본문 바로가기

전체보기

(33)
[security] jceks hadoop credential provider api # list hadoop credential list -provider jceks:///dir/filename.jceks # create hadoop credential create myalias -value 'pwd' -provider jceks:///dir/filename.jceks hadoop credential create myalias2 -value 'pwd2' -provider jceks:///dir/filename.jceks def getHadoopCredentail(credProviderPath="jceks:///dir/filename.jceks", pwAlias = None): spark = SparkSession.builder \ .appName(appName1) \ .enableHiv..
[python] Getting startingOffsets value and reading kafka value into spark dataframe from pyspark.sql import SparkSession import pyspark.sql.functions as func # get maxoffset spark1 = SparkSession.builder \ .appName(appName1) \ .config("spark.executor.memory","11g") \ .enableHiveSupport() \ .getOrCreate() strStartOffset = 'earliest' if len(listPath) > 0: dfOffset = spark1.read.parquet(path+"/"+listPath['pathSuffix'].max()) dfOffset.createOrReplaceTempView("temp_offset") dfMaxOff..
[python] get hdfs path info import requests import pandas as pd def getPathInfo(hdfsPath): try: s = requests.session() userName = "hive" #getpass.getuser() operation = "LISTSTATUS" httpFsUrl = "http://hostip:14000" req = "{0}/webhdfs/v1{1}?user.name={2}&op={3}".format(httpFsUrl,hdfsPath,userName,operation) response = s.get(req,headers={'Connection':'close'}) result = response.json()["FileStatuses"]["FileStatus"] dfTmp = pd..
[spark dataframe] extract date value using pyspark udf lambda from pyspark.sql import SparkSession import pyspark.sql.functions as func import datetime fnDataReplace = func.udf(lambda s : s.replace('\\','')) fnGetBaseDate = func.udf(lambda value1, s1, s2, s3 : extractBaseDate(value1, s1, s2, s3)) def extractBaseDate(value1, dateCol1, dateCol2, timestampCol): if (dateCol1 is not None) and len(dateCol1) > 13: baseDate = datetime.datetime.strptime(dateCol1, "..
[command] kafka topic CLI # topic list check kafka-topics --bootstrap-server kafka01.host.com:9092,kafka02.host.com:9092,kafka03.host.com:9092 --list # topic details kafka-topics --bootstrap-server kafka01.host.com:9092,kafka02.host.com:9092,kafka03.host.com:9092 --topic topic1 --describe # change topic partition kafka-topics --bootstrap-server kafka01.host.com:9092,kafka02.host.com:9092,kafka03.host.com:9092 --alter --t..
Filebeat to Kafka procedure # ssh ssh id@log_generating_server # cd cd /dir/filebeat_to_kafka # unzip tar -xvf filebeat_to_kafka.tar # Check running processes ps -ef | grep filebeat # stop work kill -9 `ps -ef | grep -w filebeat-big.yml | grep -v grep | awk '{print $2}'` # update yml file vi filebeat-big.yml #-----------------------filebeat prospectors--------------------------------- - input_type: log paths: - /dir/AAA???..
[impala / oracle / presto/spark] Confusing Date SQL Statement Differences 1. impala tx_dt between from_timestamp(date_add(to_timestamp(t1.base_dt, 'yyyyMMdd'),-3),'yyyyMMdd') and t1.base_dt2 a.nbr = cat(b.nbr as string) select substr(to_date(months_add(now(),-1)),1,7) as base_ym base_dt = to_date(now()) base_dt = to_date(date_add(now(),-1)) to_timestamp(tx_tmstmp,'yyyyMMddHHmmss') > date_add(now(), interval -10 minutes) count(case when t1.base_dt = from_timestamp(last..
[ML] XGBClassifier sample code # First XGBoost model for Pima Indians dataset from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets seed = 7 test_size = ..
[pandas] MinMaxScaling sample code import pandas as pd a = pd.read_csv('data/mtcars.csv', index_col=0) # case1 min_data = a['qsec'].min() max_data = a['qsec'].max() a['qsec2'] = (a['qsec'] - min_data) / (max_data - min_data) result = len(a['qsec2'] > 0.5) print(result) # case2 from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() a['qsec3'] = scaler.fit_transform(a[['qsec']]) result2 = len(a['qsec3']>0.5) print(r..
[pandas] dataframe filter import pandas as pd df = df[df.col1 == 'value1']