Need help to retrive data from remote host in this script.
#!python3
import pyodbc, psycopg2
import datetime, time, pytz, tzlocal, socket, configparser
import os, hashlib, csv, sys, sysconfig, logging, logging.config
local_tz = tzlocal.get_localzone()
def utc_to_local(utc_dt):
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
return local_tz.normalize(local_dt) # .normalize might be unnecessary
def aslocaltimestr(utc_dt):
return utc_to_local(utc_dt).strftime('%Y-%m-%d %H:%M:%S.%f %Z') # ('%Y-%m-%d %H:%M:%S.%f %Z%z')
CSV_ColumnNames = ['TableName', 'TSchema', 'Cols', 'ColumnNames', 'HAWQColumnNames', 'ColumnDefinition', 'HAWQColumnDefinition',
'CreateTable', 'CreateTableLocal', 'DropTable', 'GrantPermissions',
'Records', 'HAWQRecords', 'Insert_stmt', 'Location', 'Format', 'TimeStamp','ObjectID']
g_cursor_row_size = 100
default_loc = r'S:\nas\temp\export_201505201535'
csvFile = r'Perf_Tables.csv'
colmapCsv = r'ColumnTypes.csv'
def set_globals(location):
global hawq_pass1, sqla_pass1, gLocation, logger, pyver, element_mgr
gLocation = location
pyver = float(sysconfig.get_python_version())
minReq = 3.4
scriptName = os.path.basename(__file__)
if pyver < minReq:
sys.exit("Error# {}: This program, {} is not tested on Python Version: {}, \nPlease upgrade to at least Version: {}.".\
format(9999, scriptName, pyver, minReq))
elif get_lock(scriptName):
logging.basicConfig(filename='ETL_Adapter.log',level=logging.DEBUG)
logging.config.fileConfig('ETL_Adapter.conf')
# create logger debug,info,warn,error,critical
logger = logging.getLogger('ETL_DFM')
configParser = configparser.ConfigParser()
configParser.read('ETL_Adapter.conf')
hawq_pass1 = configParser.get('Authentication','hawq_pass1')
sqla_pass1 = configParser.get('Authentication','sqla_pass1')
element_mgr = configParser.get('Element Managers','NetApp7M')
return True
def get_lock(scriptName):
global lock_socket # Without this our lock gets garbage collected
lock_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
server_address = (socket.gethostname(), 10000)
lock_socket.bind(server_address)
return True
except socket.error:
sys.exit("Error# {}: This program, {} is already running, {}, Exiting...".format(9999, scriptName, lock_socket))
# """ Dictionary for Table Meta Data"""
table_info_dict = {}
def build_create_drop_sqla(tab_info):
tab_info['TSchema'] = "bigdata_sandbox.perf_stn_"
tab_info['Records'] = ""
tab_info['Location'] = GSQLA_conn_str
tab_info['Format'] = "SQL"
build_create_drop(tab_info)
def build_create_drop_csv(tab_info, object_id):
tab_info['TSchema'] = "bigdata_sandbox.perf_st_"
tab_info['ObjectID'] = object_id
tab_info['Location'] = gLocation
tab_info['Format'] = "CSV"
build_create_drop(tab_info)
def build_create_drop(tab_info):
tab_info['Records'] = ""
tab_info['TimeStamp'] = aslocaltimestr(datetime.datetime.utcnow())
full_table_name = tab_info['TSchema'] + tab_info['TableName']
create_table = "CREATE TABLE " + full_table_name + " (distinct_id varchar(255), element_mgr varchar(32), " + \
tab_info['ColumnDefinition'] + ", entity_last_updated timestamp DEFAULT current_timestamp)"
tab_info['CreateTableLocal'] = create_table + ";"
create_table += " DISTRIBUTED BY (distinct_id);"
tab_info['DropTable'] = "DROP TABLE {}; -- Columns: {}".format(full_table_name, tab_info['Cols'])
tab_info['CreateTable'] = create_table
tab_info['GrantPermissions'] = "GRANT SELECT ON TABLE " + full_table_name + " TO team_cisa;"
# distinct_id,element_mgr and entity_last_updated
tab_info['HAWQColumnNames'] = "distinct_id,element_mgr," + tab_info['ColumnNames'] + ",entity_last_updated"
sql_criteria = ",".join("%({})s".format(item) for item in (tab_info['HAWQColumnNames']).split(','))
tab_info['Insert_stmt'] = """INSERT INTO %s (%s) VALUES(%s)""" % (full_table_name, tab_info['HAWQColumnNames'], sql_criteria)
table_info_dict[tab_info['TableName']] = tab_info.copy()
def get_rows_from_csv(table_info,header_row):
csv_path = table_info['Location'] + "\\" + table_info['TableName'] + "_" + str(table_info['ObjectID'])
rows = []
file_size = os.path.getsize(csv_path)
if file_size > 0:
with open(csv_path, mode='r') as infile:
reader = csv.reader(infile)
for row in reader:
if not header_row:
rows.append([row[2],row[0],row[1],row[3]])
else:
header_row = False
logger.info("Loading: {0}, {1:.2f} KB's and {2} Rows.".format(csv_path,file_size/1024,len(rows)))
return(len(rows),rows.copy())
def col_map():
dt_dict = {}
with open(colmapCsv, mode='r') as infile:
reader = csv.reader(infile)
for row in reader:
dt_dict[row[0]] = [row[1],row[2],row[3]]
return dt_dict
def dictfetchall(cursor):
"Returns all rows from a cursor as a dict"
desc = cursor.description
return [
dict(zip([col[0] for col in desc], row))
for row in cursor.fetchall()
]
def build_uuid_map(cursor1,table_rec):
uuids = []
uuid_stmt = 'select distinct_id from ' + table_rec['TSchema'] + table_rec['TableName'] + ";"
(howmany,rows) = select_qry_hawq(cursor1,uuid_stmt)
if howmany > 0:
columns = [column[0] for column in cursor1.description]
for row in rows:
record = (dict(zip(columns, row)))
uuids.append(record['distinct_id'])
return uuids
def select_execute_only(cursor1,table_rec):
sql_stmt = "select " + " * from " + table_rec['TableName'] + ";"
(howmany,rows) = select_qry(cursor1,sql_stmt)
return (howmany,rows)
# TODO: Merge 2 functions build_recs_with_uuid_from_rec_array and build_recs_with_uuid
def build_recs_with_uuid(old_uuids,cursor1,howmany,fetch_rows):
entity_last_updated = aslocaltimestr(datetime.datetime.utcnow())
new_uuids = []
table_data = []
rows = cursor1.fetchmany(fetch_rows)
if howmany > 0:
cur_cnt = 0
newDupe = existingDupe = 0
for row in rows:
columns = [column[0] for column in cursor1.description]
record = (dict(zip(columns, row)))
rec_str = str(sorted(record.items(), key=lambda t: t[0]))
m = hashlib.md5(rec_str.encode('utf-8'))
record['distinct_id'] = m.hexdigest()
record['element_mgr'] = element_mgr
if (m.hexdigest() in old_uuids):
existingDupe += 1
elif (m.hexdigest() in new_uuids):
newDupe += 1
else:
new_uuids.append(m.hexdigest())
old_uuids.append(m.hexdigest())
record['entity_last_updated'] = entity_last_updated
table_data.append(record)
cur_cnt += 1
if cur_cnt >= fetch_rows:
return table_data
return table_data
def build_recs_with_uuid_from_rec_array(object_id,old_uuids,csv_row_array,start,fetch_rows):
entity_last_updated = aslocaltimestr(datetime.datetime.utcnow())
new_uuids = []
table_data = []
if len(csv_row_array) > 0:
cur_cnt = start
index = 0
newDupe = existingDupe = 0
prev_sample_time = time.mktime(datetime.datetime.strptime(csv_row_array[start][0],'%Y%m%d%H%M%S').timetuple())
while cur_cnt < len(csv_row_array) and index < fetch_rows:
value = csv_row_array[cur_cnt]
perf_id='{0}:{1}'.format(object_id,cur_cnt)
cur_cnt += 1
index += 1
sample_time = time.mktime(datetime.datetime.strptime(value[0],'%Y%m%d%H%M%S').timetuple())
tdelta = sample_time - prev_sample_time # datetime.timedelta
prev_sample_time = sample_time
record = {'element_mgr':element_mgr,
'perfdata_id':perf_id,
'instance_id':value[1],
'counter_id':value[2],
'interval_minutes': datetime.timedelta(seconds=tdelta),
'sample_time':value[0],
'sample_value':value[3],
'sample_value_min':0,
'sample_value_max':0,
'sample_value_calc':0,
'calc_method':'None'}
rec_str = str(sorted(record.items(), key=lambda t: t[0]) )
m = hashlib.md5(rec_str.encode('utf-8'))
record['distinct_id'] = m.hexdigest()
if (m.hexdigest() in old_uuids):
existingDupe += 1
elif (m.hexdigest() in new_uuids):
newDupe += 1
else:
new_uuids.append(m.hexdigest())
old_uuids.append(m.hexdigest())
record['entity_last_updated'] = entity_last_updated
table_data.append(record)
if cur_cnt >= fetch_rows+start:
return table_data
return table_data
def ds2dl_insert_bulk(HAWQcursor,SQLAcursor,table_rec,old_uuids):
inserted = 0
try:
(howmany,rows) = select_execute_only(SQLAcursor,table_rec)
while howmany > 0:
# Fetch how many rows at a time and insert.
fetch_rows = g_cursor_row_size
values = build_recs_with_uuid(old_uuids,SQLAcursor,howmany,fetch_rows)
if len(values) > 0:
inserted += len(values)
logger.debug("Table: {} Total {} From: {}, Insert: {}".format(table_rec['TableName'],str(len(values)),inserted, howmany))
HAWQcursor.executemany(table_rec['Insert_stmt'], values)
#TODO: Resume from failure and insert only fixed rows. Remove below Comment to insert all rows
#return howmany
howmany -= fetch_rows
except psycopg2.Error as e:
logger.info ("Insert Error {}, {}".format(e.pgerro, e.diag.message_primary))
howmany = -1
return howmany
def ds2dl_insert_bulk_from_array(HAWQCursor, rec_array, table_rec, old_uuids):
insert_from = 0
inserted = 0
# Fetch fixed rows at a time and insert.
fetch_rows = g_cursor_row_size
try:
while insert_from <= len(rec_array):
values = build_recs_with_uuid_from_rec_array(table_rec['ObjectID'],old_uuids,rec_array,insert_from,fetch_rows)
if len(values) > 0:
logger.debug("Table: {} Total {} From: {}, Insert: {}".format(table_rec['TableName'],str(len(rec_array)),insert_from, len(values)))
HAWQCursor.executemany(table_rec['Insert_stmt'], values)
inserted += len(values)
insert_from += fetch_rows
except psycopg2.Error as e:
logger.info ("Insert Error {}, {}".format(e.pgerro, e.diag.message_primary))
def select_qry(cursor1,qry1): # returns (howmany,rows)
howmany = 0
try:
rows = cursor1.execute(qry1)
howmany = abs(cursor1.rowcount)
return (howmany,rows)
except:
logger.error ("Select Error") # , cursor1.statusmessage, "\n", cursor1.query)
return (howmany,None)
def select_qry_many(cursor1,many): # returns (howmany,rows)
howmany = 0
try:
rows = cursor1.fetchmany(many)
howmany = abs(cursor1.rowcount)
return (howmany,rows)
except:
logger.error ("Select Error")
return (howmany,None)
def select_qry_hawq(cursor1,qry1): # returns (howmany,rows)
howmany = 0
try:
cursor1.execute(qry1)
rows = cursor1.fetchall()
howmany = abs(cursor1.rowcount)
return (howmany,rows)
except psycopg2.Error as e:
logger.error ("Select Error : {}, {}".format(e.pgerror,e.diag.message_primary ))
return (howmany,None)
def delete_rows(cursor1,stmt1):
try:
cursor1.execute(stmt1)
howmany = cursor1.rowcount
except:
logger.error ("Delete Error: {}, {}".format( cursor1.statusmessage, cursor1.query))
howmany = 0
return howmany
def truncate_hawq_table(cursor1,table_rec):
sql_stmt = "TRUNCATE TABLE " + table_rec['TSchema'] + table_rec['TableName']
try:
cursor1.execute(sql_stmt)
return True
except:
logger.error ("Truncate Error: {}, {}".format( cursor1.statusmessage, cursor1.query))
return False
def get_column_def(data_types, rowdef):
data_type = data_types[rowdef[2]]
col_def = rowdef[1] + " " + data_type[0]
if data_type[1] == 'Y':
col_def += "(" + str(rowdef[3]) + ")" # Add precision
return col_def
def get_rowcount(SQLCursor,table_name):
cnt_stmt = 'select count(*) from ' + table_name
(howmany,rows) = select_qry(SQLCursor,cnt_stmt)
if howmany > 0:
for row in rows:
row_cnt = '{}'.format(','.join(map(str, row)))
return int(row_cnt)
else:
return 0
def get_HAWQ_rowcount(HAWQCursor,table_name,tschema):
cnt_stmt = 'select count(*) from ' + tschema + table_name
(howmany,rows) = select_qry_hawq(HAWQCursor,cnt_stmt)
if howmany > 0:
for row in rows:
row_cnt = '{}'.format(','.join(map(str, row)))
return int(row_cnt)
else:
return 0
def GrantPermissions(cursor1, grant_def):
try:
if len(grant_def) > 0:
logger.debug ("Granting Permissions: " + grant_def + "\n")
cursor1.execute(grant_def)
except psycopg2.Error as e:
logger.error ("Error Granting Permissions" + e.diag.message_primary + "\n")
return -1
return 0
def drop_table(cursor1, drop_table_def):
try:
if len(drop_table_def) > 0:
logger.debug ("Dropping Table: " + drop_table_def + "\n")
cursor1.execute(drop_table_def)
except psycopg2.Error as e:
logger.error ("Error Dropping Table" + e.diag.message_primary + "\n")
return -1
return 0
def create_table(cursor1, drop_table_def, table_def, force):
if force == True:
drop_table(cursor1, drop_table_def)
# return 0 # Uncomment to drop tables.
try:
if len(table_def) > 0:
cursor1.execute(table_def)
except psycopg2.Error as e:
logger.error ("Error Creating Table" + e.diag.message_primary + "\n")
return -1
return 0
def main_etl_routine_sql(writer1,HAWQCursor,SQLACursor,recreate_table,truncate_table):
tab_info = {}
col_num=-1
column_definitions = ""
table_name = ""
column_names = ""
tab_num = 0
data_types_csv = col_map()
logger.info("Started main_etl_routine_sql")
table = "SYS.SYSCOLUMNS"
columns = tuple("tname,cname,coltype,length,nulls,in_primary_key,default_value".split(","))
where = " where creator like 'DFMG%' order by 1"
stmt_qry = """SELECT %s FROM %s""" % (",".join(columns), table + where)
select_qry(SQLACursor,stmt_qry)
for table_row in SQLACursor.fetchall():
if (table_name == table_row[0]): # If Same table
col_num += 1
column_definitions += ", "
column_names += "," + table_row[1]
else:
if(col_num != -1): # Skip first row
tab_info['TableName'] = table_name
tab_info['Cols']= col_num
tab_info['ColumnNames'] = column_names
tab_info['ColumnDefinition'] = column_definitions
build_create_drop_sqla(tab_info)
column_definitions = ""
tab_num += 1
col_num=1
column_names = table_row[1]
table_name = table_row[0]
col_data = get_column_def(data_types_csv, table_row)
column_definitions += col_data
# Set for last table.
tab_info['TableName'] = table_name
tab_info['Cols']= col_num
tab_info['ColumnNames'] = column_names
tab_info['ColumnDefinition'] = column_definitions
build_create_drop_sqla(tab_info)
for table_name, table_detail in table_info_dict.items():
table_detail['Records'] = get_rowcount(SQLACursor,table_name)
if table_detail['Records'] > 0: #and table_name == 'objectView':
# Recreate table if structure is changed
if recreate_table:
create_table(HAWQCursor, table_detail['DropTable'], table_detail['CreateTable'], True)
GrantPermissions(HAWQCursor, table_detail['GrantPermissions'])
table_detail['HAWQRecords'] = get_HAWQ_rowcount(HAWQCursor,table_name,tab_info['TSchema'])
# Check and truncate table only if records exists and requested.
old_uuids = []
if truncate_table and table_detail['HAWQRecords'] > 0:
result = truncate_hawq_table(HAWQCursor,tab_info)
table_detail['HAWQRecords'] = 0;
logger.debug("Truncated Table: {}, Result: {}".format(table_detail['TableName'],result))
elif table_detail['HAWQRecords'] > 0:
old_uuids = build_uuid_map(HAWQCursor,table_detail)
ds2dl_insert_bulk(HAWQCursor, SQLACursor, table_detail,old_uuids)
logger.info("Table: {}, New Count: {}, Old Count: {}, Processed: {}".format(table_detail['TableName'],len(old_uuids),table_detail['HAWQRecords'],table_detail['Records']))
table_detail['HAWQRecords'] = len(old_uuids)
writer1.writerow(table_detail)
def main_etl_routine_csv(writer1,HAWQCursor,recreate_table,truncate_table):
logger.info("Started main_etl_routine_csv")
table_detail = {}
column_definitions = """perfdata_id VARCHAR(32),instance_id VARCHAR(32),counter_id VARCHAR(32),interval_minutes interval,
sample_time timestamptz,sample_value decimal(36,6),sample_value_min decimal(36,6),sample_value_max decimal(36,6),sample_value_calc decimal(36,6),calc_method VARCHAR(45)"""
table_name = "samples_vfiler"
column_names = "perfdata_id,instance_id,counter_id,interval_minutes,sample_time,sample_value,sample_value_min,sample_value_max,sample_value_calc,calc_method"
table_detail['TableName'] = table_name
table_detail['Cols']= len(column_names.split(','))
table_detail['ColumnNames'] = column_names
table_detail['ColumnDefinition'] = column_definitions
# TODO: Add routine to get object list
object_list = [19523]
for object_id in object_list:
table_detail['ObjectID'] = object_id
build_create_drop_csv(table_detail, object_id)
(csv_rows,csv_row_array) = get_rows_from_csv(table_detail,True)
table_detail['Records'] = csv_rows
if table_detail['Records'] > 0:
if recreate_table:
create_table(HAWQCursor, table_detail['DropTable'], table_detail['CreateTable'], True)
GrantPermissions(HAWQCursor, table_detail['GrantPermissions'])
table_detail['HAWQRecords'] = get_HAWQ_rowcount(HAWQCursor,table_name,table_detail['TSchema'])
old_uuids = []
if truncate_table and table_detail['HAWQRecords'] > 0:
result = truncate_hawq_table(HAWQCursor,table_detail)
table_detail['HAWQRecords'] = 0;
logger.debug("Truncated Table: {}, Result: {}".format(table_detail['TableName'],result))
elif table_detail['HAWQRecords'] > 0:
old_uuids = build_uuid_map(HAWQCursor,table_detail)
ds2dl_insert_bulk_from_array(HAWQCursor, csv_row_array, table_detail, old_uuids)
logger.info("Table: {}, New Count: {}, Old Count: {}, Processed: {}".format(table_detail['TableName'],len(old_uuids),table_detail['HAWQRecords'],table_detail['Records']))
table_detail['HAWQRecords'] = len(old_uuids)
writer1.writerow(table_detail)
# _Main Function opens CSV File: DictWriter, Gets Table Definitions, and calls ETL Routine:
GHAWQ_conn_str = "dbname={0} user={1} host={2}".format('bigdata','g430878','gbl20053091.eu.hedani.net')
server = "{0}:{1}".format('usw20045038.gbl.ad.hedani.net', '2638')
GSQLA_conn_str = 'DRIVER={SQL Anywhere 16};HOST=' + server + ';DATABASE=MonitorDB;UID=PerfData'
def _Main():
set_globals(default_loc)
logger.info("Starting Main DataStream to DataLake Program")
outfile = open(csvFile, mode='w+')
try:
# By default, the classes in the csv module use Windows-style line terminators (\r\n) rather than Unix-style (\n).
logger.info("Opened CSV Dict Writer File: " + csvFile + " for logging load operation.")
writer1 = csv.DictWriter(outfile, lineterminator='\n', fieldnames=CSV_ColumnNames)
writer1.writeheader()
try: # HAWQ Connection, Use 64Bit Driver from http://www.stickpeople.com/projects/python/win-psycopg/
HAWQ_conn_str = "{0} password={1}".format(GHAWQ_conn_str,hawq_pass1)
HAWQConnection = psycopg2.connect(HAWQ_conn_str)
#HAWQConnection = psycopg2.connect("dbname='bigdata' user='g430878' host='localhost' password='pass'")
HAWQConnection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
except psycopg2.Error as e:
errMsg = "{0}: Error while connecting to the HAWQ database, pgError {1}, Diag Msg: {2}".format(1001,e.pgerror,e.diag.message_primary)
#sys.exit(errMsg) # Document error
HAWQCursor = HAWQConnection.cursor()
try: # SQL Anywhere Connection via ODBC, http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyodbc
SQLA_conn_str = "{0};PWD={1}".format(GSQLA_conn_str,sqla_pass1)
SQLAConnection = pyodbc.connect(SQLA_conn_str)
except pyodbc.Error as e:
errMsg = "{0}: Error while connecting to the HAWQ database, Error {1}".format(1002,e)
logging.warn(e)
sys.exit(errMsg) # Document error
SQLACursor = SQLAConnection.cursor()
# This is main ETL routine for SQL Data
main_etl_routine_sql(writer1,HAWQCursor,SQLACursor,False,False)
SQLAConnection.close()
# This is main ETL routine for CSV Perf Data
main_etl_routine_csv(writer1,HAWQCursor,False,False)
HAWQConnection.close()
finally:
outfile.close()
# Main Ends here.
if __name__ == '__main__':
try:
_Main()
finally:
logger.info("Completed Main")
# [loggers]
# keys=root,ETL_DFM
#
# [handlers]
# keys=consoleHandler
#
# [formatters]
# keys=simpleFormatter
#
# [logger_root]
# level=DEBUG
# handlers=consoleHandler
#
# [logger_ETL_DFM]
# level=DEBUG
# handlers=consoleHandler
# qualname=ETL_DFM
# propagate=0
#
# [handler_consoleHandler]
# class=StreamHandler
# level=DEBUG
# formatter=simpleFormatter
# args=(sys.stdout,)
#
# [formatter_simpleFormatter]
# format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
# datefmt=
#
# [Authentication]
# hawq_pass1=
# sqla_pass1=
#
# [Element Managers]
# NetApp7M=
# NetAppCM=