-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path211209_EDA01_yn.py
117 lines (100 loc) · 4.47 KB
/
211209_EDA01_yn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# coding: utf-8
# This EDA file is to derive basic statistics about each of the 300 lab components. Include total count, accepted counts, mean, and standard deviation.
import pandas as pd
import numpy as np
import math
import statistics
import glob
import tqdm
listOfYearlyImportFiles = ['Data/lab_data/req1512_lab_2009.csv',
'Data/lab_data/req1512_lab_2010.csv',
'Data/lab_data/req1512_lab_2011.csv',
'Data/lab_data/req1512_lab_2012.csv',
'Data/lab_data/req1512_lab_2013.csv',
'Data/lab_data/req1512_lab_2014.csv',
'Data/lab_data/req1512_lab_2015.csv',
'Data/lab_data/req1512_lab_2016.csv',
'Data/lab_data/req1512_lab_2017.csv',
'Data/lab_data/req1512_lab_2018.csv',
'Data/lab_data/req1512_lab_2019.csv',
'Data/lab_data/req1512_lab_2020.csv']
print(len(listOfYearlyImportFiles))
print(listOfYearlyImportFiles)
componentDataDF = pd.read_csv('Data/211209_Lab_Crosswalk_YN.csv')
componentDataDF.head()
listOfLabComponents = []
for eachIndex, eachRow in componentDataDF.iterrows():
listOfLabComponents.append(eachRow['First Alphabetical Component'])
for eachColumnName in ['Related Components 0', 'Related Components 1', 'Related Components 2', 'Related Components 3']:
if isinstance(eachRow[eachColumnName], str):
listOfLabComponents.append(eachRow[eachColumnName])
print('number of lab components:', len(listOfLabComponents))
#print(listOfLabComponents[250])
#listOfLabComponents=listOfLabComponents[250:]
# function to preprocess list to remove nonsensical inputs and get only numerical values
def filterList(oldList):
stripedList = [x.strip('<>') for x in oldList]
filteredList = []
for eachItem in stripedList:
try:
floatItem = float(eachItem)
filteredList.append(floatItem)
except:
pass
noNanList = [x for x in filteredList if math.isnan(x) == False]
return noNanList
# get basic parameters of list like min, max, mean, standard deviation
def analyzeList(thisList):
try:
thisMin = min(thisList)
except:
thisMin = -1
try:
thisMax = max(thisList)
except:
thisMax = -1
try:
thisMean = statistics.mean(thisList)
except:
thisMean = -1
try:
thisStdev = statistics.stdev(thisList)
except:
thisStdev = -1
return {'acceptedCount':len(thisList),'min':thisMin, 'max':thisMax, 'mean':thisMean, 'stdev':thisStdev}
paramList = ['totalCount', 'acceptedCount', 'min', 'max', 'mean', 'stdev']
# generate string
def genString(componentName, paramDict):
thisString = componentName.strip(',')
for eachParam in paramList:
thisString +=','+str(paramDict[eachParam])
thisString +='\n'
return thisString
# For EDA, to figure out which column was the ex
with open("outputs/results.csv", "a") as text_file:
thisString = 'component name'
for eachParam in paramList:
thisString += ',' + eachParam
thisString += '\n'
text_file.write(thisString)
for eachComponent in listOfLabComponents:
print(eachComponent)
completeListOfORDvalues = []
totalCount = 0
for eachYearlyImportFile in listOfYearlyImportFiles:
workingDataFrame = pd.read_csv(eachYearlyImportFile, usecols = ['COMMON_NAME','COMPONENT_NAME','BASE_NAME','ORD_VALUE','REFERENCE_UNIT'])
workingDataFrame['ORD_VALUE'] = workingDataFrame['ORD_VALUE'].astype('str')
workingDataFrame['nameMatched'] = workingDataFrame.apply(lambda x: x['COMMON_NAME']==eachComponent, axis=1)
workingDataFrame = workingDataFrame[workingDataFrame['nameMatched']==True]
eachYearListOfORDvalues = workingDataFrame['ORD_VALUE'].values
totalCount += len(eachYearListOfORDvalues) # get count of cases before processing
eachYearListOfORDvalues = filterList(eachYearListOfORDvalues)
print(len(eachYearListOfORDvalues))
completeListOfORDvalues += eachYearListOfORDvalues
print(len(completeListOfORDvalues))
with open("outputs/results.csv", "a") as text_file:
resultDict = analyzeList(completeListOfORDvalues)
resultDict['totalCount'] = totalCount
stringToSave = genString(eachComponent,resultDict)
text_file.write(stringToSave)