-
Notifications
You must be signed in to change notification settings - Fork 0
/
importClusters.py
84 lines (67 loc) · 2.76 KB
/
importClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import csv
import pickle
import pyroprinting
import config
def loadIsolateList(filename):
cfg = config.loadConfig()
isolates = pyroprinting.loadIsolatesFromFile("isolatesAll.pickle")
isolateIdMap = {iso.name.strip(): iso for iso in isolates}
with open(filename) as listFile:
isoIds = {isoId.strip().strip("'").strip() for isoId in listFile.readline().split(',')}
# print(isoIds)
missingIsos = [iso.name for iso in isolates if iso.name.strip() not in isoIds]
extraIsos = [isoId for isoId in isoIds if isoId not in isolateIdMap]
print(extraIsos)
print("extraIsoCount: {}".format(len(extraIsos)))
print(missingIsos)
print("missingIsoCount: {}".format(len(missingIsos)))
sharedIsos = [iso for iso in isolates if iso.name.strip() in isoIds]
print("{}/{} shared".format(len(sharedIsos), len(isolates)))
with open("isolatesShared.pickle", mode='w+b') as cacheFile:
pickle.dump(sharedIsos, cacheFile)
def loadFromCSV(filename, outfile):
cfg = config.loadConfig()
isolates = pyroprinting.loadIsolates(cfg)
isolateIdMap = {iso.name.strip(): iso for iso in isolates}
clusters = []
with open(filename) as csvFile:
csvLines = "".join(line for line in csvFile if line.strip()).splitlines(True) # remove blank lines
# print(csvLines)
csvReader = csv.reader(csvLines, delimiter=',')
pastHeader = False # because Aldrin's csv files have some header rows
currentClusterId = None
currentCluster = None
for i, row in enumerate(csvReader):
# print("{}/{}".format(i+1, len(csvLines)))
if row[0] == "Cluster Id":
pastHeader = True
elif pastHeader:
if row[0].startswith("Threshold:") or row[0] == "******":
print("Multiple clusterings detected in file. Skipping the rest.")
break
isoId = row[1].strip()
if isoId in isolateIdMap:
if row[0] != currentClusterId:
currentClusterId = row[0]
currentCluster = set()
clusters.append(currentCluster)
currentCluster.add(isolateIdMap[isoId])
else:
print("extra isolate: {}".format(isoId))
# print(clusters)
print(len(clusters))
with open(outfile, mode='w+b') as cacheFile:
pickle.dump(clusters, cacheFile)
def getOHClustClusters(threshold):
with open("ohclust{}.pickle".format(threshold), mode='r+b') as cacheFile:
ohclustClusters = pickle.load(cacheFile)
return ohclustClusters
def getAgglomerativeClusters():
with open("agglomerative99.pickle", mode='r+b') as cacheFile:
agglomerativeClusters = pickle.load(cacheFile)
return agglomerativeClusters
if __name__ == '__main__':
loadIsolateList("database_iso_ids")
loadFromCSV("aldrins/database_oh_clusters_josh_ont_99.csv", "ohclust99.pickle")
loadFromCSV("aldrins/database_oh_clusters_josh_ont_995.csv", "ohclust995.pickle")
loadFromCSV("aldrins/database_oh_clusters_agglomerative_99.csv", "agglomerative99.pickle")