-
Notifications
You must be signed in to change notification settings - Fork 1
/
getInputData.py
133 lines (110 loc) · 4.54 KB
/
getInputData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from pandas import pandas as pd
import numpy as np
bedIntersectPath = "../results/bedIntersectWaWbTFBSinGenes.bed"
filteredBedIntersectPath = "../results/bedIntersectWaWbTFBSinGenesFiltered.tsv"
tfbsCountsInTissuesPath = "../results/tfbsCountsInTissues.tsv"
tfAndTissuesPath = "../results/tfAndTissues.tsv"
tfAndTissuesTopPath = "../results/tfAndTissues-top.tsv"
genesPerTissueFolder = "../input/genesPerTissue"
tissueNames = ["adrenal_gland", "brain", "breast", "colon", "heart", "kidney", "leukocyte", "liver", "lung", "lymph_node", "ovary", "prostate", "skeletal_muscle", "testis", "thyriod"]
genesPerTissue = dict()
tissuePerGene = dict()
geneNames = set()
tfsInTissue = dict()
tfs = set()
def readGenesFromTissue(tissueName):
if tissueName not in genesPerTissue:
genesPerTissue[tissueName] = list()
fileName = genesPerTissueFolder + "/" + tissueName + ".txt"
f = open(fileName, 'r')
for line in f:
gene = line.replace('\n', '')
genesPerTissue[tissueName].append(gene)
if gene not in tissuePerGene:
tissuePerGene[gene] = list()
tissuePerGene[gene].append(tissueName)
geneNames.add(gene)
f.close()
def readAllTissueGenes():
for tissueName in tissueNames:
readGenesFromTissue(tissueName)
def addTfbsToTfCounts(row):
geneName = row['geneFullName']
tfName = row['tfName']
tfs.add(tfName)
for tissue in tissueNames:
if geneName in genesPerTissue[tissue]:
if tfName not in tfsInTissue[tissue]:
tfsInTissue[tissue][tfName] = 1
else:
tfsInTissue[tissue][tfName] += 1
def mostUsedInTissue(row):
mostUsed = tissueNames[0]
for name in tissueNames:
if row[name] > row[mostUsed]:
mostUsed = name
return mostUsed
def calcDiffFactor(row):
mostUsed = row['mostUsedInTissue']
mean = row['mean']
return (row[mostUsed]/mean)
def countTfbsInTissues(df):
for tissueName in tissueNames:
tfsInTissue[tissueName] = dict()
df.apply(lambda row: addTfbsToTfCounts(row), axis=1)
print("Creating dataframe")
cols = ['tfName']
for tissueName in tissueNames:
cols.append(tissueName)
cols.append('totalBS')
rows = []
for tfName in tfs:
row = dict()
row['tfName'] = tfName
totalBS = 0
for tissueName in tissueNames:
amount = 0
if tfName in tfsInTissue[tissueName]:
amount = tfsInTissue[tissueName][tfName]
row[tissueName] = amount
totalBS += amount
row['totalBS'] = totalBS
rows.append(row)
tfDf = pd.DataFrame(rows, columns=cols)
tfDf['median'] = tfDf[np.array(list(tissueNames))].median(axis=1)
tfDf['mean'] = tfDf[np.array(list(tissueNames))].mean(axis=1)
#tfDf['mostUsedInTissue'] = tfDf.apply(lambda row: mostUsedInTissue(row), axis=1)
#tfDf['diffFactor'] = tfDf.apply(lambda row: calcDiffFactor(row), axis=1)
#tfDf.sort(['diffFactor'], inplace=True, ascending=False)
return tfDf
def getUniquenessPerTF(row, newDFRows):
mean = row["mean"]
for tissueName in tissueNames:
tfbsCount = row[tissueName]
newRow = dict()
diffFactor = (tfbsCount / mean)
newRow["tfName"] = row["tfName"]
newRow["tissue"] = tissueName
newRow["diffFactor"] = diffFactor
newDFRows.append(newRow)
def calcExpressionForEachTissue(tFactorDF):
newDFRows = []
tFactorDF.apply(lambda row: getUniquenessPerTF(row, newDFRows), axis=1)
diffFactorDF = pd.DataFrame(newDFRows, columns=['tfName', 'tissue', 'diffFactor'])
diffFactorDF.sort(['diffFactor'], inplace=True, ascending=False)
return diffFactorDF
print("Reading tissue genes lists")
readAllTissueGenes()
rawDF = pd.read_csv(filteredBedIntersectPath, sep='\t')
tFactorDF = countTfbsInTissues(rawDF)
print(tFactorDF.head())
expressionOnTissuesDF = calcExpressionForEachTissue(tFactorDF)
print("Writing result files")
tFactorDF.to_csv(tfbsCountsInTissuesPath, sep='\t', index=False)
#expressionOnTissuesDF.to_csv(tfAndTissuesPath, sep='\t', index=False)
diffFactorQuantile = expressionOnTissuesDF['diffFactor'].quantile(0.95)
tfAndTissuesTop = expressionOnTissuesDF[expressionOnTissuesDF.diffFactor >= diffFactorQuantile]
#tfAndTissuesTop.to_csv(tfAndTissuesTopPath, sep='\t', index=False)
#createFilteredDf()
#df_sliced.to_csv(filteredBedIntersectPath, sep='\t')
#print(str(genesIdentified/genesRead))