-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_mixtures.py
65 lines (52 loc) · 1.36 KB
/
generate_mixtures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# This script is designed to take pseudobulk RNA-seq data e.g. MCA and generate mixtures at known percentages to test e.g. CIBERSORT for accuracy
# AUTHOR: Adam Reid
# Copyright (C) 2019 Genome Research Ltd.
# This program is distributed under the terms of the GNU General Public License
import sys
import pandas as pd
# get file of pseudobulk
pb_file = sys.argv[1]
outstem = sys.argv[2]
pb = pd.read_csv(pb_file, sep='\t', index_col = 0)
mixtures = {
'F20M20R20T20S20' :
{'Female' : 0.2,
'Male' : 0.2,
'Schizont' : 0.2,
'Trophozoite' : 0.2,
'Ring' : 0.2},
'F60M10R10T10S10' :
{'Female' : 0.6,
'Male' : 0.1,
'Schizont' : 0.1,
'Trophozoite' : 0.1,
'Ring' : 0.1},
'F10M60R10T10S10' :
{'Female' : 0.1,
'Male' : 0.6,
'Schizont' : 0.1,
'Trophozoite' : 0.1,
'Ring' : 0.1},
'F10M10R10T10S60' :
{'Female' : 0.1,
'Male' : 0.1,
'Schizont' : 0.6,
'Trophozoite' : 0.1,
'Ring' : 0.1},
'F25M25R25T25S0' :
{'Female' : 0.25,
'Male' : 0.25,
'Schizont' : 0,
'Trophozoite' : 0.25,
'Ring' : 0.25}
}
# make mixtures
mix_df = pd.DataFrame(index=pb.index)
for m in mixtures:
for s in mixtures[m]:
if m not in mix_df:
mix_df[m] = pb[s] * mixtures[m][s]
else:
mix_df[m] = mix_df[m] + (pb[s] * mixtures[m][s])
#print(s, mix_df)
mix_df.to_csv(f'{outstem}.dat', sep='\t')