-
Notifications
You must be signed in to change notification settings - Fork 6
/
tcga_downloader.py
148 lines (108 loc) · 4.64 KB
/
tcga_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import subprocess
import requests
import json
import sys
import pandas as pd
import ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
#from tcga downloader import *
#ids=process_manifests(manifest_file)
'''
Match tumor and normal samples to a patient through GDC API. This will provide metadata that
for associated files including sample type(tumor /normal) patient ID(TCGA barcode), etc
The general query structure will look like this:
curl "https://gdc-api.nci.nih.gov/files?size=100000&format=tsv&filters=XXXX&fields=YYYY"
filters= :
The XXXX will be replaced with a URL-encoded JSON query that will filter a list of files. The JSON query can be generated yourself and then URL encoded, or the filtering can be performed on the GDC Data Portal (https://gdc-portal.nci.nih.gov/search/s?facetTab=files). You can copy and paste the string that appears where the * are displayed in the following:
https://gdc-portal.nci.nih.gov/search/f?filters=**********&facetTab=cases
'''
def prepare_payload(ids,data_type=None):
#"Gene Expression Quantification"
"Workflow Type" "HTSeq-Counts"
"Data Category" "transcriptome profilling"
"Experimental Strategy" "RNA-Seq"
no_of_samples=len(ids)
part1='''{
"filters":{
"op":"and",
"content":[
{
"op":"in",
"content":{
"field":"files.file_id",
"value":[%s]
}
},
{
"op":"=",
"content":{
"field":"files.data_type",
"value":"%s"
}
}
]
},'''%(",\n".join(ids),data_type)
part2=''' "format":"TSV","fields":"file_id,file_name,cases.submitter_id,cases.disease_type,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,cases.samples.portions.analytes.aliquots.aliquot_id,cases.samples.portions.analytes.aliquots.submitter_id","size":"%d"}'''%no_of_samples
payload_command='%s %s'%(part1,part2)
payloadfile='payloadv3.txt'
with open(payloadfile,'w') as output_:
output_.write(payload_command)
return payloadfile
def get_ids(manifest):
try:
#manifest='all.txt'
with open (manifest,'r') as input_:
ids=["\"%s\""%i.strip('\n').split('\t')[0] for i in input_][1:]
no_of_samples=len(ids)
return ids
except Exception as ex:
return None
#print(ids)
def get_metadata(payloadfile):
metadata='Metadata.tsv'
webaddr='\'https://api.gdc.cancer.gov/files\''
args=['curl', '--request POST', '--header','\"Content-Type: application/json\"',
'--data','@%s'%payloadfile,webaddr, '>', metadata]
single=' '.join(args)
print(single)
os.system(single)
return metadata
#def get_metadatada():
# ids=process_manifest()
# if ids==None:
# print('Error encountered\nPlease ensure that you are using the correct manifest file')
# else:
# payloadfile=prepare_payload(ids)
# download_data(payloadfile)
def download_data(metadatafile,sep='\t',outdir='downloads'):
df=pd.DataFrame()
data_df=pd.read_csv(metadatafile,sep=sep)
sampletypes=data_df['cases.0.samples.0.sample_type'].unique()
sampletypes=sampletypes.tolist()
curr_dir=os.getcwd()
if not os.path.exists(outdir):
os.mkdir(outdir)
else:
print('output directory exists\ndata may be overwritten')
for sampletype in sampletypes:
sel=data_df[data_df['cases.0.samples.0.sample_type'].str.contains(sampletype)][['file_id',
'file_name']]
sampledir="%s/%s"%(outdir,sampletype)
if not os.path.exists(sampledir):
os.mkdir(sampledir)
else:
print('sample type directory exists\ndata may be overwritten')
os.chdir(sampledir)
all_file_ids=sel['file_id'].values
download_list=[]
for file_id in all_file_ids:
args=['curl', '--remote-name', '--remote-header-name',
'\'https://api.gdc.cancer.gov/data/%s\''%file_id]
cmd=' '.join(args)
print('downloading %s'%file_id)
os.system(cmd)
#download_list.append(' '.join(args))
os.chdir(curr_dir)
print('Download complete\nAll data has been downloaded to ------------->%s'%outdir)