forked from lundmb/ExoFOP-Tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_downloader.py
69 lines (62 loc) · 2.64 KB
/
file_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
import pandas as pd
import requests
from bs4 import BeautifulSoup
from os import listdir
# Allows for large-scale downloading of files with predictable file names
# Currently uses the imaging table to get the list of TOIs to check for files
#get tbl files for a TOI
def file_get(TIC):
url = "https://exofop.ipac.caltech.edu/tess/target.php?id="+str(TIC)
page = requests.get(url)
data= page.text
soup = BeautifulSoup(data, 'html.parser')
x=[[link.get('href'), link.get_text()] for link in soup.find_all('a')]
df = pd.DataFrame(x, columns = ['url', 'text'])
df=df.dropna(how='any')
df=df[df['url'].str.contains("get_file")]
for substring_1 in file_substring:
df=df[df['text'].str.contains(substring_1)]
return df
def bulk_download(obs_type, file_dir=".", user=None, file_substring=[".tbl"]):
existing_files=listdir(file_dir)
page_term=None
if obs_type.lower()[0]=="i": page_term="imaging"
if obs_type.lower()[0]=="s": page_term="spect"
if obs_type.lower()[0]=="t": page_term="tseries"
if not page_term:
print("Not a valid category of observations")
quit()
source_file="https://exofop.ipac.caltech.edu/tess/download_"+page_term+".php?sort=id&output=pipe"
print(source_file)
obs_df=pd.read_csv(source_file, sep='|')
if user:
obs_df=obs_df[obs_df['User'].str.contains(user)]
if verbose: print(obs_df)
observed_TOIs=obs_df['TIC ID'].values
observed_TIC_list=list(dict.fromkeys(observed_TOIs))
if verbose: print(observed_TIC_list, len(observed_TIC_list))
print("%.0f TOIs observed" % len(observed_TIC_list))
downloads=0
for TIC in observed_TIC_list:
TIC_table=file_get(TIC)
if verbose: print(test)
if verbose: print(test['url'].values)
for index, row in TIC_table.iterrows():
if row['text'] in existing_files:
print("Already downloaded %s" % row['text'])
else:
print("Downloading %s" % row['text'])
url="https://exofop.ipac.caltech.edu/tess/"+row['url']
r=requests.get(url, allow_redirects=True)
destination_file=file_dir+"/"+row['text']
open(destination_file, 'wb').write(r.content)
downloads+=1
print("%.0f files downloaded" % downloads)
if __name__ == '__main__':
verbose=False
obs_type="Imaging" #Imaging, Spectroscopy, Time Series
file_dir="ExoFOP_data" #location to save files and check for previous files
user="ciardi" # user that has uploaded the files being searched for
file_substring=[".tbl", "-dc"] # substrings to search for in filenames
bulk_download(obs_type, file_dir, user=user, file_substring=file_substring)