-
Notifications
You must be signed in to change notification settings - Fork 0
/
NIST-scraper.py
125 lines (116 loc) · 4.55 KB
/
NIST-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
USed to scrape enthalpies from NIST database.
"""
from argparse import ArgumentParser
from lxml import html
from lxml import etree
import requests
import sys
import pickle
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('output_folder',
help='Folder to save downloaded files')
args = parser.parse_args()
TIMEOUT = 100
fin = 'NIST-CHONP.txt'
fout = 'NIST-CHONP.pickle'
urls = 'http://webbook.nist.gov/cgi/cbook.cgi?'
urle = '&Units=CAL&cTG=on'
fldrpath = args.output_folder
fobjin = open(fldrpath + fin, 'r')
lines = fobjin.readlines()
fobjin.close()
fobjout = open(fldrpath + fout, 'wb')
data = list()
imax = len(lines)
for i in range(0, imax):
# progress
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d/%d %d%%" % ('=' * int((i * 20 / imax + 1)),
i, imax,
i * 100 / imax + 1))
sys.stdout.flush()
# Read line
line = lines[i]
line = line.replace('\n', '')
line = line.split('\t')
# Reading data
tryname = False
# Try CAS first
if line[6] != 'N/A':
url = urls + 'ID=' + line[6] + urle
page = requests.get(url)
tree = html.fromstring(page.content)
# check if the page exist
title = tree.xpath('//meta[@content="Registry Number Not Found"]')
if title:
tryname = True
else:
tryname = True
# Try Name url
if tryname:
line[0] = line[0].replace('α', 'α')
line[0] = line[0].replace('β', 'β')
line[0] = line[0].replace('γ', 'γ')
line[0] = line[0].replace('δ', 'δ')
url = urls + 'Name=' + line[0] + urle
page = requests.get(url, timeout = TIMEOUT)
tree = html.fromstring(page.content)
# check if the page exist
title = tree.xpath('//meta[@content="Name Not Found"]')
if title:
line.append('WebpageLoadingFailed')
data.append(line)
continue
# Initialize variables
sf_url = list() # structure file url. [2d, 3d]. False if missing
dh = list() # heat of formation [value, CI]. CI is False, if missing
siso = list() # stereoisomers name and url
# Get dH. If nothing available, dh is a empty list
prop_table = tree.xpath('//table[@aria-label="One dimensional data"]')
if prop_table: # if there is prob
prop_names = prop_table[0].xpath('.//td[1]')
props = prop_table[0].xpath('.//td[2]')
for i in range(0, len(prop_names)):
if prop_names[i].text_content() == u'fH\xb0gas':
prop = props[i].text.split()
if prop: # sometimes this value is empty. See Cubane
if len(prop) == 3:
dh.append([float(prop[0]), float(prop[2])])
else:
dh.append([float(prop[0]), False])
if dh:
line.append('Good')
else:
line.append('DHfMissing')
data.append(line)
continue
# Get 2d mol and 3dmol data. If nothing available,
# it returns a empty list
structure = tree.xpath('//li[contains(.,"Chemical structure:")]')
if structure: # if there is structure
f2d = structure[0].xpath('//a[text()="2d Mol file"]')
f3d = structure[0].xpath('//a[text()="3d SD file"]')
if f2d:
sf_url.append('http://webbook.nist.gov' + f2d[0].values()[0])
else:
sf_url.append([False])
if f3d:
sf_url.append('http://webbook.nist.gov' + f3d[0].values()[0])
else:
sf_url.append([False])
# Get stereoisomers. If nothing available, it returns a empty list
stereoisomers = tree.xpath('//li[contains(.,"Stereoisomers:")]')
if stereoisomers: # if there is structure
stereoisomers = stereoisomers[0].xpath('.//a')
for isomer in stereoisomers:
name = isomer.text_content()
if isinstance(name, etree._ElementUnicodeResult):
name = name
siso.append([name, 'http://webbook.nist.gov' + isomer.values()[0]])
line.append([dh, sf_url, siso])
data.append(line)
pickle.dump(data, fobjout)
fobjout.close()