-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_pubmed.py
131 lines (117 loc) · 5.92 KB
/
fetch_pubmed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import argparse, configparser, os, json
from pathlib import Path
import requests
from glob import glob
import lxml.etree as ET
class PubRef:
def __init__(self, pubmed, pmc="", doi="", issn="", journal="", abbv=""):
self.pubmed = pubmed
self.doi = doi
self.issn = issn
self.pmc = pmc
self.journal = journal
self.journal_abbreviation = abbv
self.authors = []
def __eq__(self, other):
return (isinstance(other, self.__class__) and
getattr(other, 'pubmed', None) == self.pubmed and
getattr(other, 'doi', None) == self.doi and
getattr(other, 'pmc', None) == self.pmc and
getattr(other, 'issn', None) == self.issn)
def __hash__(self):
return hash(self.pubmed + self.doi + self.issn + self.pmc) + hash(self.authors)
def __str__(self):
if len(self.authors) == 0:
return f"{self.pubmed}\t{self.pmc}\t{self.doi}\t{self.issn}\t{self.journal}\t{self.journal_abbreviation}"
return f"{self.pubmed}\t{self.pmc}\t{self.doi}\t{self.issn}\t{self.journal}\t{self.journal_abbreviation}\t" + '\t'.join(self.authors)
def call_ePubmedCentral(pubmed_list, uri):
publications = []
for i in range(0, len(pubmed_list), N):
subset = pubmed_list[i:i+N]
query = ' OR '.join(subset)
queryString = f"SRC:MED AND EXT_ID:({query})"
data = {'query': queryString,
'format': 'json',
'resultType': 'core',
"pageSize": N}
response = requests.post(uri, data=data)
if response.status_code == 200:
try:
pmcjdata = json.loads(response.text)
if 'result' in pmcjdata['resultList']:
result = pmcjdata['resultList']['result']
for pub_data in result:
pmid = pub_data['id']
pmcid = pub_data['pmcid'] if "pmcid" in pub_data else ""
doi = pub_data['doi'] if "doi" in pub_data else ""
issn = ""
journal_name = ""
abbv = ""
if "journalInfo" in pub_data:
if "journal" in pub_data['journalInfo']:
issn = pub_data['journalInfo']['journal']['issn'] if 'issn' in pub_data['journalInfo']['journal'] else ""
journal_name = pub_data['journalInfo']['journal']['title'] if 'title' in pub_data['journalInfo']['journal'] else ""
abbv = pub_data['journalInfo']['journal']['medlineAbbreviation'] if 'medlineAbbreviation' in pub_data['journalInfo']['journal'] else ""
pub_ref = PubRef(pmid, pmcid, doi, issn, journal_name, abbv)
if "authorList" in pub_data:
for author in pub_data['authorList']['author']:
if "authorId" in author:
if author['authorId']['type'] == "ORCID":
pub_ref.authors.append(author['authorId']['value']) #Found ORCID
else:
pub_ref.authors.append("") #No ORCID
else:
pub_ref.authors.append("") #No ORCID
publications.append(pub_ref)
except json.JSONDecodeError:
print(f"WARN: Failed to read the result of {queryString}")
else:
print(f"WARN: Failed to connect to Europe PMC on {queryString}")
return publications
def get_pubmed_ids(header_dir):
pubmed_list = set()
for xml_dirpath in glob(os.path.join(str(header_dir), '*')):
split_dir = xml_dirpath.split('-')
if len(split_dir) == 2:
id_num = xml_dirpath.split('-')[1]
emdb_id = f"EMD-{id_num}"
xml_filepath = os.path.join(xml_dirpath, f"header/emd-{id_num}-v30.xml")
if not os.path.isfile(xml_filepath):
print(f"{xml_filepath} not found.")
continue
tree = ET.parse(xml_filepath)
xrefs = tree.xpath("//crossreferences/citation_list/primary_citation/*/external_references")
for ref in xrefs:
if ref.attrib['type'] == "PUBMED":
pubmed_list.add(ref.text)
return list(pubmed_list)
if __name__ == "__main__":
prog = "Publication (EMICSS)"
usage = """
Collect Publication and author EMICSS
Example:
python fetch_pubmed.py -w '[{"/path/to/working/folder"}]'
-f '[{"/path/to/EMDB/header/files/folder"}]' -N 400
"""
parser = argparse.ArgumentParser(prog=prog, usage=usage, add_help=False,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")
parser.add_argument('-w', '--workDir', type=Path, help="Main working directory path .")
parser.add_argument('-f', '--headerDir', type=Path, help="Directory path to the EMDB version 3.0 header files.")
parser.add_argument('-N', default=500, help="Number of simultaneosly papers to be included in a query.")
args = parser.parse_args()
workDir = args.workDir
headerDir = args.headerDir
N = int(args.N)
#Get config variables:
config = configparser.ConfigParser()
env_file = os.path.join(Path(__file__).parent.absolute(), "config.ini")
config.read(env_file)
pmc_uri = config.get("api", "pmc")
pubmed_list = get_pubmed_ids(headerDir)
publications = call_ePubmedCentral(pubmed_list, pmc_uri)
#Export results
with open(os.path.join(workDir, "EPMC_pubmed.tsv"), "w") as fw:
fw.write("PMID\tPMC\tDOI\tISSN\tJOURNAL\tABBREVIATION\t[AUTHORS ORCID]\n")
for pub in publications:
fw.write(f"{str(pub)}\n")