-
Notifications
You must be signed in to change notification settings - Fork 3
/
simbad2rdf.py
executable file
·76 lines (61 loc) · 2.81 KB
/
simbad2rdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import getopt
import re
import sys
from namespaces import *
from urllib import quote_plus
from rdflib import URIRef, Namespace, Literal, BNode
from rdflib import ConjunctiveGraph
import uuid
#from lxml import etree as ElementTree
import HTMLParser
import os.path
"""
{
'1999ApJ...522..718Q': [
{'mtype': '~,~,~', 'otype': 'LINER ~', 'refcode': '1999ApJ...522..718Q', 'ra': '184.84673421', '_raw': 'NGC 4261|184.84673421|+05.82491522|LINER ~|~,~,~|E...,D,~', 'dec': '+05.82491522', 'id': 'NGC 4261'},
{'mtype': '~,~,~', 'otype': 'Seyfert_2 ~', 'refcode': '1999ApJ...522..718Q', 'ra': '186.26559721', '_raw': 'M 84|186.26559721|+12.88698314|Seyfert_2 ~|~,~,~|E...,D,~', 'dec': '+12.88698314', 'id': 'M 84'},
{'mtype': '~,~,~', 'otype': 'GinGroup ~', 'refcode': '1999ApJ...522..718Q', 'ra': '193.092133', '_raw': 'NGC 4753|193.092133|-01.199689|GinGroup ~|~,~,~|I...,D,~', 'dec': '-01.199689', 'id': 'NGC 4753'},
{'mtype': '~,~,~', 'otype': 'Seyfert_2 ~', 'refcode': '1999ApJ...522..718Q', 'ra': '201.36506279', '_raw': 'NGC 5128|201.36506279|-43.01911258|Seyfert_2 ~|~,~,~|E,D,~', 'dec': '-43.01911258', 'id': 'NGC 5128'}
]
}
"""
if len(sys.argv)<2 or len(sys.argv)> 3:
print "Usage: python simbad2rdf.py dictfile [targetdir]"
sys.exit(-1)
filetoread=sys.argv[1]
fd=open(filetoread)
stuff=fd.read()
fd.close()
simbad=eval(stuff)
if len(sys.argv)==3:
DATA=sys.argv[2]
else:
DATA="../chandra-rdf"
#DATA="../mast_hut-rdf"
#dabib='2005ApJ...629..700N'
##file to read is output of simad1.py and assumes bibcode.simbad
#print "SIMBAD", simbad[dabib]
#sys.exit(-1)
#Issue, some sources will come again and again and have multiple metadata strings. I think this is fine
#as the triplestore will kill repeated triples. But what if they come in different contexts. Wont we #have multiple statements then. I think we can deal with that but it is something to remember.
odir = DATA + "/data/rdf"
if not os.path.isdir(odir):
os.makedirs(odir)
for bibcode in simbad.keys():
g = ConjunctiveGraph(identifier=URIRef(None))
bindgraph(g)
for aobject in simbad[bibcode]:
#print bibcode, aobject['id']
euri=uri_bib[bibcode]
eleid=quote_plus("_".join(aobject['id'].split()))
gadd(g,euri, adsbase.hasAstronomicalSource, uri_source[eleid])
gadd(g,uri_source[eleid], a, adsbase.AstronomicalSource)
gadd(g,uri_source[eleid], adsbase.name , Literal(aobject['id']))
gadd(g,uri_source[eleid], adsobsv.curatedAt, uri_conf['SIMBAD'])
gadd(g,uri_source[eleid], adsbase.hasMetadataString, Literal(str(aobject)))
serializedstuff=g.serialize()
if not os.path.isdir(DATA+"/data/rdf"):
os.makedirs(DATA+"/data/rdf")
fd=open(odir+"/simbad."+quote_plus(bibcode)+".rdf", "w")
fd.write(serializedstuff)
fd.close()