diff --git a/oekg evaluation/How to use.txt b/oekg evaluation/How to use.txt index 1cf98d5..5f8beb5 100644 --- a/oekg evaluation/How to use.txt +++ b/oekg evaluation/How to use.txt @@ -1,21 +1,14 @@ -There are 3 scripts. +There are 2 scripts. -This one is querying a file: +This one is querying a file or the oekg SPARQL endpoint: -python3 wordFinder.py [path to your rdf data file] [path to the file holding labels]. +python3 labelChecker.py > [path to your rdf data file]. + +If you want to query the endpoint, use "endpoint" as the first artgument. Use "file" if you want to query a file. In that case you need to specify the path to the file in the third argument. The output will be printed into a file called abstract_evaluation.txt. You may need to instal rdflib first. -/// - -This one is querying the OEKG endpoint: - -python3 endpointWordFinder.py [path to the file holding labels] - -The output will be printed into a file called abstract_evaluation2.txt. - -/// The file with all current OEO labels taken from the glossary and current alternative labels extracted from the omn files is provided in this folder and is called allLabels.txt. @@ -24,9 +17,11 @@ The file with all current OEO labels taken from the glossary and current alterna This one is for querying... 1) the number of studys in the sample 2) the number of scenarios per study -3) the descriptors of each study +3) the keywords of each study + +python3 oekgQuery.py [path to your rdf data file] -python3 oekgQuery.py [studynumber|scenarionumber|descriptors] +If you want to query the endpoint, use "endpoint" as the first artgument. Use "file" if you want to query a file. In that case you need to specify the path to the file in the third argument. Chosing "studynumber" will print the number in the terminal. -Chosing "scenarionumber" or "descriptors" will output a file named "scenariosPerStudy.txt" or "studyDescriptors.txt" respectively. +Chosing "scenarionumber" or "descriptors" will output a file named "scenariosPerStudy.txt" or "studyDescriptors.txt" respectively. It will also print a dictionary of the resultst to the terminal. diff --git a/oekg evaluation/endpointWordFinder.py b/oekg evaluation/endpointWordFinder.py deleted file mode 100644 index 806e896..0000000 --- a/oekg evaluation/endpointWordFinder.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import re -import requests - -keys = sys.argv[1] #path to the file with all the labels - -sparql_endpoint = "https://openenergyplatform.org/sparql_query/sparql" -sparql_query = { - "query": """ - PREFIX rdf: - PREFIX rdfs: - - SELECT * WHERE { - ?sub ?obj. - - }""" -} - -r = requests.get(url=sparql_endpoint, params=sparql_query) - -start = '"sub": {"type": "uri", "value": "' -start2 = '"obj": {"type": "literal", "value": "' -end = '"}}' -end2 = '"},' - -subjects = re.findall('%s(.*?)%s' % (start, end2), r.text) #get the URIs -abstracts = re.findall('%s(.*?)%s' % (start2, end), r.text) #get the abstracts - -keywords = [] - -with open(keys,"r") as file: - for line in file: - keywords.append(line.strip()) #collect all the labels - -i = 0 - -with open("abstract_evaluation2.txt", "w") as file: - - for x in abstracts: #go trough abstracts - file.write("\n") - file.write(subjects[i]+"\n") #move trough URIs at the same pace - i = i+1 - for y in keywords: #check every label - if y in x: - file.write(y+": " + str(x.count(y)) + "\n") - - - - - - diff --git a/oekg evaluation/label extraction/labels.py b/oekg evaluation/label extraction/labels.py index 25b41bd..9a5ec6a 100644 --- a/oekg evaluation/label extraction/labels.py +++ b/oekg evaluation/label extraction/labels.py @@ -1,44 +1,53 @@ import re import sys -path = sys.argv[1] -mode = sys.argv[2] -#all of those were searched for alternative labels: -#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-social.omn" -#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-physical.omn" -#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-sector.omn" -#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-shared.omn" - def write(result, p): with open("labels.txt", p) as file: for x in result: file.write(str(x)+"\n") -def find(start, end): +def find(start, end,s): result = re.findall('%s(.*)%s' % (start, end), s) return result -s = "" -start2 = 'rdfs:label "' -start = ' "' -end = '"@en' -end2 = '"@de' - -with open(path, "r") as file: - for line in file: - s = s + str(line) - -if mode == "alt" or mode == "all": - res = find(start,end) - write(res,"w") - -if mode == "ger" or mode == "all": - res = find(start,end2) - write(res,"a+") +def main(): + s = "" + start2 = 'rdfs:label "' + start = ' "' + end = '"@en' + end2 = '"@de' + + with open(path, "r") as file: + for line in file: + s = s + str(line) + + if mode == "alt" or mode == "all": + res = find(start,end,s) + write(res,"w") + + if mode == "ger" or mode == "all": + res = find(start,end2,s) + write(res,"a+") + + if mode == "label" or mode == "all": + res = find(start2,end,s) + write(res,"a+") + +try: + path = sys.argv[1] + mode = sys.argv[2] + if not (mode =="alt" or mode == "all" or mode == "ger" or mode == "label"): + print("Error:Not a valid mode") + else: + main() +except: + print("Error: Missing or invalid argument!") +#all of those were searched for alternative labels: +#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-social.omn" +#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-physical.omn" +#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-sector.omn" +#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-shared.omn" -if mode == "label" or mode == "all": - res = find(start2,end) - write(res,"a+") diff --git a/oekg evaluation/label extraction/mergeLists.py b/oekg evaluation/label extraction/mergeLists.py index 667a0ca..fd22eaf 100644 --- a/oekg evaluation/label extraction/mergeLists.py +++ b/oekg evaluation/label extraction/mergeLists.py @@ -1,24 +1,38 @@ import sys -path1 = sys.argv[1] -path2 = sys.argv[2] +path1 = None +path2 = None + +try: + path1 = sys.argv[1] + path2 = sys.argv[2] +except: + print("Error: Missing or invalid argument") + list1 = [] list2 = [] +if not (path1 == None or path2 == None): -with open(path1, "r") as file: - for line in file: - list1.append(line.strip()) + try: + with open(path1, "r") as file: + for line in file: + list1.append(line.strip()) + except: + print("Error: First file not found!") -with open(path2, "r") as file: - for line in file: - list2.append(line.strip()) + try: + with open(path2, "r") as file: + for line in file: + list2.append(line.strip()) + except: + print("Error: Second file not found!") -for line in list1: - if line not in list2: - list2.append(line) + for line in list1: + if line not in list2: + list2.append(line) -with (open("mergedList.txt","w") as file): - for x in list2: - file.write(str(x) + "\n") \ No newline at end of file + with (open("mergedList.txt","w") as file): + for x in list2: + file.write(str(x) + "\n") \ No newline at end of file diff --git a/oekg evaluation/labelChecker.py b/oekg evaluation/labelChecker.py new file mode 100644 index 0000000..b24952b --- /dev/null +++ b/oekg evaluation/labelChecker.py @@ -0,0 +1,86 @@ +import sys +import rdflib +import re +import requests + +def queryFile(): + g = rdflib.Graph() + g.parse(data) + + knows_query = """ + SELECT ?s ?b + WHERE { + ?s DC:abstract ?b. + }""" + + subjects = [] + objects = [] + + qres = g.query(knows_query) + for row in qres: + subjects.append(str(row.s)) # stores the study URI + objects.append(str(row.b)) # stores the abstracts + + return subjects, objects + +def queryEndpoint(): + sparql_endpoint = "https://openenergyplatform.org/sparql_query/sparql" + sparql_query = { + "query": """ + PREFIX rdf: + PREFIX rdfs: + PREFIX DC: + + SELECT * WHERE { + ?sub DC:abstract ?obj. + + }""" + } + + r = requests.get(url=sparql_endpoint, params=sparql_query) + start = '"sub": {"type": "uri", "value": "' + start2 = '"obj": {"type": "literal", "value": "' + end = '"}}' + end2 = '"},' + + subjects = re.findall('%s(.*?)%s' % (start, end2), r.text) # get the URIs + objects = re.findall('%s(.*?)%s' % (start2, end), r.text) # get the abstracts + + return subjects, objects + +def checkKeywords(keys): + keywords = [] + + with open(keys, "r") as file: + for line in file: + keywords.append(line.strip()) # collect all the labels + + i = 0 + + with open("abstractEvaluation.txt", "w") as file: + + for x in objects: # go trough objects + file.write("\n") + file.write(subjects[i] + "\n") # move trough URIs at the same pace + i = i + 1 + for y in keywords: # check every label + if y in x: + file.write(y + ": " + str(x.count(y)) + "\n") + + +try: + mode = sys.argv[1] # endpoint | file + if not (mode == "endpoint" or mode == "file"): + print("Error: Invalid mode! Use 'endpoint' or 'file' as first argument!") + else: + keys = sys.argv[2] # path to file with all the labes + if mode == "endpoint": + subjects, objects = queryEndpoint() + if mode == "file": + data = sys.argv[3] # path to rdf data file + subjects, objects = queryFile() + + checkKeywords(keys) + +except: + print("Error: Missing or invalid argument!") \ No newline at end of file diff --git a/oekg evaluation/oekgQuery.py b/oekg evaluation/oekgQuery.py index 9eed3bf..c6b2462 100644 --- a/oekg evaluation/oekgQuery.py +++ b/oekg evaluation/oekgQuery.py @@ -1,6 +1,7 @@ import sys import re import requests +import rdflib def buildquery(pred): #input: predicate for chosen query, output: full query string return '\nPREFIX rdf: \nPREFIX rdfs: \nSELECT * WHERE{\n?subj ' + pred + ' ?obj.\n}' @@ -27,58 +28,85 @@ def duplicateFilter(subjects, objects): #input: lists of subjects and objects, o if x not in subjectsFilter: subjectsFilter.append(x) objectFilter.append(objects[i]) - else: #add object on the same position as first isntance of subject + else: #add object on the same position as first instance of subject objectFilter[subjectsFilter.index(x)] = objectFilter[subjectsFilter.index(x)] + ", " + objects[i] i = i + 1 - results = [] - results.append(subjectsFilter) - results.append(objectFilter) - return results #return list of both lists + return subjectsFilter, objectFilter #return list of both lists def main(): - arg = str(sys.argv[1]) + mode = sys.argv[1] + subjects = [] + objects = [] + if not (mode == "endpoint" or mode == "file"): + print("Error: Invalid mode! Use 'endpoint' or 'file' as first argument!") + else: + arg = str(sys.argv[2]) - if arg == "scenarionumber": - pred = "" - start2 = 'obj": {"type": "uri", "value": "' - - elif arg == "descriptors": - pred = "" - start2 = 'obj": {"type": "literal", "value": "' # scenarionumber, descriptors - - elif arg == "studynumber": - pred = "" - - - r = queryEndpoint(buildquery(pred)) - start = '"subj": {"type": "uri", "value": "' - end = '"},' - subjects = re.findall('%s(.*?)%s' % (start, end), r.text) #get the URIs - - if arg == "studynumber": - print(len(scenarioFilter(subjects))) #how many unique study URIs are in the sample - - if arg == "scenarionumber" or arg == "descriptors": - end2 = '"}}' - objects = re.findall('%s(.*?)%s' % (start2, end2), r.text) - subjectsFiltered = duplicateFilter(subjects, objects)[0] - objectsFiltered = duplicateFilter(subjects, objects)[1] if arg == "scenarionumber": - i = 0 - with open("scenariosPerStudy.txt", "w") as file: - for x in objectsFiltered: - file.write("\n") - file.write(subjectsFiltered[i] + "\n") - file.write(str(objectsFiltered[i].count(",") + 1)) #count number ob entrys in one object position - i = i + 1 - if arg == "descriptors": - i = 0 - with open("studyDescriptors.txt", "w") as file: - for x in objectsFiltered: - file.write("\n") - file.write(subjectsFiltered[i] + "\n") - file.write(objectsFiltered[i] + "\n") - i = i + 1 + pred = "" + start2 = 'obj": {"type": "uri", "value": "' + + elif arg == "keywords": + pred = "" + start2 = 'obj": {"type": "literal", "value": "' + + elif arg == "studynumber": + pred = "" + + else: + print("Error: Invalid query! Use 'scenarionumber', 'keywords' or 'studynumber'!") + return + + if mode == "endpoint": + r = queryEndpoint(buildquery(pred)) + start = '"subj": {"type": "uri", "value": "' + end = '"},' + subjects = re.findall('%s(.*?)%s' % (start, end), r.text) # get the URIs + + if mode == "file": + try: + data = sys.argv[3] + g = rdflib.Graph() + g.parse(data) + knows_query = buildquery(pred) + qres = g.query(knows_query) + for row in qres: + subjects.append(str(row.subj)) + objects.append(str(row.obj)) + except: + print("Error: Invalid data argument!") + + if arg == "studynumber": + print(len(scenarioFilter(subjects))) # how many unique study URIs are in the sample + + if arg == "scenarionumber" or arg == "keywords": + if mode == "endpoint": + end2 = '"}}' + objects = re.findall('%s(.*?)%s' % (start2, end2), r.text) + + dictionary = {objects[i]: subjects[i] for i in range(len(objects))} + + print(dictionary) + + subjectsFiltered, objectsFiltered = duplicateFilter(subjects,objects) + if arg == "scenarionumber": + i = 0 + with open("scenariosPerStudy.txt", "w") as file: + for x in objectsFiltered: + file.write("\n") + file.write(subjectsFiltered[i] + "\n") + file.write( + str(objectsFiltered[i].count(",") + 1)) # count number ob entrys in one object position + i = i + 1 + if arg == "keywords": + i = 0 + with open("studyKeywords.txt", "w") as file: + for x in objectsFiltered: + file.write("\n") + file.write(subjectsFiltered[i] + "\n") + file.write(objectsFiltered[i] + "\n") + i = i + 1 + main() diff --git a/oekg evaluation/wordFinder.py b/oekg evaluation/wordFinder.py deleted file mode 100644 index 221ae09..0000000 --- a/oekg evaluation/wordFinder.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import rdflib - -data = sys.argv[1] #path to the rdf data file -keys = sys.argv[2] #path to the file with all the labels - -#example: -#data = /home/madeleine/Schreibtisch/oekg1.nq -#keys = /home/madeleine/PycharmProjects/pythonProject/allLabels.txt - -def main(): - g = rdflib.Graph() - g.parse(data) - - knows_query = """ - SELECT ?s ?b - WHERE { - ?s ?b. - }""" - - subjects = [] - abstracts = [] - - qres = g.query(knows_query) - for row in qres: - subjects.append(str(row.s)) #stores the study URI - abstracts.append(str(row.b)) #stores the abstracts - - keywords = [] - - with open(keys,"r") as file: - for line in file: - keywords.append(line.strip()) #collect all the labels - - i = 0 - - with open("abstract_evaluation.txt", "w") as file: - - for x in abstracts: #go trough abstracts - file.write("\n") - file.write(subjects[i]+"\n") #move trough URIs at the same pace - i = i+1 - for y in keywords: #check every label - if y in x: - file.write(y+": " + str(x.count(y)) + "\n") - - - -main()