Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework the oekg evaluation tools #25

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions oekg evaluation/How to use.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
There are 3 scripts.
There are 2 scripts.

This one is querying a file:
This one is querying a file or the oekg SPARQL endpoint:

python3 wordFinder.py [path to your rdf data file] [path to the file holding labels].
python3 labelChecker.py <endpoint|file> > <path to the file holding labels> [path to your rdf data file].

If you want to query the endpoint, use "endpoint" as the first artgument. Use "file" if you want to query a file. In that case you need to specify the path to the file in the third argument.

The output will be printed into a file called abstract_evaluation.txt.
You may need to instal rdflib first.

///

This one is querying the OEKG endpoint:

python3 endpointWordFinder.py [path to the file holding labels]

The output will be printed into a file called abstract_evaluation2.txt.

///

The file with all current OEO labels taken from the glossary and current alternative labels extracted from the omn files is provided in this folder and is called allLabels.txt.

Expand All @@ -24,9 +17,11 @@ The file with all current OEO labels taken from the glossary and current alterna
This one is for querying...
1) the number of studys in the sample
2) the number of scenarios per study
3) the descriptors of each study
3) the keywords of each study

python3 oekgQuery.py <endpoint|file> <studynumber|scenarionumber|keywords> [path to your rdf data file]

python3 oekgQuery.py [studynumber|scenarionumber|descriptors]
If you want to query the endpoint, use "endpoint" as the first artgument. Use "file" if you want to query a file. In that case you need to specify the path to the file in the third argument.

Chosing "studynumber" will print the number in the terminal.
Chosing "scenarionumber" or "descriptors" will output a file named "scenariosPerStudy.txt" or "studyDescriptors.txt" respectively.
Chosing "scenarionumber" or "descriptors" will output a file named "scenariosPerStudy.txt" or "studyDescriptors.txt" respectively. It will also print a dictionary of the resultst to the terminal.
51 changes: 0 additions & 51 deletions oekg evaluation/endpointWordFinder.py

This file was deleted.

67 changes: 38 additions & 29 deletions oekg evaluation/label extraction/labels.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,53 @@
import re
import sys

path = sys.argv[1]
mode = sys.argv[2]
#all of those were searched for alternative labels:
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-social.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-physical.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-sector.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-shared.omn"

def write(result, p):
with open("labels.txt", p) as file:
for x in result:
file.write(str(x)+"\n")

def find(start, end):
def find(start, end,s):
result = re.findall('%s(.*)%s' % (start, end), s)
return result

s = ""
start2 = 'rdfs:label "'
start = '<http://purl.obolibrary.org/obo/IAO_0000118> "'
end = '"@en'
end2 = '"@de'

with open(path, "r") as file:
for line in file:
s = s + str(line)

if mode == "alt" or mode == "all":
res = find(start,end)
write(res,"w")

if mode == "ger" or mode == "all":
res = find(start,end2)
write(res,"a+")
def main():
s = ""
start2 = 'rdfs:label "'
start = '<http://purl.obolibrary.org/obo/IAO_0000118> "'
end = '"@en'
end2 = '"@de'

with open(path, "r") as file:
for line in file:
s = s + str(line)

if mode == "alt" or mode == "all":
res = find(start,end,s)
write(res,"w")

if mode == "ger" or mode == "all":
res = find(start,end2,s)
write(res,"a+")

if mode == "label" or mode == "all":
res = find(start2,end,s)
write(res,"a+")

try:
path = sys.argv[1]
mode = sys.argv[2]
if not (mode =="alt" or mode == "all" or mode == "ger" or mode == "label"):
print("Error:Not a valid mode")
else:
main()
except:
print("Error: Missing or invalid argument!")
#all of those were searched for alternative labels:
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-social.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-physical.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-sector.omn"
#path = r"/home/madeleine/Schreibtisch/ontology/src/ontology/edits/oeo-shared.omn"

if mode == "label" or mode == "all":
res = find(start2,end)
write(res,"a+")



Expand Down
42 changes: 28 additions & 14 deletions oekg evaluation/label extraction/mergeLists.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,38 @@
import sys

path1 = sys.argv[1]
path2 = sys.argv[2]
path1 = None
path2 = None

try:
path1 = sys.argv[1]
path2 = sys.argv[2]
except:
print("Error: Missing or invalid argument")


list1 = []
list2 = []

if not (path1 == None or path2 == None):

with open(path1, "r") as file:
for line in file:
list1.append(line.strip())
try:
with open(path1, "r") as file:
for line in file:
list1.append(line.strip())
except:
print("Error: First file not found!")

with open(path2, "r") as file:
for line in file:
list2.append(line.strip())
try:
with open(path2, "r") as file:
for line in file:
list2.append(line.strip())
except:
print("Error: Second file not found!")

for line in list1:
if line not in list2:
list2.append(line)
for line in list1:
if line not in list2:
list2.append(line)

with (open("mergedList.txt","w") as file):
for x in list2:
file.write(str(x) + "\n")
with (open("mergedList.txt","w") as file):
for x in list2:
file.write(str(x) + "\n")
86 changes: 86 additions & 0 deletions oekg evaluation/labelChecker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import sys
import rdflib
import re
import requests

def queryFile():
g = rdflib.Graph()
g.parse(data)

knows_query = """
SELECT ?s ?b
WHERE {
?s DC:abstract ?b.
}"""

subjects = []
objects = []

qres = g.query(knows_query)
for row in qres:
subjects.append(str(row.s)) # stores the study URI
objects.append(str(row.b)) # stores the abstracts

return subjects, objects

def queryEndpoint():
sparql_endpoint = "https://openenergyplatform.org/sparql_query/sparql"
sparql_query = {
"query": """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX DC: <http://purl.org/dc/terms/>

SELECT * WHERE {
?sub DC:abstract ?obj.

}"""
}

r = requests.get(url=sparql_endpoint, params=sparql_query)
start = '"sub": {"type": "uri", "value": "'
start2 = '"obj": {"type": "literal", "value": "'
end = '"}}'
end2 = '"},'

subjects = re.findall('%s(.*?)%s' % (start, end2), r.text) # get the URIs
objects = re.findall('%s(.*?)%s' % (start2, end), r.text) # get the abstracts

return subjects, objects

def checkKeywords(keys):
keywords = []

with open(keys, "r") as file:
for line in file:
keywords.append(line.strip()) # collect all the labels

i = 0

with open("abstractEvaluation.txt", "w") as file:

for x in objects: # go trough objects
file.write("\n")
file.write(subjects[i] + "\n") # move trough URIs at the same pace
i = i + 1
for y in keywords: # check every label
if y in x:
file.write(y + ": " + str(x.count(y)) + "\n")


try:
mode = sys.argv[1] # endpoint | file
if not (mode == "endpoint" or mode == "file"):
print("Error: Invalid mode! Use 'endpoint' or 'file' as first argument!")
else:
keys = sys.argv[2] # path to file with all the labes
if mode == "endpoint":
subjects, objects = queryEndpoint()
if mode == "file":
data = sys.argv[3] # path to rdf data file
subjects, objects = queryFile()

checkKeywords(keys)

except:
print("Error: Missing or invalid argument!")
Loading