Skip to content

Commit

Permalink
bugs with merging
Browse files Browse the repository at this point in the history
  • Loading branch information
Volodymyr Savchenko authored and Volodymyr Savchenko committed May 23, 2024
1 parent 08a10f6 commit 2abb753
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 8 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/aggregate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,10 @@ jobs:
poetry run workflowhub-graph-source-crates
poetry run workflowhub-graph-merge merged.ttl
# TODO store artifact, to zenodo? to github release (tag daily, make )? to gh pages?
# TODO: make a nice picture
# TODO: make a nice picture?
- name: Plot the data
run: |
apt update
apt install -y graphviz
python <(curl https://raw.githubusercontent.com/oda-hub/rdflib/lesstext/rdflib/tools/rdf2dot.py) merged.ttl | sfdp -x -Tpdf > merged.pdf
20 changes: 13 additions & 7 deletions workflowhub_graph/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from workflowhub_graph.cachedurlopen import patch_rdflib_urlopen
from workflowhub_graph.constants import BASE_URL


# TODO: check if names like "#Husen" are correctly represented in the graph
def merge_all_files(
pattern="data/*.json", base_url: str = BASE_URL, cache_kwargs: None = Optional[dict]
pattern="data/*.json", base_url: str = BASE_URL, cache_kwargs: Optional[dict] = None
) -> rdflib.Graph:
"""
Merges all JSON-LD files in the given pattern into a single RDF graph.
Expand All @@ -22,13 +22,13 @@ def merge_all_files(
:return: The merged RDF graph.
"""

if cache_kwargs is None:
cache_kwargs = dict()

G = rdflib.Graph()

filenames = glob.glob(pattern)

# TODO: this can be much accelerated by caching the context
# TODO: collect statistics about file:// references


for i, fn in enumerate(filenames):
with open(fn, "r") as f:
print(f"Processing {fn}, {i}/{len(filenames)}")
Expand All @@ -49,9 +49,15 @@ def main():
argparser.add_argument(
"output_filename", help="The output filename.", default="merged.ttl"
)
argparser.add_argument(
"-p",
"--pattern",
help="The pattern to match the files.",
default="data/*.json",
)
args = argparser.parse_args()

G = merge_all_files()
G = merge_all_files(pattern=args.pattern)
G.serialize(args.output_filename, format="ttl")


Expand Down

0 comments on commit 2abb753

Please sign in to comment.