Skip to content

Commit

Permalink
Merge pull request #649 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
fix #635
  • Loading branch information
dgarijo authored Oct 2, 2024
2 parents f123fbb + 04deedf commit 72a0162
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ src/somef/create_corpus_for_NER.py
src/somef.egg-info/*
local_tests/*
Dockerfile_old
repos.txt
4 changes: 3 additions & 1 deletion src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def configure(auto, base_uri):
"--in_file",
"-i",
type=click.Path(exists=True),
help="A file of newline separated links to GitHub/Gitlab repositories to process in bulk"
help=""""A file of newline separated links to GitHub/Gitlab repositories to process in bulk. Each repository will be
stored in a different file called $out_$url.json where $out is the name selected as out file and $url is the
url of the target repository (url encoded)"""
)
@optgroup.group('Output', cls=RequiredAnyOptionGroup)
@optgroup.option(
Expand Down
50 changes: 35 additions & 15 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import sys
from uu import encode

import validators
import logging
import os
import tempfile
import urllib.parse

from os import path
from . import header_analysis, regular_expressions, process_repository, configuration, process_files, \
Expand Down Expand Up @@ -200,18 +203,34 @@ def run_cli(*,
# convert to a set to ensure uniqueness (we don't want to get the same data multiple times)
repo_set = set(repo_list)
# check if the urls in repo_set if are valid
remove_urls = []
urls_to_process = []
for repo_elem in repo_set:
if not validators.url(repo_elem):
logging.error("Not a valid repository url. Please check the url provided: " + repo_elem)
remove_urls.append(repo_elem)
# remove non valid urls in repo_set
for remove_url in remove_urls:
repo_set.remove(remove_url)
if len(repo_set) > 0:
repo_data = [cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder) for repo_url in
repo_set]
repo_elem = repo_elem.strip()
if validators.url(repo_elem):
urls_to_process.append(repo_elem)
else:
logging.error(repo_elem +" is not a valid repository url. Please check the url provided ")
if len(urls_to_process) > 0:
# repo_data = [cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
# keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder) for repo_url in
# urls_to_process]
for repo_url in urls_to_process:
try:
encoded_url = urllib.parse.quote(repo_url, safe='')
encoded_url = encoded_url.replace(".","") #removing dots just in case
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url,
ignore_github_metadata=ignore_github_metadata, readme_only=readme_only,
keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder)
if output is not None:
output = output.replace(".json","")
output = output + "_" + encoded_url + ".json"
json_export.save_json_output(repo_data.results, output, missing, pretty=pretty)
if codemeta_out is not None:
codemeta_out = codemeta_out.replace(".json", "")
codemeta_out = codemeta_out + "_" + encoded_url + ".json"
json_export.save_codemeta_output(repo_data.results, codemeta_out, pretty=pretty)
except:
logging.error("Error when processing repo: " + repo_url)
else:
return None

Expand All @@ -227,8 +246,10 @@ def run_cli(*,
repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers,
doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder)

if output is not None:
json_export.save_json_output(repo_data.results, output, missing, pretty=pretty)
if output is not None:
json_export.save_json_output(repo_data.results, output, missing, pretty=pretty)
if codemeta_out is not None:
json_export.save_codemeta_output(repo_data.results, codemeta_out, pretty=pretty)

if graph_out is not None:
logging.info("Generating triples...")
Expand All @@ -241,5 +262,4 @@ def run_cli(*,

data_graph.export_to_file(graph_out, graph_format)

if codemeta_out is not None:
json_export.save_codemeta_output(repo_data.results, codemeta_out, pretty=pretty)

0 comments on commit 72a0162

Please sign in to comment.