Skip to content

Commit

Permalink
Merge pull request #625 from KnowledgeCaptureAndDiscovery/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
dgarijo authored Mar 7, 2024
2 parents bbb8f7e + 39dc979 commit 3967b59
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 38 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Software Metadata Extraction Framework (SOMEF)
# Software Metadata Extraction Framework (SOMEF)
[![Documentation Status](https://readthedocs.org/projects/somef/badge/?version=latest)](https://somef.readthedocs.io/en/latest/?badge=latest)
[![Python](https://img.shields.io/pypi/pyversions/somef.svg?style=plastic)](https://badge.fury.io/py/somef) [![PyPI](https://badge.fury.io/py/somef.svg)](https://badge.fury.io/py/somef) [![DOI](https://zenodo.org/badge/190487675.svg)](https://zenodo.org/badge/latestdoi/190487675) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb) [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)

<img src="docs/logo.png" alt="logo" width="150"/>
Expand Down
4 changes: 0 additions & 4 deletions src/somef/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ class URLParamType(click.types.StringParamType):
@click.version_option(__version__)
def cli():
click.echo("SOftware Metadata Extraction Framework (SOMEF) Command Line Interface")
# Logging setup
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s',
datefmt='%d-%b-%y %H:%M:%S')
# logging.getLogger("bibtexparser").setLevel(logging.WARNING)


@cli.command(help="Configure GitHub credentials and classifiers file path")
Expand Down
17 changes: 10 additions & 7 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import base64
import logging
import os
import zipfile
Expand All @@ -11,6 +10,7 @@
from . import configuration
from .process_results import Result


# Constructs a template HTTP header, which:
# - has a key for the authorization token if passed via the authorization argument, otherwise
# - has a key for the authorization token if specified via config, otherwise
Expand Down Expand Up @@ -39,8 +39,9 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
rate_limit_remaining = response.headers["X-RateLimit-Remaining"]
epochtime = int(response.headers["X-RateLimit-Reset"])
date_reset = datetime.fromtimestamp(epochtime)
logging.info("Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str(
date_reset))
logging.info(
"Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str(
date_reset))
if 'message' in response and 'API rate limit exceeded' in response['message']:
rate_limited = True
logging.warning(f"rate limited. Backing off for {initial_backoff} seconds")
Expand Down Expand Up @@ -324,7 +325,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
return None

logging.info(f"Loading Repository {repository_url} Information....")

# Create template header with optional authorization token
header = header_template(authorization)
header['accept'] = constants.GITHUB_ACCEPT_HEADER
Expand Down Expand Up @@ -438,7 +439,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,

# get releases
releases_list_raw, date = rate_limit_get(repo_api_base_url + "/releases",
headers=header)
headers=header)
releases_list = releases_list_raw.json()
if isinstance(releases_list, dict) and 'message' in releases_list.keys():
logging.error("Releases Error: " + releases_list['message'])
Expand Down Expand Up @@ -492,7 +493,8 @@ def do_crosswalk(data, crosswalk_table):
return output


def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, authorization=None):
def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None,
authorization=None):
"""
Given a repository, this method will download its files and return the readme text
Parameters
Expand Down Expand Up @@ -543,7 +545,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
logging.info(f"Trying to download {repo_archive_url}")
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))

if repo_download.status_code != 200:
sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
Expand Down Expand Up @@ -593,6 +595,7 @@ class GithubUrlError(Exception):
# print("The URL provided seems to be incorrect")
pass


def get_readme_content(readme_url):
"""Function to retrieve the content of a readme file given its URL (github)"""
readme_url = readme_url.replace("/blob/", "/raw/")
Expand Down
51 changes: 27 additions & 24 deletions src/somef/regular_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,30 +476,33 @@ def extract_bibtex(readme_text, repository_metadata: Result, readme_source) -> R
-------
@returns Result object with the bibtex associated with this software component
"""
bib_database = bibtexparser.loads(readme_text)
entries = bib_database.entries
for entry in entries:
# dumping the found fields does not seem to work, so rebuilding the object:
exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n"
for key, value in entry.items():
if key not in ('ENTRYTYPE', 'ID'):
exported_bibtex += f" {key} = {{{value}}},\n"
exported_bibtex += "}"
result = {
constants.PROP_VALUE: exported_bibtex,
constants.PROP_TYPE: constants.TEXT_EXCERPT,
constants.PROP_FORMAT: constants.FORMAT_BIB
}
if constants.PROP_DOI in entry:
result[constants.PROP_DOI] = entry[constants.PROP_DOI]
if constants.PROP_TITLE in entry:
result[constants.PROP_TITLE] = entry[constants.PROP_TITLE]
if constants.PROP_AUTHOR in entry:
result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR]
if constants.PROP_URL in entry:
result[constants.PROP_URL] = entry[constants.PROP_URL]
repository_metadata.add_result(constants.CAT_CITATION, result, 1,
constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source)
try:
bib_database = bibtexparser.loads(readme_text)
entries = bib_database.entries
for entry in entries:
# dumping the found fields does not seem to work, so rebuilding the object:
exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n"
for key, value in entry.items():
if key not in ('ENTRYTYPE', 'ID'):
exported_bibtex += f" {key} = {{{value}}},\n"
exported_bibtex += "}"
result = {
constants.PROP_VALUE: exported_bibtex,
constants.PROP_TYPE: constants.TEXT_EXCERPT,
constants.PROP_FORMAT: constants.FORMAT_BIB
}
if constants.PROP_DOI in entry:
result[constants.PROP_DOI] = entry[constants.PROP_DOI]
if constants.PROP_TITLE in entry:
result[constants.PROP_TITLE] = entry[constants.PROP_TITLE]
if constants.PROP_AUTHOR in entry:
result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR]
if constants.PROP_URL in entry:
result[constants.PROP_URL] = entry[constants.PROP_URL]
repository_metadata.add_result(constants.CAT_CITATION, result, 1,
constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source)
except Exception as e:
logging.warning("An error occurred when trying to extract bibtex from README " + str(e))
return repository_metadata


Expand Down
7 changes: 5 additions & 2 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
-------
@return: Dictionary with the results found by SOMEF, formatted as a Result object.
"""
# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s',
datefmt='%d-%b-%y %H:%M:%S', force=True)
logging.getLogger("bibtexparser").setLevel(logging.ERROR)

file_paths = configuration.get_configuration_file()
repo_type = constants.RepositoryType.GITHUB
repository_metadata = Result()
Expand Down Expand Up @@ -152,8 +157,6 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
logging.info("Completed extracting regular expressions")

return repository_metadata


except Exception as e:
logging.error("Error processing repository " + str(e))
return repository_metadata
Expand Down

0 comments on commit 3967b59

Please sign in to comment.