Merge pull request #625 from KnowledgeCaptureAndDiscovery/dev

Dev
KnowledgeCaptureAndDiscovery · Mar 7, 2024 · 3967b59 · 3967b59
2 parents bbb8f7e + 39dc979
commit 3967b59
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
-# Software Metadata Extraction Framework (SOMEF) 
+# Software Metadata Extraction Framework (SOMEF)
+[![Documentation Status](https://readthedocs.org/projects/somef/badge/?version=latest)](https://somef.readthedocs.io/en/latest/?badge=latest) 
 [![Python](https://img.shields.io/pypi/pyversions/somef.svg?style=plastic)](https://badge.fury.io/py/somef) [![PyPI](https://badge.fury.io/py/somef.svg)](https://badge.fury.io/py/somef) [![DOI](https://zenodo.org/badge/190487675.svg)](https://zenodo.org/badge/latestdoi/190487675) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)  [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
 
 <img src="docs/logo.png" alt="logo" width="150"/>

diff --git a/src/somef/__main__.py b/src/somef/__main__.py
@@ -17,10 +17,6 @@ class URLParamType(click.types.StringParamType):
 @click.version_option(__version__)
 def cli():
     click.echo("SOftware Metadata Extraction Framework (SOMEF) Command Line Interface")
-    # Logging setup
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s',
-                        datefmt='%d-%b-%y %H:%M:%S')
-    # logging.getLogger("bibtexparser").setLevel(logging.WARNING)
 
 
 @cli.command(help="Configure GitHub credentials and classifiers file path")

diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
@@ -1,4 +1,3 @@
-import base64
 import logging
 import os
 import zipfile
@@ -11,6 +10,7 @@
 from . import configuration
 from .process_results import Result
 
+
 # Constructs a template HTTP header, which:
 # - has a key for the authorization token if passed via the authorization argument, otherwise
 # - has a key for the authorization token if specified via config, otherwise
@@ -39,8 +39,9 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, **kwargs):
             rate_limit_remaining = response.headers["X-RateLimit-Remaining"]
             epochtime = int(response.headers["X-RateLimit-Reset"])
             date_reset = datetime.fromtimestamp(epochtime)
-            logging.info("Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str(
-            date_reset))
+            logging.info(
+                "Remaining GitHub API requests: " + rate_limit_remaining + " ### Next rate limit reset at: " + str(
+                    date_reset))
         if 'message' in response and 'API rate limit exceeded' in response['message']:
             rate_limited = True
             logging.warning(f"rate limited. Backing off for {initial_backoff} seconds")
@@ -324,7 +325,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
         return None
 
     logging.info(f"Loading Repository {repository_url} Information....")
-    
+
     # Create template header with optional authorization token
     header = header_template(authorization)
     header['accept'] = constants.GITHUB_ACCEPT_HEADER
@@ -438,7 +439,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
 
         # get releases
         releases_list_raw, date = rate_limit_get(repo_api_base_url + "/releases",
-                                             headers=header)
+                                                 headers=header)
         releases_list = releases_list_raw.json()
         if isinstance(releases_list, dict) and 'message' in releases_list.keys():
             logging.error("Releases Error: " + releases_list['message'])
@@ -492,7 +493,8 @@ def do_crosswalk(data, crosswalk_table):
     return output
 
 
-def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None, authorization=None):
+def download_repository_files(owner, repo_name, default_branch, repo_type, target_dir, repo_ref=None,
+                              authorization=None):
     """
     Given a repository, this method will download its files and return the readme text
     Parameters
@@ -543,7 +545,7 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
         logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
         repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
         logging.info(f"Trying to download {repo_archive_url}")
-        repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization)) 
+        repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
 
     if repo_download.status_code != 200:
         sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
@@ -593,6 +595,7 @@ class GithubUrlError(Exception):
     # print("The URL provided seems to be incorrect")
     pass
 
+
 def get_readme_content(readme_url):
     """Function to retrieve the content of a readme file given its URL (github)"""
     readme_url = readme_url.replace("/blob/", "/raw/")

diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py
@@ -476,30 +476,33 @@ def extract_bibtex(readme_text, repository_metadata: Result, readme_source) -> R
     -------
     @returns Result object with the bibtex associated with this software component
     """
-    bib_database = bibtexparser.loads(readme_text)
-    entries = bib_database.entries
-    for entry in entries:
-        # dumping the found fields does not seem to work, so rebuilding the object:
-        exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n"
-        for key, value in entry.items():
-            if key not in ('ENTRYTYPE', 'ID'):
-                exported_bibtex += f"    {key} = {{{value}}},\n"
-        exported_bibtex += "}"
-        result = {
-            constants.PROP_VALUE: exported_bibtex,
-            constants.PROP_TYPE: constants.TEXT_EXCERPT,
-            constants.PROP_FORMAT: constants.FORMAT_BIB
-        }
-        if constants.PROP_DOI in entry:
-            result[constants.PROP_DOI] = entry[constants.PROP_DOI]
-        if constants.PROP_TITLE in entry:
-            result[constants.PROP_TITLE] = entry[constants.PROP_TITLE]
-        if constants.PROP_AUTHOR in entry:
-            result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR]
-        if constants.PROP_URL in entry:
-            result[constants.PROP_URL] = entry[constants.PROP_URL]
-        repository_metadata.add_result(constants.CAT_CITATION, result, 1,
-                                       constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source)
+    try:
+        bib_database = bibtexparser.loads(readme_text)
+        entries = bib_database.entries
+        for entry in entries:
+            # dumping the found fields does not seem to work, so rebuilding the object:
+            exported_bibtex = f"@{entry['ENTRYTYPE']}{{{entry['ID']},\n"
+            for key, value in entry.items():
+                if key not in ('ENTRYTYPE', 'ID'):
+                    exported_bibtex += f"    {key} = {{{value}}},\n"
+            exported_bibtex += "}"
+            result = {
+                constants.PROP_VALUE: exported_bibtex,
+                constants.PROP_TYPE: constants.TEXT_EXCERPT,
+                constants.PROP_FORMAT: constants.FORMAT_BIB
+            }
+            if constants.PROP_DOI in entry:
+                result[constants.PROP_DOI] = entry[constants.PROP_DOI]
+            if constants.PROP_TITLE in entry:
+                result[constants.PROP_TITLE] = entry[constants.PROP_TITLE]
+            if constants.PROP_AUTHOR in entry:
+                result[constants.PROP_AUTHOR] = entry[constants.PROP_AUTHOR]
+            if constants.PROP_URL in entry:
+                result[constants.PROP_URL] = entry[constants.PROP_URL]
+            repository_metadata.add_result(constants.CAT_CITATION, result, 1,
+                                           constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source)
+    except Exception as e:
+        logging.warning("An error occurred when trying to extract bibtex from README " + str(e))
     return repository_metadata
 
 

diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py
@@ -37,6 +37,11 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
     -------
     @return: Dictionary with the results found by SOMEF, formatted as a Result object.
     """
+    # Set up logging
+    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s-%(levelname)s-%(message)s',
+                        datefmt='%d-%b-%y %H:%M:%S', force=True)
+    logging.getLogger("bibtexparser").setLevel(logging.ERROR)
+
     file_paths = configuration.get_configuration_file()
     repo_type = constants.RepositoryType.GITHUB
     repository_metadata = Result()
@@ -152,8 +157,6 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
             logging.info("Completed extracting regular expressions")
 
         return repository_metadata
-
-
     except Exception as e:
         logging.error("Error processing repository " + str(e))
         return repository_metadata