Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a regex in the config file for each resource to exclude certain types of files #29

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions hashtheplanet/config/config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
"""
This module handles the config file.
"""
from enum import Enum
import json
from typing import Dict, List

from hashtheplanet.resources.git_resource import GitResource
from hashtheplanet.resources.npm_resource import NpmResource

class ConfigField(Enum):
"""
This enum contains every field that can be found in the config file
"""
TARGETS = "targets"
EXCLUDE_REGEX = "exclude_regex"
Comment on lines +11 to +16
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Je ne vois pas vraiment l'intérêt d'utiliser une énumération ici plutôt qu'une classe pure. Plutôt que de devoir passer l'argument config_field à chaque fonction, ça pourrait être plus simple pour la maintenabilité d'avoir un fichier definition.py contenant les abstractions des noms de champs. En pratique je suis pas certain qu'abstraire les champs soit utile puisqu'on aimerait assurer la stabilité de l'interface proposée par le json, et qu'il vaudrait mieux juste déprécier des champs si on souhaite faire des changements.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Je pense qu'une énumération est mieux pour la compréhension des fields disponibles dans le fichier de config parce qu'elle permet d'avoir un namespace: ConfigField et surtout un type pour récupérer le field que l'on souhaite. Pour parler de maintenabilité, on pourrait déplacer la classe ConfigField dans le fichier definition.py ce qui permettrait de tout regrouper au même endroit.


class Config():
"""
This class implements methods to manipulate the config file.
Expand All @@ -25,15 +33,15 @@ def parse(self, config_path: str):
with open(config_path, "r", encoding="utf-8") as file_fp:
self._config = json.load(file_fp)

def get_targets(self, resource_name: str) -> List[str]:
def get(self, resource_name: str, config_field: ConfigField):
"""
This methods returns the targets used by the given resource.
This methods returns a field content used by the given resource.
"""
module_info: Dict = self._config.get(resource_name)
field_content: Dict = self._config.get(resource_name)

if module_info is None:
return []
return module_info.get("targets")
if not config_field or not field_content:
return None
return field_content.get(config_field.value)

def get_used_resources(self) -> List[str]:
"""
Expand Down
7 changes: 4 additions & 3 deletions hashtheplanet/core/hashtheplanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from sqlalchemy.orm import sessionmaker

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField
from hashtheplanet.executor.executor import Executor
from hashtheplanet.sql.db_connector import Base, DbConnector, Hash

Expand Down Expand Up @@ -90,10 +90,11 @@ def compute_hashs(self):
self._config.parse(self._input_file)

for resource_name in self._config.get_used_resources():
targets = self._config.get_targets(resource_name)
targets = self._config.get(resource_name, ConfigField.TARGETS) or []
exclude_regex = self._config.get(resource_name, ConfigField.EXCLUDE_REGEX)

for target in targets:
self._executor.execute(resource_name, target)
self._executor.execute(resource_name, target, exclude_regex)

logger.info("Computing done")

Expand Down
5 changes: 3 additions & 2 deletions hashtheplanet/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module handles the resource executions.
"""
from importlib import import_module
from typing import Optional

from loguru import logger

Expand All @@ -17,7 +18,7 @@ def __init__(self, database: DbConnector, session_scope):
self._database = database
self._session_scope = session_scope

def execute(self, resource_name: str, target: str):
def execute(self, resource_name: str, target: str, exclude_regex: Optional[str] = None):
"""
This method executes a resource to compute hashes.
"""
Expand All @@ -31,4 +32,4 @@ def execute(self, resource_name: str, target: str):
return

resource_instance: Resource = getattr(module, resource_class_name)(self._database)
resource_instance.compute_hashes(self._session_scope, target)
resource_instance.compute_hashes(self._session_scope, target, exclude_regex)
14 changes: 9 additions & 5 deletions hashtheplanet/resources/git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import subprocess
import tempfile
from stat import S_ISDIR, S_ISREG
from typing import List, Tuple
from typing import List, Optional, Tuple

# third party imports
from git import GitCommandError, Repo
Expand Down Expand Up @@ -54,10 +54,11 @@ def get_all_files_from_commit(commit: Commit) -> List[Tuple[FilePath, BlobHash]]
file_list.append((blob.path, blob.hexsha))
return file_list

@staticmethod
def _hash_files(
self,
files: List[GitFileMetadata],
repo_dir_path: str
repo_dir_path: str,
exclude_regex: Optional[str]
) -> List[FileMetadata]:
"""
This method calculates the SHA256 hashes of input files.
Expand All @@ -69,6 +70,8 @@ def _hash_files(
os.chdir(repo_dir_path)

for (file_path, tag_name, blob_hash) in files:
if not self.should_save(exclude_regex, file_path):
continue
try:
# We need to use a subprocess and not the GitPython library
# because when we execute "git cat-file -p [blob]" with it, it always removes the \n from the last line.
Expand Down Expand Up @@ -192,7 +195,7 @@ def _filter_stored_tags(stored_versions: List[VersionTable], found_tags: List[Ta
result.append(found_tag)
return result

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method clones the repository from url, retrieves tags, compares each tags to retrieve only modified files,
computes their hashes and then stores the tags & files information in the database.
Expand All @@ -203,6 +206,7 @@ def compute_hashes(self, session_scope, target: str):

with tempfile.TemporaryDirectory() as tmp_dir_name:
try:
logger.info(f"Cloning {target}")
repo = self.clone_repository(target, tmp_dir_name)
except GitCommandError as error:
logger.warning(f"Error while cloning repository on {target}: {error}")
Expand All @@ -225,7 +229,7 @@ def compute_hashes(self, session_scope, target: str):
files += self._get_diff_files(tags)

logger.info("Generating hashes ...")
files_info = self._hash_files(files, tmp_dir_name)
files_info = self._hash_files(files, tmp_dir_name, exclude_regex)

logger.info("Saving hashes ...")
self._save_hashes(session_scope, files_info, tags, technology)
11 changes: 5 additions & 6 deletions hashtheplanet/resources/npm_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#standard imports
import tarfile
import tempfile
from typing import Dict, List, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple
import requests

# third party imports
Expand Down Expand Up @@ -61,8 +61,7 @@ def save_tar_to_disk(file_path: str, npm_module_name: str, version: str):
with open(file_path, 'wb') as file_fd:
file_fd.write(request.content)

@staticmethod
def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]:
def extract_hashes_from_tar(self, file_path: str, exclude_regex: Optional[str]) -> List[FileMetadata]:
"""
This method returns all hashes of all files contained in a tar file.
"""
Expand All @@ -72,7 +71,7 @@ def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]:
for member in tar.getmembers():
file = tar.extractfile(member)

if file is None:
if file is None or not self.should_save(exclude_regex, member.path):
continue
files.append((member.path, Hash.hash_bytes(file.read())))
return files
Expand All @@ -94,7 +93,7 @@ def _save_hashes(
self._database.insert_file(session, npm_module_name, file_path)
self._database.insert_or_update_hash(session, file_hash, npm_module_name, [version])

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method downloads all versions of an npm module and stores all the versions with their associated files
and hashes and stores them in the database.
Expand All @@ -108,6 +107,6 @@ def compute_hashes(self, session_scope, target: str):
file_path = f"{tmp_dir_name}/{target}-{version}.tgz"

self.save_tar_to_disk(file_path, target, version)
files_info[version] = self.extract_hashes_from_tar(file_path)
files_info[version] = self.extract_hashes_from_tar(file_path, exclude_regex)

self._save_hashes(session_scope, files_info, versions, target)
16 changes: 15 additions & 1 deletion hashtheplanet/resources/resource.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""
This module contains the base class for the resources.
"""
import re
from typing import Optional

from hashtheplanet.sql.db_connector import DbConnector

class Resource(): # pylint: disable=too-few-public-methods
Expand All @@ -12,8 +15,19 @@ class Resource(): # pylint: disable=too-few-public-methods
def __init__(self, database: DbConnector):
self._database = database

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method computes all the versions and their associated files & hashes and stores them in the database.
"""
raise NotImplementedError()

@staticmethod
def should_save(exclude_regex: str, file_path: str):
"""
This method permits to verify if the specified file should be saved in the database or not
"""
if not file_path:
return False
if not exclude_regex:
return True
return not re.search(exclude_regex, file_path)
6 changes: 4 additions & 2 deletions src/tech_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
"https://github.com/drupal/drupal.git",
"https://github.com/magento/magento2.git",
"https://github.com/joomla/joomla-cms.git"
]
],
"exclude_regex": "\\.php|tests\\/|test\\/|\\.gitignore|\\.package|\\.idea|qunit|\\.editorconfig|vendor\\/|\\.vue|\\.scss|\\.less"
},
"npm": {
"targets": [
"underscore",
"jquery"
]
],
"exclude_regex": "tests\\/|test\\/|\\.jshintrc|\\.npmignore|\\.bowerrc|\\.jscsrc|\\.bower.json|\\.eslintrc.json|\\.jshintignore|\\.eslintrc|\\.scss|\\.less|\\.vue|\\.scss|\\.less|\\.idea"
}
}
6 changes: 3 additions & 3 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from unittest.mock import MagicMock, mock_open, patch

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField

def get_mock_open(files: Dict[str, str]):
def open_mock(filename, *args, **kwargs):
Expand Down Expand Up @@ -61,9 +61,9 @@ def test_get_targets():
config = Config()

with patch.dict(config._config, {"git": {"targets": ["target1", "target2"]}}):
assert len(config.get_targets("git")) == 2
assert len(config.get("git", ConfigField.TARGETS)) == 2

assert len(config.get_targets("npm")) == 0
assert not config.get("npm", ConfigField.TARGETS)

def test_get_used_resources():
config = Config()
Expand Down
33 changes: 27 additions & 6 deletions tests/resources/test_git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,11 @@ def test_hash_files():

git_resource = GitResource(None)

# when there are no excluded files
with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path")
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", None)

assert sp_mock.call_count == 3
sp_mock.assert_called_with(['git', 'cat-file', '-p', 'e42f952edc48e2c085c206166bf4f1ead4d4b058'], shell=False)
Expand All @@ -114,11 +115,31 @@ def test_hash_files():
assert files_metadata[1][1] == "1.2.5"
assert files_metadata[1][2] == hashlib.sha256(blobs.get("e42f952edc48e2c085c206166bf4f1ead4d4b058")).hexdigest()

git_resource = GitResource(None)

# When the *.cfg files are excluded
with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", "\\.cfg$")

assert sp_mock.call_count == 2

getcwd_mock.assert_called_once()

assert chdir_mock.call_count == 2
chdir_mock.assert_called_with("/foobar/")

assert len(files_metadata) == 1

assert files_metadata[0][0] == "LICENSE"
assert files_metadata[0][1] == "1.2.3"
assert files_metadata[0][2] == hashlib.sha256(blobs.get("d159169d1050894d3ea3b98e1c965c4058208fe1")).hexdigest()

with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path")
files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path", None)

assert sp_mock.call_count == 1
sp_mock.assert_called_with(['git', 'cat-file', '-p', 'empty'], shell=False)
Expand All @@ -134,7 +155,7 @@ def test_hash_files():
with mock.patch.object(subprocess, "check_output", MagicMock(side_effect=ValueError("error"))) as mock_exec, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
git_resource._hash_files(git_files_metadata, "repo_dir_path")
git_resource._hash_files(git_files_metadata, "repo_dir_path", None)

getcwd_mock.assert_called_once()

Expand Down Expand Up @@ -411,7 +432,7 @@ def mock_tmp_dir():
session = MagicMock()
git_resource = GitResource(DbConnector())

git_resource.compute_hashes(session, repo_url)
git_resource.compute_hashes(session, repo_url, None)

# In this situation, we verify that by giving a good repo_url & a good tmp_dir_path
# we download the tags, calculate hash & store them in the database
Expand All @@ -420,7 +441,7 @@ def mock_tmp_dir():
mock_get_tag_files.assert_called_once_with(tags[0])
mock_filter_stored_tags.assert_called_once_with([], tags)
mock_get_diff_files.assert_called_once_with(tags)
mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path)
mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path, None)
mock_save_hashes.assert_called_once_with(session, "hashed files", tags, "foobar")

with patch.object(
Expand All @@ -435,7 +456,7 @@ def mock_tmp_dir():
patch.object(GitResource, "_hash_files", return_value="hashed files") as mock_hash_files, \
patch.object(GitResource, "_save_hashes") as mock_save_hashes, \
patch.object(DbConnector, "get_versions") as mock_get_versions:
git_resource.compute_hashes(MagicMock(), repo_url)
git_resource.compute_hashes(MagicMock(), repo_url, None)
mock_clone_repo.assert_called_once_with(repo_url, tmp_dir_path)

# In this situation, we verify that by giving a wrong repository we stop the function
Expand Down
Loading