From 9a76ddd3e4588f954fb3ee7aec80c5d9de495d6f Mon Sep 17 00:00:00 2001 From: Matthew Brady Date: Thu, 6 Jun 2024 22:25:14 +0100 Subject: [PATCH] v1.5 - Added logic for duplicate dependencies --- README.md | 92 ++++++++++++++++++++++------- bd_sig_filter/ComponentClass.py | 15 +++-- bd_sig_filter/ComponentListClass.py | 53 ++++++++++++++--- bd_sig_filter/SigEntryClass.py | 4 +- pyproject.toml | 2 +- 5 files changed, 131 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index a374d0b..2552062 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# bd_sig_filter +# bd_sig_filter - v1.5 BD Script to ignore components matched from Signature scan likely to be partial or invalid matches. ## PROVISION OF THIS SCRIPT @@ -10,10 +10,12 @@ If you have comments or issues, please raise a GitHub issue here. Synopsys suppo Black Duck Signature matching is a unique and powerful way to find OSS and 3rd party code within your applications and environments. -Signature matching uses hierarchical folder analysis to find matches with depth, identifying the most likely components matching the project. -Many competitive SCA solutions use individual file matching across all files in the project which is not effective -to identify component matches because the majority of files in a component do not change between versions, -so multiple matches will be identified for every file. +Signature matching uses hierarchical folder analysis to find matches with depth, identifying the most likely components +matching the project by examining all files in all folders as a whole. +Many competitive SCA solutions use individual file matching for files in the project, but this is not suitable +to identify component matches because the majority of files in components do not change between versions, +so multiple version matches will be identified for every file. It is therefore impossible to infer an overall component +version by looking at the individual files. However, Signature matching can still produce false positive matches, especially where template code hierarchies exist in custom and OSS code. @@ -29,16 +31,14 @@ name and version in the path to determine matches which are likely correct and o It can also ignore components only matched from paths which should be excluded (Synopsys tools, cache/config folders and test folders), and components which are duplicates across versions where the version string is not found -in the signature match path. +in the signature match path or one match is a dependency. -Options can be used to enable ignore and review actions, and other features. +Options are available to enable ignore and review actions, and other features. ## PREREQUISITES - Python 3.8+ must be installed prior to using this script. ## INSTALLATION - The package can be installed using the command: python3 -m pip install bd-sig-filter @@ -52,7 +52,6 @@ Alternatively, the repository can be cloned and the script run directly using th python3 bd_sig_filter/bd_sig_filter.py OPTIONS ## USAGE - If installed as a package, run the utility using the command `bd-sig-filter`. Alternatively if you have cloned the repo, use a command similar to: @@ -61,7 +60,7 @@ Alternatively if you have cloned the repo, use a command similar to: The package can be invoked as follows: - usage: bd_sig_filter [-h] [--blackduck_url BLACKDUCK_URL] [--blackduck_api_token BLACKDUCK_API_TOKEN] [--blackduck_trust_cert] [-p PROJECT] [-v VERSION] [--debug] [--logfile LOGFILE] + usage: bd-sig-filter [-h] [--blackduck_url BLACKDUCK_URL] [--blackduck_api_token BLACKDUCK_API_TOKEN] [--blackduck_trust_cert] [-p PROJECT] [-v VERSION] [--debug] [--logfile LOGFILE] [--report_file REPORT_FILE] [--version_match_reqd] [--ignore] [--review] [--no_ignore_test] [--no_ignore_synopsys] [--no_ignore_defaults] [--ignore_no_path_matches] @@ -90,6 +89,8 @@ The package can be invoked as follows: --ignore_no_path_matches Also ignore components with no component/version match in signature path (Use with caution) + --report_unmatched Report the list of components which will be left Unreviewed and why - these may need + to be manually reviewed. The minimum required options are: @@ -102,7 +103,8 @@ Environment variables BLACKDUCK_URL, BLACKDUCK_API_TOKEN and BLACKDUCK_TRUST_CER ## SCRIPT BEHAVIOUR The default behaviour of the script is to create a table of BOM components with details about what actions can be taken. -By default no actions will be taken, with only the table being created. +By default, no actions will be taken, with only the tables being created to explain what would happen if `--ignore` and `--review` +options were specified. An example of the output table is shown below: @@ -120,6 +122,8 @@ An example of the output table is shown below: Amazon MSK Library for AW/2.0.2 Dep+Sig False False False True Mark REVIEWED - Dependency Apache HttpComponents Cor/5.2.4 Sig False False True False Mark IGNORED - compname or version not found in paths & --ignore_no_path_matches set WSDL4J/1.5.1 Sig False False False False No Action + Xalan Java Serializer/2.7.2 Sig False False False False No Action - Is a duplicate of dependency 'Xalan Java Serializer/2.7.3', has different component id/version but version found in sigpaths + Xalan Java Serializer/2.7.3 Dep False False False True Mark REVIEWED - Dependency Note component names are truncated at 25 characters. @@ -146,25 +150,71 @@ The following options can be specified: as it may exclude components which are legitimate (the Signature match path does not have to include the component name or version). -The options --report_file and --logfile can be used to output the tabular report and logging data to +The options `--report_file` and `--logfile` can be used to output the tabular report and logging data to specified files. ## PROPOSED WORKFLOW - -The script provides automatic classification of Signature scan results. +The script can classify Signature scan results. It can mark components as reviewed which are either Dependencies, or which have signature match paths containing -the component name (and optionally component version) and therefore highly likely to be correctly identified +the component name (and optionally component version) and which are therefore highly likely to be correctly identified by Signature matching. Fuzzy pattern matching is used so there is the possibility that components could be marked as reviewed where only a partial match exists, or components which should be matched -are not identified meaning that manual curation may still be required. +are not identified meaning that some manual curation may still be required. It will also ignore components only matched within extraneous folders (for example created by Synopsys tools, config/cache folders or test folders). Components shown with `No action` are Signature matches where the component name or version could not be identified in the signature paths, so they are potential false matches and require manual review. -Specify the `--ignore_no_path_matches` option to ignore these components automatically. -Duplicate components with multiple versions where the version -is not found in the signature match path are also marked as ignored. - +Specify the `--ignore_no_path_matches` option to ignore these components automatically, +however this should be used with caution as these components may be valid and should be manually reviewed. + +## PROCESSING DUPLICATE COMPONENTS +The script processes multiple versions of the same component in the BOM in several ways as described below: + +### SCENARIO 1 +- Comp1 and Comp2 are different versions of the same component +- Comp1 and Comp2 are BOTH dependencies + +Outcome: +- Comp1 will be marked REVIEWED +- Comp2 will be marked REVIEWED + +### SCENARIO 2 +- Comp1 and Comp2 are different versions of the same component +- Comp1 is a dependency and Comp2 is a signature match +- Comp2 name IS found but version string is NOT found in the Signature match paths + +Outcome: +- Comp1 will be marked REVIEWED +- Comp2 will be marked IGNORED + +### SCENARIO 3 +- Comp1 and Comp2 are different versions of the same component +- Comp1 is a dependency and Comp2 is a signature match +- Comp2 name and version strings ARE found in the Signature match paths + +Outcome: +- Comp1 will be marked REVIEWED +- Comp2 will be left unignored and not marked reviewed - for manual review + +### SCENARIO 4 +- Comp1 and Comp2 are different versions of the same component +- Comp1 and Comp2 are both signature matches +- Comp1 name and version strings ARE both found in the Signature match paths +- Comp2 name IS found but version string is NOT found in the Signature match paths + +Outcome: +- Comp1 will be marked REVIEWED +- Comp2 will be IGNORED + +### SCENARIO 5 +- Comp1 and Comp2 are different versions of the same component +- Comp1 and Comp2 are both signature matches +- Comp1 name string IS found but version string is NOT found in the Signature match paths +- Comp2 name string IS found but version string is NOT found in the Signature match paths + +Outcome: +- Comp1 will be marked REVIEWED +- Comp2 will be left unignored and not reviewed - for manual review diff --git a/bd_sig_filter/ComponentClass.py b/bd_sig_filter/ComponentClass.py index 9f1d507..abfd9da 100644 --- a/bd_sig_filter/ComponentClass.py +++ b/bd_sig_filter/ComponentClass.py @@ -48,7 +48,8 @@ def is_dependency(self): return False def is_signature(self): - sig_types = ['FILE_EXACT', 'FILE_SOME_FILES_MODIFIED', 'FILE_FILES_ADDED_DELETED_AND_MODIFIED'] + sig_types = ['FILE_EXACT', 'FILE_SOME_FILES_MODIFIED', 'FILE_FILES_ADDED_DELETED_AND_MODIFIED', + 'FILE_EXACT_FILE_MATCH'] match_types = self.get_matchtypes() for m in sig_types: if m in match_types: @@ -60,6 +61,10 @@ def is_only_signature(self): def set_ignore(self): self.ignore = True + self.mark_reviewed = False + + def set_unignore(self): + self.ignore = False def get_reviewed_status(self): try: @@ -70,9 +75,11 @@ def get_reviewed_status(self): return False def set_reviewed(self): - if not self.get_reviewed_status(): - self.mark_reviewed = True - return + self.mark_reviewed = True + self.ignore = False + + def set_notreviewed(self): + self.mark_reviewed = False def is_ignored(self): try: diff --git a/bd_sig_filter/ComponentListClass.py b/bd_sig_filter/ComponentListClass.py index 1fa263e..3fe8093 100644 --- a/bd_sig_filter/ComponentListClass.py +++ b/bd_sig_filter/ComponentListClass.py @@ -103,34 +103,73 @@ def process(self): comp.reason = 'No action - not Signature match' # look for duplicate components (same compid) and ignore - logging.debug("\nDUPLICATE SIGNATURE MATCHES FILTER PHASE") + logging.debug("\nDUPLICATE COMPONENT FILTER PHASE") for i in range(len(self.components)): comp1 = self.components[i] - if comp1.is_ignored() or comp1.is_dependency() or not comp1.is_only_signature() or comp1.ignore: + if comp1.is_ignored() or comp1.ignore: continue for j in range(i + 1, len(self.components)): comp2 = self.components[j] - if comp2.is_ignored() or comp2.is_dependency() or not comp2.is_only_signature() or comp2.ignore: + if comp2.is_ignored() or comp2.ignore: continue - - if comp1.get_compid() == comp2.get_compid(): - if comp1.compname_found and not comp2.compname_found: - logging.debug(f"IGNORING {comp2.name}/{comp2.version} as it is a duplicate to {comp1.name}/{comp1.version}") + if comp1.get_compid() == comp2.get_compid() or comp1.name.lower() == comp2.name.lower(): + if comp1.is_dependency() and comp2.is_dependency(): + continue + elif comp1.is_dependency(): + if comp2.is_only_signature() and not comp2.compver_found: + logging.debug(f"IGNORING {comp2.name[:25]}/{comp2.version} as it has no version in sigpaths and is a duplicate to {comp1.name[:25]}/{comp1.version} which is a dependency") + comp2.reason = f"Mark IGNORED - Is a duplicate of dependency '{comp1.name[:25]}/{comp1.version[:10]}', has different component id or version and no version in sigpaths" + comp2.set_ignore() + comp2.set_notreviewed() + else: + logging.debug( + f"No Action for {comp2.name[:25]}/{comp2.version} as it has version in sigpaths but is a duplicate to {comp1.name[:25]}/{comp1.version} which is a dependency") + comp2.reason = f"No Action - Is a duplicate of dependency '{comp1.name[:25]}/{comp1.version[:10]}', has different component id/version but version found in sigpaths" + comp2.set_notreviewed() + comp2.set_unignore() + elif comp2.is_dependency(): + if comp1.is_only_signature() and not comp1.compver_found: + logging.debug( + f"IGNORING {comp1.name[:25]}/{comp1.version} as it has no version in sigpaths and is a duplicate to {comp2.name[:25]}/{comp2.version} which is a dependency") + comp1.reason = f"Mark IGNORED - Is a duplicate of dependency '{comp2.name[:25]}/{comp2.version[:10]}' but has different component id or version and no version in sigpaths" + comp1.set_ignore() + comp1.set_notreviewed() + else: + logging.debug( + f"No Action for {comp1.name[:25]}/{comp1.version} as it has version in sigpaths but is a duplicate to {comp2.name[:25]}/{comp2.version} which is a dependency") + comp1.reason = f"No Action - Is a duplicate of dependency '{comp2.name[:25]}/{comp2.version[:10]}', has different component id/version but version found in sigpaths" + comp1.set_notreviewed() + comp1.set_unignore() + + elif comp1.compname_found and not comp2.compname_found: + logging.debug(f"IGNORING {comp2.name[:25]}/{comp2.version} as it is a duplicate to {comp1.name}/{comp1.version}") comp2.reason = f"Mark IGNORED - Is a duplicate of '{comp1.name[:25]}/{comp1.version[:10]}' but has no compname in Signature paths" comp2.set_ignore() + comp2.set_notreviewed() elif not comp1.compname_found and comp2.compname_found: logging.debug(f"IGNORING {comp1.name}/{comp1.version} as it is a duplicate to {comp2.name}/{comp2.version}") comp1.set_ignore() comp1.reason = f"Mark IGNORED - Is a duplicate to '{comp2.name[:25]}/{comp2.version[:10]}' but has no compname in Signature paths" + comp1.set_notreviewed() elif comp1.compver_found and not comp2.compver_found: logging.debug(f"Will ignore {comp2.name}/{comp2.version} as it is a duplicate to {comp1.name}/{comp1.version} and path misses version") comp2.set_ignore() comp2.reason = f"Mark IGNORED - Is a duplicate to '{comp1.name[:25]}/{comp1.version[:10]}' but has no version in Signature paths" + comp2.set_notreviewed() elif not comp1.compver_found and comp2.compver_found: logging.debug(f"Will ignore {comp1.name}/{comp1.version} as it is a duplicate to {comp2.name}/{comp2.version} and path misses version") comp1.set_ignore() comp1.reason = f"Mark IGNORED - Is a duplicate to '{comp2.name[:25]}/{comp2.version[:10]}' but has no version in Signature paths" + comp1.set_notreviewed() + elif not comp1.compver_found and not comp2.compver_found: + # Both components have no versions - mark comp1 reviewed + logging.debug(f"- Duplicate components {comp1.name}/{comp1.version} and {comp2.name}/{comp2.version} - " + f"{comp1.name} marked as REVIEWED") + comp1.set_reviewed() + comp1.reason = f"Mark REVIEWED - Is a duplicate to '{comp2.name[:25]}/{comp2.version[:10]}' but both have no version in Signature paths (chose '{comp1.version}')" + comp2.set_notreviewed() + comp2.reason = f"No Action - Is a duplicate to '{comp1.name[:25]}/{comp1.version[:10]}' but both have no version in Signature paths (chose '{comp1.version}')" else: # Nothing to do logging.debug(f"- Will retain both components {comp1.filter_name}/{comp1.filter_version} and {comp2.filter_name}/{comp2.filter_version} - " diff --git a/bd_sig_filter/SigEntryClass.py b/bd_sig_filter/SigEntryClass.py index aef0d0d..62cbc42 100644 --- a/bd_sig_filter/SigEntryClass.py +++ b/bd_sig_filter/SigEntryClass.py @@ -41,7 +41,7 @@ def search_component(self, compname_arr, compver): # comp_in_path = fuzz.token_set_ratio(compstring, newpath) compname_in_path = fuzz.token_set_ratio(cname, newpath) compver_in_path = fuzz.token_set_ratio(compver, newpath) - if compname_in_path + compver_in_path > 90: + if compname_in_path + compver_in_path > 100: if compname_in_path + compver_in_path > best_match_name + best_match_ver: best_match_name = compname_in_path best_match_ver = compver_in_path @@ -52,7 +52,7 @@ def search_component(self, compname_arr, compver): ver_bool = False if best_match_name > 45: name_bool = True - if best_match_ver > 45: + if best_match_ver > 60: ver_bool = True return name_bool, ver_bool, best_match_name + best_match_ver diff --git a/pyproject.toml b/pyproject.toml index 2d87a29..c97ec03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bd_sig_filter" -version = "1.4" +version = "1.5" authors = [ { name="Matthew Brady", email="mbrad@synopsys.com" }, ]