From 2a8fee5178e4216df6b516d49469daf8ccd179e8 Mon Sep 17 00:00:00 2001 From: Thomas Druez Date: Tue, 18 Jul 2023 14:46:14 +0400 Subject: [PATCH 1/9] Prototype implementation of a C/C++ d2d pipeline #812 Signed-off-by: Thomas Druez --- scanpipe/filters.py | 3 + scanpipe/pipelines/d2d_dwarf.py | 50 ++++++++++++ scanpipe/pipes/d2d.py | 77 +++++++++++++++++++ scanpipe/pipes/input.py | 6 +- scanpipe/pipes/scancode.py | 15 +++- .../templates/scanpipe/relation_list.html | 2 +- setup.cfg | 1 + 7 files changed, 149 insertions(+), 5 deletions(-) create mode 100644 scanpipe/pipelines/d2d_dwarf.py diff --git a/scanpipe/filters.py b/scanpipe/filters.py index 75b4970c8..e23910736 100644 --- a/scanpipe/filters.py +++ b/scanpipe/filters.py @@ -307,6 +307,8 @@ def __init__(self, *args, **kwargs): ("any", "Any map"), ("many", "Many map"), ("about_file", "about file"), + ("dwarf_compiled_paths", "dwarf compiled paths"), + ("dwarf_included_paths", "dwarf included paths"), ("java_to_class", "java to class"), ("jar_to_source", "jar to source"), ("js_compiled", "js compiled"), @@ -373,6 +375,7 @@ class ResourceFilterSet(FilterSetUtilsMixin, django_filters.FilterSet): relation_map_type = RelationMapTypeFilter( label="Relation map type", field_name="related_from__map_type", + distinct=True, ) class Meta: diff --git a/scanpipe/pipelines/d2d_dwarf.py b/scanpipe/pipelines/d2d_dwarf.py new file mode 100644 index 000000000..a560501b1 --- /dev/null +++ b/scanpipe/pipelines/d2d_dwarf.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines.deploy_to_develop import DeployToDevelop +from scanpipe.pipes import d2d +from scanpipe.pipes import input + + +class DWARF(DeployToDevelop): + """ELFs and DWARFs.""" + + @classmethod + def steps(cls): + return ( + cls.get_inputs, + cls.build_inventory_from_scans, + cls.flag_ignored_resources, + cls.map_dwarf_paths, + cls.flag_mapped_resources_and_ignored_directories, + ) + + def build_inventory_from_scans(self): + """Build inventories""" + for input_path, tag in [(self.from_file, "from"), (self.to_file, "to")]: + input.load_inventory_from_toolkit_scan( + self.project, input_path, resource_defaults={"tag": tag} + ) + + def map_dwarf_paths(self): + """Map DWARF paths""" + d2d.map_dwarf_path(project=self.project, logger=self.log) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 6d9ab9a61..d5b124736 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -840,3 +840,80 @@ def _map_javascript_path_resource( extra_data=extra_data, ) return len(transpiled) + + +def _map_dwarf_path_resource(to_resource, from_resources, from_resources_index): + dwarf_source_paths = to_resource.extra_data.get("dwarf_source_paths") or {} + compiled_paths = dwarf_source_paths.get("compiled_paths") or [] + included_paths = dwarf_source_paths.get("included_paths") or [] + paths_and_map_type = [ + (compiled_paths, "dwarf_compiled_paths"), + (included_paths, "dwarf_included_paths"), + ] + + for paths, map_type in paths_and_map_type: + for path in paths: + match = pathmap.find_paths(path, from_resources_index) + if not match: + continue + + # Only create relations when the number of matches if inferior or equal to + # the current number of path segment matched. + if len(match.resource_ids) > match.matched_path_length: + to_resource.update(status=flag.TOO_MANY_MAPS) + continue + + for resource_id in match.resource_ids: + from_resource = from_resources.get(id=resource_id) + + # Do not count the "to/" segment as it is not "matchable" + to_path_length = len(to_resource.path.split("/")) - 1 + extra_data = { + "path_score": f"{match.matched_path_length}/{to_path_length}", + } + + pipes.make_relation( + from_resource=from_resource, + to_resource=to_resource, + map_type=map_type, + extra_data=extra_data, + ) + + +def map_dwarf_path(project, logger=None): + """Map DWARF paths suffix similarities.""" + project_files = project.codebaseresources.files().no_status() + from_resources = project_files.from_codebase() + to_resources = project_files.to_codebase().has_no_relation() + + # TODO: Review the performances of this + # Replace by something along: .exclude(extra_data__dwarf_source_paths=[]) + to_resources = to_resources.json_field_contains( + field_name="extra_data", + value="dwarf_source_paths", + ) + resource_count = to_resources.count() + + if logger: + logger( + f"Mapping {resource_count:,d} to/ resources using DWARF path map " + f"against from/ codebase" + ) + + from_resources_index = pathmap.build_index( + from_resources.values_list("id", "path"), with_subpaths=True + ) + + resource_iterator = to_resources.iterator(chunk_size=2000) + last_percent = 0 + start_time = timer() + for resource_index, to_resource in enumerate(resource_iterator): + last_percent = pipes.log_progress( + logger, + resource_index, + resource_count, + last_percent, + increment_percent=10, + start_time=start_time, + ) + _map_dwarf_path_resource(to_resource, from_resources, from_resources_index) diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index fa52e02b5..206d24c2d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -65,14 +65,16 @@ def get_tool_name_from_scan_headers(scan_data): return tool_name -def load_inventory_from_toolkit_scan(project, input_location): +def load_inventory_from_toolkit_scan(project, input_location, resource_defaults=None): """ Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan results located at ``input_location``. """ scanned_codebase = scancode.get_virtual_codebase(project, input_location) scancode.create_discovered_packages(project, scanned_codebase) - scancode.create_codebase_resources(project, scanned_codebase) + scancode.create_codebase_resources( + project, scanned_codebase, defaults=resource_defaults + ) scancode.create_discovered_dependencies( project, scanned_codebase, strip_datafile_path_root=True ) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index c52dd30a0..3303cfb0a 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -466,7 +466,7 @@ def get_virtual_codebase(project, input_location): return VirtualCodebase(input_location, temp_dir=str(temp_path), max_in_memory=0) -def create_codebase_resources(project, scanned_codebase): +def create_codebase_resources(project, scanned_codebase, defaults=None): """ Save the resources of a ScanCode `scanned_codebase` scancode.resource.Codebase object to the database as a CodebaseResource of the `project`. @@ -488,7 +488,18 @@ def create_codebase_resources(project, scanned_codebase): resource_type = "FILE" if scanned_resource.is_file else "DIRECTORY" resource_data["type"] = CodebaseResource.Type[resource_type] - resource_path = scanned_resource.get_path(strip_root=True) + # TODO review this: + # resource_path = scanned_resource.get_path(strip_root=True) + resource_path = scanned_resource.get_path(strip_root=False) + + if dwarf_source_paths := getattr(scanned_resource, "dwarf_source_paths", None): + resource_data["extra_data"] = {"dwarf_source_paths": dwarf_source_paths} + + if defaults: + resource_data.update(defaults) + # TODO: Use a new path_prefix attribute? + if tag := defaults.get("tag"): + resource_path = f"{tag}/{resource_path.lstrip('/')}" codebase_resource, _ = CodebaseResource.objects.get_or_create( project=project, diff --git a/scanpipe/templates/scanpipe/relation_list.html b/scanpipe/templates/scanpipe/relation_list.html index 01ce97899..96d205383 100644 --- a/scanpipe/templates/scanpipe/relation_list.html +++ b/scanpipe/templates/scanpipe/relation_list.html @@ -31,7 +31,7 @@ {{ resource.status }} - + {{ relation.map_type }} {% if relation.extra_data.path_score %} {{ relation.extra_data.path_score }} diff --git a/setup.cfg b/setup.cfg index a9d91e666..0b876d3af 100644 --- a/setup.cfg +++ b/setup.cfg @@ -120,6 +120,7 @@ scancodeio_pipelines = deploy_to_develop = scanpipe.pipelines.deploy_to_develop:DeployToDevelop docker = scanpipe.pipelines.docker:Docker docker_windows = scanpipe.pipelines.docker_windows:DockerWindows + d2d_dwarf = scanpipe.pipelines.d2d_dwarf:DWARF find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities inspect_manifest = scanpipe.pipelines.inspect_manifest:InspectManifest load_inventory = scanpipe.pipelines.load_inventory:LoadInventory From 92f21f43812054d60841b89099fcf9a2d70ec8c8 Mon Sep 17 00:00:00 2001 From: Thomas Druez Date: Wed, 19 Jul 2023 15:17:35 +0400 Subject: [PATCH 2/9] Use proper JSON lookup in map_dwarf_path #812 Signed-off-by: Thomas Druez --- scanpipe/pipes/d2d.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index d5b124736..ec14ea3ef 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -885,13 +885,7 @@ def map_dwarf_path(project, logger=None): project_files = project.codebaseresources.files().no_status() from_resources = project_files.from_codebase() to_resources = project_files.to_codebase().has_no_relation() - - # TODO: Review the performances of this - # Replace by something along: .exclude(extra_data__dwarf_source_paths=[]) - to_resources = to_resources.json_field_contains( - field_name="extra_data", - value="dwarf_source_paths", - ) + to_resources = to_resources.filter(extra_data__has_key="dwarf_source_paths") resource_count = to_resources.count() if logger: From b2f9b4cd309fa625a2d68eae65e03234f8b59167 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sun, 20 Aug 2023 19:23:05 +0200 Subject: [PATCH 3/9] Improve dwarf mappings Signed-off-by: Philippe Ombredanne --- scanpipe/pipes/__init__.py | 62 ++++++++++++++++++---- scanpipe/pipes/d2d.py | 104 ++++++++++++++++++++++++++++--------- 2 files changed, 132 insertions(+), 34 deletions(-) diff --git a/scanpipe/pipes/__init__.py b/scanpipe/pipes/__init__.py index 374785cee..c92146bfc 100644 --- a/scanpipe/pipes/__init__.py +++ b/scanpipe/pipes/__init__.py @@ -186,25 +186,55 @@ def update_or_create_dependency( def get_or_create_relation(project, relation_data): """ - Get or create a CodebaseRelation then return it. + Get or create a CodebaseRelation then return it. The support for update is not useful as there is no fields on the model that could be updated. """ + resource_qs = project.codebaseresources + from_resource_path = relation_data.get("from_resource") + from_resource = resource_qs.get(path=from_resource_path) + to_resource_path = relation_data.get("to_resource") - resource_qs = project.codebaseresources + to_resource = resource_qs.get(path=to_resource_path) + + map_type = relation_data.get("map_type") + + codebase_relation, _ = get_or_create_relation_from_resources( + from_resource=from_resource, + to_resource=to_resource, + map_type=map_type, + ) + return codebase_relation + + +def get_or_create_relation_from_resources( + from_resource, to_resource, map_type, extra_data=None +): + """ + Get or create a Code baseRelationrelation of type ``map_type`` between the + ``from_resource`` and the ``to_resource`` and return it. + ``extra_data`` if any will override any pre-existing value for these. + """ codebase_relation, _ = CodebaseRelation.objects.get_or_create( - project=project, - from_resource=resource_qs.get(path=from_resource_path), - to_resource=resource_qs.get(path=to_resource_path), - map_type=relation_data.get("map_type"), + project=from_resource.project, + from_resource=from_resource, + to_resource=to_resource, + map_type=map_type, ) + if extra_data: + codebase_relation.extra_data = extra_data + codebase_relation.save() return codebase_relation def make_relation(from_resource, to_resource, map_type, **extra_fields): + """ + Create a Code baseRelationrelation of type ``map_type`` between the + ``from_resource`` and the ``to_resource`` and return it. + """ return CodebaseRelation.objects.create( project=from_resource.project, from_resource=from_resource, @@ -303,11 +333,14 @@ def log_progress( return last_percent -def get_text_str_diff_ratio(str_a, str_b): +def get_text_str_diff_ratio(str_a, str_b, as_lines=True): """ Return a similarity ratio as a float between 0 and 1 by comparing the text content of the ``str_a`` and ``str_b``. + Split the text in lines and compare lines if ``as_lines`` is True. + Otherwise, process the input as-is. + Return None if any of the two resources str is empty. """ if not (str_a and str_b): @@ -315,20 +348,29 @@ def get_text_str_diff_ratio(str_a, str_b): if not isinstance(str_a, str) or not isinstance(str_b, str): raise ValueError("Values must be str") - - matcher = difflib.SequenceMatcher(a=str_a.splitlines(), b=str_b.splitlines()) + if as_lines: + a = str_a.splitlines() + b = str_b.splitlines() + else: + a = str_a + b = str_b + matcher = difflib.SequenceMatcher(a=a, b=b) return matcher.quick_ratio() -def get_resource_diff_ratio(resource_a, resource_b): +def get_resource_diff_ratio(resource_a, resource_b, as_lines=True): """ Return a similarity ratio as a float between 0 and 1 by comparing the text content of the CodebaseResource ``resource_a`` and ``resource_b``. + Split the text in lines and compare lines if ``as_lines`` is True. + Otherwise, compare the files text content as-is. + Return None if any of the two resources are not readable as text. """ with suppress(IOError): return get_text_str_diff_ratio( str_a=resource_a.file_content, str_b=resource_b.file_content, + as_lines=as_lines, ) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 7c3908da9..cfa73c9ae 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -991,46 +991,87 @@ def flag_processed_archives(project): to_archive.update(status=flag.ARCHIVE_PROCESSED) -def _map_dwarf_path_resource(to_resource, from_resources, from_resources_index): +def _map_dwarf_path_resource( + to_resource, from_resources, from_resources_index, logger=None, +): + """ + Map DWARF dwarf_paths found in the ``to_resource`` extrac data to + dwarf_paths of the ``from_resources`` CodebaseResource queryset using the + precomputed ``from_resources_index`` path index. + """ dwarf_source_paths = to_resource.extra_data.get("dwarf_source_paths") or {} compiled_paths = dwarf_source_paths.get("compiled_paths") or [] included_paths = dwarf_source_paths.get("included_paths") or [] - paths_and_map_type = [ + dwarf_paths_and_map_type = [ (compiled_paths, "dwarf_compiled_paths"), (included_paths, "dwarf_included_paths"), ] - for paths, map_type in paths_and_map_type: - for path in paths: - match = pathmap.find_paths(path, from_resources_index) - if not match: - continue + dpnm = to_resource.extra_data["dwarf_paths_not_mapped"] = [] + relations = {} + + for dwarf_paths, map_type in dwarf_paths_and_map_type: + for dwarf_path in dwarf_paths: - # Only create relations when the number of matches if inferior or equal to - # the current number of path segment matched. - if len(match.resource_ids) > match.matched_path_length: - to_resource.update(status=flag.TOO_MANY_MAPS) + match = pathmap.find_paths(dwarf_path, from_resources_index) + if not match: + dpnm.append(dwarf_path) continue - for resource_id in match.resource_ids: - from_resource = from_resources.get(id=resource_id) + # short dwarf path matched more than once is treated as not mapped for now + matched_path_length = match.matched_path_length - # Do not count the "to/" segment as it is not "matchable" - to_path_length = len(to_resource.path.split("/")) - 1 - extra_data = { - "path_score": f"{match.matched_path_length}/{to_path_length}", - } + if matched_path_length == 1 and len(match.resource_ids) != 1: + dpnm.append(dwarf_path) + continue - pipes.make_relation( - from_resource=from_resource, + # Sort match by most similar to the From/ side dwarf_path e.g. if we match + # some/foo/bar/baz.c and this/other/foo/bar/baz.c and the From is + # that/foo/bar/baz.c, some/foo/bar/baz.c has the most segments + # matched wins, e.g., the shortest From/ path wins. + matched_from_resources = [ + from_resources.get(id=rid) for rid in match.resource_ids + ] + matched_from_resources.sort(key=lambda res: (len(res.path.strip("/").split("/")), res.path)) + winning_from_resource = matched_from_resources[0] + + # Do not count the "to/" segment as it is not "matchable" + # always strip leading segment ("to" or from" first segment) + dwarf_path_length = len(dwarf_path.strip("/").split("/")) - 1 + + extra_data = { + "path_score": f"{matched_path_length}/{dwarf_path_length}", + "dwarf_path": dwarf_path, + } + + rel_key = (winning_from_resource.path, to_resource.path, map_type) + if rel_key not in relations: + relation = CodebaseRelation( + project=winning_from_resource.project, + from_resource=winning_from_resource, to_resource=to_resource, map_type=map_type, extra_data=extra_data, ) + relations[rel_key] = relation + + if relations: + rels = CodebaseRelation.objects.bulk_create(relations.values()) + if logger: + logger(f"Created {len(rels)} mapping using DWARF for: {to_resource.path!r}") + else: + if logger: + logger(f"No mapping using DWARF for: {to_resource.path!r}") + + if dpnm: + # save the "dwarf dwarf_paths not mapped" + to_resource.save() + if logger: + logger(f"WARNING: DWARF paths NOT mapped for: {to_resource.path!r}: " + ", ".join(map(repr, dpnm))) def map_dwarf_path(project, logger=None): - """Map DWARF paths suffix similarities.""" + """Map DWARF paths using similarities of path suffixes.""" project_files = project.codebaseresources.files().no_status() from_resources = project_files.from_codebase() to_resources = project_files.to_codebase().has_no_relation() @@ -1039,14 +1080,19 @@ def map_dwarf_path(project, logger=None): if logger: logger( - f"Mapping {resource_count:,d} to/ resources using DWARF path map " - f"against from/ codebase" + f"Mapping {resource_count:,d} to/ resources using DWARF paths " + f"with {from_resources.count():,d} from/ resources." ) from_resources_index = pathmap.build_index( from_resources.values_list("id", "path"), with_subpaths=True ) + if logger: + logger( + f"Done building from/ resources index." + ) + resource_iterator = to_resources.iterator(chunk_size=2000) last_percent = 0 start_time = timer() @@ -1059,4 +1105,14 @@ def map_dwarf_path(project, logger=None): increment_percent=10, start_time=start_time, ) - _map_dwarf_path_resource(to_resource, from_resources, from_resources_index) + if logger: + logger( + f"Mapping to/ resource: {to_resource.path!r} using DWARF paths." + ) + + _map_dwarf_path_resource( + to_resource, + from_resources, + from_resources_index, + logger=logger, + ) From 0505cbe620d94ff8297b5af31a7e698a808daff9 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sun, 20 Aug 2023 19:31:05 +0200 Subject: [PATCH 4/9] Add elf d2d map filters Signed-off-by: Philippe Ombredanne --- scanpipe/filters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scanpipe/filters.py b/scanpipe/filters.py index 8dd1811e7..3265ab3f5 100644 --- a/scanpipe/filters.py +++ b/scanpipe/filters.py @@ -344,6 +344,8 @@ def filter(self, qs, value): ("js_path", "js path"), ("path", "path"), ("sha1", "sha1"), + ("dwarf_included_paths", "dwarf_included_paths"), + ("dwarf_compiled_paths", "dwarf_compiled_paths"), ) @@ -427,6 +429,7 @@ class ResourceFilterSet(FilterSetUtilsMixin, django_filters.FilterSet): relation_map_type = RelationMapTypeFilter( label="Relation map type", field_name="related_from__map_type", + distinct=True, ) class Meta: From d75c1927d4f1aa8525ba00b634cd703ace5d1a76 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 21 Aug 2023 14:49:09 +0200 Subject: [PATCH 5/9] Use correct name Signed-off-by: Philippe Ombredanne --- scanpipe/pipelines/d2d_dwarf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/pipelines/d2d_dwarf.py b/scanpipe/pipelines/d2d_dwarf.py index a560501b1..b024d7dbe 100644 --- a/scanpipe/pipelines/d2d_dwarf.py +++ b/scanpipe/pipelines/d2d_dwarf.py @@ -35,7 +35,7 @@ def steps(cls): cls.build_inventory_from_scans, cls.flag_ignored_resources, cls.map_dwarf_paths, - cls.flag_mapped_resources_and_ignored_directories, + cls.flag_mapped_resources_archives_and_ignored_directories, ) def build_inventory_from_scans(self): From 62a4a35ca1634ea07e44ddb4b656472cf79d3acd Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 21 Aug 2023 14:49:25 +0200 Subject: [PATCH 6/9] Fix typo Signed-off-by: Philippe Ombredanne --- scanpipe/pipes/d2d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index cfa73c9ae..63c4b0e35 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -995,7 +995,7 @@ def _map_dwarf_path_resource( to_resource, from_resources, from_resources_index, logger=None, ): """ - Map DWARF dwarf_paths found in the ``to_resource`` extrac data to + Map DWARF dwarf_paths found in the ``to_resource`` extra_data to dwarf_paths of the ``from_resources`` CodebaseResource queryset using the precomputed ``from_resources_index`` path index. """ From 13e6c40e4341db3c3cabc2502a9e2d72e2824af3 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 7 Nov 2023 10:19:27 +0100 Subject: [PATCH 7/9] Improve path mapping logging Signed-off-by: Philippe Ombredanne --- scanpipe/pipes/pathmap.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipes/pathmap.py b/scanpipe/pipes/pathmap.py index 0de855376..a1d014cb7 100644 --- a/scanpipe/pipes/pathmap.py +++ b/scanpipe/pipes/pathmap.py @@ -92,7 +92,7 @@ def find_paths(path, index): return Match(matched_length, resource_ids) -def build_index(resource_id_and_paths, with_subpaths=True): +def build_index(resource_id_and_paths, with_subpaths=True, logger=None): """ Return an index (an index) built from a ``resource_id_and_paths`` iterable of tuples of (resource_id int, resource_path string). @@ -111,7 +111,7 @@ def build_index(resource_id_and_paths, with_subpaths=True): # create a new empty automaton. index = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_STRING) - for resource_id, resource_path in resource_id_and_paths: + for i, (resource_id, resource_path) in enumerate(resource_id_and_paths): segments = get_reversed_path_segments(resource_path) segments_count = len(segments) if with_subpaths: @@ -119,6 +119,9 @@ def build_index(resource_id_and_paths, with_subpaths=True): else: add_path(resource_id, segments, segments_count, index) + if logger: + logger(f"Indexed {i} total resources") + index.make_automaton() return index From 405ba816f56a3b434db3d3ff5b8539a0df511d18 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 7 Nov 2023 10:20:22 +0100 Subject: [PATCH 8/9] Import all extra data when importing a scan Signed-off-by: Philippe Ombredanne --- scanpipe/pipes/scancode.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py index ece9a6fcb..3c88b438f 100644 --- a/scanpipe/pipes/scancode.py +++ b/scanpipe/pipes/scancode.py @@ -491,31 +491,36 @@ def create_codebase_resources(project, scanned_codebase, defaults=None): """ for scanned_resource in scanned_codebase.walk(skip_root=True): resource_data = {} + extra_data = {} - for field in CodebaseResource._meta.fields: + known_field_names = set(field.name for field in CodebaseResource._meta.fields) + for field_name, value in scanned_resource.to_dict().items(): # Do not include the path as provided by the scanned_resource since it # includes the "root". The `get_path` method is used instead. - if field.name == "path": + if field_name == "path": continue - value = getattr(scanned_resource, field.name, None) if value is not None: - resource_data[field.name] = value + if field_name in known_field_names: + resource_data[field_name] = value + else: + extra_data[field_name] = value resource_type = "FILE" if scanned_resource.is_file else "DIRECTORY" resource_data["type"] = CodebaseResource.Type[resource_type] + # TODO review this: # resource_path = scanned_resource.get_path(strip_root=True) resource_path = scanned_resource.get_path(strip_root=False) - if dwarf_source_paths := getattr(scanned_resource, "dwarf_source_paths", None): - resource_data["extra_data"] = {"dwarf_source_paths": dwarf_source_paths} - if defaults: resource_data.update(defaults) # TODO: Use a new path_prefix attribute? if tag := defaults.get("tag"): resource_path = f"{tag}/{resource_path.lstrip('/')}" + if extra_data: + resource_data["extra_data"] = extra_data + codebase_resource, _ = CodebaseResource.objects.get_or_create( project=project, path=resource_path, From 48e583a0961db192afceb9698648bee2a5799d0b Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 7 Nov 2023 10:22:04 +0100 Subject: [PATCH 9/9] Support multiple inputs Signed-off-by: Philippe Ombredanne --- scanpipe/pipelines/d2d_dwarf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scanpipe/pipelines/d2d_dwarf.py b/scanpipe/pipelines/d2d_dwarf.py index b024d7dbe..120622118 100644 --- a/scanpipe/pipelines/d2d_dwarf.py +++ b/scanpipe/pipelines/d2d_dwarf.py @@ -38,12 +38,15 @@ def steps(cls): cls.flag_mapped_resources_archives_and_ignored_directories, ) + + def build_inventory_from_scans(self): """Build inventories""" - for input_path, tag in [(self.from_file, "from"), (self.to_file, "to")]: - input.load_inventory_from_toolkit_scan( - self.project, input_path, resource_defaults={"tag": tag} - ) + for input_paths, tag in [(self.from_files, "from"), (self.to_files, "to")]: + for input_path in input_paths: + input.load_inventory_from_toolkit_scan( + self.project, input_path, resource_defaults={"tag": tag} + ) def map_dwarf_paths(self): """Map DWARF paths"""