aboutcode-org · tdruez · Jul 18, 2023 · Jul 19, 2023 · Aug 20, 2023 · Aug 20, 2023
diff --git a/scanpipe/filters.py b/scanpipe/filters.py
@@ -409,6 +409,8 @@ def filter(self, qs, value):
     ("js_path", "js path"),
     ("path", "path"),
     ("sha1", "sha1"),
+    ("dwarf_included_paths", "dwarf_included_paths"),
+    ("dwarf_compiled_paths", "dwarf_compiled_paths"),
 )
 
 
@@ -492,6 +494,7 @@ class ResourceFilterSet(FilterSetUtilsMixin, django_filters.FilterSet):
     relation_map_type = RelationMapTypeFilter(
         label="Relation map type",
         field_name="related_from__map_type",
+        distinct=True,
     )
 
     class Meta:

diff --git a/scanpipe/pipelines/d2d_dwarf.py b/scanpipe/pipelines/d2d_dwarf.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+from scanpipe.pipelines.deploy_to_develop import DeployToDevelop
+from scanpipe.pipes import d2d
+from scanpipe.pipes import input
+
+
+class DWARF(DeployToDevelop):
+    """ELFs and DWARFs."""
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.get_inputs,
+            cls.build_inventory_from_scans,
+            cls.flag_ignored_resources,
+            cls.map_dwarf_paths,
+            cls.flag_mapped_resources_archives_and_ignored_directories,
+        )
+
+
+
+    def build_inventory_from_scans(self):
+        """Build inventories"""
+        for input_paths, tag in [(self.from_files, "from"), (self.to_files, "to")]:
+            for input_path in input_paths:
+                input.load_inventory_from_toolkit_scan(
+                    self.project, input_path, resource_defaults={"tag": tag}
+                )
+
+    def map_dwarf_paths(self):
+        """Map DWARF paths"""
+        d2d.map_dwarf_path(project=self.project, logger=self.log)
diff --git a/scanpipe/pipes/__init__.py b/scanpipe/pipes/__init__.py
@@ -239,25 +239,55 @@ def update_or_create_dependency(
 
 def get_or_create_relation(project, relation_data):
     """
-    Get  or create a CodebaseRelation then return it.
+    Get or create a CodebaseRelation then return it.
     The support for update is not useful as there is no fields on the model that
     could be updated.
     """
+    resource_qs = project.codebaseresources
+
     from_resource_path = relation_data.get("from_resource")
+    from_resource = resource_qs.get(path=from_resource_path)
+
     to_resource_path = relation_data.get("to_resource")
-    resource_qs = project.codebaseresources
+    to_resource = resource_qs.get(path=to_resource_path)
+
+    map_type = relation_data.get("map_type")
+
+    codebase_relation, _ = get_or_create_relation_from_resources(
+        from_resource=from_resource,
+        to_resource=to_resource,
+        map_type=map_type,
+    )
 
+    return codebase_relation
+
+
+def get_or_create_relation_from_resources(
+    from_resource, to_resource, map_type, extra_data=None
+):
+    """
+    Get or create a Code baseRelationrelation of type ``map_type`` between the
+    ``from_resource`` and the ``to_resource`` and return it.
+    ``extra_data`` if any will override any pre-existing value for these.
+    """
     codebase_relation, _ = CodebaseRelation.objects.get_or_create(
-        project=project,
-        from_resource=resource_qs.get(path=from_resource_path),
-        to_resource=resource_qs.get(path=to_resource_path),
-        map_type=relation_data.get("map_type"),
+        project=from_resource.project,
+        from_resource=from_resource,
+        to_resource=to_resource,
+        map_type=map_type,
     )
+    if extra_data:
+        codebase_relation.extra_data = extra_data
+        codebase_relation.save()
 
     return codebase_relation
 
 
 def make_relation(from_resource, to_resource, map_type, **extra_fields):
+    """
+    Create a Code baseRelationrelation of type ``map_type`` between the
+    ``from_resource`` and the ``to_resource`` and return it.
+    """
     return CodebaseRelation.objects.create(
         project=from_resource.project,
         from_resource=from_resource,
@@ -387,32 +417,44 @@ def iter(self, iterator):
             yield item
 
 
-def get_text_str_diff_ratio(str_a, str_b):
+def get_text_str_diff_ratio(str_a, str_b, as_lines=True):
     """
     Return a similarity ratio as a float between 0 and 1 by comparing the
     text content of the ``str_a`` and ``str_b``.
 
+    Split the text in lines and compare lines if ``as_lines`` is True.
+    Otherwise, process the input as-is.
+
     Return None if any of the two resources str is empty.
     """
     if not (str_a and str_b):
         return
 
     if not isinstance(str_a, str) or not isinstance(str_b, str):
         raise ValueError("Values must be str")
-
-    matcher = difflib.SequenceMatcher(a=str_a.splitlines(), b=str_b.splitlines())
+    if as_lines:
+        a = str_a.splitlines()
+        b = str_b.splitlines()
+    else:
+        a = str_a
+        b = str_b
+    matcher = difflib.SequenceMatcher(a=a, b=b)
     return matcher.quick_ratio()
 
 
-def get_resource_diff_ratio(resource_a, resource_b):
+def get_resource_diff_ratio(resource_a, resource_b, as_lines=True):
     """
     Return a similarity ratio as a float between 0 and 1 by comparing the
     text content of the CodebaseResource ``resource_a`` and ``resource_b``.
 
+    Split the text in lines and compare lines if ``as_lines`` is True.
+    Otherwise, compare the files text content as-is.
+
     Return None if any of the two resources are not readable as text.
     """
     with suppress(IOError):
         return get_text_str_diff_ratio(
             str_a=resource_a.file_content,
             str_b=resource_b.file_content,
+            as_lines=as_lines,
         )
diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py
@@ -1409,3 +1409,130 @@ def flag_whitespace_files(project):
         # If resource contains only whitespace characters.
         if not non_whitespace_bytes:
             resource.update(status=flag.IGNORED_WHITESPACE_FILE)
+
+
+def _map_dwarf_path_resource(
+    to_resource, from_resources, from_resources_index, logger=None,
+):
+    """
+    Map DWARF dwarf_paths found in the ``to_resource`` extra_data to
+    dwarf_paths of the ``from_resources`` CodebaseResource queryset using the
+    precomputed ``from_resources_index`` path index.
+    """
+    dwarf_source_paths = to_resource.extra_data.get("dwarf_source_paths") or {}
+    compiled_paths = dwarf_source_paths.get("compiled_paths") or []
+    included_paths = dwarf_source_paths.get("included_paths") or []
+    dwarf_paths_and_map_type = [
+        (compiled_paths, "dwarf_compiled_paths"),
+        (included_paths, "dwarf_included_paths"),
+    ]
+
+    dpnm = to_resource.extra_data["dwarf_paths_not_mapped"] = []
+    relations = {}
+
+    for dwarf_paths, map_type in dwarf_paths_and_map_type:
+        for dwarf_path in dwarf_paths:
+
+            match = pathmap.find_paths(dwarf_path, from_resources_index)
+            if not match:
+                dpnm.append(dwarf_path)
+                continue
+
+            # short dwarf path matched more than once is treated as not mapped for now
+            matched_path_length = match.matched_path_length
+
+            if matched_path_length == 1 and len(match.resource_ids) != 1:
+                dpnm.append(dwarf_path)
+                continue
+
+            # Sort match by most similar to the From/ side dwarf_path e.g. if we match
+            # some/foo/bar/baz.c and this/other/foo/bar/baz.c and the From is
+            # that/foo/bar/baz.c, some/foo/bar/baz.c has the most segments
+            # matched wins, e.g., the shortest From/ path wins.
+            matched_from_resources = [
+                from_resources.get(id=rid) for rid in match.resource_ids
+            ]
+            matched_from_resources.sort(key=lambda res: (len(res.path.strip("/").split("/")), res.path))
+            winning_from_resource = matched_from_resources[0]
+
+            # Do not count the "to/" segment as it is not "matchable"
+            # always strip leading segment ("to" or from" first segment)
+            dwarf_path_length = len(dwarf_path.strip("/").split("/")) - 1
+
+            extra_data = {
+                "path_score": f"{matched_path_length}/{dwarf_path_length}",
+                "dwarf_path": dwarf_path,
+            }
+
+            rel_key = (winning_from_resource.path, to_resource.path, map_type)
+            if rel_key not in relations:
+                relation = CodebaseRelation(
+                    project=winning_from_resource.project,
+                    from_resource=winning_from_resource,
+                    to_resource=to_resource,
+                    map_type=map_type,
+                    extra_data=extra_data,
+                )
+                relations[rel_key] = relation
+
+    if relations:
+        rels = CodebaseRelation.objects.bulk_create(relations.values())
+        if logger:
+            logger(f"Created {len(rels)} mapping using DWARF for: {to_resource.path!r}")
+    else:
+        if logger:
+            logger(f"No mapping using DWARF for: {to_resource.path!r}")
+
+    if dpnm:
+        # save the "dwarf dwarf_paths not mapped"
+        to_resource.save()
+        if logger:
+            logger(f"WARNING: DWARF paths NOT mapped for: {to_resource.path!r}: " + ", ".join(map(repr, dpnm)))
+
+
+def map_dwarf_path(project, logger=None):
+    """Map DWARF paths using similarities of path suffixes."""
+    project_files = project.codebaseresources.files().no_status()
+    from_resources = project_files.from_codebase()
+    to_resources = project_files.to_codebase().has_no_relation()
+    to_resources = to_resources.filter(extra_data__has_key="dwarf_source_paths")
+    resource_count = to_resources.count()
+
+    if logger:
+        logger(
+            f"Mapping {resource_count:,d} to/ resources using DWARF paths "
+            f"with {from_resources.count():,d} from/ resources."
+        )
+
+    from_resources_index = pathmap.build_index(
+        from_resources.values_list("id", "path"), with_subpaths=True
+    )
+
+    if logger:
+        logger(
+            f"Done building from/ resources index."
+        )
+
+    resource_iterator = to_resources.iterator(chunk_size=2000)
+    last_percent = 0
+    start_time = timer()
+    for resource_index, to_resource in enumerate(resource_iterator):
+        last_percent = pipes.log_progress(
+            logger,
+            resource_index,
+            resource_count,
+            last_percent,
+            increment_percent=10,
+            start_time=start_time,
+        )
+        if logger:
+            logger(
+                f"Mapping to/ resource: {to_resource.path!r} using DWARF paths."
+            )
+
+        _map_dwarf_path_resource(
+            to_resource,
+            from_resources,
+            from_resources_index,
+            logger=logger,
+        )
diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py
@@ -65,14 +65,16 @@ def get_tool_name_from_scan_headers(scan_data):
         return tool_name
 
 
-def load_inventory_from_toolkit_scan(project, input_location):
+def load_inventory_from_toolkit_scan(project, input_location, resource_defaults=None):
     """
     Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan
     results located at ``input_location``.
     """
     scanned_codebase = scancode.get_virtual_codebase(project, input_location)
     scancode.create_discovered_packages(project, scanned_codebase)
-    scancode.create_codebase_resources(project, scanned_codebase)
+    scancode.create_codebase_resources(
+        project, scanned_codebase, defaults=resource_defaults
+    )
     scancode.create_discovered_dependencies(
         project, scanned_codebase, strip_datafile_path_root=True
     )

diff --git a/scanpipe/pipes/pathmap.py b/scanpipe/pipes/pathmap.py
@@ -92,7 +92,7 @@ def find_paths(path, index):
     return Match(matched_length, resource_ids)
 
 
-def build_index(resource_id_and_paths, with_subpaths=True):
+def build_index(resource_id_and_paths, with_subpaths=True, logger=None):
     """
     Return an index (an index) built from a ``resource_id_and_paths``
     iterable of tuples of (resource_id int, resource_path string).
@@ -111,14 +111,17 @@ def build_index(resource_id_and_paths, with_subpaths=True):
     # create a new empty automaton.
     index = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_STRING)
 
-    for resource_id, resource_path in resource_id_and_paths:
+    for i, (resource_id, resource_path) in enumerate(resource_id_and_paths):
         segments = get_reversed_path_segments(resource_path)
         segments_count = len(segments)
         if with_subpaths:
             add_subpaths(resource_id, segments, segments_count, index)
         else:
             add_path(resource_id, segments, segments_count, index)
 
+    if logger:
+        logger(f"Indexed {i} total resources")
+
     index.make_automaton()
     return index