Skip to content

Map deployed ELFs to development C/C++ source code #812 #819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions scanpipe/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ def filter(self, qs, value):
("js_path", "js path"),
("path", "path"),
("sha1", "sha1"),
("dwarf_included_paths", "dwarf_included_paths"),
("dwarf_compiled_paths", "dwarf_compiled_paths"),
)


Expand Down Expand Up @@ -492,6 +494,7 @@ class ResourceFilterSet(FilterSetUtilsMixin, django_filters.FilterSet):
relation_map_type = RelationMapTypeFilter(
label="Relation map type",
field_name="related_from__map_type",
distinct=True,
)

class Meta:
Expand Down
53 changes: 53 additions & 0 deletions scanpipe/pipelines/d2d_dwarf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from scanpipe.pipelines.deploy_to_develop import DeployToDevelop
from scanpipe.pipes import d2d
from scanpipe.pipes import input


class DWARF(DeployToDevelop):
"""ELFs and DWARFs."""

@classmethod
def steps(cls):
return (
cls.get_inputs,
cls.build_inventory_from_scans,
cls.flag_ignored_resources,
cls.map_dwarf_paths,
cls.flag_mapped_resources_archives_and_ignored_directories,
)



def build_inventory_from_scans(self):
"""Build inventories"""
for input_paths, tag in [(self.from_files, "from"), (self.to_files, "to")]:
for input_path in input_paths:
input.load_inventory_from_toolkit_scan(
self.project, input_path, resource_defaults={"tag": tag}
)

def map_dwarf_paths(self):
"""Map DWARF paths"""
d2d.map_dwarf_path(project=self.project, logger=self.log)
62 changes: 52 additions & 10 deletions scanpipe/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,25 +239,55 @@ def update_or_create_dependency(

def get_or_create_relation(project, relation_data):
"""
Get or create a CodebaseRelation then return it.
Get or create a CodebaseRelation then return it.
The support for update is not useful as there is no fields on the model that
could be updated.
"""
resource_qs = project.codebaseresources

from_resource_path = relation_data.get("from_resource")
from_resource = resource_qs.get(path=from_resource_path)

to_resource_path = relation_data.get("to_resource")
resource_qs = project.codebaseresources
to_resource = resource_qs.get(path=to_resource_path)

map_type = relation_data.get("map_type")

codebase_relation, _ = get_or_create_relation_from_resources(
from_resource=from_resource,
to_resource=to_resource,
map_type=map_type,
)

return codebase_relation


def get_or_create_relation_from_resources(
from_resource, to_resource, map_type, extra_data=None
):
"""
Get or create a Code baseRelationrelation of type ``map_type`` between the
``from_resource`` and the ``to_resource`` and return it.
``extra_data`` if any will override any pre-existing value for these.
"""
codebase_relation, _ = CodebaseRelation.objects.get_or_create(
project=project,
from_resource=resource_qs.get(path=from_resource_path),
to_resource=resource_qs.get(path=to_resource_path),
map_type=relation_data.get("map_type"),
project=from_resource.project,
from_resource=from_resource,
to_resource=to_resource,
map_type=map_type,
)
if extra_data:
codebase_relation.extra_data = extra_data
codebase_relation.save()

return codebase_relation


def make_relation(from_resource, to_resource, map_type, **extra_fields):
"""
Create a Code baseRelationrelation of type ``map_type`` between the
``from_resource`` and the ``to_resource`` and return it.
"""
return CodebaseRelation.objects.create(
project=from_resource.project,
from_resource=from_resource,
Expand Down Expand Up @@ -387,32 +417,44 @@ def iter(self, iterator):
yield item


def get_text_str_diff_ratio(str_a, str_b):
def get_text_str_diff_ratio(str_a, str_b, as_lines=True):
"""
Return a similarity ratio as a float between 0 and 1 by comparing the
text content of the ``str_a`` and ``str_b``.

Split the text in lines and compare lines if ``as_lines`` is True.
Otherwise, process the input as-is.

Return None if any of the two resources str is empty.
"""
if not (str_a and str_b):
return

if not isinstance(str_a, str) or not isinstance(str_b, str):
raise ValueError("Values must be str")

matcher = difflib.SequenceMatcher(a=str_a.splitlines(), b=str_b.splitlines())
if as_lines:
a = str_a.splitlines()
b = str_b.splitlines()
else:
a = str_a
b = str_b
matcher = difflib.SequenceMatcher(a=a, b=b)
return matcher.quick_ratio()


def get_resource_diff_ratio(resource_a, resource_b):
def get_resource_diff_ratio(resource_a, resource_b, as_lines=True):
"""
Return a similarity ratio as a float between 0 and 1 by comparing the
text content of the CodebaseResource ``resource_a`` and ``resource_b``.

Split the text in lines and compare lines if ``as_lines`` is True.
Otherwise, compare the files text content as-is.

Return None if any of the two resources are not readable as text.
"""
with suppress(IOError):
return get_text_str_diff_ratio(
str_a=resource_a.file_content,
str_b=resource_b.file_content,
as_lines=as_lines,
)
127 changes: 127 additions & 0 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -1409,3 +1409,130 @@ def flag_whitespace_files(project):
# If resource contains only whitespace characters.
if not non_whitespace_bytes:
resource.update(status=flag.IGNORED_WHITESPACE_FILE)


def _map_dwarf_path_resource(
to_resource, from_resources, from_resources_index, logger=None,
):
"""
Map DWARF dwarf_paths found in the ``to_resource`` extra_data to
dwarf_paths of the ``from_resources`` CodebaseResource queryset using the
precomputed ``from_resources_index`` path index.
"""
dwarf_source_paths = to_resource.extra_data.get("dwarf_source_paths") or {}
compiled_paths = dwarf_source_paths.get("compiled_paths") or []
included_paths = dwarf_source_paths.get("included_paths") or []
dwarf_paths_and_map_type = [
(compiled_paths, "dwarf_compiled_paths"),
(included_paths, "dwarf_included_paths"),
]

dpnm = to_resource.extra_data["dwarf_paths_not_mapped"] = []
relations = {}

for dwarf_paths, map_type in dwarf_paths_and_map_type:
for dwarf_path in dwarf_paths:

match = pathmap.find_paths(dwarf_path, from_resources_index)
if not match:
dpnm.append(dwarf_path)
continue

# short dwarf path matched more than once is treated as not mapped for now
matched_path_length = match.matched_path_length

if matched_path_length == 1 and len(match.resource_ids) != 1:
dpnm.append(dwarf_path)
continue

# Sort match by most similar to the From/ side dwarf_path e.g. if we match
# some/foo/bar/baz.c and this/other/foo/bar/baz.c and the From is
# that/foo/bar/baz.c, some/foo/bar/baz.c has the most segments
# matched wins, e.g., the shortest From/ path wins.
matched_from_resources = [
from_resources.get(id=rid) for rid in match.resource_ids
]
matched_from_resources.sort(key=lambda res: (len(res.path.strip("/").split("/")), res.path))
winning_from_resource = matched_from_resources[0]

# Do not count the "to/" segment as it is not "matchable"
# always strip leading segment ("to" or from" first segment)
dwarf_path_length = len(dwarf_path.strip("/").split("/")) - 1

extra_data = {
"path_score": f"{matched_path_length}/{dwarf_path_length}",
"dwarf_path": dwarf_path,
}

rel_key = (winning_from_resource.path, to_resource.path, map_type)
if rel_key not in relations:
relation = CodebaseRelation(
project=winning_from_resource.project,
from_resource=winning_from_resource,
to_resource=to_resource,
map_type=map_type,
extra_data=extra_data,
)
relations[rel_key] = relation

if relations:
rels = CodebaseRelation.objects.bulk_create(relations.values())
if logger:
logger(f"Created {len(rels)} mapping using DWARF for: {to_resource.path!r}")
else:
if logger:
logger(f"No mapping using DWARF for: {to_resource.path!r}")

if dpnm:
# save the "dwarf dwarf_paths not mapped"
to_resource.save()
if logger:
logger(f"WARNING: DWARF paths NOT mapped for: {to_resource.path!r}: " + ", ".join(map(repr, dpnm)))


def map_dwarf_path(project, logger=None):
"""Map DWARF paths using similarities of path suffixes."""
project_files = project.codebaseresources.files().no_status()
from_resources = project_files.from_codebase()
to_resources = project_files.to_codebase().has_no_relation()
to_resources = to_resources.filter(extra_data__has_key="dwarf_source_paths")
resource_count = to_resources.count()

if logger:
logger(
f"Mapping {resource_count:,d} to/ resources using DWARF paths "
f"with {from_resources.count():,d} from/ resources."
)

from_resources_index = pathmap.build_index(
from_resources.values_list("id", "path"), with_subpaths=True
)

if logger:
logger(
f"Done building from/ resources index."
)

resource_iterator = to_resources.iterator(chunk_size=2000)
last_percent = 0
start_time = timer()
for resource_index, to_resource in enumerate(resource_iterator):
last_percent = pipes.log_progress(
logger,
resource_index,
resource_count,
last_percent,
increment_percent=10,
start_time=start_time,
)
if logger:
logger(
f"Mapping to/ resource: {to_resource.path!r} using DWARF paths."
)

_map_dwarf_path_resource(
to_resource,
from_resources,
from_resources_index,
logger=logger,
)
6 changes: 4 additions & 2 deletions scanpipe/pipes/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ def get_tool_name_from_scan_headers(scan_data):
return tool_name


def load_inventory_from_toolkit_scan(project, input_location):
def load_inventory_from_toolkit_scan(project, input_location, resource_defaults=None):
"""
Create packages, dependencies, and resources loaded from the ScanCode-toolkit scan
results located at ``input_location``.
"""
scanned_codebase = scancode.get_virtual_codebase(project, input_location)
scancode.create_discovered_packages(project, scanned_codebase)
scancode.create_codebase_resources(project, scanned_codebase)
scancode.create_codebase_resources(
project, scanned_codebase, defaults=resource_defaults
)
scancode.create_discovered_dependencies(
project, scanned_codebase, strip_datafile_path_root=True
)
Expand Down
7 changes: 5 additions & 2 deletions scanpipe/pipes/pathmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def find_paths(path, index):
return Match(matched_length, resource_ids)


def build_index(resource_id_and_paths, with_subpaths=True):
def build_index(resource_id_and_paths, with_subpaths=True, logger=None):
"""
Return an index (an index) built from a ``resource_id_and_paths``
iterable of tuples of (resource_id int, resource_path string).
Expand All @@ -111,14 +111,17 @@ def build_index(resource_id_and_paths, with_subpaths=True):
# create a new empty automaton.
index = ahocorasick.Automaton(ahocorasick.STORE_ANY, ahocorasick.KEY_STRING)

for resource_id, resource_path in resource_id_and_paths:
for i, (resource_id, resource_path) in enumerate(resource_id_and_paths):
segments = get_reversed_path_segments(resource_path)
segments_count = len(segments)
if with_subpaths:
add_subpaths(resource_id, segments, segments_count, index)
else:
add_path(resource_id, segments, segments_count, index)

if logger:
logger(f"Indexed {i} total resources")

index.make_automaton()
return index

Expand Down
Loading