Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rust binary support #1435 #1488

Merged
merged 6 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions scanpipe/pipelines/deploy_to_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import d2d
from scanpipe.pipes import flag
from scanpipe.pipes import input
from scanpipe.pipes import matchcode
from scanpipe.pipes import purldb
from scanpipe.pipes import scancode
Expand Down Expand Up @@ -72,6 +73,7 @@ def steps(cls):
cls.map_javascript,
cls.map_elf,
cls.map_go,
cls.map_rust,
cls.match_directories_to_purldb,
cls.match_resources_to_purldb,
cls.map_javascript_post_purldb_match,
Expand Down Expand Up @@ -129,7 +131,10 @@ def extract_inputs_to_codebase_directory(self):

for input_files, codebase_path in inputs_with_codebase_path_destination:
for input_file_path in input_files:
self.extract_archive(input_file_path, codebase_path)
if input.is_archive(input_file_path):
self.extract_archive(input_file_path, codebase_path)
else:
input.copy_input(input_file_path, codebase_path)

# Reload the project env post-extraction as the scancode-config.yml file
# may be located in one of the extracted archives.
Expand Down Expand Up @@ -198,9 +203,14 @@ def map_elf(self):

@optional_step("Go")
def map_go(self):
"""Map Go binaries to their sources."""
"""Map Go binaries to their sources using paths."""
d2d.map_go_paths(project=self.project, logger=self.log)

@optional_step("Rust")
def map_rust(self):
"""Map Rust binaries to their sources using symbols."""
d2d.map_rust_paths(project=self.project, logger=self.log)

def match_directories_to_purldb(self):
"""Match selected directories in PurlDB."""
if not purldb.is_available():
Expand Down
74 changes: 70 additions & 4 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from extractcode import EXTRACT_SUFFIX
from go_inspector.plugin import collect_and_parse_symbols
from packagedcode.npm import NpmPackageJsonHandler
from rust_inspector.binary import collect_and_parse_rust_symbols
from summarycode.classify import LEGAL_STARTS_ENDS

from aboutcode.pipeline import LoopProgress
Expand All @@ -57,6 +58,8 @@
from scanpipe.pipes import purldb
from scanpipe.pipes import resolve
from scanpipe.pipes import scancode
from scanpipe.pipes import symbolmap
from scanpipe.pipes import symbols

FROM = "from/"
TO = "to/"
Expand Down Expand Up @@ -1794,8 +1797,14 @@ def map_elfs(project, logger=None):
try:
paths = get_elf_file_dwarf_paths(resource.location_path)
resource.update_extra_data(paths)
except Exception as e:
logger(f"Can not parse {resource.location_path!r} {e!r}")
except Exception as exception:
project.add_warning(
exception=exception,
object_instance=resource,
description=f"Cannot parse binary at {resource.path}",
model="map_elfs",
details={"path": resource.path},
)

if logger:
logger(
Expand Down Expand Up @@ -1860,8 +1869,14 @@ def map_go_paths(project, logger=None):
try:
paths = get_go_file_paths(resource.location_path)
resource.update_extra_data(paths)
except Exception as e:
logger(f"Can not parse {resource.location_path!r} {e!r}")
except Exception as exception:
project.add_warning(
exception=exception,
object_instance=resource,
description=f"Cannot parse binary at {resource.path}",
model="map_go_paths",
details={"path": resource.path},
)

if logger:
logger(
Expand All @@ -1886,3 +1901,54 @@ def map_go_paths(project, logger=None):
map_types=["go_file_paths"],
logger=logger,
)


def map_rust_paths(project, logger=None):
"""Map Rust binaries to their source in ``project``."""
from_resources = project.codebaseresources.files().from_codebase()
to_resources = (
project.codebaseresources.files()
.to_codebase()
.has_no_relation()
.executable_binaries()
)

# Collect source symbols from rust source files
rust_from_resources = from_resources.filter(extension=".rs")
symbols.collect_and_store_tree_sitter_symbols_and_strings(
project=project,
logger=logger,
project_files=rust_from_resources,
)

# Collect binary symbols from rust binaries
for resource in to_resources:
try:
binary_symbols = collect_and_parse_rust_symbols(resource.location_path)
resource.update_extra_data(binary_symbols)
except Exception as e:
logger(f"Can not parse {resource.location_path!r} {e!r}")

if logger:
logger(
f"Mapping {to_resources.count():,d} to/ resources using symbols "
f"with {rust_from_resources.count():,d} from/ resources."
)

resource_iterator = to_resources.iterator(chunk_size=2000)
progress = LoopProgress(to_resources.count(), logger)
for to_resource in progress.iter(resource_iterator):
binary_symbols = to_resource.extra_data.get("rust_symbols")
if not binary_symbols:
continue

if logger:
logger(f"Mapping source files to binary at {to_resource.path}")

symbolmap.map_resources_with_symbols(
to_resource=to_resource,
from_resources=rust_from_resources,
binary_symbols=binary_symbols,
map_type="rust_symbols",
logger=logger,
)
1 change: 1 addition & 0 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@

ABOUT_MAPPED = "about-mapped"
MAPPED = "mapped"
MAPPED_BY_SYMBOL = "mapped-by-symbol"
ARCHIVE_PROCESSED = "archive-processed"
MATCHED_TO_PURLDB_PACKAGE = "matched-to-purldb-package"
MATCHED_TO_PURLDB_RESOURCE = "matched-to-purldb-resource"
Expand Down
4 changes: 4 additions & 0 deletions scanpipe/pipes/matchcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ def fingerprint_codebase_directories(project, to_codebase_only=False):
resources = project.codebaseresources.all()
if to_codebase_only:
resources = resources.to_codebase()

if not resources.directories():
return

virtual_codebase = codebase.get_basic_virtual_codebase(resources)
virtual_codebase = compute_codebase_directory_fingerprints(virtual_codebase)
save_directory_fingerprints(
Expand Down
156 changes: 156 additions & 0 deletions scanpipe/pipes/symbolmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from collections import Counter

from aboutcode.pipeline import LoopProgress
from scanpipe.models import CodebaseRelation
from scanpipe.pipes import flag

"""
Path matching using source and binary symbols.

The approach is to create a set of symbols obtained from the rust binary for
each of them and match them to the symbols obtained from the source
"""


def map_resources_with_symbols(
to_resource, from_resources, binary_symbols, map_type, logger=None
):
"""
Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources``
CodebaseResource queryset using the precomputed ``from_resources_index`` path index.
"""
if not binary_symbols:
return

# Accumulate unique relation objects for bulk creation
relations_to_create = {}

# These are of type string
paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = []
for item in match_source_paths_to_binary(
to_resource=to_resource,
from_resources=from_resources,
binary_symbols=binary_symbols,
map_type=map_type,
logger=logger,
):
if isinstance(item, str):
paths_not_mapped.append(item)
else:
rel_key, relation = item
if rel_key not in relations_to_create:
relations_to_create[rel_key] = relation

# If there are any non-test files in the rust source files which
# are not mapped, we mark the binary as REQUIRES_REVIEW
if paths_not_mapped and any(
[True for path in paths_not_mapped if "/tests/" not in path]
):
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved
to_resource.status = flag.REQUIRES_REVIEW
to_resource.save()
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved
if logger:
logger(
f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: "
f"{to_resource.path!r}"
)

if relations_to_create:
rels = CodebaseRelation.objects.bulk_create(relations_to_create.values())
from_resources.has_relation().update(status=flag.MAPPED_BY_SYMBOL)
if logger:
logger(
f"Created {len(rels)} mappings using "
f"{map_type} for: {to_resource.path!r}"
)

elif logger:
logger(f"No mappings using {map_type} for: " f"{to_resource.path!r}")


def match_source_symbols_to_binary(source_symbols, binary_symbols):
binary_symbols_set = set(binary_symbols)
source_symbols_set = set(source_symbols)
source_symbols_count = len(source_symbols)
source_symbols_unique_count = len(source_symbols_set)

source_symbols_counter = Counter(source_symbols)

common_symbols = source_symbols_set.intersection(binary_symbols_set)
common_symbols_count = sum(
[source_symbols_counter.get(symbol) for symbol in common_symbols]
)
common_symbols_ratio = common_symbols_count / source_symbols_count
common_symbols_unique_count = len(common_symbols)
common_symbols_unique_ratio = (
common_symbols_unique_count / source_symbols_unique_count
)
stats = {
"common_symbols_unique_ratio": common_symbols_unique_ratio,
"common_symbols_ratio": common_symbols_ratio,
}

if common_symbols_ratio > 0.5 or common_symbols_unique_ratio > 0.5:
return True, stats
elif source_symbols_count > 20 and (
common_symbols_ratio > 0.4 or common_symbols_unique_ratio > 0.4
):
AyanSinhaMahapatra marked this conversation as resolved.
Show resolved Hide resolved
return True, stats
else:
return False, stats


def match_source_paths_to_binary(
to_resource,
from_resources,
binary_symbols,
map_type,
logger=None,
):
resource_iterator = from_resources.iterator(chunk_size=2000)
progress = LoopProgress(from_resources.count(), logger)

for resource in progress.iter(resource_iterator):
source_symbols = resource.extra_data.get("source_symbols")
if not source_symbols:
yield resource.path
continue

is_source_matched, match_stats = match_source_symbols_to_binary(
source_symbols=source_symbols,
binary_symbols=binary_symbols,
)
if not is_source_matched:
yield resource.path
continue

rel_key = (resource.path, to_resource.path, map_type)
relation = CodebaseRelation(
project=resource.project,
from_resource=resource,
to_resource=to_resource,
map_type=map_type,
extra_data=match_stats,
)
yield rel_key, relation
14 changes: 12 additions & 2 deletions scanpipe/pipes/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,19 @@ def _collect_and_store_pygments_symbols_and_strings(resource):
)


def collect_and_store_tree_sitter_symbols_and_strings(project, logger=None):
def collect_and_store_tree_sitter_symbols_and_strings(
project, logger=None, project_files=None
):
"""
Collect symbols from codebase files using tree-sitter and store
them in the extra data field.

Collect from `project_files` instead of all codebase files if specified.
"""
from source_inspector import symbols_tree_sitter

project_files = project.codebaseresources.files()
if not project_files:
project_files = project.codebaseresources.files()

language_qs = Q()

Expand All @@ -131,6 +136,11 @@ def collect_and_store_tree_sitter_symbols_and_strings(project, logger=None):
).filter(language_qs)

resources_count = resources.count()
if logger:
logger(
f"Getting source symbols and strings from {resources_count:,d}"
" from/ resources using tree-sitter."
)

resource_iterator = resources.iterator(chunk_size=2000)
progress = LoopProgress(resources_count, logger)
Expand Down
Binary file not shown.
Binary file not shown.
Loading
Loading