Skip to content

Commit

Permalink
Add rust binary support for packages and symbols
Browse files Browse the repository at this point in the history
* Add support to get packages from rust binaries
* Add support for rust source to binary mapping using symbols

Reference: #1435
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Dec 27, 2024
1 parent e1607c7 commit 828c6a4
Show file tree
Hide file tree
Showing 11 changed files with 5,203 additions and 2 deletions.
6 changes: 6 additions & 0 deletions scanpipe/pipelines/deploy_to_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def steps(cls):
cls.map_javascript,
cls.map_elf,
cls.map_go,
cls.map_rust,
cls.match_directories_to_purldb,
cls.match_resources_to_purldb,
cls.map_javascript_post_purldb_match,
Expand Down Expand Up @@ -201,6 +202,11 @@ def map_go(self):
"""Map Go binaries to their sources."""
d2d.map_go_paths(project=self.project, logger=self.log)

@optional_step("Rust")
def map_rust(self):
"""Map Rust binaries to their sources using symbols."""
d2d.map_rust_paths(project=self.project, logger=self.log)

def match_directories_to_purldb(self):
"""Match selected directories in PurlDB."""
if not purldb.is_available():
Expand Down
54 changes: 54 additions & 0 deletions scanpipe/pipes/d2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from extractcode import EXTRACT_SUFFIX
from go_inspector.plugin import collect_and_parse_symbols
from packagedcode.npm import NpmPackageJsonHandler
from rust_inspector.binary import collect_and_parse_rust_symbols
from summarycode.classify import LEGAL_STARTS_ENDS

from aboutcode.pipeline import LoopProgress
Expand All @@ -57,6 +58,8 @@
from scanpipe.pipes import purldb
from scanpipe.pipes import resolve
from scanpipe.pipes import scancode
from scanpipe.pipes import symbolmap
from scanpipe.pipes import symbols

FROM = "from/"
TO = "to/"
Expand Down Expand Up @@ -1886,3 +1889,54 @@ def map_go_paths(project, logger=None):
map_types=["go_file_paths"],
logger=logger,
)


def map_rust_paths(project, logger=None):
"""Map Rust binaries to their source in ``project``."""
from_resources = project.codebaseresources.files().from_codebase()
to_resources = (
project.codebaseresources.files()
.to_codebase()
.has_no_relation()
.executable_binaries()
)

# Collect source symbols from rust source files
rust_from_resources = from_resources.filter(extension=".rs")
symbols.collect_and_store_tree_sitter_symbols_and_strings(
project=project,
logger=logger,
project_files=rust_from_resources,
)

# Collect binary symbols from rust binaries
for resource in to_resources:
try:
binary_symbols = collect_and_parse_rust_symbols(resource.location_path)
resource.update_extra_data(binary_symbols)
except Exception as e:
logger(f"Can not parse {resource.location_path!r} {e!r}")

if logger:
logger(
f"Mapping {to_resources.count():,d} to/ resources using symbols "
f"with {rust_from_resources.count():,d} from/ resources."
)

resource_iterator = to_resources.iterator(chunk_size=2000)
progress = LoopProgress(to_resources.count(), logger)
for to_resource in progress.iter(resource_iterator):
binary_symbols = to_resource.extra_data.get("rust_symbols")
if not binary_symbols:
continue

if logger:
logger(f"Mapping source files to binary at {to_resource.path}")

symbolmap.map_resources_with_symbols(
to_resource=to_resource,
from_resources=rust_from_resources,
binary_symbols=binary_symbols,
map_type="rust_symbols",
logger=logger,
)
1 change: 1 addition & 0 deletions scanpipe/pipes/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@

ABOUT_MAPPED = "about-mapped"
MAPPED = "mapped"
MAPPED_BY_SYMBOL = "mapped-by-symbol"
ARCHIVE_PROCESSED = "archive-processed"
MATCHED_TO_PURLDB_PACKAGE = "matched-to-purldb-package"
MATCHED_TO_PURLDB_RESOURCE = "matched-to-purldb-resource"
Expand Down
156 changes: 156 additions & 0 deletions scanpipe/pipes/symbolmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from collections import Counter

from aboutcode.pipeline import LoopProgress
from scanpipe.models import CodebaseRelation
from scanpipe.pipes import flag

"""
Path matching using source and binary symbols.
The approach is to create a set of symbols obtained from the rust binary for
each of them and match them to the symbols obtained from the source
"""


def map_resources_with_symbols(
to_resource, from_resources, binary_symbols, map_type, logger=None
):
"""
Map paths found in the ``to_resource`` extra_data to paths of the ``from_resources``
CodebaseResource queryset using the precomputed ``from_resources_index`` path index.
"""
if not binary_symbols:
return

# Accumulate unique relation objects for bulk creation
relations_to_create = {}

# These are of type string
paths_not_mapped = to_resource.extra_data[f"{map_type}_not_mapped"] = []
for item in match_source_paths_to_binary(
to_resource=to_resource,
from_resources=from_resources,
binary_symbols=binary_symbols,
map_type=map_type,
logger=logger,
):
if isinstance(item, str):
paths_not_mapped.append(item)
else:
rel_key, relation = item
if rel_key not in relations_to_create:
relations_to_create[rel_key] = relation

# If there are any non-test files in the rust source files which
# are not mapped, we mark the binary as REQUIRES_REVIEW
if paths_not_mapped and any(
[True for path in paths_not_mapped if "/tests/" not in path]
):
to_resource.status = flag.REQUIRES_REVIEW
to_resource.save()
if logger:
logger(
f"WARNING: #{len(paths_not_mapped)} {map_type} paths NOT mapped for: "
f"{to_resource.path!r}"
)

if relations_to_create:
rels = CodebaseRelation.objects.bulk_create(relations_to_create.values())
from_resources.has_relation().update(status=flag.MAPPED_BY_SYMBOL)
if logger:
logger(
f"Created {len(rels)} mappings using "
f"{map_type} for: {to_resource.path!r}"
)

elif logger:
logger(f"No mappings using {map_type} for: " f"{to_resource.path!r}")


def match_source_symbols_to_binary(source_symbols, binary_symbols):
binary_symbols_set = set(binary_symbols)
source_symbols_set = set(source_symbols)
source_symbols_count = len(source_symbols)
source_symbols_unique_count = len(source_symbols_set)

source_symbols_counter = Counter(source_symbols)

common_symbols = source_symbols_set.intersection(binary_symbols_set)
common_symbols_count = sum(
[source_symbols_counter.get(symbol) for symbol in common_symbols]
)
common_symbols_ratio = common_symbols_count / source_symbols_count
common_symbols_unique_count = len(common_symbols)
common_symbols_unique_ratio = (
common_symbols_unique_count / source_symbols_unique_count
)
stats = {
"common_symbols_unique_ratio": common_symbols_unique_ratio,
"common_symbols_ratio": common_symbols_ratio,
}

if common_symbols_ratio > 0.5 or common_symbols_unique_ratio > 0.5:
return True, stats
elif source_symbols_count > 20 and (
common_symbols_ratio > 0.4 or common_symbols_unique_ratio > 0.4
):
return True, stats
else:
return False, stats


def match_source_paths_to_binary(
to_resource,
from_resources,
binary_symbols,
map_type,
logger=None,
):
resource_iterator = from_resources.iterator(chunk_size=2000)
progress = LoopProgress(from_resources.count(), logger)

for resource in progress.iter(resource_iterator):
source_symbols = resource.extra_data.get("source_symbols")
if not source_symbols:
yield resource.path
continue

is_source_matched, match_stats = match_source_symbols_to_binary(
source_symbols=source_symbols,
binary_symbols=binary_symbols,
)
if not is_source_matched:
yield resource.path
continue

rel_key = (resource.path, to_resource.path, map_type)
relation = CodebaseRelation(
project=resource.project,
from_resource=resource,
to_resource=to_resource,
map_type=map_type,
extra_data=match_stats,
)
yield rel_key, relation
14 changes: 12 additions & 2 deletions scanpipe/pipes/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,19 @@ def _collect_and_store_pygments_symbols_and_strings(resource):
)


def collect_and_store_tree_sitter_symbols_and_strings(project, logger=None):
def collect_and_store_tree_sitter_symbols_and_strings(
project, logger=None, project_files=None
):
"""
Collect symbols from codebase files using tree-sitter and store
them in the extra data field.
Collect from `project_files` instead of all codebase files if specified.
"""
from source_inspector import symbols_tree_sitter

project_files = project.codebaseresources.files()
if not project_files:
project_files = project.codebaseresources.files()

language_qs = Q()

Expand All @@ -131,6 +136,11 @@ def collect_and_store_tree_sitter_symbols_and_strings(project, logger=None):
).filter(language_qs)

resources_count = resources.count()
if logger:
logger(
f"Getting source symbols and strings from {resources_count:,d}"
" from/ resources using tree-sitter."
)

resource_iterator = resources.iterator(chunk_size=2000)
progress = LoopProgress(resources_count, logger)
Expand Down
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 828c6a4

Please sign in to comment.