From 489af1b5b62c83d46fb80e9ce0dfab1416a40aba Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:12:54 +0530 Subject: [PATCH 01/13] Simplify GemfileHandler path patterns Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/rubygems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py index 9cbbf6d7553..e80295c48af 100644 --- a/src/packagedcode/rubygems.py +++ b/src/packagedcode/rubygems.py @@ -211,7 +211,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): # TODO: https://stackoverflow.com/questions/41454333/meaning-of-new-block-git-sourcegithub-in-gemfile class GemfileHandler(GemspecHandler): datasource_id = 'gemfile' - path_patterns = ('*/Gemfile', '*/*.gemfile', '*/Gemfile-*') + path_patterns = ('*/Gemfile', '*.gemfile', '*/Gemfile-*') default_package_type = 'gem' default_primary_language = 'Ruby' description = 'RubyGems Bundler Gemfile' From 8d6fa73bdba4c7e8658f38b2db6ce5cb07f4ed4a Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:15:51 +0530 Subject: [PATCH 02/13] Add multiregex as a dependency Reference: https://github.com/Quantco/multiregex Signed-off-by: Ayan Sinha Mahapatra --- requirements.txt | 1 + setup-mini.cfg | 1 + setup.cfg | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d7b458c84d..f9e6b6a0a28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,6 +40,7 @@ license-expression==30.4.4 lxml==5.4.0 MarkupSafe==3.0.2 more-itertools==10.7.0 +multiregex==2.0.3 normality==2.6.1 packageurl-python==0.17.1 packaging==25.0 diff --git a/setup-mini.cfg b/setup-mini.cfg index 8f3a043d8af..7251d59715a 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -89,6 +89,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 diff --git a/setup.cfg b/setup.cfg index 770b70542b3..c02d1ce9f13 100644 --- a/setup.cfg +++ b/setup.cfg @@ -74,6 +74,7 @@ install_requires = colorama >= 0.3.9 commoncode >= 32.4.0 container-inspector >= 31.0.0 + cyseq >= 0.0.2 debian-inspector >= 31.1.0 dparse2 >= 0.7.0 fasteners @@ -90,6 +91,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 @@ -116,7 +118,6 @@ install_requires = typecode >= 30.0.1 typecode[full] >= 30.0.1 extractcode[full] >= 31.0.0 - cyseq >= 0.0.2 [options.packages.find] From 4fc3af2f834a7db0d126a5fa5b44ad251d752511 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:16:44 +0530 Subject: [PATCH 03/13] Add initial multiregex implementation Use multiregex to use a cached regex path patterns and datafile handlers mapping to detect package datafiles faster. Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4064 Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4061 Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/__init__.py | 18 ++- src/packagedcode/cache.py | 200 ++++++++++++++++++++++++++++++++++ src/packagedcode/recognize.py | 60 +++++++--- src/scancode_config.py | 6 + 4 files changed, 269 insertions(+), 15 deletions(-) create mode 100644 src/packagedcode/cache.py diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index 9cc46d0e09b..d65e535bee6 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -246,15 +246,29 @@ win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler, ] + +# These handlers are special as they use filetype to +# detect these binaries instead of datafile path patterns +# as these are optionally installed, we can skip checking +# for filetype if these are not available +BINARY_HANDLERS_PRESENT = False +BINARY_PACKAGE_DATAFILE_HANDLERS = [] + try: from go_inspector.binary import get_go_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler()) + handler = get_go_binary_handler() + APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_HANDLERS_PRESENT = True except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler()) + handler = get_rust_binary_handler() + APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_HANDLERS_PRESENT = True except ImportError: pass diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py new file mode 100644 index 00000000000..6412f7c6e8c --- /dev/null +++ b/src/packagedcode/cache.py @@ -0,0 +1,200 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import json +import attr +import fnmatch + +from commoncode.fileutils import create_dir + +from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS +from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS + +from scancode_config import packagedcode_cache_dir +from scancode_config import scancode_cache_dir + +""" +An on-disk persistent cache of package manifest patterns and related package +manifest handlers mapping. Loading and dumping the cached package manifest +patterns is safe to use across multiple processes using lock files. +""" + +# global in-memory cache of the PkgManifestPatternsCache +_PACKAGE_CACHE = None + +PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 +PACKAGE_INDEX_DIR = 'package_patterns_index' +PACKAGE_INDEX_FILENAME = 'index_cache' +PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile' +PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums' + + +@attr.s +class PkgManifestPatternsCache: + """ + Represent cachable package manifest regex patterns, prematchers + and mappings from regex patterns to datasource IDs for all datafile + handlers. + """ + + handler_by_regex = attr.ib(default=attr.Factory(dict)) + system_multiregex_patterns = attr.ib(default=attr.Factory(list)) + application_multiregex_patterns = attr.ib(default=attr.Factory(list)) + + @staticmethod + def all_multiregex_patterns(self): + return self.application_multiregex_patterns + [ + multiregex_pattern + for multiregex_pattern in self.system_multiregex_patterns + if multiregex_pattern not in self.application_multiregex_patterns + ] + + @classmethod + def from_mapping(cls, cache_mapping): + return cls(**cache_mapping) + + @staticmethod + def load_or_build( + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=False, + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + ): + """ + Load or build and save and return a PkgManifestPatternsCache object. + + We either load a cached PkgManifestPatternsCache or build and cache the patterns. + + - If the cache exists, it is returned unless corrupted. + - If ``force`` is True, or if the cache does not exist a new index is built + and cached. + """ + idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR) + create_dir(idx_cache_dir) + cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME) + has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file) + + # bypass build if cache exists + if has_cache and not force: + try: + return load_cache_file(cache_file) + except Exception as e: + # work around some rare Windows quirks + import traceback + print('Inconsistent License cache: rebuilding index.') + print(str(e)) + print(traceback.format_exc()) + + + from scancode import lockfile + lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) + + # here, we have no cache: lock, check and rebuild + try: + # acquire lock and wait until timeout to get a lock or die + with lockfile.FileLock(lock_file).locked(timeout=timeout): + + system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + ) + application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + ) + package_cache = PkgManifestPatternsCache( + handler_by_regex=system_handlers_by_regex + application_handlers_by_regex, + system_multiregex_patterns=system_multiregex_patterns, + application_multiregex_patterns=application_multiregex_patterns, + ) + package_cache.dump(cache_file) + return package_cache + + except lockfile.LockTimeout: + # TODO: handle unable to lock in a nicer way + raise + + def dump(self, cache_file): + """ + Dump this package cache on disk at ``cache_file``. + """ + package_cache = {} + with open(cache_file, 'w') as f: + json.dump(package_cache, f) + + +def get_prematchers_from_glob_pattern(pattern): + return [ + prematcher.lower().lstrip("/") + for prematcher in pattern.split("*") + if prematcher + ] + + +def build_mappings_and_multiregex_patterns( + datafile_handlers, +): + """ + Return an index built from rules and licenses directories + """ + with_patterns = [] + + for handler in datafile_handlers: + if handler.path_patterns: + with_patterns.append(handler) + + handler_by_regex = {} + prematchers_by_regex = {} + + for handler in with_patterns: + for pattern in handler.path_patterns: + regex_pattern = fnmatch.translate(pattern) + regex_pattern = fr"{regex_pattern}" + + prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern) + + if regex_pattern in handler_by_regex: + handler_by_regex[regex_pattern].append(handler.datasource_id) + else: + handler_by_regex[regex_pattern]= [handler.datasource_id] + + multiregex_patterns = [] + for regex in handler_by_regex.keys(): + regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) + multiregex_patterns.append(regex_and_prematcher) + + return handler_by_regex, multiregex_patterns + + +def get_cache( + force=False, +): + """ + Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. + """ + global _PACKAGE_CACHE + + if force or not _PACKAGE_CACHE: + _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=force, + # used for testing only + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + ) + return _PACKAGE_CACHE + + +def load_cache_file(cache_file): + """ + Return a PkgManifestPatternsCache loaded from JSON ``cache_file``. + """ + with open(cache_file) as f: + cache = json.load(f) + + return PkgManifestPatternsCache.from_mapping(cache) diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index e41d29c82df..bc3704fb64d 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -10,11 +10,16 @@ import os import sys +import multiregex + from commoncode import filetype -from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS -from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS -from packagedcode import ALL_DATAFILE_HANDLERS +from commoncode.fileutils import as_posixpath + +from packagedcode import HANDLER_BY_DATASOURCE_ID +from packagedcode import BINARY_HANDLERS_PRESENT +from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS from packagedcode import models +from packagedcode.cache import get_cache TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) @@ -56,25 +61,19 @@ def recognize_package_data( if not filetype.is_file(location): return [] - assert application or system or package_only - if package_only or (application and system): - datafile_handlers = ALL_DATAFILE_HANDLERS - elif application: - datafile_handlers = APPLICATION_PACKAGE_DATAFILE_HANDLERS - elif system: - datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS - return list(_parse( location=location, package_only=package_only, - datafile_handlers=datafile_handlers, + application=application, + system=system, )) def _parse( location, + application=True, + system=False, package_only=False, - datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, ): """ Yield parsed PackageData objects from ``location``. Raises Exceptions on errors. @@ -83,6 +82,41 @@ def _parse( Default to use application packages """ + package_path = as_posixpath(location) + package_patterns = get_cache() + + assert application or system or package_only + if package_only or (application and system): + multiregex_patterns = package_patterns.all_multiregex_patterns + elif application: + multiregex_patterns = package_patterns.application_multiregex_patterns + elif system: + multiregex_patterns = package_patterns.system_multiregex_patterns + + package_matcher = multiregex.RegexMatcher(multiregex_patterns) + matched_patterns = package_matcher.match(package_path) + + datafile_handlers = [] + for matched_pattern in matched_patterns: + regex, _match = matched_pattern + handler_ids = package_patterns.handler_by_regex.get(regex.pattern) + if TRACE: + logger_debug(f'_parse:.handler_ids: {handler_ids}') + + datafile_handlers = [ + HANDLER_BY_DATASOURCE_ID.get(handler_id) + for handler_id in handler_ids + ] + + if not datafile_handlers: + if BINARY_HANDLERS_PRESENT: + datafile_handlers = BINARY_PACKAGE_DATAFILE_HANDLERS + else: + if TRACE: + logger_debug(f'_parse: no package datafile detected at {package_path}') + + return + for handler in datafile_handlers: if TRACE: logger_debug(f'_parse:.is_datafile: {handler}') diff --git a/src/scancode_config.py b/src/scancode_config.py index 9b6e2b7d075..520a0af9396 100644 --- a/src/scancode_config.py +++ b/src/scancode_config.py @@ -185,7 +185,13 @@ def _create_dir(location): __env_license_cache_dir = os.getenv('SCANCODE_LICENSE_INDEX_CACHE') licensedcode_cache_dir = (__env_license_cache_dir or std_license_cache_dir) + +std_package_cache_dir = join(scancode_src_dir, 'packagedcode', 'data', 'cache') +__env_package_cache_dir = os.getenv('SCANCODE_PACKAGE_INDEX_CACHE') +packagedcode_cache_dir = (__env_package_cache_dir or std_package_cache_dir) + _create_dir(licensedcode_cache_dir) +_create_dir(packagedcode_cache_dir) _create_dir(scancode_cache_dir) # - scancode_temp_dir: for short-lived temporary files which are import- or run- From 4438526050feb0ae6c600ca3ecf32783c9f226d5 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 19:27:01 +0530 Subject: [PATCH 04/13] Add minimal tests for package cache Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/cache.py | 48 +++++++++++++----- src/packagedcode/data/.gitignore | 1 + .../package_patterns_index/index_cache | 1 + .../data/plugin/plugins_list_linux.txt | 2 +- tests/packagedcode/test_cache.py | 49 +++++++++++++++++++ 5 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 src/packagedcode/data/.gitignore create mode 100644 tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache create mode 100644 tests/packagedcode/test_cache.py diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index 6412f7c6e8c..ff596f80cf8 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -66,6 +66,8 @@ def load_or_build( scancode_cache_dir=scancode_cache_dir, force=False, timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, ): """ Load or build and save and return a PkgManifestPatternsCache object. @@ -88,7 +90,7 @@ def load_or_build( except Exception as e: # work around some rare Windows quirks import traceback - print('Inconsistent License cache: rebuilding index.') + print('Inconsistent Package cache: rebuilding index.') print(str(e)) print(traceback.format_exc()) @@ -102,13 +104,13 @@ def load_or_build( with lockfile.FileLock(lock_file).locked(timeout=timeout): system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( - datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + datafile_handlers=system_package_datafile_handlers, ) application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( - datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + datafile_handlers=application_package_datafile_handlers, ) package_cache = PkgManifestPatternsCache( - handler_by_regex=system_handlers_by_regex + application_handlers_by_regex, + handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, system_multiregex_patterns=system_multiregex_patterns, application_multiregex_patterns=application_multiregex_patterns, ) @@ -123,7 +125,11 @@ def dump(self, cache_file): """ Dump this package cache on disk at ``cache_file``. """ - package_cache = {} + package_cache = { + "handler_by_regex": self.handler_by_regex, + "system_multiregex_patterns": self.system_multiregex_patterns, + "application_multiregex_patterns": self.application_multiregex_patterns, + } with open(cache_file, 'w') as f: json.dump(package_cache, f) @@ -136,19 +142,23 @@ def get_prematchers_from_glob_pattern(pattern): ] -def build_mappings_and_multiregex_patterns( - datafile_handlers, -): +def build_mappings_and_multiregex_patterns(datafile_handlers): """ - Return an index built from rules and licenses directories + Return a mapping of regex patterns to datafile handler IDs and + multiregex patterns consisting of regex patterns and prematchers. """ + handler_by_regex = {} + multiregex_patterns = [] + + if not datafile_handlers: + return multiregex_patterns, handler_by_regex + with_patterns = [] for handler in datafile_handlers: if handler.path_patterns: with_patterns.append(handler) - handler_by_regex = {} prematchers_by_regex = {} for handler in with_patterns: @@ -163,16 +173,17 @@ def build_mappings_and_multiregex_patterns( else: handler_by_regex[regex_pattern]= [handler.datasource_id] - multiregex_patterns = [] for regex in handler_by_regex.keys(): regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) multiregex_patterns.append(regex_and_prematcher) - return handler_by_regex, multiregex_patterns + return multiregex_patterns, handler_by_regex def get_cache( force=False, + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, ): """ Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. @@ -197,4 +208,15 @@ def load_cache_file(cache_file): with open(cache_file) as f: cache = json.load(f) - return PkgManifestPatternsCache.from_mapping(cache) + # convert multiregex patterns from list to tuples while loading + cache_transformed = {"handler_by_regex": cache.get("handler_by_regex")} + cache_transformed["system_multiregex_patterns"] = [ + tuple(multiregex_pattern) + for multiregex_pattern in cache.get("system_multiregex_patterns") + ] + cache_transformed["application_multiregex_patterns"] = [ + tuple(multiregex_pattern) + for multiregex_pattern in cache.get("application_multiregex_patterns") + ] + + return PkgManifestPatternsCache.from_mapping(cache_transformed) diff --git a/src/packagedcode/data/.gitignore b/src/packagedcode/data/.gitignore new file mode 100644 index 00000000000..0a2101fab9b --- /dev/null +++ b/src/packagedcode/data/.gitignore @@ -0,0 +1 @@ +/cache/ diff --git a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache new file mode 100644 index 00000000000..2c820bcff1c --- /dev/null +++ b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache @@ -0,0 +1 @@ +{"handler_by_regex": {"(?s:.*\\.ABOUT)\\Z": ["about_file"]}, "system_multiregex_patterns": [], "application_multiregex_patterns": [["(?s:.*\\.ABOUT)\\Z", [".about"]]]} \ No newline at end of file diff --git a/tests/packagedcode/data/plugin/plugins_list_linux.txt b/tests/packagedcode/data/plugin/plugins_list_linux.txt index e24512dfd91..eb4763d6c7e 100755 --- a/tests/packagedcode/data/plugin/plugins_list_linux.txt +++ b/tests/packagedcode/data/plugin/plugins_list_linux.txt @@ -410,7 +410,7 @@ Package type: gem documentation URL: https://bundler.io/man/gemfile.5.html primary language: Ruby description: RubyGems Bundler Gemfile - path_patterns: '*/Gemfile', '*/*.gemfile', '*/Gemfile-*' + path_patterns: '*/Gemfile', '*.gemfile', '*/Gemfile-*' -------------------------------------------- Package type: gem datasource_id: gemfile_extracted diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py new file mode 100644 index 00000000000..84614d17e59 --- /dev/null +++ b/tests/packagedcode/test_cache.py @@ -0,0 +1,49 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os.path + +from packagedcode import cache +from packages_test_utils import PackageTester +from scancode_config import REGEN_TEST_FIXTURES +from scancode.cli_test_utils import run_scan_click +from scancode.cli_test_utils import check_json_scan + + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + + +class TestMultiregexPatterns(PackageTester): + test_data_dir = TEST_DATA_DIR + + def test_build_mappings_and_multiregex_patterns_works(self): + from packagedcode.about import AboutFileHandler + + multiregex_patterns, handler_by_regex = cache.build_mappings_and_multiregex_patterns( + datafile_handlers=[AboutFileHandler], + ) + assert multiregex_patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])] + assert handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']} + + def test_build_package_cache_works(self): + from packagedcode.about import AboutFileHandler + + package_cache_dir = self.get_test_loc('cache/package_patterns_index') + package_cache = cache.PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=package_cache_dir, + application_package_datafile_handlers=[AboutFileHandler], + system_package_datafile_handlers=[], + force=True, + ) + + assert not package_cache.system_multiregex_patterns + assert len(package_cache.application_multiregex_patterns) == 1 + assert '(?s:.*\\.ABOUT)\\Z' in package_cache.handler_by_regex + + From 3bbf35e14d8ae6134684e14c1f3b33d687301f32 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 18 Nov 2025 16:37:40 +0530 Subject: [PATCH 05/13] Cache multiregex matchers instead of patterns Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/cache.py | 83 +++++++++---------- src/packagedcode/recognize.py | 22 ++--- tests/packagedcode/data/cache/.gitignore | 1 + .../package_patterns_index/index_cache | 1 - tests/packagedcode/test_cache.py | 22 +++-- tests/packagedcode/test_recognize.py | 5 ++ 6 files changed, 71 insertions(+), 63 deletions(-) create mode 100644 tests/packagedcode/data/cache/.gitignore delete mode 100644 tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index ff596f80cf8..163efd18f56 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -8,12 +8,13 @@ # import os -import json -import attr import fnmatch +import pickle +import multiregex -from commoncode.fileutils import create_dir +import attr +from commoncode.fileutils import create_dir from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS @@ -29,6 +30,9 @@ # global in-memory cache of the PkgManifestPatternsCache _PACKAGE_CACHE = None +# This is the Pickle protocol we use, which was added in Python 3.4. +PICKLE_PROTOCOL = 4 + PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 PACKAGE_INDEX_DIR = 'package_patterns_index' PACKAGE_INDEX_FILENAME = 'index_cache' @@ -45,23 +49,21 @@ class PkgManifestPatternsCache: """ handler_by_regex = attr.ib(default=attr.Factory(dict)) - system_multiregex_patterns = attr.ib(default=attr.Factory(list)) - application_multiregex_patterns = attr.ib(default=attr.Factory(list)) + system_package_matcher = attr.ib(default=None) + application_package_matcher = attr.ib(default=None) + all_package_matcher = attr.ib(default=None) @staticmethod - def all_multiregex_patterns(self): - return self.application_multiregex_patterns + [ + def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns): + return application_multiregex_patterns + [ multiregex_pattern - for multiregex_pattern in self.system_multiregex_patterns - if multiregex_pattern not in self.application_multiregex_patterns + for multiregex_pattern in system_multiregex_patterns + if multiregex_pattern not in application_multiregex_patterns ] @classmethod - def from_mapping(cls, cache_mapping): - return cls(**cache_mapping) - - @staticmethod def load_or_build( + cls, packagedcode_cache_dir=packagedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, @@ -94,7 +96,6 @@ def load_or_build( print(str(e)) print(traceback.format_exc()) - from scancode import lockfile lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) @@ -109,29 +110,31 @@ def load_or_build( application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( datafile_handlers=application_package_datafile_handlers, ) - package_cache = PkgManifestPatternsCache( + all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns( + application_multiregex_patterns, system_multiregex_patterns, + ) + system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns) + application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns) + all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher) + package_cache = cls( handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, - system_multiregex_patterns=system_multiregex_patterns, - application_multiregex_patterns=application_multiregex_patterns, + system_package_matcher=system_package_matcher, + application_package_matcher=application_package_matcher, + all_package_matcher=all_package_matcher, ) package_cache.dump(cache_file) return package_cache except lockfile.LockTimeout: # TODO: handle unable to lock in a nicer way - raise + raise def dump(self, cache_file): """ - Dump this package cache on disk at ``cache_file``. + Dump this license cache on disk at ``cache_file``. """ - package_cache = { - "handler_by_regex": self.handler_by_regex, - "system_multiregex_patterns": self.system_multiregex_patterns, - "application_multiregex_patterns": self.application_multiregex_patterns, - } - with open(cache_file, 'w') as f: - json.dump(package_cache, f) + with open(cache_file, 'wb') as fn: + pickle.dump(self, fn, protocol=PICKLE_PROTOCOL) def get_prematchers_from_glob_pattern(pattern): @@ -203,20 +206,16 @@ def get_cache( def load_cache_file(cache_file): """ - Return a PkgManifestPatternsCache loaded from JSON ``cache_file``. + Return a PkgManifestPatternsCache loaded from ``cache_file``. """ - with open(cache_file) as f: - cache = json.load(f) - - # convert multiregex patterns from list to tuples while loading - cache_transformed = {"handler_by_regex": cache.get("handler_by_regex")} - cache_transformed["system_multiregex_patterns"] = [ - tuple(multiregex_pattern) - for multiregex_pattern in cache.get("system_multiregex_patterns") - ] - cache_transformed["application_multiregex_patterns"] = [ - tuple(multiregex_pattern) - for multiregex_pattern in cache.get("application_multiregex_patterns") - ] - - return PkgManifestPatternsCache.from_mapping(cache_transformed) + with open(cache_file, 'rb') as lfc: + try: + return pickle.load(lfc) + except Exception as e: + msg = ( + 'ERROR: Failed to load package cache (the file may be corrupted ?).\n' + f'Please delete "{cache_file}" and retry.\n' + 'If the problem persists, copy this error message ' + 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' + ) + raise Exception(msg) from e diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index bc3704fb64d..26c5f0d702b 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -10,8 +10,6 @@ import os import sys -import multiregex - from commoncode import filetype from commoncode.fileutils import as_posixpath @@ -87,13 +85,12 @@ def _parse( assert application or system or package_only if package_only or (application and system): - multiregex_patterns = package_patterns.all_multiregex_patterns + package_matcher = package_patterns.all_package_matcher elif application: - multiregex_patterns = package_patterns.application_multiregex_patterns + package_matcher = package_patterns.application_package_matcher elif system: - multiregex_patterns = package_patterns.system_multiregex_patterns + package_matcher = package_patterns.system_package_matcher - package_matcher = multiregex.RegexMatcher(multiregex_patterns) matched_patterns = package_matcher.match(package_path) datafile_handlers = [] @@ -103,19 +100,14 @@ def _parse( if TRACE: logger_debug(f'_parse:.handler_ids: {handler_ids}') - datafile_handlers = [ + datafile_handlers.extend([ HANDLER_BY_DATASOURCE_ID.get(handler_id) for handler_id in handler_ids - ] + ]) if not datafile_handlers: - if BINARY_HANDLERS_PRESENT: - datafile_handlers = BINARY_PACKAGE_DATAFILE_HANDLERS - else: - if TRACE: - logger_debug(f'_parse: no package datafile detected at {package_path}') - - return + if TRACE: + logger_debug(f'_parse: no package datafile detected at {package_path}') for handler in datafile_handlers: if TRACE: diff --git a/tests/packagedcode/data/cache/.gitignore b/tests/packagedcode/data/cache/.gitignore new file mode 100644 index 00000000000..a738fbc8f7f --- /dev/null +++ b/tests/packagedcode/data/cache/.gitignore @@ -0,0 +1 @@ +/package_patterns_index/ \ No newline at end of file diff --git a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache deleted file mode 100644 index 2c820bcff1c..00000000000 --- a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache +++ /dev/null @@ -1 +0,0 @@ -{"handler_by_regex": {"(?s:.*\\.ABOUT)\\Z": ["about_file"]}, "system_multiregex_patterns": [], "application_multiregex_patterns": [["(?s:.*\\.ABOUT)\\Z", [".about"]]]} \ No newline at end of file diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py index 84614d17e59..98951d9fc8f 100644 --- a/tests/packagedcode/test_cache.py +++ b/tests/packagedcode/test_cache.py @@ -10,6 +10,8 @@ import os.path from packagedcode import cache +from commoncode.fileutils import as_posixpath + from packages_test_utils import PackageTester from scancode_config import REGEN_TEST_FIXTURES from scancode.cli_test_utils import run_scan_click @@ -33,17 +35,27 @@ def test_build_mappings_and_multiregex_patterns_works(self): def test_build_package_cache_works(self): from packagedcode.about import AboutFileHandler + from packagedcode.bower import BowerJsonHandler - package_cache_dir = self.get_test_loc('cache/package_patterns_index') + package_cache_dir = self.get_test_loc('cache/') package_cache = cache.PkgManifestPatternsCache.load_or_build( packagedcode_cache_dir=package_cache_dir, application_package_datafile_handlers=[AboutFileHandler], - system_package_datafile_handlers=[], + system_package_datafile_handlers=[BowerJsonHandler], force=True, ) + test_path = "scancode-toolkit.ABOUT" + + assert not package_cache.system_package_matcher.match(test_path) + assert package_cache.application_package_matcher.match(test_path) - assert not package_cache.system_multiregex_patterns - assert len(package_cache.application_multiregex_patterns) == 1 - assert '(?s:.*\\.ABOUT)\\Z' in package_cache.handler_by_regex + regex, _match = package_cache.all_package_matcher.match(test_path).pop() + assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id + + def check_empty_file_scan_works(self): + test_file = self.get_test_loc('cache/.gitignore') + package_path = as_posixpath(test_file) + package_matcher = cache.get_cache() + assert not package_matcher.match(package_path) diff --git a/tests/packagedcode/test_recognize.py b/tests/packagedcode/test_recognize.py index f7736aeeb61..98a50164321 100644 --- a/tests/packagedcode/test_recognize.py +++ b/tests/packagedcode/test_recognize.py @@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self): packages = recognize_package_data(test_file, system=True) assert packages assert isinstance(packages[0], models.PackageData) + + def test_recognize_non_package_manifest_file(self): + test_file = self.get_test_loc('cache/.gitignore') + packages = recognize_package_data(test_file) + assert not packages From e0460ef7add32caba7f4ff721add2f305bd0bb3b Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 18 Nov 2025 18:27:52 +0530 Subject: [PATCH 06/13] Restore binary package manifest scanning Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/recognize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index 26c5f0d702b..1c9e85ec570 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -106,7 +106,9 @@ def _parse( ]) if not datafile_handlers: - if TRACE: + if BINARY_HANDLERS_PRESENT: + datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) + elif TRACE: logger_debug(f'_parse: no package datafile detected at {package_path}') for handler in datafile_handlers: From dde6bc9710ad0f16e1cfde69e3e7b29eb2ea057d Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 19 Nov 2025 15:00:10 +0530 Subject: [PATCH 07/13] Only scan for bianry packages optionally Introduce a new option --binary-packages which looks for package/dependency data in binaries. Signed-off-by: Ayan Sinha Mahapatra --- docs/source/rst_snippets/basic_options.rst | 5 ++++ src/packagedcode/__init__.py | 7 +---- src/packagedcode/plugin_package.py | 32 ++++++++++++++++++---- src/packagedcode/recognize.py | 27 ++++++++++++------ src/scancode/api.py | 22 +++++++++++++-- tests/packagedcode/test_cargo.py | 2 +- tests/scancode/data/help/help.txt | 17 +++++++----- tests/scancode/data/help/help_linux.txt | 21 ++++++++------ 8 files changed, 94 insertions(+), 39 deletions(-) diff --git a/docs/source/rst_snippets/basic_options.rst b/docs/source/rst_snippets/basic_options.rst index d01fbf72a6c..83caf28f406 100644 --- a/docs/source/rst_snippets/basic_options.rst +++ b/docs/source/rst_snippets/basic_options.rst @@ -33,6 +33,11 @@ documenting a program's options. For example: --system-package Scan ```` for installed system package databases. +-b, --binary-package Scan for package and dependency related + data in binaries. Note that looking for packages + in binaries makes package scan slower. + Currently supported binaries: Go, Rust. + --package-only Scan ```` for system and application only for package metadata, without license/ copyright detection and package assembly. diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index d65e535bee6..8626fcf7ff6 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -251,24 +251,19 @@ # detect these binaries instead of datafile path patterns # as these are optionally installed, we can skip checking # for filetype if these are not available -BINARY_HANDLERS_PRESENT = False BINARY_PACKAGE_DATAFILE_HANDLERS = [] try: from go_inspector.binary import get_go_binary_handler handler = get_go_binary_handler() - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) - BINARY_HANDLERS_PRESENT = True except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler handler = get_rust_binary_handler() - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) - BINARY_HANDLERS_PRESENT = True except ImportError: pass @@ -276,7 +271,7 @@ APPLICATION_PACKAGE_DATAFILE_HANDLERS + [ p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS - ] + ] + BINARY_PACKAGE_DATAFILE_HANDLERS ) # registry of all handler classes keyed by datasource_id diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index 8dc993e3b7a..dc55e0053fb 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -170,6 +170,20 @@ class PackageScanner(ScanPlugin): help_group=SCAN_GROUP, sort_order=21, ), + PluggableCommandLineOption( + ( + '-b', + '--binary-package', + ), + is_flag=True, + default=False, + help=( + 'Scan for package and dependency related data in binaries. ' + 'Currently supported binaries: Go, Rust.' + ), + help_group=SCAN_GROUP, + sort_order=22, + ), PluggableCommandLineOption( ( '--package-only', @@ -182,7 +196,7 @@ class PackageScanner(ScanPlugin): 'license/copyright detection and top-level package creation.' ), help_group=SCAN_GROUP, - sort_order=22, + sort_order=23, ), PluggableCommandLineOption( ('--list-packages',), @@ -195,10 +209,17 @@ class PackageScanner(ScanPlugin): ), ] - def is_enabled(self, package, system_package, package_only, **kwargs): - return package or system_package or package_only + def is_enabled(self, package, system_package, binary_package, package_only, **kwargs): + return package or system_package or binary_package or package_only - def get_scanner(self, package=True, system_package=False, package_only=False, **kwargs): + def get_scanner( + self, + package=True, + system_package=False, + binary_package=False, + package_only=False, + **kwargs + ): """ Return a scanner callable to scan a file for package data. """ @@ -208,6 +229,7 @@ def get_scanner(self, package=True, system_package=False, package_only=False, ** get_package_data, application=package, system=system_package, + binary=binary_package, package_only=package_only, ) @@ -464,7 +486,7 @@ def get_package_and_deps(codebase, package_adder=add_to_package, strip_root=Fals resource.scan_errors.append(msg) resource.save(codebase) - if TRACE: + if TRACE_ASSEMBLY: raise Exception(msg) from e return packages, dependencies diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index 1c9e85ec570..f60107c6904 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -14,7 +14,6 @@ from commoncode.fileutils import as_posixpath from packagedcode import HANDLER_BY_DATASOURCE_ID -from packagedcode import BINARY_HANDLERS_PRESENT from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS from packagedcode import models from packagedcode.cache import get_cache @@ -47,6 +46,7 @@ def recognize_package_data( location, application=True, system=False, + binary=False, package_only=False, ): """ @@ -61,9 +61,10 @@ def recognize_package_data( return list(_parse( location=location, - package_only=package_only, application=application, system=system, + binary=binary, + package_only=package_only, )) @@ -71,6 +72,7 @@ def _parse( location, application=True, system=False, + binary=False, package_only=False, ): """ @@ -83,7 +85,8 @@ def _parse( package_path = as_posixpath(location) package_patterns = get_cache() - assert application or system or package_only + has_patterns = application or system or package_only + assert has_patterns or binary if package_only or (application and system): package_matcher = package_patterns.all_package_matcher elif application: @@ -91,22 +94,30 @@ def _parse( elif system: package_matcher = package_patterns.system_package_matcher - matched_patterns = package_matcher.match(package_path) + matched_patterns = [] + if has_patterns: + matched_patterns = package_matcher.match(package_path) - datafile_handlers = [] + all_handler_ids = [] for matched_pattern in matched_patterns: regex, _match = matched_pattern handler_ids = package_patterns.handler_by_regex.get(regex.pattern) if TRACE: logger_debug(f'_parse:.handler_ids: {handler_ids}') - datafile_handlers.extend([ - HANDLER_BY_DATASOURCE_ID.get(handler_id) + all_handler_ids.extend([ + handler_id for handler_id in handler_ids + if handler_id not in all_handler_ids ]) + datafile_handlers = [ + HANDLER_BY_DATASOURCE_ID.get(handler_id) + for handler_id in all_handler_ids + ] + if not datafile_handlers: - if BINARY_HANDLERS_PRESENT: + if binary: datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) elif TRACE: logger_debug(f'_parse: no package datafile detected at {package_path}') diff --git a/src/scancode/api.py b/src/scancode/api.py index 94592e20ce1..d06af7dcf45 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -256,20 +256,28 @@ def get_licenses( SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) -def _get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def _get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at ``location``. Include ``application`` packages (such as pypi) and/or ``system`` packages. Note that all exceptions are caught if there are any errors while parsing a package manifest. """ - assert application or system or package_only + assert application or system or binary or package_only from packagedcode.recognize import recognize_package_data try: return recognize_package_data( location=location, application=application, system=system, + binary=binary, package_only=package_only, ) or [] @@ -300,7 +308,14 @@ def get_package_info(location, **kwargs): return dict(packages=[p.to_dict() for p in packages]) -def get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at `location`. @@ -313,6 +328,7 @@ def get_package_data(location, application=True, system=False, package_only=Fals location=location, application=application, system=system, + binary=binary, package_only=package_only, **kwargs, ) or [] diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py index b71634aa8a1..5b22b69e193 100644 --- a/tests/packagedcode/test_cargo.py +++ b/tests/packagedcode/test_cargo.py @@ -159,7 +159,7 @@ def test_scan_works_on_rust_binary_with_inspector(self): test_file = self.get_test_loc('cargo/binary/cargo_dependencies') expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json') result_file = self.get_temp_file('results.json') - run_scan_click(['--package', test_file, '--json', result_file]) + run_scan_click(['--binary-package', test_file, '--json', result_file]) check_json_scan( expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES ) diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 8a486871b5d..2c45a354b31 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -8,13 +8,16 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. other scans: -i, --info Scan for file information (size, checksums, etc). diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 9ca1d26d68a..5d7b1dfed92 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -8,15 +8,18 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. - --go-symbol Collect Go symbols. - --rust-symbol Collect Rust symbols from rust binaries. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. + --go-symbol Collect Go symbols. + --rust-symbol Collect Rust symbols from rust binaries. other scans: -i, --info Scan for file information (size, checksums, etc). From 5cd5f74bc27b6da1ff8371b8198455696d654533 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 19 Nov 2025 16:15:02 +0530 Subject: [PATCH 08/13] Do not setup license index on --package-only We do not need the license index in a --package-only scan as this is designed to do a fast package detection only scan which skips the license detection. As license index loading takes a couple seconds in each case, this makes the package only scan much faster. Signed-off-by: Ayan Sinha Mahapatra --- src/licensedcode/plugin_license.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py index 5c42f96760a..717253c4baa 100644 --- a/src/licensedcode/plugin_license.py +++ b/src/licensedcode/plugin_license.py @@ -152,6 +152,9 @@ def setup(self, **kwargs): This is a cache warmup such that child process inherit from the loaded index. """ + if kwargs.get("package_only"): + return + from licensedcode.cache import populate_cache populate_cache() From 6b6a79b8a1c0b9789a466df4c5623ab723890a76 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Dec 2025 19:45:46 +0530 Subject: [PATCH 09/13] Add a new console script to build the package patterns cache Signed-off-by: Ayan Sinha Mahapatra --- Dockerfile | 6 ++++-- etc/release/scancode-create-pypi-wheel.sh | 1 + etc/release/scancode-create-release-app-linux.sh | 1 + etc/release/scancode-create-release-app-macos.sh | 1 + .../scancode-create-release-app-windows.sh | 1 + setup-mini.cfg | 1 + setup.cfg | 1 + src/packagedcode/cache.py | 15 +++++++++++++++ 8 files changed, 25 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index d4c641d7a2f..17e28ad0930 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,9 +38,11 @@ WORKDIR /scancode-toolkit COPY . /scancode-toolkit # Initial configuration using ./configure, scancode-reindex-licenses to build -# the base license index +# the base license index and scancode-cache-package-patterns to build the +# package patterns cache RUN ./configure \ - && ./venv/bin/scancode-reindex-licenses + && ./venv/bin/scancode-reindex-licenses \ + && ./venv/bin/scancode-cache-package-patterns # Add scancode to path ENV PATH=/scancode-toolkit:$PATH diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh index 5ab2fe8e988..4915695bae8 100755 --- a/etc/release/scancode-create-pypi-wheel.sh +++ b/etc/release/scancode-create-pypi-wheel.sh @@ -19,6 +19,7 @@ set -e ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" ) diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh index fbe5951a937..ab6a4314d6d 100755 --- a/etc/release/scancode-create-release-app-linux.sh +++ b/etc/release/scancode-create-release-app-linux.sh @@ -65,6 +65,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh index 5f34bf88f28..41c804137bb 100755 --- a/etc/release/scancode-create-release-app-macos.sh +++ b/etc/release/scancode-create-release-app-macos.sh @@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh index 03a22d7117a..e4dba1b9b2f 100755 --- a/etc/release/scancode-create-release-app-windows.sh +++ b/etc/release/scancode-create-release-app-windows.sh @@ -62,6 +62,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/setup-mini.cfg b/setup-mini.cfg index 7251d59715a..bfb24e2dd33 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -157,6 +157,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/setup.cfg b/setup.cfg index c02d1ce9f13..f156833e463 100644 --- a/setup.cfg +++ b/setup.cfg @@ -159,6 +159,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index 163efd18f56..92320379887 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -13,7 +13,9 @@ import multiregex import attr +import click +from commoncode.cliutils import PluggableCommandLineOption from commoncode.fileutils import create_dir from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS @@ -219,3 +221,16 @@ def load_cache_file(cache_file): 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' ) raise Exception(msg) from e + + +@click.command(name='scancode-cache-package-patterns') +@click.help_option('-h', '--help') +def cache_package_patterns(*args, **kwargs): + """Create scancode package manifest patterns cache and exit""" + click.echo('Rebuilding the package cache patterns...') + get_cache(force=True) + click.echo('Done.') + + +if __name__ == '__main__': + cache_package_patterns() From 7926aaf065cffaaad5361211f800b4aa11b87a18 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 29 Dec 2025 15:17:15 +0530 Subject: [PATCH 10/13] Address review feedback Signed-off-by: Ayan Sinha Mahapatra --- Dockerfile | 4 +- docs/source/rst_snippets/basic_options.rst | 9 +-- etc/release/scancode-create-pypi-wheel.sh | 2 +- .../scancode-create-release-app-linux.sh | 2 +- .../scancode-create-release-app-macos.sh | 2 +- .../scancode-create-release-app-windows.sh | 2 +- setup-mini.cfg | 2 +- setup.cfg | 2 +- src/licensedcode/cache.py | 1 - src/packagedcode/__init__.py | 10 +-- src/packagedcode/cache.py | 69 +++++++++++++++---- src/packagedcode/plugin_package.py | 15 ++-- src/packagedcode/recognize.py | 14 ++-- src/scancode/api.py | 10 +-- tests/packagedcode/test_cache.py | 23 ++++--- tests/packagedcode/test_cargo.py | 2 +- tests/scancode/data/help/help.txt | 5 +- tests/scancode/data/help/help_linux.txt | 5 +- 18 files changed, 113 insertions(+), 66 deletions(-) diff --git a/Dockerfile b/Dockerfile index 17e28ad0930..4a3ff20516e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,11 +38,11 @@ WORKDIR /scancode-toolkit COPY . /scancode-toolkit # Initial configuration using ./configure, scancode-reindex-licenses to build -# the base license index and scancode-cache-package-patterns to build the +# the base license index and scancode-reindex-package-patterns to build the # package patterns cache RUN ./configure \ && ./venv/bin/scancode-reindex-licenses \ - && ./venv/bin/scancode-cache-package-patterns + && ./venv/bin/scancode-reindex-package-patterns # Add scancode to path ENV PATH=/scancode-toolkit:$PATH diff --git a/docs/source/rst_snippets/basic_options.rst b/docs/source/rst_snippets/basic_options.rst index 83caf28f406..a2da40d39bf 100644 --- a/docs/source/rst_snippets/basic_options.rst +++ b/docs/source/rst_snippets/basic_options.rst @@ -33,10 +33,11 @@ documenting a program's options. For example: --system-package Scan ```` for installed system package databases. --b, --binary-package Scan for package and dependency related - data in binaries. Note that looking for packages - in binaries makes package scan slower. - Currently supported binaries: Go, Rust. +--package-in-compiled Scan compiled executable binaries such as ELF, + WinpE and Mach-O files, looking for structured + package and dependency metadata. Note that looking for + packages in binaries makes package scan slower. + Currently supported compiled binaries: Go, Rust. --package-only Scan ```` for system and application only for package metadata, without license/ diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh index 4915695bae8..4c27868c9cf 100755 --- a/etc/release/scancode-create-pypi-wheel.sh +++ b/etc/release/scancode-create-pypi-wheel.sh @@ -19,7 +19,7 @@ set -e ./configure --dev venv/bin/scancode-reindex-licenses -venv/bin/scancode-cache-package-patterns +venv/bin/scancode-reindex-package-patterns python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" ) diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh index ab6a4314d6d..93fb37dc0ce 100755 --- a/etc/release/scancode-create-release-app-linux.sh +++ b/etc/release/scancode-create-release-app-linux.sh @@ -65,7 +65,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses -venv/bin/scancode-cache-package-patterns +venv/bin/scancode-reindex-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh index 41c804137bb..7bcd8b7b270 100755 --- a/etc/release/scancode-create-release-app-macos.sh +++ b/etc/release/scancode-create-release-app-macos.sh @@ -63,7 +63,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses -venv/bin/scancode-cache-package-patterns +venv/bin/scancode-reindex-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh index e4dba1b9b2f..7a8b8ab87d8 100755 --- a/etc/release/scancode-create-release-app-windows.sh +++ b/etc/release/scancode-create-release-app-windows.sh @@ -62,7 +62,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses -venv/bin/scancode-cache-package-patterns +venv/bin/scancode-reindex-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/setup-mini.cfg b/setup-mini.cfg index bfb24e2dd33..8a45d58de4b 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -157,7 +157,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses - scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns + scancode-reindex-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/setup.cfg b/setup.cfg index f156833e463..53b666297da 100644 --- a/setup.cfg +++ b/setup.cfg @@ -159,7 +159,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses - scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns + scancode-reindex-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/src/licensedcode/cache.py b/src/licensedcode/cache.py index 92998a2bb41..65b0fe59ce1 100644 --- a/src/licensedcode/cache.py +++ b/src/licensedcode/cache.py @@ -34,7 +34,6 @@ LICENSE_INDEX_DIR = 'license_index' LICENSE_INDEX_FILENAME = 'index_cache' LICENSE_LOCKFILE_NAME = 'scancode_license_index_lockfile' -LICENSE_CHECKSUM_FILE = 'scancode_license_index_tree_checksums' class LicenseCache: diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index 8626fcf7ff6..d3c48b6e259 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -248,22 +248,22 @@ # These handlers are special as they use filetype to -# detect these binaries instead of datafile path patterns +# detect these compiled binaries instead of datafile path patterns # as these are optionally installed, we can skip checking # for filetype if these are not available -BINARY_PACKAGE_DATAFILE_HANDLERS = [] +PACKAGE_IN_COMPILED_DATAFILE_HANDLERS = [] try: from go_inspector.binary import get_go_binary_handler handler = get_go_binary_handler() - BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + PACKAGE_IN_COMPILED_DATAFILE_HANDLERS.append(handler) except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler handler = get_rust_binary_handler() - BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + PACKAGE_IN_COMPILED_DATAFILE_HANDLERS.append(handler) except ImportError: pass @@ -271,7 +271,7 @@ APPLICATION_PACKAGE_DATAFILE_HANDLERS + [ p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS - ] + BINARY_PACKAGE_DATAFILE_HANDLERS + ] + PACKAGE_IN_COMPILED_DATAFILE_HANDLERS ) # registry of all handler classes keyed by datasource_id diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index 92320379887..52703717e39 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -15,7 +15,8 @@ import attr import click -from commoncode.cliutils import PluggableCommandLineOption +from collections import defaultdict + from commoncode.fileutils import create_dir from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS @@ -35,11 +36,10 @@ # This is the Pickle protocol we use, which was added in Python 3.4. PICKLE_PROTOCOL = 4 -PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 +PACKAGE_INDEX_LOCK_TIMEOUT = 60 PACKAGE_INDEX_DIR = 'package_patterns_index' PACKAGE_INDEX_FILENAME = 'index_cache' PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile' -PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums' @attr.s @@ -106,20 +106,25 @@ def load_or_build( # acquire lock and wait until timeout to get a lock or die with lockfile.FileLock(lock_file).locked(timeout=timeout): - system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( + system_multiregexes = build_mappings_and_multiregex_patterns( datafile_handlers=system_package_datafile_handlers, ) - application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( + application_multiregexes = build_mappings_and_multiregex_patterns( datafile_handlers=application_package_datafile_handlers, ) all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns( - application_multiregex_patterns, system_multiregex_patterns, + application_multiregex_patterns=application_multiregexes.patterns, + system_multiregex_patterns=system_multiregexes.patterns, ) - system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns) - application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns) + system_package_matcher = multiregex.RegexMatcher(system_multiregexes.patterns) + application_package_matcher = multiregex.RegexMatcher(application_multiregexes.patterns) all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher) + handler_by_regex = ( + system_multiregexes.handler_by_regex | + application_multiregexes.handler_by_regex + ) package_cache = cls( - handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, + handler_by_regex=handler_by_regex, system_package_matcher=system_package_matcher, application_package_matcher=application_package_matcher, all_package_matcher=all_package_matcher, @@ -140,19 +145,48 @@ def dump(self, cache_file): def get_prematchers_from_glob_pattern(pattern): + """ + Get a list of prematchers required to initialize the + multiregex matchers for a package manifest pattern. + + Prematchers are words that must be present for a pattern to + be matched, and this acts as a pre-matching filter for fast + matching. + >>> get_prematchers_from_glob_pattern('*pyproject.toml') + ['pyproject.toml'] + """ return [ prematcher.lower().lstrip("/") for prematcher in pattern.split("*") if prematcher ] +@attr.s +class AcceleratedPattern(): + regex :str = attr.ib(default=None) # regular expression string + prematchers :list[str] = attr.ib(default=[]) # list of prematcher strinsg for this regex + handler_datasource_ids :list[str] = attr.ib(default=[]) # handler + + +@attr.s +class MultiRegexPatternsandMappings: + multiregex_patterns :list[AcceleratedPattern] = attr.ib(default=[]) + handler_by_regex :dict = attr.ib(default={}) + + @property + def patterns(self): + return [ + (pattern.regex, pattern.prematchers) + for pattern in self.multiregex_patterns + ] + def build_mappings_and_multiregex_patterns(datafile_handlers): """ Return a mapping of regex patterns to datafile handler IDs and multiregex patterns consisting of regex patterns and prematchers. """ - handler_by_regex = {} + handler_by_regex = defaultdict(list) multiregex_patterns = [] if not datafile_handlers: @@ -178,11 +212,18 @@ def build_mappings_and_multiregex_patterns(datafile_handlers): else: handler_by_regex[regex_pattern]= [handler.datasource_id] - for regex in handler_by_regex.keys(): - regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) + for regex, handler_ids in handler_by_regex.items(): + regex_and_prematcher = AcceleratedPattern( + regex=regex, + prematchers=prematchers_by_regex.get(regex, []), + handler_datasource_ids=handler_ids, + ) multiregex_patterns.append(regex_and_prematcher) - return multiregex_patterns, handler_by_regex + return MultiRegexPatternsandMappings( + handler_by_regex=handler_by_regex, + multiregex_patterns=multiregex_patterns, + ) def get_cache( @@ -223,7 +264,7 @@ def load_cache_file(cache_file): raise Exception(msg) from e -@click.command(name='scancode-cache-package-patterns') +@click.command(name='scancode-reindex-package-patterns') @click.help_option('-h', '--help') def cache_package_patterns(*args, **kwargs): """Create scancode package manifest patterns cache and exit""" diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index dc55e0053fb..e887ebc860b 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -172,14 +172,13 @@ class PackageScanner(ScanPlugin): ), PluggableCommandLineOption( ( - '-b', - '--binary-package', + '--package-in-compiled', ), is_flag=True, default=False, help=( - 'Scan for package and dependency related data in binaries. ' - 'Currently supported binaries: Go, Rust.' + 'Scan for package and dependency related data in compiled binaries. ' + 'Currently supported compiled binaries: Go, Rust.' ), help_group=SCAN_GROUP, sort_order=22, @@ -209,14 +208,14 @@ class PackageScanner(ScanPlugin): ), ] - def is_enabled(self, package, system_package, binary_package, package_only, **kwargs): - return package or system_package or binary_package or package_only + def is_enabled(self, package, system_package, package_in_compiled, package_only, **kwargs): + return package or system_package or package_in_compiled or package_only def get_scanner( self, package=True, system_package=False, - binary_package=False, + package_in_compiled=False, package_only=False, **kwargs ): @@ -229,7 +228,7 @@ def get_scanner( get_package_data, application=package, system=system_package, - binary=binary_package, + binary=package_in_compiled, package_only=package_only, ) diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index f60107c6904..7744a550eff 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -14,7 +14,7 @@ from commoncode.fileutils import as_posixpath from packagedcode import HANDLER_BY_DATASOURCE_ID -from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS +from packagedcode import PACKAGE_IN_COMPILED_DATAFILE_HANDLERS from packagedcode import models from packagedcode.cache import get_cache @@ -46,7 +46,7 @@ def recognize_package_data( location, application=True, system=False, - binary=False, + compiled=False, package_only=False, ): """ @@ -63,7 +63,7 @@ def recognize_package_data( location=location, application=application, system=system, - binary=binary, + compiled=compiled, package_only=package_only, )) @@ -72,7 +72,7 @@ def _parse( location, application=True, system=False, - binary=False, + compiled=False, package_only=False, ): """ @@ -86,7 +86,7 @@ def _parse( package_patterns = get_cache() has_patterns = application or system or package_only - assert has_patterns or binary + assert has_patterns or compiled if package_only or (application and system): package_matcher = package_patterns.all_package_matcher elif application: @@ -117,8 +117,8 @@ def _parse( ] if not datafile_handlers: - if binary: - datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) + if compiled: + datafile_handlers.extend(PACKAGE_IN_COMPILED_DATAFILE_HANDLERS) elif TRACE: logger_debug(f'_parse: no package datafile detected at {package_path}') diff --git a/src/scancode/api.py b/src/scancode/api.py index d06af7dcf45..71382f4a6a0 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -260,7 +260,7 @@ def _get_package_data( location, application=True, system=False, - binary=False, + compiled=False, package_only=False, **kwargs ): @@ -270,14 +270,14 @@ def _get_package_data( Note that all exceptions are caught if there are any errors while parsing a package manifest. """ - assert application or system or binary or package_only + assert application or system or compiled or package_only from packagedcode.recognize import recognize_package_data try: return recognize_package_data( location=location, application=application, system=system, - binary=binary, + compiled=compiled, package_only=package_only, ) or [] @@ -312,7 +312,7 @@ def get_package_data( location, application=True, system=False, - binary=False, + compiled=False, package_only=False, **kwargs ): @@ -328,7 +328,7 @@ def get_package_data( location=location, application=application, system=system, - binary=binary, + compiled=compiled, package_only=package_only, **kwargs, ) or [] diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py index 98951d9fc8f..27ff079219c 100644 --- a/tests/packagedcode/test_cache.py +++ b/tests/packagedcode/test_cache.py @@ -7,15 +7,13 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import os.path +import fnmatch +import os from packagedcode import cache from commoncode.fileutils import as_posixpath from packages_test_utils import PackageTester -from scancode_config import REGEN_TEST_FIXTURES -from scancode.cli_test_utils import run_scan_click -from scancode.cli_test_utils import check_json_scan TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') @@ -27,11 +25,11 @@ class TestMultiregexPatterns(PackageTester): def test_build_mappings_and_multiregex_patterns_works(self): from packagedcode.about import AboutFileHandler - multiregex_patterns, handler_by_regex = cache.build_mappings_and_multiregex_patterns( + multiregexes = cache.build_mappings_and_multiregex_patterns( datafile_handlers=[AboutFileHandler], ) - assert multiregex_patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])] - assert handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']} + assert multiregexes.patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])] + assert multiregexes.handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']} def test_build_package_cache_works(self): from packagedcode.about import AboutFileHandler @@ -52,10 +50,17 @@ def test_build_package_cache_works(self): regex, _match = package_cache.all_package_matcher.match(test_path).pop() assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id - def check_empty_file_scan_works(self): + def test_empty_file_scan_works(self): test_file = self.get_test_loc('cache/.gitignore') package_path = as_posixpath(test_file) package_matcher = cache.get_cache() - assert not package_matcher.match(package_path) + assert not package_matcher.all_package_matcher.match(package_path) + + def test_get_prematchers_from_glob_pattern(self): + + from packagedcode.pypi import PyprojectTomlHandler + + prematchers = cache.get_prematchers_from_glob_pattern(PyprojectTomlHandler.path_patterns[0]) + assert "pyproject.toml" in prematchers diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py index 5b22b69e193..0ca7b3d99f3 100644 --- a/tests/packagedcode/test_cargo.py +++ b/tests/packagedcode/test_cargo.py @@ -159,7 +159,7 @@ def test_scan_works_on_rust_binary_with_inspector(self): test_file = self.get_test_loc('cargo/binary/cargo_dependencies') expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json') result_file = self.get_temp_file('results.json') - run_scan_click(['--binary-package', test_file, '--json', result_file]) + run_scan_click(['--package-in-compiled', test_file, '--json', result_file]) check_json_scan( expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES ) diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 2c45a354b31..f2ccbb6dc33 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -12,8 +12,9 @@ Options: -p, --package Scan for application package and dependency manifests, lockfiles and related data. --system-package Scan for installed system package databases. - -b, --binary-package Scan for package and dependency related data in - binaries. Currently supported binaries: Go, Rust. + --package-in-compiled Scan for package and dependency related data in + compiled binaries. Currently supported compiled + binaries:Go, Rust. --package-only Scan for system and application package data and skip license/copyright detection and top-level package creation. diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 5d7b1dfed92..a54e2f4fd11 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -12,8 +12,9 @@ Options: -p, --package Scan for application package and dependency manifests, lockfiles and related data. --system-package Scan for installed system package databases. - -b, --binary-package Scan for package and dependency related data in - binaries. Currently supported binaries: Go, Rust. + --package-in-compiled Scan for package and dependency related data in + compiled binaries. Currently supported compiled + binaries: Go, Rust. --package-only Scan for system and application package data and skip license/copyright detection and top-level package creation. From af566f258147bca9fef4a427098814c405bfe754 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 29 Dec 2025 15:28:07 +0530 Subject: [PATCH 11/13] Remove deprecated macos runners Signed-off-by: Ayan Sinha Mahapatra --- azure-pipelines.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b7fb0baac4a..2dc0bc4acc5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -145,14 +145,6 @@ jobs: test_suites: all: venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2 - - template: etc/ci/azure-posix.yml - parameters: - job_name: macos13_cpython - image_name: macOS-13 - python_versions: ['3.10', '3.11', '3.12', '3.13'] - test_suites: - all: venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2 - - template: etc/ci/azure-win.yml parameters: job_name: win2025_cpython @@ -220,14 +212,6 @@ jobs: test_suites: all: venv/bin/pip install --upgrade-strategy eager --force-reinstall --upgrade -e .[testing] && venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py - - template: etc/ci/azure-posix.yml - parameters: - job_name: macos13_cpython_latest_from_pip - image_name: macos-13 - python_versions: ['3.10', '3.11', '3.12', '3.13'] - test_suites: - all: venv/bin/pip install --upgrade-strategy eager --force-reinstall --upgrade -e .[testing] && venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py - - template: etc/ci/azure-win.yml parameters: job_name: win2019_cpython_latest_from_pip From 63d72cfe507b42b15532bda0964c8be886e2008d Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 7 Jan 2026 14:09:03 +0530 Subject: [PATCH 12/13] Fix test failures Signed-off-by: Ayan Sinha Mahapatra --- requirements.txt | 2 +- tests/scancode/data/help/help.txt | 20 ++++++++++---------- tests/scancode/data/help/help_linux.txt | 24 ++++++++++++------------ tests/scancode/test_cli.py | 11 +++-------- 4 files changed, 26 insertions(+), 31 deletions(-) diff --git a/requirements.txt b/requirements.txt index e7d5c43e483..165afc9d3b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ charset-normalizer==3.4.2 click==8.3.0;python_version>='3.10' click==8.1.7;python_version<'3.10' colorama==0.4.6 -commoncode==32.4.0 +commoncode==32.4.2 construct==2.10.70 container-inspector==33.0.0 cryptography==45.0.4 diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index f2ccbb6dc33..e725888ead4 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -8,17 +8,17 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. --package-in-compiled Scan for package and dependency related data in - compiled binaries. Currently supported compiled - binaries:Go, Rust. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package - creation. - -c, --copyright Scan for copyrights. + compiled binaries. Currently supported compiled + binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. other scans: -i, --info Scan for file information (size, checksums, etc). diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index a54e2f4fd11..6794b19d602 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -8,19 +8,19 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. --package-in-compiled Scan for package and dependency related data in - compiled binaries. Currently supported compiled - binaries: Go, Rust. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package - creation. - -c, --copyright Scan for copyrights. - --go-symbol Collect Go symbols. - --rust-symbol Collect Rust symbols from rust binaries. + compiled binaries. Currently supported compiled + binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. + --go-symbol Collect Go symbols. + --rust-symbol Collect Rust symbols from rust binaries. other scans: -i, --info Scan for file information (size, checksums, etc). diff --git a/tests/scancode/test_cli.py b/tests/scancode/test_cli.py index 0f543a274a9..d83487b09c9 100644 --- a/tests/scancode/test_cli.py +++ b/tests/scancode/test_cli.py @@ -19,8 +19,6 @@ from commoncode.system import on_mac from commoncode.system import on_macos_14_or_higher from commoncode.system import on_windows -from commoncode.system import py36 -from commoncode.system import py37 from scancode.cli_test_utils import check_json_scan from scancode.cli_test_utils import load_json_result @@ -901,9 +899,6 @@ def test_check_error_count(): assert str(error_files) == str(error_count) -on_mac_new_py = on_mac and not (py36 or py37) - - def test_scan_keep_temp_files_is_false_by_default(): test_file = test_env.get_test_loc('tempfiles/samples') result_file = test_env.get_temp_file('json') @@ -919,7 +914,7 @@ def test_scan_keep_temp_files_is_false_by_default(): # the SCANCODE_TEMP dir is not deleted, but it should be empty assert os.path.exists(temp_directory) # this does not make sense but that's what is seen in practice - if on_mac_new_py: + if on_mac: expected = 4 elif on_windows: expected = 2 @@ -945,8 +940,8 @@ def test_scan_keep_temp_files_keeps_files(): # the SCANCODE_TEMP dir is not deleted, but it should not be empty assert os.path.exists(temp_directory) # this does not make sense but that's what is seen in practice - expected = 8 if (on_windows or on_mac_new_py) else 7 - if on_mac_new_py: + expected = 8 if (on_windows or on_mac) else 7 + if on_mac: expected = 10 elif on_windows: expected = 8 From db6d1a4f72d78f8e28cd6f32f8badc17613ac93d Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 8 Jan 2026 18:45:04 +0530 Subject: [PATCH 13/13] Fix misc test failures Signed-off-by: Ayan Sinha Mahapatra --- README.rst | 9 ++++++--- src/packagedcode/plugin_package.py | 2 +- src/packagedcode/win_reg.py | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 9da7b7c5cbf..c35b142d8a2 100644 --- a/README.rst +++ b/README.rst @@ -2,12 +2,15 @@ ScanCode Toolkit ================ -ScanCode Toolkit is a set of code scanning tools that detect the origin (copyrights), license and vulnerabilities of code, packages and dependencies in a codebase. ScanCode Toolkit is an `AboutCode project `_. +ScanCode Toolkit is a set of code scanning tools that detect the origin (copyrights), license and vulnerabilities of code, +packages and dependencies in a codebase. ScanCode Toolkit is an `AboutCode project `_. Why Use ScanCode Toolkit? ========================= -ScanCode Toolkit is the leading tool in scanning depth and accuracy, used by hundreds of software teams. You can use ScanCode Toolkit as a command line tool or as a library. +ScanCode Toolkit is the leading tool in scanning depth and accuracy, +used by hundreds of software teams. You can use ScanCode Toolkit +as a command line tool or as a library. Getting Started =============== @@ -84,7 +87,7 @@ Benefits of ScanCode Support ======= -If you have a specific problem, suggestion or bug, please submit a +If you have a specific problem, suggestion or bug, please submit a `GitHub issue `_. For quick questions or socializing, join the AboutCode community discussions on `Slack `_. diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index e887ebc860b..027d89427b0 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -228,7 +228,7 @@ def get_scanner( get_package_data, application=package, system=system_package, - binary=package_in_compiled, + compiled=package_in_compiled, package_only=package_only, ) diff --git a/src/packagedcode/win_reg.py b/src/packagedcode/win_reg.py index 8e8327f06e8..228260f1715 100644 --- a/src/packagedcode/win_reg.py +++ b/src/packagedcode/win_reg.py @@ -13,6 +13,8 @@ from pathlib import Path from pathlib import PureWindowsPath +from dataclasses import asdict + import attr try: @@ -68,7 +70,7 @@ def get_registry_tree(registry_location, registry_path): if not name_key_entry: return [] return [ - attr.asdict(entry) for entry in registry_hive.recurse_subkeys(name_key_entry, as_json=True) + asdict(entry) for entry in registry_hive.recurse_subkeys(name_key_entry, as_json=True) ]