Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Next release
- Replace unmaintained ``toml`` library with ``tomllib`` / ``tomli``.
https://github.com/aboutcode-org/scancode-toolkit/issues/4532

- Support directories in ``referenced_filenames`` for rules.
https://github.com/nexB/scancode-toolkit/issues/4276

v32.4.1 - 2025-07-23
--------------------

Expand Down
112 changes: 74 additions & 38 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import uuid
from enum import Enum
from hashlib import sha1
from fnmatch import fnmatch

import attr
from collections import defaultdict
Expand Down Expand Up @@ -1900,22 +1901,23 @@ def has_resolved_referenced_file(license_matches):

def find_referenced_resource_from_package(referenced_filename, resource, codebase, **kwargs):
"""
Return a Resource matching the ``referenced_filename`` path or filename
Return a list of Resources matching the ``referenced_filename`` path or filename
given a ``resource`` in ``codebase``.

To find the `referenced_filename` the sibling files are searched beside all the
package manifest paths, for all the packages which the resource is a part of,
to resolve references to files in package ecosystem specific locations.

Return None if the ``referenced_filename`` cannot be found in the same
Return an empty list if the ``referenced_filename`` cannot be found in the same
directory as the base ``resource``, or at the codebase ``root``.

``referenced_filename`` is the path or filename referenced in a
LicenseMatch detected at ``resource``,
"""
matches = []
codebase_packages = codebase.attributes.packages
if not (resource and codebase_packages):
return
return matches

datafile_paths_by_package_uid = {}
for package in codebase_packages:
Expand Down Expand Up @@ -1945,43 +1947,72 @@ def find_referenced_resource_from_package(referenced_filename, resource, codebas
referenced_path = posixpath.join(parent_path, referenced_filename)
referenced_resource = codebase.get_resource(path=referenced_path)
if referenced_resource:
return referenced_resource
matches.append(referenced_resource)

return matches


def find_referenced_resource(referenced_filename, resource, codebase, **kwargs):
"""
Return a Resource matching the ``referenced_filename`` path or filename
given a ``resource`` in ``codebase``.
Return a list of Resources matching the ``referenced_filename`` path, pattern
or filename given a ``resource`` in ``codebase``.

To find the `referenced_filename` the sibling files of the `resource`
and files at the `codebase` root are searched.

Return None if the ``referenced_filename`` cannot be found in the same
directory as the base ``resource``, or at the codebase ``root``.
Return an empty list if the ``referenced_filename`` cannot be found in the
same directory as the base ``resource``, or at the codebase ``root``.

``referenced_filename`` is the path or filename referenced in a
``referenced_filename`` is the path, pattern or filename referenced in a
LicenseMatch detected at ``resource``,
"""
matches = []
if not resource:
return
return matches

parent_path = resource.parent_path()
if not parent_path:
return
return matches

# this can be a path or a plain name
referenced_filename = clean_path(referenced_filename)
path = posixpath.join(parent_path, referenced_filename)
resource = codebase.get_resource(path=path)
if resource:
return resource

# Candidate paths to look for: relative to parent, and relative to root
candidate_paths = [
posixpath.join(parent_path, referenced_filename),
posixpath.join(codebase.root.path, referenced_filename),
]

# Also look at codebase root for referenced file
root_path = codebase.root.path
path = posixpath.join(root_path, referenced_filename)
resource = codebase.get_resource(path=path)
if resource:
return resource
# We want unique matches
seen_paths = set()

for pattern in candidate_paths:
is_glob = any(c in pattern for c in '*?[]')

if is_glob:
# If glob, we walk the codebase to find matches
# Optimization: If we could limit walk to a subtree it would be better
# For now we walk all as a safe default
for res in codebase.walk(topdown=True):
if fnmatch(res.path, pattern):
if res.path not in seen_paths:
matches.append(res)
seen_paths.add(res.path)
else:
# Exact match check
res = codebase.get_resource(path=pattern)
if res:
if res.is_dir:
for child in res.children(codebase):
if child.path not in seen_paths:
matches.append(child)
seen_paths.add(child.path)
else:
if res.path not in seen_paths:
matches.append(res)
seen_paths.add(res.path)

return matches


def update_expressions_from_license_detections(resource, codebase):
Expand Down Expand Up @@ -2041,25 +2072,30 @@ def update_detection_from_referenced_files(
referenced_detections = []
referenced_resources = []
for referenced_filename in referenced_filenames:
referenced_resource = find_referenced_resource_func(
found_resources = find_referenced_resource_func(
referenced_filename=referenced_filename,
resource=resource,
codebase=codebase,
)

if referenced_resource and referenced_resource.license_detections:
referenced_detections.extend(
referenced_resource.license_detections
)
referenced_resources.append(referenced_resource)

# For LicenseMatches with different resources as origin, add the
# resource path to these matches as origin info
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
) or []

# Ensure we work with a list even if the func returns a single item (legacy support)
if hasattr(found_resources, 'path'):
found_resources = [found_resources]

for referenced_resource in found_resources:
if referenced_resource and referenced_resource.license_detections:
referenced_detections.extend(
referenced_resource.license_detections
)
referenced_resources.append(referenced_resource)

# For LicenseMatches with different resources as origin, add the
# resource path to these matches as origin info
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
)

if not referenced_detections:
return False
Expand All @@ -2080,7 +2116,7 @@ def update_detection_from_referenced_files(
if TRACE_REFERENCE and referenced_resources:
paths = [
resource.path
for resource in referenced_resource
for resource in referenced_resources
]
logger_debug(
f'use_referenced_license_expression: False for '
Expand All @@ -2092,7 +2128,7 @@ def update_detection_from_referenced_files(
if TRACE_REFERENCE and referenced_resources:
paths = [
resource.path
for resource in referenced_resource
for resource in referenced_resources
]
logger_debug(
f'use_referenced_license_expression: True for '
Expand Down
78 changes: 78 additions & 0 deletions tests/licensedcode/test_issue_4276.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@

# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import os
from commoncode.testcase import FileDrivenTesting
from commoncode.resource import Codebase
from licensedcode.detection import find_referenced_resource

class TestIssue4276(FileDrivenTesting):

def test_find_referenced_resource_with_glob(self):
test_dir = self.get_temp_dir()
os.makedirs(os.path.join(test_dir, 'licenses'))
with open(os.path.join(test_dir, 'README.txt'), 'w') as f:
f.write('See licenses/*.txt')
with open(os.path.join(test_dir, 'licenses', 'MIT.txt'), 'w') as f:
f.write('MIT License')

codebase = Codebase(test_dir)
readme = None
for res in codebase.walk():
if res.name == 'README.txt':
readme = res
break
assert readme is not None, "README.txt resource not found in codebase"

# Test finding with glob
# This currently expects a single return, but we want it to handle globs
# For this test, we accept either a list or a single resource if we stick to single-return for now
# But realistically we need a list.

print(f"DEBUG: Root path: {codebase.root.path if codebase.root else 'None'}")
print(f"DEBUG: Readme path: {readme.path}")
print(f"DEBUG: Readme parent: {readme.parent_path()}")
print(f"DEBUG: All resources: {[r.path for r in codebase.walk()]}")

result = find_referenced_resource('licenses/*.txt', readme, codebase)
print(f"DEBUG: Result: {result}")

assert result is not None
if isinstance(result, list):
assert len(result) > 0
assert result[0].path.endswith('licenses/MIT.txt')
else:
assert result.path.endswith('licenses/MIT.txt')

def test_find_referenced_resource_with_directory(self):
test_dir = self.get_temp_dir()
os.makedirs(os.path.join(test_dir, 'licenses'))
with open(os.path.join(test_dir, 'README.txt'), 'w') as f:
f.write('See licenses/')
with open(os.path.join(test_dir, 'licenses', 'MIT.txt'), 'w') as f:
f.write('MIT License')

codebase = Codebase(test_dir)
readme = None
for res in codebase.walk():
if res.name == 'README.txt':
readme = res
break
assert readme is not None

# referencing a directory should return all files in it
result = find_referenced_resource('licenses/', readme, codebase)

assert result is not None
assert isinstance(result, list)
# We expect it to find the file inside the directory
found_paths = [r.path for r in result]
assert any(p.endswith('licenses/MIT.txt') for p in found_paths)
8 changes: 6 additions & 2 deletions tests/licensedcode/test_plugin_license_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@ def test_find_referenced_resource():
codebase = VirtualCodebase(scan_loc)
resource = codebase.get_resource(path='scan-ref/license-notice.txt')
result = find_referenced_resource(referenced_filename='LICENSE', resource=resource, codebase=codebase)
assert result.path == 'scan-ref/LICENSE'
assert isinstance(result, list)
assert len(result) == 1
assert result[0].path == 'scan-ref/LICENSE'


def test_find_referenced_resource_does_not_find_based_file_name_suffix():
Expand All @@ -346,7 +348,9 @@ def test_find_referenced_resource_does_not_find_based_file_name_suffix():
codebase = VirtualCodebase(scan_loc)
resource = codebase.get_resource(path='scan-ref-dupe-name-suffix/license-notice.txt')
result = find_referenced_resource(referenced_filename='LICENSE', resource=resource, codebase=codebase)
assert result.path == 'scan-ref-dupe-name-suffix/LICENSE'
assert isinstance(result, list)
assert len(result) == 1
assert result[0].path == 'scan-ref-dupe-name-suffix/LICENSE'


def test_match_reference_license():
Expand Down
Loading