From 65b443797d01faa1538d5bc26afce10f12afd4d7 Mon Sep 17 00:00:00 2001 From: tdruez Date: Tue, 16 Dec 2025 08:42:17 +0400 Subject: [PATCH 1/9] Add option to infer_download_urls on Product importers Signed-off-by: tdruez --- product_portfolio/forms.py | 11 ++++++++++ product_portfolio/importers.py | 21 ++++++++++++++++++- ...014_scancodeproject_infer_download_urls.py | 18 ++++++++++++++++ product_portfolio/models.py | 4 ++++ 4 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 product_portfolio/migrations/0014_scancodeproject_infer_download_urls.py diff --git a/product_portfolio/forms.py b/product_portfolio/forms.py index 216bfa6d..164b9091 100644 --- a/product_portfolio/forms.py +++ b/product_portfolio/forms.py @@ -641,6 +641,15 @@ class BaseProductImportFormView(forms.Form): "for all of the packages assigned to your product." ), ) + infer_download_urls = forms.BooleanField( + label=_("Infer missing download URLs"), + required=False, + initial=True, + help_text=_( + "When a download URL is missing from the input data, attempt to infer it " + "from the Package URL (purl). A download URL is required for package scanning." + ), + ) @property def helper(self): @@ -652,6 +661,7 @@ def helper(self): Fieldset( None, "input_file", + "infer_download_urls", "update_existing_packages", "scan_all_packages", StrictSubmit("submit", _("Import"), css_class="btn-success col-2"), @@ -667,6 +677,7 @@ def submit(self, product, user): input_file=self.cleaned_data.get("input_file"), update_existing_packages=self.cleaned_data.get("update_existing_packages"), scan_all_packages=self.cleaned_data.get("scan_all_packages"), + infer_download_urls=self.cleaned_data.get("infer_download_urls"), created_by=user, ) diff --git a/product_portfolio/importers.py b/product_portfolio/importers.py index 699c746d..5b88eb49 100644 --- a/product_portfolio/importers.py +++ b/product_portfolio/importers.py @@ -26,6 +26,7 @@ from component_catalog.models import PACKAGE_URL_FIELDS from component_catalog.models import Component from component_catalog.models import Package +from dejacode_toolkit import download from dejacode_toolkit.scancodeio import ScanCodeIO from dje.copier import copy_object from dje.importers import BaseImporter @@ -649,7 +650,15 @@ class ImportPackageFromScanCodeIO: "filename", ] - def __init__(self, user, project_uuid, product, update_existing=False, scan_all_packages=False): + def __init__( + self, + user, + project_uuid, + product, + update_existing=False, + scan_all_packages=False, + infer_download_urls=False, + ): self.licensing = Licensing() self.created = defaultdict(list) self.existing = defaultdict(list) @@ -662,6 +671,7 @@ def __init__(self, user, project_uuid, product, update_existing=False, scan_all_ self.product = product self.update_existing = update_existing self.scan_all_packages = scan_all_packages + self.infer_download_urls = infer_download_urls scancodeio = ScanCodeIO(user.dataspace) self.packages = scancodeio.fetch_project_packages(self.project_uuid) @@ -696,6 +706,15 @@ def import_package(self, package_data): # Check if the package already exists to prevent duplication. package = self.look_for_existing_package(package_data) + # Infer a download URL from the Package URL + if ( + self.infer_download_urls + and not package_data.get("download_url") + and (purl := package_data.get("purl")) + and (download_url := download.infer_download_url(purl)) + ): + package_data["download_url"] = download_url + if license_expression := package_data.get("declared_license_expression"): license_expression = str(self.licensing.dedup(license_expression)) package_data["license_expression"] = license_expression diff --git a/product_portfolio/migrations/0014_scancodeproject_infer_download_urls.py b/product_portfolio/migrations/0014_scancodeproject_infer_download_urls.py new file mode 100644 index 00000000..4bcd43b4 --- /dev/null +++ b/product_portfolio/migrations/0014_scancodeproject_infer_download_urls.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.8 on 2025-12-16 04:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("product_portfolio", "0013_productstatus_is_locked_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="scancodeproject", + name="infer_download_urls", + field=models.BooleanField(default=False), + ), + ] diff --git a/product_portfolio/models.py b/product_portfolio/models.py index 0fa97dae..b66882b5 100644 --- a/product_portfolio/models.py +++ b/product_portfolio/models.py @@ -1555,6 +1555,9 @@ class Status(models.TextChoices): scan_all_packages = models.BooleanField( default=False, ) + infer_download_urls = models.BooleanField( + default=False, + ) status = models.CharField( max_length=10, choices=Status.choices, @@ -1615,6 +1618,7 @@ def import_data_from_scancodeio(self): product=self.product, update_existing=self.update_existing_packages, scan_all_packages=self.scan_all_packages, + infer_download_urls=self.infer_download_urls, ) created, existing, errors = importer.save() From f143fe26123db4814ca3afdc8b473a1f084b92bb Mon Sep 17 00:00:00 2001 From: tdruez Date: Tue, 16 Dec 2025 08:46:53 +0400 Subject: [PATCH 2/9] Add StaticResolver,DynamicResolver steps to the resolve_dependencies Signed-off-by: tdruez --- product_portfolio/forms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/product_portfolio/forms.py b/product_portfolio/forms.py index 164b9091..7a71e5f3 100644 --- a/product_portfolio/forms.py +++ b/product_portfolio/forms.py @@ -727,7 +727,7 @@ class LoadSBOMsForm(BaseProductImportFormView): class ImportManifestsForm(BaseProductImportFormView): project_type = ScanCodeProject.ProjectType.IMPORT_FROM_MANIFEST - pipeline_name = "resolve_dependencies" + pipeline_name = "resolve_dependencies:StaticResolver,DynamicResolver" input_file = SmartFileField( label=_("Manifest file or zip archive"), From 019f8bea7c09feb5f8d2d2d87ef40f87f6d459e0 Mon Sep 17 00:00:00 2001 From: tdruez Date: Tue, 16 Dec 2025 10:08:03 +0400 Subject: [PATCH 3/9] Add infer_download_urls option in REST API Signed-off-by: tdruez --- product_portfolio/api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/product_portfolio/api.py b/product_portfolio/api.py index 9f6fa0af..d24c5076 100644 --- a/product_portfolio/api.py +++ b/product_portfolio/api.py @@ -227,6 +227,12 @@ class LoadSBOMsFormSerializer(serializers.Serializer): required=True, help_text=LoadSBOMsForm.base_fields["input_file"].label, ) + infer_download_urls = serializers.BooleanField( + required=False, + initial=True, + default=True, + help_text=LoadSBOMsForm.base_fields["infer_download_urls"].help_text, + ) update_existing_packages = serializers.BooleanField( required=False, default=False, @@ -246,6 +252,12 @@ class ImportManifestsFormSerializer(serializers.Serializer): required=True, help_text=ImportManifestsForm.base_fields["input_file"].label, ) + infer_download_urls = serializers.BooleanField( + required=False, + initial=True, + default=True, + help_text=ImportManifestsForm.base_fields["infer_download_urls"].help_text, + ) update_existing_packages = serializers.BooleanField( required=False, default=False, From a2b2f2d963885c9122fc7f428930a154e76b81f0 Mon Sep 17 00:00:00 2001 From: tdruez Date: Tue, 16 Dec 2025 10:08:55 +0400 Subject: [PATCH 4/9] Refine the Exception case in ImportPackageFromScanCodeIO Signed-off-by: tdruez --- product_portfolio/importers.py | 4 ++-- product_portfolio/tests/test_importers.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/product_portfolio/importers.py b/product_portfolio/importers.py index 5b88eb49..7b931bcf 100644 --- a/product_portfolio/importers.py +++ b/product_portfolio/importers.py @@ -675,9 +675,9 @@ def __init__( scancodeio = ScanCodeIO(user.dataspace) self.packages = scancodeio.fetch_project_packages(self.project_uuid) - if not self.packages: - raise Exception("Packages could not be fetched from ScanCode.io") self.dependencies = scancodeio.fetch_project_dependencies(self.project_uuid) + if not self.packages and not self.dependencies: + raise Exception("Packages could not be fetched from ScanCode.io") def save(self): self.import_packages() diff --git a/product_portfolio/tests/test_importers.py b/product_portfolio/tests/test_importers.py index a5d03cf2..2b0144ee 100644 --- a/product_portfolio/tests/test_importers.py +++ b/product_portfolio/tests/test_importers.py @@ -1153,13 +1153,13 @@ def test_product_portfolio_import_packages_from_scio_importer_look_for_existing_ package = importer.look_for_existing_package(package_data) self.assertEqual(package1, package) - # 2 packages are matched, cannot defined the one that should be used + # 2 packages are matched, cannot define the one that should be used package1.update(download_url=download_url) package = importer.look_for_existing_package(package_data) self.assertIsNone(package) # If the package data does not include a download_url value: - # Attemp to find an existing package using purl-only match. + # Attempt to find an existing package using purl-only match. package2.delete() package = importer.look_for_existing_package(package_data) self.assertEqual(package1, package) From 8d465a8c95aea78b99cface5eca42ef7bf3d2e7e Mon Sep 17 00:00:00 2001 From: tdruez Date: Tue, 16 Dec 2025 14:42:13 +0400 Subject: [PATCH 5/9] Add the infer_download_urls in scan_all_packages_task Signed-off-by: tdruez --- component_catalog/models.py | 10 +++++++- product_portfolio/models.py | 25 +++++++++++++++---- .../modals/scan_all_packages_modal.html | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/component_catalog/models.py b/component_catalog/models.py index a8a4ced2..ddabd900 100644 --- a/component_catalog/models.py +++ b/component_catalog/models.py @@ -1719,6 +1719,10 @@ def has_package_url(self): """Return objects with Package URL defined.""" return self.filter(~models.Q(type="") & ~models.Q(name="")) + def has_download_url(self): + """Return objects with download URL defined.""" + return self.filter(~models.Q(download_url="")) + def annotate_sortable_identifier(self): """ Annotate the QuerySet with a `sortable_identifier` value that combines @@ -2036,9 +2040,13 @@ def package_url_filename(self): @property def inferred_repo_url(self): - """Return the URL deduced from the information available in a Package URL (purl).""" + """Return the repo URL deduced from the Package URL (purl).""" return purl2url.get_repo_url(self.package_url) + def infer_download_url(self): + """Infer the download URL deduced from the Package URL (purl).""" + return download.infer_download_url(self.package_url) + def get_url(self, name, params=None, include_identifier=False): if not params: params = [self.dataspace.name, quote_plus(str(self.uuid))] diff --git a/product_portfolio/models.py b/product_portfolio/models.py index b66882b5..c82db0b4 100644 --- a/product_portfolio/models.py +++ b/product_portfolio/models.py @@ -557,25 +557,40 @@ def assign_objects(self, related_objects, user, replace_version=False): return created_count, updated_count, unchanged_count - def scan_all_packages_task(self, user): + def scan_all_packages_task(self, user, infer_download_urls=False): """ Submit a Scan request to ScanCode.io for each package assigned to this Product. Only packages with a proper download URL are sent. """ - package_urls = [ + if infer_download_urls: + self.improve_packages_from_purl() + + package_download_urls = [ package.download_url - for package in self.all_packages + for package in self.all_packages.has_download_url() if package.download_url.startswith(("http", "https")) ] tasks.scancodeio_submit_scan.delay( - uris=package_urls, + uris=package_download_urls, user_uuid=user.uuid, dataspace_uuid=user.dataspace.uuid, ) + def improve_packages_from_purl(self): + """Infer missing packages download URL using the Package URL when possible.""" + updated_packages = [] + + packages = self.all_packages.has_package_url().filter(models.Q(download_url="")) + for package in packages: + if download_url := package.infer_download_url(): + package.update(download_url=download_url) + updated_packages.append(package) + + return updated_packages + def improve_packages_from_purldb(self, user): - """Update all Packages assigned to the Product using PurlDB data.""" + """Update all packages assigned to thepProduct using PurlDB data.""" updated_packages = [] for package in self.packages.all(): updated_fields = package.update_from_purldb(user) diff --git a/product_portfolio/templates/product_portfolio/modals/scan_all_packages_modal.html b/product_portfolio/templates/product_portfolio/modals/scan_all_packages_modal.html index 4f06c9f5..55a1272e 100644 --- a/product_portfolio/templates/product_portfolio/modals/scan_all_packages_modal.html +++ b/product_portfolio/templates/product_portfolio/modals/scan_all_packages_modal.html @@ -6,7 +6,7 @@