Skip to content

Commit

Permalink
Use the PurlDB collect endpoint for the enrich pipeline #1328
Browse files Browse the repository at this point in the history
Signed-off-by: tdruez <[email protected]>
  • Loading branch information
tdruez committed Jul 23, 2024
1 parent 90d0292 commit 1df1b6e
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 27 deletions.
76 changes: 62 additions & 14 deletions scanpipe/pipes/purldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,27 @@ class PurlDBException(Exception):
# This key can be used for filtering
ENRICH_EXTRA_DATA_KEY = "enrich_with_purldb"

# Subset of fields kept when multiple entries are found in the PurlDB.
CROSS_VERSION_COMMON_FIELDS = [
"primary_language",
"description",
"parties",
"keywords",
"homepage_url",
"bug_tracking_url",
"code_view_url",
"vcs_url",
"repository_homepage_url",
"copyright",
"holder",
"declared_license_expression",
"declared_license_expression_spdx",
"other_license_expression",
"other_license_expression_spdx",
"extracted_license_statement",
"notice_text",
]


def is_configured():
"""Return True if the required PurlDB settings have been set."""
Expand Down Expand Up @@ -370,13 +391,16 @@ def get_packages_for_purl(package_url):
return find_packages(payload)


def get_package_by_purl(package_url):
"""
Get a Package details entry providing its `package_url`.
If multiples entries are found in the PurlDB, the most recent version is returned.
"""
if purldb_entries := get_packages_for_purl(package_url):
return purldb_entries[0]
def collect_data_for_purl(package_url):
collect_api_url = f"{PURLDB_API_URL}collect/"
payload = {
"purl": str(package_url),
"sort": "-version",
}
purldb_entries = request_get(collect_api_url, payload=payload)

if purldb_entries:
return purldb_entries


def get_next_download_url(timeout=DEFAULT_TIMEOUT, api_url=PURLDB_API_URL):
Expand Down Expand Up @@ -476,17 +500,41 @@ def get_run_status(run, **kwargs):

def enrich_package(package):
"""Enrich the provided ``package`` with the PurlDB data."""
purldb_entry = get_package_by_purl(package.package_url)
if purldb_entry:
package_data = _clean_package_data(purldb_entry)
if updated_fields := package.update_from_data(package_data):
package.update_extra_data({ENRICH_EXTRA_DATA_KEY: updated_fields})
return updated_fields
package_url = package.package_url
project = package.project

purldb_entries = collect_data_for_purl(package_url)
if not purldb_entries:
return

if len(purldb_entries) == 1:
# Single match, all the PurlDB data are used to enrich the package.
purldb_entry = purldb_entries[0]
else:
project.add_warning(
model="PurlDB",
description=(
f"Multiple entries found in the PurlDB for {package_url}\n"
f"Using data from the most recent version."
),
details={"package_url": package_url, "uuid": package.uuid},
)
# Do not set fields specific to a given version.
purldb_entry = {
field: value
for field, value in purldb_entries[0].items()
if field in CROSS_VERSION_COMMON_FIELDS
}

package_data = _clean_package_data(purldb_entry)
if updated_fields := package.update_from_data(package_data):
package.update_extra_data({ENRICH_EXTRA_DATA_KEY: updated_fields})
return updated_fields


def enrich_discovered_packages(project, logger=logger.info):
"""Enrich all project discovered packages with the PurlDB data."""
packages = project.discoveredpackages.all()
packages = project.discoveredpackages.all().select_related("project")

updated_package_count = 0
for package in packages:
Expand Down
18 changes: 8 additions & 10 deletions scanpipe/tests/pipes/test_purldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,17 +237,17 @@ def test_scanpipe_pipes_purldb_create_project_name(self):
project_name = purldb.create_project_name(download_url, scannable_uri_uuid)
self.assertEqual("httpsregistrynpmjsorgasdf-asdf-101tgz-52b2930d", project_name)

@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
def test_scanpipe_pipes_purldb_enrich_package(self, mock_get_package_by_purl):
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
def test_scanpipe_pipes_purldb_enrich_package(self, mock_collect_data):
package1 = make_package(self.project1, package_url="pkg:npm/[email protected]")

mock_get_package_by_purl.return_value = {}
mock_collect_data.return_value = []
updated_fields = purldb.enrich_package(package=package1)
self.assertIsNone(updated_fields)

purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
purldb_entry = json.loads(purldb_entry_file.read_text())
mock_get_package_by_purl.return_value = purldb_entry
mock_collect_data.return_value = [purldb_entry]
updated_fields = purldb.enrich_package(package=package1)
self.assertTrue(updated_fields)
self.assertIn("homepage_url", updated_fields)
Expand All @@ -258,13 +258,11 @@ def test_scanpipe_pipes_purldb_enrich_package(self, mock_get_package_by_purl):
self.assertEqual(purldb_entry.get("sha256"), package1.sha256)
self.assertEqual(purldb_entry.get("copyright"), package1.copyright)

@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
def test_scanpipe_pipes_purldb_enrich_discovered_packages(
self, mock_get_package_by_purl
):
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
def test_scanpipe_pipes_purldb_enrich_discovered_packages(self, mock_collect_data):
package1 = make_package(self.project1, package_url="pkg:npm/[email protected]")

mock_get_package_by_purl.return_value = {}
mock_collect_data.return_value = []
buffer = io.StringIO()
updated_package_count = purldb.enrich_discovered_packages(
project=self.project1,
Expand All @@ -276,7 +274,7 @@ def test_scanpipe_pipes_purldb_enrich_discovered_packages(

purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
purldb_entry = json.loads(purldb_entry_file.read_text())
mock_get_package_by_purl.return_value = purldb_entry
mock_collect_data.return_value = [purldb_entry]
buffer = io.StringIO()
updated_package_count = purldb.enrich_discovered_packages(
project=self.project1,
Expand Down
6 changes: 3 additions & 3 deletions scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1709,9 +1709,9 @@ def test_scanpipe_collect_symbols_tree_sitter_pipeline_integration(self):

@mock.patch("scanpipe.pipes.purldb.is_available")
@mock.patch("scanpipe.pipes.purldb.is_configured")
@mock.patch("scanpipe.pipes.purldb.get_package_by_purl")
@mock.patch("scanpipe.pipes.purldb.collect_data_for_purl")
def test_scanpipe_enrich_with_purldb_pipeline_integration(
self, mock_get_package, mock_is_configured, mock_is_available
self, mock_collect_data, mock_is_configured, mock_is_available
):
pipeline_name = "enrich_with_purldb"
project1 = Project.objects.create(name="Analysis")
Expand All @@ -1722,7 +1722,7 @@ def test_scanpipe_enrich_with_purldb_pipeline_integration(

purldb_entry_file = self.data / "purldb" / "csvtojson-2.0.10.json"
purldb_entry = json.loads(purldb_entry_file.read_text())
mock_get_package.return_value = purldb_entry
mock_collect_data.return_value = [purldb_entry]

run = project1.add_pipeline(pipeline_name)
pipeline = run.make_pipeline_instance()
Expand Down

0 comments on commit 1df1b6e

Please sign in to comment.