Skip to content

Commit

Permalink
Merge pull request #235 from nexB/store_scancode_scans_locally
Browse files Browse the repository at this point in the history
Store scancode scans locally
  • Loading branch information
pombredanne authored Jul 22, 2024
2 parents d4094f5 + a746356 commit 14115c0
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 5 deletions.
13 changes: 8 additions & 5 deletions clearcode/cdutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
'nuget': 'nuget',
'pypi': 'pypi',
'gem': 'gem',
'npm': 'npm',
'go': 'golang',
}


Expand Down Expand Up @@ -104,7 +106,7 @@ class Coordinate(object):
ClearlyDefined coordinates are used to identify any tracked component.
"""

base_api_url = 'https://api.clearlydefined.io'
base_api_url = 'https://dev-api.clearlydefined.io'

type = attr.ib()
provider = attr.ib()
Expand Down Expand Up @@ -236,15 +238,15 @@ def to_purl(self):
>>> expected = 'pkg:maven/io.dropwizard/[email protected]'
>>> test = Coordinate('maven', 'mavencentral', 'io.dropwizard', 'dropwizard', '2.0.0-rc13').to_purl()
>>> assert expected == test
>>> assert expected == str(test)
>>> expected = 'pkg:maven/io.dropwizard/[email protected]?classifier=sources'
>>> test = Coordinate('sourcearchive', 'mavencentral', 'io.dropwizard', 'dropwizard', '2.0.0-rc13').to_purl()
>>> assert expected == test
>>> assert expected == str(test)
>>> expected = 'pkg:deb/debian/[email protected]?arch=source'
>>> test = Coordinate('debsrc', 'debian', '', 'gedit-plugins', '3.34.0-3').to_purl()
>>> assert expected == test
>>> assert expected == str(test)
"""
converted_package_type = PACKAGE_TYPES_BY_CD_TYPE[self.type]

Expand All @@ -265,7 +267,7 @@ def to_purl(self):
name=self.name,
version=self.revision,
qualifiers=qualifiers,
).to_string()
)

@classmethod
def from_purl(cls, purl):
Expand Down Expand Up @@ -519,6 +521,7 @@ def str2coord(s):
URN: "urn:gem:rubygems:-:mocha:revision:1.7.0:tool:scancode:3.1.0"
plain: /gem/rubygems/foo/mocha/1.7.0"
"""
#TODO: Add doctest
is_urn = s.startswith('urn')
is_url = s.startswith('cd:')
splitter = ':' if is_urn else '/'
Expand Down
22 changes: 22 additions & 0 deletions clearcode/management/commands/store_scans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from clearcode.store_scans import store_scancode_scans_from_cd_items
from minecode.management.commands import VerboseCommand

class Command(VerboseCommand):
help = 'Store scancode scans in git repositories'

def add_arguments(self, parser):
parser.add_argument('work_dir', type=str)
parser.add_argument('--github_org', type=str, default="")
parser.add_argument('--count', type=int, default=0)

def handle(self, *args, **options):
store_scancode_scans_from_cd_items(work_dir=options['work_dir'], github_org=options['github_org'], count=options['count'])
238 changes: 238 additions & 0 deletions clearcode/store_scans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
#
# ClearCode is a free software tool from nexB Inc. and others.
# Visit https://github.com/nexB/clearcode-toolkit/ for support and download.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import defaultdict
from clearcode.models import CDitem
from clearcode.cdutils import Coordinate
from clearcode.cdutils import str2coord
from django.db.models import Q
from hashlib import sha512
import json
import requests
from packageurl import PackageURL
from pathlib import Path
from git import Repo
import os
"""
The input is a bunch of scans from ClearlyDefined and
the output is a bunch of git repositories with commited and
pushed scans such that we balance the scans roughly evenly accross
different repositories.
The primary reason for multiple repositories is size of a single
repo. There is a size limit of 5 GB at GitHub and it's difficult
to work with repositories with million files.
Therefore the approach is to use hashing as a way to name git
repositories and directories. We compute hash on the purl of the scanned
package and use the first few layers of this hash for the repo and
directory names.
Initial processing steps are:
- We collect a list of scan and purl.
- For each we compute a hash and determine the repo and directory.
- If needed we create the repo or pull it.
- Then we store the scan using the purl hash and purl as path.
- Finally commit and push! : )
Because it's not practical to process many repos at once, we organize the
processing one repo a time. For this, we iterate over a bunch of records get or compute
the purl hash and process the records that share the same hash.
We are using a short hash that is three characters long using hexadecimal encoding.
Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about
25k scan files, if we were to store 100 million scans (which is a high mark).
For reference one scan should use less than a 100k on average when compressed
with gzip or git based on looking at 15 million scans. Each repo should be roughly
couple hundred mega bytes big, based on 15 million scans.
"""

# Create hex values of integers and ignore the 0x prefix
repo_names = [hex(hash)[2:].zfill(3) for hash in range(4096)]

def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0):
"""
Iterate over CDItem objects with scancode scans.
Save and commit them in git repositories in work dir.
Process a maximum of count items and process all items if
count is 0
"""
cd_items = CDitem.objects.filter(~Q(content=b''), path__contains="tool/scancode")
if count:
cd_items = cd_items[:count]
for purl_hash, cd_items in get_cd_item_by_purl_hash(cd_items=cd_items).items():
commit_count = 0
for cd_item in cd_items:
data = cd_item.data
cd_url = data.get("_metadata", {}).get("url")
coordinate = Coordinate.from_dict(coords=str2coord(cd_url))
if not is_valid_coordinate(coordinate):
print(f"Invalid coordinate {coordinate}")
continue
scancode_scan = data.get("content")
if not scancode_scan:
continue
repo = get_or_init_repo(repo_name=purl_hash, work_dir=work_dir, repo_namespace=github_org, user_name=github_org, pull=False)
purl = coordinate.to_purl()
if add_scancode_scan(scancode_scan=scancode_scan, purl=purl, repo=repo):
commit_count += 1
if commit_count % 10 == 0:
print(".", end="")
origin = repo.remote(name='origin')
origin.push()


def get_cd_item_by_purl_hash(cd_items):
"""
Return a mapping of {purl_hash: [CDItem,....]}
"""
cd_item_by_purl_hash = defaultdict(list)
for cd_item in cd_items:
data = cd_item.data
cd_url = data.get("_metadata", {}).get("url")
coordinate = Coordinate.from_dict(coords=str2coord(cd_url))
if not is_valid_coordinate(coordinate):
print(f"Invalid coordinate {cd_url}")
continue
purl = coordinate.to_purl()
purl_hash = get_purl_hash(purl=purl)
cd_item_by_purl_hash[purl_hash].append(cd_item)
return cd_item_by_purl_hash


def add_scancode_scan(repo, purl, scancode_scan):
"""
Save and commit scancode scan for purl to git repo.
Return true if we commited else false
"""
purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo)
scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json"
with open(scancode_scan_path, "w") as f:
json.dump(scancode_scan,f,indent=2)

if repo.is_dirty():
repo.index.add([scancode_scan_path])
repo.index.commit(message=f"Add scancode-toolkit scan for {purl}")
return True


def is_valid_coordinate(coordinate):
return coordinate.type and coordinate.name and coordinate.version


def get_or_create_dir_for_purl(purl, repo):
"""
Return a path to a directory for this purl,
in this git repo.
"""
purl_dir = repo.working_dir / get_purl_path(purl)
purl_dir.mkdir(parents=True, exist_ok=True)
return purl_dir

def get_purl_path(purl):
purl_path = Path(purl.type)
if purl.namespace:
purl_path = purl_path / purl.namespace
return purl_path / purl.name / purl.version


def get_purl_hash(purl: PackageURL, length: int=3) -> str:
"""
Return a short lower cased hash of a purl.
"""
# This function takes a PackageURL object and an optional length parameter.
# It returns a short hash of the purl. The length of the hash is determined by the length parameter.
# The default length is 3. The function first converts the purl to bytes and then computes the sha512 hash of the purl.
# It then takes the first 'length' characters of the hash and returns it in lower case.

purl_bytes = str(purl).encode("utf-8")
short_hash = sha512(purl_bytes).hexdigest()[:length]
return short_hash.lower()


def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", user_name: str = "", pull=False):
"""
Return a repo object for repo name and namespace
and store it in the work dir. Clone if it does not
exist optionally take the latest pull if it does exist.
"""
# TODO: Manage org repo name
# MAYBE: CREATE ALL THE REPOS AT A TIME AND CLONE THEM LOCALLY
if repo_name not in get_github_repos(user_name=user_name):
repo_url = create_github_repo(repo_name=repo_name)
repo_path = work_dir / repo_name
if repo_path.exists():
repo = Repo(repo_path)
if pull:
repo.origin.pull()
else:
repo = Repo.clone_from(repo_url, repo_path)
return repo


def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scancode-toolkit-scan.json"):
purl_hash = get_purl_hash(purl=purl)
purl_path = get_purl_path(purl)
return f"https://raw.githubusercontent.com/{namespace}/{purl_hash}/main/{purl_path}/{scan_file_name}"


def create_github_repo(repo_name, token=os.getenv("GH_TOKEN")):
headers = {
'Authorization': f'token {token}',
'Accept': 'application/vnd.github.v3+json'
}

data = {
'name': repo_name,
}

url = 'https://api.github.com/user/repos'

response = requests.post(url, headers=headers, json=data)

if response.status_code == 201:
print(f"Repository '{repo_name}' created successfully!")
else:
print(f"Failed to create repository. Status code: {response.status_code}")
print(response.text)


def get_github_repos(user_name, token=os.getenv("GH_TOKEN")):
"""
Yield full repo names for a user or org name, use the optional ``token`` if provided.
Full repo name is in the form user or org name / repo name
"""
headers = {
'Accept': 'application/vnd.github.v3+json'
}
if token:
headers['Authorization'] = f'token {token}'

url = f'https://api.github.com/users/{user_name}/repos'
response = requests.get(url, headers=headers)

# TODO: We need have a way to handle failures from GH API
if not response.status_code == 200:
raise Exception(f"HTTP {response.status_code}: Failed to get repos for {user_name}")

data = response.json()
for repo_data in data:
full_repo_name = repo_data.get("full_name")
if full_repo_name:
yield full_repo_name
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ install_requires =
purl2vcs == 1.0.2
univers == 30.11.0
scancodeio @ git+https://github.com/nexB/scancode.io.git@07b48c0224f5c2ad1b2972b693702ef685f16c98
gitpython == 3.1.43
setup_requires = setuptools_scm[toml] >= 4

python_requires = >=3.8
Expand Down

0 comments on commit 14115c0

Please sign in to comment.