Skip to content

Commit 14115c0

Browse files
authored
Merge pull request #235 from nexB/store_scancode_scans_locally
Store scancode scans locally
2 parents d4094f5 + a746356 commit 14115c0

File tree

4 files changed

+269
-5
lines changed

4 files changed

+269
-5
lines changed

clearcode/cdutils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@
5959
'nuget': 'nuget',
6060
'pypi': 'pypi',
6161
'gem': 'gem',
62+
'npm': 'npm',
63+
'go': 'golang',
6264
}
6365

6466

@@ -104,7 +106,7 @@ class Coordinate(object):
104106
ClearlyDefined coordinates are used to identify any tracked component.
105107
"""
106108

107-
base_api_url = 'https://api.clearlydefined.io'
109+
base_api_url = 'https://dev-api.clearlydefined.io'
108110

109111
type = attr.ib()
110112
provider = attr.ib()
@@ -236,15 +238,15 @@ def to_purl(self):
236238
237239
>>> expected = 'pkg:maven/io.dropwizard/[email protected]'
238240
>>> test = Coordinate('maven', 'mavencentral', 'io.dropwizard', 'dropwizard', '2.0.0-rc13').to_purl()
239-
>>> assert expected == test
241+
>>> assert expected == str(test)
240242
241243
>>> expected = 'pkg:maven/io.dropwizard/[email protected]?classifier=sources'
242244
>>> test = Coordinate('sourcearchive', 'mavencentral', 'io.dropwizard', 'dropwizard', '2.0.0-rc13').to_purl()
243-
>>> assert expected == test
245+
>>> assert expected == str(test)
244246
245247
>>> expected = 'pkg:deb/debian/[email protected]?arch=source'
246248
>>> test = Coordinate('debsrc', 'debian', '', 'gedit-plugins', '3.34.0-3').to_purl()
247-
>>> assert expected == test
249+
>>> assert expected == str(test)
248250
"""
249251
converted_package_type = PACKAGE_TYPES_BY_CD_TYPE[self.type]
250252

@@ -265,7 +267,7 @@ def to_purl(self):
265267
name=self.name,
266268
version=self.revision,
267269
qualifiers=qualifiers,
268-
).to_string()
270+
)
269271

270272
@classmethod
271273
def from_purl(cls, purl):
@@ -519,6 +521,7 @@ def str2coord(s):
519521
URN: "urn:gem:rubygems:-:mocha:revision:1.7.0:tool:scancode:3.1.0"
520522
plain: /gem/rubygems/foo/mocha/1.7.0"
521523
"""
524+
#TODO: Add doctest
522525
is_urn = s.startswith('urn')
523526
is_url = s.startswith('cd:')
524527
splitter = ':' if is_urn else '/'
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
from clearcode.store_scans import store_scancode_scans_from_cd_items
11+
from minecode.management.commands import VerboseCommand
12+
13+
class Command(VerboseCommand):
14+
help = 'Store scancode scans in git repositories'
15+
16+
def add_arguments(self, parser):
17+
parser.add_argument('work_dir', type=str)
18+
parser.add_argument('--github_org', type=str, default="")
19+
parser.add_argument('--count', type=int, default=0)
20+
21+
def handle(self, *args, **options):
22+
store_scancode_scans_from_cd_items(work_dir=options['work_dir'], github_org=options['github_org'], count=options['count'])

clearcode/store_scans.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) nexB Inc. and others. All rights reserved.
4+
#
5+
# ClearCode is a free software tool from nexB Inc. and others.
6+
# Visit https://github.com/nexB/clearcode-toolkit/ for support and download.
7+
#
8+
# Licensed under the Apache License, Version 2.0 (the "License");
9+
# you may not use this file except in compliance with the License.
10+
# You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing, software
15+
# distributed under the License is distributed on an "AS IS" BASIS,
16+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
# See the License for the specific language governing permissions and
18+
# limitations under the License.
19+
20+
from collections import defaultdict
21+
from clearcode.models import CDitem
22+
from clearcode.cdutils import Coordinate
23+
from clearcode.cdutils import str2coord
24+
from django.db.models import Q
25+
from hashlib import sha512
26+
import json
27+
import requests
28+
from packageurl import PackageURL
29+
from pathlib import Path
30+
from git import Repo
31+
import os
32+
"""
33+
The input is a bunch of scans from ClearlyDefined and
34+
the output is a bunch of git repositories with commited and
35+
pushed scans such that we balance the scans roughly evenly accross
36+
different repositories.
37+
38+
The primary reason for multiple repositories is size of a single
39+
repo. There is a size limit of 5 GB at GitHub and it's difficult
40+
to work with repositories with million files.
41+
42+
Therefore the approach is to use hashing as a way to name git
43+
repositories and directories. We compute hash on the purl of the scanned
44+
package and use the first few layers of this hash for the repo and
45+
directory names.
46+
47+
Initial processing steps are:
48+
- We collect a list of scan and purl.
49+
- For each we compute a hash and determine the repo and directory.
50+
- If needed we create the repo or pull it.
51+
- Then we store the scan using the purl hash and purl as path.
52+
- Finally commit and push! : )
53+
54+
Because it's not practical to process many repos at once, we organize the
55+
processing one repo a time. For this, we iterate over a bunch of records get or compute
56+
the purl hash and process the records that share the same hash.
57+
58+
We are using a short hash that is three characters long using hexadecimal encoding.
59+
Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about
60+
25k scan files, if we were to store 100 million scans (which is a high mark).
61+
For reference one scan should use less than a 100k on average when compressed
62+
with gzip or git based on looking at 15 million scans. Each repo should be roughly
63+
couple hundred mega bytes big, based on 15 million scans.
64+
"""
65+
66+
# Create hex values of integers and ignore the 0x prefix
67+
repo_names = [hex(hash)[2:].zfill(3) for hash in range(4096)]
68+
69+
def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0):
70+
"""
71+
Iterate over CDItem objects with scancode scans.
72+
Save and commit them in git repositories in work dir.
73+
Process a maximum of count items and process all items if
74+
count is 0
75+
"""
76+
cd_items = CDitem.objects.filter(~Q(content=b''), path__contains="tool/scancode")
77+
if count:
78+
cd_items = cd_items[:count]
79+
for purl_hash, cd_items in get_cd_item_by_purl_hash(cd_items=cd_items).items():
80+
commit_count = 0
81+
for cd_item in cd_items:
82+
data = cd_item.data
83+
cd_url = data.get("_metadata", {}).get("url")
84+
coordinate = Coordinate.from_dict(coords=str2coord(cd_url))
85+
if not is_valid_coordinate(coordinate):
86+
print(f"Invalid coordinate {coordinate}")
87+
continue
88+
scancode_scan = data.get("content")
89+
if not scancode_scan:
90+
continue
91+
repo = get_or_init_repo(repo_name=purl_hash, work_dir=work_dir, repo_namespace=github_org, user_name=github_org, pull=False)
92+
purl = coordinate.to_purl()
93+
if add_scancode_scan(scancode_scan=scancode_scan, purl=purl, repo=repo):
94+
commit_count += 1
95+
if commit_count % 10 == 0:
96+
print(".", end="")
97+
origin = repo.remote(name='origin')
98+
origin.push()
99+
100+
101+
def get_cd_item_by_purl_hash(cd_items):
102+
"""
103+
Return a mapping of {purl_hash: [CDItem,....]}
104+
"""
105+
cd_item_by_purl_hash = defaultdict(list)
106+
for cd_item in cd_items:
107+
data = cd_item.data
108+
cd_url = data.get("_metadata", {}).get("url")
109+
coordinate = Coordinate.from_dict(coords=str2coord(cd_url))
110+
if not is_valid_coordinate(coordinate):
111+
print(f"Invalid coordinate {cd_url}")
112+
continue
113+
purl = coordinate.to_purl()
114+
purl_hash = get_purl_hash(purl=purl)
115+
cd_item_by_purl_hash[purl_hash].append(cd_item)
116+
return cd_item_by_purl_hash
117+
118+
119+
def add_scancode_scan(repo, purl, scancode_scan):
120+
"""
121+
Save and commit scancode scan for purl to git repo.
122+
Return true if we commited else false
123+
"""
124+
purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo)
125+
scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json"
126+
with open(scancode_scan_path, "w") as f:
127+
json.dump(scancode_scan,f,indent=2)
128+
129+
if repo.is_dirty():
130+
repo.index.add([scancode_scan_path])
131+
repo.index.commit(message=f"Add scancode-toolkit scan for {purl}")
132+
return True
133+
134+
135+
def is_valid_coordinate(coordinate):
136+
return coordinate.type and coordinate.name and coordinate.version
137+
138+
139+
def get_or_create_dir_for_purl(purl, repo):
140+
"""
141+
Return a path to a directory for this purl,
142+
in this git repo.
143+
"""
144+
purl_dir = repo.working_dir / get_purl_path(purl)
145+
purl_dir.mkdir(parents=True, exist_ok=True)
146+
return purl_dir
147+
148+
def get_purl_path(purl):
149+
purl_path = Path(purl.type)
150+
if purl.namespace:
151+
purl_path = purl_path / purl.namespace
152+
return purl_path / purl.name / purl.version
153+
154+
155+
def get_purl_hash(purl: PackageURL, length: int=3) -> str:
156+
"""
157+
Return a short lower cased hash of a purl.
158+
"""
159+
# This function takes a PackageURL object and an optional length parameter.
160+
# It returns a short hash of the purl. The length of the hash is determined by the length parameter.
161+
# The default length is 3. The function first converts the purl to bytes and then computes the sha512 hash of the purl.
162+
# It then takes the first 'length' characters of the hash and returns it in lower case.
163+
164+
purl_bytes = str(purl).encode("utf-8")
165+
short_hash = sha512(purl_bytes).hexdigest()[:length]
166+
return short_hash.lower()
167+
168+
169+
def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", user_name: str = "", pull=False):
170+
"""
171+
Return a repo object for repo name and namespace
172+
and store it in the work dir. Clone if it does not
173+
exist optionally take the latest pull if it does exist.
174+
"""
175+
# TODO: Manage org repo name
176+
# MAYBE: CREATE ALL THE REPOS AT A TIME AND CLONE THEM LOCALLY
177+
if repo_name not in get_github_repos(user_name=user_name):
178+
repo_url = create_github_repo(repo_name=repo_name)
179+
repo_path = work_dir / repo_name
180+
if repo_path.exists():
181+
repo = Repo(repo_path)
182+
if pull:
183+
repo.origin.pull()
184+
else:
185+
repo = Repo.clone_from(repo_url, repo_path)
186+
return repo
187+
188+
189+
def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scancode-toolkit-scan.json"):
190+
purl_hash = get_purl_hash(purl=purl)
191+
purl_path = get_purl_path(purl)
192+
return f"https://raw.githubusercontent.com/{namespace}/{purl_hash}/main/{purl_path}/{scan_file_name}"
193+
194+
195+
def create_github_repo(repo_name, token=os.getenv("GH_TOKEN")):
196+
headers = {
197+
'Authorization': f'token {token}',
198+
'Accept': 'application/vnd.github.v3+json'
199+
}
200+
201+
data = {
202+
'name': repo_name,
203+
}
204+
205+
url = 'https://api.github.com/user/repos'
206+
207+
response = requests.post(url, headers=headers, json=data)
208+
209+
if response.status_code == 201:
210+
print(f"Repository '{repo_name}' created successfully!")
211+
else:
212+
print(f"Failed to create repository. Status code: {response.status_code}")
213+
print(response.text)
214+
215+
216+
def get_github_repos(user_name, token=os.getenv("GH_TOKEN")):
217+
"""
218+
Yield full repo names for a user or org name, use the optional ``token`` if provided.
219+
Full repo name is in the form user or org name / repo name
220+
"""
221+
headers = {
222+
'Accept': 'application/vnd.github.v3+json'
223+
}
224+
if token:
225+
headers['Authorization'] = f'token {token}'
226+
227+
url = f'https://api.github.com/users/{user_name}/repos'
228+
response = requests.get(url, headers=headers)
229+
230+
# TODO: We need have a way to handle failures from GH API
231+
if not response.status_code == 200:
232+
raise Exception(f"HTTP {response.status_code}: Failed to get repos for {user_name}")
233+
234+
data = response.json()
235+
for repo_data in data:
236+
full_repo_name = repo_data.get("full_name")
237+
if full_repo_name:
238+
yield full_repo_name

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ install_requires =
6363
purl2vcs == 1.0.2
6464
univers == 30.11.0
6565
scancodeio @ git+https://github.com/nexB/scancode.io.git@07b48c0224f5c2ad1b2972b693702ef685f16c98
66+
gitpython == 3.1.43
6667
setup_requires = setuptools_scm[toml] >= 4
6768

6869
python_requires = >=3.8

0 commit comments

Comments
 (0)