|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# |
| 3 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 4 | +# |
| 5 | +# ClearCode is a free software tool from nexB Inc. and others. |
| 6 | +# Visit https://github.com/nexB/clearcode-toolkit/ for support and download. |
| 7 | +# |
| 8 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 9 | +# you may not use this file except in compliance with the License. |
| 10 | +# You may obtain a copy of the License at |
| 11 | +# |
| 12 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 13 | +# |
| 14 | +# Unless required by applicable law or agreed to in writing, software |
| 15 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 16 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 | +# See the License for the specific language governing permissions and |
| 18 | +# limitations under the License. |
| 19 | + |
| 20 | +from collections import defaultdict |
| 21 | +from clearcode.models import CDitem |
| 22 | +from clearcode.cdutils import Coordinate |
| 23 | +from clearcode.cdutils import str2coord |
| 24 | +from django.db.models import Q |
| 25 | +from hashlib import sha512 |
| 26 | +import json |
| 27 | +import requests |
| 28 | +from packageurl import PackageURL |
| 29 | +from pathlib import Path |
| 30 | +from git import Repo |
| 31 | +import os |
| 32 | +""" |
| 33 | +The input is a bunch of scans from ClearlyDefined and |
| 34 | +the output is a bunch of git repositories with commited and |
| 35 | +pushed scans such that we balance the scans roughly evenly accross |
| 36 | +different repositories. |
| 37 | +
|
| 38 | +The primary reason for multiple repositories is size of a single |
| 39 | +repo. There is a size limit of 5 GB at GitHub and it's difficult |
| 40 | +to work with repositories with million files. |
| 41 | +
|
| 42 | +Therefore the approach is to use hashing as a way to name git |
| 43 | +repositories and directories. We compute hash on the purl of the scanned |
| 44 | +package and use the first few layers of this hash for the repo and |
| 45 | +directory names. |
| 46 | +
|
| 47 | +Initial processing steps are: |
| 48 | +- We collect a list of scan and purl. |
| 49 | +- For each we compute a hash and determine the repo and directory. |
| 50 | +- If needed we create the repo or pull it. |
| 51 | +- Then we store the scan using the purl hash and purl as path. |
| 52 | +- Finally commit and push! : ) |
| 53 | +
|
| 54 | +Because it's not practical to process many repos at once, we organize the |
| 55 | +processing one repo a time. For this, we iterate over a bunch of records get or compute |
| 56 | +the purl hash and process the records that share the same hash. |
| 57 | +
|
| 58 | +We are using a short hash that is three characters long using hexadecimal encoding. |
| 59 | +Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about |
| 60 | +25k scan files, if we were to store 100 million scans (which is a high mark). |
| 61 | +For reference one scan should use less than a 100k on average when compressed |
| 62 | +with gzip or git based on looking at 15 million scans. Each repo should be roughly |
| 63 | +couple hundred mega bytes big, based on 15 million scans. |
| 64 | +""" |
| 65 | + |
| 66 | +# Create hex values of integers and ignore the 0x prefix |
| 67 | +repo_names = [hex(hash)[2:].zfill(3) for hash in range(4096)] |
| 68 | + |
| 69 | +def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0): |
| 70 | + """ |
| 71 | + Iterate over CDItem objects with scancode scans. |
| 72 | + Save and commit them in git repositories in work dir. |
| 73 | + Process a maximum of count items and process all items if |
| 74 | + count is 0 |
| 75 | + """ |
| 76 | + cd_items = CDitem.objects.filter(~Q(content=b''), path__contains="tool/scancode") |
| 77 | + if count: |
| 78 | + cd_items = cd_items[:count] |
| 79 | + for purl_hash, cd_items in get_cd_item_by_purl_hash(cd_items=cd_items).items(): |
| 80 | + commit_count = 0 |
| 81 | + for cd_item in cd_items: |
| 82 | + data = cd_item.data |
| 83 | + cd_url = data.get("_metadata", {}).get("url") |
| 84 | + coordinate = Coordinate.from_dict(coords=str2coord(cd_url)) |
| 85 | + if not is_valid_coordinate(coordinate): |
| 86 | + print(f"Invalid coordinate {coordinate}") |
| 87 | + continue |
| 88 | + scancode_scan = data.get("content") |
| 89 | + if not scancode_scan: |
| 90 | + continue |
| 91 | + repo = get_or_init_repo(repo_name=purl_hash, work_dir=work_dir, repo_namespace=github_org, user_name=github_org, pull=False) |
| 92 | + purl = coordinate.to_purl() |
| 93 | + if add_scancode_scan(scancode_scan=scancode_scan, purl=purl, repo=repo): |
| 94 | + commit_count += 1 |
| 95 | + if commit_count % 10 == 0: |
| 96 | + print(".", end="") |
| 97 | + origin = repo.remote(name='origin') |
| 98 | + origin.push() |
| 99 | + |
| 100 | + |
| 101 | +def get_cd_item_by_purl_hash(cd_items): |
| 102 | + """ |
| 103 | + Return a mapping of {purl_hash: [CDItem,....]} |
| 104 | + """ |
| 105 | + cd_item_by_purl_hash = defaultdict(list) |
| 106 | + for cd_item in cd_items: |
| 107 | + data = cd_item.data |
| 108 | + cd_url = data.get("_metadata", {}).get("url") |
| 109 | + coordinate = Coordinate.from_dict(coords=str2coord(cd_url)) |
| 110 | + if not is_valid_coordinate(coordinate): |
| 111 | + print(f"Invalid coordinate {cd_url}") |
| 112 | + continue |
| 113 | + purl = coordinate.to_purl() |
| 114 | + purl_hash = get_purl_hash(purl=purl) |
| 115 | + cd_item_by_purl_hash[purl_hash].append(cd_item) |
| 116 | + return cd_item_by_purl_hash |
| 117 | + |
| 118 | + |
| 119 | +def add_scancode_scan(repo, purl, scancode_scan): |
| 120 | + """ |
| 121 | + Save and commit scancode scan for purl to git repo. |
| 122 | + Return true if we commited else false |
| 123 | + """ |
| 124 | + purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo) |
| 125 | + scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json" |
| 126 | + with open(scancode_scan_path, "w") as f: |
| 127 | + json.dump(scancode_scan,f,indent=2) |
| 128 | + |
| 129 | + if repo.is_dirty(): |
| 130 | + repo.index.add([scancode_scan_path]) |
| 131 | + repo.index.commit(message=f"Add scancode-toolkit scan for {purl}") |
| 132 | + return True |
| 133 | + |
| 134 | + |
| 135 | +def is_valid_coordinate(coordinate): |
| 136 | + return coordinate.type and coordinate.name and coordinate.version |
| 137 | + |
| 138 | + |
| 139 | +def get_or_create_dir_for_purl(purl, repo): |
| 140 | + """ |
| 141 | + Return a path to a directory for this purl, |
| 142 | + in this git repo. |
| 143 | + """ |
| 144 | + purl_dir = repo.working_dir / get_purl_path(purl) |
| 145 | + purl_dir.mkdir(parents=True, exist_ok=True) |
| 146 | + return purl_dir |
| 147 | + |
| 148 | +def get_purl_path(purl): |
| 149 | + purl_path = Path(purl.type) |
| 150 | + if purl.namespace: |
| 151 | + purl_path = purl_path / purl.namespace |
| 152 | + return purl_path / purl.name / purl.version |
| 153 | + |
| 154 | + |
| 155 | +def get_purl_hash(purl: PackageURL, length: int=3) -> str: |
| 156 | + """ |
| 157 | + Return a short lower cased hash of a purl. |
| 158 | + """ |
| 159 | + # This function takes a PackageURL object and an optional length parameter. |
| 160 | + # It returns a short hash of the purl. The length of the hash is determined by the length parameter. |
| 161 | + # The default length is 3. The function first converts the purl to bytes and then computes the sha512 hash of the purl. |
| 162 | + # It then takes the first 'length' characters of the hash and returns it in lower case. |
| 163 | + |
| 164 | + purl_bytes = str(purl).encode("utf-8") |
| 165 | + short_hash = sha512(purl_bytes).hexdigest()[:length] |
| 166 | + return short_hash.lower() |
| 167 | + |
| 168 | + |
| 169 | +def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", user_name: str = "", pull=False): |
| 170 | + """ |
| 171 | + Return a repo object for repo name and namespace |
| 172 | + and store it in the work dir. Clone if it does not |
| 173 | + exist optionally take the latest pull if it does exist. |
| 174 | + """ |
| 175 | + # TODO: Manage org repo name |
| 176 | + # MAYBE: CREATE ALL THE REPOS AT A TIME AND CLONE THEM LOCALLY |
| 177 | + if repo_name not in get_github_repos(user_name=user_name): |
| 178 | + repo_url = create_github_repo(repo_name=repo_name) |
| 179 | + repo_path = work_dir / repo_name |
| 180 | + if repo_path.exists(): |
| 181 | + repo = Repo(repo_path) |
| 182 | + if pull: |
| 183 | + repo.origin.pull() |
| 184 | + else: |
| 185 | + repo = Repo.clone_from(repo_url, repo_path) |
| 186 | + return repo |
| 187 | + |
| 188 | + |
| 189 | +def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scancode-toolkit-scan.json"): |
| 190 | + purl_hash = get_purl_hash(purl=purl) |
| 191 | + purl_path = get_purl_path(purl) |
| 192 | + return f"https://raw.githubusercontent.com/{namespace}/{purl_hash}/main/{purl_path}/{scan_file_name}" |
| 193 | + |
| 194 | + |
| 195 | +def create_github_repo(repo_name, token=os.getenv("GH_TOKEN")): |
| 196 | + headers = { |
| 197 | + 'Authorization': f'token {token}', |
| 198 | + 'Accept': 'application/vnd.github.v3+json' |
| 199 | + } |
| 200 | + |
| 201 | + data = { |
| 202 | + 'name': repo_name, |
| 203 | + } |
| 204 | + |
| 205 | + url = 'https://api.github.com/user/repos' |
| 206 | + |
| 207 | + response = requests.post(url, headers=headers, json=data) |
| 208 | + |
| 209 | + if response.status_code == 201: |
| 210 | + print(f"Repository '{repo_name}' created successfully!") |
| 211 | + else: |
| 212 | + print(f"Failed to create repository. Status code: {response.status_code}") |
| 213 | + print(response.text) |
| 214 | + |
| 215 | + |
| 216 | +def get_github_repos(user_name, token=os.getenv("GH_TOKEN")): |
| 217 | + """ |
| 218 | + Yield full repo names for a user or org name, use the optional ``token`` if provided. |
| 219 | + Full repo name is in the form user or org name / repo name |
| 220 | + """ |
| 221 | + headers = { |
| 222 | + 'Accept': 'application/vnd.github.v3+json' |
| 223 | + } |
| 224 | + if token: |
| 225 | + headers['Authorization'] = f'token {token}' |
| 226 | + |
| 227 | + url = f'https://api.github.com/users/{user_name}/repos' |
| 228 | + response = requests.get(url, headers=headers) |
| 229 | + |
| 230 | + # TODO: We need have a way to handle failures from GH API |
| 231 | + if not response.status_code == 200: |
| 232 | + raise Exception(f"HTTP {response.status_code}: Failed to get repos for {user_name}") |
| 233 | + |
| 234 | + data = response.json() |
| 235 | + for repo_data in data: |
| 236 | + full_repo_name = repo_data.get("full_name") |
| 237 | + if full_repo_name: |
| 238 | + yield full_repo_name |
0 commit comments