Skip to content

Commit

Permalink
Address style and format issues #512 #515
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Aug 13, 2024
1 parent 57f9cc0 commit f1d05f8
Show file tree
Hide file tree
Showing 74 changed files with 219 additions and 309 deletions.
4 changes: 1 addition & 3 deletions clearcode/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def walk_and_load_from_filesystem(input_dir, cd_root_dir):

# Save to DB
try:
cditem = models.CDitem.objects.create(
path=cditem_rel_path, content=content
)
models.CDitem.objects.create(path=cditem_rel_path, content=content)
except IntegrityError:
# skip if we already have it in the DB
continue
Expand Down
28 changes: 14 additions & 14 deletions clearcode/store_scans.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,18 @@
from clearcode.models import CDitem

"""
The input is a bunch of scans from ClearlyDefined and
the output is a bunch of git repositories with commited and
pushed scans such that we balance the scans roughly evenly accross
The input is a bunch of scans from ClearlyDefined and
the output is a bunch of git repositories with commited and
pushed scans such that we balance the scans roughly evenly accross
different repositories.
The primary reason for multiple repositories is size of a single
repo. There is a size limit of 5 GB at GitHub and it's difficult
The primary reason for multiple repositories is size of a single
repo. There is a size limit of 5 GB at GitHub and it's difficult
to work with repositories with million files.
Therefore the approach is to use hashing as a way to name git
repositories and directories. We compute hash on the purl of the scanned
package and use the first few layers of this hash for the repo and
Therefore the approach is to use hashing as a way to name git
repositories and directories. We compute hash on the purl of the scanned
package and use the first few layers of this hash for the repo and
directory names.
Initial processing steps are:
Expand All @@ -54,15 +54,15 @@
- Then we store the scan using the purl hash and purl as path.
- Finally commit and push! : )
Because it's not practical to process many repos at once, we organize the
processing one repo a time. For this, we iterate over a bunch of records get or compute
Because it's not practical to process many repos at once, we organize the
processing one repo a time. For this, we iterate over a bunch of records get or compute
the purl hash and process the records that share the same hash.
We are using a short hash that is three characters long using hexadecimal encoding.
Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about
We are using a short hash that is three characters long using hexadecimal encoding.
Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about
25k scan files, if we were to store 100 million scans (which is a high mark).
For reference one scan should use less than a 100k on average when compressed
with gzip or git based on looking at 15 million scans. Each repo should be roughly
For reference one scan should use less than a 100k on average when compressed
with gzip or git based on looking at 15 million scans. Each repo should be roughly
couple hundred mega bytes big, based on 15 million scans.
"""

Expand Down
2 changes: 1 addition & 1 deletion clearcode/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def is_unchanged_remotely(self, url, session=session):
remote_etag = response.headers.get("etag")
if remote_etag and self.etags_cache.get(url) == remote_etag:
return True
except:
except Exception:
return False

def is_fetched(self, checksum, url):
Expand Down
21 changes: 13 additions & 8 deletions clearcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_known_package_types(self):
# This path starts with npm, which is known
cditem_1 = CDitem.objects.create(path="npm/name/version")
# asdf is not a proper type
cditem_2 = CDitem.objects.create(path="asdf/name/version")
CDitem.objects.create(path="asdf/name/version")
cditems = list(CDitem.objects.known_package_types())
self.assertEqual(1, len(cditems))
cditem = cditems[0]
Expand All @@ -50,7 +50,7 @@ def test_definitions(self):
path="composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json"
)
# harvest should not be in cditems
harvest = CDitem.objects.create(
CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json"
)
cditems = list(CDitem.objects.definitions())
Expand All @@ -63,7 +63,7 @@ def test_scancode_harvests(self):
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json"
)
# unexpected_harvest should not be in cditems
unexpected_harvest = CDitem.objects.create(
CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json"
)
harvests = list(CDitem.objects.scancode_harvests())
Expand All @@ -75,7 +75,8 @@ def test_mappable(self):
definition_1 = CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json"
)
definition_2 = CDitem.objects.create(
# This should not be mappable
CDitem.objects.create(
path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json",
last_map_date=timezone.now(),
map_error="error",
Expand All @@ -92,12 +93,14 @@ def test_mappable_definitions(self):
definition_1 = CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json"
)
definition_2 = CDitem.objects.create(
# This should not be mappable
CDitem.objects.create(
path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json",
last_map_date=timezone.now(),
map_error="error",
)
harvest = CDitem.objects.create(
# This should not be mappable
CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json"
)
mappables = list(CDitem.objects.mappable_definitions())
Expand All @@ -109,12 +112,14 @@ def test_mappable_scancode_harvests(self):
harvest_1 = CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json"
)
harvest_2 = CDitem.objects.create(
# This should not be mappable
CDitem.objects.create(
path="sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json",
last_map_date=timezone.now(),
map_error="error",
)
definition_1 = CDitem.objects.create(
# This should not be mappable
CDitem.objects.create(
path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json"
)
mappables = list(CDitem.objects.mappable_scancode_harvests())
Expand Down
8 changes: 2 additions & 6 deletions clearindex/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=""):

download_url = package_data.get("download_url")
if not download_url:
logger.error(
f"Null `download_url` value for `package_data`: {package_data}"
)
logger.error(f"Null `download_url` value for `package_data`: {package_data}")
return

# This ugly block is needed until https://github.com/nexB/packagedb/issues/14
Expand All @@ -115,9 +113,7 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=""):
merge_packages(
existing_package=package, new_package_data=package_data, replace=False
)
package.append_to_history(
f"Updated package from CDitem harvest: {cditem_path}"
)
package.append_to_history(f"Updated package from CDitem harvest: {cditem_path}")

logger.info(f"Merged package data from scancode harvest: {package}")

Expand Down
4 changes: 0 additions & 4 deletions etc/scripts/clearcode-api-backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,6 @@ def run_api_backup(api_root_url, extra_payload=None):
objects = get_all_objects_from_endpoint(endpoint_url, extra_payload=extra_payload)
print('{} {} collected.'.format(len(objects), endpoint_name))

collect_extra_conditions = [
extra_payload.get('last_modified_date'),
]

results[endpoint_name] += objects

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
Expand Down
6 changes: 3 additions & 3 deletions etc/scripts/utils_thirdparty.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False):
if TRACE:
print(f"Fetched license from remote: {lic_url}")

except:
except Exception:
try:
# try licensedb second
lic_url = f"{LICENSEDB_API_URL}/{filename}"
Expand All @@ -858,7 +858,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False):
if TRACE:
print(f"Fetched license from licensedb: {lic_url}")

except:
except Exception:
msg = f'No text for license {filename} in expression "{self.license_expression}" from {self}'
print(msg)
errors.append(msg)
Expand Down Expand Up @@ -1290,7 +1290,7 @@ def is_pure(self):
def is_pure_wheel(filename):
try:
return Wheel.from_filename(filename).is_pure()
except:
except Exception:
return False


Expand Down
6 changes: 3 additions & 3 deletions matchcode/tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class MatchPackagesTestCase(MatchcodeTestCase):
def setUp(self):
# Execute the superclass' setUp method before creating our own
# DB objects
super(MatchPackagesTestCase, self).setUp()
super().setUp()

self.test_package1, _ = Package.objects.get_or_create(
filename="abbot-0.12.3.jar",
Expand Down Expand Up @@ -158,7 +158,7 @@ class MatchNestedPackagesTestCase(MatchcodeTestCase):
def setUp(self):
# Execute the superclass' setUp method before creating our own
# DB objects
super(MatchNestedPackagesTestCase, self).setUp()
super().setUp()

self.test_package1, _ = Package.objects.get_or_create(
filename="plugin-request-2.4.1.tgz",
Expand Down Expand Up @@ -219,7 +219,7 @@ class DirectoryMatchingTestCase(MatchcodeTestCase):
maxDiff = None

def setUp(self):
super(DirectoryMatchingTestCase, self).setUp()
super().setUp()

self.test_package1, _ = Package.objects.get_or_create(
filename="abbrev-1.0.3.tgz",
Expand Down
2 changes: 1 addition & 1 deletion matchcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class BaseModelTest(MatchcodeTestCase):
maxDiff = None

def setUp(self):
super(BaseModelTest, self).setUp()
super().setUp()

self.test_package1, _ = Package.objects.get_or_create(
filename="abbot-0.12.3.jar",
Expand Down
10 changes: 2 additions & 8 deletions matchcode_pipeline/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,8 @@ def test_matching_pipeline_api_matching_create(self, mock_execute_pipeline_task)
self.assertEqual("matching", response.data["runs"][0]["pipeline_name"])
mock_execute_pipeline_task.assert_called_once()

created_matching_project_detail_url = response.data["url"]
matching_project_uuid = response.data["uuid"]
results_url = reverse("matching-results", args=[matching_project_uuid])

# Check that the file was uploaded
created_matching_project_detail_url = response.data["url"]
response = self.csrf_client.get(created_matching_project_detail_url)
self.assertEqual("test-out.json", response.data["input_sources"][0]["filename"])

Expand All @@ -165,11 +162,8 @@ def test_matching_pipeline_api_matching_create_multiple_input_urls(
self.assertEqual("matching", response.data["runs"][0]["pipeline_name"])
mock_execute_pipeline_task.assert_called_once()

created_matching_project_detail_url = response.data["url"]
matching_project_uuid = response.data["uuid"]
results_url = reverse("matching-results", args=[matching_project_uuid])

# Check that the file was uploaded
created_matching_project_detail_url = response.data["url"]
response = self.csrf_client.get(created_matching_project_detail_url)
input_sources = response.data["input_sources"]
self.assertEqual(2, len(input_sources))
Expand Down
2 changes: 1 addition & 1 deletion minecode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def index_package_scan(request, key):

user_id = signing.loads(key)
User = get_user_model()
user = get_object_or_404(User, id=user_id)
get_object_or_404(User, id=user_id)

results = json_data.get("results")
summary = json_data.get("summary")
Expand Down
4 changes: 1 addition & 3 deletions minecode/collectors/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,9 +521,7 @@ def get_vcs_repo(description):
repos.append((vcs_tool, vcs_repo))

if len(repos) > 1:
raise TypeError(
"Debian description with more than one Vcs repos: %(repos)r" % locals()
)
raise TypeError(f"Debian description with more than one Vcs repos: {repos}")

if repos:
vcs_tool, vcs_repo = repos[0]
Expand Down
6 changes: 3 additions & 3 deletions minecode/collectors/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,17 +498,17 @@ def process_request(purl_str, **kwargs):

def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
"""Return True if `file_name` is in `links`"""
return any(l.endswith(file_name) for l in links)
return any(link.endswith(file_name) for link in links)


def check_if_page_has_pom_files(links, **kwargs):
"""Return True of any entry in `links` ends with .pom."""
return any(l.endswith(".pom") for l in links)
return any(link.endswith(".pom") for link in links)


def check_if_page_has_directories(links, **kwargs):
"""Return True if any entry, excluding "../", ends with /."""
return any(l.endswith("/") for l in links if l != "../")
return any(link.endswith("/") for link in links if link != "../")


def check_if_package_version_page(links, **kwargs):
Expand Down
32 changes: 16 additions & 16 deletions minecode/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,28 @@ def sf_net(input_file, output):
writer = csv.writer(fo, quoting=csv.QUOTE_ALL)
with open(input_file) as fi:
reader = csv.reader(fi)
for i, l in enumerate(reader):
for i, row in enumerate(reader):
if i == 0:
# add headers on first row
l.extend(new_headers)
if not l:
row.extend(new_headers)
if not row:
continue
project_id = l[0]
name = l[1]
version_column = l[2]
project_id = row[0]
name = row[1]
version_column = row[2]
sep = ": released on "
if sep not in version_column:
# write as is if we do not have a file release date
# separator
writer.writerow(l)
writer.writerow(row)
continue
filename, release_date_ts = version_column.split(sep, 1)
found_version = version.version_hint(filename)
l.append(found_version or "")
l.append(release_date_ts or "")
l.append(download_url_template % locals())
l.append("") # reviewed
l.append("") # curated name
row.append(found_version or "")
row.append(release_date_ts or "")
row.append(download_url_template % locals())
row.append("") # reviewed
row.append("") # curated name
excluded_reason = ""
if "." in project_id:
excluded_reason = "mirror or special project"
Expand All @@ -70,10 +70,10 @@ def sf_net(input_file, output):
excluded_reason = "special chars in name"
elif not good_filename(project_id, filename, name):
excluded_reason = "multi component possible"
l.append(excluded_reason)
l.append("") # curated_owner
l.append("") # owner_type
writer.writerow(l)
row.append(excluded_reason)
row.append("") # curated_owner
row.append("") # owner_type
writer.writerow(row)


def good_name(s):
Expand Down
4 changes: 3 additions & 1 deletion minecode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ def index_package(
declared_license_expression = summary_data.get("declared_license_expression")
other_license_expressions = summary_data.get("other_license_expressions", [])
other_license_expressions = [
l["value"] for l in other_license_expressions if l["value"]
license_expression["value"]
for license_expression in other_license_expressions
if license_expression["value"]
]
other_license_expression = combine_expressions(other_license_expressions)

Expand Down
3 changes: 1 addition & 2 deletions minecode/management/commands/check_licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ def find_ambiguous_packages(
)
license_filter = reduce(operator.or_, filter_expression)

for package in Package.objects.filter(type__in=types).filter(license_filter):
yield package
yield from Package.objects.filter(type__in=types).filter(license_filter)


def dump(packages, json_location):
Expand Down
4 changes: 3 additions & 1 deletion minecode/management/commands/import_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,9 @@ def handle(self, *args, **options):
try:
errors = process_request(importable_uri)
except Exception as e:
errors = f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n"
errors = (
f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n"
)
errors += get_error_message(e)
finally:
if errors:
Expand Down
Loading

0 comments on commit f1d05f8

Please sign in to comment.