Skip to content
Open
137 changes: 137 additions & 0 deletions minecode_pipelines/miners/cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from datetime import datetime

from minecode_pipelines.pipes import get_checkpoint_from_file
from minecode_pipelines.pipes import get_commit_at_distance_ahead
from minecode_pipelines.pipes import update_checkpoints_in_github
from minecode_pipelines.pipes import get_changed_files
from minecode_pipelines.pipes.cargo import store_cargo_packages
from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes
from minecode_pipelines import VERSION

import json
from pathlib import Path


PACKAGE_BATCH_SIZE = 500
COMMIT_BATCH_SIZE = 10

CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"


def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logger):
"""
Process Cargo index files commit by commit.
Push changes to fed_repo after:
- every `commit_batch` commits, OR when reaching HEAD.
"""

base_path = Path(cargo_index_repo.working_tree_dir)

while True:
cargo_checkpoints = get_checkpoint_from_file(
cloned_repo=config_repo, path=CARGO_CHECKPOINT_PATH
)

checkpoints_last_commit = cargo_checkpoints.get("last_commit")

try:
next_commit = get_commit_at_distance_ahead(
cargo_index_repo,
checkpoints_last_commit,
num_commits_ahead=COMMIT_BATCH_SIZE,
branch_name="master",
)
except ValueError as e:
logger(str(e))
break

if next_commit == checkpoints_last_commit:
logger("No new commits to mine")
break

changed_files = get_changed_files(
cargo_index_repo, commit_x=checkpoints_last_commit, commit_y=next_commit
)
logger(f"Found {len(changed_files)} changed files in Cargo index.")

file_counter = 0
purl_files = []
purls = []
for idx, rel_path in enumerate(changed_files):
file_path = base_path / rel_path
logger(f"Found {file_path}.")

if not file_path.is_file() or file_path.name in {
"config.json",
"README.md",
"update-dl-url.yml",
}:
continue

packages = []
with open(file_path, encoding="utf-8") as f:
for line in f:
if line.strip():
try:
packages.append(json.loads(line))
except json.JSONDecodeError as e:
logger(f"Skipping invalid JSON in {file_path}: {e}")

file_counter += 1

# Commit and push after each full batch or when processing the last file
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
idx == len(changed_files) - 1
)

result_store = store_cargo_packages(packages, cloned_data_repo)
if result_store:
purl_file, base_purl = result_store
logger(f"writing packageURLs for package: {base_purl} at: {purl_file}")

purl_files.append(purl_file)
purls.append(str(base_purl))

if not commit_and_push:
continue

commit_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
purls=purls,
mine_type="packageURL",
tool_name="pkg:pypi/minecode-pipelines",
tool_version=VERSION,
)

push_changes(repo=cloned_data_repo)
purl_files = []
purls = []

if logger:
logger(
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
)

if next_commit != checkpoints_last_commit:
settings_data = {
"date": str(datetime.now()),
"last_commit": next_commit,
}

update_checkpoints_in_github(
checkpoint=settings_data,
cloned_repo=config_repo,
path=CARGO_CHECKPOINT_PATH,
)

logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")
84 changes: 84 additions & 0 deletions minecode_pipelines/pipelines/mine_cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode
from minecode_pipelines.miners import cargo
from minecode_pipelines import pipes
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO

MINECODE_DATA_CARGO_REPO = os.environ.get(
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
)
MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"


class MineCargo(Pipeline):
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.clone_cargo_repos,
cls.mine_and_publish_cargo_packageurls,
cls.delete_cloned_repos,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def clone_cargo_repos(self):
"""
Clone the Cargo-related repositories (index, data, and pipelines config)
and store their Repo objects in the corresponding instance variables.
"""
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)

if self.log:
self.log(
f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}"
)
self.log(
f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
)
self.log(
f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {self.cloned_config_repo.working_dir}"
)

def mine_and_publish_cargo_packageurls(self):
cargo.process_cargo_packages(
self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo, self.log
)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(
repos=[self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo],
logger=self.log,
)
67 changes: 66 additions & 1 deletion minecode_pipelines/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import saneyaml

from aboutcode.hashid import PURLS_FILENAME
from git import Repo

from scanpipe.pipes.federatedcode import delete_local_clone
from scanpipe.pipes.federatedcode import commit_and_push_changes

Expand All @@ -34,7 +36,7 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
)
response = requests.get(checkpoints_file)
if not response.ok:
return
return {}

checkpoint_data = json.loads(response.text)
return checkpoint_data
Expand Down Expand Up @@ -112,3 +114,66 @@ def delete_cloned_repos(repos, logger=None):
if logger:
logger(f"Deleting local clone at: {repo.working_dir}")
delete_local_clone(repo)


def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
"""
Return a list of files changed between two commits using GitPython.
Includes added, modified, deleted, and renamed files.
- commit_x: base commit (or the empty tree hash for the first commit)
- commit_y: target commit (defaults to HEAD if not provided)
"""
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"

if commit_y is None:
commit_y = repo.head.commit.hexsha
commit_y_obj = repo.commit(commit_y)

if commit_x is None or commit_x == EMPTY_TREE_HASH:
# First commit case: diff against empty tree
diff_index = commit_y_obj.diff(EMPTY_TREE_HASH, R=True)
else:
commit_x_obj = repo.commit(commit_x)
diff_index = commit_x_obj.diff(commit_y_obj, R=True)

changed_files = {item.a_path or item.b_path for item in diff_index}
return list(changed_files)


def get_last_commit(repo, ecosystem):
"""
Retrieve the last mined commit for a given ecosystem.
This function reads a JSON checkpoint file from the repository, which stores
mining progress. Each checkpoint contains the "last_commit" from the package
index (e.g., PyPI) that was previously mined.
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
https://github.com/ziadhany/cargo-test/blob/main/minecode_checkpoints/cargo.json
"""

last_commit_file_path = (
Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
)
try:
with open(last_commit_file_path) as f:
settings_data = json.load(f)
except FileNotFoundError:
return
return settings_data.get("last_commit")


def get_commit_at_distance_ahead(
repo: Repo,
current_commit: str,
num_commits_ahead: int = 10,
branch_name: str = "master",
) -> str:
"""
Return the commit hash that is `num_commits_ahead` commits ahead of `current_commit`
on the given branch.
"""
if not current_commit:
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
revs = repo.git.rev_list(f"^{current_commit}", branch_name).splitlines()
if len(revs) < num_commits_ahead:
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
return revs[-num_commits_ahead]
32 changes: 32 additions & 0 deletions minecode_pipelines/pipes/cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pathlib import Path

from aboutcode import hashid
from packageurl import PackageURL
from aboutcode.hashid import get_core_purl

from minecode_pipelines.pipes import write_data_to_yaml_file


def store_cargo_packages(packages, repo):
"""Collect Cargo package versions into purls and write them to the repo."""

if not packages:
return

first_pkg = packages[0]
name = first_pkg.get("name")
version = first_pkg.get("vers")
purl = PackageURL(type="cargo", name=name, version=version)
base_purl = get_core_purl(purl)

updated_purls = []
for package in packages:
name = package.get("name")
version = package.get("vers")
purl = PackageURL(type="cargo", name=name, version=version).to_string()
updated_purls.append(purl)

ppath = hashid.get_package_purls_yml_file_path(base_purl)
purl_file_full_path = Path(repo.working_dir) / ppath
write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)
return purl_file_full_path, base_purl
1 change: 1 addition & 0 deletions minecode_pipelines/pipes/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from aboutcode.hashid import get_package_base_dir
from packageurl import PackageURL
from scanpipe.pipes.federatedcode import clone_repository

from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes

Expand Down
Loading