Skip to content

Commit 03bf5d2

Browse files
committed
Refactored my Cargo to work with the newly updated common functions
Signed-off-by: ziad hany <[email protected]>
1 parent cac353a commit 03bf5d2

File tree

5 files changed

+172
-70
lines changed

5 files changed

+172
-70
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 69 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,40 +6,58 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9-
from minecode_pipelines.pipes import get_last_commit
9+
from datetime import datetime
10+
11+
from minecode_pipelines.pipes import fetch_checkpoint_from_github
12+
from minecode_pipelines.pipes import update_checkpoints_in_github
13+
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
1014
from minecode_pipelines.pipes import get_changed_files
11-
from minecode_pipelines.pipes import update_last_commit
1215
from minecode_pipelines.pipes.cargo import store_cargo_packages
16+
from scanpipe.pipes.federatedcode import commit_changes
17+
from scanpipe.pipes.federatedcode import push_changes
18+
from minecode_pipelines import VERSION
19+
1320
import json
1421
from pathlib import Path
1522

1623
from minecode_pipelines.utils import get_next_x_commit
1724

25+
PACKAGE_BATCH_SIZE = 500
26+
CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"
1827

19-
def process_cargo_packages(cargo_repo, fed_repo, fed_conf_repo, logger):
28+
29+
def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logger):
2030
"""
2131
Process Cargo index files commit by commit.
2232
Push changes to fed_repo after:
23-
- every `commit_batch` commits, OR
24-
- when reaching HEAD.
33+
- every `commit_batch` commits, OR when reaching HEAD.
2534
"""
2635

27-
base_path = Path(cargo_repo.working_tree_dir)
36+
base_path = Path(cargo_index_repo.working_tree_dir)
2837

2938
while True:
30-
setting_last_commit = get_last_commit(fed_conf_repo, "cargo")
31-
next_commit = get_next_x_commit(cargo_repo, setting_last_commit, x=10, branch="master")
39+
cargo_checkpoints = (
40+
fetch_checkpoint_from_github(MINECODE_PIPELINES_CONFIG_REPO, CARGO_CHECKPOINT_PATH)
41+
or {}
42+
)
43+
checkpoints_last_commit = cargo_checkpoints.get("last_commit")
3244

33-
if next_commit == setting_last_commit:
45+
next_commit = get_next_x_commit(
46+
cargo_index_repo, checkpoints_last_commit, x=10, branch="master"
47+
)
48+
49+
if next_commit == checkpoints_last_commit:
3450
logger("No new commits to mine")
3551
break
3652

3753
changed_files = get_changed_files(
38-
cargo_repo, commit_x=setting_last_commit, commit_y=next_commit
54+
cargo_index_repo, commit_x=checkpoints_last_commit, commit_y=next_commit
3955
)
4056
logger(f"Found {len(changed_files)} changed files in Cargo index.")
4157

4258
file_counter = 0
59+
purl_files = []
60+
purls = []
4361
for idx, rel_path in enumerate(changed_files):
4462
file_path = base_path / rel_path
4563
logger(f"Found {file_path}.")
@@ -57,8 +75,45 @@ def process_cargo_packages(cargo_repo, fed_repo, fed_conf_repo, logger):
5775
packages.append(json.loads(line))
5876

5977
file_counter += 1
60-
push_commit = (file_counter % 1000 == 0) or (idx == len(changed_files))
61-
store_cargo_packages(packages, fed_repo, push_commit)
78+
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
79+
idx == len(changed_files)
80+
)
81+
purl_file, base_purl = store_cargo_packages(packages, cloned_data_repo)
82+
logger(f"writing packageURLs for package: {base_purl} at: {purl_file}")
83+
84+
purl_files.append(purl_file)
85+
purls.append(str(base_purl))
86+
if not commit_and_push:
87+
continue
88+
89+
commit_changes(
90+
repo=cloned_data_repo,
91+
files_to_commit=purl_files,
92+
purls=purls,
93+
mine_type="packageURL",
94+
tool_name="pkg:cargo/minecode-pipelines",
95+
tool_version=VERSION,
96+
)
97+
98+
# Push changes to remote repository
99+
push_changes(repo=cloned_data_repo)
100+
purl_files = []
101+
purls = []
102+
103+
if logger:
104+
logger(
105+
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
106+
)
107+
108+
settings_data = {
109+
"date": str(datetime.now()),
110+
"last_commit": next_commit,
111+
}
112+
113+
update_checkpoints_in_github(
114+
checkpoint=settings_data,
115+
cloned_repo=config_repo,
116+
path=CARGO_CHECKPOINT_PATH,
117+
)
62118

63-
update_last_commit(next_commit, fed_conf_repo, "cargo")
64-
logger(f"Pushed batch for commit range {setting_last_commit}:{next_commit}.")
119+
logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,18 @@
1919
#
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22-
import os
2322

24-
from git.repo.base import Repo
25-
from scanpipe.pipes.federatedcode import delete_local_clone
26-
from minecode_pipelines.utils import get_temp_file
23+
import os
2724
from scanpipe.pipelines import Pipeline
2825
from scanpipe.pipes import federatedcode
2926
from minecode_pipelines.miners import cargo
27+
from minecode_pipelines import pipes
28+
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
3029

31-
FEDERATEDCODE_CARGO_GIT_URL = os.environ.get(
32-
"FEDERATEDCODE_CARGO_GIT_URL", "https://github.com/ziadhany/cargo-test"
33-
)
34-
35-
FEDERATEDCODE_CONFIG_GIT_URL = os.environ.get(
36-
"FEDERATEDCODE_CONFIG_GIT_URL", "https://github.com/ziadhany/federatedcode-config"
30+
MINECODE_DATA_CARGO_REPO = os.environ.get(
31+
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
3732
)
33+
MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"
3834

3935

4036
class MineandPublishCargoPURLs(Pipeline):
@@ -45,39 +41,43 @@ def steps(cls):
4541
return (
4642
cls.check_federatedcode_eligibility,
4743
cls.clone_cargo_repo,
48-
cls.collect_packages_from_cargo,
49-
cls.clean_cargo_repo,
44+
cls.mine_and_publish_cargo_packageurls,
45+
cls.delete_cloned_repos,
5046
)
5147

5248
def check_federatedcode_eligibility(self):
5349
"""
5450
Check if the project fulfills the following criteria for
5551
pushing the project result to FederatedCode.
5652
"""
57-
federatedcode.check_federatedcode_configured_and_available(project=self.project)
53+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
5854

5955
def clone_cargo_repo(self):
6056
"""
6157
Clone the repo at repo_url and return the Repo object
6258
"""
63-
conan_repo_url = "https://github.com/rust-lang/crates.io-index"
64-
65-
self.fed_repo = federatedcode.clone_repository(FEDERATEDCODE_CARGO_GIT_URL)
66-
self.fed_conf_repo = federatedcode.clone_repository(FEDERATEDCODE_CONFIG_GIT_URL)
67-
self.cargo_repo = Repo.clone_from(conan_repo_url, get_temp_file())
59+
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
60+
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)
61+
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)
6862

69-
def collect_packages_from_cargo(self):
70-
cargo.process_cargo_packages(self.cargo_repo, self.fed_repo, self.fed_conf_repo, self.log)
63+
if self.log:
64+
self.log(
65+
f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}"
66+
)
67+
self.log(
68+
f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
69+
)
70+
self.log(
71+
f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {self.cloned_config_repo.working_dir}"
72+
)
7173

72-
def clean_cargo_repo(self):
73-
"""
74-
Delete the federatedcode repository if it exists, and also delete the Cargo repository if it exists.
75-
"""
76-
if self.cargo_repo:
77-
delete_local_clone(self.cargo_repo)
78-
79-
if self.fed_repo:
80-
delete_local_clone(self.fed_repo)
74+
def mine_and_publish_cargo_packageurls(self):
75+
cargo.process_cargo_packages(
76+
self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo, self.log
77+
)
8178

82-
if self.fed_conf_repo:
83-
delete_local_clone(self.fed_repo)
79+
def delete_cloned_repos(self):
80+
pipes.delete_cloned_repos(
81+
repos=[self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo],
82+
logger=self.log,
83+
)

minecode_pipelines/pipes/__init__.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pathlib import Path
1616

1717
from aboutcode.hashid import PURLS_FILENAME
18+
from git import Repo
1819

1920
from scanpipe.pipes.federatedcode import delete_local_clone
2021
from scanpipe.pipes.federatedcode import commit_and_push_changes
@@ -106,3 +107,60 @@ def delete_cloned_repos(repos, logger=None):
106107
if logger:
107108
logger(f"Deleting local clone at: {repo.working_dir}")
108109
delete_local_clone(repo)
110+
111+
112+
def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
113+
"""
114+
Return a list of files changed between two commits using GitPython.
115+
Includes added, modified, deleted, and renamed files.
116+
- commit_x: base commit (or the empty tree hash for the first commit)
117+
- commit_y: target commit (defaults to HEAD if not provided)
118+
"""
119+
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
120+
121+
if commit_y is None:
122+
commit_y = repo.head.commit.hexsha
123+
commit_y_obj = repo.commit(commit_y)
124+
125+
if commit_x is None or commit_x == EMPTY_TREE_HASH:
126+
# First commit case: diff against empty tree
127+
diff_index = commit_y_obj.diff(EMPTY_TREE_HASH, R=True)
128+
else:
129+
commit_x_obj = repo.commit(commit_x)
130+
diff_index = commit_x_obj.diff(commit_y_obj, R=True)
131+
132+
changed_files = {item.a_path or item.b_path for item in diff_index}
133+
return list(changed_files)
134+
135+
136+
def get_last_commit(repo, ecosystem):
137+
"""
138+
Retrieve the last mined commit for a given ecosystem.
139+
This function reads a JSON checkpoint file from the repository, which stores
140+
mining progress. Each checkpoint contains the "last_commit" from the package
141+
index (e.g., PyPI) that was previously mined.
142+
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
143+
https://github.com/ziadhany/cargo-test/blob/main/minecode_checkpoints/cargo.json
144+
"""
145+
146+
last_commit_file_path = (
147+
Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
148+
)
149+
try:
150+
with open(last_commit_file_path) as f:
151+
settings_data = json.load(f)
152+
except FileNotFoundError:
153+
return
154+
return settings_data.get("last_commit")
155+
156+
157+
def get_next_x_commit(repo: Repo, current_commit: str, x: int = 10, branch: str = "master") -> str:
158+
"""
159+
Get the x-th next commit after the current commit in the specified branch.
160+
"""
161+
if not current_commit:
162+
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
163+
revs = repo.git.rev_list(f"^{current_commit}", branch).splitlines()
164+
if len(revs) < x:
165+
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
166+
return revs[-x]

minecode_pipelines/pipes/cargo.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
1+
from aboutcode import hashid
12
from packageurl import PackageURL
23
from aboutcode.hashid import get_core_purl
3-
from minecode_pipelines.pipes import write_purls_to_repo
44

5+
from minecode_pipelines.miners import write_packageurls_to_file
56

6-
def store_cargo_packages(packages, fed_repo, push_commit=False):
7+
8+
def store_cargo_packages(packages, repo):
79
"""Collect Cargo package versions into purls and write them to the repo."""
810

911
if not packages:
10-
raise ValueError("No packages found")
12+
return
1113

1214
first_pkg = packages[0]
1315
name = first_pkg.get("name")
@@ -22,4 +24,5 @@ def store_cargo_packages(packages, fed_repo, push_commit=False):
2224
purl = PackageURL(type="cargo", name=name, version=version).to_string()
2325
updated_purls.append(purl)
2426

25-
write_purls_to_repo(fed_repo, base_purl, updated_purls, push_commit)
27+
ppath = hashid.get_package_purls_yml_file_path(base_purl)
28+
return write_packageurls_to_file(repo, ppath, updated_purls), base_purl

minecode_pipelines/tests/pipes/test_cargo_pipes.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,15 @@
55
from unittest.mock import Mock, patch
66
import saneyaml
77
from django.test import TestCase
8-
from packageurl import PackageURL
98

10-
from minecode_pipelines.pipes import git_stage_purls
9+
from minecode_pipelines.miners import write_packageurls_to_file
1110
from minecode_pipelines.pipes.cargo import store_cargo_packages
1211

1312
DATA_DIR = Path(__file__).parent.parent / "test_data" / "cargo"
1413

1514

1615
class CargoPipelineTests(TestCase):
17-
def _get_temp_dir(self):
18-
import tempfile
19-
20-
return tempfile.mkdtemp()
21-
22-
@patch("minecode_pipelines.pipes.cargo.write_purls_to_repo")
16+
@patch("minecode_pipelines.pipes.cargo.write_packageurls_to_file")
2317
def test_collect_packages_from_cargo_calls_write(self, mock_write):
2418
packages_file = DATA_DIR / "c5store"
2519
expected_file = DATA_DIR / "c5store-expected.yaml"
@@ -34,22 +28,16 @@ def test_collect_packages_from_cargo_calls_write(self, mock_write):
3428
expected = saneyaml.load(f)
3529

3630
repo = Mock()
37-
result = store_cargo_packages(packages, repo)
38-
self.assertIsNone(result)
31+
store_cargo_packages(packages, repo)
3932

4033
mock_write.assert_called_once()
4134
args, kwargs = mock_write.call_args
42-
called_repo, base_purl, written_packages, push_commit = args
35+
called_repo, base_purl, written_packages = args
4336

4437
self.assertEqual(called_repo, repo)
4538

46-
first_pkg = packages[0]
47-
expected_base_purl = PackageURL(
48-
type="cargo",
49-
name=first_pkg["name"],
50-
)
39+
expected_base_purl = 'aboutcode-packages-cargo-0/cargo/c5store/purls.yml'
5140
self.assertEqual(str(base_purl), str(expected_base_purl))
52-
5341
self.assertEqual(written_packages, expected)
5442

5543
def test_add_purl_result_with_mock_repo(self):
@@ -64,13 +52,11 @@ def test_add_purl_result_with_mock_repo(self):
6452

6553
purls_file = repo_dir / "purls.yaml"
6654

67-
relative_path = git_stage_purls(purls, mock_repo, purls_file)
55+
relative_path = write_packageurls_to_file(mock_repo, purls_file, purls)
6856

6957
written_file = repo_dir / relative_path
7058
self.assertTrue(written_file.exists())
7159

7260
with open(written_file, encoding="utf-8") as f:
7361
content = saneyaml.load(f)
74-
self.assertEqual(content, purls)
75-
76-
mock_repo.index.add.assert_called_once_with([relative_path])
62+
self.assertEqual(content, purls)

0 commit comments

Comments
 (0)