Skip to content

Commit 45d7d06

Browse files
Update pypi packageURL mining
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent fba0889 commit 45d7d06

File tree

1 file changed

+42
-26
lines changed

1 file changed

+42
-26
lines changed

minecode_pipelines/pipes/pypi.py

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@
5252
from scanpipe.pipes.federatedcode import push_changes
5353

5454

55+
# If True, show full details on fetching packageURL for
56+
# a package name present in the index
57+
LOG_PACKAGEURL_DETAILS = False
58+
59+
5560
PACKAGE_FILE_NAME = "PypiPackages.json"
5661
PYPI_SIMPLE_CHECKPOINT_PATH = "pypi/simple_index/" + PACKAGE_FILE_NAME
5762
PYPI_CHECKPOINT_PATH = "pypi/checkpoints.json"
@@ -63,7 +68,7 @@
6368

6469

6570
# Number of packages
66-
PACKAGE_BATCH_SIZE = 500
71+
PACKAGE_BATCH_SIZE = 1000
6772

6873

6974
def mine_pypi_packages(logger=None):
@@ -154,12 +159,12 @@ def update_pypi_checkpoints(
154159
cloned_repo,
155160
checkpoint_path=PYPI_CHECKPOINT_PATH,
156161
):
157-
settings_data = {
162+
checkpoint = {
158163
"date": str(datetime.now()),
159164
"last_serial": last_serial,
160165
}
161166
update_checkpoints_in_github(
162-
checkpoint=settings_data,
167+
checkpoint=checkpoint,
163168
cloned_repo=cloned_repo,
164169
path=checkpoint_path,
165170
)
@@ -191,30 +196,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
191196
if not packages:
192197
return
193198

199+
synced_packages = get_mined_packages_from_checkpoint(
200+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
201+
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
202+
)
194203
if not state:
195204
if logger:
196205
logger("Initializing package mining:")
197206
packages_to_sync = packages
198-
synced_packages = []
199207

200208
elif state == PERIODIC_SYNC_STATE:
201209
# We are all synced up from the index
202210
if last_serial == last_serial_fetched:
203211
return
204212

205213
packages_to_sync = [
206-
package for package in packages if last_serial < package.get("_last-serial")
214+
package
215+
for package in packages
216+
if last_serial_fetched < package.get("_last-serial")
217+
and package.get("name") not in synced_packages
207218
]
208219
if logger:
209220
logger(
210-
f"Starting periodic package mining for {len(packages_to_sync)} packages, which has been released after serial: {last_serial}"
221+
f"Starting periodic package mining for {len(packages_to_sync)} packages, "
222+
f"which has been released after serial: {last_serial_fetched}"
211223
)
212224

213225
elif state == INITIAL_SYNC_STATE:
214-
synced_packages = get_mined_packages_from_checkpoint(
215-
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
216-
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
217-
)
218226
packages_to_sync = [
219227
package for package in packages if package.get("name") not in synced_packages
220228
]
@@ -235,7 +243,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
235243
purls = []
236244
purl_files = []
237245

238-
if logger:
246+
if logger and LOG_PACKAGEURL_DETAILS:
239247
logger("Starting package mining for a batch of packages")
240248

241249
for package in package_batch:
@@ -244,20 +252,23 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
244252

245253
# fetch packageURLs for package
246254
name = package.get("name")
247-
if logger:
255+
if logger and LOG_PACKAGEURL_DETAILS:
248256
logger(f"getting packageURLs for package: {name}")
249257

250258
packageurls = get_pypi_packageurls(name)
251259
if not packageurls:
252-
if logger:
253-
logger(f"Could not fetch package versions for package: {name}")
260+
if logger and LOG_PACKAGEURL_DETAILS:
261+
logger(f"Package versions not present for package: {name}")
262+
263+
# We don't want to try fetching versions for these again
264+
packages_mined.append(name)
254265
continue
255266

256267
# get repo and path for package
257268
base_purl = PackageURL(type=PYPI_TYPE, name=name).to_string()
258269
package_base_dir = get_package_base_dir(purl=base_purl)
259270

260-
if logger:
271+
if logger and LOG_PACKAGEURL_DETAILS:
261272
logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
262273
purls_string = " ".join(packageurls)
263274
logger(f"packageURLs: {purls_string}")
@@ -291,28 +302,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
291302
# Push changes to remote repository
292303
push_changes(repo=cloned_data_repo)
293304

294-
# If we are mining the packages initially to sync with the index,
295305
# we need to update mined packages checkpoint for every batch
296-
if state != PERIODIC_SYNC_STATE:
297-
if logger:
298-
logger("Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}")
299-
300-
packages_checkpoint = packages_mined + synced_packages
301-
update_mined_packages_in_checkpoint(
302-
packages=packages_checkpoint,
303-
cloned_repo=cloned_config_repo,
304-
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
305-
)
306+
if logger:
307+
logger(f"Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}")
308+
309+
update_mined_packages_in_checkpoint(
310+
packages=packages_mined,
311+
cloned_repo=cloned_config_repo,
312+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
313+
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
314+
)
306315

307316
# If we are finshed mining all the packages in the intial sync, we can now
308317
# periodically sync the packages from latest
309318
if state == INITIAL_SYNC_STATE:
310319
if logger:
311320
logger(f"{INITIAL_SYNC_STATE} completed. starting: {PERIODIC_SYNC_STATE}")
321+
312322
update_checkpoint_state(
313323
cloned_repo=cloned_config_repo,
314324
state=PERIODIC_SYNC_STATE,
315325
)
326+
# refresh packages checkpoint once to only checkpoint new packages
327+
update_checkpoints_in_github(
328+
checkpoint={"packages_mined": []},
329+
cloned_repo=cloned_config_repo,
330+
path=PYPI_PACKAGES_CHECKPOINT_PATH,
331+
)
316332

317333
# update last_serial to minecode checkpoints whenever we finish mining
318334
# either from checkpoints or from the latest pypi

0 commit comments

Comments
 (0)