5252from scanpipe .pipes .federatedcode import push_changes
5353
5454
55+ # If True, show full details on fetching packageURL for
56+ # a package name present in the index
57+ LOG_PACKAGEURL_DETAILS = False
58+
59+
5560PACKAGE_FILE_NAME = "PypiPackages.json"
5661PYPI_SIMPLE_CHECKPOINT_PATH = "pypi/simple_index/" + PACKAGE_FILE_NAME
5762PYPI_CHECKPOINT_PATH = "pypi/checkpoints.json"
6368
6469
6570# Number of packages
66- PACKAGE_BATCH_SIZE = 500
71+ PACKAGE_BATCH_SIZE = 1000
6772
6873
6974def mine_pypi_packages (logger = None ):
@@ -154,12 +159,12 @@ def update_pypi_checkpoints(
154159 cloned_repo ,
155160 checkpoint_path = PYPI_CHECKPOINT_PATH ,
156161):
157- settings_data = {
162+ checkpoint = {
158163 "date" : str (datetime .now ()),
159164 "last_serial" : last_serial ,
160165 }
161166 update_checkpoints_in_github (
162- checkpoint = settings_data ,
167+ checkpoint = checkpoint ,
163168 cloned_repo = cloned_repo ,
164169 path = checkpoint_path ,
165170 )
@@ -191,30 +196,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
191196 if not packages :
192197 return
193198
199+ synced_packages = get_mined_packages_from_checkpoint (
200+ config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
201+ checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
202+ )
194203 if not state :
195204 if logger :
196205 logger ("Initializing package mining:" )
197206 packages_to_sync = packages
198- synced_packages = []
199207
200208 elif state == PERIODIC_SYNC_STATE :
201209 # We are all synced up from the index
202210 if last_serial == last_serial_fetched :
203211 return
204212
205213 packages_to_sync = [
206- package for package in packages if last_serial < package .get ("_last-serial" )
214+ package
215+ for package in packages
216+ if last_serial_fetched < package .get ("_last-serial" )
217+ and package .get ("name" ) not in synced_packages
207218 ]
208219 if logger :
209220 logger (
210- f"Starting periodic package mining for { len (packages_to_sync )} packages, which has been released after serial: { last_serial } "
221+ f"Starting periodic package mining for { len (packages_to_sync )} packages, "
222+ f"which has been released after serial: { last_serial_fetched } "
211223 )
212224
213225 elif state == INITIAL_SYNC_STATE :
214- synced_packages = get_mined_packages_from_checkpoint (
215- config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
216- checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
217- )
218226 packages_to_sync = [
219227 package for package in packages if package .get ("name" ) not in synced_packages
220228 ]
@@ -235,7 +243,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
235243 purls = []
236244 purl_files = []
237245
238- if logger :
246+ if logger and LOG_PACKAGEURL_DETAILS :
239247 logger ("Starting package mining for a batch of packages" )
240248
241249 for package in package_batch :
@@ -244,20 +252,23 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
244252
245253 # fetch packageURLs for package
246254 name = package .get ("name" )
247- if logger :
255+ if logger and LOG_PACKAGEURL_DETAILS :
248256 logger (f"getting packageURLs for package: { name } " )
249257
250258 packageurls = get_pypi_packageurls (name )
251259 if not packageurls :
252- if logger :
253- logger (f"Could not fetch package versions for package: { name } " )
260+ if logger and LOG_PACKAGEURL_DETAILS :
261+ logger (f"Package versions not present for package: { name } " )
262+
263+ # We don't want to try fetching versions for these again
264+ packages_mined .append (name )
254265 continue
255266
256267 # get repo and path for package
257268 base_purl = PackageURL (type = PYPI_TYPE , name = name ).to_string ()
258269 package_base_dir = get_package_base_dir (purl = base_purl )
259270
260- if logger :
271+ if logger and LOG_PACKAGEURL_DETAILS :
261272 logger (f"writing packageURLs for package: { base_purl } at: { package_base_dir } " )
262273 purls_string = " " .join (packageurls )
263274 logger (f"packageURLs: { purls_string } " )
@@ -291,28 +302,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
291302 # Push changes to remote repository
292303 push_changes (repo = cloned_data_repo )
293304
294- # If we are mining the packages initially to sync with the index,
295305 # we need to update mined packages checkpoint for every batch
296- if state != PERIODIC_SYNC_STATE :
297- if logger :
298- logger ("Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}" )
299-
300- packages_checkpoint = packages_mined + synced_packages
301- update_mined_packages_in_checkpoint (
302- packages = packages_checkpoint ,
303- cloned_repo = cloned_config_repo ,
304- checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
305- )
306+ if logger :
307+ logger (f"Checkpointing processed packages to: { PYPI_PACKAGES_CHECKPOINT_PATH } " )
308+
309+ update_mined_packages_in_checkpoint (
310+ packages = packages_mined ,
311+ cloned_repo = cloned_config_repo ,
312+ config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
313+ checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
314+ )
306315
307316 # If we are finshed mining all the packages in the intial sync, we can now
308317 # periodically sync the packages from latest
309318 if state == INITIAL_SYNC_STATE :
310319 if logger :
311320 logger (f"{ INITIAL_SYNC_STATE } completed. starting: { PERIODIC_SYNC_STATE } " )
321+
312322 update_checkpoint_state (
313323 cloned_repo = cloned_config_repo ,
314324 state = PERIODIC_SYNC_STATE ,
315325 )
326+ # refresh packages checkpoint once to only checkpoint new packages
327+ update_checkpoints_in_github (
328+ checkpoint = {"packages_mined" : []},
329+ cloned_repo = cloned_config_repo ,
330+ path = PYPI_PACKAGES_CHECKPOINT_PATH ,
331+ )
316332
317333 # update last_serial to minecode checkpoints whenever we finish mining
318334 # either from checkpoints or from the latest pypi
0 commit comments