5050from scanpipe .pipes .federatedcode import push_changes
5151
5252
53+ # If True, show full details on fetching packageURL for
54+ # a package name present in the index
55+ LOG_PACKAGEURL_DETAILS = False
56+
57+
5358PACKAGE_FILE_NAME = "PypiPackages.json"
5459PYPI_SIMPLE_CHECKPOINT_PATH = "pypi/simple_index/" + PACKAGE_FILE_NAME
5560PYPI_CHECKPOINT_PATH = "pypi/checkpoints.json"
6166
6267
6368# Number of packages
64- PACKAGE_BATCH_SIZE = 500
69+ PACKAGE_BATCH_SIZE = 1000
6570
6671
6772def mine_pypi_packages (logger = None ):
@@ -140,6 +145,7 @@ def update_checkpoint_state(
140145 checkpoint_path = checkpoint_path ,
141146 )
142147 checkpoint ["state" ] = state
148+ checkpoint ["last_updated" ] = str (datetime .now ())
143149 update_checkpoints_in_github (
144150 checkpoint = checkpoint ,
145151 cloned_repo = cloned_repo ,
@@ -149,15 +155,17 @@ def update_checkpoint_state(
149155
150156def update_pypi_checkpoints (
151157 last_serial ,
158+ state ,
152159 cloned_repo ,
153160 checkpoint_path = PYPI_CHECKPOINT_PATH ,
154161):
155- settings_data = {
156- "date" : str (datetime .now ()),
162+ checkpoint = {
163+ "last_updated" : str (datetime .now ()),
164+ "state" : state ,
157165 "last_serial" : last_serial ,
158166 }
159167 update_checkpoints_in_github (
160- checkpoint = settings_data ,
168+ checkpoint = checkpoint ,
161169 cloned_repo = cloned_repo ,
162170 path = checkpoint_path ,
163171 )
@@ -189,30 +197,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
189197 if not packages :
190198 return
191199
200+ synced_packages = get_mined_packages_from_checkpoint (
201+ config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
202+ checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
203+ )
192204 if not state :
193205 if logger :
194206 logger ("Initializing package mining:" )
195207 packages_to_sync = packages
196- synced_packages = []
197208
198209 elif state == PERIODIC_SYNC_STATE :
199210 # We are all synced up from the index
200211 if last_serial == last_serial_fetched :
201212 return
202213
203214 packages_to_sync = [
204- package for package in packages if last_serial < package .get ("_last-serial" )
215+ package
216+ for package in packages
217+ if last_serial_fetched < package .get ("_last-serial" )
218+ and package .get ("name" ) not in synced_packages
205219 ]
206220 if logger :
207221 logger (
208- f"Starting periodic package mining for { len (packages_to_sync )} packages, which has been released after serial: { last_serial } "
222+ f"Starting periodic package mining for { len (packages_to_sync )} packages, "
223+ f"which has been released after serial: { last_serial_fetched } "
209224 )
210225
211226 elif state == INITIAL_SYNC_STATE :
212- synced_packages = get_mined_packages_from_checkpoint (
213- config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
214- checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
215- )
216227 packages_to_sync = [
217228 package for package in packages if package .get ("name" ) not in synced_packages
218229 ]
@@ -233,7 +244,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
233244 purls = []
234245 purl_files = []
235246
236- if logger :
247+ if logger and LOG_PACKAGEURL_DETAILS :
237248 logger ("Starting package mining for a batch of packages" )
238249
239250 for package in package_batch :
@@ -242,20 +253,23 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
242253
243254 # fetch packageURLs for package
244255 name = package .get ("name" )
245- if logger :
256+ if logger and LOG_PACKAGEURL_DETAILS :
246257 logger (f"getting packageURLs for package: { name } " )
247258
248259 packageurls = get_pypi_packageurls (name )
249260 if not packageurls :
250- if logger :
251- logger (f"Could not fetch package versions for package: { name } " )
261+ if logger and LOG_PACKAGEURL_DETAILS :
262+ logger (f"Package versions not present for package: { name } " )
263+
264+ # We don't want to try fetching versions for these again
265+ packages_mined .append (name )
252266 continue
253267
254268 # get repo and path for package
255269 base_purl = PackageURL (type = PYPI_TYPE , name = name ).to_string ()
256270 package_base_dir = get_package_base_dir (purl = base_purl )
257271
258- if logger :
272+ if logger and LOG_PACKAGEURL_DETAILS :
259273 logger (f"writing packageURLs for package: { base_purl } at: { package_base_dir } " )
260274 purls_string = " " .join (packageurls )
261275 logger (f"packageURLs: { purls_string } " )
@@ -289,34 +303,40 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
289303 # Push changes to remote repository
290304 push_changes (repo = cloned_data_repo )
291305
292- # If we are mining the packages initially to sync with the index,
293306 # we need to update mined packages checkpoint for every batch
294- if state != PERIODIC_SYNC_STATE :
295- if logger :
296- logger ("Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}" )
297-
298- packages_checkpoint = packages_mined + synced_packages
299- update_mined_packages_in_checkpoint (
300- packages = packages_checkpoint ,
301- cloned_repo = cloned_config_repo ,
302- checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
303- )
307+ if logger :
308+ logger (f"Checkpointing processed packages to: { PYPI_PACKAGES_CHECKPOINT_PATH } " )
309+
310+ update_mined_packages_in_checkpoint (
311+ packages = packages_mined ,
312+ cloned_repo = cloned_config_repo ,
313+ config_repo = MINECODE_PIPELINES_CONFIG_REPO ,
314+ checkpoint_path = PYPI_PACKAGES_CHECKPOINT_PATH ,
315+ )
304316
305317 # If we are finshed mining all the packages in the intial sync, we can now
306318 # periodically sync the packages from latest
307319 if state == INITIAL_SYNC_STATE :
308320 if logger :
309321 logger (f"{ INITIAL_SYNC_STATE } completed. starting: { PERIODIC_SYNC_STATE } " )
322+
323+ state = PERIODIC_SYNC_STATE
310324 update_checkpoint_state (
311325 cloned_repo = cloned_config_repo ,
312- state = PERIODIC_SYNC_STATE ,
326+ state = state ,
327+ )
328+ # refresh packages checkpoint once to only checkpoint new packages
329+ update_checkpoints_in_github (
330+ checkpoint = {"packages_mined" : []},
331+ cloned_repo = cloned_config_repo ,
332+ path = PYPI_PACKAGES_CHECKPOINT_PATH ,
313333 )
314334
315335 # update last_serial to minecode checkpoints whenever we finish mining
316336 # either from checkpoints or from the latest pypi
317337 if logger :
318338 logger (f"Updating checkpoint at: { PYPI_CHECKPOINT_PATH } with last serial: { last_serial } " )
319- update_pypi_checkpoints (last_serial = last_serial , cloned_repo = cloned_config_repo )
339+ update_pypi_checkpoints (last_serial = last_serial , state = state , cloned_repo = cloned_config_repo )
320340
321341 repos_to_clean = [cloned_data_repo , cloned_config_repo ]
322342 return repos_to_clean
0 commit comments