Skip to content

Commit c468a76

Browse files
Merge pull request #710 from aboutcode-org/minecode-pipeline-pypi
Update pypi packageURL mining
2 parents e389935 + 8ab027e commit c468a76

File tree

4 files changed

+55
-35
lines changed

4 files changed

+55
-35
lines changed

.github/workflows/pypi-release-minecode-pipeline.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ on:
44
workflow_dispatch:
55
push:
66
tags:
7-
- "minecode-pipeline/*"
7+
- "minecode-pipelines/*"
88

99
jobs:
1010
build-and-publish:
11+
permissions:
12+
id-token: write
1113
name: Build and publish library to PyPI
1214
runs-on: ubuntu-22.04
1315

@@ -28,8 +30,6 @@ jobs:
2830
- name: Publish to PyPI
2931
if: startsWith(github.ref, 'refs/tags')
3032
uses: pypa/gh-action-pypi-publish@release/v1
31-
with:
32-
password: ${{ secrets.PYPI_API_TOKEN_MINECODE_PIPELINES }}
3333

3434
- name: Upload built archives
3535
uses: actions/upload-artifact@v4

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
VERSION = "0.0.1b2"
10+
VERSION = "0.0.1b3"

minecode_pipelines/pipes/pypi.py

Lines changed: 49 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@
5050
from scanpipe.pipes.federatedcode import push_changes
5151

5252

53+
# If True, show full details on fetching packageURL for
54+
# a package name present in the index
55+
LOG_PACKAGEURL_DETAILS = False
56+
57+
5358
PACKAGE_FILE_NAME = "PypiPackages.json"
5459
PYPI_SIMPLE_CHECKPOINT_PATH = "pypi/simple_index/" + PACKAGE_FILE_NAME
5560
PYPI_CHECKPOINT_PATH = "pypi/checkpoints.json"
@@ -61,7 +66,7 @@
6166

6267

6368
# Number of packages
64-
PACKAGE_BATCH_SIZE = 500
69+
PACKAGE_BATCH_SIZE = 1000
6570

6671

6772
def mine_pypi_packages(logger=None):
@@ -140,6 +145,7 @@ def update_checkpoint_state(
140145
checkpoint_path=checkpoint_path,
141146
)
142147
checkpoint["state"] = state
148+
checkpoint["last_updated"] = str(datetime.now())
143149
update_checkpoints_in_github(
144150
checkpoint=checkpoint,
145151
cloned_repo=cloned_repo,
@@ -149,15 +155,17 @@ def update_checkpoint_state(
149155

150156
def update_pypi_checkpoints(
151157
last_serial,
158+
state,
152159
cloned_repo,
153160
checkpoint_path=PYPI_CHECKPOINT_PATH,
154161
):
155-
settings_data = {
156-
"date": str(datetime.now()),
162+
checkpoint = {
163+
"last_updated": str(datetime.now()),
164+
"state": state,
157165
"last_serial": last_serial,
158166
}
159167
update_checkpoints_in_github(
160-
checkpoint=settings_data,
168+
checkpoint=checkpoint,
161169
cloned_repo=cloned_repo,
162170
path=checkpoint_path,
163171
)
@@ -189,30 +197,33 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
189197
if not packages:
190198
return
191199

200+
synced_packages = get_mined_packages_from_checkpoint(
201+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
202+
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
203+
)
192204
if not state:
193205
if logger:
194206
logger("Initializing package mining:")
195207
packages_to_sync = packages
196-
synced_packages = []
197208

198209
elif state == PERIODIC_SYNC_STATE:
199210
# We are all synced up from the index
200211
if last_serial == last_serial_fetched:
201212
return
202213

203214
packages_to_sync = [
204-
package for package in packages if last_serial < package.get("_last-serial")
215+
package
216+
for package in packages
217+
if last_serial_fetched < package.get("_last-serial")
218+
and package.get("name") not in synced_packages
205219
]
206220
if logger:
207221
logger(
208-
f"Starting periodic package mining for {len(packages_to_sync)} packages, which has been released after serial: {last_serial}"
222+
f"Starting periodic package mining for {len(packages_to_sync)} packages, "
223+
f"which has been released after serial: {last_serial_fetched}"
209224
)
210225

211226
elif state == INITIAL_SYNC_STATE:
212-
synced_packages = get_mined_packages_from_checkpoint(
213-
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
214-
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
215-
)
216227
packages_to_sync = [
217228
package for package in packages if package.get("name") not in synced_packages
218229
]
@@ -233,7 +244,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
233244
purls = []
234245
purl_files = []
235246

236-
if logger:
247+
if logger and LOG_PACKAGEURL_DETAILS:
237248
logger("Starting package mining for a batch of packages")
238249

239250
for package in package_batch:
@@ -242,20 +253,23 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
242253

243254
# fetch packageURLs for package
244255
name = package.get("name")
245-
if logger:
256+
if logger and LOG_PACKAGEURL_DETAILS:
246257
logger(f"getting packageURLs for package: {name}")
247258

248259
packageurls = get_pypi_packageurls(name)
249260
if not packageurls:
250-
if logger:
251-
logger(f"Could not fetch package versions for package: {name}")
261+
if logger and LOG_PACKAGEURL_DETAILS:
262+
logger(f"Package versions not present for package: {name}")
263+
264+
# We don't want to try fetching versions for these again
265+
packages_mined.append(name)
252266
continue
253267

254268
# get repo and path for package
255269
base_purl = PackageURL(type=PYPI_TYPE, name=name).to_string()
256270
package_base_dir = get_package_base_dir(purl=base_purl)
257271

258-
if logger:
272+
if logger and LOG_PACKAGEURL_DETAILS:
259273
logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
260274
purls_string = " ".join(packageurls)
261275
logger(f"packageURLs: {purls_string}")
@@ -289,34 +303,40 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None):
289303
# Push changes to remote repository
290304
push_changes(repo=cloned_data_repo)
291305

292-
# If we are mining the packages initially to sync with the index,
293306
# we need to update mined packages checkpoint for every batch
294-
if state != PERIODIC_SYNC_STATE:
295-
if logger:
296-
logger("Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}")
297-
298-
packages_checkpoint = packages_mined + synced_packages
299-
update_mined_packages_in_checkpoint(
300-
packages=packages_checkpoint,
301-
cloned_repo=cloned_config_repo,
302-
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
303-
)
307+
if logger:
308+
logger(f"Checkpointing processed packages to: {PYPI_PACKAGES_CHECKPOINT_PATH}")
309+
310+
update_mined_packages_in_checkpoint(
311+
packages=packages_mined,
312+
cloned_repo=cloned_config_repo,
313+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
314+
checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH,
315+
)
304316

305317
# If we are finshed mining all the packages in the intial sync, we can now
306318
# periodically sync the packages from latest
307319
if state == INITIAL_SYNC_STATE:
308320
if logger:
309321
logger(f"{INITIAL_SYNC_STATE} completed. starting: {PERIODIC_SYNC_STATE}")
322+
323+
state = PERIODIC_SYNC_STATE
310324
update_checkpoint_state(
311325
cloned_repo=cloned_config_repo,
312-
state=PERIODIC_SYNC_STATE,
326+
state=state,
327+
)
328+
# refresh packages checkpoint once to only checkpoint new packages
329+
update_checkpoints_in_github(
330+
checkpoint={"packages_mined": []},
331+
cloned_repo=cloned_config_repo,
332+
path=PYPI_PACKAGES_CHECKPOINT_PATH,
313333
)
314334

315335
# update last_serial to minecode checkpoints whenever we finish mining
316336
# either from checkpoints or from the latest pypi
317337
if logger:
318338
logger(f"Updating checkpoint at: {PYPI_CHECKPOINT_PATH} with last serial: {last_serial}")
319-
update_pypi_checkpoints(last_serial=last_serial, cloned_repo=cloned_config_repo)
339+
update_pypi_checkpoints(last_serial=last_serial, state=state, cloned_repo=cloned_config_repo)
320340

321341
repos_to_clean = [cloned_data_repo, cloned_config_repo]
322342
return repos_to_clean

pyproject-minecode_pipeline.toml renamed to pyproject-minecode_pipelines.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "flot.buildapi"
44

55
[project]
66
name = "minecode_pipelines"
7-
version = "0.0.1b2"
7+
version = "0.0.1b3"
88
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
99
readme = "minecode_pipelines/README.rst"
1010
license = { text = "Apache-2.0" }
@@ -51,7 +51,7 @@ mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo"
5151
mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian"
5252

5353
[tool.bumpversion]
54-
current_version = "0.0.1b1"
54+
current_version = "0.0.1b3"
5555
allow_dirty = true
5656

5757
files = [

0 commit comments

Comments
 (0)