Skip to content

Commit

Permalink
fix: add script to normalize barcodes in DB
Browse files Browse the repository at this point in the history
  • Loading branch information
raphael0202 committed Oct 8, 2024
1 parent 1730489 commit 4adeb6b
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 0 deletions.
119 changes: 119 additions & 0 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,5 +1020,124 @@ def launch_spellcheck_batch_job(
)


@app.command()
def launch_normalize_barcode_job():
from openfoodfacts.images import generate_image_path
from peewee import fn

from robotoff.models import ImageModel, Prediction, ProductInsight, db
from robotoff.off import normalize_barcode
from robotoff.utils import get_logger

logger = get_logger()
logger.info("Starting barcode normalization job")

with db.connection_context():
updated = 0
min_id = 0
max_id = Prediction.select(fn.MAX(Prediction.id)).scalar()
with db.atomic() as tsx:
while min_id < max_id:
prediction = None
for prediction in (
Prediction.select()
.where(Prediction.id >= min_id)
.order_by(Prediction.id.asc())
.limit(10_000)
):
barcode = normalize_barcode(prediction.barcode)
source_image = (
generate_image_path(
prediction.barcode, Path(prediction.source_image).stem
)
if prediction.source_image
else None
)
is_updated = (barcode != prediction.barcode) or (
source_image != prediction.source_image
)
if is_updated:
prediction.barcode = barcode
prediction.source_image = source_image
prediction.save()
updated += 1

tsx.commit()
logger.info("Current ID: %s, Updated %d predictions", min_id, updated)
if prediction is not None:
min_id = prediction.id
else:
break

logger.info("Updated %d predictions", updated)

updated = 0
min_id = ProductInsight.select(fn.MIN(ProductInsight.timestamp)).scalar()
max_id = ProductInsight.select(fn.MAX(ProductInsight.timestamp)).scalar()
with db.atomic() as tsx:
while min_id < max_id:
insight = None
for insight in (
ProductInsight.select()
.where(ProductInsight.timestamp >= min_id)
.order_by(ProductInsight.timestamp.asc())
.limit(10_000)
):
barcode = normalize_barcode(insight.barcode)
source_image = generate_image_path(
insight.barcode, Path(insight.source_image).stem
)
is_updated = (barcode != insight.barcode) or (
source_image != insight.source_image
)
if is_updated:
insight.barcode = barcode
insight.source_image = source_image
insight.save()
updated += 1

tsx.commit()
logger.info("Current ID: %s, Updated %d insights", min_id, updated)
if insight is not None:
min_id = insight.timestamp
else:
break

logger.info("Updated %d insights", updated)

updated = 0
min_id = ImageModel.select(fn.MIN(ImageModel.id)).scalar()
max_id = ImageModel.select(fn.MAX(ImageModel.id)).scalar()
with db.atomic() as tsx:
while min_id < max_id:
image = None
for image in (
ImageModel.select()
.where(ImageModel.id >= min_id)
.order_by(ImageModel.id.asc())
.limit(10_000)
):
barcode = normalize_barcode(image.barcode)
source_image = generate_image_path(
image.barcode, Path(image.source_image).stem
)
is_updated = (barcode != image.barcode) or (
source_image != image.source_image
)
if is_updated:
image.barcode = barcode
image.source_image = source_image
image.save()
updated += 1

tsx.commit()
logger.info("Current ID: %s, Updated %d images", min_id, updated)
if image is not None:
min_id = image.id
else:
break
logger.info("Updated %d images", updated)


def main() -> None:
app()
19 changes: 19 additions & 0 deletions robotoff/off.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,25 @@ def is_valid_image(product_id: ProductIdentifier, image_id: str) -> bool:
return image_id in images


def normalize_barcode(barcode: str) -> str:
"""Normalize the barcode.
First, we remove leading zeros, then we pad the barcode with zeros to
reach 8 digits.
If the barcode is longer than 8 digits, we pad it to 13 digits.
:param barcode: the barcode to normalize
:return: the normalized barcode
"""
barcode = barcode.lstrip("0").zfill(8)

if len(barcode) > 8:
barcode = barcode.zfill(13)

return barcode


def off_credentials() -> dict[str, str]:
return {"user_id": settings._off_user, "password": settings._off_password}

Expand Down

0 comments on commit 4adeb6b

Please sign in to comment.