Skip to content

Commit

Permalink
fix: drop unused indices (#1364)
Browse files Browse the repository at this point in the history
Drop unused indices:
    - logo_annotation_nearest_neighbors (8447 MB in prod)
    - image_prediction_data (unused, 3286 MB in prod)
    - product_insight_data (almost unused, 3047 MB in prod)
    - product_insight_predictor_version (unused, 517 MB in prod)
    - logo_annotation_bounding_box (unused, 1633 MB in prod)
    - image_prediction_max_confidence (useless, 280 MB in prod)
    - image_width (unused, 324 MB in prod)
    - image_height (unused, 310 MB in prod)

The `logo_annotation_nearest_neighbors` index currently (as of
2024-07-11) takes 8447 MB of space in production, just so that we can
once in a while get logo annotations without nearest neighbors
(nearest_neighbors is NULL).
  • Loading branch information
raphael0202 authored Jul 11, 2024
1 parent e6f8533 commit f0c348f
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 7 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -290,4 +290,7 @@ robotoff-cli: guard-args

# apply DB migrations
migrate-db:
${DOCKER_COMPOSE} run --rm --no-deps api python -m robotoff migrate-db
${DOCKER_COMPOSE} run --rm --no-deps api python -m robotoff migrate-db

create-migration: guard-args
${DOCKER_COMPOSE} run --rm --no-deps api python -m robotoff create-migration ${args}
52 changes: 52 additions & 0 deletions migrations/004_drop_indices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Peewee migrations -- 004_drop_logo_annotation_nearest_neighbors_index.py.
"""

import peewee as pw
from peewee_migrate import Migrator


def migrate(migrator: Migrator, database: pw.Database, *, fake=False):
"""Drop unused indices:
- logo_annotation_nearest_neighbors (8447 MB in prod)
- image_prediction_data (unused, 3286 MB in prod)
- product_insight_data (almost unused, 3047 MB in prod)
- product_insight_predictor_version (unused, 517 MB in prod)
- logo_annotation_bounding_box (unused, 1633 MB in prod)
- image_prediction_max_confidence (useless, 280 MB in prod)
- image_width (unused, 324 MB in prod)
- image_height (unused, 310 MB in prod)
The `logo_annotation_nearest_neighbors` index currently (as of 2024-07-11)
takes 8447 MB of space in production, just so that we can once in a while
get logo annotations without nearest neighbors (nearest_neighbors is NULL).
"""
# The name of the index may change depending on how it was created
migrator.sql("DROP INDEX IF EXISTS logo_annotation_nearest_neighbors")
migrator.sql("DROP INDEX IF EXISTS logoannotation_nearest_neighbors")

migrator.sql("DROP INDEX IF EXISTS image_prediction_data")
migrator.sql("DROP INDEX IF EXISTS imageprediction_data")

migrator.sql("DROP INDEX IF EXISTS logo_annotation_bounding_box")
migrator.sql("DROP INDEX IF EXISTS logoannotation_bounding_box")

migrator.sql("DROP INDEX IF EXISTS product_insight_data")
migrator.sql("DROP INDEX IF EXISTS productinsight_data")

migrator.sql("DROP INDEX IF EXISTS image_prediction_max_confidence")
migrator.sql("DROP INDEX IF EXISTS imageprediction_max_confidence")

migrator.sql("DROP INDEX IF EXISTS image_width")
migrator.sql("DROP INDEX IF EXISTS imagemodel_width")

migrator.sql("DROP INDEX IF EXISTS image_height")
migrator.sql("DROP INDEX IF EXISTS imagemodel_height")

migrator.sql("DROP INDEX IF EXISTS product_insight_predictor_version")
migrator.sql("DROP INDEX IF EXISTS productinsight_predictor_version")


def rollback(migrator: Migrator, database: pw.Database, *, fake=False):
"""These indices are too long to build using a migration script, rollback
should be done manually."""
pass
3 changes: 3 additions & 0 deletions robotoff/app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,11 +1120,14 @@ def on_post(self, req: falcon.Request, resp: falcon.Response, logo_id: int):
insights_deleted = (
ProductInsight.delete()
.where(
# Speed-up filtering by providing additional filters
ProductInsight.barcode == logo.barcode,
ProductInsight.type == annotation_type,
# never delete annotated insights
ProductInsight.annotation.is_null(),
ProductInsight.predictor == "universal-logo-detector",
# We don't have an index on data, but the number of
# rows should be small enough to not be a problem
ProductInsight.data["logo_id"] == str(logo_id),
)
.execute()
Expand Down
11 changes: 5 additions & 6 deletions robotoff/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class ProductInsight(BaseModel):
# NOTE: there is no 1:1 mapping between the type and the JSON format
# provided here, for example for type==label, the data here could be:
# {"logo_id":X,"bounding_box":Y}, or {"text":X,"notify":Y}
data = BinaryJSONField(index=True, default=dict)
data = BinaryJSONField(default=dict)

# Timestamp is the timestamp of when this insight was imported into the DB.
timestamp = peewee.DateTimeField(null=True, index=True)
Expand Down Expand Up @@ -187,7 +187,7 @@ class ProductInsight(BaseModel):

# Predictor version is used to know what the version of the predictor
# that generated the prediction. It can be either a digit or a model name
predictor_version = peewee.CharField(max_length=100, null=True, index=True)
predictor_version = peewee.CharField(max_length=100, null=True)

# annotation campaigns enable contributors to focus their efforts (on
# Hunger Games) on a subset of products. Each product have 0+ campaign
Expand Down Expand Up @@ -262,8 +262,8 @@ class ImageModel(BaseModel):
# The complete image path can be constructed with
# robotoff.settings.OFF_IMAGE_BASE_URL + source_image.
source_image = peewee.TextField(null=False, index=True)
width = peewee.IntegerField(null=False, index=True)
height = peewee.IntegerField(null=False, index=True)
width = peewee.IntegerField(null=False)
height = peewee.IntegerField(null=False)
deleted = peewee.BooleanField(null=False, index=True, default=False)
server_type = peewee.CharField(null=True, max_length=10, index=True)
# Perceptual hash of the image, used to find near-duplicates
Expand Down Expand Up @@ -299,12 +299,11 @@ class ImagePrediction(BaseModel):
type = peewee.CharField(max_length=256)
model_name = peewee.CharField(max_length=100, null=False, index=True)
model_version = peewee.CharField(max_length=256, null=False, index=True)
data = BinaryJSONField(index=True)
data = BinaryJSONField()
timestamp = peewee.DateTimeField(null=True)
image = peewee.ForeignKeyField(ImageModel, null=False, backref="predictions")
max_confidence = peewee.FloatField(
null=True,
index=True,
help_text="for object detection models, confidence of the highest confident"
"object detected, null if no object was detected",
)
Expand Down

0 comments on commit f0c348f

Please sign in to comment.