Skip to content

Commit

Permalink
fix: fix the absence of matching synonyms during category insight imp…
Browse files Browse the repository at this point in the history
…ort (#1497)

* fix: download newer version of taxonomy if available

* fix: fix the absence of matching synonyms during category insight import

- use create_taxonomy_mapping from SDK
- use taxonomy_mapping for categories to match the canonical ID
  • Loading branch information
raphael0202 authored Dec 12, 2024
1 parent fb6c5dc commit 66c5322
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 130 deletions.
4 changes: 2 additions & 2 deletions data/taxonomies/categories.full.json.gz
Git LFS file not shown
17 changes: 8 additions & 9 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ lark = "~1.1.4"
h5py = "~3.8.0"
opencv-python-headless = "~4.10.0.84"
toml = "~0.10.2"
openfoodfacts = "2.4.0"
openfoodfacts = "2.5.0"
imagehash = "~4.3.1"
peewee-migrate = "~1.12.2"
diskcache = "~5.6.3"
Expand Down
27 changes: 21 additions & 6 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,16 +835,31 @@ def generate_candidates(
predictions: list[Prediction],
product_id: ProductIdentifier,
) -> Iterator[ProductInsight]:
candidates = [
prediction
for prediction in predictions
if cls.is_prediction_valid(product, prediction.value_tag) # type: ignore
]
taxonomy = get_taxonomy(InsightType.category.name)
selected_candidates = []
for prediction in predictions:
if prediction.value_tag is None:
logger.warning(
"Unexpected None `value_tag` (prediction: %s)", prediction
)
continue
else:
prediction.value_tag = match_taxonomized_value(
prediction.value_tag, TaxonomyType.category.name
)
if prediction.value_tag is None:
logger.warning(f"Could not match {prediction.value_tag} (category)")
continue
elif not cls.is_prediction_valid(product, prediction.value_tag):
continue
else:
selected_candidates.append(prediction)

yield from (
ProductInsight(**candidate.to_dict())
for candidate in select_deepest_taxonomized_candidates(candidates, taxonomy)
for candidate in select_deepest_taxonomized_candidates(
selected_candidates, taxonomy
)
)

@staticmethod
Expand Down
36 changes: 17 additions & 19 deletions robotoff/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import collections
from typing import Optional

from cachetools.func import ttl_cache
from openfoodfacts.taxonomy import Taxonomy
from openfoodfacts.taxonomy import (
Taxonomy,
create_brand_taxonomy_mapping,
create_taxonomy_mapping,
)
from openfoodfacts.taxonomy import get_taxonomy as _get_taxonomy
from openfoodfacts.types import TaxonomyType

from robotoff import settings
from robotoff.utils import get_logger
from robotoff.utils.cache import function_cache_register
from robotoff.utils.text import get_tag

logger = get_logger(__name__)

Expand Down Expand Up @@ -65,6 +67,7 @@ def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Ta
return _get_taxonomy(
taxonomy_type_enum,
force_download=False,
download_newer=True,
cache_dir=settings.DATA_DIR / "taxonomies",
)

Expand All @@ -84,29 +87,24 @@ def get_taxonomy_mapping(taxonomy_type: str) -> dict[str, str]:
"""
logger.debug("Loading taxonomy mapping %s...", taxonomy_type)
taxonomy = get_taxonomy(taxonomy_type)
ids: dict[str, str] = {}

for key in taxonomy.keys():
if taxonomy_type == TaxonomyType.brand.name:
unprefixed_key = key
if is_prefixed_value(key):
unprefixed_key = key[3:]
ids[unprefixed_key] = taxonomy[key].names["en"]
else:
for lang, name in taxonomy[key].names.items():
tag = get_tag(name)
ids[f"{lang}:{tag}"] = key
if taxonomy_type == TaxonomyType.brand.name:
return create_brand_taxonomy_mapping(taxonomy)
else:
return create_taxonomy_mapping(taxonomy)

return ids


def match_taxonomized_value(value_tag: str, taxonomy_type: str) -> Optional[str]:
def match_taxonomized_value(value_tag: str, taxonomy_type: str) -> str | None:
"""Return the canonical taxonomized value of a `value_tag` (if any) or
return None if no match was found or if the type is unsupported.
Currently it only works for brand and label.
Currently it only works for brand, label and category taxonomies.
"""
if taxonomy_type not in (TaxonomyType.brand.name, TaxonomyType.label.name):
if taxonomy_type not in (
TaxonomyType.brand.name,
TaxonomyType.label.name,
TaxonomyType.category.name,
):
return None

taxonomy = get_taxonomy(taxonomy_type)
Expand Down
7 changes: 7 additions & 0 deletions tests/unit/insights/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,13 @@ def test_is_parent_category(self, category, to_check_categories, expected, mocke
Product({"code": DEFAULT_BARCODE, "categories_tags": ["en:meats"]}),
[],
),
(
[
Prediction(PredictionType.category, value_tag="en:shelled-almonds"),
],
Product({"code": DEFAULT_BARCODE, "categories_tags": []}),
["en:almonds-shelled"],
),
(
[
Prediction(
Expand Down
97 changes: 4 additions & 93 deletions tests/unit/test_taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,6 @@
import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy, TaxonomyType, match_taxonomized_value

label_taxonomy = Taxonomy.from_path(settings.TAXONOMY_PATHS["label"])
category_taxonomy = Taxonomy.from_path(settings.TAXONOMY_PATHS["category"])


class TestTaxonomy:
@pytest.mark.parametrize(
"taxonomy,item,candidates,output",
[
(label_taxonomy, "en:organic", {"en:fr-bio-01"}, True),
(label_taxonomy, "en:fr-bio-01", {"en:organic"}, False),
(label_taxonomy, "en:fr-bio-01", [], False),
(label_taxonomy, "en:organic", {"en:gluten-free"}, False),
(
label_taxonomy,
"en:organic",
{"en:gluten-free", "en:no-additives", "en:vegan"},
False,
),
(
label_taxonomy,
"en:organic",
{"en:gluten-free", "en:no-additives", "en:fr-bio-16"},
True,
),
],
)
def test_is_child_of_any(
self, taxonomy: Taxonomy, item: str, candidates: list, output: bool
):
assert taxonomy.is_parent_of_any(item, candidates) is output

def test_is_child_of_any_unknwon_item(self):
with pytest.raises(ValueError):
label_taxonomy.is_parent_of_any("unknown-id", set())

@pytest.mark.parametrize(
"taxonomy,item,output",
[
(category_taxonomy, "en:plant-based-foods-and-beverages", set()),
(
category_taxonomy,
"en:plant-based-foods",
{"en:plant-based-foods-and-beverages"},
),
(
category_taxonomy,
"en:brown-rices",
{
"en:rices",
"en:cereal-grains",
"en:cereals-and-their-products",
"en:cereals-and-potatoes",
"en:plant-based-foods",
"en:plant-based-foods-and-beverages",
"en:seeds",
},
),
],
)
def test_get_parents_hierarchy(
self, taxonomy: Taxonomy, item: str, output: set[str]
):
node = taxonomy[item]
parents = node.get_parents_hierarchy()
assert set((x.id for x in parents)) == output

@pytest.mark.parametrize(
"taxonomy,items,output",
[
(category_taxonomy, [], []),
(category_taxonomy, ["en:brown-rices"], ["en:brown-rices"]),
(category_taxonomy, ["en:brown-rices", "en:rices"], ["en:brown-rices"]),
(
category_taxonomy,
["en:brown-rices", "en:rices", "en:cereal-grains"],
["en:brown-rices"],
),
(
category_taxonomy,
["en:brown-rices", "en:teas", "en:cereal-grains"],
["en:brown-rices", "en:teas"],
),
],
)
def test_find_deepest_nodes(
self, taxonomy: Taxonomy, items: list[str], output: list[str]
):
item_nodes = [taxonomy[item] for item in items]
output_nodes = [taxonomy[o] for o in output]
assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes
from robotoff.taxonomy import TaxonomyType, match_taxonomized_value


@pytest.mark.parametrize(
Expand All @@ -114,6 +22,9 @@ def test_find_deepest_nodes(
(TaxonomyType.label.name, "unknown-label", None),
(TaxonomyType.label.name, "fr:viande-bovine-francaise", "en:french-beef"),
(TaxonomyType.ingredient.name, "text", None), # unsupported taxonomy
# en:almonds-shelled is the canonical ID, we check here that synonyms are
# matched
(TaxonomyType.category.name, "en:shelled-almonds", "en:almonds-shelled"),
],
)
def test_match_taxonomized_value(taxonomy_type, value, expected):
Expand Down

0 comments on commit 66c5322

Please sign in to comment.