diff --git a/Makefile b/Makefile index 875fb8cb68..c67e87874c 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,10 @@ prune_cache: clean: goodbye hdown prune prune_cache +# clean tests, remove containers and volume (useful if you changed env variables, etc.) +clean_tests: + ${DOCKER_COMPOSE_TEST} down -v --remove-orphans + #-----------# # Utilities # #-----------# diff --git a/data/grammars/terminal_allergen_de.lark b/data/grammars/terminal_allergen_de.lark new file mode 100644 index 0000000000..1ddd8bc83c --- /dev/null +++ b/data/grammars/terminal_allergen_de.lark @@ -0,0 +1,302 @@ +// This file has been generated automatically, DO NOT EDIT! +ALLERGEN_DE.1: /\bgluten enthaltendes getreide\b/ // "en:gluten" + | /\bgluten-enthaltendes-getreide\b/ // "en:gluten" + | /\bvollmilchschokolade[üu]berzug\b/ // "en:milk" + | /\bschwefeldioxid und sulfite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bmagermilchjoghurtpulver\b/ // "en:milk" + | /\banderen schalenfr[üu]chten\b/ // "en:nuts" + | /\bh[üu]hnerei-trockeneiwei(ß|ss)\b/ // "en:eggs" + | /\bh[üu]hnerei-eiweisspulver\b/ // "en:eggs" + | /\bh[üu]hnertrockeneieiweiss\b/ // "en:eggs" + | /\bmagermilchpulverzusatz\b/ // "en:milk" + | /\bvollkorn-weizenflocken\b/ // "en:gluten" + | /\bweitere schalenfr[üu]chte\b/ // "en:nuts" + | /\bh[üu]hnerei-eiwei(ß|ss)pulver\b/ // "en:eggs" + | /\bh[üu]hnertrockeneieiwei(ß|ss)\b/ // "en:eggs" + | /\bsojaproteinhydrolysat\b/ // "en:soybeans" + | /\bsojaeiweisskonzentrat\b/ // "en:soybeans" + | /\bmilcheiweisserzeugnis\b/ // "en:milk" + | /\bvollkorn-haferflocken\b/ // "en:gluten" + | /\bweizenvollkornflocken\b/ // "en:gluten" + | /\broggenvollkornflocken\b/ // "en:gluten" + | /\bdinkelvollkornflocken\b/ // "en:gluten" + | /\balaska-seelachs-filet\b/ // "en:fish" + | /\bh[üu]hnereieiwei(ß|ss)pulver\b/ // "en:eggs" + | /\bh[üu]hnertrockeneiweiss\b/ // "en:eggs" + | /\bsojaeiwei(ß|ss)konzentrat\b/ // "en:soybeans" + | /\bmagermilchkonzentrat\b/ // "en:milk" + | /\bmilcheiwei(ß|ss)erzeugnis\b/ // "en:milk" + | /\bhafervollkornflocken\b/ // "en:gluten" + | /\bvollkornhaferflocken\b/ // "en:gluten" + | /\broggenvollkornschrot\b/ // "en:gluten" + | /\bweizenvollkornschrot\b/ // "en:gluten" + | /\bdinkelvollkornschrot\b/ // "en:gluten" + | /\balaska-seelachsfilet\b/ // "en:fish" + | /\bh[üu]hnertrockeneiwei(ß|ss)\b/ // "en:eggs" + | /\bh[üu]hnereiweisspulver\b/ // "en:eggs" + | /\bvollmilchschokolade\b/ // "en:milk" + | /\bgerstenvollkornmehl\b/ // "en:gluten" + | /\bvollkornhaferschrot\b/ // "en:gluten" + | /\bh[üu]hnervolleipulver\b/ // "en:eggs" + | /\bh[üu]hnereigelbpulver\b/ // "en:eggs" + | /\bh[üu]hnereiklarpulver\b/ // "en:eggs" + | /\bh[üu]hnereiwei(ß|ss)pulver\b/ // "en:eggs" + | /\bhafer-vollkornmehl\b/ // "en:gluten" + | /\bvollkorn-hafermehl\b/ // "en:gluten" + | /\bvollkornweizenmehl\b/ // "en:gluten" + | /\bweizenvollkornmehl\b/ // "en:gluten" + | /\bgerstenmalzextrakt\b/ // "en:gluten" + | /\broggenvollkornmehl\b/ // "en:gluten" + | /\bdinkelvollkornmehl\b/ // "en:gluten" + | /\bweizenrostmalzmehl\b/ // "en:gluten" + | /\bsojaproteinisolat\b/ // "en:soybeans" + | /\bbuttermilchpulver\b/ // "en:milk" + | /\bmilcherzeugnissen\b/ // "en:milk" + | /\bmagermilchjoghurt\b/ // "en:milk" + | /\bkondensmagermilch\b/ // "en:milk" + | /\bhafervollkornmehl\b/ // "en:gluten" + | /\bvollkornhafermehl\b/ // "en:gluten" + | /\bweizenspeisekleie\b/ // "en:gluten" + | /\bsojasossenpulver\b/ // "en:soybeans" + | /\bsojaeiwei(ß|ss)isolat\b/ // "en:soybeans" + | /\bblauschimmelk[äa]se\b/ // "en:milk" + | /\bmilcherzeugnisse\b/ // "en:milk" + | /\bmagermilchpulver\b/ // "en:milk" + | /\bmolkeneiwei(ß|ss)k[äa]se\b/ // "en:milk" + | /\bs[üu]ssmolkenpulver\b/ // "en:milk" + | /\bjoghurterzeugnis\b/ // "en:milk" + | /\barachis hypogaea\b/ // "en:peanuts" + | /\bhartweizengriess\b/ // "en:gluten" + | /\bh[üu]hnerei-eiwei(ß|ss)\b/ // "en:eggs" + | /\bsojaso(ß|ss)enpulver\b/ // "en:soybeans" + | /\bmolkenerzeugnis\b/ // "en:milk" + | /\bmilchschokolade\b/ // "en:milk" + | /\bsauermilchquark\b/ // "en:milk" + | /\bsauerrahmpulver\b/ // "en:milk" + | /\bs[üu](ß|ss)molkenpulver\b/ // "en:milk" + | /\bvollmilchpulver\b/ // "en:milk" + | /\bs[üu]ssmolkepulver\b/ // "en:milk" + | /\bweizensauerteig\b/ // "en:gluten" + | /\bhartweizengrie(ß|ss)\b/ // "en:gluten" + | /\bgerstenmalzmehl\b/ // "en:gluten" + | /\bweizenquellmehl\b/ // "en:gluten" + | /\bweichweizenmehl\b/ // "en:gluten" + | /\bschalenfr[üu]chten\b/ // "en:nuts" + | /\bmandelst[üu]ckchen\b/ // "en:nuts" + | /\bqueenslandn[üu]sse\b/ // "en:nuts" + | /\bhaselnussst[üu]cke\b/ // "en:nuts" + | /\bstaudensellerie\b/ // "en:celery" + | /\bstangensellerie\b/ // "en:celery" + | /\bknollensellerie\b/ // "en:celery" + | /\bselleriebl[äa]tter\b/ // "en:celery" + | /\bsellerieextrakt\b/ // "en:celery" + | /\balaska-seelachs\b/ // "en:fish" + | /\bh[üu]hnereieiwei(ß|ss)\b/ // "en:eggs" + | /\bh[üu]hnereieigelb\b/ // "en:eggs" + | /\bsoja-lecithine\b/ // "en:soybeans" + | /\bbutterreinfett\b/ // "en:milk" + | /\bs[üu](ß|ss)molkepulver\b/ // "en:milk" + | /\bvollkorn-hafer\b/ // "en:gluten" + | /\bweizenmalzmehl\b/ // "en:gluten" + | /\bgerstenflocken\b/ // "en:gluten" + | /\bvollkornweizen\b/ // "en:gluten" + | /\bschalenfr[üu]chte\b/ // "en:nuts" + | /\bhaselnussmasse\b/ // "en:nuts" + | /\bhaselnusskerne\b/ // "en:nuts" + | /\bhaselnusspaste\b/ // "en:nuts" + | /\bmacadamian[üu]sse\b/ // "en:nuts" + | /\bschwefeldioxid\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bs[üu](ß|ss)lupinenmehl\b/ // "en:lupin" + | /\bmakrelenfilets\b/ // "en:fish" + | /\bsojalecithine\b/ // "en:soybeans" + | /\bsojaerzeugnis\b/ // "en:soybeans" + | /\bjoghurtpulver\b/ // "en:milk" + | /\bmolkeneiweiss\b/ // "en:milk" + | /\bmolkenprotein\b/ // "en:milk" + | /\bbuttereinfett\b/ // "en:milk" + | /\bvollkornhafer\b/ // "en:gluten" + | /\bweizenflocken\b/ // "en:gluten" + | /\broggenflocken\b/ // "en:gluten" + | /\bdinkelflocken\b/ // "en:gluten" + | /\bhaselnussmark\b/ // "en:nuts" + | /\bparanusskerne\b/ // "en:nuts" + | /\bsellerieblatt\b/ // "en:celery" + | /\bselleriesamen\b/ // "en:celery" + | /\bsesambr[öo]tchen\b/ // "en:sesame-seeds" + | /\bheringsfilets\b/ // "en:fish" + | /\bmakrelenfilet\b/ // "en:fish" + | /\beigelbpulver\b/ // "en:eggs" + | /\beiklarpulver\b/ // "en:eggs" + | /\bh[üu]hnereiwei(ß|ss)\b/ // "en:eggs" + | /\bh[üu]hnervollei\b/ // "en:eggs" + | /\bvolleipulver\b/ // "en:eggs" + | /\bh[üu]hnereigelb\b/ // "en:eggs" + | /\bh[üu]hnereiklar\b/ // "en:eggs" + | /\bsojalecithin\b/ // "en:soybeans" + | /\bsojabohnen[öo]l\b/ // "en:soybeans" + | /\bkondensmilch\b/ // "en:milk" + | /\bkuhvollmilch\b/ // "en:milk" + | /\bmolkenpulver\b/ // "en:milk" + | /\bsahnejoghurt\b/ // "en:milk" + | /\bk[äa]sereimilch\b/ // "en:milk" + | /\bmilchprotein\b/ // "en:milk" + | /\bmolkeneiwei(ß|ss)\b/ // "en:milk" + | /\berdnusskerne\b/ // "en:peanuts" + | /\bhaferflocken\b/ // "en:gluten" + | /\bweizengluten\b/ // "en:gluten" + | /\bweizenst[äa]rke\b/ // "en:gluten" + | /\bweizenkleber\b/ // "en:gluten" + | /\bweizengriess\b/ // "en:gluten" + | /\broggenschrot\b/ // "en:gluten" + | /\bmandelst[üu]cke\b/ // "en:nuts" + | /\bwalnusskerne\b/ // "en:nuts" + | /\bandere n[üu]sse\b/ // "en:nuts" + | /\bselleriesaft\b/ // "en:celery" + | /\bheringsfilet\b/ // "en:fish" + | /\bsojaprotein\b/ // "en:soybeans" + | /\bsojaflocken\b/ // "en:soybeans" + | /\bbuttermilch\b/ // "en:milk" + | /\bedamer-k[äa]se\b/ // "en:milk" + | /\bmilchzucker\b/ // "en:milk" + | /\bmilcheiwei(ß|ss)\b/ // "en:milk" + | /\bmilchpulver\b/ // "en:milk" + | /\bspeisequark\b/ // "en:milk" + | /\bsahnepulver\b/ // "en:milk" + | /\brahmjoghurt\b/ // "en:milk" + | /\bschlagsahne\b/ // "en:milk" + | /\bziegenmilch\b/ // "en:milk" + | /\bschafsmilch\b/ // "en:milk" + | /\bmolkepulver\b/ // "en:milk" + | /\bgerstenmehl\b/ // "en:gluten" + | /\bgerstenmalz\b/ // "en:gluten" + | /\bweizengrie(ß|ss)\b/ // "en:gluten" + | /\bweizenkleie\b/ // "en:gluten" + | /\bmandelkerne\b/ // "en:nuts" + | /\bhaseln[üu]ssen\b/ // "en:nuts" + | /\bkaschun[üu]sse\b/ // "en:nuts" + | /\bcashewkerne\b/ // "en:nuts" + | /\bcashewn[üu]sse\b/ // "en:nuts" + | /\bsesamk[öo]rner\b/ // "en:sesame-seeds" + | /\blupinenmehl\b/ // "en:lupin" + | /\blachsfilets\b/ // "en:fish" + | /\bkrebstieren\b/ // "en:crustaceans" + | /\bh[üu]hnereier\b/ // "en:eggs" + | /\bsojabohnen\b/ // "en:soybeans" + | /\bsojaeiwei(ß|ss)\b/ // "en:soybeans" + | /\bsojaschrot\b/ // "en:soybeans" + | /\bemmentaler\b/ // "en:milk" + | /\bfrischk[äa]se\b/ // "en:milk" + | /\bgorgonzola\b/ // "en:milk" + | /\bk[äa]sepulver\b/ // "en:milk" + | /\bmagermilch\b/ // "en:milk" + | /\bmagerquark\b/ // "en:milk" + | /\bmozzarella\b/ // "en:milk" + | /\bsauermilch\b/ // "en:milk" + | /\bbutterfett\b/ // "en:milk" + | /\bschafmilch\b/ // "en:milk" + | /\bsauermolke\b/ // "en:milk" + | /\bhaferkleie\b/ // "en:gluten" + | /\bweizenmehl\b/ // "en:gluten" + | /\bhartweizen\b/ // "en:gluten" + | /\broggenmehl\b/ // "en:gluten" + | /\bdinkelmehl\b/ // "en:gluten" + | /\bweizenmalz\b/ // "en:gluten" + | /\bhaferfaser\b/ // "en:gluten" + | /\bhaseln[üu]sse\b/ // "en:nuts" + | /\bpecann[üu]sse\b/ // "en:nuts" + | /\bsesamsamen\b/ // "en:sesame-seeds" + | /\bsesampaste\b/ // "en:sesame-seeds" + | /\bsenfk[öo]rner\b/ // "en:mustard" + | /\bsenfsaaten\b/ // "en:mustard" + | /\bsenfschrot\b/ // "en:mustard" + | /\blachsfilet\b/ // "en:fish" + | /\bkrebstiere\b/ // "en:crustaceans" + | /\bweichtiere\b/ // "en:molluscs" + | /\bsojamilch\b/ // "en:soybeans" + | /\bsojakerne\b/ // "en:soybeans" + | /\bsojakleie\b/ // "en:soybeans" + | /\bsojasauce\b/ // "en:soybeans" + | /\bsojagrie(ß|ss)\b/ // "en:soybeans" + | /\bsojasosse\b/ // "en:soybeans" + | /\bcamembert\b/ // "en:milk" + | /\bsauerrahm\b/ // "en:milk" + | /\bvollmilch\b/ // "en:milk" + | /\bs[üu]ssmolke\b/ // "en:milk" + | /\bmilchfett\b/ // "en:milk" + | /\berdn[üu]ssen\b/ // "en:peanuts" + | /\bhafermehl\b/ // "en:gluten" + | /\bwalnuss[öo]l\b/ // "en:nuts" + | /\bparan[üu]sse\b/ // "en:nuts" + | /\bpistazien\b/ // "en:nuts" + | /\bhaselnuss\b/ // "en:nuts" + | /\berdnuss[öo]l\b/ // "en:nuts" + | /\bsesamkorn\b/ // "en:sesame-seeds" + | /\bsenfsamen\b/ // "en:mustard" + | /\bthunfisch\b/ // "en:fish" + | /\beipulver\b/ // "en:eggs" + | /\bh[üu]hnerei\b/ // "en:eggs" + | /\bfrischei\b/ // "en:eggs" + | /\bsojafett\b/ // "en:soybeans" + | /\bsojamehl\b/ // "en:soybeans" + | /\bsojaso(ß|ss)e\b/ // "en:soybeans" + | /\bbergk[äa]se\b/ // "en:milk" + | /\bhartk[äa]se\b/ // "en:milk" + | /\bkuhmilch\b/ // "en:milk" + | /\brohmilch\b/ // "en:milk" + | /\bpecorino\b/ // "en:milk" + | /\bs[üu](ß|ss)molke\b/ // "en:milk" + | /\berdn[üu]sse\b/ // "en:peanuts" + | /\bmalzmehl\b/ // "en:gluten" + | /\bwaln[üu]sse\b/ // "en:nuts" + | /\bsulphite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bsellerie\b/ // "en:celery" + | /\bsenfmehl\b/ // "en:mustard" + | /\bsenfsaat\b/ // "en:mustard" + | /\bsenfkorn\b/ // "en:mustard" + | /\beiweiss\b/ // "en:eggs" + | /\bcheddar\b/ // "en:milk" + | /\bjoghurt\b/ // "en:milk" + | /\blaktose\b/ // "en:milk" + | /\bricotta\b/ // "en:milk" + | /\berdnuss\b/ // "en:peanuts" + | /\bgersten\b/ // "en:gluten" + | /\bmandeln\b/ // "en:nuts" + | /\bsulfite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bsesam[öo]l\b/ // "en:sesame-seeds" + | /\blupinen\b/ // "en:lupin" + | /\bmakrele\b/ // "en:fish" + | /\bfisch[öo]l\b/ // "en:fish" + | /\beigelb\b/ // "en:eggs" + | /\beiklar\b/ // "en:eggs" + | /\bvollei\b/ // "en:eggs" + | /\beiwei(ß|ss)\b/ // "en:eggs" + | /\bsoja[öo]l\b/ // "en:soybeans" + | /\bbutter\b/ // "en:milk" + | /\bedamer\b/ // "en:milk" + | /\bgluten\b/ // "en:gluten" + | /\bweizen\b/ // "en:gluten" + | /\broggen\b/ // "en:gluten" + | /\bgerste\b/ // "en:gluten" + | /\bdinkel\b/ // "en:gluten" + | /\bmandel\b/ // "en:nuts" + | /\bfische\b/ // "en:fish" + | /\bhering\b/ // "en:fish" + | /\beiern\b/ // "en:eggs" + | /\bmilch\b/ // "en:milk" + | /\bgouda\b/ // "en:milk" + | /\bquark\b/ // "en:milk" + | /\bsahne\b/ // "en:milk" + | /\bmolke\b/ // "en:milk" + | /\bhafer\b/ // "en:gluten" + | /\bkamut\b/ // "en:gluten" + | /\bn[üu]sse\b/ // "en:nuts" + | /\bsesam\b/ // "en:sesame-seeds" + | /\bfisch\b/ // "en:fish" + | /\blachs\b/ // "en:fish" + | /\beier\b/ // "en:eggs" + | /\bsoja\b/ // "en:soybeans" + | /\bk[äa]se\b/ // "en:milk" + | /\brahm\b/ // "en:milk" + | /\bsenf\b/ // "en:mustard" + | /\bei\b/ // "en:eggs" + diff --git a/data/grammars/terminal_allergen_en.lark b/data/grammars/terminal_allergen_en.lark new file mode 100644 index 0000000000..15124c43a0 --- /dev/null +++ b/data/grammars/terminal_allergen_en.lark @@ -0,0 +1,126 @@ +// This file has been generated automatically, DO NOT EDIT! +ALLERGEN_EN.1: /\bother cereals containing gluten\b/ // "en:gluten" + | /\bsulphur dioxide and sulphites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bcereals containing gluten\b/ // "en:gluten" + | /\bmilk chocolate coating\b/ // "en:milk" + | /\bmalted barley extract\b/ // "en:gluten" + | /\bsoy protein isolate\b/ // "en:soybeans" + | /\bparmigiano reggiano\b/ // "en:milk" + | /\bmalted barley flour\b/ // "en:gluten" + | /\bbarley malt flour\b/ // "en:gluten" + | /\barachis hypogaea\b/ // "en:peanuts" + | /\bqueensland nuts\b/ // "en:nuts" + | /\bsulphur dioxide\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bsoya lecithins\b/ // "en:soybeans" + | /\bsoy lecithines\b/ // "en:soybeans" + | /\bwheat semolina\b/ // "en:gluten" + | /\bpistachio nuts\b/ // "en:nuts" + | /\bmacadamia nuts\b/ // "en:nuts" + | /\bsoy lecithins\b/ // "en:soybeans" + | /\bsoya lecithin\b/ // "en:soybeans" + | /\bsoya products\b/ // "en:soybeans" + | /\bmalted barley\b/ // "en:gluten" + | /\bsoy lecithin\b/ // "en:soybeans" + | /\bgrana padano\b/ // "en:milk" + | /\bmilk protein\b/ // "en:milk" + | /\bsesame seeds\b/ // "en:sesame-seeds" + | /\bmilk powder\b/ // "en:milk" + | /\bwheat flour\b/ // "en:gluten" + | /\bbrazil nuts\b/ // "en:nuts" + | /\bcrustaceans\b/ // "en:crustaceans" + | /\begg whites\b/ // "en:eggs" + | /\bwhole eggs\b/ // "en:eggs" + | /\bsoya flour\b/ // "en:soybeans" + | /\bbuttermilk\b/ // "en:milk" + | /\bspeltflour\b/ // "en:gluten" + | /\bwheatflour\b/ // "en:gluten" + | /\bpecan nuts\b/ // "en:nuts" + | /\bother nuts\b/ // "en:nuts" + | /\bcuttlefish\b/ // "en:molluscs" + | /\begg white\b/ // "en:eggs" + | /\begg yolks\b/ // "en:eggs" + | /\bwhole egg\b/ // "en:eggs" + | /\bsoya bean\b/ // "en:soybeans" + | /\bsoy flour\b/ // "en:soybeans" + | /\brye flour\b/ // "en:gluten" + | /\boat fiber\b/ // "en:gluten" + | /\bhazelnuts\b/ // "en:nuts" + | /\bpistachio\b/ // "en:nuts" + | /\bmacadamia\b/ // "en:nuts" + | /\btree nuts\b/ // "en:nuts" + | /\bsulphites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bbarn egg\b/ // "en:eggs" + | /\begg yolk\b/ // "en:eggs" + | /\bsoybeans\b/ // "en:soybeans" + | /\btreenuts\b/ // "en:nuts" + | /\bsulfites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bceleriac\b/ // "en:celery" + | /\bbrassica\b/ // "en:mustard" + | /\bmackerel\b/ // "en:fish" + | /\bflounder\b/ // "en:fish" + | /\bsardines\b/ // "en:fish" + | /\bcrayfish\b/ // "en:crustaceans" + | /\bmolluscs\b/ // "en:molluscs" + | /\bmollusks\b/ // "en:molluscs" + | /\bscallops\b/ // "en:molluscs" + | /\blactose\b/ // "en:milk" + | /\byoghurt\b/ // "en:milk" + | /\bpeanuts\b/ // "en:peanuts" + | /\balmonds\b/ // "en:nuts" + | /\bwalnuts\b/ // "en:nuts" + | /\bcashews\b/ // "en:nuts" + | /\bmustard\b/ // "en:mustard" + | /\bhalibut\b/ // "en:fish" + | /\bhaddock\b/ // "en:fish" + | /\bsardine\b/ // "en:fish" + | /\blobster\b/ // "en:crustaceans" + | /\bmollusc\b/ // "en:molluscs" + | /\bmollusk\b/ // "en:molluscs" + | /\boysters\b/ // "en:molluscs" + | /\bmussels\b/ // "en:molluscs" + | /\bscallop\b/ // "en:molluscs" + | /\bbutter\b/ // "en:milk" + | /\byogurt\b/ // "en:milk" + | /\bcheese\b/ // "en:milk" + | /\bpeanut\b/ // "en:peanuts" + | /\bgluten\b/ // "en:gluten" + | /\bbarley\b/ // "en:gluten" + | /\bcashew\b/ // "en:nuts" + | /\bcelery\b/ // "en:celery" + | /\bsesame\b/ // "en:sesame-seeds" + | /\blupine\b/ // "en:lupin" + | /\bfishes\b/ // "en:fish" + | /\bturbot\b/ // "en:fish" + | /\bsalmon\b/ // "en:fish" + | /\bshrimp\b/ // "en:crustaceans" + | /\boyster\b/ // "en:molluscs" + | /\bmussel\b/ // "en:molluscs" + | /\bdairy\b/ // "en:milk" + | /\bcream\b/ // "en:milk" + | /\bkamut\b/ // "en:gluten" + | /\bspelt\b/ // "en:gluten" + | /\bwheat\b/ // "en:gluten" + | /\bpecan\b/ // "en:nuts" + | /\blupin\b/ // "en:lupin" + | /\btrout\b/ // "en:fish" + | /\bprawn\b/ // "en:crustaceans" + | /\bsquid\b/ // "en:molluscs" + | /\bclams\b/ // "en:molluscs" + | /\beggs\b/ // "en:eggs" + | /\bsoya\b/ // "en:soybeans" + | /\bsoja\b/ // "en:soybeans" + | /\bsoia\b/ // "en:soybeans" + | /\bmilk\b/ // "en:milk" + | /\bwhey\b/ // "en:milk" + | /\boats\b/ // "en:gluten" + | /\bnuts\b/ // "en:nuts" + | /\bfish\b/ // "en:fish" + | /\bsole\b/ // "en:fish" + | /\btuna\b/ // "en:fish" + | /\bcrab\b/ // "en:crustaceans" + | /\bclam\b/ // "en:molluscs" + | /\begg\b/ // "en:eggs" + | /\bsoy\b/ // "en:soybeans" + | /\brye\b/ // "en:gluten" + | /\bcod\b/ // "en:fish" + diff --git a/data/grammars/terminal_allergen_en_map.json b/data/grammars/terminal_allergen_en_map.json new file mode 100644 index 0000000000..1fa265fc04 --- /dev/null +++ b/data/grammars/terminal_allergen_en_map.json @@ -0,0 +1 @@ +{"other cereals containing gluten":["en:gluten"],"sulphur dioxide and sulphites":["en:sulphur-dioxide-and-sulphites"],"cereals containing gluten":["en:gluten"],"milk chocolate coating":["en:milk"],"malted barley extract":["en:gluten"],"soy protein isolate":["en:soybeans"],"parmigiano reggiano":["en:milk"],"malted barley flour":["en:gluten"],"barley malt flour":["en:gluten"],"arachis hypogaea":["en:peanuts"],"queensland nuts":["en:nuts"],"sulphur dioxide":["en:sulphur-dioxide-and-sulphites"],"soya lecithins":["en:soybeans"],"soy lecithines":["en:soybeans"],"wheat semolina":["en:gluten"],"pistachio nuts":["en:nuts"],"macadamia nuts":["en:nuts"],"soy lecithins":["en:soybeans"],"soya lecithin":["en:soybeans"],"soya products":["en:soybeans"],"malted barley":["en:gluten"],"soy lecithin":["en:soybeans"],"grana padano":["en:milk"],"milk protein":["en:milk"],"sesame seeds":["en:sesame-seeds"],"milk powder":["en:milk"],"wheat flour":["en:gluten"],"brazil nuts":["en:nuts"],"crustaceans":["en:crustaceans"],"egg whites":["en:eggs"],"whole eggs":["en:eggs"],"soya flour":["en:soybeans"],"buttermilk":["en:milk"],"speltflour":["en:gluten"],"wheatflour":["en:gluten"],"pecan nuts":["en:nuts"],"other nuts":["en:nuts"],"cuttlefish":["en:molluscs"],"egg white":["en:eggs"],"egg yolks":["en:eggs"],"whole egg":["en:eggs"],"soya bean":["en:soybeans"],"soy flour":["en:soybeans"],"rye flour":["en:gluten"],"oat fiber":["en:gluten"],"hazelnuts":["en:nuts"],"pistachio":["en:nuts"],"macadamia":["en:nuts"],"tree nuts":["en:nuts"],"sulphites":["en:sulphur-dioxide-and-sulphites"],"barn egg":["en:eggs"],"egg yolk":["en:eggs"],"soybeans":["en:soybeans"],"treenuts":["en:nuts"],"sulfites":["en:sulphur-dioxide-and-sulphites"],"celeriac":["en:celery"],"brassica":["en:mustard"],"mackerel":["en:fish"],"flounder":["en:fish"],"sardines":["en:fish"],"crayfish":["en:crustaceans"],"molluscs":["en:molluscs"],"mollusks":["en:molluscs"],"scallops":["en:molluscs"],"lactose":["en:milk"],"yoghurt":["en:milk"],"peanuts":["en:peanuts"],"almonds":["en:nuts"],"walnuts":["en:nuts"],"cashews":["en:nuts"],"mustard":["en:mustard"],"halibut":["en:fish"],"haddock":["en:fish"],"sardine":["en:fish"],"lobster":["en:crustaceans"],"mollusc":["en:molluscs"],"mollusk":["en:molluscs"],"oysters":["en:molluscs"],"mussels":["en:molluscs"],"scallop":["en:molluscs"],"butter":["en:milk"],"yogurt":["en:milk"],"cheese":["en:milk"],"peanut":["en:peanuts"],"gluten":["en:gluten"],"barley":["en:gluten"],"cashew":["en:nuts"],"celery":["en:celery"],"sesame":["en:sesame-seeds"],"lupine":["en:lupin"],"fishes":["en:fish"],"turbot":["en:fish"],"salmon":["en:fish"],"shrimp":["en:crustaceans"],"oyster":["en:molluscs"],"mussel":["en:molluscs"],"dairy":["en:milk"],"cream":["en:milk"],"kamut":["en:gluten"],"spelt":["en:gluten"],"wheat":["en:gluten"],"pecan":["en:nuts"],"lupin":["en:lupin"],"trout":["en:fish"],"prawn":["en:crustaceans"],"squid":["en:molluscs"],"clams":["en:molluscs"],"eggs":["en:eggs"],"soya":["en:soybeans"],"soja":["en:soybeans"],"soia":["en:soybeans"],"milk":["en:milk"],"whey":["en:milk"],"oats":["en:gluten"],"nuts":["en:nuts"],"fish":["en:fish"],"sole":["en:fish"],"tuna":["en:fish"],"crab":["en:crustaceans"],"clam":["en:molluscs"],"egg":["en:eggs"],"soy":["en:soybeans"],"rye":["en:gluten"],"cod":["en:fish"]} \ No newline at end of file diff --git a/data/grammars/terminal_allergen_es.lark b/data/grammars/terminal_allergen_es.lark new file mode 100644 index 0000000000..febad6d40b --- /dev/null +++ b/data/grammars/terminal_allergen_es.lark @@ -0,0 +1,106 @@ +// This file has been generated automatically, DO NOT EDIT! +ALLERGEN_ES.1: /\bcereales que contienen gluten\b/ // "en:gluten" + | /\botros frutos secos de c[áa]scara\b/ // "en:nuts" + | /\bfrutos de c[áa]scara y derivados\b/ // "en:nuts" + | /\bproductos derivados de huevo\b/ // "en:eggs" + | /\bdi[óo]xido de azufre y sulfitos\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bharina de trigo integral\b/ // "en:gluten" + | /\bfrutos secos de c[áa]scara\b/ // "en:nuts" + | /\botros frutos de c[áa]scara\b/ // "en:nuts" + | /\btrigo de joras[áa]n kamut\b/ // "en:gluten" + | /\bleche y sus derivados\b/ // "en:milk" + | /\bsemillas de ajonjol[íi]\b/ // "en:sesame-seeds" + | /\baceite de cacahuete\b/ // "en:peanuts" + | /\baceite de cacahuate\b/ // "en:peanuts" + | /\bcereales con gluten\b/ // "en:gluten" + | /\bnueces del amazonas\b/ // "en:nuts" + | /\bnueces de macadamia\b/ // "en:nuts" + | /\bnueces de australia\b/ // "en:nuts" + | /\bsemillas de mostaza\b/ // "en:mustard" + | /\bfrutos con c[áa]scara\b/ // "en:nuts" + | /\botros frutos secos\b/ // "en:nuts" + | /\bsemillas de s[ée]samo\b/ // "en:sesame-seeds" + | /\bgranos de ajonjol[íi]\b/ // "en:sesame-seeds" + | /\bderivados l[áa]cteos\b/ // "en:milk" + | /\bproductos l[áa]cteos\b/ // "en:milk" + | /\bleche y derivados\b/ // "en:milk" + | /\bprote[íi]na de leche\b/ // "en:milk" + | /\bfrutos de c[áa]scara\b/ // "en:nuts" + | /\bnueces de [áa]rboles\b/ // "en:nuts" + | /\bdi[óo]xido de azufre\b/ // "en:sulphur-dioxide-and-sulphites" + | /\blecitina de soja\b/ // "en:soybeans" + | /\blecitina de soya\b/ // "en:soybeans" + | /\barachis hypogaea\b/ // "en:peanuts" + | /\bnueces de brasil\b/ // "en:nuts" + | /\bnueces macadamia\b/ // "en:nuts" + | /\bgranos de s[ée]samo\b/ // "en:sesame-seeds" + | /\bharina de trigo\b/ // "en:gluten" + | /\bcangrejo de r[íi]o\b/ // "en:crustaceans" + | /\bnuez de brasil\b/ // "en:nuts" + | /\bhabas de soja\b/ // "en:soybeans" + | /\bhabas de soya\b/ // "en:soybeans" + | /\bfrutos secos\b/ // "en:nuts" + | /\botros nueces\b/ // "en:nuts" + | /\bcacahuetes\b/ // "en:peanuts" + | /\bcacahuates\b/ // "en:peanuts" + | /\balf[óo]ncigos\b/ // "en:nuts" + | /\baltramuces\b/ // "en:lupin" + | /\bcrust[áa]ceos\b/ // "en:crustaceans" + | /\blangostino\b/ // "en:crustaceans" + | /\bmejillones\b/ // "en:molluscs" + | /\bcacahuete\b/ // "en:peanuts" + | /\bcacahuate\b/ // "en:peanuts" + | /\balmendras\b/ // "en:nuts" + | /\bavellanas\b/ // "en:nuts" + | /\banacardos\b/ // "en:nuts" + | /\bpistachos\b/ // "en:nuts" + | /\bescal[óo]pas\b/ // "en:molluscs" + | /\bcaracoles\b/ // "en:molluscs" + | /\bcereales\b/ // "en:gluten" + | /\bcoquitos\b/ // "en:nuts" + | /\bpistacho\b/ // "en:nuts" + | /\bcastañas\b/ // "en:nuts" + | /\bsulfitos\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bajonjol[íi]\b/ // "en:sesame-seeds" + | /\baltramuz\b/ // "en:lupin" + | /\bcangrejo\b/ // "en:crustaceans" + | /\blangosta\b/ // "en:crustaceans" + | /\bmoluscos\b/ // "en:molluscs" + | /\bmejill[óo]n\b/ // "en:molluscs" + | /\blactosa\b/ // "en:milk" + | /\bl[áa]cteos\b/ // "en:milk" + | /\bcenteno\b/ // "en:gluten" + | /\bespelta\b/ // "en:gluten" + | /\bpacanas\b/ // "en:nuts" + | /\bcoquito\b/ // "en:nuts" + | /\bpiñones\b/ // "en:nuts" + | /\bcastaña\b/ // "en:nuts" + | /\bmostaza\b/ // "en:mustard" + | /\bpescado\b/ // "en:fish" + | /\bcamar[óo]n\b/ // "en:crustaceans" + | /\bmolusco\b/ // "en:molluscs" + | /\balmejas\b/ // "en:molluscs" + | /\bcalamar\b/ // "en:molluscs" + | /\bcaracol\b/ // "en:molluscs" + | /\bhuevos\b/ // "en:eggs" + | /\bgluten\b/ // "en:gluten" + | /\bcebada\b/ // "en:gluten" + | /\bnueces\b/ // "en:nuts" + | /\bs[ée]samo\b/ // "en:sesame-seeds" + | /\bostras\b/ // "en:molluscs" + | /\bvieras\b/ // "en:molluscs" + | /\bhuevo\b/ // "en:eggs" + | /\bleche\b/ // "en:milk" + | /\bqueso\b/ // "en:milk" + | /\btrigo\b/ // "en:gluten" + | /\bkamut\b/ // "en:gluten" + | /\bpiñ[óo]n\b/ // "en:nuts" + | /\bgamba\b/ // "en:crustaceans" + | /\bsepia\b/ // "en:molluscs" + | /\bostra\b/ // "en:molluscs" + | /\bviera\b/ // "en:molluscs" + | /\bsoja\b/ // "en:soybeans" + | /\bsoya\b/ // "en:soybeans" + | /\bnuez\b/ // "en:nuts" + | /\bapio\b/ // "en:celery" + diff --git a/data/grammars/terminal_allergen_es_map.json b/data/grammars/terminal_allergen_es_map.json new file mode 100644 index 0000000000..a5c80ad87d --- /dev/null +++ b/data/grammars/terminal_allergen_es_map.json @@ -0,0 +1 @@ +{"cereales que contienen gluten":["en:gluten"],"otros frutos secos de cáscara":["en:nuts"],"frutos de cáscara y derivados":["en:nuts"],"productos derivados de huevo":["en:eggs"],"dióxido de azufre y sulfitos":["en:sulphur-dioxide-and-sulphites"],"harina de trigo integral":["en:gluten"],"frutos secos de cáscara":["en:nuts"],"otros frutos de cáscara":["en:nuts"],"trigo de jorasán kamut":["en:gluten"],"leche y sus derivados":["en:milk"],"semillas de ajonjolí":["en:sesame-seeds"],"aceite de cacahuete":["en:peanuts"],"aceite de cacahuate":["en:peanuts"],"cereales con gluten":["en:gluten"],"nueces del amazonas":["en:nuts"],"nueces de macadamia":["en:nuts"],"nueces de australia":["en:nuts"],"semillas de mostaza":["en:mustard"],"frutos con cáscara":["en:nuts"],"otros frutos secos":["en:nuts"],"semillas de sésamo":["en:sesame-seeds"],"granos de ajonjolí":["en:sesame-seeds"],"derivados lácteos":["en:milk"],"productos lácteos":["en:milk"],"leche y derivados":["en:milk"],"proteína de leche":["en:milk"],"frutos de cáscara":["en:nuts"],"nueces de árboles":["en:nuts"],"dióxido de azufre":["en:sulphur-dioxide-and-sulphites"],"lecitina de soja":["en:soybeans"],"lecitina de soya":["en:soybeans"],"arachis hypogaea":["en:peanuts"],"nueces de brasil":["en:nuts"],"nueces macadamia":["en:nuts"],"granos de sésamo":["en:sesame-seeds"],"harina de trigo":["en:gluten"],"cangrejo de río":["en:crustaceans"],"nuez de brasil":["en:nuts"],"habas de soja":["en:soybeans"],"habas de soya":["en:soybeans"],"frutos secos":["en:nuts"],"otros nueces":["en:nuts"],"cacahuetes":["en:peanuts"],"cacahuates":["en:peanuts"],"alfóncigos":["en:nuts"],"altramuces":["en:lupin"],"crustáceos":["en:crustaceans"],"langostino":["en:crustaceans"],"mejillones":["en:molluscs"],"cacahuete":["en:peanuts"],"cacahuate":["en:peanuts"],"almendras":["en:nuts"],"avellanas":["en:nuts"],"anacardos":["en:nuts"],"pistachos":["en:nuts"],"escalópas":["en:molluscs"],"caracoles":["en:molluscs"],"cereales":["en:gluten"],"coquitos":["en:nuts"],"pistacho":["en:nuts"],"castañas":["en:nuts"],"sulfitos":["en:sulphur-dioxide-and-sulphites"],"ajonjolí":["en:sesame-seeds"],"altramuz":["en:lupin"],"cangrejo":["en:crustaceans"],"langosta":["en:crustaceans"],"moluscos":["en:molluscs"],"mejillón":["en:molluscs"],"lactosa":["en:milk"],"lácteos":["en:milk"],"centeno":["en:gluten"],"espelta":["en:gluten"],"pacanas":["en:nuts"],"coquito":["en:nuts"],"piñones":["en:nuts"],"castaña":["en:nuts"],"mostaza":["en:mustard"],"pescado":["en:fish"],"camarón":["en:crustaceans"],"molusco":["en:molluscs"],"almejas":["en:molluscs"],"calamar":["en:molluscs"],"caracol":["en:molluscs"],"huevos":["en:eggs"],"gluten":["en:gluten"],"cebada":["en:gluten"],"nueces":["en:nuts"],"sésamo":["en:sesame-seeds"],"ostras":["en:molluscs"],"vieras":["en:molluscs"],"huevo":["en:eggs"],"leche":["en:milk"],"queso":["en:milk"],"trigo":["en:gluten"],"kamut":["en:gluten"],"piñón":["en:nuts"],"gamba":["en:crustaceans"],"sepia":["en:molluscs"],"ostra":["en:molluscs"],"viera":["en:molluscs"],"soja":["en:soybeans"],"soya":["en:soybeans"],"nuez":["en:nuts"],"apio":["en:celery"]} \ No newline at end of file diff --git a/data/grammars/terminal_allergen_fr.lark b/data/grammars/terminal_allergen_fr.lark new file mode 100644 index 0000000000..db8b12d9b6 --- /dev/null +++ b/data/grammars/terminal_allergen_fr.lark @@ -0,0 +1,212 @@ +// This file has been generated automatically, DO NOT EDIT! +ALLERGEN_FR.1: /\blaits et d[ée]riv[ée]s y compris lactose\b/ // "en:milk" + | /\banhydride sulfureux et sulfites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bnoix de coquilles saint-jacques\b/ // "en:molluscs" + | /\bproduits laitiers et d[ée]riv[ée]es\b/ // "en:milk" + | /\bproduits laitiers et d[ée]riv[ée]s\b/ // "en:milk" + | /\bc[ée]r[ée]ales contenant du gluten\b/ // "en:gluten" + | /\bnoix de coquilles st-jacques\b/ // "en:molluscs" + | /\bcoquilles? saint jacques\b/ // "en:molluscs" + | /\bautres? fruits? à coque\b/ // "en:nuts" + | /\bnoix de saint-jacques\b/ // "en:molluscs" + | /\bprot[ée]ines laiti[èe]res\b/ // "en:milk" + | /\bfruits? secs? à coque\b/ // "en:nuts" + | /\bfruits? à coque dure\b/ // "en:nuts" + | /\banhydride sulfureux\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bgraines? de moutarde\b/ // "en:mustard" + | /\bcoquilles? st jacques\b/ // "en:molluscs" + | /\bferments lactiques\b/ // "en:milk" + | /\bnoix du queensland\b/ // "en:nuts" + | /\bautres? fruits? secs?\b/ // "en:nuts" + | /\bnoix de st-jacques\b/ // "en:molluscs" + | /\bl[ée]cithines? de soja\b/ // "en:soybeans" + | /\bfromages? de ch[èe]vre\b/ // "en:milk" + | /\bprot[ée]ines? de lait\b/ // "en:milk" + | /\bproduits? laitiers?\b/ // "en:milk" + | /\bfarine de froment\b/ // "en:gluten" + | /\bfarine d\'[ée]peautre\b/ // "en:gluten" + | /\bnoix de macadamia\b/ // "en:nuts" + | /\bgraines? de s[ée]same\b/ // "en:sesame-seeds" + | /\bbeurre patissier\b/ // "en:milk" + | /\bfromage de vache\b/ // "en:milk" + | /\bd[ée]riv[ée]s laitiers\b/ // "en:milk" + | /\bbeurre concentr[ée]\b/ // "en:milk" + | /\blait demi-[ée]cr[ée]m[ée]\b/ // "en:milk" + | /\barachis hypogaea\b/ // "en:peanuts" + | /\bgraines? de s[ée]same\b/ // "en:sesame-seeds" + | /\bgraines? de soja\b/ // "en:soybeans" + | /\bpoudres? de lait\b/ // "en:milk" + | /\blait de ch[èe]vre\b/ // "en:milk" + | /\blait de brebis\b/ // "en:milk" + | /\bpetit [ée]peautre\b/ // "en:gluten" + | /\bgrand [ée]peautre\b/ // "en:gluten" + | /\bfruits? à coque\b/ // "en:nuts" + | /\bnoix du br[ée]sil\b/ // "en:nuts" + | /\bmetabisulphite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bcolin d\'alaska\b/ // "en:fish" + | /\bcr[èe]me de lait\b/ // "en:milk" + | /\bcr[èe]me fraiche\b/ // "en:milk" + | /\bfromage blanc\b/ // "en:milk" + | /\bfromage fondu\b/ // "en:milk" + | /\bfromage frais\b/ // "en:milk" + | /\blait de vache\b/ // "en:milk" + | /\bgluten de bl[ée]\b/ // "en:gluten" + | /\bfarine de bl[ée]\b/ // "en:gluten" + | /\bnoix de cajou\b/ // "en:nuts" + | /\bnoix de p[ée]can\b/ // "en:nuts" + | /\bmetabisulfite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bsaint-jacques\b/ // "en:molluscs" + | /\bblanc d\'oeuf\b/ // "en:eggs" + | /\bjaune d\'oeuf\b/ // "en:eggs" + | /\bblanc d\'œufs\b/ // "en:eggs" + | /\bjaune d\'œufs\b/ // "en:eggs" + | /\bfibres? de bl[ée]\b/ // "en:gluten" + | /\bblanc d\'œuf\b/ // "en:eggs" + | /\bjaune d\'œuf\b/ // "en:eggs" + | /\boeufs frais\b/ // "en:eggs" + | /\bson de soja\b/ // "en:soybeans" + | /\blait [ée]cr[ée]m[ée]\b/ // "en:milk" + | /\blait entier\b/ // "en:milk" + | /\bcacahou[èe]tes\b/ // "en:peanuts" + | /\bbl[ée] complet\b/ // "en:gluten" + | /\bmalt d\'orge\b/ // "en:gluten" + | /\bfruits? secs?\b/ // "en:nuts" + | /\bc[ée]leri-rave\b/ // "en:celery" + | /\bsaumon fum[ée]\b/ // "en:fish" + | /\blangoustine\b/ // "en:crustaceans" + | /\boeuf frais\b/ // "en:eggs" + | /\bœufs frais\b/ // "en:eggs" + | /\bbas-beurre\b/ // "en:milk" + | /\bpetit-lait\b/ // "en:milk" + | /\blactoserum\b/ // "en:milk" + | /\blactos[ée]rum\b/ // "en:milk" + | /\bmozzarella\b/ // "en:milk" + | /\bmascarpone\b/ // "en:milk" + | /\bgorgonzola\b/ // "en:milk" + | /\blait frais\b/ // "en:milk" + | /\bcacahu[èe]tes\b/ // "en:peanuts" + | /\bcacahou[èe]te\b/ // "en:peanuts" + | /\bson de bl[ée]\b/ // "en:gluten" + | /\bdisulfites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\b[ée]crevisses\b/ // "en:crustaceans" + | /\bmollusques\b/ // "en:molluscs" + | /\bst-jacques\b/ // "en:molluscs" + | /\bœuf frais\b/ // "en:eggs" + | /\blactiques\b/ // "en:milk" + | /\breblochon\b/ // "en:milk" + | /\bmimolette\b/ // "en:milk" + | /\bcas[ée]inate\b/ // "en:milk" + | /\broquefort\b/ // "en:milk" + | /\barachides\b/ // "en:peanuts" + | /\bcacahu[èe]te\b/ // "en:peanuts" + | /\bnoisettes?\b/ // "en:nuts" + | /\bpistaches\b/ // "en:nuts" + | /\bsulphites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bbisulfite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bcabillaud\b/ // "en:fish" + | /\bmaquereau\b/ // "en:fish" + | /\bcrustac[ée]s\b/ // "en:crustaceans" + | /\blangouste\b/ // "en:crustaceans" + | /\bcrevettes\b/ // "en:crustaceans" + | /\b[ée]crevisse\b/ // "en:crustaceans" + | /\bmollusque\b/ // "en:molluscs" + | /\bescargots\b/ // "en:molluscs" + | /\bp[ée]toncles\b/ // "en:molluscs" + | /\blaitiere\b/ // "en:milk" + | /\blaitiers\b/ // "en:milk" + | /\bbabeurre\b/ // "en:milk" + | /\blait cru\b/ // "en:milk" + | /\bemmental\b/ // "en:milk" + | /\blactique\b/ // "en:milk" + | /\bparmesan\b/ // "en:milk" + | /\braclette\b/ // "en:milk" + | /\barachide\b/ // "en:peanuts" + | /\b[ée]peautre\b/ // "en:gluten" + | /\bboulgour\b/ // "en:gluten" + | /\bsulfites\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bsulphite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bmoutarde\b/ // "en:mustard" + | /\bpoissons\b/ // "en:fish" + | /\baiglefin\b/ // "en:fish" + | /\bsardines\b/ // "en:fish" + | /\bcrustac[ée]\b/ // "en:crustaceans" + | /\bcrevette\b/ // "en:crustaceans" + | /\bencornet\b/ // "en:molluscs" + | /\bescargot\b/ // "en:molluscs" + | /\blactose\b/ // "en:milk" + | /\blaitier\b/ // "en:milk" + | /\bfromage\b/ // "en:milk" + | /\bcheddar\b/ // "en:milk" + | /\bricotta\b/ // "en:milk" + | /\bcas[ée]ine\b/ // "en:milk" + | /\bgruy[èe]re\b/ // "en:milk" + | /\bfroment\b/ // "en:gluten" + | /\bbl[ée] dur\b/ // "en:gluten" + | /\bamandes?\b/ // "en:nuts" + | /\bsulfite\b/ // "en:sulphur-dioxide-and-sulphites" + | /\bpoisson\b/ // "en:fish" + | /\bhaddock\b/ // "en:fish" + | /\bsardine\b/ // "en:fish" + | /\banchois\b/ // "en:fish" + | /\bbrochet\b/ // "en:fish" + | /\bharengs\b/ // "en:fish" + | /\blimande\b/ // "en:fish" + | /\bhomards\b/ // "en:crustaceans" + | /\bcalamar\b/ // "en:molluscs" + | /\bhuitres\b/ // "en:molluscs" + | /\bbeurre\b/ // "en:milk" + | /\byaourt\b/ // "en:milk" + | /\bgluten\b/ // "en:gluten" + | /\bseigle\b/ // "en:gluten" + | /\bc[ée]leri\b/ // "en:celery" + | /\bs[ée]same\b/ // "en:sesame-seeds" + | /\bfl[ée]tan\b/ // "en:fish" + | /\bturbot\b/ // "en:fish" + | /\bsaumon\b/ // "en:fish" + | /\btruite\b/ // "en:fish" + | /\bhareng\b/ // "en:fish" + | /\bmerlan\b/ // "en:fish" + | /\brouget\b/ // "en:fish" + | /\btacaud\b/ // "en:fish" + | /\bcrabes\b/ // "en:crustaceans" + | /\bhomard\b/ // "en:crustaceans" + | /\bgambas\b/ // "en:crustaceans" + | /\bpoulpe\b/ // "en:molluscs" + | /\bseiche\b/ // "en:molluscs" + | /\bcalmar\b/ // "en:molluscs" + | /\bhuitre\b/ // "en:molluscs" + | /\bmoules\b/ // "en:molluscs" + | /\boeufs\b/ // "en:eggs" + | /\bsojas\b/ // "en:soybeans" + | /\btonyu\b/ // "en:soybeans" + | /\bcr[èe]me\b/ // "en:milk" + | /\bm[ée]ton\b/ // "en:milk" + | /\bcomte\b/ // "en:milk" + | /\bgouda\b/ // "en:milk" + | /\bkamut\b/ // "en:gluten" + | /\blupin\b/ // "en:lupin" + | /\bmorue\b/ // "en:fish" + | /\bcolin\b/ // "en:fish" + | /\bmerlu\b/ // "en:fish" + | /\bcrabe\b/ // "en:crustaceans" + | /\bmoule\b/ // "en:molluscs" + | /\bœufs\b/ // "en:eggs" + | /\boeuf\b/ // "en:eggs" + | /\bsoja\b/ // "en:soybeans" + | /\bsoya\b/ // "en:soybeans" + | /\bsoia\b/ // "en:soybeans" + | /\btofu\b/ // "en:soybeans" + | /\blait\b/ // "en:milk" + | /\bbleu\b/ // "en:milk" + | /\bedam\b/ // "en:milk" + | /\bfeta\b/ // "en:milk" + | /\borge\b/ // "en:gluten" + | /\bmalt\b/ // "en:gluten" + | /\bnoix\b/ // "en:nuts" + | /\bsole\b/ // "en:fish" + | /\bthon\b/ // "en:fish" + | /\bloup\b/ // "en:fish" + | /\blieu\b/ // "en:fish" + | /\bœuf\b/ // "en:eggs" + | /\bbl[ée]\b/ // "en:gluten" + diff --git a/data/grammars/terminal_allergen_fr_map.json b/data/grammars/terminal_allergen_fr_map.json new file mode 100644 index 0000000000..f2be743a45 --- /dev/null +++ b/data/grammars/terminal_allergen_fr_map.json @@ -0,0 +1 @@ +{"laits et dérivés y compris lactose":["en:milk"],"anhydride sulfureux et sulfites":["en:sulphur-dioxide-and-sulphites"],"noix de coquilles saint-jacques":["en:molluscs"],"produits laitiers et dérivées":["en:milk"],"produits laitiers et dérivés":["en:milk"],"céréales contenant du gluten":["en:gluten"],"noix de coquilles st-jacques":["en:molluscs"],"coquille saint jacques":["en:molluscs"],"autres fruits à coque":["en:nuts"],"noix de saint-jacques":["en:molluscs"],"protéines laitières":["en:milk"],"fruits secs à coque":["en:nuts"],"fruits à coque dure":["en:nuts"],"anhydride sulfureux":["en:sulphur-dioxide-and-sulphites"],"graines de moutarde":["en:mustard"],"coquille st jacques":["en:molluscs"],"ferments lactiques":["en:milk"],"noix du queensland":["en:nuts"],"autres fruits secs":["en:nuts"],"noix de st-jacques":["en:molluscs"],"lécithine de soja":["en:soybeans"],"fromage de chèvre":["en:milk"],"protéines de lait":["en:milk"],"produits laitiers":["en:milk"],"farine de froment":["en:gluten"],"farine d'épeautre":["en:gluten"],"noix de macadamia":["en:nuts"],"graines de sésame":["en:sesame-seeds"],"beurre patissier":["en:milk"],"fromage de vache":["en:milk"],"dérivés laitiers":["en:milk"],"beurre concentré":["en:milk"],"lait demi-écrémé":["en:milk"],"arachis hypogaea":["en:peanuts"],"graine de sésame":["en:sesame-seeds"],"graines de soja":["en:soybeans"],"graine de soja":["en:soybeans"],"poudre de lait":["en:milk"],"lait de chèvre":["en:milk"],"lait de brebis":["en:milk"],"petit épeautre":["en:gluten"],"grand épeautre":["en:gluten"],"fruits à coque":["en:nuts"],"noix du brésil":["en:nuts"],"metabisulphite":["en:sulphur-dioxide-and-sulphites"],"colin d'alaska":["en:fish"],"crème de lait":["en:milk"],"crème fraiche":["en:milk"],"fromage blanc":["en:milk"],"fromage fondu":["en:milk"],"fromage frais":["en:milk"],"lait de vache":["en:milk"],"gluten de blé":["en:gluten"],"farine de blé":["en:gluten"],"noix de cajou":["en:nuts"],"noix de pécan":["en:nuts"],"metabisulfite":["en:sulphur-dioxide-and-sulphites"],"saint-jacques":["en:molluscs"],"blanc d'oeuf":["en:eggs"],"jaune d'oeuf":["en:eggs"],"blanc d'œufs":["en:eggs"],"jaune d'œufs":["en:eggs"],"fibre de blé":["en:gluten"],"blanc d'œuf":["en:eggs"],"jaune d'œuf":["en:eggs"],"oeufs frais":["en:eggs"],"son de soja":["en:soybeans"],"lait écrémé":["en:milk"],"lait entier":["en:milk"],"cacahouètes":["en:peanuts"],"blé complet":["en:gluten"],"malt d'orge":["en:gluten"],"fruits secs":["en:nuts"],"céleri-rave":["en:celery"],"saumon fumé":["en:fish"],"langoustine":["en:crustaceans"],"oeuf frais":["en:eggs"],"œufs frais":["en:eggs"],"bas-beurre":["en:milk"],"petit-lait":["en:milk"],"lactoserum":["en:milk"],"lactosérum":["en:milk"],"mozzarella":["en:milk"],"mascarpone":["en:milk"],"gorgonzola":["en:milk"],"lait frais":["en:milk"],"cacahuètes":["en:peanuts"],"cacahouète":["en:peanuts"],"son de blé":["en:gluten"],"disulfites":["en:sulphur-dioxide-and-sulphites"],"écrevisses":["en:crustaceans"],"mollusques":["en:molluscs"],"st-jacques":["en:molluscs"],"œuf frais":["en:eggs"],"lactiques":["en:milk"],"reblochon":["en:milk"],"mimolette":["en:milk"],"caséinate":["en:milk"],"roquefort":["en:milk"],"arachides":["en:peanuts"],"cacahuète":["en:peanuts"],"noisettes":["en:nuts"],"pistaches":["en:nuts"],"sulphites":["en:sulphur-dioxide-and-sulphites"],"bisulfite":["en:sulphur-dioxide-and-sulphites"],"cabillaud":["en:fish"],"maquereau":["en:fish"],"crustacés":["en:crustaceans"],"langouste":["en:crustaceans"],"crevettes":["en:crustaceans"],"écrevisse":["en:crustaceans"],"mollusque":["en:molluscs"],"escargots":["en:molluscs"],"pétoncles":["en:molluscs"],"laitiere":["en:milk"],"laitiers":["en:milk"],"babeurre":["en:milk"],"lait cru":["en:milk"],"emmental":["en:milk"],"lactique":["en:milk"],"parmesan":["en:milk"],"raclette":["en:milk"],"arachide":["en:peanuts"],"épeautre":["en:gluten"],"boulgour":["en:gluten"],"sulfites":["en:sulphur-dioxide-and-sulphites"],"sulphite":["en:sulphur-dioxide-and-sulphites"],"moutarde":["en:mustard"],"poissons":["en:fish"],"aiglefin":["en:fish"],"sardines":["en:fish"],"crustacé":["en:crustaceans"],"crevette":["en:crustaceans"],"encornet":["en:molluscs"],"escargot":["en:molluscs"],"lactose":["en:milk"],"laitier":["en:milk"],"fromage":["en:milk"],"cheddar":["en:milk"],"ricotta":["en:milk"],"caséine":["en:milk"],"gruyère":["en:milk"],"froment":["en:gluten"],"blé dur":["en:gluten"],"amandes":["en:nuts"],"sulfite":["en:sulphur-dioxide-and-sulphites"],"poisson":["en:fish"],"haddock":["en:fish"],"sardine":["en:fish"],"anchois":["en:fish"],"brochet":["en:fish"],"harengs":["en:fish"],"limande":["en:fish"],"homards":["en:crustaceans"],"calamar":["en:molluscs"],"huitres":["en:molluscs"],"beurre":["en:milk"],"yaourt":["en:milk"],"gluten":["en:gluten"],"seigle":["en:gluten"],"céleri":["en:celery"],"sésame":["en:sesame-seeds"],"flétan":["en:fish"],"turbot":["en:fish"],"saumon":["en:fish"],"truite":["en:fish"],"hareng":["en:fish"],"merlan":["en:fish"],"rouget":["en:fish"],"tacaud":["en:fish"],"crabes":["en:crustaceans"],"homard":["en:crustaceans"],"gambas":["en:crustaceans"],"poulpe":["en:molluscs"],"seiche":["en:molluscs"],"calmar":["en:molluscs"],"huitre":["en:molluscs"],"moules":["en:molluscs"],"oeufs":["en:eggs"],"sojas":["en:soybeans"],"tonyu":["en:soybeans"],"crème":["en:milk"],"méton":["en:milk"],"comte":["en:milk"],"gouda":["en:milk"],"kamut":["en:gluten"],"lupin":["en:lupin"],"morue":["en:fish"],"colin":["en:fish"],"merlu":["en:fish"],"crabe":["en:crustaceans"],"moule":["en:molluscs"],"œufs":["en:eggs"],"oeuf":["en:eggs"],"soja":["en:soybeans"],"soya":["en:soybeans"],"soia":["en:soybeans"],"tofu":["en:soybeans"],"lait":["en:milk"],"bleu":["en:milk"],"edam":["en:milk"],"feta":["en:milk"],"orge":["en:gluten"],"malt":["en:gluten"],"noix":["en:nuts"],"sole":["en:fish"],"thon":["en:fish"],"loup":["en:fish"],"lieu":["en:fish"],"œuf":["en:eggs"],"blé":["en:gluten"]} \ No newline at end of file diff --git a/data/grammars/traces.lark b/data/grammars/traces.lark new file mode 100644 index 0000000000..b35c2787fc --- /dev/null +++ b/data/grammars/traces.lark @@ -0,0 +1,59 @@ +// match any non-whitespace word that is not detected by other rule/terminal +// it has priority 0, lower than packaging related terminals that have priority +// 1, so it matches words that were not detected by other terminals +OTHER: /[^\s]+/ + + +// ## FR ## + +OF_FR.1: /\bdes?\b/ + | /\bd\'/ + | /\bd\b/ + | /\bdu\b/ + +POSSIBLE_FR.1: /\b[ée]ventuelles?\b/ + | /\bpossibles?\b/ + +PRODUCTED_FR.1: /\bfabriqu[ée]\b/ + | /\bélabor[ée]\b/ + +THAT_USES_FR.1: /\bqui utilise\b/ + | /\butilisant\b/ + +// Peut contenir des traces de fruits à coque, de cacahuete de sésame, de sulfites et de gluten +manufactured_in_fr: ("produit"i WS)? PRODUCTED_FR WS "dans" WS "un" WS "atelier" WS THAT_USES_FR WS? (":" WS)? trace_list_fr +can_contain_fr: ("peut"i WS "contenir" WS "des" WS)? "traces"i WS (POSSIBLE_FR? WS)? ("de"? WS? ":" WS?)? trace_list_fr +can_contain_2_fr: "peut"i WS "contenir" WS? (":" WS?)? trace_list_fr +contains_fr: "contient"i (WS "naturellement")? WS trace_list_fr + +trace_list_fr: (OF_FR WS?)? ALLERGEN_FR (WS? ("," WS?)? (("et" WS)? (OF_FR WS?)?)? ALLERGEN_FR)* +traces_fr: can_contain_fr | can_contain_2_fr | contains_fr | manufactured_in_fr + +// ## EN ## + +// It may contain traces of nuts, peanuts, sesame, sulphites and gluten. +can_contain_en: ("it"i WS)? "may" WS "contain" WS ("traces" WS "of" WS)? trace_list_en +contain_en: "contains"i (WS "traces" WS "of")? (WS? ":")? WS? trace_list_en +manufactured_in_en: "prepared"i WS "in" WS "premises" WS "where" WS "traces" WS "of" WS trace_list_en WS "are" WS "used" +trace_list_en: ALLERGEN_EN (WS? ("," WS)? (("and" WS)? ("of" WS)?)? ALLERGEN_EN)* +traces_en: can_contain_en | contain_en | manufactured_in_en + +// ## ES ## + +// PUEDE CONTENER LECHE +can_contain_es: ("este" WS "producto" WS)? "puede" WS "contener"i (WS? ":")? WS? trace_list_es +// Contiene leche +contain_es: ("este" WS "producto" WS)? "contiene"i (WS? ":")? WS? trace_list_es +trace_list_es: ALLERGEN_ES (WS? ("," WS)? (("y" WS)?)? ALLERGEN_ES)* +traces_es: contain_es | can_contain_es + + +traces: traces_fr | traces_en | traces_fr | traces_es +start: (traces | junk | WS)+ +// all other words +junk: OTHER+ + +%import common.WS +%import .terminal_allergen_fr.ALLERGEN_FR +%import .terminal_allergen_en.ALLERGEN_EN +%import .terminal_allergen_es.ALLERGEN_ES diff --git a/robotoff/models.py b/robotoff/models.py index ff0a5fb330..2a8db5e571 100644 --- a/robotoff/models.py +++ b/robotoff/models.py @@ -132,7 +132,7 @@ class ProductInsight(BaseModel): # the annotator (or first annotator, if multiple votes were cast). username = peewee.TextField(index=True, null=True) - # Stores the list of counties that are associated with the product. + # Stores the list of countries that are associated with the product. # E.g. possible values are "en:united-states" or "en:france". countries = BinaryJSONField(null=True, index=True, default=list) @@ -171,7 +171,7 @@ class ProductInsight(BaseModel): null=True, max_length=10, help_text="project associated with the insight, " - "one of 'off', 'obf', 'opff', 'opf'", + "one of 'off', 'obf', 'opff', 'opf', 'off-pro'", index=True, ) diff --git a/robotoff/off.py b/robotoff/off.py index 5d67228ea6..b55eb93b1e 100644 --- a/robotoff/off.py +++ b/robotoff/off.py @@ -806,6 +806,78 @@ def send_image( return r +def parse_ingredients(text: str, lang: str, timeout: int = 10) -> list[JSONType]: + """Parse ingredients text using Product Opener API. + + It is only available for `off` flavor (food). + + The result is a list of ingredients, each ingredient is a dict with the + following keys: + + - id: the ingredient ID. Having an ID does not means that the ingredient + is recognized, you must check if it exists in the taxonomy. + - text: the ingredient text (as it appears in the input ingredients list) + - percent_min: the minimum percentage of the ingredient in the product + - percent_max: the maximum percentage of the ingredient in the product + - percent_estimate: the estimated percentage of the ingredient in the + product + - vegan (bool): optional key indicating if the ingredient is vegan + - vegetarian (bool): optional key indicating if the ingredient is + vegetarian + + + :param server_type: the server type (project) to use + :param text: the ingredients text to parse + :param lang: the language of the text (used for parsing) as a 2-letter code + :param timeout: the request timeout in seconds, defaults to 10s + :raises RuntimeError: a RuntimeError is raised if the parsing fails + :return: the list of parsed ingredients + """ + base_url = settings.BaseURLProvider.world(ServerType.off) + # by using "test" as code, we don't save any information to database + # This endpoint is specifically designed for testing purposes + url = f"{base_url}/api/v3/product/test" + + if len(text) == 0: + raise ValueError("text must be a non-empty string") + + try: + r = http_session.patch( + url, + auth=settings._off_request_auth, + json={ + "fields": "ingredients", + "lc": lang, + "tags_lc": lang, + "product": { + "lang": lang, + f"ingredients_text_{lang}": text, + }, + }, + timeout=timeout, + ) + except ( + requests.exceptions.ConnectionError, + requests.exceptions.SSLError, + requests.exceptions.Timeout, + ) as e: + raise RuntimeError( + f"Unable to parse ingredients: error during HTTP request: {e}" + ) + + if not r.ok: + raise RuntimeError( + f"Unable to parse ingredients (non-200 status code): {r.status_code}, {r.text}" + ) + + response_data = r.json() + + if response_data.get("status") != "success": + raise RuntimeError(f"Unable to parse ingredients: {response_data}") + + return response_data["product"]["ingredients"] + + def normalize_tag(value, lowercase=True): """Given a value normalize it to a tag (as in taxonomies). diff --git a/robotoff/prediction/ingredient_list/__init__.py b/robotoff/prediction/ingredient_list/__init__.py index 555cb470d3..0eb55789a7 100644 --- a/robotoff/prediction/ingredient_list/__init__.py +++ b/robotoff/prediction/ingredient_list/__init__.py @@ -9,17 +9,21 @@ from tritonclient.grpc import service_pb2 from robotoff import settings +from robotoff.prediction.ingredient_list.postprocess import detect_additional_mentions from robotoff.prediction.langid import LanguagePrediction, predict_lang_batch from robotoff.triton import get_triton_inference_stub from robotoff.utils import http_session -from .postprocess import AggregationStrategy, TokenClassificationPipeline +from .transformers_pipeline import AggregationStrategy, TokenClassificationPipeline # The tokenizer assets are stored in the model directory INGREDIENT_NER_MODEL_DIR = settings.TRITON_MODELS_DIR / "ingredient-ner/1/model.onnx" INGREDIENT_ID2LABEL = {0: "O", 1: "B-ING", 2: "I-ING"} +MODEL_NAME = "ingredient-detection" +MODEL_VERSION = "ingredient-detection-1.0" + @dataclasses.dataclass class IngredientPredictionAggregatedEntity: @@ -27,9 +31,12 @@ class IngredientPredictionAggregatedEntity: start: int # character end index of the entity end: int + # character start index of the entity, before postprocessing (i.e. + # before adding organic or allergen mentions) + raw_end: int # confidence score score: float - # entity text + # entity text (without organic or allergen mentions) text: str # language prediction of the entity text lang: Optional[LanguagePrediction] = None @@ -175,13 +182,16 @@ def predict_batch( agg_entities = [] for output in pipeline_output: start = int(output["start"]) - end = int(output["end"]) + raw_end = int(output["end"]) + end = detect_additional_mentions(sentence, raw_end) + text = sentence[start:end] agg_entities.append( IngredientPredictionAggregatedEntity( start=start, end=end, + raw_end=raw_end, score=float(output["score"]), - text=sentence[start:end], + text=text, ), ) if predict_lang: diff --git a/robotoff/prediction/ingredient_list/postprocess.py b/robotoff/prediction/ingredient_list/postprocess.py index 047c83e27d..37f8fd6988 100644 --- a/robotoff/prediction/ingredient_list/postprocess.py +++ b/robotoff/prediction/ingredient_list/postprocess.py @@ -1,372 +1,136 @@ -""" -This file has been copied and adapted from -https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/pipelines/token_classification.py - -The code is under Apache-2.0 license: -https://github.com/huggingface/transformers/blob/main/LICENSE - -We use Triton to serve the request, but still need NER prediction -post-processing, and HuggingFace transformers library provide this feature -nicely using `TokenClassificationPipeline`. - -Most of the code was kept unchanged, the only modifications that were made -were the following: - -- accept numpy array as input instead of Tensorflow/Pytorch tensors -- remove unnecessary code (everything that is not related to post-processing) -- `postprocess` now accepts a single sample (instead of a batched sample of - size 1) -""" -import enum -import warnings -from typing import List, Optional, Tuple - -import numpy as np - - -class AggregationStrategy(enum.Enum): - """All the valid aggregation strategies for TokenClassificationPipeline""" - - NONE = "none" - SIMPLE = "simple" - FIRST = "first" - AVERAGE = "average" - MAX = "max" - - -class TokenClassificationPipeline: - default_input_names = "sequences" - - def __init__(self, tokenizer, id2label): - self.tokenizer = tokenizer - self.id2label = id2label - - def _sanitize_parameters( - self, - ignore_labels=None, - grouped_entities: Optional[bool] = None, - ignore_subwords: Optional[bool] = None, - aggregation_strategy: Optional[AggregationStrategy] = None, - offset_mapping: Optional[List[Tuple[int, int]]] = None, - ): - - preprocess_params = {} - if offset_mapping is not None: - preprocess_params["offset_mapping"] = offset_mapping - - postprocess_params = {} - if grouped_entities is not None or ignore_subwords is not None: - if grouped_entities and ignore_subwords: - aggregation_strategy = AggregationStrategy.FIRST - elif grouped_entities and not ignore_subwords: - aggregation_strategy = AggregationStrategy.SIMPLE +import functools +import re + +from lark import Discard, Lark, Transformer + +from robotoff import settings + +ASTERISK_SYMBOL = r"((\* ?=?|\(¹\)|\") ?)" +FROM_ORGANIC_FARMING_FR = r"issus? de l'agriculture (biologique|bio|durable)" +ORGANIC_MENTIONS_RE = re.compile( + rf"{ASTERISK_SYMBOL}?ingr[ée]dients?( agricoles?)? {FROM_ORGANIC_FARMING_FR}" + rf"|{ASTERISK_SYMBOL}?produits? {FROM_ORGANIC_FARMING_FR}" + rf"|{ASTERISK_SYMBOL}?{FROM_ORGANIC_FARMING_FR}" + rf"|{ASTERISK_SYMBOL}organic( farming)?" + rf"|{ASTERISK_SYMBOL}?aus biologischer landwirtschaft" + rf"|{ASTERISK_SYMBOL}?procedentes del cultivo ecol[óo]gico" + rf"|{ASTERISK_SYMBOL}?de cultivo ecol[óo]gico certificado" + rf"|{ASTERISK_SYMBOL}?ingredientes? ecol[óo]gicos?", + re.I, +) + + +def detect_additional_mentions(text: str, end_idx: int) -> int: + """Detect additional mentions that are relevant to include in the + ingredient list (such as organic/fair trade or allergen mentions) but + that are not currently detected by the model (as the model was trained + not to include them in the ingredient list). + + :param text: the full text to process + :param end_idx: the end character index of the current ingredient list + :return: the new end index of the ingredient list, if any mention was + detected. Return the initial end index otherwise. + """ + initial_end_idx = end_idx + last_updated = True + matched = False + + while last_updated: + last_updated = False + lookup_end_idx = end_idx + candidate = text[lookup_end_idx:] + + for char in candidate: + if char.isspace() or char in (".", ","): + lookup_end_idx += 1 else: - aggregation_strategy = AggregationStrategy.NONE - - if grouped_entities is not None: - warnings.warn( - "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to" - f' `aggregation_strategy="{aggregation_strategy}"` instead.' - ) - if ignore_subwords is not None: - warnings.warn( - "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to" - f' `aggregation_strategy="{aggregation_strategy}"` instead.' - ) - - if aggregation_strategy is not None: - if isinstance(aggregation_strategy, str): - aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()] - if ( - aggregation_strategy - in { - AggregationStrategy.FIRST, - AggregationStrategy.MAX, - AggregationStrategy.AVERAGE, - } - and not self.tokenizer.is_fast - ): - raise ValueError( - "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option" - 'to `"simple"` or use a fast tokenizer.' - ) - postprocess_params["aggregation_strategy"] = aggregation_strategy - if ignore_labels is not None: - postprocess_params["ignore_labels"] = ignore_labels - return preprocess_params, {}, postprocess_params + break - def postprocess( - self, - model_outputs, - aggregation_strategy=AggregationStrategy.NONE, - ignore_labels=None, - ): - if ignore_labels is None: - ignore_labels = ["O"] - logits = model_outputs["logits"] - sentence = model_outputs["sentence"] - input_ids = model_outputs["input_ids"] - offset_mapping = ( - model_outputs["offset_mapping"] - if model_outputs["offset_mapping"] is not None - else None - ) - special_tokens_mask = model_outputs["special_tokens_mask"] + candidate = text[lookup_end_idx:] - maxes = np.max(logits, axis=-1, keepdims=True) - shifted_exp = np.exp(logits - maxes) - scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + if (match := ORGANIC_MENTIONS_RE.search(candidate)) is not None: + if match.start() == 0: + matched = True + last_updated = True + lookup_end_idx += match.end() + end_idx = lookup_end_idx - pre_entities = self.gather_pre_entities( - sentence, - input_ids, - scores, - offset_mapping, - special_tokens_mask, - aggregation_strategy, - ) - grouped_entities = self.aggregate(pre_entities, aggregation_strategy) - # Filter anything that is in self.ignore_labels - entities = [ - entity - for entity in grouped_entities - if entity.get("entity", None) not in ignore_labels - and entity.get("entity_group", None) not in ignore_labels - ] - return entities + if ( + new_end_idx := detect_trace_mention(text, lookup_end_idx) + ) != lookup_end_idx: + matched = True + lookup_end_idx = new_end_idx + end_idx = new_end_idx + last_updated = True - def gather_pre_entities( - self, - sentence: str, - input_ids: np.ndarray, - scores: np.ndarray, - offset_mapping: Optional[List[Tuple[int, int]]], - special_tokens_mask: np.ndarray, - aggregation_strategy: AggregationStrategy, - ) -> List[dict]: - """Fuse various numpy arrays into dicts with all the information - needed for aggregation""" - pre_entities = [] - for idx, token_scores in enumerate(scores): - # Filter special_tokens, they should only occur - # at the sentence boundaries since we're not encoding pairs of - # sentences so we don't have to keep track of those. - if special_tokens_mask[idx]: - continue + # If a mention was detected, return the new end index + if matched: + return end_idx - word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) - if offset_mapping is not None: - start_ind, end_ind = offset_mapping[idx] - word_ref = sentence[start_ind:end_ind] - if getattr( - self.tokenizer._tokenizer.model, "continuing_subword_prefix", None - ): - # This is a BPE, word aware tokenizer, there is a correct - # way to fuse tokens - is_subword = len(word) != len(word_ref) - else: - # This is a fallback heuristic. This will fail most likely - # on any kind of text + punctuation mixtures that will be - # considered "words". Non word aware models cannot do - # better than this unfortunately. - if aggregation_strategy in { - AggregationStrategy.FIRST, - AggregationStrategy.AVERAGE, - AggregationStrategy.MAX, - }: - warnings.warn( - "Tokenizer does not support real words, using fallback heuristic", - UserWarning, - ) - is_subword = ( - start_ind > 0 - and " " not in sentence[start_ind - 1 : start_ind + 1] - ) + # If no mention was detected, reset the end index to its initial value + return initial_end_idx - if int(input_ids[idx]) == self.tokenizer.unk_token_id: - word = word_ref - is_subword = False - else: - start_ind = None - end_ind = None - is_subword = False - - pre_entity = { - "word": word, - "scores": token_scores, - "start": start_ind, - "end": end_ind, - "index": idx, - "is_subword": is_subword, - } - pre_entities.append(pre_entity) - return pre_entities - def aggregate( - self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy - ) -> List[dict]: - if aggregation_strategy in { - AggregationStrategy.NONE, - AggregationStrategy.SIMPLE, - }: - entities = [] - for pre_entity in pre_entities: - entity_idx = pre_entity["scores"].argmax() - score = pre_entity["scores"][entity_idx] - entity = { - "entity": self.id2label[entity_idx], - "score": score, - "index": pre_entity["index"], - "word": pre_entity["word"], - "start": pre_entity["start"], - "end": pre_entity["end"], - } - entities.append(entity) - else: - entities = self.aggregate_words(pre_entities, aggregation_strategy) +@functools.cache +def load_trace_grammar() -> Lark: + return Lark.open( + str(settings.GRAMMARS_DIR / "traces.lark"), + start="start", + # include start and end positions in the parse tree + propagate_positions=True, + ) - if aggregation_strategy == AggregationStrategy.NONE: - return entities - return self.group_entities(entities) - def aggregate_word( - self, entities: List[dict], aggregation_strategy: AggregationStrategy - ) -> dict: - word = self.tokenizer.convert_tokens_to_string( - [entity["word"] for entity in entities] - ) - if aggregation_strategy == AggregationStrategy.FIRST: - scores = entities[0]["scores"] - idx = scores.argmax() - score = scores[idx] - entity = self.id2label[idx] - elif aggregation_strategy == AggregationStrategy.MAX: - max_entity = max(entities, key=lambda entity: entity["scores"].max()) - scores = max_entity["scores"] - idx = scores.argmax() - score = scores[idx] - entity = self.id2label[idx] - elif aggregation_strategy == AggregationStrategy.AVERAGE: - scores = np.stack([entity["scores"] for entity in entities]) - average_scores = np.nanmean(scores, axis=0) - entity_idx = average_scores.argmax() - entity = self.id2label[entity_idx] - score = average_scores[entity_idx] - else: - raise ValueError("Invalid aggregation_strategy") - new_entity = { - "entity": entity, - "score": score, - "word": word, - "start": entities[0]["start"], - "end": entities[-1]["end"], - } - return new_entity +class TraceDetectionTransformer(Transformer): + """Transformer to detect trace mentions in the ingredient list. - def aggregate_words( - self, entities: List[dict], aggregation_strategy: AggregationStrategy - ) -> List[dict]: - """ - Override tokens from a given word that disagree to force agreement on - word boundaries. + Only the start and end positions of the first item are returned, + as we're only interested in the end position of the first trace mention. + Start position is returned to make sure that the match is at the start of + the text. + """ - Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be - rewritten with first strategy as microsoft| - company| B-ENT I-ENT - """ - if aggregation_strategy in { - AggregationStrategy.NONE, - AggregationStrategy.SIMPLE, - }: - raise ValueError( - "NONE and SIMPLE strategies are invalid for word aggregation" - ) + def start(self, items: list): + if items: + return items[0] + return None, None - word_entities = [] - word_group = None - for entity in entities: - if word_group is None: - word_group = [entity] - elif entity["is_subword"]: - word_group.append(entity) - else: - word_entities.append( - self.aggregate_word(word_group, aggregation_strategy) - ) - word_group = [entity] - # Last item - word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) # type: ignore - return word_entities + def value(self, items: list): + return items - def group_sub_entities(self, entities: List[dict]) -> dict: - """ - Group together the adjacent tokens with the same entity predicted. + def traces(self, items): + item = items[0] + return item.meta.start_pos, item.meta.end_pos - Args: - entities (`dict`): The entities predicted by the pipeline. - """ - # Get the first entity in the entity group - entity = entities[0]["entity"].split("-")[-1] - scores = np.nanmean([entity["score"] for entity in entities]) - tokens = [entity["word"] for entity in entities] + def WS(self, token): + return Discard - entity_group = { - "entity_group": entity, - "score": np.mean(scores), - "word": self.tokenizer.convert_tokens_to_string(tokens), - "start": entities[0]["start"], - "end": entities[-1]["end"], - } - return entity_group + def OTHER(self, token): + return Discard - def get_tag(self, entity_name: str) -> Tuple[str, str]: - if entity_name.startswith("B-"): - bi = "B" - tag = entity_name[2:] - elif entity_name.startswith("I-"): - bi = "I" - tag = entity_name[2:] - else: - # It's not in B-, I- format - # Default to I- for continuation. - bi = "I" - tag = entity_name - return bi, tag + def junk(self, items): + return Discard - def group_entities(self, entities: List[dict]) -> List[dict]: - """ - Find and group together the adjacent tokens with the same entity - predicted. - Args: - entities (`dict`): The entities predicted by the pipeline. - """ +def detect_trace_mention(text: str, end_idx: int) -> int: + """Detect trace mentions that are relevant to include in the ingredient + list. - entity_groups = [] - entity_group_disagg: list[dict] = [] + :param text: the full text to process + :param end_idx: the end character index of the current ingredient list + :return: the new end index of the ingredient list, if any mention was + detected, or the initial end index otherwise + """ + if not text[end_idx:]: + return end_idx - for entity in entities: - if not entity_group_disagg: - entity_group_disagg.append(entity) - continue + initial_end_idx = end_idx + grammar = load_trace_grammar() + t = grammar.parse(text[end_idx:].lower()) + start_idx, end_idx_offset = TraceDetectionTransformer().transform(t) - # If the current entity is similar and adjacent to the previous - # entity, append it to the disaggregated entity group - # The split is meant to account for the "B" and "I" prefixes - # Shouldn't merge if both entities are B-type - bi, tag = self.get_tag(entity["entity"]) - last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"]) - - if tag == last_tag and bi != "B": - # Modify subword type to be previous_type - entity_group_disagg.append(entity) - else: - # If the current entity is different from the previous entity - # aggregate the disaggregated entity group - entity_groups.append(self.group_sub_entities(entity_group_disagg)) - entity_group_disagg = [entity] - if entity_group_disagg: - # it's the last entity, add it to the entity groups - entity_groups.append(self.group_sub_entities(entity_group_disagg)) + if start_idx != 0: + return initial_end_idx - return entity_groups + end_idx += end_idx_offset + return end_idx diff --git a/robotoff/prediction/ingredient_list/transformers_pipeline.py b/robotoff/prediction/ingredient_list/transformers_pipeline.py new file mode 100644 index 0000000000..047c83e27d --- /dev/null +++ b/robotoff/prediction/ingredient_list/transformers_pipeline.py @@ -0,0 +1,372 @@ +""" +This file has been copied and adapted from +https://github.com/huggingface/transformers/blob/v4.25.1/src/transformers/pipelines/token_classification.py + +The code is under Apache-2.0 license: +https://github.com/huggingface/transformers/blob/main/LICENSE + +We use Triton to serve the request, but still need NER prediction +post-processing, and HuggingFace transformers library provide this feature +nicely using `TokenClassificationPipeline`. + +Most of the code was kept unchanged, the only modifications that were made +were the following: + +- accept numpy array as input instead of Tensorflow/Pytorch tensors +- remove unnecessary code (everything that is not related to post-processing) +- `postprocess` now accepts a single sample (instead of a batched sample of + size 1) +""" +import enum +import warnings +from typing import List, Optional, Tuple + +import numpy as np + + +class AggregationStrategy(enum.Enum): + """All the valid aggregation strategies for TokenClassificationPipeline""" + + NONE = "none" + SIMPLE = "simple" + FIRST = "first" + AVERAGE = "average" + MAX = "max" + + +class TokenClassificationPipeline: + default_input_names = "sequences" + + def __init__(self, tokenizer, id2label): + self.tokenizer = tokenizer + self.id2label = id2label + + def _sanitize_parameters( + self, + ignore_labels=None, + grouped_entities: Optional[bool] = None, + ignore_subwords: Optional[bool] = None, + aggregation_strategy: Optional[AggregationStrategy] = None, + offset_mapping: Optional[List[Tuple[int, int]]] = None, + ): + + preprocess_params = {} + if offset_mapping is not None: + preprocess_params["offset_mapping"] = offset_mapping + + postprocess_params = {} + if grouped_entities is not None or ignore_subwords is not None: + if grouped_entities and ignore_subwords: + aggregation_strategy = AggregationStrategy.FIRST + elif grouped_entities and not ignore_subwords: + aggregation_strategy = AggregationStrategy.SIMPLE + else: + aggregation_strategy = AggregationStrategy.NONE + + if grouped_entities is not None: + warnings.warn( + "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + if ignore_subwords is not None: + warnings.warn( + "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + + if aggregation_strategy is not None: + if isinstance(aggregation_strategy, str): + aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()] + if ( + aggregation_strategy + in { + AggregationStrategy.FIRST, + AggregationStrategy.MAX, + AggregationStrategy.AVERAGE, + } + and not self.tokenizer.is_fast + ): + raise ValueError( + "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option" + 'to `"simple"` or use a fast tokenizer.' + ) + postprocess_params["aggregation_strategy"] = aggregation_strategy + if ignore_labels is not None: + postprocess_params["ignore_labels"] = ignore_labels + return preprocess_params, {}, postprocess_params + + def postprocess( + self, + model_outputs, + aggregation_strategy=AggregationStrategy.NONE, + ignore_labels=None, + ): + if ignore_labels is None: + ignore_labels = ["O"] + logits = model_outputs["logits"] + sentence = model_outputs["sentence"] + input_ids = model_outputs["input_ids"] + offset_mapping = ( + model_outputs["offset_mapping"] + if model_outputs["offset_mapping"] is not None + else None + ) + special_tokens_mask = model_outputs["special_tokens_mask"] + + maxes = np.max(logits, axis=-1, keepdims=True) + shifted_exp = np.exp(logits - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + pre_entities = self.gather_pre_entities( + sentence, + input_ids, + scores, + offset_mapping, + special_tokens_mask, + aggregation_strategy, + ) + grouped_entities = self.aggregate(pre_entities, aggregation_strategy) + # Filter anything that is in self.ignore_labels + entities = [ + entity + for entity in grouped_entities + if entity.get("entity", None) not in ignore_labels + and entity.get("entity_group", None) not in ignore_labels + ] + return entities + + def gather_pre_entities( + self, + sentence: str, + input_ids: np.ndarray, + scores: np.ndarray, + offset_mapping: Optional[List[Tuple[int, int]]], + special_tokens_mask: np.ndarray, + aggregation_strategy: AggregationStrategy, + ) -> List[dict]: + """Fuse various numpy arrays into dicts with all the information + needed for aggregation""" + pre_entities = [] + for idx, token_scores in enumerate(scores): + # Filter special_tokens, they should only occur + # at the sentence boundaries since we're not encoding pairs of + # sentences so we don't have to keep track of those. + if special_tokens_mask[idx]: + continue + + word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) + if offset_mapping is not None: + start_ind, end_ind = offset_mapping[idx] + word_ref = sentence[start_ind:end_ind] + if getattr( + self.tokenizer._tokenizer.model, "continuing_subword_prefix", None + ): + # This is a BPE, word aware tokenizer, there is a correct + # way to fuse tokens + is_subword = len(word) != len(word_ref) + else: + # This is a fallback heuristic. This will fail most likely + # on any kind of text + punctuation mixtures that will be + # considered "words". Non word aware models cannot do + # better than this unfortunately. + if aggregation_strategy in { + AggregationStrategy.FIRST, + AggregationStrategy.AVERAGE, + AggregationStrategy.MAX, + }: + warnings.warn( + "Tokenizer does not support real words, using fallback heuristic", + UserWarning, + ) + is_subword = ( + start_ind > 0 + and " " not in sentence[start_ind - 1 : start_ind + 1] + ) + + if int(input_ids[idx]) == self.tokenizer.unk_token_id: + word = word_ref + is_subword = False + else: + start_ind = None + end_ind = None + is_subword = False + + pre_entity = { + "word": word, + "scores": token_scores, + "start": start_ind, + "end": end_ind, + "index": idx, + "is_subword": is_subword, + } + pre_entities.append(pre_entity) + return pre_entities + + def aggregate( + self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy + ) -> List[dict]: + if aggregation_strategy in { + AggregationStrategy.NONE, + AggregationStrategy.SIMPLE, + }: + entities = [] + for pre_entity in pre_entities: + entity_idx = pre_entity["scores"].argmax() + score = pre_entity["scores"][entity_idx] + entity = { + "entity": self.id2label[entity_idx], + "score": score, + "index": pre_entity["index"], + "word": pre_entity["word"], + "start": pre_entity["start"], + "end": pre_entity["end"], + } + entities.append(entity) + else: + entities = self.aggregate_words(pre_entities, aggregation_strategy) + + if aggregation_strategy == AggregationStrategy.NONE: + return entities + return self.group_entities(entities) + + def aggregate_word( + self, entities: List[dict], aggregation_strategy: AggregationStrategy + ) -> dict: + word = self.tokenizer.convert_tokens_to_string( + [entity["word"] for entity in entities] + ) + if aggregation_strategy == AggregationStrategy.FIRST: + scores = entities[0]["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.id2label[idx] + elif aggregation_strategy == AggregationStrategy.MAX: + max_entity = max(entities, key=lambda entity: entity["scores"].max()) + scores = max_entity["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.id2label[idx] + elif aggregation_strategy == AggregationStrategy.AVERAGE: + scores = np.stack([entity["scores"] for entity in entities]) + average_scores = np.nanmean(scores, axis=0) + entity_idx = average_scores.argmax() + entity = self.id2label[entity_idx] + score = average_scores[entity_idx] + else: + raise ValueError("Invalid aggregation_strategy") + new_entity = { + "entity": entity, + "score": score, + "word": word, + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return new_entity + + def aggregate_words( + self, entities: List[dict], aggregation_strategy: AggregationStrategy + ) -> List[dict]: + """ + Override tokens from a given word that disagree to force agreement on + word boundaries. + + Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be + rewritten with first strategy as microsoft| + company| B-ENT I-ENT + """ + if aggregation_strategy in { + AggregationStrategy.NONE, + AggregationStrategy.SIMPLE, + }: + raise ValueError( + "NONE and SIMPLE strategies are invalid for word aggregation" + ) + + word_entities = [] + word_group = None + for entity in entities: + if word_group is None: + word_group = [entity] + elif entity["is_subword"]: + word_group.append(entity) + else: + word_entities.append( + self.aggregate_word(word_group, aggregation_strategy) + ) + word_group = [entity] + # Last item + word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) # type: ignore + return word_entities + + def group_sub_entities(self, entities: List[dict]) -> dict: + """ + Group together the adjacent tokens with the same entity predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + # Get the first entity in the entity group + entity = entities[0]["entity"].split("-")[-1] + scores = np.nanmean([entity["score"] for entity in entities]) + tokens = [entity["word"] for entity in entities] + + entity_group = { + "entity_group": entity, + "score": np.mean(scores), + "word": self.tokenizer.convert_tokens_to_string(tokens), + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return entity_group + + def get_tag(self, entity_name: str) -> Tuple[str, str]: + if entity_name.startswith("B-"): + bi = "B" + tag = entity_name[2:] + elif entity_name.startswith("I-"): + bi = "I" + tag = entity_name[2:] + else: + # It's not in B-, I- format + # Default to I- for continuation. + bi = "I" + tag = entity_name + return bi, tag + + def group_entities(self, entities: List[dict]) -> List[dict]: + """ + Find and group together the adjacent tokens with the same entity + predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + + entity_groups = [] + entity_group_disagg: list[dict] = [] + + for entity in entities: + if not entity_group_disagg: + entity_group_disagg.append(entity) + continue + + # If the current entity is similar and adjacent to the previous + # entity, append it to the disaggregated entity group + # The split is meant to account for the "B" and "I" prefixes + # Shouldn't merge if both entities are B-type + bi, tag = self.get_tag(entity["entity"]) + last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"]) + + if tag == last_tag and bi != "B": + # Modify subword type to be previous_type + entity_group_disagg.append(entity) + else: + # If the current entity is different from the previous entity + # aggregate the disaggregated entity group + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + entity_group_disagg = [entity] + if entity_group_disagg: + # it's the last entity, add it to the entity groups + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + + return entity_groups diff --git a/robotoff/prediction/ocr/grammar.py b/robotoff/prediction/ocr/grammar.py index 326528177c..2645d8cc9f 100644 --- a/robotoff/prediction/ocr/grammar.py +++ b/robotoff/prediction/ocr/grammar.py @@ -131,7 +131,7 @@ def generate_terminal_symbols_text( """ ignore_ids = ignore_ids or set() texts = [] - taxonomy = get_taxonomy(taxonomy_type.name, offline=True) + taxonomy = get_taxonomy(taxonomy_type.name, offline=False) seen_set: dict[str, str] = {} node_id_names = extract_taxonomy_names( diff --git a/robotoff/settings.py b/robotoff/settings.py index 3282b69403..809701115f 100644 --- a/robotoff/settings.py +++ b/robotoff/settings.py @@ -155,6 +155,8 @@ def event_api() -> str: + "/data/taxonomies/packaging_materials.full.json", "packaging_recycling": BaseURLProvider.static(ServerType.off) + "/data/taxonomies/packaging_recycling.full.json", + "allergen": BaseURLProvider.static(ServerType.off) + + "/data/taxonomies/allergen.full.json", } _off_password = os.environ.get("OFF_PASSWORD", "") diff --git a/robotoff/taxonomy.py b/robotoff/taxonomy.py index 24ef7b1a02..fd5e9143a9 100644 --- a/robotoff/taxonomy.py +++ b/robotoff/taxonomy.py @@ -40,7 +40,7 @@ def generate_category_hierarchy( @cachetools.cached(cache=cachetools.TTLCache(maxsize=100, ttl=12 * 60 * 60)) # 12h -def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy: +def get_taxonomy(taxonomy_type: TaxonomyType | str, offline: bool = False) -> Taxonomy: """Return the taxonomy of type `taxonomy_type`. The taxonomy is cached in memory and locally on disk. Every 12h, we check @@ -57,8 +57,11 @@ def get_taxonomy(taxonomy_type: str, offline: bool = False) -> Taxonomy: if offline: return Taxonomy.from_path(str(settings.TAXONOMY_PATHS[taxonomy_type])) + taxonomy_type_enum = ( + TaxonomyType[taxonomy_type] if isinstance(taxonomy_type, str) else taxonomy_type + ) return _get_taxonomy( - TaxonomyType[taxonomy_type], + taxonomy_type_enum, force_download=False, cache_dir=settings.DATA_DIR / "taxonomies", ) diff --git a/robotoff/workers/tasks/import_image.py b/robotoff/workers/tasks/import_image.py index 9a108ae307..f570e35b0a 100644 --- a/robotoff/workers/tasks/import_image.py +++ b/robotoff/workers/tasks/import_image.py @@ -1,3 +1,4 @@ +import dataclasses import datetime from pathlib import Path from typing import Optional @@ -5,6 +6,7 @@ import elasticsearch from elasticsearch.helpers import BulkIndexError from openfoodfacts import OCRResult +from openfoodfacts.types import TaxonomyType from PIL import Image from robotoff import settings @@ -32,10 +34,12 @@ db, with_db, ) -from robotoff.off import generate_image_url, get_source_from_url +from robotoff.off import generate_image_url, get_source_from_url, parse_ingredients +from robotoff.prediction import ingredient_list from robotoff.prediction.upc_image import UPCImageType, find_image_is_upc from robotoff.products import get_product_store from robotoff.slack import NotifierFactory +from robotoff.taxonomy import get_taxonomy from robotoff.triton import generate_clip_embedding from robotoff.types import ( JSONType, @@ -115,6 +119,18 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url: image_url=image_url, ocr_url=ocr_url, ) + # Only extract ingredient lists for food products, as the model was not + # trained on non-food products + enqueue_job( + extract_ingredients_job, + get_high_queue(product_id), + # We add a higher timeout, as we request Product Opener to + # parse ingredient list, which may take a while depending on + # the number of ingredient list (~1s per ingredient list) + job_kwargs={"result_ttl": 0, "timeout": "2m"}, + product_id=product_id, + ocr_url=ocr_url, + ) # We make sure there are no concurrent insight processing by sending # the job to the same queue. The queue is selected based on the product # barcode. See `get_high_queue` documentation for more details. @@ -573,3 +589,85 @@ def add_image_fingerprint_job(image_model_id: int): return add_image_fingerprint(image_model) + + +@with_db +def extract_ingredients_job(product_id: ProductIdentifier, ocr_url: str): + """Extracts ingredients using ingredient extraction model from an image + OCR. + + :param product_id: The identifier of the product to extract ingredients + for. + :param ocr_url: The URL of the image to extract ingredients from. + """ + source_image = get_source_from_url(ocr_url) + + with db: + image_model = ImageModel.get_or_none( + source_image=source_image, server_type=product_id.server_type.name + ) + + if not image_model: + logger.info("Missing image in DB for image %s", source_image) + return + + # Stop the job here if the image has already been processed + if ( + ImagePrediction.get_or_none( + image=image_model, model_name=ingredient_list.MODEL_NAME + ) + ) is not None: + return + + output = ingredient_list.predict_from_ocr(ocr_url) + logger.warning("predict_from_ocr output: %s", output) + entities: list[ + ingredient_list.IngredientPredictionAggregatedEntity + ] = output.entities # type: ignore + # (we know it's an aggregated entity, so we can ignore the type) + + image_prediction_data = dataclasses.asdict(output) + ingredient_taxonomy = get_taxonomy(TaxonomyType.ingredient) + + for entity in image_prediction_data["entities"]: + # This is just an extra check, we should have lang information + # available + if entity["lang"]: + lang_id = entity["lang"]["lang"] + try: + # Parse ingredients using Product Opener ingredient parser, + # and add it to the entity data + parsed_ingredients = parse_ingredients(entity["text"], lang_id) + except RuntimeError as e: + logger.info( + "Error while parsing ingredients, skipping " + "to the next ingredient list", + exc_info=e, + ) + continue + + known_ingredients_n = 0 + ingredients_n = len(parsed_ingredients) + for ingredient_data in parsed_ingredients: + ingredient_id = ingredient_data["id"] + ingredient_data["in_taxonomy"] = ( + ingredient_id in ingredient_taxonomy + ) + known_ingredients_n += int(ingredient_data["in_taxonomy"]) + + # We use the same terminology as Product Opener + entity["ingredients_n"] = ingredients_n + entity["known_ingredients_n"] = known_ingredients_n + entity["unknown_ingredients_n"] = ingredients_n - known_ingredients_n + entity["ingredients"] = parsed_ingredients + + ImagePrediction.create( + image=image_model, + type="ner", + model_name=ingredient_list.MODEL_NAME, + model_version=ingredient_list.MODEL_VERSION, + data=image_prediction_data, + timestamp=datetime.datetime.utcnow(), + max_confidence=max(entity.score for entity in entities), + ) + logger.info("create image prediction (ingredient detection) from %s", ocr_url) diff --git a/tests/integration/workers/__init__.py b/tests/integration/workers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/workers/tasks/__init__.py b/tests/integration/workers/tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/workers/tasks/test_import_image.py b/tests/integration/workers/tasks/test_import_image.py new file mode 100644 index 0000000000..aa98a0f7ed --- /dev/null +++ b/tests/integration/workers/tasks/test_import_image.py @@ -0,0 +1,168 @@ +import pytest + +from robotoff.models import ImagePrediction +from robotoff.prediction.ingredient_list import ( + IngredientPredictionAggregatedEntity, + IngredientPredictionOutput, +) +from robotoff.prediction.langid import LanguagePrediction +from robotoff.types import ProductIdentifier, ServerType +from robotoff.workers.tasks.import_image import extract_ingredients_job + +from ...models_utils import ImageModelFactory, ImagePredictionFactory, clean_db + + +@pytest.fixture(autouse=True) +def _set_up_and_tear_down(peewee_db): + with peewee_db: + clean_db() + # Run the test case. + yield + + with peewee_db: + clean_db() + + +def test_extract_ingredients_job(mocker, peewee_db): + full_text = "Best product ever!\ningredients: water, salt, sugar." + entities = [ + IngredientPredictionAggregatedEntity( + start=19, + end=51, + raw_end=51, + score=0.9, + text="water, salt, sugar.", + lang=LanguagePrediction(lang="en", confidence=0.9), + ) + ] + parsed_ingredients = [ + { + "ciqual_food_code": "18066", + "id": "en:water", + "percent_estimate": 66.6666666666667, + "percent_max": 100, + "percent_min": 33.3333333333333, + "text": "water", + "vegan": "yes", + "vegetarian": "yes", + }, + { + "ciqual_food_code": "11058", + "id": "en:salt", + "percent_estimate": 16.6666666666667, + "percent_max": 50, + "percent_min": 0, + "text": "salt", + "vegan": "yes", + "vegetarian": "yes", + }, + { + "id": "en:sugar", + "percent_estimate": 16.6666666666667, + "percent_max": 33.3333333333333, + "percent_min": 0, + "text": "sugar", + "vegan": "yes", + "vegetarian": "yes", + }, + ] + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients", + return_value=parsed_ingredients, + ) + ingredient_list_mocker.predict_from_ocr.return_value = IngredientPredictionOutput( + entities=entities, text=full_text + ) + ingredient_list_mocker.MODEL_NAME = "ingredient-detection" + ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0" + + barcode = "1234567890123" + ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" + + with peewee_db: + image = ImageModelFactory( + barcode=barcode, server_type=ServerType.off, image_id="1" + ) + extract_ingredients_job( + ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url + ) + ingredient_list_mocker.predict_from_ocr.assert_called_once_with(ocr_url) + parse_ingredients_mocker.assert_called_once_with("water, salt, sugar.", "en") + image_prediction = ImagePrediction.get_or_none( + ImagePrediction.model_name == "ingredient-detection", + ImagePrediction.image_id == image.id, + ) + assert image_prediction is not None + assert image_prediction.data == { + "text": full_text, + "entities": [ + { + "end": 51, + "lang": {"lang": "en", "confidence": 0.9}, + "text": "water, salt, sugar.", + "score": 0.9, + "start": 19, + "raw_end": 51, + "ingredients_n": 3, + "known_ingredients_n": 3, + "unknown_ingredients_n": 0, + "ingredients": [ + {"in_taxonomy": True, **ingredient} + for ingredient in parsed_ingredients + ], + } + ], + } + assert image_prediction.max_confidence == 0.9 + assert image_prediction.type == "ner" + assert image_prediction.model_name == "ingredient-detection" + assert image_prediction.model_version == "ingredient-detection-1.0" + + +def test_extract_ingredients_job_missing_image(mocker, peewee_db): + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients" + ) + barcode = "1234567890123" + ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" + + with peewee_db: + extract_ingredients_job( + ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url + ) + ingredient_list_mocker.predict_from_ocr.assert_not_called() + parse_ingredients_mocker.assert_not_called() + + +def test_extract_ingredients_job_existing_image_prediction(mocker, peewee_db): + ingredient_list_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.ingredient_list" + ) + parse_ingredients_mocker = mocker.patch( + "robotoff.workers.tasks.import_image.parse_ingredients" + ) + ingredient_list_mocker.MODEL_NAME = "ingredient-detection" + ingredient_list_mocker.MODEL_VERSION = "ingredient-detection-1.0" + barcode = "1234567890123" + ocr_url = "https://images.openfoodfacts.org/images/products/123/456/789/0123/1.json" + + with peewee_db: + image = ImageModelFactory( + barcode=barcode, server_type=ServerType.off, image_id="1" + ) + ImagePredictionFactory( + image=image, + model_name="ingredient-detection", + model_version="ingredient-detection-1.0", + ) + extract_ingredients_job( + ProductIdentifier(barcode, ServerType.off), ocr_url=ocr_url + ) + ingredient_list_mocker.predict_from_ocr.assert_not_called() + parse_ingredients_mocker.assert_not_called() diff --git a/tests/unit/prediction/ingredient_list/test_postprocess.py b/tests/unit/prediction/ingredient_list/test_postprocess.py new file mode 100644 index 0000000000..59216bbd90 --- /dev/null +++ b/tests/unit/prediction/ingredient_list/test_postprocess.py @@ -0,0 +1,115 @@ +import pytest + +from robotoff.prediction.ingredient_list.postprocess import ( + ORGANIC_MENTIONS_RE, + detect_additional_mentions, + detect_trace_mention, +) + + +@pytest.mark.parametrize( + "text,match", + [ + ("Ingrédients issus de l'agriculture biologique", True), + ("*Ingrédients agricoles issus de l'agriculture biologique", True), + ("*issu de l'agriculture biologique", True), + ("issu de l'agriculture biologique", True), + ("*Produits issus de l'agriculture biologique", True), + ("Produit issu de l'agriculture biologique", True), + ("\"produit issu de l'agriculture durable", True), + ("*= produits issus de l'agriculture biologique", True), + ("* = ingrédients issus de l'agriculture durable", True), + ("* Produit issu de l'Agriculture Biologique", True), + ("*organic", True), + ('"aus biologischer Landwirtschaft', True), + ("*de cultivo ecologico certificado", True), + ("organic", False), + ("agriculture biologique", False), + ("produit issu", False), + ], +) +def test_organic_mention_detection(text: str, match: bool): + assert (ORGANIC_MENTIONS_RE.match(text) is not None) is match + + +@pytest.mark.parametrize( + "text, initial_end_idx, new_end_idx", + [ + (", *ingrédients issus de l'agriculture biologique", 0, 48), + ( + "Eau, poireaux*, carottes*, navet*. *= produits issus de l'agriculture durable. Valeurs nutritionnelles", + 33, + 77, + ), + ( + "Eau, poireaux*, carottes*, navet*, *ingrédients issus de l'agriculture bio. Valeurs nutritionnelles", + 33, + 74, + ), + ( + "Eau, poireaux*, carottes*, navet*, *ingrédients issus de l'agriculture bio. Peut contenir des traces de noix. Valeurs nutritionnelles", + 33, + 108, + ), + ( + "Eau, poireaux*, carottes*, navet*. Peut contenir des traces de noix. *ingrédients issus de l'agriculture bio. Valeurs nutritionnelles", + 33, + 108, + ), + # If no mention was detected, reset the end index to its initial value + ( + "Eau, poireaux*, carottes*, navet*, ", + 33, + 33, + ), + ], +) +def test_detect_additional_mentions(text: str, initial_end_idx, new_end_idx: int): + assert detect_additional_mentions(text, initial_end_idx) == new_end_idx + + +@pytest.mark.parametrize( + "text, new_end_idx", + [ + ("Peut contenir des traces de fruit à coque.", 41), + ( + "Peut contenir des traces de soja, lait, sésame, amande, noisette, noix de cajou et arachide !", + 91, + ), + ("Eau, banane", 0), + ("peut contenir des traces d'arachides et de cacahuètes. Attention", 53), + ( + "produit élaboré dans un atelier utilisant du lait demi-écrémé et du gorgonzola. OTHER", + 78, + ), + # This should not match, as the string does not start with the + # allergen mention + ("OTHER. Peut contenir des traces d'arachides et de cacahuètes", 0), + ("contient naturellement du jaune d'oeuf. Info nutritionnelles", 38), + # This should not match, as the first word is "acontient" and not + # "contient" (we check for word boundaries) + ("acontient naturellement du jaune d'oeuf. Info nutritionnelles", 0), + # EN + ("contains wheat", 14), + ], +) +def test_detect_trace_mention(text: str, new_end_idx: int): + assert detect_trace_mention(text, end_idx=0) == new_end_idx + + +@pytest.mark.parametrize( + "text", + [ + # FR + "Peut contenir des traces de fruit à coque", + # ES + "CONTIENE LECHE", + "Contiene lecitina de soya", + "Este producto contiene espelta, trigo y gluten", + "PUEDE CONTENER LECHE", + ], +) +def test_detect_trace_mention_full_match(text: str): + """Test that the trace mention detection works (only full matches are + tested here).""" + assert detect_trace_mention(text, end_idx=0) == len(text)