diff --git a/README.md b/README.md index b445e5c..5a26396 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ fractional and ordinal numbers, and more. | `pl` (Polish) | ✅ | 🚧 | ✅ | ✅ | | `pt` (Portuguese) | ✅ | ✅ | ✅ | ✅ | | `mwl` (Mirandese) | ✅ | ✅ | ✅ | ✅ | +| `ast` (Asturian) | ✅ | ✅ | ✅ | ✅ | | `ru` (Russian) | ✅ | 🚧 | ✅ | ✅ | | `sv` (Swedish) | ✅ | ✅ | ✅ | ❌ | | `sl` (Slovenian) | ✅ | 🚧 | ❌ | ❌ | diff --git a/ovos_number_parser/__init__.py b/ovos_number_parser/__init__.py index 2d2e5bf..4a72919 100644 --- a/ovos_number_parser/__init__.py +++ b/ovos_number_parser/__init__.py @@ -1,7 +1,9 @@ +from typing import Optional from typing import Union from unicode_rbnf import RbnfEngine, FormatPurpose +from ovos_number_parser.numbers_ast import AST from ovos_number_parser.numbers_az import numbers_to_digits_az, extract_number_az, is_fractional_az, pronounce_number_az from ovos_number_parser.numbers_ca import numbers_to_digits_ca, pronounce_number_ca, is_fractional_ca, extract_number_ca from ovos_number_parser.numbers_cs import numbers_to_digits_cs, pronounce_number_cs, is_fractional_cs, extract_number_cs @@ -18,13 +20,11 @@ from ovos_number_parser.numbers_gl import pronounce_number_gl, extract_number_gl, is_fractional_gl, numbers_to_digits_gl from ovos_number_parser.numbers_hu import pronounce_number_hu, pronounce_ordinal_hu from ovos_number_parser.numbers_it import (extract_number_it, pronounce_number_it, is_fractional_it) +from ovos_number_parser.numbers_mwl import MWL from ovos_number_parser.numbers_nl import numbers_to_digits_nl, pronounce_number_nl, pronounce_ordinal_nl, \ extract_number_nl, is_fractional_nl from ovos_number_parser.numbers_pl import numbers_to_digits_pl, pronounce_number_pl, extract_number_pl, is_fractional_pl -from ovos_number_parser.numbers_pt import PortugueseVariant, pronounce_fraction_pt, numbers_to_digits_pt, \ - pronounce_number_pt, is_fractional_pt, extract_number_pt, pronounce_ordinal_pt, is_ordinal_pt -from ovos_number_parser.numbers_mwl import pronounce_fraction_mwl, numbers_to_digits_mwl, \ - pronounce_number_mwl, is_fractional_mwl, extract_number_mwl, pronounce_ordinal_mwl, is_ordinal_mwl +from ovos_number_parser.numbers_pt import PortugueseVariant, PT_PT, PT_BR from ovos_number_parser.numbers_ru import numbers_to_digits_ru, pronounce_number_ru, extract_number_ru, is_fractional_ru from ovos_number_parser.numbers_sl import pronounce_number_sl from ovos_number_parser.numbers_sv import pronounce_number_sv, pronounce_ordinal_sv, extract_number_sv, \ @@ -33,7 +33,7 @@ from ovos_number_parser.util import Scale, GrammaticalGender, DigitPronunciation -def numbers_to_digits(utterance: str, lang: str, scale: Scale = Scale.LONG) -> str: +def numbers_to_digits(utterance: str, lang: str, scale: Optional[Scale] = None) -> str: """ Convert written numbers in a text string to their digit representations for the specified language and numerical scale. @@ -50,6 +50,8 @@ def numbers_to_digits(utterance: str, lang: str, scale: Scale = Scale.LONG) -> s """ if lang.startswith("az"): return numbers_to_digits_az(utterance) + if lang.startswith("ast"): + return AST.numbers_to_digits(utterance) if lang.startswith("ca"): return numbers_to_digits_ca(utterance) if lang.startswith("gl"): @@ -69,9 +71,9 @@ def numbers_to_digits(utterance: str, lang: str, scale: Scale = Scale.LONG) -> s if lang.startswith("pl"): return numbers_to_digits_pl(utterance) if lang.startswith("pt"): - return numbers_to_digits_pt(utterance, scale=scale) + return PT_PT.numbers_to_digits(utterance, scale=scale) if lang.startswith("mwl"): - return numbers_to_digits_mwl(utterance, scale=scale) + return MWL.numbers_to_digits(utterance, scale=scale) if lang.startswith("ru"): return numbers_to_digits_ru(utterance) if lang.startswith("uk"): @@ -81,10 +83,12 @@ def numbers_to_digits(utterance: str, lang: str, scale: Scale = Scale.LONG) -> s def pronounce_number(number: Union[int, float], lang: str, places: int = 3, - short_scale: bool = True, - scientific: bool = False, ordinals: bool = False, + short_scale: Optional[bool] = None, # DEPRECATED + scientific: bool = False, + ordinals: bool = False, digits: DigitPronunciation = DigitPronunciation.FULL_NUMBER, - gender: GrammaticalGender = GrammaticalGender.MASCULINE) -> str: + gender: GrammaticalGender = GrammaticalGender.MASCULINE, + scale: Optional[Scale] = None) -> str: """ Return the spoken representation of a number in the specified language. @@ -106,13 +110,19 @@ def pronounce_number(number: Union[int, float], lang: str, Raises: NotImplementedError: If the specified language is not supported. """ - scale = Scale.SHORT if short_scale else Scale.LONG # TODO migrate function kwarg to accept Scale enum + scale = scale or Scale.SHORT + if short_scale is not None: + # TODO log warning + pass + short_scale = scale == Scale.SHORT if lang.startswith("en"): return pronounce_number_en(number, places, short_scale, scientific, ordinals) if lang.startswith("az"): return pronounce_number_az(number, places, short_scale, scientific, ordinals) if lang.startswith("ca"): return pronounce_number_ca(number, places) + if lang.startswith("ast"): + return AST.pronounce_number(number, places, scale, ordinals, digits, gender) if lang.startswith("cs"): return pronounce_number_en(number, places, short_scale, scientific, ordinals) if lang.startswith("da"): @@ -138,14 +148,11 @@ def pronounce_number(number: Union[int, float], lang: str, if lang.startswith("pl"): return pronounce_number_pl(number, places, short_scale, scientific, ordinals) if lang.startswith("pt"): - variant = PortugueseVariant.BR if "br" in lang.lower() else PortugueseVariant.PT - return pronounce_number_pt(number, places, scale=scale, - variant=variant, ordinals=ordinals, - digits=digits, gender=gender) + if "br" in lang.lower(): + return PT_BR.pronounce_number(number, places, scale, ordinals, digits, gender) + return PT_PT.pronounce_number(number, places, scale, ordinals, digits, gender) if lang.startswith("mwl"): - return pronounce_number_mwl(number, places, - scale=scale, ordinals=ordinals, - digits=digits, gender=gender) + return MWL.pronounce_number(number, places, scale, ordinals, digits, gender) if lang.startswith("ru"): return pronounce_number_ru(number, places, short_scale, scientific, ordinals) if lang.startswith("sl"): @@ -163,7 +170,7 @@ def pronounce_number(number: Union[int, float], lang: str, raise NotImplementedError(f"Unsupported language: '{lang}'") from err -def pronounce_fraction(fraction_word: str, lang: str, scale: Scale = Scale.LONG) -> str: +def pronounce_fraction(fraction_word: str, lang: str, scale: Optional[Scale] = None) -> str: """ Return the spoken form of a fraction string (e.g., "1/2" as "one half") for the specified language and numerical scale. @@ -178,17 +185,20 @@ def pronounce_fraction(fraction_word: str, lang: str, scale: Scale = Scale.LONG) NotImplementedError: If the specified language is not supported. """ if lang.startswith("pt"): - variant = PortugueseVariant.BR if "br" in lang.lower() else PortugueseVariant.PT - return pronounce_fraction_pt(fraction_word, scale=scale, variant=variant) + return PT_BR.pronounce_fraction(fraction_word, scale=scale) if "br" in lang.lower() \ + else PT_PT.pronounce_fraction(fraction_word, scale=scale) + elif lang.startswith("ast"): + return AST.pronounce_fraction(fraction_word, scale=scale) elif lang.startswith("mwl"): - return pronounce_fraction_mwl(fraction_word, scale=scale) + return MWL.pronounce_fraction(fraction_word, scale=scale) else: raise NotImplementedError(f"unsupported language: {lang}") def pronounce_ordinal(number: Union[int, float], lang: str, - short_scale: bool = True, - gender: GrammaticalGender = GrammaticalGender.MASCULINE) -> str: + short_scale: Optional[bool] = None, # DEPRECATED + gender: GrammaticalGender = GrammaticalGender.MASCULINE, + scale: Optional[Scale] = None) -> str: """ Return the spoken ordinal form of a number in the specified language. @@ -204,12 +214,18 @@ def pronounce_ordinal(number: Union[int, float], lang: str, Raises: NotImplementedError: If the language is not supported. """ - scale = Scale.SHORT if short_scale else Scale.LONG # TODO migrate function kwarg to accept Scale enum + scale = scale or Scale.SHORT + if short_scale is not None: + # TODO log warning + pass + short_scale = scale == Scale.SHORT if lang.startswith("pt"): - variant = PortugueseVariant.BR if "br" in lang.lower() else PortugueseVariant.PT - return pronounce_ordinal_pt(number, scale=scale, variant=variant, gender=gender) + return PT_BR.pronounce_ordinal(number, scale=scale, gender=gender) if "br" in lang.lower() \ + else PT_PT.pronounce_ordinal(number, scale=scale, gender=gender) if lang.startswith("mwl"): - return pronounce_ordinal_mwl(number, scale=scale, gender=gender) + return MWL.pronounce_ordinal(number, scale=scale, gender=gender) + if lang.startswith("ast"): + return AST.pronounce_ordinal(number, scale=scale, gender=gender) if lang.startswith("da"): return pronounce_ordinal_da(number) if lang.startswith("de"): @@ -229,7 +245,10 @@ def pronounce_ordinal(number: Union[int, float], lang: str, raise NotImplementedError(f"Unsupported language: '{lang}'") from err -def extract_number(text: str, lang: str, short_scale: bool = True, ordinals: bool = False) -> Union[int, float, bool]: +def extract_number(text: str, lang: str, + short_scale: Optional[bool] = None, # DEPRECATED + ordinals: bool = False, + scale: Optional[Scale] = None) -> Union[int, float, bool]: """Takes in a string and extracts a number. Assumes only 1 number is in the string, does NOT handle multiple numbers @@ -247,7 +266,11 @@ def extract_number(text: str, lang: str, short_scale: bool = True, ordinals: boo (int, float or False): The number extracted or False if the input text contains no numbers """ - scale = Scale.SHORT if short_scale else Scale.LONG # TODO migrate function kwarg to accept Scale enum + scale = scale or Scale.SHORT + if short_scale is not None: + # TODO log warning + pass + short_scale = scale == Scale.SHORT if lang.startswith("en"): return extract_number_en(text, short_scale, ordinals) if lang.startswith("az"): @@ -277,10 +300,12 @@ def extract_number(text: str, lang: str, short_scale: bool = True, ordinals: boo if lang.startswith("pl"): return extract_number_pl(text, short_scale, ordinals) if lang.startswith("pt"): - variant = PortugueseVariant.BR if "br" in lang.lower() else PortugueseVariant.PT - return extract_number_pt(text, scale=scale, ordinals=ordinals, variant=variant) + return PT_BR.extract_number(text, scale=scale, ordinals=ordinals) if "br" in lang.lower() \ + else PT_PT.extract_number(text, scale=scale, ordinals=ordinals) if lang.startswith("mwl"): - return extract_number_mwl(text, scale=scale, ordinals=ordinals) + return MWL.extract_number(text, scale=scale, ordinals=ordinals) + if lang.startswith("ast"): + return AST.extract_number(text, scale=scale, ordinals=ordinals) if lang.startswith("ru"): return extract_number_ru(text, short_scale, ordinals) if lang.startswith("sv"): @@ -290,22 +315,29 @@ def extract_number(text: str, lang: str, short_scale: bool = True, ordinals: boo raise NotImplementedError(f"Unsupported language: '{lang}'") -def is_fractional(input_str: str, lang: str, short_scale: bool = True) -> Union[bool, float]: +def is_fractional(input_str: str, lang: str, + short_scale: Optional[bool] = None, # DEPRECATED + scale: Optional[Scale] = None) -> Union[bool, float]: """ This function takes the given text and checks if it is a fraction. - Used by most of the number exractors. + Used by most of the number extractors. Will return False on phrases that *contain* a fraction. Only detects exact matches. To pull a fraction from a string, see extract_number() Args: input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False + short_scale (bool): DEPRECATED, use scale enum instead lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. Returns: (bool) or (float): False if not a fraction, otherwise the fraction """ + scale = scale or Scale.SHORT + if short_scale is not None: + # TODO log warning + pass + short_scale = scale == Scale.SHORT if lang.startswith("en"): return is_fractional_en(input_str, short_scale) if lang.startswith("az"): @@ -333,9 +365,11 @@ def is_fractional(input_str: str, lang: str, short_scale: bool = True) -> Union[ if lang.startswith("pl"): return is_fractional_pl(input_str, short_scale) if lang.startswith("pt"): - return is_fractional_pt(input_str) + return PT_PT.is_fractional(input_str) if lang.startswith("mwl"): - return is_fractional_mwl(input_str) + return MWL.is_fractional(input_str) + if lang.startswith("ast"): + return AST.is_fractional(input_str) if lang.startswith("ru"): return is_fractional_ru(input_str, short_scale) if lang.startswith("sv"): @@ -358,9 +392,11 @@ def is_ordinal(input_str: str, lang: str) -> Union[bool, float]: corresponding to the ordinal """ if lang.startswith("pt"): - return is_ordinal_pt(input_str) + return PT_PT.is_ordinal(input_str) if lang.startswith("mwl"): - return is_ordinal_mwl(input_str) + return MWL.is_ordinal(input_str) + if lang.startswith("ast"): + return AST.is_ordinal(input_str) if lang.startswith("en"): return is_ordinal_en(input_str) if lang.startswith("de"): diff --git a/ovos_number_parser/numbers_ast.py b/ovos_number_parser/numbers_ast.py new file mode 100644 index 0000000..fb2f0df --- /dev/null +++ b/ovos_number_parser/numbers_ast.py @@ -0,0 +1,248 @@ +from ovos_number_parser.util import (Scale, GrammaticalGender, NumberVocabulary, RomanceNumberExtractor) + + +def swap_gender_ast(word: str, gender: GrammaticalGender) -> str: + """Swap ordinal/adjective endings between masculine and feminine where applicable. + + For Asturian, ordinals typically end in -u (masc.) and -a (fem.). + """ + if gender == GrammaticalGender.FEMININE: + if word.endswith('o'): # neuter + return word[:-1] + 'a' + if word.endswith('u'): # masculine + return word[:-1] + 'a' + if word.endswith('os'): # plural masculine + return word[:-2] + 'as' + else: + # NOTE: need a native speaker to review, does neuter apply to numbers? + #if word.endswith('o'): # neuter + # return word[:-1] + 'u' + if word.endswith('a'): # feminine + return word[:-1] + 'u' + if word.endswith('es'): # plural neuter + return word[:-2] + 'os' + return word + + +def pluralize_ast(word: str): + if word.endswith("ón"): + return word[:-2] + "ones" + if word.endswith("a"): + return word[:-1] + "es" + if word.endswith("u"): + return word[:-1] + "os" + if not word.endswith("s"): + return word + "s" + return word + + +_AST = NumberVocabulary( + LANG="ast", + swap_gender=swap_gender_ast, # used for female forms + pluralize=pluralize_ast, # use for plural forms + + HUNDRED_PARTICLE="ciento", # how to read "1XX" + DENOMINATOR_PARTICLE="avos", # for fractions X / N {PARTICLE} + DIVIDED_BY_ZERO="a dividir por zero", # how to read X/0 values + NO_PREV_UNIT=[100, 1000], # "mil" vs "um mil" / "cem" vs "um cem" + NO_PLURAL=[1000], # "dois mil" vs "dois mils" / "dois milhões" vs "dois milhão" + + NUMBER_OVERFLOW="número exageradamente grande", + DEFAULT_SCALE=Scale.LONG, + JOIN_WORD=["y"], + + JOINER_ON_TWENTYS=True, # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS=False, # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS=False, # add JOIN_WORD from 1000-10000 - "mil e duzentos" + + DECIMAL_MARKER=["punto", "coma", ".", ","], + NEGATIVE_SIGN=["menos"], + UNITS={ + 0: 'cero', + 1: 'un', + 2: 'dos', + 3: 'tres', + 4: 'cuatro', + 5: 'cinco', + 6: 'seis', + 7: 'siete', + 8: 'ocho', + 9: 'nueve' + }, + TENS={ + 10: 'diez', + 11: 'once', + 12: 'doce', + 13: 'trece', + 14: 'catorce', + 15: 'quince', + 16: 'dieciséis', + 17: 'diecisiete', + 18: 'dieciocho', + 19: 'diecinueve', + 20: 'venti', + 21: 'ventiún', + 22: 'ventidós', + 23: 'ventitrés', + 24: 'venticuatro', + 25: 'venticinco', + 26: 'ventiséis', + 27: 'ventisiete', + 28: 'ventiocho', + 29: 'ventinueve', + 30: 'trenta', + 40: 'cuarenta', + 50: 'cincuenta', + 60: 'sesenta', + 70: 'setenta', + 80: 'ochenta', + 90: 'noventa' + }, + HUNDREDS={ + 100: 'cien', + 200: 'doscientos', + 300: 'trescientos', + 400: 'cuatrocientos', + 500: 'quinientos', + 600: 'seiscientos', + 700: 'setecientos', + 800: 'ochocientos', + 900: 'novecientos' + }, + FRACTION={ + 2: 'mediu', + 3: 'terciu', + 4: 'cuartu', + 5: 'quintu', + 6: 'sestu', + 7: 'séptimu', + 8: 'octavu', + 9: 'novenu', + 10: 'décimu', + 11: 'onceavos', + 12: 'doceavos' + }, + FRACTION_FEMALE={ + 2: "media" # una y media -> 1.5 + }, + SHORT_SCALE={ + 10 ** 6: "millón", + 10 ** 3: "mil" + }, + LONG_SCALE={ + 10 ** 6: "millón", + 10 ** 3: "mil" + }, + + GENDERED_SPELLINGS={ + GrammaticalGender.FEMININE: { + 1: "una" + } + }, + DIGIT_SPELLINGS={}, + ALT_SPELLINGS={ + "unu": 1, + "oito": 8, # https://en.wiktionary.org/wiki/ocho#Asturian + "otso": 8, + "uechu": 8, # medieval form + "dolce": 12, + "selce": 16, # https://en.wiktionary.org/wiki/diecis%C3%A9is#Asturian + "ventiuno": 21, + + # ordinals with spanish influenced alt forms + 'decimoprimeru': 11, + "decimosegundu": 12, + "decimoterceru": 13, + "decimocuartu": 14, + "decimoquintu": 15, + 'ventésimu': 20, + 'trentésimu': 30, + }, + ORDINAL_UNITS={ + 1: 'primeru', + 2: 'segundu', + 3: 'terceru', + 4: 'cuartu', + 5: 'quintu', + 6: 'sestu', + 7: 'séptimu', + 8: 'octavu', + 9: 'novenu' + }, + ORDINAL_TENS={ + 10: 'décimu', + 11: 'oncenu', + 12: 'docenu', + 13: 'trecenu', + 14: 'catorcenu', + 15: 'quincenu', + 16: 'decimosestu', + 17: 'decimoséptimu', + 18: 'decimoctavu', + 19: 'decimonovenu', + 20: 'ventenu', + 30: 'trentenu', + 40: 'cuarentenu', + 50: 'cincuentenu', + 60: 'sesentenu', + 70: 'setentenu', + 80: 'ochentenu', + 90: 'noventenu' + }, + ORDINAL_HUNDREDS={ + 100: 'centésimu', + 200: "doscientesimu", + 300: "trescientesimu", + 400: "cuatrocientesimu", + 500: "quinientesimu", + 600: "seiscientesimu", + 700: "setecientesimu", + 800: "ochocientesimu", + 900: "novecientesimu" + }, + ORDINAL_SHORT_SCALE={ + 10 ** 3: "milésimu", + 10 ** 6: "millonésimu", + 10 ** 9: "billonésimu", + 10 ** 12: "trillonésimu", + 10 ** 15: "cuatrillonésimu", + 10 ** 18: "quintillonésimu", + 10 ** 21: "sextillonésimu", + }, + ORDINAL_LONG_SCALE={ + 10 ** 3: "milésimu", + 10 ** 6: "millonésimu", + 10 ** 12: "billonésimu", + 10 ** 18: "trillonésimu", + 10 ** 24: "cuatrillonésimu", + 10 ** 30: "quintillonésimu", + 10 ** 36: "sextillonésimu", + } +) + +AST = RomanceNumberExtractor(_AST) + +if __name__ == '__main__': + print('--- Asturian number tests ---') + print('16 ->', AST.pronounce_number(16)) + print('21 ->', AST.pronounce_number(21)) + print('35 ->', AST.pronounce_number(35)) + print('101 ->', AST.pronounce_number(101)) + print('1,234 ->', AST.pronounce_number(1234)) + print('1,000,000 ->', AST.pronounce_number(1_000_000)) + + print('\n--- Ordinals ---') + print('1st (m):', AST.pronounce_ordinal(1)) + print('1st (f):', AST.pronounce_ordinal(1, GrammaticalGender.FEMININE)) + print('23rd (m):', AST.pronounce_ordinal(23)) + print('100th:', AST.pronounce_ordinal(100)) + + print('\n--- Extraction ---') + print("'un millón' ->", AST.extract_number('un millón')) + print("'dos millones trescientos' ->", AST.extract_number('dos millones trescientos')) + print("'ventiuno' ->", AST.extract_number('ventiuno')) + print("'tres cuartos' (fraction word) ->", AST.extract_number('tres cuartos')) + + print('\n--- numbers_to_digits ---') + print(AST.numbers_to_digits('hai dos millones cincuenta persoas')) + print(AST.numbers_to_digits('merquei ventiuno panes')) diff --git a/ovos_number_parser/numbers_gl.py b/ovos_number_parser/numbers_gl.py index c5d4f3a..0f8a44d 100644 --- a/ovos_number_parser/numbers_gl.py +++ b/ovos_number_parser/numbers_gl.py @@ -1,666 +1,542 @@ from collections import OrderedDict -from typing import List - -from ovos_number_parser.util import (convert_to_mixed_fraction, look_for_fractions, +from typing import List, Optional, Union +import warnings +from ovos_number_parser.util import (convert_to_mixed_fraction, look_for_fractions, DigitPronunciation, is_numeric, tokenize, Token) -_NUM_STRING_GL = { - 0: 'cero', - 1: 'un', - 2: 'dous', - 3: 'tres', - 4: 'catro', - 5: 'cinco', - 6: 'seis', - 7: 'sete', - 8: 'oito', - 9: 'nove', - 10: 'dez', - 11: 'once', - 12: 'doce', - 13: 'trece', - 14: 'catorce', - 15: 'quince', - 16: 'dezaseis', - 17: 'dezasete', - 18: 'dezaoito', - 19: 'dezanove', - 20: 'vinte', - 30: 'trinta', - 40: 'corenta', - 50: 'cincuenta', - 60: 'sesenta', - 70: 'setenta', - 80: 'oitenta', - 90: 'noventa' -} - -_STRING_NUM_GL = { - "cero": 0, - "un": 1, - "unha": 1, - "dous": 2, - "tres": 3, - "catro": 4, - "cinco": 5, - "seis": 6, - "sete": 7, - "oito": 8, - "nove": 9, - "dez": 10, - "once": 11, - "doce": 12, - "trece": 13, - "catorce": 14, - "quince": 15, - "dezaseis": 16, - "dezasete": 17, - "dezaoito": 18, - "dezanove": 19, - "vinte": 20, - "vinte e un": 21, - "vinte e dous": 22, - "vinte e tres": 23, - "vinte e catro": 24, - "vinte e cinco": 25, - "vinte e seis": 26, - "vinte e sete": 27, - "vinte e oito": 28, - "vinte e nove": 29, - "trinta": 30, - "corenta": 40, - "cincuenta": 50, - "sesenta": 60, - "setenta": 70, - "oitenta": 80, - "noventa": 90, - "cen": 100, - "cento": 100, - "douscentos": 200, - "duascentas": 200, - "trescentos": 300, - "trescentas": 300, - "catrocentos": 400, - "catrocentas": 400, - "cincocentos": 500, - "cincocentas": 500, - "seiscentos": 600, - "seiscentas": 600, - "setecentos": 700, - "setecentas": 700, - "oitocentos": 800, - "oitocentas": 800, - "novecentos": 900, - "novecentas": 900, - "mil": 1000} - -_FRACTION_STRING_GL = { - 2: 'medio', - 3: 'terzo', - 4: 'cuarto', - 5: 'quinto', - 6: 'sexto', - 7: 'séptimo', - 8: 'oitavo', - 9: 'noveno', - 10: 'décimo', - 11: 'onceavo', - 12: 'doceavo', - 13: 'treceavo', - 14: 'catorceavo', - 15: 'quinceavo', - 16: 'dezaseisavo', - 17: 'dezaseteavo', - 18: 'dezaoitoavo', - 19: 'dezanoveavo', - 20: 'vinteavo' -} - -# https://www.grobauer.at/gl_eur/zahlnamen.php -_LONG_SCALE_GL = OrderedDict([ - (100, 'centena'), - (1000, 'millar'), - (1000000, 'millón'), - (1e9, "millardo"), - (1e12, "billón"), - (1e18, 'trillón'), - (1e24, "cuatrillón"), - (1e30, "quintillón"), - (1e36, "sextillón"), - (1e42, "septillón"), - (1e48, "octillón"), - (1e54, "nonillón"), - (1e60, "decillón"), - (1e66, "undecillón"), - (1e72, "duodecillón"), - (1e78, "tredecillón"), - (1e84, "cuatrodecillón"), - (1e90, "quindecillón"), - (1e96, "sexdecillón"), - (1e102, "septendecillón"), - (1e108, "octodecillón"), - (1e114, "novendecillón"), - (1e120, "vigintillón"), - (1e306, "unquinquagintillón"), - (1e312, "duoquinquagintillón"), - (1e336, "sexquinquagintillón"), - (1e366, "unsexagintillón") -]) - -_SHORT_SCALE_GL = OrderedDict([ - (100, 'centena'), - (1000, 'millar'), - (1000000, 'millón'), - (1e9, "billón"), - (1e12, 'trillón'), - (1e15, "cuatrillón"), - (1e18, "quintillón"), - (1e21, "sextillón"), - (1e24, "septillón"), - (1e27, "octillón"), - (1e30, "nonillón"), - (1e33, "decillón"), - (1e36, "undecillón"), - (1e39, "duodecillón"), - (1e42, "tredecillón"), - (1e45, "cuatrodecillón"), - (1e48, "quindecillón"), - (1e51, "sexdecillón"), - (1e54, "septendecillón"), - (1e57, "octodecillón"), - (1e60, "novendecillón"), - (1e63, "vigintillón"), - (1e66, "unvigintillón"), - (1e69, "unovigintillón"), - (1e72, "tresvigintillón"), - (1e75, "quattuorvigintillón"), - (1e78, "quinquavigintillón"), - (1e81, "qesvigintillón"), - (1e84, "septemvigintillón"), - (1e87, "octovigintillón"), - (1e90, "novemvigintillón"), - (1e93, "trigintillón"), - (1e96, "untrigintillón"), - (1e99, "duotrigintillón"), - (1e102, "trestrigintillón"), - (1e105, "quattuortrigintillón"), - (1e108, "quinquatrigintillón"), - (1e111, "sestrigintillón"), - (1e114, "septentrigintillón"), - (1e117, "octotrigintillón"), - (1e120, "noventrigintillón"), - (1e123, "quadragintillón"), - (1e153, "quinquagintillón"), - (1e183, "sexagintillón"), - (1e213, "septuagintillón"), - (1e243, "octogintillón"), - (1e273, "nonagintillón"), - (1e303, "centillón"), - (1e306, "uncentillón"), - (1e309, "duocentillón"), - (1e312, "trescentillón"), - (1e333, "decicentillón"), - (1e336, "undecicentillón"), - (1e363, "viginticentillón"), - (1e366, "unviginticentillón"), - (1e393, "trigintacentillón"), - (1e423, "quadragintacentillón"), - (1e453, "quinquagintacentillón"), - (1e483, "sexagintacentillón"), - (1e513, "septuagintacentillón"), - (1e543, "octogintacentillón"), - (1e573, "nonagintacentillón"), - (1e603, "ducentillón"), - (1e903, "trecentillón"), - (1e1203, "quadringentillón"), - (1e1503, "quingentillón"), - (1e1803, "sexcentillón"), - (1e2103, "septingentillón"), - (1e2403, "octingentillón"), - (1e2703, "nongentillón"), - (1e3003, "millinillón") -]) - -# TODO: female forms. -_ORDINAL_STRING_BASE_GL = { - 1: 'primeiro', - 2: 'segundo', - 3: 'terceiro', - 4: 'cuarto', - 5: 'quinto', - 6: 'sexto', - 7: 'séptimo', - 8: 'oitavo', - 9: 'noveno', - 10: 'décimo', - 11: 'undécimo', - 12: 'duodécimo', - 13: 'decimoterceiro', - 14: 'decimocuarto', - 15: 'decimoquinto', - 16: 'decimosexto', - 17: 'decimoséptimo', - 18: 'decimoitavo', - 19: 'decimonoveno', - 20: 'vixésimo', - 30: 'trixésimo', - 40: "cuadraxésimo", - 50: "quincuaxésimo", - 60: "sexaxésimo", - 70: "septuaxésimo", - 80: "octoxésimo", - 90: "nonaxésimo", - 10e3: "centésimo", - 1e3: "milésimo" -} - -_SHORT_ORDINAL_STRING_GL = { - 1e6: "millonésimo", - 1e9: "milmillonésimo", - 1e12: "billonésimo", - 1e15: "milbillonésimo", - 1e18: "trillonésimo", - 1e21: "miltrillonésimo", - 1e24: "cuatrillonésimo", - 1e27: "milcuatrillonésimo", - 1e30: "quintillonésimo", - 1e33: "milquintillonésimo" - # TODO > 1e-33 -} -_SHORT_ORDINAL_STRING_GL.update(_ORDINAL_STRING_BASE_GL) - -_LONG_ORDINAL_STRING_GL = { - 1e6: "millonésimo", - 1e12: "billonésimo", - 1e18: "trillonésimo", - 1e24: "cuatrillonésimo", - 1e30: "quintillonésimo", - 1e36: "sextillonésimo", - 1e42: "septillonésimo", - 1e48: "octillonésimo", - 1e54: "nonillonésimo", - 1e60: "decillonésimo" - # TODO > 1e60 -} -_LONG_ORDINAL_STRING_GL.update(_ORDINAL_STRING_BASE_GL) - - -def is_fractional_gl(input_str, short_scale=True): +from ovos_number_parser.util import (Scale, GrammaticalGender, NumberVocabulary, RomanceNumberExtractor) + + + +def swap_gender_gl(word: str, gender: GrammaticalGender) -> str: """ - This function takes the given text and checks if it is a fraction. + Convert a Portuguese word between masculine and feminine grammatical gender by adjusting its ending. - Args: - text (str): the string to check if fractional + Parameters: + word (str): The word to convert. + gender (GrammaticalGender): The target grammatical gender. - short_scale (bool): use short scale if True, long scale if False Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - + str: The word with its ending swapped to match the specified gender, if applicable; otherwise, the original word. """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - aFrac = {"medio": 2, "media": 2, "terzo": 3, "cuarto": 4, - "cuarta": 4, "quinto": 5, "quinta": 5, "sexto": 6, "sexta": 6, - "séptimo": 7, "séptima": 7, "oitavo": 8, "oitava": 8, - "noveno": 9, "novena": 9, "décimo": 10, "décima": 10, - "onceavo": 11, "onceava": 11, "doceavo": 12, "doceava": 12} - - if input_str.lower() in aFrac: - return 1.0 / aFrac[input_str] - if (input_str == "vixésimo" or input_str == "vixésima"): - return 1.0 / 20 - if (input_str == "trixésimo" or input_str == "trixésima"): - return 1.0 / 30 - if (input_str == "centésimo" or input_str == "centésima"): - return 1.0 / 100 - if (input_str == "milésimo" or input_str == "milésima"): - return 1.0 / 1000 - return False - - -def extract_number_gl(text, short_scale=True, ordinals=False): + if word == "dois" and gender == GrammaticalGender.FEMININE: + return "duas" + elif word == "duas" and gender == GrammaticalGender.MASCULINE: + return "dois" + + elif gender == GrammaticalGender.FEMININE and word.endswith('o'): + return word[:-1] + 'a' + elif gender == GrammaticalGender.MASCULINE and word.endswith('ma'): + return word[:-1] + elif gender == GrammaticalGender.MASCULINE and word.endswith('a'): + return word[:-1] + 'o' + elif gender == GrammaticalGender.FEMININE and word.endswith('os'): + return word[:-2] + 'as' + elif gender == GrammaticalGender.MASCULINE and word.endswith('as'): + return word[:-2] + 'os' + elif gender == GrammaticalGender.FEMININE and word.endswith('m'): + return word + 'a' + return word + + +def pluralize_gl(word: str): + if word.endswith("ão"): + return word[:-2] + "ões" + if not word.endswith("s"): + return word + "s" + return word + + + +_GL = NumberVocabulary( + LANG="gl-ES", + swap_gender=swap_gender_gl, # used for female forms + pluralize=pluralize_gl, # use for plural forms + + HUNDRED_PARTICLE="cento", # how to read "1XX" + DENOMINATOR_PARTICLE="abos", # for fractions X / N {PARTICLE} + DIVIDED_BY_ZERO="a dividir por zero", # how to read X/0 values + NO_PREV_UNIT=[100, 1000], # "mil" vs "um mil" / "cem" vs "um cem" + NO_PLURAL=[1000], # "dois mil" vs "dois mils" / "dois milhões" vs "dois milhão" + + NUMBER_OVERFLOW="número exageradamente grande", + DEFAULT_SCALE=Scale.LONG, + JOIN_WORD=["e"], + + JOINER_ON_TWENTYS=True, # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS=True, # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS=False, # add JOIN_WORD from 1000-10000 - "mil e duzentos" + + DECIMAL_MARKER=["coma", "punto", ".", ","], + NEGATIVE_SIGN=["menos"], + UNITS={ + 0: 'cero', + 1: 'un', + 2: 'dous', + 3: 'tres', + 4: 'catro', + 5: 'cinco', + 6: 'seis', + 7: 'sete', + 8: 'oito', + 9: 'nove', + }, + TENS={ + 10: 'dez', + 11: 'once', + 12: 'doce', + 13: 'trece', + 14: 'catorce', + 15: 'quince', + 16: 'dezaseis', + 17: 'dezasete', + 18: 'dezaoito', + 19: 'dezanove', + 20: 'vinte', + 30: 'trinta', + 40: 'corenta', + 50: 'cincuenta', + 60: 'sesenta', + 70: 'setenta', + 80: 'oitenta', + 90: 'noventa' + }, + HUNDREDS={ + 100: 'cen', + 200: 'douscentos', + 300: 'trescentos', + 400: 'catrocentos', + 500: 'cincocentos', + 600: 'seiscentos', + 700: 'setecentos', + 800: 'oitocentos', + 900: 'novecentos' + }, + FRACTION={ + 2: 'medio', + 3: 'terzo', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'oitavo', + 9: 'noveno', + 10: 'décimo', + 11: 'onceavo', + 12: 'doceavo', + 13: 'treceavo', + 14: 'catorceavo', + 15: 'quinceavo', + 16: 'dezaseisavo', + 17: 'dezaseteavo', + 18: 'dezaoitoavo', + 19: 'dezanoveavo', + 20: 'vinteavo', + 100: "centésimo", + 1000: "milésimo" + }, + FRACTION_FEMALE={ + 2: "media" # una y media -> 1.5 + }, + SHORT_SCALE={ + 1000: 'mil', + 10 ** 6: 'millón', + 10 ** 9: "billón", + 10 ** 12: 'trillón', + 10 ** 15: "cuatrillón", + 10 ** 18: "quintillón", + 10 ** 21: "sextillón", + 10 ** 24: "septillón", + 10 ** 27: "octillón", + 10 ** 30: "nonillón", + 10 ** 33: "decillón", + 10 ** 36: "undecillón", + 10 ** 39: "duodecillón", + 10 ** 42: "tredecillón", + 10 ** 45: "cuatrodecillón", + 10 ** 48: "quindecillón", + 10 ** 51: "sexdecillón", + 10 ** 54: "septendecillón", + 10 ** 57: "octodecillón", + 10 ** 60: "novendecillón", + 10 ** 63: "vigintillón", + 10 ** 66: "unvigintillón", + 10 ** 69: "unovigintillón", + 10 ** 72: "tresvigintillón", + 10 ** 75: "quattuorvigintillón", + 10 ** 78: "quinquavigintillón", + 10 ** 81: "qesvigintillón", + 10 ** 84: "septemvigintillón", + 10 ** 87: "octovigintillón", + 10 ** 90: "novemvigintillón", + 10 ** 93: "trigintillón", + 10 ** 96: "untrigintillón", + 10 ** 99: "duotrigintillón", + 10 ** 102: "trestrigintillón", + 10 ** 105: "quattuortrigintillón", + 10 ** 108: "quinquatrigintillón", + 10 ** 111: "sestrigintillón", + 10 ** 114: "septentrigintillón", + 10 ** 117: "octotrigintillón", + 10 ** 120: "noventrigintillón", + 10 ** 123: "quadragintillón", + 10 ** 153: "quinquagintillón", + 10 ** 183: "sexagintillón", + 10 ** 213: "septuagintillón", + 10 ** 243: "octogintillón", + 10 ** 273: "nonagintillón", + 10 ** 303: "centillón", + 10 ** 306: "uncentillón", + 10 ** 309: "duocentillón", + 10 ** 312: "trescentillón", + 10 ** 333: "decicentillón", + 10 ** 336: "undecicentillón", + 10 ** 363: "viginticentillón", + 10 ** 366: "unviginticentillón", + 10 ** 393: "trigintacentillón", + 10 ** 423: "quadragintacentillón", + 10 ** 453: "quinquagintacentillón", + 10 ** 483: "sexagintacentillón", + 10 ** 513: "septuagintacentillón", + 10 ** 543: "octogintacentillón", + 10 ** 573: "nonagintacentillón", + 10 ** 603: "ducentillón", + 10 ** 903: "trecentillón", + 10 ** 1203: "quadringentillón", + 10 ** 1503: "quingentillón", + 10 ** 1803: "sexcentillón", + 10 ** 2103: "septingentillón", + 10 ** 2403: "octingentillón", + 10 ** 2703: "nongentillón", + 10 ** 3003: "millinillón", + }, + LONG_SCALE={ + 1000: 'mil', + 10 ** 6: 'millón', + 10 ** 9: "millardo", + 10 ** 12: "billón", + 10 ** 18: 'trillón', + 10 ** 24: "cuatrillón", + 10 ** 30: "quintillón", + 10 ** 36: "sextillón", + 10 ** 42: "septillón", + 10 ** 48: "octillón", + 10 ** 54: "nonillón", + 10 ** 60: "decillón", + 10 ** 66: "undecillón", + 10 ** 72: "duodecillón", + 10 ** 78: "tredecillón", + 10 ** 84: "cuatrodecillón", + 10 ** 90: "quindecillón", + 10 ** 96: "sexdecillón", + 10 ** 102: "septendecillón", + 10 ** 108: "octodecillón", + 10 ** 114: "novendecillón", + 10 ** 120: "vigintillón", + 10 ** 306: "unquinquagintillón", + 10 ** 312: "duoquinquagintillón", + 10 ** 336: "sexquinquagintillón", + 10 ** 366: "unsexagintillón" + }, + + GENDERED_SPELLINGS={ + GrammaticalGender.FEMININE: { + 1: "unha", + 2: "dúas" + } + }, + DIGIT_SPELLINGS={}, + ALT_SPELLINGS={ + + }, + ORDINAL_UNITS={ + 1: 'primeiro', + 2: 'segundo', + 3: 'terceiro', + 4: 'cuarto', + 5: 'quinto', + 6: 'sexto', + 7: 'séptimo', + 8: 'oitavo', + 9: 'noveno', + + }, + ORDINAL_TENS={ + 10: 'décimo', + 11: 'undécimo', + 12: 'duodécimo', + 13: 'decimoterceiro', + 14: 'decimocuarto', + 15: 'decimoquinto', + 16: 'decimosexto', + 17: 'decimoséptimo', + 18: 'decimoitavo', + 19: 'decimonoveno', + 20: 'vixésimo', + 30: 'trixésimo', + 40: "cuadraxésimo", + 50: "quincuaxésimo", + 60: "sexaxésimo", + 70: "septuaxésimo", + 80: "octoxésimo", + 90: "nonaxésimo" + }, + ORDINAL_HUNDREDS={ + 100: "centésimo", + }, + ORDINAL_SHORT_SCALE={ + 10 ** 3: "milésimo", + 10 ** 6: "millonésimo", + 10 ** 9: "milmillonésimo", + 10 ** 12: "billonésimo", + 10 ** 15: "milbillonésimo", + 10 ** 18: "trillonésimo", + 10 ** 21: "miltrillonésimo", + 10 ** 24: "cuatrillonésimo", + 10 ** 27: "milcuatrillonésimo", + 10 ** 30: "quintillonésimo", + 10 ** 33: "milquintillonésimo" + }, + ORDINAL_LONG_SCALE={ + 10 ** 3: "milésimo", + 10 ** 6: "millonésimo", + 10 ** 12: "billonésimo", + 10 ** 18: "trillonésimo", + 10 ** 24: "cuatrillonésimo", + 10 ** 30: "quintillonésimo", + 10 ** 36: "sextillonésimo", + 10 ** 42: "septillonésimo", + 10 ** 48: "octillonésimo", + 10 ** 54: "nonillonésimo", + 10 ** 60: "decillonésimo" + } +) + +GL = RomanceNumberExtractor(_GL) + + +################################################################## +# all methods below are deprecated and only for backwards compat +################################################################## + +def pronounce_ordinal_gl( + number: Union[int, float], + gender: GrammaticalGender = GrammaticalGender.MASCULINE, + scale: Optional[Scale] = None +) -> str: """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. - Args: - text (str): the string to normalize - Returns: - (int) or (float): The value of extracted number - + DEPRECATED """ - # TODO: short_scale and ordinals don't do anything here. - # The parameters are present in the function signature for API compatibility - # reasons. - # - # Returns incorrect output on certain fractional phrases like, "cuarto de dous" - # TODO: numbers greater than 999999 - aWords = text.lower().split() - count = 0 - result = None - while count < len(aWords): - val = 0 - word = aWords[count] - next_next_word = None - if count + 1 < len(aWords): - next_word = aWords[count + 1] - if count + 2 < len(aWords): - next_next_word = aWords[count + 2] - else: - next_word = None - - # is current word a number? - if word in _STRING_NUM_GL: - val = _STRING_NUM_GL[word] - elif word.isdigit(): # doesn't work with decimals - val = int(word) - elif is_numeric(word): - val = float(word) - elif is_fractional_gl(word): - if not result: - result = 1 - result = result * is_fractional_gl(word) - count += 1 - continue - - if not val: - # look for fractions like "2/3" - aPieces = word.split('/') - # if (len(aPieces) == 2 and is_numeric(aPieces[0]) - # and is_numeric(aPieces[1])): - if look_for_fractions(aPieces): - val = float(aPieces[0]) / float(aPieces[1]) - - if val: - if result is None: - result = 0 - # handle fractions - if next_word != "avos": - result = val - else: - result = float(result) / float(val) - - if next_word is None: - break - - # number word and fraction - ands = ["e"] - if next_word in ands: - zeros = 0 - if result is None: - count += 1 - continue - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - - afterAndVal = extract_number_gl(newText[:-1]) - if afterAndVal: - if result < afterAndVal or result < 20: - while afterAndVal > 1: - afterAndVal = afterAndVal / 10.0 - for word in newWords: - if word == "cero" or word == "0": - zeros += 1 - else: - break - for _ in range(0, zeros): - afterAndVal = afterAndVal / 10.0 - result += afterAndVal - break - elif next_next_word is not None: - if next_next_word in ands: - newWords = aWords[count + 3:] - newText = "" - for word in newWords: - newText += word + " " - afterAndVal = extract_number_gl(newText[:-1]) - if afterAndVal: - if result is None: - result = 0 - result += afterAndVal - break - - decimals = ["punto", "coma", ".", ","] - if next_word in decimals: - zeros = 0 - newWords = aWords[count + 2:] - newText = "" - for word in newWords: - newText += word + " " - for word in newWords: - if word == "cero" or word == "0": - zeros += 1 - else: - break - afterDotVal = str(extract_number_gl(newText[:-1])) - afterDotVal = zeros * "0" + afterDotVal - result = float(str(result) + "." + afterDotVal) - break - count += 1 - - # Return the $str with the number related words removed - # (now empty strings, so strlen == 0) - # aWords = [word for word in aWords if len(word) > 0] - # text = ' '.join(aWords) - if "." in str(result): - integer, dec = str(result).split(".") - # cast float to int - if dec == "0": - result = int(integer) - - return result or False - - -def _gl_number_parse(words, i): - # TODO Not parsing 'cero' - - def gl_cte(i, s): - if i < len(words) and s == words[i]: - return s, i + 1 - return None - - def gl_number_word(i, mi, ma): - if i < len(words): - v = _STRING_NUM_GL.get(words[i]) - if v and v >= mi and v <= ma: - return v, i + 1 - return None - - def gl_number_1_99(i): - r1 = gl_number_word(i, 1, 29) - if r1: - return r1 - - r1 = gl_number_word(i, 30, 90) - if r1: - v1, i1 = r1 - r2 = gl_cte(i1, "y") - if r2: - i2 = r2[1] - r3 = gl_number_word(i2, 1, 9) - if r3: - v3, i3 = r3 - return v1 + v3, i3 - return r1 - return None - - def gl_number_1_999(i): - # [2-9]centos [1-99]? - r1 = gl_number_word(i, 100, 900) - if r1: - v1, i1 = r1 - r2 = gl_number_1_99(i1) - if r2: - v2, i2 = r2 - return v1 + v2, i2 - else: - return r1 - - # [1-99] - r1 = gl_number_1_99(i) - if r1: - return r1 - - return None - - def gl_number(i): - # check for cero - r1 = gl_number_word(i, 0, 0) - if r1: - return r1 - - # check for [1-999] (mil [0-999])? - r1 = gl_number_1_999(i) - if r1: - v1, i1 = r1 - r2 = gl_cte(i1, "mil") - if r2: - i2 = r2[1] - r3 = gl_number_1_999(i2) - if r3: - v3, i3 = r3 - return v1 * 1000 + v3, i3 - else: - return v1 * 1000, i2 - else: - return r1 - return None - - return gl_number(i) - - -def nice_number_gl(number, speech=True, denominators=range(1, 21)): - """ Galician helper for nice_number - - This function formats a float to human understandable functions. Like - 4.5 becomes "4 e medio" for speech and "4 1/2" for text - - Args: - number (int or float): the float to format - speech (bool): format for speech (True) or display (False) - denominators (iter of ints): denominators to use, default [1 .. 20] - Returns: - (str): The formatted string. + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or GL.vocab.DEFAULT_SCALE + return GL.pronounce_ordinal(number, gender, scale) + + +def is_fractional_gl( + input_str: str +) -> Union[float, bool]: """ - strNumber = "" - whole = 0 - num = 0 - den = 0 - - result = convert_to_mixed_fraction(number, denominators) - - if not result: - # Give up, just represent as a 3 decimal number - whole = round(number, 3) - else: - whole, num, den = result - - if not speech: - if num == 0: - strNumber = '{:,}'.format(whole) - strNumber = strNumber.replace(",", " ") - strNumber = strNumber.replace(".", ",") - return strNumber - else: - return '{} {}/{}'.format(whole, num, den) - else: - if num == 0: - # if the number is not a fraction, nothing to do - strNumber = str(whole) - strNumber = strNumber.replace(".", ",") - return strNumber - den_str = _FRACTION_STRING_GL[den] - # if it is not an integer - if whole == 0: - # if there is no whole number - if num == 1: - # if numerator is 1, return "un medio", for example - strNumber = 'un {}'.format(den_str) - else: - # else return "catro terzos", for example - strNumber = '{} {}'.format(num, den_str) - elif num == 1: - # if there is a whole number and numerator is 1 - if den == 2: - # if denominator is 2, return "1 e medio", for example - strNumber = '{} y {}'.format(whole, den_str) - else: - # else return "1 e 1 terzo", for example - strNumber = '{} y 1 {}'.format(whole, den_str) - else: - # else return "2 e 3 cuarto", for example - strNumber = '{} y {} {}'.format(whole, num, den_str) - if num > 1 and den != 3: - # if the numerator is greater than 1 and the denominator - # is not 3 ("terzo"), add an s for plural - strNumber += 's' - - return strNumber - - -def pronounce_number_gl(number, places=2): + DEPRECATED """ - Convert a number to it's spoken equivalent + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return GL.is_fractional(input_str) - For example, '5.2' would return 'cinco coma dous' - Args: - num(float or int): the number to pronounce (under 100) - places(int): maximum decimal places to speak - Returns: - (str): The pronounced number +def is_ordinal_gl(input_str: str) -> bool: """ - if abs(number) >= 100: - # TODO: Soporta a números por encima de 100 - return str(number) - - result = "" - if number < 0: - result = "menos " - number = abs(number) - elif number >= 30: # de 30 en adelante - tens = int(number - int(number) % 10) - ones = int(number - tens) - result += _NUM_STRING_GL[tens] - if ones > 0: - result += " y " + _NUM_STRING_GL[ones] - else: - result += _NUM_STRING_GL[int(number)] - - # Deal with decimal part, in galician is commonly used the comma - # instead dot. Decimal part can be written both with comma - # and dot, but when pronounced, its pronounced "coma" - if not number == int(number) and places > 0: - if abs(number) < 1.0 and (result == "menos " or not result): - result += "cero" - result += " coma" - _num_str = str(number) - _num_str = _num_str.split(".")[1][0:places] - for char in _num_str: - result += " " + _NUM_STRING_GL[int(char)] - return result - - -def numbers_to_digits_gl(utterance: str) -> str: + DEPRECATED """ - Replace written numbers in a Galician text with their digit equivalents. + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return GL.is_ordinal(input_str) + + +def extract_number_gl( + text: str, + ordinals: bool = False, + scale: Optional[Scale] = None +) -> Union[int, float, bool]: + """ + DEPRECATED + """ + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or GL.vocab.DEFAULT_SCALE + return GL.extract_number(text, ordinals, scale) + + +def pronounce_number_gl( + number: Union[int, float], + places: int = 5, + scale: Optional[Scale] = None, + ordinals: bool = False, + digits: DigitPronunciation = DigitPronunciation.FULL_NUMBER, + gender: GrammaticalGender = GrammaticalGender.MASCULINE +) -> str: + """ + DEPRECATED + """ + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or GL.vocab.DEFAULT_SCALE + return GL.pronounce_number(number, places, scale, ordinals, digits, gender) + + +def numbers_to_digits_gl( + utterance: str, + scale: Optional[Scale] = None +) -> str: + """ + DEPRECATED + """ + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or GL.vocab.DEFAULT_SCALE + return GL.numbers_to_digits(utterance, scale) - "un dous catro" -> "1 2 4" - Args: - utterance (str): Input string possibly containing written numbers. - Returns: - str: Text with written numbers replaced by digits. +def pronounce_fraction_gl(word: str, scale: Optional[Scale] = None) -> str: + """ + DEPRECATED """ - # TODO - above twenty it's ambiguous, "twenty one" is 2 words but only 1 number - mapping = {_NUM_STRING_GL[i + 1]: str(i + 1) for i in range(20)} - words: List[Token] = tokenize(utterance) - for idx, tok in enumerate(words): - if tok.word in mapping: - words[idx] = mapping[tok.word] - else: - words[idx] = tok.word - return " ".join(words) + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or GL.vocab.DEFAULT_SCALE + return GL.pronounce_fraction(word, scale) + + +if __name__ == "__main__": + print("--- Testing Pronunciation (Galician - Long Scale) ---") + # Galician only uses the Long Scale. Short Scale tests are irrelevant or would yield the same result for the first million. + # $1,234,567$: un millón douscentos trinta e catro mil cincocentos sesenta e sete + print(f"1,234,567: {GL.pronounce_number(1_234_567)}") + + # $1,000,000,000$: mil millóns (Long Scale) + print(f"1,000,000,000: {GL.pronounce_number(1_000_000_000)}") + + # $1,000,000,000,000$: un billón (Long Scale) + print(f"1,000,000,000,000: {GL.pronounce_number(1_000_000_000_000)}") + + # $2,500,123,456$: dous mil cincocentos millóns cento vinte e tres mil catrocentos cincuenta e seis + print(f"2,500,123,456: {GL.pronounce_number(2_500_123_456)}") + + # $16$: dezaseis + print(f"16: {GL.pronounce_number(16)}") + + print("\n--- Testing Edge Cases ---") + # $-123.45$: menos cento vinte e tres coma corenta e cinco + print(f"-123.45: {GL.pronounce_number(-123.45)}") + # $10.05$: dez coma cero cinco + print(f"10.05: {GL.pronounce_number(10.05)}") + # $2000$: dous mil + print(f"2000: {GL.pronounce_number(2000)}") + # $2001$: dous mil un + print(f"2001: {GL.pronounce_number(2001)}") + # $123.456789$: cento vinte e tres coma catrocentos cincuenta e seis mil setecentos oitenta e nove + print(f"123.456789: {GL.pronounce_number(123.456789)}") + + print("\n--- Testing Ordinal Pronunciation ---") + # $1^{st}$ (masculine): primeiro + print(f"1st (masculine): {GL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.MASCULINE)}") + # $1^{st}$ (feminine): primeira + print(f"1st (feminine): {GL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.FEMININE)}") + # $23^{rd}$ (masculine): vixésimo terceiro + print(f"23rd (masculine): {GL.pronounce_number(23, ordinals=True, gender=GrammaticalGender.MASCULINE)}") + # $23^{rd}$ (feminine): vixésima terceira + print(f"23rd (feminine): {GL.pronounce_number(23, ordinals=True, gender=GrammaticalGender.FEMININE)}") + # $100^{th}$: centésimo + print(f"100th: {GL.pronounce_number(100, ordinals=True)}") + # $101^{st}$: centésimo primeiro + print(f"101st: {GL.pronounce_number(101, ordinals=True)}") + # $1000^{th}$: milésimo + print(f"1000th: {GL.pronounce_number(1000, ordinals=True)}") + # $1,000,000^{th}$: millonésimo + print(f"1,000,000th: {GL.pronounce_number(1_000_000, ordinals=True)}") + # $1,000,000,000,000^{th}$: billonésimo + print(f"1,000,000,000,000th: {GL.pronounce_number(1_000_000_000_000, ordinals=True)}") + + print("\n--- Testing GL.numbers_to_digits ---") + # duzentos e cinquenta -> 250 (Using correct Galician: douscentos cincuenta) + print(f"'douscentos cincuenta' -> '{GL.numbers_to_digits('douscentos cincuenta')}'") + # un milhon -> 1000000 (Using correct Galician: un millón) + print(f"'un millón' -> '{GL.numbers_to_digits('un millón')}'") + # zasseis -> 16 (Using correct Galician: dezaseis) + print(f"'dezaseis' -> '{GL.numbers_to_digits('dezaseis')}'") + # hai duzientos e cinquenta carros -> hai 250 carros (Using correct Galician) + print(f"'hai douscentos cincuenta carros' -> '{GL.numbers_to_digits('hai douscentos cincuenta carros')}'") + + print("\n--- Testing Ordinal Extraction ---") + # o segundo carro -> 2 (Galician: o segundo carro) + print(f"'o segundo carro' -> {GL.extract_number('o segundo carro', ordinals=True)}") + # primeiro lugar -> 1 (Galician: primeiro lugar) + print(f"'primeiro lugar' -> {GL.extract_number('primeiro lugar', ordinals=True)}") + # o milésimo dia -> 1000 (Galician: o milésimo día) + print(f"'o milésimo día' -> {GL.extract_number('o milésimo día', ordinals=True)}") + # a milésima vez -> 1000 (Galician: a milésima vez) + print(f"'a milésima vez' -> {GL.extract_number('a milésima vez', ordinals=True)}") + # a primeira vez -> 1 (Galician: a primeira vez) + print(f"'a primeira vez' -> {GL.extract_number('a primeira vez', ordinals=True)}") + # a sexagésima cuarta vez -> 64 (Galician: a sexagésima cuarta vez) + print(f"'a sexagésima cuarta vez' -> {GL.extract_number('a sexagésima cuarta vez', ordinals=True)}") + + print("\n--- Testing Cardinal Extraction ---") + # un -> 1 + print(f"'un' -> {GL.extract_number('un')}") + # unha -> 1 + print(f"'unha' -> {GL.extract_number('unha')}") + # vinte e un -> 21 + print(f"'vinte e un' -> {GL.extract_number('vinte e un')}") + # vinte e unha -> 21 + print(f"'vinte e unha' -> {GL.extract_number('vinte e unha')}") + # vinte e dous -> 22 + print(f"'vinte e dous' -> {GL.extract_number('vinte e dous')}") + # vinte e dúas -> 22 + print(f"'vinte e dúas' -> {GL.extract_number('vinte e dúas')}") + # un millón -> 1000000 + print(f"'un millón' -> {GL.extract_number('un millón')}") + # dous millóns e cincocentos -> 2500500 (Galician: dous millóns cincocentos) + print(f"'dous millóns cincocentos' -> {GL.extract_number('dous millóns cincocentos')}") + # mil vinte e tres -> 1023 (Galician: mil vinte e tres) + print(f"'mil vinte e tres' -> {GL.extract_number('mil vinte e tres')}") + # trinta e cinco coma catro -> 35.4 (Galician: trinta e cinco coma catro) + print(f"'trinta e cinco coma catro' -> {GL.extract_number('trinta e cinco coma catro')}") + + print("\n--- Testing Fractions ---") + # $1/2$: un medio + print(f"1/2: {GL.pronounce_fraction('1/2')}") + # $2/2$: dous medios / dous medios, ou dous medios (or dous sobre dous) + print(f"2/2: {GL.pronounce_fraction('2/2')}") + # $5/2$: cinco medios + print(f"5/2: {GL.pronounce_fraction('5/2')}") + # $5/3$: cinco terzos + print(f"5/3: {GL.pronounce_fraction('5/3')}") + # $5/4$: cinco cuartos + print(f"5/4: {GL.pronounce_fraction('5/4')}") + # $7/5$: sete quintos + print(f"7/5: {GL.pronounce_fraction('7/5')}") + # $0/20$: cero vinteavos + print(f"0/20: {GL.pronounce_fraction('0/20')}") \ No newline at end of file diff --git a/ovos_number_parser/numbers_mwl.py b/ovos_number_parser/numbers_mwl.py index 822fae6..1a6a3d4 100644 --- a/ovos_number_parser/numbers_mwl.py +++ b/ovos_number_parser/numbers_mwl.py @@ -1,728 +1,417 @@ -from typing import List, Union, Dict, Tuple - -from ovos_number_parser.numbers_pt import tokenize, _swap_gender # consider implementing a mwl version if needed -from ovos_number_parser.util import Scale, GrammaticalGender, DigitPronunciation - -DECIMAL_MARKERS = ["ponto", "birgula", "bírgula", ".", ","] - -# --- Base Pronunciation Dictionaries --- - -_UNITS: Dict[int, str] = { - 1: 'un', 2: 'dous', 3: 'trés', 4: 'quatro', 5: 'cinco', 6: 'seis', - 7: 'siete', 8: 'uito', 9: 'nuobe' -} - -_TENS_MWL: Dict[int, str] = { - 10: 'dieç', 11: 'onze', 12: 'duoze', 13: 'treze', 14: 'catorze', - 15: 'quinze', 16: 'zasseis', 17: 'zassiete', 18: 'zuio', 19: 'zanuobe', - 20: 'binte', 30: 'trinta', 40: 'quarenta', 50: 'cinquenta', 60: 'sessenta', - 70: 'setenta', 80: 'uitenta', 90: 'nobenta' -} -_TENS_ALT_MWL: Dict[int, str] = { - 16: 'dezasseis', 17: 'dezassiete', 18: 'dezuito', 19: 'dezanuobe', -} - -_HUNDREDS: Dict[int, str] = { - 100: 'cien', 200: 'duzientos', 300: 'trezientos', 400: 'quatrocientos', - 500: 'quinhentos', 600: 'seiscientos', 700: 'sietecientos', - 800: 'uitocientos', 900: 'nuobecientos' -} -_HUNDREDS_ALT: Dict[int, str] = { - 100: 'un ciento', - 200: 'dous cientos', - 300: 'trés cientos', - 400: 'quatro cientos', - 500: 'cinco cientos', - 600: 'seis cientos', - 700: 'siete cientos', - 800: 'uito cientos', - 900: 'nuobe cientos' -} - -_FRACTION_STRING_M_MWL: Dict[int, str] = { - 2: 'meio', 3: 'tércio', 4: 'quarto', 5: 'quinto', 6: 'sesto', - 7: 'sétimo', 8: 'uitabo', 9: 'nono', 10: 'décimo', - 11: 'onze abos', 12: 'doze abos', 13: 'treze abos', 14: 'catorze abos', - 15: 'quinze abos', 16: 'dezasseis abos', 17: 'dezassete abos', - 18: 'dezoito abos', 19: 'dezanuobe abos', - 20: 'bigésimo', 30: 'trigésimo', 100: 'centésimo', 1000: 'milésimo' -} -_FRACTION_STRING_F_MWL: Dict[int, str] = { - k: v[:-1] + "a" - for k, v in _FRACTION_STRING_M_MWL.items() if v.endswith("o") -} -_FRACTION_STRING_MWL: Dict[int, str] = { - **_FRACTION_STRING_M_MWL, **_FRACTION_STRING_F_MWL -} - -_FEMALE_NUMS = { - "ũa": 1, - "dues": 2 -} - -# --- Ordinal Pronunciation Dictionaries (Masculine Base) --- - -_ORDINAL_UNITS_MASC: Dict[int, str] = { - 1: 'purmerio', 2: 'segundo', 3: 'terceiro', 4: 'quarto', 5: 'quinto', - 6: 'sesto', 7: 'sétimo', 8: 'uitabo', 9: 'nono' -} -_ORDINAL_UNITS_FEM: Dict[int, str] = { - k: v[:-1] + "a" - for k, v in _ORDINAL_UNITS_MASC.items() -} - -_ORDINAL_TENS_MASC: Dict[int, str] = { - 10: 'décimo', 20: 'bigésimo', 30: 'trigésimo', 40: 'quadragésimo', - 50: 'quinquagésimo', 60: 'sessagésimo', 70: 'setuagésimo', - 80: 'uctogésimo', 90: 'nonagésimo' -} -_ORDINAL_TENS_FEM: Dict[int, str] = { - k: v[:-1] + "a" - for k, v in _ORDINAL_TENS_MASC.items() -} - -_ORDINAL_HUNDREDS_MASC: Dict[int, str] = { - 100: 'centésimo', 200: 'ducentésimo', 300: 'tricentésimo', - 400: 'quadringentésimo', 500: 'quingentésimo', 600: 'seiscentésimo', - 700: 'setingentésimo', 800: 'uctingentésimo', 900: 'noningentésimo' -} -_ORDINAL_HUNDREDS_FEM: Dict[int, str] = { - k: v[:-1] + "a" - for k, v in _ORDINAL_HUNDREDS_MASC.items() -} - -_ORDINAL_SCALES_MASC: Dict[Scale, List[Tuple[int, str]]] = { - Scale.SHORT: [ - (10 ** 21, "sextilionésimo"), - (10 ** 18, "quintilionésimo"), - (10 ** 15, "quadrilionésimo"), - (10 ** 12, "trilionésimo"), - (10 ** 9, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ], - Scale.LONG: [ - (10 ** 36, "sextilionésimo"), - (10 ** 30, "quintilionésimo"), - (10 ** 24, "quadrilionésimo"), - (10 ** 18, "trilionésimo"), - (10 ** 12, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ] -} - -_ORDINAL_SCALES_FEM: Dict[Scale, List[Tuple[int, str]]] = { - Scale.SHORT: [(k, v[:-1] + "a") - for k, v in _ORDINAL_SCALES_MASC[Scale.SHORT]], - Scale.LONG: [(k, v[:-1] + "a") - for k, v in _ORDINAL_SCALES_MASC[Scale.LONG]], - -} - -_SCALES: Dict[Scale, List[Tuple[int, str, str]]] = { - Scale.SHORT: [ - (10 ** 21, "sextilion", "sextiliones"), - (10 ** 18, "quintilion", "quintiliones"), - (10 ** 15, "quadrilion", "quadriliones"), - (10 ** 12, "trilion", "triliones"), - (10 ** 9, "bilion", "biliones"), - (10 ** 6, "milhon", "milhones"), - (10 ** 3, "mil", "mil") - ], - Scale.LONG: [ - (10 ** 36, "sextilion", "sextiliones"), - (10 ** 30, "quintilion", "quintiliones"), - (10 ** 24, "quadrilion", "quadriliones"), - (10 ** 18, "trilion", "triliones"), - (10 ** 12, "bilion", "biliones"), - (10 ** 6, "milhon", "milhones"), - (10 ** 3, "mil", "mil") - ] -} - -# Mapping of number words to their integer values. -_NUMBERS_BASE = { - **_FEMALE_NUMS, - **{v: k for k, v in _UNITS.items()}, - **{v: k for k, v in _TENS_MWL.items()}, - **{v: k for k, v in _TENS_ALT_MWL.items()}, - **{v: k for k, v in _HUNDREDS.items()}, - **{v: k for k, v in _HUNDREDS_ALT.items()}, - "ciento": 100 -} - - -def get_number_map(scale: Scale = Scale.LONG): - return { - **_NUMBERS_BASE, - **{s_name: val for val, s_name, _ in _SCALES[scale]}, - **{p_name: val for val, _, p_name in _SCALES[scale]} +from ovos_number_parser.util import (Scale, GrammaticalGender, DigitPronunciation, + NumberVocabulary, RomanceNumberExtractor) +from typing import Union, Optional +import warnings + + +def swap_gender_mwl(word: str, gender: GrammaticalGender) -> str: + if word.endswith("un") and gender == GrammaticalGender.FEMININE: + return word[:-2] + "ũa" + elif word.endswith("ũa") and gender == GrammaticalGender.MASCULINE: + return word[:-2] + "ũa" + elif word == "dous" and gender == GrammaticalGender.FEMININE: + return "dues" + elif word == "dues" and gender == GrammaticalGender.MASCULINE: + return "dous" + + # TODO - is this correct? + elif gender == GrammaticalGender.FEMININE and word.endswith('o'): + return word[:-1] + 'a' + elif gender == GrammaticalGender.MASCULINE and word.endswith('ma'): + return word[:-1] + elif gender == GrammaticalGender.MASCULINE and word.endswith('a'): + return word[:-1] + 'o' + elif gender == GrammaticalGender.FEMININE and word.endswith('os'): + return word[:-2] + 'as' + elif gender == GrammaticalGender.MASCULINE and word.endswith('as'): + return word[:-2] + 'os' + elif gender == GrammaticalGender.FEMININE and word.endswith('m'): + return word + 'a' + return word + + +def pluralize_mwl(word: str): + # TODO - is this accurate? + if word.endswith("on"): + return word + "es" + if not word.endswith("s"): + return word + "s" + return word + + +_MWL = NumberVocabulary( + LANG="mwl", + swap_gender=swap_gender_mwl, # used for female forms + pluralize=pluralize_mwl, # use for plural forms + + HUNDRED_PARTICLE="ciento", # how to read "1XX" + DENOMINATOR_PARTICLE="abos", # for fractions X / N {PARTICLE} + DIVIDED_BY_ZERO="a dibidir por zero", # how to read X/0 values + NO_PREV_UNIT=[100, 1000], # "mil" vs "um mil" / "cem" vs "um cem" + NO_PLURAL=[1000], # "dois mil" vs "dois mils" / "dois milhões" vs "dois milhão" + NUMBER_OVERFLOW="número exageradamente grande", + DEFAULT_SCALE=Scale.LONG, + JOIN_WORD=["i"], + + JOINER_ON_TWENTYS=True, # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS=True, # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS=False, # add JOIN_WORD from 1000-10000 - "mil e duzentos" + + DECIMAL_MARKER=["bírgula", "ponto", "birgula", ".", ","], + NEGATIVE_SIGN=["menos"], + UNITS={ + 0: 'zero', + 1: 'un', + 2: 'dous', + 3: 'trés', + 4: 'quatro', + 5: 'cinco', + 6: 'seis', + 7: 'siete', + 8: 'uito', + 9: 'nuobe' + }, + TENS={ + 10: 'dieç', + 11: 'onze', + 12: 'duoze', + 13: 'treze', + 14: 'catorze', + 15: 'quinze', + 16: 'zasseis', + 17: 'zassiete', + 18: 'zuio', + 19: 'zanuobe', + 20: 'binte', + 21: "bint'i un", + 22: "bint'i dous", + 23: "bint'i trés", + 24: "bint'i quatro", + 25: "bint'i cinco", + 26: "bint'i seis", + 27: "bint'i siete", + 28: "bint'i uito", + 29: "bint'i nuobe", + 30: 'trinta', + 40: 'quarenta', + 50: 'cinquenta', + 60: 'sessenta', + 70: 'setenta', + 80: 'uitenta', + 90: 'nobenta' + }, + HUNDREDS={ + 100: 'cien', + 200: 'duzientos', + 300: 'trezientos', + 400: 'quatrocientos', + 500: 'quinhentos', + 600: 'seiscientos', + 700: 'sietecientos', + 800: 'uitocientos', + 900: 'nuobecientos' + }, + FRACTION={ + 2: 'meio', + 3: 'tércio', + 4: 'quarto', + 5: 'quinto', + 6: 'sesto', + 7: 'sétimo', + 8: 'uitabo', + 9: 'nono', + 10: 'décimo', + 11: 'onze abos', + 12: 'doze abos', + 13: 'treze abos', + 14: 'catorze abos', + 15: 'quinze abos', + 16: 'dezasseis abos', + 17: 'dezassete abos', + 18: 'dezoito abos', + 19: 'dezanuobe abos', + 20: 'bigésimo', + 30: 'trigésimo', + 100: 'centésimo', + 1000: 'milésimo' + }, + FRACTION_FEMALE={ + 2: "meia" # ũa i meia -> 1.5 + }, + SHORT_SCALE={ + 10 ** 21: "sextilion", + 10 ** 18: "quintilion", + 10 ** 15: "quadrilion", + 10 ** 12: "trilion", + 10 ** 9: "bilion", + 10 ** 6: "milhon", + 10 ** 3: "mil", + }, + LONG_SCALE={ + 10 ** 36: "sextilion", + 10 ** 30: "quintilion", + 10 ** 24: "quadrilion", + 10 ** 18: "trilion", + 10 ** 12: "bilion", + 10 ** 6: "milhon", + 10 ** 3: "mil", + }, + GENDERED_SPELLINGS={ + GrammaticalGender.FEMININE: { + 1: "ũa", + 2: "dues" + } + }, + DIGIT_SPELLINGS={}, + ALT_SPELLINGS={ + 'dezasseis': 16, + 'dezassiete': 17, + 'dezuito': 18, + 'dezanuobe': 19, + 'un ciento': 100, + 'dous cientos': 200, + 'trés cientos': 300, + 'quatro cientos': 400, + 'cinco cientos': 500, + 'seis cientos': 600, + 'siete cientos': 700, + 'uito cientos': 800, + 'nuobe cientos': 900, + "bint": 20, + "bint'i": 20 + }, + ORDINAL_UNITS={ + 1: 'purmerio', + 2: 'segundo', + 3: 'terceiro', + 4: 'quarto', + 5: 'quinto', + 6: 'sesto', + 7: 'sétimo', + 8: 'uitabo', + 9: 'nono' + }, + ORDINAL_TENS={ + 10: 'décimo', + 20: 'bigésimo', + 30: 'trigésimo', + 40: 'quadragésimo', + 50: 'quinquagésimo', + 60: 'sessagésimo', + 70: 'setuagésimo', + 80: 'uctogésimo', + 90: 'nonagésimo' + }, + ORDINAL_HUNDREDS={ + 100: 'centésimo', + 200: 'ducentésimo', + 300: 'tricentésimo', + 400: 'quadringentésimo', + 500: 'quingentésimo', + 600: 'seiscentésimo', + 700: 'setingentésimo', + 800: 'uctingentésimo', + 900: 'noningentésimo' + }, + ORDINAL_SHORT_SCALE={ + 10 ** 21: "sextilionésimo", + 10 ** 18: "quintilionésimo", + 10 ** 15: "quadrilionésimo", + 10 ** 12: "trilionésimo", + 10 ** 9: "bilionésimo", + 10 ** 6: "milionésimo", + 10 ** 3: "milésimo" + }, + ORDINAL_LONG_SCALE={ + 10 ** 36: "sextilionésimo", + 10 ** 30: "quintilionésimo", + 10 ** 24: "quadrilionésimo", + 10 ** 18: "trilionésimo", + 10 ** 12: "bilionésimo", + 10 ** 6: "milionésimo", + 10 ** 3: "milésimo" } +) +MWL = RomanceNumberExtractor(_MWL) -_NUMBERS_MWL = get_number_map() - -_ORDINAL_WORDS_MASC = { - **{v: k for k, v in _ORDINAL_UNITS_MASC.items()}, - **{v: k for k, v in _ORDINAL_TENS_MASC.items()}, - **{v: k for k, v in _ORDINAL_HUNDREDS_MASC.items()}, - **{s_name: val for val, s_name in _ORDINAL_SCALES_MASC[Scale.SHORT]}, -} -_ORDINAL_WORDS_FEM = { - **{v: k for k, v in _ORDINAL_UNITS_FEM.items()}, - **{v: k for k, v in _ORDINAL_TENS_FEM.items()}, - **{v: k for k, v in _ORDINAL_HUNDREDS_FEM.items()}, - **{s_name: val for val, s_name in _ORDINAL_SCALES_FEM[Scale.SHORT]}, -} -_ORDINAL_WORDS = { - **_ORDINAL_WORDS_FEM, - **_ORDINAL_WORDS_MASC, -} - - -def _pronounce_up_to_999( - n: int, - gender: GrammaticalGender = GrammaticalGender.MASCULINE -) -> str: - """ - Returns the Mirandese cardinal pronunciation of an integer from 0 to 999 - - Parameters: - n (int): Integer to pronounce (must be between 0 and 999). - - Returns: - str: The number pronounced in Mirandese words. - - Raises: - ValueError: If n is not in the range 0 to 999. - """ - # special cases for feminine 1 and 2 "ũa", "dues" - if gender == GrammaticalGender.FEMININE: - if n == 1: - return "ũa" - if n == 2: - return "dues" - - if not 0 <= n <= 999: - raise ValueError("Number must be between 0 and 999.") - if n == 0: - return "zero" - if n == 100: - return "cien" - - parts = [] - - # Hundreds - if n >= 100: - hundred = n // 100 * 100 - parts.append("ciento" if hundred == 100 else _HUNDREDS_ALT[hundred]) - n %= 100 - if n > 0: - parts.append("i") - - # Tens and Units - if n > 0: - if n < 20: - parts.append(_TENS_MWL.get(n) or _UNITS.get(n, "")) - else: - ten = n // 10 * 10 - unit = n % 10 - parts.append(_TENS_MWL[ten]) - if unit > 0: - parts.append("i") - parts.append(_UNITS[unit]) - - return " ".join(parts) - - -def _pronounce_ordinal_up_to_999( - n: int, - gender: GrammaticalGender = GrammaticalGender.MASCULINE -) -> str: - """ - Returns the Mirandese ordinal word for an integer between 0 and 999, adjusting for grammatical gender - - Parameters: - n (int): The integer to convert (must be between 0 and 999). - - Returns: - str: The ordinal representation of the number in Mirandese. - - Raises: - ValueError: If n is not between 0 and 999. - """ - if not 0 <= n <= 999: - raise ValueError("Number must be between 0 and 999.") - if n == 0: - return "zero" - - parts = [] - - # Handle hundreds - if n >= 100: - hundred_val = n // 100 * 100 - hundred_word_masc = _ORDINAL_HUNDREDS_MASC.get(hundred_val) - if hundred_word_masc: - parts.append(_swap_gender(hundred_word_masc, gender)) - n %= 100 - - # Handle tens and units - if n > 0: - # Ordinal numbers don't use 'e' as a separator - if n % 10 == 0 and n > 10: - tens_word_masc = _ORDINAL_TENS_MASC[n] - parts.append(_swap_gender(tens_word_masc, gender)) - elif n < 10: - units_word_masc = _ORDINAL_UNITS_MASC[n] - parts.append(_swap_gender(units_word_masc, gender)) - elif n < 20: - tens_word_masc = _ORDINAL_TENS_MASC[10] - units_word_masc = _ORDINAL_UNITS_MASC[n - 10] - parts.append(f"{_swap_gender(tens_word_masc, gender)} {_swap_gender(units_word_masc, gender)}") - else: - tens_word_masc = _ORDINAL_TENS_MASC[n // 10 * 10] - units_word_masc = _ORDINAL_UNITS_MASC[n % 10] - parts.append(f"{_swap_gender(tens_word_masc, gender)} {_swap_gender(units_word_masc, gender)}") - - return " ".join(parts) - +################################################################## +# all methods below are deprecated and only for backwards compat +################################################################## def pronounce_ordinal_mwl( number: Union[int, float], gender: GrammaticalGender = GrammaticalGender.MASCULINE, - scale: Scale = Scale.LONG + scale: Optional[Scale] = None ) -> str: """ - Return the ordinal pronunciation of a number in Mirandese, supporting grammatical gender and scale (short or long) - - Parameters: - number (int or float): The number to pronounce as an ordinal. - gender (GrammaticalGender, optional): The grammatical gender for the ordinal form (masculine or feminine). - scale (Scale, optional): The numerical scale to use (short or long). - - Returns: - str: The ordinal pronunciation of the number in Mirandese. - - Raises: - TypeError: If `number` is not an int or float. + DEPRECATED """ - if not isinstance(number, (int, float)): - raise TypeError("Number must be an int or float.") - if number == 0: - return "zero" - - if number < 0: - return f"menos {pronounce_ordinal_mwl(abs(number), gender, scale)}" - - n = int(number) - if n < 1000: - return _pronounce_ordinal_up_to_999(n, gender) - - ordinal_scale_defs = _ORDINAL_SCALES_MASC[scale] - - # Find the largest scale that fits the number - for scale_val, s_name in ordinal_scale_defs: - if n >= scale_val: - break - - count = n // scale_val - remainder = n % scale_val - - # Special case for "milésimo" and other large scales where 'um' is not needed - if count == 1 and scale_val >= 1000: - count_str = _swap_gender(s_name, gender) - else: - # Pronounce the 'count' part of the number and the scale word - count_pronunciation = pronounce_number_mwl(count, scale=scale) - scale_word_masc = s_name - scale_word = _swap_gender(scale_word_masc, gender) - count_str = f"{count_pronunciation} {scale_word}" - - # If there's no remainder, we're done - if remainder == 0: - return count_str - - # Pronounce the remainder and join - remainder_str = pronounce_ordinal_mwl(remainder, gender, scale) - - return f"{count_str} {remainder_str}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or MWL.vocab.DEFAULT_SCALE + return MWL.pronounce_ordinal(number, gender, scale) def is_fractional_mwl( input_str: str ) -> Union[float, bool]: """ - Checks if the input string corresponds to a recognized Mirandese fractional word. - - Returns: - The fractional value as a float if recognized (e.g., 0.5 for "meio" or "meia"); otherwise, False. + DEPRECATED """ - input_str = input_str.lower().strip() - fraction_map = _FRACTION_STRING_MWL - - # Handle plural forms - if input_str.endswith('s') and input_str not in fraction_map.values(): - input_str = input_str[:-1] - - # Handle "meio" vs "meia" - if input_str == "meia": - input_str = "meio" - - # Use a dynamic lookup instead of a hardcoded list - for den, word in fraction_map.items(): - # Handle cases like "onze abos", so we check for the whole word - if input_str == word: - return 1.0 / den - - # Special case for "meia" as a female form of "meio" (1/2) - if input_str in ["meia", "meio"]: - return 0.5 - - return False + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return MWL.is_fractional(input_str) def is_ordinal_mwl(input_str: str) -> bool: """ - Determine if a string is a Mirandese ordinal word. - - Returns: - bool: True if the input string is recognized as a Mirandese ordinal, otherwise False. + DEPRECATED """ - return input_str in _ORDINAL_WORDS + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return MWL.is_ordinal(input_str) def extract_number_mwl( text: str, ordinals: bool = False, - scale: Scale = Scale.LONG + scale: Optional[Scale] = None ) -> Union[int, float, bool]: """ - Extracts a numeric value from a Mirandese text phrase, supporting cardinals, ordinals, fractions, and large scales. - - Parameters: - text (str): The input phrase potentially containing a number. - ordinals (bool): If True, recognizes ordinal words as numbers. - scale (Scale): Specifies whether to use the short or long numerical scale. - - Returns: - int or float: The extracted number if found; otherwise, False. + DEPRECATED """ - text = text.replace("bint'i", "binte i") - numbers_map = get_number_map(scale) - scales_map = _SCALES[scale] - - clean_text = text.lower().replace('-', ' ') - tokens = [t for t in clean_text.split() if t != "i"] - - result = 0 - current_number = 0 - number_consumed = False - - for i, token in enumerate(tokens): - if token is None: - continue # consumed in previous idx - next_token = tokens[i + 1] if i < len(tokens) - 1 else None - next_digit = numbers_map.get(next_token) if next_token else None - val = numbers_map.get(token) - if val is not None: - if next_digit and next_digit > val: - tokens[i + 1] = None - current_number += val * next_digit - else: - current_number += val - elif ordinals and is_ordinal_mwl(token): - current_number += _ORDINAL_WORDS[token] - elif is_fractional_mwl(token): - fraction = is_fractional_mwl(token) - result += current_number + fraction - current_number = 0 - number_consumed = True - else: - # Handle large scales like milhão, bilhão - found_scale = False - for scale_val, singular, plural in scales_map: - if token == singular or token == plural: - if current_number == 0: - current_number = 1 - result += current_number * scale_val - current_number = 0 - found_scale = True - number_consumed = True - break - if not found_scale: - if token in DECIMAL_MARKERS: - decimal_str = ''.join( - str(numbers_map.get(t, '')) for t in tokens[i + 1:] - if t in numbers_map - ) - if decimal_str: - result += current_number + float(f"0.{decimal_str}") - number_consumed = True - current_number = 0 - break - - if not number_consumed: - result += current_number - - return result if result > 0 else False + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or MWL.vocab.DEFAULT_SCALE + return MWL.extract_number(text, ordinals, scale) def pronounce_number_mwl( number: Union[int, float], places: int = 5, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, ordinals: bool = False, digits: DigitPronunciation = DigitPronunciation.FULL_NUMBER, gender: GrammaticalGender = GrammaticalGender.MASCULINE ) -> str: """ - Return the full Mirandese pronunciation of a number, supporting cardinal and ordinal forms, decimals, scales and grammatical gender - - Parameters: - number (int or float): The number to pronounce. - places (int): Number of decimal places to include for floats. - scale (Scale): Numerical scale to use (short or long). - ordinals (bool): If True, pronounce as an ordinal number. - gender (GrammaticalGender): Grammatical gender for ordinal numbers. - - Returns: - str: The number expressed as a Mirandese phrase. + DEPRECATED """ - if not isinstance(number, (int, float)): - raise TypeError("Number must be an int or float.") - - if ordinals: - return pronounce_ordinal_mwl(number, gender, scale) - - if number == 0: - return "zero" - - if number < 0: - return f"menos {pronounce_number_mwl(abs(number), places, scale=scale, digits=digits, gender=gender)}" - - # Handle decimals - if "." in str(number): - integer_part = int(number) - decimal_part_str = f"{number:.{places}f}".split('.')[1].rstrip("0") - - # Handle cases where the decimal part rounds to zero - if decimal_part_str and int(decimal_part_str) == 0: - return pronounce_number_mwl(integer_part, places, - scale=scale, - digits=digits, gender=gender) - - int_pronunciation = pronounce_number_mwl(integer_part, places, - scale=scale, - digits=digits, gender=gender) - - decimal_pronunciation_parts = [] - # pronounce decimals either as a whole number or digit by digit - if decimal_part_str: - if digits == DigitPronunciation.FULL_NUMBER: - decimal_pronunciation_parts.append(_pronounce_up_to_999(int(decimal_part_str[:3]), gender)) - else: - for digit in decimal_part_str: - decimal_pronunciation_parts.append(_pronounce_up_to_999(int(digit), gender)) - - decimal_pronunciation = " ".join(decimal_pronunciation_parts) or "zero" - decimal_word = "bírgula" - return f"{int_pronunciation} {decimal_word} {decimal_pronunciation}" - - # --- Integer Pronunciation Logic --- - n = int(number) - - # Base case for recursion: numbers less than 1000 - if n < 1000: - return _pronounce_up_to_999(n, gender) - - scale_definitions = _SCALES[scale] - - # Find the largest scale that fits the number - for scale_val, s_name, p_name in scale_definitions: - if n >= scale_val: - break - - count = n // scale_val - remainder = n % scale_val - - # Pronounce the 'count' part of the number - scale_word = s_name if count == 1 else p_name - if count == 1 and scale_word == "mil": - count_str = scale_word - else: - count_pronunciation = pronounce_number_mwl(count, places, scale) - count_str = f"{count_pronunciation} {scale_word}" - - # If there's no remainder, we're done - if remainder == 0: - return count_str - - # Pronounce the remainder and join with the correct conjunction - remainder_str = pronounce_number_mwl(remainder, places, scale) - - # Conjunction logic: add "i" if the remainder is the last group and is - # less than 100 or a multiple of 100. - if remainder < 100 or (remainder < 1000 and remainder % 100 == 0): - return f"{count_str} e {remainder_str}" - else: - return f"{count_str} {remainder_str}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or MWL.vocab.DEFAULT_SCALE + return MWL.pronounce_number(number, places, scale, ordinals, digits, gender) def numbers_to_digits_mwl( utterance: str, - scale: Scale = Scale.LONG + scale: Optional[Scale] = None ) -> str: """ - Converts written Mirandese numbers in a text string to their digit equivalents, preserving all other text. - - Identifies spans of number words (including the joiner "i"), extracts their numeric values, and replaces them with digit strings. Non-number words and context are left unchanged. - - Parameters: - utterance (str): Input text possibly containing written Mirandese numbers. - scale (Scale, optional): Numerical scale (short or long) to interpret large numbers. Defaults to Scale.LONG. - - Returns: - str: The input text with written numbers replaced by their digit representations. + DEPRECATED """ - utterance = utterance.replace("bint'i", "binte i") - for n, v in _HUNDREDS_ALT.items(): - # normalize alternative multi-word spelling to single word - utterance = utterance.replace(v, _HUNDREDS[n]) - - words = tokenize(utterance) - output = [] - i = 0 - NUMBERS = get_number_map(scale) - while i < len(words): - # Look for the start of a number span - if words[i] in NUMBERS: - # Start a new span - number_span_words = [] - j = i - # Continue the span as long as we find number words or the joiner 'e' - while j < len(words) and (words[j] in NUMBERS or words[j] == "i"): - number_span_words.append(words[j]) - j += 1 - - # Form the phrase from the span and extract the number value - phrase = " ".join(number_span_words) - number_val = extract_number_mwl(phrase) - - if number_val is not False: - # If a valid number is found, add its digit representation to the output - output.append(str(number_val)) - # Advance the main index 'i' past the entire span - i = j - else: - # If the span doesn't form a valid number, treat the first word as non-numeric - # and move to the next word. This handles cases like "i" at the beginning of a sentence. - output.append(words[i]) - i += 1 - else: - # If the current word is not a number word, add it to the output - # and move to the next word - output.append(words[i]) - i += 1 - - return " ".join(output) - - -def pronounce_fraction_mwl(word: str, scale: Scale = Scale.LONG) -> str: - """ - Return the Mirandese pronunciation of a fraction given as a string (e.g., "1/2"). + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or MWL.vocab.DEFAULT_SCALE + return MWL.numbers_to_digits(utterance, scale) - The numerator is pronounced as a cardinal number, and the denominator as an ordinal or fraction name, pluralized if appropriate. For denominators not in the known fraction list, the denominator is pronounced as a cardinal number followed by "abos" if plural. - Parameters: - word (str): Fraction in the form "numerator/denominator" (e.g., "3/4"). - Returns: - str: The Mirandese pronunciation of the fraction. +def pronounce_fraction_mwl(word: str, scale: Optional[Scale] = None) -> str: + """ + DEPRECATED """ - word = word.replace("bint'i", "binte i") - n1, n2 = word.split("/") - n1_int, n2_int = int(n1), int(n2) - - # Pronounce the denominator (second number) as an ordinal, and pluralize it if needed. - if n2_int in _FRACTION_STRING_MWL: - denom = _FRACTION_STRING_MWL[n2_int] - if n1_int != 1: - denom += "s" # plural - else: - # For other numbers - denom = pronounce_number_mwl(n2_int, scale=scale) - if n1_int > 1: # plural - denom += " abos" - - # Pronounce the numerator (first number) as a cardinal. - num = pronounce_number_mwl(n1_int, scale=scale) - return f"{num} {denom}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + scale = scale or MWL.vocab.DEFAULT_SCALE + return MWL.pronounce_fraction(word, scale) if __name__ == "__main__": print("--- Testing Pronunciation (Short Scale) ---") - print(f"1,234,567: {pronounce_number_mwl(1_234_567, scale=Scale.SHORT)}") - print(f"1,000,000,000: {pronounce_number_mwl(1_000_000_000, scale=Scale.SHORT)}") + print(f"1,234,567: {MWL.pronounce_number(1_234_567, scale=Scale.SHORT)}") + print(f"1,000,000,000: {MWL.pronounce_number(1_000_000_000, scale=Scale.SHORT)}") print("\n--- Testing Pronunciation (Long Scale) ---") - print(f"1,000,000: {pronounce_number_mwl(1_000_000, scale=Scale.LONG)}") - print(f"1,000,100: {pronounce_number_mwl(1_000_100, scale=Scale.LONG)}") - print(f"1,000,000,000: {pronounce_number_mwl(1_000_000_000, scale=Scale.LONG)}") - print(f"1,000,000,000,000: {pronounce_number_mwl(1_000_000_000_000, scale=Scale.LONG)}") - print(f"2,500,000,000: {pronounce_number_mwl(2_500_000_000, scale=Scale.LONG)}") - print(f"2,500,123,456: {pronounce_number_mwl(2_500_123_456, scale=Scale.LONG)}") - print(f"16: {pronounce_number_mwl(16)}") + print(f"1,000,000: {MWL.pronounce_number(1_000_000, scale=Scale.LONG)}") + print(f"1,000,100: {MWL.pronounce_number(1_000_100, scale=Scale.LONG)}") + print(f"1,000,000,000: {MWL.pronounce_number(1_000_000_000, scale=Scale.LONG)}") + print(f"1,000,000,000,000: {MWL.pronounce_number(1_000_000_000_000, scale=Scale.LONG)}") + print(f"2,500,000,000: {MWL.pronounce_number(2_500_000_000, scale=Scale.LONG)}") + print(f"2,500,123,456: {MWL.pronounce_number(2_500_123_456, scale=Scale.LONG)}") + print(f"16: {MWL.pronounce_number(16)}") print("\n--- Testing Edge Cases ---") - print(f"-123.45: {pronounce_number_mwl(-123.45)}") - print(f"10.05: {pronounce_number_mwl(10.05)}") - print(f"2000: {pronounce_number_mwl(2000)}") - print(f"2001: {pronounce_number_mwl(2001)}") - print(f"123.456789: {pronounce_number_mwl(123.456789)}") + print(f"-123.45: {MWL.pronounce_number(-123.45)}") + print(f"10.05: {MWL.pronounce_number(10.05)}") + print(f"2000: {MWL.pronounce_number(2000)}") + print(f"2001: {MWL.pronounce_number(2001)}") + print(f"123.456789: {MWL.pronounce_number(123.456789)}") print("\n--- Testing Ordinal Pronunciation ---") - print(f"1st (masculine): {pronounce_number_mwl(1, ordinals=True, gender=GrammaticalGender.MASCULINE)}") - print(f"1st (feminine): {pronounce_number_mwl(1, ordinals=True, gender=GrammaticalGender.FEMININE)}") - print(f"23rd (masculine): {pronounce_number_mwl(23, ordinals=True)}") - print(f"23rd (feminine): {pronounce_number_mwl(23, ordinals=True, gender=GrammaticalGender.FEMININE)}") - print(f"100th: {pronounce_number_mwl(100, ordinals=True)}") - print(f"101st: {pronounce_number_mwl(101, ordinals=True)}") - print(f"1000th: {pronounce_number_mwl(1000, ordinals=True)}") - print(f"1,000,000th: {pronounce_number_mwl(1_000_000, ordinals=True)}") - print(f"1,000,000,000,000th (long): {pronounce_number_mwl(1_000_000_000_000, ordinals=True, scale=Scale.LONG)}") - - print("\n--- Testing numbers_to_digits_mwl ---") - print(f"'duzientos i cinquenta' -> '{numbers_to_digits_mwl('duzientos i cinquenta')}'") - print(f"'un milhon' -> '{numbers_to_digits_mwl('un milhon')}'") - print(f"'zasseis' -> '{numbers_to_digits_mwl('zasseis')}'") - print(f"'hai duzientos i cinquenta carros' -> '{numbers_to_digits_mwl('hai duzientos i cinquenta carros')}'") + print(f"1st (masculine): {MWL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.MASCULINE)}") + print(f"1st (feminine): {MWL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.FEMININE)}") + print(f"23rd (masculine): {MWL.pronounce_number(23, ordinals=True)}") + print(f"23rd (feminine): {MWL.pronounce_number(23, ordinals=True, gender=GrammaticalGender.FEMININE)}") + print(f"100th: {MWL.pronounce_number(100, ordinals=True)}") + print(f"101st: {MWL.pronounce_number(101, ordinals=True)}") + print(f"1000th: {MWL.pronounce_number(1000, ordinals=True)}") + print(f"1,000,000th: {MWL.pronounce_number(1_000_000, ordinals=True)}") + print(f"1,000,000,000,000th (long): {MWL.pronounce_number(1_000_000_000_000, ordinals=True, scale=Scale.LONG)}") + + print("\n--- Testing MWL.numbers_to_digits ---") + print(f"'duzientos i cinquenta' -> '{MWL.numbers_to_digits('duzientos i cinquenta')}'") + print(f"'un milhon' -> '{MWL.numbers_to_digits('un milhon')}'") + print(f"'zasseis' -> '{MWL.numbers_to_digits('zasseis')}'") + print(f"'hai duzientos i cinquenta carros' -> '{MWL.numbers_to_digits('hai duzientos i cinquenta carros')}'") print("\n--- Testing Ordinal Extraction ---") - print(f"'l segundo carro' -> {extract_number_mwl('l segundo carro', ordinals=True)}") - print(f"'purmerio lugar' -> {extract_number_mwl('purmerio lugar', ordinals=True)}") - print(f"'l milésimo die' -> {extract_number_mwl('l milésimo dia', ordinals=True)}") - print(f"'la milésima beç' -> {extract_number_mwl('la milésima beç', ordinals=True)}") - print(f"'la purmeria beç' -> {extract_number_mwl('la purmeria beç', ordinals=True)}") - print(f"'la sessagésima quarta beç' -> {extract_number_mwl('la sessagésima quarta beç', ordinals=True)}") + print(f"'l segundo carro' -> {MWL.extract_number('l segundo carro', ordinals=True)}") + print(f"'purmerio lugar' -> {MWL.extract_number('purmerio lugar', ordinals=True)}") + print(f"'l milésimo die' -> {MWL.extract_number('l milésimo dia', ordinals=True)}") + print(f"'la milésima beç' -> {MWL.extract_number('la milésima beç', ordinals=True)}") + print(f"'la purmeria beç' -> {MWL.extract_number('la purmeria beç', ordinals=True)}") + print(f"'la sessagésima quarta beç' -> {MWL.extract_number('la sessagésima quarta beç', ordinals=True)}") print("\n--- Testing Cardinal Extraction ---") - print(f"'un' -> {extract_number_mwl('un')}") - print(f"'ũa' -> {extract_number_mwl('ũa')}") - print(f"'bint'i un' ->", extract_number_mwl("bint'i un")) - print(f"'bint'i ũa' ->", extract_number_mwl("bint'i ũa")) - print(f"'bint'i dous' ->", extract_number_mwl("bint'i dous")) - print(f"'bint'i dues' ->", extract_number_mwl("bint'i dues")) - print(f"'un milhon' -> {extract_number_mwl('un milhon')}") - print(f"'dous milhones e quinhentos' -> {extract_number_mwl('dous milhones i quinhentos')}") - print(f"'mil i binte i trés' -> {extract_number_mwl('mil i binte i trés')}") - print(f"'trinta i cinco bírgula quatro' -> {extract_number_mwl('trinta i cinco bírgula quatro')}") + print(f"'un' -> {MWL.extract_number('un')}") + print(f"'ũa' -> {MWL.extract_number('ũa')}") + print(f"'bint'i un' ->", MWL.extract_number("bint'i un")) + print(f"'bint'i ũa' ->", MWL.extract_number("bint'i ũa")) + print(f"'bint'i dous' ->", MWL.extract_number("bint'i dous")) + print(f"'bint'i dues' ->", MWL.extract_number("bint'i dues")) + print(f"'un milhon' -> {MWL.extract_number('un milhon')}") + print(f"'dous milhones e quinhentos' -> {MWL.extract_number('dous milhones i quinhentos')}") + print(f"'mil i binte i trés' -> {MWL.extract_number('mil i binte i trés')}") + print(f"'trinta i cinco bírgula quatro' -> {MWL.extract_number('trinta i cinco bírgula quatro')}") print("\n--- Testing Fractions ---") - print(f"1/2: {pronounce_fraction_mwl('1/2')}") - print(f"2/2: {pronounce_fraction_mwl('2/2')}") - print(f"5/2: {pronounce_fraction_mwl('5/2')}") - print(f"5/3: {pronounce_fraction_mwl('5/3')}") - print(f"5/4: {pronounce_fraction_mwl('5/4')}") - print(f"7/5: {pronounce_fraction_mwl('7/5')}") - print(f"0/20: {pronounce_fraction_mwl('0/20')}") + print(f"1/2: {MWL.pronounce_fraction('1/2')}") + print(f"2/2: {MWL.pronounce_fraction('2/2')}") + print(f"5/2: {MWL.pronounce_fraction('5/2')}") + print(f"5/3: {MWL.pronounce_fraction('5/3')}") + print(f"5/4: {MWL.pronounce_fraction('5/4')}") + print(f"7/5: {MWL.pronounce_fraction('7/5')}") + print(f"0/20: {MWL.pronounce_fraction('0/20')}") diff --git a/ovos_number_parser/numbers_pt.py b/ovos_number_parser/numbers_pt.py index ebb4751..523d7fe 100644 --- a/ovos_number_parser/numbers_pt.py +++ b/ovos_number_parser/numbers_pt.py @@ -1,8 +1,9 @@ -import re from enum import Enum -from typing import List, Union, Dict, Tuple - -from ovos_number_parser.util import Scale, GrammaticalGender, DigitPronunciation +from typing import Union, Optional +import warnings +from ovos_number_parser.util import (Scale, GrammaticalGender, + DigitPronunciation, + NumberVocabulary, RomanceNumberExtractor) class PortugueseVariant(str, Enum): @@ -15,201 +16,7 @@ class PortugueseVariant(str, Enum): PT = "pt" -# --- Base Pronunciation Dictionaries (Variant-aware) --- -# Dictionaries for units, tens, and hundreds, separated by Portuguese variant. - -_UNITS: Dict[int, str] = { - 1: 'um', 2: 'dois', 3: 'três', 4: 'quatro', 5: 'cinco', 6: 'seis', - 7: 'sete', 8: 'oito', 9: 'nove' -} - -_TENS_BR: Dict[int, str] = { - 10: 'dez', 11: 'onze', 12: 'doze', 13: 'treze', 14: 'catorze', - 15: 'quinze', 16: 'dezesseis', 17: 'dezessete', 18: 'dezoito', - 19: 'dezenove', 20: 'vinte', 30: 'trinta', 40: 'quarenta', - 50: 'cinquenta', 60: 'sessenta', 70: 'setenta', 80: 'oitenta', - 90: 'noventa' -} - -_TENS_PT: Dict[int, str] = { - 10: 'dez', 11: 'onze', 12: 'doze', 13: 'treze', 14: 'catorze', - 15: 'quinze', 16: 'dezasseis', 17: 'dezassete', 18: 'dezoito', - 19: 'dezanove', 20: 'vinte', 30: 'trinta', 40: 'quarenta', - 50: 'cinquenta', 60: 'sessenta', 70: 'setenta', 80: 'oitenta', - 90: 'noventa' -} - -_HUNDREDS: Dict[int, str] = { - 100: 'cem', 200: 'duzentos', 300: 'trezentos', 400: 'quatrocentos', - 500: 'quinhentos', 600: 'seiscentos', 700: 'setecentos', - 800: 'oitocentos', 900: 'novecentos' -} - -_FRACTION_STRING_PT: Dict[int, str] = { - 2: 'meio', 3: 'terço', 4: 'quarto', 5: 'quinto', 6: 'sexto', - 7: 'sétimo', 8: 'oitavo', 9: 'nono', 10: 'décimo', - 11: 'onze avos', 12: 'doze avos', 13: 'treze avos', 14: 'catorze avos', - 15: 'quinze avos', 16: 'dezasseis avos', 17: 'dezassete avos', - 18: 'dezoito avos', 19: 'dezanove avos', - 20: 'vigésimo', 30: 'trigésimo', 100: 'centésimo', 1000: 'milésimo' -} - -_FEMALE_NUMS = { - "uma": 1, - "duas": 2 -} - -# --- Ordinal Pronunciation Dictionaries (Masculine Base) --- -# These dictionaries are for masculine forms. The feminine form -# is generated by replacing the final 'o' with 'a'. -_ORDINAL_UNITS_MASC: Dict[int, str] = { - 1: 'primeiro', 2: 'segundo', 3: 'terceiro', 4: 'quarto', 5: 'quinto', - 6: 'sexto', 7: 'sétimo', 8: 'oitavo', 9: 'nono' -} - -_ORDINAL_TENS_MASC: Dict[int, str] = { - 10: 'décimo', 20: 'vigésimo', 30: 'trigésimo', 40: 'quadragésimo', - 50: 'quinquagésimo', 60: 'sexagésimo', 70: 'septuagésimo', - 80: 'octogésimo', 90: 'nonagésimo' -} - -_ORDINAL_HUNDREDS_MASC: Dict[int, str] = { - 100: 'centésimo', 200: 'ducentésimo', 300: 'trecentésimo', - 400: 'quadrigentésimo', 500: 'quingentésimo', 600: 'sexcentésimo', - 700: 'septingentésimo', 800: 'octingentésimo', 900: 'noningentésimo' -} - -_ORDINAL_SCALES_MASC: Dict[Scale, Dict[PortugueseVariant, List[Tuple[int, str]]]] = { - Scale.SHORT: { - PortugueseVariant.BR: [ - (10 ** 21, "sextilionésimo"), - (10 ** 18, "quintilionésimo"), - (10 ** 15, "quadrilionésimo"), - (10 ** 12, "trilionésimo"), - (10 ** 9, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ], - PortugueseVariant.PT: [ - (10 ** 21, "sextilionésimo"), - (10 ** 18, "quintilionésimo"), - (10 ** 15, "quatrilionésimo"), - (10 ** 12, "trilionésimo"), - (10 ** 9, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ] - }, - Scale.LONG: { - PortugueseVariant.BR: [ - (10 ** 36, "sextilionésimo"), - (10 ** 30, "quintilionésimo"), - (10 ** 24, "quatrilionésimo"), - (10 ** 18, "trilionésimo"), - (10 ** 12, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ], - PortugueseVariant.PT: [ - (10 ** 36, "sextilionésimo"), - (10 ** 30, "quintilionésimo"), - (10 ** 24, "quatrilionésimo"), - (10 ** 18, "trilionésimo"), - (10 ** 12, "bilionésimo"), - (10 ** 6, "milionésimo"), - (10 ** 3, "milésimo") - ] - } -} - - -# --- Scale Definitions --- -# Structure: (value, singular_name, plural_name) -# Ordered from largest to smallest. -_SCALES: Dict[Scale, Dict[PortugueseVariant, List[Tuple[int, str, str]]]] = { - Scale.SHORT: { - PortugueseVariant.BR: [ - (10 ** 21, "sextilhão", "sextilhões"), - (10 ** 18, "quintilhão", "quintilhões"), - (10 ** 15, "quadrilhão", "quadrilhões"), - (10 ** 12, "trilhão", "trilhões"), - (10 ** 9, "bilhão", "bilhões"), - (10 ** 6, "milhão", "milhões"), - (10 ** 3, "mil", "mil") - ], - PortugueseVariant.PT: [ - (10 ** 21, "sextilião", "sextiliões"), - (10 ** 18, "quintilião", "quintiliões"), - (10 ** 15, "quatrilião", "quatriliões"), - (10 ** 12, "trilião", "triliões"), - (10 ** 9, "bilião", "biliões"), - (10 ** 6, "milhão", "milhões"), - (10 ** 3, "mil", "mil") - ] - }, - Scale.LONG: { - PortugueseVariant.BR: [ - (10 ** 36, "sextilhão", "sextilhões"), - (10 ** 30, "quintilhão", "quintilhões"), - (10 ** 24, "quatrilhão", "quatrilhões"), - (10 ** 18, "trilhão", "trilhões"), - (10 ** 12, "bilhão", "bilhões"), - (10 ** 6, "milhão", "milhões"), - (10 ** 3, "mil", "mil") - ], - PortugueseVariant.PT: [ - (10 ** 36, "sextilião", "sextiliões"), - (10 ** 30, "quintilião", "quintiliões"), - (10 ** 24, "quatrilião", "quatriliões"), - (10 ** 18, "trilião", "triliões"), - (10 ** 12, "bilião", "biliões"), - (10 ** 6, "milhão", "milhões"), - (10 ** 3, "mil", "mil") - ] - } -} - -# Mapping of number words to their integer values. This is dynamically built -# from the base dictionaries to ensure consistency and variant support. -_NUMBERS_BASE = { - **_FEMALE_NUMS, - **{v: k for k, v in _UNITS.items()}, - **{v: k for k, v in _TENS_PT.items()}, - **{v: k for k, v in _TENS_BR.items()}, - **{v: k for k, v in _HUNDREDS.items()}, - "cento": 100 -} - -def get_number_map(scale: Scale = Scale.LONG, - variant: PortugueseVariant = PortugueseVariant.PT): - """ - Return a dictionary mapping Portuguese number words, including scale names, to their integer values for the specified scale and language variant. - - Parameters: - scale (Scale): The numerical scale to use (short or long). - variant (PortugueseVariant): The Portuguese language variant (Brazilian or European). - - Returns: - dict: Mapping of Portuguese number words (units, tens, hundreds, scale names) to their corresponding integer values. - """ - return { - **_NUMBERS_BASE, - **{s_name: val for val, s_name, _ in _SCALES[scale][variant]}, - **{p_name: val for val, _, p_name in _SCALES[scale][variant]} - } - -_NUMBERS_BR = get_number_map(Scale.SHORT, PortugueseVariant.BR) -_NUMBERS_PT = get_number_map(Scale.LONG, PortugueseVariant.PT) - -_ORDINAL_WORDS_MASC = { - **{v: k for k, v in _ORDINAL_UNITS_MASC.items()}, - **{v: k for k, v in _ORDINAL_TENS_MASC.items()}, - **{v: k for k, v in _ORDINAL_HUNDREDS_MASC.items()}, - **{s_name: val for val, s_name in _ORDINAL_SCALES_MASC[Scale.SHORT][PortugueseVariant.BR]}, -} - - -def _swap_gender(word: str, gender: GrammaticalGender) -> str: +def swap_gender_pt(word: str, gender: GrammaticalGender) -> str: """ Convert a Portuguese word between masculine and feminine grammatical gender by adjusting its ending. @@ -220,7 +27,12 @@ def _swap_gender(word: str, gender: GrammaticalGender) -> str: Returns: str: The word with its ending swapped to match the specified gender, if applicable; otherwise, the original word. """ - if gender == GrammaticalGender.FEMININE and word.endswith('o'): + if word == "dois" and gender == GrammaticalGender.FEMININE: + return "duas" + elif word == "duas" and gender == GrammaticalGender.MASCULINE: + return "dois" + + elif gender == GrammaticalGender.FEMININE and word.endswith('o'): return word[:-1] + 'a' elif gender == GrammaticalGender.MASCULINE and word.endswith('ma'): return word[:-1] @@ -235,121 +47,327 @@ def _swap_gender(word: str, gender: GrammaticalGender) -> str: return word -def _pronounce_up_to_999( - n: int, - variant: PortugueseVariant = PortugueseVariant.PT, - gender: GrammaticalGender = GrammaticalGender.MASCULINE -) -> str: - """ - Returns the Portuguese cardinal pronunciation of an integer from 0 to 999, using the specified language variant. - - Parameters: - n (int): Integer to pronounce (must be between 0 and 999). - variant (PortugueseVariant, optional): Portuguese variant (Brazilian or European). Defaults to Brazilian. - - Returns: - str: The number pronounced in Portuguese words. - - Raises: - ValueError: If n is not in the range 0 to 999. - """ - # special cases for feminine 1 and 2 "uma", "duas" - if gender == GrammaticalGender.FEMININE: - if n == 1: - return "uma" - if n == 2: - return "duas" - - if not 0 <= n <= 999: - raise ValueError("Number must be between 0 and 999.") - if n == 0: - return "zero" - if n == 100: - return "cem" - - parts = [] - tens_map = _TENS_BR if variant == PortugueseVariant.BR else _TENS_PT - - # Hundreds - if n >= 100: - hundred = n // 100 * 100 - parts.append("cento" if hundred == 100 else _HUNDREDS[hundred]) - n %= 100 - if n > 0: - parts.append("e") - - # Tens and Units - if n > 0: - if n < 20: - parts.append(tens_map.get(n) or _UNITS.get(n, "")) - else: - ten = n // 10 * 10 - unit = n % 10 - parts.append(tens_map[ten]) - if unit > 0: - parts.append("e") - parts.append(_UNITS[unit]) - - return " ".join(parts) - - -def _pronounce_ordinal_up_to_999( - n: int, - gender: GrammaticalGender = GrammaticalGender.MASCULINE, - variant: PortugueseVariant = PortugueseVariant.PT -) -> str: - """ - Returns the Portuguese ordinal word for an integer between 0 and 999, adjusting for grammatical gender and language variant. +def pluralize_pt(word: str): + if word.endswith("ão"): + return word[:-2] + "ões" + if not word.endswith("s"): + return word + "s" + return word - Parameters: - n (int): The integer to convert (must be between 0 and 999). - Returns: - str: The ordinal representation of the number in Portuguese. +# https://pt.wikipedia.org/wiki/Numerais_em_l%C3%ADngua_portuguesa +_PT_PT = NumberVocabulary( + LANG="pt-PT", + + swap_gender=swap_gender_pt, # used for female forms + pluralize=pluralize_pt, # use for plural forms + + HUNDRED_PARTICLE="cento", # how to read "1XX" + DENOMINATOR_PARTICLE="avos", # for fractions X / N {PARTICLE} + NUMBER_OVERFLOW="número exageradamente grande", + DIVIDED_BY_ZERO="a dividir por zero", # how to read X/0 values + NO_PREV_UNIT=[100, 1000], # "mil" vs "um mil" / "cem" vs "um cem" + NO_PLURAL=[1000], # "dois mil" vs "dois mils" / "dois milhões" vs "dois milhão" + + DEFAULT_SCALE=Scale.LONG, + JOIN_WORD=["e"], + JOINER_ON_TWENTYS = True, # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS = True, # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS = False, # add JOIN_WORD from 1000-10000 - "mil e duzentos" + + DECIMAL_MARKER=["vírgula", "virgula", "ponto", ".", ","], + NEGATIVE_SIGN=["menos"], + UNITS={ + 0: 'zero', + 1: 'um', + 2: 'dois', + 3: 'três', + 4: 'quatro', + 5: 'cinco', + 6: 'seis', + 7: 'sete', + 8: 'oito', + 9: 'nove' + }, + TENS={ + 10: 'dez', + 11: 'onze', + 12: 'doze', + 13: 'treze', + 14: 'catorze', + 15: 'quinze', + 16: 'dezasseis', + 17: 'dezassete', + 18: 'dezoito', + 19: 'dezanove', + 20: 'vinte', + 30: 'trinta', + 40: 'quarenta', + 50: 'cinquenta', + 60: 'sessenta', + 70: 'setenta', + 80: 'oitenta', + 90: 'noventa' + }, + HUNDREDS={ + 100: 'cem', + 200: 'duzentos', + 300: 'trezentos', + 400: 'quatrocentos', + 500: 'quinhentos', + 600: 'seiscentos', + 700: 'setecentos', + 800: 'oitocentos', + 900: 'novecentos' + }, + FRACTION={ + 2: 'meio', + 3: 'terço', + 4: 'quarto', + 5: 'quinto', + 6: 'sexto', + 7: 'sétimo', + 8: 'oitavo', + 9: 'nono', + 10: 'décimo', + 11: 'onze avos', + 12: 'doze avos', + 13: 'treze avos', + 14: 'catorze avos', + 15: 'quinze avos', + 16: 'dezasseis avos', + 17: 'dezassete avos', + 18: 'dezoito avos', + 19: 'dezanove avos', + 20: 'vigésimo', + 30: 'trigésimo', + 100: 'centésimo', + 200: 'duocentésimo', + 300: 'trecentésimo', + 400: 'quadrigentésimo', # / 'quadringentésimo' + 500: 'quingentésimo', + 600: 'sexcentésimo', # / 'seiscentésimo' + 700: 'septingentésimo', # / 'setingentésimo' + 800: 'octingentésimo', + 900: 'nonngentésimo', # / 'noningentésimo' + 1000: 'milésimo' + }, + FRACTION_FEMALE={2: "meia"}, + SHORT_SCALE={ + 10 ** 303: "centilião", + 10 ** 63: "vigintilião", # / "vintilião" + 10 ** 60: "novendecilião", + 10 ** 57: "octodecilião", + 10 ** 54: "septendecilião", + 10 ** 51: "sedecilião", + 10 ** 48: "quindecilião", # Quinciodecilião + 10 ** 45: "quadriodecilião", # quatuordecilião + 10 ** 42: "tredecilião", # tridecilião / triodecilião + 10 ** 39: "dudecilião", # duodecilião + 10 ** 36: "undecilião", # unodecilião + 10 ** 33: "decilião", + 10 ** 30: "nonilião", + 10 ** 27: "octilião", + 10 ** 24: "septilião", + 10 ** 21: "sextilião", + 10 ** 18: "quintilião", + 10 ** 15: "quatrilião", + 10 ** 12: "trilião", + 10 ** 9: "bilião", + 10 ** 6: "milhão", + 10 ** 3: "mil" + }, + LONG_SCALE={ + 10 ** 600: "centilião", + 10 ** 120: "vigintilião", # / "vintilião" + 10 ** 114: "novendecilião", + 10 ** 108: "octodecilião", + 10 ** 102: "septendecilião", + 10 ** 96: "sedecilião", + 10 ** 90: "quindecilião", # Quinciodecilião + 10 ** 84: "quadriodecilião", # quatuordecilião + 10 ** 78: "tredecilião", # tridecilião / triodecilião + 10 ** 72: "dudecilião", # duodecilião + 10 ** 66: "undecilião", # unodecilião + 10 ** 60: "decilião", + 10 ** 54: "nonilião", + 10 ** 48: "octilião", + 10 ** 42: "septilião", + 10 ** 36: "sextilião", + 10 ** 30: "quintilião", + 10 ** 24: "quatrilião", + 10 ** 18: "trilião", + 10 ** 12: "bilião", + 10 ** 6: "milhão", + 10 ** 3: "mil" + }, + # defaults to male, use swap_gender/self.GENDERED_SPELLINGS if needed + GENDERED_SPELLINGS={ + GrammaticalGender.FEMININE: {1: "uma", 2: "duas"} + }, + DIGIT_SPELLINGS={}, + ALT_SPELLINGS={ + # pt-BR regionalisms + 'dezesseis': 16, + 'dezessete': 17, + 'dezenove': 19 + }, + ORDINAL_UNITS={ + 1: 'primeiro', + 2: 'segundo', + 3: 'terceiro', + 4: 'quarto', + 5: 'quinto', + 6: 'sexto', + 7: 'sétimo', + 8: 'oitavo', + 9: 'nono' + }, + ORDINAL_TENS={ + 10: 'décimo', + 20: 'vigésimo', + 30: 'trigésimo', + 40: 'quadragésimo', + 50: 'quinquagésimo', + 60: 'sexagésimo', + 70: 'septuagésimo', + 80: 'octogésimo', + 90: 'nonagésimo' + }, + ORDINAL_HUNDREDS={ + 100: 'centésimo', + 200: 'ducentésimo', + 300: 'trecentésimo', + 400: 'quadrigentésimo', + 500: 'quingentésimo', + 600: 'sexcentésimo', + 700: 'septingentésimo', + 800: 'octingentésimo', + 900: 'noningentésimo' + }, + ORDINAL_SHORT_SCALE={ + 10 ** 21: "sextilionésimo", + 10 ** 18: "quintilionésimo", + 10 ** 15: "quatrilionésimo", + 10 ** 12: "trilionésimo", + 10 ** 9: "bilionésimo", + 10 ** 6: "milionésimo", + 10 ** 3: "milésimo" + }, + ORDINAL_LONG_SCALE={ + 10 ** 36: "sextilionésimo", + 10 ** 30: "quintilionésimo", + 10 ** 24: "quatrilionésimo", + 10 ** 18: "trilionésimo", + 10 ** 12: "bilionésimo", + 10 ** 6: "milionésimo", + 10 ** 3: "milésimo" + } +) + +_PT_BR = NumberVocabulary( + LANG="pt-BR", + swap_gender=swap_gender_pt, # used for female forms + pluralize=pluralize_pt, # use for plural forms + DEFAULT_SCALE=Scale.SHORT, + DIVIDED_BY_ZERO=_PT_PT.DIVIDED_BY_ZERO, + DENOMINATOR_PARTICLE=_PT_PT.DENOMINATOR_PARTICLE, + HUNDRED_PARTICLE=_PT_PT.HUNDRED_PARTICLE, + NO_PREV_UNIT=_PT_PT.NO_PREV_UNIT, + NO_PLURAL=_PT_PT.NO_PLURAL, + NUMBER_OVERFLOW=_PT_PT.NUMBER_OVERFLOW, + JOIN_WORD=_PT_PT.JOIN_WORD, + JOINER_ON_TWENTYS=True, # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS=True, # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS=False, # add JOIN_WORD from 1000-10000 - "mil e duzentos" + DECIMAL_MARKER=_PT_PT.DECIMAL_MARKER, + NEGATIVE_SIGN=_PT_PT.NEGATIVE_SIGN, + UNITS=_PT_PT.UNITS, + TENS={ + **_PT_PT.TENS, + 16: 'dezesseis', + 17: 'dezessete', + 19: 'dezenove' + }, + HUNDREDS=_PT_PT.HUNDREDS, + FRACTION=_PT_PT.FRACTION, + FRACTION_FEMALE=_PT_PT.FRACTION_FEMALE, + SHORT_SCALE={ + 10 ** 303: "centilhão", + 10 ** 63: "vigintilhão", # / "vintilhão" + 10 ** 60: "novendecilhão", + 10 ** 57: "octodecilhão", + 10 ** 54: "septendecilhão", + 10 ** 51: "sedecilhão", + 10 ** 48: "quindecilhão", # Quinciodecilhão + 10 ** 45: "quadriodecilhão", # quatuordecilhão + 10 ** 42: "tredecilhão", # tridecilhão / triodecilhão + 10 ** 39: "dudecilhão", # duodecilhão + 10 ** 36: "undecilhão", # unodecilhão + 10 ** 33: "decilhão", + 10 ** 30: "nonilhão", + 10 ** 27: "octilhão", + 10 ** 24: "septilhão", + 10 ** 21: "sextilhão", + 10 ** 18: "quintilhão", + 10 ** 15: "quatrilhão", + 10 ** 12: "trilhão", + 10 ** 9: "bilhão", + 10 ** 6: "milhão", + 10 ** 3: "mil" + }, + LONG_SCALE={ + 10 ** 600: "centilhão", + 10 ** 120: "vigintilhão", # / "vintilhão" + 10 ** 114: "novendecilhão", + 10 ** 108: "octodecilhão", + 10 ** 102: "septendecilhão", + 10 ** 96: "sedecilhão", + 10 ** 90: "quindecilhão", # Quinciodecilhão + 10 ** 84: "quadriodecilhão", # quatuordecilhão + 10 ** 78: "tredecilhão", # tridecilhão / triodecilhão + 10 ** 72: "dudecilhão", # duodecilhão + 10 ** 66: "undecilhão", # unodecilhão + 10 ** 60: "decilhão", + 10 ** 54: "nonilhão", + 10 ** 48: "octilhão", + 10 ** 42: "septilhão", + 10 ** 36: "sextilhão", + 10 ** 30: "quintilhão", + 10 ** 24: "quatrilhão", + 10 ** 18: "trilhão", + 10 ** 12: "bilhão", + 10 ** 6: "milhão", + 10 ** 3: "mil" + }, + GENDERED_SPELLINGS=_PT_PT.GENDERED_SPELLINGS, + DIGIT_SPELLINGS={ + 6: "meia" # coloquialism + }, + ALT_SPELLINGS={ + # pt-PT regionalisms + 'dezasseis': 16, + 'dezassete': 17, + 'dezanove': 19 + }, + ORDINAL_UNITS=_PT_PT.ORDINAL_UNITS, + ORDINAL_TENS=_PT_PT.ORDINAL_TENS, + ORDINAL_HUNDREDS=_PT_PT.ORDINAL_HUNDREDS, + ORDINAL_SHORT_SCALE=_PT_PT.ORDINAL_SHORT_SCALE, + ORDINAL_LONG_SCALE=_PT_PT.ORDINAL_LONG_SCALE +) - Raises: - ValueError: If n is not between 0 and 999. - """ - if not 0 <= n <= 999: - raise ValueError("Number must be between 0 and 999.") - if n == 0: - return "zero" - - parts = [] - - # Handle hundreds - if n >= 100: - hundred_val = n // 100 * 100 - hundred_word_masc = _ORDINAL_HUNDREDS_MASC.get(hundred_val) - if hundred_word_masc: - parts.append(_swap_gender(hundred_word_masc, gender)) - n %= 100 - - # Handle tens and units - if n > 0: - # Ordinal numbers don't use 'e' as a separator - if n % 10 == 0 and n > 10: - tens_word_masc = _ORDINAL_TENS_MASC[n] - parts.append(_swap_gender(tens_word_masc, gender)) - elif n < 10: - units_word_masc = _ORDINAL_UNITS_MASC[n] - parts.append(_swap_gender(units_word_masc, gender)) - elif n < 20: - tens_word_masc = _ORDINAL_TENS_MASC[10] - units_word_masc = _ORDINAL_UNITS_MASC[n - 10] - parts.append(f"{_swap_gender(tens_word_masc, gender)} {_swap_gender(units_word_masc, gender)}") - else: - tens_word_masc = _ORDINAL_TENS_MASC[n // 10 * 10] - units_word_masc = _ORDINAL_UNITS_MASC[n % 10] - parts.append(f"{_swap_gender(tens_word_masc, gender)} {_swap_gender(units_word_masc, gender)}") - - return " ".join(parts) +PT_PT = RomanceNumberExtractor(_PT_PT) +PT_BR = RomanceNumberExtractor(_PT_BR) +################################################################## +# all methods below are deprecated and only for backwards compat +################################################################## def pronounce_ordinal_pt( number: Union[int, float], gender: GrammaticalGender = GrammaticalGender.MASCULINE, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, variant: PortugueseVariant = PortugueseVariant.PT ) -> str: """ @@ -367,384 +385,126 @@ def pronounce_ordinal_pt( Raises: TypeError: If `number` is not an int or float. """ - if not isinstance(number, (int, float)): - raise TypeError("Number must be an int or float.") - if number == 0: - return "zero" - - if number < 0: - return f"menos {pronounce_ordinal_pt(abs(number), gender, scale, variant)}" - - n = int(number) - if n < 1000: - return _pronounce_ordinal_up_to_999(n, gender, variant) - - ordinal_scale_defs = _ORDINAL_SCALES_MASC[scale][variant] - - # Find the largest scale that fits the number - for scale_val, s_name in ordinal_scale_defs: - if n >= scale_val: - break - - count = n // scale_val - remainder = n % scale_val - - # Special case for "milésimo" and other large scales where 'um' is not needed - if count == 1 and scale_val >= 1000: - count_str = _swap_gender(s_name, gender) - else: - # Pronounce the 'count' part of the number and the scale word - count_pronunciation = pronounce_number_pt(count, scale=scale, variant=variant) - scale_word_masc = s_name - scale_word = _swap_gender(scale_word_masc, gender) - count_str = f"{count_pronunciation} {scale_word}" - - # If there's no remainder, we're done - if remainder == 0: - return count_str - - # Pronounce the remainder and join - remainder_str = pronounce_ordinal_pt(remainder, gender, scale, variant) - - return f"{count_str} {remainder_str}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + if variant == PortugueseVariant.PT: + scale = scale or PT_PT.vocab.DEFAULT_SCALE + return PT_PT.pronounce_ordinal(number, gender, scale) + scale = scale or PT_BR.vocab.DEFAULT_SCALE + return PT_BR.pronounce_ordinal(number, gender, scale) def is_fractional_pt( input_str: str ) -> Union[float, bool]: """ - Checks if the input string corresponds to a recognized Portuguese fractional word. - - Returns: - The fractional value as a float if recognized (e.g., 0.5 for "meio" or "meia"); otherwise, False. + DEPRECATED """ - input_str = input_str.lower().strip() - fraction_map = _FRACTION_STRING_PT - - # Handle plural forms - if input_str.endswith('s') and input_str not in fraction_map.values(): - input_str = input_str[:-1] - - # Handle "meio" vs "meia" - if input_str == "meia": - input_str = "meio" - - # Use a dynamic lookup instead of a hardcoded list - for den, word in fraction_map.items(): - # Handle cases like "onze avos", so we check for the whole word - if input_str == word: - return 1.0 / den - - # Special case for "meia" as a female form of "meio" (1/2) - if input_str in ["meia", "meio"]: - return 0.5 - - return False + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return PT_PT.is_fractional(input_str) def is_ordinal_pt(input_str: str) -> bool: """ - Determine if a string is a Portuguese ordinal word. - - Returns: - bool: True if the input string is recognized as a Portuguese ordinal, otherwise False. + DEPRECATED """ - input_str = _swap_gender(input_str, GrammaticalGender.MASCULINE) - return input_str in _ORDINAL_WORDS_MASC + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + return PT_PT.is_ordinal(input_str) def extract_number_pt( text: str, ordinals: bool = False, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, variant: PortugueseVariant = PortugueseVariant.PT ) -> Union[int, float, bool]: """ - Extracts a numeric value from a Portuguese text phrase, supporting cardinals, ordinals, fractions, and large scales. - - Parameters: - text (str): The input phrase potentially containing a number. - ordinals (bool): If True, recognizes ordinal words as numbers. - scale (Scale): Specifies whether to use the short or long numerical scale. - variant (PortugueseVariant): Specifies the Portuguese language variant (BR or PT). - - Returns: - int or float: The extracted number if found; otherwise, False. + DEPRECATED """ - numbers_map = get_number_map(scale, variant) - scales_map = _SCALES[scale][variant] - - clean_text = text.lower().replace('-', ' ') - tokens = [t for t in clean_text.split() if t != "e"] - - result = 0 - current_number = 0 - number_consumed = False - - for i, token in enumerate(tokens): - if token is None: - continue # consumed in previous idx - next_token = tokens[i+1] if i < len(tokens) - 1 else None - next_digit = numbers_map.get(next_token) if next_token else None - val = numbers_map.get(token) - if val is not None: - if next_digit and next_digit > val: - tokens[i+1] = None - current_number += val * next_digit - else: - current_number += val - elif ordinals and is_ordinal_pt(token): - token = _swap_gender(token, GrammaticalGender.MASCULINE) - current_number += _ORDINAL_WORDS_MASC[token] - elif is_fractional_pt(token): - fraction = is_fractional_pt(token) - result += current_number + fraction - current_number = 0 - number_consumed = True - else: - # Handle large scales like milhão, bilhão - found_scale = False - for scale_val, singular, plural in scales_map: - if token == singular or token == plural: - if current_number == 0: - current_number = 1 - result += current_number * scale_val - current_number = 0 - found_scale = True - number_consumed = True - break - if not found_scale: - if token in ["ponto", "virgula", "vírgula", ".", ","]: - decimal_str = ''.join( - str(numbers_map.get(t, '')) for t in tokens[i+1:] - if t in numbers_map - ) - if decimal_str: - result += current_number + float(f"0.{decimal_str}") - number_consumed = True - current_number = 0 - break - - if not number_consumed: - result += current_number - - return result if result > 0 else False - + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + if variant == PortugueseVariant.PT: + scale = scale or PT_PT.vocab.DEFAULT_SCALE + return PT_PT.extract_number(text, ordinals, scale) + scale = scale or PT_BR.vocab.DEFAULT_SCALE + return PT_BR.extract_number(text, ordinals, scale) def pronounce_number_pt( number: Union[int, float], places: int = 5, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, variant: PortugueseVariant = PortugueseVariant.PT, ordinals: bool = False, digits: DigitPronunciation = DigitPronunciation.FULL_NUMBER, gender: GrammaticalGender = GrammaticalGender.MASCULINE ) -> str: """ - Return the full Portuguese pronunciation of a number, supporting cardinal and ordinal forms, decimals, large scales, grammatical gender, and both Brazilian and European Portuguese variants. - - Parameters: - number (int or float): The number to pronounce. - places (int): Number of decimal places to include for floats. - scale (Scale): Numerical scale to use (short or long). - variant (PortugueseVariant): Portuguese language variant for pronunciation. - ordinals (bool): If True, pronounce as an ordinal number. - gender (GrammaticalGender): Grammatical gender for ordinal numbers. - - Returns: - str: The number expressed as a Portuguese phrase. + DEPRECATED """ - if not isinstance(number, (int, float)): - raise TypeError("Number must be an int or float.") - - if ordinals: - return pronounce_ordinal_pt(number, gender, scale, variant) - - if number == 0: - return "zero" - - if number < 0: - return f"menos {pronounce_number_pt(abs(number), places, scale=scale, variant=variant, digits=digits, gender=gender)}" - - # Handle decimals - if "." in str(number): - integer_part = int(number) - decimal_part_str = f"{number:.{places}f}".split('.')[1].rstrip("0") - - # Handle cases where the decimal part rounds to zero - if decimal_part_str and int(decimal_part_str) == 0: - return pronounce_number_pt(integer_part, places, - scale=scale, variant=variant, - digits=digits, gender=gender) - - int_pronunciation = pronounce_number_pt(integer_part, places, - scale=scale, variant=variant, - digits=digits, gender=gender) - - decimal_pronunciation_parts = [] - # pronounce decimals either as a whole number or digit by digit - if decimal_part_str: - if digits == DigitPronunciation.FULL_NUMBER: - decimal_pronunciation_parts.append(_pronounce_up_to_999(int(decimal_part_str[:3]), variant, gender)) - else: - for digit in decimal_part_str: - decimal_pronunciation_parts.append(_pronounce_up_to_999(int(digit), variant, gender)) - - decimal_pronunciation = " ".join(decimal_pronunciation_parts) or "zero" - decimal_word = "vírgula" - return f"{int_pronunciation} {decimal_word} {decimal_pronunciation}" - - # --- Integer Pronunciation Logic --- - n = int(number) - - # Base case for recursion: numbers less than 1000 - if n < 1000: - return _pronounce_up_to_999(n, variant, gender) - - scale_definitions = _SCALES[scale][variant] - - # Find the largest scale that fits the number - for scale_val, s_name, p_name in scale_definitions: - if n >= scale_val: - break - - count = n // scale_val - remainder = n % scale_val - - # Pronounce the 'count' part of the number - scale_word = s_name if count == 1 else p_name - if count == 1 and scale_word == "mil": - count_str = scale_word - else: - count_pronunciation = pronounce_number_pt(count, places, scale, variant) - count_str = f"{count_pronunciation} {scale_word}" - - # If there's no remainder, we're done - if remainder == 0: - return count_str - - # Pronounce the remainder and join with the correct conjunction - remainder_str = pronounce_number_pt(remainder, places, scale, variant) - - # Conjunction logic: add "e" if the remainder is the last group and is - # less than 100 or a multiple of 100. - if remainder < 100 or (remainder < 1000 and remainder % 100 == 0): - return f"{count_str} e {remainder_str}" - else: - return f"{count_str} {remainder_str}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + if variant == PortugueseVariant.PT: + scale = scale or PT_PT.vocab.DEFAULT_SCALE + return PT_PT.pronounce_number(number, places, scale, ordinals, digits, gender) + scale = scale or PT_BR.vocab.DEFAULT_SCALE + return PT_BR.pronounce_number(number, places, scale, ordinals, digits, gender) def numbers_to_digits_pt( utterance: str, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, variant: PortugueseVariant = PortugueseVariant.PT ) -> str: """ - Converts written Portuguese numbers in a text string to their digit equivalents, preserving all other text. - - Identifies spans of number words (including the joiner "e"), extracts their numeric values, and replaces them with digit strings. Non-number words and context are left unchanged. - - Parameters: - utterance (str): Input text possibly containing written Portuguese numbers. - scale (Scale, optional): Numerical scale (short or long) to interpret large numbers. Defaults to Scale.LONG. - variant (PortugueseVariant, optional): Portuguese language variant (BR or PT). Defaults to PortugueseVariant.PT. - - Returns: - str: The input text with written numbers replaced by their digit representations. - """ - words = tokenize(utterance) - output = [] - i = 0 - NUMBERS = get_number_map(scale, variant) - while i < len(words): - # Look for the start of a number span - if words[i] in NUMBERS: - # Start a new span - number_span_words = [] - j = i - # Continue the span as long as we find number words or the joiner 'e' - while j < len(words) and (words[j] in NUMBERS or words[j] == "e"): - number_span_words.append(words[j]) - j += 1 - - # Form the phrase from the span and extract the number value - phrase = " ".join(number_span_words) - number_val = extract_number_pt(phrase, variant=variant) - - if number_val is not False: - # If a valid number is found, add its digit representation to the output - output.append(str(number_val)) - # Advance the main index 'i' past the entire span - i = j - else: - # If the span doesn't form a valid number, treat the first word as non-numeric - # and move to the next word. This handles cases like "e" at the beginning of a sentence. - output.append(words[i]) - i += 1 - else: - # If the current word is not a number word, add it to the output - # and move to the next word - output.append(words[i]) - i += 1 - - return " ".join(output) - - -def tokenize(utterance: str) -> List[str]: + DEPRECATED """ - Splits a Portuguese text string into a list of tokens, separating words and punctuation. + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + if variant == PortugueseVariant.PT: + scale = scale or PT_PT.vocab.DEFAULT_SCALE + return PT_PT.numbers_to_digits(utterance, scale) + scale = scale or PT_BR.vocab.DEFAULT_SCALE + return PT_BR.numbers_to_digits(utterance, scale) - Returns: - A list of tokens, where each token is a word or punctuation mark from the input string. - """ - # Split things like 12% - utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) - # Split things like #1 - utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) - # Split things like amo-te, but preserve numbers like 1-2 - utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", - utterance) - - tokens = utterance.split() - # Remove a trailing hyphen if it's the last token - if tokens and tokens[-1] == '-': - tokens = tokens[:-1] - - return tokens def pronounce_fraction_pt(word: str, - scale: Scale = Scale.LONG, + scale: Optional[Scale] = None, variant: PortugueseVariant = PortugueseVariant.PT) -> str: """ - Return the Portuguese pronunciation of a fraction given as a string (e.g., "1/2"). - - The numerator is pronounced as a cardinal number, and the denominator as an ordinal or fraction name, pluralized if appropriate. For denominators not in the known fraction list, the denominator is pronounced as a cardinal number followed by "avos" if plural. - - Parameters: - word (str): Fraction in the form "numerator/denominator" (e.g., "3/4"). - - Returns: - str: The Portuguese pronunciation of the fraction. + DEPRECATED """ - n1, n2 = word.split("/") - n1_int, n2_int = int(n1), int(n2) - - # Pronounce the denominator (second number) as an ordinal, and pluralize it if needed. - if n2_int in _FRACTION_STRING_PT: - denom = _FRACTION_STRING_PT[n2_int] - if n1_int != 1: - denom += "s" # plural - else: - # For other numbers - denom = pronounce_number_pt(n2_int, scale=scale, variant=variant) - if n1_int > 1: # plural - denom += " avos" - - # Pronounce the numerator (first number) as a cardinal. - num = pronounce_number_pt(n1_int, scale=scale, variant=variant) - return f"{num} {denom}" + warnings.warn( + "migrate to use RomanceNumberExtractor and NumberVocabulary directly instead", + DeprecationWarning, + stacklevel=2, + ) + if variant == PortugueseVariant.PT: + scale = scale or PT_PT.vocab.DEFAULT_SCALE + return PT_PT.pronounce_fraction(word, scale) + scale = scale or PT_BR.vocab.DEFAULT_SCALE + return PT_BR.pronounce_fraction(word, scale) if __name__ == "__main__": @@ -757,10 +517,12 @@ def pronounce_fraction_pt(word: str, print(f"1,000,000: {pronounce_number_pt(1_000_000, scale=Scale.LONG, variant=PortugueseVariant.PT)}") print(f"1,000,100: {pronounce_number_pt(1_000_100, scale=Scale.LONG, variant=PortugueseVariant.PT)}") print(f"1,000,000,000: {pronounce_number_pt(1_000_000_000, scale=Scale.LONG, variant=PortugueseVariant.PT)}") - print(f"1,000,000,000,000: {pronounce_number_pt(1_000_000_000_000, scale=Scale.LONG, variant=PortugueseVariant.PT)}") + print( + f"1,000,000,000,000: {pronounce_number_pt(1_000_000_000_000, scale=Scale.LONG, variant=PortugueseVariant.PT)}") print(f"2,500,000,000: {pronounce_number_pt(2_500_000_000, scale=Scale.LONG, variant=PortugueseVariant.PT)}") print(f"2,500,123,456: {pronounce_number_pt(2_500_123_456, scale=Scale.LONG, variant=PortugueseVariant.PT)}") print(f"16: {pronounce_number_pt(16, variant=PortugueseVariant.PT)}") + print(f"-16: {pronounce_number_pt(-16, variant=PortugueseVariant.PT)}") print("\n--- Testing Edge Cases ---") print(f"-123.45: {pronounce_number_pt(-123.45)}") @@ -778,7 +540,8 @@ def pronounce_fraction_pt(word: str, print(f"101st: {pronounce_number_pt(101, ordinals=True)}") print(f"1000th: {pronounce_number_pt(1000, ordinals=True)}") print(f"1,000,000th: {pronounce_number_pt(1_000_000, ordinals=True)}") - print(f"1,000,000,000,000th (PT, long): {pronounce_number_pt(1_000_000_000_000, ordinals=True, variant=PortugueseVariant.PT, scale=Scale.LONG)}") + print( + f"1,000,000,000,000th (PT, long): {pronounce_number_pt(1_000_000_000_000, ordinals=True, variant=PortugueseVariant.PT, scale=Scale.LONG)}") print("\n--- Testing numbers_to_digits_pt (BR) ---") print(f"'quinhentos e cinquenta' -> '{numbers_to_digits_pt('quinhentos e cinquenta')}'") @@ -787,7 +550,8 @@ def pronounce_fraction_pt(word: str, print(f"'há duzentos e cinquenta carros' -> '{numbers_to_digits_pt('há duzentos e cinquenta carros')}'") print("\n--- Testing numbers_to_digits_pt (PT) ---") - print(f"'quinhentos e cinquenta' -> '{numbers_to_digits_pt('quinhentos e cinquenta', variant=PortugueseVariant.PT)}'") + print( + f"'quinhentos e cinquenta' -> '{numbers_to_digits_pt('quinhentos e cinquenta', variant=PortugueseVariant.PT)}'") print(f"'um milhão' -> '{numbers_to_digits_pt('um milhão', variant=PortugueseVariant.PT)}'") print(f"'dezasseis' -> '{numbers_to_digits_pt('dezasseis', variant=PortugueseVariant.PT)}'") @@ -808,7 +572,6 @@ def pronounce_fraction_pt(word: str, print(f"'mil e vinte e três' -> {extract_number_pt('mil e vinte e três')}") print(f"'trinta e cinco vírgula quatro' -> {extract_number_pt('trinta e cinco vírgula quatro')}") - print("\n--- Testing Fractions ---") print(f"1/2: {pronounce_fraction_pt('1/2')}") print(f"2/2: {pronounce_fraction_pt('2/2')}") @@ -817,4 +580,5 @@ def pronounce_fraction_pt(word: str, print(f"5/4: {pronounce_fraction_pt('5/4')}") print(f"7/5: {pronounce_fraction_pt('7/5')}") print(f"0/20: {pronounce_fraction_pt('0/20')}") - + print(f"1/0: {pronounce_fraction_pt('1/0')}") + print(f"0/0: {pronounce_fraction_pt('0/0')}") diff --git a/ovos_number_parser/util.py b/ovos_number_parser/util.py index 8d8eab9..1bfb6c5 100644 --- a/ovos_number_parser/util.py +++ b/ovos_number_parser/util.py @@ -1,12 +1,38 @@ -from collections import namedtuple +from dataclasses import dataclass from enum import Enum -from quebra_frases import word_tokenize +from typing import List, Dict, Union, Any, Tuple, Optional, Callable +import re -# Token is intended to be used in the number processing functions in -# this module. The parsing requires slicing and dividing of the original -# text. To ensure things parse correctly, we need to know where text came -# from in the original input, hence this nametuple. -Token = namedtuple('Token', 'word index') + +@dataclass +class Token: + word: str + index: int + + def __iter__(self): + yield self.word + yield self.index + + def __getitem__(self, item): + if item == 0: + return self.word + elif item == 1: + return self.index + raise IndexError + + def __setattr__(self, key, value): + """ + Prevent modification of existing attributes, allowing only new attributes to be set. + + Raises: + Exception: If attempting to modify an attribute that already exists. + """ + try: + getattr(self, key) + except AttributeError: + super().__setattr__(key, value) + else: + raise AttributeError("Immutable!") class Scale(str, Enum): @@ -33,6 +59,7 @@ class DigitPronunciation(str, Enum): FULL_NUMBER = "number" +@dataclass class ReplaceableNumber: """ Similar to Token, this class is used in number parsing. @@ -42,24 +69,22 @@ class ReplaceableNumber: In other words, it is the text, and the number that can replace it in the string. """ + value: Union[int, float] + tokens: List[Token] - def __init__(self, value, tokens: [Token]): - self.value = value - self.tokens = tokens - - def __bool__(self): + def __bool__(self) -> bool: return bool(self.value is not None and self.value is not False) @property - def start_index(self): + def start_index(self) -> int: return self.tokens[0].index @property - def end_index(self): + def end_index(self) -> int: return self.tokens[-1].index @property - def text(self): + def text(self) -> str: """ Return the concatenated text represented by the tokens, separated by spaces. """ @@ -87,7 +112,30 @@ def __repr__(self): t=self.tokens) -def tokenize(text): +def word_tokenize(utterance: str) -> List[str]: + """ + Splits a Portuguese text string into a list of tokens, separating words and punctuation. + + Returns: + A list of tokens, where each token is a word or punctuation mark from the input string. + """ + # Split things like 12% + utterance = re.sub(r"([0-9]+)([\%])", r"\1 \2", utterance) + # Split things like #1 + utterance = re.sub(r"(\#)([0-9]+\b)", r"\1 \2", utterance) + # Split things like amo-te, but preserve numbers like 1-2 + utterance = re.sub(r"([a-zA-Z]+)(-)([a-zA-Z]+\b)", r"\1 \2 \3", + utterance) + + tokens = utterance.split() + # Remove a trailing hyphen if it's the last token + if tokens and tokens[-1] == '-': + tokens = tokens[:-1] + + return tokens + + +def tokenize(text: str) -> List[Token]: """ Generate a list of token object, given a string. Args: @@ -97,11 +145,10 @@ def tokenize(text): [Token] """ - return [Token(word, index) - for index, word in enumerate(word_tokenize(text))] + return [Token(word, index) for index, word in enumerate(word_tokenize(text))] -def partition_list(items, split_on): +def partition_list(items: List[Any], split_on: Any) -> List[List[Any]]: """ Partition a list of items. @@ -131,7 +178,7 @@ def partition_list(items, split_on): return list(filter(lambda x: len(x) != 0, splits)) -def invert_dict(original): +def invert_dict(original: Dict[Any, Any]) -> Dict[Any, Any]: """ Produce a dictionary with the keys and values inverted, relative to the dict passed in. @@ -146,7 +193,7 @@ def invert_dict(original): return {value: key for key, value in original.items()} -def is_numeric(input_str): +def is_numeric(input_str: str) -> bool: """ Return True if the input string represents a valid number, otherwise False. @@ -163,7 +210,7 @@ def is_numeric(input_str): return False -def look_for_fractions(split_list): +def look_for_fractions(split_list: List[str]) -> bool: """" This function takes a list made by fraction & determines if a fraction. @@ -181,7 +228,7 @@ def look_for_fractions(split_list): return False -def convert_to_mixed_fraction(number, denominators=range(1, 21)): +def convert_to_mixed_fraction(number: Union[int, float], denominators=range(1, 21)) -> Optional[Tuple[int, int, int]]: """ Convert floats to components of a mixed fraction representation @@ -211,3 +258,622 @@ def convert_to_mixed_fraction(number, denominators=range(1, 21)): return None return int_number, int(round(numerator)), denominator + + +@dataclass +class NumberVocabulary: + LANG: str + DEFAULT_SCALE: Scale + + # first entry also used for pronouncing, others only for extraction + JOIN_WORD: List[str] + JOINER_ON_TWENTYS: bool # add JOIN_WORD from 20-30 - "vinte e um" + JOINER_ON_HUNDREDS: bool # add JOIN_WORD from 100-1000 - "duzentos e um" + JOINER_ON_THOUSANDS: bool # add JOIN_WORD from 1000-10000 - "mil e duzentos" + DECIMAL_MARKER: List[str] + NEGATIVE_SIGN: List[str] + HUNDRED_PARTICLE: str # how to read "1XX" + DENOMINATOR_PARTICLE: str # for fractions X / N {PARTICLE} + DIVIDED_BY_ZERO: str # how to read X/0 values + + UNITS: Dict[int, str] # 0-10 range + TENS: Dict[int, str] # 10-100 range + HUNDREDS: Dict[int, str] # 100-1000 range + FRACTION: Dict[int, str] # spellings for 1/x + SHORT_SCALE: Dict[int, str] # 1000+ + LONG_SCALE: Dict[int, str] # 1000+ + + FRACTION_FEMALE: Dict[int, str] # spellings for 1/x feminine grammatical gender + + ALT_SPELLINGS: Dict[str, int] # special cases eg. archaisms/colloquialisms + + # mappings default to neutral, use swap_gender/self.GENDERED_SPELLINGS if needed + GENDERED_SPELLINGS: Dict[GrammaticalGender, Dict[int, str]] # grammatical gender + DIGIT_SPELLINGS: Dict[int, str] # for numbers that should be pronounced differently when reading digit-by-digit + + ORDINAL_UNITS: Dict[int, str] + ORDINAL_TENS: Dict[int, str] + ORDINAL_HUNDREDS: Dict[int, str] + ORDINAL_SHORT_SCALE: Dict[int, str] + ORDINAL_LONG_SCALE: Dict[int, str] + + NUMBER_OVERFLOW: str # "número exageradamente grande" + NO_PREV_UNIT: List[int] # cases where "1" should not be spelled before the unit. eg. "one hundred" vs "hundred" if 100 in lit + NO_PLURAL: List[int] + + swap_gender: Callable[[str, GrammaticalGender], str] = lambda word, gender: word + pluralize: Callable[[str], str] = lambda word: word + singularize: Callable[[str], str] = lambda word: word + + def get_number_strings(self, scale: Optional[Scale] = None) -> Dict[str, int]: + scale = scale or self.DEFAULT_SCALE + SCALES = self.SHORT_SCALE if scale == Scale.SHORT else self.LONG_SCALE + male = { + **self.ALT_SPELLINGS, + **{v: k for k, v in self.UNITS.items()}, + **{v: k for k, v in self.TENS.items()}, + **{v: k for k, v in self.HUNDREDS.items()}, + **{v: k for k, v in SCALES.items()}, + **{v: k for k, v in self.GENDERED_SPELLINGS.get(GrammaticalGender.MASCULINE, {}).items()}, + } + + female = { + **{self.swap_gender(k, GrammaticalGender.FEMININE): v for k, v in male.items()}, + **{v: k for k, v in self.GENDERED_SPELLINGS.get(GrammaticalGender.FEMININE, {}).items()}, + } + + plural = { + **{self.pluralize(k): v for k, v in male.items()}, + **{self.pluralize(k): v for k, v in female.items()} + } + return { + **male, + **female, + **dict({self.HUNDRED_PARTICLE: 100} if self.HUNDRED_PARTICLE else {}), + **plural + } + + def get_ordinal_strings(self, scale: Optional[Scale] = None) -> Dict[str, int]: + scale = scale or self.DEFAULT_SCALE + SCALES = self.ORDINAL_SHORT_SCALE if scale == Scale.SHORT else self.ORDINAL_LONG_SCALE + male = { + **{v: k for k, v in self.ORDINAL_UNITS.items()}, + **{v: k for k, v in self.ORDINAL_TENS.items()}, + **{v: k for k, v in self.ORDINAL_HUNDREDS.items()}, + **{v: k for k, v in SCALES.items()} + } + female = { + **{self.swap_gender(k, GrammaticalGender.FEMININE): v for k, v in male.items()}, + } + plural = { + **{self.pluralize(k): v for k, v in male.items()}, + **{self.pluralize(k): v for k, v in female.items()} + } + return { + **male, + **female, + **plural + } + + def get_fraction_strings(self) -> Dict[str, int]: + male = {v: k for k, v in self.FRACTION.items()} + female = {v: k for k, v in self.FRACTION_FEMALE.items()} + plural = {self.pluralize(k): v for k, v in male.items()} + return { + **male, + **female, + **plural + } + + +class RomanceNumberExtractor: + """vocabulary based number parser that should work for most romance-like languages""" + + def __init__(self, vocab: NumberVocabulary): + self.vocab = vocab + + def is_ordinal(self, input_str: str, scale: Optional[Scale] = None) -> bool: + """ + Determine if a string is a Portuguese ordinal word. + + Returns: + bool: True if the input string is recognized as a Portuguese ordinal, otherwise False. + """ + scale = scale or self.vocab.DEFAULT_SCALE + ordinals_map = self.vocab.get_ordinal_strings(scale) + return input_str in ordinals_map + + def is_fractional(self, input_str: str) -> Union[float, bool]: + """ + Checks if the input string corresponds to a recognized Portuguese fractional word. + + Returns: + The fractional value as a float if recognized; otherwise, False. + """ + fractions_map = self.vocab.get_fraction_strings() + input_str = input_str.lower().strip() + + # handle fraction denominator strings + for word, den in fractions_map.items(): + if input_str == word: + return 1.0 / den + + return False + + def extract_number(self, + text: str, + ordinals: bool = False, + scale: Scale = None, + ) -> Union[int, float, bool]: + """ + Extracts a numeric value from a text phrase, supporting cardinals, ordinals, fractions, and large scales. + + Parameters: + text (str): The input phrase potentially containing a number. + ordinals (bool): If True, recognizes ordinal words as numbers. + scale (Scale): Specifies whether to use the short or long numerical scale. + + Returns: + int or float: The extracted number if found; otherwise, False. + """ + scale = scale or self.vocab.DEFAULT_SCALE + numbers_map = self.vocab.get_number_strings(scale) + ordinals_map = self.vocab.get_ordinal_strings(scale) + scales_map = self.vocab.SHORT_SCALE if scale == Scale.SHORT else self.vocab.LONG_SCALE + + # normalize and tokenize + clean_text = text.lower().replace('-', ' ') + tokens = [t for t in clean_text.split() if t not in self.vocab.JOIN_WORD] + + result = None + current_number = None + number_consumed = False + is_negative = False + + for i, token in enumerate(tokens): + if token is None: + continue # consumed in previous idx + prev_token = tokens[i - 1] if i > 0 else None + next_token = tokens[i + 1] if i < len(tokens) - 1 else None + next_digit = numbers_map.get(next_token) if next_token else None + val = numbers_map.get(token) + if val is not None: + current_number = current_number or 0 + result = result or 0 + if prev_token in self.vocab.NEGATIVE_SIGN: + is_negative = True + if next_digit and next_digit > abs(val): + tokens[i + 1] = None + current_number += val * next_digit + else: + current_number += val + elif ordinals and self.is_ordinal(token): + # token = self.vocab.swap_gender(token, GrammaticalGender.MASCULINE) + current_number = current_number or 0 + result = result or 0 + current_number += ordinals_map[token] + elif self.is_fractional(token): + current_number = current_number or 0 + result = result or 0 + fraction = self.is_fractional(token) + result += current_number + fraction + current_number = None + number_consumed = True + else: + # Handle large scales like milhão, bilhão + found_scale = False + for scale_val, w in scales_map.items(): + if token == w or token == self.vocab.pluralize(w): + current_number = current_number or 0 + result = result or 0 + if current_number is None: + current_number = 1 + result += current_number * scale_val + current_number = None + found_scale = True + number_consumed = True + break + + # Handle decimal numbers + if not found_scale: + if token in self.vocab.DECIMAL_MARKER: + current_number = current_number or 0 + result = result or 0 + decimal_str = ''.join( + str(numbers_map.get(t, '')) for t in tokens[i + 1:] + if t in numbers_map + ) + if decimal_str: + result += current_number + float(f"0.{decimal_str}") + number_consumed = True + current_number = None + break + + if not number_consumed and current_number: + result = result or 0 + result += current_number + + if result and is_negative: + result = -result + return result if result is not None else False + + def numbers_to_digits(self, + utterance: str, + scale: Optional[Scale] = None + ) -> str: + """ + Converts written numbers in a text string to their digit equivalents, preserving all other text. + + Identifies spans of number words (including the joiner "e"), extracts their numeric values, and replaces them with digit strings. Non-number words and context are left unchanged. + + Parameters: + utterance (str): Input text possibly containing written numbers. + scale (Scale, optional): Numerical scale (short or long) to interpret large numbers. Defaults to Scale.LONG. + + Returns: + str: The input text with written numbers replaced by their digit representations. + """ + scale = scale or self.vocab.DEFAULT_SCALE + words = word_tokenize(utterance) + output = [] + i = 0 + + numbers_map = self.vocab.get_number_strings(scale) + + while i < len(words): + # Look for the start of a number span + if words[i] in numbers_map: + # Start a new span + number_span_words = [] + j = i + # Continue the span as long as we find number words or the joiner 'e' + while j < len(words) and (words[j] in numbers_map or words[j] in self.vocab.JOIN_WORD): + number_span_words.append(words[j]) + j += 1 + + # Form the phrase from the span and extract the number value + phrase = " ".join(number_span_words) + number_val = self.extract_number(phrase) + + if number_val is not False: + # If a valid number is found, add its digit representation to the output + output.append(str(number_val)) + # Advance the main index 'i' past the entire span + i = j + else: + # If the span doesn't form a valid number, treat the first word as non-numeric + # and move to the next word. This handles cases like "e" at the beginning of a sentence. + output.append(words[i]) + i += 1 + else: + # If the current word is not a number word, add it to the output + # and move to the next word + output.append(words[i]) + i += 1 + + return " ".join(output) + + def pronounce_number(self, + number: Union[int, float], + places: int = 5, + scale: Optional[Scale] = None, + ordinals: bool = False, + digits: DigitPronunciation = DigitPronunciation.FULL_NUMBER, + gender: GrammaticalGender = GrammaticalGender.MASCULINE + ) -> str: + """ + Return the full pronunciation of a number, supporting cardinal and ordinal forms, decimals, large scales, grammatical gender, and both Brazilian and European Portuguese variants. + + Parameters: + number (int or float): The number to pronounce. + places (int): Number of decimal places to include for floats. + scale (Scale): Numerical scale to use (short or long). + ordinals (bool): If True, pronounce as an ordinal number. + gender (GrammaticalGender): Grammatical gender for ordinal numbers. + + Returns: + str: The number expressed as a phrase. + """ + scale = scale or self.vocab.DEFAULT_SCALE + if not isinstance(number, (int, float)): + raise TypeError("Number must be an int or float.") + + if ordinals: + return self.pronounce_ordinal(number, gender, scale) + + if number < 0: + return f"{self.vocab.NEGATIVE_SIGN[0]} {self.pronounce_number(abs(number), places, scale=scale, digits=digits, gender=gender)}" + + # Handle decimals + if "." in str(number): + integer_part = int(number) + decimal_part_str = str(number).split('.')[1].rstrip("0") + + # Handle cases where the decimal part rounds to zero + if decimal_part_str and int(decimal_part_str) == 0: + return self.pronounce_number(integer_part, places, + scale=scale, + digits=digits, gender=gender) + + int_pronunciation = self.pronounce_number(integer_part, places, + scale=scale, + digits=digits, gender=gender) + + decimal_pronunciation_parts = [] + # pronounce decimals either as a whole number or digit by digit + if decimal_part_str: + if digits == DigitPronunciation.FULL_NUMBER: + + # handle leading zeros + no_z = decimal_part_str.lstrip("0") # without zeros + n_zeros = len(decimal_part_str) - len(no_z) + for i in range(n_zeros): + decimal_pronunciation_parts.append(self.vocab.UNITS[0]) + + if n_zeros >= places: + # read all zeros + last_digit = int(decimal_part_str[n_zeros]) + after_last_digit = int(decimal_part_str[n_zeros + 1]) if len(decimal_part_str) > n_zeros + 1 else 0 + # round up last digit if needed + if after_last_digit >= 5: + last_digit += 1 + decimal_pronunciation_parts.append(self._pronounce_up_to_999(last_digit, gender=gender)) + else: + if len(decimal_part_str) > places: + last_digit = int(decimal_part_str[places - 1]) + after_last_digit = int(decimal_part_str[places]) + # round up last digit if needed + if after_last_digit >= 5: + last_digit += 1 + decimal_part_str = decimal_part_str[:places - 1] + str(last_digit) + decimal_pronunciation_parts.append(self.pronounce_number(int(decimal_part_str), gender=gender)) + else: + for digit in decimal_part_str: + decimal_pronunciation_parts.append(self.pronounce_number(int(digit), gender=gender)) + + decimal_pronunciation = " ".join(decimal_pronunciation_parts) or self.vocab.UNITS[0] + decimal_word = self.vocab.DECIMAL_MARKER[0] + return f"{int_pronunciation} {decimal_word} {decimal_pronunciation}" + + # --- Integer Pronunciation Logic --- + n = int(number) + + # Base case for recursion: numbers less than 1000 + if n < 1000: + return self._pronounce_up_to_999(n, gender) + + scales_map = self.vocab.SHORT_SCALE if scale == Scale.SHORT else self.vocab.LONG_SCALE + + # Find the smallest scale that fits the number + scale_candidates = [(scale_val, scale_str) + for scale_val, scale_str in scales_map.items() + if n >= scale_val] + scale_val: Tuple[int, str] = scale_candidates[0] + + count = n // scale_val[0] + remainder = n % scale_val[0] + + # Pronounce the 'count' part of the number + is_singular = count == 1 or scale_val[0] in self.vocab.NO_PLURAL + scale_word = scale_val[1] if is_singular else self.vocab.pluralize(scale_val[1]) + if count == 1 and scale_val[0] in self.vocab.NO_PREV_UNIT: + count_str = scale_word + else: + count_pronunciation = self.pronounce_number(count, places, scale) + count_str = f"{count_pronunciation} {scale_word}" + + # If there's no remainder, we're done + if remainder == 0: + return count_str + + # Pronounce the remainder and join with the correct conjunction + remainder_str = self.pronounce_number(remainder, places, scale) + # Conjunction logic: add JOIN_WORD if the remainder is the last group and is + # less than 100 or a multiple of 100. + join_word = self.vocab.JOIN_WORD[0] if len(self.vocab.JOIN_WORD) else "" + if remainder < 100 or (remainder < 1000 and remainder % 100 == 0): + return f"{count_str} {join_word} {remainder_str}" + else: + return f"{count_str} {remainder_str}" + + def pronounce_fraction(self, + word: str, + scale: Optional[Scale] = None) -> str: + """ + Return the pronunciation of a fraction given as a string (e.g., "1/2"). + + The numerator is pronounced as a cardinal number, and the denominator as an ordinal or fraction name, pluralized if appropriate. For denominators not in the known fraction list, the denominator is pronounced as a cardinal number followed by "avos" if plural. + + Parameters: + word (str): Fraction in the form "numerator/denominator" (e.g., "3/4"). + + Returns: + str: The pronunciation of the fraction. + """ + scale = scale or self.vocab.DEFAULT_SCALE + n1, n2 = word.split("/") + n1_int, n2_int = int(n1), int(n2) + + if n2_int == 0: + denom = self.vocab.DIVIDED_BY_ZERO + + # Pronounce the denominator (second number) as an ordinal, and pluralize it if needed. + elif n2_int in self.vocab.FRACTION: + denom = self.vocab.FRACTION[n2_int] + if n1_int != 1: + denom = self.vocab.pluralize(denom) # plural + else: + # For other numbers + denom = self.pronounce_number(n2_int, scale=scale) + if n1_int > 1: # plural + denom = f"{denom} {self.vocab.DENOMINATOR_PARTICLE}" + + + # Pronounce the numerator (first number) as a cardinal. + num = self.pronounce_number(n1_int, scale=scale) + return f"{num} {denom}" + + def pronounce_ordinal(self, + number: Union[int, float], + gender: GrammaticalGender = GrammaticalGender.MASCULINE, + scale: Optional[Scale] = None + ) -> str: + """ + Return the ordinal pronunciation of a number, supporting grammatical gender, scale (short or long), and language variant (Brazilian or European Portuguese). + + Parameters: + number (int or float): The number to pronounce as an ordinal. + gender (GrammaticalGender, optional): The grammatical gender for the ordinal form (masculine or feminine). + scale (Scale, optional): The numerical scale to use (short or long). + + Returns: + str: The ordinal pronunciation of the number. + + Raises: + TypeError: If `number` is not an int or float. + """ + scale = scale or self.vocab.DEFAULT_SCALE + if not isinstance(number, (int, float)): + raise TypeError("Number must be an int or float.") + if number == 0: + return self.vocab.UNITS[0] + + if number < 0: + return f"{self.vocab.NEGATIVE_SIGN[0]} {self.pronounce_ordinal(abs(number), gender, scale)}" + + n = int(number) + if n < 1000: + return self._pronounce_ordinal_up_to_999(n, gender) + + scales_map = self.vocab.ORDINAL_SHORT_SCALE if scale == Scale.SHORT else self.vocab.ORDINAL_LONG_SCALE + + # Find the smallest scale that fits the number + scale_candidates = [(scale_val, scale_str) + for scale_val, scale_str in scales_map.items() + if n >= scale_val] + scale_val: Tuple[int, str] = scale_candidates[0] + + count = n // scale_val[0] + remainder = n % scale_val[0] + + # Special case for "milésimo" and other large scales where 'um' is not needed + if count == 1: + count_str = self.vocab.swap_gender(scale_val[1], gender) + else: + # Pronounce the 'count' part of the number and the scale word + count_pronunciation = self.pronounce_number(count, scale=scale) + scale_word = self.vocab.swap_gender(scale_val[1], gender) + count_str = f"{count_pronunciation} {scale_word}" + + # If there's no remainder, we're done + if remainder == 0: + return count_str + + # Pronounce the remainder and join + remainder_str = self.pronounce_ordinal(remainder, gender, scale) + + return f"{count_str} {remainder_str}" + + def _pronounce_up_to_999(self, + n: int, + gender: GrammaticalGender = GrammaticalGender.MASCULINE + ) -> str: + """ + Returns the cardinal pronunciation of an integer from 0 to 999, using the specified language variant. + + Parameters: + n (int): Integer to pronounce (must be between 0 and 999). + + Returns: + str: The pronounced number. + + Raises: + ValueError: If n is not in the range 0 to 999. + """ + if not 0 <= n <= 999: + raise ValueError("Number must be between 0 and 999.") + if n in self.vocab.GENDERED_SPELLINGS.get(gender, {}): + return self.vocab.GENDERED_SPELLINGS[gender][n] + if n in self.vocab.UNITS: + return self.vocab.UNITS[n] + if n in self.vocab.TENS: + return self.vocab.TENS[n] + if n in self.vocab.HUNDREDS: + return self.vocab.HUNDREDS[n] + + parts = [] + tens_map = self.vocab.TENS + + # Hundreds + if n >= 100: + hundred = n // 100 * 100 + parts.append(self.vocab.HUNDRED_PARTICLE if hundred == 100 else self.vocab.HUNDREDS[hundred]) + n %= 100 + if n > 0 and self.vocab.JOIN_WORD and self.vocab.JOINER_ON_HUNDREDS: + parts.append(self.vocab.JOIN_WORD[0]) + + # Tens and Units + if n > 0: + if n < 20: + parts.append(tens_map.get(n) or self.vocab.UNITS.get(n, "")) + else: + ten = n // 10 * 10 + unit = n % 10 + parts.append(tens_map[ten]) + if unit > 0: + if self.vocab.JOINER_ON_TWENTYS and self.vocab.JOIN_WORD: + parts.append(self.vocab.JOIN_WORD[0]) + parts.append(self.vocab.UNITS[unit]) + + return self.vocab.swap_gender(" ".join(parts), gender) + + def _pronounce_ordinal_up_to_999(self, + n: int, + gender: GrammaticalGender = GrammaticalGender.MASCULINE + ) -> str: + """ + Returns the ordinal word for an integer between 0 and 999, adjusting for grammatical gender and language variant. + + Parameters: + n (int): The integer to convert (must be between 0 and 999). + + Returns: + str: The ordinal representation of the number. + + Raises: + ValueError: If n is not between 0 and 999. + """ + if not 0 <= n <= 999: + raise ValueError("Number must be between 0 and 999.") + if n == 0: + return self.vocab.UNITS[0] + + parts = [] + + # Handle hundreds + if n >= 100: + hundred_val = n // 100 * 100 + hundred_word_masc = self.vocab.ORDINAL_HUNDREDS.get(hundred_val) + if hundred_word_masc: + parts.append(self.vocab.swap_gender(hundred_word_masc, gender)) + n %= 100 + + # Handle tens and units + if n > 0: + # Ordinal numbers don't use 'e' as a separator + if n % 10 == 0: + tens_word_masc = self.vocab.ORDINAL_TENS[n] + parts.append(self.vocab.swap_gender(tens_word_masc, gender)) + elif n < 10: + units_word_masc = self.vocab.ORDINAL_UNITS[n] + parts.append(self.vocab.swap_gender(units_word_masc, gender)) + elif n < 20: + tens_word_masc = self.vocab.ORDINAL_TENS[10] + units_word_masc = self.vocab.ORDINAL_UNITS[n - 10] + parts.append(f"{self.vocab.swap_gender(tens_word_masc, gender)} {self.vocab.swap_gender(units_word_masc, gender)}") + else: + tens_word_masc = self.vocab.ORDINAL_TENS[n // 10 * 10] + units_word_masc = self.vocab.ORDINAL_UNITS[n % 10] + parts.append(f"{self.vocab.swap_gender(tens_word_masc, gender)} {self.vocab.swap_gender(units_word_masc, gender)}") + + return self.vocab.swap_gender(" ".join(parts), gender) diff --git a/tests/test_number_parser_ast.py b/tests/test_number_parser_ast.py new file mode 100644 index 0000000..6c711c6 --- /dev/null +++ b/tests/test_number_parser_ast.py @@ -0,0 +1,203 @@ +import unittest + +from ovos_number_parser.util import GrammaticalGender +from ovos_number_parser.numbers_ast import AST + +# ============================================================ +# Dictionaries +# ============================================================ + +class TestDictionaries(unittest.TestCase): + + def test_units_completeness(self): + self.assertEqual(set(AST.vocab.UNITS.keys()), set(range(0, 10))) + + def test_tens_contains_expected(self): + expected = list(range(10, 20)) + list(range(20, 100, 10)) + for i in expected: + self.assertIn(i, set(AST.vocab.TENS.keys())) + + def test_composite_20s(self): + # 21–29 must exist + for i in range(21, 30): + self.assertIn(i, AST.vocab.TENS) + self.assertEqual(AST.vocab.TENS[21], "ventiún") + + def test_hundreds(self): + self.assertEqual(set(AST.vocab.HUNDREDS.keys()), set(range(100, 1000, 100))) + + def test_fraction_strings(self): + self.assertIn(2, AST.vocab.FRACTION) + self.assertIn(10, AST.vocab.FRACTION) + self.assertEqual(AST.vocab.FRACTION[2], "mediu") + + +# ============================================================ +# Pronounce up to 999 (through AST.pronounce_number) +# ============================================================ + +class TestPronounceUpTo999_AST(unittest.TestCase): + + def test_zero(self): + self.assertEqual(AST.pronounce_number(0), "cero") + + def test_units(self): + self.assertEqual(AST.pronounce_number(1), "un") + self.assertEqual(AST.pronounce_number(5), "cinco") + + def test_teens(self): + self.assertEqual(AST.pronounce_number(16), "dieciséis") + self.assertEqual(AST.pronounce_number(18), "dieciocho") + + def test_composite_20s(self): + self.assertEqual(AST.pronounce_number(21), "ventiún") + self.assertEqual(AST.pronounce_number(29), "ventinueve") + + def test_tens_and_units(self): + self.assertEqual(AST.pronounce_number(35), "trenta y cinco") + self.assertEqual(AST.pronounce_number(47), "cuarenta y siete") + + def test_hundreds_exact(self): + self.assertEqual(AST.pronounce_number(100), "cien") + self.assertEqual(AST.pronounce_number(500), "quinientos") + + def test_hundreds_with_remainder(self): + self.assertEqual(AST.pronounce_number(101), "ciento un") + self.assertEqual(AST.pronounce_number(234), "doscientos trenta y cuatro") + + +# ============================================================ +# Fractions +# ============================================================ + +class TestFractionsAst(unittest.TestCase): + + def test_basic_fraction_words(self): + self.assertAlmostEqual(AST.is_fractional("mediu"), 0.5) + self.assertAlmostEqual(AST.is_fractional("terciu"), 1/3) + + def test_fraction_plurals(self): + self.assertAlmostEqual(AST.is_fractional("medios"), 0.5) + self.assertAlmostEqual(AST.is_fractional("cuartos"), 1/4) + + def test_fraction_pronounce(self): + result = AST.pronounce_fraction("1/4") + self.assertIn("un", result) + self.assertIn("cuartu", result) + result2 = AST.pronounce_fraction("3/4") + self.assertIn("tres", result2) + self.assertIn("cuartos", result2) + + +# ============================================================ +# Extraction +# ============================================================ + +class TestExtractAST(unittest.TestCase): + + def test_simple(self): + self.assertEqual(AST.extract_number("dieciséis"), 16) + self.assertEqual(AST.extract_number("ventiuno"), 21) + + def test_hundreds(self): + self.assertEqual(AST.extract_number("ciento un"), 101) + self.assertEqual(AST.extract_number("doscientos trenta y cuatro"), 234) + + def test_scale_thousand(self): + self.assertEqual(AST.extract_number("mil"), 1000) + self.assertEqual(AST.extract_number("mil doscientos trenta y cuatro"), 1234) + + def test_million(self): + self.assertEqual(AST.extract_number("un millón"), 1_000_000) + + def test_million_plus(self): + self.assertEqual(AST.extract_number("dos millones trescientos"), 2_000_300) + + def test_fraction_phrase(self): + self.assertAlmostEqual(AST.extract_number("tres cuartos"), 0.75) + + def test_negative(self): + self.assertEqual(AST.extract_number("menos cinco"), -5) + + def test_no_number(self): + self.assertFalse(AST.extract_number("nada aquí")) + self.assertFalse(AST.extract_number("palabres")) + + +# ============================================================ +# Ordinals +# ============================================================ + +class TestOrdinalsAST(unittest.TestCase): + + def test_basic_ordinals(self): + self.assertEqual(AST.pronounce_ordinal(1), "primeru") + self.assertEqual(AST.pronounce_ordinal(1, gender=GrammaticalGender.FEMININE), "primera") + + def test_teens_ordinal(self): + self.assertIn("décimu", AST.pronounce_ordinal(12)) + + def test_tens_ordinal(self): + self.assertEqual(AST.pronounce_ordinal(20), "ventenu") + + def test_composed_ordinal(self): + self.assertEqual(AST.pronounce_ordinal(23), "ventenu terceru") + + def test_hundredth(self): + self.assertEqual(AST.pronounce_ordinal(100), "centésimu") + + +# ============================================================ +# Numbers to digits +# ============================================================ + +class TestNumbersToDigitsAST(unittest.TestCase): + + def test_simple(self): + self.assertEqual(AST.numbers_to_digits("dieciséis"), "16") + self.assertEqual(AST.numbers_to_digits("ventiuno"), "21") + + def test_phrase(self): + result = AST.numbers_to_digits("hai dos millones cincuenta persones") + self.assertIn("2000050", result) + + def test_mixed(self): + result = AST.numbers_to_digits("merquei ventiuno panes") + self.assertEqual(result, "merquei 21 panes") + + def test_no_number(self): + original = "nada equí" + self.assertEqual(AST.numbers_to_digits(original), original) + + +# ============================================================ +# Integration tests +# ============================================================ + +class TestIntegrationAST(unittest.TestCase): + + def test_round_trip(self): + nums = [1, 16, 21, 35, 100, 234 , 1000, 1234] + for n in nums: + text = AST.pronounce_number(n) + extracted = AST.extract_number(text) + self.assertEqual(extracted, n) + + def test_large_numbers(self): + text = AST.pronounce_number(1_234_567) + extracted = AST.extract_number(text) + self.assertEqual(extracted, 1_234_567) + + def test_decimals(self): + res = AST.pronounce_number(1.234) + self.assertIn("punto", res) + self.assertIn("un", res) + + def test_negative(self): + text = AST.pronounce_number(-234) + self.assertTrue(text.startswith("menos")) + self.assertEqual(AST.extract_number(text), -234) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_number_parser_gl.py b/tests/test_number_parser_gl.py new file mode 100644 index 0000000..7232347 --- /dev/null +++ b/tests/test_number_parser_gl.py @@ -0,0 +1,150 @@ +import unittest +from ovos_number_parser.util import Scale, GrammaticalGender +from ovos_number_parser.numbers_gl import GL + + +# ============================================================ +# Pronunciation Tests +# ============================================================ + +class TestPronunciationGL(unittest.TestCase): + + # --- Cardinal Pronunciation (up to 999) --- + def test_units(self): + self.assertEqual(GL.pronounce_number(1), "un") + self.assertEqual(GL.pronounce_number(1, gender=GrammaticalGender.FEMININE), "unha") + self.assertEqual(GL.pronounce_number(2), "dous") + self.assertEqual(GL.pronounce_number(2, gender=GrammaticalGender.FEMININE), "dúas") + + def test_composite_numbers(self): + self.assertEqual(GL.pronounce_number(16), "dezaseis") + self.assertEqual(GL.pronounce_number(21), "vinte e un") + self.assertEqual(GL.pronounce_number(22), "vinte e dous") + self.assertEqual(GL.pronounce_number(22, gender=GrammaticalGender.FEMININE), "vinte e dúas") + self.assertEqual(GL.pronounce_number(35), "trinta e cinco") + self.assertEqual(GL.pronounce_number(99), "noventa e nove") + + def test_hundreds(self): + # 100 uses different form for multiples + self.assertEqual(GL.pronounce_number(100), "cen") + # 101 uses a composite form + self.assertEqual(GL.pronounce_number(101), "cento e un") + # Multiples of 100 are gendered + self.assertEqual(GL.pronounce_number(200), "douscentos") + self.assertEqual(GL.pronounce_number(200, gender=GrammaticalGender.FEMININE), "douscentas") + self.assertEqual(GL.pronounce_number(547), "cincocentos corenta e sete") + self.assertEqual(GL.pronounce_number(999), "novecentos noventa e nove") + + # --- Large Number Pronunciation (Galician is Long Scale) --- + def test_thousands(self): + self.assertEqual(GL.pronounce_number(1000), "mil") + self.assertEqual(GL.pronounce_number(2000), "dous mil") + self.assertEqual(GL.pronounce_number(1234), "mil douscentos e trinta e catro") + self.assertEqual(GL.pronounce_number(2001), "dous mil e un") + + def test_millions(self): + # 1,000,000 should be 'un millón' + self.assertEqual(GL.pronounce_number(1_000_000), "un millón") + # 1,234,567 should be 'un millón...' + self.assertEqual(GL.pronounce_number(1_234_567), + "un millón douscentos trinta e catro mil cincocentos sesenta e sete") + self.assertEqual(GL.pronounce_number(2_000_000), "dous millóns") + + def test_long_scale_billions_trillions(self): + # 10^9 (Galician: Mil millóns) + self.assertEqual(GL.pronounce_number(1_000_000_000), "mil millóns") + # 2,500,123,456 + expected = "dous mil cincocentos millóns cento vinte e tres mil catrocentos cincuenta e seis" + self.assertEqual(GL.pronounce_number(2_500_123_456), expected) + + # 10^12 (Galician: Un billón) + self.assertEqual(GL.pronounce_number(1_000_000_000_000), "un billón") + + # --- Decimals and Negatives --- + def test_decimals(self): + # Galician uses 'coma' (comma) for the decimal separator + self.assertEqual(GL.pronounce_number(10.05), "dez coma cero cinco") + self.assertEqual(GL.pronounce_number(123.45), "cento e vinte e tres coma corenta e cinco") + + def test_negative(self): + self.assertEqual(GL.pronounce_number(-123.45), + "menos cento e vinte e tres coma corenta e cinco") + + # --- Ordinal Pronunciation --- + def test_ordinals(self): + self.assertEqual(GL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.MASCULINE), "primeiro") + self.assertEqual(GL.pronounce_number(1, ordinals=True, gender=GrammaticalGender.FEMININE), "primeira") + self.assertEqual(GL.pronounce_number(23, ordinals=True), "vixésimo terceiro") + self.assertEqual(GL.pronounce_number(23, ordinals=True, gender=GrammaticalGender.FEMININE), "vixésima terceira") + self.assertEqual(GL.pronounce_number(100, ordinals=True), "centésimo") + self.assertEqual(GL.pronounce_number(1_000_000, ordinals=True), "millonésimo") + self.assertEqual(GL.pronounce_number(1_000_000_000_000, ordinals=True), "billonésimo") + + # --- Fractions --- + def test_fractions(self): + # $1/2$ + self.assertEqual(GL.pronounce_fraction('1/2'), "un medio") + # $5/3$ + self.assertEqual(GL.pronounce_fraction('5/3'), "cinco terzos") + # $7/10$ + self.assertEqual(GL.pronounce_fraction('7/10'), "sete décimos") + # $0/20$ + self.assertEqual(GL.pronounce_fraction('0/20'), "cero vinteavos") + + +# ============================================================ +# Extraction and Conversion Tests +# ============================================================ + +class TestExtractionGL(unittest.TestCase): + + # --- Cardinal Extraction --- + def test_extraction_cardinal_gender(self): + # 'un' -> 1 + self.assertEqual(GL.extract_number('un'), 1) + # 'unha' -> 1 (Fixes the current 'False' error) + self.assertEqual(GL.extract_number('unha'), 1) + + def test_extraction_composite(self): + # 'vinte e un' -> 21 + self.assertEqual(GL.extract_number('vinte e un'), 21) + # 'vinte e unha' -> 21 (Fixes the current '20' error) + self.assertEqual(GL.extract_number('vinte e unha'), 21) + # 'vinte e dúas' -> 22 (Fixes the current '20' error) + self.assertEqual(GL.extract_number('vinte e dúas'), 22) + + def test_extraction_large_numbers(self): + # 'un millón' -> 1000000 + self.assertEqual(GL.extract_number('un millón'), 1_000_000) + # 'dous millóns cincocentos' -> 2,000,500 (Fixes the current '502' error) + self.assertEqual(GL.extract_number('dous millóns cincocentos'), 2_000_500) + # 'mil vinte e tres' -> 1023 + self.assertEqual(GL.extract_number('mil vinte e tres'), 1023) + # 'trinta e cinco coma catro' -> 35.4 + self.assertEqual(GL.extract_number('trinta e cinco coma catro'), 35.4) + + # --- Ordinal Extraction --- + def test_extraction_ordinals(self): + # 'a sexagésima cuarta vez' -> 64 (Fixes the current '4' error) + self.assertEqual(GL.extract_number('a sexaxésima cuarta vez', ordinals=True), 64) + self.assertEqual(GL.extract_number('o segundo carro', ordinals=True), 2) + self.assertEqual(GL.extract_number('a primeira vez', ordinals=True), 1) + self.assertEqual(GL.extract_number('o milésimo día', ordinals=True), 1000) + + # --- Text to Digits Conversion --- + def test_numbers_to_digits(self): + # Simple numbers + self.assertEqual(GL.numbers_to_digits('douscentos cincuenta'), '250') + self.assertEqual(GL.numbers_to_digits('un millón'), '1000000') + + # Numbers within a phrase + phrase = 'hai douscentos cincuenta carros' + self.assertEqual(GL.numbers_to_digits(phrase), 'hai 250 carros') + + # Test large numbers in phrase + phrase_large = 'atopamos dous millóns cincocentos insectos' + self.assertEqual(GL.numbers_to_digits(phrase_large), 'atopamos 2000500 insectos') + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_number_parser_mwl.py b/tests/test_number_parser_mwl.py new file mode 100644 index 0000000..54b2c15 --- /dev/null +++ b/tests/test_number_parser_mwl.py @@ -0,0 +1,246 @@ +import unittest + +from ovos_number_parser.util import GrammaticalGender +from ovos_number_parser.numbers_mwl import MWL + + +# ============================================================ +# Dictionaries +# ============================================================ + +class TestDictionariesMWL(unittest.TestCase): + """Test the completeness and content of the Mirandese vocabulary dictionaries.""" + + def test_units_completeness(self): + """Test that UNITS contains all expected numbers (0-9).""" + self.assertEqual(set(MWL.vocab.UNITS.keys()), set(range(0, 10))) + self.assertEqual(MWL.vocab.UNITS[1], "un") # Masculine form + + def test_tens_contains_expected(self): + """Test that TENS contains the expected numbers (10-19, 20, 30, ..., 90).""" + expected = list(range(10, 20)) + list(range(20, 100, 10)) + for i in expected: + self.assertIn(i, set(MWL.vocab.TENS.keys())) + self.assertEqual(MWL.vocab.TENS[16], "zasseis") + self.assertEqual(MWL.vocab.TENS[20], "binte") # Should be the root form + self.assertEqual(MWL.vocab.TENS[90], "nobenta") + + def test_composite_numbers_below_100(self): + """Test composite numbers in the 20-99 range.""" + # 21-29 are composite and use a specific structure + #self.assertEqual(MWL.vocab.TENS[21], "binte i un") # MWL uses 'i' as separator + #self.assertEqual(MWL.vocab.TENS[35], "trinta i cinco") + #self.assertEqual(MWL.vocab.TENS[99], "nobenta i nuobe") + + def test_hundreds(self): + """Test that HUNDREDS contains the expected numbers (100, 200, ..., 900).""" + self.assertEqual(set(MWL.vocab.HUNDREDS.keys()), set(range(100, 1000, 100))) + self.assertEqual(MWL.vocab.HUNDREDS[100], "cien") + self.assertEqual(MWL.vocab.HUNDREDS[200], "duzientos") + self.assertEqual(MWL.vocab.HUNDREDS[700], "sietecientos") + + def test_fraction_strings(self): + """Test basic fraction dictionary values.""" + self.assertIn(2, MWL.vocab.FRACTION) + self.assertIn(10, MWL.vocab.FRACTION) + self.assertEqual(MWL.vocab.FRACTION[2], "meio") # Masculine form + + +# ============================================================ +# Pronounce (Number to Text) +# ============================================================ + +class TestPronunciationMWL(unittest.TestCase): + """Test the MWL.pronounce_number function for cardinals and special cases.""" + + def test_cardinals_basic(self): + self.assertEqual(MWL.pronounce_number(0), "zero") + self.assertEqual(MWL.pronounce_number(1), "un") + self.assertEqual(MWL.pronounce_number(2), "dous") + self.assertEqual(MWL.pronounce_number(10), "dieç") + self.assertEqual(MWL.pronounce_number(16), "zasseis") + self.assertEqual(MWL.pronounce_number(20), "binte") + self.assertEqual(MWL.pronounce_number(21), "bint'i un") + self.assertEqual(MWL.pronounce_number(35), "trinta i cinco") + self.assertEqual(MWL.pronounce_number(99), "nobenta i nuobe") + + def test_cardinals_hundreds(self): + self.assertEqual(MWL.pronounce_number(100), "cien") + self.assertEqual(MWL.pronounce_number(101), "ciento i un") + self.assertEqual(MWL.pronounce_number(234), "duzientos i trinta i quatro") + self.assertEqual(MWL.pronounce_number(999), "nuobecientos i nobenta i nuobe") + + def test_cardinals_thousands(self): + self.assertEqual(MWL.pronounce_number(1000), "mil") + self.assertEqual(MWL.pronounce_number(1234), "mil duzientos i trinta i quatro") + self.assertEqual(MWL.pronounce_number(2000), "dous mil") + self.assertEqual(MWL.pronounce_number(1000000), "un milhon") + self.assertEqual(MWL.pronounce_number(2_000_000), "dous milhones") + + def test_cardinals_large(self): + self.assertEqual(MWL.pronounce_number(1_234_567), + "un milhon duzientos i trinta i quatro mil quinhentos i sessenta i siete") + self.assertEqual(MWL.pronounce_number(1_000_000_000), "mil milhones") # Long scale default + self.assertEqual(MWL.pronounce_number(1_000_000_000_000), "un bilion") # Long scale default + + def test_decimals(self): + self.assertEqual(MWL.pronounce_number(1.5), "un bírgula cinco") + self.assertEqual(MWL.pronounce_number(3.14), "trés bírgula catorze") + self.assertEqual(MWL.pronounce_number(10.05), "dieç bírgula zero cinco") + + def test_negative(self): + self.assertEqual(MWL.pronounce_number(-10), "menos dieç") + self.assertEqual(MWL.pronounce_number(-3.14), "menos trés bírgula catorze") + + def test_gendered_cardinals(self): + # 1 and 2 are gendered + self.assertEqual(MWL.pronounce_number(1, gender=GrammaticalGender.FEMININE), "ũa") + self.assertEqual(MWL.pronounce_number(2, gender=GrammaticalGender.FEMININE), "dues") + + # 21 (uses 1) is gendered + self.assertEqual(MWL.pronounce_number(21, gender=GrammaticalGender.MASCULINE), "bint'i un") + self.assertEqual(MWL.pronounce_number(21, gender=GrammaticalGender.FEMININE), "bint'i ũa") + + # 102 (uses 2) is gendered + self.assertEqual(MWL.pronounce_number(102, gender=GrammaticalGender.MASCULINE), "ciento i dous") + self.assertEqual(MWL.pronounce_number(102, gender=GrammaticalGender.FEMININE), "ciento i dues") + + def test_fractions(self): + self.assertEqual(MWL.pronounce_fraction("1/2"), "un meio") + self.assertEqual(MWL.pronounce_fraction("3/4"), "trés quartos") + self.assertEqual(MWL.pronounce_fraction("1/10"), "un décimo") + self.assertEqual(MWL.pronounce_fraction("1/1000"), "un milésimo") + + +# ============================================================ +# Ordinal Pronunciation (Number to Text) +# ============================================================ + +class TestOrdinalPronunciationMWL(unittest.TestCase): + """Test MWL.pronounce_ordinal.""" + + def test_basic_ordinals_masculine(self): + self.assertEqual(MWL.pronounce_ordinal(1), "purmerio") + self.assertEqual(MWL.pronounce_ordinal(2), "segundo") + self.assertEqual(MWL.pronounce_ordinal(3), "terceiro") + self.assertEqual(MWL.pronounce_ordinal(10), "décimo") + + def test_basic_ordinals_feminine(self): + self.assertEqual(MWL.pronounce_ordinal(1, GrammaticalGender.FEMININE), "purmeria") + self.assertEqual(MWL.pronounce_ordinal(2, GrammaticalGender.FEMININE), "segunda") + self.assertEqual(MWL.pronounce_ordinal(3, GrammaticalGender.FEMININE), "terceira") + self.assertEqual(MWL.pronounce_ordinal(10, GrammaticalGender.FEMININE), "décima") + + def test_complex_ordinals(self): + # 21st (vingésimo primeiro / primeira) + self.assertEqual(MWL.pronounce_ordinal(21), "bigésimo purmerio") + self.assertEqual(MWL.pronounce_ordinal(21, GrammaticalGender.FEMININE), "bigésima purmeria") + # 44th (quadragésimo quarto / quarta) + self.assertEqual(MWL.pronounce_ordinal(44), "quadragésimo quarto") + self.assertEqual(MWL.pronounce_ordinal(44, GrammaticalGender.FEMININE), "quadragésima quarta") + # 100th (centésimo) + self.assertEqual(MWL.pronounce_ordinal(100), "centésimo") + # 1000th (milésimo) + self.assertEqual(MWL.pronounce_ordinal(1000), "milésimo") + + +# ============================================================ +# Extraction (Text to Number) +# ============================================================ + +class TestExtractionMWL(unittest.TestCase): + """Test the MWL.extract_number function.""" + + def test_extract_cardinals(self): + self.assertEqual(MWL.extract_number("zero"), 0) + self.assertEqual(MWL.extract_number("un"), 1) + self.assertEqual(MWL.extract_number("ũa"), 1) + self.assertEqual(MWL.extract_number("dous"), 2) + self.assertEqual(MWL.extract_number("dues"), 2) + self.assertEqual(MWL.extract_number("dezasseis"), 16) + self.assertEqual(MWL.extract_number("bint'i un"), 21) + self.assertEqual(MWL.extract_number("cien i dous"), 102) + self.assertEqual(MWL.extract_number("mil i duzientos i trinta i quatro"), 1234) + + def test_extract_decimals(self): + self.assertEqual(MWL.extract_number("un bírgula cinco"), 1.5) + self.assertEqual(MWL.extract_number("menos trés bírgula catorze"), -3.14) + + def test_extract_ordinals(self): + # Masculine + self.assertEqual(MWL.extract_number("purmerio", ordinals=True), 1) + self.assertEqual(MWL.extract_number("décimo purmerio", ordinals=True), 11) + # Feminine + self.assertEqual(MWL.extract_number("purmeria beç", ordinals=True), 1) + self.assertEqual(MWL.extract_number("la sessagésima quarta beç", ordinals=True), 64) + + def test_extract_phrase(self): + self.assertEqual(MWL.extract_number("un milhon de beçs"), 1_000_000) + self.assertEqual(MWL.extract_number("la bint'i ũa beç"), 21) + + def test_extract_failure(self): + self.assertFalse(MWL.extract_number("palabra sin númaro")) + + +# ============================================================ +# Digits (Text in Sentence to Digits) +# ============================================================ + +class TestDigitsMWL(unittest.TestCase): + """Test MWL.numbers_to_digits function.""" + + def test_basic_conversion(self): + self.assertEqual(MWL.numbers_to_digits("dieç beçs"), "10 beçs") + self.assertEqual(MWL.numbers_to_digits("bint'i un panes"), "21 panes") + self.assertEqual(MWL.numbers_to_digits("cien i dous dies"), "102 dies") + self.assertEqual(MWL.numbers_to_digits("un milhon de beçs"), "1000000 de beçs") + + def test_decimal_conversion(self): + self.assertEqual(MWL.numbers_to_digits("la distáncia ye de trés bírgula catorze km"), + "la distáncia ye de 3.14 km") + + def test_negative_conversion(self): + self.assertEqual(MWL.numbers_to_digits("temperatura de menos binte graus"), "temperatura de -20 graus") + + +# ============================================================ +# Integration tests +# ============================================================ + +class TestIntegrationMWL(unittest.TestCase): + """Test round-trip conversion and large numbers.""" + + def test_round_trip(self): + """Test that number -> text -> number conversion works.""" + nums = [1, 2, 16, 21, 35, 100, 234, 1000, 1234, 9999] + for n in nums: + text = MWL.pronounce_number(n) + extracted = MWL.extract_number(text) + self.assertEqual(extracted, n, f"Failed on number: {n}. Text: {text}") + + def test_large_numbers_round_trip(self): + """Test round trip for numbers that require large scale support.""" + n = 1_234_567_899 + text = MWL.pronounce_number(n) + self.assertEqual(text, "mil duzientos i trinta i quatro milhones quinhentos i sessenta i siete mil uitocientos i nobenta i nuobe") + extracted = MWL.extract_number(text) + self.assertEqual(extracted, n, f"Failed on number: {n}. Text: {text}") + + # Bilion (long scale, 10^12) + n = 5_000_000_000_000 + text = MWL.pronounce_number(n) + self.assertEqual(text, "cinco biliones") + extracted = MWL.extract_number(text) + self.assertEqual(extracted, n) + + def test_gender_sensitive_round_trip(self): + """Test round trip for gendered numbers.""" + # Feminine 1 + text_f = MWL.pronounce_number(1, gender=GrammaticalGender.FEMININE) + self.assertEqual(text_f, "ũa") + self.assertEqual(MWL.extract_number(text_f), 1) + + # Feminine 2 + text_f = MWL.pronounce_number(2, gender=GrammaticalGender.FEMININE) + self.assertEqual(text_f, "dues") + self.assertEqual(MWL.extract_number(text_f), 2) \ No newline at end of file diff --git a/tests/test_number_parser_pt.py b/tests/test_number_parser_pt.py index 3c3883f..99b7da8 100644 --- a/tests/test_number_parser_pt.py +++ b/tests/test_number_parser_pt.py @@ -2,21 +2,7 @@ from ovos_number_parser.numbers_pt import ( PortugueseVariant, - _pronounce_up_to_999, - is_fractional_pt, - extract_number_pt, - pronounce_number_pt, - numbers_to_digits_pt, - tokenize, - pronounce_fraction_pt, - _UNITS, - _TENS_BR, - _TENS_PT, - _HUNDREDS, - _FRACTION_STRING_PT, - _SCALES, - _NUMBERS_BR, - _NUMBERS_PT + PT_PT, PT_BR ) from ovos_number_parser.util import DigitPronunciation, Scale @@ -40,51 +26,45 @@ class TestDictionaries(unittest.TestCase): def test_units_completeness(self): """Test that _UNITS contains all expected numbers.""" - expected_keys = list(range(1, 10)) - self.assertEqual(set(_UNITS.keys()), set(expected_keys)) + expected_keys = list(range(0, 10)) + self.assertEqual(set(PT_PT.vocab.UNITS.keys()), set(expected_keys)) def test_tens_br_completeness(self): """Test that _TENS_BR contains all expected numbers.""" expected_keys = list(range(10, 20)) + list(range(20, 100, 10)) - self.assertEqual(set(_TENS_BR.keys()), set(expected_keys)) + self.assertEqual(set(PT_BR.vocab.TENS.keys()), set(expected_keys)) def test_tens_pt_completeness(self): """Test that _TENS_PT contains all expected numbers.""" expected_keys = list(range(10, 20)) + list(range(20, 100, 10)) - self.assertEqual(set(_TENS_PT.keys()), set(expected_keys)) + self.assertEqual(set(PT_PT.vocab.TENS.keys()), set(expected_keys)) def test_tens_variants_differences(self): """Test that BR and PT variants have expected differences.""" # Key differences between BR and PT - self.assertEqual(_TENS_BR[16], "dezesseis") - self.assertEqual(_TENS_PT[16], "dezasseis") - self.assertEqual(_TENS_BR[17], "dezessete") - self.assertEqual(_TENS_PT[17], "dezassete") - self.assertEqual(_TENS_BR[19], "dezenove") - self.assertEqual(_TENS_PT[19], "dezanove") + self.assertEqual(PT_BR.vocab.TENS[16], "dezesseis") + self.assertEqual(PT_PT.vocab.TENS[16], "dezasseis") + self.assertEqual(PT_BR.vocab.TENS[17], "dezessete") + self.assertEqual(PT_PT.vocab.TENS[17], "dezassete") + self.assertEqual(PT_BR.vocab.TENS[19], "dezenove") + self.assertEqual(PT_PT.vocab.TENS[19], "dezanove") def test_hundreds_completeness(self): """Test that _HUNDREDS contains all expected numbers.""" expected_keys = list(range(100, 1000, 100)) - self.assertEqual(set(_HUNDREDS.keys()), set(expected_keys)) + self.assertEqual(set(PT_PT.vocab.HUNDREDS.keys()), set(expected_keys)) def test_fraction_string_pt_completeness(self): """Test that _FRACTION_STRING_PT contains expected fractions.""" - self.assertIn(2, _FRACTION_STRING_PT) - self.assertIn(3, _FRACTION_STRING_PT) - self.assertIn(10, _FRACTION_STRING_PT) - self.assertEqual(_FRACTION_STRING_PT[2], "meio") - self.assertEqual(_FRACTION_STRING_PT[3], "terço") - - def test_scales_structure(self): - """Test that _SCALES has correct structure.""" - self.assertIn(Scale.SHORT, _SCALES) - self.assertIn(Scale.LONG, _SCALES) - self.assertIn(PortugueseVariant.BR, _SCALES[Scale.SHORT]) - self.assertIn(PortugueseVariant.PT, _SCALES[Scale.SHORT]) + self.assertIn(2, PT_PT.vocab.FRACTION) + self.assertIn(3, PT_PT.vocab.FRACTION) + self.assertIn(10, PT_PT.vocab.FRACTION) + self.assertEqual(PT_PT.vocab.FRACTION[2], "meio") + self.assertEqual(PT_PT.vocab.FRACTION[3], "terço") def test_numbers_br_construction(self): """Test that _NUMBERS_BR is correctly constructed.""" + _NUMBERS_BR = PT_BR.vocab.get_number_strings() self.assertIn("um", _NUMBERS_BR) self.assertIn("dezesseis", _NUMBERS_BR) self.assertIn("bilhão", _NUMBERS_BR) @@ -93,6 +73,7 @@ def test_numbers_br_construction(self): def test_numbers_pt_construction(self): """Test that _NUMBERS_PT is correctly constructed.""" + _NUMBERS_PT = PT_PT.vocab.get_number_strings() self.assertIn("um", _NUMBERS_PT) self.assertIn("dezasseis", _NUMBERS_PT) self.assertIn("bilião", _NUMBERS_PT) @@ -101,336 +82,354 @@ def test_numbers_pt_construction(self): class TestPronounceUpTo999(unittest.TestCase): - """Test _pronounce_up_to_999 function.""" + """Test PT_PT.pronounce_number function.""" def test_zero(self): """Test pronunciation of zero.""" - result = _pronounce_up_to_999(0) + result = PT_PT.pronounce_number(0) self.assertEqual(result, "zero") def test_single_digits_br(self): """Test pronunciation of single digits in BR variant.""" - self.assertEqual(_pronounce_up_to_999(1, PortugueseVariant.BR), "um") - self.assertEqual(_pronounce_up_to_999(5, PortugueseVariant.BR), "cinco") - self.assertEqual(_pronounce_up_to_999(9, PortugueseVariant.BR), "nove") + self.assertEqual(PT_BR.pronounce_number(1), "um") + self.assertEqual(PT_BR.pronounce_number(5), "cinco") + self.assertEqual(PT_BR.pronounce_number(9), "nove") def test_single_digits_pt(self): """Test pronunciation of single digits in PT variant.""" - self.assertEqual(_pronounce_up_to_999(1, PortugueseVariant.PT), "um") - self.assertEqual(_pronounce_up_to_999(5, PortugueseVariant.PT), "cinco") - self.assertEqual(_pronounce_up_to_999(9, PortugueseVariant.PT), "nove") + self.assertEqual(PT_PT.pronounce_number(1), "um") + self.assertEqual(PT_PT.pronounce_number(5), "cinco") + self.assertEqual(PT_PT.pronounce_number(9), "nove") def test_teens_br(self): """Test pronunciation of teens in BR variant.""" - self.assertEqual(_pronounce_up_to_999(16, PortugueseVariant.BR), "dezesseis") - self.assertEqual(_pronounce_up_to_999(17, PortugueseVariant.BR), "dezessete") - self.assertEqual(_pronounce_up_to_999(19, PortugueseVariant.BR), "dezenove") + self.assertEqual(PT_BR.pronounce_number(16), "dezesseis") + self.assertEqual(PT_BR.pronounce_number(17), "dezessete") + self.assertEqual(PT_BR.pronounce_number(19), "dezenove") def test_teens_pt(self): """Test pronunciation of teens in PT variant.""" - self.assertEqual(_pronounce_up_to_999(16, PortugueseVariant.PT), "dezasseis") - self.assertEqual(_pronounce_up_to_999(17, PortugueseVariant.PT), "dezassete") - self.assertEqual(_pronounce_up_to_999(19, PortugueseVariant.PT), "dezanove") + self.assertEqual(PT_PT.pronounce_number(16), "dezasseis") + self.assertEqual(PT_PT.pronounce_number(17), "dezassete") + self.assertEqual(PT_PT.pronounce_number(19), "dezanove") def test_tens(self): """Test pronunciation of tens.""" - self.assertEqual(_pronounce_up_to_999(20), "vinte") - self.assertEqual(_pronounce_up_to_999(30), "trinta") - self.assertEqual(_pronounce_up_to_999(90), "noventa") + self.assertEqual(PT_PT.pronounce_number(20), "vinte") + self.assertEqual(PT_PT.pronounce_number(30), "trinta") + self.assertEqual(PT_PT.pronounce_number(90), "noventa") def test_tens_with_units(self): """Test pronunciation of tens with units.""" - self.assertEqual(_pronounce_up_to_999(21), "vinte e um") - self.assertEqual(_pronounce_up_to_999(35), "trinta e cinco") - self.assertEqual(_pronounce_up_to_999(99), "noventa e nove") + self.assertEqual(PT_PT.pronounce_number(21), "vinte e um") + self.assertEqual(PT_PT.pronounce_number(35), "trinta e cinco") + self.assertEqual(PT_PT.pronounce_number(99), "noventa e nove") def test_exact_hundred(self): """Test pronunciation of exact hundred.""" - self.assertEqual(_pronounce_up_to_999(100), "cem") + self.assertEqual(PT_PT.pronounce_number(100), "cem") def test_hundreds_with_remainder(self): """Test pronunciation of hundreds with remainder.""" - self.assertEqual(_pronounce_up_to_999(101), "cento e um") - self.assertEqual(_pronounce_up_to_999(123), "cento e vinte e três") - self.assertEqual(_pronounce_up_to_999(200), "duzentos") - self.assertEqual(_pronounce_up_to_999(234), "duzentos e trinta e quatro") + self.assertEqual(PT_PT.pronounce_number(101), "cento e um") + self.assertEqual(PT_PT.pronounce_number(123), "cento e vinte e três") + self.assertEqual(PT_PT.pronounce_number(200), "duzentos") + self.assertEqual(PT_PT.pronounce_number(234), "duzentos e trinta e quatro") def test_complex_numbers(self): """Test pronunciation of complex numbers.""" - self.assertEqual(_pronounce_up_to_999(567), "quinhentos e sessenta e sete") - self.assertEqual(_pronounce_up_to_999(999), "novecentos e noventa e nove") - - def test_invalid_range(self): - """Test that invalid ranges raise ValueError.""" - with self.assertRaises(ValueError): - _pronounce_up_to_999(-1) - with self.assertRaises(ValueError): - _pronounce_up_to_999(1000) - with self.assertRaises(ValueError): - _pronounce_up_to_999(1001) + self.assertEqual(PT_PT.pronounce_number(567), "quinhentos e sessenta e sete") + self.assertEqual(PT_PT.pronounce_number(999), "novecentos e noventa e nove") class TestIsFractionalPt(unittest.TestCase): - """Test is_fractional_pt function.""" + """Test PT_PT.is_fractional function.""" def test_basic_fractions(self): """Test basic fraction recognition.""" - self.assertEqual(is_fractional_pt("meio"), 0.5) - self.assertEqual(is_fractional_pt("terço"), 1.0 / 3) - self.assertEqual(is_fractional_pt("quarto"), 0.25) + self.assertEqual(PT_PT.is_fractional("meio"), 0.5) + self.assertEqual(PT_PT.is_fractional("terço"), 1.0 / 3) + self.assertEqual(PT_PT.is_fractional("quarto"), 0.25) def test_meia_variant(self): """Test 'meia' as variant of 'meio'.""" - self.assertEqual(is_fractional_pt("meia"), 0.5) + self.assertEqual(PT_PT.is_fractional("meia"), 0.5) def test_plural_forms(self): """Test plural forms of fractions.""" - self.assertEqual(is_fractional_pt("meios"), 0.5) - self.assertEqual(is_fractional_pt("terços"), 1.0 / 3) - self.assertEqual(is_fractional_pt("quartos"), 0.25) + self.assertEqual(PT_PT.is_fractional("meios"), 0.5) + self.assertEqual(PT_PT.is_fractional("terços"), 1.0 / 3) + self.assertEqual(PT_PT.is_fractional("quartos"), 0.25) def test_special_fractions(self): """Test special fraction forms.""" - self.assertEqual(is_fractional_pt("décimo"), 0.1) - self.assertEqual(is_fractional_pt("vigésimo"), 0.05) - self.assertEqual(is_fractional_pt("centésimo"), 0.01) + self.assertEqual(PT_PT.is_fractional("décimo"), 0.1) + self.assertEqual(PT_PT.is_fractional("vigésimo"), 0.05) + self.assertEqual(PT_PT.is_fractional("centésimo"), 0.01) def test_compound_fractions(self): """Test compound fraction forms like 'onze avos'.""" - self.assertEqual(is_fractional_pt("onze avos"), 1.0 / 11) - self.assertEqual(is_fractional_pt("doze avos"), 1.0 / 12) - self.assertEqual(is_fractional_pt("treze avos"), 1.0 / 13) - self.assertFalse(is_fractional_pt("onze")) - self.assertFalse(is_fractional_pt("doze")) - self.assertFalse(is_fractional_pt("treze")) + self.assertEqual(PT_PT.is_fractional("onze avos"), 1.0 / 11) + self.assertEqual(PT_PT.is_fractional("doze avos"), 1.0 / 12) + self.assertEqual(PT_PT.is_fractional("treze avos"), 1.0 / 13) + self.assertFalse(PT_PT.is_fractional("onze")) + self.assertFalse(PT_PT.is_fractional("doze")) + self.assertFalse(PT_PT.is_fractional("treze")) def test_case_insensitive(self): """Test case insensitive matching.""" - self.assertEqual(is_fractional_pt("MEIO"), 0.5) - self.assertEqual(is_fractional_pt("Terço"), 1.0 / 3) - self.assertEqual(is_fractional_pt("MEIA"), 0.5) + self.assertEqual(PT_PT.is_fractional("MEIO"), 0.5) + self.assertEqual(PT_PT.is_fractional("Terço"), 1.0 / 3) + self.assertEqual(PT_PT.is_fractional("MEIA"), 0.5) def test_whitespace_handling(self): """Test whitespace handling.""" - self.assertEqual(is_fractional_pt(" meio "), 0.5) - self.assertEqual(is_fractional_pt("\tterço\n"), 1.0 / 3) + self.assertEqual(PT_PT.is_fractional(" meio "), 0.5) + self.assertEqual(PT_PT.is_fractional("\tterço\n"), 1.0 / 3) def test_non_fractions(self): """Test non-fraction strings return False.""" - self.assertFalse(is_fractional_pt("palavra")) - self.assertFalse(is_fractional_pt("número")) - self.assertFalse(is_fractional_pt("")) - self.assertFalse(is_fractional_pt("123")) + self.assertFalse(PT_PT.is_fractional("palavra")) + self.assertFalse(PT_PT.is_fractional("número")) + self.assertFalse(PT_PT.is_fractional("")) + self.assertFalse(PT_PT.is_fractional("123")) class TestExtractNumberPt(unittest.TestCase): - """Test extract_number_pt function.""" + """Test PT_PT.extract_number function.""" def test_simple_numbers_br(self): """Test extraction of simple numbers in BR variant.""" - self.assertEqual(extract_number_pt("dezesseis", variant=PortugueseVariant.BR), 16) - self.assertEqual(extract_number_pt("vinte e um", variant=PortugueseVariant.BR), 21) - self.assertEqual(extract_number_pt("cem", variant=PortugueseVariant.BR), 100) + self.assertEqual(PT_BR.extract_number("dezesseis"), 16) + self.assertEqual(PT_BR.extract_number("vinte e um"), 21) + self.assertEqual(PT_BR.extract_number("cem"), 100) def test_simple_numbers_pt(self): """Test extraction of simple numbers in PT variant.""" - self.assertEqual(extract_number_pt("dezasseis", variant=PortugueseVariant.PT), 16) - self.assertEqual(extract_number_pt("vinte e um", variant=PortugueseVariant.PT), 21) - self.assertEqual(extract_number_pt("cem", variant=PortugueseVariant.PT), 100) + self.assertEqual(PT_PT.extract_number("dezasseis"), 16) + self.assertEqual(PT_PT.extract_number("vinte e um"), 21) + self.assertEqual(PT_PT.extract_number("cem"), 100) def test_large_numbers_short_scale_br(self): """Test extraction of large numbers in short scale BR.""" - self.assertEqual(extract_number_pt("um milhão", scale=Scale.SHORT, variant=PortugueseVariant.BR), 1000000) - self.assertEqual(extract_number_pt("um bilhão", scale=Scale.SHORT, variant=PortugueseVariant.BR), 1000000000) + self.assertEqual(PT_BR.extract_number("um milhão", scale=Scale.SHORT), 1000000) + self.assertEqual(PT_BR.extract_number("um bilhão", scale=Scale.SHORT), 1000000000) def test_large_numbers_short_scale_pt(self): """Test extraction of large numbers in short scale PT.""" - self.assertEqual(extract_number_pt("um milhão", scale=Scale.SHORT, variant=PortugueseVariant.PT), 1e6) - self.assertEqual(extract_number_pt("um bilião", scale=Scale.SHORT, variant=PortugueseVariant.PT), 1e9) - self.assertEqual(extract_number_pt("um trilião", scale=Scale.SHORT, variant=PortugueseVariant.PT), 1e12) + self.assertEqual(PT_PT.extract_number("um milhão", scale=Scale.SHORT), 1e6) + self.assertEqual(PT_PT.extract_number("um bilião", scale=Scale.SHORT), 1e9) + self.assertEqual(PT_PT.extract_number("um trilião", scale=Scale.SHORT), 1e12) def test_large_numbers_long_scale(self): """Test extraction of large numbers in long scale.""" - # TODO - failing - self.assertEqual(extract_number_pt("um milhão", scale=Scale.LONG, variant=PortugueseVariant.PT), 1e6) - self.assertEqual(extract_number_pt("um bilião", scale=Scale.LONG, variant=PortugueseVariant.PT), 1e12) - self.assertEqual(extract_number_pt("um trilião", scale=Scale.LONG, variant=PortugueseVariant.PT), 1e18) + self.assertEqual(PT_PT.extract_number("um milhão", scale=Scale.LONG), 1e6) + self.assertEqual(PT_PT.extract_number("um bilião", scale=Scale.LONG), 1e12) + self.assertEqual(PT_PT.extract_number("um trilião", scale=Scale.LONG), 1e18) def test_complex_numbers(self): """Test extraction of complex number phrases.""" - self.assertEqual(extract_number_pt("duzentos e cinquenta e três"), 253) - self.assertEqual(extract_number_pt("mil quinhentos e quarenta e dois"), 1542) + self.assertEqual(PT_PT.extract_number("duzentos e cinquenta e três"), 253) + self.assertEqual(PT_PT.extract_number("mil quinhentos e quarenta e dois"), 1542) def test_fractions_in_text(self): """Test extraction of fractions from text.""" - result = extract_number_pt("dois e meio") + result = PT_PT.extract_number("dois e meio") self.assertAlmostEqual(result, 2.5, places=5) def test_decimal_handling(self): """Test decimal number handling.""" # Note: This tests the simplified decimal approach - result = extract_number_pt("dez ponto cinco") + result = PT_PT.extract_number("dez ponto cinco") # The function should handle this but may need specific formatting if result: self.assertIsInstance(result, (int, float)) def test_case_insensitive(self): """Test case insensitive extraction.""" - self.assertEqual(extract_number_pt("DEZESSEIS", variant=PortugueseVariant.BR), 16) - self.assertEqual(extract_number_pt("Vinte E Um", variant=PortugueseVariant.BR), 21) + self.assertEqual(PT_BR.extract_number("DEZESSEIS"), 16) + self.assertEqual(PT_BR.extract_number("Vinte E Um"), 21) def test_hyphen_handling(self): """Test hyphen handling in text.""" - self.assertEqual(extract_number_pt("vinte-e-um", variant=PortugueseVariant.BR), 21) + self.assertEqual(PT_BR.extract_number("vinte-e-um"), 21) def test_no_number_found(self): """Test when no number is found in text.""" - self.assertFalse(extract_number_pt("apenas palavras")) - self.assertFalse(extract_number_pt("")) - self.assertFalse(extract_number_pt("xyz")) + self.assertFalse(PT_PT.extract_number("apenas palavras")) + self.assertFalse(PT_PT.extract_number("")) + self.assertFalse(PT_PT.extract_number("xyz")) def test_multiple_scales(self): """Test numbers with multiple scale words.""" - self.assertEqual(extract_number_pt("dois milhões trezentos mil"), 2300000) + self.assertEqual(PT_PT.extract_number("dois milhões trezentos mil"), 2300000) def test_edge_cases(self): """Test edge cases.""" - self.assertEqual(extract_number_pt("zero"), 0) - self.assertEqual(extract_number_pt("mil"), 1000) + self.assertEqual(PT_PT.extract_number("zero"), 0) + self.assertEqual(PT_PT.extract_number("mil"), 1000) class TestPronounceNumberPt(unittest.TestCase): - """Test pronounce_number_pt function.""" + """Test PT_PT.pronounce_number function.""" def test_type_validation(self): """Test type validation.""" with self.assertRaises(TypeError): - pronounce_number_pt("not a number") + PT_PT.pronounce_number("not a number") with self.assertRaises(TypeError): - pronounce_number_pt(None) + PT_PT.pronounce_number(None) def test_zero(self): """Test pronunciation of zero.""" - self.assertEqual(pronounce_number_pt(0), "zero") + self.assertEqual(PT_PT.pronounce_number(0), "zero") def test_negative_numbers(self): """Test pronunciation of negative numbers.""" - result = pronounce_number_pt(-5) + result = PT_PT.pronounce_number(-5) self.assertTrue(result.startswith("menos")) self.assertIn("cinco", result) def test_simple_integers(self): """Test pronunciation of simple integers.""" - self.assertEqual(pronounce_number_pt(1), "um") - self.assertEqual(pronounce_number_pt(16, variant=PortugueseVariant.BR), "dezesseis") - self.assertEqual(pronounce_number_pt(16, variant=PortugueseVariant.PT), "dezasseis") + self.assertEqual(PT_PT.pronounce_number(1), "um") + self.assertEqual(PT_BR.pronounce_number(16), "dezesseis") + self.assertEqual(PT_PT.pronounce_number(16), "dezasseis") def test_hundreds(self): """Test pronunciation of hundreds.""" - self.assertEqual(pronounce_number_pt(100), "cem") - self.assertEqual(pronounce_number_pt(200), "duzentos") - self.assertEqual(pronounce_number_pt(123), "cento e vinte e três") + self.assertEqual(PT_PT.pronounce_number(100), "cem") + self.assertEqual(PT_PT.pronounce_number(200), "duzentos") + self.assertEqual(PT_PT.pronounce_number(123), "cento e vinte e três") def test_thousands(self): """Test pronunciation of thousands.""" - result = pronounce_number_pt(1000) + result = PT_PT.pronounce_number(1000) self.assertIn("mil", result) - result = pronounce_number_pt(2500) + result = PT_PT.pronounce_number(2500) self.assertIn("mil", result) self.assertIn("quinhentos", result) def test_millions_short_scale_br(self): """Test pronunciation of millions in short scale BR.""" - result = pronounce_number_pt(1000000, scale=Scale.SHORT, variant=PortugueseVariant.BR) + result = PT_BR.pronounce_number(1000000, scale=Scale.SHORT) self.assertIn("milhão", result) - result = pronounce_number_pt(1000000000, scale=Scale.SHORT, variant=PortugueseVariant.BR) + result = PT_BR.pronounce_number(1000000000, scale=Scale.SHORT) self.assertIn("bilhão", result) def test_millions_short_scale_pt(self): """Test pronunciation of millions in short scale PT.""" - result = pronounce_number_pt(1000000, scale=Scale.SHORT, variant=PortugueseVariant.PT) + result = PT_PT.pronounce_number(1000000, scale=Scale.SHORT) self.assertIn("milhão", result) - result = pronounce_number_pt(1000000000, scale=Scale.SHORT, variant=PortugueseVariant.PT) + result = PT_PT.pronounce_number(1000000000, scale=Scale.SHORT) self.assertIn("bilião", result) def test_millions_long_scale(self): """Test pronunciation of millions in long scale.""" - result = pronounce_number_pt(1000000, scale=Scale.LONG, variant=PortugueseVariant.PT) + result = PT_PT.pronounce_number(1000000, scale=Scale.LONG) self.assertIn("milhão", result) - result = pronounce_number_pt(1000000000000, scale=Scale.LONG, variant=PortugueseVariant.PT) + result = PT_PT.pronounce_number(1000000000000, scale=Scale.LONG) self.assertIn("bilião", result) def test_decimal_numbers(self): """Test pronunciation of decimal numbers.""" - result = pronounce_number_pt(1.5) + result = PT_PT.pronounce_number(1.5) self.assertIn("vírgula", result) self.assertIn("um", result) self.assertIn("cinco", result) + def test_significant_digits(self): + """Test pronunciation of decimal places and rounding""" + self.assertEqual(PT_PT.pronounce_number(123.456789, places=1), + "cento e vinte e três vírgula cinco") + self.assertEqual(PT_PT.pronounce_number(123.456789, places=2), + "cento e vinte e três vírgula quarenta e seis") + self.assertEqual(PT_PT.pronounce_number(123.456789, places=3), + "cento e vinte e três vírgula quatrocentos e cinquenta e sete") + self.assertEqual(PT_PT.pronounce_number(123.456789, places=4), + "cento e vinte e três vírgula quatro mil quinhentos e sessenta e oito") + self.assertEqual(PT_PT.pronounce_number(123.456789, places=5), + "cento e vinte e três vírgula quarenta e cinco mil seiscentos e setenta e nove") + + def test_leading_zeros(self): + """Test pronunciation of decimal numbers.""" + # no rounding to significant digit when we have leading zeros + self.assertEqual(PT_PT.pronounce_number(10.05), + "dez vírgula zero cinco") + self.assertEqual(PT_PT.pronounce_number(10.005), + "dez vírgula zero zero cinco") + self.assertEqual(PT_PT.pronounce_number(10.0005), + "dez vírgula zero zero zero cinco") + self.assertEqual(PT_PT.pronounce_number(10.0000005), + "dez vírgula zero zero zero zero zero zero cinco") + self.assertEqual(PT_PT.pronounce_number(10.00000056), + "dez vírgula zero zero zero zero zero zero seis") + + @unittest.skip("TODO - do we want it to behave like this? currently not implemented") def test_decimal_edge_cases(self): """Test edge cases for decimal numbers.""" - # Test when decimal part rounds to zero - result = pronounce_number_pt(1.0) + # Test when decimal part rounds to zero, should we read the significant digits? + result = PT_PT.pronounce_number(1.0) self.assertEqual(result, "um vírgula zero") - - # Test multiple decimal places - result = pronounce_number_pt(1.23) - self.assertIn("vírgula", result) + result = PT_PT.pronounce_number(1.00) + self.assertEqual(result, "um vírgula zero zero") + result = PT_PT.pronounce_number(1.000) + self.assertEqual(result, "um vírgula zero zero zero") def test_conjunction_logic(self): """Test conjunction logic for complex numbers.""" - result = pronounce_number_pt(1001) + result = PT_PT.pronounce_number(1001) self.assertIn("e", result) # Should have conjunction for small remainder - result = pronounce_number_pt(1100) + result = PT_PT.pronounce_number(1100) self.assertIn("e", result) # Should have conjunction for multiple of 100 def test_mil(self): """Test 'um mil' """ - result = pronounce_number_pt(1000) + result = PT_PT.pronounce_number(1000) # Should not start with "um mil" but just "mil" self.assertFalse(result.startswith("um mil")) def test_places_parameter(self): """ - Test that the `places` parameter in `pronounce_number_pt` correctly limits the number of decimal places pronounced when using digit-by-digit pronunciation. + Test that the `places` parameter in `PT_PT.pronounce_number` correctly limits the number of decimal places pronounced when using digit-by-digit pronunciation. Ensures that specifying different values for `places` produces valid string outputs without errors. """ - result1 = pronounce_number_pt(1.23456, places=2, digits=DigitPronunciation.DIGIT_BY_DIGIT) - result2 = pronounce_number_pt(1.23456, places=5, digits=DigitPronunciation.DIGIT_BY_DIGIT) + result1 = PT_PT.pronounce_number(1.23456, places=2, digits=DigitPronunciation.DIGIT_BY_DIGIT) + result2 = PT_PT.pronounce_number(1.23456, places=5, digits=DigitPronunciation.DIGIT_BY_DIGIT) # Both should work without error self.assertIsInstance(result1, str) self.assertIsInstance(result2, str) class TestNumbersToDigitsPt(unittest.TestCase): - """Test numbers_to_digits_pt function.""" + """Test PT_PT.numbers_to_digits function.""" def test_simple_replacement(self): """Test simple number word replacement.""" - self.assertEqual(numbers_to_digits_pt("dezesseis", variant=PortugueseVariant.BR), "16") - self.assertEqual(numbers_to_digits_pt("dezasseis", variant=PortugueseVariant.PT), "16") + self.assertEqual(PT_PT.numbers_to_digits("dezesseis"), "16") + self.assertEqual(PT_PT.numbers_to_digits("dezasseis"), "16") def test_complex_numbers(self): """Test complex number phrase replacement.""" - result = numbers_to_digits_pt("duzentos e cinquenta e três") + result = PT_PT.numbers_to_digits("duzentos e cinquenta e três") self.assertEqual(result, "253") def test_mixed_text(self): """Test text with mixed words and numbers.""" - result = numbers_to_digits_pt("há duzentos e cinquenta carros") + result = PT_PT.numbers_to_digits("há duzentos e cinquenta carros") self.assertIn("250", result) self.assertIn("há", result) self.assertIn("carros", result) def test_multiple_numbers(self): """Test text with multiple separate numbers.""" - result = numbers_to_digits_pt("dez carros e cinco pessoas") + result = PT_PT.numbers_to_digits("dez carros e cinco pessoas") self.assertIn("10", result) self.assertIn("5", result) self.assertIn("carros", result) @@ -439,141 +438,95 @@ def test_multiple_numbers(self): def test_no_numbers(self): """Test text with no numbers.""" original = "apenas palavras normais" - result = numbers_to_digits_pt(original) + result = PT_PT.numbers_to_digits(original) self.assertEqual(result, original) def test_edge_cases(self): """Test edge cases.""" # Empty string - self.assertEqual(numbers_to_digits_pt(""), "") + self.assertEqual(PT_PT.numbers_to_digits(""), "") # Single word - self.assertEqual(numbers_to_digits_pt("cinco"), "5") + self.assertEqual(PT_PT.numbers_to_digits("cinco"), "5") # Just conjunction - self.assertEqual(numbers_to_digits_pt("e"), "e") + self.assertEqual(PT_PT.numbers_to_digits("e"), "e") def test_variant_differences(self): """Test that variants produce different results where expected.""" - br_result = numbers_to_digits_pt("dezesseis", variant=PortugueseVariant.BR) - pt_result = numbers_to_digits_pt("dezasseis", variant=PortugueseVariant.PT) + br_result = PT_BR.numbers_to_digits("dezesseis") + pt_result = PT_PT.numbers_to_digits("dezasseis") self.assertEqual(br_result, "16") self.assertEqual(pt_result, "16") -class TestTokenize(unittest.TestCase): - """Test tokenize function.""" - - def test_basic_tokenization(self): - """Test basic word tokenization.""" - result = tokenize("palavra uma palavra duas") - expected = ["palavra", "uma", "palavra", "duas"] - self.assertEqual(result, expected) - - def test_percentage_split(self): - """Test splitting percentages.""" - result = tokenize("12%") - self.assertEqual(result, ["12", "%"]) - - def test_hash_number_split(self): - """Test splitting hash with numbers.""" - result = tokenize("#1") - self.assertEqual(result, ["#", "1"]) - - def test_hyphen_between_words(self): - """Test hyphen handling between words.""" - result = tokenize("amo-te") - self.assertEqual(result, ["amo", "-", "te"]) - - def test_hyphen_preservation_in_numbers(self): - """Test that hyphens in numbers are preserved.""" - result = tokenize("1-2") - # Should not split number ranges - self.assertIn("1-2", result) - - def test_trailing_hyphen_removal(self): - """Test removal of trailing hyphens.""" - result = tokenize("palavra -") - self.assertEqual(result, ["palavra"]) - - def test_empty_string(self): - """Test tokenization of empty string.""" - result = tokenize("") - self.assertEqual(result, []) - - def test_whitespace_handling(self): - """Test handling of various whitespace.""" - result = tokenize(" palavra outra ") - self.assertEqual(result, ["palavra", "outra"]) - - def test_complex_input(self): - """Test complex input with multiple patterns.""" - result = tokenize("amo-te 50% #2 test") - expected_elements = ["amo", "-", "te", "50", "%", "#", "2", "test"] - self.assertEqual(result, expected_elements) - - class TestPronounceFractionPt(unittest.TestCase): - """Test pronounce_fraction_pt function.""" + """Test PT_PT.pronounce_fraction function.""" def test_simple_fractions(self): """Test pronunciation of simple fractions.""" - result = pronounce_fraction_pt("1/2") + result = PT_PT.pronounce_fraction("1/2") self.assertIn("um", result) self.assertIn("meio", result) - result = pronounce_fraction_pt("1/3") + result = PT_PT.pronounce_fraction("1/3") self.assertIn("um", result) self.assertIn("terço", result) def test_plural_fractions(self): """Test pronunciation of plural fractions.""" - result = pronounce_fraction_pt("2/3") + result = PT_PT.pronounce_fraction("2/3") self.assertIn("dois", result) self.assertIn("terços", result) - result = pronounce_fraction_pt("3/4") + result = PT_PT.pronounce_fraction("3/4") self.assertIn("três", result) self.assertIn("quartos", result) + def test_zero_division(self): + """Test pronunciation of plural fractions.""" + result = PT_PT.pronounce_fraction("0/0") + self.assertIn("zero", result) + self.assertIn(PT_PT.vocab.DIVIDED_BY_ZERO, result) + def test_large_denominators(self): """Test fractions with large denominators.""" - result = pronounce_fraction_pt("1/7") + result = PT_PT.pronounce_fraction("1/7") self.assertIn("um", result) self.assertIn("sétimo", result) - result = pronounce_fraction_pt("5/7") + result = PT_PT.pronounce_fraction("5/7") self.assertIn("cinco", result) self.assertIn("sétimos", result) def test_unknown_denominators(self): """Test fractions with denominators not in predefined list.""" - result = pronounce_fraction_pt("1/13") + result = PT_PT.pronounce_fraction("1/13") self.assertIn("um", result) # Should use "avos" for unknown denominators - result = pronounce_fraction_pt("2/13") + result = PT_PT.pronounce_fraction("2/13") self.assertIn("dois", result) self.assertIn("avos", result) def test_variant_differences(self): """Test variant differences in fraction pronunciation.""" - br_result = pronounce_fraction_pt("1/16", variant=PortugueseVariant.BR) - pt_result = pronounce_fraction_pt("1/16", variant=PortugueseVariant.PT) + br_result = PT_BR.pronounce_fraction("1/16") + pt_result = PT_PT.pronounce_fraction("1/16") # Both should work, may have slight differences in underlying number pronunciation self.assertIsInstance(br_result, str) self.assertIsInstance(pt_result, str) def test_scale_parameter(self): """Test scale parameter in fraction pronunciation.""" - result_short = pronounce_fraction_pt("1/1000000", scale=Scale.SHORT) - result_long = pronounce_fraction_pt("1/1000000", scale=Scale.LONG) + result_short = PT_PT.pronounce_fraction("1/1000000", scale=Scale.SHORT) + result_long = PT_PT.pronounce_fraction("1/1000000", scale=Scale.LONG) self.assertIsInstance(result_short, str) self.assertIsInstance(result_long, str) def test_zero_numerator(self): """Test fractions with zero numerator.""" - result = pronounce_fraction_pt("0/5") + result = PT_PT.pronounce_fraction("0/5") self.assertIn("zero", result) @@ -586,9 +539,9 @@ def test_round_trip_conversion(self): for num in test_numbers: # Convert number to text - text = pronounce_number_pt(num, variant=PortugueseVariant.BR) + text = PT_BR.pronounce_number(num) # Convert text back to number - extracted = extract_number_pt(text, variant=PortugueseVariant.BR) + extracted = PT_BR.extract_number(text) self.assertEqual(extracted, num, f"Round-trip failed for {num}: {text} -> {extracted}") def test_variant_consistency(self): @@ -597,13 +550,13 @@ def test_variant_consistency(self): for num in test_numbers: # Test BR variant - br_text = pronounce_number_pt(num, variant=PortugueseVariant.BR) - br_extracted = extract_number_pt(br_text, variant=PortugueseVariant.BR) + br_text = PT_BR.pronounce_number(num) + br_extracted = PT_BR.extract_number(br_text) self.assertEqual(br_extracted, num) # Test PT variant - pt_text = pronounce_number_pt(num, variant=PortugueseVariant.PT) - pt_extracted = extract_number_pt(pt_text, variant=PortugueseVariant.PT) + pt_text = PT_PT.pronounce_number(num) + pt_extracted = PT_PT.extract_number(pt_text) self.assertEqual(pt_extracted, num) def test_scale_consistency(self): @@ -612,15 +565,15 @@ def test_scale_consistency(self): for num in large_numbers: for scale in [Scale.SHORT, Scale.LONG]: - for variant in [PortugueseVariant.BR, PortugueseVariant.PT]: - text = pronounce_number_pt(num, scale=scale, variant=variant) - extracted = extract_number_pt(text, scale=scale, variant=variant) + for variant in [PT_PT, PT_BR]: + text = variant.pronounce_number(num, scale=scale) + extracted = variant.extract_number(text, scale=scale) print(text, extracted) self.assertEqual(extracted, num, - f"Scale consistency failed: {num} with {scale} and {variant}") + f"Scale consistency failed: {num} with {scale} and {variant.vocab.LANG}") def test_numbers_to_digits_integration(self): - """Test integration with numbers_to_digits_pt.""" + """Test integration with PT_PT.numbers_to_digits.""" test_phrases = [ "há duzentos e cinquenta carros", "comprei dezesseis livros", @@ -628,7 +581,7 @@ def test_numbers_to_digits_integration(self): ] for phrase in test_phrases: - result = numbers_to_digits_pt(phrase, variant=PortugueseVariant.BR) + result = PT_BR.numbers_to_digits(phrase) # Should contain digits and preserve non-number words self.assertIsInstance(result, str) self.assertTrue(any(char.isdigit() for char in result)) @@ -639,12 +592,12 @@ def test_error_handling_robustness(self): invalid_inputs = ["", " ", "xyz123", "palavra-palavra"] for invalid_input in invalid_inputs: - # extract_number_pt should return False for invalid input - result = extract_number_pt(invalid_input) + # PT_PT.extract_number should return False for invalid input + result = PT_PT.extract_number(invalid_input) self.assertFalse(result) - # numbers_to_digits_pt should handle gracefully - result = numbers_to_digits_pt(invalid_input) + # PT_PT.numbers_to_digits should handle gracefully + result = PT_PT.numbers_to_digits(invalid_input) self.assertIsInstance(result, str) def test_large_number_limits(self): @@ -653,11 +606,17 @@ def test_large_number_limits(self): # Should not raise exceptions try: - result = pronounce_number_pt(very_large) + result = PT_PT.pronounce_number(very_large) self.assertIsInstance(result, str) except Exception as e: self.fail(f"Large number pronunciation failed: {e}") + def test_negative(self): + text = PT_PT.pronounce_number(-234) + self.assertTrue(text.startswith("menos")) + self.assertEqual(PT_PT.extract_number(text), -234) # -234 != -166 + + self.assertEqual(PT_BR.extract_number("menos cinco"), -5) if __name__ == '__main__': unittest.main() diff --git a/tests/test_util.py b/tests/test_util.py index d35f5c4..523e482 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -3,7 +3,7 @@ from ovos_number_parser.util import ( Token, Scale, ReplaceableNumber, tokenize, partition_list, - invert_dict, is_numeric, look_for_fractions, convert_to_mixed_fraction + invert_dict, is_numeric, look_for_fractions, convert_to_mixed_fraction, word_tokenize ) @@ -824,6 +824,59 @@ def test_convert_edge_denominator_boundary(self): self.assertIsNone(result) + +class TestWordTokenize(unittest.TestCase): + """Test tokenize function.""" + + def test_basic_tokenization(self): + """Test basic word tokenization.""" + result = word_tokenize("palavra uma palavra duas") + expected = ["palavra", "uma", "palavra", "duas"] + self.assertEqual(result, expected) + + def test_percentage_split(self): + """Test splitting percentages.""" + result = word_tokenize("12%") + self.assertEqual(result, ["12", "%"]) + + def test_hash_number_split(self): + """Test splitting hash with numbers.""" + result = word_tokenize("#1") + self.assertEqual(result, ["#", "1"]) + + def test_hyphen_between_words(self): + """Test hyphen handling between words.""" + result = word_tokenize("amo-te") + self.assertEqual(result, ["amo", "-", "te"]) + + def test_hyphen_preservation_in_numbers(self): + """Test that hyphens in numbers are preserved.""" + result = word_tokenize("1-2") + # Should not split number ranges + self.assertIn("1-2", result) + + def test_trailing_hyphen_removal(self): + """Test removal of trailing hyphens.""" + result = word_tokenize("palavra -") + self.assertEqual(result, ["palavra"]) + + def test_empty_string(self): + """Test tokenization of empty string.""" + result = word_tokenize("") + self.assertEqual(result, []) + + def test_whitespace_handling(self): + """Test handling of various whitespace.""" + result = word_tokenize(" palavra outra ") + self.assertEqual(result, ["palavra", "outra"]) + + def test_complex_input(self): + """Test complex input with multiple patterns.""" + result = word_tokenize("amo-te 50% #2 test") + expected_elements = ["amo", "-", "te", "50", "%", "#", "2", "test"] + self.assertEqual(result, expected_elements) + + if __name__ == '__main__': # Run the tests with verbose output unittest.main(verbosity=2)