Use tokenizer for fast transcribe function.

mideind · Aug 28, 2023 · a3a9055 · a3a9055
1 parent 312a3ea
commit a3a9055
Show file tree

Hide file tree

Showing 4 changed files with 188 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
+[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 [![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/)
 [![tests](https://github.com/mideind/Icespeak/actions/workflows/main.yml/badge.svg)]()
-[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 

diff --git a/src/icespeak/__init__.py b/src/icespeak/__init__.py
@@ -20,11 +20,15 @@
 """
 
 from .parser import GreynirSSMLParser
-from .transcribe import gssml
+from .transcribe import DefaultTranscriber, gssml
 from .tts import text_to_speech
 
+fast_transcribe = DefaultTranscriber.token_transcribe
+
 __all__ = (
     "GreynirSSMLParser",
     "text_to_speech",
-    "gssml"
+    "gssml",
+    "DefaultTranscriber",
+    "fast_transcribe",
 )
diff --git a/src/icespeak/settings.py b/src/icespeak/settings.py
@@ -122,7 +122,7 @@ class Keys(BaseModel):
 
     azure: Optional[AzureKey] = Field(default=None, description="Azure API key.")
     aws: Optional[AWSPollyKey] = Field(default=None, description="AWS Polly API key.")
-    google: Optional[dict[Any,Any]] = Field(
+    google: Optional[dict[Any, Any]] = Field(
         default=None, description="Path to Google API key file."
     )
 

diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py
@@ -32,11 +32,20 @@
 from re import Match
 
 from islenska.basics import ALL_CASES, ALL_GENDERS, ALL_NUMBERS
-from reynir import TOK, Greynir, Tok
+from reynir import Greynir
 from reynir.bindb import GreynirBin
 from reynir.simpletree import SimpleTree
-from tokenizer import Abbreviations
-from tokenizer.definitions import HYPHENS
+from tokenizer import TOK, Abbreviations, Tok, detokenize, tokenize
+from tokenizer.definitions import (
+    HYPHENS,
+    AmountTuple,
+    # CURRENCY_ABBREV,
+    # SI_UNITS,
+    # CURRENCY_SYMBOLS,
+    CurrencyTuple,
+    MeasurementTuple,
+    PunctuationTuple,
+)
 
 from .num import (
     ROMAN_NUMERALS,
@@ -99,6 +108,23 @@ def gssml(data: Any = None, *, type: str, **kwargs: Union[str, float]) -> str:
     )
 
 
+def _currency_to_gender(code: str) -> GenderType:
+    non_kvk_currencies: Mapping[str, GenderType] = {
+        # KK
+        "USD": "kk",
+        "CHF": "kk",
+        "CAD": "kk",
+        # HK
+        "GBP": "hk",
+        "JPY": "hk",
+        "PLN": "hk",
+        "CNY": "hk",
+        "RMB": "hk",
+        "ZAR": "hk",
+    }
+    return non_kvk_currencies.get(code, "kvk")
+
+
 # Spell out how character names are pronounced in Icelandic
 _CHAR_PRONUNCIATION: Mapping[str, str] = {
     "a": "a",
@@ -176,24 +202,50 @@ def gssml(data: Any = None, *, type: str, **kwargs: Union[str, float]) -> str:
     "nóvember",
     "desember",
 )
-_DATE_REGEXES = (
-    # Matches e.g. "1986-03-07"
-    re.compile(r"(?P<year>\d{1,4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})"),
-    # Matches e.g. "1/4/2001"
-    re.compile(r"(?P<day>\d{1,2})/(?P<month>\d{1,2})/(?P<year>\d{1,4})"),
-    # Matches e.g. "25. janúar 1999" or "25 des."
-    re.compile(
-        r"(?P<day>\d{1,2})\.? ?"
-        + r"(?P<month>(jan(úar|\.)?|feb(rúar|\.)?|mar(s|\.)?|"
-        + r"apr(íl|\.)?|maí\.?|jún(í|\.)?|"
-        + r"júl(í|\.)?|ágú(st|\.)?|sep(tember|\.)?|"
-        + r"okt(óber|\.)?|nóv(ember|\.)?|des(ember|\.)?))"  # 'month' capture group ends
-        + r"( (?P<year>\d{1,4}))?",  # Optional
-        flags=re.IGNORECASE,
+_DATE_REGEX = re.compile(
+    "|".join(
+        (  # TODO: This matches incorrect dates such as 1999-88-63 or 43/67/1999
+            # Matches e.g. "1986-03-07"
+            r"(?P<year1>\d{1,4})-(?P<month1>\d{1,2})-(?P<day1>\d{1,2})",
+            # Matches e.g. "1/4/2001"
+            r"(?P<day2>\d{1,2})/(?P<month2>\d{1,2})/(?P<year2>\d{1,4})",
+            # Matches e.g. "25. janúar 1999" or "25 des."
+            r"(?P<day3>\d{1,2})\.? ?"
+            + r"(?P<month3>(jan(úar|\.)?|feb(rúar|\.)?|mar(s|\.)?|"
+            + r"apr(íl|\.)?|maí\.?|jún(í|\.)?|"
+            + r"júl(í|\.)?|ágú(st|\.)?|sept?(ember|\.)?|"
+            + r"okt(óber|\.)?|nóv(ember|\.)?|des(ember|\.)?))"  # 'month' capture group ends
+            + r"( (?P<year3>\d{1,4}))?",  # Optional
+        )
     ),
+    flags=re.IGNORECASE,
 )
 
 
+def _date_from_match(match: re.Match[str], case: CaseType = "nf") -> str:
+    gd = match.groupdict()
+
+    day_val: Optional[str] = next(
+        filter(bool, (gd[f"day{i}"] for i in range(1, 4))), None
+    )
+    month_val: Optional[str] = next(
+        filter(bool, (gd[f"month{i}"] for i in range(1, 4))), None
+    )
+    year_val: Optional[str] = next(
+        filter(bool, (gd[f"year{i}"] for i in range(1, 4))), None
+    )
+
+    day = number_to_ordinal(day_val, gender="kk", case=case, number="et")
+    mon: str = month_val
+    # Month names don't change in different declensions
+    month = (
+        _MONTH_NAMES[int(mon) - 1]  # DD/MM/YYYY specification
+        if mon.isdecimal()
+        else _MONTH_NAMES[_MONTH_ABBREVS.index(mon[:3])]  # Non-decimal
+    )
+    return f"{day} {month} {year_to_text(year_val)}" if year_val else f"{day} {month}"
+
+
 def _split_substring_types(t: str) -> Iterable[str]:
     """
     Split text into alphabetic, decimal or
@@ -203,11 +255,8 @@ def _split_substring_types(t: str) -> Iterable[str]:
         list(_split_substring_types("hello world,123"))
         -> ["hello", " ", "world", ",", "123"]
     """
-
-    def f(c: str) -> int:
-        return c.isalpha() + 2 * c.isdecimal()
-
-    return ("".join(g) for _, g in itertools.groupby(t, key=f))
+    chartype2val: Callable[[str], int] = lambda c: c.isalpha() + 2 * c.isdecimal()
+    return ("".join(g) for _, g in itertools.groupby(t, key=chartype2val))
 
 
 # Matches letter followed by period or
@@ -221,7 +270,9 @@ def f(c: str) -> int:
 
 # Terms common in sentences which refer to results from sports
 _SPORTS_LEMMAS: frozenset[str] = frozenset(("leikur", "vinna", "tapa", "sigra"))
-
+_IGNORED_TOKENS = frozenset(
+    (TOK.PUNCTUATION, TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN)
+)
 _HYPHEN_SYMBOLS = frozenset(HYPHENS)
 
 _StrBool = Union[str, bool]
@@ -408,6 +459,7 @@ def digits(cls, txt: str) -> str:
     @_empty_str
     def phone(cls, txt: str) -> str:
         """Spell out a phone number."""
+        # TODO: "plús" for e.g. +354-5588-5522
         return cls.digits(txt)
 
     @classmethod
@@ -479,28 +531,13 @@ def _time_fmt(match: Match[str]) -> str:
     @_empty_str
     def date(cls, txt: str, case: CaseType = "nf") -> str:
         """Voicifies a date"""
-        for r in _DATE_REGEXES:
-            match = r.search(txt)
-            if match:
-                # Found match
-                start, end = match.span()
-                gd = match.groupdict()
-                day = number_to_ordinal(gd["day"], gender="kk", case=case, number="et")
-                mon: str = gd["month"]
-                # Month names don't change in different declensions
-                month = (
-                    _MONTH_NAMES[int(mon) - 1]  # DD/MM/YYYY specification
-                    if mon.isdecimal()
-                    else _MONTH_NAMES[_MONTH_ABBREVS.index(mon[:3])]  # Non-decimal
-                )
-                fmt_date = (
-                    f"{day} {month} {year_to_text(gd['year'])}"
-                    if gd["year"]
-                    else f"{day} {month}"
-                )
-                # Only replace date part, leave rest of string intact
-                txt = txt[:start] + fmt_date + txt[end:]
-                break
+        match = _DATE_REGEX.search(txt)
+        if match:
+            # Found match
+            start, end = match.span()
+            fmt_date = _date_from_match(match, case=case)
+            # Only replace date part, leave rest of string intact
+            txt = txt[:start] + fmt_date + txt[end:]
         return txt
 
     @classmethod
@@ -1087,3 +1124,101 @@ def paragraph(cls, txt: str) -> str:
     def sentence(cls, txt: str) -> str:
         """Sentence delimiter for speech synthesis."""
         return f"<s>{txt}</s>"
+
+    @classmethod
+    def token_transcribe(cls, text: str):
+        """
+        Quick transcription of Icelandic text for TTS.
+        Utilizes the tokenizer library.
+        """
+        tokens: list[Tok] = list(tokenize(text))
+        for token in tokens:
+            if token.kind in _IGNORED_TOKENS:
+                continue
+
+            elif token.kind == TOK.TIME:
+                token.txt = cls.time(token.txt)
+
+            elif token.kind == TOK.DATE:
+                token.txt = cls.date(token.txt, case="þf")
+
+            elif token.kind == TOK.YEAR:
+                token.txt = cls.year(token.integer)
+
+            elif token.kind == TOK.NUMBER:
+                token.txt = cls.float(token.number, case="nf", gender="hk")
+
+            elif token.kind == TOK.TELNO:
+                token.txt = cls.phone(token.txt)
+
+            elif token.kind == TOK.PERCENT:
+                pass  # TODO
+
+            elif token.kind == TOK.URL:
+                protocol, _, domain = token.txt.partition("://")
+                if domain:
+                    token.txt = cls.spell(protocol) + cls.domain(domain)
+
+            elif token.kind == TOK.ORDINAL:
+                token.txt = cls.ordinal(token.ordinal, case="nf", gender="kk")
+
+            elif token.kind == TOK.CURRENCY:
+                curr, _, _ = cast(CurrencyTuple, token.val)
+                token.txt = cls.currency(curr)
+
+            elif token.kind == TOK.AMOUNT:
+                num, curr, _, _ = cast(AmountTuple, token.val)
+                token.txt = (
+                    cls.float(num, case="nf", gender=_currency_to_gender(curr))
+                    + " "
+                    + cls.currency(curr)
+                )
+
+            elif token.kind == TOK.EMAIL:
+                token.txt = cls.email(token.txt)
+
+            elif token.kind == TOK.DATEABS:
+                token.txt = cls.date(token.txt, case="þf")
+
+            elif token.kind == TOK.DATEREL:
+                token.txt = cls.date(token.txt, case="þf")
+
+            elif token.kind == TOK.TIMESTAMPABS:
+                token.txt = cls.time(cls.date(token.txt, case="þf"))
+
+            elif token.kind == TOK.TIMESTAMPREL:
+                token.txt = cls.time(cls.date(token.txt, case="þf"))
+
+            elif token.kind == TOK.MEASUREMENT:
+                unit, num = cast(MeasurementTuple, token.val)
+                pass  # TODO
+                # TOK.MEASUREMENT: lambda tok, term: tok.txt, SI_UNITS in tokenizer
+
+            elif token.kind == TOK.NUMWLETTER:
+                num, letter = cast(PunctuationTuple, token.val)
+                token.txt = (
+                    cls.number(num, case="nf", gender="hk") + " " + cls.spell(letter)
+                )
+
+            elif token.kind == TOK.DOMAIN:
+                token.txt = cls.domain(token.txt)
+
+            elif token.kind == TOK.HASHTAG:
+                token.txt = f"myllumerki {token.txt.lstrip('#')}"
+
+            elif token.kind == TOK.MOLECULE:
+                token.txt = cls.molecule(token.txt)
+
+            elif token.kind == TOK.SSN:
+                token.txt = cls.digits(token.txt)
+
+            elif token.kind == TOK.USERNAME:
+                token.txt = cls.username(token.txt)
+
+            elif token.kind == TOK.SERIALNUMBER:
+                token.txt = cls.digits(token.txt)
+
+            elif token.kind == TOK.COMPANY:
+                token.txt = cls.entity(token.txt)
+
+        return detokenize(tokens)