Add TranscriptionOptions and fast_transcribe wrapper

mideind · Aug 30, 2023 · e4a4d76 · e4a4d76
1 parent 8bd373e
commit e4a4d76
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 50 deletions.
diff --git a/src/icespeak/__init__.py b/src/icespeak/__init__.py
@@ -20,7 +20,7 @@
 """
 
 from .parser import GreynirSSMLParser
-from .transcribe import DefaultTranscriber, gssml
+from .transcribe import DefaultTranscriber, TranscriptionOptions, fast_transcribe, gssml
 from .tts import AVAILABLE_VOICES, text_to_speech
 
 __all__ = (
@@ -29,4 +29,6 @@
     "gssml",
     "DefaultTranscriber",
     "AVAILABLE_VOICES",
+    "TranscriptionOptions",
+    "fast_transcribe",
 )
diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py
@@ -31,6 +31,7 @@
 from re import Match
 
 from islenska.basics import ALL_CASES, ALL_GENDERS, ALL_NUMBERS
+from pydantic import BaseModel, Extra, Field
 from reynir import Greynir
 from reynir.bindb import GreynirBin
 from tokenizer import TOK, Abbreviations, Tok, detokenize, tokenize
@@ -44,6 +45,9 @@
     PunctuationTuple,
 )
 
+from icespeak.settings import SETTINGS
+from icespeak.tts import AVAILABLE_VOICES
+
 from .num import (
     ROMAN_NUMERALS,
     CaseType,
@@ -110,6 +114,37 @@ def gssml(data: Any = None, *, type: str, **kwargs: str | float) -> str:
     )
 
 
+class TranscriptionOptions(BaseModel):
+    """Settings for transcription."""
+
+    model_config = {"frozen": True, "extra": Extra.forbid}
+
+    emails: bool = Field(default=True, description="Whether to transcribe emails.")
+    dates: bool = Field(default=True, description="Whether to transcribe dates.")
+    years: bool = Field(default=True, description="Whether to transcribe years.")
+    domains: bool = Field(default=True, description="Whether to transcribe domains.")
+    urls: bool = Field(default=True, description="Whether to transcribe URLs.")
+    amounts: bool = Field(
+        default=True,
+        description="Whether to transcribe amounts (number with currency).",
+    )
+    measurements: bool = Field(
+        default=True,
+        description="Whether to transcribe measurements "
+        + "(number with unit of measurement).",
+    )
+    percentages: bool = Field(
+        default=True, description="Whether to transcribe percentages."
+    )
+    # These are experimental, turned off by default
+    numbers: bool = Field(
+        default=False, description="Whether to transcribe (cardinal) numbers."
+    )
+    ordinals: bool = Field(
+        default=False, description="Whether to transcribe ordinal numbers."
+    )
+
+
 def _currency_to_gender(code: str) -> GenderType:
     non_kvk_currencies: Mapping[str, GenderType] = {
         # KK
@@ -706,7 +741,7 @@ def years(cls, txt: str) -> str:
         "*": "stjarna",
         "(": "vinstri svigi",
         ")": "hægri svigi",
-        "-": "bandstrik",  # TODO: or "mínus"?
+        "-": "bandstrik",  # in some cases this should be "mínus"
         "_": "niðurstrik",
         "=": "jafnt og merki",
         "+": "plús",
@@ -1343,11 +1378,14 @@ def sentence(cls, txt: str) -> str:
         return f"<s>{txt}</s>"
 
     @classmethod
-    def token_transcribe(cls, text: str):
+    def token_transcribe(
+        cls, text: str, options: TranscriptionOptions | None = None
+    ) -> str:
         """
         Quick transcription of Icelandic text for TTS.
         Utilizes the tokenizer library.
         """
+        opt: TranscriptionOptions = options if options else TranscriptionOptions()
         tokens: list[Tok] = list(tokenize(text))
         for token in tokens:
             if (
@@ -1367,19 +1405,19 @@ def token_transcribe(cls, text: str):
                 h, m, s = cast(DateTimeTuple, token.val)
                 token.txt = _time_to_text(h, m, s or None)
 
-            elif token.kind == TOK.DATE:
+            elif token.kind == TOK.DATE and opt.dates:
                 token.txt = cls.date(token.txt, case="þf")
 
-            elif token.kind == TOK.YEAR:
+            elif token.kind == TOK.YEAR and opt.years:
                 token.txt = cls.year(token.integer)
 
-            elif token.kind == TOK.NUMBER:
+            elif token.kind == TOK.NUMBER and opt.numbers:
                 token.txt = cls.float(token.number, case="nf", gender="hk")
 
             elif token.kind == TOK.TELNO:
                 token.txt = cls.phone(token.txt)
 
-            elif token.kind == TOK.PERCENT:
+            elif token.kind == TOK.PERCENT and opt.percentages:
                 percent, _, _ = cast(NumberTuple, token.val)
                 if "%" in token.txt:
                     token.txt = cls.float(percent, case="nf", gender="hk") + " prósent"
@@ -1389,19 +1427,19 @@ def token_transcribe(cls, text: str):
                 elif "‰" in token.txt:
                     token.txt = cls.float(percent, case="nf", gender="hk") + " prómill"
 
-            elif token.kind == TOK.URL:
+            elif token.kind == TOK.URL and opt.urls:
                 protocol, _, domain = token.txt.partition("://")
                 if domain:
                     token.txt = cls.spell(protocol) + cls.domain(domain)
 
-            elif token.kind == TOK.ORDINAL:
+            elif token.kind == TOK.ORDINAL and opt.ordinals:
                 token.txt = cls.ordinal(token.ordinal, case="þf", gender="kk")
 
             elif token.kind == TOK.CURRENCY:
                 curr, _, _ = cast(CurrencyTuple, token.val)
                 token.txt = cls.currency(curr)
 
-            elif token.kind == TOK.AMOUNT:
+            elif token.kind == TOK.AMOUNT and opt.amounts:
                 num, curr, _, _ = cast(AmountTuple, token.val)
                 curr = CURRENCY_SYMBOLS.get(curr, curr)
                 token.txt = (
@@ -1410,13 +1448,13 @@ def token_transcribe(cls, text: str):
                     + cls.currency(curr, number="ft" if _is_plural(num) else "et")
                 )
 
-            elif token.kind == TOK.EMAIL:
+            elif token.kind == TOK.EMAIL and opt.emails:
                 token.txt = cls.email(token.txt)
 
-            elif token.kind == TOK.DATEABS:
+            elif token.kind == TOK.DATEABS and opt.dates:
                 token.txt = cls.date(token.txt, case="þf")
 
-            elif token.kind == TOK.DATEREL:
+            elif token.kind == TOK.DATEREL and opt.dates:
                 token.txt = cls.date(token.txt, case="þf")
 
             elif token.kind == TOK.TIMESTAMPABS:
@@ -1425,7 +1463,7 @@ def token_transcribe(cls, text: str):
             elif token.kind == TOK.TIMESTAMPREL:
                 token.txt = cls.time(cls.date(token.txt, case="þf"))
 
-            elif token.kind == TOK.MEASUREMENT:
+            elif token.kind == TOK.MEASUREMENT and opt.measurements:
                 # We can't use token.val here because
                 # the tokenization converts everything to SI units :/
                 # unit, num = cast(MeasurementTuple, token.val)
@@ -1452,11 +1490,11 @@ def token_transcribe(cls, text: str):
                     cls.number(num, case="nf", gender="hk") + " " + cls.spell(letter)
                 )
 
-            elif token.kind == TOK.DOMAIN:
+            elif token.kind == TOK.DOMAIN and opt.domains:
                 token.txt = cls.domain(token.txt)
 
             elif token.kind == TOK.HASHTAG:
-                token.txt = f"myllumerki {token.txt.lstrip('#')}"
+                token.txt = "myllumerki " + token.txt.lstrip("#")
 
             elif token.kind == TOK.MOLECULE:
                 token.txt = cls.molecule(token.txt)
@@ -1474,3 +1512,16 @@ def token_transcribe(cls, text: str):
                 token.txt = cls.entity(token.txt)
 
         return detokenize(tokens)
+
+
+def fast_transcribe(
+    text: str,
+    voice: str = SETTINGS.DEFAULT_VOICE,
+    options: TranscriptionOptions | None = None,
+):
+    """
+    Simple wrapper for token-based transcription
+    of text for a specific TTS voice.
+    """
+    t = AVAILABLE_VOICES[voice]["Transcriber"] or DefaultTranscriber
+    return t.token_transcribe(text, options)
diff --git a/tests/test_speech.py b/tests/test_speech.py
@@ -32,6 +32,7 @@
 from icespeak import text_to_speech
 from icespeak.settings import API_KEYS
 from icespeak.transcribe import DefaultTranscriber as DT
+from icespeak.transcribe import TranscriptionOptions
 from icespeak.tts import AVAILABLE_VOICES
 
 
@@ -969,12 +970,14 @@ def test_voice_breaks() -> None:
         assert n == f'<break strength="{s}" />'
 
 
-def test_token_transcribe() -> None:
+_ws_re = re.compile(r"\n\s+")
+_fix_ws: Callable[[str], str] = lambda t: _ws_re.sub(" ", t.strip())
+
+
+def test_token_transcribe_basic() -> None:
     # Replace whitespace with single space in text
     # stretching over multiple lines
-    ws_re = re.compile(r"\n\s+")
-    ws_to_space: Callable[[str], str] = lambda t: ws_re.sub(" ", t.strip())
-    t = ws_to_space(
+    t = _fix_ws(
         """
         Frétt skrifuð þann 27. ágúst 2023 kl. 20:20.
         """
@@ -984,7 +987,7 @@ def test_token_transcribe() -> None:
         "tuttugasta og sjöunda ágúst tvö þúsund tuttugu og þrjú klukkan tuttugu tuttugu"
         in n
     )
-    t = ws_to_space(
+    t = _fix_ws(
         """
         t.d. var 249% munur á ódýrstu og dýrustu rauðrófunum,
         118% munur milli bökunarkartafla, 291% munur á grænum eplum,
@@ -998,35 +1001,53 @@ def test_token_transcribe() -> None:
         and "tvö hundruð níutíu og eitt prósent" in n
         and "tvö til þrjú prósent"
     )
+    t = _fix_ws(
+        """
+        Sendu tölvupóst á [email protected] og [email protected].
+        Kíktu svo á síðurnar is.wikipedia.org, ruv.is og bull.co.net.
+        """
+    )
+    n = DT.token_transcribe(t)
+    assert "@" not in n
+    assert "jon.gudm" not in n
+    assert " punktur " in n
+    assert " is " in n
+    assert "com" not in n
+    t = _fix_ws("Hvað eru 0,67cm í tommum?")
+    n = DT.token_transcribe(t)
+    assert "núll komma sextíu og sjö sentimetrar" in n
+
+
+def test_token_transcribe_experimental():
+    t_opts = TranscriptionOptions(numbers=True, ordinals=True)
     n = DT.token_transcribe(
         "sagðist hún vona að á næstu 10-20 árum "
-        + "yrði farið að nýta tæknina 9,2-5,3 prósent meira."
-    )
-    assert (
-        "tíu bandstrik tuttugu árum" in n
-        and "níu komma tvö bandstrik fimm komma þrjú prósent" in n
+        + "yrði farið að nýta tæknina 9,2-5,3 prósent meira.",
+        t_opts,
     )
-    t = ws_to_space(
+    assert "tíu bandstrik tuttugu árum" in n
+    assert "níu komma tvö bandstrik fimm komma þrjú prósent" in n
+    t = _fix_ws(
         """
         Frakkland - Marókkó á HM.
         Leikurinn var bráðfjörugur en það voru Frakkar
         sem voru sterkari og unnu þeir leikinn 2-0.
         """
     )
-    n = DT.token_transcribe(t)
+    n = DT.token_transcribe(t, t_opts)
     assert "Frakkland bandstrik Marókkó" in n and "tvö bandstrik núll" in n
-    t = ws_to_space(
+    t = _fix_ws(
         """
         2 eru slasaðir og um 1.500 fiskar dauðir eftir að um
         16 metra hátt fiskabúr í miðju Radisson hóteli
         í Berlín sprakk snemma í morgun.
         """
     )
-    n = DT.token_transcribe(t)
+    n = DT.token_transcribe(t, t_opts)
     # assert "tveir" in n
     assert "eitt þúsund og fimm hundruð" in n
     assert "sextán" in n
-    t = ws_to_space(
+    t = _fix_ws(
         """
         Dæmi eru um að nauðsynjavörur hafi nær tvöfaldast í verði á síðustu tveimur árum
         og enn hækka sumar vörur þrátt fyrir minni verðbólgu og sterkara gengi.
@@ -1051,12 +1072,12 @@ def test_token_transcribe() -> None:
         Hér er upphæð í eintölu: 21 kr.
         """
     )
-    n = DT.token_transcribe(t)
+    n = DT.token_transcribe(t, t_opts)
     assert "%" not in n and "prósent" in n
     assert not any(c.isdecimal() for c in n)
     assert "níu þúsund níu hundruð áttatíu og þrjár krónur" in n
     assert "tuttugu og ein króna" in n
-    t = ws_to_space(
+    t = _fix_ws(
         """
         Norðmaðurinn Jakob Ingebrigtsen átti stórkostlegan endasprett
         og tryggði sér heimsmeistaratitilinn í 5.000m hlaupi.
@@ -1066,25 +1087,10 @@ def test_token_transcribe() -> None:
         úr sekúndum á undan þeim spænska í mark. Jacob Krop frá Kenýa tók bronsið á 13:12.28.
         """
     )
-    n = DT.token_transcribe(t)
+    n = DT.token_transcribe(t, t_opts)
     assert "fimm þúsund metra" in n and "ellefu komma þrj" in n
-    t = ws_to_space(
-        """
-        Sendu tölvupóst á [email protected] og [email protected].
-        Kíktu svo á síðurnar is.wikipedia.org, ruv.is og bull.co.net.
-        """
-    )
-    n = DT.token_transcribe(t)
-    assert "@" not in n
-    assert "jon.gudm" not in n
-    assert " punktur " in n
-    assert " is " in n
-    assert "com" not in n
-    t = ws_to_space("Hvað eru 0,67cm í tommum?")
-    n = DT.token_transcribe(t)
-    assert "núll komma sextíu og sjö sentimetrar" in n
     t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi"
-    n = DT.token_transcribe(t)
+    n = DT.token_transcribe(t, t_opts)
     assert "Í fyrsta" in n
     # assert "öðru" in n
     assert "þriðja" in n