Skip to content

Commit

Permalink
Add TranscriptionOptions and fast_transcribe wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
sultur committed Aug 30, 2023
1 parent 8bd373e commit e4a4d76
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 50 deletions.
4 changes: 3 additions & 1 deletion src/icespeak/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""

from .parser import GreynirSSMLParser
from .transcribe import DefaultTranscriber, gssml
from .transcribe import DefaultTranscriber, TranscriptionOptions, fast_transcribe, gssml
from .tts import AVAILABLE_VOICES, text_to_speech

__all__ = (
Expand All @@ -29,4 +29,6 @@
"gssml",
"DefaultTranscriber",
"AVAILABLE_VOICES",
"TranscriptionOptions",
"fast_transcribe",
)
81 changes: 66 additions & 15 deletions src/icespeak/transcribe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from re import Match

from islenska.basics import ALL_CASES, ALL_GENDERS, ALL_NUMBERS
from pydantic import BaseModel, Extra, Field
from reynir import Greynir
from reynir.bindb import GreynirBin
from tokenizer import TOK, Abbreviations, Tok, detokenize, tokenize
Expand All @@ -44,6 +45,9 @@
PunctuationTuple,
)

from icespeak.settings import SETTINGS
from icespeak.tts import AVAILABLE_VOICES

from .num import (
ROMAN_NUMERALS,
CaseType,
Expand Down Expand Up @@ -110,6 +114,37 @@ def gssml(data: Any = None, *, type: str, **kwargs: str | float) -> str:
)


class TranscriptionOptions(BaseModel):
"""Settings for transcription."""

model_config = {"frozen": True, "extra": Extra.forbid}

emails: bool = Field(default=True, description="Whether to transcribe emails.")
dates: bool = Field(default=True, description="Whether to transcribe dates.")
years: bool = Field(default=True, description="Whether to transcribe years.")
domains: bool = Field(default=True, description="Whether to transcribe domains.")
urls: bool = Field(default=True, description="Whether to transcribe URLs.")
amounts: bool = Field(
default=True,
description="Whether to transcribe amounts (number with currency).",
)
measurements: bool = Field(
default=True,
description="Whether to transcribe measurements "
+ "(number with unit of measurement).",
)
percentages: bool = Field(
default=True, description="Whether to transcribe percentages."
)
# These are experimental, turned off by default
numbers: bool = Field(
default=False, description="Whether to transcribe (cardinal) numbers."
)
ordinals: bool = Field(
default=False, description="Whether to transcribe ordinal numbers."
)


def _currency_to_gender(code: str) -> GenderType:
non_kvk_currencies: Mapping[str, GenderType] = {
# KK
Expand Down Expand Up @@ -706,7 +741,7 @@ def years(cls, txt: str) -> str:
"*": "stjarna",
"(": "vinstri svigi",
")": "hægri svigi",
"-": "bandstrik", # TODO: or "mínus"?
"-": "bandstrik", # in some cases this should be "mínus"
"_": "niðurstrik",
"=": "jafnt og merki",
"+": "plús",
Expand Down Expand Up @@ -1343,11 +1378,14 @@ def sentence(cls, txt: str) -> str:
return f"<s>{txt}</s>"

@classmethod
def token_transcribe(cls, text: str):
def token_transcribe(
cls, text: str, options: TranscriptionOptions | None = None
) -> str:
"""
Quick transcription of Icelandic text for TTS.
Utilizes the tokenizer library.
"""
opt: TranscriptionOptions = options if options else TranscriptionOptions()
tokens: list[Tok] = list(tokenize(text))
for token in tokens:
if (
Expand All @@ -1367,19 +1405,19 @@ def token_transcribe(cls, text: str):
h, m, s = cast(DateTimeTuple, token.val)
token.txt = _time_to_text(h, m, s or None)

elif token.kind == TOK.DATE:
elif token.kind == TOK.DATE and opt.dates:
token.txt = cls.date(token.txt, case="þf")

elif token.kind == TOK.YEAR:
elif token.kind == TOK.YEAR and opt.years:
token.txt = cls.year(token.integer)

elif token.kind == TOK.NUMBER:
elif token.kind == TOK.NUMBER and opt.numbers:
token.txt = cls.float(token.number, case="nf", gender="hk")

elif token.kind == TOK.TELNO:
token.txt = cls.phone(token.txt)

elif token.kind == TOK.PERCENT:
elif token.kind == TOK.PERCENT and opt.percentages:
percent, _, _ = cast(NumberTuple, token.val)
if "%" in token.txt:
token.txt = cls.float(percent, case="nf", gender="hk") + " prósent"
Expand All @@ -1389,19 +1427,19 @@ def token_transcribe(cls, text: str):
elif "‰" in token.txt:
token.txt = cls.float(percent, case="nf", gender="hk") + " prómill"

elif token.kind == TOK.URL:
elif token.kind == TOK.URL and opt.urls:
protocol, _, domain = token.txt.partition("://")
if domain:
token.txt = cls.spell(protocol) + cls.domain(domain)

elif token.kind == TOK.ORDINAL:
elif token.kind == TOK.ORDINAL and opt.ordinals:
token.txt = cls.ordinal(token.ordinal, case="þf", gender="kk")

elif token.kind == TOK.CURRENCY:
curr, _, _ = cast(CurrencyTuple, token.val)
token.txt = cls.currency(curr)

elif token.kind == TOK.AMOUNT:
elif token.kind == TOK.AMOUNT and opt.amounts:
num, curr, _, _ = cast(AmountTuple, token.val)
curr = CURRENCY_SYMBOLS.get(curr, curr)
token.txt = (
Expand All @@ -1410,13 +1448,13 @@ def token_transcribe(cls, text: str):
+ cls.currency(curr, number="ft" if _is_plural(num) else "et")
)

elif token.kind == TOK.EMAIL:
elif token.kind == TOK.EMAIL and opt.emails:
token.txt = cls.email(token.txt)

elif token.kind == TOK.DATEABS:
elif token.kind == TOK.DATEABS and opt.dates:
token.txt = cls.date(token.txt, case="þf")

elif token.kind == TOK.DATEREL:
elif token.kind == TOK.DATEREL and opt.dates:
token.txt = cls.date(token.txt, case="þf")

elif token.kind == TOK.TIMESTAMPABS:
Expand All @@ -1425,7 +1463,7 @@ def token_transcribe(cls, text: str):
elif token.kind == TOK.TIMESTAMPREL:
token.txt = cls.time(cls.date(token.txt, case="þf"))

elif token.kind == TOK.MEASUREMENT:
elif token.kind == TOK.MEASUREMENT and opt.measurements:
# We can't use token.val here because
# the tokenization converts everything to SI units :/
# unit, num = cast(MeasurementTuple, token.val)
Expand All @@ -1452,11 +1490,11 @@ def token_transcribe(cls, text: str):
cls.number(num, case="nf", gender="hk") + " " + cls.spell(letter)
)

elif token.kind == TOK.DOMAIN:
elif token.kind == TOK.DOMAIN and opt.domains:
token.txt = cls.domain(token.txt)

elif token.kind == TOK.HASHTAG:
token.txt = f"myllumerki {token.txt.lstrip('#')}"
token.txt = "myllumerki " + token.txt.lstrip("#")

elif token.kind == TOK.MOLECULE:
token.txt = cls.molecule(token.txt)
Expand All @@ -1474,3 +1512,16 @@ def token_transcribe(cls, text: str):
token.txt = cls.entity(token.txt)

return detokenize(tokens)


def fast_transcribe(
text: str,
voice: str = SETTINGS.DEFAULT_VOICE,
options: TranscriptionOptions | None = None,
):
"""
Simple wrapper for token-based transcription
of text for a specific TTS voice.
"""
t = AVAILABLE_VOICES[voice]["Transcriber"] or DefaultTranscriber
return t.token_transcribe(text, options)
74 changes: 40 additions & 34 deletions tests/test_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from icespeak import text_to_speech
from icespeak.settings import API_KEYS
from icespeak.transcribe import DefaultTranscriber as DT
from icespeak.transcribe import TranscriptionOptions
from icespeak.tts import AVAILABLE_VOICES


Expand Down Expand Up @@ -969,12 +970,14 @@ def test_voice_breaks() -> None:
assert n == f'<break strength="{s}" />'


def test_token_transcribe() -> None:
_ws_re = re.compile(r"\n\s+")
_fix_ws: Callable[[str], str] = lambda t: _ws_re.sub(" ", t.strip())


def test_token_transcribe_basic() -> None:
# Replace whitespace with single space in text
# stretching over multiple lines
ws_re = re.compile(r"\n\s+")
ws_to_space: Callable[[str], str] = lambda t: ws_re.sub(" ", t.strip())
t = ws_to_space(
t = _fix_ws(
"""
Frétt skrifuð þann 27. ágúst 2023 kl. 20:20.
"""
Expand All @@ -984,7 +987,7 @@ def test_token_transcribe() -> None:
"tuttugasta og sjöunda ágúst tvö þúsund tuttugu og þrjú klukkan tuttugu tuttugu"
in n
)
t = ws_to_space(
t = _fix_ws(
"""
t.d. var 249% munur á ódýrstu og dýrustu rauðrófunum,
118% munur milli bökunarkartafla, 291% munur á grænum eplum,
Expand All @@ -998,35 +1001,53 @@ def test_token_transcribe() -> None:
and "tvö hundruð níutíu og eitt prósent" in n
and "tvö til þrjú prósent"
)
t = _fix_ws(
"""
Sendu tölvupóst á [email protected] og [email protected].
Kíktu svo á síðurnar is.wikipedia.org, ruv.is og bull.co.net.
"""
)
n = DT.token_transcribe(t)
assert "@" not in n
assert "jon.gudm" not in n
assert " punktur " in n
assert " is " in n
assert "com" not in n
t = _fix_ws("Hvað eru 0,67cm í tommum?")
n = DT.token_transcribe(t)
assert "núll komma sextíu og sjö sentimetrar" in n


def test_token_transcribe_experimental():
t_opts = TranscriptionOptions(numbers=True, ordinals=True)
n = DT.token_transcribe(
"sagðist hún vona að á næstu 10-20 árum "
+ "yrði farið að nýta tæknina 9,2-5,3 prósent meira."
)
assert (
"tíu bandstrik tuttugu árum" in n
and "níu komma tvö bandstrik fimm komma þrjú prósent" in n
+ "yrði farið að nýta tæknina 9,2-5,3 prósent meira.",
t_opts,
)
t = ws_to_space(
assert "tíu bandstrik tuttugu árum" in n
assert "níu komma tvö bandstrik fimm komma þrjú prósent" in n
t = _fix_ws(
"""
Frakkland - Marókkó á HM.
Leikurinn var bráðfjörugur en það voru Frakkar
sem voru sterkari og unnu þeir leikinn 2-0.
"""
)
n = DT.token_transcribe(t)
n = DT.token_transcribe(t, t_opts)
assert "Frakkland bandstrik Marókkó" in n and "tvö bandstrik núll" in n
t = ws_to_space(
t = _fix_ws(
"""
2 eru slasaðir og um 1.500 fiskar dauðir eftir að um
16 metra hátt fiskabúr í miðju Radisson hóteli
í Berlín sprakk snemma í morgun.
"""
)
n = DT.token_transcribe(t)
n = DT.token_transcribe(t, t_opts)
# assert "tveir" in n
assert "eitt þúsund og fimm hundruð" in n
assert "sextán" in n
t = ws_to_space(
t = _fix_ws(
"""
Dæmi eru um að nauðsynjavörur hafi nær tvöfaldast í verði á síðustu tveimur árum
og enn hækka sumar vörur þrátt fyrir minni verðbólgu og sterkara gengi.
Expand All @@ -1051,12 +1072,12 @@ def test_token_transcribe() -> None:
Hér er upphæð í eintölu: 21 kr.
"""
)
n = DT.token_transcribe(t)
n = DT.token_transcribe(t, t_opts)
assert "%" not in n and "prósent" in n
assert not any(c.isdecimal() for c in n)
assert "níu þúsund níu hundruð áttatíu og þrjár krónur" in n
assert "tuttugu og ein króna" in n
t = ws_to_space(
t = _fix_ws(
"""
Norðmaðurinn Jakob Ingebrigtsen átti stórkostlegan endasprett
og tryggði sér heimsmeistaratitilinn í 5.000m hlaupi.
Expand All @@ -1066,25 +1087,10 @@ def test_token_transcribe() -> None:
úr sekúndum á undan þeim spænska í mark. Jacob Krop frá Kenýa tók bronsið á 13:12.28.
"""
)
n = DT.token_transcribe(t)
n = DT.token_transcribe(t, t_opts)
assert "fimm þúsund metra" in n and "ellefu komma þrj" in n
t = ws_to_space(
"""
Sendu tölvupóst á [email protected] og [email protected].
Kíktu svo á síðurnar is.wikipedia.org, ruv.is og bull.co.net.
"""
)
n = DT.token_transcribe(t)
assert "@" not in n
assert "jon.gudm" not in n
assert " punktur " in n
assert " is " in n
assert "com" not in n
t = ws_to_space("Hvað eru 0,67cm í tommum?")
n = DT.token_transcribe(t)
assert "núll komma sextíu og sjö sentimetrar" in n
t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi"
n = DT.token_transcribe(t)
n = DT.token_transcribe(t, t_opts)
assert "Í fyrsta" in n
# assert "öðru" in n
assert "þriðja" in n
Expand Down

0 comments on commit e4a4d76

Please sign in to comment.