Skip to content
Merged

Test #217

31 changes: 20 additions & 11 deletions doc/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ after the hook function is executed. Possible return values are defined below
for each hook. Some special return values, such as `BREAK` and `CONT`, are
registered as constants under `scriptshifter.exceptions`.

### Note on running multiple functions on a hook

Currently, if multiple functions are defined for a hook, they are executed
in the order specified in the configuration. There is no way to skip a function
implicitly based on the outcome of the previous one. The only state that is
passed around in this context, is the `ctx` instance of the `Transliterator`
class. This may change in the future as specific needs arise.


### Always available context members

The following members of the context object are available in all the hooks:
Expand Down Expand Up @@ -191,7 +200,7 @@ ignore term and when or when not to trigger a match.
at every character iteration. See "Cursor Flags" below.
- `ctx.dest_ls`: destination token list.

#### Output
#### Return

`CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
current ignore token. `BREAK` stops looking up ignore tokens for the current
Expand All @@ -217,7 +226,7 @@ scanning for more ignore tokens past the match.
- `ctx.ignoring`: whether an ignore token matched. If set to `False`, the rest
of the workflow will assume a non-match.

#### Output
#### Return

`CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
on looking up the ignore list. `BREAK` stops looking up ignore tokens for the
Expand All @@ -242,7 +251,7 @@ number of characters, and/or exit the text scanning loop altogether.
- `ctx.src_tk`: the input token being looked up.
- `ctx.dest_tk`: the transliterated string associated with the current token.

#### Output
#### Return

`CONT`, `BREAK`, or `None`. `CONT` skips the checks on the
current token. `BREAK` stops looking up all tokens for the current
Expand All @@ -269,7 +278,7 @@ also inject additional conditions and logic for the match, and revoke the
- `ctx.match`: whether there was a match. If set to `False`, the rest of the
workflow will assume a non-match.

#### Output
#### Return

`CONT`, `BREAK`, or `None`. `CONT` voids the match and keeps
on looking up the token list. `BREAK` stops looking up tokens for the
Expand All @@ -292,7 +301,7 @@ cursor position to the destination list, verbatim.
at every character iteration. See "Cursor Flags" below.
- `ctx.dest_ls`: destination token list.

#### Output
#### Return

`CONT`, `BREAK`, or `None`. `CONT` skips to the next position in the input
text. Int his case, the function **must** advance the cursor. `BREAK` stops all
Expand All @@ -311,10 +320,10 @@ bypass any further output handling.

- `ctx.dest_ls`: destination token list.

#### Output
#### Return

A string or `None`. If the output is a string, the transliteration function
returns this string immediately; otherwise it proceeds with standard
`BREAK` or `None`. If `BREAK`, the content of `ctx.dest`, which should be set
by the function, is returned immediately; otherwise it proceeds with standard
adjustments and assembly of the output list.

### `post_assembly`
Expand All @@ -333,9 +342,9 @@ and return it before any further default processing is done.

#### Output

String or `None`. If a string, the transliteration function returns that
immediately; otherwise it proceeds with standard adjustments of the output
string before returning.
`BREAK` or `None`. If `BREAK`, the transliteration function returns the content
of `ctx.dest` immediately; otherwise it proceeds with standard adjustments of
the output string before returning.

## Cursor flags

Expand Down
4 changes: 4 additions & 0 deletions scriptshifter/hooks/arabic/arabic_romanizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from os import path
from sys import path as syspath
from unicodedata import normalize as precomp_normalize

from scriptshifter import APP_ROOT
from scriptshifter.exceptions import BREAK
Expand Down Expand Up @@ -28,4 +29,7 @@ def s2r_post_config(ctx):
loc_mappings,
loc_exceptional)

# TODO create separate hook.
ctx.dest = precomp_normalize("NFD", ctx.dest)

return BREAK
16 changes: 5 additions & 11 deletions scriptshifter/hooks/chinese/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from logging import getLogger
from re import I, compile, search, sub

from scriptshifter.hooks.general import normalize_spacing_post_assembly


logger = getLogger(__name__)

Expand All @@ -21,7 +19,7 @@ def parse_numerals_pre_assembly(ctx):
tk_ct = len(ctx.dest_ls)
token_ptn = compile(r"^([A-Za-z]+)#([0-9]*)(\s*)$")

output = ""
output = []

# Use manual loop as i is manipulated inside it.
i = 0
Expand All @@ -36,7 +34,7 @@ def parse_numerals_pre_assembly(ctx):
# characters representing numbers are converted to Arabic
# numerals. When a non-numerical token (or end of string) is
# encountered, the string of numerical tokens is evaluated to
# determine which version should be used in the output string.
# determine which version should be used in the output.
# The outer loop then continues where the inner loop left off.
logger.debug(f"Match number: {tk_i}.")
text_v = num_v = ""
Expand Down Expand Up @@ -96,7 +94,7 @@ def parse_numerals_pre_assembly(ctx):
while search("[0-9] [0-9]", num_v):
num_v = sub("([0-9]) ([0-9])", r"\1\2", num_v)

output += num_v if use_num_v else text_v
output.append(num_v if use_num_v else text_v)

# if the end of the string is not reached, backtrack to the
# delimiter after the last numerical token (i.e. two tokens
Expand All @@ -117,16 +115,12 @@ def parse_numerals_pre_assembly(ctx):

else:
logger.debug(f"No numeric match: adding {tk_i}.")
output += tk_i
output.append(tk_i)

i += 1

logger.debug(f"Use num version: {use_num_v}")
ctx.dest = output

# Skip main transliterate function joining.

return normalize_spacing_post_assembly(ctx)
ctx.dest_ls = output


def person_name_pre_assembly(ctx):
Expand Down
12 changes: 8 additions & 4 deletions scriptshifter/hooks/general/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from logging import getLogger
from re import compile

from scriptshifter.trans import MULTI_WS_RE

# Match multiple spaces.
MULTI_WS_RE = compile(r"(\s){2,}")

# Punctuation and brackets.
# TODO add angled brackets, opening and closing quotes, etc.
NORM1_RE = compile(r"\s([.,;:\)\]}])")
NORM2_RE = compile(r"([.,;:\)\]}])(\S)")
NORM2_RE = compile(r"([,;\)\]}])(\S)")
NORM3_RE = compile(r"([\(\[\{])\s")
NORM4_RE = compile(r"(\S)([\(\[\{])")

Expand Down Expand Up @@ -42,12 +43,15 @@ def capitalize_post_assembly(ctx):

dest_ls = _capitalize(dest_ls, ctx.options.get("capitalize"))

return " ".join(dest_ls)
ctx.dest = " ".join(dest_ls)


def normalize_spacing_post_assembly(ctx):
"""
Remove duplicate and unwanted whitespace around punctuation.

NOTE: This is called by default by transliterate() immediately after the
`post_assembly` hook.
"""
# De-duplicate whitespace.
logger.debug(f"Dest pre manipulation: {ctx.dest}")
Expand All @@ -70,7 +74,7 @@ def normalize_spacing_post_assembly(ctx):
# Remove multiple white space characters.
# norm = NORM8_RE.sub(r"\1\2", norm)

return norm
ctx.dest = norm


def _capitalize(src, which):
Expand Down
2 changes: 1 addition & 1 deletion scriptshifter/hooks/hebrew/dicta_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def s2r_post_config(ctx):

ctx.dest = rsp.json().get("transliteration")
if ctx.dest:
ctx.dest = capitalize_post_assembly(ctx)
capitalize_post_assembly(ctx)

return BREAK
4 changes: 2 additions & 2 deletions scriptshifter/hooks/korean/romanizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def s2r_nonames_post_config(ctx):
# FKR042: Capitalize all first letters
# FKR043: Capitalize the first letter
logger.debug(f"Before capitalization: {ctx.dest}")
ctx.dest = capitalize_post_assembly(ctx)
capitalize_post_assembly(ctx)

return BREAK

Expand All @@ -84,7 +84,7 @@ def s2r_names_post_config(ctx):
# FKR042: Capitalize all first letters
# FKR043: Capitalize the first letter
logger.debug(f"Before capitalization: {ctx.dest}")
ctx.dest = capitalize_post_assembly(ctx)
capitalize_post_assembly(ctx)

return BREAK

Expand Down
8 changes: 8 additions & 0 deletions scriptshifter/tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,10 @@ def load_table(tname):
parents = tdata.get("general", {}).get("parents", [])

if "script_to_roman" in tdata:
# s2r and r2s sections may be empty, but here they need to be converted
# to empty dicts.
if tdata["script_to_roman"] is None:
tdata["script_to_roman"] = {}
if "double_cap" in tdata["script_to_roman"]:
tdata["script_to_roman"]["double_cap"] = tuple(
tdata["script_to_roman"]["double_cap"])
Expand Down Expand Up @@ -411,6 +415,10 @@ def load_table(tname):
tname, tdata["script_to_roman"])

if "roman_to_script" in tdata:
# s2r and r2s sections may be empty, but here they need to be converted
# to empty dicts.
if tdata["roman_to_script"] is None:
tdata["roman_to_script"] = {}
tokens = {}
for parent in parents:
parent_tdata = load_table(parent)
Expand Down
Loading