Skip to content

Commit

Permalink
Updates for current dictionary file
Browse files Browse the repository at this point in the history
Mostly added some new Unicode characters and fixed some special cases
  • Loading branch information
haukex committed Feb 12, 2025
1 parent ee83f27 commit b4d9cb1
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 22 deletions.
27 changes: 17 additions & 10 deletions dict-check.pl
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,21 @@ ($url, $file)
'LATIN SMALL LETTER C WITH CEDILLA',
'LATIN CAPITAL LETTER S WITH CARON',
'LATIN SMALL LETTER A WITH RING ABOVE',
'LATIN SMALL LETTER L WITH STROKE',
'LATIN SMALL LETTER AE',
'GREEK SMALL LETTER ALPHA',
'GREEK SMALL LETTER LAMDA',
'GREEK CAPITAL LETTER OMEGA',
),
# ##### ##### Digits ##### #####
# Note subscript digits happen to be in sequence in Unicode (U+2080 - U+2089), but superscripts aren't!
'0-9',
"\N{SUBSCRIPT ZERO}-\N{SUBSCRIPT NINE}",
"\N{SUPERSCRIPT TWO}",
"\N{SUPERSCRIPT THREE}",
# Note subscript digits happen to be in sequence in Unicode, but superscripts aren't!
"\N{SUBSCRIPT ZERO}-\N{SUBSCRIPT NINE}", # U+2080 - U+2089
"\N{SUPERSCRIPT ZERO}", # U+2070
"\N{SUPERSCRIPT ONE}", # U+00B9
"\N{SUPERSCRIPT TWO}", # U+00B2
"\N{SUPERSCRIPT THREE}", # U+00B3
"\N{SUPERSCRIPT FOUR}-\N{SUPERSCRIPT NINE}", # U+2074 - U+2079
],
special => [
# ##### ##### Special Characters ##### #####
Expand All @@ -164,9 +168,12 @@ ($url, $file)
'MICRO SIGN',
'VULGAR FRACTION ONE HALF',
'MULTIPLICATION SIGN',
'DOT OPERATOR',
'EURO SIGN',
'POUND SIGN',
'REGISTERED SIGN',
'SUPERSCRIPT MINUS',
'MODIFIER LETTER REVERSED COMMA',
)
],
} );
Expand All @@ -186,11 +193,10 @@ ($url, $file)
# ##### ##### Special Sequences ##### #####
# characters we would otherwise treat specially
| (?> / \x20 \( \x20 / ) # "left parenthesis / ( /"
| (?> / [ ) [\] <> {} ] / ) # "left square bracket /[/" etc.
| (?> \( [<>] \) ) # "greater-than sign (>)" etc.
| (?> [<>] \x20* [0-9] ) # greater/less than a number
| (?> /:-\)/ ) # "Smiley"
| (?> / [ () [\] <> {} ] / ) # "left square bracket /[/" etc.
| (?> \( [<>] \) ) # "greater-than sign (>)" etc.
| (?> [<>] \x20* [0-9] (?!-) ) # greater/less than a number (special case "four column design <4-column>")
| (?> /:-\)/ ) # "Smiley"
# special characters occurring only once
| (?> / [ \\ \N{ACUTE ACCENT} ] / )
| (?> \( [ @ \N{CENT SIGN} \N{YEN SIGN} \N{COPYRIGHT SIGN} ] \) )
Expand Down Expand Up @@ -259,13 +265,14 @@ ($url, $file)
while (my $line = <$fh>) {
chomp($line); # remove trailing newline
next if $line=~/^\s*#/ || $line!~/\S/; # skip comments and blank lines
$line =~ s/\Q{f} [auch {pl}]/{f, auch pl}/; # special case: adjust b/c brackets don't normally contain braces
$line =~ s/hydro\K\xAD(?=magnetics)/-/; # special case: fix a soft hyphen
if ( $line =~ $LINE_GRAMMAR ) { # parse the line
my ($de, $en) = ($+{LEFT}, $+{RIGHT});
my @des = split m/\|/, $de;
my @ens = split m/\|/, $en;
@des == @ens or die "Did not get the same number of sub-entries in ".pp($line)."\n";
#say pp \@des, \@ens; # debugging, helps visualize runaway regex
# use the same regex as JS uses to get "annotations":
}
else {
warn "Failed to parse ".pp($line)."\n";
Expand Down
4 changes: 2 additions & 2 deletions src/workers/alphabet.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"_comment" : "DO NOT EDIT - this file is generated by dict-check.pl",
"re" : {
"special" : "[\\ \\!\\$\\%\\&\\+\\,\\-\\.\\/\\:\\=\\?\\~\\'\u2019\u2013\u00b0\u00a7\u2026\u00b5\u00bd\u00d7\u20ac\u00a3\u00ae]",
"word" : "[a-zA-Z\u00e4\u00eb\u00ef\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u00c1\u00c9\u00ce\u00e1\u00e9\u00ed\u00f3\u00e0\u00e8\u00ec\u00f2\u00e2\u00ea\u00ee\u00f4\u00fb\u00e3\u00f1\u012b\u00e7\u0160\u00e5\u00e6\u03b1\u03bb\u03a90-9\u2080-\u2089\u00b2\u00b3]"
"special" : "[\\ \\!\\$\\%\\&\\+\\,\\-\\.\\/\\:\\=\\?\\~\\'\u2019\u2013\u00b0\u00a7\u2026\u00b5\u00bd\u00d7\u22c5\u20ac\u00a3\u00ae\u207b\u02bd]",
"word" : "[a-zA-Z\u00e4\u00eb\u00ef\u00f6\u00fc\u00c4\u00d6\u00dc\u00df\u00c1\u00c9\u00ce\u00e1\u00e9\u00ed\u00f3\u00e0\u00e8\u00ec\u00f2\u00e2\u00ea\u00ee\u00f4\u00fb\u00e3\u00f1\u012b\u00e7\u0160\u00e5\u0142\u00e6\u03b1\u03bb\u03a90-9\u2080-\u2089\u2070\u00b9\u00b2\u00b3\u2074-\u2079]"
}
}
21 changes: 11 additions & 10 deletions src/workers/equiv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,26 +81,27 @@ const EQUIV :[string[], string[]][] = [
[ ['S'], ['Š'] ],
[ ['a'], ['å'] ],
[ ['ae'], ['æ'] ],
[ ['l'], ['ł'] ],
// greek letters
[ ['alpha'], ['α'] ],
[ ['lambda', 'lamda'], ['λ'] ],
[ ['omega', 'ohm'], ['Ω'] ],
// special chars
[ ['\'', '’'], [] ],
[ ['-', '–'], [] ],
[ ['\'', '’', 'ʽ'], [] ],
[ ['-', '–', '⁻'], [] ],
[ ['...', '…'], [] ],
[ ['"', '“', '”', '„'], [] ],
// other special sequences
[ ['0'], ['₀'] ],
[ ['1'], ['₁'] ],
[ ['0'], ['₀','⁰'] ],
[ ['1'], ['₁','¹'] ],
[ ['2'], ['₂','²'] ],
[ ['3'] , ['₃','³'] ],
[ ['4'], ['₄'] ],
[ ['5'], ['₅'] ],
[ ['6'], ['₆'] ],
[ ['7'], ['₇'] ],
[ ['8'], ['₈'] ],
[ ['9'], ['₉'] ],
[ ['4'], ['₄','⁴'] ],
[ ['5'], ['₅','⁵'] ],
[ ['6'], ['₆','⁶'] ],
[ ['7'], ['₇','⁷'] ],
[ ['8'], ['₈','⁸'] ],
[ ['9'], ['₉','⁹'] ],
[ ['1/2'], ['½'] ],
[ ['x'], ['×'] ],
[ ['(R)'], ['®'] ],
Expand Down

0 comments on commit b4d9cb1

Please sign in to comment.