Skip to content

Commit

Permalink
disambiguate based on stanza output
Browse files Browse the repository at this point in the history
  • Loading branch information
reynoldsnlp committed Jul 9, 2020
1 parent f045c0e commit b55a7d9
Show file tree
Hide file tree
Showing 20 changed files with 234 additions and 81 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,6 @@ print(phonetic_doc1)
| text | `str` | The original text of this token |
| misc | `str` | Miscellaneous annotations with regard to this token |
| lemmas | `Set[str]` | All possible lemmas, based on remaining readings |
| most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
| most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
| readings | `List[Reading]` | List of readings not removed by the Constraint Grammar |
| removed\_readings | `List[Reading]` | List of readings removed by the Constraint Grammar | head | `int` | The id of the syntactic head of this token in the sentence, 1-based (0 is reserved for an artificial symbol that represents the root of the syntactic tree). |
| deprel | `str` | The dependency relation between this word and its syntactic head. Example: ‘nmod’. |
Expand All @@ -154,6 +152,8 @@ print(phonetic_doc1)
| stresses | `Set[str]` | All possible stressed wordforms, based on remaining readings |
| stressed | `str` | The original text of the sentence with stress marks |
| phonetic | `str` | The original text converted to phonetic transcription |
| most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
| most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
| transliterate | `str` | The original text converted to Romanized Cyrillic (default=Scholarly) |
| cg3\_str | `str` | Analysis stream in the [VISL-CG3 format](https://visl.sdu.dk/cg3/single/#stream-vislcg) |
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |
Expand All @@ -175,6 +175,7 @@ print(phonetic_doc1)
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |
| generate | `str` | Generate the wordform from this reading |
| replace\_tag | `None` | Replace a tag in this reading |
| does\_not\_conflict | `bool` | Determine whether reading from external tagset (e.g. Universal Dependencies) conflicts with this reading |

### `Subreading` object

Expand Down
8 changes: 8 additions & 0 deletions dev/qa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,16 @@ echo "Versions with which tests passed for this commit:" \
hfst-tokenize --version | grep hfst >> hfst_vislcg3_versions.txt
vislcg3 --version | grep VISL >> hfst_vislcg3_versions.txt

echo "Checking for unnecessary noqa's..."
egrep "^.{,76}[^\"]{3}# noqa: E501" test/*.py udar/**/*.py

echo "Running flake8..."
flake8 *.py test/**/*.py udar/**/*.py

echo "Running mypy..."
mypy udar

echo "Running pytest..."
pytest --cov=udar --cov-append --cov-report term-missing --doctest-modules

rm .coverage # can conflict with tox
Expand Down
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.napoleon',
'sphinx_autodoc_typehints']
autoclass_content = 'both'
Expand Down
2 changes: 1 addition & 1 deletion test/test_convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_noun_distractors_NotImplementedError():
def test_readability():
d1 = Document('Афанасий сотрудничает со смешными корреспондентами.')
assert all(len(tok.readings) == 1 for tok in d1), d1.hfst_str()
r1 = convenience.readability_measures(d1)[1] # noqa: E501
r1 = convenience.readability_measures(d1)[1]
assert len(r1) == 6, r1
assert r1.matskovskij == 3.2248
assert r1.oborneva == 18.830000000000002
Expand Down
2 changes: 1 addition & 1 deletion test/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_feature_keywords_declared_in_alphabetical_order():
def test_feature_keywords_are_exhaustive_for_dependencies():
"""Ensure that all arguments of dependent functions can be overridden."""
for name, feat in ALL.items():
ignore_keywords = {'has_tag', 'n'}
ignore_keywords = {'has_tag', 'n', 'method'}
parent_keywords = set(feat.default_kwargs).union(ignore_keywords)
posterity_keywords = _get_all_dependent_keyword_arguments(name)
assert name and posterity_keywords.issubset(parent_keywords)
Expand Down
30 changes: 30 additions & 0 deletions udar/conversion/OC_conflicts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
OC_conflicts = {
# OC_tag: set of conflicting udar tags
'ADJF': {'N'},
'ADJS': {'N'},
'ADVB': {'A', 'CS', 'Pcle'},
'CONJ': {'Pron', 'N', 'Interj', 'Adv'},
'NOUN': {'A', 'Pr'},
'NPRO': {'N', 'Pred', 'Pcle', 'Det'},
'PREP': {'Interj', 'N', 'V'},
'VERB': {'A', 'Pred'},
'masc': {'Neu', 'Fem'},
'femn': {'Msc'},
'neut': {'Msc'},
'sing': {'Pl'},
'plur': {'Sg'},
'nomn': {'Acc', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
'gent': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
'datv': {'Nom', 'Acc', 'Gen', 'Loc', 'Ins', 'Voc'},
'accs': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
'ablt': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Voc'},
'loct': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
'voct': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Ins'},
'gen1': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
'gen2': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
'acc2': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
'loc1': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
'loc2': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
'tran': {'IV'},
'intr': {'TV'},
}
File renamed without changes.
30 changes: 30 additions & 0 deletions udar/conversion/UD_conflicts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
UD_conflicts = {
# UD_tag: set of conflicting udar tags
'ADJ': {'N'},
'ADV': {'A', 'CS', 'Pcle'},
'CCONJ': {'Pron', 'N', 'Interj', 'Adv', 'CS'},
'SCONJ': {'Pron', 'N', 'Interj', 'Adv', 'CC'},
'NOUN': {'A', 'Pr'},
'PRON': {'N', 'Pred', 'Pcle', 'Det'},
'ADP': {'Interj', 'N', 'V'},
'VERB': {'A', 'Pred'},
'Masc': {'Neu', 'Fem'},
'Fem': {'Msc'},
'Neut': {'Msc'},
'Sing': {'Pl'},
'Plur': {'Sg'},
'Nom': {'Acc', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
'Gen': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
'Dat': {'Nom', 'Acc', 'Gen', 'Loc', 'Ins', 'Voc'},
'Acc': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
'Ins': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Voc'},
'Loc': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
'Voc': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Ins'},
# 'gen1': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
# 'gen2': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
# 'acc2': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
# 'loc1': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'}, ??
# 'loc2': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'}, ??
'Tran': {'IV'},
'Intr': {'TV'},
}
Empty file added udar/conversion/__init__.py
Empty file.
44 changes: 6 additions & 38 deletions udar/util/OC2udar.py → udar/conversion/external2udar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import argparse
from glob import glob
import os
import re
import sys

from bs4 import BeautifulSoup # type: ignore
Expand All @@ -18,9 +17,12 @@
HOME = os.path.expanduser('~')

parser = argparse.ArgumentParser()
parser.add_argument('-t', '--tagset', type=str,
help='Which tagset is used in the input. Must be either '
'OC (opencorpora) or UD (universal dependencies)')
parser.add_argument('-i', '--input', type=str,
default=f'{HOME}/corpora/opencorpora/annot.opcorpora.no_ambig_strict.xml', # noqa: E501
help="path to opencorpora xml file")
help='path to opencorpora xml file')
parser.add_argument('-o', '--output-dir', type=str, default='corp/OC',
help='path to output directory. If it already exists, all '
'*.out files therein will be deleted.')
Expand All @@ -37,41 +39,8 @@ def readable_sent(sentence):
for t in sentence.tokens.find_all('token'))


OC2udar_constraints = {
# OC_tag: regex to match incompatible udar tag
'ADJF': r'\+N\+',
'ADJS': r'\+N\+',
'ADVB': r'\+A\+|\+CS|\+Pcle',
'CONJ': r'\+Pron|\+N\+|\+Interj|\+Adv',
'NOUN': r'\+A\+|\+Pr(?:$|\+)',
'NPRO': r'\+N\+|\+Pred(?:$|\s)|\+Pcle|\+Det',
'PREP': r'\+Interj|\+N\+|\+V\+',
'VERB': r'\+A\+|\+Pred(?:$|\s)',
'masc': r'\+Neu|\+Fem',
'femn': r'\+Msc',
'neut': r'\+Msc',
'sing': r'\+Pl',
'plur': r'\+Sg',
'nomn': r'\+Acc|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
'gent': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
'datv': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Ins|\+Voc',
'accs': r'\+Nom|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
'ablt': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Dat|\+Voc',
'loct': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
'voct': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Dat|\+Ins',
'gen1': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
'gen2': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
'acc2': r'\+Nom|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
'loc1': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
'loc2': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
'tran': r'\+IV',
'intr': r'\+TV',
}


if __name__ == '__main__':
args = parser.parse_args()

out_dir = args.output_dir + '/'
mistoken_dir = args.output_dir + '/mistoken/'
for d in (out_dir, mistoken_dir):
Expand Down Expand Up @@ -134,13 +103,12 @@ def readable_sent(sentence):
# print('\t', r.lemma, r, file=sys.stderr)
# for g in oc_tok_tags:
# print('\t\t', g,
# re.search(OC2udar_constraints[g],
# re.search(constraints[g],
# r.hfst_str()),
# file=sys.stderr)
new_readings = [r for r in u_tok.readings
if # r.lemma == oc_tok_lem and
all([not re.search(OC2udar_constraints.get(g, '#%@!&' * 99), r.hfst_str()) # noqa: E501
for g in oc_tok_tags])
r.does_not_conflict(oc_tok_tags, 'OC')
and 'Der' not in r and 'Lxc' not in r]
# assert len(new_readings) > 0, f'{u_tok}\n{oc_tok}\n{new_readings}' # noqa: E501
len_new_readings = len(new_readings)
Expand Down
7 changes: 4 additions & 3 deletions udar/features/absolute_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .features import add_to_ALL
from .features import ALL
from .features import MAX_SYLL
from .features import MOST_LIKELY
from .features import ms_feats
from .features import safe_ms_feat_name
from .features import safe_tag_name
Expand Down Expand Up @@ -164,17 +165,17 @@ def num_types(doc: Document, lower=True, rmv_punc=False) -> int:

@add_to_ALL('num_lemma_types', category='Absolute length')
def num_lemma_types(doc: Document, has_tag='', lower=False,
rmv_punc=False) -> int:
method=MOST_LIKELY, rmv_punc=False) -> int:
"""Count number of unique lemmas in a Document."""
toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
if lower:
return len(set([lem.lower()
for t in toks
for lem in t.most_likely_lemmas]))
for lem in t.most_likely_lemmas(method=MOST_LIKELY)])) # noqa: E501
else:
return len(set([lem
for t in toks
for lem in t.most_likely_lemmas]))
for lem in t.most_likely_lemmas(method=MOST_LIKELY)])) # noqa: E501


def num_types_Tag(tag: str, doc: Document, lower=True, rmv_punc=False) -> int:
Expand Down
1 change: 1 addition & 0 deletions udar/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .feature_extractor import FeatureExtractor

MAX_SYLL = 8
MOST_LIKELY = 'stanza' # `method` argument to Token.most_likely_reading()
NaN = float('nan')
punc_re = r'[\\!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~]+'
vowel_re = r'[аэоуыяеёюиaeiou]' # TODO make latin vowels optional?
Expand Down
5 changes: 3 additions & 2 deletions udar/features/lexical_familiarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .features import _get_kelly_dict
from .features import add_to_ALL
from .features import ALL
from .features import MOST_LIKELY
from .features import NaN
from .features import warn_about_irrelevant_argument

Expand All @@ -26,7 +27,7 @@ def num_words_at_lexmin_level(level, doc: Document) -> int:
lexmin_dict = _get_lexmin_dict()
return len([1 for tok in doc
if any(lexmin_dict.get(lem) == level
for lem in tok.most_likely_lemmas)]) # type: ignore # noqa: E501
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))]) # type: ignore # noqa: E501
for level in ['A1', 'A2', 'B1', 'B2']: # noqa: E305
name = f'num_words_at_lexmin_{level}'
this_partial = partial(num_words_at_lexmin_level, level) # type: ignore
Expand Down Expand Up @@ -69,7 +70,7 @@ def num_words_at_kelly_level(level, doc: Document) -> int:
kelly_dict = _get_kelly_dict()
return len([1 for tok in doc
if any(kelly_dict.get(lem) == level
for lem in tok.most_likely_lemmas)]) # type: ignore # noqa: E501
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))]) # type: ignore # noqa: E501
for level in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']: # noqa: E305
name = f'num_words_at_kelly_{level}'
this_partial = partial(num_words_at_kelly_level, level) # type: ignore
Expand Down
5 changes: 3 additions & 2 deletions udar/features/morphology.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .features import add_to_ALL
from .features import ALL
from .features import ms_feats
from .features import MOST_LIKELY
from .features import NaN
from .features import safe_ms_feat_name
from .features import safe_tag_name
Expand All @@ -24,7 +25,7 @@ def num_types_ms_feat(ms_feat: str, doc: Document, rmv_punc=False) -> int:
toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
counter = 0
for tok in toks:
for tag in tok.most_likely_reading.grouped_tags:
for tag in tok.most_likely_reading(method=MOST_LIKELY).grouped_tags:
if tag.ms_feat == ms_feat:
counter += 1
break
Expand All @@ -45,7 +46,7 @@ def num_abstract_nouns(doc: Document, rmv_punc=True) -> int:
abstract_re = r'(?:ье|ие|ство|ация|ость|изм|изна|ота|ина|ика|ива)[¹²³⁴⁵⁶⁷⁸⁹⁰⁻]*$' # noqa: E501
return len([t for t in toks
if any(re.search(abstract_re, lem)
for lem in t.most_likely_lemmas)])
for lem in t.most_likely_lemmas(method=MOST_LIKELY))])


def tag_ms_feat_ratio_Tag(tag: str, doc: Document, rmv_punc=False,
Expand Down
5 changes: 3 additions & 2 deletions udar/features/normalized_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .features import _get_tix_morph_count_dict
from .features import add_to_ALL
from .features import ALL
from .features import MOST_LIKELY
from .features import NaN
from .features import vowel_re
from .features import warn_about_irrelevant_argument
Expand Down Expand Up @@ -155,7 +156,7 @@ def morphs_per_word(doc: Document, has_tag='', lower=False, rmv_punc=True,
try:
return mean(tix_morph_count_dict[lem]
for tok in toks
for lem in tok.most_likely_lemmas
for lem in tok.most_likely_lemmas(method=MOST_LIKELY)
if lem in tix_morph_count_dict)
except StatisticsError:
return zero_div_val
Expand All @@ -173,7 +174,7 @@ def max_morphs_per_word(doc: Document, has_tag='', lower=False, rmv_punc=True,
try:
return max(tix_morph_count_dict[lem]
for tok in toks
for lem in tok.most_likely_lemmas
for lem in tok.most_likely_lemmas(method=MOST_LIKELY)
if lem in tix_morph_count_dict)
except ValueError:
return zero_div_val
Expand Down
11 changes: 7 additions & 4 deletions udar/features/priors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .features import _get_Sharoff_lem_freq_rank_dict
from .features import ALL
from .features import add_to_ALL
from .features import MOST_LIKELY
from .features import punc_re

side_effects = None # import this and get all the side effects for free!
Expand Down Expand Up @@ -72,10 +73,12 @@ def _filter_toks(doc: Document,
if has_tag:
if isinstance(has_tag, str) or isinstance(has_tag, Tag):
toks = [t for t in toks
if t.has_tag_in_most_likely_reading(has_tag)]
if t.has_tag_in_most_likely_reading(has_tag,
method=MOST_LIKELY)]
elif isinstance(has_tag, tuple):
toks = [t for t in toks
if any(t.has_tag_in_most_likely_reading(tag)
if any(t.has_tag_in_most_likely_reading(tag,
method=MOST_LIKELY)
for tag in has_tag)]
else:
raise NotImplementedError('has_tag argument must be a str or Tag, '
Expand All @@ -94,7 +97,7 @@ def _lemma_frequencies(doc: Document,
Sharoff_lem_freq_dict = _get_Sharoff_lem_freq_dict()
return [Sharoff_lem_freq_dict.get(lem, 0)
for t in toks
for lem in t.most_likely_lemmas]
for lem in t.most_likely_lemmas(method=MOST_LIKELY)]


@add_to_ALL('_lemma_frequency_ranks', category='_prior')
Expand All @@ -106,7 +109,7 @@ def _lemma_frequency_ranks(doc: Document,
Sharoff_lem_freq_rank_dict = _get_Sharoff_lem_freq_rank_dict()
return [Sharoff_lem_freq_rank_dict.get(lem, 0)
for t in toks
for lem in t.most_likely_lemmas]
for lem in t.most_likely_lemmas(method=MOST_LIKELY)]


@add_to_ALL('_token_frequencies', category='_prior')
Expand Down
2 changes: 2 additions & 0 deletions udar/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

import stanza # type: ignore

# This module should not import anything from udar. Modules that need to
# import from udar should either be in convenience.py or in util/

__all__ = ['StressParams', 'Result', 'result_names', 'destress',
'compute_metrics', 'unspace_punct']
Expand Down
Loading

0 comments on commit b55a7d9

Please sign in to comment.