disambiguate based on stanza output

reynoldsnlp · Jul 9, 2020 · b55a7d9 · b55a7d9
1 parent f045c0e
commit b55a7d9
Show file tree

Hide file tree

Showing 20 changed files with 234 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -143,8 +143,6 @@ print(phonetic_doc1)
 | text | `str` | The original text of this token |
 | misc | `str` | Miscellaneous annotations with regard to this token |
 | lemmas | `Set[str]` | All possible lemmas, based on remaining readings |
-| most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
-| most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
 | readings | `List[Reading]` | List of readings not removed by the Constraint Grammar |
 | removed\_readings | `List[Reading]` | List of readings removed by the Constraint Grammar | head | `int` | The id of the syntactic head of this token in the sentence, 1-based (0 is reserved for an artificial symbol that represents the root of the syntactic tree). |
 | deprel | `str` | The dependency relation between this word and its syntactic head. Example: ‘nmod’. |
@@ -154,6 +152,8 @@ print(phonetic_doc1)
 | stresses | `Set[str]` | All possible stressed wordforms, based on remaining readings |
 | stressed | `str` | The original text of the sentence with stress marks |
 | phonetic | `str` | The original text converted to phonetic transcription |
+| most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
+| most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
 | transliterate | `str` | The original text converted to Romanized Cyrillic (default=Scholarly) |
 | cg3\_str | `str` | Analysis stream in the [VISL-CG3 format](https://visl.sdu.dk/cg3/single/#stream-vislcg) |
 | hfst\_str | `str` | Analysis stream in the XFST/HFST format |
@@ -175,6 +175,7 @@ print(phonetic_doc1)
 | hfst\_str | `str` | Analysis stream in the XFST/HFST format |
 | generate | `str` | Generate the wordform from this reading |
 | replace\_tag | `None` | Replace a tag in this reading |
+| does\_not\_conflict | `bool` | Determine whether reading from external tagset (e.g. Universal Dependencies) conflicts with this reading |
 
 ### `Subreading` object
 

diff --git a/dev/qa.sh b/dev/qa.sh
@@ -16,8 +16,16 @@ echo "Versions with which tests passed for this commit:" \
 hfst-tokenize --version | grep hfst >> hfst_vislcg3_versions.txt
 vislcg3 --version | grep VISL >> hfst_vislcg3_versions.txt
 
+echo "Checking for unnecessary noqa's..."
+egrep "^.{,76}[^\"]{3}# noqa: E501" test/*.py udar/**/*.py
+
+echo "Running flake8..."
 flake8 *.py test/**/*.py udar/**/*.py
+
+echo "Running mypy..."
 mypy udar
+
+echo "Running pytest..."
 pytest --cov=udar --cov-append --cov-report term-missing --doctest-modules
 
 rm .coverage  # can conflict with tox

diff --git a/docs/conf.py b/docs/conf.py
@@ -28,6 +28,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = ['sphinx.ext.autodoc',
+              'sphinx.ext.autosummary',
               'sphinx.ext.napoleon',
               'sphinx_autodoc_typehints']
 autoclass_content = 'both'

diff --git a/test/test_convenience.py b/test/test_convenience.py
@@ -81,7 +81,7 @@ def test_noun_distractors_NotImplementedError():
 def test_readability():
     d1 = Document('Афанасий сотрудничает со смешными корреспондентами.')
     assert all(len(tok.readings) == 1 for tok in d1), d1.hfst_str()
-    r1 = convenience.readability_measures(d1)[1]  # noqa: E501
+    r1 = convenience.readability_measures(d1)[1]
     assert len(r1) == 6, r1
     assert r1.matskovskij == 3.2248
     assert r1.oborneva == 18.830000000000002

diff --git a/test/test_features.py b/test/test_features.py
@@ -47,7 +47,7 @@ def test_feature_keywords_declared_in_alphabetical_order():
 def test_feature_keywords_are_exhaustive_for_dependencies():
     """Ensure that all arguments of dependent functions can be overridden."""
     for name, feat in ALL.items():
-        ignore_keywords = {'has_tag', 'n'}
+        ignore_keywords = {'has_tag', 'n', 'method'}
         parent_keywords = set(feat.default_kwargs).union(ignore_keywords)
         posterity_keywords = _get_all_dependent_keyword_arguments(name)
         assert name and posterity_keywords.issubset(parent_keywords)

diff --git a/udar/conversion/OC_conflicts.py b/udar/conversion/OC_conflicts.py
@@ -0,0 +1,30 @@
+OC_conflicts = {
+                # OC_tag: set of conflicting udar tags
+                'ADJF': {'N'},
+                'ADJS': {'N'},
+                'ADVB': {'A', 'CS', 'Pcle'},
+                'CONJ': {'Pron', 'N', 'Interj', 'Adv'},
+                'NOUN': {'A', 'Pr'},
+                'NPRO': {'N', 'Pred', 'Pcle', 'Det'},
+                'PREP': {'Interj', 'N', 'V'},
+                'VERB': {'A', 'Pred'},
+                'masc': {'Neu', 'Fem'},
+                'femn': {'Msc'},
+                'neut': {'Msc'},
+                'sing': {'Pl'},
+                'plur': {'Sg'},
+                'nomn': {'Acc', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'gent': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'datv': {'Nom', 'Acc', 'Gen', 'Loc', 'Ins', 'Voc'},
+                'accs': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'ablt': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Voc'},
+                'loct': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
+                'voct': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Ins'},
+                'gen1': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'gen2': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'acc2': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'loc1': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
+                'loc2': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
+                'tran': {'IV'},
+                'intr': {'TV'},
+                }
diff --git a/udar/util/OC_tags.py → udar/conversion/OC_tags.py b/udar/util/OC_tags.py → udar/conversion/OC_tags.py
diff --git a/udar/conversion/UD_conflicts.py b/udar/conversion/UD_conflicts.py
@@ -0,0 +1,30 @@
+UD_conflicts = {
+                # UD_tag: set of conflicting udar tags
+                'ADJ': {'N'},
+                'ADV': {'A', 'CS', 'Pcle'},
+                'CCONJ': {'Pron', 'N', 'Interj', 'Adv', 'CS'},
+                'SCONJ': {'Pron', 'N', 'Interj', 'Adv', 'CC'},
+                'NOUN': {'A', 'Pr'},
+                'PRON': {'N', 'Pred', 'Pcle', 'Det'},
+                'ADP': {'Interj', 'N', 'V'},
+                'VERB': {'A', 'Pred'},
+                'Masc': {'Neu', 'Fem'},
+                'Fem': {'Msc'},
+                'Neut': {'Msc'},
+                'Sing': {'Pl'},
+                'Plur': {'Sg'},
+                'Nom': {'Acc', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'Gen': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'Dat': {'Nom', 'Acc', 'Gen', 'Loc', 'Ins', 'Voc'},
+                'Acc': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'},
+                'Ins': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Voc'},
+                'Loc': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'},
+                'Voc': {'Nom', 'Acc', 'Gen', 'Loc', 'Dat', 'Ins'},
+                # 'gen1': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
+                # 'gen2': {'Nom', 'Acc', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
+                # 'acc2': {'Nom', 'Gen', 'Loc', 'Dat', 'Ins', 'Voc'}, ??
+                # 'loc1': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'}, ??
+                # 'loc2': {'Nom', 'Acc', 'Gen', 'Dat', 'Ins', 'Voc'}, ??
+                'Tran': {'IV'},
+                'Intr': {'TV'},
+                }
diff --git a/udar/conversion/__init__.py b/udar/conversion/__init__.py
diff --git a/udar/util/OC2udar.py → udar/conversion/external2udar.py b/udar/util/OC2udar.py → udar/conversion/external2udar.py
@@ -3,7 +3,6 @@
 import argparse
 from glob import glob
 import os
-import re
 import sys
 
 from bs4 import BeautifulSoup  # type: ignore
@@ -18,9 +17,12 @@
 HOME = os.path.expanduser('~')
 
 parser = argparse.ArgumentParser()
+parser.add_argument('-t', '--tagset', type=str,
+                    help='Which tagset is used in the input. Must be either '
+                    'OC (opencorpora) or UD (universal dependencies)')
 parser.add_argument('-i', '--input', type=str,
                     default=f'{HOME}/corpora/opencorpora/annot.opcorpora.no_ambig_strict.xml',  # noqa: E501
-                    help="path to opencorpora xml file")
+                    help='path to opencorpora xml file')
 parser.add_argument('-o', '--output-dir', type=str, default='corp/OC',
                     help='path to output directory. If it already exists, all '
                     '*.out files therein will be deleted.')
@@ -37,41 +39,8 @@ def readable_sent(sentence):
                      for t in sentence.tokens.find_all('token'))
 
 
-OC2udar_constraints = {
-                # OC_tag: regex to match incompatible udar tag
-                'ADJF': r'\+N\+',
-                'ADJS': r'\+N\+',
-                'ADVB': r'\+A\+|\+CS|\+Pcle',
-                'CONJ': r'\+Pron|\+N\+|\+Interj|\+Adv',
-                'NOUN': r'\+A\+|\+Pr(?:$|\+)',
-                'NPRO': r'\+N\+|\+Pred(?:$|\s)|\+Pcle|\+Det',
-                'PREP': r'\+Interj|\+N\+|\+V\+',
-                'VERB': r'\+A\+|\+Pred(?:$|\s)',
-                'masc': r'\+Neu|\+Fem',
-                'femn': r'\+Msc',
-                'neut': r'\+Msc',
-                'sing': r'\+Pl',
-                'plur': r'\+Sg',
-                'nomn': r'\+Acc|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
-                'gent': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
-                'datv': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Ins|\+Voc',
-                'accs': r'\+Nom|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
-                'ablt': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Dat|\+Voc',
-                'loct': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
-                'voct': r'\+Nom|\+Acc|\+Gen|\+Loc|\+Dat|\+Ins',
-                'gen1': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
-                'gen2': r'\+Nom|\+Acc|\+Loc|\+Dat|\+Ins|\+Voc',
-                'acc2': r'\+Nom|\+Gen|\+Loc|\+Dat|\+Ins|\+Voc',
-                'loc1': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
-                'loc2': r'\+Nom|\+Acc|\+Gen|\+Dat|\+Ins|\+Voc',
-                'tran': r'\+IV',
-                'intr': r'\+TV',
-                }
-
-
 if __name__ == '__main__':
     args = parser.parse_args()
-
     out_dir = args.output_dir + '/'
     mistoken_dir = args.output_dir + '/mistoken/'
     for d in (out_dir, mistoken_dir):
@@ -134,13 +103,12 @@ def readable_sent(sentence):
                 #     print('\t', r.lemma, r, file=sys.stderr)
                 #     for g in oc_tok_tags:
                 #         print('\t\t', g,
-                #               re.search(OC2udar_constraints[g],
+                #               re.search(constraints[g],
                 #                         r.hfst_str()),
                 #               file=sys.stderr)
                 new_readings = [r for r in u_tok.readings
                                 if  # r.lemma == oc_tok_lem and
-                                all([not re.search(OC2udar_constraints.get(g, '#%@!&' * 99), r.hfst_str())  # noqa: E501
-                                     for g in oc_tok_tags])
+                                r.does_not_conflict(oc_tok_tags, 'OC')
                                 and 'Der' not in r and 'Lxc' not in r]
                 # assert len(new_readings) > 0, f'{u_tok}\n{oc_tok}\n{new_readings}'  # noqa: E501
                 len_new_readings = len(new_readings)

diff --git a/udar/features/absolute_length.py b/udar/features/absolute_length.py
@@ -7,6 +7,7 @@
 from .features import add_to_ALL
 from .features import ALL
 from .features import MAX_SYLL
+from .features import MOST_LIKELY
 from .features import ms_feats
 from .features import safe_ms_feat_name
 from .features import safe_tag_name
@@ -164,17 +165,17 @@ def num_types(doc: Document, lower=True, rmv_punc=False) -> int:
 
 @add_to_ALL('num_lemma_types', category='Absolute length')
 def num_lemma_types(doc: Document, has_tag='', lower=False,
-                    rmv_punc=False) -> int:
+                    method=MOST_LIKELY, rmv_punc=False) -> int:
     """Count number of unique lemmas in a Document."""
     toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
     if lower:
         return len(set([lem.lower()
                         for t in toks
-                        for lem in t.most_likely_lemmas]))
+                        for lem in t.most_likely_lemmas(method=MOST_LIKELY)]))  # noqa: E501
     else:
         return len(set([lem
                         for t in toks
-                        for lem in t.most_likely_lemmas]))
+                        for lem in t.most_likely_lemmas(method=MOST_LIKELY)]))  # noqa: E501
 
 
 def num_types_Tag(tag: str, doc: Document, lower=True, rmv_punc=False) -> int:

diff --git a/udar/features/features.py b/udar/features/features.py
@@ -11,6 +11,7 @@
 from .feature_extractor import FeatureExtractor
 
 MAX_SYLL = 8
+MOST_LIKELY = 'stanza'  # `method` argument to Token.most_likely_reading()
 NaN = float('nan')
 punc_re = r'[\\!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~]+'
 vowel_re = r'[аэоуыяеёюиaeiou]'  # TODO make latin vowels optional?

diff --git a/udar/features/lexical_familiarity.py b/udar/features/lexical_familiarity.py
@@ -13,6 +13,7 @@
 from .features import _get_kelly_dict
 from .features import add_to_ALL
 from .features import ALL
+from .features import MOST_LIKELY
 from .features import NaN
 from .features import warn_about_irrelevant_argument
 
@@ -26,7 +27,7 @@ def num_words_at_lexmin_level(level, doc: Document) -> int:
     lexmin_dict = _get_lexmin_dict()
     return len([1 for tok in doc
                 if any(lexmin_dict.get(lem) == level
-                       for lem in tok.most_likely_lemmas)])  # type: ignore  # noqa: E501
+                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])  # type: ignore  # noqa: E501
 for level in ['A1', 'A2', 'B1', 'B2']:  # noqa: E305
     name = f'num_words_at_lexmin_{level}'
     this_partial = partial(num_words_at_lexmin_level, level)  # type: ignore
@@ -69,7 +70,7 @@ def num_words_at_kelly_level(level, doc: Document) -> int:
     kelly_dict = _get_kelly_dict()
     return len([1 for tok in doc
                 if any(kelly_dict.get(lem) == level
-                       for lem in tok.most_likely_lemmas)])  # type: ignore  # noqa: E501
+                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])  # type: ignore  # noqa: E501
 for level in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']:  # noqa: E305
     name = f'num_words_at_kelly_{level}'
     this_partial = partial(num_words_at_kelly_level, level)  # type: ignore

diff --git a/udar/features/morphology.py b/udar/features/morphology.py
@@ -7,6 +7,7 @@
 from .features import add_to_ALL
 from .features import ALL
 from .features import ms_feats
+from .features import MOST_LIKELY
 from .features import NaN
 from .features import safe_ms_feat_name
 from .features import safe_tag_name
@@ -24,7 +25,7 @@ def num_types_ms_feat(ms_feat: str, doc: Document, rmv_punc=False) -> int:
     toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
     counter = 0
     for tok in toks:
-        for tag in tok.most_likely_reading.grouped_tags:
+        for tag in tok.most_likely_reading(method=MOST_LIKELY).grouped_tags:
             if tag.ms_feat == ms_feat:
                 counter += 1
                 break
@@ -45,7 +46,7 @@ def num_abstract_nouns(doc: Document, rmv_punc=True) -> int:
     abstract_re = r'(?:ье|ие|ство|ация|ость|изм|изна|ота|ина|ика|ива)[¹²³⁴⁵⁶⁷⁸⁹⁰⁻]*$'  # noqa: E501
     return len([t for t in toks
                 if any(re.search(abstract_re, lem)
-                       for lem in t.most_likely_lemmas)])
+                       for lem in t.most_likely_lemmas(method=MOST_LIKELY))])
 
 
 def tag_ms_feat_ratio_Tag(tag: str, doc: Document, rmv_punc=False,

diff --git a/udar/features/normalized_length.py b/udar/features/normalized_length.py
@@ -8,6 +8,7 @@
 from .features import _get_tix_morph_count_dict
 from .features import add_to_ALL
 from .features import ALL
+from .features import MOST_LIKELY
 from .features import NaN
 from .features import vowel_re
 from .features import warn_about_irrelevant_argument
@@ -155,7 +156,7 @@ def morphs_per_word(doc: Document, has_tag='', lower=False, rmv_punc=True,
     try:
         return mean(tix_morph_count_dict[lem]
                     for tok in toks
-                    for lem in tok.most_likely_lemmas
+                    for lem in tok.most_likely_lemmas(method=MOST_LIKELY)
                     if lem in tix_morph_count_dict)
     except StatisticsError:
         return zero_div_val
@@ -173,7 +174,7 @@ def max_morphs_per_word(doc: Document, has_tag='', lower=False, rmv_punc=True,
     try:
         return max(tix_morph_count_dict[lem]
                    for tok in toks
-                   for lem in tok.most_likely_lemmas
+                   for lem in tok.most_likely_lemmas(method=MOST_LIKELY)
                    if lem in tix_morph_count_dict)
     except ValueError:
         return zero_div_val

diff --git a/udar/features/priors.py b/udar/features/priors.py
@@ -13,6 +13,7 @@
 from .features import _get_Sharoff_lem_freq_rank_dict
 from .features import ALL
 from .features import add_to_ALL
+from .features import MOST_LIKELY
 from .features import punc_re
 
 side_effects = None  # import this and get all the side effects for free!
@@ -72,10 +73,12 @@ def _filter_toks(doc: Document,
     if has_tag:
         if isinstance(has_tag, str) or isinstance(has_tag, Tag):
             toks = [t for t in toks
-                    if t.has_tag_in_most_likely_reading(has_tag)]
+                    if t.has_tag_in_most_likely_reading(has_tag,
+                                                        method=MOST_LIKELY)]
         elif isinstance(has_tag, tuple):
             toks = [t for t in toks
-                    if any(t.has_tag_in_most_likely_reading(tag)
+                    if any(t.has_tag_in_most_likely_reading(tag,
+                                                            method=MOST_LIKELY)
                            for tag in has_tag)]
         else:
             raise NotImplementedError('has_tag argument must be a str or Tag, '
@@ -94,7 +97,7 @@ def _lemma_frequencies(doc: Document,
     Sharoff_lem_freq_dict = _get_Sharoff_lem_freq_dict()
     return [Sharoff_lem_freq_dict.get(lem, 0)
             for t in toks
-            for lem in t.most_likely_lemmas]
+            for lem in t.most_likely_lemmas(method=MOST_LIKELY)]
 
 
 @add_to_ALL('_lemma_frequency_ranks', category='_prior')
@@ -106,7 +109,7 @@ def _lemma_frequency_ranks(doc: Document,
     Sharoff_lem_freq_rank_dict = _get_Sharoff_lem_freq_rank_dict()
     return [Sharoff_lem_freq_rank_dict.get(lem, 0)
             for t in toks
-            for lem in t.most_likely_lemmas]
+            for lem in t.most_likely_lemmas(method=MOST_LIKELY)]
 
 
 @add_to_ALL('_token_frequencies', category='_prior')

diff --git a/udar/misc.py b/udar/misc.py
@@ -12,6 +12,8 @@
 
 import stanza  # type: ignore
 
+# This module should not import anything from udar. Modules that need to
+# import from udar should either be in convenience.py or in util/
 
 __all__ = ['StressParams', 'Result', 'result_names', 'destress',
            'compute_metrics', 'unspace_punct']