Merge pull request #1 from lapras-inc/topic-refactoring

preprocessor の compose インターフェースを実装
lapras-inc · Mar 12, 2021 · 43125c1 · 43125c1
2 parents c19024c + 4ce9a4f
commit 43125c1
Show file tree

Hide file tree

Showing 18 changed files with 203 additions and 85 deletions.
diff --git a/naivesearch/__init__.py b/naivesearch/__init__.py
@@ -1,7 +1,5 @@
-from .indexer import InvertedIndex
 from .main import naivesearch
 
 __all__ = [
-    'InvertedIndex'
     'naivesearch'
 ]
diff --git a/naivesearch/indexer/__init__.py b/naivesearch/indexer/__init__.py
@@ -1,8 +1,11 @@
-from .inverted_index import Chunker, InvertedIndex, Reader
+from .inverted_index import InvertedIndex, Reader
+from .preprocess_composer import Formatter, Chunker, compose_preprocessors
 
 
 __all__ = [
     'Chunker',
+    'compose_preprocessors',
+    'Formatter',
     'InvertedIndex',
     'Reader',
 ]
diff --git a/naivesearch/indexer/chunker.py b/naivesearch/indexer/chunker.py
diff --git a/naivesearch/indexer/formatter.py b/naivesearch/indexer/formatter.py
diff --git a/naivesearch/indexer/inverted_index.py b/naivesearch/indexer/inverted_index.py
@@ -1,8 +1,8 @@
 import logging
-
 from collections import defaultdict
-from typing import Dict, List, Protocol, Iterable
-from .chunker import Chunker
+from typing import Dict, List, Protocol, Iterable, Union
+
+from .preprocess_composer import Chunker, Converter
 
 
 logger = logging.getLogger(__name__)
@@ -13,9 +13,13 @@ class Reader(Iterable[str]):
 
 
 class InvertedIndex:
-    chunkers: List[Chunker]
+    chunkers: List[Converter]
 
-    def __init__(self, reader: Reader, chunkers: List[Chunker]):
+    def __init__(
+            self,
+            reader: Reader,
+            chunkers: List[Union[Chunker, Converter]]
+    ):
         self.index: Dict[str, List[str]] = defaultdict(list)
         self.chunkers = chunkers
 
@@ -26,7 +30,7 @@ def __init__(self, reader: Reader, chunkers: List[Chunker]):
                     self.index[chunk].append(d)
         logger.info('Done indexing.')
 
-    def __getitem__(self, q):
+    def __getitem__(self, q) -> List[str]:
         chunks = []
         for chunker in self.chunkers:
             for chunk in chunker(q):

diff --git a/naivesearch/indexer/preprocess_composer.py b/naivesearch/indexer/preprocess_composer.py
@@ -0,0 +1,36 @@
+from typing import Callable, List, Optional, Protocol, Type
+
+
+class Formatter(Protocol):
+    def __init__(self, other: Optional['Formatter'] = None, **kwargs):
+        ...
+
+    def __call__(self, x: str) -> str:
+        ...
+
+
+class Chunker(Protocol):
+    def __init__(self, formatter: Optional[Formatter]):
+        ...
+
+    def __call__(self, x: str) -> List[str]:
+        ...
+
+
+class Converter(Protocol):
+    def __init__(self, chunker: Chunker):
+        ...
+
+    def __call__(self, x: str) -> List[str]:
+        ...
+
+
+def compose_preprocessors(
+        converter: Type[Converter],
+        chunker: Type[Chunker],
+        *formatters: Callable[..., Formatter],
+) -> Converter:
+    result = None
+    for x in reversed(formatters):
+        result = x(result)
+    return converter(chunker(result))
diff --git a/naivesearch/indexer/test_formatter.py b/naivesearch/indexer/test_formatter.py
diff --git a/naivesearch/indexer/test_inverted_index.py b/naivesearch/indexer/test_inverted_index.py
@@ -1,6 +1,10 @@
-from naivesearch import InvertedIndex
 from typing import Callable, List, Optional
 
+from naivesearch.indexer import InvertedIndex
+from naivesearch.preprocessors import UnicodeNormalizer, LowerCaseNormalizer
+from naivesearch.preprocessors import BigramConverter
+from naivesearch.preprocessors import CharacterChunker
+
 
 class TestInvertedIndex:
     def test_getitem(self):
@@ -34,3 +38,29 @@ def dummy_chunker2(x):
         assert 'good bye world' in index['good b']
         assert 'good morning world' not in index['good b']
 
+    def test_instantiate_composed_formatters(self):
+        jinkou1 = '人口'
+        jinkou2 = '⼈⼝'
+        upper = 'UPPER'
+        lower = 'upper'
+
+        assert jinkou1 != jinkou2
+
+        def reader():
+            yield jinkou1
+            yield upper
+            yield 'hello world'
+            yield 'good bye world'
+            yield 'good morning world'
+
+        index = InvertedIndex(
+            reader(),
+            [
+                BigramConverter(CharacterChunker(
+                    UnicodeNormalizer(LowerCaseNormalizer()),
+                ))
+            ]
+        )
+
+        assert jinkou1 in index[jinkou2]
+        assert upper in index[lower]
diff --git a/naivesearch/main.py b/naivesearch/main.py
@@ -1,23 +1,27 @@
-from naivesearch.indexer import InvertedIndex
-from naivesearch.indexer.formatter import UnicodeNormalizer, LowerCaseNormalizer
-from naivesearch.indexer.converter import BigramConverter
-from naivesearch.indexer.chunker import CharacterChunker
+from returns.curry import partial
 
+from naivesearch.indexer import compose_preprocessors, InvertedIndex
+from naivesearch.preprocessors import UnicodeNormalizer, LowerCaseNormalizer
+from naivesearch.preprocessors import BigramConverter
+from naivesearch.preprocessors import CharacterChunker
 
-def naivesearch(filepath: str):
-
-    def file_reader(filepath):
-        with open(filepath) as f:
-            for line in f.readlines():
-                yield line.strip()
 
+def naivesearch(filepath: str):
     index = InvertedIndex(
         file_reader(filepath),
         [
-            BigramConverter(CharacterChunker([
-                UnicodeNormalizer(),
-                LowerCaseNormalizer(),
-            ]))
+            compose_preprocessors(
+                BigramConverter,
+                CharacterChunker,
+                LowerCaseNormalizer,
+                partial(UnicodeNormalizer, form='NFKC'),
+            )
         ]
     )
     return index
+
+
+def file_reader(filepath):
+    with open(filepath) as f:
+        for line in f.readlines():
+            yield line.strip()
diff --git a/naivesearch/preprocessors/__init__.py b/naivesearch/preprocessors/__init__.py
@@ -0,0 +1,11 @@
+from .chunker import CharacterChunker
+from .converter import BigramConverter
+from .formatter import UnicodeNormalizer, LowerCaseNormalizer
+
+
+__all__ = [
+    'BigramConverter',
+    'CharacterChunker',
+    'LowerCaseNormalizer',
+    'UnicodeNormalizer',
+]
diff --git a/naivesearch/preprocessors/chunker.py b/naivesearch/preprocessors/chunker.py
@@ -0,0 +1,11 @@
+from typing import List, Optional
+
+from naivesearch.indexer import Formatter
+
+
+class CharacterChunker:
+    def __init__(self, formatter: Optional[Formatter]):
+        self.formatter = formatter
+
+    def __call__(self, x: str) -> List[str]:
+        return list(self.formatter(x) if self.formatter else x)
diff --git a/naivesearch/indexer/converter.py → naivesearch/preprocessors/converter.py b/naivesearch/indexer/converter.py → naivesearch/preprocessors/converter.py
@@ -1,10 +1,12 @@
+from typing import List
+
 from naivesearch.indexer import Chunker
 
 
 class BigramConverter:
     def __init__(self, chunker: Chunker):
         self.chuker = chunker
 
-    def __call__(self, x: str):
+    def __call__(self, x: str) -> List[str]:
         s = self.chuker(x)
         return s + [''.join(z) for z in zip(s[0:], s[1:])]
diff --git a/naivesearch/preprocessors/formatter.py b/naivesearch/preprocessors/formatter.py
@@ -0,0 +1,23 @@
+import unicodedata
+from typing import Optional
+
+from naivesearch.indexer import Formatter
+
+
+class UnicodeNormalizer:
+    def __init__(self, other: Optional[Formatter] = None, form: str = 'NFKC'):
+        self.other = other
+        self.form = form
+
+    def __call__(self, x: str) -> str:
+        x = self.other(x) if self.other else x
+        return unicodedata.normalize(self.form, x)
+
+
+class LowerCaseNormalizer:
+    def __init__(self, other: Optional[Formatter] = None):
+        self.other = other
+
+    def __call__(self, x: str) -> str:
+        x = self.other(x) if self.other else x
+        return x.lower()
diff --git a/naivesearch/indexer/test_chunker.py → naivesearch/preprocessors/test_chunker.py b/naivesearch/indexer/test_chunker.py → naivesearch/preprocessors/test_chunker.py
@@ -1,15 +1,11 @@
-from naivesearch import InvertedIndex
-from typing import Callable, List, Optional
-
 from .chunker import CharacterChunker
 from .formatter import LowerCaseNormalizer, UnicodeNormalizer
 
 
 class TestCharacterChunker:
     def test(self):
-        format_chunker = CharacterChunker([
-            UnicodeNormalizer(),
-            LowerCaseNormalizer(),
-        ])
+        format_chunker = CharacterChunker(
+            UnicodeNormalizer(LowerCaseNormalizer()),
+        )
         result = format_chunker('Hello')
         assert result == ['h', 'e', 'l', 'l', 'o']
diff --git a/naivesearch/indexer/test_converter.py → naivesearch/preprocessors/test_converter.py b/naivesearch/indexer/test_converter.py → naivesearch/preprocessors/test_converter.py
@@ -1,6 +1,3 @@
-from naivesearch import InvertedIndex
-from typing import Callable, List, Optional
-
 from .converter import BigramConverter
 from .formatter import UnicodeNormalizer
 

diff --git a/naivesearch/preprocessors/test_formatter.py b/naivesearch/preprocessors/test_formatter.py
@@ -0,0 +1,35 @@
+from unittest.mock import Mock
+
+from .formatter import UnicodeNormalizer, LowerCaseNormalizer
+
+
+class TestFormatter:
+    def test_unicode_normalizer(self):
+        formatter = UnicodeNormalizer()
+        lhs = '人口'
+        rhs = '⼈⼝'
+        assert lhs != rhs
+        assert formatter(lhs) == formatter(rhs)
+
+    def test_unicode_normalizer_instanciate_with_another_formatter(self):
+        lhs = '人口'
+        rhs = '⼈⼝'
+        another = Mock(return_value='人口')
+        formatter = UnicodeNormalizer(another)
+        assert lhs != rhs
+        assert formatter(lhs) == formatter(rhs)
+        assert another.called
+
+    def test_lower_case_normalizer(self):
+        formatter = LowerCaseNormalizer()
+        lhs = 'UPPER'
+        rhs = 'upper'
+        assert formatter(lhs) == formatter(rhs)
+
+    def test_lower_case_normalizer_instanciate_with_another_formatter(self):
+        lhs = 'UPPER'
+        rhs = 'upper'
+        another = Mock(return_value=lhs)
+        formatter = LowerCaseNormalizer(another)
+        assert formatter(lhs) == formatter(rhs)
+        assert another.called
diff --git a/poetry.lock b/poetry.lock