Normalizerを複数指定できるようにした！勝利！！！

lapras-inc · Feb 12, 2021 · c19024c · c19024c
1 parent 71fe521
commit c19024c
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 9 deletions.
diff --git a/naivesearch/indexer/chunker.py b/naivesearch/indexer/chunker.py
@@ -6,9 +6,10 @@
 
 
 class CharacterChunker:
-    def __init__(self, formatter: Formatter):
-        self.formatter = formatter
+    def __init__(self, formatters: List[Formatter]):
+        self.formatters = formatters
 
     def __call__(self, x: str) -> List[str]:
-        s = self.formatter(x)
-        return list(s)
+        for formatter in self.formatters:
+            x = formatter(x)
+        return list(x)
diff --git a/naivesearch/indexer/test_chunker.py b/naivesearch/indexer/test_chunker.py
@@ -2,11 +2,14 @@
 from typing import Callable, List, Optional
 
 from .chunker import CharacterChunker
-from .formatter import UnicodeNormalizer
+from .formatter import LowerCaseNormalizer, UnicodeNormalizer
 
 
 class TestCharacterChunker:
     def test(self):
-        format_chunker = CharacterChunker(UnicodeNormalizer())
-        result = format_chunker('hello')
+        format_chunker = CharacterChunker([
+            UnicodeNormalizer(),
+            LowerCaseNormalizer(),
+        ])
+        result = format_chunker('Hello')
         assert result == ['h', 'e', 'l', 'l', 'o']
diff --git a/naivesearch/main.py b/naivesearch/main.py
@@ -14,8 +14,10 @@ def file_reader(filepath):
     index = InvertedIndex(
         file_reader(filepath),
         [
-            # BigramConverter(CharacterChunker(UnicodeNormalizer())),
-            BigramConverter(CharacterChunker(LowerCaseNormalizer()))
+            BigramConverter(CharacterChunker([
+                UnicodeNormalizer(),
+                LowerCaseNormalizer(),
+            ]))
         ]
     )
     return index