-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from lapras-inc/topic-refactoring
preprocessor の compose インターフェースを実装
- Loading branch information
Showing
18 changed files
with
203 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,5 @@ | ||
from .indexer import InvertedIndex | ||
from .main import naivesearch | ||
|
||
__all__ = [ | ||
'InvertedIndex' | ||
'naivesearch' | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,11 @@ | ||
from .inverted_index import Chunker, InvertedIndex, Reader | ||
from .inverted_index import InvertedIndex, Reader | ||
from .preprocess_composer import Formatter, Chunker, compose_preprocessors | ||
|
||
|
||
__all__ = [ | ||
'Chunker', | ||
'compose_preprocessors', | ||
'Formatter', | ||
'InvertedIndex', | ||
'Reader', | ||
] |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from typing import Callable, List, Optional, Protocol, Type | ||
|
||
|
||
class Formatter(Protocol): | ||
def __init__(self, other: Optional['Formatter'] = None, **kwargs): | ||
... | ||
|
||
def __call__(self, x: str) -> str: | ||
... | ||
|
||
|
||
class Chunker(Protocol): | ||
def __init__(self, formatter: Optional[Formatter]): | ||
... | ||
|
||
def __call__(self, x: str) -> List[str]: | ||
... | ||
|
||
|
||
class Converter(Protocol): | ||
def __init__(self, chunker: Chunker): | ||
... | ||
|
||
def __call__(self, x: str) -> List[str]: | ||
... | ||
|
||
|
||
def compose_preprocessors( | ||
converter: Type[Converter], | ||
chunker: Type[Chunker], | ||
*formatters: Callable[..., Formatter], | ||
) -> Converter: | ||
result = None | ||
for x in reversed(formatters): | ||
result = x(result) | ||
return converter(chunker(result)) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,23 +1,27 @@ | ||
from naivesearch.indexer import InvertedIndex | ||
from naivesearch.indexer.formatter import UnicodeNormalizer, LowerCaseNormalizer | ||
from naivesearch.indexer.converter import BigramConverter | ||
from naivesearch.indexer.chunker import CharacterChunker | ||
from returns.curry import partial | ||
|
||
from naivesearch.indexer import compose_preprocessors, InvertedIndex | ||
from naivesearch.preprocessors import UnicodeNormalizer, LowerCaseNormalizer | ||
from naivesearch.preprocessors import BigramConverter | ||
from naivesearch.preprocessors import CharacterChunker | ||
|
||
def naivesearch(filepath: str): | ||
|
||
def file_reader(filepath): | ||
with open(filepath) as f: | ||
for line in f.readlines(): | ||
yield line.strip() | ||
|
||
def naivesearch(filepath: str): | ||
index = InvertedIndex( | ||
file_reader(filepath), | ||
[ | ||
BigramConverter(CharacterChunker([ | ||
UnicodeNormalizer(), | ||
LowerCaseNormalizer(), | ||
])) | ||
compose_preprocessors( | ||
BigramConverter, | ||
CharacterChunker, | ||
LowerCaseNormalizer, | ||
partial(UnicodeNormalizer, form='NFKC'), | ||
) | ||
] | ||
) | ||
return index | ||
|
||
|
||
def file_reader(filepath): | ||
with open(filepath) as f: | ||
for line in f.readlines(): | ||
yield line.strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from .chunker import CharacterChunker | ||
from .converter import BigramConverter | ||
from .formatter import UnicodeNormalizer, LowerCaseNormalizer | ||
|
||
|
||
__all__ = [ | ||
'BigramConverter', | ||
'CharacterChunker', | ||
'LowerCaseNormalizer', | ||
'UnicodeNormalizer', | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from typing import List, Optional | ||
|
||
from naivesearch.indexer import Formatter | ||
|
||
|
||
class CharacterChunker: | ||
def __init__(self, formatter: Optional[Formatter]): | ||
self.formatter = formatter | ||
|
||
def __call__(self, x: str) -> List[str]: | ||
return list(self.formatter(x) if self.formatter else x) |
4 changes: 3 additions & 1 deletion
4
naivesearch/indexer/converter.py → naivesearch/preprocessors/converter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,12 @@ | ||
from typing import List | ||
|
||
from naivesearch.indexer import Chunker | ||
|
||
|
||
class BigramConverter: | ||
def __init__(self, chunker: Chunker): | ||
self.chuker = chunker | ||
|
||
def __call__(self, x: str): | ||
def __call__(self, x: str) -> List[str]: | ||
s = self.chuker(x) | ||
return s + [''.join(z) for z in zip(s[0:], s[1:])] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import unicodedata | ||
from typing import Optional | ||
|
||
from naivesearch.indexer import Formatter | ||
|
||
|
||
class UnicodeNormalizer: | ||
def __init__(self, other: Optional[Formatter] = None, form: str = 'NFKC'): | ||
self.other = other | ||
self.form = form | ||
|
||
def __call__(self, x: str) -> str: | ||
x = self.other(x) if self.other else x | ||
return unicodedata.normalize(self.form, x) | ||
|
||
|
||
class LowerCaseNormalizer: | ||
def __init__(self, other: Optional[Formatter] = None): | ||
self.other = other | ||
|
||
def __call__(self, x: str) -> str: | ||
x = self.other(x) if self.other else x | ||
return x.lower() |
10 changes: 3 additions & 7 deletions
10
naivesearch/indexer/test_chunker.py → naivesearch/preprocessors/test_chunker.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,11 @@ | ||
from naivesearch import InvertedIndex | ||
from typing import Callable, List, Optional | ||
|
||
from .chunker import CharacterChunker | ||
from .formatter import LowerCaseNormalizer, UnicodeNormalizer | ||
|
||
|
||
class TestCharacterChunker: | ||
def test(self): | ||
format_chunker = CharacterChunker([ | ||
UnicodeNormalizer(), | ||
LowerCaseNormalizer(), | ||
]) | ||
format_chunker = CharacterChunker( | ||
UnicodeNormalizer(LowerCaseNormalizer()), | ||
) | ||
result = format_chunker('Hello') | ||
assert result == ['h', 'e', 'l', 'l', 'o'] |
3 changes: 0 additions & 3 deletions
3
naivesearch/indexer/test_converter.py → naivesearch/preprocessors/test_converter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from unittest.mock import Mock | ||
|
||
from .formatter import UnicodeNormalizer, LowerCaseNormalizer | ||
|
||
|
||
class TestFormatter: | ||
def test_unicode_normalizer(self): | ||
formatter = UnicodeNormalizer() | ||
lhs = '人口' | ||
rhs = '⼈⼝' | ||
assert lhs != rhs | ||
assert formatter(lhs) == formatter(rhs) | ||
|
||
def test_unicode_normalizer_instanciate_with_another_formatter(self): | ||
lhs = '人口' | ||
rhs = '⼈⼝' | ||
another = Mock(return_value='人口') | ||
formatter = UnicodeNormalizer(another) | ||
assert lhs != rhs | ||
assert formatter(lhs) == formatter(rhs) | ||
assert another.called | ||
|
||
def test_lower_case_normalizer(self): | ||
formatter = LowerCaseNormalizer() | ||
lhs = 'UPPER' | ||
rhs = 'upper' | ||
assert formatter(lhs) == formatter(rhs) | ||
|
||
def test_lower_case_normalizer_instanciate_with_another_formatter(self): | ||
lhs = 'UPPER' | ||
rhs = 'upper' | ||
another = Mock(return_value=lhs) | ||
formatter = LowerCaseNormalizer(another) | ||
assert formatter(lhs) == formatter(rhs) | ||
assert another.called |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.