Skip to content

Commit

Permalink
Merge pull request #1 from lapras-inc/topic-refactoring
Browse files Browse the repository at this point in the history
preprocessor の compose インターフェースを実装
  • Loading branch information
Chanmoro authored Mar 12, 2021
2 parents c19024c + 4ce9a4f commit 43125c1
Show file tree
Hide file tree
Showing 18 changed files with 203 additions and 85 deletions.
2 changes: 0 additions & 2 deletions naivesearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from .indexer import InvertedIndex
from .main import naivesearch

__all__ = [
'InvertedIndex'
'naivesearch'
]
5 changes: 4 additions & 1 deletion naivesearch/indexer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from .inverted_index import Chunker, InvertedIndex, Reader
from .inverted_index import InvertedIndex, Reader
from .preprocess_composer import Formatter, Chunker, compose_preprocessors


__all__ = [
'Chunker',
'compose_preprocessors',
'Formatter',
'InvertedIndex',
'Reader',
]
15 changes: 0 additions & 15 deletions naivesearch/indexer/chunker.py

This file was deleted.

14 changes: 0 additions & 14 deletions naivesearch/indexer/formatter.py

This file was deleted.

16 changes: 10 additions & 6 deletions naivesearch/indexer/inverted_index.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging

from collections import defaultdict
from typing import Dict, List, Protocol, Iterable
from .chunker import Chunker
from typing import Dict, List, Protocol, Iterable, Union

from .preprocess_composer import Chunker, Converter


logger = logging.getLogger(__name__)
Expand All @@ -13,9 +13,13 @@ class Reader(Iterable[str]):


class InvertedIndex:
chunkers: List[Chunker]
chunkers: List[Converter]

def __init__(self, reader: Reader, chunkers: List[Chunker]):
def __init__(
self,
reader: Reader,
chunkers: List[Union[Chunker, Converter]]
):
self.index: Dict[str, List[str]] = defaultdict(list)
self.chunkers = chunkers

Expand All @@ -26,7 +30,7 @@ def __init__(self, reader: Reader, chunkers: List[Chunker]):
self.index[chunk].append(d)
logger.info('Done indexing.')

def __getitem__(self, q):
def __getitem__(self, q) -> List[str]:
chunks = []
for chunker in self.chunkers:
for chunk in chunker(q):
Expand Down
36 changes: 36 additions & 0 deletions naivesearch/indexer/preprocess_composer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Callable, List, Optional, Protocol, Type


class Formatter(Protocol):
def __init__(self, other: Optional['Formatter'] = None, **kwargs):
...

def __call__(self, x: str) -> str:
...


class Chunker(Protocol):
def __init__(self, formatter: Optional[Formatter]):
...

def __call__(self, x: str) -> List[str]:
...


class Converter(Protocol):
def __init__(self, chunker: Chunker):
...

def __call__(self, x: str) -> List[str]:
...


def compose_preprocessors(
converter: Type[Converter],
chunker: Type[Chunker],
*formatters: Callable[..., Formatter],
) -> Converter:
result = None
for x in reversed(formatters):
result = x(result)
return converter(chunker(result))
19 changes: 0 additions & 19 deletions naivesearch/indexer/test_formatter.py

This file was deleted.

32 changes: 31 additions & 1 deletion naivesearch/indexer/test_inverted_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from naivesearch import InvertedIndex
from typing import Callable, List, Optional

from naivesearch.indexer import InvertedIndex
from naivesearch.preprocessors import UnicodeNormalizer, LowerCaseNormalizer
from naivesearch.preprocessors import BigramConverter
from naivesearch.preprocessors import CharacterChunker


class TestInvertedIndex:
def test_getitem(self):
Expand Down Expand Up @@ -34,3 +38,29 @@ def dummy_chunker2(x):
assert 'good bye world' in index['good b']
assert 'good morning world' not in index['good b']

def test_instantiate_composed_formatters(self):
jinkou1 = '人口'
jinkou2 = '⼈⼝'
upper = 'UPPER'
lower = 'upper'

assert jinkou1 != jinkou2

def reader():
yield jinkou1
yield upper
yield 'hello world'
yield 'good bye world'
yield 'good morning world'

index = InvertedIndex(
reader(),
[
BigramConverter(CharacterChunker(
UnicodeNormalizer(LowerCaseNormalizer()),
))
]
)

assert jinkou1 in index[jinkou2]
assert upper in index[lower]
32 changes: 18 additions & 14 deletions naivesearch/main.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
from naivesearch.indexer import InvertedIndex
from naivesearch.indexer.formatter import UnicodeNormalizer, LowerCaseNormalizer
from naivesearch.indexer.converter import BigramConverter
from naivesearch.indexer.chunker import CharacterChunker
from returns.curry import partial

from naivesearch.indexer import compose_preprocessors, InvertedIndex
from naivesearch.preprocessors import UnicodeNormalizer, LowerCaseNormalizer
from naivesearch.preprocessors import BigramConverter
from naivesearch.preprocessors import CharacterChunker

def naivesearch(filepath: str):

def file_reader(filepath):
with open(filepath) as f:
for line in f.readlines():
yield line.strip()

def naivesearch(filepath: str):
index = InvertedIndex(
file_reader(filepath),
[
BigramConverter(CharacterChunker([
UnicodeNormalizer(),
LowerCaseNormalizer(),
]))
compose_preprocessors(
BigramConverter,
CharacterChunker,
LowerCaseNormalizer,
partial(UnicodeNormalizer, form='NFKC'),
)
]
)
return index


def file_reader(filepath):
with open(filepath) as f:
for line in f.readlines():
yield line.strip()
11 changes: 11 additions & 0 deletions naivesearch/preprocessors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .chunker import CharacterChunker
from .converter import BigramConverter
from .formatter import UnicodeNormalizer, LowerCaseNormalizer


__all__ = [
'BigramConverter',
'CharacterChunker',
'LowerCaseNormalizer',
'UnicodeNormalizer',
]
11 changes: 11 additions & 0 deletions naivesearch/preprocessors/chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from typing import List, Optional

from naivesearch.indexer import Formatter


class CharacterChunker:
def __init__(self, formatter: Optional[Formatter]):
self.formatter = formatter

def __call__(self, x: str) -> List[str]:
return list(self.formatter(x) if self.formatter else x)
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import List

from naivesearch.indexer import Chunker


class BigramConverter:
def __init__(self, chunker: Chunker):
self.chuker = chunker

def __call__(self, x: str):
def __call__(self, x: str) -> List[str]:
s = self.chuker(x)
return s + [''.join(z) for z in zip(s[0:], s[1:])]
23 changes: 23 additions & 0 deletions naivesearch/preprocessors/formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import unicodedata
from typing import Optional

from naivesearch.indexer import Formatter


class UnicodeNormalizer:
def __init__(self, other: Optional[Formatter] = None, form: str = 'NFKC'):
self.other = other
self.form = form

def __call__(self, x: str) -> str:
x = self.other(x) if self.other else x
return unicodedata.normalize(self.form, x)


class LowerCaseNormalizer:
def __init__(self, other: Optional[Formatter] = None):
self.other = other

def __call__(self, x: str) -> str:
x = self.other(x) if self.other else x
return x.lower()
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from naivesearch import InvertedIndex
from typing import Callable, List, Optional

from .chunker import CharacterChunker
from .formatter import LowerCaseNormalizer, UnicodeNormalizer


class TestCharacterChunker:
def test(self):
format_chunker = CharacterChunker([
UnicodeNormalizer(),
LowerCaseNormalizer(),
])
format_chunker = CharacterChunker(
UnicodeNormalizer(LowerCaseNormalizer()),
)
result = format_chunker('Hello')
assert result == ['h', 'e', 'l', 'l', 'o']
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from naivesearch import InvertedIndex
from typing import Callable, List, Optional

from .converter import BigramConverter
from .formatter import UnicodeNormalizer

Expand Down
35 changes: 35 additions & 0 deletions naivesearch/preprocessors/test_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from unittest.mock import Mock

from .formatter import UnicodeNormalizer, LowerCaseNormalizer


class TestFormatter:
def test_unicode_normalizer(self):
formatter = UnicodeNormalizer()
lhs = '人口'
rhs = '⼈⼝'
assert lhs != rhs
assert formatter(lhs) == formatter(rhs)

def test_unicode_normalizer_instanciate_with_another_formatter(self):
lhs = '人口'
rhs = '⼈⼝'
another = Mock(return_value='人口')
formatter = UnicodeNormalizer(another)
assert lhs != rhs
assert formatter(lhs) == formatter(rhs)
assert another.called

def test_lower_case_normalizer(self):
formatter = LowerCaseNormalizer()
lhs = 'UPPER'
rhs = 'upper'
assert formatter(lhs) == formatter(rhs)

def test_lower_case_normalizer_instanciate_with_another_formatter(self):
lhs = 'UPPER'
rhs = 'upper'
another = Mock(return_value=lhs)
formatter = LowerCaseNormalizer(another)
assert formatter(lhs) == formatter(rhs)
assert another.called
19 changes: 17 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 43125c1

Please sign in to comment.