Skip to content

handling .z files as gzip #191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions ir_datasets/formats/trec.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,20 +126,20 @@ def docs_iter(self):

def _docs_iter(self, path):
if Path(path).is_file():
path_suffix = Path(path).suffix.lower()
if path_suffix == '.gz':
path_suffix = Path(path).suffix
if path_suffix.lower() == '.gz' or path_suffix == '.z':
with gzip.open(path, 'rb') as f:
yield from self._parser(f)
elif path_suffix in ['.z', '.0z', '.1z', '.2z']:
elif path_suffix in ['.Z', '.0Z', '.1Z', '.2Z']:
# unix "compress" command encoding
unlzw3 = ir_datasets.lazy_libs.unlzw3()
with io.BytesIO(unlzw3.unlzw(path)) as f:
with io.BytesIO(unlzw3.unlzw(Path(path))) as f:
yield from self._parser(f)
else:
with open(path, 'rb') as f:
yield from self._parser(f)
elif Path(path).is_dir():
for child in path.iterdir():
for child in sorted(Path(path).iterdir()):
yield from self._docs_iter(child)

def _parser_bs(self, stream):
Expand Down
Binary file added test/dummy/trecdocs/compress_uc_0z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_0z/F00.0Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_0z/F01.0Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z/F00.Z
Binary file not shown.
Binary file added test/dummy/trecdocs/compress_uc_z/F01.Z
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz/F00.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_gz/F01.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz/F00.GZ
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_uc_gz/F01.GZ
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z.tar.gz
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z/F00.z
Binary file not shown.
Binary file added test/dummy/trecdocs/gzip_z/F01.z
Binary file not shown.
Binary file added test/dummy/trecdocs/plaintext_noext.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_noext/F00
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_noext/F01
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
Binary file added test/dummy/trecdocs/plaintext_txt.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_txt/F00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_txt/F01.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
Binary file added test/dummy/trecdocs/plaintext_uc_txt.tar.gz
Binary file not shown.
29 changes: 29 additions & 0 deletions test/dummy/trecdocs/plaintext_uc_txt/F00.TXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<DOC>
<DOCNO> D100A </DOCNO>
<PARENT> Something </PARENT>
<HT> Some text </HT>

<HEADLINE>
<AU> Header Text </AU>
Daily Report

</HEADLINE>

<TEXT>
Main body text
on multiple lines

with <F P=102> some markup
</F> here. Also, some invalid <T> markup &amp;.
</TEXT>

</DOC>

<DOC>
<DOCNO> 101 </DOCNO>

<TEXT>
More body text
</TEXT>

</DOC>
11 changes: 11 additions & 0 deletions test/dummy/trecdocs/plaintext_uc_txt/F01.TXT
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<DOC>
<DOCNO> D102 </DOCNO>
<HT> more text </HT>

<TEXT>
some very <F P=102> fun text
<!-- commented out --> markup &AMP;

</TEXT>

</DOC>
39 changes: 38 additions & 1 deletion test/formats/test_trec.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,22 @@
import os
import shutil
import unittest
import contextlib
from ir_datasets.formats import TrecQrel, TrecQrels, TrecQuery, TrecQueries, TrecDoc, TrecDocs
from ir_datasets.util import StringFile
from ir_datasets.util import StringFile, RelativePath


class File:
def __init__(self, path):
self._path = path

def path(self, force=True):
return self._path

@contextlib.contextmanager
def stream(self):
yield open(self._path, 'rb')



class TestTrec(unittest.TestCase):
Expand Down Expand Up @@ -127,6 +141,29 @@ def test_docs(self):
self.assertEqual(docs.docs_path(), 'MOCK')
self.assertEqual(list(docs.docs_iter()), expected_results)


def test_docs_formats(self):
expected_results = [
TrecDoc(doc_id='D100A', text='\n\n Header Text \nDaily Report \n\n\n\nMain body text\non multiple lines\n\nwith some markup\n here. Also, some invalid markup &. \n\n', marked_up_doc='<HEADLINE>\n<AU> Header Text </AU>\nDaily Report \n\n</HEADLINE>\n<TEXT>\nMain body text\non multiple lines\n\nwith <F P=102> some markup\n</F> here. Also, some invalid <T> markup &amp;. \n</TEXT>\n'),
TrecDoc(doc_id='101', text='\n\nMore body text\n\n', marked_up_doc='<TEXT>\nMore body text\n</TEXT>\n'),
TrecDoc(doc_id='D102', text='\n\nsome very fun text\n markup &AMP;\n\n\n', marked_up_doc='<TEXT>\nsome very <F P=102> fun text\n<!-- commented out --> markup &AMP;\n\n</TEXT>\n'),
]

for source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz', 'gzip_z', 'gzip_uc_gz', 'compress_uc_z', 'compress_uc_0z']:
with self.subTest(source):
print(source, "no paths")
docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')))
self.assertEqual(list(docs.docs_iter()), expected_results)

print(source, "paths")
docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}')), path_globs=['F*'])
self.assertEqual(list(docs.docs_iter()), expected_results)

if source in ['plaintext_noext', 'plaintext_txt', 'plaintext_uc_txt', 'gzip_gz']:
print(source, "tarfile")
docs = TrecDocs(File(os.path.abspath(f'test/dummy/trecdocs/{source}.tar.gz')), path_globs=['*/F*'])
self.assertEqual(list(docs.docs_iter()), expected_results)

def tearDown(self):
if os.path.exists('MOCK.pklz4'):
shutil.rmtree('MOCK.pklz4')
Expand Down