Skip to content

Commit 92be4eb

Browse files
authored
bugfix/fix ndjson detection (#3905)
### Description NDJSON files were being detected as JSON due to having the same mime-type. This adds additional logic to skip mime-type based detection if extension is `.ndjson`
1 parent 723c074 commit 92be4eb

File tree

6 files changed

+57
-15
lines changed

6 files changed

+57
-15
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.21-dev2
1+
## 0.16.21-dev3
22

33
### Enhancements
44

@@ -8,6 +8,8 @@
88

99
### Fixes
1010

11+
- **Fix file type detection for NDJSON files** NDJSON files were being detected as JSON due to having the same mime-type.
12+
1113
## 0.16.20
1214

1315
### Enhancements

test_unstructured/file_utils/test_filetype.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc
7777
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
7878
(FileType.HTML, "example-10k-1p.html", "text/html"),
7979
(FileType.JPG, "img/example.jpg", "image/jpeg"),
80-
(FileType.JSON, "spring-weather.html.json", "application/json"),
8180
(FileType.MD, "README.md", "text/markdown"),
8281
(FileType.ORG, "README.org", "text/org"),
8382
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -116,7 +115,6 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
116115
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
117116
(FileType.HTML, "example-10k-1p.html", "text/html"),
118117
(FileType.JPG, "img/example.jpg", "image/jpeg"),
119-
(FileType.JSON, "spring-weather.html.json", "application/json"),
120118
(FileType.MD, "README.md", "text/markdown"),
121119
(FileType.ORG, "README.org", "text/org"),
122120
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -154,10 +152,10 @@ def test_it_identifies_NDJSON_for_file_like_object_with_no_name_but_NDJSON_conte
154152
assert detect_filetype(file=file, content_type=FileType.NDJSON.mime_type) == FileType.NDJSON
155153

156154

157-
# TODO: ideally this test should pass, currently fails
158-
# def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
159-
# file_path = example_doc_path("simple.ndjson")
160-
# assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
155+
def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type():
156+
file_path = example_doc_path("simple.ndjson")
157+
assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON
158+
161159

162160
# ================================================================================================
163161
# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
@@ -268,7 +266,6 @@ def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mi
268266
(FileType.UNK, "stanley-cups.csv"),
269267
(FileType.UNK, "eml/fake-email.eml"),
270268
(FileType.UNK, "example-10k-1p.html"),
271-
(FileType.UNK, "spring-weather.html.json"),
272269
(FileType.UNK, "README.md"),
273270
(FileType.UNK, "README.org"),
274271
(FileType.UNK, "README.rst"),
@@ -333,6 +330,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
333330
(FileType.TXT, "norwich-city.txt"),
334331
(FileType.WAV, "CantinaBand3.wav"),
335332
(FileType.XML, "factbook.xml"),
333+
(FileType.NDJSON, "simple.ndjson"),
336334
],
337335
)
338336
def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type(
@@ -395,6 +393,27 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi
395393
assert file_type is FileType.HTML
396394

397395

396+
@pytest.mark.parametrize(
397+
("expected_value", "file_name"),
398+
[(FileType.NDJSON, "simple.ndjson"), (FileType.JSON, "spring-weather.html.json")],
399+
)
400+
def test_it_detects_correct_json_type_without_extension(expected_value: FileType, file_name: str):
401+
with open(example_doc_path(file_name), "rb") as f:
402+
file = io.BytesIO(f.read())
403+
404+
filetype = detect_filetype(file=file)
405+
assert filetype == expected_value
406+
407+
408+
@pytest.mark.parametrize(
409+
("expected_value", "file_name"),
410+
[(FileType.NDJSON, "simple.ndjson"), (FileType.JSON, "spring-weather.html.json")],
411+
)
412+
def test_it_detects_correct_json_type_with_extension(expected_value: FileType, file_name: str):
413+
filetype = detect_filetype(file_path=example_doc_path(file_name))
414+
assert filetype == expected_value
415+
416+
398417
@pytest.mark.parametrize(
399418
("mime_type", "file_name"),
400419
[

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.21-dev2" # pragma: no cover
1+
__version__ = "0.16.21-dev3" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
4747
from unstructured.file_utils.model import FileType
4848
from unstructured.logger import logger
49-
from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
49+
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
5050
from unstructured.partition.common.common import add_element_metadata, exactly_one
5151
from unstructured.partition.common.metadata import set_element_hierarchy
5252
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
@@ -140,8 +140,7 @@ def is_ndjson_processable(
140140
file_text = _FileTypeDetectionContext.new(
141141
file_path=filename, file=file, encoding=encoding
142142
).text_head
143-
144-
return re.match(DICT_PATTERN, file_text) is not None
143+
return file_text.lstrip().startswith("{")
145144

146145

147146
class _FileTypeDetector:
@@ -179,7 +178,11 @@ def _file_type(self) -> FileType:
179178
if file_type := self._file_type_from_file_extension:
180179
return file_type
181180

182-
# -- strategy 5: give up and report FileType.UNK --
181+
# -- strategy 5: edge case where JSON/NDJSON content without file extension --
182+
if file_type := self._disambiguate_json_file_type:
183+
return file_type
184+
185+
# -- strategy 6: give up and report FileType.UNK --
183186
return FileType.UNK
184187

185188
# == STRATEGIES ============================================================
@@ -210,6 +213,20 @@ def _file_type_from_content_type(self) -> FileType | None:
210213
# -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
211214
return FileType.from_mime_type(self._ctx.content_type)
212215

216+
@property
217+
def _disambiguate_json_file_type(self) -> FileType | None:
218+
"""Disambiguate JSON/NDJSON file-type based on file contents.
219+
220+
This method is used when the content-type is `application/json` and the file is not empty.
221+
"""
222+
if self._ctx.content_type is not None and self._ctx.content_type != "application/json":
223+
return None
224+
if is_json_processable(file_text=self._ctx.text_head):
225+
return FileType.JSON
226+
if is_ndjson_processable(file_text=self._ctx.text_head):
227+
return FileType.NDJSON
228+
return None
229+
213230
@property
214231
def _file_type_from_guessed_mime_type(self) -> FileType | None:
215232
"""FileType based on auto-detection of MIME-type by libmagic.
@@ -240,6 +257,9 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None:
240257
if mime_type.endswith("empty"):
241258
return FileType.EMPTY
242259

260+
if mime_type.endswith("json") and self._ctx.extension == ".ndjson":
261+
return FileType.NDJSON
262+
243263
# -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
244264
file_type = FileType.from_mime_type(mime_type)
245265
return file_type if file_type != FileType.UNK else None

unstructured/file_utils/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def from_mime_type(cls, mime_type: str | None) -> FileType | None:
8282
Returns `None` when `mime_type` is `None` or does not map to the canonical MIME-type of a
8383
`FileType` member or one of its alias MIME-types.
8484
"""
85-
if mime_type is None:
85+
if mime_type is None or mime_type == "application/json":
86+
# application/json is ambiguous as it may point ot JSON and NDJSON file types
8687
return None
8788
# -- not super efficient but plenty fast enough for once-or-twice-per-file use and avoids
8889
# -- limitations on defining a class variable on an Enum.

unstructured/nlp/patterns.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@
120120
# format for document elements
121121
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
122122

123-
DICT_PATTERN = r"\A\s*{?"
124123

125124
# (?s) dot all (including newline characters)
126125
# \{(?=.*:) opening brace and at least one colon
@@ -133,6 +132,7 @@
133132
# or the closing bracket to handle cases where the JSON array is cut off
134133
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
135134

135+
136136
# taken from https://stackoverflow.com/a/3845829/12406158
137137
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
138138

0 commit comments

Comments
 (0)