forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_issue3962.py
120 lines (103 loc) · 3.46 KB
/
test_issue3962.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf8
from __future__ import unicode_literals
import pytest
from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
text = "He jests at scars, that never felt a wound."
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
deps = [
"nsubj",
"ccomp",
"prep",
"pobj",
"punct",
"nsubj",
"neg",
"ROOT",
"det",
"dobj",
"punct",
]
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_issue3962(doc):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
span2 = doc[1:5] # "jests at scars ,"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root
assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests" # head set to the new artificial root
assert doc2[3].dep_ == "dep"
# We should still have 1 sentence
assert len(list(doc2.sents)) == 1
span3 = doc[6:9] # "never felt a"
doc3 = span3.as_doc()
doc3_json = doc3.to_json()
assert doc3_json
assert doc3[0].head.text == "felt"
assert doc3[0].dep_ == "neg"
assert doc3[1].head.text == "felt"
assert doc3[1].dep_ == "ROOT"
assert doc3[2].head.text == "felt" # head set to ancestor
assert doc3[2].dep_ == "dep"
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len(list(doc3.sents)) == 1
@pytest.fixture
def two_sent_doc(en_tokenizer):
text = "He jests at scars. They never felt a wound."
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
deps = [
"nsubj",
"ROOT",
"prep",
"pobj",
"punct",
"nsubj",
"neg",
"ROOT",
"det",
"dobj",
"punct",
]
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_issue3962_long(two_sent_doc):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
span2 = two_sent_doc[1:7] # "jests at scars. They never"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
assert (
doc2[0].head.text == "jests"
) # head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct"
assert (
doc2[4].head.text == "They"
) # head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].dep_ == "dep"
assert (
doc2[4].head.text == "They"
) # head set to the new artificial head (in sentence 2)
assert doc2[4].dep_ == "dep"
# We should still have 2 sentences
sents = list(doc2.sents)
assert len(sents) == 2
assert sents[0].text == "jests at scars ."
assert sents[1].text == "They never"