Skip to content

Commit

Permalink
[recognize entities] fix skipping entities beyond a certain index in …
Browse files Browse the repository at this point in the history
…the query (#1351)

There was a bug in limiting the n-grams used to recognize entities where
beyond a certain number of words in the query, the rest of the query was
not being used in recognition
  • Loading branch information
chejennifer authored Apr 22, 2024
1 parent 497f50d commit c6bdf1b
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 2 deletions.
47 changes: 47 additions & 0 deletions internal/server/recon/golden/recognize_entities/result.json
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,53 @@
"span": "jose are chirpy"
}
]
},
"what genes are associated with the rs13317 , rs1826962 , rs790314 , rs2801952 , rs1814149": {
"items": [
{
"span": "what genes are associated with the"
},
{
"span": "rs13317",
"entities": [
{
"dcid": "bio/rs13317"
}
]
},
{
"span": ", rs1826962 ,",
"entities": [
{
"dcid": "bio/rs1826962"
}
]
},
{
"span": "rs790314",
"entities": [
{
"dcid": "bio/rs790314"
}
]
},
{
"span": ", rs2801952 ,",
"entities": [
{
"dcid": "bio/rs2801952"
}
]
},
{
"span": "rs1814149",
"entities": [
{
"dcid": "bio/rs1814149"
}
]
}
]
}
}
}
1 change: 1 addition & 0 deletions internal/server/recon/golden/recognize_entities_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ func TestRecognizeEntities(t *testing.T) {
// should not recognize the first "me" but should recognize "gene me"
// and "mesh descriptor genes"
"tell me about the gene me and the MeSH descriptor genes",
"What genes are associated with the rs13317 , rs1826962 , rs790314 , rs2801952 , rs1814149",
},
"result.json",
},
Expand Down
7 changes: 5 additions & 2 deletions internal/server/recon/recognize.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,12 @@ func getId2Span(query string) map[string]map[string]struct{} {
spanTokens := strings.Split(query, " ")
for i := range spanTokens {
span := ""
maxNGramLength := int(math.Min(float64(len(spanTokens)), reconNGramLimit+1))
// This is the index in the list of tokens to end at when making n-grams,
// which should be either the end of the list of tokens or when the max n is
// reached for the n-grams, whichever comes first.
maxNGramIdx := int(math.Min(float64(len(spanTokens)), float64(reconNGramLimit+i)))
// make n-grams from the span tokens
for j := i; j < maxNGramLength; j++ {
for j := i; j < maxNGramIdx; j++ {
span = span + " " + spanTokens[j]
span = strings.TrimSpace(span)
id := getReconName(span)
Expand Down

0 comments on commit c6bdf1b

Please sign in to comment.