Skip to content

Commit c877611

Browse files
committed
make sure we don't cut off matching passage after clean-up
1 parent cc686ba commit c877611

File tree

1 file changed

+8
-6
lines changed

1 file changed

+8
-6
lines changed

lib/core/src/compareNgrams.go

+8-6
Original file line numberDiff line numberDiff line change
@@ -909,17 +909,17 @@ func writeAligments(combinedAlignments *CombinedAlignments, sourceDocID *string,
909909

910910
// Returns three passages: the context before, the match itself, and the context after
911911
func alignmentToText(alignment *position, filename string, config *matchingParams) []string {
912-
beforeContext := getText(&filename, alignment.startByte-int32(config.contextSize), alignment.startByte)
912+
beforeContext := getText(&filename, alignment.startByte-int32(config.contextSize), alignment.startByte, "before")
913913
beforeContext = cleanStart.ReplaceAllString(beforeContext, "") // avoid truncation at beginning
914-
matchingPassage := getText(&filename, alignment.startByte, alignment.endByte)
915-
afterContext := getText(&filename, alignment.endByte, alignment.endByte+int32(config.contextSize))
914+
matchingPassage := getText(&filename, alignment.startByte, alignment.endByte, "match")
915+
afterContext := getText(&filename, alignment.endByte, alignment.endByte+int32(config.contextSize), "after")
916916
afterContext = cleanEnd.ReplaceAllString(afterContext, "") // avoid truncation at the end
917917
passages := []string{beforeContext, matchingPassage, afterContext}
918918
return passages
919919
}
920920

921921
// Get text passages using file location and start and end byte
922-
func getText(fileLocation *string, startByte int32, endByte int32) string {
922+
func getText(fileLocation *string, startByte int32, endByte int32, passageType string) string {
923923
f, err := os.Open(*fileLocation)
924924
checkErr(err, fmt.Sprintf("getText (opening %s)", *fileLocation))
925925
if startByte < 0 {
@@ -935,8 +935,10 @@ func getText(fileLocation *string, startByte int32, endByte int32) string {
935935
passage = bytes.Replace(passage, []byte("\xc2\xa0"), []byte(" "), -1) // remove non-breaking spaces
936936
text := string(passage)
937937
text = tags.ReplaceAllString(text, "")
938-
text = brokenBeginTags.ReplaceAllString(text, "")
939-
text = brokenEndTags.ReplaceAllString(text, "")
938+
if passageType != "match" {
939+
text = brokenBeginTags.ReplaceAllString(text, "")
940+
text = brokenEndTags.ReplaceAllString(text, "")
941+
}
940942
text = html.UnescapeString(text)
941943
text = strings.Replace(text, "\\n", "\n", -1)
942944
text = strings.Replace(text, "\\t", "\t", -1)

0 commit comments

Comments
 (0)