diff --git a/main/src/main/scala/org/clulab/processors/Document.scala b/main/src/main/scala/org/clulab/processors/Document.scala index 140bc2f49..3b71d5813 100644 --- a/main/src/main/scala/org/clulab/processors/Document.scala +++ b/main/src/main/scala/org/clulab/processors/Document.scala @@ -184,7 +184,36 @@ class Document(val sentences: Array[Sentence]) extends Serializable { } } }) + } + + protected def replaceSentences(sentences: Array[Sentence]): Document = { + val newDocument = new Document(sentences) + + newDocument.id = id + newDocument.text = text + + require(newDocument.coreferenceChains.isEmpty) + require(coreferenceChains.isEmpty) + + getAttachmentKeys.foreach { attachmentKey => + require(newDocument.getAttachment(attachmentKey).forall(_ == getAttachment(attachmentKey).get)) + newDocument.addAttachment(attachmentKey, getAttachment(attachmentKey).get) + } + val dctOpt = getDCT + dctOpt.foreach(newDocument.setDCT) + + newDocument + } + + def offset(offset: Int): Document = { + if (offset == 0) this + else { + val offsetSentences = sentences.map(_.offset(offset)) + val newDocument = replaceSentences(offsetSentences) + + newDocument + } } } diff --git a/main/src/main/scala/org/clulab/processors/Processor.scala b/main/src/main/scala/org/clulab/processors/Processor.scala index 6980a9904..52af1288b 100644 --- a/main/src/main/scala/org/clulab/processors/Processor.scala +++ b/main/src/main/scala/org/clulab/processors/Processor.scala @@ -12,6 +12,69 @@ trait Processor { /** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */ def mkDocument (text:String, keepText:Boolean = false): Document + // The documents here were created with Processor.mkDocument, which could have created a subclassed + // Document or documents with certain fields already filled in. This implementation only handles + // known document fields and then only performs rudimentary requirement checks to make sure that + // the documents are compatible for combination. In more complicated situations it would be necessary + // to override this method in the Processor subclass. + protected def combineDocuments(documents: IndexedSeq[Document], combinedTextOpt: Option[String]): Document = { + require(documents.length > 1) + val headDocument = documents.head + val tailDocuments = documents.tail + val combinedSentences = documents.flatMap(_.sentences).toArray + val combinedDocument = new Document(combinedSentences) + + val headId = headDocument.id + require(tailDocuments.forall(_.id == headId)) + combinedDocument.id = headId + + require(combinedDocument.text.isEmpty) + combinedDocument.text = combinedTextOpt + + // Coreference chains involve Mentions that include references to documents. The Mentions are being + // moved to a new Document and it would be infeasible to move the chains. + require(combinedDocument.coreferenceChains.isEmpty) + require(documents.forall(_.coreferenceChains.isEmpty)) + + documents.foreach { document => + document.getAttachmentKeys.foreach { attachmentKey => + require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get)) + combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get) + } + } + + val headDctOpt = headDocument.getDCT + require(documents.tail.forall(_.getDCT == headDctOpt)) + headDctOpt.foreach(combinedDocument.setDCT) + combinedDocument + } + + def mkCombinedDocument(texts: IndexedSeq[String], trailers: IndexedSeq[String], keepText: Boolean = false): Document = { + require(texts.length == trailers.length) + texts.length match { + case 0 => mkDocument("", keepText) + case 1 => mkDocument(texts.head, keepText) + case _ => + val documents = texts.map(mkDocument(_, keepText)) + val offsets = texts.zip(trailers).scanLeft(0) { case (offset, (text, trailer)) => offset + text.length + trailer.length } + val offsetDocuments = documents.zip(offsets).map { case (document, offset) => + document.offset(offset) + } + val combinedTextOpt = + if (keepText) { + val combinedText = texts.zip(trailers).foldLeft(new StringBuilder) { case (stringBuilder, (text, separator)) => + stringBuilder.append(text).append(separator) + }.toString + + Some(combinedText) + } + else None + val combinedDocument = combineDocuments(offsetDocuments, combinedTextOpt) + + combinedDocument + } + } + /** Constructs a document of tokens from an array of untokenized sentences. */ def mkDocumentFromSentences (sentences:Iterable[String], keepText:Boolean = false, diff --git a/main/src/main/scala/org/clulab/processors/Sentence.scala b/main/src/main/scala/org/clulab/processors/Sentence.scala index b5f8f4581..1787ba1a1 100644 --- a/main/src/main/scala/org/clulab/processors/Sentence.scala +++ b/main/src/main/scala/org/clulab/processors/Sentence.scala @@ -5,7 +5,6 @@ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree} import org.clulab.struct.GraphMap._ import org.clulab.utils.SeqUtils -import scala.collection.immutable.Range import scala.collection.mutable import scala.util.hashing.MurmurHash3._ @@ -174,6 +173,25 @@ class Sentence( reverted } + + def offset(offset: Int): Sentence = { + if (offset == 0) this + else { + val newStartOffsets = startOffsets.map(_ + offset).toArray + val newEndOffsets = endOffsets.map(_ + offset).toArray + val newSentence = Sentence(raw, newStartOffsets, newEndOffsets, words) + + newSentence.tags = tags + newSentence.lemmas = lemmas + newSentence.entities = entities + newSentence.norms = norms + newSentence.chunks = chunks + newSentence.syntacticTree = syntacticTree + newSentence.graphs = graphs + newSentence.relations = relations + newSentence + } + } } object Sentence { diff --git a/main/src/test/resources/org/clulab/processors/sentences10.txt b/main/src/test/resources/org/clulab/processors/sentences10.txt new file mode 100644 index 000000000..d4aa03bec --- /dev/null +++ b/main/src/test/resources/org/clulab/processors/sentences10.txt @@ -0,0 +1,10 @@ +Needed lines of action will be decided on by representatives of some 50 nations . +Scarcity , not only of foodstuffs but of lumber and other forest products , textiles , seeds , fertilizers , draught power , and farm equipment will continue throughout most of Europe and Asia during the coming year . +Hopes of continued recovery in Europe 's indigenous food supplies were checked by last winter 's bad weather . +Diets in Western and Central Europe will be still lower next year , and in Asia they will remain at present very low levels , unless imports can be increased . +Even to hold the present line will require drastic action . +Minimum import needs for Europe , North Africa , and Asia in 1947/48 may be estimated at 34 to 38 million tons without allowing for any improvement in bread rations , any additional livestock feeding , or any increase in working reserves . +Against this need , supplies of grain available for export from the surplus countries may be tentatively estimated at 30 to 34 million tons . +Even with somewhat larger supplies of certain other foods particularly potatoes , sugar , and fats the situation will continue to be grim . +Cessation of UNRRA activities and accumulated foreign exchange difficulties worsen the problem for nations in a weak bargaining position . +Every delay in improving this situation further impairs the working ability of labour , slows up reconstruction , adds to the physical damage caused by prolonged undernourishment , and accelerates social unrest . diff --git a/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala new file mode 100644 index 000000000..3ce3f6edc --- /dev/null +++ b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala @@ -0,0 +1,130 @@ +package org.clulab.processors + +import org.clulab.processors.clu.CluProcessor +import org.clulab.serialization.DocumentSerializer +import org.clulab.utils.Closer.AutoCloser +import org.clulab.utils.{Sourcer, Test} + +import java.io.{PrintWriter, StringWriter} + +class TestMkCombinedDocument extends Test { + val sentences = Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt").autoClose { source => + source.getLines.toArray + } + val manySentenceLengths = Array( + Array(1, 9), + Array(9, 1), + Array(1, 1, 8), + Array(1, 8, 1), + Array(8, 1, 1), + Array(5, 5), + Array(2, 2, 2, 2, 2), + Array(1, 2, 3, 4), + Array(4, 3, 2, 1), + Array(0, 5, 0, 5) + ) + val separator = " " + val documentSerializer = new DocumentSerializer() + val processor = new CluProcessor() + + def toString(document: Document): String = { + val stringWriter = new StringWriter() + + new PrintWriter(stringWriter).autoClose { printWriter => + documentSerializer.save(document, printWriter, keepText = true) + } + stringWriter.toString + } + + behavior of "mkCombinedDocument" + + def test(sentenceLengths: Array[Int], expectedResult: String): Unit = { + val label = sentenceLengths.mkString("[", ", ", "]") + + it should s"combine $label" in { + val sentenceStarts = sentenceLengths.scanLeft(0) { case (start, split) => start + split } + assert(sentenceStarts.last == 10) + val sentenceGroups = sentenceStarts.zip(sentenceLengths).map { case (start, length) => + sentences.slice(start, start + length).mkString(separator) + } + // + val trailers = sentenceGroups.zipWithIndex.map { case (sentenceGroup, index) => + if (sentenceGroup.isEmpty || index == sentenceGroups.indices.last) "" + else separator + } + val document = processor.mkCombinedDocument(sentenceGroups, trailers, keepText = true) + val actualResult = toString(document) + + actualResult should be(expectedResult) + } + } + + { + val document = processor.mkDocument(sentences.mkString(separator), keepText = true) + val expectedResult = toString(document) + + manySentenceLengths.foreach { sentenceLengths => + test(sentenceLengths, expectedResult) + } + } + + behavior of "dynamically separated texts" + + it should "include separators in both text and words" in { + val text = "I found this text
on a web page." + val separator = "
" + val texts = text.split(separator) + val dirtyTexts = texts.zipWithIndex.map { case (text, index) => + if (index != texts.indices.last) text + separator + else text + } + val indices = texts.indices + val trailers = indices.map { _ => "" } + val document = processor.mkCombinedDocument(dirtyTexts, trailers, keepText = true) + + document.text.get should be (text) + document.sentences.length should be (indices.length) + + document.sentences.zipWithIndex.foreach { case (sentence, index) => + if (index != indices.last) + sentence.words should contain (separator) + else + sentence.words should not contain (separator) + } + } + + // This is thought to be the standard case. + it should "include separators in text but not words" in { + val text = "I found this text
on a web page." + val separator = "
" + val texts = text.split(separator) + val indices = texts.indices + val trailers = indices.map { index => if (index != indices.last) separator else "" } + val document = processor.mkCombinedDocument(texts, trailers, keepText = true) + + document.text.get should be (text) + document.sentences.length should be (indices.length) + + document.sentences.foreach { sentence => + sentence.words should not contain(separator) + } + } + + it should "include separators in neither text nor words" in { + val text = "I found this text
on a web page." + val separator = "
" + val cleanSeparator = " " + val cleanText = text.replace(separator, cleanSeparator) + val texts = text.split(separator) + val indices = texts.indices + val trailers = indices.map { index => if (index != indices.last) cleanSeparator else "" } + val document = processor.mkCombinedDocument(texts, trailers, keepText = true) + + document.text.get should be(cleanText) + document.sentences.length should be(indices.length) + + document.sentences.foreach { sentence => + sentence.words should not contain (separator) + } + } +}