diff --git a/main/src/main/scala/org/clulab/processors/Document.scala b/main/src/main/scala/org/clulab/processors/Document.scala
index 140bc2f49..3b71d5813 100644
--- a/main/src/main/scala/org/clulab/processors/Document.scala
+++ b/main/src/main/scala/org/clulab/processors/Document.scala
@@ -184,7 +184,36 @@ class Document(val sentences: Array[Sentence]) extends Serializable {
}
}
})
+ }
+
+ protected def replaceSentences(sentences: Array[Sentence]): Document = {
+ val newDocument = new Document(sentences)
+
+ newDocument.id = id
+ newDocument.text = text
+
+ require(newDocument.coreferenceChains.isEmpty)
+ require(coreferenceChains.isEmpty)
+
+ getAttachmentKeys.foreach { attachmentKey =>
+ require(newDocument.getAttachment(attachmentKey).forall(_ == getAttachment(attachmentKey).get))
+ newDocument.addAttachment(attachmentKey, getAttachment(attachmentKey).get)
+ }
+ val dctOpt = getDCT
+ dctOpt.foreach(newDocument.setDCT)
+
+ newDocument
+ }
+
+ def offset(offset: Int): Document = {
+ if (offset == 0) this
+ else {
+ val offsetSentences = sentences.map(_.offset(offset))
+ val newDocument = replaceSentences(offsetSentences)
+
+ newDocument
+ }
}
}
diff --git a/main/src/main/scala/org/clulab/processors/Processor.scala b/main/src/main/scala/org/clulab/processors/Processor.scala
index 6980a9904..52af1288b 100644
--- a/main/src/main/scala/org/clulab/processors/Processor.scala
+++ b/main/src/main/scala/org/clulab/processors/Processor.scala
@@ -12,6 +12,69 @@ trait Processor {
/** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */
def mkDocument (text:String, keepText:Boolean = false): Document
+ // The documents here were created with Processor.mkDocument, which could have created a subclassed
+ // Document or documents with certain fields already filled in. This implementation only handles
+ // known document fields and then only performs rudimentary requirement checks to make sure that
+ // the documents are compatible for combination. In more complicated situations it would be necessary
+ // to override this method in the Processor subclass.
+ protected def combineDocuments(documents: IndexedSeq[Document], combinedTextOpt: Option[String]): Document = {
+ require(documents.length > 1)
+ val headDocument = documents.head
+ val tailDocuments = documents.tail
+ val combinedSentences = documents.flatMap(_.sentences).toArray
+ val combinedDocument = new Document(combinedSentences)
+
+ val headId = headDocument.id
+ require(tailDocuments.forall(_.id == headId))
+ combinedDocument.id = headId
+
+ require(combinedDocument.text.isEmpty)
+ combinedDocument.text = combinedTextOpt
+
+ // Coreference chains involve Mentions that include references to documents. The Mentions are being
+ // moved to a new Document and it would be infeasible to move the chains.
+ require(combinedDocument.coreferenceChains.isEmpty)
+ require(documents.forall(_.coreferenceChains.isEmpty))
+
+ documents.foreach { document =>
+ document.getAttachmentKeys.foreach { attachmentKey =>
+ require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get))
+ combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get)
+ }
+ }
+
+ val headDctOpt = headDocument.getDCT
+ require(documents.tail.forall(_.getDCT == headDctOpt))
+ headDctOpt.foreach(combinedDocument.setDCT)
+ combinedDocument
+ }
+
+ def mkCombinedDocument(texts: IndexedSeq[String], trailers: IndexedSeq[String], keepText: Boolean = false): Document = {
+ require(texts.length == trailers.length)
+ texts.length match {
+ case 0 => mkDocument("", keepText)
+ case 1 => mkDocument(texts.head, keepText)
+ case _ =>
+ val documents = texts.map(mkDocument(_, keepText))
+ val offsets = texts.zip(trailers).scanLeft(0) { case (offset, (text, trailer)) => offset + text.length + trailer.length }
+ val offsetDocuments = documents.zip(offsets).map { case (document, offset) =>
+ document.offset(offset)
+ }
+ val combinedTextOpt =
+ if (keepText) {
+ val combinedText = texts.zip(trailers).foldLeft(new StringBuilder) { case (stringBuilder, (text, separator)) =>
+ stringBuilder.append(text).append(separator)
+ }.toString
+
+ Some(combinedText)
+ }
+ else None
+ val combinedDocument = combineDocuments(offsetDocuments, combinedTextOpt)
+
+ combinedDocument
+ }
+ }
+
/** Constructs a document of tokens from an array of untokenized sentences. */
def mkDocumentFromSentences (sentences:Iterable[String],
keepText:Boolean = false,
diff --git a/main/src/main/scala/org/clulab/processors/Sentence.scala b/main/src/main/scala/org/clulab/processors/Sentence.scala
index b5f8f4581..1787ba1a1 100644
--- a/main/src/main/scala/org/clulab/processors/Sentence.scala
+++ b/main/src/main/scala/org/clulab/processors/Sentence.scala
@@ -5,7 +5,6 @@ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree}
import org.clulab.struct.GraphMap._
import org.clulab.utils.SeqUtils
-import scala.collection.immutable.Range
import scala.collection.mutable
import scala.util.hashing.MurmurHash3._
@@ -174,6 +173,25 @@ class Sentence(
reverted
}
+
+ def offset(offset: Int): Sentence = {
+ if (offset == 0) this
+ else {
+ val newStartOffsets = startOffsets.map(_ + offset).toArray
+ val newEndOffsets = endOffsets.map(_ + offset).toArray
+ val newSentence = Sentence(raw, newStartOffsets, newEndOffsets, words)
+
+ newSentence.tags = tags
+ newSentence.lemmas = lemmas
+ newSentence.entities = entities
+ newSentence.norms = norms
+ newSentence.chunks = chunks
+ newSentence.syntacticTree = syntacticTree
+ newSentence.graphs = graphs
+ newSentence.relations = relations
+ newSentence
+ }
+ }
}
object Sentence {
diff --git a/main/src/test/resources/org/clulab/processors/sentences10.txt b/main/src/test/resources/org/clulab/processors/sentences10.txt
new file mode 100644
index 000000000..d4aa03bec
--- /dev/null
+++ b/main/src/test/resources/org/clulab/processors/sentences10.txt
@@ -0,0 +1,10 @@
+Needed lines of action will be decided on by representatives of some 50 nations .
+Scarcity , not only of foodstuffs but of lumber and other forest products , textiles , seeds , fertilizers , draught power , and farm equipment will continue throughout most of Europe and Asia during the coming year .
+Hopes of continued recovery in Europe 's indigenous food supplies were checked by last winter 's bad weather .
+Diets in Western and Central Europe will be still lower next year , and in Asia they will remain at present very low levels , unless imports can be increased .
+Even to hold the present line will require drastic action .
+Minimum import needs for Europe , North Africa , and Asia in 1947/48 may be estimated at 34 to 38 million tons without allowing for any improvement in bread rations , any additional livestock feeding , or any increase in working reserves .
+Against this need , supplies of grain available for export from the surplus countries may be tentatively estimated at 30 to 34 million tons .
+Even with somewhat larger supplies of certain other foods particularly potatoes , sugar , and fats the situation will continue to be grim .
+Cessation of UNRRA activities and accumulated foreign exchange difficulties worsen the problem for nations in a weak bargaining position .
+Every delay in improving this situation further impairs the working ability of labour , slows up reconstruction , adds to the physical damage caused by prolonged undernourishment , and accelerates social unrest .
diff --git a/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala
new file mode 100644
index 000000000..3ce3f6edc
--- /dev/null
+++ b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala
@@ -0,0 +1,130 @@
+package org.clulab.processors
+
+import org.clulab.processors.clu.CluProcessor
+import org.clulab.serialization.DocumentSerializer
+import org.clulab.utils.Closer.AutoCloser
+import org.clulab.utils.{Sourcer, Test}
+
+import java.io.{PrintWriter, StringWriter}
+
+class TestMkCombinedDocument extends Test {
+ val sentences = Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt").autoClose { source =>
+ source.getLines.toArray
+ }
+ val manySentenceLengths = Array(
+ Array(1, 9),
+ Array(9, 1),
+ Array(1, 1, 8),
+ Array(1, 8, 1),
+ Array(8, 1, 1),
+ Array(5, 5),
+ Array(2, 2, 2, 2, 2),
+ Array(1, 2, 3, 4),
+ Array(4, 3, 2, 1),
+ Array(0, 5, 0, 5)
+ )
+ val separator = " "
+ val documentSerializer = new DocumentSerializer()
+ val processor = new CluProcessor()
+
+ def toString(document: Document): String = {
+ val stringWriter = new StringWriter()
+
+ new PrintWriter(stringWriter).autoClose { printWriter =>
+ documentSerializer.save(document, printWriter, keepText = true)
+ }
+ stringWriter.toString
+ }
+
+ behavior of "mkCombinedDocument"
+
+ def test(sentenceLengths: Array[Int], expectedResult: String): Unit = {
+ val label = sentenceLengths.mkString("[", ", ", "]")
+
+ it should s"combine $label" in {
+ val sentenceStarts = sentenceLengths.scanLeft(0) { case (start, split) => start + split }
+ assert(sentenceStarts.last == 10)
+ val sentenceGroups = sentenceStarts.zip(sentenceLengths).map { case (start, length) =>
+ sentences.slice(start, start + length).mkString(separator)
+ }
+ //
+ val trailers = sentenceGroups.zipWithIndex.map { case (sentenceGroup, index) =>
+ if (sentenceGroup.isEmpty || index == sentenceGroups.indices.last) ""
+ else separator
+ }
+ val document = processor.mkCombinedDocument(sentenceGroups, trailers, keepText = true)
+ val actualResult = toString(document)
+
+ actualResult should be(expectedResult)
+ }
+ }
+
+ {
+ val document = processor.mkDocument(sentences.mkString(separator), keepText = true)
+ val expectedResult = toString(document)
+
+ manySentenceLengths.foreach { sentenceLengths =>
+ test(sentenceLengths, expectedResult)
+ }
+ }
+
+ behavior of "dynamically separated texts"
+
+ it should "include separators in both text and words" in {
+ val text = "I found this text
on a web page."
+ val separator = "
"
+ val texts = text.split(separator)
+ val dirtyTexts = texts.zipWithIndex.map { case (text, index) =>
+ if (index != texts.indices.last) text + separator
+ else text
+ }
+ val indices = texts.indices
+ val trailers = indices.map { _ => "" }
+ val document = processor.mkCombinedDocument(dirtyTexts, trailers, keepText = true)
+
+ document.text.get should be (text)
+ document.sentences.length should be (indices.length)
+
+ document.sentences.zipWithIndex.foreach { case (sentence, index) =>
+ if (index != indices.last)
+ sentence.words should contain (separator)
+ else
+ sentence.words should not contain (separator)
+ }
+ }
+
+ // This is thought to be the standard case.
+ it should "include separators in text but not words" in {
+ val text = "I found this text
on a web page."
+ val separator = "
"
+ val texts = text.split(separator)
+ val indices = texts.indices
+ val trailers = indices.map { index => if (index != indices.last) separator else "" }
+ val document = processor.mkCombinedDocument(texts, trailers, keepText = true)
+
+ document.text.get should be (text)
+ document.sentences.length should be (indices.length)
+
+ document.sentences.foreach { sentence =>
+ sentence.words should not contain(separator)
+ }
+ }
+
+ it should "include separators in neither text nor words" in {
+ val text = "I found this text
on a web page."
+ val separator = "
"
+ val cleanSeparator = " "
+ val cleanText = text.replace(separator, cleanSeparator)
+ val texts = text.split(separator)
+ val indices = texts.indices
+ val trailers = indices.map { index => if (index != indices.last) cleanSeparator else "" }
+ val document = processor.mkCombinedDocument(texts, trailers, keepText = true)
+
+ document.text.get should be(cleanText)
+ document.sentences.length should be(indices.length)
+
+ document.sentences.foreach { sentence =>
+ sentence.words should not contain (separator)
+ }
+ }
+}