Skip to content

Commit 10d188a

Browse files
committed
#397 Skip empty lines when reading record sequence text files
1 parent 050be67 commit 10d188a

File tree

3 files changed

+208
-37
lines changed

3 files changed

+208
-37
lines changed

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/extractors/raw/TextRecordExtractor.scala

Lines changed: 68 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -26,78 +26,111 @@ import java.util
2626
*/
2727
class TextRecordExtractor(ctx: RawRecordContext) extends Serializable with RawRecordExtractor {
2828
private val maxRecordSize = ctx.copybook.getRecordSize + 2
29-
private val bytes = new Array[Byte](maxRecordSize)
30-
private var bytesSize = 0
29+
private val pendingBytes = new Array[Byte](maxRecordSize)
30+
private var pendingBytesSize = 0
31+
private var recordBytes: Option[Array[Byte]] = None
32+
private var curRecordSize = 0
3133
private var lastFooterSize = 1
3234

33-
override def hasNext: Boolean = !ctx.inputStream.isEndOfStream || bytesSize > 0
35+
override def hasNext: Boolean = {
36+
if (recordBytes.isEmpty) {
37+
ensureBytesRead(maxRecordSize)
38+
fetchNextRecord()
39+
}
40+
41+
recordBytes.get.length > 0
42+
}
3443

3544
override def next(): Array[Byte] = {
3645
if (!hasNext) {
3746
throw new NoSuchElementException
3847
}
39-
ensureBytesRead(maxRecordSize)
40-
findEol()
48+
val bytesToReturn = recordBytes.get
49+
curRecordSize = 0
50+
recordBytes = None
51+
bytesToReturn
4152
}
4253

43-
override def offset: Long = ctx.inputStream.offset - bytesSize
54+
override def offset: Long = ctx.inputStream.offset - pendingBytesSize - curRecordSize
55+
56+
private def ensureBytesRead(numOfBytes: Int): Unit = {
57+
val bytesToRead = numOfBytes - pendingBytesSize
58+
if (bytesToRead > 0) {
59+
val newBytes = ctx.inputStream.next(bytesToRead)
60+
if (newBytes.length > 0) {
61+
System.arraycopy(newBytes, 0, pendingBytes, pendingBytesSize, newBytes.length)
62+
pendingBytesSize = pendingBytesSize + newBytes.length
63+
}
64+
}
65+
}
4466

45-
private def findEol(): Array[Byte] = {
67+
private def skipEmptyLines(): Unit = {
68+
var i = 0
69+
while (i < pendingBytesSize && (pendingBytes(i) == 0x0D || pendingBytes(i) == 0x0A)) {
70+
i += 1
71+
}
72+
if (i > 0) {
73+
advanceArray(i)
74+
ensureBytesRead(maxRecordSize)
75+
}
76+
}
77+
78+
private def findNextNonEmptyRecord(): (Int, Int) = {
4679
var recordLength = 0
4780
var recordPayload = 0
48-
4981
var i = 0
50-
while (recordLength == 0 && i < bytesSize) {
51-
if (bytes(i) == 0x0D) {
52-
if (i + 1 < maxRecordSize && bytes(i + 1) == 0x0A) {
82+
83+
while (recordLength == 0 && i < pendingBytesSize) {
84+
if (pendingBytes(i) == 0x0D) {
85+
if (i + 1 < maxRecordSize && pendingBytes(i + 1) == 0x0A) {
5386
recordLength = i + 2
5487
recordPayload = i
5588
}
56-
} else if (bytes(i) == 0x0A) {
89+
} else if (pendingBytes(i) == 0x0A) {
5790
recordLength = i + 1
5891
recordPayload = i
5992
}
6093
i += 1
6194
}
95+
(recordLength, recordPayload)
96+
}
6297

63-
val record = if (recordLength > 0) {
64-
bytes.take(recordPayload)
98+
private def fetchNextRecord(): Unit = {
99+
skipEmptyLines()
100+
101+
var (recordLength, recordPayload) = findNextNonEmptyRecord()
102+
103+
recordBytes = if (recordLength > 0) {
104+
curRecordSize = recordLength
105+
Some(pendingBytes.take(recordPayload))
65106
} else {
66107
// Last record or a record is too large?
67108
// In the latter case
68109
if (ctx.inputStream.isEndOfStream) {
69110
// Last record
70-
recordLength = bytesSize
71-
recordPayload = bytesSize
111+
recordLength = pendingBytesSize
112+
recordPayload = pendingBytesSize
72113
} else {
73114
// This is an errors situation - no line breaks between records
74115
// Return a record worth of data minus line break.
75-
recordLength = bytesSize - lastFooterSize
76-
recordPayload = bytesSize - lastFooterSize
116+
recordLength = pendingBytesSize - lastFooterSize
117+
recordPayload = pendingBytesSize - lastFooterSize
77118
}
78-
bytes.take(recordLength)
79-
}
80-
81-
if (bytesSize > recordLength) {
82-
System.arraycopy(bytes, recordLength, bytes, 0, bytesSize - recordLength)
119+
curRecordSize = recordLength
120+
Some(pendingBytes.take(recordLength))
83121
}
84-
bytesSize -= recordLength
85122

86-
util.Arrays.fill(bytes, bytesSize, maxRecordSize, 0.toByte)
123+
advanceArray(recordLength)
87124

88125
lastFooterSize = recordLength - recordPayload
89-
90-
record
91126
}
92127

93-
private def ensureBytesRead(numOfBytes: Int): Unit = {
94-
val bytesToRead = numOfBytes - bytesSize
95-
if (bytesToRead > 0) {
96-
val newBytes = ctx.inputStream.next(bytesToRead)
97-
if (newBytes.length > 0) {
98-
System.arraycopy(newBytes, 0, bytes, bytesSize, newBytes.length)
99-
bytesSize = numOfBytes
100-
}
128+
private def advanceArray(recordLength: Int): Unit = {
129+
if (pendingBytesSize > recordLength) {
130+
System.arraycopy(pendingBytes, recordLength, pendingBytes, 0, pendingBytesSize - recordLength)
101131
}
132+
pendingBytesSize -= recordLength
133+
134+
util.Arrays.fill(pendingBytes, pendingBytesSize, maxRecordSize, 0.toByte)
102135
}
103136
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/reader/index/IndexGenerator.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,10 @@ object IndexGenerator {
8080
record = dataStream.next(recordMetadata.recordLength)
8181
}
8282
val recordSize = dataStream.offset - byteIndex
83-
val hasMoreRecords = recordSize > 0
83+
val hasMoreRecords = recordSize > 0 && !dataStream.isEndOfStream
8484
(recordSize, recordMetadata.isValid, hasMoreRecords)
8585
}
86-
if (dataStream.isEndOfStream || !hasMoreRecords) {
86+
if (!hasMoreRecords) {
8787
endOfFileReached = true
8888
} else {
8989
if (isValid) {
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.spark.cobol.source.regression
18+
19+
import org.scalatest.WordSpec
20+
import org.slf4j.{Logger, LoggerFactory}
21+
import za.co.absa.cobrix.spark.cobol.source.base.{SimpleComparisonBase, SparkTestBase}
22+
import za.co.absa.cobrix.spark.cobol.source.fixtures.BinaryFileFixture
23+
24+
class Test13AsciiCrLfText extends WordSpec with SparkTestBase with BinaryFileFixture with SimpleComparisonBase {
25+
26+
private implicit val logger: Logger = LoggerFactory.getLogger(this.getClass)
27+
28+
private val copybook =
29+
""" 01 ENTITY.
30+
05 A PIC X(2).
31+
"""
32+
33+
val binFileContents: Array[Byte] = Array[Byte](
34+
// 0
35+
0x66.toByte, 0x64.toByte, 0x0D.toByte, 0x0A.toByte,
36+
// 1
37+
0x68.toByte, 0x64.toByte, 0x0D.toByte, 0x0A.toByte,
38+
// 2 - empty line
39+
0x0D.toByte, 0x0A.toByte,
40+
// 3
41+
0x73.toByte, 0x64.toByte, 0x0D.toByte, 0x0A.toByte,
42+
// 4 - empty line
43+
0x0D.toByte, 0x0A.toByte
44+
)
45+
46+
val emptyFileContents: Array[Byte] = Array[Byte](
47+
// 0 - empty line
48+
0x0D.toByte, 0x0A.toByte,
49+
// 1 - empty line
50+
0x0D.toByte, 0x0A.toByte
51+
)
52+
53+
"Test ASCII CRLF text file" should {
54+
"correctly identify empty lines when read as a text file" in {
55+
withTempBinFile("crlf", ".dat", binFileContents) { tmpFileName =>
56+
val df = spark
57+
.read
58+
.format("cobol")
59+
.option("copybook_contents", copybook)
60+
.option("pedantic", "true")
61+
.option("is_text", "true")
62+
.option("encoding", "ascii")
63+
.option("schema_retention_policy", "collapse_root")
64+
.load(tmpFileName)
65+
66+
val expected = """[{"A":"fd"},{"A":"hd"},{"A":"sd"}]"""
67+
68+
val count = df.count()
69+
val actual = df.toJSON.collect().mkString("[", ",", "]")
70+
71+
assert(count == 3)
72+
assertEqualsMultiline(actual, expected)
73+
}
74+
}
75+
76+
"correctly identify empty lines when read as a record sequence" in {
77+
withTempBinFile("crlf", ".dat", binFileContents) { tmpFileName =>
78+
val df = spark
79+
.read
80+
.format("cobol")
81+
.option("copybook_contents", copybook)
82+
.option("pedantic", "true")
83+
.option("is_record_sequence", "true")
84+
.option("is_text", "true")
85+
.option("encoding", "ascii")
86+
.option("schema_retention_policy", "collapse_root")
87+
.load(tmpFileName)
88+
89+
val expected = """[{"A":"fd"},{"A":"hd"},{"A":"sd"}]"""
90+
91+
val count = df.count()
92+
val actual = df.toJSON.collect().mkString("[", ",", "]")
93+
94+
assert(count == 3)
95+
assertEqualsMultiline(actual, expected)
96+
}
97+
}
98+
}
99+
100+
"Test empty ASCII CRLF text file" should {
101+
"correctly identify empty lines when read as a text file" in {
102+
withTempBinFile("crlf_empty", ".dat", emptyFileContents) { tmpFileName =>
103+
val df = spark
104+
.read
105+
.format("cobol")
106+
.option("copybook_contents", copybook)
107+
.option("pedantic", "true")
108+
.option("is_text", "true")
109+
.option("encoding", "ascii")
110+
.option("schema_retention_policy", "collapse_root")
111+
.load(tmpFileName)
112+
113+
val count = df.count()
114+
115+
assert(count == 0)
116+
}
117+
}
118+
119+
"correctly identify empty lines when read as a record sequence" in {
120+
withTempBinFile("crlf_empty", ".dat", emptyFileContents) { tmpFileName =>
121+
val df = spark
122+
.read
123+
.format("cobol")
124+
.option("copybook_contents", copybook)
125+
.option("pedantic", "true")
126+
.option("is_record_sequence", "true")
127+
.option("is_text", "true")
128+
.option("encoding", "ascii")
129+
.option("schema_retention_policy", "collapse_root")
130+
.load(tmpFileName)
131+
132+
val count = df.count()
133+
134+
assert(count == 0)
135+
}
136+
}
137+
}
138+
}

0 commit comments

Comments
 (0)