Skip to content

Commit cd34692

Browse files
committed
#484 Add unit tests for ASCII indexing and partial record parsing.
1 parent 59de02b commit cd34692

File tree

1 file changed

+119
-1
lines changed

1 file changed

+119
-1
lines changed

spark-cobol/src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test18AsciiNulChars.scala

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,6 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
189189

190190
"allow partial records" in {
191191
withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
192-
193192
val df = spark
194193
.read
195194
.format("cobol")
@@ -207,5 +206,124 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
207206
assert(count == 21)
208207
}
209208
}
209+
210+
"allow partial records with indexing" in {
211+
withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
212+
val expected =
213+
"""[ {
214+
| "Record_Id" : 0,
215+
| "A" : "1",
216+
| "B" : ""
217+
|}, {
218+
| "Record_Id" : 1,
219+
| "A" : "1",
220+
| "B" : "2"
221+
|}, {
222+
| "Record_Id" : 2,
223+
| "A" : "1",
224+
| "B" : "23"
225+
|}, {
226+
| "Record_Id" : 3,
227+
| "A" : "1",
228+
| "B" : "234"
229+
|}, {
230+
| "Record_Id" : 4,
231+
| "A" : "1",
232+
| "B" : "234"
233+
|}, {
234+
| "Record_Id" : 5,
235+
| "A" : "1",
236+
| "B" : "234"
237+
|}, {
238+
| "Record_Id" : 6,
239+
| "A" : "1",
240+
| "B" : "234"
241+
|}, {
242+
| "Record_Id" : 7,
243+
| "A" : "1",
244+
| "B" : "234"
245+
|}, {
246+
| "Record_Id" : 8,
247+
| "A" : "1",
248+
| "B" : "234"
249+
|}, {
250+
| "Record_Id" : 9,
251+
| "A" : "1",
252+
| "B" : "234"
253+
|}, {
254+
| "Record_Id" : 10,
255+
| "A" : "5",
256+
| "B" : "678"
257+
|} ]
258+
|""".stripMargin
259+
260+
val df = spark
261+
.read
262+
.format("cobol")
263+
.option("copybook_contents", copybook)
264+
.option("pedantic", "true")
265+
.option("record_format", "D")
266+
.option("input_split_records", 2)
267+
.option("encoding", "ascii")
268+
.option("string_trimming_policy", "keep_all")
269+
.option("generate_record_id", "true")
270+
.load(tmpFileName)
271+
.select("Record_Id", "A", "B")
272+
.orderBy("Record_Id")
273+
274+
val count = df.count()
275+
val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
276+
277+
assert(count == 11)
278+
assertEqualsMultiline(actual, expected)
279+
}
280+
}
281+
282+
"don't lose any records" in {
283+
val copybook =
284+
""" 01 ENTITY.
285+
05 A PIC X(3).
286+
05 B PIC X(3).
287+
"""
288+
289+
val expected =
290+
"""[ {
291+
| "Record_Id" : 0,
292+
| "A" : "123",
293+
| "B" : "456"
294+
|}, {
295+
| "Record_Id" : 1,
296+
| "A" : "567",
297+
| "B" : "890"
298+
|}, {
299+
| "Record_Id" : 2,
300+
| "A" : "123",
301+
| "B" : "456"
302+
|}, {
303+
| "Record_Id" : 3,
304+
| "A" : "7"
305+
|} ]""".stripMargin
306+
307+
val text = "123456\n567890\n123456\n7"
308+
309+
withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
310+
val df = spark
311+
.read
312+
.format("cobol")
313+
.option("copybook_contents", copybook)
314+
.option("pedantic", "true")
315+
.option("record_format", "D")
316+
.option("input_split_records", 3)
317+
.option("generate_record_id", "true")
318+
.load(tmpFileName)
319+
.select("Record_Id", "A", "B")
320+
.orderBy("Record_Id")
321+
322+
val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
323+
324+
assertEqualsMultiline(actual, expected)
325+
}
326+
}
327+
210328
}
211329
}

0 commit comments

Comments
 (0)