@@ -189,7 +189,6 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
189189
190190 " allow partial records" in {
191191 withTempTextFile(" ascii_nul" , " .dat" , StandardCharsets .UTF_8 , text) { tmpFileName =>
192-
193192 val df = spark
194193 .read
195194 .format(" cobol" )
@@ -207,5 +206,124 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
207206 assert(count == 21 )
208207 }
209208 }
209+
210+ " allow partial records with indexing" in {
211+ withTempTextFile(" ascii_nul" , " .dat" , StandardCharsets .UTF_8 , text) { tmpFileName =>
212+ val expected =
213+ """ [ {
214+ | "Record_Id" : 0,
215+ | "A" : "1",
216+ | "B" : ""
217+ |}, {
218+ | "Record_Id" : 1,
219+ | "A" : "1",
220+ | "B" : "2"
221+ |}, {
222+ | "Record_Id" : 2,
223+ | "A" : "1",
224+ | "B" : "23"
225+ |}, {
226+ | "Record_Id" : 3,
227+ | "A" : "1",
228+ | "B" : "234"
229+ |}, {
230+ | "Record_Id" : 4,
231+ | "A" : "1",
232+ | "B" : "234"
233+ |}, {
234+ | "Record_Id" : 5,
235+ | "A" : "1",
236+ | "B" : "234"
237+ |}, {
238+ | "Record_Id" : 6,
239+ | "A" : "1",
240+ | "B" : "234"
241+ |}, {
242+ | "Record_Id" : 7,
243+ | "A" : "1",
244+ | "B" : "234"
245+ |}, {
246+ | "Record_Id" : 8,
247+ | "A" : "1",
248+ | "B" : "234"
249+ |}, {
250+ | "Record_Id" : 9,
251+ | "A" : "1",
252+ | "B" : "234"
253+ |}, {
254+ | "Record_Id" : 10,
255+ | "A" : "5",
256+ | "B" : "678"
257+ |} ]
258+ |""" .stripMargin
259+
260+ val df = spark
261+ .read
262+ .format(" cobol" )
263+ .option(" copybook_contents" , copybook)
264+ .option(" pedantic" , " true" )
265+ .option(" record_format" , " D" )
266+ .option(" input_split_records" , 2 )
267+ .option(" encoding" , " ascii" )
268+ .option(" string_trimming_policy" , " keep_all" )
269+ .option(" generate_record_id" , " true" )
270+ .load(tmpFileName)
271+ .select(" Record_Id" , " A" , " B" )
272+ .orderBy(" Record_Id" )
273+
274+ val count = df.count()
275+ val actual = SparkUtils .prettyJSON(df.toJSON.collect().mkString(" [" , " ," , " ]" ))
276+
277+ assert(count == 11 )
278+ assertEqualsMultiline(actual, expected)
279+ }
280+ }
281+
282+ " don't lose any records" in {
283+ val copybook =
284+ """ 01 ENTITY.
285+ 05 A PIC X(3).
286+ 05 B PIC X(3).
287+ """
288+
289+ val expected =
290+ """ [ {
291+ | "Record_Id" : 0,
292+ | "A" : "123",
293+ | "B" : "456"
294+ |}, {
295+ | "Record_Id" : 1,
296+ | "A" : "567",
297+ | "B" : "890"
298+ |}, {
299+ | "Record_Id" : 2,
300+ | "A" : "123",
301+ | "B" : "456"
302+ |}, {
303+ | "Record_Id" : 3,
304+ | "A" : "7"
305+ |} ]""" .stripMargin
306+
307+ val text = " 123456\n 567890\n 123456\n 7"
308+
309+ withTempTextFile(" ascii_nul" , " .dat" , StandardCharsets .UTF_8 , text) { tmpFileName =>
310+ val df = spark
311+ .read
312+ .format(" cobol" )
313+ .option(" copybook_contents" , copybook)
314+ .option(" pedantic" , " true" )
315+ .option(" record_format" , " D" )
316+ .option(" input_split_records" , 3 )
317+ .option(" generate_record_id" , " true" )
318+ .load(tmpFileName)
319+ .select(" Record_Id" , " A" , " B" )
320+ .orderBy(" Record_Id" )
321+
322+ val actual = SparkUtils .prettyJSON(df.toJSON.collect().mkString(" [" , " ," , " ]" ))
323+
324+ assertEqualsMultiline(actual, expected)
325+ }
326+ }
327+
210328 }
211329}
0 commit comments