Skip to content

Commit 050be67

Browse files
authored
Merge pull request #400 from AbsaOSS/feature/372-add-improve-null-detection-option
Feature/372 add improve null detection option
2 parents fc3bbd9 + 14c1439 commit 050be67

File tree

21 files changed

+473
-277
lines changed

21 files changed

+473
-277
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,7 @@ Again, the full example is available at
11131113
| .option("floating_point_format", "IBM") | Specifies a floating-point format. Available options: `IBM` (default), `IEEE754`, `IBM_little_endian`, `IEEE754_little_endian`. |
11141114
| .option("variable_size_occurs", "false") | If `false` (default) fields that have `OCCURS 0 TO 100 TIMES DEPENDING ON` clauses always have the same size corresponding to the maximum array size (e.g. 100 in this example). If set to `true` the size of the field will shrink for each field that has less actual elements. |
11151115
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
1116+
| .option("improved_null_detection", "false") | If `true`, values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
11161117

11171118
##### Modifier options
11181119

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala

Lines changed: 62 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -78,21 +78,22 @@ object CopybookParser {
7878
/**
7979
* Tokenizes a Cobol Copybook contents and returns the AST.
8080
*
81-
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
82-
* @param copyBookContents A string containing all lines of a copybook
83-
* @param dropGroupFillers Drop groups marked as fillers from the output AST
84-
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
85-
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
86-
* resolving segment redefines.
87-
* @param fieldParentMap A segment fields parent mapping
88-
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
89-
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
90-
* @param ebcdicCodePage A code page for EBCDIC encoded data
91-
* @param asciiCharset A charset for ASCII encoded data
92-
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
93-
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
94-
* @param nonTerminals A list of non-terminals that should be extracted as strings
95-
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
81+
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
82+
* @param copyBookContents A string containing all lines of a copybook
83+
* @param dropGroupFillers Drop groups marked as fillers from the output AST
84+
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
85+
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
86+
* resolving segment redefines.
87+
* @param fieldParentMap A segment fields parent mapping
88+
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
89+
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
90+
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
91+
* @param ebcdicCodePage A code page for EBCDIC encoded data
92+
* @param asciiCharset A charset for ASCII encoded data
93+
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
94+
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
95+
* @param nonTerminals A list of non-terminals that should be extracted as strings
96+
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
9697
* @return Seq[Group] where a group is a record inside the copybook
9798
*/
9899
def parse(copyBookContents: String,
@@ -103,6 +104,7 @@ object CopybookParser {
103104
fieldParentMap: Map[String, String] = HashMap[String, String](),
104105
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
105106
commentPolicy: CommentPolicy = CommentPolicy(),
107+
improvedNullDetection: Boolean = false,
106108
ebcdicCodePage: CodePage = new CodePageCommon,
107109
asciiCharset: Charset = StandardCharsets.US_ASCII,
108110
isUtf16BigEndian: Boolean = true,
@@ -118,6 +120,7 @@ object CopybookParser {
118120
fieldParentMap,
119121
stringTrimmingPolicy,
120122
commentPolicy,
123+
improvedNullDetection,
121124
ebcdicCodePage,
122125
asciiCharset,
123126
isUtf16BigEndian,
@@ -130,19 +133,20 @@ object CopybookParser {
130133
/**
131134
* Tokenizes a Cobol Copybook contents and returns the AST.
132135
*
133-
* @param copyBookContents A string containing all lines of a copybook
134-
* @param dropGroupFillers Drop groups marked as fillers from the output AST
135-
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
136-
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
137-
* @param fieldParentMap A segment fields parent mapping
138-
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
139-
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
140-
* @param ebcdicCodePage A code page for EBCDIC encoded data
141-
* @param asciiCharset A charset for ASCII encoded data
142-
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
143-
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
144-
* @param nonTerminals A list of non-terminals that should be extracted as strings
145-
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
136+
* @param copyBookContents A string containing all lines of a copybook
137+
* @param dropGroupFillers Drop groups marked as fillers from the output AST
138+
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
139+
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
140+
* @param fieldParentMap A segment fields parent mapping
141+
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
142+
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
143+
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
144+
* @param ebcdicCodePage A code page for EBCDIC encoded data
145+
* @param asciiCharset A charset for ASCII encoded data
146+
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
147+
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
148+
* @param nonTerminals A list of non-terminals that should be extracted as strings
149+
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
146150
* @return Seq[Group] where a group is a record inside the copybook
147151
*/
148152
def parseTree(copyBookContents: String,
@@ -152,6 +156,7 @@ object CopybookParser {
152156
fieldParentMap: Map[String, String] = HashMap[String, String](),
153157
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
154158
commentPolicy: CommentPolicy = CommentPolicy(),
159+
improvedNullDetection: Boolean = false,
155160
ebcdicCodePage: CodePage = new CodePageCommon,
156161
asciiCharset: Charset = StandardCharsets.US_ASCII,
157162
isUtf16BigEndian: Boolean = true,
@@ -167,6 +172,7 @@ object CopybookParser {
167172
fieldParentMap,
168173
stringTrimmingPolicy,
169174
commentPolicy,
175+
improvedNullDetection,
170176
ebcdicCodePage,
171177
asciiCharset,
172178
isUtf16BigEndian,
@@ -179,21 +185,22 @@ object CopybookParser {
179185
/**
180186
* Tokenizes a Cobol Copybook contents and returns the AST.
181187
*
182-
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
183-
* @param copyBookContents A string containing all lines of a copybook
184-
* @param dropGroupFillers Drop groups marked as fillers from the output AST
185-
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
186-
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
187-
* resolving segment redefines.
188-
* @param fieldParentMap A segment fields parent mapping
189-
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
190-
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
191-
* @param ebcdicCodePage A code page for EBCDIC encoded data
192-
* @param asciiCharset A charset for ASCII encoded data
193-
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
194-
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
195-
* @param nonTerminals A list of non-terminals that should be extracted as strings
196-
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
188+
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
189+
* @param copyBookContents A string containing all lines of a copybook
190+
* @param dropGroupFillers Drop groups marked as fillers from the output AST
191+
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
192+
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
193+
* resolving segment redefines.
194+
* @param fieldParentMap A segment fields parent mapping
195+
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
196+
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
197+
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
198+
* @param ebcdicCodePage A code page for EBCDIC encoded data
199+
* @param asciiCharset A charset for ASCII encoded data
200+
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
201+
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
202+
* @param nonTerminals A list of non-terminals that should be extracted as strings
203+
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
197204
* @return Seq[Group] where a group is a record inside the copybook
198205
*/
199206
@throws(classOf[SyntaxErrorException])
@@ -205,6 +212,7 @@ object CopybookParser {
205212
fieldParentMap: Map[String, String],
206213
stringTrimmingPolicy: StringTrimmingPolicy,
207214
commentPolicy: CommentPolicy,
215+
improvedNullDetection: Boolean,
208216
ebcdicCodePage: CodePage,
209217
asciiCharset: Charset,
210218
isUtf16BigEndian: Boolean,
@@ -213,7 +221,7 @@ object CopybookParser {
213221
occursHandlers: Map[String, Map[String, Int]],
214222
debugFieldsPolicy: DebugFieldsPolicy): Copybook = {
215223

216-
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
224+
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, improvedNullDetection, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
217225

218226
val nonTerms: Set[String] = (for (id <- nonTerminals)
219227
yield transformIdentifier(id)
@@ -232,7 +240,7 @@ object CopybookParser {
232240
processGroupFillers(
233241
markDependeeFields(
234242
addNonTerminals(
235-
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
243+
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
236244
occursHandlers
237245
), dropValueFillers
238246
), dropGroupFillers, dropValueFillers
@@ -249,7 +257,7 @@ object CopybookParser {
249257
renameGroupFillers(
250258
markDependeeFields(
251259
addNonTerminals(
252-
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
260+
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
253261
occursHandlers
254262
),
255263
dropGroupFillers, dropValueFillers
@@ -267,7 +275,8 @@ object CopybookParser {
267275
ebcdicCodePage: CodePage,
268276
asciiCharset: Charset,
269277
isUtf16BigEndian: Boolean,
270-
floatingPointFormat: FloatingPointFormat
278+
floatingPointFormat: FloatingPointFormat,
279+
improvedNullDetection: Boolean
271280
): CopybookAST = {
272281

273282
def getNonTerminalName(name: String, parent: Group): String = {
@@ -292,11 +301,11 @@ object CopybookParser {
292301
case g: Group =>
293302
if (nonTerminals contains g.name) {
294303
newChildren.append(
295-
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat).copy(isRedefined = true)(g.parent)
304+
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection).copy(isRedefined = true)(g.parent)
296305
)
297306
val sz = g.binaryProperties.actualSize
298307
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
299-
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
308+
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
300309
val newName = getNonTerminalName(g.name, g.parent.get)
301310
newChildren.append(
302311
Primitive(
@@ -310,7 +319,7 @@ object CopybookParser {
310319
}
311320
else
312321
newChildren.append(
313-
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
322+
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
314323
)
315324
}
316325
}
@@ -840,7 +849,7 @@ object CopybookParser {
840849
* <li>Remove all groups that don't have child nodes.</li>
841850
* </ul>
842851
*
843-
* @param ast An AST as a set of copybook records
852+
* @param ast An AST as a set of copybook records
844853
* @param dropValueFillers is there intention to drop primitive fields fillers
845854
* @return The same AST with group fillers processed
846855
*/
@@ -919,8 +928,8 @@ object CopybookParser {
919928
val newGrp = processGroup(grp)
920929
newChildren += newGrp
921930
case st: Primitive =>
922-
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
923-
newChildren += getDebugField(st)
931+
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
932+
newChildren += getDebugField(st)
924933
}
925934
group.withUpdatedChildren(newChildren)
926935
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,12 @@ object ANTLRParser {
5656
enc: Encoding,
5757
stringTrimmingPolicy: StringTrimmingPolicy,
5858
commentPolicy: CommentPolicy,
59+
improvedNullDetection: Boolean,
5960
ebcdicCodePage: CodePage,
6061
asciiCharset: Charset,
6162
isUtf16BigEndian: Boolean,
6263
floatingPointFormat: FloatingPointFormat): CopybookAST = {
63-
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
64+
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
6465

6566
val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
6667
line =>

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ class ParserVisitor(enc: Encoding,
4545
ebcdicCodePage: CodePage,
4646
asciiCharset: Charset,
4747
isUtf16BigEndian: Boolean,
48-
floatingPointFormat: FloatingPointFormat) extends copybookParserBaseVisitor[Expr] {
48+
floatingPointFormat: FloatingPointFormat,
49+
improvedNullDetection: Boolean) extends copybookParserBaseVisitor[Expr] {
4950
/* expressions */
5051
case class IdentifierExpr(value: String) extends Expr
5152
case class OccursExpr(m: Int, M: Option[Int], dep: Option[String]) extends Expr
@@ -812,7 +813,7 @@ class ParserVisitor(enc: Encoding,
812813
Map(),
813814
isDependee = false,
814815
identifier.toUpperCase() == Constants.FILLER,
815-
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
816+
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
816817
) (Some(parent))
817818

818819
parent.children.append(prim)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/AsciiStringDecoderWrapper.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ import scala.collection.mutable.ArrayBuffer
2929
* @param asciiCharsetName A charset name of input strings
3030
* @return A string representation of the binary data
3131
*/
32-
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) extends Serializable with (Array[Byte] => Any) {
32+
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String, improvedNullDetection: Boolean) extends Serializable with (Array[Byte] => Any) {
3333
import StringDecoders._
34+
import StringTools._
3435

3536
lazy val charset: Charset = Charset.forName(asciiCharsetName)
3637

@@ -41,6 +42,9 @@ class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) ext
4142
* @return A string representation of the binary data
4243
*/
4344
def apply(bytes: Array[Byte]): String = {
45+
if (improvedNullDetection && isArrayNull(bytes))
46+
return null
47+
4448
var i = 0
4549

4650
// Filter out all special characters

0 commit comments

Comments
 (0)