Skip to content

Commit f374649

Browse files
committed
#678 Implement strict integral precision feature.
1 parent 9f57161 commit f374649

File tree

19 files changed

+419
-161
lines changed

19 files changed

+419
-161
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,6 +1517,7 @@ The output looks like this:
15171517
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
15181518
| .option("strict_sign_overpunching", "true") | If `true` (default), sign overpunching will only be allowed for signed numbers. If `false`, overpunched positive sign will be allowed for unsigned numbers, but negative sign will result in null. |
15191519
| .option("improved_null_detection", "true") | If `true`(default), values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
1520+
| .option("strict_integral_precision", "true") | If `true`, Cobrix will not generate `short`/`integer`/`long` Spark data types, and always use `decimal(n)` with the exact precision that matches the copybook. |
15201521
| .option("binary_as_hex", "false") | By default fields that have `PIC X` and `USAGE COMP` are converted to `binary` Spark data type. If this option is set to `true`, such fields will be strings in HEX encoding. |
15211522

15221523
##### Modifier options

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/CopybookParser.scala

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,14 @@ object CopybookParser extends Logging {
8787
dropGroupFillers: Boolean = false,
8888
dropValueFillers: Boolean = true,
8989
commentPolicy: CommentPolicy = CommentPolicy(),
90-
dropFillersFromAst: Boolean = false
90+
dropFillersFromAst: Boolean = false,
91+
strictIntegralPrecision: Boolean = false
9192
): Copybook = {
9293
val copybook = parse(copyBookContents = copyBookContents,
9394
dropGroupFillers = dropGroupFillers,
9495
dropValueFillers = dropValueFillers,
95-
commentPolicy = commentPolicy)
96+
commentPolicy = commentPolicy,
97+
strictIntegralPrecision = strictIntegralPrecision)
9698

9799
if (dropFillersFromAst && (dropGroupFillers || dropValueFillers)) {
98100
copybook.dropFillers(dropGroupFillers, dropValueFillers)
@@ -135,6 +137,7 @@ object CopybookParser extends Logging {
135137
commentPolicy: CommentPolicy = CommentPolicy(),
136138
strictSignOverpunch: Boolean = true,
137139
improvedNullDetection: Boolean = false,
140+
strictIntegralPrecision: Boolean = false,
138141
decodeBinaryAsHex: Boolean = false,
139142
ebcdicCodePage: CodePage = new CodePageCommon,
140143
asciiCharset: Charset = StandardCharsets.US_ASCII,
@@ -155,6 +158,7 @@ object CopybookParser extends Logging {
155158
commentPolicy,
156159
strictSignOverpunch,
157160
improvedNullDetection,
161+
strictIntegralPrecision,
158162
decodeBinaryAsHex,
159163
ebcdicCodePage,
160164
asciiCharset,
@@ -197,6 +201,7 @@ object CopybookParser extends Logging {
197201
commentPolicy: CommentPolicy = CommentPolicy(),
198202
strictSignOverpunch: Boolean = true,
199203
improvedNullDetection: Boolean = false,
204+
strictIntegralPrecision: Boolean = false,
200205
decodeBinaryAsHex: Boolean = false,
201206
ebcdicCodePage: CodePage = new CodePageCommon,
202207
asciiCharset: Charset = StandardCharsets.US_ASCII,
@@ -217,6 +222,7 @@ object CopybookParser extends Logging {
217222
commentPolicy,
218223
strictSignOverpunch,
219224
improvedNullDetection,
225+
strictIntegralPrecision,
220226
decodeBinaryAsHex,
221227
ebcdicCodePage,
222228
asciiCharset,
@@ -262,6 +268,7 @@ object CopybookParser extends Logging {
262268
commentPolicy: CommentPolicy,
263269
strictSignOverpunch: Boolean,
264270
improvedNullDetection: Boolean,
271+
strictIntegralPrecision: Boolean,
265272
decodeBinaryAsHex: Boolean,
266273
ebcdicCodePage: CodePage,
267274
asciiCharset: Charset,
@@ -272,7 +279,7 @@ object CopybookParser extends Logging {
272279
debugFieldsPolicy: DebugFieldsPolicy,
273280
fieldCodePageMap: Map[String, String]): Copybook = {
274281

275-
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
282+
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, fieldCodePageMap)
276283

277284
val nonTerms: Set[String] = (for (id <- nonTerminals)
278285
yield transformIdentifier(id)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ANTLRParser.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,14 @@ object ANTLRParser extends Logging {
5757
commentPolicy: CommentPolicy,
5858
strictSignOverpunch: Boolean,
5959
improvedNullDetection: Boolean,
60+
strictIntegralPrecision: Boolean,
6061
decodeBinaryAsHex: Boolean,
6162
ebcdicCodePage: CodePage,
6263
asciiCharset: Charset,
6364
isUtf16BigEndian: Boolean,
6465
floatingPointFormat: FloatingPointFormat,
6566
fieldCodePageMap: Map[String, String]): CopybookAST = {
66-
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, decodeBinaryAsHex, fieldCodePageMap)
67+
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision, decodeBinaryAsHex, fieldCodePageMap)
6768

6869
val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
6970
line =>

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/antlr/ParserVisitor.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class ParserVisitor(enc: Encoding,
4747
floatingPointFormat: FloatingPointFormat,
4848
strictSignOverpunch: Boolean,
4949
improvedNullDetection: Boolean,
50+
strictIntegralPrecision: Boolean,
5051
decodeBinaryAsHex: Boolean,
5152
fieldCodePageMap: Map[String, String]) extends copybookParserBaseVisitor[Expr] {
5253
/* expressions */
@@ -853,7 +854,7 @@ class ParserVisitor(enc: Encoding,
853854
Map(),
854855
isDependee = false,
855856
identifier.toUpperCase() == Constants.FILLER,
856-
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
857+
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
857858
) (Some(parent))
858859

859860
parent.children.append(prim)

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/decoders/DecoderSelector.scala

Lines changed: 52 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,15 @@ object DecoderSelector {
4242
* <li> Integral types are represented as boxed integers and longs. Larger integral numbers are represented as BigDecimal </li>
4343
* </ul>
4444
*
45-
* @param dataType A daatype of a copybook field
46-
* @param stringTrimmingPolicy Specifies how the decoder should handle string types
47-
* @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
48-
* @param asciiCharset A charset for ASCII encoded data
49-
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
50-
* @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
51-
* @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
45+
* @param dataType A datatype of a copybook field
46+
* @param stringTrimmingPolicy Specifies how the decoder should handle string types
47+
* @param ebcdicCodePage Specifies a code page to use for EBCDIC to ASCII/Unicode conversion
48+
* @param asciiCharset A charset for ASCII encoded data
49+
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
50+
* @param floatingPointFormat Specifies a floating point format (IBM or IEEE754)
51+
* @param strictSignOverpunch if true, sign overpunching is not allowed for positive numbers.
52+
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
53+
* @param strictIntegralPrecision If true, Cobrix will not generate short/integer/long Spark data types, and always use decimal(n) with the exact precision that matches the copybook.
5254
* @return A function that converts an array of bytes to the target data type.
5355
*/
5456
def getDecoder(dataType: CobolType,
@@ -58,11 +60,12 @@ object DecoderSelector {
5860
isUtf16BigEndian: Boolean = true,
5961
floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
6062
strictSignOverpunch: Boolean = false,
61-
improvedNullDetection: Boolean = false): Decoder = {
63+
improvedNullDetection: Boolean = false,
64+
strictIntegralPrecision: Boolean = false): Decoder = {
6265
val decoder = dataType match {
6366
case alphaNumeric: AlphaNumeric => getStringDecoder(alphaNumeric.enc.getOrElse(EBCDIC), stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, improvedNullDetection)
6467
case decimalType: Decimal => getDecimalDecoder(decimalType, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
65-
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection)
68+
case integralType: Integral => getIntegralDecoder(integralType, strictSignOverpunch, improvedNullDetection, strictIntegralPrecision)
6669
case _ => throw new IllegalStateException("Unknown AST object")
6770
}
6871
decoder
@@ -186,7 +189,8 @@ object DecoderSelector {
186189
/** Gets a decoder function for an integral data type. A direct conversion from array of bytes to the target type is used where possible. */
187190
private def getIntegralDecoder(integralType: Integral,
188191
strictSignOverpunch: Boolean,
189-
improvedNullDetection: Boolean): Decoder = {
192+
improvedNullDetection: Boolean,
193+
strictIntegralPrecision: Boolean): Decoder = {
190194
val encoding = integralType.enc.getOrElse(EBCDIC)
191195

192196
val isEbcidic = encoding match {
@@ -198,7 +202,12 @@ object DecoderSelector {
198202

199203
integralType.compact match {
200204
case None =>
201-
if (integralType.precision <= Constants.maxIntegerPrecision) {
205+
if (strictIntegralPrecision) {
206+
if (isEbcidic)
207+
StringDecoders.decodeEbcdicBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
208+
else
209+
StringDecoders.decodeAsciiBigNumber(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
210+
} else if (integralType.precision <= Constants.maxIntegerPrecision) {
202211
if (isEbcidic)
203212
StringDecoders.decodeEbcdicInt(_, !isSigned, isSigned || !strictSignOverpunch, improvedNullDetection)
204213
else
@@ -223,57 +232,60 @@ object DecoderSelector {
223232
throw new IllegalStateException("Unexpected error. COMP-2 (double) is incorrect for an integral number.")
224233
case Some(COMP3()) =>
225234
// COMP-3 aka BCD-encoded number
226-
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true)
235+
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = true, strictIntegralPrecision)
227236
case Some(COMP3U()) =>
228237
// COMP-3U aka Unsigned BCD-encoded number aka Unsigned Packed
229-
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false)
238+
getBCDIntegralDecoder(integralType.precision, mandatorySignNibble = false, strictIntegralPrecision)
230239
case Some(COMP4()) =>
231240
// COMP aka BINARY encoded number
232-
getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true)
241+
getBinaryEncodedIntegralDecoder(Some(COMP4()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
233242
case Some(COMP5()) =>
234243
// COMP aka BINARY encoded number
235-
getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true)
244+
getBinaryEncodedIntegralDecoder(Some(COMP5()), integralType.precision, integralType.signPosition, isBigEndian = true, strictIntegralPrecision)
236245
case Some(COMP9()) =>
237246
// COMP aka BINARY encoded number
238-
getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false)
247+
getBinaryEncodedIntegralDecoder(Some(COMP9()), integralType.precision, integralType.signPosition, isBigEndian = false, strictIntegralPrecision)
239248
case _ =>
240249
throw new IllegalStateException(s"Unknown number compression format (${integralType.compact.get}).")
241250
}
242251
}
243252

244253
/** Gets a decoder function for a binary encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
245-
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean): Decoder = {
254+
private def getBinaryEncodedIntegralDecoder(compact: Option[Usage], precision: Int, signPosition: Option[Position] = None, isBigEndian: Boolean, strictIntegralPrecision: Boolean): Decoder = {
246255
val isSigned = signPosition.nonEmpty
247-
val isSignLeft = signPosition.forall(sp => if (sp == za.co.absa.cobrix.cobol.parser.position.Left) true else false)
248256

249257
val numOfBytes = BinaryUtils.getBytesCount(compact, precision, isSigned, isExplicitDecimalPt = false, isSignSeparate = false)
250-
val decoder = (isSigned, isBigEndian, numOfBytes) match {
251-
case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
252-
case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
253-
case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
254-
case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
255-
case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
256-
case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
257-
case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
258-
case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
259-
case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
260-
case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
261-
case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
262-
case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
263-
case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
264-
case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
265-
case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
266-
case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
267-
case _ =>
268-
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
258+
val decoder = if (strictIntegralPrecision) {
259+
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
260+
} else {
261+
(isSigned, isBigEndian, numOfBytes) match {
262+
case (true, true, 1) => BinaryNumberDecoders.decodeSignedByte _
263+
case (true, true, 2) => BinaryNumberDecoders.decodeBinarySignedShortBigEndian _
264+
case (true, true, 4) => BinaryNumberDecoders.decodeBinarySignedIntBigEndian _
265+
case (true, true, 8) => BinaryNumberDecoders.decodeBinarySignedLongBigEndian _
266+
case (true, false, 1) => BinaryNumberDecoders.decodeSignedByte _
267+
case (true, false, 2) => BinaryNumberDecoders.decodeBinarySignedShortLittleEndian _
268+
case (true, false, 4) => BinaryNumberDecoders.decodeBinarySignedIntLittleEndian _
269+
case (true, false, 8) => BinaryNumberDecoders.decodeBinarySignedLongLittleEndian _
270+
case (false, true, 1) => BinaryNumberDecoders.decodeUnsignedByte _
271+
case (false, true, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortBigEndian _
272+
case (false, true, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntBigEndian _
273+
case (false, true, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongBigEndian _
274+
case (false, false, 1) => BinaryNumberDecoders.decodeUnsignedByte _
275+
case (false, false, 2) => BinaryNumberDecoders.decodeBinaryUnsignedShortLittleEndian _
276+
case (false, false, 4) => BinaryNumberDecoders.decodeBinaryUnsignedIntLittleEndian _
277+
case (false, false, 8) => BinaryNumberDecoders.decodeBinaryUnsignedLongLittleEndian _
278+
case _ =>
279+
(a: Array[Byte]) => BinaryNumberDecoders.decodeBinaryAribtraryPrecision(a, isBigEndian, isSigned)
280+
}
269281
}
270282
decoder // 999 999 999
271283
}
272284

273285
/** Gets a decoder function for a BCD-encoded integral data type. A direct conversion from array of bytes to the target type is used where possible. */
274-
private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean): Decoder = {
286+
private def getBCDIntegralDecoder(precision: Int, mandatorySignNibble: Boolean, strictIntegralPrecision: Boolean): Decoder = {
275287
val decoder =
276-
if (precision <= Constants.maxIntegerPrecision) {
288+
if (precision <= Constants.maxIntegerPrecision && !strictIntegralPrecision) {
277289
a: Array[Byte] => {
278290
val num = BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
279291
if (num != null) {
@@ -282,7 +294,7 @@ object DecoderSelector {
282294
null
283295
}
284296
}
285-
} else if (precision <= Constants.maxLongPrecision) {
297+
} else if (precision <= Constants.maxLongPrecision && !strictIntegralPrecision) {
286298
a: Array[Byte] => BCDNumberDecoders.decodeBCDIntegralNumber(a, mandatorySignNibble)
287299
} else {
288300
a: Array[Byte] =>

0 commit comments

Comments
 (0)