@@ -7,6 +7,7 @@ const parquet_shredder = require('./shred')
77const parquet_util = require ( './util' )
88const parquet_codec = require ( './codec' )
99const parquet_compression = require ( './compression' )
10+ const parquet_types = require ( './types' ) ;
1011
1112/**
1213 * Parquet File Magic String
@@ -293,6 +294,27 @@ function encodeValues(type, encoding, values, opts) {
293294 return parquet_codec [ encoding ] . encodeValues ( type , values , opts ) ;
294295}
295296
297+ function encodeStatisticsValue ( value , column ) {
298+ if ( column . originalType ) {
299+ value = parquet_types . toPrimitive ( column . originalType , value ) ;
300+ }
301+ if ( column . primitiveType !== 'BYTE_ARRAY' ) {
302+ value = encodeValues ( column . primitiveType , 'PLAIN' , [ value ] , column ) ;
303+ }
304+ return value ;
305+ }
306+
307+ function encodeStatistics ( statistics , column ) {
308+ statistics = Object . assign ( { } , statistics ) ;
309+ statistics . min_value = encodeStatisticsValue ( statistics . min_value , column ) ;
310+ statistics . max_value = encodeStatisticsValue ( statistics . max_value , column ) ;
311+
312+ statistics . max = statistics . max_value ;
313+ statistics . min = statistics . min_value ;
314+
315+ return new parquet_thrift . Statistics ( statistics ) ;
316+ }
317+
296318function encodePages ( schema , rowBuffer , opts ) {
297319 if ( ! rowBuffer . pageRowCount ) {
298320 return ;
@@ -305,6 +327,23 @@ function encodePages(schema, rowBuffer, opts) {
305327
306328 let page ;
307329 const values = rowBuffer . columnData [ field . path ] ;
330+
331+ let statistics ;
332+
333+ if ( field . statistics !== false ) {
334+ statistics = { } ;
335+ [ ...values . distinct_values ] . forEach ( ( v , i ) => {
336+ if ( i === 0 || v > statistics . max_value ) {
337+ statistics . max_value = v ;
338+ }
339+ if ( i === 0 || v < statistics . min_value ) {
340+ statistics . min_value = v ;
341+ }
342+ } ) ;
343+
344+ statistics . null_count = values . count - values . values . length ;
345+ statistics . distinct_count = values . distinct_values . size ;
346+ }
308347
309348 if ( opts . useDataPageV2 ) {
310349 page = encodeDataPageV2 (
@@ -313,18 +352,27 @@ function encodePages(schema, rowBuffer, opts) {
313352 rowBuffer . pageRowCount ,
314353 values . values ,
315354 values . rlevels ,
316- values . dlevels ) ;
355+ values . dlevels ,
356+ statistics ) ;
317357 } else {
318358 page = encodeDataPage (
319359 field ,
320360 values . count ,
321361 values . values ,
322362 values . rlevels ,
323- values . dlevels ) ;
363+ values . dlevels ,
364+ statistics ) ;
324365 }
325366
326- rowBuffer . pages [ field . path ] . push ( { page, count : values . values . length } ) ;
367+ rowBuffer . pages [ field . path ] . push ( {
368+ page,
369+ statistics,
370+ distinct_values : values . distinct_values ,
371+ count : values . values . length
372+ } ) ;
373+
327374
375+ values . distinct_values = new Set ( ) ;
328376 values . values = [ ] ;
329377 values . rlevels = [ ] ;
330378 values . dlevels = [ ] ;
@@ -337,7 +385,7 @@ function encodePages(schema, rowBuffer, opts) {
337385/**
338386 * Encode a parquet data page
339387 */
340- function encodeDataPage ( column , valueCount , values , rlevels , dlevels ) {
388+ function encodeDataPage ( column , valueCount , values , rlevels , dlevels , statistics ) {
341389 /* encode values */
342390 let valuesBuf = encodeValues (
343391 column . primitiveType ,
@@ -374,6 +422,9 @@ function encodeDataPage(column, valueCount, values, rlevels, dlevels) {
374422 pageHeader . compressed_page_size = pageBody . length ;
375423 pageHeader . data_page_header = new parquet_thrift . DataPageHeader ( ) ;
376424 pageHeader . data_page_header . num_values = rlevels . length ;
425+ if ( column . statistics !== false ) {
426+ pageHeader . data_page_header . statistics = encodeStatistics ( statistics , column ) ;
427+ }
377428
378429 pageHeader . data_page_header . encoding = parquet_thrift . Encoding [ column . encoding ] ;
379430 pageHeader . data_page_header . definition_level_encoding =
@@ -388,7 +439,7 @@ function encodeDataPage(column, valueCount, values, rlevels, dlevels) {
388439/**
389440 * Encode a parquet data page (v2)
390441 */
391- function encodeDataPageV2 ( column , valueCount , rowCount , values , rlevels , dlevels ) {
442+ function encodeDataPageV2 ( column , valueCount , rowCount , values , rlevels , dlevels , statistics ) {
392443 /* encode values */
393444 let valuesBuf = encodeValues (
394445 column . primitiveType ,
@@ -433,6 +484,10 @@ function encodeDataPageV2(column, valueCount, rowCount, values, rlevels, dlevels
433484 pageHeader . data_page_header_v2 . num_nulls = valueCount - values . length ;
434485 pageHeader . data_page_header_v2 . num_rows = valueCount ;
435486
487+ if ( column . statistics !== false ) {
488+ pageHeader . data_page_header_v2 . statistics = encodeStatistics ( statistics , column ) ;
489+ }
490+
436491 pageHeader . uncompressed_page_size =
437492 rLevelsBuf . length + dLevelsBuf . length + valuesBuf . length ;
438493
@@ -477,6 +532,34 @@ function encodeColumnChunk(pages, opts) {
477532 metadata . codec = parquet_thrift . CompressionCodec [
478533 opts . useDataPageV2 ? opts . column . compression : 'UNCOMPRESSED' ] ;
479534
535+ /* compile statistics */
536+ let statistics = { } ;
537+ let distinct_values = new Set ( ) ;
538+ statistics . null_count = 0 ;
539+ statistics . distinct_count = 0 ;
540+
541+
542+ for ( let i = 0 ; i < pages . length ; i ++ ) {
543+ let page = pages [ i ] ;
544+
545+ if ( opts . column . statistics !== false ) {
546+
547+ if ( page . statistics . max_value > statistics . max_value || i == 0 ) {
548+ statistics . max_value = page . statistics . max_value ;
549+ }
550+ if ( page . statistics . min_value < statistics . min_value || i == 0 ) {
551+ statistics . min_value = page . statistics . min_value ;
552+ }
553+ statistics . null_count += page . statistics . null_count ;
554+ page . distinct_values . forEach ( value => distinct_values . add ( value ) ) ;
555+ }
556+ }
557+
558+ if ( opts . column . statistics !== false ) {
559+ statistics . distinct_count = distinct_values . size ;
560+ metadata . statistics = encodeStatistics ( statistics , opts . column ) ;
561+ }
562+
480563 /* list encodings */
481564 let encodingsSet = { } ;
482565 encodingsSet [ PARQUET_RDLVL_ENCODING ] = true ;
0 commit comments