@@ -7,6 +7,7 @@ const parquet_shredder = require('./shred')
77const parquet_util = require ( './util' )
88const parquet_codec = require ( './codec' )
99const parquet_compression = require ( './compression' )
10+ const parquet_types = require ( './types' ) ;
1011
1112/**
1213 * Parquet File Magic String
@@ -293,6 +294,30 @@ function encodeValues(type, encoding, values, opts) {
293294 return parquet_codec [ encoding ] . encodeValues ( type , values , opts ) ;
294295}
295296
297+ function encodeStatisticsValue ( value , column ) {
298+ if ( value === undefined ) {
299+ return new Buffer ( 0 ) ;
300+ }
301+ if ( column . originalType ) {
302+ value = parquet_types . toPrimitive ( column . originalType , value ) ;
303+ }
304+ if ( column . primitiveType !== 'BYTE_ARRAY' ) {
305+ value = encodeValues ( column . primitiveType , 'PLAIN' , [ value ] , column ) ;
306+ }
307+ return value ;
308+ }
309+
310+ function encodeStatistics ( statistics , column ) {
311+ statistics = Object . assign ( { } , statistics ) ;
312+ statistics . min_value = encodeStatisticsValue ( statistics . min_value , column ) ;
313+ statistics . max_value = encodeStatisticsValue ( statistics . max_value , column ) ;
314+
315+ statistics . max = statistics . max_value ;
316+ statistics . min = statistics . min_value ;
317+
318+ return new parquet_thrift . Statistics ( statistics ) ;
319+ }
320+
296321function encodePages ( schema , rowBuffer , opts ) {
297322 if ( ! rowBuffer . pageRowCount ) {
298323 return ;
@@ -305,6 +330,23 @@ function encodePages(schema, rowBuffer, opts) {
305330
306331 let page ;
307332 const values = rowBuffer . columnData [ field . path ] ;
333+
334+ let statistics ;
335+
336+ if ( field . statistics !== false ) {
337+ statistics = { } ;
338+ [ ...values . distinct_values ] . forEach ( ( v , i ) => {
339+ if ( i === 0 || v > statistics . max_value ) {
340+ statistics . max_value = v ;
341+ }
342+ if ( i === 0 || v < statistics . min_value ) {
343+ statistics . min_value = v ;
344+ }
345+ } ) ;
346+
347+ statistics . null_count = values . count - values . values . length ;
348+ statistics . distinct_count = values . distinct_values . size ;
349+ }
308350
309351 if ( opts . useDataPageV2 ) {
310352 page = encodeDataPageV2 (
@@ -313,18 +355,27 @@ function encodePages(schema, rowBuffer, opts) {
313355 rowBuffer . pageRowCount ,
314356 values . values ,
315357 values . rlevels ,
316- values . dlevels ) ;
358+ values . dlevels ,
359+ statistics ) ;
317360 } else {
318361 page = encodeDataPage (
319362 field ,
320363 values . count ,
321364 values . values ,
322365 values . rlevels ,
323- values . dlevels ) ;
366+ values . dlevels ,
367+ statistics ) ;
324368 }
325369
326- rowBuffer . pages [ field . path ] . push ( { page, count : values . values . length } ) ;
370+ rowBuffer . pages [ field . path ] . push ( {
371+ page,
372+ statistics,
373+ distinct_values : values . distinct_values ,
374+ count : values . values . length
375+ } ) ;
376+
327377
378+ values . distinct_values = new Set ( ) ;
328379 values . values = [ ] ;
329380 values . rlevels = [ ] ;
330381 values . dlevels = [ ] ;
@@ -337,7 +388,7 @@ function encodePages(schema, rowBuffer, opts) {
337388/**
338389 * Encode a parquet data page
339390 */
340- function encodeDataPage ( column , valueCount , values , rlevels , dlevels ) {
391+ function encodeDataPage ( column , valueCount , values , rlevels , dlevels , statistics ) {
341392 /* encode values */
342393 let valuesBuf = encodeValues (
343394 column . primitiveType ,
@@ -374,6 +425,9 @@ function encodeDataPage(column, valueCount, values, rlevels, dlevels) {
374425 pageHeader . compressed_page_size = pageBody . length ;
375426 pageHeader . data_page_header = new parquet_thrift . DataPageHeader ( ) ;
376427 pageHeader . data_page_header . num_values = rlevels . length ;
428+ if ( column . statistics !== false ) {
429+ pageHeader . data_page_header . statistics = encodeStatistics ( statistics , column ) ;
430+ }
377431
378432 pageHeader . data_page_header . encoding = parquet_thrift . Encoding [ column . encoding ] ;
379433 pageHeader . data_page_header . definition_level_encoding =
@@ -388,7 +442,7 @@ function encodeDataPage(column, valueCount, values, rlevels, dlevels) {
388442/**
389443 * Encode a parquet data page (v2)
390444 */
391- function encodeDataPageV2 ( column , valueCount , rowCount , values , rlevels , dlevels ) {
445+ function encodeDataPageV2 ( column , valueCount , rowCount , values , rlevels , dlevels , statistics ) {
392446 /* encode values */
393447 let valuesBuf = encodeValues (
394448 column . primitiveType ,
@@ -433,6 +487,10 @@ function encodeDataPageV2(column, valueCount, rowCount, values, rlevels, dlevels
433487 pageHeader . data_page_header_v2 . num_nulls = valueCount - values . length ;
434488 pageHeader . data_page_header_v2 . num_rows = valueCount ;
435489
490+ if ( column . statistics !== false ) {
491+ pageHeader . data_page_header_v2 . statistics = encodeStatistics ( statistics , column ) ;
492+ }
493+
436494 pageHeader . uncompressed_page_size =
437495 rLevelsBuf . length + dLevelsBuf . length + valuesBuf . length ;
438496
@@ -477,6 +535,34 @@ function encodeColumnChunk(pages, opts) {
477535 metadata . codec = parquet_thrift . CompressionCodec [
478536 opts . useDataPageV2 ? opts . column . compression : 'UNCOMPRESSED' ] ;
479537
538+ /* compile statistics */
539+ let statistics = { } ;
540+ let distinct_values = new Set ( ) ;
541+ statistics . null_count = 0 ;
542+ statistics . distinct_count = 0 ;
543+
544+
545+ for ( let i = 0 ; i < pages . length ; i ++ ) {
546+ let page = pages [ i ] ;
547+
548+ if ( opts . column . statistics !== false ) {
549+
550+ if ( page . statistics . max_value > statistics . max_value || i == 0 ) {
551+ statistics . max_value = page . statistics . max_value ;
552+ }
553+ if ( page . statistics . min_value < statistics . min_value || i == 0 ) {
554+ statistics . min_value = page . statistics . min_value ;
555+ }
556+ statistics . null_count += page . statistics . null_count ;
557+ page . distinct_values . forEach ( value => distinct_values . add ( value ) ) ;
558+ }
559+ }
560+
561+ if ( opts . column . statistics !== false ) {
562+ statistics . distinct_count = distinct_values . size ;
563+ metadata . statistics = encodeStatistics ( statistics , opts . column ) ;
564+ }
565+
480566 /* list encodings */
481567 let encodingsSet = { } ;
482568 encodingsSet [ PARQUET_RDLVL_ENCODING ] = true ;
0 commit comments