Skip to content

Commit 4894e62

Browse files
committed
feat!: support SAS XPORT and portable SPSS files
1 parent 8c87a5b commit 4894e62

File tree

6 files changed

+91
-6
lines changed

6 files changed

+91
-6
lines changed

CMakeLists.txt

+9
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,18 @@ set(EXTENSION_SOURCES
4444
third_party/readstat/readstat_value.c
4545
third_party/readstat/readstat_variable.c
4646
third_party/readstat/readstat_writer.c
47+
third_party/readstat/sas/ieee.c
4748
third_party/readstat/sas/readstat_sas.c
4849
third_party/readstat/sas/readstat_sas_rle.c
4950
third_party/readstat/sas/readstat_sas7bdat_read.c
51+
third_party/readstat/sas/readstat_xport_parse_format.c
52+
third_party/readstat/sas/readstat_xport_read.c
53+
third_party/readstat/sas/readstat_xport_write.c
54+
third_party/readstat/sas/readstat_xport.c
55+
third_party/readstat/spss/readstat_por_parse.c
56+
third_party/readstat/spss/readstat_por_read.c
57+
third_party/readstat/spss/readstat_por_write.c
58+
third_party/readstat/spss/readstat_por.c
5059
third_party/readstat/spss/readstat_sav_compress.c
5160
third_party/readstat/spss/readstat_sav_parse_timestamp.c
5261
third_party/readstat/spss/readstat_sav_parse.c

README.md

+11-5
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,29 @@ in a DuckDB instance near you.
1717
The extension adds a single DuckDB table function, `read_stat`, which you use as follows:
1818

1919
```SQL
20-
-- Read a SAS `.sas7bdat` file
20+
-- Read a SAS `.sas7bdat` or `.xpt` file
2121
FROM read_stat('sas_data.sas7bdat');
22-
-- Read an SPSS `.sav` or `.zsav` file
22+
FROM read_stat('sas_data.xpt');
23+
-- Read an SPSS `.sav`, `.zsav`, or `.por` file
2324
FROM read_stat('spss_data.sav');
2425
FROM read_stat('compressed_spss_data.zsav');
26+
FROM read_stat('portable_spss_data.por');
2527
-- Read a Stata .dta file
2628
FROM read_stat('stata_data.dta');
2729
```
2830

29-
If the file extension is not `.sas7bdat`, `.sav`, `.zsav`, or `.dta`,
31+
If the file extension is not `.sas7bdat`, `.xpt`, `.sav`, `.zsav`, `.por`, or `.dta`,
3032
use the `read_stat` function for the right file type with the `format` parameter:
3133

3234
```SQL
3335
FROM read_stat('sas_data.other_extension', format = 'sas7bdat');
36+
FROM read_stat('sas_data.other_extension', format = 'xpt');
3437
-- SPSS `.sav` and `.zsav` can both be read through the format `'sav'`
3538
FROM read_stat(
3639
'spss_data_possibly_compressed.other_extension',
3740
format = 'sav'
3841
);
42+
FROM read_stat('portable_spss_data.other_extension', format = 'por');
3943
FROM read_stat('stata_data.other_extension', format = 'dta');
4044
```
4145

@@ -48,11 +52,13 @@ FROM read_stat('latin1_encoded.sas7bdat', encoding = 'iso-8859-1');
4852
If your files have the proper file extensions and you do not need to override their character encodings, a [replacement scan](<https://duckdb.org/docs/stable/guides/glossary.html#replacement-scan>) is also available:
4953

5054
```SQL
51-
-- Read a SAS `.sas7bdat` file
55+
-- Read a SAS `.sas7bdat` or `.xpt` file
5256
FROM 'sas_data.sas7bdat';
53-
-- Read an SPSS `.sav` or `.zsav` file
57+
FROM 'sas_data.xpt';
58+
-- Read an SPSS `.sav`, `.zsav`, or `.por` file
5459
FROM 'spss_data.sav';
5560
FROM 'compressed_spss_data.zsav';
61+
FROM 'portable_spss_data.por';
5662
-- Read a Stata .dta file
5763
FROM 'stata_data.dta';
5864
```

src/duckdb_read_stat.c

+37-1
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,21 @@ void duckdb_read_stat_bind(duckdb_bind_info info)
144144
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SAS;
145145
error = readstat_parse_sas7bdat(parser, path, data);
146146
}
147+
if (!strcmp(format, "xpt"))
148+
{
149+
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SAS;
150+
error = readstat_parse_xport(parser, path, data);
151+
}
147152
else if (!strcmp(format, "sav") || !strcmp(format, "zsav"))
148153
{
149154
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SPSS;
150155
error = readstat_parse_sav(parser, path, data);
151156
}
157+
else if (!strcmp(format, "por"))
158+
{
159+
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SPSS;
160+
error = readstat_parse_por(parser, path, data);
161+
}
152162
else if (!strcmp(format, "dta"))
153163
{
154164
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_STATA;
@@ -160,11 +170,21 @@ void duckdb_read_stat_bind(duckdb_bind_info info)
160170
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SAS;
161171
error = readstat_parse_sas7bdat(parser, path, data);
162172
}
173+
else if (duckdb_read_stat_ends_with(path, ".xpt"))
174+
{
175+
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SAS;
176+
error = readstat_parse_xport(parser, path, data);
177+
}
163178
else if (duckdb_read_stat_ends_with(path, ".sav") || duckdb_read_stat_ends_with(path, ".zsav"))
164179
{
165180
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SPSS;
166181
error = readstat_parse_sav(parser, path, data);
167182
}
183+
else if (duckdb_read_stat_ends_with(path, ".por"))
184+
{
185+
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_SPSS;
186+
error = readstat_parse_por(parser, path, data);
187+
}
168188
else if (duckdb_read_stat_ends_with(path, ".dta"))
169189
{
170190
data->file_format = DUCKDB_READ_STAT_FILE_FORMAT_STATA;
@@ -472,10 +492,18 @@ void duckdb_read_stat_function(duckdb_function_info info, duckdb_data_chunk outp
472492
{
473493
error = readstat_parse_sas7bdat(parser, bind_data->path, context);
474494
}
495+
else if (!strcasecmp(bind_data->format, "xpt"))
496+
{
497+
error = readstat_parse_xport(parser, bind_data->path, context);
498+
}
475499
else if (!strcasecmp(bind_data->format, "sav") || !strcasecmp(bind_data->format, "zsav"))
476500
{
477501
error = readstat_parse_sav(parser, bind_data->path, context);
478502
}
503+
else if (!strcasecmp(bind_data->format, "por"))
504+
{
505+
error = readstat_parse_por(parser, bind_data->path, context);
506+
}
479507
else if (!strcasecmp(bind_data->format, "dta"))
480508
{
481509
error = readstat_parse_dta(parser, bind_data->path, context);
@@ -485,10 +513,18 @@ void duckdb_read_stat_function(duckdb_function_info info, duckdb_data_chunk outp
485513
{
486514
error = readstat_parse_sas7bdat(parser, bind_data->path, context);
487515
}
516+
else if (duckdb_read_stat_ends_with(bind_data->path, ".xpt"))
517+
{
518+
error = readstat_parse_xport(parser, bind_data->path, context);
519+
}
488520
else if (duckdb_read_stat_ends_with(bind_data->path, ".sav") || duckdb_read_stat_ends_with(bind_data->path, ".zsav"))
489521
{
490522
error = readstat_parse_sav(parser, bind_data->path, context);
491523
}
524+
else if (duckdb_read_stat_ends_with(bind_data->path, ".por"))
525+
{
526+
error = readstat_parse_por(parser, bind_data->path, context);
527+
}
492528
else if (duckdb_read_stat_ends_with(bind_data->path, ".dta"))
493529
{
494530
error = readstat_parse_dta(parser, bind_data->path, context);
@@ -536,7 +572,7 @@ void duckdb_read_stat_register_read_stat_function(duckdb_connection connection)
536572

537573
void duckdb_read_stat_replacement_scan(duckdb_replacement_scan_info info, const char *table_name, void *data)
538574
{
539-
if (duckdb_read_stat_ends_with(table_name, ".sas7bdat") || duckdb_read_stat_ends_with(table_name, ".sav") || duckdb_read_stat_ends_with(table_name, ".zsav") || duckdb_read_stat_ends_with(table_name, ".dta"))
575+
if (duckdb_read_stat_ends_with(table_name, ".sas7bdat") || duckdb_read_stat_ends_with(table_name, ".xpt") || duckdb_read_stat_ends_with(table_name, ".sav") || duckdb_read_stat_ends_with(table_name, ".zsav") || duckdb_read_stat_ends_with(table_name, ".por") || duckdb_read_stat_ends_with(table_name, ".dta"))
540576
{
541577
duckdb_replacement_scan_set_function_name(info, "read_stat");
542578
duckdb_replacement_scan_add_parameter(info, duckdb_create_varchar(table_name));

test/sql/sample.por

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
ÁâÃÉÉ@â×ââ@×ÖÙã@ÆÉÓÅ@@@@@@@@@@@@@@@@@@@@ASCII SPSS PORT FILE
2+
00000-0000-0000-0000--------------------!3#))0303300/240&),%00000000000000000000
3+
0200002'220'&)3000#0000000000000000000000000000000000000000000000000000000000000
4+
0000000000000000000000000123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst
5+
uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'="000000~000000000000000000000{}\0000000000000
6+
00000000000000000000000000000000000000000000000000000000SPSSPORTA8/201812166/172
7+
8211O/IBM SPSS Statistics 25.047/5B/71/6/MYCHAR1/1/0/1/1/0/C9/character70/5/MYNU
8+
M5/8/2/5/8/2/C7/numeric70/6/MYDATE40/A/0/40/A/0/C4/date70/5/DTIME3E/K/0/3E/K/0/C
9+
8/datetime70/6/MYLABL5/8/2/5/8/2/C7/labeled70/5/MYORD5/8/2/5/8/2/C7/ordinal70/6/
10+
MYTIME3D/8/0/3D/8/0/C4/timeD1/6/MYLABL2/1/4/Male2/6/FemaleD1/5/MYORD3/1/3/low2/6
11+
/medium3/4/highE4/N/some test text as notesO/ (Entered 15-Aug-2018)J/some othe
12+
r commentsO/ (Entered 15-Aug-2018)F1/a1.3/IPJ2+3/IPJ3AKA/1/1/1AKA/1/b1.6/CQCMC
13+
+2/CQCNMKA/2/2/32KA/1/c-13A.9/G9Q+4/G9Q+4/1/3/0/1/d-1.C/8CO+2/8CO+2/2/1/24KA/1/e
14+
13A.9/*.*.1/1/*.ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ

test/sql/sample.test

+20
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@ c -1000.3 1960-01-01 1960-01-01 00:00:00 1.0 3.0 00:00:00
2222
d -1.4 1583-01-01 1583-01-01 00:00:00 2.0 1.0 16:10:10
2323
e 1000.3 NULL NULL 1.0 1.0 NULL
2424

25+
query IIIIIII
26+
SELECT mychar, mynum, mydate, dtime, mylabl, myord, mytime
27+
FROM './test/sql/sample.xpt';
28+
----
29+
a 1.1 2018-05-06 2018-05-06 10:10:10 1.0 1.0 10:10:10
30+
b 1.2 1880-05-06 1880-05-06 10:10:10 2.0 2.0 23:10:10
31+
c -1000.3 1960-01-01 1960-01-01 00:00:00 1.0 3.0 00:00:00
32+
d -1.4 1583-01-01 1583-01-01 00:00:00 2.0 1.0 16:10:10
33+
e 1000.3 NULL NULL 1.0 1.0 NULL
34+
2535
query IIIIIII
2636
SELECT mychar, mynum, mydate, dtime, mylabl, myord, mytime
2737
FROM './test/sql/sample.sav';
@@ -42,6 +52,16 @@ c -1000.3 1960-01-01 1960-01-01 00:00:00 1.0 3.0 00:00:00
4252
d -1.4 1583-01-01 1583-01-01 00:00:00 2.0 1.0 16:10:10
4353
e 1000.3 NULL NULL 1.0 1.0 NULL
4454

55+
query IIIIIII
56+
SELECT mychar, mynum, mydate, dtime, mylabl, myord, mytime
57+
FROM './test/sql/sample.por';
58+
----
59+
a 1.1 2018-05-06 2018-05-06 10:10:10 1.0 1.0 10:10:10
60+
b 1.2 1880-05-06 1880-05-06 10:10:10 2.0 2.0 23:10:10
61+
c -1000.3 1960-01-01 1960-01-01 00:00:00 1.0 3.0 00:00:00
62+
d -1.4 1583-01-01 1583-01-01 00:00:00 2.0 1.0 16:10:10
63+
e 1000.3 NULL NULL 1.0 1.0 NULL
64+
4565
query IIIIIII
4666
SELECT mychar, mynum, mydate, dtime, mylabl, myord, mytime
4767
FROM './test/sql/sample.dta';

test/sql/sample.xpt

2.03 KB
Binary file not shown.

0 commit comments

Comments
 (0)