Skip to content

Commit

Permalink
Fix csv parsing (#531)
Browse files Browse the repository at this point in the history
* change CSV parser to directly parse into a Tensor

Otherwise we would have to copy the `seq` we parse into for mem
copyable types after parsing.

* [io] replace CSVParser based line counter by memfiles counter

* [io] add simple readCsv test & add note to docstring about #530

* [io] extend tests with semicolon example, fixup empty line test

* [io] remove TODO note

* fix line counting for quoted fields in CSV files

* [tests] add test case for quoted field in CSV file
  • Loading branch information
Vindaar authored Oct 27, 2021
1 parent 66372d2 commit 649e42b
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 27 deletions.
86 changes: 59 additions & 27 deletions src/arraymancer/io/io_csv.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,69 +15,101 @@

import os, parsecsv, streams, strutils, sequtils, algorithm,
../tensor
from memfiles as mf import nil

proc countLinesAndCols(file: string, sep: char, quote: char,
skipHeader: bool): tuple[rows: int, cols: int] =
## Counts the number of lines and columns in the given `file`.
##
## This uses the `memfiles` interface for performance reasons to avoid
## unnecessary overhead purely for counting lines. Ideally, the actual
## CSV parsing would also use the same interface.
var memf = mf.open(file)
defer: mf.close(memf)
var countedCols = false
var nCols = 1 # at least 1 column
var nRows = 0
var cstr: cstring
var quoted = false
for slice in mf.memSlices(memf):
cstr = cast[cstring](slice.data)
if slice.size > 0 and unlikely(not countedCols): # count number of columns
for idx in 0 ..< slice.size: # need to be careful to only access to `size`
if cstr[idx] == sep: # a separator means another column
inc nCols
inc nRows
countedCols = true
elif slice.size > 0: # only count non empty lines from here
for idx in 0 ..< slice.size:
if cstr[idx] == quote:
quoted = not quoted
if not quoted:
inc nRows
if skipHeader:
dec nRows
result = (rows: nRows, cols: nCols)

proc read_csv*[T: SomeNumber|bool|string](
csvPath: string,
skip_header = false,
skipHeader = false,
separator = ',',
quote = '\"'
): Tensor[T] {.noInit.} =
## Load a csv into a Tensor. All values must be of the same type.
##
## If there is a header row, it can be skipped.
##
## The reading of CSV files currently ``does not`` handle parsing a tensor
## created with `toCsv`. This is because the dimensional information becomes
## part of the CSV output and the parser has no option to reconstruct the
## correct tensor shape.
## Instead of a NxMx...xZ tensor we always construct a NxM tensor, where N-1
## is the rank of the original tensor and M is the total size (total number of
## elements) of the original tensor!
##
## Input:
## - csvPath: a path to the csvfile
## - skip_header: should read_csv skip the first row
## - skipHeader: should read_csv skip the first row
## - separator: a char, default ','
## - quote: a char, default '\"' (single and double quotes must be escaped).
## Separators inside quoted strings are ignored, for example: `"foo", "bar, baz"` corresponds to 2 columns not 3.

var parser: proc(x:string): T {.nimcall.}
when T is SomeSignedInt:
parser = proc(x:string): T = x.parseInt.T
parser = proc(x: string): T = x.parseInt.T
elif T is SomeUnsignedInt:
parser = proc(x:string): T = x.parseUInt.T
parser = proc(x: string): T = x.parseUInt.T
elif T is SomeFloat:
parser = proc(x:string): T = x.parseFloat.T
parser = proc(x: string): T = x.parseFloat.T
elif T is bool:
parser = parseBool
elif T is string:
parser = proc(x: string): string = shallowCopy(result, x) # no-op

# 1. count number of lines and columns using memfile interface
let (numRows, numCols) = countLinesAndCols(csvPath, separator, quote, skipHeader)

# 2. prepare CSV parser
var csv: CsvParser
let stream = newFileStream(csvPath, mode = fmRead)

csv.open( stream, csvPath,
separator = separator,
quote = quote,
skipInitialSpace = true
)
defer: csv.close

if skip_header:
discard csv.readRow
# 3. possibly skip the header
if skipHeader:
csv.readHeaderRow()

# Initialization, count cols:
discard csv.readRow #TODO what if there is only one line.
var
num_cols = csv.row.len
csvdata: seq[T] = @[]
for val in csv.row:
csvdata.add parser(val)

# Processing
# 4. init data storage for each type & process all rows
result = newTensorUninit[T]([numRows, numCols])
var curRow = 0
while csv.readRow:
for val in csv.row:
csvdata.add parser(val)

# Finalizing
let num_rows= if skip_header: csv.processedRows - 2
else: csv.processedRows - 1

result = newTensorUninit[T](num_rows, num_cols)
shallowCopy(result.storage.Fdata, csvdata)

for i, val in csv.row:
result[curRow, i] = parser val
inc curRow

proc to_csv*[T](
tensor: Tensor[T],
Expand Down
66 changes: 66 additions & 0 deletions tests/io/test_csv.nim
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,45 @@ proc main() =
1;1;4.0
"""

let csv_empty_lines = """
dimension_1,value
0,1
1,2
2,3
3,4
4,5
"""

let csv_semicolon_short = """dimension_1;value
0;1
1;2
2;3
3;4
4;5
"""

let csv_with_quoted = """dimension_1,value
0,A
1,B
2,"hello, this is a string
with a line break, ugh
"
3,D
4,E
"""



let test_file_path = getTempDir() / "arraymancer_test.csv"

## NOTE: the reading of CSV files in arraymancer currently ``does not`` handle parsing its own
## CSV files as the dimensional information becomes part of the CSV output. I.e. instead of constructing
## a NxMx...xZ tensor we always construct a NxM tensor, where N-1 is the rank of the original tensor
## and M is the total size (total number of elements) of the original.
suite "[IO] CSV support":

test "Should export 1d Tensor":
Expand All @@ -62,6 +99,11 @@ proc main() =
let content = readFile(test_file_path)
check content == expected_output_1d

test "Read 1D serialized tensor":
let tRead = readCsv[int](test_file_path, skipHeader = true)
let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
check tExp == tRead

test "Should export 2d Tensor":
let t = @[@[1, 2, 3], @[4, 5, 6]].toTensor()
t.to_csv(test_file_path)
Expand All @@ -80,5 +122,29 @@ proc main() =
let content = readFile(test_file_path)
check content == expected_output_semicolon

test "CSV parsing ignores empty lines":
writeFile(test_file_path, csv_empty_lines)
let tRead = readCsv[int](test_file_path, skipHeader = true)
let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
check tExp == tRead

test "CSV parsing of different (semicolon) separators works":
writeFile(test_file_path, csv_semicolon_short)
let tRead = readCsv[int](test_file_path, separator = ';', skipHeader = true)
let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
check tExp == tRead

test "CSV parsing of file with quoted content works":
writeFile(test_file_path, csv_with_quoted)
let tRead = readCsv[string](test_file_path, quote = '\"', skipHeader = true)
let tExp = @[@["0", "A"],
@["1", "B"],
@["2", """hello, this is a string
with a line break, ugh
"""],
@["3", "D"],
@["4", "E"]].toTensor()
check tExp == tRead

main()
GC_fullCollect()

0 comments on commit 649e42b

Please sign in to comment.