Fix csv parsing (#531)

* change CSV parser to directly parse into a Tensor Otherwise we would have to copy the `seq` we parse into for mem copyable types after parsing. * [io] replace CSVParser based line counter by memfiles counter * [io] add simple readCsv test & add note to docstring about #530 * [io] extend tests with semicolon example, fixup empty line test * [io] remove TODO note * fix line counting for quoted fields in CSV files * [tests] add test case for quoted field in CSV file
mratsim · Oct 27, 2021 · 649e42b · 649e42b
1 parent 66372d2
commit 649e42b
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 27 deletions.
diff --git a/src/arraymancer/io/io_csv.nim b/src/arraymancer/io/io_csv.nim
@@ -15,69 +15,101 @@
 
 import  os, parsecsv, streams, strutils, sequtils, algorithm,
         ../tensor
+from memfiles as mf import nil
+
+proc countLinesAndCols(file: string, sep: char, quote: char,
+                       skipHeader: bool): tuple[rows: int, cols: int] =
+  ## Counts the number of lines and columns in the given `file`.
+  ##
+  ## This uses the `memfiles` interface for performance reasons to avoid
+  ## unnecessary overhead purely for counting lines. Ideally, the actual
+  ## CSV parsing would also use the same interface.
+  var memf = mf.open(file)
+  defer: mf.close(memf)
+  var countedCols = false
+  var nCols = 1 # at least 1 column
+  var nRows = 0
+  var cstr: cstring
+  var quoted = false
+  for slice in mf.memSlices(memf):
+    cstr = cast[cstring](slice.data)
+    if slice.size > 0 and unlikely(not countedCols): # count number of columns
+      for idx in 0 ..< slice.size:                   # need to be careful to only access to `size`
+        if cstr[idx] == sep:                         # a separator means another column
+          inc nCols
+      inc nRows
+      countedCols = true
+    elif slice.size > 0:                             # only count non empty lines from here
+      for idx in 0 ..< slice.size:
+        if cstr[idx] == quote:
+          quoted = not quoted
+      if not quoted:
+        inc nRows
+  if skipHeader:
+    dec nRows
+  result = (rows: nRows, cols: nCols)
 
 proc read_csv*[T: SomeNumber|bool|string](
        csvPath: string,
-       skip_header = false,
+       skipHeader = false,
        separator = ',',
        quote = '\"'
        ): Tensor[T] {.noInit.} =
   ## Load a csv into a Tensor. All values must be of the same type.
   ##
   ## If there is a header row, it can be skipped.
   ##
+  ## The reading of CSV files currently ``does not`` handle parsing a tensor
+  ## created with `toCsv`. This is because the dimensional information becomes
+  ## part of the CSV output and the parser has no option to reconstruct the
+  ## correct tensor shape.
+  ## Instead of a NxMx...xZ tensor we always construct a NxM tensor, where N-1
+  ## is the rank of the original tensor and M is the total size (total number of
+  ## elements) of the original tensor!
+  ##
   ## Input:
   ##   - csvPath: a path to the csvfile
-  ##   - skip_header: should read_csv skip the first row
+  ##   - skipHeader: should read_csv skip the first row
   ##   - separator: a char, default ','
   ##   - quote: a char, default '\"' (single and double quotes must be escaped).
   ##     Separators inside quoted strings are ignored, for example: `"foo", "bar, baz"` corresponds to 2 columns not 3.
 
   var parser: proc(x:string): T {.nimcall.}
   when T is SomeSignedInt:
-    parser = proc(x:string): T = x.parseInt.T
+    parser = proc(x: string): T = x.parseInt.T
   elif T is SomeUnsignedInt:
-    parser = proc(x:string): T = x.parseUInt.T
+    parser = proc(x: string): T = x.parseUInt.T
   elif T is SomeFloat:
-    parser = proc(x:string): T = x.parseFloat.T
+    parser = proc(x: string): T = x.parseFloat.T
   elif T is bool:
     parser = parseBool
   elif T is string:
     parser = proc(x: string): string = shallowCopy(result, x) # no-op
 
+  # 1. count number of lines and columns using memfile interface
+  let (numRows, numCols) = countLinesAndCols(csvPath, separator, quote, skipHeader)
+
+  # 2. prepare CSV parser
   var csv: CsvParser
   let stream = newFileStream(csvPath, mode = fmRead)
-
   csv.open( stream, csvPath,
             separator = separator,
             quote = quote,
             skipInitialSpace = true
           )
   defer: csv.close
 
-  if skip_header:
-    discard csv.readRow
+  # 3. possibly skip the header
+  if skipHeader:
+    csv.readHeaderRow()
 
-  # Initialization, count cols:
-  discard csv.readRow #TODO what if there is only one line.
-  var
-    num_cols = csv.row.len
-    csvdata: seq[T] = @[]
-  for val in csv.row:
-    csvdata.add parser(val)
-
-  # Processing
+  # 4. init data storage for each type & process all rows
+  result = newTensorUninit[T]([numRows, numCols])
+  var curRow = 0
   while csv.readRow:
-    for val in csv.row:
-      csvdata.add parser(val)
-
-  # Finalizing
-  let num_rows= if skip_header: csv.processedRows - 2
-                else: csv.processedRows - 1
-
-  result = newTensorUninit[T](num_rows, num_cols)
-  shallowCopy(result.storage.Fdata, csvdata)
-
+    for i, val in csv.row:
+      result[curRow, i] = parser val
+    inc curRow
 
 proc to_csv*[T](
     tensor: Tensor[T],

diff --git a/tests/io/test_csv.nim b/tests/io/test_csv.nim
@@ -52,8 +52,45 @@ proc main() =
 1;1;4.0
 """
 
+  let csv_empty_lines = """
+
+
+dimension_1,value
+0,1
+1,2
+2,3
+3,4
+4,5
+
+
+"""
+
+  let csv_semicolon_short = """dimension_1;value
+0;1
+1;2
+2;3
+3;4
+4;5
+"""
+
+  let csv_with_quoted = """dimension_1,value
+0,A
+1,B
+2,"hello, this is a string
+with a line break, ugh
+"
+3,D
+4,E
+"""
+
+
+
   let test_file_path = getTempDir() / "arraymancer_test.csv"
 
+  ## NOTE: the reading of CSV files in arraymancer currently ``does not`` handle parsing its own
+  ## CSV files as the dimensional information becomes part of the CSV output. I.e. instead of constructing
+  ## a NxMx...xZ tensor we always construct a NxM tensor, where N-1 is the rank of the original tensor
+  ## and M is the total size (total number of elements) of the original.
   suite "[IO] CSV support":
 
     test "Should export 1d Tensor":
@@ -62,6 +99,11 @@ proc main() =
       let content = readFile(test_file_path)
       check content == expected_output_1d
 
+    test "Read 1D serialized tensor":
+      let tRead = readCsv[int](test_file_path, skipHeader = true)
+      let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
+      check tExp == tRead
+
     test "Should export 2d Tensor":
       let t = @[@[1, 2, 3], @[4, 5, 6]].toTensor()
       t.to_csv(test_file_path)
@@ -80,5 +122,29 @@ proc main() =
       let content = readFile(test_file_path)
       check content == expected_output_semicolon
 
+    test "CSV parsing ignores empty lines":
+      writeFile(test_file_path, csv_empty_lines)
+      let tRead = readCsv[int](test_file_path, skipHeader = true)
+      let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
+      check tExp == tRead
+
+    test "CSV parsing of different (semicolon) separators works":
+      writeFile(test_file_path, csv_semicolon_short)
+      let tRead = readCsv[int](test_file_path, separator = ';', skipHeader = true)
+      let tExp = @[@[0, 1], @[1, 2], @[2, 3], @[3, 4], @[4, 5]].toTensor()
+      check tExp == tRead
+
+    test "CSV parsing of file with quoted content works":
+      writeFile(test_file_path, csv_with_quoted)
+      let tRead = readCsv[string](test_file_path, quote = '\"', skipHeader = true)
+      let tExp = @[@["0", "A"],
+                   @["1", "B"],
+                   @["2", """hello, this is a string
+with a line break, ugh
+"""],
+                   @["3", "D"],
+                   @["4", "E"]].toTensor()
+      check tExp == tRead
+
 main()
 GC_fullCollect()