From 296e35ce6cf4db12dd18b3c83ca21b59a9c60c6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kevin=20L=C3=A4ufer?= <laeufer@cornell.edu>
Date: Thu, 31 Oct 2024 13:13:59 -0400
Subject: [PATCH] wip: port VCD parser to use ReadBuf trait

---
 wellen/src/simple.rs       |  13 +++
 wellen/src/vcd.rs          | 203 ++++++++++++++++++++-----------------
 wellen/tests/diff_tests.rs |  36 +++++--
 3 files changed, 150 insertions(+), 102 deletions(-)
diff --git a/wellen/src/simple.rs b/wellen/src/simple.rs
index c396544..e38629f 100644
--- a/wellen/src/simple.rs
+++ b/wellen/src/simple.rs
@@ -10,6 +10,7 @@ use crate::{
 };
 use std::collections::HashMap;
 use std::fmt::{Debug, Formatter};
+use std::io::{BufRead, Seek};
 
 /// Read a waveform file with the default options. Reads in header and body at once.
 pub fn read<P: AsRef<std::path::Path>>(filename: P) -> Result<Waveform> {
@@ -30,6 +31,18 @@ pub fn read_with_options<P: AsRef<std::path::Path>>(
     ))
 }
 
+/// Read from something that is not a file.
+pub fn read_from_reader<R: BufRead + Seek + Send + Sync + 'static>(input: R) -> Result<Waveform> {
+    let options = LoadOptions::default();
+    let header = viewers::read_header(input, &options)?;
+    let body = viewers::read_body(header.body, &header.hierarchy, None)?;
+    Ok(Waveform::new(
+        header.hierarchy,
+        body.source,
+        body.time_table,
+    ))
+}
+
 /// Provides file format independent access to a waveform file.
 pub struct Waveform {
     hierarchy: Hierarchy,
diff --git a/wellen/src/vcd.rs b/wellen/src/vcd.rs
index f0a1d8a..9cb56fa 100644
--- a/wellen/src/vcd.rs
+++ b/wellen/src/vcd.rs
@@ -117,7 +117,26 @@ pub fn read_body<R: BufRead + Seek>(
     progress: Option<ProgressCount>,
 ) -> Result<(SignalSource, TimeTable)> {
     let (source, time_table) = match data.input {
-        Input::Reader(input) => todo!("parse VCD from reader"),
+        Input::Reader(mut input) => {
+            // determine body length
+            let start = input.stream_position()?;
+            input.seek(SeekFrom::End(0))?;
+            let end = input.stream_position()?;
+            input.seek(SeekFrom::Start(start))?;
+            let input_len = (end - start) as usize;
+
+            // encode signals
+            let encoder = read_single_stream_of_values(
+                &mut input,
+                input_len - 1,
+                true,
+                true,
+                hierarchy,
+                &data.lookup,
+                progress,
+            )?;
+            encoder.finish()
+        }
         Input::Mmap(mmap) => read_values(
             &mmap[data.header_len..],
             data.multi_thread,
@@ -963,7 +982,7 @@ fn read_values(
 ) -> Result<(SignalSource, TimeTable)> {
     if multi_thread {
         let chunks = determine_thread_chunks(input.len());
-        let encoders: Vec<crate::wavemem::Encoder> = chunks
+        let encoders: Result<Vec<crate::wavemem::Encoder>> = chunks
             .par_iter()
             .map(|(start, len)| {
                 let is_first = *start == 0;
@@ -975,8 +994,9 @@ fn read_values(
                     // TODO: deal with \n\r
                     before == b'\n'
                 };
+                let mut inp = std::io::Cursor::new(&input[*start..]);
                 read_single_stream_of_values(
-                    &input[*start..],
+                    &mut inp,
                     *len - 1,
                     is_first,
                     starts_on_new_line,
@@ -986,6 +1006,7 @@ fn read_values(
                 )
             })
             .collect();
+        let encoders = encoders?;
 
         // combine encoders
         let mut encoder_iter = encoders.into_iter();
@@ -995,42 +1016,43 @@ fn read_values(
         }
         Ok(encoder.finish())
     } else {
+        let mut inp = std::io::Cursor::new(input);
         let encoder = read_single_stream_of_values(
-            input,
+            &mut inp,
             input.len() - 1,
             true,
             true,
             hierarchy,
             lookup,
             progress,
-        );
+        )?;
         Ok(encoder.finish())
     }
 }
 
-fn read_single_stream_of_values(
-    input: &[u8],
+fn read_single_stream_of_values<R: BufRead + Seek>(
+    input: &mut R,
     stop_pos: usize,
     is_first: bool,
     starts_on_new_line: bool,
     hierarchy: &Hierarchy,
     lookup: &IdLookup,
     progress: Option<ProgressCount>,
-) -> crate::wavemem::Encoder {
+) -> Result<crate::wavemem::Encoder> {
     let mut encoder = crate::wavemem::Encoder::new(hierarchy);
 
-    let (input2, offset) = if starts_on_new_line {
-        (input, 0)
-    } else {
-        advance_to_first_newline(input)
-    };
-    let mut reader = BodyReader::new(input2);
+    if !starts_on_new_line {
+        // if we start in the middle of a line, we need to skip it
+        let mut dummy = Vec::new();
+        input.read_until(b'\n', &mut dummy)?;
+    }
+    let mut reader = BodyReader::new(input);
     // We only start recording once we have encountered our first time step
     let mut found_first_time_step = false;
 
     // progress tracking
     let mut last_reported_pos = 0;
-    let report_increments = std::cmp::max(input2.len() as u64 / 1000, 512);
+    let report_increments = std::cmp::max(stop_pos as u64 / 1000, 512);
 
     loop {
         if let Some((pos, cmd)) = reader.next() {
@@ -1094,105 +1116,101 @@ fn advance_to_first_newline(input: &[u8]) -> (&[u8], usize) {
     (&[], 0) // no whitespaces found
 }
 
-struct BodyReader<'a> {
-    input: &'a [u8],
+struct BodyReader<'a, R: BufRead> {
+    input: &'a R,
     // state
     pos: usize,
+    token: Vec<u8>,
+    prev_token: Vec<u8>,
     // statistics
     lines_read: usize,
 }
 
 const ASCII_ZERO: &[u8] = b"0";
 
-impl<'a> BodyReader<'a> {
-    fn new(input: &'a [u8]) -> Self {
+impl<'a, R: BufRead> BodyReader<'a, R> {
+    fn new(input: &'a mut R) -> Self {
         BodyReader {
             input,
             pos: 0,
+            token: Vec::with_capacity(64),
+            prev_token: Vec::with_capacity(64),
             lines_read: 0,
         }
     }
 
     #[inline]
-    fn try_finish_token(
-        &mut self,
-        pos: usize,
-        token_start: &mut Option<usize>,
-        prev_token: &mut Option<&'a [u8]>,
-        search_for_end: &mut bool,
-    ) -> Option<BodyCmd<'a>> {
-        match *token_start {
-            None => None,
-            Some(start) => {
-                let token = &self.input[start..pos];
-                if token.is_empty() {
-                    return None;
-                }
-                if *search_for_end {
-                    *search_for_end = token != b"$end";
-                    // consume token and return
-                    *token_start = None;
-                    return None;
+    fn try_finish_token(&mut self, pos: usize, search_for_end: &mut bool) -> Option<BodyCmd<'a>> {
+        // no token means that there is nothing to do
+        if self.token.is_empty() {
+            return None;
+        }
+
+        // if we are looking for the $end token, we discard everything else
+        if *search_for_end {
+            // did we find the end token?
+            *search_for_end = self.token != b"$end";
+            // consume token and return
+            self.token.clear();
+            return None;
+        }
+
+        // if there was no previous token
+        if self.prev_token.is_empty() {
+            if self.token.len() == 1 {
+                // too short, wait for more input
+                return None;
+            }
+
+            // 1-token commands are binary changes or time commands
+            match self.token[0] {
+                b'#' => Some(BodyCmd::Time(&self.token[1..])),
+                b'0' | b'1' | b'z' | b'Z' | b'x' | b'X' | b'h' | b'H' | b'u' | b'U' | b'w'
+                | b'W' | b'l' | b'L' | b'-' => {
+                    Some(BodyCmd::Value(&self.token[0..1], &self.token[1..]))
                 }
-                let ret = match *prev_token {
-                    None => {
-                        if token.len() == 1 {
-                            // too short
-                            return None;
+                _ => {
+                    // parse command tokens
+                    match self.token.as_slice() {
+                        b"$dumpall" => {
+                            // interpret dumpall as indicating timestep zero
+                            self.token.clear();
+                            return Some(BodyCmd::Time(ASCII_ZERO));
                         }
-                        // 1-token commands are binary changes or time commands
-                        match token[0] {
-                            b'#' => Some(BodyCmd::Time(&token[1..])),
-                            b'0' | b'1' | b'z' | b'Z' | b'x' | b'X' | b'h' | b'H' | b'u' | b'U'
-                            | b'w' | b'W' | b'l' | b'L' | b'-' => {
-                                Some(BodyCmd::Value(&token[0..1], &token[1..]))
-                            }
-                            _ => {
-                                if token == b"$dumpall" {
-                                    // interpret dumpall as indicating timestep zero
-                                    return Some(BodyCmd::Time(ASCII_ZERO));
-                                }
-                                if token == b"$comment" {
-                                    // drop token, but start searching for $end in order to skip the comment
-                                    *search_for_end = true;
-                                } else if token != b"$dumpvars"
-                                    && token != b"$end"
-                                    && token != b"$dumpoff"
-                                    && token != b"$dumpon"
-                                {
-                                    // ignore dumpvars, dumpoff, dumpon, and end command
-                                    *prev_token = Some(token);
-                                }
-                                None
-                            }
+                        b"$comment" => {
+                            // drop token, but start searching for $end in order to skip the comment
+                            *search_for_end = true;
                         }
+                        b"$dumpvars" | b"$end" | b"$dumpoff" | b"$dumpon" => {
+                            // ignore dumpvars, dumpoff, dumpon, and end command
+                            self.prev_token.copy_from_slice(self.token.as_slice());
+                        }
+                        _ => {} // do nothing
                     }
-                    Some(first) => {
-                        let cmd = match first[0] {
-                            b'b' | b'B' | b'r' | b'R' | b's' | b'S' => {
-                                BodyCmd::Value(&first[0..], token)
-                            }
-                            _ => {
-                                panic!(
-                                    "Unexpected tokens: `{}` and `{}` ({} lines after header)",
-                                    String::from_utf8_lossy(first),
-                                    String::from_utf8_lossy(token),
-                                    self.lines_read
-                                );
-                            }
-                        };
-                        *prev_token = None;
-                        Some(cmd)
-                    }
-                };
-                *token_start = None;
-                ret
+                    // wait for more input
+                    None
+                }
             }
+        } else {
+            let cmd = match self.prev_token[0] {
+                b'b' | b'B' | b'r' | b'R' | b's' | b'S' => {
+                    BodyCmd::Value(&self.prev_token[0..], self.token.as_slice())
+                }
+                _ => {
+                    panic!(
+                        "Unexpected tokens: `{}` and `{}` ({} lines after header)",
+                        String::from_utf8_lossy(self.prev_token.as_slice()),
+                        String::from_utf8_lossy(self.token.as_slice()),
+                        self.lines_read
+                    );
+                }
+            };
+            Some(cmd)
         }
     }
 }
 
-impl<'a> Iterator for BodyReader<'a> {
+impl<'a, R: BufRead> Iterator for BodyReader<'a, R> {
     type Item = (usize, BodyCmd<'a>);
 
     /// returns the starting position and the body of the command
@@ -1253,12 +1271,7 @@ impl<'a> Iterator for BodyReader<'a> {
         // update final position
         self.pos = self.input.len();
         // check to see if there is a final token at the end
-        match self.try_finish_token(
-            self.pos,
-            &mut token_start,
-            &mut prev_token,
-            &mut search_for_end,
-        ) {
+        match self.try_finish_token(self.pos, &mut search_for_end) {
             None => {}
             Some(cmd) => {
                 return Some((start_pos, cmd));
diff --git a/wellen/tests/diff_tests.rs b/wellen/tests/diff_tests.rs
index 99158ac..64c30f2 100644
--- a/wellen/tests/diff_tests.rs
+++ b/wellen/tests/diff_tests.rs
@@ -8,27 +8,32 @@ use wellen::simple::*;
 use wellen::*;
 
 fn run_diff_test(vcd_filename: &str, fst_filename: &str) {
-    run_diff_test_internal(vcd_filename, Some(fst_filename), false);
+    run_diff_test_internal(vcd_filename, Some(fst_filename), false, false);
+}
+
+fn run_diff_test_from_bytes(vcd_filename: &str, fst_filename: &str) {
+    run_diff_test_internal(vcd_filename, Some(fst_filename), false, true);
 }
 
 fn run_diff_test_vcd_only(vcd_filename: &str) {
-    run_diff_test_internal(vcd_filename, None, false);
+    run_diff_test_internal(vcd_filename, None, false, false);
 }
 
 /// Skips trying to load the content with the `vcd` library. This is important for files
 /// with 9-state values since these cannot be read by the `vcd` library.
 fn run_load_test(vcd_filename: &str, fst_filename: &str) {
-    run_diff_test_internal(vcd_filename, Some(fst_filename), true);
+    run_diff_test_internal(vcd_filename, Some(fst_filename), true, false);
 }
 
 fn run_load_test_vcd(vcd_filename: &str) {
-    run_diff_test_internal(vcd_filename, None, true);
+    run_diff_test_internal(vcd_filename, None, true, false);
 }
 
 fn run_diff_test_internal(
     vcd_filename: &str,
     fst_filename: Option<&str>,
     skip_content_comparison: bool,
+    load_from_bytes_instead_of_file: bool,
 ) {
     {
         let single_thread = LoadOptions {
@@ -39,13 +44,25 @@ fn run_diff_test_internal(
             .expect("Failed to load VCD with a single thread");
         diff_test_one(vcd_filename, wave, skip_content_comparison);
     }
-    {
+    if load_from_bytes_instead_of_file {
+        let bytes = std::io::Cursor::new(std::fs::read(vcd_filename).expect("failed"));
+        let wave =
+            read_from_reader(bytes).expect("Failed to load VCD with multiple threads from bytes");
+        diff_test_one(vcd_filename, wave, skip_content_comparison);
+    } else {
         let wave = read(vcd_filename).expect("Failed to load VCD with multiple threads");
         diff_test_one(vcd_filename, wave, skip_content_comparison);
     }
     if let Some(fst_filename) = fst_filename {
-        let wave = read(fst_filename).expect("Failed to load FST");
-        diff_test_one(vcd_filename, wave, skip_content_comparison);
+        if load_from_bytes_instead_of_file {
+            let bytes = std::io::Cursor::new(std::fs::read(fst_filename).expect("failed"));
+            let wave = read_from_reader(bytes)
+                .expect("Failed to load FST with multiple threads from bytes");
+            diff_test_one(fst_filename, wave, skip_content_comparison);
+        } else {
+            let wave = read(fst_filename).expect("Failed to load FST");
+            diff_test_one(vcd_filename, wave, skip_content_comparison);
+        }
     }
 }
 
@@ -470,6 +487,11 @@ fn diff_icarus_test1() {
     run_diff_test("inputs/icarus/test1.vcd", "inputs/icarus/test1.vcd.fst");
 }
 
+#[test]
+fn diff_icarus_test1_from_bytes() {
+    run_diff_test_from_bytes("inputs/icarus/test1.vcd", "inputs/icarus/test1.vcd.fst");
+}
+
 #[test]
 fn diff_model_sim_clkdiv2n_tb() {
     run_diff_test(