From 209ba679bd7cb5aee4237567aebb7e19598ff3bc Mon Sep 17 00:00:00 2001 From: hellovai Date: Tue, 11 Jun 2024 12:05:06 -0700 Subject: [PATCH] Updated the deserializer for parsing inline objects (#664) * LLMs can sometimes spit out: {"key": value, "key2": val... This works when the object is parseable, but when its not yet parse-able (aka streaming), this causes some bugs. We capture that now. --- engine/Cargo.lock | 11 + engine/baml-lib/jsonish/Cargo.toml | 1 + .../deserializer/coercer/coerce_primitive.rs | 6 + .../parser/fixing_parser/json_parse_state.rs | 29 +++ engine/baml-lib/jsonish/src/tests/macros.rs | 9 +- engine/baml-lib/jsonish/src/tests/mod.rs | 2 +- .../jsonish/src/tests/test_partials.rs | 234 ++++++++++++++++++ 7 files changed, 285 insertions(+), 7 deletions(-) create mode 100644 engine/baml-lib/jsonish/src/tests/test_partials.rs diff --git a/engine/Cargo.lock b/engine/Cargo.lock index 18db7e862..e1968bed2 100644 --- a/engine/Cargo.lock +++ b/engine/Cargo.lock @@ -149,6 +149,16 @@ dependencies = [ "nom", ] +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "async-channel" version = "1.9.0" @@ -1945,6 +1955,7 @@ name = "jsonish" version = "0.33.0" dependencies = [ "anyhow", + "assert-json-diff", "baml-types", "colored", "either", diff --git a/engine/baml-lib/jsonish/Cargo.toml b/engine/baml-lib/jsonish/Cargo.toml index 7ad800489..874345536 100644 --- a/engine/baml-lib/jsonish/Cargo.toml +++ b/engine/baml-lib/jsonish/Cargo.toml @@ -23,3 +23,4 @@ anyhow.workspace = true either = "1.10.0" test-log = "0.2.16" regex.workspace = true +assert-json-diff = "2.0.2" diff --git a/engine/baml-lib/jsonish/src/deserializer/coercer/coerce_primitive.rs b/engine/baml-lib/jsonish/src/deserializer/coercer/coerce_primitive.rs index 5b55cc416..4f6d8a753 100644 --- a/engine/baml-lib/jsonish/src/deserializer/coercer/coerce_primitive.rs +++ b/engine/baml-lib/jsonish/src/deserializer/coercer/coerce_primitive.rs @@ -23,6 +23,12 @@ impl TypeCoercer for TypeValue { scope = ctx.display_scope(), current = value.map(|v| v.r#type()).unwrap_or("".into()) ); + log::trace!( + "content: {}", + value + .map(|v| v.to_string()) + .unwrap_or_else(|| "".into()) + ); match self { TypeValue::String => coerce_string(ctx, target, value), diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs index 97f42ea44..6ccc8ad3f 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs @@ -170,11 +170,40 @@ impl JsonParseState { '\n' => { return Some(idx); } + ' ' => { + // If after the space we have "//" or "/*" or the beginning of a key, we'll close the string + while let Some((_, c)) = next.next() { + match c { + ' ' => {} + '\n' => { + return Some(idx); + } + '/' => match next.peek() { + Some((_, '/')) => { + return Some(idx); + } + Some((_, '*')) => { + return Some(idx); + } + _ => { + let _ = self.consume(c); + } + }, + '"' => { + return Some(idx); + } + x => { + let _ = self.consume(x); + } + } + } + } _ => { let _ = self.consume(c); } } } else { + // Don't include the comma return Some(idx); } } diff --git a/engine/baml-lib/jsonish/src/tests/macros.rs b/engine/baml-lib/jsonish/src/tests/macros.rs index 8fb7a2a25..5ca56c10c 100644 --- a/engine/baml-lib/jsonish/src/tests/macros.rs +++ b/engine/baml-lib/jsonish/src/tests/macros.rs @@ -1,4 +1,3 @@ -#[macro_use] macro_rules! test_failing_deserializer { ($name:ident, $file_content:expr, $raw_string:expr, $target_type:expr) => { #[test_log::test] @@ -13,7 +12,6 @@ macro_rules! test_failing_deserializer { }; } -#[macro_use] macro_rules! test_deserializer { ($name:ident, $file_content:expr, $raw_string:expr, $target_type:expr, $($json:tt)+) => { #[test_log::test] @@ -38,12 +36,11 @@ macro_rules! test_deserializer { let expected = serde_json::json!($($json)+); - assert_eq!(json_value, expected, "Expected: {:#}, got: {:#?}", expected, value); + assert_json_diff::assert_json_eq!(json_value, expected); } }; } -#[macro_use] macro_rules! test_partial_deserializer { ($name:ident, $file_content:expr, $raw_string:expr, $target_type:expr, $($json:tt)+) => { #[test_log::test] @@ -61,13 +58,13 @@ macro_rules! test_partial_deserializer { assert!(result.is_ok(), "Failed to parse: {:?}", result); let value = result.unwrap(); - println!("{}", value); let value: BamlValue = value.into(); + println!("{:#?}", value); let json_value = json!(value); let expected = serde_json::json!($($json)+); - assert_eq!(json_value, expected, "Expected: {:#}, got: {:#?}", expected, value); + assert_json_diff::assert_json_eq!(json_value, expected); } }; } diff --git a/engine/baml-lib/jsonish/src/tests/mod.rs b/engine/baml-lib/jsonish/src/tests/mod.rs index 16af404bb..f02f6084b 100644 --- a/engine/baml-lib/jsonish/src/tests/mod.rs +++ b/engine/baml-lib/jsonish/src/tests/mod.rs @@ -7,11 +7,11 @@ pub mod macros; mod test_class; mod test_enum; mod test_lists; +mod test_partials; mod test_unions; use std::{ collections::{HashMap, HashSet}, - env, path::PathBuf, }; diff --git a/engine/baml-lib/jsonish/src/tests/test_partials.rs b/engine/baml-lib/jsonish/src/tests/test_partials.rs new file mode 100644 index 000000000..63082e05f --- /dev/null +++ b/engine/baml-lib/jsonish/src/tests/test_partials.rs @@ -0,0 +1,234 @@ +use super::*; + +const BAML_FILE: &str = r###" +class Score { + year int @description(#" + The year you're giving the score for. + "#) + score int @description(#" + 1 to 100 + "#) +} + +class PopularityOverTime { + bookName string + scores Score[] +} + +class WordCount { + bookName string + count int +} + +class Ranking { + bookName string + score int @description(#" + 1 to 100 of your own personal score of this book + "#) +} + +class BookAnalysis { + bookNames string[] @description(#" + The list of book names provided + "#) + popularityOverTime PopularityOverTime[] @description(#" + Print the popularity of EACH BOOK over time. + Make sure you add datapoints up to the current year. Try to use a max of 10 datapoints to + represent the whole timeline for all books (so 10 handpicked years). + "#) @alias(popularityData) + popularityRankings Ranking[] @description(#" + A list of the book's popularity rankings over time. + The first element is the top ranking. + "#) + wordCounts WordCount[] +} +"###; + +test_partial_deserializer!( + test_partial_analysis_1, + BAML_FILE, + r#" + ```json + { + "bookNames": [ + "brave new world", + "the lord of the rings", + "three body problem", + "stormlight archive" + ], + "popularityData": [ + { + "bookName": "brave new world", + "scores": [ + {"year": 1950, "score": 70}, + {"year": 1960, "score": 75}, + {"year": 1970, "score": 80}, + {"year": 1980, "score": 85}, + {"year": 1990, "score": 85}, + {"year": 2000, "score": 90}, + {"year": 2010, "score": 95}, + {"year": 2020, "score": 97}, + {"year": 2023, "score": 98} + ] + }, + { + "bookName": "the lord of the rings", + "scores": [ + {"year": 1954, "score": 60}, + {"year": 1960, "score": 75}, + {"year": 1970, "score": 85}, + {"year": 1980, "score": 90}, + {"year": 1990, "score": 92}, + {"year": 2000, "score": 95}, + {"year": 2010, "score": 96}, + {"year": 2020, "score": 98}, + {"year": 2023, "score": 99} + ] + }, + { + "bookName": "three body problem", + "scores": [ + {"year": 2008, "score": 50}, + {"year": 2010, "score": 60}, + {"year": 2015, "score": 70}, + {"year": 2020, "score": 80}, + {"year": 2023, "score": 85} + ] + }, + { + "bookName": "stormlight archive", + "scores": [ + {"year": 2010, "score": 55}, + {"year": 2014, "score": 65}, + {"year": 2017, "score": 75}, + {"year": 2020, "score": 80}, + {"year": 2023, "score": 85} + ] + } + ], + "popularityRankings": [ + {"bookName": "the lord of the rings", "score": 99}, + {"bookName": "brave new world", "score": 97}, + {"bookName": "stormlight archive", "score": 85}, + {"bookName": "three body problem", "score": 85} + ], + "wordCounts": [ + {"bookName": "brave new world", "count": 64000}, + {"bookName": "the lord of the rings", "count": 470000}, + {"bookName": "three body problem", "count": 150000}, + {"bookName": "stormlight archive", "count": 400000} + ] + } + ``` + "#, + FieldType::Class("BookAnalysis".to_string()), + { + "bookNames": [ + "brave new world", + "the lord of the rings", + "three body problem", + "stormlight archive" + ], + "popularityOverTime": [ + { + "bookName": "brave new world", + "scores": [ + {"year": 1950, "score": 70}, + {"year": 1960, "score": 75}, + {"year": 1970, "score": 80}, + {"year": 1980, "score": 85}, + {"year": 1990, "score": 85}, + {"year": 2000, "score": 90}, + {"year": 2010, "score": 95}, + {"year": 2020, "score": 97}, + {"year": 2023, "score": 98} + ] + }, + { + "bookName": "the lord of the rings", + "scores": [ + {"year": 1954, "score": 60}, + {"year": 1960, "score": 75}, + {"year": 1970, "score": 85}, + {"year": 1980, "score": 90}, + {"year": 1990, "score": 92}, + {"year": 2000, "score": 95}, + {"year": 2010, "score": 96}, + {"year": 2020, "score": 98}, + {"year": 2023, "score": 99} + ] + }, + { + "bookName": "three body problem", + "scores": [ + {"year": 2008, "score": 50}, + {"year": 2010, "score": 60}, + {"year": 2015, "score": 70}, + {"year": 2020, "score": 80}, + {"year": 2023, "score": 85} + ] + }, + { + "bookName": "stormlight archive", + "scores": [ + {"year": 2010, "score": 55}, + {"year": 2014, "score": 65}, + {"year": 2017, "score": 75}, + {"year": 2020, "score": 80}, + {"year": 2023, "score": 85} + ] + } + ], + "popularityRankings": [ + {"bookName": "the lord of the rings", "score": 99}, + {"bookName": "brave new world", "score": 97}, + {"bookName": "stormlight archive", "score": 85}, + {"bookName": "three body problem", "score": 85} + ], + "wordCounts": [ + {"bookName": "brave new world", "count": 64000}, + {"bookName": "the lord of the rings", "count": 470000}, + {"bookName": "three body problem", "count": 150000}, + {"bookName": "stormlight archive", "count": 400000} + ] + } +); + +test_partial_deserializer!( + test_partial_analysis_2, + BAML_FILE, + r#" + ```json + { + "bookNames": [ + "brave new world", + "the lord of the rings", + "three body problem", + "stormlight archive" + ], + "popularityData": [ + { + "bookName": "brave new world", + "scores": [ + {"year": 1950, "score": 70}, + "#, + FieldType::Class("BookAnalysis".to_string()), + { + "bookNames": [ + "brave new world", + "the lord of the rings", + "three body problem", + "stormlight archive" + ], + "popularityOverTime": [ + { + "bookName": "brave new world", + "scores": [ + {"year": 1950, "score": 70} + ] + } + ], + "popularityRankings": [], + "wordCounts": [] + } +);