Skip to content

Commit 3baf9c4

Browse files
authored
Provide codec config schemas (#1)
* Prototype codec config schemas * Derive codec config schemas for static codecs * Test PyCodecClass schema polyfill * Fix dict item iteration * Check for Python __doc__ is None * Improve PyCodec schema generation * Refactor the PyCodec schema polyfill implementation * Draft the docs extraction from a json schema, enums not yet supported * Inline subschemas for static codecs * Add enum schema and schema-derived PyCodec signature support * Implement PyCodec[Class] downcast support * Improvements to codec docs for enum configs * Fix config parsing for SZ3 and ZFP * Pin schemars to the latest alpha * Refactor the schema parameter extraction
1 parent de9e850 commit 3baf9c4

File tree

40 files changed

+1264
-260
lines changed

40 files changed

+1264
-260
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1+
/miniconda3
2+
13
/target
24
/Cargo.lock

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ postcard = { version = "1.0", default-features = false }
5454
pyo3 = { version = "0.21", default-features = false }
5555
pythonize = { version = "0.21", default-features = false }
5656
rand = { version = "0.8", default-features = false }
57+
schemars = { version = "=1.0.0-alpha.9", default-features = false }
5758
serde = { version = "1.0", default-features = false }
5859
serde-transcode = { version = "1.1", default-features = false }
5960
serde_json = { version = "1.0", default-features = false }

codecs/bit-round/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ keywords = ["bit-rounding", "numcodecs", "compression", "encoding"]
1717
[dependencies]
1818
ndarray = { workspace = true }
1919
numcodecs = { workspace = true }
20+
schemars = { workspace = true, features = ["derive", "preserve_order"] }
2021
serde = { workspace = true, features = ["std", "derive"] }
2122
thiserror = { workspace = true }
2223

codecs/bit-round/src/lib.rs

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,24 @@
1919
2020
use ndarray::{Array, ArrayBase, Data, Dimension};
2121
use numcodecs::{
22-
serialize_codec_config_with_id, AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView,
23-
AnyArrayViewMut, AnyCowArray, Codec, StaticCodec,
22+
AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
23+
Codec, StaticCodec, StaticCodecConfig,
2424
};
25-
use serde::{Deserialize, Deserializer, Serialize, Serializer};
25+
use schemars::JsonSchema;
26+
use serde::{Deserialize, Serialize};
2627
use thiserror::Error;
2728

28-
#[derive(Clone, Serialize, Deserialize)]
29-
/// Codec providing floating-point [`bit_round`]ing.
29+
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
30+
#[serde(deny_unknown_fields)]
31+
/// Codec providing floating-point bit rounding.
32+
///
33+
/// Drops the specified number of bits from the floating point mantissa,
34+
/// leaving an array that is more amenable to compression. The number of
35+
/// bits to keep should be determined by information analysis of the data
36+
/// to be compressed.
37+
///
38+
/// The approach is based on the paper by Klöwer et al. 2021
39+
/// (<https://www.nature.com/articles/s43588-021-00156-2>).
3040
pub struct BitRoundCodec {
3141
/// The number of bits of the mantissa to keep.
3242
///
@@ -67,17 +77,19 @@ impl Codec for BitRoundCodec {
6777

6878
Ok(decoded.assign(&encoded)?)
6979
}
70-
71-
fn get_config<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
72-
serialize_codec_config_with_id(self, self, serializer)
73-
}
7480
}
7581

7682
impl StaticCodec for BitRoundCodec {
7783
const CODEC_ID: &'static str = "bit-round";
7884

79-
fn from_config<'de, D: Deserializer<'de>>(config: D) -> Result<Self, D::Error> {
80-
Self::deserialize(config)
85+
type Config<'de> = Self;
86+
87+
fn from_config(config: Self::Config<'_>) -> Self {
88+
config
89+
}
90+
91+
fn get_config(&self) -> StaticCodecConfig<Self> {
92+
StaticCodecConfig::from(self)
8193
}
8294
}
8395

@@ -104,15 +116,8 @@ pub enum BitRoundCodecError {
104116
},
105117
}
106118

107-
/// Floating-point bit rounding.
108-
///
109-
/// Drops the specified number of bits from the floating point mantissa,
110-
/// leaving an array that is more amenable to compression. The number of
111-
/// bits to keep should be determined by information analysis of the data
112-
/// to be compressed.
113-
///
114-
/// The approach is based on the paper by Klöwer et al. 2021
115-
/// (<https://www.nature.com/articles/s43588-021-00156-2>).
119+
/// Floating-point bit rounding, which drops the specified number of bits from
120+
/// the floating point mantissa.
116121
///
117122
/// See <https://github.com/milankl/BitInformation.jl> for the the original
118123
/// implementation in Julia.

codecs/identity/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ keywords = ["identity", "numcodecs", "compression", "encoding"]
1616

1717
[dependencies]
1818
numcodecs = { workspace = true }
19+
schemars = { workspace = true, features = ["derive", "preserve_order"] }
1920
serde = { workspace = true, features = ["std", "derive"] }
2021
thiserror = { workspace = true }
2122

codecs/identity/src/lib.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@
1818
//! Identity codec implementation for the [`numcodecs`] API.
1919
2020
use numcodecs::{
21-
serialize_codec_config_with_id, AnyArray, AnyArrayAssignError, AnyArrayView, AnyArrayViewMut,
22-
AnyCowArray, Codec, StaticCodec,
21+
AnyArray, AnyArrayAssignError, AnyArrayView, AnyArrayViewMut, AnyCowArray, Codec, StaticCodec,
22+
StaticCodecConfig,
2323
};
24-
use serde::{Deserialize, Deserializer, Serialize, Serializer};
24+
use schemars::JsonSchema;
25+
use serde::{Deserialize, Serialize};
2526
use thiserror::Error;
2627

27-
#[derive(Clone, Serialize, Deserialize)]
28+
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
29+
#[serde(deny_unknown_fields)]
2830
/// Identity codec which applies the identity function, i.e. passes through the
2931
/// input unchanged during encoding and decoding.
3032
pub struct IdentityCodec {
@@ -49,17 +51,19 @@ impl Codec for IdentityCodec {
4951
) -> Result<(), Self::Error> {
5052
Ok(decoded.assign(&encoded)?)
5153
}
52-
53-
fn get_config<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
54-
serialize_codec_config_with_id(self, self, serializer)
55-
}
5654
}
5755

5856
impl StaticCodec for IdentityCodec {
5957
const CODEC_ID: &'static str = "identity";
6058

61-
fn from_config<'de, D: Deserializer<'de>>(config: D) -> Result<Self, D::Error> {
62-
Self::deserialize(config)
59+
type Config<'de> = Self;
60+
61+
fn from_config(config: Self::Config<'_>) -> Self {
62+
config
63+
}
64+
65+
fn get_config(&self) -> StaticCodecConfig<Self> {
66+
StaticCodecConfig::from(self)
6367
}
6468
}
6569

codecs/linear-quantize/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ keywords = ["linear", "quantization", "numcodecs", "compression", "encoding"]
1818
ndarray = { workspace = true, features = ["std"] }
1919
numcodecs = { workspace = true }
2020
postcard = { workspace = true }
21+
schemars = { workspace = true, features = ["derive", "preserve_order"] }
2122
serde = { workspace = true, features = ["std", "derive"] }
2223
serde_repr = { workspace = true }
2324
thiserror = { workspace = true }

codecs/linear-quantize/src/lib.rs

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,17 @@ use std::{borrow::Cow, fmt};
2323

2424
use ndarray::{Array, Array1, ArrayBase, ArrayD, ArrayViewMutD, Data, Dimension, ShapeError};
2525
use numcodecs::{
26-
serialize_codec_config_with_id, AnyArray, AnyArrayDType, AnyArrayView, AnyArrayViewMut,
27-
AnyCowArray, Codec, StaticCodec,
26+
AnyArray, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray, Codec, StaticCodec,
27+
StaticCodecConfig,
2828
};
29-
use serde::{de::DeserializeOwned, Deserialize, Deserializer, Serialize, Serializer};
29+
use schemars::{JsonSchema, JsonSchema_repr};
30+
use serde::{de::DeserializeOwned, Deserialize, Serialize};
3031
use serde_repr::{Deserialize_repr, Serialize_repr};
3132
use thiserror::Error;
3233
use twofloat::TwoFloat;
3334

34-
#[derive(Clone, Serialize, Deserialize)]
35+
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
36+
#[serde(deny_unknown_fields)]
3537
/// Lossy codec to reduce the precision of floating point data.
3638
///
3739
/// The data is quantized to unsigned integers of the best-fitting type.
@@ -44,7 +46,8 @@ pub struct LinearQuantizeCodec {
4446
}
4547

4648
/// Data types which the [`LinearQuantizeCodec`] can quantize
47-
#[derive(Copy, Clone, Debug, serde::Serialize, serde::Deserialize)]
49+
#[derive(Copy, Clone, Debug, Serialize, Deserialize, JsonSchema)]
50+
#[schemars(extend("enum" = ["f32", "float32", "f64", "float64"]))]
4851
#[allow(missing_docs)]
4952
pub enum LinearQuantizeDType {
5053
#[serde(rename = "f32", alias = "float32")]
@@ -67,7 +70,7 @@ impl fmt::Display for LinearQuantizeDType {
6770
/// The binary `#[repr(u8)]` value of each variant is equivalent to the binary
6871
/// logarithm of the number of bins, i.e. the binary precision or the number of
6972
/// bits used.
70-
#[derive(Copy, Clone, Serialize_repr, Deserialize_repr)]
73+
#[derive(Copy, Clone, Serialize_repr, Deserialize_repr, JsonSchema_repr)]
7174
#[repr(u8)]
7275
#[rustfmt::skip]
7376
#[allow(missing_docs)]
@@ -382,17 +385,19 @@ impl Codec for LinearQuantizeCodec {
382385

383386
Ok(())
384387
}
385-
386-
fn get_config<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
387-
serialize_codec_config_with_id(self, self, serializer)
388-
}
389388
}
390389

391390
impl StaticCodec for LinearQuantizeCodec {
392391
const CODEC_ID: &'static str = "linear-quantize";
393392

394-
fn from_config<'de, D: Deserializer<'de>>(config: D) -> Result<Self, D::Error> {
395-
Self::deserialize(config)
393+
type Config<'de> = Self;
394+
395+
fn from_config(config: Self::Config<'_>) -> Self {
396+
config
397+
}
398+
399+
fn get_config(&self) -> StaticCodecConfig<Self> {
400+
StaticCodecConfig::from(self)
396401
}
397402
}
398403

@@ -714,7 +719,7 @@ impl Unsigned for u64 {
714719
const ZERO: Self = 0;
715720
}
716721

717-
#[derive(serde::Serialize, serde::Deserialize)]
722+
#[derive(Serialize, Deserialize)]
718723
#[serde(bound = "")]
719724
struct CompressionHeader<'a, T: Float> {
720725
#[serde(borrow)]

codecs/log/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ keywords = ["log", "numcodecs", "compression", "encoding"]
1717
[dependencies]
1818
ndarray = { workspace = true }
1919
numcodecs = { workspace = true }
20+
schemars = { workspace = true, features = ["derive", "preserve_order"] }
2021
serde = { workspace = true, features = ["std", "derive"] }
2122
thiserror = { workspace = true }
2223

codecs/log/src/lib.rs

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@
1919
2020
use ndarray::{Array, ArrayBase, ArrayView, ArrayViewMut, Data, Dimension};
2121
use numcodecs::{
22-
serialize_codec_config_with_id, AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView,
23-
AnyArrayViewMut, AnyCowArray, Codec, StaticCodec,
22+
AnyArray, AnyArrayAssignError, AnyArrayDType, AnyArrayView, AnyArrayViewMut, AnyCowArray,
23+
Codec, StaticCodec, StaticCodecConfig,
2424
};
25-
use serde::{Deserialize, Deserializer, Serialize, Serializer};
25+
use schemars::JsonSchema;
26+
use serde::{Deserialize, Serialize};
2627
use thiserror::Error;
2728

28-
#[derive(Clone, Serialize, Deserialize)]
29+
#[derive(Clone, Serialize, Deserialize, JsonSchema)]
30+
#[serde(deny_unknown_fields)]
2931
/// Log codec which calculates `c = log(1+x)` on encoding and `d = exp(c)-1` on
3032
/// decoding.
3133
///
@@ -76,17 +78,19 @@ impl Codec for LogCodec {
7678
(encoded, _decoded) => Err(LogCodecError::UnsupportedDtype(encoded.dtype())),
7779
}
7880
}
79-
80-
fn get_config<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
81-
serialize_codec_config_with_id(self, self, serializer)
82-
}
8381
}
8482

8583
impl StaticCodec for LogCodec {
8684
const CODEC_ID: &'static str = "log";
8785

88-
fn from_config<'de, D: Deserializer<'de>>(config: D) -> Result<Self, D::Error> {
89-
Self::deserialize(config)
86+
type Config<'de> = Self;
87+
88+
fn from_config(config: Self::Config<'_>) -> Self {
89+
config
90+
}
91+
92+
fn get_config(&self) -> StaticCodecConfig<Self> {
93+
StaticCodecConfig::from(self)
9094
}
9195
}
9296

0 commit comments

Comments
 (0)