|
| 1 | +/** |
| 2 | + * Dtype system — immutable singleton descriptors for all pandas-equivalent dtypes. |
| 3 | + * |
| 4 | + * Mirrors pandas' dtype hierarchy: numeric (int, uint, float), bool, string, |
| 5 | + * object, datetime, timedelta, and category. Each Dtype is a flyweight (cached |
| 6 | + * singleton keyed by name) so identity comparisons (`===`) work correctly. |
| 7 | + */ |
| 8 | + |
| 9 | +import type { DtypeName, Scalar } from "../types.ts"; |
| 10 | + |
| 11 | +/** Classification of a dtype into a broad "kind". */ |
| 12 | +export type DtypeKind = |
| 13 | + | "int" |
| 14 | + | "uint" |
| 15 | + | "float" |
| 16 | + | "bool" |
| 17 | + | "string" |
| 18 | + | "object" |
| 19 | + | "datetime" |
| 20 | + | "timedelta" |
| 21 | + | "category"; |
| 22 | + |
| 23 | +/** Size of a single element in bytes (0 = variable / unknown). */ |
| 24 | +export type ItemSize = 0 | 1 | 2 | 4 | 8; |
| 25 | + |
| 26 | +const _registry = new Map<DtypeName, Dtype>(); |
| 27 | + |
| 28 | +interface InferFlags { |
| 29 | + allBool: boolean; |
| 30 | + allInt: boolean; |
| 31 | + allFloat: boolean; |
| 32 | + allDate: boolean; |
| 33 | + allString: boolean; |
| 34 | +} |
| 35 | + |
| 36 | +/** |
| 37 | + * An immutable descriptor for a data type. |
| 38 | + * |
| 39 | + * Obtain instances via the static factory methods or the `Dtype` named |
| 40 | + * constants rather than the constructor. |
| 41 | + * |
| 42 | + * @example |
| 43 | + * ```ts |
| 44 | + * const dt = Dtype.float64; |
| 45 | + * dt.isNumeric; // true |
| 46 | + * dt.itemsize; // 8 |
| 47 | + * Dtype.from("float64") === dt; // true — singletons |
| 48 | + * ``` |
| 49 | + */ |
| 50 | +export class Dtype { |
| 51 | + readonly name: DtypeName; |
| 52 | + readonly kind: DtypeKind; |
| 53 | + readonly itemsize: ItemSize; |
| 54 | + |
| 55 | + private constructor(name: DtypeName, kind: DtypeKind, itemsize: ItemSize) { |
| 56 | + this.name = name; |
| 57 | + this.kind = kind; |
| 58 | + this.itemsize = itemsize; |
| 59 | + } |
| 60 | + |
| 61 | + // ─── singleton factory ────────────────────────────────────────── |
| 62 | + |
| 63 | + /** Return (or create) the singleton for `name`. */ |
| 64 | + static from(name: DtypeName): Dtype { |
| 65 | + const cached = _registry.get(name); |
| 66 | + if (cached !== undefined) { |
| 67 | + return cached; |
| 68 | + } |
| 69 | + const dt = Dtype.build(name); |
| 70 | + _registry.set(name, dt); |
| 71 | + return dt; |
| 72 | + } |
| 73 | + |
| 74 | + private static build(name: DtypeName): Dtype { |
| 75 | + switch (name) { |
| 76 | + case "int8": |
| 77 | + return new Dtype("int8", "int", 1); |
| 78 | + case "int16": |
| 79 | + return new Dtype("int16", "int", 2); |
| 80 | + case "int32": |
| 81 | + return new Dtype("int32", "int", 4); |
| 82 | + case "int64": |
| 83 | + return new Dtype("int64", "int", 8); |
| 84 | + case "uint8": |
| 85 | + return new Dtype("uint8", "uint", 1); |
| 86 | + case "uint16": |
| 87 | + return new Dtype("uint16", "uint", 2); |
| 88 | + case "uint32": |
| 89 | + return new Dtype("uint32", "uint", 4); |
| 90 | + case "uint64": |
| 91 | + return new Dtype("uint64", "uint", 8); |
| 92 | + case "float32": |
| 93 | + return new Dtype("float32", "float", 4); |
| 94 | + case "float64": |
| 95 | + return new Dtype("float64", "float", 8); |
| 96 | + case "bool": |
| 97 | + return new Dtype("bool", "bool", 1); |
| 98 | + case "string": |
| 99 | + return new Dtype("string", "string", 0); |
| 100 | + case "object": |
| 101 | + return new Dtype("object", "object", 0); |
| 102 | + case "datetime": |
| 103 | + return new Dtype("datetime", "datetime", 8); |
| 104 | + case "timedelta": |
| 105 | + return new Dtype("timedelta", "timedelta", 8); |
| 106 | + case "category": |
| 107 | + return new Dtype("category", "category", 0); |
| 108 | + } |
| 109 | + } |
| 110 | + |
| 111 | + // ─── named singletons ─────────────────────────────────────────── |
| 112 | + |
| 113 | + static readonly int8 = Dtype.from("int8"); |
| 114 | + static readonly int16 = Dtype.from("int16"); |
| 115 | + static readonly int32 = Dtype.from("int32"); |
| 116 | + static readonly int64 = Dtype.from("int64"); |
| 117 | + static readonly uint8 = Dtype.from("uint8"); |
| 118 | + static readonly uint16 = Dtype.from("uint16"); |
| 119 | + static readonly uint32 = Dtype.from("uint32"); |
| 120 | + static readonly uint64 = Dtype.from("uint64"); |
| 121 | + static readonly float32 = Dtype.from("float32"); |
| 122 | + static readonly float64 = Dtype.from("float64"); |
| 123 | + static readonly bool = Dtype.from("bool"); |
| 124 | + static readonly string = Dtype.from("string"); |
| 125 | + static readonly object = Dtype.from("object"); |
| 126 | + static readonly datetime = Dtype.from("datetime"); |
| 127 | + static readonly timedelta = Dtype.from("timedelta"); |
| 128 | + static readonly category = Dtype.from("category"); |
| 129 | + |
| 130 | + // ─── type predicates ──────────────────────────────────────────── |
| 131 | + |
| 132 | + get isNumeric(): boolean { |
| 133 | + return this.kind === "int" || this.kind === "uint" || this.kind === "float"; |
| 134 | + } |
| 135 | + |
| 136 | + get isInteger(): boolean { |
| 137 | + return this.kind === "int" || this.kind === "uint"; |
| 138 | + } |
| 139 | + |
| 140 | + get isSignedInteger(): boolean { |
| 141 | + return this.kind === "int"; |
| 142 | + } |
| 143 | + |
| 144 | + get isUnsignedInteger(): boolean { |
| 145 | + return this.kind === "uint"; |
| 146 | + } |
| 147 | + |
| 148 | + get isFloat(): boolean { |
| 149 | + return this.kind === "float"; |
| 150 | + } |
| 151 | + |
| 152 | + get isBool(): boolean { |
| 153 | + return this.kind === "bool"; |
| 154 | + } |
| 155 | + |
| 156 | + get isString(): boolean { |
| 157 | + return this.kind === "string"; |
| 158 | + } |
| 159 | + |
| 160 | + get isDatetime(): boolean { |
| 161 | + return this.kind === "datetime"; |
| 162 | + } |
| 163 | + |
| 164 | + get isTimedelta(): boolean { |
| 165 | + return this.kind === "timedelta"; |
| 166 | + } |
| 167 | + |
| 168 | + get isCategory(): boolean { |
| 169 | + return this.kind === "category"; |
| 170 | + } |
| 171 | + |
| 172 | + get isObject(): boolean { |
| 173 | + return this.kind === "object"; |
| 174 | + } |
| 175 | + |
| 176 | + // ─── casting / promotion ──────────────────────────────────────── |
| 177 | + |
| 178 | + /** |
| 179 | + * True when values of `this` dtype can be safely cast to `target` |
| 180 | + * without loss of information. |
| 181 | + */ |
| 182 | + canCastTo(target: Dtype): boolean { |
| 183 | + if (this === target) { |
| 184 | + return true; |
| 185 | + } |
| 186 | + // Numeric promotion rules (mirrors numpy safe casting). |
| 187 | + const order: readonly DtypeName[] = [ |
| 188 | + "int8", |
| 189 | + "int16", |
| 190 | + "int32", |
| 191 | + "int64", |
| 192 | + "uint8", |
| 193 | + "uint16", |
| 194 | + "uint32", |
| 195 | + "uint64", |
| 196 | + "float32", |
| 197 | + "float64", |
| 198 | + ]; |
| 199 | + const fromIdx = order.indexOf(this.name); |
| 200 | + const toIdx = order.indexOf(target.name); |
| 201 | + if (fromIdx !== -1 && toIdx !== -1) { |
| 202 | + // Unsigned → signed: only safe if there's enough headroom. |
| 203 | + if (this.isUnsignedInteger && target.isSignedInteger) { |
| 204 | + return target.itemsize > this.itemsize; |
| 205 | + } |
| 206 | + return toIdx >= fromIdx; |
| 207 | + } |
| 208 | + // bool → any numeric is safe. |
| 209 | + if (this.isBool && target.isNumeric) { |
| 210 | + return true; |
| 211 | + } |
| 212 | + // string → object is safe. |
| 213 | + if (this.isString && target.isObject) { |
| 214 | + return true; |
| 215 | + } |
| 216 | + return false; |
| 217 | + } |
| 218 | + |
| 219 | + /** |
| 220 | + * Return the smallest dtype that can represent both `a` and `b` without loss. |
| 221 | + * Falls back to `object` when no numeric promotion exists. |
| 222 | + */ |
| 223 | + static commonType(a: Dtype, b: Dtype): Dtype { |
| 224 | + if (a === b) { |
| 225 | + return a; |
| 226 | + } |
| 227 | + if (a.canCastTo(b)) { |
| 228 | + return b; |
| 229 | + } |
| 230 | + if (b.canCastTo(a)) { |
| 231 | + return a; |
| 232 | + } |
| 233 | + // Mixed int / float → float64. |
| 234 | + if (a.isNumeric && b.isNumeric) { |
| 235 | + return Dtype.float64; |
| 236 | + } |
| 237 | + // bool + numeric → numeric. |
| 238 | + if (a.isBool && b.isNumeric) { |
| 239 | + return b; |
| 240 | + } |
| 241 | + if (b.isBool && a.isNumeric) { |
| 242 | + return a; |
| 243 | + } |
| 244 | + // Anything else → object. |
| 245 | + return Dtype.object; |
| 246 | + } |
| 247 | + |
| 248 | + // ─── inference ────────────────────────────────────────────────── |
| 249 | + |
| 250 | + /** |
| 251 | + * Infer the most specific dtype from an array of scalar values. |
| 252 | + * |
| 253 | + * Rules (in priority order): |
| 254 | + * 1. Empty array → float64 (pandas default). |
| 255 | + * 2. All booleans → bool. |
| 256 | + * 3. All integers (number without fractional part, no NaN/Inf) → int64. |
| 257 | + * 4. All finite/NaN numbers → float64. |
| 258 | + * 5. All Date objects → datetime. |
| 259 | + * 6. All strings → string. |
| 260 | + * 7. Otherwise → object. |
| 261 | + */ |
| 262 | + static inferFrom(values: readonly Scalar[]): Dtype { |
| 263 | + if (values.length === 0) { |
| 264 | + return Dtype.float64; |
| 265 | + } |
| 266 | + const flags = Dtype.scanFlags(values); |
| 267 | + return Dtype.flagsToDtype(flags); |
| 268 | + } |
| 269 | + |
| 270 | + private static scanFlags(values: readonly Scalar[]): InferFlags { |
| 271 | + const flags: InferFlags = { |
| 272 | + allBool: true, |
| 273 | + allInt: true, |
| 274 | + allFloat: true, |
| 275 | + allDate: true, |
| 276 | + allString: true, |
| 277 | + }; |
| 278 | + for (const v of values) { |
| 279 | + if (v === null || v === undefined) { |
| 280 | + continue; |
| 281 | + } |
| 282 | + Dtype.updateFlags(flags, v); |
| 283 | + } |
| 284 | + return flags; |
| 285 | + } |
| 286 | + |
| 287 | + private static updateFlags(flags: InferFlags, v: NonNullable<Scalar>): void { |
| 288 | + const t = typeof v; |
| 289 | + if (t !== "boolean") { |
| 290 | + flags.allBool = false; |
| 291 | + } |
| 292 | + if (t === "boolean") { |
| 293 | + flags.allString = false; |
| 294 | + flags.allDate = false; |
| 295 | + } else if (t === "number") { |
| 296 | + flags.allString = false; |
| 297 | + flags.allDate = false; |
| 298 | + if (!(Number.isFinite(v as number) && Number.isInteger(v as number))) { |
| 299 | + flags.allInt = false; |
| 300 | + } |
| 301 | + } else if (v instanceof Date) { |
| 302 | + flags.allString = false; |
| 303 | + flags.allInt = false; |
| 304 | + flags.allFloat = false; |
| 305 | + flags.allBool = false; |
| 306 | + } else if (t === "string") { |
| 307 | + flags.allInt = false; |
| 308 | + flags.allFloat = false; |
| 309 | + flags.allDate = false; |
| 310 | + flags.allBool = false; |
| 311 | + } else { |
| 312 | + flags.allBool = false; |
| 313 | + flags.allInt = false; |
| 314 | + flags.allFloat = false; |
| 315 | + flags.allDate = false; |
| 316 | + flags.allString = false; |
| 317 | + } |
| 318 | + } |
| 319 | + |
| 320 | + private static flagsToDtype(f: InferFlags): Dtype { |
| 321 | + if (f.allBool) { |
| 322 | + return Dtype.bool; |
| 323 | + } |
| 324 | + if (f.allInt) { |
| 325 | + return Dtype.int64; |
| 326 | + } |
| 327 | + if (f.allFloat) { |
| 328 | + return Dtype.float64; |
| 329 | + } |
| 330 | + if (f.allDate) { |
| 331 | + return Dtype.datetime; |
| 332 | + } |
| 333 | + if (f.allString) { |
| 334 | + return Dtype.string; |
| 335 | + } |
| 336 | + return Dtype.object; |
| 337 | + } |
| 338 | + |
| 339 | + // ─── misc ──────────────────────────────────────────────────────── |
| 340 | + |
| 341 | + toString(): string { |
| 342 | + return this.name; |
| 343 | + } |
| 344 | + |
| 345 | + /** Equality: dtypes are singletons, so reference equality suffices. */ |
| 346 | + equals(other: Dtype): boolean { |
| 347 | + return this === other; |
| 348 | + } |
| 349 | +} |
0 commit comments