linville · maxthoursie · Jun 15, 2025 · X-Coder · Jun 15, 2025 · X-Coder
diff --git a/NOTES.md b/NOTES.md
@@ -43,14 +43,15 @@ Datatypes
 | -- | --------- | --- |----------------------- |
 | 1  | String    | Variable | Size defined in column definition at `0xA6` |
 | 2  | Date      | 4 | Days -1 since [AD 1, Jan 0](https://en.wikipedia.org/wiki/List_of_non-standard_dates#January_0) |
-| 3  | BLOB      | ? | Not yet supported. BLOBs are stored in a separate `.blb` file. The data in the `.dat` file is likely an address for the `.blb` file.  |
+| 3  | BLOB      | 8 | Block index in the `.blb` file. The actual content is stored in blocks with 18-byte headers. |
 | 4  | Boolean   | 1 | Missing the trailing `\x01` marker |
 | 5  | Short Int | 2 |           |
 | 6  | Int       | 4 |           |
 | 7  | Double    | 8 | IEEE-754  |
 | 11 | Timestamp | 8 | IEEE-754, milliseconds since [AD 1, Jan 0](https://en.wikipedia.org/wiki/List_of_non-standard_dates#January_0) |
 | 5383 | Currency | 8 | IEEE-754  |
 | 7430 | Autoincrement | 4 | Int  |
+| 7431 | MEMO     | 8 | Similar to BLOB, stores text content in the `.blb` file |
 
 
 Row Definition
@@ -66,3 +67,24 @@ A row in the actual data section of the database has a 26-byte row header. The m
 |  `0x9`   | 16   | Checksum (MD5?)        |
 |  `0x19`  | 2    | Trailing `\x01` marker |
 |  `0x20`  |      | Start of first field   |
+
+Blob Format
+-----------
+
+Blobs are stored in a separate `.blb` file and use a block-based structure. Each block has a header followed by content.
+
+Block Header (18 bytes):
+|  Offset  | Size<br>(bytes) | Description         |
+|  ------: | ---- | ------------------------------ |
+|  `0x0`   | 4    | Previous block index           |
+|  `0x4`   | 4    | Next block index               |
+|  `0x8`   | 2    | Length of block content        |
+|  `0xA`   | 4    | Unknown index                  |
+|  `0xE`   | 4    | Total length (on first block)  |
+
+The blocks form a linked list structure where:
+- Each block points to the next block using the next block index
+- The chain ends when next block index is 0
+- The first block (index 0) is empty
+- Content is stored after the header in each block
+- The total length field in the first block indicates the complete blob size
diff --git a/README.md b/README.md
@@ -16,6 +16,11 @@ PyDBISAM includes a simple CLI that can be used to dump the table structure or e
 # pydbisam --dump-csv path/to/file.dat
 ```
 
+When exporting to CSV, BLOB fields are handled specially:
+- The actual blob content is stored in a `blobs` directory
+- Only the MD5 hash of the blob content is written to the CSV
+- The blob files are named using their MD5 hashes
+
 
 Code Usage
 ----------

diff --git a/pydbisam/__init__.py b/pydbisam/__init__.py
@@ -1,4 +1,7 @@
 import binascii
+import os
+import sys
+from .blob import Blob
 
 
 class PyDBISAM:
@@ -23,6 +26,7 @@ def __init__(self, path=None, data: bytes = None):
         self._total_rows = 0
         self._description = None
         self._user_version = None
+        self._blob = None
 
         if path:
             with open(path, mode="rb") as file:
@@ -39,6 +43,13 @@ def __init__(self, path=None, data: bytes = None):
         self._read_file_header()
         self._read_field_subheader()
 
+        blob_path = path.replace(".dat", ".blb")
+        if os.path.exists(blob_path):
+            with open(blob_path, mode="rb") as blob_file:
+                self._blob = Blob(blob_file.read(), self._blob_block_size);
+        else:
+            self._blob_bytes = b""
+
     def __enter__(self):
         return self
 

diff --git a/pydbisam/blob.py b/pydbisam/blob.py
@@ -0,0 +1,73 @@
+import struct
+import sys
+import hashlib
+import os
+
+
+class Blob:
+    """
+    Blob is a class that decodes the blob data from a file.
+
+    `_data` is the memoryview of the blob data
+
+    `_block_size` is the size of the block
+
+    """
+
+    _BLOCK_HEADER_SIZE = 18
+    _BLOB_DIR = "blobs"
+
+    def __init__(self, content, block_size):
+        self._data = memoryview(content)
+        self._block_size = block_size
+
+    def get_blob(self, block_index):
+        # print("GET BLOB", block_index, file=sys.stderr)
+        content = bytearray()
+        if block_index == 0:
+            return content
+
+        offset = block_index * self._block_size
+        remaining_length = 0
+        try:
+            while True:
+                (prev_block, next_block, length_of_block, blob_index, total_length) = struct.unpack_from("<IIHII", self._data, offset)
+                if total_length > 0:
+                    remaining_length = total_length
+                content_offset = offset + self._BLOCK_HEADER_SIZE
+                block_content = bytearray(self._data[content_offset:content_offset+length_of_block])
+                content.extend(block_content)
+                # print("  BLOCK", int(offset/self._block_size), (prev_block, next_block, length_of_block, blob_index, total_length), length_of_block, total_length, offset, len(self._data), file=sys.stderr)
+                # print("    CONTENT HEX", block_content.hex(), file=sys.stderr)
+                # print("    CONTENT", block_content, file=sys.stderr)
+
+                remaining_length -= length_of_block
+                #if next_block == 0:
+                if remaining_length <= 0:
+                    break
+                offset = next_block * self._block_size
+
+        except Exception as e:
+            print("ERROR decoding block", e, "offset", offset, file=sys.stderr)
+            return bytearray()
+
+        # print("    HASH", self._hash(content), file=sys.stderr)
+        # print("    COMPLETE", remaining_length, content, file=sys.stderr)
+        return content
+
+    def write_blob_to_content_hash(self, content):
+        content_hash = self._hash(content)
+        self._write_blob_to_file(content, os.path.join(self._BLOB_DIR, content_hash))
+        return content_hash
+
+    def _hash(self, content):
+        md5 = hashlib.md5()
+        md5.update(content)
+        return md5.hexdigest();
+
+
+    def _write_blob_to_file(self, content, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "wb") as file:
+            file.write(content)
+
diff --git a/pydbisam/cli.py b/pydbisam/cli.py
@@ -3,6 +3,12 @@
 from pydbisam import PyDBISAM
 
 
+def quote_string(s):
+    if isinstance(s, str):
+        escaped = s.replace('"', '""')
+        return f'"{escaped}"'
+    return str(s)
+
 def main():
     parser = argparse.ArgumentParser(
         description="Extract data from DBISAM database tables."
@@ -25,9 +31,9 @@ def main():
 
     with PyDBISAM(args.path) as db:
         if args.dump_csv:
-            print(", ".join(db.fields()))
+            print(",".join(map(quote_string, db.fields())))
             for row in db.rows():
-                print(", ".join(map(str, row)))
+                print(",".join(map(quote_string, row)))
             exit()
 
         db.dump_structure()

diff --git a/pydbisam/extract.py b/pydbisam/extract.py
@@ -1,5 +1,6 @@
 import datetime
 import struct
+import sys
 from ctypes import create_string_buffer
 
 from .field import Field
@@ -25,6 +26,10 @@ def _read_file_header(self):
     else:
         self._description = None
 
+    # Size of the blocks in the blob file, default is 512
+    self._blob_block_size = struct.unpack_from("<I", self._data, 0x33)[0]
+    # print("blob_block_size", self._blob_block_size, file=sys.stderr)
+
     u_major = struct.unpack_from("<H", self._data, 0xC1)[0]
     u_minor = struct.unpack_from("<B", self._data, 0xC3)[0]
     self._user_version = f"{u_major}.{u_minor}"
@@ -148,4 +153,4 @@ def row(self, index, extract_deleted=False):
     #     row_idx_b,
     # ]
 
-    return [field.decode_from_row(row_data) for field in self._columns]
+    return [field.decode_from_row(row_data, self._blob) for field in self._columns]
diff --git a/pydbisam/field.py b/pydbisam/field.py
@@ -1,5 +1,6 @@
 import datetime
 import struct
+import sys
 from enum import Enum, unique
 
 
@@ -16,7 +17,7 @@ class FieldType(int, Enum):
 
     STRING = (1, 0)
     DATE = (2, 4)
-    BLOB = (3, -1)
+    BLOB = (3, 8)
     BOOLEAN = (4, 1)
     SHORT_INTEGER = (5, 2)
     INTEGER = (6, 4)
@@ -25,6 +26,7 @@ class FieldType(int, Enum):
     TIMESTAMP = (11, 8)
 
     CURRENCY = (5383, 8)
+    MEMO = (5635, 8)
     AUTOINCREMET = (7430, 4)
 
     def __new__(cls, type_id, size):
@@ -95,7 +97,7 @@ def index(self):
     def row_offset(self):
         return self._row_offset
 
-    def decode_from_row(self, row_data):
+    def decode_from_row(self, row_data, blob):
         field_data = row_data[self.row_offset : self.row_offset + self.size]
 
         # Bool doesn't have a field marking that its empty.
@@ -109,12 +111,19 @@ def decode_from_row(self, row_data):
             return None
 
         if self._type is FieldType.STRING:
-            return (
-                bytearray(field_data).decode("cp1252", errors="replace").rstrip("\x00")
-            )
+            # Read zero-terminated string
+            bytes = bytearray(field_data)
+            string_len = bytes.find(b"\x00")
+            return bytes[:string_len].decode("cp1252", errors="replace")
+
         if self._type is FieldType.BLOB:
-            # Value is likely an address within the separate blob file.
-            return None
+            (block_index, _unknown) = struct.unpack("<II", field_data)
+            content = blob.get_blob(block_index)
+            content_hash = blob.write_blob_to_content_hash(content)
+            return content_hash
+        elif self._type is FieldType.MEMO:
+            (block_index, _unknown) = struct.unpack("<II", field_data)
+            return blob.get_blob(block_index).decode("cp1252", errors="replace")
         elif self._type is FieldType.DATE:
             days = struct.unpack("<i", field_data)[0]
             if days == 0: