Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,15 @@ Datatypes
| -- | --------- | --- |----------------------- |
| 1 | String | Variable | Size defined in column definition at `0xA6` |
| 2 | Date | 4 | Days -1 since [AD 1, Jan 0](https://en.wikipedia.org/wiki/List_of_non-standard_dates#January_0) |
| 3 | BLOB | ? | Not yet supported. BLOBs are stored in a separate `.blb` file. The data in the `.dat` file is likely an address for the `.blb` file. |
| 3 | BLOB | 8 | Block index in the `.blb` file. The actual content is stored in blocks with 18-byte headers. |
| 4 | Boolean | 1 | Missing the trailing `\x01` marker |
| 5 | Short Int | 2 | |
| 6 | Int | 4 | |
| 7 | Double | 8 | IEEE-754 |
| 11 | Timestamp | 8 | IEEE-754, milliseconds since [AD 1, Jan 0](https://en.wikipedia.org/wiki/List_of_non-standard_dates#January_0) |
| 5383 | Currency | 8 | IEEE-754 |
| 7430 | Autoincrement | 4 | Int |
| 7431 | MEMO | 8 | Similar to BLOB, stores text content in the `.blb` file |


Row Definition
Expand All @@ -66,3 +67,24 @@ A row in the actual data section of the database has a 26-byte row header. The m
| `0x9` | 16 | Checksum (MD5?) |
Copy link
Copy Markdown

@X-Coder X-Coder Jun 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In all file headers, this is the engine's checksum and not a file checksum. This needs to match the reading engine, otherwise the engine is unable to read the file. It doesn't matter for this project, it is only used by the engine.

| `0x19` | 2 | Trailing `\x01` marker |
| `0x20` | | Start of first field |

Blob Format
-----------

Blobs are stored in a separate `.blb` file and use a block-based structure. Each block has a header followed by content.

Block Header (18 bytes):
| Offset | Size<br>(bytes) | Description |
| ------: | ---- | ------------------------------ |
| `0x0` | 4 | Previous block index |
| `0x4` | 4 | Next block index |
| `0x8` | 2 | Length of block content |
| `0xA` | 4 | Unknown index |
Copy link
Copy Markdown

@X-Coder X-Coder Jun 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0xA is the row index, from the main table file. This is how the engine can link the blob block to a specific row. One blb file contains the data of all blob and memo type columns.

| `0xE` | 4 | Total length (on first block) |

The blocks form a linked list structure where:
- Each block points to the next block using the next block index
- The chain ends when next block index is 0
- The first block (index 0) is empty
- Content is stored after the header in each block
- The total length field in the first block indicates the complete blob size
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ PyDBISAM includes a simple CLI that can be used to dump the table structure or e
# pydbisam --dump-csv path/to/file.dat
```

When exporting to CSV, BLOB fields are handled specially:
- The actual blob content is stored in a `blobs` directory
- Only the MD5 hash of the blob content is written to the CSV
- The blob files are named using their MD5 hashes


Code Usage
----------
Expand Down
11 changes: 11 additions & 0 deletions pydbisam/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import binascii
import os
import sys
from .blob import Blob


class PyDBISAM:
Expand All @@ -23,6 +26,7 @@ def __init__(self, path=None, data: bytes = None):
self._total_rows = 0
self._description = None
self._user_version = None
self._blob = None

if path:
with open(path, mode="rb") as file:
Expand All @@ -39,6 +43,13 @@ def __init__(self, path=None, data: bytes = None):
self._read_file_header()
self._read_field_subheader()

blob_path = path.replace(".dat", ".blb")
if os.path.exists(blob_path):
with open(blob_path, mode="rb") as blob_file:
self._blob = Blob(blob_file.read(), self._blob_block_size);
else:
self._blob_bytes = b""

def __enter__(self):
return self

Expand Down
73 changes: 73 additions & 0 deletions pydbisam/blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import struct
import sys
import hashlib
import os


class Blob:
"""
Blob is a class that decodes the blob data from a file.

`_data` is the memoryview of the blob data

`_block_size` is the size of the block

"""

_BLOCK_HEADER_SIZE = 18
_BLOB_DIR = "blobs"

def __init__(self, content, block_size):
self._data = memoryview(content)
self._block_size = block_size

def get_blob(self, block_index):
# print("GET BLOB", block_index, file=sys.stderr)
content = bytearray()
if block_index == 0:
return content

offset = block_index * self._block_size
remaining_length = 0
try:
while True:
(prev_block, next_block, length_of_block, blob_index, total_length) = struct.unpack_from("<IIHII", self._data, offset)
if total_length > 0:
remaining_length = total_length
content_offset = offset + self._BLOCK_HEADER_SIZE
block_content = bytearray(self._data[content_offset:content_offset+length_of_block])
content.extend(block_content)
# print(" BLOCK", int(offset/self._block_size), (prev_block, next_block, length_of_block, blob_index, total_length), length_of_block, total_length, offset, len(self._data), file=sys.stderr)
# print(" CONTENT HEX", block_content.hex(), file=sys.stderr)
# print(" CONTENT", block_content, file=sys.stderr)

remaining_length -= length_of_block
#if next_block == 0:
if remaining_length <= 0:
break
offset = next_block * self._block_size

except Exception as e:
print("ERROR decoding block", e, "offset", offset, file=sys.stderr)
return bytearray()

# print(" HASH", self._hash(content), file=sys.stderr)
# print(" COMPLETE", remaining_length, content, file=sys.stderr)
return content

def write_blob_to_content_hash(self, content):
content_hash = self._hash(content)
self._write_blob_to_file(content, os.path.join(self._BLOB_DIR, content_hash))
return content_hash

def _hash(self, content):
md5 = hashlib.md5()
md5.update(content)
return md5.hexdigest();


def _write_blob_to_file(self, content, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "wb") as file:
file.write(content)

10 changes: 8 additions & 2 deletions pydbisam/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
from pydbisam import PyDBISAM


def quote_string(s):
if isinstance(s, str):
escaped = s.replace('"', '""')
return f'"{escaped}"'
return str(s)

def main():
parser = argparse.ArgumentParser(
description="Extract data from DBISAM database tables."
Expand All @@ -25,9 +31,9 @@ def main():

with PyDBISAM(args.path) as db:
if args.dump_csv:
print(", ".join(db.fields()))
print(",".join(map(quote_string, db.fields())))
for row in db.rows():
print(", ".join(map(str, row)))
print(",".join(map(quote_string, row)))
exit()

db.dump_structure()
Expand Down
7 changes: 6 additions & 1 deletion pydbisam/extract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import struct
import sys
from ctypes import create_string_buffer

from .field import Field
Expand All @@ -25,6 +26,10 @@ def _read_file_header(self):
else:
self._description = None

# Size of the blocks in the blob file, default is 512
self._blob_block_size = struct.unpack_from("<I", self._data, 0x33)[0]
# print("blob_block_size", self._blob_block_size, file=sys.stderr)

u_major = struct.unpack_from("<H", self._data, 0xC1)[0]
u_minor = struct.unpack_from("<B", self._data, 0xC3)[0]
self._user_version = f"{u_major}.{u_minor}"
Expand Down Expand Up @@ -148,4 +153,4 @@ def row(self, index, extract_deleted=False):
# row_idx_b,
# ]

return [field.decode_from_row(row_data) for field in self._columns]
return [field.decode_from_row(row_data, self._blob) for field in self._columns]
23 changes: 16 additions & 7 deletions pydbisam/field.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import struct
import sys
from enum import Enum, unique


Expand All @@ -16,7 +17,7 @@ class FieldType(int, Enum):

STRING = (1, 0)
DATE = (2, 4)
BLOB = (3, -1)
BLOB = (3, 8)
BOOLEAN = (4, 1)
SHORT_INTEGER = (5, 2)
INTEGER = (6, 4)
Expand All @@ -25,6 +26,7 @@ class FieldType(int, Enum):
TIMESTAMP = (11, 8)

CURRENCY = (5383, 8)
MEMO = (5635, 8)
AUTOINCREMET = (7430, 4)

def __new__(cls, type_id, size):
Expand Down Expand Up @@ -95,7 +97,7 @@ def index(self):
def row_offset(self):
return self._row_offset

def decode_from_row(self, row_data):
def decode_from_row(self, row_data, blob):
field_data = row_data[self.row_offset : self.row_offset + self.size]

# Bool doesn't have a field marking that its empty.
Expand All @@ -109,12 +111,19 @@ def decode_from_row(self, row_data):
return None

if self._type is FieldType.STRING:
return (
bytearray(field_data).decode("cp1252", errors="replace").rstrip("\x00")
)
# Read zero-terminated string
bytes = bytearray(field_data)
string_len = bytes.find(b"\x00")
return bytes[:string_len].decode("cp1252", errors="replace")

if self._type is FieldType.BLOB:
# Value is likely an address within the separate blob file.
return None
(block_index, _unknown) = struct.unpack("<II", field_data)
content = blob.get_blob(block_index)
content_hash = blob.write_blob_to_content_hash(content)
return content_hash
elif self._type is FieldType.MEMO:
(block_index, _unknown) = struct.unpack("<II", field_data)
return blob.get_blob(block_index).decode("cp1252", errors="replace")
elif self._type is FieldType.DATE:
days = struct.unpack("<i", field_data)[0]
if days == 0:
Expand Down