feat: Handle extra mdoc fields and use pydantic alias for FrameDosesAndNumbers

sumslogs · sumslogs · commit 774cf29c1f23 · 2026-01-27T13:44:28.000-08:00
diff --git a/src/mdocfile/data_models.py b/src/mdocfile/data_models.py
@@ -1,16 +1,21 @@
+import logging
 import pandas as pd
-from pydantic import field_validator, BaseModel
+from pydantic import field_validator, model_validator, BaseModel, ConfigDict, Field
 from pathlib import Path, PureWindowsPath
 from typing import List, Optional, Tuple, Union, Sequence
 
 from mdocfile.utils import find_section_entries, find_title_entries
 
+log = logging.getLogger('mdocfile')
+
 
 class MdocGlobalData(BaseModel):
     """Data model for global data in a SerialEM mdoc file.
 
     https://bio3d.colorado.edu/SerialEM/hlp/html/about_formats.htm
     """
+    model_config = ConfigDict(extra='allow')
+
     DataMode: Optional[int] = None
     ImageSize: Optional[Tuple[int, int]] = None
     Montage: Optional[bool] = None
@@ -69,6 +74,10 @@ class MdocSectionData(BaseModel):
 
     https://bio3d.colorado.edu/SerialEM/hlp/html/about_formats.htm
     """
+    model_config = ConfigDict(extra='allow', # keep extra field data
+                              validate_by_name=True) # use our validations for aliased fields
+                              # serialize_by_alias=True) # use the version of the fieldname the file arrived as
+
     # headers
     ZValue: Optional[int] = None
     MontSection: Optional[int] = None
@@ -111,7 +120,9 @@ class MdocSectionData(BaseModel):
         Union[Tuple[float, float], Tuple[float, float, float]]] = None
     SubFramePath: Optional[Union[PureWindowsPath, Path]] = None
     NumSubFrames: Optional[int] = None
-    FrameDosesAndNumbers: Optional[Sequence[Tuple[float, int]]] = None
+    FrameDosesAndNumbers: Optional[Sequence[Tuple[float, int]]] = Field(
+        default=None, validation_alias='FrameDosesAndNumber'
+    )
     DateTime: Optional[str] = None
     NavigatorLabel: Optional[str] = None
     FilterSlitAndLoss: Optional[Tuple[float, float]] = None
@@ -120,6 +131,16 @@ class MdocSectionData(BaseModel):
     CameraPixelSize: Optional[float] = None
     Voltage: Optional[float] = None
 
+    @model_validator(mode='before')
+    @classmethod
+    def warn_on_aliases(cls, data):
+        if isinstance(data, dict):
+            for field_name, field_info in cls.model_fields.items():
+                alias = field_info.validation_alias
+                if alias and alias in data:
+                    log.warning(f"'{alias}' mapped to '{field_name}'")
+        return data
+
     @field_validator(
         'PieceCoordinates',
         'SuperMontCoords',
@@ -133,7 +154,6 @@ class MdocSectionData(BaseModel):
         'StageOffsets',
         'AlignedPieceCoords',
         'AlignedPieceCoordsVS',
-        'FrameDosesAndNumbers',
         'FilterSlitAndLoss',
         'MultiShotHoleAndPosition',
         mode="before")
@@ -143,28 +163,31 @@ def multi_number_string_to_tuple(cls, value: str):
             value = tuple(value.split())
         return value
 
+    @field_validator('FrameDosesAndNumbers', mode="before")
+    @classmethod
+    def parse_frame_doses_and_numbers(cls, value: str):
+        """Parse 'dose1 num1 dose2 num2 ...' into [(dose1, num1), ...]"""
+        if isinstance(value, str):
+            parts = value.split()
+            return [(float(parts[i]), int(parts[i+1])) for i in range(0, len(parts)-1, 2)]
+        return value
+
     @classmethod
     def from_lines(cls, lines: List[str]):
-        lines = [line.strip('[]')
-                 for line
-                 in lines
-                 if len(line) > 0]
-        key_value_pairs = [line.split('=') for line in lines]
-        key_value_pairs = [
-            (k.strip(), v.strip())
-            for k, v
-            in key_value_pairs
-        ]
-        lines = {k: v for k, v in key_value_pairs}
-        return cls(**lines)
+        data = {}
+        for line in lines:
+            line = line.strip().strip('[]')
+            if not line or '=' not in line:
+                continue
+            k, v = line.split('=', 1)
+            data[k.strip()] = v.strip()
+        return cls(**data)
     
     @classmethod
     def from_dataframe(cls, series: pd.Series):
-        section = {}
-        for k in cls.model_fields.keys():
-            if k in series.index.tolist():
-                section[k] = series[k]
-        return cls(**section)
+        skip = set(MdocGlobalData.model_fields.keys()) | {'titles'}
+        data = {k: series[k] for k in series.index if k not in skip}
+        return cls(**data)
 
     def to_string(self):
         data = self.model_dump()
@@ -173,6 +196,8 @@ def to_string(self):
         for k, v in data.items():
             if v is None:
                 continue
+            elif k == 'FrameDosesAndNumbers' and isinstance(v, list):
+                v = ' '.join(f'{d} {n}' for d, n in v)
             elif isinstance(v, tuple):
                 v = ' '.join(str(el) for el in v)
             elif v == 'nan':
@@ -213,19 +238,35 @@ def from_lines(cls, file_lines: List[str]) -> 'Mdoc':
             for start_idx, end_idx
             in zip(split_idxs, split_idxs[1:])
         ]
+
+        # Warn about extra fields
+        extra_fields = set(global_data.model_extra.keys())
+        for s in section_data:
+            extra_fields.update(s.model_extra.keys())
+        if extra_fields:
+            log.warning(f"Unknown fields will be preserved: {extra_fields}")
+
         return cls(titles=titles, global_data=global_data, section_data=section_data)
     
     def to_dataframe(self) -> pd.DataFrame:
         """
         Convert an Mdoc object to a pandas DataFrame
         """
         global_data = self.global_data.model_dump()
-        section_data = {
-            k: [section.model_dump()[k] for section in self.section_data]
-            for k
-            in self.section_data[0].model_dump().keys()
-        }
-        df = pd.DataFrame(data=section_data)
+        # Include extra fields from global_data
+        global_data.update(self.global_data.model_extra)
+
+        # Collect all keys from all sections (including extras)
+        all_keys = set()
+        section_dicts = []
+        for section in self.section_data:
+            d = section.model_dump()
+            d.update(section.model_extra)
+            section_dicts.append(d)
+            all_keys.update(d.keys())
+
+        # Build section_data dict with None for missing keys
+        df = pd.DataFrame(data=dict((k, [d.get(k) for d in section_dicts]) for k in all_keys))
 
         # add duplicate copies of global data and mdoc file titles to each row of
         # the dataframe - tidy data is easier to analyse
diff --git a/tests/test_data_models.py b/tests/test_data_models.py
@@ -83,6 +83,7 @@ def test_to_string_is_valid_mdoc(tilt_series_mdoc_file):
     mdoc = Mdoc.from_file(tilt_series_mdoc_file)
     with NamedTemporaryFile() as tmp:
         tmp.write(mdoc.to_string().encode())
+        tmp.flush()
         mdoc2 = Mdoc.from_file(tmp.name)
     mdoc_dict = mdoc.section_data[0].model_dump()
     mdoc2_dict = mdoc2.section_data[0].model_dump()
@@ -91,4 +92,96 @@ def test_to_string_is_valid_mdoc(tilt_series_mdoc_file):
         assert k1 == k2
 
 def test_section_data_from_path():
-    section = MdocSectionData(SubFramePath=Path('bla.tif'))
+    some_path = Path('bla.tif')
+    section = MdocSectionData(SubFramePath=some_path)
+    assert section.SubFramePath == some_path
+    assert f'SubFramePath = {some_path}' in section.to_string()
+
+def test_fieldname_alias_mapping():
+    """Test that aliased field names are mapped to canonical names."""
+    lines = """[ZValue = 0]
+TiltAngle = 5.0
+FrameDosesAndNumber = 2.5 10 3.0 20
+""".split('\n')
+
+    section = MdocSectionData.from_lines(lines)
+
+    # Should be accessible via canonical name
+    assert section.FrameDosesAndNumbers is not None
+    assert section.FrameDosesAndNumbers == [(2.5, 10), (3.0, 20)]
+
+    # to_string should output the canonical name (FrameDosesAndNumbers)
+    output = section.to_string()
+    assert 'FrameDosesAndNumbers = 2.5 10 3.0 20' in output
+    # Original aliased name should not appear
+    assert 'FrameDosesAndNumber =' not in output
+
+
+def test_extra_fields_round_trip():
+    """Test that unknown fields are preserved through full Mdoc round-trip."""
+    mdoc_str = """DataMode = 1
+ImageFile = test.mrc
+
+[ZValue = 0]
+TiltAngle = 5.0
+CountsPerElectron = 42.0
+UnknownCustomField = some_value
+"""
+
+    mdoc = Mdoc.from_string(mdoc_str)
+
+    # Extra fields stored in model_extra
+    assert mdoc.section_data[0].model_extra['CountsPerElectron'] == '42.0'
+    assert mdoc.section_data[0].model_extra['UnknownCustomField'] == 'some_value'
+
+    # Round-trip preserves extra fields
+    mdoc2 = Mdoc.from_string(mdoc.to_string())
+    assert mdoc2.section_data[0].model_extra['CountsPerElectron'] == '42.0'
+    assert mdoc2.section_data[0].model_extra['UnknownCustomField'] == 'some_value'
+
+
+def test_dataframe_alias_mapping():
+    """Test that aliased field names work through dataframe round-trip."""
+    mdoc_str = """DataMode = 1
+ImageFile = test.mrc
+
+[ZValue = 0]
+TiltAngle = 5.0
+FrameDosesAndNumber = 2.5 10 3.0 20
+"""
+
+    mdoc = Mdoc.from_string(mdoc_str)
+    df = mdoc.to_dataframe()
+
+    # Dataframe should have canonical name
+    assert 'FrameDosesAndNumbers' in df.columns
+
+    # Round-trip through dataframe
+    mdoc2 = Mdoc.from_dataframe(df)
+    assert mdoc2.section_data[0].FrameDosesAndNumbers == [(2.5, 10), (3.0, 20)]
+
+
+def test_dataframe_extra_fields_round_trip():
+    """Test that extra fields survive dataframe round-trip."""
+    mdoc_str = """DataMode = 1
+ImageFile = test.mrc
+
+[ZValue = 0]
+TiltAngle = 5.0
+CountsPerElectron = 42.0
+UnknownCustomField = some_value
+"""
+
+    mdoc = Mdoc.from_string(mdoc_str)
+    df = mdoc.to_dataframe()
+
+    # Extra fields should be columns in dataframe
+    assert 'CountsPerElectron' in df.columns
+    assert 'UnknownCustomField' in df.columns
+    assert df['CountsPerElectron'].iloc[0] == '42.0'
+    assert df['UnknownCustomField'].iloc[0] == 'some_value'
+
+    # Round-trip through dataframe preserves extra fields
+    mdoc2 = Mdoc.from_dataframe(df)
+    assert mdoc2.section_data[0].model_extra['CountsPerElectron'] == '42.0'
+    assert mdoc2.section_data[0].model_extra['UnknownCustomField'] == 'some_value'
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -22,25 +22,30 @@ def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string):
 def test_read_montage_section_mdoc(montage_section_mdoc_file):
     df = read(montage_section_mdoc_file)
     assert isinstance(df, pd.DataFrame)
-    assert df.shape == (63, 37)
+    assert df.shape[0] == 63  # row count
+    assert df.shape[1] >= 37  # at least this many columns (extra fields preserved)
+    assert 'TiltAngle' in df.columns
 
 
 def test_read_montage_section_multiple_mdoc(montage_section_multiple_mdoc_file):
     df = read(montage_section_multiple_mdoc_file)
     assert isinstance(df, pd.DataFrame)
-    assert df.shape == (100, 36)
+    assert df.shape[0] == 100  # row count
+    assert df.shape[1] >= 36  # at least this many columns (extra fields preserved)
 
 
 def test_read_frame_set_single_mdoc(frame_set_single_mdoc_file):
     df = read(frame_set_single_mdoc_file)
     assert isinstance(df, pd.DataFrame)
-    assert df.shape == (1, 26)
+    assert df.shape[0] == 1  # row count
+    assert df.shape[1] >= 26  # at least this many columns (extra fields preserved)
 
 
 def test_read_frame_set_multiple_mdoc(frame_set_multiple_mdoc_file):
     df = read(frame_set_multiple_mdoc_file)
     assert isinstance(df, pd.DataFrame)
-    assert df.shape == (21, 28)
+    assert df.shape[0] == 21  # row count
+    assert df.shape[1] >= 28  # at least this many columns (extra fields preserved)
 
 
 def test_write_tilt_series_mdoc(tilt_series_mdoc_file):