1+ import logging
12import pandas as pd
2- from pydantic import field_validator , BaseModel
3+ from pydantic import field_validator , model_validator , BaseModel , ConfigDict , Field
34from pathlib import Path , PureWindowsPath
45from typing import List , Optional , Tuple , Union , Sequence
56
67from mdocfile .utils import find_section_entries , find_title_entries
78
9+ log = logging .getLogger ('mdocfile' )
10+
811
912class MdocGlobalData (BaseModel ):
1013 """Data model for global data in a SerialEM mdoc file.
1114
1215 https://bio3d.colorado.edu/SerialEM/hlp/html/about_formats.htm
1316 """
17+ model_config = ConfigDict (extra = 'allow' )
18+
1419 DataMode : Optional [int ] = None
1520 ImageSize : Optional [Tuple [int , int ]] = None
1621 Montage : Optional [bool ] = None
@@ -69,6 +74,10 @@ class MdocSectionData(BaseModel):
6974
7075 https://bio3d.colorado.edu/SerialEM/hlp/html/about_formats.htm
7176 """
77+ model_config = ConfigDict (extra = 'allow' , # keep extra field data
78+ validate_by_name = True ) # use our validations for aliased fields
79+ # serialize_by_alias=True) # use the version of the fieldname the file arrived as
80+
7281 # headers
7382 ZValue : Optional [int ] = None
7483 MontSection : Optional [int ] = None
@@ -111,7 +120,9 @@ class MdocSectionData(BaseModel):
111120 Union [Tuple [float , float ], Tuple [float , float , float ]]] = None
112121 SubFramePath : Optional [Union [PureWindowsPath , Path ]] = None
113122 NumSubFrames : Optional [int ] = None
114- FrameDosesAndNumbers : Optional [Sequence [Tuple [float , int ]]] = None
123+ FrameDosesAndNumbers : Optional [Sequence [Tuple [float , int ]]] = Field (
124+ default = None , validation_alias = 'FrameDosesAndNumber'
125+ )
115126 DateTime : Optional [str ] = None
116127 NavigatorLabel : Optional [str ] = None
117128 FilterSlitAndLoss : Optional [Tuple [float , float ]] = None
@@ -120,6 +131,16 @@ class MdocSectionData(BaseModel):
120131 CameraPixelSize : Optional [float ] = None
121132 Voltage : Optional [float ] = None
122133
134+ @model_validator (mode = 'before' )
135+ @classmethod
136+ def warn_on_aliases (cls , data ):
137+ if isinstance (data , dict ):
138+ for field_name , field_info in cls .model_fields .items ():
139+ alias = field_info .validation_alias
140+ if alias and alias in data :
141+ log .warning (f"'{ alias } ' mapped to '{ field_name } '" )
142+ return data
143+
123144 @field_validator (
124145 'PieceCoordinates' ,
125146 'SuperMontCoords' ,
@@ -133,7 +154,6 @@ class MdocSectionData(BaseModel):
133154 'StageOffsets' ,
134155 'AlignedPieceCoords' ,
135156 'AlignedPieceCoordsVS' ,
136- 'FrameDosesAndNumbers' ,
137157 'FilterSlitAndLoss' ,
138158 'MultiShotHoleAndPosition' ,
139159 mode = "before" )
@@ -143,28 +163,31 @@ def multi_number_string_to_tuple(cls, value: str):
143163 value = tuple (value .split ())
144164 return value
145165
166+ @field_validator ('FrameDosesAndNumbers' , mode = "before" )
167+ @classmethod
168+ def parse_frame_doses_and_numbers (cls , value : str ):
169+ """Parse 'dose1 num1 dose2 num2 ...' into [(dose1, num1), ...]"""
170+ if isinstance (value , str ):
171+ parts = value .split ()
172+ return [(float (parts [i ]), int (parts [i + 1 ])) for i in range (0 , len (parts )- 1 , 2 )]
173+ return value
174+
146175 @classmethod
147176 def from_lines (cls , lines : List [str ]):
148- lines = [line .strip ('[]' )
149- for line
150- in lines
151- if len (line ) > 0 ]
152- key_value_pairs = [line .split ('=' ) for line in lines ]
153- key_value_pairs = [
154- (k .strip (), v .strip ())
155- for k , v
156- in key_value_pairs
157- ]
158- lines = {k : v for k , v in key_value_pairs }
159- return cls (** lines )
177+ data = {}
178+ for line in lines :
179+ line = line .strip ().strip ('[]' )
180+ if not line or '=' not in line :
181+ continue
182+ k , v = line .split ('=' , 1 )
183+ data [k .strip ()] = v .strip ()
184+ return cls (** data )
160185
161186 @classmethod
162187 def from_dataframe (cls , series : pd .Series ):
163- section = {}
164- for k in cls .model_fields .keys ():
165- if k in series .index .tolist ():
166- section [k ] = series [k ]
167- return cls (** section )
188+ skip = set (MdocGlobalData .model_fields .keys ()) | {'titles' }
189+ data = {k : series [k ] for k in series .index if k not in skip }
190+ return cls (** data )
168191
169192 def to_string (self ):
170193 data = self .model_dump ()
@@ -173,6 +196,8 @@ def to_string(self):
173196 for k , v in data .items ():
174197 if v is None :
175198 continue
199+ elif k == 'FrameDosesAndNumbers' and isinstance (v , list ):
200+ v = ' ' .join (f'{ d } { n } ' for d , n in v )
176201 elif isinstance (v , tuple ):
177202 v = ' ' .join (str (el ) for el in v )
178203 elif v == 'nan' :
@@ -213,19 +238,35 @@ def from_lines(cls, file_lines: List[str]) -> 'Mdoc':
213238 for start_idx , end_idx
214239 in zip (split_idxs , split_idxs [1 :])
215240 ]
241+
242+ # Warn about extra fields
243+ extra_fields = set (global_data .model_extra .keys ())
244+ for s in section_data :
245+ extra_fields .update (s .model_extra .keys ())
246+ if extra_fields :
247+ log .warning (f"Unknown fields will be preserved: { extra_fields } " )
248+
216249 return cls (titles = titles , global_data = global_data , section_data = section_data )
217250
218251 def to_dataframe (self ) -> pd .DataFrame :
219252 """
220253 Convert an Mdoc object to a pandas DataFrame
221254 """
222255 global_data = self .global_data .model_dump ()
223- section_data = {
224- k : [section .model_dump ()[k ] for section in self .section_data ]
225- for k
226- in self .section_data [0 ].model_dump ().keys ()
227- }
228- df = pd .DataFrame (data = section_data )
256+ # Include extra fields from global_data
257+ global_data .update (self .global_data .model_extra )
258+
259+ # Collect all keys from all sections (including extras)
260+ all_keys = set ()
261+ section_dicts = []
262+ for section in self .section_data :
263+ d = section .model_dump ()
264+ d .update (section .model_extra )
265+ section_dicts .append (d )
266+ all_keys .update (d .keys ())
267+
268+ # Build section_data dict with None for missing keys
269+ df = pd .DataFrame (data = dict ((k , [d .get (k ) for d in section_dicts ]) for k in all_keys ))
229270
230271 # add duplicate copies of global data and mdoc file titles to each row of
231272 # the dataframe - tidy data is easier to analyse
0 commit comments