feat(serialization): introduce dedicated serialization module#118
feat(serialization): introduce dedicated serialization module#118
Conversation
Separates serialization logic from API clients (OpenAI, GoogleGenAI) and make it more customizable per model or provider. - Introduced a unified serialization pipeline through the BaseSerializer and its implementations: OpenAICompletionSerializer, ModelProxyOpenAISerializer, and GenAiSerializer. - Removed inline message serialization loops (like _get_raw_messages) from LLM classes, delegating it directly to the corresponding serializer instances.
351b090 to
51feb95
Compare
dolaameng
left a comment
There was a problem hiding this comment.
Thanks for the improvement! Left some comments regarding differences from regression tests.
There was a problem hiding this comment.
nit: Unfortunately I have a "serialization" module that does different things: src/kaggle_benchmarks/kaggle/serialization.py.
Shall we rename one or the other to make it less confusing? It's probably easier to just rename "src/kaggle_benchmarks/kaggle/serialization.py".
There was a problem hiding this comment.
I'm fine with keeping this as is for now. We should discuss a better overall structure, as the current one causes frequent cyclic import issues. One idea:
srd/kb/
- openai/
- models.py / or client.py
- serializers.py
- google/
...
- kaggle
- model_proxy
| yield { | ||
| "role": self.get_role(message.sender), | ||
| "content": [ | ||
| {"type": "video_url", "video_url": {"url": video.url}}, |
There was a problem hiding this comment.
All test_video_url golden tests failed. I don't think "video_url" will work for MP. it should still be "content": [{"type": "image_url", "image_url": {"url": video.url}}] .
@mohami2000 can you confirm?
There was a problem hiding this comment.
You are right, as per VideoContent.get_payload
My key didn't let me to test it.
| yield { | ||
| "role": self.get_role(message.sender), | ||
| "content": [ | ||
| {"type": "text", "text": video.url}, |
There was a problem hiding this comment.
I am not familiar with this, can you confirm "text" is the right type here?
| }, | ||
| ], | ||
| } | ||
| ] |
There was a problem hiding this comment.
Some regression tests to show the difference, not all are needed but we should fix the failure ones.
| ] | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Regression tests for old-vs-new serialization differences | |
| # --------------------------------------------------------------------------- | |
| import dataclasses | |
| import pydantic | |
| class _SamplePydanticModel(pydantic.BaseModel): | |
| name: str | |
| value: int | |
| @dataclasses.dataclass | |
| class _SampleDataclass: | |
| name: str | |
| value: int | |
| class TestModelProxyImageWithCaption: | |
| """Images with captions should include both the caption text and image data.""" | |
| def test_image_without_caption_includes_image_data(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| image = ImageBase64(b64_string="abc123", mime_type="image/png") | |
| msg = messages.Message(content=image, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| content = result[0]["content"] | |
| types_in_content = [item["type"] for item in content] | |
| assert "image_url" in types_in_content | |
| # FIXME: Operator precedence bug in dump_image — the `else [] + [image_part]` | |
| # binds as `else ([] + [image_part])`, so when a caption IS present the image | |
| # data is silently dropped and only the caption text is emitted. | |
| def test_image_with_caption_includes_both_caption_and_image_data(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| image = ImageBase64(b64_string="abc123", mime_type="image/png", caption="A cat") | |
| msg = messages.Message(content=image, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| content = result[0]["content"] | |
| types_in_content = [item["type"] for item in content] | |
| # Both caption text and image data must be present | |
| assert "text" in types_in_content | |
| assert "image_url" in types_in_content | |
| class TestModelProxyVideoSerialization: | |
| """Videos should use image_url as a generic file URL carrier for Model Proxy. | |
| Model Proxy doesn't support the video_url content type and rejects it | |
| with a 400 error. The old code used image_url as a carrier, which worked. | |
| """ | |
| # FIXME: dump_video uses {"type": "video_url", ...} but Model Proxy only | |
| # accepts {"type": "image_url", ...} as a generic file URL carrier. | |
| # The old code used VideoContent.get_payload() which produced image_url. | |
| def test_video_uses_image_url_carrier(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| video = videos.VideoURL(url="https://youtube.com/watch?v=dummy") | |
| msg = messages.Message(content=video, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| content = result[0]["content"] | |
| assert content == [ | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": "https://youtube.com/watch?v=dummy"}, | |
| } | |
| ] | |
| class TestModelProxyDictContent: | |
| """Dict content should be serialized as a JSON string.""" | |
| # FIXME: dump_json_message calls message.copy() which doesn't exist on | |
| # the Message dataclass. Should use dataclasses.replace() or similar. | |
| def test_dict_content_serialized_as_json(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| msg = messages.Message(content={"key": "value"}, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| assert result == [{"role": "user", "content": '{"key": "value"}'}] | |
| class TestModelProxyPydanticContent: | |
| """Pydantic model content should be serialized as a JSON string.""" | |
| # FIXME: Pydantic models go through base.dump_message → copy.copy(msg) + | |
| # model_dump() → dump_json_message → message.copy() crash (same root cause | |
| # as dict content above). | |
| def test_pydantic_model_serialized_as_json(self): | |
| import json | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| model = _SamplePydanticModel(name="test", value=42) | |
| msg = messages.Message(content=model, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| assert result[0]["role"] == "user" | |
| # Content should be a valid JSON string matching the model | |
| parsed = json.loads(result[0]["content"]) | |
| assert parsed == {"name": "test", "value": 42} | |
| class TestModelProxyDataclassContent: | |
| """Dataclass content should be serialized as a JSON string, not Python repr.""" | |
| # FIXME: Dataclass content is not matched by any isinstance check in | |
| # dump_message, so it falls through to _dump_message which uses str(content) | |
| # producing Python repr instead of JSON. | |
| def test_dataclass_serialized_as_json(self): | |
| import json | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| dc = _SampleDataclass(name="test", value=42) | |
| msg = messages.Message(content=dc, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| assert result[0]["role"] == "user" | |
| # Content should be a valid JSON string, not Python repr | |
| parsed = json.loads(result[0]["content"]) | |
| assert parsed == {"name": "test", "value": 42} | |
| class TestModelProxyToolInvocationResult: | |
| """Standalone ToolInvocationResult should be serialized as text content. | |
| Model Proxy doesn't support native tool calls, so the result should be | |
| rendered as a human-readable text message rather than silently dropped. | |
| """ | |
| # FIXME: ModelProxyOpenAISerializer._dump_invocation is a noop (`if False: yield`), | |
| # so standalone ToolInvocationResult messages are silently dropped. | |
| def test_standalone_tool_result_serialized_as_text(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer(roles_mapping={}) | |
| tool_result = ToolInvocationResult( | |
| name="calc", arguments={"a": 1}, call_id="c1", output="42" | |
| ) | |
| msg = messages.Message(content=tool_result, sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| # Should produce at least one message, not silently drop the content | |
| assert len(result) > 0 | |
| assert result[0]["role"] == "user" | |
| class TestModelProxyRoleMapping: | |
| """Verifies that the production role mapping {"tool": "system"} works correctly.""" | |
| def test_tool_role_maps_to_system(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer( | |
| roles_mapping={"tool": "system"} | |
| ) | |
| tool_actor = actors.Actor(name="tool_actor", role="tool") | |
| msg = messages.Message(content="tool output", sender=tool_actor) | |
| result = list(serializer.dump_message(msg)) | |
| assert result[0]["role"] == "system" | |
| def test_user_role_passes_through(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer( | |
| roles_mapping={"tool": "system"} | |
| ) | |
| msg = messages.Message(content="hello", sender=actors.user) | |
| result = list(serializer.dump_message(msg)) | |
| assert result[0]["role"] == "user" | |
| def test_assistant_role_passes_through(self): | |
| serializer = openai_serializer.ModelProxyOpenAISerializer( | |
| roles_mapping={"tool": "system"} | |
| ) | |
| assistant = actors.Actor(name="assistant", role="assistant") | |
| msg = messages.Message(content="response", sender=assistant) | |
| result = list(serializer.dump_message(msg)) | |
| assert result[0]["role"] == "assistant" | |
| def dump_json_message(self, message: msg.Message[dict]): | ||
| """Serializes a JSON dictionary message by stringifying it as text by default.""" | ||
| yield from self.dump_text_message( | ||
| message.copy(new_content=json.dumps(message.content)) |
There was a problem hiding this comment.
Leftover from an intermediate state. I opted not to implement Message.copy.
| ], | ||
| } | ||
|
|
||
| def _dump_invocation(self, tool): |
There was a problem hiding this comment.
@s-alexey I think we can just remove this because MP should support tool calling. wdyt?
dolaameng
left a comment
There was a problem hiding this comment.
LGTM. Thanks for the improvement!
|
Golden test and new unit tests passed. Commit it now. |
Separates serialization logic from API clients (OpenAI, GoogleGenAI) and make it more customizable per model or provider.