Skip to content

Commit 86c4c6f

Browse files
authored
Migrate pickled data & change XCom value type to JSON (#44166)
follow-up of #43905 Changes: - Changed `XCom.value` column to JSON for all dbs. - Archived pickled XCom data to `_xcom_archive` and removed it from the `xcom` table. - Removed encoded string in XCom serialization and deserialization logic. - Updated logic for `XComObjectStorageBackend` to make it compatible for AF 2 & 3
1 parent 39042c8 commit 86c4c6f

File tree

11 files changed

+222
-20
lines changed

11 files changed

+222
-20
lines changed
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
19+
"""
20+
Remove pickled data from xcom table.
21+
22+
Revision ID: eed27faa34e3
23+
Revises: 9fc3fc5de720
24+
Create Date: 2024-11-18 18:41:50.849514
25+
26+
"""
27+
28+
from __future__ import annotations
29+
30+
import sqlalchemy as sa
31+
from alembic import op
32+
from sqlalchemy import text
33+
from sqlalchemy.dialects.mysql import LONGBLOB
34+
35+
from airflow.migrations.db_types import TIMESTAMP, StringID
36+
37+
revision = "eed27faa34e3"
38+
down_revision = "9fc3fc5de720"
39+
branch_labels = None
40+
depends_on = None
41+
airflow_version = "3.0.0"
42+
43+
44+
def upgrade():
45+
"""Apply Remove pickled data from xcom table."""
46+
# Summary of the change:
47+
# 1. Create an archived table (`_xcom_archive`) to store the current "pickled" data in the xcom table
48+
# 2. Extract and archive the pickled data using the condition
49+
# 3. Delete the pickled data from the xcom table so that we can update the column type
50+
# 4. Update the XCom.value column type to JSON from LargeBinary/LongBlob
51+
52+
conn = op.get_bind()
53+
dialect = conn.dialect.name
54+
55+
# Create an archived table to store the current data
56+
op.create_table(
57+
"_xcom_archive",
58+
sa.Column("dag_run_id", sa.Integer(), nullable=False, primary_key=True),
59+
sa.Column("task_id", StringID(length=250), nullable=False, primary_key=True),
60+
sa.Column("map_index", sa.Integer(), nullable=False, server_default=sa.text("-1"), primary_key=True),
61+
sa.Column("key", StringID(length=512), nullable=False, primary_key=True),
62+
sa.Column("dag_id", StringID(length=250), nullable=False),
63+
sa.Column("run_id", StringID(length=250), nullable=False),
64+
sa.Column("value", sa.LargeBinary().with_variant(LONGBLOB, "mysql"), nullable=True),
65+
sa.Column("timestamp", TIMESTAMP(), nullable=False),
66+
sa.PrimaryKeyConstraint("dag_run_id", "task_id", "map_index", "key"),
67+
if_not_exists=True,
68+
)
69+
70+
# Condition to detect pickled data for different databases
71+
condition_templates = {
72+
"postgresql": "get_byte(value, 0) = 128",
73+
"mysql": "HEX(SUBSTRING(value, 1, 1)) = '80'",
74+
"sqlite": "substr(value, 1, 1) = char(128)",
75+
}
76+
77+
condition = condition_templates.get(dialect)
78+
if not condition:
79+
raise RuntimeError(f"Unsupported dialect: {dialect}")
80+
81+
# Key is a reserved keyword in MySQL, so we need to quote it
82+
quoted_key = conn.dialect.identifier_preparer.quote("key")
83+
84+
# Archive pickled data using the condition
85+
conn.execute(
86+
text(
87+
f"""
88+
INSERT INTO _xcom_archive (dag_run_id, task_id, map_index, {quoted_key}, dag_id, run_id, value, timestamp)
89+
SELECT dag_run_id, task_id, map_index, {quoted_key}, dag_id, run_id, value, timestamp
90+
FROM xcom
91+
WHERE value IS NOT NULL AND {condition}
92+
"""
93+
)
94+
)
95+
96+
# Delete the pickled data from the xcom table so that we can update the column type
97+
conn.execute(text(f"DELETE FROM xcom WHERE value IS NOT NULL AND {condition}"))
98+
99+
# Update the value column type to JSON
100+
if dialect == "postgresql":
101+
op.execute(
102+
"""
103+
ALTER TABLE xcom
104+
ALTER COLUMN value TYPE JSONB
105+
USING CASE
106+
WHEN value IS NOT NULL THEN CAST(CONVERT_FROM(value, 'UTF8') AS JSONB)
107+
ELSE NULL
108+
END
109+
"""
110+
)
111+
elif dialect == "mysql":
112+
op.add_column("xcom", sa.Column("value_json", sa.JSON(), nullable=True))
113+
op.execute("UPDATE xcom SET value_json = CAST(value AS CHAR CHARACTER SET utf8mb4)")
114+
op.drop_column("xcom", "value")
115+
op.alter_column("xcom", "value_json", existing_type=sa.JSON(), new_column_name="value")
116+
elif dialect == "sqlite":
117+
# Rename the existing `value` column to `value_old`
118+
with op.batch_alter_table("xcom", schema=None) as batch_op:
119+
batch_op.alter_column("value", new_column_name="value_old")
120+
121+
# Add the new `value` column with JSON type
122+
with op.batch_alter_table("xcom", schema=None) as batch_op:
123+
batch_op.add_column(sa.Column("value", sa.JSON(), nullable=True))
124+
125+
# Migrate data from `value_old` to `value`
126+
conn.execute(
127+
text(
128+
"""
129+
UPDATE xcom
130+
SET value = json(CAST(value_old AS TEXT))
131+
WHERE value_old IS NOT NULL
132+
"""
133+
)
134+
)
135+
136+
# Drop the old `value_old` column
137+
with op.batch_alter_table("xcom", schema=None) as batch_op:
138+
batch_op.drop_column("value_old")
139+
140+
141+
def downgrade():
142+
"""Unapply Remove pickled data from xcom table."""
143+
conn = op.get_bind()
144+
dialect = conn.dialect.name
145+
146+
# Revert the value column back to LargeBinary
147+
if dialect == "postgresql":
148+
op.execute(
149+
"""
150+
ALTER TABLE xcom
151+
ALTER COLUMN value TYPE BYTEA
152+
USING CASE
153+
WHEN value IS NOT NULL THEN CONVERT_TO(value::TEXT, 'UTF8')
154+
ELSE NULL
155+
END
156+
"""
157+
)
158+
elif dialect == "mysql":
159+
op.add_column("xcom", sa.Column("value_blob", LONGBLOB, nullable=True))
160+
op.execute("UPDATE xcom SET value_blob = CAST(value AS BINARY);")
161+
op.drop_column("xcom", "value")
162+
op.alter_column("xcom", "value_blob", existing_type=LONGBLOB, new_column_name="value")
163+
164+
elif dialect == "sqlite":
165+
with op.batch_alter_table("xcom", schema=None) as batch_op:
166+
batch_op.alter_column("value", new_column_name="value_old")
167+
168+
with op.batch_alter_table("xcom", schema=None) as batch_op:
169+
batch_op.add_column(sa.Column("value", sa.LargeBinary, nullable=True))
170+
171+
conn.execute(
172+
text(
173+
"""
174+
UPDATE xcom
175+
SET value = CAST(value_old AS BLOB)
176+
WHERE value_old IS NOT NULL
177+
"""
178+
)
179+
)
180+
181+
with op.batch_alter_table("xcom", schema=None) as batch_op:
182+
batch_op.drop_column("value_old")

airflow/models/xcom.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,17 @@
2323
from typing import TYPE_CHECKING, Any, Iterable, cast
2424

2525
from sqlalchemy import (
26+
JSON,
2627
Column,
2728
ForeignKeyConstraint,
2829
Index,
2930
Integer,
30-
LargeBinary,
3131
PrimaryKeyConstraint,
3232
String,
3333
delete,
3434
select,
3535
text,
3636
)
37-
from sqlalchemy.dialects.mysql import LONGBLOB
3837
from sqlalchemy.ext.associationproxy import association_proxy
3938
from sqlalchemy.orm import Query, reconstructor, relationship
4039

@@ -80,7 +79,7 @@ class BaseXCom(TaskInstanceDependencies, LoggingMixin):
8079
dag_id = Column(String(ID_LEN, **COLLATION_ARGS), nullable=False)
8180
run_id = Column(String(ID_LEN, **COLLATION_ARGS), nullable=False)
8281

83-
value = Column(LargeBinary().with_variant(LONGBLOB, "mysql"))
82+
value = Column(JSON)
8483
timestamp = Column(UtcDateTime, default=timezone.utcnow, nullable=False)
8584

8685
__table_args__ = (
@@ -453,9 +452,12 @@ def serialize_value(
453452
dag_id: str | None = None,
454453
run_id: str | None = None,
455454
map_index: int | None = None,
456-
) -> Any:
455+
) -> str:
457456
"""Serialize XCom value to JSON str."""
458-
return json.dumps(value, cls=XComEncoder).encode("UTF-8")
457+
try:
458+
return json.dumps(value, cls=XComEncoder)
459+
except (ValueError, TypeError):
460+
raise ValueError("XCom value must be JSON serializable")
459461

460462
@staticmethod
461463
def _deserialize_value(result: XCom, orm: bool) -> Any:
@@ -466,7 +468,7 @@ def _deserialize_value(result: XCom, orm: bool) -> Any:
466468
if result.value is None:
467469
return None
468470

469-
return json.loads(result.value.decode("UTF-8"), cls=XComDecoder, object_hook=object_hook)
471+
return json.loads(result.value, cls=XComDecoder, object_hook=object_hook)
470472

471473
@staticmethod
472474
def deserialize_value(result: XCom) -> Any:

airflow/utils/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class MappedClassProtocol(Protocol):
9797
"2.9.2": "686269002441",
9898
"2.10.0": "22ed7efa9da2",
9999
"2.10.3": "5f2621c13b39",
100-
"3.0.0": "9fc3fc5de720",
100+
"3.0.0": "eed27faa34e3",
101101
}
102102

103103

airflow/www/views.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3861,7 +3861,7 @@ class XComModelView(AirflowModelView):
38613861
permissions.ACTION_CAN_ACCESS_MENU,
38623862
]
38633863

3864-
search_columns = ["key", "value", "timestamp", "dag_id", "task_id", "run_id", "logical_date"]
3864+
search_columns = ["key", "timestamp", "dag_id", "task_id", "run_id", "logical_date"]
38653865
list_columns = ["key", "value", "timestamp", "dag_id", "task_id", "run_id", "map_index", "logical_date"]
38663866
base_order = ("dag_run_id", "desc")
38673867

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
028d2fec22a15bbf5794e2fc7522eaf880a8b6293ead484780ef1a14e6cd9b48
1+
7748eec981f977cc97b852d1fe982aebe24ec2d090ae8493a65cea101f9d42a5

docs/apache-airflow/img/airflow_erd.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/apache-airflow/migrations-ref.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ Here's the list of all the Database Migrations that are executed via when you ru
3939
+-------------------------+------------------+-------------------+--------------------------------------------------------------+
4040
| Revision ID | Revises ID | Airflow Version | Description |
4141
+=========================+==================+===================+==============================================================+
42-
| ``9fc3fc5de720`` (head) | ``2b47dc6bc8df`` | ``3.0.0`` | Add references between assets and triggers. |
42+
| ``eed27faa34e3`` (head) | ``9fc3fc5de720`` | ``3.0.0`` | Remove pickled data from xcom table. |
43+
+-------------------------+------------------+-------------------+--------------------------------------------------------------+
44+
| ``9fc3fc5de720`` | ``2b47dc6bc8df`` | ``3.0.0`` | Add references between assets and triggers. |
4345
+-------------------------+------------------+-------------------+--------------------------------------------------------------+
4446
| ``2b47dc6bc8df`` | ``d03e4a635aa3`` | ``3.0.0`` | add dag versioning. |
4547
+-------------------------+------------------+-------------------+--------------------------------------------------------------+

newsfragments/aip-72.significant.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,7 @@ As part of this change the following breaking changes have occurred:
2727

2828
If you still need to use pickling, you can use a custom XCom backend that stores references in the metadata DB and
2929
the pickled data can be stored in a separate storage like S3.
30+
31+
The ``value`` field in the XCom table has been changed to a ``JSON`` type via DB migration. The XCom records that
32+
contains pickled data are archived in the ``_xcom_archive`` table. You can safely drop this table if you don't need
33+
the data anymore.

providers/src/airflow/providers/common/io/xcom/backend.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
from urllib.parse import urlsplit
2626

2727
import fsspec.utils
28+
from packaging.version import Version
2829

30+
from airflow import __version__ as airflow_version
2931
from airflow.configuration import conf
3032
from airflow.io.path import ObjectStoragePath
3133
from airflow.models.xcom import BaseXCom
@@ -41,6 +43,10 @@
4143
SECTION = "common.io"
4244

4345

46+
AIRFLOW_VERSION = Version(airflow_version)
47+
AIRFLOW_V_3_0_PLUS = Version(AIRFLOW_VERSION.base_version) >= Version("3.0.0")
48+
49+
4450
def _get_compression_suffix(compression: str) -> str:
4551
"""
4652
Return the compression suffix for the given compression.
@@ -103,7 +109,7 @@ def _get_full_path(data: str) -> ObjectStoragePath:
103109
raise ValueError(f"Not a valid url: {data}")
104110

105111
@staticmethod
106-
def serialize_value(
112+
def serialize_value( # type: ignore[override]
107113
value: T,
108114
*,
109115
key: str | None = None,
@@ -114,16 +120,22 @@ def serialize_value(
114120
) -> bytes | str:
115121
# we will always serialize ourselves and not by BaseXCom as the deserialize method
116122
# from BaseXCom accepts only XCom objects and not the value directly
117-
s_val = json.dumps(value, cls=XComEncoder).encode("utf-8")
123+
s_val = json.dumps(value, cls=XComEncoder)
124+
s_val_encoded = s_val.encode("utf-8")
118125

119126
if compression := _get_compression():
120127
suffix = f".{_get_compression_suffix(compression)}"
121128
else:
122129
suffix = ""
123130

124131
threshold = _get_threshold()
125-
if threshold < 0 or len(s_val) < threshold: # Either no threshold or value is small enough.
126-
return s_val
132+
if threshold < 0 or len(s_val_encoded) < threshold: # Either no threshold or value is small enough.
133+
if AIRFLOW_V_3_0_PLUS:
134+
return s_val
135+
else:
136+
# TODO: Remove this branch once we drop support for Airflow 2
137+
# This is for Airflow 2.10 where the value is expected to be bytes
138+
return s_val_encoded
127139

128140
base_path = _get_base_path()
129141
while True: # Safeguard against collisions.
@@ -138,7 +150,7 @@ def serialize_value(
138150
p.parent.mkdir(parents=True, exist_ok=True)
139151

140152
with p.open(mode="wb", compression=compression) as f:
141-
f.write(s_val)
153+
f.write(s_val_encoded)
142154
return BaseXCom.serialize_value(str(p))
143155

144156
@staticmethod

tests/api_connexion/endpoints/test_xcom_endpoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ def test_handle_limit_offset(self, query_params, expected_xcom_ids):
640640
xcom = XCom(
641641
dag_run_id=dagrun.id,
642642
key=f"TEST_XCOM_KEY{i}",
643-
value=b"null",
643+
value="null",
644644
run_id=self.run_id,
645645
task_id=self.task_id,
646646
dag_id=self.dag_id,

0 commit comments

Comments
 (0)