Skip to content

Commit 481e836

Browse files
committed
Refactor dump_db script
- add fake data for non-allowlisted columns - remove debug and print statements
1 parent 80955fb commit 481e836

1 file changed

Lines changed: 38 additions & 34 deletions

File tree

jobserver/jobs/hourly/dump_db.py

Lines changed: 38 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,7 @@
1616

1717
# Temporary schema to hold safe copies
1818
TEMP_SCHEMA = "safe_dump"
19-
# OUTPUT_PATH = pathlib.Path("/storage/jobserver.dump")
20-
OUTPUT_PATH = pathlib.Path("./jobserver.dump")
21-
# ALLOWLIST_ENV = "DUMP_DB_ALLOWLIST_FILE"
22-
ALLOWLIST_SETTING = "DUMP_DB_ALLOWLIST_FILE"
19+
OUTPUT_PATH = pathlib.Path("/storage/jobserver.dump")
2320

2421

2522
class Job(HourlyJob):
@@ -28,52 +25,60 @@ class Job(HourlyJob):
2825
@monitor(monitor_slug="dump_db", monitor_config=monitor_config("0 * * * *"))
2926
def execute(self):
3027
db = settings.DATABASES["default"]
31-
32-
# allowlist_path = os.environ.get(ALLOWLIST_ENV) or getattr(settings, ALLOWLIST_SETTING, None)
3328
allowlist_path = pathlib.Path(__file__).with_name("allow_list.json")
3429
allowlist = self._load_allowlist(allowlist_path)
35-
# print(f"Loaded allowlist for {len(allowlist)} tables")
36-
37-
# If allowlist empty -> conservative schema-only dump
38-
dump_rows = bool(allowlist)
39-
# print("dump_rows:", dump_rows)
30+
allowlist_exists = bool(allowlist)
4031

41-
# Ensure output directory exists
4232
out_dir = OUTPUT_PATH.parent
4333
if not out_dir.is_dir():
4434
print(f"Unknown output directory: {out_dir}", file=sys.stderr)
4535
sys.exit(1)
4636

47-
# Temporary output file (atomic replace later)
4837
with tempfile.NamedTemporaryFile(
4938
prefix="jobserver-", dir=str(out_dir), delete=False
5039
) as tmp:
5140
tmp_name = tmp.name
52-
# print(tmp_name)
5341

5442
try:
55-
if dump_rows:
43+
if allowlist_exists:
5644
self._create_safe_schema_and_copy(allowlist)
5745
try:
5846
self._run_pg_dump_for_schema(tmp_name, TEMP_SCHEMA, db)
5947
finally:
60-
# Always drop the temp schema
6148
self._drop_temp_schema()
6249
else:
63-
# No allowlist -> schema-only dump (no row data)
6450
self._run_pg_dump_schema_only(tmp_name, db)
6551

6652
os.chmod(tmp_name, 0o600)
6753
os.replace(tmp_name, OUTPUT_PATH)
6854
except Exception:
69-
# Cleanup on error
7055
try:
7156
if os.path.exists(tmp_name):
7257
os.remove(tmp_name)
7358
except Exception:
7459
pass
7560
raise
7661

62+
def _fake_expression(self, table: str, col: str, meta: dict) -> str:
63+
dtype = (meta.get("data_type") or "").lower()
64+
65+
if "char" in dtype or "text" in dtype:
66+
return f"'fake_{table}_{col}_' || id::text"
67+
68+
if "boolean" in dtype:
69+
return "false"
70+
71+
if "integer" in dtype or "bigint" in dtype or "smallint" in dtype:
72+
return "0"
73+
74+
if "timestamp" in dtype or "date" in dtype:
75+
return "now()"
76+
77+
if "json" in dtype:
78+
return "'{}'::jsonb"
79+
80+
return "NULL"
81+
7782
def _load_allowlist(self, path: str | None) -> dict[str, list[str]]:
7883
if not path:
7984
return {}
@@ -97,15 +102,9 @@ def _load_allowlist(self, path: str | None) -> dict[str, list[str]]:
97102
return {}
98103

99104
def _create_safe_schema_and_copy(self, allowlist: dict[str, list[str]]):
100-
"""Create TEMP_SCHEMA and copy data while preserving full table schema.
101-
102-
- TEMP_SCHEMA tables are created using LIKE source_table INCLUDING ALL, so all
103-
columns and constraints are present.
104-
- Columns listed in the allowlist are populated from the source.
105-
- Columns not listed in the allowlist remain in the schema but are populated as NULL.
106-
"""
107105
with connection.cursor() as cur:
108-
cur.execute(f"CREATE SCHEMA IF NOT EXISTS {TEMP_SCHEMA};")
106+
cur.execute(f"DROP SCHEMA IF EXISTS {TEMP_SCHEMA} CASCADE;")
107+
cur.execute(f"CREATE SCHEMA {TEMP_SCHEMA};")
109108

110109
for table_name, columns in allowlist.items():
111110
if not columns:
@@ -129,21 +128,24 @@ def _valid_ident(x: str) -> bool:
129128
if not _valid_ident(short_table) or not _valid_ident(schema_name):
130129
raise ValueError(f"Invalid table name in allowlist: {table_name}")
131130

132-
# Get existing columns for the source table
133131
cur.execute(
134132
"""
135-
SELECT column_name
133+
SELECT column_name, is_nullable, data_type
136134
FROM information_schema.columns
137135
WHERE table_schema = %s AND table_name = %s;
138136
""",
139137
[schema_name, short_table],
140138
)
141-
existing_cols = [row[0] for row in cur.fetchall()]
142-
143-
if not existing_cols:
139+
rows = cur.fetchall()
140+
if not rows:
144141
continue
145142

146-
# Validate allowlisted columns and build lookup set
143+
existing_cols = [row[0] for row in rows]
144+
col_meta = {
145+
row[0]: {"is_nullable": row[1], "data_type": row[2]} for row in rows
146+
}
147+
148+
# validate allowlisted columns and build lookup set
147149
allowed_set: set[str] = set()
148150
for col in columns:
149151
if not _valid_ident(col):
@@ -167,13 +169,15 @@ def _valid_ident(x: str) -> bool:
167169
except Exception as exc:
168170
raise RuntimeError(f"Failed to create table {dst_table_q}: {exc}")
169171

170-
# Build SELECT list: allowed columns as real values, others as NULL
171172
select_exprs: list[str] = []
172173
for col in existing_cols:
174+
meta = col_meta[col]
175+
173176
if col in allowed_set:
174177
select_exprs.append(f'"{col}"')
175178
else:
176-
select_exprs.append(f'NULL AS "{col}"')
179+
expr = self._fake_expression(short_table, col, meta)
180+
select_exprs.append(f'{expr} AS "{col}"')
177181

178182
select_list = ", ".join(select_exprs)
179183
quoted_all_cols = ", ".join(f'"{c}"' for c in existing_cols)

0 commit comments

Comments
 (0)