1616
1717# Temporary schema to hold safe copies
1818TEMP_SCHEMA = "safe_dump"
19- # OUTPUT_PATH = pathlib.Path("/storage/jobserver.dump")
20- OUTPUT_PATH = pathlib .Path ("./jobserver.dump" )
21- # ALLOWLIST_ENV = "DUMP_DB_ALLOWLIST_FILE"
22- ALLOWLIST_SETTING = "DUMP_DB_ALLOWLIST_FILE"
19+ OUTPUT_PATH = pathlib .Path ("/storage/jobserver.dump" )
2320
2421
2522class Job (HourlyJob ):
@@ -28,52 +25,60 @@ class Job(HourlyJob):
2825 @monitor (monitor_slug = "dump_db" , monitor_config = monitor_config ("0 * * * *" ))
2926 def execute (self ):
3027 db = settings .DATABASES ["default" ]
31-
32- # allowlist_path = os.environ.get(ALLOWLIST_ENV) or getattr(settings, ALLOWLIST_SETTING, None)
3328 allowlist_path = pathlib .Path (__file__ ).with_name ("allow_list.json" )
3429 allowlist = self ._load_allowlist (allowlist_path )
35- # print(f"Loaded allowlist for {len(allowlist)} tables")
36-
37- # If allowlist empty -> conservative schema-only dump
38- dump_rows = bool (allowlist )
39- # print("dump_rows:", dump_rows)
30+ allowlist_exists = bool (allowlist )
4031
41- # Ensure output directory exists
4232 out_dir = OUTPUT_PATH .parent
4333 if not out_dir .is_dir ():
4434 print (f"Unknown output directory: { out_dir } " , file = sys .stderr )
4535 sys .exit (1 )
4636
47- # Temporary output file (atomic replace later)
4837 with tempfile .NamedTemporaryFile (
4938 prefix = "jobserver-" , dir = str (out_dir ), delete = False
5039 ) as tmp :
5140 tmp_name = tmp .name
52- # print(tmp_name)
5341
5442 try :
55- if dump_rows :
43+ if allowlist_exists :
5644 self ._create_safe_schema_and_copy (allowlist )
5745 try :
5846 self ._run_pg_dump_for_schema (tmp_name , TEMP_SCHEMA , db )
5947 finally :
60- # Always drop the temp schema
6148 self ._drop_temp_schema ()
6249 else :
63- # No allowlist -> schema-only dump (no row data)
6450 self ._run_pg_dump_schema_only (tmp_name , db )
6551
6652 os .chmod (tmp_name , 0o600 )
6753 os .replace (tmp_name , OUTPUT_PATH )
6854 except Exception :
69- # Cleanup on error
7055 try :
7156 if os .path .exists (tmp_name ):
7257 os .remove (tmp_name )
7358 except Exception :
7459 pass
7560 raise
7661
62+ def _fake_expression (self , table : str , col : str , meta : dict ) -> str :
63+ dtype = (meta .get ("data_type" ) or "" ).lower ()
64+
65+ if "char" in dtype or "text" in dtype :
66+ return f"'fake_{ table } _{ col } _' || id::text"
67+
68+ if "boolean" in dtype :
69+ return "false"
70+
71+ if "integer" in dtype or "bigint" in dtype or "smallint" in dtype :
72+ return "0"
73+
74+ if "timestamp" in dtype or "date" in dtype :
75+ return "now()"
76+
77+ if "json" in dtype :
78+ return "'{}'::jsonb"
79+
80+ return "NULL"
81+
7782 def _load_allowlist (self , path : str | None ) -> dict [str , list [str ]]:
7883 if not path :
7984 return {}
@@ -97,15 +102,9 @@ def _load_allowlist(self, path: str | None) -> dict[str, list[str]]:
97102 return {}
98103
99104 def _create_safe_schema_and_copy (self , allowlist : dict [str , list [str ]]):
100- """Create TEMP_SCHEMA and copy data while preserving full table schema.
101-
102- - TEMP_SCHEMA tables are created using LIKE source_table INCLUDING ALL, so all
103- columns and constraints are present.
104- - Columns listed in the allowlist are populated from the source.
105- - Columns not listed in the allowlist remain in the schema but are populated as NULL.
106- """
107105 with connection .cursor () as cur :
108- cur .execute (f"CREATE SCHEMA IF NOT EXISTS { TEMP_SCHEMA } ;" )
106+ cur .execute (f"DROP SCHEMA IF EXISTS { TEMP_SCHEMA } CASCADE;" )
107+ cur .execute (f"CREATE SCHEMA { TEMP_SCHEMA } ;" )
109108
110109 for table_name , columns in allowlist .items ():
111110 if not columns :
@@ -129,21 +128,24 @@ def _valid_ident(x: str) -> bool:
129128 if not _valid_ident (short_table ) or not _valid_ident (schema_name ):
130129 raise ValueError (f"Invalid table name in allowlist: { table_name } " )
131130
132- # Get existing columns for the source table
133131 cur .execute (
134132 """
135- SELECT column_name
133+ SELECT column_name, is_nullable, data_type
136134 FROM information_schema.columns
137135 WHERE table_schema = %s AND table_name = %s;
138136 """ ,
139137 [schema_name , short_table ],
140138 )
141- existing_cols = [row [0 ] for row in cur .fetchall ()]
142-
143- if not existing_cols :
139+ rows = cur .fetchall ()
140+ if not rows :
144141 continue
145142
146- # Validate allowlisted columns and build lookup set
143+ existing_cols = [row [0 ] for row in rows ]
144+ col_meta = {
145+ row [0 ]: {"is_nullable" : row [1 ], "data_type" : row [2 ]} for row in rows
146+ }
147+
148+ # validate allowlisted columns and build lookup set
147149 allowed_set : set [str ] = set ()
148150 for col in columns :
149151 if not _valid_ident (col ):
@@ -167,13 +169,15 @@ def _valid_ident(x: str) -> bool:
167169 except Exception as exc :
168170 raise RuntimeError (f"Failed to create table { dst_table_q } : { exc } " )
169171
170- # Build SELECT list: allowed columns as real values, others as NULL
171172 select_exprs : list [str ] = []
172173 for col in existing_cols :
174+ meta = col_meta [col ]
175+
173176 if col in allowed_set :
174177 select_exprs .append (f'"{ col } "' )
175178 else :
176- select_exprs .append (f'NULL AS "{ col } "' )
179+ expr = self ._fake_expression (short_table , col , meta )
180+ select_exprs .append (f'{ expr } AS "{ col } "' )
177181
178182 select_list = ", " .join (select_exprs )
179183 quoted_all_cols = ", " .join (f'"{ c } "' for c in existing_cols )
0 commit comments