Skip to content

Commit 037275b

Browse files
authored
Merge pull request #488 from Police-Data-Accessibility-Project/dev
Dev
2 parents 5de5f98 + cc581bf commit 037275b

File tree

1,518 files changed

+32505
-11730
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,518 files changed

+32505
-11730
lines changed

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev
1414
# Must call from the root directory because uv does not add playwright to path
1515
RUN playwright install-deps chromium
1616
RUN playwright install chromium
17+
# Download Spacy Model
18+
RUN python -m spacy download en_core_web_sm
1719

1820
# Copy project files
1921
COPY src ./src

ENV.md

Lines changed: 110 additions & 19 deletions
Large diffs are not rendered by default.

alembic/env.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
import logging
21
from datetime import datetime
32
from logging.config import fileConfig
43

54
from alembic import context
65
from sqlalchemy import engine_from_config
76
from sqlalchemy import pool
87

9-
from src.db.helpers import get_postgres_connection_string
10-
from src.db.models.templates import Base
8+
from src.db.helpers.connect import get_postgres_connection_string
9+
from src.db.models.templates_.base import Base
1110

1211
# this is the Alembic Config object, which provides
1312
# access to the values within the .ini file in use.
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
"""Setup for sync data sources task
2+
3+
Revision ID: 59d2af1bab33
4+
Revises: 9552d354ccf4
5+
Create Date: 2025-07-21 06:37:51.043504
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
from sqlalchemy.dialects.postgresql import JSONB
13+
14+
from src.util.alembic_helpers import switch_enum_type, id_column
15+
16+
# revision identifiers, used by Alembic.
17+
revision: str = '59d2af1bab33'
18+
down_revision: Union[str, None] = '9552d354ccf4'
19+
branch_labels: Union[str, Sequence[str], None] = None
20+
depends_on: Union[str, Sequence[str], None] = None
21+
22+
SYNC_STATE_TABLE_NAME = "data_sources_sync_state"
23+
URL_DATA_SOURCES_METADATA_TABLE_NAME = "url_data_sources_metadata"
24+
25+
CONFIRMED_AGENCY_TABLE_NAME = "confirmed_url_agency"
26+
LINK_URLS_AGENCIES_TABLE_NAME = "link_urls_agencies"
27+
CHANGE_LOG_TABLE_NAME = "change_log"
28+
29+
AGENCIES_TABLE_NAME = "agencies"
30+
31+
TABLES_TO_LOG = [
32+
LINK_URLS_AGENCIES_TABLE_NAME,
33+
"urls",
34+
"url_data_sources",
35+
"agencies",
36+
]
37+
38+
OperationTypeEnum = sa.Enum("UPDATE", "DELETE", "INSERT", name="operation_type")
39+
40+
41+
def upgrade() -> None:
42+
_create_data_sources_sync_state_table()
43+
_create_data_sources_sync_task()
44+
45+
_rename_confirmed_url_agency_to_link_urls_agencies()
46+
_create_change_log_table()
47+
_add_jsonb_diff_val_function()
48+
_create_log_table_changes_trigger()
49+
50+
51+
_add_table_change_log_triggers()
52+
_add_agency_id_column()
53+
54+
55+
56+
def downgrade() -> None:
57+
_drop_data_sources_sync_task()
58+
_drop_data_sources_sync_state_table()
59+
_drop_change_log_table()
60+
_drop_table_change_log_triggers()
61+
_drop_jsonb_diff_val_function()
62+
_drop_log_table_changes_trigger()
63+
64+
_rename_link_urls_agencies_to_confirmed_url_agency()
65+
66+
OperationTypeEnum.drop(op.get_bind())
67+
_drop_agency_id_column()
68+
69+
70+
71+
def _add_jsonb_diff_val_function() -> None:
72+
op.execute(
73+
"""
74+
CREATE OR REPLACE FUNCTION jsonb_diff_val(val1 JSONB, val2 JSONB)
75+
RETURNS JSONB AS
76+
$$
77+
DECLARE
78+
result JSONB;
79+
v RECORD;
80+
BEGIN
81+
result = val1;
82+
FOR v IN SELECT * FROM jsonb_each(val2)
83+
LOOP
84+
IF result @> jsonb_build_object(v.key, v.value)
85+
THEN
86+
result = result - v.key;
87+
ELSIF result ? v.key THEN
88+
CONTINUE;
89+
ELSE
90+
result = result || jsonb_build_object(v.key, 'null');
91+
END IF;
92+
END LOOP;
93+
RETURN result;
94+
END;
95+
$$ LANGUAGE plpgsql;
96+
"""
97+
)
98+
99+
def _drop_jsonb_diff_val_function() -> None:
100+
op.execute("DROP FUNCTION IF EXISTS jsonb_diff_val(val1 JSONB, val2 JSONB)")
101+
102+
def _create_log_table_changes_trigger() -> None:
103+
op.execute(
104+
f"""
105+
CREATE OR REPLACE FUNCTION public.log_table_changes()
106+
RETURNS trigger
107+
LANGUAGE 'plpgsql'
108+
COST 100
109+
VOLATILE NOT LEAKPROOF
110+
AS $BODY$
111+
DECLARE
112+
old_values JSONB;
113+
new_values JSONB;
114+
old_to_new JSONB;
115+
new_to_old JSONB;
116+
BEGIN
117+
-- Handle DELETE operations (store entire OLD row since all data is lost)
118+
IF (TG_OP = 'DELETE') THEN
119+
old_values = row_to_json(OLD)::jsonb;
120+
121+
INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data)
122+
VALUES ('DELETE', TG_TABLE_NAME, OLD.id, old_values);
123+
124+
RETURN OLD;
125+
126+
-- Handle UPDATE operations (only log the changed columns)
127+
ELSIF (TG_OP = 'UPDATE') THEN
128+
old_values = row_to_json(OLD)::jsonb;
129+
new_values = row_to_json(NEW)::jsonb;
130+
new_to_old = jsonb_diff_val(old_values, new_values);
131+
old_to_new = jsonb_diff_val(new_values, old_values);
132+
133+
-- Skip logging if both old_to_new and new_to_old are NULL or empty JSON objects
134+
IF (new_to_old IS NOT NULL AND new_to_old <> '{{}}') OR
135+
(old_to_new IS NOT NULL AND old_to_new <> '{{}}') THEN
136+
INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, old_data, new_data)
137+
VALUES ('UPDATE', TG_TABLE_NAME, OLD.id, new_to_old, old_to_new);
138+
END IF;
139+
140+
RETURN NEW;
141+
142+
-- Handle INSERT operations
143+
ELSIF (TG_OP = 'INSERT') THEN
144+
new_values = row_to_json(NEW)::jsonb;
145+
146+
-- Skip logging if new_values is NULL or an empty JSON object
147+
IF new_values IS NOT NULL AND new_values <> '{{}}' THEN
148+
INSERT INTO {CHANGE_LOG_TABLE_NAME} (operation_type, table_name, affected_id, new_data)
149+
VALUES ('INSERT', TG_TABLE_NAME, NEW.id, new_values);
150+
END IF;
151+
152+
RETURN NEW;
153+
END IF;
154+
END;
155+
$BODY$;
156+
"""
157+
)
158+
159+
def _drop_log_table_changes_trigger() -> None:
160+
op.execute(f"DROP TRIGGER IF EXISTS log_table_changes ON {URL_DATA_SOURCES_METADATA_TABLE_NAME}")
161+
162+
def _create_data_sources_sync_state_table() -> None:
163+
table = op.create_table(
164+
SYNC_STATE_TABLE_NAME,
165+
id_column(),
166+
sa.Column('last_full_sync_at', sa.DateTime(), nullable=True),
167+
sa.Column('current_cutoff_date', sa.Date(), nullable=True),
168+
sa.Column('current_page', sa.Integer(), nullable=True),
169+
)
170+
# Add row to `data_sources_sync_state` table
171+
op.bulk_insert(
172+
table,
173+
[
174+
{
175+
"last_full_sync_at": None,
176+
"current_cutoff_date": None,
177+
"current_page": None
178+
}
179+
]
180+
)
181+
182+
def _drop_data_sources_sync_state_table() -> None:
183+
op.drop_table(SYNC_STATE_TABLE_NAME)
184+
185+
def _create_data_sources_sync_task() -> None:
186+
switch_enum_type(
187+
table_name='tasks',
188+
column_name='task_type',
189+
enum_name='task_type',
190+
new_enum_values=[
191+
'HTML',
192+
'Relevancy',
193+
'Record Type',
194+
'Agency Identification',
195+
'Misc Metadata',
196+
'Submit Approved URLs',
197+
'Duplicate Detection',
198+
'404 Probe',
199+
'Sync Agencies',
200+
'Sync Data Sources'
201+
]
202+
)
203+
204+
def _drop_data_sources_sync_task() -> None:
205+
switch_enum_type(
206+
table_name='tasks',
207+
column_name='task_type',
208+
enum_name='task_type',
209+
new_enum_values=[
210+
'HTML',
211+
'Relevancy',
212+
'Record Type',
213+
'Agency Identification',
214+
'Misc Metadata',
215+
'Submit Approved URLs',
216+
'Duplicate Detection',
217+
'404 Probe',
218+
'Sync Agencies',
219+
]
220+
)
221+
222+
def _create_change_log_table() -> None:
223+
# Create change_log table
224+
op.create_table(
225+
CHANGE_LOG_TABLE_NAME,
226+
id_column(),
227+
sa.Column("operation_type", OperationTypeEnum, nullable=False),
228+
sa.Column("table_name", sa.String(), nullable=False),
229+
sa.Column("affected_id", sa.Integer(), nullable=False),
230+
sa.Column("old_data", JSONB, nullable=True),
231+
sa.Column("new_data", JSONB, nullable=True),
232+
sa.Column(
233+
"created_at", sa.DateTime(), server_default=sa.func.now(), nullable=False
234+
),
235+
)
236+
237+
def _drop_change_log_table() -> None:
238+
op.drop_table(CHANGE_LOG_TABLE_NAME)
239+
240+
def _rename_confirmed_url_agency_to_link_urls_agencies() -> None:
241+
op.rename_table(CONFIRMED_AGENCY_TABLE_NAME, LINK_URLS_AGENCIES_TABLE_NAME)
242+
243+
def _rename_link_urls_agencies_to_confirmed_url_agency() -> None:
244+
op.rename_table(LINK_URLS_AGENCIES_TABLE_NAME, CONFIRMED_AGENCY_TABLE_NAME)
245+
246+
def _add_table_change_log_triggers() -> None:
247+
# Create trigger for tables:
248+
def create_table_trigger(table_name: str) -> None:
249+
op.execute(
250+
"""
251+
CREATE OR REPLACE TRIGGER log_{table_name}_changes
252+
BEFORE INSERT OR DELETE OR UPDATE
253+
ON public.{table_name}
254+
FOR EACH ROW
255+
EXECUTE FUNCTION public.log_table_changes();
256+
""".format(table_name=table_name)
257+
)
258+
259+
for table_name in TABLES_TO_LOG:
260+
create_table_trigger(table_name)
261+
262+
def _drop_table_change_log_triggers() -> None:
263+
def drop_table_trigger(table_name: str) -> None:
264+
op.execute(
265+
f"""
266+
DROP TRIGGER log_{table_name}_changes
267+
ON public.{table_name}
268+
"""
269+
)
270+
271+
for table_name in TABLES_TO_LOG:
272+
drop_table_trigger(table_name)
273+
274+
def _add_agency_id_column():
275+
op.add_column(
276+
AGENCIES_TABLE_NAME,
277+
id_column(),
278+
)
279+
280+
281+
def _drop_agency_id_column():
282+
op.drop_column(
283+
AGENCIES_TABLE_NAME,
284+
'id',
285+
)
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Setup for upload to huggingface task
2+
3+
Revision ID: 637de6eaa3ab
4+
Revises: 59d2af1bab33
5+
Create Date: 2025-07-26 08:30:37.940091
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
from src.util.alembic_helpers import id_column, switch_enum_type
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = '637de6eaa3ab'
17+
down_revision: Union[str, None] = '59d2af1bab33'
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
TABLE_NAME = "huggingface_upload_state"
22+
23+
24+
def upgrade() -> None:
25+
op.create_table(
26+
TABLE_NAME,
27+
id_column(),
28+
sa.Column(
29+
"last_upload_at",
30+
sa.DateTime(),
31+
nullable=False
32+
),
33+
)
34+
35+
switch_enum_type(
36+
table_name='tasks',
37+
column_name='task_type',
38+
enum_name='task_type',
39+
new_enum_values=[
40+
'HTML',
41+
'Relevancy',
42+
'Record Type',
43+
'Agency Identification',
44+
'Misc Metadata',
45+
'Submit Approved URLs',
46+
'Duplicate Detection',
47+
'404 Probe',
48+
'Sync Agencies',
49+
'Sync Data Sources',
50+
'Push to Hugging Face'
51+
]
52+
)
53+
54+
55+
def downgrade() -> None:
56+
op.drop_table(TABLE_NAME)
57+
58+
switch_enum_type(
59+
table_name='tasks',
60+
column_name='task_type',
61+
enum_name='task_type',
62+
new_enum_values=[
63+
'HTML',
64+
'Relevancy',
65+
'Record Type',
66+
'Agency Identification',
67+
'Misc Metadata',
68+
'Submit Approved URLs',
69+
'Duplicate Detection',
70+
'404 Probe',
71+
'Sync Agencies',
72+
'Sync Data Sources'
73+
]
74+
)

0 commit comments

Comments
 (0)