-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbigquery_sampling.sql
More file actions
95 lines (80 loc) · 3.37 KB
/
Copy pathbigquery_sampling.sql
File metadata and controls
95 lines (80 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
-- BigQuery Sampling SQL for A/B/C/D/E Experiment
-- Groups: A=1.00%, B=1.25%, C=1.25%, D=1.25%, E=1.25%
-- Uses deterministic sampling with seeds for reproducibility
-- Replace these variables with your actual values:
-- {PROJECT_ID}, {BQ_DATASET}, {FEATURE_TABLE}
-- Group A: 1.00% of dataset
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_a_sample` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.{FEATURE_TABLE}`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'seed_a_2024'))), 10000) < 100
AND address IS NOT NULL;
-- Group B: 1.25% of dataset
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_b_sample` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.{FEATURE_TABLE}`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'seed_b_2024'))), 10000) < 125
AND address IS NOT NULL;
-- Group C: 1.25% of dataset
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_c_sample` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.{FEATURE_TABLE}`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'seed_c_2024'))), 10000) < 125
AND address IS NOT NULL;
-- Group D: 1.25% of dataset
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_d_sample` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.{FEATURE_TABLE}`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'seed_d_2024'))), 10000) < 125
AND address IS NOT NULL;
-- Group E: 1.25% of dataset
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_e_sample` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.{FEATURE_TABLE}`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'seed_e_2024'))), 10000) < 125
AND address IS NOT NULL;
-- Validation/Holdout split for each group (20% of each group)
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_a_validation` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.group_a_sample`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'validation_seed'))), 100) < 20;
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_b_validation` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.group_b_sample`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'validation_seed'))), 100) < 20;
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_c_validation` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.group_c_sample`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'validation_seed'))), 100) < 20;
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_d_validation` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.group_d_sample`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'validation_seed'))), 100) < 20;
CREATE OR REPLACE TABLE `{PROJECT_ID}.{BQ_DATASET}.group_e_validation` AS
SELECT *
FROM `{PROJECT_ID}.{BQ_DATASET}.group_e_sample`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(address, 'validation_seed'))), 100) < 20;
-- Audit decisions table for logging all API decisions
CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{BQ_DATASET}.audit_decisions` (
timestamp TIMESTAMP,
request_id STRING,
to_address STRING,
amount_btc FLOAT64,
model_group STRING,
model_version STRING,
yhat FLOAT64,
sigma FLOAT64,
z_score FLOAT64,
decision STRING,
reason STRING,
explain_json JSON,
signed_decision_blob STRING
)
PARTITION BY DATE(timestamp);
-- Example usage with specific seeds:
-- To change sampling percentages, modify the comparison values:
-- Group A: < 100 (1.00%) -> < 50 for 0.50%
-- Group B-E: < 125 (1.25%) -> < 250 for 2.50%
--
-- To change seeds, modify the seed strings:
-- 'seed_a_2024' -> 'seed_a_2024_v2' for new random sampling