-
Notifications
You must be signed in to change notification settings - Fork 283
/
actively_read_tables_with_partitioning_clustering_info.sql
121 lines (117 loc) · 3.77 KB
/
actively_read_tables_with_partitioning_clustering_info.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This script creates a table named, actively_read_tables_with_part_clust_info,
* that contains a list of the most frequently read tables which are:
* - not partitioned
* - not clustered
* - neither partitioned nor clustered
*/
DECLARE projects ARRAY<STRING> DEFAULT (
SELECT ARRAY_AGG(DISTINCT project_id)
FROM optimization_workshop.table_read_patterns
);
CREATE SCHEMA IF NOT EXISTS optimization_workshop;
CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info
(
project_id STRING,
dataset_id STRING,
table_id STRING,
total_slot_ms FLOAT64,
total_jobs INT64,
max_slot_ms_job_url STRING,
num_days_queried INT64,
predicate_columns STRING,
partitioning_column STRING,
clustering_columns STRING,
table_url STRING,
logical_gigabytes FlOAT64,
logical_terabytes FLOAT64
);
FOR p IN (
SELECT project_id
FROM
UNNEST(projects) project_id
)
DO
BEGIN
EXECUTE IMMEDIATE FORMAT("""
INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info
SELECT
rp.* EXCEPT(predicates),
ARRAY_TO_STRING(ARRAY_CONCAT_AGG((
SELECT
ARRAY_AGG(predicate_column_counts.column || ':' || predicate_column_counts.cnt)
FROM (
SELECT
STRUCT(predicate.column, COUNT(predicate.column) AS cnt) AS predicate_column_counts
FROM UNNEST(predicates) predicate
GROUP BY predicate.column
))), ', ') AS predicate_columns,
partitioning_column,
clustering_columns,
bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url,
SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
FROM
`region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
JOIN (
SELECT
project_id,
dataset_id,
table_id,
ARRAY_CONCAT_AGG(predicates) AS predicates,
SUM(stage_slot_ms) AS total_slot_ms,
COUNT(DISTINCT job_id) AS total_jobs,
ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url,
COUNT(DISTINCT date) AS num_days_queried,
FROM optimization_workshop.table_read_patterns
GROUP BY
project_id,
dataset_id,
table_id
) rp
ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id)
JOIN (
SELECT
table_catalog,
table_schema,
table_name,
STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
STRING_AGG(
CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
ORDER BY CLUSTERING_ORDINAL_POSITION
) AS clustering_columns
FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
GROUP BY 1,2,3
) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
GROUP BY
project_id,
dataset_id,
table_id,
table_url,
total_slot_ms,
total_jobs,
max_slot_ms_job_url,
num_days_queried,
partitioning_column,
clustering_columns,
total_slot_ms;
""",
p.project_id);
EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
END;
END FOR;