Skip to content

Commit a8ead6a

Browse files
committed
pg_duckdb: Bring pg_duckdb-indexed scripts up to date
1 parent 9639642 commit a8ead6a

File tree

4 files changed

+77
-21
lines changed

4 files changed

+77
-21
lines changed

pg_duckdb-indexed/benchmark.sh

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,73 @@
11
#!/bin/bash
22

3-
set -ex
3+
set -eux
44

55
#sudo apt-get update
66
#sudo apt-get install -y docker.io
77
#sudo apt-get install -y postgresql-client
88

99
wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz'
1010
gzip -d hits.tsv.gz
11-
sudo chmod 777 *
1211

13-
sudo docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb -v ./:/tmp/files pgduckdb/pgduckdb:16-main
12+
memory=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
13+
threads=$(nproc)
14+
cpus=$(($threads / 2))
15+
# Shared buffers is set to 25% of memory in AWS RDS by default. We do the same.
16+
# https://docs.aws.amazon.com/prescriptive-guidance/latest/tuning-postgresql-parameters/shared-buffers.html
17+
shared_buffers=$(($memory / 4))
18+
# Effective cache size does not need to be perfect, but it should be somewhat
19+
# close to the total memory minus what is expected to be used for queries.
20+
# https://www.cybertec-postgresql.com/en/effective_cache_size-what-it-means-in-postgresql/
21+
effective_cache_size=$(($memory - ($memory / 4)))
22+
# By default, max_worker_processes is set to in postgres. We want to be able to
23+
# use all the threads for parallel workers so we increase it. We also add a
24+
# small buffer of 15 for any other background workers that might be created.
25+
max_worker_processes=$(($threads + 15))
26+
# We also give DuckDB 25% of the memory to work with.
27+
duckdb_memory=$(($memory / 4))
28+
# Below we make sure to configure the rest of the parallel worker settings to
29+
# match the number of cpu cores:
30+
# https://www.crunchydata.com/blog/postgres-tuning-and-performance-for-analytics-data
31+
#
32+
# We also increase work_mem because we are doing an analytics workload to allow
33+
# some more memory for sorting, aggregations, etc.
34+
#
35+
# It's necessary to increase max_wal_size to make the dataload not take very
36+
# long. With the default value it's constantly checkpointing, and the PG logs
37+
# warn you about that and tell you to increase max_wal_size.
38+
39+
sudo docker run -d --name pgduck -p 5432:5432 -e POSTGRES_PASSWORD=duckdb pgduckdb/pgduckdb:17-v0.3.1
40+
41+
sleep 2
1442

1543
sudo docker exec -it pgduck bash -c "
16-
sed -i -e '
17-
s/shared_buffers = 128MB/shared_buffers = 8GB/;
18-
s/#max_parallel_workers = 8/max_parallel_workers = 16/;
19-
s/#max_parallel_workers_per_gather = 2/max_parallel_workers_per_gather = 8/;
20-
s/max_wal_size = 1GB/max_wal_size = 32GB/;
21-
' /var/lib/postgresql/data/postgresql.conf
44+
cat >> /var/lib/postgresql/data/postgresql.conf <<'EOF'
45+
shared_buffers=${shared_buffers}kB
46+
max_worker_processes=${max_worker_processes}
47+
max_parallel_workers=${threads}
48+
max_parallel_maintenance_workers=${cpus}
49+
max_parallel_workers_per_gather=${cpus}
50+
duckdb.max_workers_per_postgres_scan=${cpus}
51+
max_wal_size=32GB
52+
work_mem=64MB
53+
effective_cache_size=${effective_cache_size}kB
54+
duckdb.max_memory=${duckdb_memory}kB
55+
EOF
2256
"
2357

2458
docker restart pgduck
2559

60+
export PGUSER=postgres
61+
export PGPASSWORD=duckdb
62+
2663
sleep 5
27-
docker exec -i pgduck psql -U postgres -c 'CREATE DATABASE test;'
28-
docker exec -i pgduck psql -U postgres -d test -c 'CREATE EXTENSION IF NOT EXISTS pg_duckdb;'
29-
docker exec -i pgduck psql -U postgres -d test -f /tmp/files/create.sql
30-
time docker exec -i pgduck split /tmp/files/hits.tsv --verbose -n r/$(( $(nproc)/2 )) --filter='psql -U postgres -d test -t -c "\\copy hits (WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) FROM STDIN"'
31-
docker exec -i pgduck psql -U postgres -d test -f /tmp/files/index.sql
32-
docker exec -i pgduck du -bcs /var/lib/postgresql/data
64+
psql -t <create.sql
65+
time ./load.sh
3366

34-
docker exec -i pgduck psql -U postgres -d test -c "ALTER DATABASE test SET duckdb.force_execution = true;"
67+
psql -c "ALTER DATABASE postgres SET duckdb.force_execution = true;"
3568
./run.sh 2>&1 | tee log.txt
3669

70+
docker exec -i pgduck du -bcs /var/lib/postgresql/data
71+
3772
cat log.txt | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' |
3873
awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'

pg_duckdb-indexed/index.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ CREATE INDEX regionuser on hits (RegionID,UserID);
1717
CREATE INDEX mobile2 on hits (mobilephonemodel) WHERE mobilephonemodel <> ''::text;
1818
CREATE INDEX search2 on hits (searchphrase) WHERE searchphrase <> ''::text;
1919

20-
CREATE EXTENSION IF NOT EXISTS pg_trgm;
20+
2121
CREATE INDEX trgm_idx_title ON hits USING gin (title gin_trgm_ops);
2222
CREATE INDEX trgm_idx_url ON hits USING gin (url gin_trgm_ops);
2323

pg_duckdb-indexed/load.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
set -eux
4+
5+
threads=$(nproc)
6+
cpus=$(($threads / 2))
7+
8+
export PGUSER=postgres
9+
export PGPASSWORD=duckdb
10+
11+
# Using COPY with explicit column mapping to ensure correct alignment.
12+
time split hits.tsv --verbose -n r/$cpus --filter='psql -t -c "\\copy hits (WatchID, JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID, UserID, CounterClass, OS, UserAgent, URL, Referer, IsRefresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, FUniqID, OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, RefererHash, URLHash, CLID) FROM STDIN"'
13+
14+
psql -t -c 'CREATE EXTENSION pg_trgm;'
15+
time psql -t <index.sql
16+
17+
time psql -t -c 'VACUUM ANALYZE hits'

pg_duckdb-indexed/run.sh

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22

33
TRIES=3
44

5+
export PGUSER=postgres
6+
export PGPASSWORD=duckdb
7+
58
cat queries.sql | while read -r query; do
69
sync
710
echo 3 | sudo tee /proc/sys/vm/drop_caches
811

912
echo "$query"
10-
for i in $(seq 1 $TRIES); do
11-
echo -e "\\\timing\n$query" | docker exec -i pgduck psql -U postgres -d test -t | grep 'Time'
12-
done
13-
done
13+
(
14+
echo '\timing'
15+
yes "$query" | head -n $TRIES
16+
) | psql --no-psqlrc --tuples-only | grep 'Time'
17+
done

0 commit comments

Comments
 (0)