forked from ZcashFoundation/zebra
-
Notifications
You must be signed in to change notification settings - Fork 1
722 lines (660 loc) · 31.7 KB
/
sub-deploy-integration-tests-gcp.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
name: Deploy Tests to GCP
on:
workflow_call:
inputs:
# Status and logging
test_id:
required: true
type: string
description: 'Unique identifier for the test'
test_description:
required: true
type: string
description: 'Explains what the test does'
height_grep_text:
required: false
type: string
description: 'Regular expression to find the tip height in test logs, and add it to newly created cached state image metadata'
# Test selection and parameters
test_variables:
required: true
type: string
description: 'Environmental variables used to select and configure the test'
network:
required: false
type: string
default: Mainnet
description: 'Zcash network to test against'
is_long_test:
required: false
type: boolean
default: false
description: 'Does this test need multiple run jobs? (Does it run longer than 6 hours?)'
# Cached state
#
zebra_state_dir:
required: false
type: string
default: '/var/cache/zebrad-cache'
description: 'Zebra cached state directory and input image prefix to search in GCP'
lwd_state_dir:
required: false
type: string
default: '/var/cache/lwd-cache'
description: 'Lightwalletd cached state directory and input image prefix to search in GCP'
disk_prefix:
required: false
type: string
default: 'zebrad-cache'
description: 'Image name prefix, and `zebra_state_dir` name for newly created cached states'
disk_suffix:
required: false
type: string
default: 'tip'
description: 'Image name suffix'
needs_zebra_state:
required: true
type: boolean
description: 'Does the test use Zebra cached state?'
needs_lwd_state:
required: false
type: boolean
description: 'Does the test use Lightwalletd and Zebra cached state?'
# main branch states can be outdated and slower, but they can also be more reliable
prefer_main_cached_state:
required: false
type: boolean
default: false
description: 'Does the test prefer to use a main branch cached state?'
saves_to_disk:
required: true
type: boolean
description: 'Can this test create new or updated cached state disks?'
force_save_to_disk:
required: false
type: boolean
default: false
description: 'Force this test to create a new or updated cached state disk'
app_name:
required: false
type: string
default: 'zebra'
description: 'Application name, used to work out when a job is an update job'
env:
# How many previous log lines we show at the start of each new log job.
# Increase this number if some log lines are skipped between jobs
#
# We want to show all the logs since the last job finished,
# but we don't know how long it will be between jobs.
# 200 lines is about 6-15 minutes of sync logs, or one panic log.
EXTRA_LOG_LINES: 200
# How many blocks to wait before creating an updated cached state image.
# 1 day is approximately 1152 blocks.
CACHED_STATE_UPDATE_LIMIT: 576
jobs:
# Find a cached state disk for ${{ inputs.test_id }}, matching all of:
# - disk cached state prefix -> zebrad-cache or lwd-cache
# - state version (from the source code) - v{N}
# - network (network) - mainnet or testnet
# - disk target height kind (disk_suffix) - checkpoint or tip
#
# If the test needs a lightwalletd state (needs_lwd_state) set the input disk_prefix accordingly
# - To lwd-cache if needed
# - To zebrad-cache if not
#
# Passes the disk name to subsequent jobs using `cached_disk_name` output
# Passes the state version to subsequent jobs using `state_version` output
#
get-disk-name:
name: Get disk name
uses: ./.github/workflows/sub-find-cached-disks.yml
if: ${{ inputs.needs_zebra_state || inputs.needs_lwd_state }}
with:
network: ${{ inputs.network || vars.ZCASH_NETWORK }}
disk_prefix: ${{ inputs.needs_lwd_state && 'lwd-cache' || inputs.needs_zebra_state && 'zebrad-cache' }}
disk_suffix: ${{ inputs.disk_suffix }}
prefer_main_cached_state: ${{ inputs.prefer_main_cached_state }}
test_id: ${{ inputs.test_id }}
# Show all the test logs, then follow the logs of the test we just launched, until it finishes.
# Then check the result of the test.
#
# If `inputs.is_long_test` is `true`, the timeout is 5 days, otherwise it's 3 hours.
test-result:
name: Run ${{ inputs.test_id }} test
runs-on: zfnd-runners
needs: [ get-disk-name ]
if: ${{ !cancelled() && !failure() }}
timeout-minutes: ${{ inputs.is_long_test && 7200 || 180 }}
outputs:
cached_disk_name: ${{ needs.get-disk-name.outputs.cached_disk_name }}
state_version: ${{ needs.get-disk-name.outputs.state_version }}
env:
CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/[email protected]
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/[email protected]
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4
with:
short-length: 7
- name: Downcase network name for disks and labels
run: |
NETWORK_CAPS="${{ inputs.network }}"
echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
# Install our SSH secret
- name: Install private SSH key
uses: shimataro/[email protected]
with:
key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
name: google_compute_engine
known_hosts: unnecessary
- name: Generate public SSH key
run: |
sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/[email protected]
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/[email protected]
# Create a Compute Engine virtual machine and attach a cached state disk using the
# $CACHED_DISK_NAME env as the source image to populate the disk cached state
# if the test needs it.
- name: Create ${{ inputs.test_id }} GCP compute instance
id: create-instance
shell: /usr/bin/bash -x {0}
run: |
NAME="${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }}"
DISK_PARAMS="size=400GB,type=pd-balanced,name=${NAME},device-name=${NAME}"
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
fi
gcloud compute instances create-with-container "${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \
--boot-disk-size=50GB \
--boot-disk-type=pd-ssd \
--image-project=cos-cloud \
--image-family=cos-stable \
--create-disk="${DISK_PARAMS}" \
--container-image=gcr.io/google-containers/busybox \
--machine-type ${{ inputs.is_long_test && vars.GCP_LARGE_MACHINE || vars.GCP_SMALL_MACHINE }} \
--network-interface=subnet=${{ vars.GCP_SUBNETWORK }} \
--scopes cloud-platform \
--metadata=google-monitoring-enabled=TRUE,google-logging-enabled=TRUE \
--metadata-from-file=startup-script=.github/workflows/scripts/gcp-vm-startup-script.sh \
--labels=app=${{ inputs.app_name }},environment=test,network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }},test=${{ inputs.test_id }} \
--tags ${{ inputs.app_name }} \
--zone ${{ vars.GCP_ZONE }}
# Format the mounted disk if the test doesn't use a cached state.
- name: Format ${{ inputs.test_id }} volume
if: ${{ !inputs.needs_zebra_state && !inputs.needs_lwd_state }}
shell: /usr/bin/bash -ex {0}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
set -ex;
# Extract the correct disk name based on the device-name
DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-);
sudo mkfs.ext4 -v /dev/$DISK_NAME \
'
# Launch the test with the previously created disk or cached state.
#
# This step uses a $MOUNT_FLAGS variable to mount the disk to the docker container.
# If the test needs Lightwalletd state, we add the Lightwalletd state mount to the $MOUNT_FLAGS variable.
#
# SSH into the just created VM, and create a Docker container to run the incoming test
# from ${{ inputs.test_id }}, then mount the sudo docker volume created in the previous job.
#
# In this step we're using the same disk for simplicity, as mounting multiple disks to the
# VM and to the container might require more steps in this workflow, and additional
# considerations.
#
# The disk mounted in the VM is located at /dev/$DISK_NAME, we mount the root `/` of this disk to the docker
# container, and might have two different paths (if lightwalletd state is needed):
# - ${{ inputs.zebra_state_dir }} and ${{ inputs.lwd_state_dir }}
#
# Currently we do this by mounting the same disk at both paths.
#
# This doesn't cause any path conflicts, because Zebra and lightwalletd create different
# subdirectories for their data. (But Zebra, lightwalletd, and the test harness must not
# delete the whole cache directory.)
#
# These paths must match the variables used by the tests in Rust, which are also set in
# `ci-unit-tests-docker.yml` to be able to run this tests.
#
# Although we're mounting the disk root to both directories, Zebra and Lightwalletd, tests
# will only respect the values from $ZEBRA_CACHED_STATE_DIR and $LIGHTWALLETD_DATA_DIR,
# the inputs like ${{ inputs.zebra_state_dir }} and ${{ inputs.lwd_state_dir }}
# are only used to match those variables paths.
- name: Launch ${{ inputs.test_id }} test
id: launch-test
shell: /usr/bin/bash -x {0}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
# Extract the correct disk name based on the device-name
DISK_NAME=$(ls -l /dev/disk/by-id | grep -oE "google-${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} -> ../../[^ ]+" | grep -oE "/[^/]+$" | cut -c 2-)
MOUNT_FLAGS="--mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.zebra_state_dir }}"
# Check if we need to mount for Lightwalletd state
# lightwalletd-full-sync reads Zebra and writes lwd, so it is handled specially.
if [[ "${{ inputs.needs_lwd_state }}" == "true" || "${{ inputs.test_id }}" == "lwd-full-sync" ]]; then
MOUNT_FLAGS="$MOUNT_FLAGS --mount type=volume,volume-driver=local,volume-opt=device=/dev/$DISK_NAME,volume-opt=type=ext4,dst=${{ inputs.lwd_state_dir }}"
fi
sudo docker run \
--name ${{ inputs.test_id }} \
--tty \
--detach \
${{ inputs.test_variables }} \
${MOUNT_FLAGS} \
${{ vars.GAR_BASE }}/${{ vars.CI_IMAGE_NAME }}:sha-${{ env.GITHUB_SHA_SHORT }} \
'
# Show debug logs if previous job failed
- name: Show debug logs if previous job failed
if: ${{ failure() }}
shell: /usr/bin/bash -x {0}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
lsblk;
sudo lsof /dev/$DISK_NAME;
sudo dmesg;
sudo journalctl -b \
'
# Show all the logs since the container launched,
# following until we see zebrad startup messages.
#
# This check limits the number of log lines, so tests running on the wrong network don't
# run until the job timeout. If Zebra does a complete recompile, there are a few hundred log
# lines before the startup logs. So that's what we use here.
#
# The log pipeline ignores the exit status of `docker logs`.
# It also ignores the expected 'broken pipe' error from `tee`,
# which happens when `grep` finds a matching output and moves on to the next job.
#
# Errors in the tests are caught by the final test status job.
- name: Check startup logs for ${{ inputs.test_id }}
shell: /usr/bin/bash -x {0}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
sudo docker logs \
--tail all \
--follow \
${{ inputs.test_id }} | \
head -700 | \
tee --output-error=exit-nopipe /dev/stderr | \
grep --max-count=1 --extended-regexp --color=always \
"Zcash network: ${{ inputs.network }}"; \
'
# Check that the container executed at least 1 Rust test harness test, and that all tests passed.
# Then wait for the container to finish, and exit with the test's exit status.
# Also shows all the test logs.
#
# If the container has already finished, `docker wait` should return its status.
# But sometimes this doesn't work, so we use `docker inspect` as a fallback.
#
# `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
# with that status.
# (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
- name: Result of ${{ inputs.test_id }} test
shell: /usr/bin/bash -x {0}
run: |
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
sudo docker logs \
--tail all \
--follow \
${{ inputs.test_id }} | \
tee --output-error=exit-nopipe /dev/stderr | \
grep --max-count=1 --extended-regexp --color=always \
"test result: .*ok.* [1-9][0-9]* passed.*finished in";
LOGS_EXIT_STATUS=$?;
EXIT_STATUS=$(sudo docker wait ${{ inputs.test_id }} || echo "Error retrieving exit status");
echo "sudo docker exit status: $EXIT_STATUS";
# If grep found the pattern, exit with the Docker container"s exit status
if [ $LOGS_EXIT_STATUS -eq 0 ]; then
exit $EXIT_STATUS;
fi
# Handle other potential errors here
echo "An error occurred while processing the logs.";
exit 1; \
'
# create a state image from the instance's state disk, if requested by the caller
create-state-image:
name: Create ${{ inputs.test_id }} cached state image
runs-on: ubuntu-latest
needs: [ test-result ]
# We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
# Normally, if a job is skipped, all the jobs that depend on it are also skipped.
# So we need to override the default success() check to make this job run.
if: ${{ !cancelled() && !failure() && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
env:
STATE_VERSION: ${{ needs.test-result.outputs.state_version }}
CACHED_DISK_NAME: ${{ needs.test-result.outputs.cached_disk_name }}
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/[email protected]
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/[email protected]
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4
with:
short-length: 7
# Performs formatting on disk name components.
#
# Disk images in GCP are required to be in lowercase, but the blockchain network
# uses sentence case, so we need to downcase ${{ inputs.network }}.
#
# Disk image names in GCP are limited to 63 characters, so we need to limit
# branch names to 12 characters.
#
# Passes ${{ inputs.network }} to subsequent steps using $NETWORK env variable.
# Passes ${{ env.GITHUB_REF_SLUG_URL }} to subsequent steps using $SHORT_GITHUB_REF env variable.
- name: Format network name and branch name for disks
run: |
NETWORK_CAPS="${{ inputs.network }}"
echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
LONG_GITHUB_REF="${{ env.GITHUB_REF_SLUG_URL }}"
echo "SHORT_GITHUB_REF=${LONG_GITHUB_REF:0:12}" >> "$GITHUB_ENV"
# Install our SSH secret
- name: Install private SSH key
uses: shimataro/[email protected]
with:
key: ${{ secrets.GCP_SSH_PRIVATE_KEY }}
name: google_compute_engine
known_hosts: unnecessary
- name: Generate public SSH key
run: |
sudo apt-get update && sudo apt-get -qq install -y --no-install-recommends openssh-client
ssh-keygen -y -f ~/.ssh/google_compute_engine > ~/.ssh/google_compute_engine.pub
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/[email protected]
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/[email protected]
# Sets the $UPDATE_SUFFIX env var to "-u" if updating a previous cached state,
# and the empty string otherwise.
#
# Also sets a unique date and time suffix $TIME_SUFFIX.
- name: Set update and time suffixes
run: |
UPDATE_SUFFIX=""
if [[ "${{ inputs.needs_zebra_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "zebrad" ]]; then
UPDATE_SUFFIX="-u"
fi
# TODO: find a better logic for the lwd-full-sync case
if [[ "${{ inputs.needs_lwd_state }}" == "true" ]] && [[ "${{ inputs.app_name }}" == "lightwalletd" ]] && [[ "${{ inputs.test_id }}" != 'lwd-full-sync' ]]; then
UPDATE_SUFFIX="-u"
fi
# We're going to delete old images after a few days, so we only need the time here
TIME_SUFFIX=$(date '+%H%M%S' --utc)
echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"
# Get the full initial and running database versions from the test logs.
# These versions are used as part of the disk description and labels.
#
# If these versions are missing from the logs, the job fails.
#
# Typically, the database versions are around line 20 in the logs..
# But we check the first 1000 log lines, just in case the test harness recompiles all the
# dependencies before running the test. (This can happen if the cache is invalid.)
#
# Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
# $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
- name: Get database versions from logs
shell: /usr/bin/bash -x {0}
run: |
INITIAL_DISK_DB_VERSION=""
RUNNING_DB_VERSION=""
DB_VERSION_SUMMARY=""
DOCKER_LOGS=$( \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
sudo docker logs ${{ inputs.test_id }} | head -1000 \
')
# either a semantic version or "creating new database"
INITIAL_DISK_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
# Fail the tests, because Zebra didn't log the initial disk database version,
# or the regex in this step is wrong.
false
fi
if [[ "$INITIAL_DISK_DB_VERSION" = "creating.new.database" ]]; then
INITIAL_DISK_DB_VERSION="new"
else
INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
fi
echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"
RUNNING_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
grep --extended-regexp --only-matching '[0-9\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$RUNNING_DB_VERSION" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing running database version in logs: $RUNNING_DB_VERSION"
# Fail the tests, because Zebra didn't log the running database version,
# or the regex in this step is wrong.
false
fi
RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
echo "Found running database version in logs: $RUNNING_DB_VERSION"
echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"
if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
else
DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
fi
echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"
# Get the sync height from the test logs, which is later used as part of the
# disk description and labels.
#
# The regex used to grep the sync height is provided by ${{ inputs.height_grep_text }},
# this allows to dynamically change the height as needed by different situations or
# based on the logs output from different tests.
#
# If the sync height is missing from the logs, the job fails.
#
# Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
- name: Get sync height from logs
shell: /usr/bin/bash -x {0}
run: |
SYNC_HEIGHT=""
DOCKER_LOGS=$( \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=' \
sudo docker logs ${{ inputs.test_id }} --tail 200 \
')
SYNC_HEIGHT=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
grep --extended-regexp --only-matching '[0-9]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$SYNC_HEIGHT" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing sync height in logs: $SYNC_HEIGHT"
# Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
# or the CI workflow sync height regex is wrong.
false
fi
echo "Found sync height in logs: $SYNC_HEIGHT"
echo "SYNC_HEIGHT=$SYNC_HEIGHT" >> "$GITHUB_ENV"
# Get the original cached state height from google cloud.
#
# If the height is missing from the image labels, uses zero instead.
#
# TODO: fail the job if needs_zebra_state but the height is missing
# we can make this change after all the old images have been deleted, this should happen around 15 September 2022
# we'll also need to do a manual checkpoint rebuild before opening the PR for this change
#
# Passes the original height to subsequent steps using $ORIGINAL_HEIGHT env variable.
- name: Get original cached state height from google cloud
run: |
ORIGINAL_HEIGHT="0"
ORIGINAL_DISK_NAME="${{ format('{0}', env.CACHED_DISK_NAME) }}"
if [[ -n "$ORIGINAL_DISK_NAME" ]]; then
ORIGINAL_HEIGHT=$(gcloud compute images list --filter="status=READY AND name=$ORIGINAL_DISK_NAME" --format="value(labels.height)")
ORIGINAL_HEIGHT=${ORIGINAL_HEIGHT:-0}
echo "$ORIGINAL_DISK_NAME height: $ORIGINAL_HEIGHT"
else
ORIGINAL_DISK_NAME="new-disk"
echo "newly created disk, original height set to 0"
fi
echo "ORIGINAL_HEIGHT=$ORIGINAL_HEIGHT" >> "$GITHUB_ENV"
echo "ORIGINAL_DISK_NAME=$ORIGINAL_DISK_NAME" >> "$GITHUB_ENV"
# Create an image from the state disk, which will be used for any tests that start
# after it is created. These tests can be in the same workflow, or in a different PR.
#
# Using the newest image makes future jobs faster, because it is closer to the chain tip.
#
# Skips creating updated images if the original image is less than $CACHED_STATE_UPDATE_LIMIT behind the current tip.
# Full sync images are always created.
#
# The image can contain:
# - Zebra cached state, or
# - Zebra + lightwalletd cached state.
# Which cached state is being saved to the disk is defined by ${{ inputs.disk_prefix }}.
#
# Google Cloud doesn't have an atomic image replacement operation.
# We don't want to delete and re-create the image, because that causes a ~5 minute
# window where might be no recent image. So we add an extra image with a unique name,
# which gets selected because it has a later creation time.
# This also simplifies the process of deleting old images,
# because we don't have to worry about accidentally deleting all the images.
#
# The timestamp makes images from the same commit unique,
# as long as they don't finish in the same second.
# (This is unlikely, because each image created by a workflow has a different name.)
#
# The image name must also be 63 characters or less.
# More info: https://cloud.google.com/compute/docs/naming-resources#resource-name-format
#
# Force the image creation (--force) as the disk is still attached even though is not being
# used by the container.
- name: Create image from state disk
run: |
MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then
gcloud compute images create \
"${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
--force \
--source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ vars.GCP_ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},updated-from-disk=${ORIGINAL_DISK_NAME},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
else
echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT of $ORIGINAL_DISK_NAME"
fi
# delete the Google Cloud instance for this test
delete-instance:
name: Delete ${{ inputs.test_id }} instance
runs-on: ubuntu-latest
needs: [ create-state-image ]
# If a disk generation step timeouts (+6 hours) the previous job (creating the image) will be skipped.
# Even if the instance continues running, no image will be created, so it's better to delete it.
if: always()
continue-on-error: true
permissions:
contents: 'read'
id-token: 'write'
steps:
- uses: actions/[email protected]
with:
persist-credentials: false
fetch-depth: '2'
- uses: r7kamura/[email protected]
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@v4
with:
short-length: 7
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/[email protected]
with:
workload_identity_provider: '${{ vars.GCP_WIF }}'
service_account: '${{ vars.GCP_DEPLOYMENTS_SA }}'
- name: Set up Cloud SDK
uses: google-github-actions/[email protected]
# Deletes the instances that has been recently deployed in the actual commit after all
# previous jobs have run, no matter the outcome of the job.
- name: Delete test instance
continue-on-error: true
run: |
INSTANCE=$(gcloud compute instances list --filter=${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} --format='value(NAME)')
if [ -z "${INSTANCE}" ]; then
echo "No instance to delete"
else
gcloud compute instances delete "${INSTANCE}" --zone "${{ vars.GCP_ZONE }}" --delete-disks all --quiet
fi