From 06e867a6a7d6a941453e4790fe3b3e96f7a295b7 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Sun, 12 Jan 2025 22:22:10 +0000 Subject: [PATCH 1/9] issue-2831: dump iolog for fio_index mount_kikimr tests --- build/ext_mapping.conf.json | 2 +- .../tests/fio_index/mount-kikimr-test/test.py | 3 ++- .../core/tools/testing/fio/lib/__init__.py | 19 ++++++++++++++++--- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/build/ext_mapping.conf.json b/build/ext_mapping.conf.json index 214e6fdaa09..7eee50da573 100644 --- a/build/ext_mapping.conf.json +++ b/build/ext_mapping.conf.json @@ -5,7 +5,7 @@ "5074323334": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/rootfs-compressed.img", "4399070549": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/blockstore-plugin-stable-23-1.tar.gz", "3240550068": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/blockstore-plugin-stable-22-2.tar.gz", - "2326530962": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/fio-static.tgz", + "2326530962": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/fio-3.38-static.tgz", "4449551218": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/qemu-static.tgz", "3307630510": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/rootfs.img", "4556399018": "https://storage.eu-north1.nebius.cloud/nbs-oss-resources/virtiofs-server", diff --git a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py index d8048e72b11..8cb400ffa18 100644 --- a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py +++ b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py @@ -13,4 +13,5 @@ def test_fio(name): mount_dir = get_filestore_mount_path() dir_name = fio.get_dir_name(mount_dir, name) - fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True) + # TODO(#2831): remove this debug information + fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True, verbose=True) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 7c5e36b8629..20fb7840831 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -175,12 +175,25 @@ def get_fio_cmd(self, fio_bin, file_name): cmd += ["--filename", file_name] return cmd - def get_index_fio_cmd(self, fio_bin, directory): + def get_index_fio_cmd(self, fio_bin, directory, verbose=False): cmd = self.get_common_fio_cmd(fio_bin) cmd += [ "--directory", directory, "--numjobs", str(self.numjobs) ] + if verbose: + cmd += [ + "--write_iolog", + os.path.join(common.output_path(), f"{self.name}.iolog"), + "--log_issue_time", + "1", + "--write_lat_log", + os.path.join(common.output_path(), f"{self.name}.lat.log"), + "--write_bw_log", + os.path.join(common.output_path(), f"{self.name}.bw.log"), + "--log_offset", + "1", + ] if self.fsync > 0: cmd += ["--fsync", str(self.fsync)] if self.fdatasync > 0: @@ -311,7 +324,7 @@ def run_test(file_name, test, fail_on_errors=False): return _execute_command(cmd, fail_on_errors) -def run_index_test(directory, test, fail_on_errors=False): +def run_index_test(directory, test, fail_on_errors=False, verbose=False): # fio lays out the test file using the job blocksize, which may exhaust the # run time limit, so do it ourselves logger.info("laying out files in directory " + directory) @@ -319,6 +332,6 @@ def run_index_test(directory, test, fail_on_errors=False): logger.info("laid out") fio_bin = _get_fio_bin() - cmd = test.get_index_fio_cmd(fio_bin, directory) + cmd = test.get_index_fio_cmd(fio_bin, directory, verbose) return _execute_command(cmd, fail_on_errors) From e16b597fcb5a3168de1806240c122d8ad7bdecf6 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Mon, 20 Jan 2025 02:17:57 +0100 Subject: [PATCH 2/9] add ps --- .../core/tools/testing/fio/lib/__init__.py | 68 +++++++++++++++++-- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 20fb7840831..931b8578854 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -3,6 +3,8 @@ import logging import os import subprocess +import threading +import time import uuid import yatest.common as common @@ -294,7 +296,7 @@ def _lay_out_files(directory, name, jobs, size): def _execute_command(cmd, fail_on_errors): - logger.info("execute " + " ".join(cmd)) + print("execute " + " ".join(cmd)) ex = common.execute( cmd, stdout=subprocess.PIPE, @@ -314,9 +316,9 @@ def _execute_command(cmd, fail_on_errors): def run_test(file_name, test, fail_on_errors=False): # fio lays out the test file using the job blocksize, which may exhaust the # run time limit, so do it ourselves - logger.info("laying out file " + file_name) + print("laying out file " + file_name) _lay_out_file(file_name, test.size) - logger.info("laid out") + print("laid out") fio_bin = _get_fio_bin() cmd = test.get_fio_cmd(fio_bin, file_name) @@ -327,11 +329,67 @@ def run_test(file_name, test, fail_on_errors=False): def run_index_test(directory, test, fail_on_errors=False, verbose=False): # fio lays out the test file using the job blocksize, which may exhaust the # run time limit, so do it ourselves - logger.info("laying out files in directory " + directory) + print("laying out files in directory " + directory) _lay_out_files(directory, test.name, test.numjobs, test.size) - logger.info("laid out") + print("laid out") fio_bin = _get_fio_bin() cmd = test.get_index_fio_cmd(fio_bin, directory, verbose) + parent_pids = {str(os.getpid())} + + def monitor_fio_progress(): + nonlocal cmd, parent_pids + period_sec = 0.001 + pid_seen = False + while True: + # use pgrep to find the fio process + + pgrep_process = subprocess.Popen( + ["pgrep", "-P", ",".join(list(parent_pids))], + stdout=subprocess.PIPE, + ) + + stdout = pgrep_process.stdout.read() if pgrep_process.stdout else "" + stdout = stdout.decode("utf-8") + if len(stdout) > 0: + fio_pids = list(map(int, stdout.split())) + logging.info( + "Fio process is still running with PIDs: {}".format( + fio_pids + ) + ) + # os.system("ps -p {} -o pid,ppid,cmd,%cpu,%mem,etime".format(",".join(map(str, fio_pids)))) + status_process = subprocess.Popen( + [ + "ps", + "-F", + "-p", + ",".join(map(str, fio_pids)), + ], + stdout=subprocess.PIPE, + ) + status_process.wait() + logging.info( + "Fio process status: " + + status_process.stdout.read().decode("utf-8") + if status_process.stdout + else "" + ) + pid_seen = True + parent_pids = parent_pids.union(set(map(str, fio_pids))) + else: + if pid_seen: + logging.info("Fio process has finished") + break + else: + logging.info("Fio process has not started yet") + + time.sleep(period_sec) + + # Monitoring process to report the progress of the test + monitoring_thread = threading.Thread(target=monitor_fio_progress, args=()) + monitoring_thread.start() + + # This will call popen and wait for the process to finish return _execute_command(cmd, fail_on_errors) From a98fd93449b8afd6a9b80ef54912a965b366f25d Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Mon, 20 Jan 2025 14:40:55 +0100 Subject: [PATCH 3/9] add dmesg and strace --- .../core/tools/testing/fio/lib/__init__.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 931b8578854..a40b5e5cf3a 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -342,6 +342,8 @@ def monitor_fio_progress(): nonlocal cmd, parent_pids period_sec = 0.001 pid_seen = False + start_time = time.time() + timout = 60 while True: # use pgrep to find the fio process @@ -378,6 +380,34 @@ def monitor_fio_progress(): ) pid_seen = True parent_pids = parent_pids.union(set(map(str, fio_pids))) + + if time.time() - start_time > timout: + logging.error("Fio process has timed out") + dmesg = subprocess.Popen( + ["sudo", "dmesg"], + stdout=subprocess.PIPE, + ) + dmesg.wait() + logging.error( + "dmesg output: " + + dmesg.stdout.read().decode("utf-8") + if dmesg.stdout + else "" + ) + for pid in fio_pids: + strace = subprocess.Popen( + ["sudo", "strace", "-p", str(pid)], + stdout=subprocess.PIPE, + ) + strace.wait() + logging.error( + "strace output for pid {}: ".format(pid) + + strace.stdout.read().decode("utf-8") + if strace.stdout + else "" + ) + break + else: if pid_seen: logging.info("Fio process has finished") From 060adcdf01d0d9572ff8388ad97568b993f5990a Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Mon, 20 Jan 2025 16:47:09 +0100 Subject: [PATCH 4/9] fix dmesg and strace --- .../core/tools/testing/fio/lib/__init__.py | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index a40b5e5cf3a..2f88b732884 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -383,29 +383,36 @@ def monitor_fio_progress(): if time.time() - start_time > timout: logging.error("Fio process has timed out") - dmesg = subprocess.Popen( - ["sudo", "dmesg"], - stdout=subprocess.PIPE, - ) - dmesg.wait() - logging.error( - "dmesg output: " - + dmesg.stdout.read().decode("utf-8") - if dmesg.stdout - else "" - ) - for pid in fio_pids: - strace = subprocess.Popen( - ["sudo", "strace", "-p", str(pid)], - stdout=subprocess.PIPE, - ) - strace.wait() - logging.error( - "strace output for pid {}: ".format(pid) - + strace.stdout.read().decode("utf-8") - if strace.stdout - else "" + try: + with open( + common.output_path() + "/dmesg.txt", "w" + ) as dmesg_output: + subprocess.run( + ["sudo", "-n", "dmesg", "-T"], + stdout=dmesg_output, + stderr=dmesg_output, + timeout=10, + ) + except Exception as dmesg_error: + logging.info( + f"Failed to save dmesg output: {dmesg_error}" ) + + for pid in fio_pids: + with open( + common.output_path() + f"/strace_{pid}.txt", "w" + ) as strace_output: + try: + subprocess.run( + ["sudo", "-n", "strace", "-p", str(pid)], + stdout=strace_output, + stderr=strace_output, + timeout=10, + ) + except Exception as strace_error: + logging.info( + f"Failed to save strace output: {strace_error}" + ) break else: From af45edf98e92c9cf59e721d6587aa8f738cd0584 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Mon, 20 Jan 2025 16:49:23 +0100 Subject: [PATCH 5/9] fix dmesg and strace --- cloud/storage/core/tools/testing/fio/lib/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 2f88b732884..5077bc5085b 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -393,6 +393,7 @@ def monitor_fio_progress(): stderr=dmesg_output, timeout=10, ) + logging.info("Saved dmesg output to dmesg.txt") except Exception as dmesg_error: logging.info( f"Failed to save dmesg output: {dmesg_error}" @@ -404,11 +405,14 @@ def monitor_fio_progress(): ) as strace_output: try: subprocess.run( - ["sudo", "-n", "strace", "-p", str(pid)], + ["strace", "-p", str(pid)], stdout=strace_output, stderr=strace_output, timeout=10, ) + logging.info( + f"Saved strace output for PID {pid} to strace_{pid}.txt" + ) except Exception as strace_error: logging.info( f"Failed to save strace output: {strace_error}" From a50550a9e6d30e1d880142389d3e258880c18da9 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Tue, 21 Jan 2025 17:55:56 +0100 Subject: [PATCH 6/9] fix dmesg and strace --- cloud/storage/core/tools/testing/fio/lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 5077bc5085b..5c9d46a430b 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -405,7 +405,7 @@ def monitor_fio_progress(): ) as strace_output: try: subprocess.run( - ["strace", "-p", str(pid)], + ["sudo", "strace", "-p", str(pid)], stdout=strace_output, stderr=strace_output, timeout=10, From e185f4a9800e54ed12e067e53756f6fb7fd9306b Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Tue, 21 Jan 2025 19:27:14 +0100 Subject: [PATCH 7/9] fix dmesg and strace --- cloud/filestore/bin/nfs/nfs-storage.txt | 3 ++- cloud/filestore/tests/fio_index/mount-kikimr-test/test.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cloud/filestore/bin/nfs/nfs-storage.txt b/cloud/filestore/bin/nfs/nfs-storage.txt index fcdfd74412d..171c4f64665 100644 --- a/cloud/filestore/bin/nfs/nfs-storage.txt +++ b/cloud/filestore/bin/nfs/nfs-storage.txt @@ -1,2 +1,3 @@ SchemeShardDir: "/Root/NFS" -MultiTabletForwardingEnabled: true \ No newline at end of file +MultiTabletForwardingEnabled: true +DirectoryCreationInShardsEnabled: true \ No newline at end of file diff --git a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py index 8cb400ffa18..aff9e43aa0c 100644 --- a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py +++ b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py @@ -15,3 +15,4 @@ def test_fio(name): # TODO(#2831): remove this debug information fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True, verbose=True) + assert False From 6822b1f09a590037112f989bc536defca83d0c22 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Wed, 22 Jan 2025 17:25:11 +0100 Subject: [PATCH 8/9] dump id --- cloud/filestore/tests/fio_index/mount-kikimr-test/test.py | 2 +- cloud/storage/core/tools/testing/fio/lib/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py index aff9e43aa0c..3ef8329cb9f 100644 --- a/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py +++ b/cloud/filestore/tests/fio_index/mount-kikimr-test/test.py @@ -15,4 +15,4 @@ def test_fio(name): # TODO(#2831): remove this debug information fio.run_index_test(dir_name, TESTS[name], fail_on_errors=True, verbose=True) - assert False + # assert False diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 5c9d46a430b..326a468de6c 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -398,6 +398,8 @@ def monitor_fio_progress(): logging.info( f"Failed to save dmesg output: {dmesg_error}" ) + os.system("id") + os.system("sudo id") for pid in fio_pids: with open( From 4d36f8b3ea8d12b719ce37dc936cd6f8455d9699 Mon Sep 17 00:00:00 2001 From: Maxim Deb Natkh Date: Fri, 24 Jan 2025 14:14:07 +0100 Subject: [PATCH 9/9] =?UTF-8?q?add=20more=20strace=20logs=20=F0=9F=98=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cloud/storage/core/tools/testing/fio/lib/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cloud/storage/core/tools/testing/fio/lib/__init__.py b/cloud/storage/core/tools/testing/fio/lib/__init__.py index 326a468de6c..c4aebc2745b 100644 --- a/cloud/storage/core/tools/testing/fio/lib/__init__.py +++ b/cloud/storage/core/tools/testing/fio/lib/__init__.py @@ -295,7 +295,10 @@ def _lay_out_files(directory, name, jobs, size): _lay_out_file('{}/{}.{}.0'.format(directory, name, i), size) -def _execute_command(cmd, fail_on_errors): +def _execute_command(cmd, fail_on_errors, debug=False): + if debug: + strace_file = os.path.join(common.output_path(), "strace_" + str(uuid.uuid4()) + ".log") + cmd = ["strace", "-f", "-o", strace_file] + cmd print("execute " + " ".join(cmd)) ex = common.execute( cmd, @@ -435,4 +438,4 @@ def monitor_fio_progress(): monitoring_thread.start() # This will call popen and wait for the process to finish - return _execute_command(cmd, fail_on_errors) + return _execute_command(cmd, fail_on_errors, debug=verbose)