Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check that the ceph osd df tree weight and size are equal and reflect the current osd size #10641

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions ocs_ci/helpers/osd_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
get_deviceset_count,
resize_osd,
)
from ocs_ci.ocs.cluster import check_ceph_osd_tree, CephCluster
from ocs_ci.ocs.cluster import check_ceph_osd_tree, CephCluster, check_ceph_osd_df_tree
from ocs_ci.ocs.ui.page_objects.page_navigator import PageNavigator
from ocs_ci.utility.utils import (
ceph_health_check,
Expand Down Expand Up @@ -60,6 +60,7 @@ def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pv
old_osd_pvs (list): The old osd PV objects before resizing the osd

Raises:
StorageSizeNotReflectedException: If the OSD pods failed to restart
ResourceWrongStatusException: If the following occurs:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this raises is not used can we remove this ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean to remove the line:

  1. The OSD pods failed to reach the status Terminated or to be deleted

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

1. The OSD pods failed to reach the status Terminated or to be deleted
2. The old PVC and PV names are not equal to the current PVC and PV names
Expand All @@ -75,7 +76,7 @@ def check_resources_state_post_resize_osd(old_osd_pods, old_osd_pvcs, old_osd_pv
sleep=20,
)
if not res:
raise ResourceWrongStatusException(
raise StorageSizeNotReflectedException(
"The OSD pods failed to reach the status Terminated or to be deleted"
)

Expand Down Expand Up @@ -227,6 +228,10 @@ def check_ceph_state_post_resize_osd():
raise CephHealthException(ex)
if not check_ceph_osd_tree():
raise CephHealthException("The ceph osd tree checks didn't finish successfully")
if not check_ceph_osd_df_tree():
raise CephHealthException(
"The ceph osd df tree output is not formatted correctly"
)


def base_ceph_verification_steps_post_resize_osd(
Expand Down
139 changes: 139 additions & 0 deletions ocs_ci/ocs/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3602,3 +3602,142 @@ def bring_down_mds_memory_usage_gradually():
assert (
time_elapsed <= 1800
), "Memory usage remained high for more than 30 minutes. Failed to bring down the memory usage of MDS"


def parse_ceph_table_output(raw_output: str) -> pd.DataFrame:
"""
Parse the Ceph command table output and extract the data into a pandas DataFrame.
The function assumes that the first row contains the header, with at least two spaces
separating each column value.

Args:
raw_output (str): The raw output string from any Ceph command that provides tabular output.

Returns:
pd.DataFrame: A pandas DataFrame containing the parsed data, where the columns are
derived from the header row and the data rows are parsed accordingly.

"""
# Known units for sizes (e.g., GiB, TiB, MiB)
known_units = ["GiB", "MiB", "KiB", "TiB"]

# Step 1: Join size values with their units (e.g., '894 GiB' -> '894GiB')
for unit in known_units:
raw_output = re.sub(rf"(\d+)\s+{unit}", rf"\1{unit}", raw_output)

# Split the raw output into lines
lines = raw_output.strip().split("\n")
# Use the first line as the header
header_line = lines[0].strip()
header = re.split(r"\s{2,}", header_line)
logger.info(f"Extracted Header: {header}")

data_lines = lines[1:]
# Now process the collected lines into parts
data = []
for line in data_lines:
# Split by any whitespace
parts = re.split(r"\s+", line.strip())
if len(parts) >= len(header) - 1:
data.append(parts[: len(header)])
else:
logger.warning(
f"Skipping line due to mismatch in number of columns: {line}"
)

# Create DataFrame
df = pd.DataFrame(data, columns=header)

return df


def get_ceph_osd_df_tree_weight_and_size():
"""
Extract the 'ID', 'WEIGHT', and 'SIZE' values from the Ceph 'osd df tree' command output.

Returns:
list: A list of dictionaries where each dictionary contains 'ID', 'WEIGHT', and 'SIZE'.

"""
ceph_cmd = "ceph osd df tree"
ct_pod = storage_cluster.get_ceph_tools_pod()
output = ct_pod.exec_ceph_cmd(
ceph_cmd=ceph_cmd, format=False, out_yaml_format=False
)
logger.info(f"ceph osd df tree output = {output}")
# Parse the raw output using the modified parse_ceph_osd_df_tree function
df = parse_ceph_table_output(output)

# Initialize the result list
result = []

for _, row in df.iterrows():
# Extract WEIGHT and SIZE
weight = row["WEIGHT"]
if weight == "-":
# If the weight value with '-' we need to get the next row value
weight = row["CLASS"]
size = row["REWEIGHT"]
else:
weight = row["WEIGHT"]
size = row["SIZE"]

result.append({"ID": row["ID"], "WEIGHT": weight, "SIZE": size})

return result


def check_ceph_osd_df_tree():
"""
Check that the ceph osd df tree output values are correct

Returns:
bool: True, if the ceph osd df tree output values are correct. False, otherwise.

"""
logger.info("Verify ceph osd df tree values")
storage_size_param = storage_cluster.get_storage_size()
logger.info(f"storage size = {storage_size_param}")
ceph_output_lines = get_ceph_osd_df_tree_weight_and_size()
logger.info(f"ceph output lines = {ceph_output_lines}")

for line in ceph_output_lines:
osd_id = line["ID"]
weight = float(line["WEIGHT"])
# Regular expression to match the numeric part and the unit
match = re.match(r"([0-9.]+)([a-zA-Z]+)", line["SIZE"])
size = float(match.group(1))
units = match.group(2)
if units.startswith("Ti"):
storage_size = convert_device_size(storage_size_param, "TB", 1024)
elif units.startswith("Gi"):
storage_size = convert_device_size(storage_size_param, "GB", 1024)
weight = weight * 1024
elif units.startswith("Mi"):
storage_size = convert_device_size(storage_size_param, "MB", 1024)
weight = weight * (1024**2)
else:
storage_size = float(storage_size_param[0:-2])

logger.info(f"OSD size = {size}, weight = {weight}")
# Check if the weight and size are equal ignoring a small diff
diff = size * 0.04
if not (size - diff <= weight <= size + diff):
logger.warning(
f"OSD weight {weight} (converted) does not match the OSD size {size} "
f"for OSD ID {osd_id}. Expected OSD weight within [{size - diff}, {size + diff}]"
)
return False
# If it's a regular OSD entry, check if the expected osd size
# and the current size are equal ignoring a small diff
diff = size * 0.02
if not osd_id.startswith("-") and not (
size - diff <= storage_size <= size + diff
):
logger.warning(
f"The storage size {storage_size} does not match the OSD size {size} "
f"for OSD ID {osd_id}. Expected storage size within [{size - diff}, {size + diff}]"
)
return False

return True
Loading