Skip to content

Commit 6ea3a3f

Browse files
AAP-54269 - Milan finish the full anonymized test (#247)
* Finish the anonymized test * out no longer hardcoded * Data assertions * Data assertions with approx * Handle missing start and filter not finished jobs * Add prepare call into tests, remove the setting start to finished - it will corrupt analysis results * Safe tarball open * Pytest approx * Remove saving the rollups - disable the part of the code
1 parent 2e9e826 commit 6ea3a3f

File tree

8 files changed

+241
-18
lines changed

8 files changed

+241
-18
lines changed

metrics_utility/anonymized_rollups/anonymized_rollups.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,22 +104,26 @@ def anonymize_rollups(events_modules_rollup, execution_environments_rollup, jobs
104104
return data
105105

106106

107-
def compute_anonymized_rollup_from_raw_data(salt, year, month, day):
108-
jobs = load_anonymized_rollup_data(JobsAnonymizedRollup(), './out', year, month, day)
107+
def compute_anonymized_rollup_from_raw_data(salt, year, month, day, base_path, save_rollups: bool = True):
108+
jobs = load_anonymized_rollup_data(JobsAnonymizedRollup(), base_path, year, month, day)
109109
jobs_result = JobsAnonymizedRollup().base(jobs)
110-
JobsAnonymizedRollup().save_rollup(jobs_result['rollup'], './out', year, month, day)
110+
if save_rollups:
111+
JobsAnonymizedRollup().save_rollup(jobs_result['rollup'], base_path, year, month, day)
111112

112-
job_host_summary = load_anonymized_rollup_data(JobHostSummaryAnonymizedRollup(), './out', year, month, day)
113+
job_host_summary = load_anonymized_rollup_data(JobHostSummaryAnonymizedRollup(), base_path, year, month, day)
113114
job_host_summary_result = JobHostSummaryAnonymizedRollup().base(job_host_summary)
114-
JobHostSummaryAnonymizedRollup().save_rollup(job_host_summary_result['rollup'], './out', year, month, day)
115+
if save_rollups:
116+
JobHostSummaryAnonymizedRollup().save_rollup(job_host_summary_result['rollup'], base_path, year, month, day)
115117

116-
events_modules = load_anonymized_rollup_data(EventModulesAnonymizedRollup(), './out', year, month, day)
118+
events_modules = load_anonymized_rollup_data(EventModulesAnonymizedRollup(), base_path, year, month, day)
117119
events_modules_result = EventModulesAnonymizedRollup().base(events_modules)
118-
EventModulesAnonymizedRollup().save_rollup(events_modules_result['rollup'], './out', year, month, day)
120+
if save_rollups:
121+
EventModulesAnonymizedRollup().save_rollup(events_modules_result['rollup'], base_path, year, month, day)
119122

120-
execution_environments = load_anonymized_rollup_data(ExecutionEnvironmentsAnonymizedRollup(), './out', year, month, day)
123+
execution_environments = load_anonymized_rollup_data(ExecutionEnvironmentsAnonymizedRollup(), base_path, year, month, day)
121124
execution_environments_result = ExecutionEnvironmentsAnonymizedRollup().base(execution_environments)
122-
ExecutionEnvironmentsAnonymizedRollup().save_rollup(execution_environments_result['rollup'], './out', year, month, day)
125+
if save_rollups:
126+
ExecutionEnvironmentsAnonymizedRollup().save_rollup(execution_environments_result['rollup'], base_path, year, month, day)
123127

124128
anonymized_rollup = anonymize_rollups(
125129
events_modules_result['json'], execution_environments_result['json'], jobs_result['json'], job_host_summary_result['json'], 'salt'

metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ class JobsAnonymizedRollup(BaseAnonymizedRollup):
88
Collector - unified_jobs collector data
99
"""
1010

11+
def prepare(self, dataframe):
12+
# filter out jobs that are not finished
13+
dataframe = dataframe[dataframe['finished'].notna()]
14+
return dataframe
15+
1116
def __init__(self):
1217
super().__init__('jobs')
1318
self.collector_names = ['unified_jobs']
@@ -52,9 +57,6 @@ def base(self, dataframe):
5257
if 'failed' in dataframe.columns:
5358
dataframe['failed'] = dataframe['failed'].replace({'t': True, 'f': False}).fillna(False).astype(bool)
5459

55-
# create view from dataframe where finished is not null and started is not null
56-
dataframe = dataframe[dataframe['finished'].notna() & dataframe['started'].notna()]
57-
5860
# compute job duration in seconds, .dt.total_seconds()
5961
dataframe['job_duration_seconds'] = (dataframe['finished'] - dataframe['started']).dt.total_seconds()
6062
dataframe['job_waiting_time_seconds'] = (dataframe['started'] - dataframe['created']).dt.total_seconds()

metrics_utility/anonymized_rollups/task_anonymized_rollups.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from metrics_utility.test.util import run_gather_int
55

66

7-
def task_anonymized_rollups(salt, year, month, day, ship_path):
7+
def task_anonymized_rollups(salt, year, month, day, ship_path, save_rollups: bool = True):
88
env_vars = {
99
'METRICS_UTILITY_SHIP_PATH': ship_path,
1010
'METRICS_UTILITY_SHIP_TARGET': 'directory',
@@ -22,6 +22,6 @@ def task_anonymized_rollups(salt, year, month, day, ship_path):
2222
run_gather_int(env_vars, {'ship': True, 'force': True, 'since': since_param, 'until': until_param})
2323

2424
# load data for each collector
25-
json_data = compute_anonymized_rollup_from_raw_data(salt, year, month, day)
25+
json_data = compute_anonymized_rollup_from_raw_data(salt, year, month, day, ship_path, save_rollups)
2626

2727
return json_data

metrics_utility/test/test_anonymized_rollups/test_events_modules_anonymized_rollups.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def test_events_modules_aggregations_basic():
308308
# provide default event_data for ignore_errors lookup in prepare_data
309309
df['event_data'] = [{}] * len(df)
310310
events_modules_anonymized_rollup = EventModulesAnonymizedRollup()
311-
prepared = events_modules_anonymized_rollup.prepare(df.copy())
311+
prepared = events_modules_anonymized_rollup.prepare(df)
312312
result = events_modules_anonymized_rollup.base(prepared)
313313
result = result['json']
314314

metrics_utility/test/test_anonymized_rollups/test_execution_environments_anonymized_rollups.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def test_base_counts():
1717
df = pd.DataFrame(execution_environments)
1818

1919
execution_environments_anonymized_rollup = ExecutionEnvironmentsAnonymizedRollup()
20+
df = execution_environments_anonymized_rollup.prepare(df)
2021
result = execution_environments_anonymized_rollup.base(df)
2122
result = result['json']
2223

metrics_utility/test/test_anonymized_rollups/test_from_gather_to_json.py

Lines changed: 217 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,229 @@ def cleanup_glob():
2727

2828

2929
def test_empty_data(cleanup_glob):
30-
compute_anonymized_rollup_from_raw_data('salt', 2025, 6, 13)
30+
compute_anonymized_rollup_from_raw_data('salt', 2025, 6, 13, './out')
3131

3232

3333
def test_from_gather_to_json(cleanup_glob):
3434
# run gather
35-
json_data = task_anonymized_rollups('salt', 2025, 6, 13, './out')
35+
json_data = task_anonymized_rollups('salt', 2025, 6, 13, './out', save_rollups=False)
3636

3737
print(json_data)
3838

3939
# save as json inside rollups/2025/06/13/anonymized.json
40-
with open(f'./out/rollups/{2025}/06/13/anonymized.json', 'w') as f:
40+
json_path = f'./out/rollups/{2025}/06/13/anonymized.json'
41+
42+
# create the dir
43+
os.makedirs(os.path.dirname(json_path), exist_ok=True)
44+
45+
with open(json_path, 'w') as f:
4146
json.dump(json_data, f, indent=4)
47+
48+
# ========== Validate the json_data that are containing what they should ==========
49+
50+
# Validate top-level structure
51+
assert 'events_modules' in json_data, "Missing 'events_modules' in json_data"
52+
assert 'execution_environments' in json_data, "Missing 'execution_environments' in json_data"
53+
assert 'jobs' in json_data, "Missing 'jobs' in json_data"
54+
assert 'job_host_summary' in json_data, "Missing 'job_host_summary' in json_data"
55+
56+
# Validate events_modules structure
57+
events_modules = json_data['events_modules']
58+
assert isinstance(events_modules, dict), 'events_modules should be a dictionary'
59+
assert 'list_of_modules_used_to_automate' in events_modules
60+
assert 'modules_used_to_automate_total' in events_modules
61+
assert 'avg_number_of_modules_used_in_a_playbooks' in events_modules
62+
assert 'modules_used_per_playbook_total' in events_modules
63+
assert 'module_stats' in events_modules
64+
assert 'collection_name_stats' in events_modules
65+
assert 'total_hosts_automated' in events_modules
66+
67+
# Validate events_modules data types
68+
assert isinstance(events_modules['list_of_modules_used_to_automate'], list)
69+
assert isinstance(events_modules['modules_used_to_automate_total'], int)
70+
assert isinstance(events_modules['avg_number_of_modules_used_in_a_playbooks'], (int, float))
71+
assert isinstance(events_modules['modules_used_per_playbook_total'], dict)
72+
assert isinstance(events_modules['module_stats'], list)
73+
assert isinstance(events_modules['collection_name_stats'], list)
74+
assert isinstance(events_modules['total_hosts_automated'], int)
75+
76+
# Validate modules have required fields
77+
if events_modules['list_of_modules_used_to_automate']:
78+
for module in events_modules['list_of_modules_used_to_automate']:
79+
assert 'module_name' in module
80+
assert 'collection_source' in module
81+
assert 'collection_name' in module
82+
83+
# Validate module_stats have required fields
84+
if events_modules['module_stats']:
85+
for module_stat in events_modules['module_stats']:
86+
assert 'module_name' in module_stat
87+
assert 'collection_source' in module_stat
88+
assert 'collection_name' in module_stat
89+
assert 'jobs_total' in module_stat
90+
assert 'hosts_total' in module_stat
91+
92+
# Validate execution_environments structure
93+
execution_envs = json_data['execution_environments']
94+
assert isinstance(execution_envs, dict), 'execution_environments should be a dictionary'
95+
assert 'total_EE' in execution_envs
96+
assert 'default_EE' in execution_envs
97+
assert 'custom_EE' in execution_envs
98+
assert isinstance(execution_envs['total_EE'], int)
99+
assert isinstance(execution_envs['default_EE'], int)
100+
assert isinstance(execution_envs['custom_EE'], int)
101+
102+
# Validate jobs structure
103+
jobs = json_data['jobs']
104+
assert isinstance(jobs, list), 'jobs should be a list'
105+
if jobs:
106+
for job in jobs:
107+
assert 'job_template_name' in job
108+
assert 'number_of_jobs_executed' in job
109+
assert 'number_of_jobs_failed' in job
110+
assert 'job_duration_average_in_seconds' in job
111+
assert 'job_waiting_time_average_in_seconds' in job
112+
113+
# Validate job_host_summary structure
114+
job_host_summary = json_data['job_host_summary']
115+
assert isinstance(job_host_summary, list), 'job_host_summary should be a list'
116+
if job_host_summary:
117+
for jhs in job_host_summary:
118+
assert 'job_template_name' in jhs
119+
assert 'jobs_total' in jhs
120+
assert 'hosts_total' in jhs
121+
assert 'ok_total' in jhs
122+
123+
# Validate anonymization occurred (check for hashed values)
124+
# Job template names should be hashed (64 character hex strings)
125+
if jobs:
126+
for job in jobs:
127+
job_template_name = job['job_template_name']
128+
assert len(job_template_name) == 128, f'Job template name should be hashed (128 chars): {job_template_name}'
129+
assert all(c in '0123456789abcdef' for c in job_template_name), 'Job template name should be hex string'
130+
131+
# ========== Validate actual data values and relationships ==========
132+
133+
# Validate events_modules actual values
134+
print('\n--- Validating events_modules data values ---')
135+
assert events_modules['modules_used_to_automate_total'] == 2, 'Should have 2 modules'
136+
assert events_modules['total_hosts_automated'] == 2, 'Should have 2 hosts automated'
137+
assert len(events_modules['list_of_modules_used_to_automate']) == 2, 'Should have 2 modules in list'
138+
assert len(events_modules['module_stats']) == 2, 'Should have 2 module stats'
139+
assert len(events_modules['collection_name_stats']) == 2, 'Should have 2 collection stats'
140+
141+
# Validate first module is the unencrypted community module
142+
first_module = events_modules['list_of_modules_used_to_automate'][0]
143+
assert first_module['module_name'] == 'a10.acos_axapi.a10_slb_virtual_server', 'First module should be a10_slb_virtual_server'
144+
assert first_module['collection_source'] == 'community', 'First module should be from community'
145+
assert first_module['collection_name'] == 'a10.acos_axapi', 'First module should be from a10.acos_axapi collection'
146+
147+
# Validate second module is hashed (encrypted)
148+
second_module = events_modules['list_of_modules_used_to_automate'][1]
149+
assert len(second_module['module_name']) == 128, 'Second module name should be hashed (128 chars)'
150+
assert second_module['collection_source'] == 'Unknown', 'Second module should have Unknown source'
151+
assert len(second_module['collection_name']) == 128, 'Second module collection should be hashed (128 chars)'
152+
153+
# Validate module_stats actual values
154+
print('--- Validating module_stats data values ---')
155+
first_module_stats = events_modules['module_stats'][0]
156+
assert first_module_stats['module_name'] == 'a10.acos_axapi.a10_slb_virtual_server', 'Module stats should match module'
157+
assert first_module_stats['jobs_total'] == 3, 'Should have 3 jobs using this module'
158+
assert first_module_stats['hosts_total'] == 2, 'Should have 2 hosts for this module'
159+
assert first_module_stats['task_clean_success_total'] == 6, 'Should have 6 successful tasks (3 jobs × 2 hosts)'
160+
assert first_module_stats['task_success_with_reruns_total'] == 0, 'Should have 0 reruns'
161+
assert first_module_stats['task_failed_total'] == 0, 'Should have 0 failures'
162+
assert first_module_stats['avg_hosts_per_job'] == pytest.approx(2.0, rel=1e-6), 'Should average 2 hosts per job'
163+
164+
# Validate second module stats
165+
second_module_stats = events_modules['module_stats'][1]
166+
assert second_module_stats['jobs_total'] == 3, 'Second module should also have 3 jobs'
167+
assert second_module_stats['hosts_total'] == 2, 'Second module should have 2 hosts'
168+
assert second_module_stats['task_clean_success_total'] == 0, 'Second module should have 0 clean successes'
169+
170+
# Validate collection_name_stats
171+
print('--- Validating collection_name_stats data values ---')
172+
first_collection_stats = events_modules['collection_name_stats'][0]
173+
assert first_collection_stats['collection_name'] == 'a10.acos_axapi', 'Collection name should match'
174+
assert first_collection_stats['collection_source'] == 'community', 'Collection should be from community'
175+
assert first_collection_stats['jobs_total'] == 3, 'Collection should have 3 jobs'
176+
assert first_collection_stats['hosts_total'] == 2, 'Collection should have 2 hosts'
177+
assert first_collection_stats['task_clean_success_total'] == 6, 'Collection should have 6 successful tasks'
178+
179+
# Validate modules_used_per_playbook_total structure and values
180+
print('--- Validating modules_used_per_playbook_total ---')
181+
assert len(events_modules['modules_used_per_playbook_total']) == 1, 'Should have 1 playbook'
182+
playbook_module_count = list(events_modules['modules_used_per_playbook_total'].values())[0]
183+
assert playbook_module_count == 2, 'Playbook should use 2 modules'
184+
185+
# Validate avg_number_of_modules_used_in_a_playbooks calculation
186+
total_modules_across_playbooks = sum(events_modules['modules_used_per_playbook_total'].values())
187+
num_playbooks = len(events_modules['modules_used_per_playbook_total'])
188+
expected_avg = total_modules_across_playbooks / num_playbooks if num_playbooks > 0 else 0
189+
assert events_modules['avg_number_of_modules_used_in_a_playbooks'] == pytest.approx(expected_avg, rel=1e-6), (
190+
f'Average should be {expected_avg}, got {events_modules["avg_number_of_modules_used_in_a_playbooks"]}'
191+
)
192+
193+
# Validate execution_environments actual values
194+
print('--- Validating execution_environments data values ---')
195+
assert execution_envs['total_EE'] == 2, 'Should have 2 total execution environments'
196+
assert execution_envs['default_EE'] == 1, 'Should have 1 default execution environment'
197+
assert execution_envs['custom_EE'] == 1, 'Should have 1 custom execution environment'
198+
# Validate that total = default + custom
199+
assert execution_envs['total_EE'] == execution_envs['default_EE'] + execution_envs['custom_EE'], 'Total EE should equal default + custom'
200+
201+
# Validate jobs actual values
202+
print('--- Validating jobs data values ---')
203+
assert len(jobs) == 1, 'Should have 1 job template'
204+
job = jobs[0]
205+
assert job['number_of_jobs_executed'] == 3, 'Job template should have 3 executions'
206+
assert job['number_of_jobs_failed'] == 0, 'Should have 0 failed jobs'
207+
assert job['number_of_jobs_succeeded'] == 3, 'Should have 3 succeeded jobs'
208+
assert job['number_of_jobs_succeeded'] + job['number_of_jobs_failed'] == job['number_of_jobs_executed'], (
209+
'Succeeded + failed should equal total executed'
210+
)
211+
212+
# Validate job duration fields are non-negative
213+
assert job['job_duration_average_in_seconds'] >= 0, 'Job duration average should be non-negative'
214+
assert job['job_duration_total_in_seconds'] >= 0, 'Job duration total should be non-negative'
215+
assert job['job_duration_maximum_in_seconds'] >= job['job_duration_minimum_in_seconds'], 'Max duration should be >= min duration'
216+
217+
# Validate job waiting time fields are non-negative
218+
assert job['job_waiting_time_average_in_seconds'] >= 0, 'Job waiting time average should be non-negative'
219+
assert job['job_waiting_time_total_in_seconds'] >= 0, 'Job waiting time total should be non-negative'
220+
221+
# Validate job_host_summary actual values
222+
print('--- Validating job_host_summary data values ---')
223+
assert len(job_host_summary) == 1, 'Should have 1 job template in summary'
224+
jhs = job_host_summary[0]
225+
assert jhs['jobs_total'] == 3, 'Should have 3 jobs in summary'
226+
assert jhs['hosts_total'] == 2, 'Should have 2 hosts in summary'
227+
assert jhs['ok_total'] == 6, 'Should have 6 ok tasks (3 jobs × 2 hosts)'
228+
assert jhs['dark_total'] == 0, 'Should have 0 dark (unreachable) hosts'
229+
assert jhs['failures_total'] == 0, 'Should have 0 failures'
230+
assert jhs['skipped_total'] == 0, 'Should have 0 skipped tasks'
231+
assert jhs['ignored_total'] == 0, 'Should have 0 ignored failures'
232+
assert jhs['rescued_total'] == 0, 'Should have 0 rescued tasks'
233+
234+
# Validate cross-section data consistency
235+
print('--- Validating cross-section data consistency ---')
236+
assert events_modules['total_hosts_automated'] == jhs['hosts_total'], 'Total hosts automated should match hosts in job_host_summary'
237+
238+
# Validate that module stats hosts match the total automated hosts
239+
for module_stat in events_modules['module_stats']:
240+
assert module_stat['hosts_total'] <= events_modules['total_hosts_automated'], (
241+
f'Module {module_stat["module_name"][:50]} hosts should not exceed total automated hosts'
242+
)
243+
244+
print('✅ All data value assertions passed!')
245+
246+
# Verify data directory exists and contains raw data tarballs
247+
data_path = './out/data/2025/06/13'
248+
assert os.path.exists(data_path), f'Data directory should exist at {data_path}'
249+
250+
# Check that raw data tarballs were created
251+
data_tarballs = [f for f in os.listdir(data_path) if f.endswith('.tar.gz')]
252+
assert len(data_tarballs) > 0, 'Should have raw data tarballs in data directory'
253+
print(f'Found {len(data_tarballs)} raw data tarballs')
254+
255+
print('\n✅ All assertions passed!')

metrics_utility/test/test_anonymized_rollups/test_jobhostsummary_anonymized_rollups.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def test_jobhostsummary_anonymized():
9999
df = pd.DataFrame(jobhostsummary)
100100

101101
jobhostsummary_anonymized_rollup = JobHostSummaryAnonymizedRollup()
102+
df = jobhostsummary_anonymized_rollup.prepare(df)
102103
result = jobhostsummary_anonymized_rollup.base(df)
103104
result = result['json']
104105

metrics_utility/test/test_anonymized_rollups/test_jobs_anonymized_rollups.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def test_jobs_anonymized_rollups_base_aggregation():
8888

8989
df = pd.DataFrame(jobs)
9090
jobs_anonymized_rollup = JobsAnonymizedRollup()
91+
df = jobs_anonymized_rollup.prepare(df)
9192
result = jobs_anonymized_rollup.base(df)
9293
result = result['json']
9394

0 commit comments

Comments
 (0)