From c9fa8939000f715523ca97e7dba18147b1a16f99 Mon Sep 17 00:00:00 2001 From: Alex Krzos Date: Thu, 18 Jun 2026 15:08:28 -0400 Subject: [PATCH] Add per-phase Prometheus analysis and consistent logging/reporting to load tools Extract launch_prometheus_analysis() into shared utils/analysis.py module used by both acm-deploy-load.py and acm-telco-core-load.py. Add phased workload structure (Idle Baseline, Cluster Deployment, Soak Baseline) with automatic background Prometheus analysis at phase boundaries for resource consumption comparison testing. Align logging, report generation, and section naming across both load tools. - Add -k/--kubeconfig and --no-prometheus-analysis CLI args to acm-deploy-load.py - Run 3 background analyses: idle-baseline, cluster-deployment, soak-baseline - Add periodic countdown logging during idle and soak baseline phases - Consolidate workload parameter logging into structured block matching telco-core pattern - Move telco-core inline report to generate_telco_core_report() in utils/output.py - Rename generate_report() to generate_deploy_load_report() for clarity - Rename "Deployed Cluster Orchestration" to "Workload Parameters" in report - Align report section names: Workload Parameters, Workload Duration Results, Workload Phases - Enrich deploy-load report Workload Parameters with phase durations, wait configs, and cluster details Co-Authored-By: Claude Opus 4.6 --- README.md | 12 +- acm-deploy-load/acm-deploy-load.py | 170 ++++++++++++------ acm-deploy-load/acm-telco-core-load.py | 232 ++++++++++--------------- acm-deploy-load/utils/analysis.py | 62 +++++++ acm-deploy-load/utils/common_ocp.py | 9 + acm-deploy-load/utils/output.py | 169 ++++++++++++++++-- acm-deploy-load/utils/ztp_monitor.py | 15 +- scripts/interval-ztp-install-all.sh | 20 ++- 8 files changed, 469 insertions(+), 220 deletions(-) create mode 100644 acm-deploy-load/utils/analysis.py diff --git a/README.md b/README.md index 6d2eb3ff..17c32c66 100644 --- a/README.md +++ b/README.md @@ -91,10 +91,14 @@ Deploys SNO, Compact, or Standard clusters via Assisted Installer or Image-Based **Workload Phases:** -1. Deploy Phase — Apply manifests or push to GitOps to deploy clusters -2. Wait for Cluster Install Completion -3. Wait for DU Profile Completion (optional) -4. Report Card / Graphing +1. Phase 1 / Idle Baseline — Pre-deployment delay for baseline resource measurements (`--start-delay`) +2. Phase 2 / Cluster Deployment — Apply manifests or push to GitOps to deploy clusters + - Wait for Cluster Install Completion + - Wait for DU Profile Completion (optional) + - Wait for Playbook Completion (optional) +3. Phase 3 / Soak Baseline — Post-deployment delay for steady-state resource measurements (`--end-delay`) + +Optional per-phase Prometheus analysis runs automatically at phase boundaries (disable with `--no-prometheus-analysis`). ### acm-telco-core-load.py diff --git a/acm-deploy-load/acm-deploy-load.py b/acm-deploy-load/acm-deploy-load.py index 1f9c12dd..5d183287 100755 --- a/acm-deploy-load/acm-deploy-load.py +++ b/acm-deploy-load/acm-deploy-load.py @@ -21,9 +21,10 @@ from datetime import datetime, timedelta, timezone import glob from jinja2 import Template -from utils.common_ocp import detect_aap_install +from utils.analysis import launch_prometheus_analysis +from utils.common_ocp import detect_aap_install, validate_kubeconfig from utils.command import command -from utils.output import generate_report +from utils.output import generate_deploy_load_report from utils.output import phase_break from utils.ztp_monitor import ZTPMonitor from utils.talm import detect_talm_minor @@ -237,10 +238,12 @@ def main(): description="Tool to load ACM with Cluster deployments via manifests or GitOps ZTP", prog="acm-deploy-load.py", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-m", "--method", choices=install_methods, default="ai-siteconfig-gitops", + parser.add_argument("-m", "--method", choices=install_methods, default="ibi-clusterinstance-gitops", help="The method of cluster install, ai - Assisted-Installer, ibi - Image-Based-Installer") # "Global" args + parser.add_argument("-k", "--kubeconfig", type=str, default="/root/mno/kubeconfig", + help="Changes which kubeconfig to connect to the hub cluster") parser.add_argument("-cm", "--cluster-manifests", type=str, default="/root/hv-vm/", help="The location of the cluster manifests, siteconfigs and resource files") parser.add_argument("-a", "--argocd-directory", type=str, @@ -252,9 +255,9 @@ def main(): parser.add_argument("-n", "--no-shuffle", action="store_true", default=False, help="Do not shuffle the list of discovered installable clusters") parser.add_argument("--start-delay", type=int, default=15, - help="Delay to starting deploys, allowing monitor thread to gather data (seconds)") + help="Phase 1 / Idle baseline delay before starting deploys (seconds)") parser.add_argument("--end-delay", type=int, default=120, - help="Delay on end, allows monitor thread to gather additional data points (seconds)") + help="Phase 3 / Soak baseline delay after deploys complete (seconds)") parser.add_argument("--clusters-per-app", type=int, default=100, help="Maximum number of clusters per cluster application") parser.add_argument("--wait-cluster-max", type=int, default=10800, @@ -291,6 +294,10 @@ def main(): parser.add_argument("-d", "--debug", action="store_true", default=False, help="Set log level debug") parser.add_argument("--dry-run", action="store_true", default=False, help="Echos commands instead of executing them") + # Prometheus analysis options + parser.add_argument("--no-prometheus-analysis", action="store_true", default=False, + help="Do not run analyze-prometheus.py in background post each phase") + subparsers = parser.add_subparsers(dest="rate") parser_interval = subparsers.add_parser("interval", help="Interval rate method of deploying clusters", @@ -323,20 +330,21 @@ def main(): phase_break() logger.debug("CLI Args: {}".format(cliargs)) + # Validate kubeconfig + validate_kubeconfig(cliargs.kubeconfig) + # Detect TALM version talm_minor = int(detect_talm_minor(cliargs.talm_version, cliargs.dry_run)) logger.info("Using TALM cgu monitoring based on TALM minor version: {}".format(talm_minor)) # Detect AAP install - if detect_aap_install(dry_run=cliargs.dry_run): + if detect_aap_install(cliargs.kubeconfig, cliargs.dry_run): logger.info("AAP install detected, waiting for playbook completion") cliargs.wait_playbook = True else: logger.info("AAP install not detected") - # Validate parameters and display rate and method plan - logger.info("Deploying Clusters rate: {}".format(cliargs.rate)) - logger.info("Deploying Clusters method: {}".format(cliargs.method)) + # Validate parameters if (cliargs.start < 0): logger.error("Cluster start index must be equal to or greater than 0") sys.exit(1) @@ -356,29 +364,6 @@ def main(): if not (cliargs.interval >= 0): logger.error("Interval must be equal to or greater than 0") sys.exit(1) - logger.info(" * {} Cluster(s) per {}s interval".format(cliargs.batch, cliargs.interval)) - logger.info(" * Start Index: {}, End Index: {}".format(cliargs.start, cliargs.end)) - if cliargs.skip_wait_install: - logger.info(" * Skip waiting for cluster install completion") - else: - if cliargs.wait_cluster_max > 0: - logger.info(" * Wait for cluster install completion (Max {}s)".format(cliargs.wait_cluster_max)) - else: - logger.info(" * Wait for cluster install completion (Infinite wait)") - if not cliargs.wait_du_profile: - logger.info(" * Skip waiting for DU Profile completion") - else: - if cliargs.wait_du_profile_max > 0: - logger.info(" * Wait for DU Profile completion (Max {}s)".format(cliargs.wait_du_profile_max)) - else: - logger.info(" * Wait for DU Profile completion (Infinite wait)") - if not cliargs.wait_playbook: - logger.info(" * Skip waiting for Playbook completion") - else: - if cliargs.wait_playbook_max > 0: - logger.info(" * Wait for Playbook completion (Max {}s)".format(cliargs.wait_playbook_max)) - else: - logger.info(" * Wait for Playbook completion (Infinite wait)") # Determine where the report directory will be located base_dir = os.path.dirname(os.path.realpath(sys.argv[0])) @@ -386,14 +371,9 @@ def main(): base_dir_results = os.path.join(base_dir_down, "results") report_dir_name = "{}-{}-{}".format(datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y%m%d-%H%M%S"), cliargs.method, cliargs.results_dir_suffix) report_dir = os.path.join(base_dir_results, report_dir_name) - logger.info("Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) monitor_data_csv_file = "{}/monitor_data.csv".format(report_dir) - logger.info("Monitoring data captured to: {}".format("/".join(monitor_data_csv_file.split("/")[-3:]))) - logger.info(" * Monitoring interval: {}".format(cliargs.monitor_interval)) - phase_break() - # Get starting data and list directories for manifests/siteconfigs/cluster applications available_clusters = 0 cluster_list = [] @@ -457,8 +437,54 @@ def main(): random.shuffle(cluster_list) logger.debug("Randomized the cluster order: {}".format(cluster_list)) + # Display workload parameters + phase_break() + logger.info("Workload Parameters") + logger.info(" * Method: {}".format(cliargs.method)) + logger.info(" * Rate: {}".format(cliargs.rate)) + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment):") + if cliargs.rate == "interval": + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval, str(timedelta(seconds=cliargs.interval)))) + logger.info(" * Available clusters: {}".format(available_clusters)) + logger.info(" * Cluster range: {} to {}".format(cliargs.start, cliargs.end)) + logger.info(" * Clusters per ZTP argoCD application: {}".format(cliargs.clusters_per_app)) + if cliargs.skip_wait_install: + logger.info(" * Skip waiting for cluster install completion") + else: + if cliargs.wait_cluster_max > 0: + logger.info(" * Wait for cluster install completion (Max {}s :: {})".format( + cliargs.wait_cluster_max, str(timedelta(seconds=cliargs.wait_cluster_max)))) + else: + logger.info(" * Wait for cluster install completion (Infinite wait)") + if not cliargs.wait_du_profile: + logger.info(" * Skip waiting for DU Profile completion") + else: + if cliargs.wait_du_profile_max > 0: + logger.info(" * Wait for DU Profile completion (Max {}s :: {})".format( + cliargs.wait_du_profile_max, str(timedelta(seconds=cliargs.wait_du_profile_max)))) + else: + logger.info(" * Wait for DU Profile completion (Infinite wait)") + if not cliargs.wait_playbook: + logger.info(" * Skip waiting for Playbook completion") + else: + if cliargs.wait_playbook_max > 0: + logger.info(" * Wait for Playbook completion (Max {}s :: {})".format( + cliargs.wait_playbook_max, str(timedelta(seconds=cliargs.wait_playbook_max)))) + else: + logger.info(" * Wait for Playbook completion (Infinite wait)") + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Monitor interval: {}s".format(cliargs.monitor_interval)) + logger.info(" * Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) + phase_break() + # Create the results directory to store data into - logger.info("Creating report directory: {}".format(report_dir)) + logger.debug("Creating report directory: {}".format(report_dir)) os.mkdir(report_dir) ############################################################################# @@ -484,13 +510,35 @@ def main(): "playbook_running": 0, "playbook_completed": 0 } - monitor_thread = ZTPMonitor(cliargs.method, talm_minor, monitor_data, monitor_data_csv_file, cliargs.dry_run, cliargs.monitor_interval) + monitor_thread = ZTPMonitor(cliargs.method, talm_minor, monitor_data, monitor_data_csv_file, cliargs.dry_run, cliargs.monitor_interval, cliargs.kubeconfig) monitor_thread.start() + + ############################################################################# + # Phase 1: Idle Baseline + ############################################################################# if cliargs.start_delay > 0: phase_break() - logger.info("Sleeping {}s for start delay".format(cliargs.start_delay)) - time.sleep(cliargs.start_delay) + logger.info("Phase 1: Idle Baseline - Sleeping {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + remaining_start_delay = cliargs.start_delay + while remaining_start_delay > 300: + time.sleep(300) + remaining_start_delay -= 300 + logger.info("{}s :: {} remaining in idle baseline".format( + remaining_start_delay, str(timedelta(seconds=remaining_start_delay)))) + time.sleep(remaining_start_delay) deploy_start_time = time.time() + + # Phase 1 Prometheus analysis: idle baseline window + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase1-idle-baseline", + start_time, deploy_start_time, + cliargs.kubeconfig, base_dir) + + ############################################################################# + # Phase 2: Cluster Deployment + ############################################################################# if cliargs.rate == "interval": phase_break() logger.info("Starting interval based cluster deployment rate - {}".format(int(time.time() * 1000))) @@ -518,7 +566,7 @@ def main(): # Apply the clusters for cluster in cluster_list[start_cluster_index:end_cluster_index]: monitor_data["cluster_applied_committed"] += 1 - oc_cmd = ["oc", "apply", "-f", cluster] + oc_cmd = ["oc", "--kubeconfig", cliargs.kubeconfig, "apply", "-f", cluster] # Might need to add retries and have method to count retries rc, output = command(oc_cmd, cliargs.dry_run) if rc != 0: @@ -659,13 +707,37 @@ def main(): wait_logger = 0 wait_playbook_end_time = time.time() - end_time = time.time() + # Phase 2 Prometheus analysis: cluster deployment window (deploy through all wait phases) + soak_start_time = time.time() + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase2-cluster-deployment", + deploy_start_time, soak_start_time, + cliargs.kubeconfig, base_dir) - # End of Workload delay + ############################################################################# + # Phase 3: Soak Baseline + ############################################################################# if cliargs.end_delay > 0: phase_break() - logger.info("Sleeping {}s for end delay".format(cliargs.end_delay)) - time.sleep(cliargs.end_delay) + logger.info("Phase 3: Soak Baseline - Sleeping {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + remaining_end_delay = cliargs.end_delay + while remaining_end_delay > 300: + time.sleep(300) + remaining_end_delay -= 300 + logger.info("{}s :: {} remaining in soak baseline".format( + remaining_end_delay, str(timedelta(seconds=remaining_end_delay)))) + time.sleep(remaining_end_delay) + + end_time = time.time() + + # Phase 3 Prometheus analysis: soak baseline window + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase3-soak-baseline", + soak_start_time, end_time, + cliargs.kubeconfig, base_dir) # Stop monitoring thread logger.info("Stopping monitoring thread may take up to: {}".format(cliargs.monitor_interval)) @@ -675,10 +747,10 @@ def main(): ############################################################################# # Report Card / Graph Phase ############################################################################# - generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, + generate_deploy_load_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, wait_cluster_end_time, wait_du_profile_start_time, wait_du_profile_end_time, - wait_playbook_start_time, wait_playbook_end_time, available_clusters, monitor_data, - cliargs, total_intervals, report_dir) + wait_playbook_start_time, wait_playbook_end_time, soak_start_time, + available_clusters, monitor_data, cliargs, total_intervals, report_dir) if __name__ == "__main__": sys.exit(main()) diff --git a/acm-deploy-load/acm-telco-core-load.py b/acm-deploy-load/acm-telco-core-load.py index 8c1f3b95..16bf8b46 100755 --- a/acm-deploy-load/acm-telco-core-load.py +++ b/acm-deploy-load/acm-telco-core-load.py @@ -22,13 +22,15 @@ import glob from math import ceil from jinja2 import Template +from utils.analysis import launch_prometheus_analysis from utils.command import command +from utils.common_ocp import validate_kubeconfig +from utils.output import generate_telco_core_load_report from utils.output import log_write from utils.output import phase_break import logging import os import shutil -import subprocess import sys import time @@ -111,46 +113,6 @@ def update_policy_cm(policy_ns, cm_name, policy_keys, policy_dir, hub_kc): logger.debug(output.strip()) -def launch_prometheus_analysis(report_dir, phase_name, start_ts, end_ts, kubeconfig, base_dir): - """Launch analyze-prometheus.py in the background for the given time window.""" - analyzer_script = os.path.join(base_dir, "analyze-prometheus.py") - if not os.path.isfile(analyzer_script): - logger.warning("analyze-prometheus.py not found at {}, skipping phase {}".format(analyzer_script, phase_name)) - return - start_str = datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - end_str = datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - duration_seconds = round(end_ts - start_ts) - if duration_seconds < 900: - logger.warning("Skipping prometheus analysis phase {}: window {}s < 15 minutes".format(phase_name, duration_seconds)) - return - # No buffer time since script is running against end time that is less than 5 minutes from now - cmd = [ - sys.executable, - analyzer_script, - "-k", kubeconfig, - "-s", start_str, - "-e", end_str, - "-b", "0", - "-p", phase_name, - report_dir, - ] - logger.info("Prometheus analysis command: {}".format(" ".join(cmd))) - log_file = os.path.join(report_dir, "pa-{}.log".format(phase_name)) - try: - with open(log_file, "w") as f: - proc = subprocess.Popen( - cmd, - stdout=f, - stderr=subprocess.STDOUT, - cwd=base_dir, - start_new_session=True, - ) - logger.info("Launched prometheus analysis phase '{}' in background (pid {}, log: {})".format( - phase_name, proc.pid, os.path.basename(log_file))) - except Exception as e: - logger.warning("Failed to launch prometheus analysis for phase {}: {}".format(phase_name, e)) - - def main(): start_time = time.time() @@ -186,9 +148,10 @@ def main(): parser.add_argument("--max-policy-intervals", type=int, default=10, help="Maximum number of policy intervals to run (Used with --no-deploy only)") - # Delay args are idle time before and after the workload - parser.add_argument("-s", "--start-delay", type=int, default=120, help="Delay on start of script") - parser.add_argument("-e", "--end-delay", type=int, default=120, help="Delay on end of script") + parser.add_argument("-s", "--start-delay", type=int, default=120, + help="Phase 1 / Idle baseline delay before starting deploys (seconds)") + parser.add_argument("-e", "--end-delay", type=int, default=120, + help="Phase 3 / Soak baseline delay after deploys complete (seconds)") parser.add_argument("-t", "--results-dir-suffix", type=str, default="test-00", help="Suffix to be appended to results directory name") @@ -207,6 +170,8 @@ def main(): parser.error("Cannot set both --no-deploy and --no-policy. Modes are: Deploy+Policy (default), " "Deploy only (--no-policy), or Policy only (--no-deploy).") + validate_kubeconfig(cliargs.kubeconfig) + phase_break() logger.info("ACM Telco Core Load") phase_break() @@ -222,6 +187,7 @@ def main(): logger.info("Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) clusterinstance_files = [] + deploy_batch_count = 0 if cliargs.no_deploy == False: # Detect all clusterinstance file manifests to be deployed logger.info("Checking {}clusterinstance/ for cluster instance manifests".format(cliargs.cluster_manifests)) @@ -238,39 +204,58 @@ def main(): phase_break() logger.info("Workload Parameters") - if cliargs.no_deploy == False: + if cliargs.no_deploy == False and cliargs.no_policy == False: # Subtract 1 from deploy batch count to account that on last batch, the phase will wait for last_deploy_runtime instead of interval_deploy expected_run_time = cliargs.start_delay + (deploy_batch_count - 1) * cliargs.interval_deploy + cliargs.last_deploy_runtime + cliargs.end_delay - if cliargs.no_policy == False: - logger.info("* Mode: Deploy+Policy") - else: - logger.info("* Mode: Deploy Clusters only") - logger.info(f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - logger.info(f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - logger.info(f" * Available clusters: {len(clusterinstance_files)}") - logger.info(f" * Total batches: {deploy_batch_count}") - logger.info(f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - if cliargs.no_policy == False: - logger.info(f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - else: - logger.info(f" * No policy updates") - logger.info(f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") + logger.info(" * Mode: Deploy+Policy") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment + Policy Updates):") + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + logger.info(" * Available clusters: {}".format(len(clusterinstance_files))) + logger.info(" * Total batches: {}".format(deploy_batch_count)) + logger.info(" * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + logger.info(" * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) if not cliargs.no_prometheus_analysis: logger.info(" * Run analyze-prometheus.py in background at phase boundaries") - logger.info(f"* Expected run time: {expected_run_time}s :: {str(timedelta(seconds=expected_run_time))}") - elif cliargs.no_deploy == True: + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) + elif cliargs.no_deploy == False and cliargs.no_policy == True: + expected_run_time = cliargs.start_delay + (deploy_batch_count - 1) * cliargs.interval_deploy + cliargs.last_deploy_runtime + cliargs.end_delay + logger.info(" * Mode: Deploy Clusters only") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment):") + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + logger.info(" * Available clusters: {}".format(len(clusterinstance_files))) + logger.info(" * Total batches: {}".format(deploy_batch_count)) + logger.info(" * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) + elif cliargs.no_deploy == True and cliargs.no_policy == False: # Subtract 1 from max policy intervals to account that on last interval, loop ends immediately expected_run_time = cliargs.start_delay + (cliargs.max_policy_intervals - 1) * cliargs.interval_policy + cliargs.end_delay - logger.info("* Mode: Policy configmap updates only") - logger.info(f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - logger.info(f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - logger.info(f" * Maximum number of policy intervals to run: {cliargs.max_policy_intervals}") - logger.info(f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - logger.info(f"* Expected run time: {expected_run_time}s :: {str(timedelta(seconds=expected_run_time))}") - else: - # Should not occur due to cliargs check above - logger.error("* Invalid mode.") - sys.exit(1) + logger.info(" * Mode: Policy configmap updates only") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Policy Updates):") + logger.info(" * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + logger.info(" * Maximum number of policy intervals to run: {}".format(cliargs.max_policy_intervals)) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) phase_break() # Detect a policy configmap @@ -295,33 +280,33 @@ def main(): os.mkdir(report_dir) os.mkdir(policy_dir) - ################################### - # Phase 1 of workload: Start delay - ################################### - # Start of workload with start delay + ############################################################################# + # Phase 1: Idle Baseline + ############################################################################# workload_start_time = time.time() if cliargs.start_delay > 0: phase_break() - logger.info("Sleeping {}s :: {} for start delay".format(cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) - total_start_delay = cliargs.start_delay - while(total_start_delay > 300): + logger.info("Phase 1: Idle Baseline - Sleeping {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + remaining_start_delay = cliargs.start_delay + while remaining_start_delay > 300: time.sleep(300) - total_start_delay -= 300 - logger.info("{}s :: {} remaining in start delay".format(total_start_delay, str(timedelta(seconds=total_start_delay)))) - # Sleep remaining less than 5 minutes time - time.sleep(total_start_delay) + remaining_start_delay -= 300 + logger.info("{}s :: {} remaining in idle baseline".format( + remaining_start_delay, str(timedelta(seconds=remaining_start_delay)))) + time.sleep(remaining_start_delay) start_delay_complete_ts = time.time() - # Phase 1 Prometheus analysis: start delay window + # Phase 1 Prometheus analysis: idle baseline window if not cliargs.no_prometheus_analysis and cliargs.no_deploy == False: launch_prometheus_analysis( - report_dir, "phase1-start-delay", + report_dir, "phase1-idle-baseline", workload_start_time, start_delay_complete_ts, cliargs.kubeconfig, base_dir) - ################################### - # Phase 2 of workload: Deploy clusters and/or update policy configmap - ################################### + ############################################################################# + # Phase 2: Cluster Deployment and/or Policy Updates + ############################################################################# total_clusters_deployed = 0 total_policy_cm_updates = 0 deployed_clusters = [] @@ -334,7 +319,7 @@ def main(): next_policy_time = next_deploy_time last_logged = start_delay_complete_ts phase_break() - logger.info("Begin Telco Core ACM Load - {}".format(int(time.time() * 1000))) + logger.info("Phase 2: Begin Telco Core ACM Load - {}".format(int(time.time() * 1000))) phase_break() current_time = time.time() while True: @@ -425,72 +410,37 @@ def main(): current_time = time.time() # End run loop - ################################### - # Phase 3 of workload: End delay - ################################### + ############################################################################# + # Phase 3: Soak Baseline + ############################################################################# end_delay_start_ts = time.time() if cliargs.end_delay > 0: phase_break() - logger.info("Sleeping {}s :: {} for end delay".format(cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) - total_end_delay = cliargs.end_delay - while(total_end_delay > 300): + logger.info("Phase 3: Soak Baseline - Sleeping {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + remaining_end_delay = cliargs.end_delay + while remaining_end_delay > 300: time.sleep(300) - total_end_delay -= 300 - logger.info("{}s :: {} remaining in end delay".format(total_end_delay, str(timedelta(seconds=total_end_delay)))) - # Sleep remaining less than 5 minutes time - time.sleep(total_end_delay) + remaining_end_delay -= 300 + logger.info("{}s :: {} remaining in soak baseline".format( + remaining_end_delay, str(timedelta(seconds=remaining_end_delay)))) + time.sleep(remaining_end_delay) end_time = time.time() - # Phase 3 Prometheus analysis: end delay window + # Phase 3 Prometheus analysis: soak baseline window if not cliargs.no_prometheus_analysis and cliargs.no_deploy == False: launch_prometheus_analysis( - report_dir, "phase3-end-delay", + report_dir, "phase3-soak-baseline", end_delay_start_ts, end_time, cliargs.kubeconfig, base_dir) - total_elapsed_time = round(end_time - workload_start_time) - # Make a report card - with open("{}/report.txt".format(report_dir), "w") as report: - phase_break(True, report) - log_write(report, "acm-telco-core-load Report Card") - phase_break(True, report) - log_write(report, "Workload Parameters") - if cliargs.no_deploy == False and cliargs.no_policy == False: - log_write(report, "* Mode: Deploy+Policy") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - log_write(report, f" * Available clusters: {len(clusterinstance_files)}") - log_write(report, f" * Total batches: {deploy_batch_count}") - log_write(report, f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - log_write(report, f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - elif cliargs.no_deploy == False and cliargs.no_policy == True: - log_write(report, "* Mode: Deploy Clusters only") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - log_write(report, f" * Available clusters: {len(clusterinstance_files)}") - log_write(report, f" * Total batches: {deploy_batch_count}") - log_write(report, f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - elif cliargs.no_deploy == True and cliargs.no_policy == False: - log_write(report, "* Mode: Policy configmap updates only") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - log_write(report, f" * Maximum number of policy intervals to run: {cliargs.max_policy_intervals}") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - log_write(report, "Workload Results") - log_write(report, " * Total elapsed time: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) - log_write(report, " * Total cluster(s) deployed: {}".format(total_clusters_deployed)) - log_write(report, " * Total policy cm updates: {}".format(total_policy_cm_updates)) - log_write(report, "Workload Timestamps") - log_write(report, " * Start Time: {} {}".format(datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(workload_start_time * 1000))) - log_write(report, " * Start Delay Complete Time: {}".format(datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - for i, ts in enumerate(cluster_deployed_timestamps): - log_write(report, " * Cluster(s) Batch {} deployed: {}".format(i, datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - log_write(report, " * End Delay Start Time: {}".format(datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - log_write(report, " * End Time: {} {}".format(datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(end_time * 1000))) + generate_telco_core_load_report(workload_start_time, end_time, start_delay_complete_ts, + end_delay_start_ts, cluster_deployed_timestamps, total_clusters_deployed, + total_policy_cm_updates, len(clusterinstance_files), deploy_batch_count, + cliargs, report_dir) + total_elapsed_time = round(end_time - workload_start_time) logger.info("Took {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) diff --git a/acm-deploy-load/utils/analysis.py b/acm-deploy-load/utils/analysis.py new file mode 100644 index 00000000..033ed924 --- /dev/null +++ b/acm-deploy-load/utils/analysis.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright 2026 Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import datetime, timezone +import logging +import os +import subprocess +import sys + +logger = logging.getLogger("acm-deploy-load") + + +def launch_prometheus_analysis(report_dir, phase_name, start_ts, end_ts, kubeconfig, base_dir): + """Launch analyze-prometheus.py in the background for the given time window.""" + analyzer_script = os.path.join(base_dir, "analyze-prometheus.py") + if not os.path.isfile(analyzer_script): + logger.warning("analyze-prometheus.py not found at {}, skipping phase {}".format(analyzer_script, phase_name)) + return + start_str = datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + end_str = datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + duration_seconds = round(end_ts - start_ts) + if duration_seconds < 900: + logger.warning("Skipping prometheus analysis phase {}: window {}s < 15 minutes".format(phase_name, duration_seconds)) + return + # No buffer time since script is running against end time that is less than 5 minutes from now + cmd = [ + sys.executable, + analyzer_script, + "-k", kubeconfig, + "-s", start_str, + "-e", end_str, + "-b", "0", + "-p", phase_name, + report_dir, + ] + logger.info("Prometheus analysis command: {}".format(" ".join(cmd))) + log_file = os.path.join(report_dir, "pa-{}.log".format(phase_name)) + try: + with open(log_file, "w") as f: + proc = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + cwd=base_dir, + start_new_session=True, + ) + logger.info("Launched prometheus analysis phase '{}' in background (pid {}, log: {})".format( + phase_name, proc.pid, os.path.basename(log_file))) + except Exception as e: + logger.warning("Failed to launch prometheus analysis for phase {}: {}".format(phase_name, e)) diff --git a/acm-deploy-load/utils/common_ocp.py b/acm-deploy-load/utils/common_ocp.py index 1e7149d1..d943078d 100644 --- a/acm-deploy-load/utils/common_ocp.py +++ b/acm-deploy-load/utils/common_ocp.py @@ -204,3 +204,12 @@ def get_thanos_querier_route(kubeconfig): else: logger.error("Failed to find route for thanos-querier") return "" + + +def validate_kubeconfig(kubeconfig): + oc_cmd = ["oc", "--kubeconfig", kubeconfig, "whoami"] + rc, output = command(oc_cmd, False, no_log=True) + if rc != 0: + logger.error("Kubeconfig validation failed (oc whoami rc: {}): {}".format(rc, kubeconfig)) + sys.exit(1) + logger.info("Kubeconfig validated, connected as: {}".format(output.strip())) diff --git a/acm-deploy-load/utils/output.py b/acm-deploy-load/utils/output.py index 494dedbc..9847756a 100644 --- a/acm-deploy-load/utils/output.py +++ b/acm-deploy-load/utils/output.py @@ -46,15 +46,26 @@ def assemble_stats(the_list, seconds=True): return "{} :: {} :: {} :: {} :: {} :: {}".format(stats_min, stats_avg, stats_p50, stats_p95, stats_p99, stats_max) -def generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, +def generate_deploy_load_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, wait_cluster_end_time, wait_du_profile_start_time, wait_du_profile_end_time, wait_playbook_start_time, - wait_playbook_end_time, available_clusters, monitor_data, cliargs, total_intervals, report_dir): + wait_playbook_end_time, soak_start_time, available_clusters, monitor_data, cliargs, total_intervals, report_dir): + # Timestamps define three workload phases: + # Phase 1 (Idle Baseline): start_time -> deploy_start_time + # Phase 2 (Cluster Deployment): deploy_start_time -> soak_start_time + # - Manifest apply: deploy_start_time -> deploy_end_time + # - Wait cluster install: wait_cluster_start_time -> wait_cluster_end_time + # - Wait DU profile: wait_du_profile_start_time -> wait_du_profile_end_time + # - Wait playbook: wait_playbook_start_time -> wait_playbook_end_time + # Phase 3 (Soak Baseline): soak_start_time -> end_time # Determine result data + total_idle_baseline_time = round(deploy_start_time - start_time) + total_phase2_time = round(soak_start_time - deploy_start_time) total_deploy_time = round(deploy_end_time - deploy_start_time) total_cluster_install_time = round(wait_cluster_end_time - wait_cluster_start_time) total_duprofile_time = round(wait_du_profile_end_time - wait_du_profile_start_time) total_playbook_time = round(wait_playbook_end_time - wait_playbook_start_time) + total_soak_baseline_time = round(end_time - soak_start_time) total_time = round(end_time - start_time) success_cluster_percent = 0 failed_cluster_percent = 0 @@ -132,29 +143,157 @@ def generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wa log_write(report, " * Overall Success (DU Compliant / Deployed): {} / {}".format(monitor_data["policy_compliant"], monitor_data["cluster_applied_committed"])) log_write(report, " * Overall Success Percent: {}%".format(success_overall_percent)) log_write(report, " * Overall Failed Percent: {}%".format(failed_overall_percent)) - log_write(report, "Deployed Cluster Orchestration") + log_write(report, "Workload Parameters") log_write(report, " * Method: {}".format(cliargs.method)) log_write(report, " * Rate: {}".format(cliargs.rate)) - log_write(report, " * Cluster Start: {} End: {}".format(cliargs.start, cliargs.end)) - log_write(report, " * {} cluster(s) per ZTP argoCD application".format(cliargs.clusters_per_app)) + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 (Cluster Deployment):") if cliargs.rate == "interval": - log_write(report, " * {} cluster(s) per {}s interval".format(cliargs.batch, cliargs.interval)) - log_write(report, " * Actual Intervals: {}".format(total_intervals)) - log_write(report, " * Wan Emulation: {}".format(cliargs.wan_emulation)) - log_write(report, "Workload Duration Results") + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval, str(timedelta(seconds=cliargs.interval)))) + log_write(report, " * Cluster range: {} to {}".format(cliargs.start, cliargs.end)) + log_write(report, " * Clusters per ZTP argoCD application: {}".format(cliargs.clusters_per_app)) + log_write(report, " * Actual intervals: {}".format(total_intervals)) + if cliargs.skip_wait_install: + log_write(report, " * Skip waiting for cluster install completion") + else: + if cliargs.wait_cluster_max > 0: + log_write(report, " * Wait for cluster install completion (Max {}s :: {})".format( + cliargs.wait_cluster_max, str(timedelta(seconds=cliargs.wait_cluster_max)))) + else: + log_write(report, " * Wait for cluster install completion (Infinite wait)") + if not cliargs.wait_du_profile: + log_write(report, " * Skip waiting for DU Profile completion") + else: + if cliargs.wait_du_profile_max > 0: + log_write(report, " * Wait for DU Profile completion (Max {}s :: {})".format( + cliargs.wait_du_profile_max, str(timedelta(seconds=cliargs.wait_du_profile_max)))) + else: + log_write(report, " * Wait for DU Profile completion (Infinite wait)") + if not cliargs.wait_playbook: + log_write(report, " * Skip waiting for Playbook completion") + else: + if cliargs.wait_playbook_max > 0: + log_write(report, " * Wait for Playbook completion (Max {}s :: {})".format( + cliargs.wait_playbook_max, str(timedelta(seconds=cliargs.wait_playbook_max)))) + else: + log_write(report, " * Wait for Playbook completion (Infinite wait)") + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + log_write(report, " * Monitor interval: {}s".format(cliargs.monitor_interval)) + if cliargs.wan_emulation: + log_write(report, " * Wan Emulation: {}".format(cliargs.wan_emulation)) + log_write(report, "Workload Phases") log_write(report, " * Start Time: {} {}".format( datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(start_time * 1000))) log_write(report, " * End Time: {} {}".format( datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(end_time * 1000))) - log_write(report, " * Cluster Deploying duration: {}s :: {}".format(total_deploy_time, str(timedelta(seconds=total_deploy_time)))) + log_write(report, " * Total duration: {}s :: {}".format(total_time, str(timedelta(seconds=total_time)))) + log_write(report, " * Phase 1 (Idle Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(deploy_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_idle_baseline_time, str(timedelta(seconds=total_idle_baseline_time)))) + log_write(report, " * Phase 2 (Cluster Deployment): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(deploy_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(soak_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_phase2_time, str(timedelta(seconds=total_phase2_time)))) + log_write(report, " * Cluster Deploying duration: {}s :: {}".format(total_deploy_time, str(timedelta(seconds=total_deploy_time)))) if not cliargs.skip_wait_install: - log_write(report, " * Cluster Install wait duration: {}s :: {}".format(total_cluster_install_time, str(timedelta(seconds=total_cluster_install_time)))) + log_write(report, " * Cluster Install wait duration: {}s :: {}".format(total_cluster_install_time, str(timedelta(seconds=total_cluster_install_time)))) if cliargs.wait_du_profile: - log_write(report, " * DU Profile wait duration: {}s :: {}".format(total_duprofile_time, str(timedelta(seconds=total_duprofile_time)))) + log_write(report, " * DU Profile wait duration: {}s :: {}".format(total_duprofile_time, str(timedelta(seconds=total_duprofile_time)))) if cliargs.wait_playbook: - log_write(report, " * Playbook wait duration: {}s :: {}".format(total_playbook_time, str(timedelta(seconds=total_playbook_time)))) - log_write(report, " * Total duration: {}s :: {}".format(total_time, str(timedelta(seconds=total_time)))) - # Done outputing the report card + log_write(report, " * Playbook wait duration: {}s :: {}".format(total_playbook_time, str(timedelta(seconds=total_playbook_time)))) + log_write(report, " * Phase 3 (Soak Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(soak_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_soak_baseline_time, str(timedelta(seconds=total_soak_baseline_time)))) + + +def generate_telco_core_load_report(workload_start_time, end_time, start_delay_complete_ts, + end_delay_start_ts, cluster_deployed_timestamps, total_clusters_deployed, + total_policy_cm_updates, available_clusters, deploy_batch_count, cliargs, report_dir): + + total_elapsed_time = round(end_time - workload_start_time) + total_idle_baseline_time = round(start_delay_complete_ts - workload_start_time) + total_phase2_time = round(end_delay_start_ts - start_delay_complete_ts) + total_soak_baseline_time = round(end_time - end_delay_start_ts) + + with open("{}/report.txt".format(report_dir), "w") as report: + phase_break(True, report) + log_write(report, "acm-telco-core-load Report Card") + phase_break(True, report) + log_write(report, "Workload Parameters") + if cliargs.no_deploy == False and cliargs.no_policy == False: + phase2_label = "Cluster Deployment + Policy Updates" + log_write(report, " * Mode: Deploy+Policy") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + log_write(report, " * Available clusters: {}".format(available_clusters)) + log_write(report, " * Total batches: {}".format(deploy_batch_count)) + log_write(report, " * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + log_write(report, " * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + elif cliargs.no_deploy == False and cliargs.no_policy == True: + phase2_label = "Cluster Deployment" + log_write(report, " * Mode: Deploy Clusters only") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + log_write(report, " * Available clusters: {}".format(available_clusters)) + log_write(report, " * Total batches: {}".format(deploy_batch_count)) + log_write(report, " * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + elif cliargs.no_deploy == True and cliargs.no_policy == False: + phase2_label = "Policy Updates" + log_write(report, " * Mode: Policy configmap updates only") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + log_write(report, " * Maximum number of policy intervals to run: {}".format(cliargs.max_policy_intervals)) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + log_write(report, "Workload Results") + log_write(report, " * Total elapsed time: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) + log_write(report, " * Total cluster(s) deployed: {}".format(total_clusters_deployed)) + log_write(report, " * Total policy cm updates: {}".format(total_policy_cm_updates)) + log_write(report, "Workload Phases") + log_write(report, " * Start Time: {} {}".format( + datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + int(workload_start_time * 1000))) + log_write(report, " * End Time: {} {}".format( + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + int(end_time * 1000))) + log_write(report, " * Total duration: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) + log_write(report, " * Phase 1 (Idle Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_idle_baseline_time, str(timedelta(seconds=total_idle_baseline_time)))) + log_write(report, " * Phase 2 ({}): {} to {} :: {}s :: {}".format(phase2_label, + datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_phase2_time, str(timedelta(seconds=total_phase2_time)))) + for i, ts in enumerate(cluster_deployed_timestamps): + log_write(report, " * Cluster(s) Batch {} deployed: {}".format( + i, datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) + log_write(report, " * Phase 3 (Soak Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_soak_baseline_time, str(timedelta(seconds=total_soak_baseline_time)))) + def log_write(file, message): logger.info(message) diff --git a/acm-deploy-load/utils/ztp_monitor.py b/acm-deploy-load/utils/ztp_monitor.py index 9f70e79d..468f9192 100644 --- a/acm-deploy-load/utils/ztp_monitor.py +++ b/acm-deploy-load/utils/ztp_monitor.py @@ -28,7 +28,7 @@ class ZTPMonitor(Thread): - def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_interval): + def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_interval, kubeconfig): super(ZTPMonitor, self).__init__() if method in ["ai-manifest", "ai-clusterinstance", "ai-clusterinstance-gitops", "ai-siteconfig-gitops"]: self.method = "agent" @@ -39,6 +39,7 @@ def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_i self.csv_file = csv_file self.dry_run = dry_run self.sample_interval = sample_interval + self.kubeconfig = kubeconfig self.signal = True def _real_run(self): @@ -52,7 +53,7 @@ def _real_run(self): if self.method == "agent": # Get agentclusterinstall data - oc_cmd = ["oc", "get", "agentclusterinstall", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "agentclusterinstall", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get agentclusterinstall rc: {}".format(rc)) @@ -66,7 +67,7 @@ def _real_run(self): logger.warning("aci JSONDecodeError: {}".format(output[:2500])) elif self.method == "image": # Get imageclusterinstall data - oc_cmd = ["oc", "get", "imageclusterinstall", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "imageclusterinstall", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get imageclusterinstall rc: {}".format(rc)) @@ -80,7 +81,7 @@ def _real_run(self): logger.warning("ici JSONDecodeError: {}".format(output[:2500])) # Get baremetalhost data - oc_cmd = ["oc", "get", "baremetalhost", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "baremetalhost", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get baremetalhost rc: {}".format(rc)) @@ -95,7 +96,7 @@ def _real_run(self): if self.method == "agent": # Get agent data - oc_cmd = ["oc", "get", "agent", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "agent", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get agent rc: {}".format(rc)) @@ -112,7 +113,7 @@ def _real_run(self): agent_data = {"items": []} # Get managedcluster data - oc_cmd = ["oc", "get", "managedcluster", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "managedcluster", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get managedcluster rc: {}".format(rc)) @@ -126,7 +127,7 @@ def _real_run(self): logger.warning("mc JSONDecodeError: {}".format(output[:2500])) # Get clustergroupupgrades data - oc_cmd = ["oc", "get", "clustergroupupgrades", "-n", "ztp-install", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "clustergroupupgrades", "-n", "ztp-install", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get clustergroupupgrades rc: {}".format(rc)) diff --git a/scripts/interval-ztp-install-all.sh b/scripts/interval-ztp-install-all.sh index 258b4103..bf5eba2c 100755 --- a/scripts/interval-ztp-install-all.sh +++ b/scripts/interval-ztp-install-all.sh @@ -6,20 +6,31 @@ set -o pipefail iteration=1 # Method to deploy clusters (AI = Assisted Installer, IBI = Image Based Installer) -# method="ai-siteconfig-gitops" # method="ai-clusterinstance-gitops" method="ibi-clusterinstance-gitops" -# Rate 500/30m +# Phase 1 (Idle baseline) delay in seconds +start_delay=15 + +# Phase 2 (Cluster deployment) rate in clusters per interval +# Rate 500 clusters every 30 minutes interval_period=1800 batch=500 -# Rate 80/5m +# Rate 80 clusters every 5 minutes # interval_period=300 # batch=80 +# Phase 3 (Soak baseline) delay in seconds +end_delay=120 + # SNO or Mixed SNOs and MNOs clusters_per_app=100 +# Prometheus analysis per phase (uncomment to enable) +# Use with longer idle and soak baselines to produce capacity guideline measurements +prometheus_analysis_arg="--no-prometheus-analysis" +# prometheus_analysis_arg="" + # WAN Emulation can only be run with SNOs wan_em="(None)" # wan_em="(50ms/0.02)" @@ -41,7 +52,7 @@ hub_ocp=$(oc version -o json | jq -r '.openshiftVersion') # grep will cause error code 141 since it prints only the first match cluster_ocp=$(cat /root/hv-vm/*/*/*.yml | grep "clusterImageSetNameRef:" -m 1 | awk '{print $NF}' | sed 's/openshift-//' || if [[ $? -eq 141 ]]; then true; else exit $?; fi) -time ./acm-deploy-load/acm-deploy-load.py --acm-version "${acm_ver}" --aap-version "${aap_csv}" --test-version "${test_ver}" --hub-version "${hub_ocp}" --deploy-version "${cluster_ocp}" --wan-emulation "${wan_em}" -m "${method}" --clusters-per-app ${clusters_per_app} ${argocd_arg} -w -i 60 -t ${clusters_per_app}cpa-${batch}b-${interval_period}i-${iteration} interval -b ${batch} -i ${interval_period} 2>&1 | tee ${log_file} +time ./acm-deploy-load/acm-deploy-load.py --acm-version "${acm_ver}" --aap-version "${aap_csv}" --test-version "${test_ver}" --hub-version "${hub_ocp}" --deploy-version "${cluster_ocp}" --wan-emulation "${wan_em}" -m "${method}" --clusters-per-app ${clusters_per_app} ${argocd_arg} --start-delay ${start_delay} --end-delay ${end_delay} ${prometheus_analysis_arg} -w -i 60 -t ${clusters_per_app}cpa-${batch}b-${interval_period}i-${iteration} interval -b ${batch} -i ${interval_period} 2>&1 | tee ${log_file} results_dir=$(grep "Results data captured in:" $log_file | awk '{print $NF}') @@ -79,6 +90,7 @@ time ./acm-deploy-load/analyze-ansiblejobs.py ${results_dir} 2>&1 | tee -a ${log echo "################################################################################" 2>&1 | tee -a ${log_file} +# Complete Prometheus analysis for entire workload period start_time=$(grep "Start Time:" ${results_dir}/report.txt | awk '{print $4}') end_time=$(grep "End Time:" ${results_dir}/report.txt | awk '{print $4}') echo "time ./acm-deploy-load/analyze-prometheus.py -p deploy-pa -s ${start_time} -e ${end_time} ${results_dir}" | tee -a ${log_file}