diff --git a/README.md b/README.md index 6d2eb3ff..17c32c66 100644 --- a/README.md +++ b/README.md @@ -91,10 +91,14 @@ Deploys SNO, Compact, or Standard clusters via Assisted Installer or Image-Based **Workload Phases:** -1. Deploy Phase — Apply manifests or push to GitOps to deploy clusters -2. Wait for Cluster Install Completion -3. Wait for DU Profile Completion (optional) -4. Report Card / Graphing +1. Phase 1 / Idle Baseline — Pre-deployment delay for baseline resource measurements (`--start-delay`) +2. Phase 2 / Cluster Deployment — Apply manifests or push to GitOps to deploy clusters + - Wait for Cluster Install Completion + - Wait for DU Profile Completion (optional) + - Wait for Playbook Completion (optional) +3. Phase 3 / Soak Baseline — Post-deployment delay for steady-state resource measurements (`--end-delay`) + +Optional per-phase Prometheus analysis runs automatically at phase boundaries (disable with `--no-prometheus-analysis`). ### acm-telco-core-load.py diff --git a/acm-deploy-load/acm-deploy-load.py b/acm-deploy-load/acm-deploy-load.py index 1f9c12dd..5d183287 100755 --- a/acm-deploy-load/acm-deploy-load.py +++ b/acm-deploy-load/acm-deploy-load.py @@ -21,9 +21,10 @@ from datetime import datetime, timedelta, timezone import glob from jinja2 import Template -from utils.common_ocp import detect_aap_install +from utils.analysis import launch_prometheus_analysis +from utils.common_ocp import detect_aap_install, validate_kubeconfig from utils.command import command -from utils.output import generate_report +from utils.output import generate_deploy_load_report from utils.output import phase_break from utils.ztp_monitor import ZTPMonitor from utils.talm import detect_talm_minor @@ -237,10 +238,12 @@ def main(): description="Tool to load ACM with Cluster deployments via manifests or GitOps ZTP", prog="acm-deploy-load.py", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-m", "--method", choices=install_methods, default="ai-siteconfig-gitops", + parser.add_argument("-m", "--method", choices=install_methods, default="ibi-clusterinstance-gitops", help="The method of cluster install, ai - Assisted-Installer, ibi - Image-Based-Installer") # "Global" args + parser.add_argument("-k", "--kubeconfig", type=str, default="/root/mno/kubeconfig", + help="Changes which kubeconfig to connect to the hub cluster") parser.add_argument("-cm", "--cluster-manifests", type=str, default="/root/hv-vm/", help="The location of the cluster manifests, siteconfigs and resource files") parser.add_argument("-a", "--argocd-directory", type=str, @@ -252,9 +255,9 @@ def main(): parser.add_argument("-n", "--no-shuffle", action="store_true", default=False, help="Do not shuffle the list of discovered installable clusters") parser.add_argument("--start-delay", type=int, default=15, - help="Delay to starting deploys, allowing monitor thread to gather data (seconds)") + help="Phase 1 / Idle baseline delay before starting deploys (seconds)") parser.add_argument("--end-delay", type=int, default=120, - help="Delay on end, allows monitor thread to gather additional data points (seconds)") + help="Phase 3 / Soak baseline delay after deploys complete (seconds)") parser.add_argument("--clusters-per-app", type=int, default=100, help="Maximum number of clusters per cluster application") parser.add_argument("--wait-cluster-max", type=int, default=10800, @@ -291,6 +294,10 @@ def main(): parser.add_argument("-d", "--debug", action="store_true", default=False, help="Set log level debug") parser.add_argument("--dry-run", action="store_true", default=False, help="Echos commands instead of executing them") + # Prometheus analysis options + parser.add_argument("--no-prometheus-analysis", action="store_true", default=False, + help="Do not run analyze-prometheus.py in background post each phase") + subparsers = parser.add_subparsers(dest="rate") parser_interval = subparsers.add_parser("interval", help="Interval rate method of deploying clusters", @@ -323,20 +330,21 @@ def main(): phase_break() logger.debug("CLI Args: {}".format(cliargs)) + # Validate kubeconfig + validate_kubeconfig(cliargs.kubeconfig) + # Detect TALM version talm_minor = int(detect_talm_minor(cliargs.talm_version, cliargs.dry_run)) logger.info("Using TALM cgu monitoring based on TALM minor version: {}".format(talm_minor)) # Detect AAP install - if detect_aap_install(dry_run=cliargs.dry_run): + if detect_aap_install(cliargs.kubeconfig, cliargs.dry_run): logger.info("AAP install detected, waiting for playbook completion") cliargs.wait_playbook = True else: logger.info("AAP install not detected") - # Validate parameters and display rate and method plan - logger.info("Deploying Clusters rate: {}".format(cliargs.rate)) - logger.info("Deploying Clusters method: {}".format(cliargs.method)) + # Validate parameters if (cliargs.start < 0): logger.error("Cluster start index must be equal to or greater than 0") sys.exit(1) @@ -356,29 +364,6 @@ def main(): if not (cliargs.interval >= 0): logger.error("Interval must be equal to or greater than 0") sys.exit(1) - logger.info(" * {} Cluster(s) per {}s interval".format(cliargs.batch, cliargs.interval)) - logger.info(" * Start Index: {}, End Index: {}".format(cliargs.start, cliargs.end)) - if cliargs.skip_wait_install: - logger.info(" * Skip waiting for cluster install completion") - else: - if cliargs.wait_cluster_max > 0: - logger.info(" * Wait for cluster install completion (Max {}s)".format(cliargs.wait_cluster_max)) - else: - logger.info(" * Wait for cluster install completion (Infinite wait)") - if not cliargs.wait_du_profile: - logger.info(" * Skip waiting for DU Profile completion") - else: - if cliargs.wait_du_profile_max > 0: - logger.info(" * Wait for DU Profile completion (Max {}s)".format(cliargs.wait_du_profile_max)) - else: - logger.info(" * Wait for DU Profile completion (Infinite wait)") - if not cliargs.wait_playbook: - logger.info(" * Skip waiting for Playbook completion") - else: - if cliargs.wait_playbook_max > 0: - logger.info(" * Wait for Playbook completion (Max {}s)".format(cliargs.wait_playbook_max)) - else: - logger.info(" * Wait for Playbook completion (Infinite wait)") # Determine where the report directory will be located base_dir = os.path.dirname(os.path.realpath(sys.argv[0])) @@ -386,14 +371,9 @@ def main(): base_dir_results = os.path.join(base_dir_down, "results") report_dir_name = "{}-{}-{}".format(datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y%m%d-%H%M%S"), cliargs.method, cliargs.results_dir_suffix) report_dir = os.path.join(base_dir_results, report_dir_name) - logger.info("Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) monitor_data_csv_file = "{}/monitor_data.csv".format(report_dir) - logger.info("Monitoring data captured to: {}".format("/".join(monitor_data_csv_file.split("/")[-3:]))) - logger.info(" * Monitoring interval: {}".format(cliargs.monitor_interval)) - phase_break() - # Get starting data and list directories for manifests/siteconfigs/cluster applications available_clusters = 0 cluster_list = [] @@ -457,8 +437,54 @@ def main(): random.shuffle(cluster_list) logger.debug("Randomized the cluster order: {}".format(cluster_list)) + # Display workload parameters + phase_break() + logger.info("Workload Parameters") + logger.info(" * Method: {}".format(cliargs.method)) + logger.info(" * Rate: {}".format(cliargs.rate)) + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment):") + if cliargs.rate == "interval": + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval, str(timedelta(seconds=cliargs.interval)))) + logger.info(" * Available clusters: {}".format(available_clusters)) + logger.info(" * Cluster range: {} to {}".format(cliargs.start, cliargs.end)) + logger.info(" * Clusters per ZTP argoCD application: {}".format(cliargs.clusters_per_app)) + if cliargs.skip_wait_install: + logger.info(" * Skip waiting for cluster install completion") + else: + if cliargs.wait_cluster_max > 0: + logger.info(" * Wait for cluster install completion (Max {}s :: {})".format( + cliargs.wait_cluster_max, str(timedelta(seconds=cliargs.wait_cluster_max)))) + else: + logger.info(" * Wait for cluster install completion (Infinite wait)") + if not cliargs.wait_du_profile: + logger.info(" * Skip waiting for DU Profile completion") + else: + if cliargs.wait_du_profile_max > 0: + logger.info(" * Wait for DU Profile completion (Max {}s :: {})".format( + cliargs.wait_du_profile_max, str(timedelta(seconds=cliargs.wait_du_profile_max)))) + else: + logger.info(" * Wait for DU Profile completion (Infinite wait)") + if not cliargs.wait_playbook: + logger.info(" * Skip waiting for Playbook completion") + else: + if cliargs.wait_playbook_max > 0: + logger.info(" * Wait for Playbook completion (Max {}s :: {})".format( + cliargs.wait_playbook_max, str(timedelta(seconds=cliargs.wait_playbook_max)))) + else: + logger.info(" * Wait for Playbook completion (Infinite wait)") + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Monitor interval: {}s".format(cliargs.monitor_interval)) + logger.info(" * Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) + phase_break() + # Create the results directory to store data into - logger.info("Creating report directory: {}".format(report_dir)) + logger.debug("Creating report directory: {}".format(report_dir)) os.mkdir(report_dir) ############################################################################# @@ -484,13 +510,35 @@ def main(): "playbook_running": 0, "playbook_completed": 0 } - monitor_thread = ZTPMonitor(cliargs.method, talm_minor, monitor_data, monitor_data_csv_file, cliargs.dry_run, cliargs.monitor_interval) + monitor_thread = ZTPMonitor(cliargs.method, talm_minor, monitor_data, monitor_data_csv_file, cliargs.dry_run, cliargs.monitor_interval, cliargs.kubeconfig) monitor_thread.start() + + ############################################################################# + # Phase 1: Idle Baseline + ############################################################################# if cliargs.start_delay > 0: phase_break() - logger.info("Sleeping {}s for start delay".format(cliargs.start_delay)) - time.sleep(cliargs.start_delay) + logger.info("Phase 1: Idle Baseline - Sleeping {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + remaining_start_delay = cliargs.start_delay + while remaining_start_delay > 300: + time.sleep(300) + remaining_start_delay -= 300 + logger.info("{}s :: {} remaining in idle baseline".format( + remaining_start_delay, str(timedelta(seconds=remaining_start_delay)))) + time.sleep(remaining_start_delay) deploy_start_time = time.time() + + # Phase 1 Prometheus analysis: idle baseline window + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase1-idle-baseline", + start_time, deploy_start_time, + cliargs.kubeconfig, base_dir) + + ############################################################################# + # Phase 2: Cluster Deployment + ############################################################################# if cliargs.rate == "interval": phase_break() logger.info("Starting interval based cluster deployment rate - {}".format(int(time.time() * 1000))) @@ -518,7 +566,7 @@ def main(): # Apply the clusters for cluster in cluster_list[start_cluster_index:end_cluster_index]: monitor_data["cluster_applied_committed"] += 1 - oc_cmd = ["oc", "apply", "-f", cluster] + oc_cmd = ["oc", "--kubeconfig", cliargs.kubeconfig, "apply", "-f", cluster] # Might need to add retries and have method to count retries rc, output = command(oc_cmd, cliargs.dry_run) if rc != 0: @@ -659,13 +707,37 @@ def main(): wait_logger = 0 wait_playbook_end_time = time.time() - end_time = time.time() + # Phase 2 Prometheus analysis: cluster deployment window (deploy through all wait phases) + soak_start_time = time.time() + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase2-cluster-deployment", + deploy_start_time, soak_start_time, + cliargs.kubeconfig, base_dir) - # End of Workload delay + ############################################################################# + # Phase 3: Soak Baseline + ############################################################################# if cliargs.end_delay > 0: phase_break() - logger.info("Sleeping {}s for end delay".format(cliargs.end_delay)) - time.sleep(cliargs.end_delay) + logger.info("Phase 3: Soak Baseline - Sleeping {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + remaining_end_delay = cliargs.end_delay + while remaining_end_delay > 300: + time.sleep(300) + remaining_end_delay -= 300 + logger.info("{}s :: {} remaining in soak baseline".format( + remaining_end_delay, str(timedelta(seconds=remaining_end_delay)))) + time.sleep(remaining_end_delay) + + end_time = time.time() + + # Phase 3 Prometheus analysis: soak baseline window + if not cliargs.no_prometheus_analysis: + launch_prometheus_analysis( + report_dir, "phase3-soak-baseline", + soak_start_time, end_time, + cliargs.kubeconfig, base_dir) # Stop monitoring thread logger.info("Stopping monitoring thread may take up to: {}".format(cliargs.monitor_interval)) @@ -675,10 +747,10 @@ def main(): ############################################################################# # Report Card / Graph Phase ############################################################################# - generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, + generate_deploy_load_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, wait_cluster_end_time, wait_du_profile_start_time, wait_du_profile_end_time, - wait_playbook_start_time, wait_playbook_end_time, available_clusters, monitor_data, - cliargs, total_intervals, report_dir) + wait_playbook_start_time, wait_playbook_end_time, soak_start_time, + available_clusters, monitor_data, cliargs, total_intervals, report_dir) if __name__ == "__main__": sys.exit(main()) diff --git a/acm-deploy-load/acm-telco-core-load.py b/acm-deploy-load/acm-telco-core-load.py index 8c1f3b95..16bf8b46 100755 --- a/acm-deploy-load/acm-telco-core-load.py +++ b/acm-deploy-load/acm-telco-core-load.py @@ -22,13 +22,15 @@ import glob from math import ceil from jinja2 import Template +from utils.analysis import launch_prometheus_analysis from utils.command import command +from utils.common_ocp import validate_kubeconfig +from utils.output import generate_telco_core_load_report from utils.output import log_write from utils.output import phase_break import logging import os import shutil -import subprocess import sys import time @@ -111,46 +113,6 @@ def update_policy_cm(policy_ns, cm_name, policy_keys, policy_dir, hub_kc): logger.debug(output.strip()) -def launch_prometheus_analysis(report_dir, phase_name, start_ts, end_ts, kubeconfig, base_dir): - """Launch analyze-prometheus.py in the background for the given time window.""" - analyzer_script = os.path.join(base_dir, "analyze-prometheus.py") - if not os.path.isfile(analyzer_script): - logger.warning("analyze-prometheus.py not found at {}, skipping phase {}".format(analyzer_script, phase_name)) - return - start_str = datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - end_str = datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - duration_seconds = round(end_ts - start_ts) - if duration_seconds < 900: - logger.warning("Skipping prometheus analysis phase {}: window {}s < 15 minutes".format(phase_name, duration_seconds)) - return - # No buffer time since script is running against end time that is less than 5 minutes from now - cmd = [ - sys.executable, - analyzer_script, - "-k", kubeconfig, - "-s", start_str, - "-e", end_str, - "-b", "0", - "-p", phase_name, - report_dir, - ] - logger.info("Prometheus analysis command: {}".format(" ".join(cmd))) - log_file = os.path.join(report_dir, "pa-{}.log".format(phase_name)) - try: - with open(log_file, "w") as f: - proc = subprocess.Popen( - cmd, - stdout=f, - stderr=subprocess.STDOUT, - cwd=base_dir, - start_new_session=True, - ) - logger.info("Launched prometheus analysis phase '{}' in background (pid {}, log: {})".format( - phase_name, proc.pid, os.path.basename(log_file))) - except Exception as e: - logger.warning("Failed to launch prometheus analysis for phase {}: {}".format(phase_name, e)) - - def main(): start_time = time.time() @@ -186,9 +148,10 @@ def main(): parser.add_argument("--max-policy-intervals", type=int, default=10, help="Maximum number of policy intervals to run (Used with --no-deploy only)") - # Delay args are idle time before and after the workload - parser.add_argument("-s", "--start-delay", type=int, default=120, help="Delay on start of script") - parser.add_argument("-e", "--end-delay", type=int, default=120, help="Delay on end of script") + parser.add_argument("-s", "--start-delay", type=int, default=120, + help="Phase 1 / Idle baseline delay before starting deploys (seconds)") + parser.add_argument("-e", "--end-delay", type=int, default=120, + help="Phase 3 / Soak baseline delay after deploys complete (seconds)") parser.add_argument("-t", "--results-dir-suffix", type=str, default="test-00", help="Suffix to be appended to results directory name") @@ -207,6 +170,8 @@ def main(): parser.error("Cannot set both --no-deploy and --no-policy. Modes are: Deploy+Policy (default), " "Deploy only (--no-policy), or Policy only (--no-deploy).") + validate_kubeconfig(cliargs.kubeconfig) + phase_break() logger.info("ACM Telco Core Load") phase_break() @@ -222,6 +187,7 @@ def main(): logger.info("Results data captured in: {}".format("/".join(report_dir.split("/")[-2:]))) clusterinstance_files = [] + deploy_batch_count = 0 if cliargs.no_deploy == False: # Detect all clusterinstance file manifests to be deployed logger.info("Checking {}clusterinstance/ for cluster instance manifests".format(cliargs.cluster_manifests)) @@ -238,39 +204,58 @@ def main(): phase_break() logger.info("Workload Parameters") - if cliargs.no_deploy == False: + if cliargs.no_deploy == False and cliargs.no_policy == False: # Subtract 1 from deploy batch count to account that on last batch, the phase will wait for last_deploy_runtime instead of interval_deploy expected_run_time = cliargs.start_delay + (deploy_batch_count - 1) * cliargs.interval_deploy + cliargs.last_deploy_runtime + cliargs.end_delay - if cliargs.no_policy == False: - logger.info("* Mode: Deploy+Policy") - else: - logger.info("* Mode: Deploy Clusters only") - logger.info(f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - logger.info(f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - logger.info(f" * Available clusters: {len(clusterinstance_files)}") - logger.info(f" * Total batches: {deploy_batch_count}") - logger.info(f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - if cliargs.no_policy == False: - logger.info(f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - else: - logger.info(f" * No policy updates") - logger.info(f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") + logger.info(" * Mode: Deploy+Policy") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment + Policy Updates):") + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + logger.info(" * Available clusters: {}".format(len(clusterinstance_files))) + logger.info(" * Total batches: {}".format(deploy_batch_count)) + logger.info(" * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + logger.info(" * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) if not cliargs.no_prometheus_analysis: logger.info(" * Run analyze-prometheus.py in background at phase boundaries") - logger.info(f"* Expected run time: {expected_run_time}s :: {str(timedelta(seconds=expected_run_time))}") - elif cliargs.no_deploy == True: + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) + elif cliargs.no_deploy == False and cliargs.no_policy == True: + expected_run_time = cliargs.start_delay + (deploy_batch_count - 1) * cliargs.interval_deploy + cliargs.last_deploy_runtime + cliargs.end_delay + logger.info(" * Mode: Deploy Clusters only") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Cluster Deployment):") + logger.info(" * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + logger.info(" * Available clusters: {}".format(len(clusterinstance_files))) + logger.info(" * Total batches: {}".format(deploy_batch_count)) + logger.info(" * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) + elif cliargs.no_deploy == True and cliargs.no_policy == False: # Subtract 1 from max policy intervals to account that on last interval, loop ends immediately expected_run_time = cliargs.start_delay + (cliargs.max_policy_intervals - 1) * cliargs.interval_policy + cliargs.end_delay - logger.info("* Mode: Policy configmap updates only") - logger.info(f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - logger.info(f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - logger.info(f" * Maximum number of policy intervals to run: {cliargs.max_policy_intervals}") - logger.info(f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - logger.info(f"* Expected run time: {expected_run_time}s :: {str(timedelta(seconds=expected_run_time))}") - else: - # Should not occur due to cliargs check above - logger.error("* Invalid mode.") - sys.exit(1) + logger.info(" * Mode: Policy configmap updates only") + logger.info(" * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + logger.info(" * Phase 2 (Policy Updates):") + logger.info(" * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + logger.info(" * Maximum number of policy intervals to run: {}".format(cliargs.max_policy_intervals)) + logger.info(" * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + if not cliargs.no_prometheus_analysis: + logger.info(" * Run analyze-prometheus.py in background at phase boundaries") + logger.info(" * Expected run time: {}s :: {}".format(expected_run_time, str(timedelta(seconds=expected_run_time)))) phase_break() # Detect a policy configmap @@ -295,33 +280,33 @@ def main(): os.mkdir(report_dir) os.mkdir(policy_dir) - ################################### - # Phase 1 of workload: Start delay - ################################### - # Start of workload with start delay + ############################################################################# + # Phase 1: Idle Baseline + ############################################################################# workload_start_time = time.time() if cliargs.start_delay > 0: phase_break() - logger.info("Sleeping {}s :: {} for start delay".format(cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) - total_start_delay = cliargs.start_delay - while(total_start_delay > 300): + logger.info("Phase 1: Idle Baseline - Sleeping {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + remaining_start_delay = cliargs.start_delay + while remaining_start_delay > 300: time.sleep(300) - total_start_delay -= 300 - logger.info("{}s :: {} remaining in start delay".format(total_start_delay, str(timedelta(seconds=total_start_delay)))) - # Sleep remaining less than 5 minutes time - time.sleep(total_start_delay) + remaining_start_delay -= 300 + logger.info("{}s :: {} remaining in idle baseline".format( + remaining_start_delay, str(timedelta(seconds=remaining_start_delay)))) + time.sleep(remaining_start_delay) start_delay_complete_ts = time.time() - # Phase 1 Prometheus analysis: start delay window + # Phase 1 Prometheus analysis: idle baseline window if not cliargs.no_prometheus_analysis and cliargs.no_deploy == False: launch_prometheus_analysis( - report_dir, "phase1-start-delay", + report_dir, "phase1-idle-baseline", workload_start_time, start_delay_complete_ts, cliargs.kubeconfig, base_dir) - ################################### - # Phase 2 of workload: Deploy clusters and/or update policy configmap - ################################### + ############################################################################# + # Phase 2: Cluster Deployment and/or Policy Updates + ############################################################################# total_clusters_deployed = 0 total_policy_cm_updates = 0 deployed_clusters = [] @@ -334,7 +319,7 @@ def main(): next_policy_time = next_deploy_time last_logged = start_delay_complete_ts phase_break() - logger.info("Begin Telco Core ACM Load - {}".format(int(time.time() * 1000))) + logger.info("Phase 2: Begin Telco Core ACM Load - {}".format(int(time.time() * 1000))) phase_break() current_time = time.time() while True: @@ -425,72 +410,37 @@ def main(): current_time = time.time() # End run loop - ################################### - # Phase 3 of workload: End delay - ################################### + ############################################################################# + # Phase 3: Soak Baseline + ############################################################################# end_delay_start_ts = time.time() if cliargs.end_delay > 0: phase_break() - logger.info("Sleeping {}s :: {} for end delay".format(cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) - total_end_delay = cliargs.end_delay - while(total_end_delay > 300): + logger.info("Phase 3: Soak Baseline - Sleeping {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + remaining_end_delay = cliargs.end_delay + while remaining_end_delay > 300: time.sleep(300) - total_end_delay -= 300 - logger.info("{}s :: {} remaining in end delay".format(total_end_delay, str(timedelta(seconds=total_end_delay)))) - # Sleep remaining less than 5 minutes time - time.sleep(total_end_delay) + remaining_end_delay -= 300 + logger.info("{}s :: {} remaining in soak baseline".format( + remaining_end_delay, str(timedelta(seconds=remaining_end_delay)))) + time.sleep(remaining_end_delay) end_time = time.time() - # Phase 3 Prometheus analysis: end delay window + # Phase 3 Prometheus analysis: soak baseline window if not cliargs.no_prometheus_analysis and cliargs.no_deploy == False: launch_prometheus_analysis( - report_dir, "phase3-end-delay", + report_dir, "phase3-soak-baseline", end_delay_start_ts, end_time, cliargs.kubeconfig, base_dir) - total_elapsed_time = round(end_time - workload_start_time) - # Make a report card - with open("{}/report.txt".format(report_dir), "w") as report: - phase_break(True, report) - log_write(report, "acm-telco-core-load Report Card") - phase_break(True, report) - log_write(report, "Workload Parameters") - if cliargs.no_deploy == False and cliargs.no_policy == False: - log_write(report, "* Mode: Deploy+Policy") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - log_write(report, f" * Available clusters: {len(clusterinstance_files)}") - log_write(report, f" * Total batches: {deploy_batch_count}") - log_write(report, f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - log_write(report, f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - elif cliargs.no_deploy == False and cliargs.no_policy == True: - log_write(report, "* Mode: Deploy Clusters only") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Deploy {cliargs.batch} cluster(s) per {cliargs.interval_deploy}s :: {str(timedelta(seconds=cliargs.interval_deploy))} interval") - log_write(report, f" * Available clusters: {len(clusterinstance_files)}") - log_write(report, f" * Total batches: {deploy_batch_count}") - log_write(report, f" * Last deploy runtime: {cliargs.last_deploy_runtime}s :: {str(timedelta(seconds=cliargs.last_deploy_runtime))}") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - elif cliargs.no_deploy == True and cliargs.no_policy == False: - log_write(report, "* Mode: Policy configmap updates only") - log_write(report, f" * Start delay: {cliargs.start_delay}s :: {str(timedelta(seconds=cliargs.start_delay))}") - log_write(report, f" * Update policy configmap ({cliargs.hub_policy_cm_keys} keys) in namespace {cliargs.hub_policy_namespace} per {cliargs.interval_policy}s interval") - log_write(report, f" * Maximum number of policy intervals to run: {cliargs.max_policy_intervals}") - log_write(report, f" * End delay: {cliargs.end_delay}s :: {str(timedelta(seconds=cliargs.end_delay))}") - log_write(report, "Workload Results") - log_write(report, " * Total elapsed time: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) - log_write(report, " * Total cluster(s) deployed: {}".format(total_clusters_deployed)) - log_write(report, " * Total policy cm updates: {}".format(total_policy_cm_updates)) - log_write(report, "Workload Timestamps") - log_write(report, " * Start Time: {} {}".format(datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(workload_start_time * 1000))) - log_write(report, " * Start Delay Complete Time: {}".format(datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - for i, ts in enumerate(cluster_deployed_timestamps): - log_write(report, " * Cluster(s) Batch {} deployed: {}".format(i, datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - log_write(report, " * End Delay Start Time: {}".format(datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) - log_write(report, " * End Time: {} {}".format(datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(end_time * 1000))) + generate_telco_core_load_report(workload_start_time, end_time, start_delay_complete_ts, + end_delay_start_ts, cluster_deployed_timestamps, total_clusters_deployed, + total_policy_cm_updates, len(clusterinstance_files), deploy_batch_count, + cliargs, report_dir) + total_elapsed_time = round(end_time - workload_start_time) logger.info("Took {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) diff --git a/acm-deploy-load/utils/analysis.py b/acm-deploy-load/utils/analysis.py new file mode 100644 index 00000000..033ed924 --- /dev/null +++ b/acm-deploy-load/utils/analysis.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# Copyright 2026 Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import datetime, timezone +import logging +import os +import subprocess +import sys + +logger = logging.getLogger("acm-deploy-load") + + +def launch_prometheus_analysis(report_dir, phase_name, start_ts, end_ts, kubeconfig, base_dir): + """Launch analyze-prometheus.py in the background for the given time window.""" + analyzer_script = os.path.join(base_dir, "analyze-prometheus.py") + if not os.path.isfile(analyzer_script): + logger.warning("analyze-prometheus.py not found at {}, skipping phase {}".format(analyzer_script, phase_name)) + return + start_str = datetime.fromtimestamp(start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + end_str = datetime.fromtimestamp(end_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + duration_seconds = round(end_ts - start_ts) + if duration_seconds < 900: + logger.warning("Skipping prometheus analysis phase {}: window {}s < 15 minutes".format(phase_name, duration_seconds)) + return + # No buffer time since script is running against end time that is less than 5 minutes from now + cmd = [ + sys.executable, + analyzer_script, + "-k", kubeconfig, + "-s", start_str, + "-e", end_str, + "-b", "0", + "-p", phase_name, + report_dir, + ] + logger.info("Prometheus analysis command: {}".format(" ".join(cmd))) + log_file = os.path.join(report_dir, "pa-{}.log".format(phase_name)) + try: + with open(log_file, "w") as f: + proc = subprocess.Popen( + cmd, + stdout=f, + stderr=subprocess.STDOUT, + cwd=base_dir, + start_new_session=True, + ) + logger.info("Launched prometheus analysis phase '{}' in background (pid {}, log: {})".format( + phase_name, proc.pid, os.path.basename(log_file))) + except Exception as e: + logger.warning("Failed to launch prometheus analysis for phase {}: {}".format(phase_name, e)) diff --git a/acm-deploy-load/utils/common_ocp.py b/acm-deploy-load/utils/common_ocp.py index 1e7149d1..d943078d 100644 --- a/acm-deploy-load/utils/common_ocp.py +++ b/acm-deploy-load/utils/common_ocp.py @@ -204,3 +204,12 @@ def get_thanos_querier_route(kubeconfig): else: logger.error("Failed to find route for thanos-querier") return "" + + +def validate_kubeconfig(kubeconfig): + oc_cmd = ["oc", "--kubeconfig", kubeconfig, "whoami"] + rc, output = command(oc_cmd, False, no_log=True) + if rc != 0: + logger.error("Kubeconfig validation failed (oc whoami rc: {}): {}".format(rc, kubeconfig)) + sys.exit(1) + logger.info("Kubeconfig validated, connected as: {}".format(output.strip())) diff --git a/acm-deploy-load/utils/output.py b/acm-deploy-load/utils/output.py index 494dedbc..9847756a 100644 --- a/acm-deploy-load/utils/output.py +++ b/acm-deploy-load/utils/output.py @@ -46,15 +46,26 @@ def assemble_stats(the_list, seconds=True): return "{} :: {} :: {} :: {} :: {} :: {}".format(stats_min, stats_avg, stats_p50, stats_p95, stats_p99, stats_max) -def generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, +def generate_deploy_load_report(start_time, end_time, deploy_start_time, deploy_end_time, wait_cluster_start_time, wait_cluster_end_time, wait_du_profile_start_time, wait_du_profile_end_time, wait_playbook_start_time, - wait_playbook_end_time, available_clusters, monitor_data, cliargs, total_intervals, report_dir): + wait_playbook_end_time, soak_start_time, available_clusters, monitor_data, cliargs, total_intervals, report_dir): + # Timestamps define three workload phases: + # Phase 1 (Idle Baseline): start_time -> deploy_start_time + # Phase 2 (Cluster Deployment): deploy_start_time -> soak_start_time + # - Manifest apply: deploy_start_time -> deploy_end_time + # - Wait cluster install: wait_cluster_start_time -> wait_cluster_end_time + # - Wait DU profile: wait_du_profile_start_time -> wait_du_profile_end_time + # - Wait playbook: wait_playbook_start_time -> wait_playbook_end_time + # Phase 3 (Soak Baseline): soak_start_time -> end_time # Determine result data + total_idle_baseline_time = round(deploy_start_time - start_time) + total_phase2_time = round(soak_start_time - deploy_start_time) total_deploy_time = round(deploy_end_time - deploy_start_time) total_cluster_install_time = round(wait_cluster_end_time - wait_cluster_start_time) total_duprofile_time = round(wait_du_profile_end_time - wait_du_profile_start_time) total_playbook_time = round(wait_playbook_end_time - wait_playbook_start_time) + total_soak_baseline_time = round(end_time - soak_start_time) total_time = round(end_time - start_time) success_cluster_percent = 0 failed_cluster_percent = 0 @@ -132,29 +143,157 @@ def generate_report(start_time, end_time, deploy_start_time, deploy_end_time, wa log_write(report, " * Overall Success (DU Compliant / Deployed): {} / {}".format(monitor_data["policy_compliant"], monitor_data["cluster_applied_committed"])) log_write(report, " * Overall Success Percent: {}%".format(success_overall_percent)) log_write(report, " * Overall Failed Percent: {}%".format(failed_overall_percent)) - log_write(report, "Deployed Cluster Orchestration") + log_write(report, "Workload Parameters") log_write(report, " * Method: {}".format(cliargs.method)) log_write(report, " * Rate: {}".format(cliargs.rate)) - log_write(report, " * Cluster Start: {} End: {}".format(cliargs.start, cliargs.end)) - log_write(report, " * {} cluster(s) per ZTP argoCD application".format(cliargs.clusters_per_app)) + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 (Cluster Deployment):") if cliargs.rate == "interval": - log_write(report, " * {} cluster(s) per {}s interval".format(cliargs.batch, cliargs.interval)) - log_write(report, " * Actual Intervals: {}".format(total_intervals)) - log_write(report, " * Wan Emulation: {}".format(cliargs.wan_emulation)) - log_write(report, "Workload Duration Results") + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval, str(timedelta(seconds=cliargs.interval)))) + log_write(report, " * Cluster range: {} to {}".format(cliargs.start, cliargs.end)) + log_write(report, " * Clusters per ZTP argoCD application: {}".format(cliargs.clusters_per_app)) + log_write(report, " * Actual intervals: {}".format(total_intervals)) + if cliargs.skip_wait_install: + log_write(report, " * Skip waiting for cluster install completion") + else: + if cliargs.wait_cluster_max > 0: + log_write(report, " * Wait for cluster install completion (Max {}s :: {})".format( + cliargs.wait_cluster_max, str(timedelta(seconds=cliargs.wait_cluster_max)))) + else: + log_write(report, " * Wait for cluster install completion (Infinite wait)") + if not cliargs.wait_du_profile: + log_write(report, " * Skip waiting for DU Profile completion") + else: + if cliargs.wait_du_profile_max > 0: + log_write(report, " * Wait for DU Profile completion (Max {}s :: {})".format( + cliargs.wait_du_profile_max, str(timedelta(seconds=cliargs.wait_du_profile_max)))) + else: + log_write(report, " * Wait for DU Profile completion (Infinite wait)") + if not cliargs.wait_playbook: + log_write(report, " * Skip waiting for Playbook completion") + else: + if cliargs.wait_playbook_max > 0: + log_write(report, " * Wait for Playbook completion (Max {}s :: {})".format( + cliargs.wait_playbook_max, str(timedelta(seconds=cliargs.wait_playbook_max)))) + else: + log_write(report, " * Wait for Playbook completion (Infinite wait)") + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + log_write(report, " * Monitor interval: {}s".format(cliargs.monitor_interval)) + if cliargs.wan_emulation: + log_write(report, " * Wan Emulation: {}".format(cliargs.wan_emulation)) + log_write(report, "Workload Phases") log_write(report, " * Start Time: {} {}".format( datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(start_time * 1000))) log_write(report, " * End Time: {} {}".format( datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), int(end_time * 1000))) - log_write(report, " * Cluster Deploying duration: {}s :: {}".format(total_deploy_time, str(timedelta(seconds=total_deploy_time)))) + log_write(report, " * Total duration: {}s :: {}".format(total_time, str(timedelta(seconds=total_time)))) + log_write(report, " * Phase 1 (Idle Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(deploy_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_idle_baseline_time, str(timedelta(seconds=total_idle_baseline_time)))) + log_write(report, " * Phase 2 (Cluster Deployment): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(deploy_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(soak_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_phase2_time, str(timedelta(seconds=total_phase2_time)))) + log_write(report, " * Cluster Deploying duration: {}s :: {}".format(total_deploy_time, str(timedelta(seconds=total_deploy_time)))) if not cliargs.skip_wait_install: - log_write(report, " * Cluster Install wait duration: {}s :: {}".format(total_cluster_install_time, str(timedelta(seconds=total_cluster_install_time)))) + log_write(report, " * Cluster Install wait duration: {}s :: {}".format(total_cluster_install_time, str(timedelta(seconds=total_cluster_install_time)))) if cliargs.wait_du_profile: - log_write(report, " * DU Profile wait duration: {}s :: {}".format(total_duprofile_time, str(timedelta(seconds=total_duprofile_time)))) + log_write(report, " * DU Profile wait duration: {}s :: {}".format(total_duprofile_time, str(timedelta(seconds=total_duprofile_time)))) if cliargs.wait_playbook: - log_write(report, " * Playbook wait duration: {}s :: {}".format(total_playbook_time, str(timedelta(seconds=total_playbook_time)))) - log_write(report, " * Total duration: {}s :: {}".format(total_time, str(timedelta(seconds=total_time)))) - # Done outputing the report card + log_write(report, " * Playbook wait duration: {}s :: {}".format(total_playbook_time, str(timedelta(seconds=total_playbook_time)))) + log_write(report, " * Phase 3 (Soak Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(soak_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_soak_baseline_time, str(timedelta(seconds=total_soak_baseline_time)))) + + +def generate_telco_core_load_report(workload_start_time, end_time, start_delay_complete_ts, + end_delay_start_ts, cluster_deployed_timestamps, total_clusters_deployed, + total_policy_cm_updates, available_clusters, deploy_batch_count, cliargs, report_dir): + + total_elapsed_time = round(end_time - workload_start_time) + total_idle_baseline_time = round(start_delay_complete_ts - workload_start_time) + total_phase2_time = round(end_delay_start_ts - start_delay_complete_ts) + total_soak_baseline_time = round(end_time - end_delay_start_ts) + + with open("{}/report.txt".format(report_dir), "w") as report: + phase_break(True, report) + log_write(report, "acm-telco-core-load Report Card") + phase_break(True, report) + log_write(report, "Workload Parameters") + if cliargs.no_deploy == False and cliargs.no_policy == False: + phase2_label = "Cluster Deployment + Policy Updates" + log_write(report, " * Mode: Deploy+Policy") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + log_write(report, " * Available clusters: {}".format(available_clusters)) + log_write(report, " * Total batches: {}".format(deploy_batch_count)) + log_write(report, " * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + log_write(report, " * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + elif cliargs.no_deploy == False and cliargs.no_policy == True: + phase2_label = "Cluster Deployment" + log_write(report, " * Mode: Deploy Clusters only") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Deploy {} cluster(s) per {}s :: {} interval".format( + cliargs.batch, cliargs.interval_deploy, str(timedelta(seconds=cliargs.interval_deploy)))) + log_write(report, " * Available clusters: {}".format(available_clusters)) + log_write(report, " * Total batches: {}".format(deploy_batch_count)) + log_write(report, " * Last deploy runtime: {}s :: {}".format( + cliargs.last_deploy_runtime, str(timedelta(seconds=cliargs.last_deploy_runtime)))) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + elif cliargs.no_deploy == True and cliargs.no_policy == False: + phase2_label = "Policy Updates" + log_write(report, " * Mode: Policy configmap updates only") + log_write(report, " * Phase 1 / Idle Baseline (Start delay): {}s :: {}".format( + cliargs.start_delay, str(timedelta(seconds=cliargs.start_delay)))) + log_write(report, " * Phase 2 ({}):".format(phase2_label)) + log_write(report, " * Update policy configmap ({} keys) in namespace {} per {}s interval".format( + cliargs.hub_policy_cm_keys, cliargs.hub_policy_namespace, cliargs.interval_policy)) + log_write(report, " * Maximum number of policy intervals to run: {}".format(cliargs.max_policy_intervals)) + log_write(report, " * Phase 3 / Soak Baseline (End delay): {}s :: {}".format( + cliargs.end_delay, str(timedelta(seconds=cliargs.end_delay)))) + log_write(report, "Workload Results") + log_write(report, " * Total elapsed time: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) + log_write(report, " * Total cluster(s) deployed: {}".format(total_clusters_deployed)) + log_write(report, " * Total policy cm updates: {}".format(total_policy_cm_updates)) + log_write(report, "Workload Phases") + log_write(report, " * Start Time: {} {}".format( + datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + int(workload_start_time * 1000))) + log_write(report, " * End Time: {} {}".format( + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + int(end_time * 1000))) + log_write(report, " * Total duration: {}s :: {}".format(total_elapsed_time, str(timedelta(seconds=total_elapsed_time)))) + log_write(report, " * Phase 1 (Idle Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(workload_start_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_idle_baseline_time, str(timedelta(seconds=total_idle_baseline_time)))) + log_write(report, " * Phase 2 ({}): {} to {} :: {}s :: {}".format(phase2_label, + datetime.fromtimestamp(start_delay_complete_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_phase2_time, str(timedelta(seconds=total_phase2_time)))) + for i, ts in enumerate(cluster_deployed_timestamps): + log_write(report, " * Cluster(s) Batch {} deployed: {}".format( + i, datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"))) + log_write(report, " * Phase 3 (Soak Baseline): {} to {} :: {}s :: {}".format( + datetime.fromtimestamp(end_delay_start_ts, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + datetime.fromtimestamp(end_time, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + total_soak_baseline_time, str(timedelta(seconds=total_soak_baseline_time)))) + def log_write(file, message): logger.info(message) diff --git a/acm-deploy-load/utils/ztp_monitor.py b/acm-deploy-load/utils/ztp_monitor.py index 9f70e79d..468f9192 100644 --- a/acm-deploy-load/utils/ztp_monitor.py +++ b/acm-deploy-load/utils/ztp_monitor.py @@ -28,7 +28,7 @@ class ZTPMonitor(Thread): - def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_interval): + def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_interval, kubeconfig): super(ZTPMonitor, self).__init__() if method in ["ai-manifest", "ai-clusterinstance", "ai-clusterinstance-gitops", "ai-siteconfig-gitops"]: self.method = "agent" @@ -39,6 +39,7 @@ def __init__(self, method, talm_minor, monitor_data, csv_file, dry_run, sample_i self.csv_file = csv_file self.dry_run = dry_run self.sample_interval = sample_interval + self.kubeconfig = kubeconfig self.signal = True def _real_run(self): @@ -52,7 +53,7 @@ def _real_run(self): if self.method == "agent": # Get agentclusterinstall data - oc_cmd = ["oc", "get", "agentclusterinstall", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "agentclusterinstall", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get agentclusterinstall rc: {}".format(rc)) @@ -66,7 +67,7 @@ def _real_run(self): logger.warning("aci JSONDecodeError: {}".format(output[:2500])) elif self.method == "image": # Get imageclusterinstall data - oc_cmd = ["oc", "get", "imageclusterinstall", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "imageclusterinstall", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get imageclusterinstall rc: {}".format(rc)) @@ -80,7 +81,7 @@ def _real_run(self): logger.warning("ici JSONDecodeError: {}".format(output[:2500])) # Get baremetalhost data - oc_cmd = ["oc", "get", "baremetalhost", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "baremetalhost", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get baremetalhost rc: {}".format(rc)) @@ -95,7 +96,7 @@ def _real_run(self): if self.method == "agent": # Get agent data - oc_cmd = ["oc", "get", "agent", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "agent", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get agent rc: {}".format(rc)) @@ -112,7 +113,7 @@ def _real_run(self): agent_data = {"items": []} # Get managedcluster data - oc_cmd = ["oc", "get", "managedcluster", "-A", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "managedcluster", "-A", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get managedcluster rc: {}".format(rc)) @@ -126,7 +127,7 @@ def _real_run(self): logger.warning("mc JSONDecodeError: {}".format(output[:2500])) # Get clustergroupupgrades data - oc_cmd = ["oc", "get", "clustergroupupgrades", "-n", "ztp-install", "-o", "json"] + oc_cmd = ["oc", "--kubeconfig", self.kubeconfig, "get", "clustergroupupgrades", "-n", "ztp-install", "-o", "json"] rc, output = command(oc_cmd, self.dry_run, retries=3, no_log=True) if rc != 0: logger.error("acm-deploy-load, oc get clustergroupupgrades rc: {}".format(rc)) diff --git a/scripts/interval-ztp-install-all.sh b/scripts/interval-ztp-install-all.sh index 258b4103..bf5eba2c 100755 --- a/scripts/interval-ztp-install-all.sh +++ b/scripts/interval-ztp-install-all.sh @@ -6,20 +6,31 @@ set -o pipefail iteration=1 # Method to deploy clusters (AI = Assisted Installer, IBI = Image Based Installer) -# method="ai-siteconfig-gitops" # method="ai-clusterinstance-gitops" method="ibi-clusterinstance-gitops" -# Rate 500/30m +# Phase 1 (Idle baseline) delay in seconds +start_delay=15 + +# Phase 2 (Cluster deployment) rate in clusters per interval +# Rate 500 clusters every 30 minutes interval_period=1800 batch=500 -# Rate 80/5m +# Rate 80 clusters every 5 minutes # interval_period=300 # batch=80 +# Phase 3 (Soak baseline) delay in seconds +end_delay=120 + # SNO or Mixed SNOs and MNOs clusters_per_app=100 +# Prometheus analysis per phase (uncomment to enable) +# Use with longer idle and soak baselines to produce capacity guideline measurements +prometheus_analysis_arg="--no-prometheus-analysis" +# prometheus_analysis_arg="" + # WAN Emulation can only be run with SNOs wan_em="(None)" # wan_em="(50ms/0.02)" @@ -41,7 +52,7 @@ hub_ocp=$(oc version -o json | jq -r '.openshiftVersion') # grep will cause error code 141 since it prints only the first match cluster_ocp=$(cat /root/hv-vm/*/*/*.yml | grep "clusterImageSetNameRef:" -m 1 | awk '{print $NF}' | sed 's/openshift-//' || if [[ $? -eq 141 ]]; then true; else exit $?; fi) -time ./acm-deploy-load/acm-deploy-load.py --acm-version "${acm_ver}" --aap-version "${aap_csv}" --test-version "${test_ver}" --hub-version "${hub_ocp}" --deploy-version "${cluster_ocp}" --wan-emulation "${wan_em}" -m "${method}" --clusters-per-app ${clusters_per_app} ${argocd_arg} -w -i 60 -t ${clusters_per_app}cpa-${batch}b-${interval_period}i-${iteration} interval -b ${batch} -i ${interval_period} 2>&1 | tee ${log_file} +time ./acm-deploy-load/acm-deploy-load.py --acm-version "${acm_ver}" --aap-version "${aap_csv}" --test-version "${test_ver}" --hub-version "${hub_ocp}" --deploy-version "${cluster_ocp}" --wan-emulation "${wan_em}" -m "${method}" --clusters-per-app ${clusters_per_app} ${argocd_arg} --start-delay ${start_delay} --end-delay ${end_delay} ${prometheus_analysis_arg} -w -i 60 -t ${clusters_per_app}cpa-${batch}b-${interval_period}i-${iteration} interval -b ${batch} -i ${interval_period} 2>&1 | tee ${log_file} results_dir=$(grep "Results data captured in:" $log_file | awk '{print $NF}') @@ -79,6 +90,7 @@ time ./acm-deploy-load/analyze-ansiblejobs.py ${results_dir} 2>&1 | tee -a ${log echo "################################################################################" 2>&1 | tee -a ${log_file} +# Complete Prometheus analysis for entire workload period start_time=$(grep "Start Time:" ${results_dir}/report.txt | awk '{print $4}') end_time=$(grep "End Time:" ${results_dir}/report.txt | awk '{print $4}') echo "time ./acm-deploy-load/analyze-prometheus.py -p deploy-pa -s ${start_time} -e ${end_time} ${results_dir}" | tee -a ${log_file}