From db63b167c4e89ea6435f70b55e02fcde05a048be Mon Sep 17 00:00:00 2001 From: Alexis Roberson Date: Tue, 10 Mar 2026 11:29:15 -0500 Subject: [PATCH] Added agent skills 4 for O11Y, 1 for Guarded Rollouts --- .cursor-plugin/plugin.json | 8 +- README.md | 27 + skills.json | 40 ++ .../guarded-rollout-create/README.md | 73 +++ .../guarded-rollout-create/SKILL.md | 123 +++++ .../references/rollout-stage-patterns.md | 422 +++++++++++++++ .../observability/o11y-flag-impact/README.md | 74 +++ .../observability/o11y-flag-impact/SKILL.md | 103 ++++ .../references/correlation-methods.md | 295 +++++++++++ skills/observability/o11y-log-query/README.md | 70 +++ skills/observability/o11y-log-query/SKILL.md | 120 +++++ .../references/query-patterns.md | 489 ++++++++++++++++++ .../o11y-regression-detect/README.md | 71 +++ .../o11y-regression-detect/SKILL.md | 116 +++++ .../references/detection-strategies.md | 486 +++++++++++++++++ .../o11y-service-health/README.md | 59 +++ .../o11y-service-health/SKILL.md | 112 ++++ .../references/health-assessment-criteria.md | 253 +++++++++ .../references/metric-types.md | 164 ++++++ 19 files changed, 3103 insertions(+), 2 deletions(-) create mode 100644 skills/guarded-rollouts/guarded-rollout-create/README.md create mode 100644 skills/guarded-rollouts/guarded-rollout-create/SKILL.md create mode 100644 skills/guarded-rollouts/guarded-rollout-create/references/rollout-stage-patterns.md create mode 100644 skills/observability/o11y-flag-impact/README.md create mode 100644 skills/observability/o11y-flag-impact/SKILL.md create mode 100644 skills/observability/o11y-flag-impact/references/correlation-methods.md create mode 100644 skills/observability/o11y-log-query/README.md create mode 100644 skills/observability/o11y-log-query/SKILL.md create mode 100644 skills/observability/o11y-log-query/references/query-patterns.md create mode 100644 skills/observability/o11y-regression-detect/README.md create mode 100644 skills/observability/o11y-regression-detect/SKILL.md create mode 100644 skills/observability/o11y-regression-detect/references/detection-strategies.md create mode 100644 skills/observability/o11y-service-health/README.md create mode 100644 skills/observability/o11y-service-health/SKILL.md create mode 100644 skills/observability/o11y-service-health/references/health-assessment-criteria.md create mode 100644 skills/observability/o11y-service-health/references/metric-types.md diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 3ab4468..f79d639 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "launchdarkly", "version": "1.0.0", - "description": "LaunchDarkly agent skills and mcp server for feature flag management, AI configuration, and skill authoring", + "description": "LaunchDarkly agent skills and mcp server for feature flag management, AI configuration, observability, guarded rollouts, and skill authoring", "author": { "name": "LaunchDarkly", "email": "support@launchdarkly.com" @@ -18,6 +18,10 @@ "targeting", "rollout", "cleanup", - "mcp" + "mcp", + "observability", + "guarded-rollouts", + "monitoring", + "service-health" ] } diff --git a/README.md b/README.md index d97001f..7d0a0db 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,21 @@ Agent Skills are modular, text-based playbooks that teach an agent how to perfor | `ai-configs/aiconfig-tools` | Create and attach tools for function calling | | `ai-configs/aiconfig-projects` | Create and manage projects to organize AI Configs | +### Observability + +| Skill | Description | +|-------|-------------| +| `observability/o11y-service-health` | Check error rate, latency, and throughput for a service | +| `observability/o11y-log-query` | Search and filter logs for debugging | +| `observability/o11y-flag-impact` | Correlate a flag change with a metric shift | +| `observability/o11y-regression-detect` | Monitor a rollout and detect metric regressions | + +### Guarded Rollouts + +| Skill | Description | +|-------|-------------| +| `guarded-rollouts/guarded-rollout-create` | Configure a guarded rollout with metric monitoring and rollback thresholds | + ### Skill Authoring | Skill | Description | @@ -76,6 +91,18 @@ Roll out dark-mode to 25% of users in production Remove the `new-checkout-flow` feature flag from this codebase ``` +``` +How is the checkout service doing after the last deploy? +``` + +``` +Did the new-checkout flag cause the error rate spike? +``` + +``` +Set up a guarded rollout for the payments feature with automatic rollback +``` + ## Install via skills.sh CLI ```bash diff --git a/skills.json b/skills.json index 5eefd12..dc9d6ca 100644 --- a/skills.json +++ b/skills.json @@ -43,6 +43,14 @@ "license": "Apache-2.0", "compatibility": "Works in repositories following the Agent Skills open standard" }, + { + "name": "guarded-rollout-create", + "description": "Configure a guarded rollout: define rollout stages (percentages and soak times), set metric thresholds for error rate, latency, and throughput, and enable automated rollback if thresholds are breached. Use when the user wants to set up or create a guarded rollout, add safety guardrails to a flag rollout, or configure automatic rollback. This skill configures the rollout \u2014 use launchdarkly-flag-targeting for simple percentage rollouts without automated monitoring.", + "path": "skills/guarded-rollouts/guarded-rollout-create", + "version": "0.1.0", + "license": "Apache-2.0", + "compatibility": "Requires LaunchDarkly MCP server with guarded rollout capabilities enabled." + }, { "name": "launchdarkly-flag-cleanup", "description": "Safely remove a feature flag from code while preserving production behavior. Use when the user wants to remove a flag from code, delete flag references, or create a PR that hardcodes the winning variation after a rollout is complete.", @@ -117,6 +125,38 @@ "devops", "mcp" ] + }, + { + "name": "o11y-flag-impact", + "description": "Retrospective analysis: compare service metrics from before and after a specific feature flag change to determine whether the flag caused a metric shift. Use when the user asks 'did flag X cause the error spike', 'what was the impact of turning on flag Y', or wants a before/after comparison of metrics around a known flag change event.", + "path": "skills/observability/o11y-flag-impact", + "version": "0.1.0", + "license": "Apache-2.0", + "compatibility": "Requires LaunchDarkly MCP server with both feature flag and observability tools enabled." + }, + { + "name": "o11y-log-query", + "description": "Search and filter application log entries by service, time window, severity, or keyword. Use when the user wants to find a specific error message, trace a request by ID across services, look at recent log output, or search for a pattern in logs. This skill queries log text \u2014 use o11y-service-health instead for numeric metrics.", + "path": "skills/observability/o11y-log-query", + "version": "0.1.0", + "license": "Apache-2.0", + "compatibility": "Requires LaunchDarkly MCP server with observability tools enabled." + }, + { + "name": "o11y-regression-detect", + "description": "Continuously poll a service's metrics at repeated intervals and alert when a regression threshold is breached multiple consecutive times. Use when the user asks to 'watch', 'monitor', or 'keep an eye on' a service during a rollout. This skill performs ongoing detection over time \u2014 use o11y-service-health for a one-time metric snapshot, or guarded-rollout-create to configure automated rollback.", + "path": "skills/observability/o11y-regression-detect", + "version": "0.1.0", + "license": "Apache-2.0", + "compatibility": "Requires LaunchDarkly MCP server with observability tools enabled." + }, + { + "name": "o11y-service-health", + "description": "Pull error rate, latency percentiles (p50/p95/p99), and throughput metrics for a service and classify its status as healthy, degraded, or critical. Use when the user asks 'how is doing', wants a metric snapshot before a rollout, or needs to confirm whether a service is currently healthy or degraded.", + "path": "skills/observability/o11y-service-health", + "version": "0.1.0", + "license": "Apache-2.0", + "compatibility": "Requires LaunchDarkly MCP server with observability tools enabled." } ] } diff --git a/skills/guarded-rollouts/guarded-rollout-create/README.md b/skills/guarded-rollouts/guarded-rollout-create/README.md new file mode 100644 index 0000000..901908c --- /dev/null +++ b/skills/guarded-rollouts/guarded-rollout-create/README.md @@ -0,0 +1,73 @@ +# LaunchDarkly Guarded Rollout Configuration Skill + +## Overview + +- Teaches AI agents to configure **guarded rollouts** for LaunchDarkly feature flags with automated metric monitoring and rollback +- Guides agents through designing **rollout stages** with appropriate traffic percentages and soak times based on risk level +- Provides methodology for setting **metric thresholds** (error rate, latency, throughput) derived from baseline data, not guesses +- Ensures rollouts follow a disciplined **explore, assess, execute, verify** pattern with user approval at each step + +## Installation (Local) + +Add this skill to your agent configuration by referencing the `SKILL.md` file: + +```yaml +skills: + - path: skills/guarded-rollouts/guarded-rollout-create/SKILL.md +``` + +Or copy the entire `guarded-rollout-create/` directory into your project's skills folder. + +## Prerequisites + +- **LaunchDarkly MCP server** with guarded rollout capabilities enabled +- MCP tools available: `create-guarded-rollout`, `get-flag`, `get-service-metrics` +- Optional MCP tools: `list-services`, `get-metric-baselines` +- Baseline metrics for the target service (use the service health check skill if baselines are not yet established) + +## Usage + +Invoke this skill when a user wants to safely roll out a feature flag with automated guardrails. + +**Example prompts:** + +``` +Set up a guarded rollout for the new-checkout flag in production +``` + +``` +Roll out dark-mode gradually with automatic rollback if errors spike +``` + +``` +Configure a safe rollout for the payments feature with conservative stages +``` + +``` +Create a guarded rollout for search-results-v3 — standard pattern, +monitor error rate and p99 latency +``` + +The skill will walk through four steps: identifying the flag and establishing baselines, designing rollout stages, defining monitoring thresholds, and configuring and verifying the rollout. + +## Structure + +``` +guarded-rollout-create/ +├── SKILL.md # Skill definition and workflow +├── README.md # This file +└── references/ + └── rollout-stage-patterns.md # Detailed reference on stage progressions, + # soak times, threshold methodology, + # rollback config, and common mistakes +``` + +## Related + +- [Service Health Check Skill](../../observability/o11y-service-health/SKILL.md) — establish baseline metrics before configuring a guarded rollout +- [Regression Detection Skill](../../observability/o11y-regression-detect/SKILL.md) — investigate metric anomalies during an active rollout +- [Feature Management Skills](../feature-management/) — flag creation, targeting, and lifecycle management + +## License + +Apache-2.0 diff --git a/skills/guarded-rollouts/guarded-rollout-create/SKILL.md b/skills/guarded-rollouts/guarded-rollout-create/SKILL.md new file mode 100644 index 0000000..fa9b3fc --- /dev/null +++ b/skills/guarded-rollouts/guarded-rollout-create/SKILL.md @@ -0,0 +1,123 @@ +--- +name: guarded-rollout-create +description: "Configure a guarded rollout: define rollout stages (percentages and soak times), set metric thresholds for error rate, latency, and throughput, and enable automated rollback if thresholds are breached. Use when the user wants to set up or create a guarded rollout, add safety guardrails to a flag rollout, or configure automatic rollback. This skill configures the rollout — use launchdarkly-flag-targeting for simple percentage rollouts without automated monitoring." +license: Apache-2.0 +compatibility: Requires LaunchDarkly MCP server with guarded rollout capabilities enabled. +metadata: + author: launchdarkly + version: "0.1.0" +--- + +# Guarded Rollout Configuration + +You're using a skill that will guide you through configuring a guarded rollout for a feature flag. Your job is to identify the flag and service, design rollout stages with appropriate soak times, define monitoring thresholds, configure the guarded rollout, and verify the configuration. + +If the user hasn't determined baseline metrics for the service, suggest using the [service health check skill](../../observability/o11y-service-health/SKILL.md) first to establish current error rate, latency, and throughput baselines. Accurate baselines are essential for setting meaningful thresholds. + +## Prerequisites + +- **LaunchDarkly MCP server** with guarded rollout capabilities enabled +- **Required MCP tools:** + - `create-guarded-rollout` — configure rollout stages, thresholds, and monitoring + - `get-flag` — fetch flag configuration and current state + - `get-service-metrics` — establish baseline metrics for the service +- **Optional MCP tools:** + - `list-services` — discover available services and their identifiers + - `get-metric-baselines` — retrieve pre-computed baseline statistics for a service + +## Core Principles + +1. **Define Success Before You Roll Out.** Know what "healthy" looks like before starting. Collect baseline metrics for error rate, latency, and throughput. Without baselines, thresholds are guesses, and guesses lead to either false rollbacks or missed regressions. + +2. **Rollback is the Default.** On threshold breach, the system stops or rolls back. Advancing requires all checks passing. The guarded rollout system is biased toward safety — it assumes any anomaly is a problem until proven otherwise. + +3. **Soak Time is Non-Negotiable.** Each stage needs enough time and traffic to detect problems. Memory leaks, cache effects, and load-dependent issues can take hours to manifest. Skipping soak time because metrics "look fine" defeats the purpose of guarded rollouts. + +4. **Thresholds from Data, Not Guesses.** Derive thresholds from actual baseline metrics, not arbitrary numbers. A threshold of "1% error rate" is meaningless without knowing whether the baseline is 0.01% or 0.9%. Always anchor thresholds to measured baselines. + +## Workflow + +### Step 1: Identify Flag & Establish Baselines + +Determine which flag to roll out and which service it affects. Gather the following information: + +- **Flag key** — the identifier for the feature flag (e.g., `new-checkout-flow`) +- **Environment** — the target environment for the rollout (e.g., `production`) +- **Service** — the service that serves this flag and will be monitored + +Use `get-flag` to confirm the flag exists and review its current configuration (variations, targeting rules, current rollout percentage). + +Use `get-service-metrics` to establish current baselines: + +- **Error rate** — the current percentage of requests resulting in errors +- **p50 latency** — median response time +- **p99 latency** — 99th percentile response time +- **Throughput** — requests per unit time + +These baselines drive threshold selection in Step 3. If baseline data is unavailable, note this as an edge case and use conservative defaults with wider margins. + +### Step 2: Design Rollout Stages + +Choose a stage progression pattern based on risk level. See [rollout-stage-patterns.md](references/rollout-stage-patterns.md) for detailed pattern descriptions. + +- **Conservative** (1% → 5% → 10% → 25% → 50% → 100%) — for payment systems, auth, data pipelines, or any change where failure has outsized impact. +- **Standard** (5% → 25% → 50% → 100%) — for most feature rollouts with moderate risk. +- **Aggressive** (10% → 50% → 100%) — for low-risk UI changes, already-validated features, or non-critical systems. + +For each stage, define: + +- **Percentage** of traffic receiving the new variation +- **Soak time** — minimum duration before advancing to the next stage +- **Total rollout duration** — the cumulative time from start to 100% + +Present the proposed stage plan to the user for approval before proceeding. The user may want to adjust percentages, soak times, or add/remove stages based on their knowledge of the service. + +### Step 3: Define Monitoring Thresholds + +Set thresholds for each monitored metric. Thresholds determine when the system halts or rolls back the rollout. See [rollout-stage-patterns.md](references/rollout-stage-patterns.md) for detailed threshold-setting methodology. + +- **Error rate:** Set to baseline + acceptable increase. Example: if baseline error rate is 0.5%, set threshold at 1.0% (baseline + 0.5 percentage points). Include an absolute ceiling as a safety net. +- **Latency p99:** Set to baseline p99 + acceptable increase. Example: if baseline p99 is 400ms, set threshold at 600ms (baseline + 200ms). Consider setting a p50 threshold as well. +- **Throughput:** Set a floor to detect traffic drops. Example: if baseline throughput is 5,000 req/hr, set minimum at 4,000 req/hr (80% of baseline). + +Verify that all thresholds are above the natural maximum variation observed in baseline data. Thresholds below natural variation will cause false rollbacks. + +Present the proposed thresholds to the user for approval. + +### Step 4: Configure & Verify + +Use `create-guarded-rollout` to set up the rollout with the designed stages and thresholds. + +After configuration, verify every aspect: + +- **Correct flag** — the rollout targets the intended flag key +- **Correct environment** — the rollout is configured for the right environment +- **Stages match plan** — percentages and soak times match the approved plan +- **Thresholds match plan** — error rate, latency, and throughput thresholds match the approved values +- **Rollback behavior correct** — automatic rollback is enabled (unless manual rollback was explicitly chosen with justification) + +Confirm with the user before activating the guarded rollout. Once activated, the system will begin rolling traffic to the first stage and monitoring metrics. + +## Edge Cases + +| Scenario | Guidance | +|----------|----------| +| **Flag already has targeting rules** | Guarded rollout interacts with existing targeting rules. Review the flag's current rules with `get-flag` and determine whether the rollout should apply to all users or only those not matched by existing rules. The guarded rollout percentage applies to the fallthrough population (users not matched by any targeting rule). | +| **No baseline data available** | Use conservative thresholds with wider margins. Set error rate threshold at 2%, latency p99 threshold at 2x the team's expected value, and throughput floor at 50% of expected volume. Plan to tighten thresholds after the first stage provides real data. | +| **Service has very low traffic** | Need longer soak times to accumulate enough data points for statistical significance. Consider traffic-based soak criteria (e.g., "advance after 5,000 requests" rather than "advance after 12 hours"). Use the conservative pattern. | +| **Multiple environments need rollout** | Configure each environment separately. Start with the lowest-risk environment (e.g., staging, then production-canary, then production). Do not copy thresholds between environments — each environment has its own baseline. | +| **User wants custom stage progression** | Support custom progressions. Verify that the progression is monotonically increasing, that each stage has a soak time defined, and that the first stage is no higher than 25% (to preserve the "start small" principle). | + +## What NOT to Do + +- **Don't start a rollout without baseline metrics.** Thresholds without baselines are arbitrary numbers. Measure first, then configure. +- **Don't skip soak times.** Even if metrics look perfect after 30 minutes, the soak time exists to catch slow-burn issues. Respect the minimum. +- **Don't set thresholds without data.** "1% error rate sounds reasonable" is not a valid approach. Derive thresholds from measured baselines using the methodology in the reference documentation. +- **Don't configure rollback to "do nothing."** A guarded rollout that does not roll back on threshold breach is just a regular rollout with extra monitoring. The value of a guarded rollout is automated safety response. +- **Don't roll out to 100% in a single stage.** That is not a guarded rollout. Even the most aggressive pattern uses at least three stages to provide incremental validation. + +## References + +- [Rollout Stage Patterns Reference](references/rollout-stage-patterns.md) — detailed guidance on stage progressions, soak times, threshold-setting methodology, and common mistakes +- [Service Health Check Skill](../../observability/o11y-service-health/SKILL.md) — use to establish baseline metrics before configuring a guarded rollout +- [Regression Detection Skill](../../observability/o11y-regression-detect/SKILL.md) — use during an active rollout to investigate potential regressions detected by threshold monitoring diff --git a/skills/guarded-rollouts/guarded-rollout-create/references/rollout-stage-patterns.md b/skills/guarded-rollouts/guarded-rollout-create/references/rollout-stage-patterns.md new file mode 100644 index 0000000..cb5ebaa --- /dev/null +++ b/skills/guarded-rollouts/guarded-rollout-create/references/rollout-stage-patterns.md @@ -0,0 +1,422 @@ +# Rollout Stage Patterns Reference + +This reference covers stage progression patterns, soak time guidelines, threshold-setting methodology, rollback configuration, and stage transition criteria for guarded rollouts. Use this document when designing a rollout plan to select the right pattern, set appropriate thresholds, and avoid common pitfalls. + +--- + +## Stage Progression Patterns + +Every guarded rollout moves traffic from 0% to 100% through a series of stages. The number of stages, the percentage at each stage, and the soak time between stages depend on how much risk the change carries. Three standard patterns cover the vast majority of use cases. + +### Conservative Pattern + +**Stages:** 1% → 5% → 10% → 25% → 50% → 100% + +| Stage | Percentage | Minimum Soak Time | Cumulative Duration | +|-------|-----------|-------------------|---------------------| +| 1 | 1% | 24 hours | 24 hours | +| 2 | 5% | 24 hours | 48 hours | +| 3 | 10% | 24 hours | 72 hours | +| 4 | 25% | 48 hours | 120 hours (5 days) | +| 5 | 50% | 48 hours | 168 hours (7 days) | +| 6 | 100% | — | 7+ days total | + +**When to use:** + +- Payment processing systems where a bug means lost revenue or failed transactions +- Authentication and authorization flows where a regression locks users out +- Data pipeline changes where corruption propagates downstream before detection +- Core API changes that affect every downstream consumer +- Any system where the blast radius of a failure is disproportionately large relative to the percentage of traffic affected + +**Why this pattern works for high-risk changes:** The 1% stage acts as a canary. At 1% traffic, even a catastrophic failure affects very few users, and the 24-hour soak ensures you capture a full diurnal traffic cycle. The gradual ramp from 5% to 25% gives you increasing confidence while keeping exposure low. The jump from 50% to 100% is the largest single increase, but by that point you have days of clean data across multiple traffic levels. + +**Example — payment service rollout:** + +A team is rolling out a new payment tokenization flow behind the `new-tokenization-v2` flag. The service processes 50,000 transactions per hour at peak. At 1%, that is 500 transactions per hour — enough to detect a 2% error rate increase within a few hours, but small enough that a total failure affects only 500 users in the worst hour. The 24-hour soak at each early stage ensures the team sees both peak and off-peak traffic patterns before advancing. + +### Standard Pattern + +**Stages:** 5% → 25% → 50% → 100% + +| Stage | Percentage | Minimum Soak Time | Cumulative Duration | +|-------|-----------|-------------------|---------------------| +| 1 | 5% | 12 hours | 12 hours | +| 2 | 25% | 24 hours | 36 hours | +| 3 | 50% | 24 hours | 60 hours (2.5 days) | +| 4 | 100% | — | 2.5+ days total | + +**When to use:** + +- Most feature flag rollouts that change application behavior +- API endpoint additions or modifications with moderate downstream impact +- UI features that affect user workflows but not data integrity +- Backend optimizations (caching changes, query rewrites) where rollback is clean +- Changes that have passed thorough integration and staging testing + +**Why this pattern works for typical changes:** Starting at 5% gives you meaningful traffic volume quickly. Most services with moderate traffic (1,000+ requests per hour) will see 50+ requests per hour at the 5% stage, which is enough to detect significant regressions within a few hours. The 12-hour initial soak captures at least half a diurnal cycle. The 24-hour soaks at 25% and 50% each capture a full cycle. + +**Example — feature rollout:** + +A team is rolling out a redesigned search results page behind the `search-results-v3` flag. The search service handles 10,000 queries per hour. At 5%, 500 queries per hour hit the new code path. The team monitors error rate, p99 latency, and click-through rate. After 12 clean hours at 5%, they advance to 25% (2,500 queries/hour), soak for 24 hours, then 50% for 24 hours, then 100%. + +### Aggressive Pattern + +**Stages:** 10% → 50% → 100% + +| Stage | Percentage | Minimum Soak Time | Cumulative Duration | +|-------|-----------|-------------------|---------------------| +| 1 | 10% | 4 hours | 4 hours | +| 2 | 50% | 8 hours | 12 hours | +| 3 | 100% | — | 12+ hours total | + +**When to use:** + +- Low-risk UI changes (copy changes, color updates, layout tweaks) +- Features that have already been fully validated in a staging environment with production-like traffic +- Changes behind a flag that is already partially rolled out and you are expanding +- Rollouts in non-critical environments (internal tools, dev-facing dashboards) +- Re-rolling out a previously rolled-back change after fixing the root cause + +**Why this pattern works for low-risk changes:** The 10% starting point provides immediate meaningful traffic. The 4-hour soak is long enough to detect obvious regressions (error spikes, latency jumps) but short enough to complete the rollout within a business day. This pattern assumes the change has been well-tested and the team has high confidence; the guarded rollout is a safety net, not the primary validation mechanism. + +**Example — copy change rollout:** + +A team is updating button labels and help text across the application behind the `updated-copy-q1` flag. The change is purely cosmetic with no logic changes. At 10%, they soak for 4 hours to confirm no unexpected errors (broken string interpolation, missing translations). At 50%, they soak for 8 hours. Total rollout completes in under a day. + +--- + +## Soak Time Guidelines + +### Why Soak Time Matters + +Soak time is the minimum duration a rollout stage must run before advancing to the next stage. It exists because many failure modes are not immediately apparent: + +- **Latency degradation under load:** A code path may perform well at low concurrency but degrade as connection pools saturate or caches fill. +- **Memory leaks:** Gradual memory growth may not trigger alerts for hours but will eventually cause out-of-memory crashes or garbage collection pauses. +- **Downstream cascading failures:** A change that increases load on a downstream service may not cause visible problems until that service's resources are exhausted. +- **Time-dependent bugs:** Code that runs differently based on time of day, day of week, or scheduled jobs may only fail during specific windows. +- **Traffic pattern sensitivity:** A change may work fine during off-peak hours but fail under peak load due to contention, rate limiting, or resource constraints. + +### Minimum Soak Times Per Stage + +These are minimums. Longer soak times are always safer. + +| Risk Level | Early Stages (< 25%) | Mid Stages (25%-50%) | Final Stage (50%-100%) | +|--------------|----------------------|----------------------|------------------------| +| Conservative | 24 hours | 48 hours | 48 hours | +| Standard | 12 hours | 24 hours | 24 hours | +| Aggressive | 4 hours | 8 hours | — | + +### Traffic-Based vs Time-Based Soak + +**Time-based soak** is the most common approach: wait N hours before advancing. It is simple and captures diurnal traffic patterns. Use time-based soak as the default. + +**Traffic-based soak** requires a minimum number of requests (or events, or transactions) before advancing. It is useful when: + +- The service has highly variable traffic (e.g., 100x difference between peak and off-peak) +- The service has very low traffic where time-based soak may not produce enough data points +- You need statistical confidence in your metrics (e.g., you need at least 10,000 requests to detect a 0.1% error rate increase) + +**Recommended traffic minimums by stage:** + +| Stage Percentage | Minimum Requests at Stage | Rationale | +|-----------------|--------------------------|-----------| +| 1% | 1,000 | Enough to detect 1%+ error rate increase with confidence | +| 5% | 5,000 | Enough to detect 0.5% error rate increase | +| 10% | 10,000 | Enough to detect 0.3% error rate increase | +| 25%+ | 25,000 | Enough to detect 0.2% error rate increase and measure latency percentiles accurately | + +If your service processes fewer than 100 requests per hour, consider using traffic-based soak with lower minimums, and accept that you will have less statistical confidence. In this case, lean toward the conservative pattern with longer time-based soaks as a supplement. + +### Off-Peak Considerations + +Many failure modes only manifest under peak load. A soak period that runs entirely during off-peak hours (e.g., 2 AM to 6 AM) may miss load-dependent issues. Guidelines: + +- **Minimum soak of 12 hours** ensures you capture at least part of a peak period, regardless of when the stage started. +- **Minimum soak of 24 hours** ensures you capture a full peak-to-peak cycle. +- **Avoid advancing stages at night.** If a stage completes its soak time at 3 AM, wait until business hours to advance so the team is available to respond if the next stage causes issues. +- **Weekend considerations:** If your service has significantly different weekend traffic, consider extending soak times to cover at least one weekday peak if the stage started on a Friday. + +### What "Sufficient Soak" Means + +A stage has been sufficiently soaked when ALL of the following are true: + +1. The minimum soak time for the risk level has elapsed. +2. The minimum traffic volume has been reached (if using traffic-based soak). +3. All monitored metrics have remained within their thresholds for the entire soak period (not just at the end). +4. At least one peak traffic period has occurred during the soak (for stages with 12+ hour soak times). +5. No anomalies have been observed that require investigation, even if they did not breach thresholds. + +--- + +## Threshold-Setting Methodology + +Thresholds determine when the system automatically halts or rolls back a rollout. Setting them correctly is the most important part of guarded rollout configuration. Thresholds that are too tight cause false rollbacks that erode trust in the system. Thresholds that are too loose miss real regressions. + +### Error Rate Thresholds + +#### Absolute vs Relative Thresholds + +**Absolute threshold:** "Roll back if error rate exceeds 2%." Simple to understand but ignores the baseline. If the service already has a 1.8% error rate, a 2% threshold gives almost no room for regression detection. + +**Relative threshold (baseline-relative):** "Roll back if error rate exceeds baseline + 0.5 percentage points" or "Roll back if error rate exceeds 2x baseline." Adapts to the current state of the service. + +**Recommendation:** Use baseline-relative thresholds as the primary mechanism, with an absolute ceiling as a safety net. + +#### Recommended Starting Points for Error Rate + +| Service Baseline Error Rate | Recommended Threshold | Absolute Ceiling | +|----------------------------|----------------------|------------------| +| < 0.1% | Baseline + 0.5pp | 1.0% | +| 0.1% - 0.5% | Baseline + 0.5pp | 2.0% | +| 0.5% - 1.0% | Baseline + 1.0pp | 3.0% | +| 1.0% - 2.0% | Baseline × 2.0 | 5.0% | +| > 2.0% | Baseline × 1.5 | 10.0% | + +*pp = percentage points* + +**Example:** A service has a baseline error rate of 0.3%. Using the table: threshold = 0.3% + 0.5pp = 0.8%, with an absolute ceiling of 2.0%. If the error rate rises to 0.9%, the rollout halts. If some unrelated issue pushes errors to 2.1%, the rollout also halts (absolute ceiling). + +#### What Counts as an "Error" + +Define this clearly before setting thresholds: + +- HTTP 5xx responses (always) +- HTTP 4xx responses (sometimes — 400 Bad Request may indicate a contract change; 404 may indicate a routing change; 401/403 may indicate an auth regression) +- Application-level error codes returned in 200 responses (common in GraphQL and RPC systems) +- Timeouts (usually yes, as they indicate latency issues) +- Circuit breaker trips on downstream calls (yes, if monitored) + +### Latency Thresholds + +#### p50 vs p99 Thresholds + +**p50 (median) latency** captures the typical user experience. A p50 regression means most users are affected. + +**p99 latency** captures the worst-case user experience. A p99 regression may only affect 1% of users but often indicates a systemic issue (e.g., a slow database query that triggers under specific conditions). + +**Recommendation:** Monitor both. Set thresholds on both. The p99 threshold should be looser than the p50 threshold because p99 is inherently noisier. + +#### Recommended Starting Points for Latency + +**Absolute increase limits:** + +| Baseline Latency (p50) | p50 Threshold | p99 Threshold | +|------------------------|------------------------|--------------------------| +| < 50ms | Baseline + 20ms | Baseline p99 + 100ms | +| 50ms - 200ms | Baseline + 50ms | Baseline p99 + 200ms | +| 200ms - 500ms | Baseline + 100ms | Baseline p99 + 500ms | +| 500ms - 2000ms | Baseline + 200ms | Baseline p99 + 1000ms | +| > 2000ms | Baseline × 1.3 | Baseline p99 × 1.5 | + +**Percentage increase limits (alternative approach):** + +| Risk Level | p50 Increase Limit | p99 Increase Limit | +|--------------|-------------------|-------------------| +| Conservative | 10% | 25% | +| Standard | 20% | 50% | +| Aggressive | 30% | 75% | + +**Example:** A service has baseline p50 of 120ms and p99 of 450ms. Using the absolute increase table: p50 threshold = 120ms + 50ms = 170ms, p99 threshold = 450ms + 200ms = 650ms. Using the percentage table (standard): p50 threshold = 120ms × 1.2 = 144ms, p99 threshold = 450ms × 1.5 = 675ms. The team chooses whichever approach better fits their SLO. + +### Throughput Thresholds + +Throughput thresholds detect traffic drops, which can indicate that requests are failing silently, timing out, or being rejected before they are counted as errors. + +#### Drop Detection + +Set a throughput floor relative to the expected traffic level: + +| Traffic Level | Minimum Throughput Threshold | +|-----------------------|-----------------------------| +| < 100 requests/hour | Baseline × 0.5 (50% drop) | +| 100-1000 requests/hour| Baseline × 0.7 (30% drop) | +| 1000+ requests/hour | Baseline × 0.8 (20% drop) | + +**Important:** Throughput naturally varies with time of day. Use a time-adjusted baseline (compare to the same hour on a previous day) rather than a flat average. If your monitoring system does not support time-adjusted baselines, use the lowest normal throughput as your baseline (typically the overnight minimum) and set the threshold below that. + +#### Minimum Traffic Requirements + +Before any threshold evaluation is meaningful, you need enough traffic to measure. Set a minimum traffic requirement per stage: + +- **At least 100 requests** before evaluating error rate thresholds +- **At least 500 requests** before evaluating latency percentile thresholds +- **At least 1 hour of traffic** before evaluating throughput thresholds + +If these minimums are not met within the soak time, extend the soak time rather than advancing without data. + +### How to Set Thresholds from Baseline Data + +Follow this process: + +1. **Collect baseline data** for at least 7 days (to capture day-of-week variation). Use `get-service-metrics` or equivalent to pull error rate, p50 latency, p99 latency, and throughput. + +2. **Calculate summary statistics:** + - Mean error rate over the period + - Maximum error rate during the period (to understand natural spikes) + - Mean and max p50 latency + - Mean and max p99 latency + - Minimum throughput (to understand natural dips) + +3. **Set thresholds above the natural variation:** Your threshold must be above the maximum observed value during normal operation. If the error rate naturally spikes to 0.8% during deployments, setting a threshold at 0.7% will cause false rollbacks. + +4. **Apply the formulas from the tables above** using the mean as the baseline and verifying that the resulting threshold is above the observed maximum. + +5. **Add a safety margin** of 10-20% above the calculated threshold to account for natural variation not captured in the baseline period. + +**Example — full threshold calculation:** + +Baseline data for `checkout-service` over 7 days: +- Error rate: mean 0.4%, max 0.7% (spike during a dependency restart) +- p50 latency: mean 85ms, max 110ms (during peak hours) +- p99 latency: mean 320ms, max 580ms (during peak hours) +- Throughput: mean 5,200 req/hr, min 1,100 req/hr (overnight) + +Threshold calculation: +- Error rate: baseline (0.4%) + 0.5pp = 0.9%. Check: 0.9% > 0.7% (max observed). Good. Add 10% margin: 0.99%, round to 1.0%. +- p50 latency: baseline (85ms) + 50ms = 135ms. Check: 135ms > 110ms (max observed). Good. Add 10% margin: 148ms, round to 150ms. +- p99 latency: baseline (320ms) + 200ms = 520ms. Check: 520ms < 580ms (max observed). Not good — threshold would be breached by normal traffic. Adjust to 580ms + 20% margin = 696ms, round to 700ms. +- Throughput floor: 1,100 req/hr × 0.8 = 880 req/hr. + +### Threshold Adjustment Over Time + +Thresholds are not set-and-forget. Adjust them: + +- **After a rollout completes:** The new baseline may be different. Recalculate for the next rollout. +- **After a false rollback:** Investigate whether the threshold was too tight or the metric was legitimately anomalous. If the threshold was too tight, widen it by 10-20%. +- **After a missed regression:** Investigate whether the threshold was too loose. Tighten it based on the actual impact observed. +- **Seasonally:** Services with strong seasonal patterns (e.g., e-commerce with holiday peaks) may need different thresholds at different times of year. + +--- + +## Rollback Configuration + +### Automatic vs Manual Rollback + +**Automatic rollback** means the system reverts the flag to its pre-rollout state without human intervention when a threshold is breached. This is the recommended default for all guarded rollouts. + +**Manual rollback** means the system alerts the team when a threshold is breached but does not revert automatically. The team investigates and decides whether to roll back or adjust thresholds. Use manual rollback only when: + +- The cost of a false rollback is very high (e.g., a data migration that is painful to undo) +- The team has high confidence in their ability to respond quickly to alerts +- The metrics being monitored are known to be noisy and thresholds cannot be set tightly enough to avoid false positives + +**Recommendation:** Default to automatic rollback. Switch to manual only with explicit justification. + +### Rollback Triggers + +A rollback is triggered when ANY of the following occur: + +1. **Threshold breach:** Any monitored metric exceeds its threshold for a sustained period (typically 5-10 minutes, not a single data point). +2. **Sustained threshold proximity:** A metric stays within 90% of its threshold for an extended period (e.g., error rate at 0.9% with a 1.0% threshold for 30+ minutes). This may indicate a slow degradation that will eventually breach. +3. **Manual trigger:** A team member manually initiates a rollback based on observations not captured by automated thresholds (e.g., customer reports, qualitative issues). +4. **Dependent service failure:** If a critical dependency goes down during a rollout, it may be prudent to roll back even if the rollout itself is not causing the issue, to reduce variables during incident response. + +### What Rollback Means at Different Stages + +| Stage | Rollback Behavior | +|-------------|-----------------------------------------------------------------------------------| +| 1% - 5% | Revert to 0%. Impact is minimal. Investigate freely. | +| 10% - 25% | Revert to 0%. Moderate number of users may notice the feature disappearing. | +| 50% | Revert to 0%. Significant user impact. Communicate if the feature was visible. | +| 100% | Guarded rollout is complete; rollback is now a manual flag toggle, not part of the guarded rollout. | + +**Important:** Rolling back does not mean the feature is permanently abandoned. It means the current rollout attempt has been halted. The team should investigate, fix the root cause, and start a new guarded rollout. + +--- + +## Stage Transition Criteria + +Before advancing from one stage to the next, ALL of the following must be satisfied: + +### Required Criteria + +1. **All metrics within thresholds.** Every monitored metric (error rate, p50 latency, p99 latency, throughput) must be within its configured threshold. Not just "currently within threshold" — it must have been within threshold for the entire soak period (or at least 95% of it, to allow for transient spikes). + +2. **Minimum soak time elapsed.** The stage must have been running for at least the configured soak time. There are no shortcuts. If the soak time is 24 hours and everything looks perfect after 6 hours, you still wait. + +3. **Minimum traffic volume seen.** The stage must have processed enough traffic to make the metric evaluations statistically meaningful. See the traffic minimums in the Soak Time Guidelines section. + +### Recommended Criteria + +4. **No active incidents.** If there is an ongoing incident affecting the service or its dependencies, do not advance. Wait until the incident is resolved and metrics have stabilized. + +5. **Team availability.** Do not advance a stage if the team will be unavailable for the next soak period (e.g., do not advance on a Friday evening if no one monitors over the weekend). + +6. **No anomalies under investigation.** If someone on the team has flagged a metric pattern as "unusual but not threshold-breaching," investigate before advancing. + +--- + +## Common Mistakes + +### Thresholds Too Tight — False Rollbacks + +**Symptom:** The rollout keeps getting rolled back even though there is no real problem. The team starts to distrust the guarded rollout system. + +**Causes:** +- Threshold set below the natural maximum variation of the metric +- Not accounting for diurnal traffic patterns (peak hours have higher latency) +- Not accounting for periodic events (batch jobs, cache refreshes, garbage collection pauses) +- Using a flat baseline instead of a time-adjusted baseline + +**Fix:** Review the baseline data. Look specifically at the maximum observed values during normal operation. Set thresholds above those maximums with a 10-20% margin. + +### Thresholds Too Loose — Missing Real Issues + +**Symptom:** A rollout completes but users report problems that the thresholds did not catch. + +**Causes:** +- Threshold set so far above baseline that only catastrophic regressions are detected +- Monitoring the wrong metrics (e.g., monitoring overall error rate when the regression only affects a specific endpoint) +- Not monitoring enough metrics (e.g., only error rate, missing latency degradation) + +**Fix:** Tighten thresholds to be closer to baseline. Add more specific metrics (per-endpoint, per-operation). Consider adding business metrics (conversion rate, completion rate) alongside technical metrics. + +### Skipping Early Stages + +**Symptom:** The team jumps from 0% to 25% or 50% because "the change is simple." Then a regression affects a large number of users. + +**Why this happens:** Confidence bias. The team tested thoroughly in staging and believes the change is safe. But production has traffic patterns, data distributions, and edge cases that staging does not replicate. + +**Fix:** Always start at a low percentage. Even the aggressive pattern starts at 10%. The early stages are cheap (low traffic, short soak times) and provide disproportionate safety value. + +### Insufficient Soak Time + +**Symptom:** A regression manifests hours or days after advancing past a stage. + +**Causes:** +- Memory leaks that take hours to cause problems +- Cache warming effects that mask latency issues until caches expire +- Downstream services that degrade gradually under increased load +- Time-dependent code paths that only execute during specific hours + +**Fix:** Respect the minimum soak times. For critical services, consider extending soak times beyond the minimums. If you have experienced late-manifesting issues in the past, add extra soak time at the stages where those issues would have been caught. + +### Setting Thresholds Without Baseline Data + +**Symptom:** The team picks "round numbers" for thresholds (1% error rate, 500ms latency) without knowing what the current baseline is. These numbers may be wildly wrong in either direction. + +**Fix:** Always measure the baseline first. Use `get-service-metrics` or your observability platform to collect at least 7 days of data before configuring thresholds. If baseline data is truly unavailable (brand-new service, no historical metrics), use the conservative pattern with wide thresholds and plan to tighten them after the first rollout provides data. + +### Ignoring Throughput Monitoring + +**Symptom:** Error rate and latency look fine, but traffic has silently dropped because requests are being rejected or timing out at a layer not captured by those metrics. + +**Fix:** Always include throughput monitoring. A significant drop in throughput is a signal that something is wrong, even if the requests that do succeed look healthy. + +--- + +## Quick-Reference: Choosing a Pattern + +| Question | Conservative | Standard | Aggressive | +|----------|-------------|----------|------------| +| Does a failure affect money or data integrity? | Yes | — | — | +| Has this exact change been validated with prod-like traffic? | — | — | Yes | +| Is the change purely cosmetic / copy? | — | — | Yes | +| Is this a first-time rollout of new logic? | — | Yes | — | +| Does the team have low confidence in test coverage? | Yes | — | — | +| Is the service low-traffic (< 100 req/hr)? | Yes | — | — | +| Does the team need the rollout done today? | — | — | Yes | + +If multiple columns apply, choose the more conservative option. diff --git a/skills/observability/o11y-flag-impact/README.md b/skills/observability/o11y-flag-impact/README.md new file mode 100644 index 0000000..34c447b --- /dev/null +++ b/skills/observability/o11y-flag-impact/README.md @@ -0,0 +1,74 @@ +# LaunchDarkly Flag Impact Analysis Skill + +## Overview + +- Teaches AI agents to correlate feature flag changes with shifts in service metrics (error rate, latency, throughput). +- Guides agents through defining before/after measurement windows around a flag change timestamp. +- Accounts for rollout percentages, confounding factors, and sample size when assessing impact. +- Produces findings with appropriate confidence levels and evidence-based language, avoiding causal claims. + +## Installation (Local) + +Clone this repository and reference the skill from your agent configuration: + +```bash +git clone https://github.com/launchdarkly/agent-skills.git +``` + +Add the skill path to your agent's skill configuration: + +```yaml +skills: + - path: skills/observability/o11y-flag-impact +``` + +## Prerequisites + +- **LaunchDarkly MCP server** with both feature flag and observability tools enabled. +- **MCP tools available:** `get-flag`, `get-service-metrics`, `get-flag-changes`. Optionally `get-metric-baselines`. +- Access to the LaunchDarkly environment where the flag change occurred. + +## Usage + +Ask your agent questions like: + +``` +"Did the new-checkout flag cause the error spike?" +``` + +``` +"What's the impact of rolling out dark-mode to 50%?" +``` + +``` +"Check if the recent flag change affected latency." +``` + +``` +"The cache-bypass flag was toggled off 2 hours ago. Did error rates change?" +``` + +``` +"Compare metrics before and after the search-v2 rollout went from 10% to 100%." +``` + +The agent will follow the skill's workflow to identify the flag change, define measurement windows, pull before/after metrics, assess correlation with appropriate confidence, and report findings. + +## Structure + +``` +o11y-flag-impact/ +├── SKILL.md # Skill definition and workflow +├── README.md # This file +└── references/ + └── correlation-methods.md # Detailed reference on correlation techniques +``` + +## Related + +- **[o11y-service-health](../o11y-service-health/)** — Establish baseline service metrics and assess overall service health. Use this skill to understand normal behavior before analyzing flag impact. +- **Feature flag skills** (`../../feature-flags/`) — Manage flag configuration, targeting rules, and lifecycle. Use alongside this skill when you need to modify a flag based on impact findings. + +## License + +Apache-2.0 diff --git a/skills/observability/o11y-flag-impact/SKILL.md b/skills/observability/o11y-flag-impact/SKILL.md new file mode 100644 index 0000000..d685762 --- /dev/null +++ b/skills/observability/o11y-flag-impact/SKILL.md @@ -0,0 +1,103 @@ +--- +name: o11y-flag-impact +description: "Retrospective analysis: compare service metrics from before and after a specific feature flag change to determine whether the flag caused a metric shift. Use when the user asks 'did flag X cause the error spike', 'what was the impact of turning on flag Y', or wants a before/after comparison of metrics around a known flag change event." +license: Apache-2.0 +compatibility: Requires LaunchDarkly MCP server with both feature flag and observability tools enabled. +metadata: + author: launchdarkly + version: "0.1.0" +--- + +# Flag Impact Analysis + +You're using a skill that will guide you through correlating a feature flag change with shifts in service metrics. Your job is to identify the flag change, define measurement windows, pull before/after metrics, assess correlation, and report findings with appropriate confidence. + +## Prerequisites + +- **LaunchDarkly MCP server** with both feature flag and observability tools enabled. +- **Required MCP tools:** + - `get-flag` — fetch flag configuration and change history. + - `get-service-metrics` — fetch error rate, latency, and throughput for a specified time window. + - `get-flag-changes` — list recent flag changes with timestamps. +- **Optional MCP tools:** + - `get-metric-baselines` — fetch historical metric baselines for comparison. + +## Core Principles + +1. **Correlation is Not Causation** — Present evidence, not certainty. Use language like "likely impacted" or "correlates with," never "caused" or "definitely." Your analysis establishes correlation and assesses its strength, but cannot prove causation on its own. + +2. **Windows Must Be Clean** — The before and after measurement windows must not overlap with other changes (deployments, other flag changes, infrastructure events). A contaminated window produces unreliable results. A smaller clean window is always better than a larger contaminated one. + +3. **Account for Rollout Percentage** — A flag at 5% rollout will not produce a 100% metric shift. Scale your expectations to the rollout percentage. If a flag is at 10% and you see a 10% error rate increase, the per-request impact is likely much larger. Always check the rollout percentage before interpreting metric magnitude. + +4. **Absence of Evidence is Evidence** — If metrics did not change after a flag toggle, that is a valid and useful finding. Report it clearly. A "no impact" result gives the team confidence that the change is safe, or rules out the flag as the cause of an issue under investigation. + +## Workflow + +### Step 1: Identify the Flag Change + +Determine which flag changed, when it changed, and what changed (toggled on/off, rollout percentage changed, targeting rules changed). + +- Use `get-flag-changes` to list recent flag changes, or use `get-flag` if you already know the flag key. +- Record the flag key, the environment (e.g., production, staging), and the exact timestamp of the change. +- Note what changed: was it toggled from off to on? Was the rollout percentage increased from 0% to 25%? Were targeting rules modified? +- Confirm the flag, environment, and timestamp with the user before proceeding. + +### Step 2: Define Metric Windows + +Set a "before" window (pre-change) and an "after" window (post-change) of equal duration. + +- Choose window duration based on the service's traffic volume. High-traffic services can use shorter windows (30 min to 1 hour); low-traffic services need longer windows (4 to 24 hours). See [correlation-methods.md](references/correlation-methods.md) for detailed sizing guidance. +- Align the windows with the flag change timestamp. Leave a small buffer (1-5 minutes) around the change to account for propagation delay. +- Check for confounding changes in both windows: other deployments, other flag changes, infrastructure events, traffic pattern shifts. If confounders exist, shrink the windows to exclude them or note them as limitations. + +### Step 3: Pull Before/After Metrics + +Use `get-service-metrics` for both the before and after windows. + +- Collect the following metrics for each window: + - **Error rate** (percentage of requests resulting in errors) + - **Latency** — p50, p95, and p99 + - **Throughput** (requests per second or per minute) +- If possible, scope metrics to the specific service or endpoint that the flag affects, rather than using aggregate service-wide metrics. +- If `get-metric-baselines` is available, pull historical baselines for additional context (e.g., what does this metric normally look like at this time of day?). +- Record the exact time ranges and metric values for your report. + +### Step 4: Assess & Report + +Compare before vs after across all collected metrics. + +- Calculate the absolute and relative change for each metric. +- Account for the rollout percentage. If the flag is at 20% rollout and error rate increased by 1 percentage point, the per-affected-request impact is approximately 5x larger than the aggregate number suggests. +- Check confounders one more time. If you find a confounder you missed earlier, note it. +- Classify your finding: + - **Likely impacted** — There is a clear metric shift that aligns with the flag change timing, with no (or minimal) confounders. State the evidence and confidence level. + - **No clear impact** — Metrics are stable across both windows. The flag change does not appear to have affected the measured metrics. + - **Insufficient data** — The window is too short, traffic is too low, or metrics are not available. Recommend waiting and re-analyzing. +- Present your findings with specific numbers: "Error rate increased from 0.12% to 1.45% (12x increase) in the 1-hour window following the flag toggle at 14:00 UTC. No other changes were detected in this window. Confidence: High." +- Include a recommended action: roll back, continue monitoring, increase rollout, or wait for more data. + +## Edge Cases + +| Scenario | How to Handle | +|---|---| +| Flag changed multiple times in short succession | Use the most recent stable state as the "after" starting point. Note the rapid changes and consider whether metrics had time to stabilize between changes. | +| Flag is at very low rollout percentage (< 5%) | Aggregate metrics may not show a detectable shift. Prefer segment-level analysis if available. If not, note that the rollout percentage is too low for reliable aggregate analysis. | +| No clear "before" baseline (flag was changing frequently) | Look for the longest stable period before the change of interest. If no stable period exists, note this as a limitation and lower your confidence. | +| Multiple flags changed simultaneously | Identify all flags that changed in the window. If they affect different services or code paths, you may still be able to isolate impact. If they overlap, note that attribution is ambiguous. | +| Metric shift preceded the flag change | The flag change is unlikely to be the cause. Investigate other changes that occurred before the flag change. The flag change may have been a response to the metric shift (e.g., a kill switch). | +| Service has very low traffic | Use longer windows (up to 24 hours). Accept that confidence will be lower. Consider recommending a higher rollout percentage to generate more signal. | + +## What NOT to Do + +- **Do not claim causation.** Your analysis establishes correlation and assesses its strength. It does not prove causation. Always use hedged language. +- **Do not ignore confounding factors.** Failing to check for other changes in the window is the single most common source of incorrect attributions. +- **Do not compare windows of different durations.** A 30-minute before window compared to a 2-hour after window will produce skewed results due to different sample sizes and time-of-day effects. +- **Do not assess impact before sufficient soak time.** If the flag changed 5 minutes ago and the service handles 10 requests per minute, you have 50 data points. Wait for adequate data. +- **Do not ignore rollout percentage when interpreting metric magnitude.** A 0.5% error rate increase with a 2% rollout is a very different signal than a 0.5% increase with a 100% rollout. + +## References + +- [Correlation Methods Reference](references/correlation-methods.md) — detailed guidance on window sizing, rollout-adjusted analysis, confounding factors, statistical considerations, and presenting findings. +- [Service Health Skill](../o11y-service-health/SKILL.md) — use the service health skill for establishing baseline metrics and understanding normal service behavior. +- Feature flag management skills in the `../../feature-flags/` directory — for flag configuration, targeting, and lifecycle management. diff --git a/skills/observability/o11y-flag-impact/references/correlation-methods.md b/skills/observability/o11y-flag-impact/references/correlation-methods.md new file mode 100644 index 0000000..45a9fe9 --- /dev/null +++ b/skills/observability/o11y-flag-impact/references/correlation-methods.md @@ -0,0 +1,295 @@ +# Correlation Methods Reference + +This document provides detailed guidance on correlating feature flag changes with shifts in service metrics. It covers window sizing, rollout-adjusted analysis, confounding factor identification, statistical considerations, metric comparison techniques, common correlation patterns, and how to present findings. + +--- + +## Before/After Window Sizing + +The foundation of flag impact analysis is comparing metrics from a "before" window (pre-change) to an "after" window (post-change). Choosing the right window size is critical to producing meaningful results. + +### How to Choose Window Duration + +Window duration depends on the service's traffic volume and the granularity of the metrics you are analyzing. The goal is to capture enough data points in each window to establish a reliable baseline and a reliable post-change measurement. + +- **High-traffic services** (thousands of requests per minute): A 30-minute to 1-hour window on each side is often sufficient. These services generate enough data points quickly that short windows still yield statistically meaningful comparisons. +- **Medium-traffic services** (hundreds of requests per minute): Use 1-hour to 4-hour windows. This gives enough volume to smooth out natural variance while keeping the analysis tight enough to avoid confounders. +- **Low-traffic services** (fewer than 100 requests per minute): You may need 4-hour to 24-hour windows. With low traffic, short windows will have high variance and make it difficult to distinguish a real change from noise. +- **Batch or periodic services**: If a service processes requests in batches (e.g., every hour or every day), your window must span at least two full cycles to capture representative behavior. + +As a general rule, each window should contain at least 1,000 data points (requests, events, or metric samples) for basic confidence, and 10,000 or more for high confidence. If your window cannot achieve this, note it as a limitation in your findings. + +### Aligning Windows with the Flag Change Timestamp + +Precise alignment is essential. The flag change timestamp is the dividing line between the before and after windows. + +- Use the exact timestamp from the flag change event, not an approximate time. +- If the flag change was a gradual rollout (e.g., 0% to 10% over 5 minutes), treat the rollout completion time as the dividing line, or alternatively exclude the rollout transition period entirely and start the "after" window once the rollout stabilized. +- Add a small buffer (1-5 minutes) between the flag change timestamp and the start of the "after" window. This accounts for propagation delay — the time it takes for flag changes to reach all service instances. LaunchDarkly SDKs typically pick up changes within seconds to a couple of minutes, but caching, CDN delays, or long polling intervals can extend this. +- The "before" window should end at the flag change timestamp (or a minute before, to avoid the transition). + +Example alignment for a 1-hour window with a flag change at 14:00 UTC: +- Before window: 12:55 to 13:55 UTC +- Buffer: 13:55 to 14:05 UTC (excluded from analysis) +- After window: 14:05 to 15:05 UTC + +### Minimum Window Sizes for Statistical Relevance + +Even if a service has high traffic, extremely short windows introduce noise: + +| Traffic Level | Minimum Before Window | Minimum After Window | Notes | +|---|---|---|---| +| > 5,000 req/min | 15 minutes | 15 minutes | Sufficient data in short windows | +| 1,000-5,000 req/min | 30 minutes | 30 minutes | Standard minimum | +| 100-1,000 req/min | 2 hours | 2 hours | Need longer to accumulate data | +| < 100 req/min | 6 hours | 6 hours | May still be noisy; flag this | +| < 10 req/min | 24 hours | 24 hours | Results will have low confidence | + +These are minimums. Larger windows are generally better, as long as they remain clean of confounders. + +### Avoiding Windows That Span Other Changes + +This is the most common source of false correlations. If your "before" window contains a deployment, or your "after" window contains an infrastructure change, the comparison is tainted. + +- Before defining windows, check for other events: deployments, config changes, other flag changes, scaling events, incident remediation actions. +- If a confounding event exists, shrink the window to exclude it, even if this means using a smaller-than-ideal window. A smaller clean window is more valuable than a larger contaminated one. +- If the confounding event cannot be excluded without making the window too small, note it explicitly as a limitation and describe what the confounder was. + +--- + +## Rollout-Adjusted Analysis + +Feature flags are often rolled out gradually. A flag at 10% rollout means only 10% of traffic is seeing the new behavior. This has direct implications for metric analysis. + +### Accounting for Percentage Rollouts + +If a flag is rolled out to X% of traffic, the expected maximum impact on aggregate metrics is proportionally X% of what a full rollout would produce. + +- A flag at 10% rollout that introduces a bug causing 100% error rate for affected users would increase the overall service error rate by roughly 10 percentage points (from, say, 0.1% to approximately 10.1% if the flag path is the only source of errors, or more precisely a weighted average). +- For latency, if the flag path adds 200ms of latency and is at 25% rollout, the aggregate p50 might shift less than 200ms because 75% of requests are unaffected. Percentile metrics (p95, p99) are particularly tricky here — the impact depends on where the affected traffic falls in the latency distribution. + +When analyzing impact at partial rollout: + +1. **Scale your expectations.** If the flag is at 5%, do not expect a 50% error rate increase to be visible in aggregate metrics. It might show up as a 2.5% increase in aggregate error rate, which could be within normal variance. +2. **Prefer segment-level metrics when available.** If your observability platform can filter metrics by a tag or attribute that corresponds to the flag variant (e.g., a request header, user segment, or context attribute), use that to compare "flag on" vs "flag off" traffic directly. This eliminates the dilution problem entirely. +3. **Calculate the expected aggregate impact.** If you know the rollout percentage and the suspected per-request impact, multiply them to get the expected aggregate impact. If the observed aggregate change matches this expected value, it strengthens the correlation. + +### Segment-Level Analysis + +When possible, segment metrics by the flag's targeting criteria: + +- If the flag targets specific user segments (e.g., beta users, internal employees, a specific region), compare metrics for that segment against the rest. +- If the flag uses a percentage rollout, and your observability stack tags requests with the variant they received, compare metrics for variant A vs variant B directly. +- Segment-level analysis is far more powerful than aggregate analysis because it removes the dilution effect and provides a natural control group. + +### Comparing Control vs Treatment Groups + +If you can identify which requests were served with the flag on (treatment) vs off (control), you have the gold standard of flag impact analysis: + +- Compare error rates between the two groups during the same time period. Since both groups are experiencing the same external conditions (traffic patterns, infrastructure state), differences between them are strong evidence of flag impact. +- Check that the groups are comparable in size and composition. If the flag targets a specific segment (e.g., premium users), the control group (non-premium users) may have inherently different behavior. +- Even a small difference between control and treatment, if consistent, can be meaningful — especially because both groups share the same confounders, so those factors cancel out. + +--- + +## Confounding Factor Checklist + +Before attributing a metric shift to a flag change, systematically check for other possible causes. Work through this checklist for every analysis. + +### Other Deployments in the Same Window + +- Check deployment logs, CI/CD pipelines, and release trackers for any code deployments during either the before or after window. +- Even deployments to other services can matter if the service you are analyzing depends on them. +- Database migrations, schema changes, and configuration deployments count as deployments. + +### Other Flag Changes + +- Check the flag change log for other flags that changed in the same environment during either window. +- Pay special attention to flags on the same service or that affect the same code paths. +- A common scenario: two flags change within an hour of each other, and the metric shift is attributed to the wrong one. + +### Traffic Pattern Changes + +- Compare overall request volume between the before and after windows. A significant traffic increase or decrease can shift error rates and latency independent of any code or flag change. +- Check for traffic spikes from marketing campaigns, external events, bot traffic, or load testing. +- Time-of-day effects: if the before window was during off-peak hours and the after window spans peak hours (or vice versa), traffic patterns alone could explain metric differences. + +### Infrastructure Changes + +- Check for scaling events (auto-scaling up or down), instance replacements, or cluster changes. +- Check for changes in downstream dependencies: database failovers, cache flushes, CDN changes. +- Network changes: DNS updates, load balancer configuration changes, firewall rule modifications. +- Cloud provider incidents or maintenance windows. + +### External Dependencies + +- Check status pages for third-party services your application depends on (payment processors, identity providers, APIs). +- DNS or certificate changes in external services. +- Rate limiting or quota changes imposed by external providers. + +### Seasonal and Cyclical Effects + +- Day-of-week effects: Monday traffic patterns differ from Saturday patterns. +- Time-of-day effects: early morning vs peak hours. +- Monthly cycles: billing dates, payroll periods. +- Annual cycles: holidays, tax season, back-to-school. +- Compare the after window not just to the immediately preceding before window, but also to the same time period on the previous day or week to check for cyclical patterns. + +### Incident Overlap + +- Check incident management systems for any ongoing or recently resolved incidents. +- An incident that was in progress during the before window (degraded baseline) or after window (external degradation) will skew the comparison. +- If the flag change was made as part of incident response (e.g., a kill switch), note that the metric shift may have started before the flag change, and the flag change may have been the remediation rather than the cause. + +--- + +## Statistical Considerations + +### Sample Size Requirements + +Meaningful comparison requires sufficient data in both windows: + +- For error rate analysis: you need enough total requests that the expected number of errors is at least 5-10 in each window. If your baseline error rate is 0.1% and you have 1,000 requests, you expect only 1 error — too few to draw conclusions. +- For latency analysis: at least 100 data points per window for p50, at least 1,000 for p95, and at least 10,000 for p99. Tail percentiles require many more samples to be stable. +- For throughput analysis: at least 10 measurement intervals (e.g., 10 one-minute data points) per window. + +If sample sizes are insufficient, report this explicitly. A finding of "insufficient data" is valid and useful — it tells the team they need to wait longer or increase rollout percentage before drawing conclusions. + +### Significance Thresholds + +In a typical operational context, you are not running a formal statistical test with p-values. However, you should apply judgment about what constitutes a meaningful change: + +- **Error rate**: A change of more than 2x the baseline variance is likely significant. For example, if error rate normally fluctuates between 0.08% and 0.12%, a post-change error rate of 0.25% is clearly significant. A post-change rate of 0.13% is within normal variance. +- **Latency**: For p50, a change of more than 10-20% is usually significant. For p95/p99, higher variance is normal, so look for changes of 30% or more, or compare against the historical range for those percentiles. +- **Throughput**: Changes of more than 10% that are not explained by traffic volume changes warrant investigation. + +When in doubt, compare the magnitude of the observed change against the historical range (min/max over the past week or month). If the post-change value exceeds the historical range, it is likely significant. + +### Effect Size Interpretation + +Not all statistically detectable changes are operationally meaningful: + +- A latency increase from 45ms to 47ms (4.4% increase) may be statistically detectable with large sample sizes but operationally irrelevant. +- Focus on whether the change matters to users or system reliability, not just whether it is detectable. +- Consider the metric in the context of SLOs. If latency increased but is still well within the SLO target, the finding is "detectable but within acceptable bounds." + +### Multiple Comparisons Problem + +When analyzing many metrics simultaneously (error rate, p50, p95, p99, throughput, etc.), some may show apparent changes by chance: + +- If you check 10 metrics, there is a reasonable probability that at least one will show a notable shift purely due to random variance. +- Give more weight to changes that are consistent across related metrics (e.g., p50, p95, and p99 all increased) than to a single metric showing change while others are stable. +- If only one of many metrics shifted, consider whether there is a logical reason the flag would affect that specific metric. + +--- + +## Metric Comparison Techniques + +### Absolute Change + +The simplest comparison: subtract the before value from the after value. + +- Before error rate: 0.5%, After error rate: 1.2%. Absolute change: +0.7 percentage points. +- Useful for communicating impact in concrete terms. "Error rate increased by 0.7 percentage points." +- Less useful when baseline values differ significantly between services or time periods. + +### Relative / Percentage Change + +Express the change as a percentage of the before value. + +- Before error rate: 0.5%, After error rate: 1.2%. Relative change: +140%. +- Useful for comparing impact across services with different baselines. +- Can be misleading with very small baselines: going from 0.001% to 0.003% is a 200% increase but operationally trivial. +- Always present relative change alongside absolute values to avoid misinterpretation. + +### Standard Deviation Analysis + +Compare the after-window value against the distribution of values in the before window: + +- Calculate the mean and standard deviation of the metric during the before window. +- Express the after-window value as the number of standard deviations from the before-window mean. +- A shift of more than 2 standard deviations is notable. More than 3 is strong evidence of a real change. +- This approach naturally accounts for the metric's baseline variability. + +### Trend Comparison + +Rather than comparing single aggregate values, compare the trend (slope) of the metric over time: + +- Was the metric already trending upward before the flag change? If so, a higher value in the after window may be a continuation of an existing trend, not a result of the flag change. +- Plot the metric over time across both windows (or describe the trend). Look for a clear inflection point at or near the flag change timestamp. +- A flat trend in the before window followed by an upward trend starting at the flag change is strong evidence of correlation. + +--- + +## Common Correlation Patterns + +### Immediate Step Change + +The metric shifts abruptly at (or very shortly after) the flag change timestamp and remains at the new level. + +- **What it looks like**: Error rate jumps from 0.1% to 2.5% within minutes of the flag toggle and stays at 2.5%. +- **What it suggests**: Strong correlation. The flag change likely introduced a new code path that directly affects the metric. +- **Confidence**: High, especially if the step aligns precisely with the flag change timestamp and there are no confounders. + +### Gradual Degradation + +The metric begins shifting after the flag change and continues to worsen over time. + +- **What it looks like**: Latency is stable at 50ms before the change, then slowly climbs to 60ms over the next hour, then 80ms over the next two hours. +- **What it suggests**: The flag may have introduced a resource leak, cache pollution, connection pool exhaustion, or other progressive issue. +- **Confidence**: Medium to high. The gradual nature makes it harder to pinpoint, but if the degradation starts at the flag change, correlation is likely. Check for other causes of gradual degradation (e.g., growing queue depth, filling disk). + +### Intermittent Impact + +The metric is mostly stable but shows periodic spikes or anomalies after the flag change. + +- **What it looks like**: Error rate is 0.1% in both windows on average, but the after window has several brief spikes to 5% that were not present before. +- **What it suggests**: The flag may have introduced a race condition, timeout sensitivity, or issue that only manifests under certain conditions (specific inputs, high load, certain user segments). +- **Confidence**: Medium. Intermittent issues are harder to attribute definitively. Look for patterns in the spikes (timing, user segment, request type). + +### No-Change Confirmation + +Metrics are stable across both windows with no detectable shift. + +- **What it looks like**: All metrics (error rate, latency, throughput) show the same mean, variance, and trend in both windows. +- **What it suggests**: The flag change did not measurably affect the service's observable behavior within the analysis window. +- **Confidence**: Confidence in a "no impact" finding depends on window size, traffic volume, and rollout percentage. A no-change finding with a 1% rollout and 15-minute windows is weak. A no-change finding with a 100% rollout and 24-hour windows is strong. +- **Important**: This is a valid and valuable finding. Report it clearly — it gives the team confidence that the flag change is safe, or that the flag change is not the cause of an issue they are investigating. + +--- + +## Presenting Findings + +### Evidence-Based Language + +Use language that reflects the strength of the evidence without overclaiming: + +- **Strong evidence**: "The error rate increase from 0.1% to 2.3% aligns precisely with the flag change at 14:00 UTC, with no other changes detected in the window. This strongly suggests the flag change is responsible." +- **Moderate evidence**: "Latency increased by 15% in the after window. While the timing aligns with the flag change, a deployment to a downstream service occurred 30 minutes into the after window, which may also be a factor." +- **Weak evidence**: "There is a slight increase in p99 latency (from 450ms to 480ms), but this is within the normal daily variance for this service. The flag change may or may not be related." +- **No evidence**: "Metrics are stable across both windows. The flag change does not appear to have impacted error rate, latency, or throughput." + +Avoid words like "caused," "definitely," or "proven." Prefer "correlates with," "aligns with," "likely contributed to," "consistent with," or "does not appear to have impacted." + +### Confidence Levels + +Assign a confidence level to your finding and explain why: + +| Level | Criteria | Example | +|---|---|---| +| High | Clear metric shift, precise timing alignment, no confounders, adequate sample size, multiple metrics consistent | Error rate doubled exactly at flag change time, 2-hour clean windows, 50k requests per window | +| Medium | Metric shift present but confounders exist, or sample size is marginal, or timing is approximate | Latency increased but a deployment also happened 1 hour later; or only 500 requests in after window | +| Low | Metric shift is within normal variance, or significant confounders, or very small sample size | p99 slightly elevated but within historical range; only 50 requests in after window | +| Insufficient Data | Not enough traffic, window too short, or metrics not available | Service had 12 requests in the after window; no error rate metrics available | + +### Recommended Actions Based on Findings + +Tailor your recommendation to the finding: + +- **High confidence of negative impact**: Recommend rolling back the flag immediately. Provide the specific metrics that support this recommendation. +- **Medium confidence of negative impact**: Recommend either rolling back as a precaution or increasing monitoring and revisiting the analysis after a longer soak period. +- **Low confidence or no clear impact**: Recommend continuing the rollout with monitoring. Suggest a follow-up analysis after the flag reaches a higher rollout percentage or after more time has passed. +- **Positive impact detected**: Note the improvement. Recommend continuing rollout and verifying that the improvement scales with rollout percentage. +- **Insufficient data**: Recommend waiting for more data before making decisions. Specify what conditions would make the analysis viable (e.g., "Re-analyze after 24 hours at 25% rollout"). + +Always include the specific numbers, timestamps, and window definitions in your report so that the findings can be verified or revisited. diff --git a/skills/observability/o11y-log-query/README.md b/skills/observability/o11y-log-query/README.md new file mode 100644 index 0000000..4e9b7ff --- /dev/null +++ b/skills/observability/o11y-log-query/README.md @@ -0,0 +1,70 @@ +# LaunchDarkly Log Query Skill + +## Overview + +- Teaches AI agents to search and filter application logs effectively. +- Guides agents through constructing queries, tracing requests across services, and debugging issues using log data. +- Covers error hunting, request tracing, user-journey reconstruction, and pattern detection. +- Provides detailed reference material on query patterns, filter syntax, and refinement strategies. + +## Installation (Local) + +Add the skill to your agent configuration by referencing the skill directory: + +```yaml +skills: + - path: skills/observability/o11y-log-query +``` + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled. +- **MCP tools available:** + - `search-logs` — Search and filter logs by service, time, severity, and keywords. + - `get-log-entry` — Fetch full details of a specific log entry. + - `list-services` (optional) — List available services for discovery. + +## Usage + +**Find errors in a specific service:** + +> "Find errors in the checkout service from the last hour" + +The agent will query ERROR-level logs for `checkout-service` with a 1-hour time window, summarize the errors found, and identify patterns or root causes. + +**Trace a request across services:** + +> "Trace request abc-123 across services" + +The agent will search for the request ID across all services, reconstruct the request flow chronologically, and highlight any errors or latency issues in the trace. + +**Investigate current production issues:** + +> "What errors are happening in production right now?" + +The agent will query recent ERROR-level logs across services, identify the most frequent error types, and provide a prioritized summary of active issues. + +**Debug a specific error:** + +> "Why are we getting PAYMENT_DECLINED errors?" + +The agent will search for the error code, analyze affected endpoints and users, check the timeline for when errors started, and suggest possible causes. + +## Structure + +``` +o11y-log-query/ +├── SKILL.md # Skill definition and workflow +├── README.md # This file +└── references/ + └── query-patterns.md # Detailed log query patterns reference +``` + +## Related + +- **o11y-service-health** — Check overall service health using metrics and dashboards. Use when log analysis alone is insufficient. +- **LaunchDarkly MCP Server** — Provides the underlying `search-logs`, `get-log-entry`, and `list-services` tools. + +## License + +Apache-2.0 diff --git a/skills/observability/o11y-log-query/SKILL.md b/skills/observability/o11y-log-query/SKILL.md new file mode 100644 index 0000000..7128ede --- /dev/null +++ b/skills/observability/o11y-log-query/SKILL.md @@ -0,0 +1,120 @@ +--- +name: o11y-log-query +description: "Search and filter application log entries by service, time window, severity, or keyword. Use when the user wants to find a specific error message, trace a request by ID across services, look at recent log output, or search for a pattern in logs. This skill queries log text — use o11y-service-health instead for numeric metrics." +license: Apache-2.0 +compatibility: Requires LaunchDarkly MCP server with observability tools enabled. +metadata: + author: launchdarkly + version: "0.1.0" +--- + +# Log Query + +You're using a skill that will guide you through searching and filtering application logs to find relevant information. Your job is to understand what the user is looking for, construct an effective query, execute it, and interpret the results. + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled. +- **Required MCP tools:** + - `search-logs` — Search and filter logs by service, time, severity, and keywords. + - `get-log-entry` — Fetch full details of a specific log entry. +- **Optional MCP tools:** + - `list-services` — List available services for discovery and name validation. + +## Core Principles + +1. **Start Broad, Then Narrow** — Begin with wider time windows and fewer filters, then iteratively refine your query. An overly specific initial query may return zero results, leaving you unsure whether the issue exists or your query is wrong. + +2. **Context is King** — A single log line rarely tells the full story. Always look for surrounding context: preceding warnings, related entries from other services, and the sequence of events leading up to an error. Use trace IDs and correlation IDs to expand your view. + +3. **Structured Queries First** — Use structured fields (severity, service, trace ID, error code) before resorting to free-text search. Structured filters are indexed, faster, and produce precise results without false positives. + +4. **Time Windows Drive Cost** — Narrower time windows return faster results and reduce noise. Always apply a time window, and prefer the narrowest window that still captures the relevant context. Wide, open-ended queries are expensive and often overwhelming. + +## Workflow + +### Step 1: Define Search Intent + +Understand what the user is looking for. Determine: + +- **Target service:** Which service or services are involved? +- **Time window:** When did the issue occur? Is it ongoing? +- **Severity level:** Are we looking for errors, warnings, or all log levels? +- **Keywords / IDs:** Does the user have a specific error message, trace ID, request ID, or user ID? + +If the user's intent is ambiguous, ask clarifying questions before constructing the query. Examples of clarifying questions: + +- "Which service are you investigating?" +- "Do you have an approximate time when this occurred?" +- "Do you have a trace ID or request ID I can search for?" +- "Are you looking for a specific error, or trying to understand the general health of the service?" + +### Step 2: Construct Query + +Build the query using structured fields first, layering filters from broadest to most specific: + +1. **Service filter** — Set the target service name. Use `list-services` if the exact name is unknown. +2. **Time window** — Apply the appropriate time range based on the user's report. +3. **Severity filter** — Set severity to `ERROR` or higher for error investigation, or `INFO` and above for request tracing. +4. **Keyword / pattern filters** — Add specific error messages, trace IDs, or other keywords. + +See [query-patterns.md](references/query-patterns.md) for common query patterns including error hunting, request tracing, and aggregation strategies. + +### Step 3: Execute & Refine + +Run the query using `search-logs` and evaluate the results. + +**If too many results:** +- Narrow the time window. +- Add or tighten the severity filter. +- Add a keyword or field filter to focus on the specific issue. + +**If too few results (or none):** +- Widen the time window. +- Remove the most restrictive filter. +- Verify the service name spelling (use `list-services`). +- Check if the severity filter is too restrictive (e.g., FATAL-only when the issue logs as ERROR). + +**If tracing a request:** +- Extract the correlation ID or trace ID from the initial results. +- Search for that ID across other services to find related entries. +- Use `get-log-entry` to fetch full details of specific entries that look relevant. + +### Step 4: Interpret Results + +Summarize findings for the user. Identify and communicate: + +- **Error patterns and commonalities** — Are the errors related? Do they share an error code, endpoint, or root cause? +- **Timeline of events** — What happened first? What appears to be the root cause vs. cascading failures? +- **Affected services and endpoints** — Which services and endpoints are impacted? +- **Stack traces and error details** — What does the error detail reveal about the failure mechanism? +- **Recommended next steps** — Should the user fix a bug, investigate a dependency, check configuration, or scale a resource? + +If deeper investigation is needed beyond log analysis, suggest using the **o11y-service-health** skill to check overall service status and metrics. + +## Edge Cases + +| Scenario | Approach | +|----------|----------| +| **No results found** | Widen the time window, remove filters, verify the service name with `list-services`. Confirm that logs exist for the target service and time period. | +| **Too many results to be useful** | Add severity, keyword, or field filters. Narrow the time window. Focus on a single service or endpoint. | +| **Log entries are truncated** | Use `get-log-entry` to fetch the full log entry by its ID. Truncation often hides stack traces and error details. | +| **Logs not available for time window** | The logs may be outside the retention period. Inform the user and suggest checking if extended retention or archives are available. | +| **Multiple services involved** | Use trace IDs or correlation IDs to follow the request across services. Start with the service closest to the user (API gateway or edge service) and work inward. | +| **User provides a trace ID** | Search directly for the trace ID across all services. Sort by timestamp to reconstruct the request flow. | +| **User provides an error message** | Use the error message as a keyword filter. If it is long, use the most unique portion to avoid partial match issues. | +| **User describes general symptoms** | Start with a broad error-level query on the likely service. Use facets and aggregation to identify the dominant error patterns, then drill into the most relevant one. | + +## What NOT to Do + +- **Don't dump raw logs without interpretation.** Always summarize and contextualize the results. The user wants insights, not a wall of log text. +- **Don't search without a time window.** Open-ended queries are expensive, slow, and return overwhelming amounts of data. Always apply a time constraint. +- **Don't assume log completeness.** Log sampling, ingestion delays, and retention policies mean that not every event is captured. Absence of evidence is not evidence of absence. +- **Don't ignore structured fields in favor of free-text search.** Structured fields are faster and more precise. Only fall back to free-text when structured fields are unavailable or insufficient. +- **Don't stop at the first error.** The first error you find may be a symptom, not the root cause. Look at the timeline to find the earliest error, and check for related entries in upstream services. +- **Don't present timestamps without timezone context.** Always clarify whether timestamps are in UTC or local time to avoid confusion. + +## References + +- [Log Query Patterns Reference](references/query-patterns.md) — Detailed patterns for error hunting, request tracing, aggregation, and query refinement. +- **o11y-service-health** — Use for metric-level investigation when log analysis alone is insufficient to diagnose the issue. diff --git a/skills/observability/o11y-log-query/references/query-patterns.md b/skills/observability/o11y-log-query/references/query-patterns.md new file mode 100644 index 0000000..cadd742 --- /dev/null +++ b/skills/observability/o11y-log-query/references/query-patterns.md @@ -0,0 +1,489 @@ +# Log Query Patterns Reference + +This reference provides detailed patterns and strategies for searching, filtering, and analyzing application logs. Use these patterns to construct effective queries, trace requests across services, and extract meaningful insights from log data. + +--- + +## Common Search Patterns + +### Error Hunting + +Error hunting is the most frequent log query task. The goal is to find specific errors, understand their frequency, and identify root causes. + +**Single Error Investigation** + +When a user reports a specific error message or error code, start with the most unique identifier available. If they provide an exact error message, use it as a keyword filter. If they provide an error code (e.g., `ERR_CONNECTION_REFUSED`, `HTTP 502`), filter on that code combined with a severity filter of `ERROR` or higher. + +A typical error hunting workflow: + +1. Filter by service name and severity `ERROR` or `FATAL`. +2. Apply the narrowest reasonable time window (if the user says "this morning," use the last few hours rather than the full day). +3. Add keyword filters for specific error text if available. +4. Review the returned entries for stack traces, error codes, and contextual fields. +5. Look for patterns: are errors clustered at a specific time? Do they affect a single endpoint or many? + +**Recurring Error Discovery** + +When looking for error patterns rather than a single incident, use a wider time window (24 hours or more) and focus on aggregation. Group errors by message or error code to identify the most frequent issues. Look for errors that started appearing at a specific time, which may correlate with a deployment or configuration change. + +**Error Spike Investigation** + +If metrics or alerts indicate an error spike, query logs around the spike timestamp with a tight window (e.g., 5-10 minutes before and after). Compare error patterns during the spike with a baseline period before the spike to identify what changed. + +### Request Tracing + +Request tracing follows a single request as it flows through one or more services. This is essential for debugging latency issues, partial failures, and unexpected behavior. + +**Single-Service Request Trace** + +For tracing within a single service, search for the request identifier (request ID, session ID, or user ID) within that service's logs. Sort results chronologically to reconstruct the request lifecycle: ingress, processing steps, database calls, external API calls, and response. + +**Multi-Service Distributed Trace** + +For distributed systems, use the correlation ID or trace ID that propagates across service boundaries. Start by finding the initial log entry (often at the API gateway or edge service), extract the trace ID, then search across all services for that trace ID. This reconstructs the full distributed path of the request. + +Key fields to look for when tracing: +- `trace_id` or `traceId`: The top-level identifier for the distributed trace. +- `span_id` or `spanId`: Identifies a specific operation within the trace. +- `parent_span_id`: Links child operations to their parent, allowing tree reconstruction. +- `request_id` or `X-Request-ID`: Often set at the edge and propagated downstream. +- `correlation_id`: Application-specific identifier linking related operations. + +### User-Journey Reconstruction + +Reconstructing a user's journey through the system requires searching by user identifier across services and a broader time window. This pattern is useful when investigating user-reported issues where the exact failing operation is unknown. + +**Approach:** + +1. Identify the user identifier: user ID, email, session ID, or IP address. +2. Set a time window covering the user's reported activity period. +3. Search across all services (or start with the most likely services) for the user identifier. +4. Sort chronologically to reconstruct the sequence of actions. +5. Look for errors, unusual latency gaps, or unexpected paths in the journey. + +**Caveats:** + +- User identifiers may not be present in all log entries, especially in backend services that only receive trace IDs. +- PII considerations: user email or IP may be redacted or hashed in logs. +- Session IDs change between sessions, so ensure you have the correct session. + +### Pattern Detection + +Pattern detection involves searching for recurring themes, anomalies, or trends in log data without a specific error in mind. + +**Common Pattern Detection Scenarios:** + +- **Deployment impact:** Compare error rates and types before and after a deployment timestamp. +- **Time-based patterns:** Look for errors that occur at specific times (cron jobs, batch processing, peak traffic hours). +- **Service dependency failures:** Search for timeout or connection errors to identify failing upstream or downstream dependencies. +- **Resource exhaustion:** Look for out-of-memory errors, connection pool exhaustion, disk space warnings, or thread pool saturation messages. +- **Configuration issues:** Search for configuration-related log messages after a config change, looking for parsing errors, missing values, or fallback behavior. + +--- + +## Filter Syntax Concepts + +### Severity / Level Filtering + +Log severity levels follow a standard hierarchy. Filtering by severity is one of the most effective ways to reduce noise. + +**Standard severity levels (from least to most severe):** + +| Level | Use Case | +|----------|--------------------------------------------------------------------------| +| TRACE | Extremely detailed diagnostic information, typically disabled in production | +| DEBUG | Detailed diagnostic information useful during development | +| INFO | General operational messages confirming things are working as expected | +| WARN | Potentially harmful situations or unexpected conditions that are handled | +| ERROR | Error events that allow the application to continue running | +| FATAL | Very severe errors that will likely cause the application to abort | + +**Filtering strategies:** + +- For incident investigation, start with `ERROR` and `FATAL` to see what went wrong. +- If errors lack context, widen to include `WARN` to see precursor conditions. +- For request tracing, use `INFO` and above to see the request flow without debug noise. +- Avoid `DEBUG` and `TRACE` in production queries unless absolutely necessary, as they produce enormous volumes. + +When filtering by severity, most systems support "this level and above" semantics. A filter of `severity >= WARN` returns `WARN`, `ERROR`, and `FATAL` entries. + +### Service Filtering + +Service filtering narrows results to logs from specific microservices or application components. + +**Best practices:** + +- Always include a service filter when you know the target service. This dramatically reduces result volume and query cost. +- Use the exact service name as registered in the logging infrastructure. Common naming conventions include `checkout-service`, `checkout_service`, or `CheckoutService`. If unsure, use the `list-services` tool to discover available service names. +- For distributed trace queries, you may need to query multiple services. Start with the known service and expand based on trace results. +- Service names may include environment prefixes or suffixes (e.g., `checkout-service-prod`, `prod.checkout-service`). Be aware of your organization's naming conventions. + +### Time Window Selection + +Time windows are one of the most impactful filters for query performance and relevance. + +**Guidelines for selecting time windows:** + +| Scenario | Recommended Window | +|---------------------------------|-----------------------| +| Known incident with timestamp | 5-15 minutes around the incident | +| "It happened this morning" | Last 4-6 hours | +| "It's been happening recently" | Last 24 hours | +| "It started after the deploy" | Deploy time to now | +| Pattern analysis | 24-72 hours | +| Regression detection | Compare two windows: before and after | + +### Field-Based Filters + +Structured log fields provide precise filtering without the ambiguity of free-text search. + +**Common filterable fields:** + +- `service` or `service_name`: The originating service. +- `level` or `severity`: Log severity level. +- `environment` or `env`: Production, staging, development. +- `host` or `hostname`: The specific host or container. +- `endpoint` or `path`: The HTTP path or RPC method. +- `status_code` or `http_status`: HTTP response status code. +- `method` or `http_method`: HTTP method (GET, POST, etc.). +- `trace_id`, `span_id`, `request_id`: Distributed tracing identifiers. +- `user_id`, `account_id`, `tenant_id`: User and account identifiers. +- `error_code` or `error_type`: Application-specific error classification. +- `duration_ms` or `latency`: Request processing time. + +When a field-based filter is available, always prefer it over free-text search. Field-based filters are indexed, faster, and produce precise results without false positives. + +### Regex Patterns + +Regular expressions provide flexible text matching when field-based filters are insufficient. + +**Common regex patterns for log queries:** + +| Pattern | Purpose | Example | +|---------|---------|---------| +| `error.*timeout` | Error messages containing "timeout" | Matches "error: connection timeout after 30s" | +| `status[_]?code[=: ]+5\d{2}` | 5xx status codes | Matches "status_code=503" or "statusCode: 500" | +| `\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b` | UUIDs | Matches trace IDs, request IDs | +| `\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}` | IPv4 addresses | Matches "192.168.1.1" | +| `(OOM\|OutOfMemory\|out of memory)` | Memory exhaustion | Matches various OOM message formats | +| `(Exception\|Error\|Traceback)` | Exception indicators | Catches Java, Python, Go error patterns | +| `latency[=: ]+\d{4,}` | High latency (4+ digit ms) | Matches entries with latency >= 1000ms | + +**Regex performance considerations:** + +- Regex filters are computationally expensive. Always combine them with other filters (service, time window, severity) to reduce the data scanned. +- Anchor patterns when possible (`^` for start, `$` for end) to improve matching performance. +- Avoid overly broad patterns like `.*` at the start of a regex, which force scanning entire log lines. +- Prefer field-based filters over regex when the data is structured. + +--- + +## Time Window Strategies + +### Narrow Windows for Known Incidents + +When you have a specific timestamp for an incident (from an alert, user report, or metric spike), use a narrow window centered on that timestamp. + +**Strategy:** + +1. Start with a 5-minute window around the reported time. +2. If insufficient context is found, expand to 15 minutes, then 30 minutes. +3. Look for the earliest error in the window, as this is often the root cause. Later errors may be cascading failures. + +**Why narrow windows matter:** + +- Faster query execution, especially on large-scale logging infrastructure. +- Less noise from unrelated log entries. +- Easier to identify the sequence of events around the incident. +- Lower cost in systems that charge per data scanned. + +### Wide Windows for Pattern Detection + +When investigating trends, recurring issues, or trying to establish baselines, use wider time windows. + +**Strategy:** + +1. Start with 24 hours for recent patterns. +2. Use 72 hours or 7 days for trend analysis. +3. Compare equal-duration windows (e.g., this week vs. last week) for regression detection. +4. Be prepared to use aggregation rather than reading individual entries, as wide windows may return millions of log lines. + +### Relative vs. Absolute Times + +**Relative times** (e.g., "last 1 hour", "last 24 hours") are useful for: +- Ongoing investigations where "now" is the reference point. +- Dashboards and saved queries that should always show recent data. +- Quick ad-hoc queries during incident response. + +**Absolute times** (e.g., "2024-01-15 14:30:00 to 2024-01-15 15:00:00") are useful for: +- Post-incident reviews where the incident window is known. +- Comparing specific time periods. +- Reproducible queries that return the same results regardless of when they run. +- Sharing queries with colleagues who may run them at different times. + +### Timezone Considerations + +Timezone mismatches are a common source of confusion in log queries. + +**Best practices:** + +- Determine what timezone your logging infrastructure uses. Most systems store timestamps in UTC. +- When a user reports a time, clarify their timezone. "It happened at 2pm" could be any timezone. +- Convert user-reported times to UTC (or the system's native timezone) before querying. +- Be aware of daylight saving time transitions, which can cause ambiguous or missing hours. +- When presenting results to users, convert timestamps back to their local timezone for readability. + +--- + +## Structured vs. Unstructured Log Approaches + +### JSON / Structured Logs + +Structured logs emit each entry as a JSON object (or other structured format) with well-defined fields. + +**Example structured log entry:** +```json +{ + "timestamp": "2024-01-15T14:32:01.456Z", + "level": "ERROR", + "service": "checkout-service", + "trace_id": "abc123def456", + "span_id": "span789", + "message": "Payment processing failed", + "error_code": "PAYMENT_DECLINED", + "user_id": "user-42", + "endpoint": "/api/v1/checkout", + "duration_ms": 1523, + "error_details": { + "provider": "stripe", + "decline_code": "insufficient_funds" + } +} +``` + +**Query advantages of structured logs:** + +- Every field is directly filterable: `error_code = "PAYMENT_DECLINED"` is precise and fast. +- Aggregations are straightforward: count by `error_code`, average `duration_ms`, group by `endpoint`. +- No parsing required at query time, reducing CPU cost and latency. +- Fields are typed, so numeric comparisons work correctly (`duration_ms > 1000`). +- Nested fields provide rich context without cluttering the message. + +### Free-Text / Unstructured Logs + +Unstructured logs are human-readable text strings, often from legacy systems or simple logging configurations. + +**Example unstructured log entry:** +``` +2024-01-15 14:32:01.456 ERROR [checkout-service] [trace:abc123def456] Payment processing failed for user user-42 on /api/v1/checkout (1523ms): PAYMENT_DECLINED - insufficient_funds (provider: stripe) +``` + +**Challenges with unstructured logs:** + +- Extracting fields requires regex or parsing at query time. +- Inconsistent formatting across services makes universal queries difficult. +- Free-text search may produce false positives (searching for "error" matches "error" in any context, including log messages about error handling working correctly). +- Aggregation requires field extraction first, which is slower and less reliable. + +### Extracting Fields from Unstructured Logs + +When working with unstructured logs, you can extract fields using patterns: + +- **Regex extraction:** Define capture groups to pull out specific values. For example, `\[trace:([a-f0-9]+)\]` extracts the trace ID from the bracket notation. +- **Delimiter-based parsing:** Split on known delimiters (spaces, pipes, brackets) and reference fields by position. +- **Grok patterns:** Some logging platforms support Grok (named regex patterns) for common log formats like Apache access logs, syslog, etc. + +**Tips for querying unstructured logs:** + +1. Identify the most unique and specific text to search for. Prefer error codes over generic words. +2. Use surrounding context (brackets, equals signs, quotes) to reduce false positives. Search for `[ERROR]` instead of just `ERROR`. +3. Combine multiple keywords with AND semantics to narrow results. For example, `"timeout" AND "checkout-service"`. +4. If you need a field value, use regex extraction on the results rather than trying to filter by extracted values at scale. + +--- + +## Request Tracing Patterns + +### Correlation IDs + +Correlation IDs are application-generated identifiers that link related operations. They differ from distributed trace IDs in that they are often business-meaningful (e.g., order ID, transaction ID) rather than randomly generated. + +**Using correlation IDs for tracing:** + +1. Identify the correlation ID from the user's report, an alert, or an initial log query. +2. Search for the correlation ID across all relevant services. +3. Sort results by timestamp to reconstruct the sequence. +4. Pay attention to service transitions (where one service hands off to another) as these are common failure points. + +**Common correlation ID fields:** +- `order_id`, `transaction_id`: Business-level identifiers. +- `request_id`, `X-Request-ID`: HTTP-level identifiers set at the edge. +- `session_id`: Groups all operations within a user session. +- `job_id`, `batch_id`: Identifiers for background processing. + +### Trace IDs and Span IDs + +In distributed tracing systems (OpenTelemetry, Jaeger, Zipkin), every request gets a trace ID, and each operation within the request gets a span ID. + +**Trace reconstruction from logs:** + +1. Find the trace ID from an initial log entry, alert, or trace UI. +2. Query all logs with that trace ID across all services. +3. Use span IDs and parent span IDs to build the trace tree: + - The root span has no parent span ID (or a null/empty parent). + - Each span's `parent_span_id` points to its parent operation. + - Spans within the same service form a local tree; spans across services link through parent references. +4. Sort by timestamp within each service, and use span relationships to understand the cross-service flow. + +**What to look for in a trace:** + +- **Error spans:** Spans where the log level is ERROR or where an error field is present. +- **Slow spans:** Spans with unusually high `duration_ms`, indicating latency bottlenecks. +- **Missing spans:** Gaps in the expected sequence may indicate dropped requests, services not emitting logs, or log ingestion delays. +- **Retry spans:** Multiple spans for the same operation indicate retries, which may point to flaky dependencies. + +### Reconstructing Distributed Traces from Logs + +When a dedicated tracing system is not available, or when traces are incomplete, you can reconstruct request flows from log entries. + +**Step-by-step approach:** + +1. Start at the entry point (API gateway, load balancer, or edge service). +2. Find the log entry for the incoming request. Extract any IDs: request ID, trace ID, user ID. +3. Search downstream services for that ID. If no common ID exists, use correlated timestamps and endpoint information. +4. For each service, identify the inbound log (request received) and outbound log (response sent or downstream call made). +5. Build a timeline showing the request flow, processing time at each service, and any errors encountered. + +**When traces are incomplete:** + +- Check for log sampling: some services may sample logs, dropping entries for certain requests. +- Check for log ingestion delays: logs from different services may arrive at different times. +- Check for log retention: if the incident is old, some logs may have been purged. +- Fall back to correlating by timestamp and endpoint when IDs are not available, but acknowledge the lower confidence of this approach. + +--- + +## Aggregation Patterns + +### Error Counts by Type + +Counting errors by type helps prioritize investigation. The most frequent errors are not always the most important, but they are a good starting point. + +**Approach:** + +1. Query all ERROR-level logs for the target service and time window. +2. Group by error code, error type, or error message. +3. Count the occurrences in each group. +4. Sort by count descending to see the most frequent errors. +5. Look for errors with counts significantly above the baseline. + +### Top Errors + +Identifying the top errors gives a quick health overview of a service. + +**Useful top-error views:** + +- **Top errors by count:** Most frequent errors. +- **Top errors by affected users:** Errors impacting the most unique users. A low-count error affecting many users may be more important than a high-count error from a single retry loop. +- **Top new errors:** Errors that appeared for the first time in the query window, which may indicate a new bug from a recent deployment. +- **Top errors by service:** Across all services, which service is producing the most errors? + +### Frequency Analysis + +Frequency analysis reveals temporal patterns in log data. + +**Patterns to look for:** + +- **Spike analysis:** Sudden increases in error count at a specific time. Correlate with deployments, config changes, or external events. +- **Periodic patterns:** Errors that occur at regular intervals may be caused by cron jobs, batch processing, or scheduled tasks. +- **Gradual increase:** A slowly rising error rate may indicate resource exhaustion (memory leaks, disk filling up, connection pool depletion). +- **Step function:** A sudden permanent increase in error rate starting at a specific time, likely caused by a deployment or configuration change. + +### Rate Calculations from Logs + +Calculating error rates from logs provides a useful signal when dedicated metrics are not available. + +**Basic rate calculation:** + +1. Count all log entries (or all request-related entries) in a time window. +2. Count error-level entries in the same window. +3. Error rate = (error count / total count) * 100. + +**Caveats:** + +- Log sampling affects rate accuracy. If only 10% of requests are logged, the rate is an estimate. +- Different services may log at different verbosity levels, making cross-service rate comparisons unreliable. +- Request logs and error logs may not have a 1:1 relationship. A single request may generate multiple error log entries (e.g., retry attempts). + +--- + +## Query Refinement Strategies + +### Start Broad, Then Narrow + +The most reliable approach to log queries is iterative refinement. + +**Step-by-step refinement:** + +1. **Broadest query:** Service + time window only. Review the result count and a sample of entries to understand the data volume and format. +2. **Add severity:** Filter to WARN and above (or ERROR and above) to focus on problems. +3. **Add keywords:** Include specific error messages, codes, or identifiers from the user's report. +4. **Add field filters:** If initial results reveal useful fields (endpoint, user ID, host), add those as filters. +5. **Narrow time window:** Once you find relevant entries, narrow the time window around them for detailed analysis. + +This approach avoids the common pitfall of starting with an overly specific query that returns no results, leaving you unsure whether the issue doesn't exist or your query is wrong. + +### Using Facets + +Facets (also called field value distributions) show the top values for each field in your result set. + +**How facets help refinement:** + +- **Service facet:** Shows which services are contributing the most entries. If an unexpected service appears, it may indicate a cascading failure. +- **Severity facet:** Shows the distribution of severity levels. A high ratio of ERROR to INFO may indicate a problem. +- **Endpoint facet:** Shows which endpoints are most active in the results. Focus on the top endpoint for targeted investigation. +- **Host facet:** Shows which hosts are contributing. If a single host dominates the error count, it may be a host-specific issue (bad deploy, hardware problem, noisy neighbor). +- **Error code facet:** Shows the distribution of error types, helping prioritize which error to investigate first. + +### Iterative Filtering + +Iterative filtering is the process of using results from one query to inform the next. + +**Common iteration patterns:** + +1. **Error to trace:** Find an error entry, extract its trace ID, query for the full trace. +2. **Trace to service:** Identify which service in the trace is the origin of the failure, then query that service's logs for more detail. +3. **Service to host:** If errors are concentrated in one service, check if they are further concentrated on a specific host. +4. **Host to time:** If a specific host is problematic, look at its error timeline to identify when the issue started. +5. **Time to cause:** Once you know when the issue started, correlate with deployment logs, config changes, or external dependency status. + +### Handling High-Cardinality Fields + +High-cardinality fields (fields with many unique values, like user ID, request ID, or IP address) require special handling. + +**Challenges:** + +- Grouping by high-cardinality fields produces too many groups to be useful. +- Querying by a specific high-cardinality value is fine, but exploring all values is impractical. +- Some logging systems have performance limitations with high-cardinality group-by operations. + +**Strategies:** + +- **Don't group by high-cardinality fields** for exploratory analysis. Instead, group by low-cardinality fields (service, endpoint, error code, severity) first. +- **Use high-cardinality fields for targeted queries** when you have a specific value (e.g., a specific user ID from a support ticket). +- **Sample first:** If you need to understand patterns in high-cardinality data, sample a subset of entries and analyze them manually. +- **Use derived fields:** Instead of grouping by full user ID, group by user segment, account type, or geographic region if those fields are available. +- **Apply aggregation limits:** If your query platform supports it, limit group-by results to the top N values to avoid overwhelming the system and your analysis. + +--- + +## Platform-Agnostic Tips + +These tips apply regardless of your specific logging platform: + +1. **Learn your platform's query language.** Every logging system has its own syntax for filters, aggregations, and transformations. Invest time in learning it to write efficient queries. +2. **Use saved queries.** For common investigation patterns, save queries as templates that can be quickly customized with specific values (time window, service name, trace ID). +3. **Understand your platform's limitations.** Know the maximum query time range, result size limits, sampling behavior, and cost model. +4. **Check log ingestion delay.** Most logging systems have a delay between when a log is emitted and when it is queryable. This delay can range from seconds to minutes. If investigating a very recent event, wait a few minutes or account for the delay in your time window. +5. **Be aware of log retention policies.** Logs older than the retention period are not available. If investigating a historical incident, verify that logs from that period are still retained. +6. **Document your investigation.** When you find something important, note the query that found it, the relevant log entries, and your interpretation. This helps with post-incident reviews and knowledge sharing. diff --git a/skills/observability/o11y-regression-detect/README.md b/skills/observability/o11y-regression-detect/README.md new file mode 100644 index 0000000..1f55d1e --- /dev/null +++ b/skills/observability/o11y-regression-detect/README.md @@ -0,0 +1,71 @@ +# LaunchDarkly Regression Detection Skill + +## Overview + +- Teaches AI agents to monitor services during feature flag rollouts and detect metric regressions in real time. +- Uses consecutive-check patterns (e.g., 3-of-5) to confirm regressions and avoid false positives from transient metric spikes. +- Classifies regression severity into normal, warning, and critical levels to enable proportional responses. +- Signals regressions with full context (metric, current value, baseline, threshold, breach count) so operators or orchestrating agents can make informed decisions. + +## Installation (Local) + +Add the skill to your agent configuration by referencing the skill directory: + +```yaml +skills: + - path: skills/observability/o11y-regression-detect +``` + +Or copy the skill directory into your project's skills folder: + +```bash +cp -r skills/observability/o11y-regression-detect /path/to/your/project/skills/ +``` + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled. +- MCP tools available: `get-service-metrics`, `get-metric-baselines`. +- Optional MCP tools: `get-flag`, `list-services`. + +## Usage + +Invoke the skill by asking your agent to monitor a service during a rollout. Examples: + +``` +"Watch the checkout service during this rollout and alert me if metrics regress." +``` + +``` +"Monitor error rates while we roll out the new payment flow. Use a 2% threshold." +``` + +``` +"Alert me if latency crosses 500ms during the dark-mode rollout." +``` + +``` +"Monitor the search service for the next 20 minutes — check every 2 minutes and flag any throughput drops." +``` + +The agent will define monitoring parameters, establish a baseline, run checks at the configured interval, and signal if a regression is detected. + +## Structure + +``` +o11y-regression-detect/ +├── SKILL.md # Skill definition and workflow +├── README.md # This file +└── references/ + └── detection-strategies.md # Detailed reference on thresholds, consecutive-check patterns, severity, baselines, and false positive management +``` + +## Related + +- [Service Health Check Skill](../o11y-service-health/) — Assess overall service health and establish metric baselines before starting regression detection. +- [Guarded Rollout Skill](../../guarded-rollouts/guarded-rollout-create/) — Automate rollback decisions based on regression detection signals from this skill. +- [Flag Impact Analysis Skill](../o11y-flag-impact/) — Analyze the post-rollout impact of a feature flag on service metrics. + +## License + +Apache-2.0 diff --git a/skills/observability/o11y-regression-detect/SKILL.md b/skills/observability/o11y-regression-detect/SKILL.md new file mode 100644 index 0000000..ed96745 --- /dev/null +++ b/skills/observability/o11y-regression-detect/SKILL.md @@ -0,0 +1,116 @@ +--- +name: o11y-regression-detect +description: "Continuously poll a service's metrics at repeated intervals and alert when a regression threshold is breached multiple consecutive times. Use when the user asks to 'watch', 'monitor', or 'keep an eye on' a service during a rollout. This skill performs ongoing detection over time — use o11y-service-health for a one-time metric snapshot, or guarded-rollout-create to configure automated rollback." +license: Apache-2.0 +compatibility: Requires LaunchDarkly MCP server with observability tools enabled. +metadata: + author: launchdarkly + version: "0.1.0" +--- + +# Regression Detection + +You're using a skill that will guide you through monitoring a service during a rollout to detect metric regressions. Your job is to define what to monitor and at what thresholds, establish a baseline, monitor at intervals, and signal when intervention may be needed. + +This skill is often used alongside the [guarded rollout skill](../../guarded-rollouts/guarded-rollout-create/SKILL.md) to automate rollback when regressions are detected, and it builds on the [service health check skill](../o11y-service-health/SKILL.md) for baseline data and health assessment. + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled. +- **Required MCP tools:** + - `get-service-metrics` — fetch current metric values for a service (error rate, latency, throughput). + - `get-metric-baselines` — retrieve historical baseline data for a service's metrics. +- **Optional MCP tools:** + - `get-flag` — check the current rollout state of a feature flag (useful for correlating regressions with rollout percentage). + - `list-services` — enumerate available services if the user does not specify one. + +## Core Principles + +1. **Signal, Don't Decide** — This skill detects regressions and reports them. It does not automatically roll back, disable flags, or take corrective action. The human operator or an orchestrating agent (such as the guarded rollout skill) decides what to do with the signal. + +2. **Consecutive Checks, Not Single Points** — Never signal a regression based on a single data point. Metrics are noisy; transient spikes are common. Always require multiple consecutive threshold breaches before classifying a regression as confirmed. See [detection-strategies.md](references/detection-strategies.md) for the consecutive-check patterns (3-of-5, N-of-M, and immediate-critical exceptions). + +3. **Baselines Before Monitoring** — You must establish what "normal" looks like before you can detect "abnormal." Without a baseline, thresholds are arbitrary and regression detection is unreliable. Always gather baseline data before starting the monitoring loop. + +4. **Severity Levels Matter** — Distinguish between warning (a metric is approaching its threshold or has a single isolated breach) and critical (a sustained breach confirmed by consecutive checks). This distinction enables proportional responses: warnings prompt attention, criticals prompt action. + +## Workflow + +### Step 1: Define Monitoring Parameters + +Determine the following before starting: + +- **Which service** to monitor. If the user does not specify, use `list-services` to identify options and ask. +- **Which metrics** to monitor. Common choices: error rate, latency p99, throughput (requests per second). Default to all three unless the user specifies otherwise. +- **Thresholds** for each metric. The user may provide these directly (e.g., "alert if error rate exceeds 2%"). If not provided: + - Use `get-metric-baselines` to retrieve historical data. + - Apply the relative threshold methodology from [detection-strategies.md](references/detection-strategies.md): 200% of baseline for error rate, 150% for latency, 70% floor for throughput. +- **Check interval.** Default: every 1 minute. The user may request a different interval. +- **Monitoring duration.** Default: 30 minutes. The user may specify a different duration. +- **Consecutive-check pattern.** Default: 3-of-5 (3 breaches in 5 checks to trigger critical). The user may adjust sensitivity. + +Present the monitoring plan to the user for confirmation before proceeding. + +### Step 2: Establish Baseline + +Use `get-metric-baselines` to retrieve baseline data for the service's monitored metrics. If baselines are unavailable, use `get-service-metrics` to sample current (pre-rollout) values as the baseline. + +Record for each metric: +- Baseline value (mean or median from historical data). +- Computed threshold (baseline value adjusted by the threshold multiplier). +- Warning level (80% of the distance from baseline to threshold). + +**Important:** If the baseline itself looks unhealthy (e.g., error rate already elevated), flag this to the user before starting monitoring. Monitoring against an unhealthy baseline will mask further regressions. + +### Step 3: Monitor at Intervals + +Execute the monitoring loop: + +1. **At each check interval**, call `get-service-metrics` to retrieve current values for each monitored metric. +2. **Compare** each metric against its threshold. +3. **Track** consecutive breaches using the configured pattern (default 3-of-5). +4. **Classify** the current overall status: + - **Normal** — All metrics within thresholds. No breaches in the consecutive-check window. + - **Warning** — A single threshold breach has occurred, or a metric is within 20% of its threshold, but the consecutive-check pattern is not yet satisfied. + - **Critical** — The consecutive-check pattern is satisfied for at least one metric (e.g., 3 of the last 5 checks breached the threshold). Alternatively, an immediate-critical threshold was breached (e.g., error rate > 50%). +5. **Report** the status at each check. Include: check number, current metric values, threshold status (normal/breach), cumulative breach count, and overall status (normal/warning/critical). + +Continue until the monitoring duration is reached or a critical status is signaled. + +### Step 4: Signal When Needed + +When the overall status reaches **critical**, signal the regression clearly. The signal must include: + +- **Which metric** regressed (e.g., "error rate"). +- **Current value** vs. **baseline** (e.g., "current: 1.8%, baseline: 0.3%"). +- **Threshold breached** (e.g., "threshold: 0.6%, which is 200% of baseline"). +- **Number of consecutive breaches** (e.g., "3 of the last 5 checks exceeded the threshold"). +- **Recommended action** — Suggest investigation and, if appropriate, consideration of a rollback. Do not perform the rollback; let the user or orchestrating agent decide. + +If monitoring completes without reaching critical status, report that the service remained healthy throughout the monitoring period, along with a summary of any warnings observed. + +## Edge Cases + +| Scenario | How to Handle | +|----------|--------------| +| Baseline itself is unhealthy | Alert the user before monitoring begins. Ask whether to proceed with the current baseline or wait for the service to recover. If proceeding, note that regressions relative to an already-degraded baseline may understate the true impact. | +| Metrics become unavailable during monitoring | Report the data gap. Do not count unavailable checks as breaches or as healthy. If metrics are unavailable for more than 3 consecutive checks, pause monitoring and alert the user. | +| Traffic drops to zero | Suspend regression detection for error rate and latency (these metrics are meaningless without traffic). Report the throughput drop as a critical signal — zero traffic may indicate a complete outage. | +| Threshold breach is transient | This is handled by the consecutive-check pattern. A single transient breach will appear as a warning but will not escalate to critical unless subsequent checks also breach. | +| Multiple metrics breach simultaneously | Escalate overall severity. If two or more metrics are in breach at the same time, this increases confidence that the regression is real and may warrant a more urgent signal. | +| User wants to adjust thresholds mid-monitoring | Allow it. Update the threshold values and reset the consecutive-check window for affected metrics. Log the threshold change in the monitoring report. | + +## What NOT to Do + +- **Don't automatically roll back.** Regression detection signals; the [guarded rollout skill](../../guarded-rollouts/guarded-rollout-create/SKILL.md) or the human operator decides whether to roll back. +- **Don't signal on a single data point.** Always use consecutive-check patterns. The only exception is immediate-critical thresholds (e.g., error rate > 50%). +- **Don't monitor without baselines.** Thresholds derived without baseline data are arbitrary and will produce unreliable results. +- **Don't ignore throughput drops.** A drop in throughput can mask other metrics — if no requests are flowing, error rate may appear low even though the service is failing. +- **Don't continue monitoring indefinitely.** Always define a monitoring duration. If the duration expires without a critical signal, report success and stop. Open-ended monitoring wastes resources and attention. + +## References + +- [Detection Strategies Reference](references/detection-strategies.md) — Detailed coverage of threshold types, consecutive-check patterns, severity classification, rolling baselines, monitoring intervals, and false positive management. +- [Service Health Check Skill](../o11y-service-health/SKILL.md) — Use for establishing baselines and assessing overall service health before starting regression detection. +- [Guarded Rollout Skill](../../guarded-rollouts/guarded-rollout-create/SKILL.md) — Use alongside this skill to automate rollback decisions when regressions are detected. +- [Flag Impact Analysis Skill](../o11y-flag-impact/SKILL.md) — Use after a rollout completes to analyze the overall impact of a feature flag on service metrics. diff --git a/skills/observability/o11y-regression-detect/references/detection-strategies.md b/skills/observability/o11y-regression-detect/references/detection-strategies.md new file mode 100644 index 0000000..a7d360d --- /dev/null +++ b/skills/observability/o11y-regression-detect/references/detection-strategies.md @@ -0,0 +1,486 @@ +# Detection Strategies Reference + +This reference provides detailed guidance on how to detect metric regressions during service rollouts. It covers threshold types, consecutive-check patterns, severity classification, rolling baseline calculation, monitoring intervals, and false positive management. Use this document when configuring regression detection parameters or when you need to understand why a particular detection approach was chosen. + +--- + +## Threshold Types + +Thresholds define the boundary between "acceptable" and "regressed" for a given metric. Choosing the right threshold type is critical — too sensitive and you drown in false positives, too lenient and you miss real regressions. + +### Static Thresholds + +Static thresholds use a fixed, absolute value as the boundary. The metric either exceeds the value or it does not. + +**How it works:** You define a hard number. For example, "error rate must not exceed 2%" or "p99 latency must stay below 800ms." + +**Example configuration:** + +| Metric | Static Threshold | Unit | +|----------------|-----------------|---------| +| Error rate | 2.0 | percent | +| Latency p99 | 800 | ms | +| Throughput | 500 | req/s | + +**When to use static thresholds:** + +- You have well-understood, stable services with predictable metric ranges. +- Regulatory or SLA requirements define hard limits (e.g., "error rate must never exceed 1% per our SLA"). +- The metric has a clear physical or business limit that does not change over time. + +**Advantages:** + +- Simple to understand and explain. "The threshold is 2%. We crossed it." +- Easy to implement. No baseline computation required. +- Deterministic — the same metric value always produces the same result. + +**Disadvantages:** + +- Does not adapt to changing baselines. If your service normally runs at 0.1% error rate and then grows to normally run at 0.5% due to increased traffic patterns, a 2% static threshold is now further from normal than it was before, but a regression from 0.5% to 1.5% (a 3x increase) would not trigger it. +- Requires manual tuning. Someone must pick the right number, and that number may become stale. +- Poor for metrics with high natural variance (e.g., latency during peak vs. off-peak hours). + +**Concrete example:** A checkout service has an SLA requiring error rates below 1%. You set a static threshold at 1%. During a rollout, error rate climbs from 0.1% to 0.8% — a significant 8x regression — but the static threshold does not fire because 0.8% < 1%. The regression goes undetected until a customer complains. This illustrates the limitation: the static threshold protects the SLA but does not detect meaningful regressions within the SLA boundary. + +### Relative Thresholds + +Relative thresholds define a boundary as a percentage increase above the current baseline. The threshold moves as the baseline moves. + +**How it works:** You compute a baseline (e.g., the average error rate over the last 24 hours is 0.3%) and then define a threshold as "baseline + X%" or "baseline * multiplier." For example, "error rate must not exceed 200% of baseline" means the threshold is 0.6% when the baseline is 0.3%. + +**Example configuration:** + +| Metric | Baseline | Relative Threshold | Computed Threshold | +|----------------|----------|--------------------|--------------------| +| Error rate | 0.3% | 200% of baseline | 0.6% | +| Latency p99 | 250ms | 150% of baseline | 375ms | +| Throughput | 1200 req/s | 70% of baseline (min) | 840 req/s | + +Note that for throughput, the threshold is a _minimum_ — you are detecting drops, not increases. + +**When to use relative thresholds:** + +- Most common use case. This is the default recommendation for regression detection. +- The service has a reasonably stable baseline that may shift over time. +- You want to detect meaningful changes relative to what the service normally does, not just absolute limit violations. + +**Advantages:** + +- Adapts to baseline changes. If the baseline shifts from 0.3% to 0.5% error rate over weeks, the threshold adjusts automatically on the next baseline computation. +- Catches regressions that static thresholds miss. A 3x increase from 0.1% to 0.3% triggers a 200% relative threshold even though absolute values are low. +- Intuitive — "twice the normal error rate" is easy to reason about. + +**Disadvantages:** + +- Requires baseline computation, which adds complexity. +- If the baseline itself is unhealthy (e.g., computed during a previous incident), the threshold will be too lenient. +- A very low baseline can produce thresholds that are too tight. If baseline error rate is 0.01%, then 200% of baseline is 0.02% — a difference so small it may be noise. + +**Concrete example:** A payment service has a baseline p99 latency of 200ms. You set a relative threshold of 150% (300ms). During a rollout, latency climbs to 320ms. The relative threshold fires. If you had used a static threshold of 500ms (a common "safe" value), this 60% latency regression would have gone undetected. + +### Adaptive Thresholds + +Adaptive thresholds use a rolling baseline combined with standard deviation bands to create a dynamic boundary that accounts for natural variance in the metric. + +**How it works:** You compute a rolling mean and standard deviation over a trailing window. The threshold is set at `mean + (N * standard_deviation)`. For example, with N=3 (three sigma), the threshold captures 99.7% of normal variation, and only values beyond three standard deviations trigger. + +**Example configuration:** + +| Metric | Rolling Mean | Std Dev | Sigma Multiplier | Computed Threshold | +|----------------|-------------|---------|-------------------|--------------------| +| Error rate | 0.25% | 0.05% | 3 | 0.40% | +| Latency p99 | 220ms | 30ms | 2.5 | 295ms | +| Throughput | 1100 req/s | 100 req/s | 2 | 900 req/s (min) | + +**When to use adaptive thresholds:** + +- Services with high natural variance (e.g., latency that fluctuates significantly by time of day). +- Long-running monitoring where the baseline may shift gradually during the monitoring period. +- When you need to minimize false positives without manual threshold tuning. + +**Advantages:** + +- Accounts for natural metric variance. A metric that is normally "spiky" gets wider bands; a stable metric gets tighter bands. +- Self-tuning — as the rolling window updates, the threshold adapts. +- Statistically grounded — three sigma thresholds have a clear probabilistic meaning. + +**Disadvantages:** + +- More complex to implement and explain. "The threshold is the rolling mean plus 2.5 standard deviations" is harder to communicate than "the threshold is 200% of baseline." +- The rolling window must be carefully sized (see Rolling Baseline Calculation below). +- Can be slow to detect regressions if the rolling window is long, because the regression data pulls the mean upward. +- Assumes approximately normal distribution of metric values, which may not hold for all metrics (e.g., latency distributions are often skewed). + +**Concrete example:** An API gateway has p99 latency that varies between 150ms and 300ms depending on time of day. A static threshold at 400ms would miss a regression during peak hours (when baseline is already 300ms), and a relative threshold of 150% would false-positive during peak hours if the baseline was computed during off-peak. An adaptive threshold with a 4-hour rolling window and 2.5 sigma multiplier computes: mean=250ms, stddev=50ms, threshold=375ms. During peak hours (baseline ~300ms), the threshold adjusts upward. During off-peak (baseline ~150ms), it tightens. A regression that pushes latency to 450ms fires regardless of time of day. + +### Comparison: When to Use Each Type + +| Factor | Static | Relative | Adaptive | +|---------------------------|------------------|------------------|------------------| +| Implementation complexity | Low | Medium | High | +| Baseline required | No | Yes | Yes (rolling) | +| Adapts to baseline shifts | No | On recomputation | Continuously | +| Handles metric variance | No | Partially | Yes | +| False positive rate | Medium-High | Low-Medium | Low | +| Detection speed | Immediate | Immediate | Slower | +| Best for | SLA hard limits | Most use cases | Variable workloads | +| Explainability | High | High | Medium | + +**Recommendation:** Start with relative thresholds (200% of baseline for error rate, 150% for latency, 70% floor for throughput). Move to adaptive thresholds only if false positive rates are unacceptable due to metric variance. Use static thresholds as a safety net for absolute limits (e.g., error rate > 50% is always critical regardless of baseline). + +--- + +## Consecutive-Check Patterns + +A threshold breach at a single point in time is not sufficient evidence of a regression. Metrics are noisy. Networks have blips. Garbage collection causes latency spikes. Consecutive-check patterns require multiple breaches before signaling a regression, dramatically reducing false positives. + +### Why Single-Check Detection Fails + +Consider a service with a baseline p99 latency of 200ms and a threshold of 300ms. In a typical 30-minute monitoring window with checks every minute: + +- **Transient spikes:** A single garbage collection pause can push p99 latency to 350ms for one check, then it drops back to 210ms. Single-check detection would signal a false regression. +- **Noisy metrics:** Metrics aggregated over short windows (1 minute) naturally have higher variance than those over longer windows (5 minutes). A 1-minute p99 latency measurement can fluctuate 20-40% from the mean due to sampling alone. +- **Network blips:** A brief network partition or DNS timeout can cause a burst of errors that inflates the error rate for one check interval, then resolves on its own. +- **Metric collection delays:** Sometimes metrics arrive late or out of order, causing a single check to reflect an incomplete or skewed picture. + +**Real-world data:** In a study of production alerting at a large SaaS provider, single-check alerting produced a false positive rate of approximately 15-25% for latency metrics and 8-12% for error rate metrics. Requiring 3 consecutive breaches reduced false positives to under 2% while adding only 2-4 minutes of detection latency. + +### The 3-of-5 Pattern + +The 3-of-5 pattern is the most commonly used consecutive-check approach for regression detection. It requires 3 threshold breaches within the last 5 checks to trigger. + +**How it works:** + +1. Maintain a sliding window of the last 5 check results (breach or no-breach). +2. After each check, count the number of breaches in the window. +3. If 3 or more of the last 5 checks are breaches, signal the regression. + +**Example timeline (1-minute check interval):** + +| Minute | Error Rate | Threshold (0.6%) | Breach? | Window (last 5) | Count | Signal? | +|--------|-----------|-------------------|---------|------------------|-------|---------| +| 1 | 0.3% | 0.6% | No | [N] | 0 | No | +| 2 | 0.7% | 0.6% | Yes | [N, Y] | 1 | No | +| 3 | 0.4% | 0.6% | No | [N, Y, N] | 1 | No | +| 4 | 0.8% | 0.6% | Yes | [N, Y, N, Y] | 2 | No | +| 5 | 0.9% | 0.6% | Yes | [N, Y, N, Y, Y] | 3 | Yes | +| 6 | 0.5% | 0.6% | No | [Y, N, Y, Y, N] | 3 | Yes | +| 7 | 0.3% | 0.6% | No | [N, Y, Y, N, N] | 2 | No | + +Notice at minute 5, the pattern triggers. At minute 7, it recovers because there are now only 2 breaches in the window. + +**Why 3-of-5 specifically:** + +- 3 breaches provide strong evidence — the probability of 3 random independent spikes in 5 checks is low. +- Allowing 2 non-breach checks in the window tolerates brief recoveries during a real regression (metrics can briefly dip below threshold during a genuine regression due to variance). +- With 1-minute checks, detection latency is at most 5 minutes, which is fast enough for rollout monitoring. + +### The N-of-M Pattern (Generalized) + +The 3-of-5 pattern is a specific instance of the general N-of-M pattern: require N breaches in the last M checks to trigger. + +**Common configurations:** + +| Pattern | N | M | Use Case | Detection Latency (1-min checks) | False Positive Rate | +|---------|---|---|----------|----------------------------------|---------------------| +| 2-of-3 | 2 | 3 | Fast detection, slightly higher false positives | 3 min max | Medium | +| 3-of-5 | 3 | 5 | Balanced (recommended default) | 5 min max | Low | +| 4-of-6 | 4 | 6 | Conservative, fewer false positives | 6 min max | Very Low | +| 5-of-10 | 5 | 10 | Very conservative, long-running monitors | 10 min max | Minimal | + +**Choosing N and M:** + +- Higher N/M ratio (e.g., 4-of-5 = 80%) means stricter — fewer false positives but slower detection. +- Lower N/M ratio (e.g., 2-of-5 = 40%) means more sensitive — faster detection but more false positives. +- The recommended sweet spot is N/M between 50% and 70%. +- M should be at least 3 to provide meaningful statistical power. +- M should not exceed about 10-15 because older data becomes less relevant. + +### Sliding Window Approach + +Instead of counting discrete breaches, the sliding window approach averages the metric over the window and compares the average against the threshold. + +**How it works:** + +1. Maintain a sliding window of the last M metric values. +2. Compute the window average. +3. If the window average exceeds the threshold, signal the regression. + +**Example (5-minute sliding window, 1-minute checks):** + +| Minute | Error Rate | Window Values | Window Avg | Threshold (0.6%) | Signal? | +|--------|-----------|------------------------|-----------|-------------------|---------| +| 1 | 0.3% | [0.3] | 0.30% | 0.6% | No | +| 2 | 0.7% | [0.3, 0.7] | 0.50% | 0.6% | No | +| 3 | 0.4% | [0.3, 0.7, 0.4] | 0.47% | 0.6% | No | +| 4 | 0.8% | [0.3, 0.7, 0.4, 0.8] | 0.55% | 0.6% | No | +| 5 | 0.9% | [0.3, 0.7, 0.4, 0.8, 0.9] | 0.62% | 0.6% | Yes | + +**Advantages over N-of-M:** Smoother signal, less sensitive to individual spikes. + +**Disadvantages:** Slower to detect sharp regressions because a few normal values in the window drag the average down. A sudden jump from 0.3% to 2.0% error rate would take multiple checks to raise the window average above 0.6%. + +**When to use:** Best for metrics that are inherently noisy and where you care about sustained trends rather than sharp transitions. Less appropriate for rollout monitoring where you want to detect regressions quickly. + +### Immediate-Critical Pattern + +Some thresholds are so severe that a single breach warrants immediate signaling. This is the exception to the "never signal on a single point" rule. + +**When to use immediate-critical:** + +- Error rate exceeds 50% (half of all requests are failing). +- Service returns zero throughput (complete outage). +- Latency exceeds 10x baseline (severe degradation). +- Health check endpoint returns unhealthy. + +**Example configuration:** + +| Metric | Immediate-Critical Threshold | Rationale | +|----------------|-----------------------------|---------------------------------| +| Error rate | > 50% | Majority of requests failing | +| Latency p99 | > 10,000ms (10s) | Effectively unusable | +| Throughput | < 10 req/s (from 1000+) | Near-complete traffic loss | +| Health check | Unhealthy | Service self-reports failure | + +These thresholds should be set far beyond any reasonable normal variation. If an immediate-critical threshold fires, the situation is unambiguous. + +--- + +## Severity Classification + +Not all threshold breaches are equal. Severity classification enables proportional responses — a warning gets attention, a critical gets action. + +### Severity Levels + +**Normal** — All monitored metrics are within their thresholds. No action needed. + +- Error rate: within baseline + margin +- Latency: within baseline + margin +- Throughput: above minimum floor +- Status: green. Continue monitoring. + +**Warning** — A metric is approaching its threshold or has a single isolated breach. Attention warranted but no intervention needed yet. + +Trigger conditions (any of): +- Metric exceeds 80% of the distance from baseline to threshold (e.g., baseline is 0.3%, threshold is 0.6%, warning at 0.54%). +- A single threshold breach occurred but consecutive-check pattern is not yet satisfied. +- Two metrics simultaneously exceed 70% of their threshold distance. + +Response: Continue monitoring. Note the warning in status reports. Be prepared to escalate. + +**Critical** — A sustained threshold breach confirmed by the consecutive-check pattern. Intervention likely needed. + +Trigger conditions (any of): +- Consecutive-check pattern satisfied (e.g., 3-of-5 breaches) for any monitored metric. +- Immediate-critical threshold breached (single check sufficient). +- Two or more metrics simultaneously in warning state for more than M checks. + +Response: Signal the regression. Provide details on which metric, current value, baseline, threshold, and number of consecutive breaches. Recommend investigation or rollback consideration. + +### Mapping Metrics to Severity + +Different metrics may warrant different severity mappings based on business impact: + +| Metric | Warning Trigger | Critical Trigger | Business Rationale | +|----------------|----------------------------------|----------------------------------------|-----------------------------------------------------| +| Error rate | > 150% of baseline OR > 1% | 3-of-5 breaches above 200% of baseline | Errors directly impact users | +| Latency p99 | > 130% of baseline | 3-of-5 breaches above 150% of baseline | Latency degrades UX but users can still complete tasks | +| Throughput | < 85% of baseline | 3-of-5 checks below 70% of baseline | Traffic drop may indicate upstream issues or errors | + +### Escalation: Warning to Critical Progression + +Severity is not static. A warning can escalate to critical, and a critical can de-escalate. + +**Escalation rules:** + +1. A metric in warning state that satisfies the consecutive-check pattern escalates to critical. +2. Multiple metrics in warning state simultaneously may escalate the overall status to critical (configurable — default is 2+ metrics in warning = overall critical). +3. A critical status that recovers (consecutive checks clear) de-escalates to warning, then to normal. + +**De-escalation rules:** + +1. A critical metric must have zero breaches in the last M checks to return to normal. +2. A warning metric must stay below 80% of threshold distance for M checks to return to normal. +3. De-escalation requires the same rigor as escalation — do not prematurely declare "all clear." + +--- + +## Rolling Baseline Calculation + +The baseline is the anchor for regression detection. A poorly computed baseline leads to either missed regressions (baseline too high) or excessive false positives (baseline too low). + +### How to Compute a Rolling Baseline + +1. **Select a trailing window** of metric data. For example, the last 24 hours of 1-minute metric samples = 1,440 data points. +2. **Exclude anomalies** from the window. Remove data points that are more than 3 standard deviations from the mean. This prevents previous incidents from inflating the baseline. +3. **Compute the baseline value:** + - For error rate and latency: use the mean (or p50/median for skewed distributions). + - For throughput: use the mean, but also compute the minimum expected value (e.g., p10) to set the floor. +4. **Compute the baseline variance:** Calculate the standard deviation. This is used for adaptive thresholds and for setting warning levels. + +**Example calculation:** + +Raw p99 latency data over 24 hours (1,440 samples): +- Mean: 220ms +- Median: 210ms +- Standard deviation: 35ms +- 3-sigma outlier threshold: 220 + (3 * 35) = 325ms +- Samples above 325ms: 12 (removed as anomalies) +- Recomputed mean (excluding anomalies): 215ms +- Recomputed standard deviation: 28ms +- **Baseline: 215ms** +- **Relative threshold at 150%: 322ms** +- **Adaptive threshold at 2.5 sigma: 215 + (2.5 * 28) = 285ms** + +### Baseline Window Sizing + +| Window Duration | Data Points (1-min) | Pros | Cons | Recommended For | +|-----------------|---------------------|-----------------------------------|-----------------------------------------|------------------------------| +| 1 hour | 60 | Very responsive to recent changes | Noisy, affected by short anomalies | Rapid iteration, canary tests | +| 4 hours | 240 | Good balance | May miss time-of-day patterns | Short rollouts (< 1 hour) | +| 24 hours | 1,440 | Captures full daily cycle | Slow to adapt, stale if service changed | Standard rollouts | +| 7 days | 10,080 | Captures weekly patterns | Very slow to adapt, high storage | Seasonal services | + +**Recommendation:** Use a 24-hour window for most cases. If the service has strong weekly patterns (e.g., much lower traffic on weekends), use 7 days. For rapid canary deployments, 1-4 hours may suffice. + +### Time-of-Day and Day-of-Week Adjustments + +Many services have predictable traffic patterns: higher load during business hours, lower on weekends. A flat baseline across all hours leads to problems: + +- A baseline computed from overnight data will be too low for daytime monitoring, causing false positives. +- A baseline computed from peak hours will be too high for overnight monitoring, missing regressions. + +**Solution: Bucketed baselines.** + +Compute separate baselines for time-of-day buckets: + +| Bucket | Hours (UTC) | Baseline Latency | Baseline Error Rate | Baseline Throughput | +|------------------|-------------|-------------------|---------------------|---------------------| +| Off-peak | 00:00-06:00 | 150ms | 0.1% | 200 req/s | +| Morning ramp | 06:00-10:00 | 200ms | 0.2% | 800 req/s | +| Peak | 10:00-18:00 | 280ms | 0.3% | 1500 req/s | +| Evening decline | 18:00-00:00 | 220ms | 0.2% | 600 req/s | + +When monitoring at 14:00 UTC, use the "Peak" baseline. When monitoring at 03:00 UTC, use the "Off-peak" baseline. This prevents time-of-day patterns from triggering false regressions. + +### Handling Baseline During Active Rollouts + +During a rollout, the baseline must reflect the pre-rollout state, not the current state. If you recompute the baseline during a rollout, the regressed metrics will pollute the baseline, raising it and masking the regression. + +**Rules:** + +1. **Freeze the baseline** at the start of monitoring. Do not recompute it during the rollout. +2. If the rollout spans multiple hours, the frozen baseline may become stale relative to time-of-day patterns. In this case, use bucketed baselines (computed pre-rollout) and switch buckets as time progresses. +3. If the rollout is paused or rolled back, and monitoring is restarted, recompute the baseline from the post-rollback recovery period. + +--- + +## Monitoring Intervals + +The check frequency determines how quickly you can detect regressions and how much data you consume. + +### Check Frequency Selection + +| Interval | Checks per Hour | Detection Latency (3-of-5) | Data Volume | Best For | +|----------|----------------|---------------------------|-------------|-------------------------------| +| 30 sec | 120 | 2.5 min max | High | Canary deployments, critical services | +| 1 min | 60 | 5 min max | Moderate | Standard rollouts (recommended) | +| 5 min | 12 | 25 min max | Low | Slow rollouts, batch services | +| 15 min | 4 | 75 min max | Very Low | Long-duration background monitoring | + +**Recommendation:** Use 1-minute intervals as the default. This provides a good balance between detection speed (5 minutes worst case with 3-of-5) and data volume. + +### Aligning with Metric Aggregation Windows + +If your metrics are aggregated in 1-minute windows, checking every 30 seconds will often return the same data point twice. Align your check interval with the metric aggregation window: + +- Metric aggregation: 1-minute windows -> check every 1 minute. +- Metric aggregation: 5-minute windows -> check every 5 minutes. +- Metric aggregation: 30-second windows -> check every 30 seconds or 1 minute. + +Checking more frequently than the aggregation window wastes resources and produces duplicate data points. + +### Minimum Monitoring Duration + +Monitoring should continue long enough to have statistical confidence that the rollout is healthy (or unhealthy). + +**Guidelines:** + +- Minimum: At least M checks (where M is the window size in your N-of-M pattern). With 3-of-5 and 1-minute checks, this is 5 minutes absolute minimum. +- Recommended: At least 15-30 minutes for a standard rollout. This provides 15-30 data points, enough to detect regressions and observe recovery patterns. +- For full rollouts (0% to 100%): Monitor for at least 30 minutes after reaching 100%. +- For staged rollouts (0% -> 10% -> 50% -> 100%): Monitor for at least 10-15 minutes at each stage before proceeding. + +--- + +## False Positive Management + +False positives erode trust in monitoring. If the system cries wolf repeatedly, operators will ignore it when a real regression occurs. + +### Common Causes of False Positives + +| Cause | Description | Frequency | +|--------------------------------|-----------------------------------------------------------------------------------------------|-----------| +| Threshold too tight | Threshold set too close to baseline, normal variance triggers it | High | +| Single-check triggering | No consecutive-check pattern; transient spikes signal false regressions | High | +| Noisy metrics | Short aggregation windows or low-traffic services produce metrics with high natural variance | Medium | +| Maintenance windows | Planned maintenance causes metric anomalies that look like regressions | Medium | +| Stale baseline | Baseline computed from old data that no longer reflects current service behavior | Medium | +| Deployment artifacts | Brief metric anomalies during instance restarts (connection draining, cold starts) | Low-Med | +| Upstream dependencies | A dependency (database, external API) has a brief issue unrelated to the rollout | Low-Med | +| Clock skew / metric lag | Metrics arrive late, causing a check to see incomplete data | Low | + +### Mitigation Strategies + +**1. Consecutive-check patterns (most impactful)** + +As described above, requiring 3-of-5 breaches reduces false positives from transient spikes by approximately 80-90%. This is the single most effective mitigation. + +**2. Appropriate threshold margins** + +Set thresholds with sufficient margin above baseline: + +| Metric | Minimum Recommended Margin | +|------------|---------------------------| +| Error rate | At least 2x baseline or 0.5% absolute, whichever is greater | +| Latency | At least 1.5x baseline or 100ms absolute, whichever is greater | +| Throughput | Floor at 70% of baseline (30% drop tolerance) | + +The "whichever is greater" clause prevents absurdly tight thresholds when baselines are very low. + +**3. Exclude known maintenance windows** + +If a maintenance window is scheduled (e.g., database migrations, infrastructure updates), either: +- Pause monitoring during the window and resume after. +- Exclude checks that fall within the maintenance window from the consecutive-check calculation. +- Use a wider threshold during the maintenance window. + +**4. Warm-up period exclusion** + +Immediately after a deployment starts, instances may experience cold-start effects (cache warming, JIT compilation, connection pool establishment). Exclude the first 1-2 minutes of monitoring from regression detection to avoid false positives from deployment artifacts. + +**5. Minimum traffic threshold** + +If throughput drops below a minimum level (e.g., < 10 req/s), error rate and latency metrics become statistically unreliable (a single slow request out of 5 produces a 20% error rate or wildly skewed latency). Set a minimum traffic threshold below which regression detection is suspended — report "insufficient data" instead of a false regression. + +**Example minimum traffic thresholds:** + +| Metric | Minimum Traffic for Reliable Detection | +|------------|---------------------------------------| +| Error rate | At least 100 requests in the check window | +| Latency p99| At least 50 requests in the check window | +| Latency p50| At least 20 requests in the check window | + +**6. Correlation across metrics** + +A real regression typically affects multiple metrics: error rate goes up AND latency goes up AND/OR throughput goes down. If only one metric breaches its threshold while others are perfectly normal, the probability of a false positive is higher. Use cross-metric correlation as a confidence booster: + +- Single metric in breach + others normal = lower confidence (may be false positive). +- Two metrics in breach simultaneously = higher confidence (likely real regression). +- All metrics in breach = very high confidence (definitely a real regression). + +This correlation should influence the urgency of the signal, not whether to signal at all. A single metric in sustained breach (passing the consecutive-check pattern) should still be reported — but the report should note that other metrics are not corroborating. + +**7. Regular baseline refresh** + +Refresh the baseline periodically (daily or weekly) to prevent staleness. A stale baseline that no longer reflects the service's current behavior is a persistent source of false positives (if the service improved) or missed regressions (if the service degraded). diff --git a/skills/observability/o11y-service-health/README.md b/skills/observability/o11y-service-health/README.md new file mode 100644 index 0000000..79d3ea0 --- /dev/null +++ b/skills/observability/o11y-service-health/README.md @@ -0,0 +1,59 @@ +# LaunchDarkly Service Health Check Skill + +## Overview + +- Teaches AI agents to check service health by examining error rates, latency percentiles, and throughput metrics. +- Assesses current metric values against historical baselines to determine if a service is operating normally. +- Classifies service status as **healthy**, **degraded**, or **critical** with per-metric breakdowns and evidence. +- Provides structured recommendations for next steps based on assessment results. + +## Installation (Local) + +Add the skill to your agent configuration by referencing the skill directory: + +```yaml +skills: + - path: skills/observability/o11y-service-health +``` + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled and configured. +- MCP tools available: `get-service-metrics`, `get-metric-baselines`, and optionally `list-services`. + +## Usage + +Example prompts that invoke this skill: + +- "How is the checkout service doing?" +- "Is the API healthy after the last deploy?" +- "Check service health for the payments service" +- "What's the current status of the auth service?" +- "Are there any issues with the search service right now?" + +## Structure + +``` +o11y-service-health/ +├── SKILL.md +├── README.md +└── references/ + ├── metric-types.md + └── health-assessment-criteria.md +``` + +| File | Description | +|------|-------------| +| `SKILL.md` | Core skill definition with workflow, principles, and edge cases | +| `README.md` | This file — overview, installation, and usage | +| `references/metric-types.md` | Detailed reference on error rate calculations, latency percentiles, throughput patterns, and common pitfalls | +| `references/health-assessment-criteria.md` | Baseline comparison techniques, threshold definitions, severity classification, and assessment methodology | + +## Related + +- [o11y-log-query](../o11y-log-query) — Query and analyze service logs for deeper investigation after identifying an unhealthy service. +- [LaunchDarkly MCP Server Documentation](https://docs.launchdarkly.com/) — Configuration and setup for the MCP server and observability tools. + +## License + +Apache-2.0 diff --git a/skills/observability/o11y-service-health/SKILL.md b/skills/observability/o11y-service-health/SKILL.md new file mode 100644 index 0000000..0c3d38d --- /dev/null +++ b/skills/observability/o11y-service-health/SKILL.md @@ -0,0 +1,112 @@ +--- +name: o11y-service-health +description: "Pull error rate, latency percentiles (p50/p95/p99), and throughput metrics for a service and classify its status as healthy, degraded, or critical. Use when the user asks 'how is doing', wants a metric snapshot before a rollout, or needs to confirm whether a service is currently healthy or degraded." +license: Apache-2.0 +compatibility: Requires LaunchDarkly MCP server with observability tools enabled. +metadata: + author: launchdarkly + version: "0.1.0" +--- + +# Service Health Check + +You're using a skill that will guide you through checking the health of a service by examining its key metrics. Your job is to identify the service, pull its health metrics, assess them against baselines, and summarize the overall status. + +## Prerequisites + +- **LaunchDarkly MCP server** with observability tools enabled and configured. +- **Required MCP tools**: + - `get-service-metrics` — Fetch error rate, latency percentiles, and throughput for a given service and time window. + - `get-metric-baselines` — Retrieve historical baseline data for a service's metrics. +- **Optional MCP tools**: + - `list-services` — Discover available services and their identifiers. + +## Core Principles + +1. **Metrics in Context** — Raw numbers mean nothing without baselines. An error rate of 2% is healthy for one service and a five-alarm fire for another. Always compare current values against established baselines before drawing conclusions. + +2. **Three Pillars** — Always check error rate, latency, AND throughput together. A low error rate with zero throughput is not healthy. High latency with dropping throughput suggests a different problem than high latency with stable throughput. No single metric tells the full story. + +3. **Time Windows Matter** — Use the appropriate time window for the question being asked. A deployment health check needs a short post-deploy window compared against the pre-deploy period. An incident triage needs the most recent minutes. A general health check needs enough data to be statistically meaningful. + +4. **Signal not Diagnosis** — Health checks identify symptoms, not root causes. The output of this skill is "the service is degraded because error rate is elevated" — not "the service is degraded because the database connection pool is exhausted." Diagnosis is a separate investigation step. + +## Workflow + +### Step 1: Identify the Service + +Determine which service to check. If the user has specified a service name, confirm it matches a known service. If the user is unsure or wants to browse, use `list-services` to discover available services and present the options. + +- Accept service names, service IDs, or common aliases. +- If the service name is ambiguous (multiple matches), ask the user to clarify. +- If the service is not found, inform the user and suggest checking the service name or using `list-services`. + +### Step 2: Pull Health Metrics + +Use `get-service-metrics` to fetch the following for the identified service: + +- **Error rate**: 4xx rate and 5xx rate, separately. +- **Latency**: p50, p95, and p99 values. +- **Throughput**: Requests per second. + +Pull metrics for two time windows: +- **Current window**: The most recent 5 minutes (or the window appropriate to the question — see Core Principles). +- **Comparison window**: The equivalent time window from 24 hours ago, or a recent baseline period. + +See [metric-types.md](references/metric-types.md) for detailed guidance on how each metric is calculated and interpreted. + +### Step 3: Assess Against Baselines + +Use `get-metric-baselines` to retrieve historical baseline data for the service. If baseline data is available: + +- Compare each current metric value against its baseline range. +- Classify each metric as **normal**, **elevated**, or **critical** based on how far it deviates from the baseline. + +If baseline data is not available: + +- Fall back to static default thresholds (see [health-assessment-criteria.md](references/health-assessment-criteria.md) for defaults). +- Note in the output that the assessment is low-confidence due to missing baselines. + +Apply the severity classification rules from [health-assessment-criteria.md](references/health-assessment-criteria.md): +- Assess each metric category independently. +- Combine into an overall status using the worst-metric-wins rule. +- Adjust for throughput context (do not trust error rate if throughput is near zero). + +### Step 4: Summarize Status + +Present the findings to the user in a clear, structured format: + +- **Overall status**: Healthy, Degraded, or Critical. +- **Per-metric breakdown**: + - Error rate (5xx): current value, baseline value, severity. + - Error rate (4xx): current value, baseline value, note if unusual. + - Latency (p50, p95, p99): current values, baseline values, severity. + - Throughput: current value, baseline value, severity. +- **Notable changes from baseline**: Call out any metric that has changed significantly, even if it has not crossed a severity threshold. +- **Recommended next steps**: + - If healthy: No action needed. Optionally suggest setting up ongoing monitoring. + - If degraded: Suggest investigating the elevated metrics. Recommend checking recent deployments, dependency health, or using the log query skill for deeper analysis. + - If critical: Recommend immediate investigation. Highlight the most affected metrics and suggest starting points for diagnosis. + +## Edge Cases + +| Scenario | How to Handle | +|----------|---------------| +| Service not found | Inform the user the service was not found. Suggest checking the service name or using `list-services` to discover available services. | +| No baseline data available | Fall back to static default thresholds. Clearly note in the output that the assessment is low-confidence. Recommend establishing baselines for future checks. | +| Metrics partially available | Assess what is available and note what is missing. For example, if latency data is present but error rate is not, assess latency and flag that error rate data was unavailable. | +| Service returning zero throughput | Flag this as a critical finding regardless of error rate or latency values. Zero throughput means the service is not processing requests, which may indicate it is down, unreachable, or has no traffic routed to it. | +| Metric spike is expected (deploy in progress) | If the user indicates a deployment is in progress, note that metric fluctuations during deployment are expected. Suggest re-checking after the deployment stabilizes (typically 5-15 minutes post-deploy). | + +## What NOT to Do + +- **Don't diagnose root cause.** This skill identifies that a service is unhealthy and which metrics are affected. Root cause analysis is a separate investigation. Point the user toward log analysis or dependency tracing rather than speculating about causes. +- **Don't compare services to each other.** Each service has its own baseline and its own definition of normal. A 200ms p50 latency is excellent for one service and terrible for another. Always compare a service against its own history. +- **Don't alert on a single data point.** A single metric value in a 1-minute window can be noisy. Look at trends over the assessment window. If only one data point is elevated and the surrounding points are normal, it is likely a transient spike rather than a real problem. +- **Don't ignore throughput.** It is tempting to focus on error rate and latency because they directly measure user experience. But throughput provides essential context. Low errors with low traffic may mean the problem is upstream. High errors with high traffic may mean the service is overwhelmed. + +## References + +- [Metric Types Reference](references/metric-types.md) — Detailed guide to error rate calculations, latency percentile interpretation, throughput patterns, and common pitfalls. +- [Health Assessment Criteria](references/health-assessment-criteria.md) — Baseline comparison techniques, threshold definitions, severity classification, and assessment methodology. +- For deeper investigation after identifying an unhealthy service, see the `o11y-log-query` skill for querying and analyzing service logs. diff --git a/skills/observability/o11y-service-health/references/health-assessment-criteria.md b/skills/observability/o11y-service-health/references/health-assessment-criteria.md new file mode 100644 index 0000000..4ddfd67 --- /dev/null +++ b/skills/observability/o11y-service-health/references/health-assessment-criteria.md @@ -0,0 +1,253 @@ +# Health Assessment Criteria + +This reference defines how to assess service health by comparing current metrics against baselines, applying severity thresholds, and generating a composite health status. It covers baseline techniques, threshold definitions, severity classification, and common assessment patterns. + +## Baseline Comparison Techniques + +A baseline is the expected value or range for a metric under normal operating conditions. Without a baseline, you cannot determine whether a metric value is normal, elevated, or critical. Raw numbers are meaningless in isolation — an error rate of 2% might be perfectly normal for one service and a severe regression for another. + +### Historical Baselines + +The simplest baseline is a historical average: what was this metric's value over the past N days? A 7-day rolling average captures a full week of patterns. A 30-day rolling average captures monthly patterns but may be too slow to reflect recent changes (such as a new deployment that legitimately changed the service's behavior). + +Historical baselines work well for stable services with predictable traffic. They work poorly for services undergoing rapid change, services with highly variable traffic, or newly deployed services with insufficient history. + +### Rolling Averages + +A rolling average continuously updates as new data arrives. Common windows include: + +- **1-hour rolling average**: Responsive but noisy. Useful for detecting acute changes but will flag brief transients. +- **4-hour rolling average**: Balances responsiveness and stability. Good for intra-day comparisons. +- **24-hour rolling average**: Captures a full diurnal cycle. Useful for comparing "right now" against "the same general time yesterday." +- **7-day rolling average**: Captures weekly patterns. Useful for comparing "Tuesday afternoon" against "recent Tuesday afternoons." + +When computing rolling averages for baselines, exclude known anomaly periods (incidents, maintenance windows, traffic spikes) to prevent them from skewing the baseline. + +### Time-of-Day and Day-of-Week Adjustments + +Many services have strong temporal patterns. A service might handle 5,000 req/s at 2pm and 200 req/s at 3am. Comparing a 3am metric value against a 24-hour average that includes peak hours will produce misleading results. + +Time-of-day adjusted baselines compare the current metric against the same time window on previous days. For example, compare today's 2pm-3pm error rate against the 2pm-3pm error rate from the past 7 Tuesdays (if today is Tuesday). This accounts for both diurnal and weekly patterns. + +Day-of-week adjustments are particularly important for B2B services where weekday traffic differs dramatically from weekend traffic. A throughput drop on Saturday that would be alarming on a Tuesday is completely expected. + +### Percentile-Based Baselines + +Instead of using a single average as the baseline, use a percentile range. For example, the baseline for error rate might be defined as "the p5 to p95 range of error rate values observed during equivalent time windows over the past 14 days." Any current value within that range is normal. Values above p95 are elevated. Values above p99 are critical. + +Percentile-based baselines handle variance naturally. A service with highly variable latency will have a wide baseline range, reducing false alarms. A service with very stable latency will have a narrow range, making even small deviations visible. + +## Threshold Definitions + +### Static Thresholds + +Static thresholds are fixed values that do not change over time. Examples: + +| Metric | Warning Threshold | Critical Threshold | +|--------|------------------|--------------------| +| 5xx error rate | > 1% | > 5% | +| p99 latency | > 2000ms | > 5000ms | +| p50 latency | > 500ms | > 1500ms | +| Throughput drop | > 30% below baseline | > 60% below baseline | + +Static thresholds are simple to implement and easy to understand. However, they do not account for service-specific norms. A 1% error rate threshold is too sensitive for a service that normally runs at 0.8% and too lenient for a service that normally runs at 0.01%. + +Use static thresholds as defaults when no baseline data is available, and replace them with dynamic thresholds as baseline data accumulates. + +### Dynamic Thresholds + +Dynamic thresholds are computed from baseline data. A common approach is to define thresholds as multiples of the baseline standard deviation: + +- **Warning**: Current value exceeds baseline mean + 2 standard deviations +- **Critical**: Current value exceeds baseline mean + 3 standard deviations + +This approach (similar to Bollinger Bands in financial analysis) adapts to each service's normal variance. A noisy service will have wider thresholds; a stable service will have tighter thresholds. + +For error rates, which are bounded at 0 and typically right-skewed, a multiplicative threshold often works better than additive: + +- **Warning**: Current error rate exceeds 3x the baseline error rate +- **Critical**: Current error rate exceeds 10x the baseline error rate + +For throughput drops, express thresholds as percentage deviations from baseline: + +- **Warning**: Throughput is more than 25% below the baseline for this time window +- **Critical**: Throughput is more than 50% below the baseline for this time window + +### Setting Thresholds From Baselines + +A practical approach to setting thresholds from baseline data: + +1. Collect metric values for the same time-of-day and day-of-week over the past 14 to 28 days. +2. Remove outliers (values during known incidents or maintenance). +3. Compute the p50 (median), p90, p95, and p99 of the collected values. +4. Set the warning threshold at the p95 of historical values. +5. Set the critical threshold at a value significantly above p99 (for example, 2x the p99 value). + +This method ensures that thresholds reflect what the service actually does, not what someone guesses it should do. + +### Common Defaults + +When no baseline data is available and you need to make an initial assessment, these defaults provide a reasonable starting point: + +**Error Rate (5xx)**: +- Healthy: < 0.5% +- Degraded: 0.5% to 5% +- Critical: > 5% + +**Latency (p50)**: +- Healthy: < 200ms +- Degraded: 200ms to 1000ms +- Critical: > 1000ms + +**Latency (p99)**: +- Healthy: < 1000ms +- Degraded: 1000ms to 5000ms +- Critical: > 5000ms + +**Throughput change from expected**: +- Healthy: within 20% of expected +- Degraded: 20% to 50% deviation +- Critical: > 50% deviation + +These defaults are intentionally loose. They should be replaced with service-specific thresholds as soon as baseline data is available. Services with strict SLOs will need much tighter thresholds. + +## Severity Classification + +### Definitions + +**Healthy**: All metrics are within their expected baseline ranges. No action is required. The service is operating normally. + +**Degraded**: One or more metrics are outside their normal range but not at critical levels. The service is still functioning but with reduced quality. Users may experience slower responses or occasional errors. Investigation is warranted, and the team should be aware, but this is not an emergency. + +**Critical**: One or more metrics are far outside their normal range. The service is experiencing significant issues that are likely affecting users. Immediate investigation and response are needed. + +### Mapping Metrics to Severity + +Each metric generates its own severity assessment. The rules for mapping a metric value to a severity level: + +**Error Rate Severity**: +- Healthy: Error rate is within the baseline range (or below the warning threshold) +- Degraded: Error rate exceeds the warning threshold but is below the critical threshold +- Critical: Error rate exceeds the critical threshold + +**Latency Severity**: +- Evaluate p50, p95, and p99 independently +- If any latency percentile exceeds its critical threshold, latency severity is critical +- If any latency percentile exceeds its warning threshold (and none are critical), latency severity is degraded +- Otherwise, latency severity is healthy + +**Throughput Severity**: +- Healthy: Throughput is within the expected range +- Degraded: Throughput has deviated significantly from baseline (warning threshold) +- Critical: Throughput has deviated dramatically from baseline (critical threshold) +- Note: Both drops AND unexpected spikes in throughput should be flagged + +### Multi-Metric Severity Combining Rules + +When combining individual metric severities into an overall service health status, use these rules: + +**Rule 1: Worst metric wins (default)** +The overall severity is the worst severity across all metrics. If error rate is healthy, latency is degraded, and throughput is healthy, the overall status is degraded. + +**Rule 2: Corroboration for critical** +To reduce false alarms, require at least two metrics to be elevated before declaring the overall status critical. If only one metric is critical and the others are healthy, the overall status is degraded (not critical). This rule acknowledges that a single metric spike may be a measurement artifact. + +**Rule 3: Throughput-adjusted error assessment** +If throughput is critically low (near zero), do not trust error rate calculations — the sample size is too small. In this case, the critical signal is the throughput loss itself, not the error rate. + +The recommended default is Rule 1 (worst metric wins) for simplicity and safety. Rule 2 can be applied when false alarm reduction is a priority and the cost of a delayed critical assessment is acceptable. + +## Assessment Methodology + +### Single Metric Assessment + +To assess a single metric: + +1. Retrieve the current value for the metric over the assessment window (typically 5 minutes). +2. Retrieve the baseline value for the same metric, adjusted for time-of-day and day-of-week if possible. +3. Compare the current value to the baseline. +4. Determine the severity based on how far the current value deviates from the baseline. +5. Record the current value, baseline value, deviation percentage, and resulting severity. + +Example: Current p99 latency is 1200ms. Baseline p99 for this time window is 400ms. The current value is 3x the baseline (200% increase). If the warning threshold is 2x and the critical threshold is 5x, the severity is degraded. + +### Composite Health Score + +A composite health score combines multiple metrics into a single numerical value. One approach: + +1. Assign each metric a score from 0 (critical) to 100 (healthy). +2. Apply weighting factors based on metric importance. +3. Compute a weighted average. + +Suggested weights: +- Error rate (5xx): 40% +- Latency (p99): 25% +- Latency (p50): 15% +- Throughput stability: 20% + +A composite score above 80 is healthy. Between 50 and 80 is degraded. Below 50 is critical. + +The composite score is useful for dashboards and trend tracking. For incident triage, the individual metric breakdowns are more useful than the composite number because they point to specific problem areas. + +### Weighting Factors + +Default weights can be adjusted based on service characteristics: + +- **User-facing API services**: Weight error rate and p50 latency more heavily. Users experience error rates and median latency directly. +- **Backend processing services**: Weight throughput and p99 latency more heavily. Processing services need to maintain throughput, and tail latency can cause cascading timeouts. +- **Real-time streaming services**: Weight throughput most heavily. A throughput drop in a streaming service may indicate data loss. + +### Confidence Levels + +The confidence of a health assessment depends on data quality: + +- **High confidence**: Baseline data is available from at least 7 days of equivalent time windows. Current data covers a full assessment window with high throughput (large sample size). All three metric categories (error rate, latency, throughput) have data. +- **Medium confidence**: Baseline data is available but limited (fewer than 7 data points). Current data is complete. Most metric categories have data. +- **Low confidence**: No baseline data available (using static defaults). Current data is sparse (low throughput or partial coverage). One or more metric categories are missing. + +Always report the confidence level alongside the health assessment so that consumers of the assessment know how much to trust it. + +## Common Assessment Patterns + +### Deployment Health Check + +When assessing health after a deployment: + +1. Compare the post-deployment window (starting from deploy completion) against the pre-deployment window (the period immediately before the deploy). +2. Use a short current window (5-15 minutes post-deploy) and a longer comparison window (1-4 hours pre-deploy). +3. Pay special attention to error rate changes — even small increases immediately after a deploy are significant because they correlate with the change. +4. Check for latency distribution shifts. A new code path or dependency change may shift the entire distribution. +5. Verify throughput is stable — a deploy that causes a throughput drop may indicate crash-looping instances or failed health checks. + +Deployment health checks should have lower thresholds than normal monitoring because any regression correlated with a deploy is likely caused by the deploy. + +### Incident Triage Assessment + +When assessing health during an incident: + +1. Focus on the most recent data (1-5 minute windows). +2. Compare against the period just before the incident started (if the start time is known) or against the same time yesterday. +3. Identify which metrics are most affected — this points toward the problem domain. +4. Check for correlated changes across metrics. If error rate and latency both spiked at the same time, they likely share a cause. +5. Check throughput to understand the blast radius — if throughput dropped, the problem may be preventing requests from reaching the service. + +During incident triage, speed matters more than precision. A quick directional assessment ("error rate is 10x normal, latency is 3x normal, throughput is stable") is more valuable than a precise composite score. + +### Ongoing Monitoring Assessment + +For routine health monitoring (periodic checks, dashboard updates): + +1. Use longer assessment windows (15-30 minutes) for stability. +2. Compare against well-established baselines (7-28 day rolling baselines, time-adjusted). +3. Track trends over time — is the service gradually getting slower? Is the error rate slowly creeping up? +4. Flag any metric that has been in the degraded range for an extended period (more than 1 hour) even if it has not reached the critical threshold. Sustained degradation warrants investigation. + +### Insufficient Data Assessment + +When baseline data is unavailable or the service is new: + +1. Fall back to static thresholds (see Common Defaults above). +2. Clearly flag that the assessment is low-confidence due to missing baselines. +3. Use the current assessment as the beginning of baseline data collection. +4. If the service has been running for at least a few hours, use the recent history as a short-term baseline (better than nothing, but note the limited data). +5. Recommend establishing a proper baseline period before relying on health assessments for alerting or automated responses. diff --git a/skills/observability/o11y-service-health/references/metric-types.md b/skills/observability/o11y-service-health/references/metric-types.md new file mode 100644 index 0000000..f48df94 --- /dev/null +++ b/skills/observability/o11y-service-health/references/metric-types.md @@ -0,0 +1,164 @@ +# Metric Types Reference + +This reference covers the core metric types used in service health assessments: error rates, latency percentiles, and throughput. Understanding how each metric is calculated, what it reveals, and how to interpret it correctly is essential for accurate health checks. + +## Error Rate Calculations + +### Error Rate vs Error Count + +Error count is the raw number of errors observed in a given time window. Error rate is the proportion of requests that resulted in an error, expressed as a percentage: + +``` +error_rate = (error_count / total_request_count) * 100 +``` + +Error count alone is misleading. A service handling 10 requests per second with 5 errors is in serious trouble (50% error rate). A service handling 10,000 requests per second with 5 errors is essentially healthy (0.05% error rate). Always prefer error rate over error count when assessing health. + +### 4xx vs 5xx Distinction + +HTTP status codes in the 400-499 range (4xx) indicate client errors. These are requests where the client sent something invalid — a malformed payload, a missing authentication token, a request for a resource that does not exist. While a sudden spike in 4xx errors may indicate a client-side problem or a breaking API change, 4xx errors do not generally indicate that the service itself is unhealthy. + +HTTP status codes in the 500-599 range (5xx) indicate server errors. These are requests where the service failed to fulfill a valid request. A 500 Internal Server Error, 502 Bad Gateway, 503 Service Unavailable, or 504 Gateway Timeout all point to problems within the service or its dependencies. 5xx errors are the primary signal for service health. + +When calculating error rate for health purposes, use 5xx error rate as the primary indicator. Track 4xx error rate separately as a secondary signal. A combined error rate (4xx + 5xx) can obscure the picture — a service with a 10% combined error rate might have 9.5% 4xx (client issues) and 0.5% 5xx (service is fine), or it might have 1% 4xx and 9% 5xx (service is in trouble). + +### Calculating Error Rate From Different Sources + +Error rates can be derived from multiple sources, and the numbers will not always agree: + +- **Load balancer metrics**: Captures all requests that reach the load balancer, including those rejected before reaching the service. Useful for the broadest view but may include errors the service never saw. +- **Application-level metrics (APM)**: Captures errors as seen by the application code. This is typically the most accurate source for service health because it reflects what the service actually experienced. +- **Log-derived error rates**: Calculated by counting error-level log entries against total request log entries. Accuracy depends on logging completeness. If the service fails to log certain errors, or logs non-error events at the error level, the rate will be skewed. +- **Synthetic monitor results**: Error rates from synthetic checks reflect availability from a specific vantage point. They are useful for validating that the service is reachable and responding correctly but represent a tiny fraction of actual traffic. + +When multiple sources are available, prefer APM metrics for service health. Use load balancer metrics to cross-validate. Note discrepancies between sources — they often reveal interesting problems (for example, a load balancer showing errors that the application does not see may indicate the application is crashing before it can log the error). + +### Error Rate Time Windows + +Error rate should be calculated over a meaningful time window. A 1-minute window is useful for detecting acute spikes but will be noisy. A 5-minute window smooths out brief transients while still catching real problems. A 15-minute or 30-minute window provides a stable signal but may be too slow to catch fast-moving incidents. + +For health checks, a 5-minute current window compared against a 1-hour or 24-hour baseline is a reasonable default. + +## Latency Percentile Interpretation + +### What Percentiles Mean + +A percentile represents the value below which a given percentage of observations fall. For latency: + +- **p50 (median)**: 50% of requests completed faster than this value, and 50% were slower. This represents the typical user experience. If p50 is 120ms, a typical request takes about 120ms. +- **p95**: 95% of requests completed faster than this value. Only 5% of requests were slower. This represents the experience of users who are having a somewhat bad time. If p95 is 500ms, then 1 in 20 requests takes longer than half a second. +- **p99**: 99% of requests completed faster than this value. Only 1% of requests were slower. This represents the tail — the worst-case experience for most users. If p99 is 2000ms, then 1 in 100 requests takes longer than 2 seconds. + +### Why Each Percentile Matters + +**p50** tells you about the common case. If p50 increases, the majority of your users are affected. A p50 regression is broad impact but may be moderate severity per user. + +**p95** tells you about the edge of normal. Many SLOs (Service Level Objectives) are defined at p95 because it captures the experience of users who are not in the happy path. A p95 regression means a meaningful minority of users are experiencing degraded performance. + +**p99** tells you about the tail. Tail latency matters because individual users make many requests during a session. If p99 is bad, a user making 100 requests will likely experience that bad latency at least once. For services that are called by other services (backend-to-backend), p99 is critical because a single slow dependency call can make the entire parent request slow. + +### How to Read Percentile Distributions + +When all three percentiles move together (p50, p95, and p99 all increase by a similar factor), the entire latency distribution has shifted. This usually indicates a systemic change — a slower dependency, increased load, or a code change that added latency to all requests. + +When p99 increases but p50 stays flat, something is affecting only a subset of requests. This could be a specific endpoint, a specific query pattern, a database lock, or garbage collection pauses. The tail is getting worse while most requests are fine. + +When p50 increases but p99 stays roughly the same, the common case got slower but the worst case did not change. This is less common but can happen when a frequently-hit code path gets slower while the already-slow paths are unchanged. + +The ratio between percentiles is informative. A healthy service often has p99/p50 ratios between 3x and 10x. If p99 is 50x or 100x the p50, the service has extreme tail latency, which often indicates resource contention, lock contention, or periodic background work interfering with request processing. + +### Apdex Scores + +Apdex (Application Performance Index) is a standardized way to convert latency into a satisfaction score between 0 and 1. It requires defining a target latency threshold T: + +- Requests completing in less than T are "satisfied" +- Requests completing between T and 4T are "tolerating" +- Requests completing in more than 4T are "frustrated" + +``` +apdex = (satisfied_count + (tolerating_count / 2)) / total_count +``` + +An Apdex score of 1.0 means all users are satisfied. A score of 0.5 means roughly half the users are having a poor experience. Scores below 0.7 generally indicate a problem. + +Apdex is useful as a single summary number but it hides distribution details. Two services can have the same Apdex score with very different latency profiles. Use Apdex as a quick summary, then look at individual percentiles for detail. + +## Throughput Patterns + +### Requests Per Second + +Throughput is measured in requests per second (req/s) or requests per minute (rpm). It tells you how much work the service is doing. Throughput alone does not indicate health, but changes in throughput provide critical context for interpreting other metrics. + +### Normal Patterns + +Most services exhibit predictable throughput patterns: + +- **Diurnal patterns**: Traffic follows the wake/sleep cycle of users. B2C services typically peak mid-morning to early evening in the user's timezone. Traffic drops overnight. +- **Weekly patterns**: Weekday traffic often differs from weekend traffic. B2B services may see minimal weekend traffic while B2C services may see higher weekend traffic depending on the domain. +- **Seasonal patterns**: E-commerce services spike during holidays. Tax services spike in April. Event-driven services spike around specific events. + +Understanding normal patterns is essential for baseline comparison. A throughput of 500 req/s might be perfectly normal at 2pm on a Tuesday but very unusual at 3am on a Sunday. + +### Capacity Indicators + +Throughput trends over weeks and months reveal capacity needs. If throughput is steadily increasing, the service is growing and will eventually need more capacity. If throughput has plateaued, growth may have stalled or the service may be at capacity (requests are being rejected or queued). + +A sudden drop in throughput is often more alarming than a sudden increase. A drop may indicate that an upstream service stopped sending traffic (possibly because it is failing), that a load balancer is routing traffic elsewhere, or that the service is rejecting requests before they are counted. + +A sudden increase in throughput may indicate a traffic spike (legitimate or attack), a retry storm from a client, or a batch job that was accidentally pointed at the production service. + +### Throughput and Error Rate Correlation + +Always examine throughput when evaluating error rate. Consider these scenarios: + +- Error rate drops from 5% to 0.1% while throughput drops from 1000 req/s to 2 req/s. The service did not get healthier — traffic stopped arriving, and the few requests that trickle in happen to succeed. The underlying problem may still exist. +- Error rate increases from 0.1% to 2% while throughput increases from 1000 req/s to 5000 req/s. The service may be fine — it could be hitting capacity limits under unusual load. Or a traffic spike is exposing a latent bug. +- Error rate increases from 0.1% to 5% while throughput remains stable at 1000 req/s. The service is genuinely degraded. The same volume of traffic is now producing more errors. + +## Metric Sources + +### APM Tools + +Application Performance Monitoring tools (Datadog APM, New Relic, Dynatrace, Honeycomb, Lightstep) instrument the application code to capture per-request data. They provide the most detailed and accurate view of service behavior, including per-endpoint breakdowns, dependency call latency, and error classification. + +APM data is typically the best source for health assessments because it reflects what the application actually experienced. + +### Custom Metrics + +Services often emit custom metrics for business-specific indicators — items in cart, payments processed, search queries executed. These can be valuable secondary signals. If the service reports healthy error rates and latency but the "orders processed per minute" metric has dropped to zero, something is wrong that the standard metrics do not capture. + +### Log-Derived Metrics + +Metrics can be extracted from structured logs by counting events, parsing latency values from log fields, or categorizing log entries by level. Log-derived metrics are useful when APM instrumentation is not available but are inherently less precise. They depend on the service logging consistently and correctly, and the log processing pipeline introducing minimal delay. + +### Infrastructure Metrics + +CPU utilization, memory usage, disk I/O, network throughput, and container-level metrics describe the environment the service runs in. They do not directly measure service health but provide context. A service with high error rates and 98% CPU utilization has a different problem than a service with high error rates and 20% CPU utilization. + +## Common Pitfalls + +### Averaging Percentiles + +Never average percentile values across instances or time windows. If instance A has a p99 of 200ms and instance B has a p99 of 800ms, the overall p99 is NOT 500ms. Percentiles must be computed from the merged data set or approximated using histogram merging techniques (like DDSketch or HDR Histogram). + +If you only have pre-computed percentiles from individual sources, you can use the maximum of the individual percentiles as a rough upper bound, but this is also not accurate. The correct approach is to compute percentiles from the raw data or merged histograms. + +### Comparing Different Time Windows + +Comparing a 1-minute error rate against a 1-hour baseline is comparing different levels of granularity. A 1-minute window can show spikes that would be averaged out over an hour. When comparing current performance to a baseline, ensure the current window and the baseline window are computed at the same granularity, or explicitly account for the difference. + +### Ignoring Throughput When Looking at Error Rates + +As discussed above, error rate without throughput context is incomplete. A 0% error rate with 0 throughput is not healthy. Always check throughput alongside error rate. + +### Treating All Errors Equally + +A 500 Internal Server Error and a 503 Service Unavailable have different implications. A 500 often indicates a bug. A 503 often indicates the service is overloaded and deliberately shedding load. A 504 Gateway Timeout indicates a dependency is slow. Grouping all 5xx errors together for the health check is appropriate as a first pass, but investigation should distinguish between error types. + +### Ignoring Error Rate Denominators + +An error rate of 50% from 2 total requests is not statistically meaningful. Ensure the sample size is large enough to draw conclusions. For services with very low throughput, raw error counts may be more useful than error rates, and longer time windows may be needed to accumulate enough data for a meaningful rate. + +### Using Averages Instead of Percentiles for Latency + +Average latency is heavily influenced by outliers and hides the shape of the distribution. A service with an average latency of 200ms might have most requests completing in 50ms with a small number taking 5000ms. The average looks acceptable but 1 in 20 users is having a terrible experience. Always use percentiles for latency assessment.