From b68c584dc624d965465d654589ac4f5ebc79fad3 Mon Sep 17 00:00:00 2001 From: Ming Wen Date: Mon, 8 Jun 2026 07:37:20 +0800 Subject: [PATCH] fix(server): pin dp-manager REST clients to HTTP/1.1 (#535) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DP→dp-manager REST clients (budget_check, heartbeat, telemetry) all began failing at the transport layer ("error sending request") after the object_store observability sink landed. object_store pulls reqwest with its `http2` feature, and Cargo feature unification enables http2 on the single shared workspace reqwest — including the mTLS client these REST calls use. dp-manager multiplexes kine/etcd gRPC and the /dp/* REST API on one TLS port via cmux: HTTP/2 streams carrying `content-type: application/grpc` go to kine, everything else to an HTTP/1.1-only gin server. With http2 enabled the REST client negotiates ALPN h2, cmux routes the non-grpc h2 connection to the gin listener, which can't parse the h2 preface, so every budget_check / heartbeat / telemetry request dies before a response. The etcd/kine path is unaffected because it speaks real gRPC. Pin these clients to HTTP/1.1 with `.http1_only()` so they advertise only http/1.1 in ALPN regardless of which reqwest features other deps unify in. Add regression tests asserting the CP-REST ClientHello advertises http/1.1 and not h2. --- crates/aisix-server/src/heartbeat.rs | 76 +++++++++++++++++++++++++++- crates/aisix-server/src/telemetry.rs | 65 +++++++++++++++++++++++- 2 files changed, 139 insertions(+), 2 deletions(-) diff --git a/crates/aisix-server/src/heartbeat.rs b/crates/aisix-server/src/heartbeat.rs index 7a108352..63050439 100644 --- a/crates/aisix-server/src/heartbeat.rs +++ b/crates/aisix-server/src/heartbeat.rs @@ -283,7 +283,20 @@ fn build_client(mtls: &MtlsBundle) -> anyhow::Result { .user_agent(format!("aisix-dp/{}", env!("CARGO_PKG_VERSION"))) .identity(identity) .add_root_certificate(ca) - .use_rustls_tls(); + .use_rustls_tls() + // Pin HTTP/1.1 for the dp-manager REST surface. dp-manager + // multiplexes kine/etcd gRPC and the /dp/* REST API on one TLS + // port via cmux: HTTP/2 streams carrying + // `content-type: application/grpc` go to kine, everything else to + // an HTTP/1.1-only gin server. If this client negotiates ALPN h2 + // — which it does whenever reqwest's `http2` feature is enabled, + // and an observability-sink dependency (object_store) unifies that + // feature onto the shared workspace reqwest — every budget_check / + // heartbeat / telemetry request lands on the gin listener, which + // can't parse the h2 preface, and fails at the transport layer. + // This line is load-bearing; do not remove without making the CP + // REST listener h2-capable. See issue #535. + .http1_only(); // Operator-supplied extra root (e2e / on-prem). Covers the dp- // manager server cert when it's signed by a CA distinct from the // cert-manager-issued client-cert CA. @@ -553,4 +566,65 @@ mod tests { }; build_client(&mtls).expect("build_client must tolerate PEM without trailing newline"); } + + /// Regression for #535: the dp-manager REST client must advertise + /// **only** `http/1.1` in its TLS ALPN — never `h2`. dp-manager + /// multiplexes kine/etcd gRPC and the `/dp/*` REST API on a single + /// TLS port via cmux; an h2 REST connection is misrouted to the + /// HTTP/1.1-only gin listener and every budget_check / heartbeat / + /// telemetry request then fails at the transport layer. The + /// `.http1_only()` in `build_client` pins this. We inspect the raw + /// ClientHello (sent in the clear before the handshake) instead of + /// completing a TLS session, so the test needs no server cert and + /// can't flake on handshake details. + #[tokio::test] + async fn cp_rest_client_advertises_only_http1_alpn() { + use tokio::io::AsyncReadExt; + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let capture = tokio::spawn(async move { + let (mut sock, _) = listener.accept().await.unwrap(); + let mut buf = vec![0u8; 4096]; + let n = sock.read(&mut buf).await.unwrap_or(0); + buf.truncate(n); + buf + }); + + let dir = tempfile::tempdir().unwrap(); + let mtls = write_test_bundle(dir.path()); + let client = build_client(&mtls).expect("build mTLS client"); + + // One request makes the client emit a ClientHello; we never + // reply, so it errors out — only the ClientHello matters. + let _ = tokio::time::timeout( + Duration::from_secs(2), + client + .get(format!("https://127.0.0.1:{}/dp/budget_check", addr.port())) + .send(), + ) + .await; + + let hello = tokio::time::timeout(Duration::from_secs(2), capture) + .await + .expect("ClientHello capture timed out") + .expect("capture task panicked"); + + // ALPN entries are length-prefixed: 0x08 "http/1.1", 0x02 "h2". + // Find the http/1.1 entry (9 bytes — no realistic random + // collision), then assert the 3 bytes right before it are NOT + // the h2 entry, which is exactly where an [h2, http/1.1] list + // would place it. + let http11 = b"\x08http/1.1"; + let pos = hello + .windows(http11.len()) + .position(|w| w == http11) + .expect("ClientHello must advertise http/1.1 ALPN"); + let preceded_by_h2 = pos >= 3 && &hello[pos - 3..pos] == b"\x02h2"; + assert!( + !preceded_by_h2, + "CP-REST ClientHello must NOT advertise h2 ALPN — dp-manager's \ + cmux would misroute the h2 REST connection (#535)", + ); + } } diff --git a/crates/aisix-server/src/telemetry.rs b/crates/aisix-server/src/telemetry.rs index 892efa34..6608a079 100644 --- a/crates/aisix-server/src/telemetry.rs +++ b/crates/aisix-server/src/telemetry.rs @@ -244,7 +244,13 @@ fn build_client(mtls: &MtlsBundle) -> anyhow::Result { .user_agent(format!("aisix-dp/{}", env!("CARGO_PKG_VERSION"))) .identity(identity) .add_root_certificate(ca) - .use_rustls_tls(); + .use_rustls_tls() + // Pin HTTP/1.1 for the dp-manager REST surface — load-bearing; see + // the full rationale in heartbeat::build_client. Without it, an + // obs-sink dependency unifying reqwest's `http2` feature pushes + // telemetry onto ALPN h2, which dp-manager's cmux routes to its + // HTTP/1.1-only gin listener, failing every request. See issue #535. + .http1_only(); // Mirror heartbeat::build_client — pick up the operator-supplied // extra trust root (managed.cp_ca_cert_file) when set. if let Some(extra) = mtls.extra_ca_pem.as_ref() { @@ -417,4 +423,61 @@ mod tests { }; build_client(&mtls).expect("build_client must tolerate PEM without trailing newline"); } + + /// Regression for #535: the telemetry client (a separate copy of the + /// CP-REST mTLS builder) must advertise **only** `http/1.1` in its + /// TLS ALPN — never `h2`. dp-manager multiplexes kine/etcd gRPC and + /// the `/dp/*` REST API on a single TLS port via cmux; an h2 REST + /// connection is misrouted to the HTTP/1.1-only gin listener and the + /// request fails at the transport layer. `.http1_only()` pins this. + /// We inspect the raw ClientHello rather than completing a TLS + /// session, so the test needs no server cert and can't flake. + #[tokio::test] + async fn cp_rest_client_advertises_only_http1_alpn() { + use tokio::io::AsyncReadExt; + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let capture = tokio::spawn(async move { + let (mut sock, _) = listener.accept().await.unwrap(); + let mut buf = vec![0u8; 4096]; + let n = sock.read(&mut buf).await.unwrap_or(0); + buf.truncate(n); + buf + }); + + let dir = tempfile::tempdir().unwrap(); + let mtls = write_test_bundle(dir.path()); + let client = build_client(&mtls).expect("build mTLS client"); + + // One request makes the client emit a ClientHello; we never + // reply, so it errors out — only the ClientHello matters. + let _ = tokio::time::timeout( + Duration::from_secs(2), + client + .get(format!("https://127.0.0.1:{}/dp/telemetry", addr.port())) + .send(), + ) + .await; + + let hello = tokio::time::timeout(Duration::from_secs(2), capture) + .await + .expect("ClientHello capture timed out") + .expect("capture task panicked"); + + // ALPN entries are length-prefixed: 0x08 "http/1.1", 0x02 "h2". + // Find the http/1.1 entry, then assert the 3 bytes right before + // it are NOT the h2 entry (where an [h2, http/1.1] list puts it). + let http11 = b"\x08http/1.1"; + let pos = hello + .windows(http11.len()) + .position(|w| w == http11) + .expect("ClientHello must advertise http/1.1 ALPN"); + let preceded_by_h2 = pos >= 3 && &hello[pos - 3..pos] == b"\x02h2"; + assert!( + !preceded_by_h2, + "telemetry ClientHello must NOT advertise h2 ALPN — dp-manager's \ + cmux would misroute the h2 REST connection (#535)", + ); + } }