diff --git a/.changes/next-release/bugfix-AWSCRTHTTPClient-fae98a2.json b/.changes/next-release/bugfix-AWSCRTHTTPClient-fae98a2.json new file mode 100644 index 000000000000..6d2a88889f8d --- /dev/null +++ b/.changes/next-release/bugfix-AWSCRTHTTPClient-fae98a2.json @@ -0,0 +1,6 @@ +{ + "type": "bugfix", + "category": "AWS CRT HTTP Client", + "contributor": "", + "description": "Wrap connection pool acquire timeout and other transient HTTP errors in IOException so the SDK retry layer treats them as retryable." +} diff --git a/http-clients/aws-crt-client/src/main/java/software/amazon/awssdk/http/crt/internal/CrtUtils.java b/http-clients/aws-crt-client/src/main/java/software/amazon/awssdk/http/crt/internal/CrtUtils.java index f4ced975f90b..8e1650c56057 100644 --- a/http-clients/aws-crt-client/src/main/java/software/amazon/awssdk/http/crt/internal/CrtUtils.java +++ b/http-clients/aws-crt-client/src/main/java/software/amazon/awssdk/http/crt/internal/CrtUtils.java @@ -15,6 +15,7 @@ package software.amazon.awssdk.http.crt.internal; +import static java.util.Collections.unmodifiableSet; import static software.amazon.awssdk.http.HttpMetric.AVAILABLE_CONCURRENCY; import static software.amazon.awssdk.http.HttpMetric.CONCURRENCY_ACQUIRE_DURATION; import static software.amazon.awssdk.http.HttpMetric.LEASED_CONCURRENCY; @@ -25,6 +26,8 @@ import java.io.IOException; import java.net.ConnectException; import java.time.Duration; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.CompletionException; import javax.net.ssl.SSLHandshakeException; import software.amazon.awssdk.annotations.SdkInternalApi; @@ -43,7 +46,29 @@ public final class CrtUtils { */ public static final int CRT_SOCKET_TIMEOUT = 1048; - public static final int HEALTH_CHECK_FAILURE_ERROR_CODE = 2073; + // HTTP error codes that the native CRT classifier (CRT.awsIsTransientError) does NOT mark as transient + // but that the SDK considers recoverable. See enum aws_http_errors in aws-c-http/include/aws/http/http.h + // for symbolic names. + private static final Set ADDITIONAL_RETRYABLE_ERROR_CODES; + + static { + Set codes = new HashSet<>(); + // AWS_ERROR_HTTP_CHANNEL_THROUGHPUT_FAILURE (health check failure) + codes.add(2073); + // AWS_ERROR_HTTP_GOAWAY_RECEIVED + codes.add(2076); + // AWS_ERROR_HTTP_MAX_CONCURRENT_STREAMS_EXCEEDED + codes.add(2085); + // AWS_ERROR_HTTP_STREAM_MANAGER_CONNECTION_ACQUIRE_FAILURE + codes.add(2087); + // AWS_ERROR_HTTP_RESPONSE_FIRST_BYTE_TIMEOUT + codes.add(2092); + // AWS_ERROR_HTTP_CONNECTION_MANAGER_ACQUISITION_TIMEOUT + codes.add(2093); + // AWS_ERROR_HTTP_CONNECTION_MANAGER_MAX_PENDING_ACQUISITIONS_EXCEEDED + codes.add(2094); + ADDITIONAL_RETRYABLE_ERROR_CODES = unmodifiableSet(codes); + } private CrtUtils() { @@ -54,7 +79,7 @@ public static Throwable wrapWithIoExceptionIfRetryable(HttpException httpExcepti int errorCode = httpException.getErrorCode(); if (CRT.awsIsTransientError(errorCode) || - errorCode == HEALTH_CHECK_FAILURE_ERROR_CODE) { + ADDITIONAL_RETRYABLE_ERROR_CODES.contains(errorCode)) { toThrow = new IOException(httpException); } return toThrow; diff --git a/http-clients/aws-crt-client/src/test/java/software/amazon/awssdk/http/crt/internal/CrtUtilsTest.java b/http-clients/aws-crt-client/src/test/java/software/amazon/awssdk/http/crt/internal/CrtUtilsTest.java new file mode 100644 index 000000000000..5cee0bb4770a --- /dev/null +++ b/http-clients/aws-crt-client/src/test/java/software/amazon/awssdk/http/crt/internal/CrtUtilsTest.java @@ -0,0 +1,73 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package software.amazon.awssdk.http.crt.internal; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import software.amazon.awssdk.crt.http.HttpException; + +public class CrtUtilsTest { + + /** + * Locks in retryability of HTTP error codes that are NOT classified as transient by the native + * {@code CRT.awsIsTransientError} but should be wrapped in {@link IOException} so the SDK retry + * layer treats them as recoverable. Codes correspond to entries in {@code enum aws_http_errors} + * in aws-c-http. + */ + @ParameterizedTest + @ValueSource(ints = { + 2073, // AWS_ERROR_HTTP_CHANNEL_THROUGHPUT_FAILURE (health check failure) + 2076, // AWS_ERROR_HTTP_GOAWAY_RECEIVED + 2085, // AWS_ERROR_HTTP_MAX_CONCURRENT_STREAMS_EXCEEDED + 2087, // AWS_ERROR_HTTP_STREAM_MANAGER_CONNECTION_ACQUIRE_FAILURE + 2092, // AWS_ERROR_HTTP_RESPONSE_FIRST_BYTE_TIMEOUT + 2093, // AWS_ERROR_HTTP_CONNECTION_MANAGER_ACQUISITION_TIMEOUT + 2094 // AWS_ERROR_HTTP_CONNECTION_MANAGER_MAX_PENDING_ACQUISITIONS_EXCEEDED + }) + public void wrapWithIoExceptionIfRetryable_retryableCode_wrapsInIOException(int errorCode) { + HttpException httpException = new HttpException(errorCode); + + Throwable result = CrtUtils.wrapWithIoExceptionIfRetryable(httpException); + + assertThat(result).isInstanceOf(IOException.class).hasCause(httpException); + } + + /** + * Locks in non-retryability for codes that were deliberately considered and rejected during the + * allowlist analysis (PR #6812 retry-classifier regression follow-up). Each code below has a + * specific reason it must NOT auto-retry; without this guard a future contributor could re-add one + * without revisiting the rationale. + */ + @ParameterizedTest + @ValueSource(ints = { + 2074, // AWS_ERROR_HTTP_PROTOCOL_ERROR - on-wire parse failure; deterministic for a misbehaving peer + 2077, // AWS_ERROR_HTTP_RST_STREAM_RECEIVED - H2 stream reset; existing H2ErrorTest pins non-retry + 2078, // AWS_ERROR_HTTP_RST_STREAM_SENT - symmetric to 2077 + 2079, // AWS_ERROR_HTTP_STREAM_NOT_ACTIVATED - API misuse on manual write + 2080, // AWS_ERROR_HTTP_STREAM_HAS_COMPLETED - API misuse or secondary signal during teardown + 2095 // AWS_ERROR_HTTP_STREAM_CANCELLED - explicit cancel by SDK abort path; not server-induced + }) + public void wrapWithIoExceptionIfRetryable_nonRetryableCode_returnsHttpExceptionUnchanged(int errorCode) { + HttpException httpException = new HttpException(errorCode); + + Throwable result = CrtUtils.wrapWithIoExceptionIfRetryable(httpException); + + assertThat(result).isSameAs(httpException); + } +} diff --git a/http-clients/netty-nio-client/src/test/java/software/amazon/awssdk/http/nio/netty/NettyAsyncHttpClientLongRunningRequestTest.java b/http-clients/netty-nio-client/src/test/java/software/amazon/awssdk/http/nio/netty/NettyAsyncHttpClientLongRunningRequestTest.java index e5e11fd9e631..dcafdf261014 100644 --- a/http-clients/netty-nio-client/src/test/java/software/amazon/awssdk/http/nio/netty/NettyAsyncHttpClientLongRunningRequestTest.java +++ b/http-clients/netty-nio-client/src/test/java/software/amazon/awssdk/http/nio/netty/NettyAsyncHttpClientLongRunningRequestTest.java @@ -15,7 +15,19 @@ package software.amazon.awssdk.http.nio.netty; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging; + +import java.net.URI; +import java.util.concurrent.CompletableFuture; +import org.junit.jupiter.api.Test; +import software.amazon.awssdk.http.HttpTestUtils; import software.amazon.awssdk.http.SdkAsyncHttpClientLongRunningRequestTestSuite; +import software.amazon.awssdk.http.SdkHttpConfigurationOption; +import software.amazon.awssdk.http.SdkHttpFullRequest; +import software.amazon.awssdk.http.SdkHttpMethod; import software.amazon.awssdk.http.async.SdkAsyncHttpClient; import software.amazon.awssdk.utils.AttributeMap; @@ -25,4 +37,43 @@ public class NettyAsyncHttpClientLongRunningRequestTest extends SdkAsyncHttpClie protected SdkAsyncHttpClient createSdkAsyncHttpClient(AttributeMap config) { return NettyNioAsyncHttpClient.builder().buildWithDefaults(config); } + + // TODO: NettyUtils.decorateException wraps connection-pool acquire timeouts in a plain Throwable + // (see NettyUtils.java around the AcquireTimeoutException handling) instead of an IOException, so + // the SDK retry layer treats them as non-transient. The body below is the suite's scenario minus + // the IOException cause-chain assertion, so the timing-bound contract is still verified for Netty. + @Test + @Override + public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeoutBound() throws Exception { + stubHanging(mockServer); + + SdkAsyncHttpClient client = createSdkAsyncHttpClient(AttributeMap.builder() + .put(SdkHttpConfigurationOption.READ_TIMEOUT, + HANG_DELAY.plusMinutes(1)) + .put(SdkHttpConfigurationOption.MAX_CONNECTIONS, 1) + .put(SdkHttpConfigurationOption + .CONNECTION_ACQUIRE_TIMEOUT, + CONFIGURED_TIMEOUT) + .build()); + try { + CompletableFuture firstRequest = sendRequest(client); + Thread.sleep(500); + + assertFailsWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT); + + firstRequest.cancel(true); + } finally { + client.close(); + } + } + + private CompletableFuture sendRequest(SdkAsyncHttpClient client) { + URI uri = URI.create("http://localhost:" + mockServer.getPort()); + SdkHttpFullRequest request = SdkHttpFullRequest.builder() + .uri(uri) + .method(SdkHttpMethod.GET) + .putHeader("Host", uri.getHost()) + .build(); + return HttpTestUtils.sendRequest(client, request); + } } diff --git a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/LongRunningRequestTestSupport.java b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/LongRunningRequestTestSupport.java index 723869ad8525..1149f4930a6a 100644 --- a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/LongRunningRequestTestSupport.java +++ b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/LongRunningRequestTestSupport.java @@ -20,6 +20,7 @@ import static com.github.tomakehurst.wiremock.client.WireMock.urlPathEqualTo; import com.github.tomakehurst.wiremock.junit5.WireMockExtension; +import java.io.IOException; import java.time.Duration; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; @@ -90,4 +91,62 @@ public static void assertFailsWithinTimeBound(CompletableFuture future, Durat // expected } } + + /** + * Same time-bound check as {@link #assertFailsWithinTimeBound} but also asserts that the failure cause chain + * contains an {@link IOException}. The SDK retry layer relies on IOException-wrapping to classify failures as + * transient, so HTTP clients must surface acquire/connect failures as (or wrapping) {@code IOException}. + */ + public static void assertFailsWithIoExceptionWithinTimeBound(CompletableFuture future, Duration expectedTimeout) { + Duration maxWait = expectedTimeout.plus(TIME_BOUND_SAFETY_MARGIN); + + try { + future.get(maxWait.toMillis(), TimeUnit.MILLISECONDS); + throw new AssertionError("Expected request to throw an exception but it completed successfully"); + } catch (TimeoutException e) { + future.cancel(true); + throw new AssertionError( + "Expected request to fail within " + maxWait + " but it was still running - client appears to hang", + e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new AssertionError("Unexpected interruption while waiting for request to fail", e); + } catch (ExecutionException e) { + if (!causeChainContains(e, IOException.class)) { + throw new AssertionError( + "Expected failure cause chain to contain IOException so the SDK retry layer treats the error as " + + "transient, but found: " + describeCauseChain(e), e); + } + } + } + + private static boolean causeChainContains(Throwable t, Class type) { + Throwable current = t; + while (current != null) { + if (type.isInstance(current)) { + return true; + } + if (current.getCause() == current) { + return false; + } + current = current.getCause(); + } + return false; + } + + private static String describeCauseChain(Throwable t) { + StringBuilder sb = new StringBuilder(); + Throwable current = t; + while (current != null) { + if (sb.length() > 0) { + sb.append(" -> "); + } + sb.append(current.getClass().getName()); + if (current.getCause() == current) { + break; + } + current = current.getCause(); + } + return sb.toString(); + } } diff --git a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkAsyncHttpClientLongRunningRequestTestSuite.java b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkAsyncHttpClientLongRunningRequestTestSuite.java index a7a78eb76b58..d6cc72a6be4b 100644 --- a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkAsyncHttpClientLongRunningRequestTestSuite.java +++ b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkAsyncHttpClientLongRunningRequestTestSuite.java @@ -18,6 +18,7 @@ import static com.github.tomakehurst.wiremock.core.WireMockConfiguration.wireMockConfig; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithIoExceptionWithinTimeBound; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubLongPolling; @@ -90,7 +91,7 @@ public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeou CompletableFuture firstRequest = sendRequest(client); Thread.sleep(500); - assertFailsWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT); + assertFailsWithIoExceptionWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT); firstRequest.cancel(true); } finally { diff --git a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkHttpClientLongRunningRequestTestSuite.java b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkHttpClientLongRunningRequestTestSuite.java index 0e52eb19eb49..c10b40209503 100644 --- a/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkHttpClientLongRunningRequestTestSuite.java +++ b/test/http-client-tests/src/main/java/software/amazon/awssdk/http/SdkHttpClientLongRunningRequestTestSuite.java @@ -18,6 +18,7 @@ import static com.github.tomakehurst.wiremock.core.WireMockConfiguration.wireMockConfig; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY; +import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithIoExceptionWithinTimeBound; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging; import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubLongPolling; @@ -92,7 +93,7 @@ public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeou CompletableFuture firstRequest = executeAsync(client); Thread.sleep(500); - assertFailsWithinTimeBound(executeAsync(client), CONFIGURED_TIMEOUT); + assertFailsWithIoExceptionWithinTimeBound(executeAsync(client), CONFIGURED_TIMEOUT); firstRequest.cancel(true); } finally {