Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/next-release/bugfix-AWSCRTHTTPClient-fae98a2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"type": "bugfix",
"category": "AWS CRT HTTP Client",
"contributor": "",
"description": "Wrap connection pool acquire timeout and other transient HTTP errors in IOException so the SDK retry layer treats them as retryable."
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

package software.amazon.awssdk.http.crt.internal;

import static java.util.Collections.unmodifiableSet;
import static software.amazon.awssdk.http.HttpMetric.AVAILABLE_CONCURRENCY;
import static software.amazon.awssdk.http.HttpMetric.CONCURRENCY_ACQUIRE_DURATION;
import static software.amazon.awssdk.http.HttpMetric.LEASED_CONCURRENCY;
Expand All @@ -25,6 +26,8 @@
import java.io.IOException;
import java.net.ConnectException;
import java.time.Duration;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CompletionException;
import javax.net.ssl.SSLHandshakeException;
import software.amazon.awssdk.annotations.SdkInternalApi;
Expand All @@ -43,7 +46,29 @@ public final class CrtUtils {
*/
public static final int CRT_SOCKET_TIMEOUT = 1048;

public static final int HEALTH_CHECK_FAILURE_ERROR_CODE = 2073;
// HTTP error codes that the native CRT classifier (CRT.awsIsTransientError) does NOT mark as transient
// but that the SDK considers recoverable. See enum aws_http_errors in aws-c-http/include/aws/http/http.h
// for symbolic names.
private static final Set<Integer> ADDITIONAL_RETRYABLE_ERROR_CODES;

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will remove the list once those error codes get added to CRT

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did we derive these seven errors?


static {
Set<Integer> codes = new HashSet<>();
// AWS_ERROR_HTTP_CHANNEL_THROUGHPUT_FAILURE (health check failure)
codes.add(2073);
// AWS_ERROR_HTTP_GOAWAY_RECEIVED
codes.add(2076);
// AWS_ERROR_HTTP_MAX_CONCURRENT_STREAMS_EXCEEDED
codes.add(2085);
// AWS_ERROR_HTTP_STREAM_MANAGER_CONNECTION_ACQUIRE_FAILURE
codes.add(2087);
// AWS_ERROR_HTTP_RESPONSE_FIRST_BYTE_TIMEOUT
codes.add(2092);
// AWS_ERROR_HTTP_CONNECTION_MANAGER_ACQUISITION_TIMEOUT
codes.add(2093);
// AWS_ERROR_HTTP_CONNECTION_MANAGER_MAX_PENDING_ACQUISITIONS_EXCEEDED
codes.add(2094);
ADDITIONAL_RETRYABLE_ERROR_CODES = unmodifiableSet(codes);
}


private CrtUtils() {
Expand All @@ -54,7 +79,7 @@ public static Throwable wrapWithIoExceptionIfRetryable(HttpException httpExcepti
int errorCode = httpException.getErrorCode();

if (CRT.awsIsTransientError(errorCode) ||
errorCode == HEALTH_CHECK_FAILURE_ERROR_CODE) {
ADDITIONAL_RETRYABLE_ERROR_CODES.contains(errorCode)) {
toThrow = new IOException(httpException);
}
return toThrow;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://aws.amazon.com/apache2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package software.amazon.awssdk.http.crt.internal;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import software.amazon.awssdk.crt.http.HttpException;

public class CrtUtilsTest {

/**
* Locks in retryability of HTTP error codes that are NOT classified as transient by the native
* {@code CRT.awsIsTransientError} but should be wrapped in {@link IOException} so the SDK retry
* layer treats them as recoverable. Codes correspond to entries in {@code enum aws_http_errors}
* in aws-c-http.
*/
@ParameterizedTest
@ValueSource(ints = {
2073, // AWS_ERROR_HTTP_CHANNEL_THROUGHPUT_FAILURE (health check failure)
2076, // AWS_ERROR_HTTP_GOAWAY_RECEIVED
2085, // AWS_ERROR_HTTP_MAX_CONCURRENT_STREAMS_EXCEEDED
2087, // AWS_ERROR_HTTP_STREAM_MANAGER_CONNECTION_ACQUIRE_FAILURE
2092, // AWS_ERROR_HTTP_RESPONSE_FIRST_BYTE_TIMEOUT
2093, // AWS_ERROR_HTTP_CONNECTION_MANAGER_ACQUISITION_TIMEOUT
2094 // AWS_ERROR_HTTP_CONNECTION_MANAGER_MAX_PENDING_ACQUISITIONS_EXCEEDED
})
public void wrapWithIoExceptionIfRetryable_retryableCode_wrapsInIOException(int errorCode) {
HttpException httpException = new HttpException(errorCode);

Throwable result = CrtUtils.wrapWithIoExceptionIfRetryable(httpException);

assertThat(result).isInstanceOf(IOException.class).hasCause(httpException);
}

/**
* Locks in non-retryability for codes that were deliberately considered and rejected during the
* allowlist analysis (PR #6812 retry-classifier regression follow-up). Each code below has a
* specific reason it must NOT auto-retry; without this guard a future contributor could re-add one
* without revisiting the rationale.
*/
@ParameterizedTest
@ValueSource(ints = {
2074, // AWS_ERROR_HTTP_PROTOCOL_ERROR - on-wire parse failure; deterministic for a misbehaving peer
2077, // AWS_ERROR_HTTP_RST_STREAM_RECEIVED - H2 stream reset; existing H2ErrorTest pins non-retry
2078, // AWS_ERROR_HTTP_RST_STREAM_SENT - symmetric to 2077
2079, // AWS_ERROR_HTTP_STREAM_NOT_ACTIVATED - API misuse on manual write
2080, // AWS_ERROR_HTTP_STREAM_HAS_COMPLETED - API misuse or secondary signal during teardown
2095 // AWS_ERROR_HTTP_STREAM_CANCELLED - explicit cancel by SDK abort path; not server-induced
})
public void wrapWithIoExceptionIfRetryable_nonRetryableCode_returnsHttpExceptionUnchanged(int errorCode) {
HttpException httpException = new HttpException(errorCode);

Throwable result = CrtUtils.wrapWithIoExceptionIfRetryable(httpException);

assertThat(result).isSameAs(httpException);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,19 @@

package software.amazon.awssdk.http.nio.netty;

import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging;

import java.net.URI;
import java.util.concurrent.CompletableFuture;
import org.junit.jupiter.api.Test;
import software.amazon.awssdk.http.HttpTestUtils;
import software.amazon.awssdk.http.SdkAsyncHttpClientLongRunningRequestTestSuite;
import software.amazon.awssdk.http.SdkHttpConfigurationOption;
import software.amazon.awssdk.http.SdkHttpFullRequest;
import software.amazon.awssdk.http.SdkHttpMethod;
import software.amazon.awssdk.http.async.SdkAsyncHttpClient;
import software.amazon.awssdk.utils.AttributeMap;

Expand All @@ -25,4 +37,43 @@ public class NettyAsyncHttpClientLongRunningRequestTest extends SdkAsyncHttpClie
protected SdkAsyncHttpClient createSdkAsyncHttpClient(AttributeMap config) {
return NettyNioAsyncHttpClient.builder().buildWithDefaults(config);
}

// TODO: NettyUtils.decorateException wraps connection-pool acquire timeouts in a plain Throwable
// (see NettyUtils.java around the AcquireTimeoutException handling) instead of an IOException, so
// the SDK retry layer treats them as non-transient. The body below is the suite's scenario minus
// the IOException cause-chain assertion, so the timing-bound contract is still verified for Netty.
@Test
@Override
public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeoutBound() throws Exception {
stubHanging(mockServer);

SdkAsyncHttpClient client = createSdkAsyncHttpClient(AttributeMap.builder()
.put(SdkHttpConfigurationOption.READ_TIMEOUT,
HANG_DELAY.plusMinutes(1))
.put(SdkHttpConfigurationOption.MAX_CONNECTIONS, 1)
.put(SdkHttpConfigurationOption
.CONNECTION_ACQUIRE_TIMEOUT,
CONFIGURED_TIMEOUT)
.build());
try {
CompletableFuture<?> firstRequest = sendRequest(client);
Thread.sleep(500);

assertFailsWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT);

firstRequest.cancel(true);
} finally {
client.close();
}
}

private CompletableFuture<byte[]> sendRequest(SdkAsyncHttpClient client) {
URI uri = URI.create("http://localhost:" + mockServer.getPort());
SdkHttpFullRequest request = SdkHttpFullRequest.builder()
.uri(uri)
.method(SdkHttpMethod.GET)
.putHeader("Host", uri.getHost())
.build();
return HttpTestUtils.sendRequest(client, request);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static com.github.tomakehurst.wiremock.client.WireMock.urlPathEqualTo;

import com.github.tomakehurst.wiremock.junit5.WireMockExtension;
import java.io.IOException;
import java.time.Duration;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
Expand Down Expand Up @@ -90,4 +91,62 @@ public static void assertFailsWithinTimeBound(CompletableFuture<?> future, Durat
// expected
}
}

/**
* Same time-bound check as {@link #assertFailsWithinTimeBound} but also asserts that the failure cause chain
* contains an {@link IOException}. The SDK retry layer relies on IOException-wrapping to classify failures as
* transient, so HTTP clients must surface acquire/connect failures as (or wrapping) {@code IOException}.
*/
public static void assertFailsWithIoExceptionWithinTimeBound(CompletableFuture<?> future, Duration expectedTimeout) {
Duration maxWait = expectedTimeout.plus(TIME_BOUND_SAFETY_MARGIN);

try {
future.get(maxWait.toMillis(), TimeUnit.MILLISECONDS);
throw new AssertionError("Expected request to throw an exception but it completed successfully");
} catch (TimeoutException e) {
future.cancel(true);
throw new AssertionError(
"Expected request to fail within " + maxWait + " but it was still running - client appears to hang",
e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new AssertionError("Unexpected interruption while waiting for request to fail", e);
} catch (ExecutionException e) {
if (!causeChainContains(e, IOException.class)) {
throw new AssertionError(
"Expected failure cause chain to contain IOException so the SDK retry layer treats the error as "
+ "transient, but found: " + describeCauseChain(e), e);
}
}
}

private static boolean causeChainContains(Throwable t, Class<? extends Throwable> type) {
Throwable current = t;
while (current != null) {
if (type.isInstance(current)) {
return true;
}
if (current.getCause() == current) {
return false;
}
current = current.getCause();
}
return false;
}

private static String describeCauseChain(Throwable t) {
StringBuilder sb = new StringBuilder();
Throwable current = t;
while (current != null) {
if (sb.length() > 0) {
sb.append(" -> ");
}
sb.append(current.getClass().getName());
if (current.getCause() == current) {
break;
}
current = current.getCause();
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import static com.github.tomakehurst.wiremock.core.WireMockConfiguration.wireMockConfig;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithIoExceptionWithinTimeBound;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubLongPolling;
Expand Down Expand Up @@ -90,7 +91,7 @@ public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeou
CompletableFuture<?> firstRequest = sendRequest(client);
Thread.sleep(500);

assertFailsWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT);
assertFailsWithIoExceptionWithinTimeBound(sendRequest(client), CONFIGURED_TIMEOUT);

firstRequest.cancel(true);
} finally {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import static com.github.tomakehurst.wiremock.core.WireMockConfiguration.wireMockConfig;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.CONFIGURED_TIMEOUT;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.HANG_DELAY;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithIoExceptionWithinTimeBound;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.assertFailsWithinTimeBound;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubHanging;
import static software.amazon.awssdk.http.LongRunningRequestTestSupport.stubLongPolling;
Expand Down Expand Up @@ -92,7 +93,7 @@ public void executeWhenConnectionAcquireTimeoutAndPoolExhaustedFailsWithinTimeou
CompletableFuture<?> firstRequest = executeAsync(client);
Thread.sleep(500);

assertFailsWithinTimeBound(executeAsync(client), CONFIGURED_TIMEOUT);
assertFailsWithIoExceptionWithinTimeBound(executeAsync(client), CONFIGURED_TIMEOUT);

firstRequest.cancel(true);
} finally {
Expand Down
Loading