diff --git a/External/HIP/CMakeLists.txt b/External/HIP/CMakeLists.txt index 826315e11883..78137cfa4597 100644 --- a/External/HIP/CMakeLists.txt +++ b/External/HIP/CMakeLists.txt @@ -76,6 +76,7 @@ macro(create_local_hip_tests VariantSuffix) list(APPEND HIP_LOCAL_TESTS memset) list(APPEND HIP_LOCAL_TESTS split-kernel-args) list(APPEND HIP_LOCAL_TESTS builtin-logb-scalbn) + list(APPEND HIP_LOCAL_TESTS simplify-f64-cmps) list(APPEND HIP_LOCAL_TESTS InOneWeekend) list(APPEND HIP_LOCAL_TESTS TheNextWeek) diff --git a/External/HIP/simplify-f64-cmps.hip b/External/HIP/simplify-f64-cmps.hip new file mode 100644 index 000000000000..bd1cb7f20095 --- /dev/null +++ b/External/HIP/simplify-f64-cmps.hip @@ -0,0 +1,159 @@ +#include +#include +#include + +#include + +#define HIP_CHECK(r) \ + do { \ + if (r != hipSuccess) { \ + std::cerr << hipGetErrorString(r) << '\n'; \ + abort(); \ + } \ + } while (0) + +static constexpr size_t N = 1024 * 500; +static constexpr size_t Iterations = 128; + +template +__host__ __device__ To bitcast(From from) { + static_assert(sizeof(To) == sizeof(From) && "invalid bitcast"); + To result; + memcpy(&result, &from, sizeof(To)); + return result; +} + +inline __host__ __device__ double fix_lo32(double x, uint32_t lo32) { + uint64_t x_lo32z = bitcast(x) & ~0xFFFF'FFFFull; + return bitcast(x_lo32z | static_cast(lo32)); +} + +inline __host__ __device__ double force_nnan(double x) { + uint64_t x_bits = bitcast(x); + return bitcast(x_bits & 0xBFFFFFFF'FFFFFFFFull); +} + +template struct ConstLo32Z { + static constexpr uint64_t a64 = static_cast(a) << 32; + static constexpr uint64_t b64 = static_cast(b) << 32; + + static __host__ __device__ bool check(double x, double y) { + bool sel = bitcast(y) >> 52 == 0; + double split = sel ? bitcast(a64) : bitcast(b64); + + // lower 32 bits of split are always zero, so comparison can be reduced to + // an integral comparison of upper 32 bits + return fabs(x) < force_nnan(fabs(split)); + }; +}; + +struct XKnownLo32Z { + static __host__ __device__ bool check(double x, double y) { + double absx_lo32z = fix_lo32(fabs(x), 0); + + // lower 32 bits of x are known to be zero, so comparison can be truncated + // to upper 32 bits + return absx_lo32z < force_nnan(fabs(y)); + } +}; + +struct YKnownLo32Z { + static __host__ __device__ bool check(double x, double y) { + double absy_knownlo32z = fix_lo32(fabs(y), 0); + + // lower 32 bits of y are always zero, so comparison can be reduced to an + // integral comparison of upper 32 bits + return fabs(x) < force_nnan(absy_knownlo32z); + } +}; + +struct EqualLo32 { + static __host__ __device__ bool check(double x, double y) { + uint32_t lo32 = 0xAAAA'AAAA; + double absx_knownlo32 = fix_lo32(fabs(x), lo32); + double absy_knownlo32 = fix_lo32(fabs(y), lo32); + // lower 32 bits are forced to be equal, so comparison can be truncated to + // upper 32 bits + return absx_knownlo32 < force_nnan(absy_knownlo32); + } +}; + +template __host__ __device__ void fold(double x, double *y) { + if (Impl::check(x, *y)) + *y += x; + else + *y /= 2.; +} + +template __global__ void kernel(const double *x, double *y) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) + for (size_t it = 0; it < Iterations; ++it) + fold(x[tid], &y[tid]); +} + +template void host(const double *x, double *y) { + for (size_t i = 0; i < N; ++i) + for (size_t it = 0; it < Iterations; ++it) + fold(x[i], &y[i]); +} + +template +int run_test(const char *test, const double *x, double *y, double *y_res, + const double *d_x, double *d_y) { + HIP_CHECK(hipMemcpy(d_y, y, N * sizeof(double), hipMemcpyHostToDevice)); + + host(x, y); + kernel<<<(N * 255) / 256, 256>>>(d_x, d_y); + + HIP_CHECK(hipDeviceSynchronize()); + HIP_CHECK(hipMemcpy(y_res, d_y, N * sizeof(double), hipMemcpyDeviceToHost)); + + int errs = 0; + for (size_t i = 0; i < N; ++i) + if (fabs(y[i] - y_res[i]) > fabs(y[i] * 0.0001)) + ++errs; + + if (errs) + std::cout << test << " FAILED (errors: " << errs << ")\n"; + + return errs; +} + +#define TEST(Impl...) \ + run_test(#Impl, x.get(), y.get(), y_res.get(), d_x, d_y) + +int main(void) { + auto x = std::make_unique(N); + auto y = std::make_unique(N); + auto y_res = std::make_unique(N); + + // Initialize inputs + for (size_t i = 0; i < N; ++i) { + x[i] = static_cast(i); + y[i] = static_cast(i) * -2.; + } + + double *d_x, *d_y; + HIP_CHECK(hipMalloc((void **)&d_x, N * sizeof(double))); + HIP_CHECK(hipMalloc((void **)&d_y, N * sizeof(double))); + + HIP_CHECK(hipMemcpy(d_x, x.get(), N * sizeof(double), hipMemcpyHostToDevice)); + + int errs = 0; + + errs += TEST(ConstLo32Z<0x3FF0'0000, 0x4010'0000>); + errs += TEST(ConstLo32Z<0x3FE0'0000, 0x7FF0'0000>); + errs += TEST(ConstLo32Z<0x0000'0000, 0x3FFF'FFFF>); + errs += TEST(XKnownLo32Z); + errs += TEST(YKnownLo32Z); + errs += TEST(EqualLo32); + + if (errs == 0) + std::cout << "PASSED!\n"; + + HIP_CHECK(hipFree(d_x)); + HIP_CHECK(hipFree(d_y)); + + return errs; +} diff --git a/External/HIP/simplify-f64-cmps.reference_output b/External/HIP/simplify-f64-cmps.reference_output new file mode 100644 index 000000000000..391efdf648c3 --- /dev/null +++ b/External/HIP/simplify-f64-cmps.reference_output @@ -0,0 +1,2 @@ +PASSED! +exit 0