From 4b78a6d44c58f54e84b6cc46c8103b41fddc6cb0 Mon Sep 17 00:00:00 2001 From: expend20 <36543551+expend20@users.noreply.github.com> Date: Mon, 26 Jan 2026 00:02:59 -0500 Subject: [PATCH 1/3] Add Windows ARM64 build support The changes include: * ARM64 architecture detection in CMakeLists.txt * ARM64 Windows SEH trampoline in translate-all.c * Native ARM64 setjmp/longjmp assembly wrappers * MSVC compatibility fixes for aarch64 TCG backend * Disabled x86-specific intrinsics on ARM64 --- CMakeLists.txt | 37 +++++-- qemu/accel/tcg/translate-all.c | 94 ++++++++++++++++- qemu/include/qemu/atomic128.h | 4 +- qemu/include/sysemu/os-win32.h | 7 ++ qemu/include/tcg/tcg-opc.h | 4 +- qemu/target/i386/cpu.c | 7 +- qemu/tcg/aarch64/tcg-target.inc.c | 32 ++++-- qemu/util/cacheinfo.c | 2 +- qemu/util/setjmp-wrapper-win32-arm64.asm | 124 +++++++++++++++++++++++ 9 files changed, 285 insertions(+), 26 deletions(-) create mode 100644 qemu/util/setjmp-wrapper-win32-arm64.asm diff --git a/CMakeLists.txt b/CMakeLists.txt index 033db0965d..1802155321 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,12 +121,21 @@ endif() # we do this manually by adding flags. set(ATOMIC_LINKAGE_FIX FALSE) if(MSVC) - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(MSVC_FLAG -D__x86_64__) + # Detect target architecture using CMAKE_SYSTEM_PROCESSOR or compiler checks + if(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64") + set(MSVC_FLAG -D__aarch64__) + set(MSVC_TCG_PATH /I${CMAKE_CURRENT_SOURCE_DIR}/qemu/tcg/aarch64) + set(MSVC_TARGET_ARCH "aarch64") + elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(MSVC_FLAG -D__x86_64__) + set(MSVC_TCG_PATH /I${CMAKE_CURRENT_SOURCE_DIR}/qemu/tcg/i386) + set(MSVC_TARGET_ARCH "x86_64") elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(MSVC_FLAG -D__i386__) + set(MSVC_FLAG -D__i386__) + set(MSVC_TCG_PATH /I${CMAKE_CURRENT_SOURCE_DIR}/qemu/tcg/i386) + set(MSVC_TARGET_ARCH "i386") else() - message(FATAL_ERROR "Neither WIN64 or WIN32!") + message(FATAL_ERROR "Unsupported architecture!") endif() add_compile_options( @@ -135,7 +144,7 @@ if(MSVC) -D_CRT_SECURE_NO_WARNINGS -DWIN32_LEAN_AND_MEAN ${MSVC_FLAG} - /I${CMAKE_CURRENT_SOURCE_DIR}/qemu/tcg/i386 + ${MSVC_TCG_PATH} ) # Disable some warnings @@ -1214,7 +1223,16 @@ if(MSVC) qemu/util/oslib-win32.c qemu/util/qemu-thread-win32.c ) - if(CMAKE_SIZEOF_VOID_P EQUAL 8) + if(MSVC_TARGET_ARCH STREQUAL "aarch64") + # ARM64 uses armasm64 for assembly + add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/setjmp-wrapper-win32-arm64.obj" + COMMAND armasm64 -o "${CMAKE_CURRENT_BINARY_DIR}/setjmp-wrapper-win32-arm64.obj" "${CMAKE_CURRENT_SOURCE_DIR}/qemu/util/setjmp-wrapper-win32-arm64.asm" + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/qemu/util/setjmp-wrapper-win32-arm64.asm" + COMMENT "Building ARM64 setjmp wrapper" + ) + set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/setjmp-wrapper-win32-arm64.obj" PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE) + set(UNICORN_COMMON_SRCS ${UNICORN_COMMON_SRCS} "${CMAKE_CURRENT_BINARY_DIR}/setjmp-wrapper-win32-arm64.obj") + elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) if(MSVC_VERSION LESS 1600 AND MSVC_IDE) add_custom_command(OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build/setjmp-wrapper-win32.dir/setjmp-wrapper-win32.obj" COMMAND ml64 /c /nologo /Fo"${CMAKE_CURRENT_SOURCE_DIR}/build/setjmp-wrapper-win32.dir/setjmp-wrapper-win32.obj" /W3 /errorReport:prompt /Ta"${CMAKE_CURRENT_SOURCE_DIR}/qemu/util/setjmp-wrapper-win32.asm" @@ -1467,6 +1485,13 @@ if (UNICORN_LEGACY_STATIC_ARCHIVE) set_target_properties(unicorn PROPERTIES OUTPUT_NAME "unicorn-static") bundle_static_library(unicorn unicorn_archive unicorn) endif() + # Merge ARM64 setjmp wrapper into the archive (bundle_static_library doesn't include external objects) + if(MSVC AND MSVC_TARGET_ARCH STREQUAL "aarch64") + add_custom_command(TARGET unicorn_archive POST_BUILD + COMMAND lib /NOLOGO "$" "${CMAKE_CURRENT_BINARY_DIR}/setjmp-wrapper-win32-arm64.obj" /OUT:"$" + COMMENT "Merging ARM64 setjmp wrapper into unicorn archive" + ) + endif() endif() if(UNICORN_FUZZ) diff --git a/qemu/accel/tcg/translate-all.c b/qemu/accel/tcg/translate-all.c index 0d65cfe2a1..ca6267ea2d 100644 --- a/qemu/accel/tcg/translate-all.c +++ b/qemu/accel/tcg/translate-all.c @@ -927,7 +927,99 @@ static inline void *alloc_code_gen_buffer(struct uc_struct *uc) uc->seh_closure = closure; data = closure + CLOSURE_SIZE /2; -#ifdef _WIN64 +#if defined(_WIN64) && defined(_M_ARM64) + /* + * ARM64 Windows trampoline. + * On ARM64 Windows, the calling convention passes first arg in x0, second in x1. + * We need to pass the uc pointer as the second argument to the handler. + * + * ARM64 instructions are 4 bytes each. + * We build a simple trampoline that: + * 1. Saves x1 and lr to data area + * 2. Loads uc pointer into x1 + * 3. Loads handler address and calls it (using blr) + * 4. Restores x1 and lr + * 5. Returns + */ + { + uint32_t *code = (uint32_t *)closure; + uint64_t data_addr = (uint64_t)data; + uint64_t handler_addr = (uint64_t)handler; + uint64_t uc_addr = (uint64_t)uc; + + /* Store uc and handler pointers in data area */ + memcpy(data + 0x00, &uc_addr, 8); /* data[0x00]: uc pointer */ + memcpy(data + 0x08, &handler_addr, 8); /* data[0x08]: handler pointer */ + /* data[0x10]: saved x1 */ + /* data[0x18]: saved lr */ + + /* + * Generate ARM64 code: + * We use x9 as scratch register (caller-saved, safe to clobber) + * + * Layout: + * code[0]: ldr x9, [pc, #offset] ; load &data from literal pool + * code[1]: str x1, [x9, #0x10] ; save x1 to data[0x10] + * code[2]: str lr, [x9, #0x18] ; save lr to data[0x18] + * code[3]: ldr x1, [x9, #0x00] ; load uc ptr into x1 + * code[4]: ldr x9, [x9, #0x08] ; load handler ptr + * code[5]: blr x9 ; call handler (clobbers lr) + * code[6]: ldr x9, [pc, #offset] ; reload &data + * code[7]: ldr x1, [x9, #0x10] ; restore x1 + * code[8]: ldr lr, [x9, #0x18] ; restore lr + * code[9]: ret ; return via restored lr + * code[10]: nop ; padding for alignment + * code[11-12]: data_addr ; 64-bit literal pool + */ + + int literal_offset; + + /* code[0]: ldr x9, [pc, #offset] - load data pointer from literal pool */ + /* LDR (literal) encoding: 0x58000000 | (imm19 << 5) | Rt */ + literal_offset = (11 - 0) * 4; /* offset from code[0] to code[11] = 44 bytes */ + code[0] = 0x58000009 | ((literal_offset / 4) << 5); + + /* code[1]: str x1, [x9, #0x10] - save x1 */ + /* STR (unsigned offset): 0xF9000000 | (imm12 << 10) | (Rn << 5) | Rt */ + /* imm12 = offset/8, Rn=9, Rt=1 */ + code[1] = 0xF9000121 | ((0x10 / 8) << 10); + + /* code[2]: str lr, [x9, #0x18] - save lr (x30) */ + /* Rn=9, Rt=30 (lr) */ + code[2] = 0xF900013E | ((0x18 / 8) << 10); + + /* code[3]: ldr x1, [x9, #0x00] - load uc pointer into x1 */ + code[3] = 0xF9400121; + + /* code[4]: ldr x9, [x9, #0x08] - load handler pointer into x9 */ + code[4] = 0xF9400129 | ((0x08 / 8) << 10); + + /* code[5]: blr x9 - call the handler */ + code[5] = 0xD63F0120; + + /* code[6]: ldr x9, [pc, #offset] - reload data pointer */ + literal_offset = (11 - 6) * 4; /* offset from code[6] to code[11] = 20 bytes */ + code[6] = 0x58000009 | ((literal_offset / 4) << 5); + + /* code[7]: ldr x1, [x9, #0x10] - restore x1 */ + code[7] = 0xF9400121 | ((0x10 / 8) << 10); + + /* code[8]: ldr lr, [x9, #0x18] - restore lr (x30) */ + code[8] = 0xF940013E | ((0x18 / 8) << 10); + + /* code[9]: ret - return via lr */ + code[9] = 0xD65F03C0; + + /* code[10]: nop - padding for 8-byte alignment of literal pool */ + code[10] = 0xD503201F; + + /* code[11-12]: Literal pool - data address (64-bit) */ + memcpy(&code[11], &data_addr, 8); + + /* Flush instruction cache for the generated code */ + FlushInstructionCache(GetCurrentProcess(), closure, 13 * 4); + } +#elif defined(_WIN64) ptr = closure; *ptr = 0x48; // REX.w ptr += 1; diff --git a/qemu/include/qemu/atomic128.h b/qemu/include/qemu/atomic128.h index 4183863d11..6337dcfcae 100644 --- a/qemu/include/qemu/atomic128.h +++ b/qemu/include/qemu/atomic128.h @@ -108,8 +108,8 @@ static inline void atomic16_set(Int128 *ptr, Int128 val) } # define HAVE_ATOMIC128 1 -#elif defined(__aarch64__) -/* We can do better than cmpxchg for AArch64. */ +#elif defined(__aarch64__) && !defined(_MSC_VER) +/* We can do better than cmpxchg for AArch64 (GCC/Clang only - uses inline asm). */ static inline Int128 atomic16_read(Int128 *ptr) { uint64_t l, h; diff --git a/qemu/include/sysemu/os-win32.h b/qemu/include/sysemu/os-win32.h index d77d0fbaea..5aa099886d 100644 --- a/qemu/include/sysemu/os-win32.h +++ b/qemu/include/sysemu/os-win32.h @@ -42,6 +42,13 @@ extern int _setjmp_wrapper(jmp_buf); #undef setjmp #define setjmp(env) _setjmp_wrapper(env) +#if defined(_M_ARM64) +// On ARM64, we also need a custom longjmp to avoid unwinding issues +extern __declspec(noreturn) void _longjmp_wrapper(jmp_buf, int); +#undef longjmp +#define longjmp(env, val) _longjmp_wrapper(env, val) +#endif + #else // MingW #undef setjmp diff --git a/qemu/include/tcg/tcg-opc.h b/qemu/include/tcg/tcg-opc.h index 22033870bf..f79ea2782e 100644 --- a/qemu/include/tcg/tcg-opc.h +++ b/qemu/include/tcg/tcg-opc.h @@ -117,7 +117,7 @@ DEF(nor_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_nor_i32)) DEF(clz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_clz_i32)) DEF(ctz_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_ctz_i32)) -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(_M_ARM64) DEF(ctpop_i32, 1, 1, 0, 0) #else DEF(ctpop_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ctpop_i32)) @@ -199,7 +199,7 @@ DEF(nor_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_nor_i64)) DEF(clz_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_clz_i64)) DEF(ctz_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_ctz_i64)) -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(_M_ARM64) DEF(ctpop_i64, 1, 1, 0, IMPL64) #else DEF(ctpop_i64, 1, 1, 0, IMPL64 | IMPL(TCG_TARGET_HAS_ctpop_i64)) diff --git a/qemu/target/i386/cpu.c b/qemu/target/i386/cpu.c index 86103b09e3..5811c62c8c 100644 --- a/qemu/target/i386/cpu.c +++ b/qemu/target/i386/cpu.c @@ -1282,10 +1282,10 @@ void host_cpuid(uint32_t function, uint32_t count, { uint32_t vec[4]; -#ifdef _MSC_VER +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) + /* MSVC on x86/x64 */ __cpuidex((int*)vec, function, count); -#else -#ifdef __x86_64__ +#elif defined(__x86_64__) asm volatile("cpuid" : "=a"(vec[0]), "=b"(vec[1]), "=c"(vec[2]), "=d"(vec[3]) @@ -1303,7 +1303,6 @@ void host_cpuid(uint32_t function, uint32_t count, #else abort(); #endif -#endif // _MSC_VER if (eax) *eax = vec[0]; diff --git a/qemu/tcg/aarch64/tcg-target.inc.c b/qemu/tcg/aarch64/tcg-target.inc.c index 50c9e595bb..13a701254a 100644 --- a/qemu/tcg/aarch64/tcg-target.inc.c +++ b/qemu/tcg/aarch64/tcg-target.inc.c @@ -1080,9 +1080,9 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, opc = I3405_MOVZ; } s0 = ctz64(t0) & (63 & -16); - t1 = t0 & ~(0xffffUL << s0); + t1 = t0 & ~(0xffffULL << s0); s1 = ctz64(t1) & (63 & -16); - t2 = t1 & ~(0xffffUL << s1); + t2 = t1 & ~(0xffffULL << s1); if (t2 == 0) { tcg_out_insn_3405(s, opc, type, rd, t0 >> s0, s0); if (t1 != 0) { @@ -1503,14 +1503,22 @@ static inline void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl, static inline void tcg_out_mb(TCGContext *s, TCGArg a0) { - static const uint32_t sync[] = { - [0 ... TCG_MO_ALL] = DMB_ISH | DMB_LD | DMB_ST, - [TCG_MO_ST_ST] = DMB_ISH | DMB_ST, - [TCG_MO_LD_LD] = DMB_ISH | DMB_LD, - [TCG_MO_LD_ST] = DMB_ISH | DMB_LD, - [TCG_MO_LD_ST | TCG_MO_LD_LD] = DMB_ISH | DMB_LD, - }; - tcg_out32(s, sync[a0 & TCG_MO_ALL]); + /* Use switch instead of array range initializers for MSVC compatibility */ + uint32_t sync_val; + switch (a0 & TCG_MO_ALL) { + case TCG_MO_ST_ST: + sync_val = DMB_ISH | DMB_ST; + break; + case TCG_MO_LD_LD: + case TCG_MO_LD_ST: + case TCG_MO_LD_ST | TCG_MO_LD_LD: + sync_val = DMB_ISH | DMB_LD; + break; + default: + sync_val = DMB_ISH | DMB_LD | DMB_ST; + break; + } + tcg_out32(s, sync_val); } static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d, @@ -2895,6 +2903,9 @@ typedef struct { uint8_t fde_reg_ofs[24]; } DebugFrame; +#if !defined(__ELF__) + /* Host machine without ELF. */ +#else #define ELF_HOST_MACHINE EM_AARCH64 static const DebugFrame debug_frame = { @@ -2933,3 +2944,4 @@ void tcg_register_jit(TCGContext *s, void *buf, size_t buf_size) { tcg_register_jit_int(s, buf, buf_size, &debug_frame, sizeof(debug_frame)); } +#endif /* __ELF__ */ diff --git a/qemu/util/cacheinfo.c b/qemu/util/cacheinfo.c index f29f720601..335fd6f672 100644 --- a/qemu/util/cacheinfo.c +++ b/qemu/util/cacheinfo.c @@ -108,7 +108,7 @@ static void sys_cache_info(int *isize, int *dsize) * Architecture (+ OS) specific detection mechanisms. */ -#if defined(__aarch64__) +#if defined(__aarch64__) && !defined(_MSC_VER) static void arch_cache_info(int *isize, int *dsize) { diff --git a/qemu/util/setjmp-wrapper-win32-arm64.asm b/qemu/util/setjmp-wrapper-win32-arm64.asm new file mode 100644 index 0000000000..78cbd472ac --- /dev/null +++ b/qemu/util/setjmp-wrapper-win32-arm64.asm @@ -0,0 +1,124 @@ +; setjmp/longjmp wrapper for Windows ARM64 (MSVC/armasm64) +; +; This is a native assembly implementation that doesn't create a stack frame, +; avoiding the stack corruption issues that occur with a C wrapper. +; +; On ARM64 Windows, wrapping setjmp in a C function doesn't work because: +; 1. The C wrapper creates its own stack frame +; 2. setjmp saves the state inside the wrapper +; 3. After setjmp returns, the wrapper's stack frame is released +; 4. Other code runs and reuses that stack space +; 5. When longjmp is called, it tries to return to corrupted stack +; +; This assembly implementation: +; 1. Saves all callee-saved registers directly to jmp_buf +; 2. Sets Frame=0 to disable stack unwinding +; 3. Returns 0 directly without calling CRT +; 4. longjmp restores registers and returns to original caller +; +; jmp_buf layout for ARM64 Windows (per MSVC _JUMP_BUFFER): +; Offset 0x00: Frame (set to 0 to disable unwinding) +; Offset 0x08: Reserved +; Offset 0x10: X19 +; Offset 0x18: X20 +; Offset 0x20: X21 +; Offset 0x28: X22 +; Offset 0x30: X23 +; Offset 0x38: X24 +; Offset 0x40: X25 +; Offset 0x48: X26 +; Offset 0x50: X27 +; Offset 0x58: X28 +; Offset 0x60: Fp (X29) +; Offset 0x68: Lr (X30) +; Offset 0x70: Sp +; Offset 0x78: Fpcr (4 bytes) + Fpsr (4 bytes) = 8 bytes combined +; Offset 0x80: D8-D15 (8 doubles = 64 bytes) +; +; Total size: 0xC0 = 192 bytes, which exactly matches jmp_buf size. + + AREA |.text|, CODE, READONLY + + EXPORT _setjmp_wrapper + EXPORT _longjmp_wrapper + +; int _setjmp_wrapper(jmp_buf env) +; x0 = pointer to jmp_buf +; Returns 0 on initial call, non-zero on longjmp return +_setjmp_wrapper PROC + ; Set Frame and Reserved to 0 (disables stack unwinding) + str xzr, [x0, #0] + str xzr, [x0, #8] + + ; Save callee-saved general purpose registers x19-x28 + stp x19, x20, [x0, #0x10] + stp x21, x22, [x0, #0x20] + stp x23, x24, [x0, #0x30] + stp x25, x26, [x0, #0x40] + stp x27, x28, [x0, #0x50] + + ; Save frame pointer (x29) and link register (x30) + stp x29, x30, [x0, #0x60] + + ; Save stack pointer + mov x1, sp + str x1, [x0, #0x70] + + ; Save FPCR and FPSR (combined into one 64-bit value) + mrs x1, fpcr + mrs x2, fpsr + orr x1, x1, x2, lsl #32 + str x1, [x0, #0x78] + + ; Save callee-saved SIMD registers d8-d15 (64 bytes total at offset 0x80) + stp d8, d9, [x0, #0x80] + stp d10, d11, [x0, #0x90] + stp d12, d13, [x0, #0xA0] + stp d14, d15, [x0, #0xB0] + + ; Return 0 (initial setjmp call) + mov w0, #0 + ret + ENDP + +; void _longjmp_wrapper(jmp_buf env, int val) +; x0 = pointer to jmp_buf +; x1 = return value (0 is converted to 1) +; Does not return - jumps to saved context +_longjmp_wrapper PROC + ; Ensure return value is at least 1 + cmp w1, #0 + csinc w2, w1, wzr, ne ; w2 = (w1 != 0) ? w1 : 1 + + ; Restore callee-saved SIMD registers d8-d15 + ldp d8, d9, [x0, #0x80] + ldp d10, d11, [x0, #0x90] + ldp d12, d13, [x0, #0xA0] + ldp d14, d15, [x0, #0xB0] + + ; Restore FPCR and FPSR + ldr x3, [x0, #0x78] + msr fpcr, x3 + lsr x3, x3, #32 + msr fpsr, x3 + + ; Restore stack pointer + ldr x3, [x0, #0x70] + mov sp, x3 + + ; Restore frame pointer (x29) and link register (x30) + ldp x29, x30, [x0, #0x60] + + ; Restore callee-saved general purpose registers x19-x28 + ldp x19, x20, [x0, #0x10] + ldp x21, x22, [x0, #0x20] + ldp x23, x24, [x0, #0x30] + ldp x25, x26, [x0, #0x40] + ldp x27, x28, [x0, #0x50] + + ; Return with the specified value + mov w0, w2 + ret + ENDP + + END From 926c89f1338a24ace618043e1766970c6f8cb2e5 Mon Sep 17 00:00:00 2001 From: expend20 <36543551+expend20@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:33:27 -0500 Subject: [PATCH 2/3] Windows ARM: use musl's setjmp and longjmp implementations --- qemu/util/setjmp-wrapper-win32-arm64.asm | 144 ++++++++--------------- qemu/util/setjmp-wrapper-win32-arm64.md | 54 +++++++++ 2 files changed, 100 insertions(+), 98 deletions(-) create mode 100644 qemu/util/setjmp-wrapper-win32-arm64.md diff --git a/qemu/util/setjmp-wrapper-win32-arm64.asm b/qemu/util/setjmp-wrapper-win32-arm64.asm index 78cbd472ac..e17784bff4 100644 --- a/qemu/util/setjmp-wrapper-win32-arm64.asm +++ b/qemu/util/setjmp-wrapper-win32-arm64.asm @@ -1,41 +1,26 @@ -; setjmp/longjmp wrapper for Windows ARM64 (MSVC/armasm64) +; setjmp/longjmp for Windows ARM64 (MSVC armasm64) ; -; This is a native assembly implementation that doesn't create a stack frame, -; avoiding the stack corruption issues that occur with a C wrapper. +; Based on musl libc aarch64 setjmp/longjmp implementation. +; https://git.musl-libc.org/cgit/musl/tree/src/setjmp/aarch64/setjmp.s +; https://git.musl-libc.org/cgit/musl/tree/src/setjmp/aarch64/longjmp.s ; -; On ARM64 Windows, wrapping setjmp in a C function doesn't work because: -; 1. The C wrapper creates its own stack frame -; 2. setjmp saves the state inside the wrapper -; 3. After setjmp returns, the wrapper's stack frame is released -; 4. Other code runs and reuses that stack space -; 5. When longjmp is called, it tries to return to corrupted stack +; Custom implementations are needed because the CRT longjmp calls +; RtlUnwind for stack unwinding, which crashes when JIT-generated code +; frames (with no SEH unwind metadata) are on the stack. +; See setjmp-wrapper-win32-arm64.md for details. ; -; This assembly implementation: -; 1. Saves all callee-saved registers directly to jmp_buf -; 2. Sets Frame=0 to disable stack unwinding -; 3. Returns 0 directly without calling CRT -; 4. longjmp restores registers and returns to original caller -; -; jmp_buf layout for ARM64 Windows (per MSVC _JUMP_BUFFER): -; Offset 0x00: Frame (set to 0 to disable unwinding) -; Offset 0x08: Reserved -; Offset 0x10: X19 -; Offset 0x18: X20 -; Offset 0x20: X21 -; Offset 0x28: X22 -; Offset 0x30: X23 -; Offset 0x38: X24 -; Offset 0x40: X25 -; Offset 0x48: X26 -; Offset 0x50: X27 -; Offset 0x58: X28 -; Offset 0x60: Fp (X29) -; Offset 0x68: Lr (X30) -; Offset 0x70: Sp -; Offset 0x78: Fpcr (4 bytes) + Fpsr (4 bytes) = 8 bytes combined -; Offset 0x80: D8-D15 (8 doubles = 64 bytes) -; -; Total size: 0xC0 = 192 bytes, which exactly matches jmp_buf size. +; jmp_buf layout (AAPCS64 callee-saved registers): +; Offset 0x00: X19, X20 +; Offset 0x10: X21, X22 +; Offset 0x20: X23, X24 +; Offset 0x30: X25, X26 +; Offset 0x40: X27, X28 +; Offset 0x50: X29 (FP), X30 (LR) +; Offset 0x68: SP +; Offset 0x70: D8, D9 +; Offset 0x80: D10, D11 +; Offset 0x90: D12, D13 +; Offset 0xA0: D14, D15 AREA |.text|, CODE, READONLY @@ -46,37 +31,18 @@ ; x0 = pointer to jmp_buf ; Returns 0 on initial call, non-zero on longjmp return _setjmp_wrapper PROC - ; Set Frame and Reserved to 0 (disables stack unwinding) - str xzr, [x0, #0] - str xzr, [x0, #8] - - ; Save callee-saved general purpose registers x19-x28 - stp x19, x20, [x0, #0x10] - stp x21, x22, [x0, #0x20] - stp x23, x24, [x0, #0x30] - stp x25, x26, [x0, #0x40] - stp x27, x28, [x0, #0x50] - - ; Save frame pointer (x29) and link register (x30) - stp x29, x30, [x0, #0x60] - - ; Save stack pointer - mov x1, sp - str x1, [x0, #0x70] - - ; Save FPCR and FPSR (combined into one 64-bit value) - mrs x1, fpcr - mrs x2, fpsr - orr x1, x1, x2, lsl #32 - str x1, [x0, #0x78] - - ; Save callee-saved SIMD registers d8-d15 (64 bytes total at offset 0x80) - stp d8, d9, [x0, #0x80] - stp d10, d11, [x0, #0x90] - stp d12, d13, [x0, #0xA0] - stp d14, d15, [x0, #0xB0] - - ; Return 0 (initial setjmp call) + stp x19, x20, [x0, #0x00] + stp x21, x22, [x0, #0x10] + stp x23, x24, [x0, #0x20] + stp x25, x26, [x0, #0x30] + stp x27, x28, [x0, #0x40] + stp x29, x30, [x0, #0x50] + mov x2, sp + str x2, [x0, #0x68] + stp d8, d9, [x0, #0x70] + stp d10, d11, [x0, #0x80] + stp d12, d13, [x0, #0x90] + stp d14, d15, [x0, #0xA0] mov w0, #0 ret ENDP @@ -86,39 +52,21 @@ _setjmp_wrapper PROC ; x1 = return value (0 is converted to 1) ; Does not return - jumps to saved context _longjmp_wrapper PROC - ; Ensure return value is at least 1 + ldp x19, x20, [x0, #0x00] + ldp x21, x22, [x0, #0x10] + ldp x23, x24, [x0, #0x20] + ldp x25, x26, [x0, #0x30] + ldp x27, x28, [x0, #0x40] + ldp x29, x30, [x0, #0x50] + ldr x2, [x0, #0x68] + mov sp, x2 + ldp d8, d9, [x0, #0x70] + ldp d10, d11, [x0, #0x80] + ldp d12, d13, [x0, #0x90] + ldp d14, d15, [x0, #0xA0] cmp w1, #0 - csinc w2, w1, wzr, ne ; w2 = (w1 != 0) ? w1 : 1 - - ; Restore callee-saved SIMD registers d8-d15 - ldp d8, d9, [x0, #0x80] - ldp d10, d11, [x0, #0x90] - ldp d12, d13, [x0, #0xA0] - ldp d14, d15, [x0, #0xB0] - - ; Restore FPCR and FPSR - ldr x3, [x0, #0x78] - msr fpcr, x3 - lsr x3, x3, #32 - msr fpsr, x3 - - ; Restore stack pointer - ldr x3, [x0, #0x70] - mov sp, x3 - - ; Restore frame pointer (x29) and link register (x30) - ldp x29, x30, [x0, #0x60] - - ; Restore callee-saved general purpose registers x19-x28 - ldp x19, x20, [x0, #0x10] - ldp x21, x22, [x0, #0x20] - ldp x23, x24, [x0, #0x30] - ldp x25, x26, [x0, #0x40] - ldp x27, x28, [x0, #0x50] - - ; Return with the specified value - mov w0, w2 - ret + csinc w0, w1, wzr, ne + br x30 ENDP END diff --git a/qemu/util/setjmp-wrapper-win32-arm64.md b/qemu/util/setjmp-wrapper-win32-arm64.md new file mode 100644 index 0000000000..3fe8c96901 --- /dev/null +++ b/qemu/util/setjmp-wrapper-win32-arm64.md @@ -0,0 +1,54 @@ +# Why setjmp/longjmp wrappers are needed on Windows ARM64 + +## Background + +Unicorn's QEMU TCG uses `setjmp`/`longjmp` to exit from JIT-generated code back +to the CPU execution loop (e.g. on HLT, exceptions, or memory faults). On +Windows ARM64, both the CRT `setjmp` and `longjmp` are unsuitable for this +because they interact with Windows Structured Exception Handling (SEH) stack +unwinding, which cannot traverse JIT-generated code frames that lack unwind +metadata. + +## Why `_setjmp_wrapper` is needed + +The CRT `_setjmp` on ARM64 saves frame information into `jmp_buf` that +`longjmp` later uses to drive `RtlUnwind`. The wrapper uses a simple +musl-libc-style layout that only saves callee-saved registers (x19-x28, x29, +x30, sp, d8-d15) without any SEH frame data. + +Without it: **linker error** — `os-win32.h` declares +`extern int _setjmp_wrapper(jmp_buf)` and the `setjmp` macro expands to it, so +all three callsites (`cpu-exec.c`, `translate-all.c`, `translate.c`) reference +this symbol. + +## Why `_longjmp_wrapper` is needed + +Even with a correct `jmp_buf` (Frame=0), the CRT `longjmp` still calls +`__longjmp_internal` → `RtlUnwind` → `RtlUnwindEx`, which attempts to walk the +stack. When JIT-generated code is on the stack (no SEH unwind info), this fails +with exception `0xC00000FF`. + +The wrapper bypasses the CRT entirely — it restores callee-saved registers +directly from the `jmp_buf` with `ldp`/`ldr` instructions and `ret`s to the +saved return address. + +Without it: **runtime crash** — confirmed via debugger: + +``` +ntdll!RtlRaiseStatus (exception 0xC00000FF) +ntdll!RtlUnwindEx ← stack unwinding fails here +ntdll!RtlUnwind +VCRUNTIME140!__longjmp_internal +VCRUNTIME140!longjmp +unicorn!cpu_loop_exit_x86_64 +unicorn!helper_hlt_x86_64 +0x000001b480000184 ← JIT code (no unwind info) +``` + +46 of 54 x86 tests crash with exit code `0xC00000FF`. + +## Why x64 doesn't need a longjmp wrapper + +On x64, `_setjmp` accepts a second parameter (frame pointer). Passing `NULL` +disables stack unwinding in `longjmp`. ARM64's CRT has no equivalent mechanism, +so both wrappers are required. From a3f97aaf9ee98839fbdc4711df262411f1906343 Mon Sep 17 00:00:00 2001 From: expend20 <36543551+expend20@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:48:40 -0500 Subject: [PATCH 3/3] Windows ARM64 CI build --- .github/workflows/build-uc2.yml | 63 +++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/.github/workflows/build-uc2.yml b/.github/workflows/build-uc2.yml index c6093e0c76..7b10794373 100644 --- a/.github/workflows/build-uc2.yml +++ b/.github/workflows/build-uc2.yml @@ -240,6 +240,69 @@ jobs: path: ./${{ matrix.config.artifact }} name: ${{ matrix.config.artifact }} + Windows-ARM64: + runs-on: windows-11-arm + name: ${{ matrix.config.name }} + strategy: + fail-fast: false + matrix: + config: + - { + name: 'windows-arm64 MSVC shared', + shared: 'yes', + artifact: 'windows-msvc-arm64-shared.7z', + archiver: '7z a', + generators: 'Ninja' + } + - { + name: 'windows-arm64 MSVC static', + shared: 'no', + artifact: 'windows-msvc-arm64-static.7z', + archiver: '7z a', + generators: 'Ninja' + } + steps: + - uses: actions/checkout@v4 + + - name: '🛠️ Win ARM64 MSVC setup' + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: arm64 + + - name: '🚧 Win ARM64 build' + shell: bash + run: | + choco install ninja + ninja --version + cmake --version + mkdir build + mkdir instdir + cmake \ + -S . \ + -B . \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -G "${{ matrix.config.generators }}" \ + -DCMAKE_INSTALL_PREFIX:PATH=instdir \ + -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} + cmake --build . --config ${{ env.BUILD_TYPE }} + cmake --install . --strip --config ${{ env.BUILD_TYPE }} + ctest -VV -C ${{ env.BUILD_TYPE }} + + - name: '📦 Pack artifact' + if: always() + shell: bash + working-directory: instdir + run: | + ls -laR + ${{ matrix.config.archiver }} ../${{ matrix.config.artifact }} . ../test* + + - name: '📤 Upload artifact' + if: always() + uses: actions/upload-artifact@v4 + with: + path: ./${{ matrix.config.artifact }} + name: ${{ matrix.config.artifact }} + Macos: runs-on: ${{ matrix.config.os }} name: ${{ matrix.config.name }} - ${{ matrix.compiler }}