diff --git a/libs/cua-driver/rust/crates/cua-driver-core/src/recording.rs b/libs/cua-driver/rust/crates/cua-driver-core/src/recording.rs index 43ed3ca97..c75eb874c 100644 --- a/libs/cua-driver/rust/crates/cua-driver-core/src/recording.rs +++ b/libs/cua-driver/rust/crates/cua-driver-core/src/recording.rs @@ -18,6 +18,7 @@ use std::time::Instant; use serde_json::Value; use crate::cursor_sampler::CursorSampler; +use crate::recording_loader::parse_screen_from_summary; use crate::video::{self, VideoBackend, VideoMetadata}; // ── Platform screenshot callback ───────────────────────────────────────────── @@ -57,6 +58,22 @@ pub fn set_click_marker_fn(f: impl Fn(&[u8], f64, f64) -> Option> + Send let _ = CLICK_MARKER_FN.set(Box::new(f)); } +// ── Platform window-origin callback ────────────────────────────────────────── +// +// Resolves the screen-space origin of the screenshot bitmap for a window. +// Element-indexed click results report screen-absolute coordinates, while +// click.png markers are drawn in screenshot-local pixels. + +type WindowBitmapOriginFnBox = Box, Option) -> Option<(f64, f64)> + Send + Sync>; +static WINDOW_BITMAP_ORIGIN_FN: OnceLock = OnceLock::new(); + +/// Register the platform-specific screenshot-origin resolver. Call once at startup. +pub fn set_window_bitmap_origin_fn( + f: impl Fn(Option, Option) -> Option<(f64, f64)> + Send + Sync + 'static, +) { + let _ = WINDOW_BITMAP_ORIGIN_FN.set(Box::new(f)); +} + // ── Platform AX-snapshot callback ──────────────────────────────────────────── // // Takes (window_id, pid) and returns JSON bytes for `app_state.json` (the @@ -370,21 +387,22 @@ fn write_turn( let pid = args.opt_i64("pid"); let element_index = args.opt_u64("element_index"); - // Extract click point for click-family tools. Falls back to the - // platform element_index → window-local-pixels resolver when the call - // used `element_index` instead of explicit `x, y`, so click.png is - // written for AX-indexed clicks too. + // Extract click point for click-family tools. Pixel-addressed clicks + // already use screenshot-local `x, y`. Element-indexed clicks report + // screen coords in the result summary, so convert those back into the + // screenshot bitmap's local coordinate space before drawing click.png. let click_point: Option<(f64, f64)> = if matches!( tool_name, "click" | "double_click" | "right_click" ) { match (args.opt_f64("x"), args.opt_f64("y")) { (Some(x), Some(y)) => Some((x, y)), - _ => match (window_id, pid, element_index, ELEMENT_BOUNDS_FN.get()) { - (Some(wid), Some(p), Some(idx), Some(f)) => { - u32::try_from(idx).ok().and_then(|idx32| f(wid, p, idx32)) - } - _ => None, - }, + _ => click_point_from_result_summary(result_text, window_id, pid) + .or_else(|| match (window_id, pid, element_index, ELEMENT_BOUNDS_FN.get()) { + (Some(wid), Some(p), Some(idx), Some(f)) => { + u32::try_from(idx).ok().and_then(|idx32| f(wid, p, idx32)) + } + _ => None, + }), } } else { None @@ -429,6 +447,21 @@ fn write_turn( Ok(()) } +fn click_point_from_result_summary( + result_text: &str, + window_id: Option, + pid: Option, +) -> Option<(f64, f64)> { + let screen = parse_screen_from_summary(result_text)?; + let origin_fn = WINDOW_BITMAP_ORIGIN_FN.get()?; + let origin = origin_fn(window_id, pid)?; + Some(screen_to_window_local(screen, origin)) +} + +fn screen_to_window_local(screen: (f64, f64), bitmap_origin: (f64, f64)) -> (f64, f64) { + (screen.0 - bitmap_origin.0, screen.1 - bitmap_origin.1) +} + fn write_json_atomic(path: &Path, value: &Value) -> anyhow::Result<()> { let tmp = path.with_extension("tmp"); std::fs::write(&tmp, serde_json::to_string_pretty(value)?)?; @@ -491,3 +524,88 @@ fn expand_tilde(path: &str) -> PathBuf { } PathBuf::from(path) } + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use serde_json::json; + + use super::*; + + #[test] + fn result_summary_screen_coords_are_converted_to_window_local_click_point() { + assert_eq!( + parse_screen_from_summary("Performed UIA Invoke on [28] (screen (745,679))."), + Some((745.0, 679.0)), + ); + assert_eq!(screen_to_window_local((745.0, 679.0), (701.0, 601.0)), (44.0, 78.0)); + } + + #[test] + fn write_turn_marks_element_index_clicks_from_result_summary() { + let marker_calls = Arc::new(Mutex::new(Vec::<(f64, f64)>::new())); + let calls = Arc::clone(&marker_calls); + + set_screenshot_fn(|window_id, pid| { + assert_eq!(window_id, Some(7)); + assert_eq!(pid, Some(42)); + Some(b"not-a-real-png".to_vec()) + }); + set_window_bitmap_origin_fn(|window_id, pid| { + assert_eq!(window_id, Some(7)); + assert_eq!(pid, Some(42)); + Some((701.0, 601.0)) + }); + set_click_marker_fn(move |_png_bytes, cx, cy| { + calls.lock().unwrap().push((cx, cy)); + Some(format!("{cx},{cy}").into_bytes()) + }); + + let root = std::env::temp_dir().join(format!( + "cua-driver-core-recording-test-{}-{}", + std::process::id(), + now_ms(), + )); + let summary_dir = root.join("turn-summary"); + write_turn( + &summary_dir, + "click", + &json!({"pid": 42, "window_id": 7, "element_index": 28}), + "Performed UIA Invoke on [28] (screen (745,679)).", + 1_100, + 1_000, + ).unwrap(); + + let action: Value = serde_json::from_str( + &std::fs::read_to_string(summary_dir.join("action.json")).unwrap(), + ).unwrap(); + assert_eq!(action["click_point"], json!({"x": 44.0, "y": 78.0})); + assert_eq!(std::fs::read(summary_dir.join("click.png")).unwrap(), b"44,78"); + + let pixel_dir = root.join("turn-pixel"); + write_turn( + &pixel_dir, + "click", + &json!({"pid": 42, "window_id": 7, "x": 12.0, "y": 34.0}), + "Performed click (screen (745,679)).", + 1_200, + 1_000, + ).unwrap(); + assert_eq!(std::fs::read(pixel_dir.join("click.png")).unwrap(), b"12,34"); + + let unparsed_dir = root.join("turn-unparsed"); + write_turn( + &unparsed_dir, + "click", + &json!({"pid": 42, "window_id": 7, "element_index": 29}), + "Performed UIA Invoke on [29].", + 1_300, + 1_000, + ).unwrap(); + assert!(!unparsed_dir.join("click.png").exists()); + + assert_eq!(*marker_calls.lock().unwrap(), vec![(44.0, 78.0), (12.0, 34.0)]); + let _ = std::fs::remove_dir_all(root); + } +} diff --git a/libs/cua-driver/rust/crates/cua-driver-core/src/recording_loader.rs b/libs/cua-driver/rust/crates/cua-driver-core/src/recording_loader.rs index 59ada94f7..bfacb2655 100644 --- a/libs/cua-driver/rust/crates/cua-driver-core/src/recording_loader.rs +++ b/libs/cua-driver/rust/crates/cua-driver-core/src/recording_loader.rs @@ -225,10 +225,10 @@ fn parse_click_turn(action_path: &Path) -> Option { } /// Pull "(screen (X,Y))" out of a tool-result text. Returns `(x, y)` -/// when the pattern is present and parses as numbers. Used as last- -/// resort coord recovery for element-indexed clicks whose action.json -/// doesn't yet carry click_point. -fn parse_screen_from_summary(summary: &str) -> Option<(f64, f64)> { +/// when the pattern is present and parses as numbers. Used by the +/// recorder and as last-resort coord recovery for element-indexed clicks +/// whose action.json doesn't yet carry click_point. +pub(crate) fn parse_screen_from_summary(summary: &str) -> Option<(f64, f64)> { // Looking for: "...(screen (123,456))..." — find the inner parens // right after "screen ". let idx = summary.find("(screen (")?; diff --git a/libs/cua-driver/rust/crates/cua-driver/src/main.rs b/libs/cua-driver/rust/crates/cua-driver/src/main.rs index 28a177f00..b0ba55969 100644 --- a/libs/cua-driver/rust/crates/cua-driver/src/main.rs +++ b/libs/cua-driver/rust/crates/cua-driver/src/main.rs @@ -650,6 +650,9 @@ fn build_registry(cursor_cfg: cursor_overlay::CursorConfig) -> cua_driver_core:: cua_driver_core::recording::set_element_bounds_fn(|wid, pid, idx| { platform_windows::recording_hooks::element_window_local_xy(wid, pid, idx) }); + cua_driver_core::recording::set_window_bitmap_origin_fn(|window_id, pid| { + platform_windows::recording_hooks::window_bitmap_origin_xy(window_id, pid) + }); cua_driver_core::video::set_video_backend_factory( Box::new(cua_driver_core::video_ffmpeg::FfmpegVideoBackendFactory), ); @@ -715,6 +718,9 @@ fn build_registry_no_cursor() -> cua_driver_core::tool::ToolRegistry { cua_driver_core::recording::set_element_bounds_fn(|wid, pid, idx| { platform_windows::recording_hooks::element_window_local_xy(wid, pid, idx) }); + cua_driver_core::recording::set_window_bitmap_origin_fn(|window_id, pid| { + platform_windows::recording_hooks::window_bitmap_origin_xy(window_id, pid) + }); cua_driver_core::video::set_video_backend_factory( Box::new(cua_driver_core::video_ffmpeg::FfmpegVideoBackendFactory), ); diff --git a/libs/cua-driver/rust/crates/platform-windows/src/recording_hooks.rs b/libs/cua-driver/rust/crates/platform-windows/src/recording_hooks.rs index f587d311b..927bb852f 100644 --- a/libs/cua-driver/rust/crates/platform-windows/src/recording_hooks.rs +++ b/libs/cua-driver/rust/crates/platform-windows/src/recording_hooks.rs @@ -5,6 +5,8 @@ //! - `element_window_local_xy` — resolves `element_index` to a click point in //! window-local screenshot-pixel coordinates so `click.png` is also written //! on UIA/MSAA-indexed clicks (not just pixel-addressed ones). +//! - `window_bitmap_origin_xy` — resolves the screenshot bitmap's top-left in +//! screen coordinates so result-summary screen points can be marker-local. #[cfg(target_os = "windows")] use std::sync::{Arc, OnceLock}; @@ -43,15 +45,52 @@ pub fn element_window_local_xy(window_id: u64, pid: i64, element_index: u32) -> let cache = ELEMENT_CACHE.get()?; let pid_u32 = u32::try_from(pid).ok()?; let (sx, sy) = cache.get_element_center(pid_u32, window_id, element_index as usize)?; - // The cached center is in SCREEN coords. Convert to window-local pixel - // coords by subtracting the window's screen origin (GetWindowRect-equivalent - // in WindowInfo). Windows captures at logical pixels so no scale factor. - let wins = crate::win32::list_windows(Some(pid_u32)); - let win = wins.iter().find(|w| w.hwnd == window_id)?; - Some(((sx - win.x) as f64, (sy - win.y) as f64)) + // The cached center is in SCREEN coords. Convert to window-local + // screenshot pixels using the same bitmap origin as pixel clicks. + let (ox, oy) = window_bitmap_origin_xy(Some(window_id), Some(pid))?; + Some((sx as f64 - ox, sy as f64 - oy)) +} + +#[cfg(target_os = "windows")] +pub fn window_bitmap_origin_xy(window_id: Option, pid: Option) -> Option<(f64, f64)> { + use windows::Win32::Foundation::{HWND, RECT}; + use windows::Win32::Graphics::Dwm::{DwmGetWindowAttribute, DWMWA_EXTENDED_FRAME_BOUNDS}; + use windows::Win32::UI::WindowsAndMessaging::GetWindowRect; + + let hwnd = match window_id { + Some(w) => w, + None => { + let pid_u32 = u32::try_from(pid?).ok()?; + crate::win32::list_windows(Some(pid_u32)).first().map(|w| w.hwnd)? + } + }; + + // Keep this constant in sync with `capture::DWM_CROP_INSET_PX` and + // `tools::impl_::bitmap_to_screen`. + const DWM_CROP_INSET_PX: i32 = 1; + let h = HWND(hwnd as *mut _); + unsafe { + let mut dwm = RECT::default(); + let hr = DwmGetWindowAttribute( + h, + DWMWA_EXTENDED_FRAME_BOUNDS, + &mut dwm as *mut _ as *mut _, + std::mem::size_of::() as u32, + ); + if hr.is_ok() { + return Some(((dwm.left + DWM_CROP_INSET_PX) as f64, + (dwm.top + DWM_CROP_INSET_PX) as f64)); + } + + let mut wr = RECT::default(); + GetWindowRect(h, &mut wr).ok()?; + Some((wr.left as f64, wr.top as f64)) + } } #[cfg(not(target_os = "windows"))] pub fn app_state_json_for(_window_id: Option, _pid: Option) -> Option> { None } #[cfg(not(target_os = "windows"))] pub fn element_window_local_xy(_window_id: u64, _pid: i64, _element_index: u32) -> Option<(f64, f64)> { None } +#[cfg(not(target_os = "windows"))] +pub fn window_bitmap_origin_xy(_window_id: Option, _pid: Option) -> Option<(f64, f64)> { None }