Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 128 additions & 10 deletions libs/cua-driver/rust/crates/cua-driver-core/src/recording.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::time::Instant;
use serde_json::Value;

use crate::cursor_sampler::CursorSampler;
use crate::recording_loader::parse_screen_from_summary;
use crate::video::{self, VideoBackend, VideoMetadata};

// ── Platform screenshot callback ─────────────────────────────────────────────
Expand Down Expand Up @@ -57,6 +58,22 @@ pub fn set_click_marker_fn(f: impl Fn(&[u8], f64, f64) -> Option<Vec<u8>> + Send
let _ = CLICK_MARKER_FN.set(Box::new(f));
}

// ── Platform window-origin callback ──────────────────────────────────────────
//
// Resolves the screen-space origin of the screenshot bitmap for a window.
// Element-indexed click results report screen-absolute coordinates, while
// click.png markers are drawn in screenshot-local pixels.

type WindowBitmapOriginFnBox = Box<dyn Fn(Option<u64>, Option<i64>) -> Option<(f64, f64)> + Send + Sync>;
static WINDOW_BITMAP_ORIGIN_FN: OnceLock<WindowBitmapOriginFnBox> = OnceLock::new();

/// Register the platform-specific screenshot-origin resolver. Call once at startup.
pub fn set_window_bitmap_origin_fn(
f: impl Fn(Option<u64>, Option<i64>) -> Option<(f64, f64)> + Send + Sync + 'static,
) {
let _ = WINDOW_BITMAP_ORIGIN_FN.set(Box::new(f));
}

// ── Platform AX-snapshot callback ────────────────────────────────────────────
//
// Takes (window_id, pid) and returns JSON bytes for `app_state.json` (the
Expand Down Expand Up @@ -370,21 +387,22 @@ fn write_turn(
let pid = args.opt_i64("pid");
let element_index = args.opt_u64("element_index");

// Extract click point for click-family tools. Falls back to the
// platform element_index → window-local-pixels resolver when the call
// used `element_index` instead of explicit `x, y`, so click.png is
// written for AX-indexed clicks too.
// Extract click point for click-family tools. Pixel-addressed clicks
// already use screenshot-local `x, y`. Element-indexed clicks report
// screen coords in the result summary, so convert those back into the
// screenshot bitmap's local coordinate space before drawing click.png.
let click_point: Option<(f64, f64)> = if matches!(
tool_name, "click" | "double_click" | "right_click"
) {
match (args.opt_f64("x"), args.opt_f64("y")) {
(Some(x), Some(y)) => Some((x, y)),
_ => match (window_id, pid, element_index, ELEMENT_BOUNDS_FN.get()) {
(Some(wid), Some(p), Some(idx), Some(f)) => {
u32::try_from(idx).ok().and_then(|idx32| f(wid, p, idx32))
}
_ => None,
},
_ => click_point_from_result_summary(result_text, window_id, pid)
.or_else(|| match (window_id, pid, element_index, ELEMENT_BOUNDS_FN.get()) {
(Some(wid), Some(p), Some(idx), Some(f)) => {
u32::try_from(idx).ok().and_then(|idx32| f(wid, p, idx32))
}
_ => None,
}),
}
} else {
None
Expand Down Expand Up @@ -429,6 +447,21 @@ fn write_turn(
Ok(())
}

fn click_point_from_result_summary(
result_text: &str,
window_id: Option<u64>,
pid: Option<i64>,
) -> Option<(f64, f64)> {
let screen = parse_screen_from_summary(result_text)?;
let origin_fn = WINDOW_BITMAP_ORIGIN_FN.get()?;
let origin = origin_fn(window_id, pid)?;
Some(screen_to_window_local(screen, origin))
}

fn screen_to_window_local(screen: (f64, f64), bitmap_origin: (f64, f64)) -> (f64, f64) {
(screen.0 - bitmap_origin.0, screen.1 - bitmap_origin.1)
}

fn write_json_atomic(path: &Path, value: &Value) -> anyhow::Result<()> {
let tmp = path.with_extension("tmp");
std::fs::write(&tmp, serde_json::to_string_pretty(value)?)?;
Expand Down Expand Up @@ -491,3 +524,88 @@ fn expand_tilde(path: &str) -> PathBuf {
}
PathBuf::from(path)
}

#[cfg(test)]
mod tests {
use std::sync::{Arc, Mutex};

use serde_json::json;

use super::*;

#[test]
fn result_summary_screen_coords_are_converted_to_window_local_click_point() {
assert_eq!(
parse_screen_from_summary("Performed UIA Invoke on [28] (screen (745,679))."),
Some((745.0, 679.0)),
);
assert_eq!(screen_to_window_local((745.0, 679.0), (701.0, 601.0)), (44.0, 78.0));
}

#[test]
fn write_turn_marks_element_index_clicks_from_result_summary() {
let marker_calls = Arc::new(Mutex::new(Vec::<(f64, f64)>::new()));
let calls = Arc::clone(&marker_calls);

set_screenshot_fn(|window_id, pid| {
assert_eq!(window_id, Some(7));
assert_eq!(pid, Some(42));
Some(b"not-a-real-png".to_vec())
});
set_window_bitmap_origin_fn(|window_id, pid| {
assert_eq!(window_id, Some(7));
assert_eq!(pid, Some(42));
Some((701.0, 601.0))
});
set_click_marker_fn(move |_png_bytes, cx, cy| {
calls.lock().unwrap().push((cx, cy));
Some(format!("{cx},{cy}").into_bytes())
});

let root = std::env::temp_dir().join(format!(
"cua-driver-core-recording-test-{}-{}",
std::process::id(),
now_ms(),
));
let summary_dir = root.join("turn-summary");
write_turn(
&summary_dir,
"click",
&json!({"pid": 42, "window_id": 7, "element_index": 28}),
"Performed UIA Invoke on [28] (screen (745,679)).",
1_100,
1_000,
).unwrap();

let action: Value = serde_json::from_str(
&std::fs::read_to_string(summary_dir.join("action.json")).unwrap(),
).unwrap();
assert_eq!(action["click_point"], json!({"x": 44.0, "y": 78.0}));
assert_eq!(std::fs::read(summary_dir.join("click.png")).unwrap(), b"44,78");

let pixel_dir = root.join("turn-pixel");
write_turn(
&pixel_dir,
"click",
&json!({"pid": 42, "window_id": 7, "x": 12.0, "y": 34.0}),
"Performed click (screen (745,679)).",
1_200,
1_000,
).unwrap();
assert_eq!(std::fs::read(pixel_dir.join("click.png")).unwrap(), b"12,34");

let unparsed_dir = root.join("turn-unparsed");
write_turn(
&unparsed_dir,
"click",
&json!({"pid": 42, "window_id": 7, "element_index": 29}),
"Performed UIA Invoke on [29].",
1_300,
1_000,
).unwrap();
assert!(!unparsed_dir.join("click.png").exists());

assert_eq!(*marker_calls.lock().unwrap(), vec![(44.0, 78.0), (12.0, 34.0)]);
let _ = std::fs::remove_dir_all(root);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,10 @@ fn parse_click_turn(action_path: &Path) -> Option<ClickEvent> {
}

/// Pull "(screen (X,Y))" out of a tool-result text. Returns `(x, y)`
/// when the pattern is present and parses as numbers. Used as last-
/// resort coord recovery for element-indexed clicks whose action.json
/// doesn't yet carry click_point.
fn parse_screen_from_summary(summary: &str) -> Option<(f64, f64)> {
/// when the pattern is present and parses as numbers. Used by the
/// recorder and as last-resort coord recovery for element-indexed clicks
/// whose action.json doesn't yet carry click_point.
pub(crate) fn parse_screen_from_summary(summary: &str) -> Option<(f64, f64)> {
// Looking for: "...(screen (123,456))..." — find the inner parens
// right after "screen ".
let idx = summary.find("(screen (")?;
Expand Down
6 changes: 6 additions & 0 deletions libs/cua-driver/rust/crates/cua-driver/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,9 @@ fn build_registry(cursor_cfg: cursor_overlay::CursorConfig) -> cua_driver_core::
cua_driver_core::recording::set_element_bounds_fn(|wid, pid, idx| {
platform_windows::recording_hooks::element_window_local_xy(wid, pid, idx)
});
cua_driver_core::recording::set_window_bitmap_origin_fn(|window_id, pid| {
platform_windows::recording_hooks::window_bitmap_origin_xy(window_id, pid)
});
cua_driver_core::video::set_video_backend_factory(
Box::new(cua_driver_core::video_ffmpeg::FfmpegVideoBackendFactory),
);
Expand Down Expand Up @@ -715,6 +718,9 @@ fn build_registry_no_cursor() -> cua_driver_core::tool::ToolRegistry {
cua_driver_core::recording::set_element_bounds_fn(|wid, pid, idx| {
platform_windows::recording_hooks::element_window_local_xy(wid, pid, idx)
});
cua_driver_core::recording::set_window_bitmap_origin_fn(|window_id, pid| {
platform_windows::recording_hooks::window_bitmap_origin_xy(window_id, pid)
});
cua_driver_core::video::set_video_backend_factory(
Box::new(cua_driver_core::video_ffmpeg::FfmpegVideoBackendFactory),
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
//! - `element_window_local_xy` — resolves `element_index` to a click point in
//! window-local screenshot-pixel coordinates so `click.png` is also written
//! on UIA/MSAA-indexed clicks (not just pixel-addressed ones).
//! - `window_bitmap_origin_xy` — resolves the screenshot bitmap's top-left in
//! screen coordinates so result-summary screen points can be marker-local.

#[cfg(target_os = "windows")]
use std::sync::{Arc, OnceLock};
Expand Down Expand Up @@ -43,15 +45,52 @@ pub fn element_window_local_xy(window_id: u64, pid: i64, element_index: u32) ->
let cache = ELEMENT_CACHE.get()?;
let pid_u32 = u32::try_from(pid).ok()?;
let (sx, sy) = cache.get_element_center(pid_u32, window_id, element_index as usize)?;
// The cached center is in SCREEN coords. Convert to window-local pixel
// coords by subtracting the window's screen origin (GetWindowRect-equivalent
// in WindowInfo). Windows captures at logical pixels so no scale factor.
let wins = crate::win32::list_windows(Some(pid_u32));
let win = wins.iter().find(|w| w.hwnd == window_id)?;
Some(((sx - win.x) as f64, (sy - win.y) as f64))
// The cached center is in SCREEN coords. Convert to window-local
// screenshot pixels using the same bitmap origin as pixel clicks.
let (ox, oy) = window_bitmap_origin_xy(Some(window_id), Some(pid))?;
Some((sx as f64 - ox, sy as f64 - oy))
}

#[cfg(target_os = "windows")]
pub fn window_bitmap_origin_xy(window_id: Option<u64>, pid: Option<i64>) -> Option<(f64, f64)> {
use windows::Win32::Foundation::{HWND, RECT};
use windows::Win32::Graphics::Dwm::{DwmGetWindowAttribute, DWMWA_EXTENDED_FRAME_BOUNDS};
use windows::Win32::UI::WindowsAndMessaging::GetWindowRect;

let hwnd = match window_id {
Some(w) => w,
None => {
let pid_u32 = u32::try_from(pid?).ok()?;
crate::win32::list_windows(Some(pid_u32)).first().map(|w| w.hwnd)?
}
};

// Keep this constant in sync with `capture::DWM_CROP_INSET_PX` and
// `tools::impl_::bitmap_to_screen`.
const DWM_CROP_INSET_PX: i32 = 1;
let h = HWND(hwnd as *mut _);
unsafe {
let mut dwm = RECT::default();
let hr = DwmGetWindowAttribute(
h,
DWMWA_EXTENDED_FRAME_BOUNDS,
&mut dwm as *mut _ as *mut _,
std::mem::size_of::<RECT>() as u32,
);
if hr.is_ok() {
return Some(((dwm.left + DWM_CROP_INSET_PX) as f64,
(dwm.top + DWM_CROP_INSET_PX) as f64));
}

let mut wr = RECT::default();
GetWindowRect(h, &mut wr).ok()?;
Some((wr.left as f64, wr.top as f64))
}
}

#[cfg(not(target_os = "windows"))]
pub fn app_state_json_for(_window_id: Option<u64>, _pid: Option<i64>) -> Option<Vec<u8>> { None }
#[cfg(not(target_os = "windows"))]
pub fn element_window_local_xy(_window_id: u64, _pid: i64, _element_index: u32) -> Option<(f64, f64)> { None }
#[cfg(not(target_os = "windows"))]
pub fn window_bitmap_origin_xy(_window_id: Option<u64>, _pid: Option<i64>) -> Option<(f64, f64)> { None }