From f87ebe657c97108dc89bdc8095f2727d834d20e0 Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Sat, 11 Apr 2026 10:31:13 +0000 Subject: [PATCH 1/5] fix(tracking): record stats for hook-rewritten commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Claude Code rewrites `git status` → `rtk git status` via its PreToolUse hook, the hook first runs `rtk rewrite` which spawns a background telemetry thread holding a brief DB connection. If that process exits before the thread finishes, SQLite's WAL shared-memory file (.db-shm) may be inconsistent for a few milliseconds. Because `execute_batch()` stops on first error, bundling `PRAGMA journal_mode=WAL` and `PRAGMA busy_timeout=5000` in one call meant that a transient WAL-mode error silently left `busy_timeout=0`. With no wait budget, any lock-contention attempt in `Tracker::new()` failed immediately, silently dropping the tracking record. Fix two things: 1. Run `PRAGMA busy_timeout=5000` in its own `execute_batch()` call so it is always applied, regardless of whether the WAL pragma succeeds. 2. Add a single retry (10 ms sleep) in `TimedExecution::track()` and `track_passthrough()` to outlast the transient .db-shm window. Add regression tests for concurrent-open correctness and for the full hook-rewrite tracking path. Fixes #1082 Co-Authored-By: Claude Sonnet 4.6 --- src/core/tracking.rs | 123 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 18 deletions(-) diff --git a/src/core/tracking.rs b/src/core/tracking.rs index 359e4f688..b411b85ed 100644 --- a/src/core/tracking.rs +++ b/src/core/tracking.rs @@ -253,12 +253,18 @@ impl Tracker { } let conn = Connection::open(&db_path)?; - // WAL mode + busy_timeout for concurrent access (multiple Claude Code instances). + // Set busy_timeout first, separately from journal_mode=WAL. + // When the Claude Code hook rewrites a command it first spawns `rtk rewrite`, + // which starts a background telemetry thread that opens the DB. If that + // process exits before the thread finishes, the WAL shared-memory file + // (.db-shm) can be briefly inconsistent, causing `PRAGMA journal_mode=WAL` + // to return a transient error. Because execute_batch() stops on the first + // error, bundling both pragmas meant busy_timeout silently stayed at 0, + // making every lock-contention attempt fail immediately instead of waiting. + // Running them separately guarantees busy_timeout is always applied. + let _ = conn.execute_batch("PRAGMA busy_timeout=5000;"); // Non-fatal: NFS/read-only filesystems may not support WAL. - let _ = conn.execute_batch( - "PRAGMA journal_mode=WAL; - PRAGMA busy_timeout=5000;", - ); + let _ = conn.execute_batch("PRAGMA journal_mode=WAL;"); conn.execute( "CREATE TABLE IF NOT EXISTS commands ( id INTEGER PRIMARY KEY, @@ -1358,14 +1364,27 @@ impl TimedExecution { let input_tokens = estimate_tokens(input); let output_tokens = estimate_tokens(output); - if let Ok(tracker) = Tracker::new() { - let _ = tracker.record( - original_cmd, - rtk_cmd, - input_tokens, - output_tokens, - elapsed_ms, - ); + // Retry once on transient DB-open failures. When a Claude Code hook + // rewrites a command it first invokes `rtk rewrite`, which spawns a + // background telemetry thread that briefly holds a DB connection. If + // that process exits before the thread finishes, SQLite's WAL shared- + // memory file (.db-shm) may be inconsistent for a few milliseconds, + // causing Tracker::new() to fail. A single short sleep + retry is + // sufficient to outlast that window without blocking the user. + for attempt in 0..2u8 { + if let Ok(tracker) = Tracker::new() { + let _ = tracker.record( + original_cmd, + rtk_cmd, + input_tokens, + output_tokens, + elapsed_ms, + ); + return; + } + if attempt == 0 { + std::thread::sleep(std::time::Duration::from_millis(10)); + } } } @@ -1391,9 +1410,17 @@ impl TimedExecution { /// ``` pub fn track_passthrough(&self, original_cmd: &str, rtk_cmd: &str) { let elapsed_ms = self.start.elapsed().as_millis() as u64; - // input_tokens=0, output_tokens=0 won't dilute savings statistics - if let Ok(tracker) = Tracker::new() { - let _ = tracker.record(original_cmd, rtk_cmd, 0, 0, elapsed_ms); + // input_tokens=0, output_tokens=0 won't dilute savings statistics. + // Same retry logic as track() to handle transient DB-open failures in + // hook execution contexts. + for attempt in 0..2u8 { + if let Ok(tracker) = Tracker::new() { + let _ = tracker.record(original_cmd, rtk_cmd, 0, 0, elapsed_ms); + return; + } + if attempt == 0 { + std::thread::sleep(std::time::Duration::from_millis(10)); + } } } } @@ -1606,7 +1633,67 @@ mod tests { ); } - // 12. record_parse_failure + get_parse_failure_summary roundtrip + // 12. busy_timeout pragma is applied even when journal_mode=WAL is already set. + // Regression: both pragmas were in one execute_batch() call. If journal_mode=WAL + // returned a transient error, busy_timeout was never set (stayed at 0), causing + // all lock-contention writes to fail immediately. + // Verify by opening two connections and writing from both — WAL + busy_timeout + // must allow this without returning a "database is locked" error. + #[test] + fn test_busy_timeout_set_independently_of_wal_mode() { + let pid = std::process::id(); + + // Open DB twice: second open simulates hook-spawned concurrent connection + let tracker1 = Tracker::new().expect("first open must succeed"); + let tracker2 = Tracker::new().expect("second open must succeed even with first open"); + + // Both connections should be able to write (busy_timeout guarantees no + // immediate lock-fail; WAL mode serialises writers without blocking readers) + let cmd1 = format!("git status hook_{}_a", pid); + let cmd2 = format!("git status hook_{}_b", pid); + tracker1 + .record("git status", &cmd1, 100, 20, 5) + .expect("write from first connection must succeed"); + tracker2 + .record("git status", &cmd2, 100, 20, 5) + .expect("write from second connection must succeed"); + + // Confirm both records are visible + let verify = Tracker::new().expect("verify tracker must open"); + let recent = verify.get_recent(20).expect("get_recent must work"); + assert!(recent.iter().any(|r| r.rtk_cmd == cmd1), "record A missing"); + assert!(recent.iter().any(|r| r.rtk_cmd == cmd2), "record B missing"); + } + + // 13. TimedExecution::track succeeds even when a concurrent reader holds the DB. + // Regression: hook-rewritten commands previously failed to record because a + // background telemetry thread (from `rtk rewrite`) held the DB open at the + // moment the tracking write was attempted, and busy_timeout was 0. + #[test] + fn test_track_records_with_concurrent_reader() { + let pid = std::process::id(); + let hook_cmd = format!("rtk git status hook_rewritten_{}", pid); + + // Simulate concurrent reader (e.g., telemetry thread from `rtk rewrite`) + let reader = Tracker::new().expect("reader must open"); + + // Now track — must succeed despite the concurrent reader connection + let timer = TimedExecution::start(); + timer.track("git status", &hook_cmd, "raw output", "filtered"); + + // Verify the record was actually written (not silently dropped) + let verify = Tracker::new().expect("verify tracker must open"); + let recent = verify.get_recent(20).expect("get_recent must work"); + assert!( + recent.iter().any(|r| r.rtk_cmd == hook_cmd), + "hook-rewritten tracking record must be present in DB (got {} records)", + recent.len() + ); + + drop(reader); + } + + // 14. record_parse_failure + get_parse_failure_summary roundtrip #[test] fn test_parse_failure_roundtrip() { let tracker = Tracker::new().expect("Failed to create tracker"); @@ -1624,7 +1711,7 @@ mod tests { assert!(summary.recent.iter().any(|r| r.raw_command == test_cmd)); } - // 13. recovery_rate calculation + // 15. recovery_rate calculation #[test] fn test_parse_failure_recovery_rate() { let tracker = Tracker::new().expect("Failed to create tracker"); From 7a6d74b5c34f0b0aa7ed4f0b6ab415a1d267a4af Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Wed, 17 Jun 2026 23:14:09 +0000 Subject: [PATCH 2/5] test(tracking): replace sequential test with real lock-contention RED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous test_busy_timeout_set_independently_of_wal_mode opened two trackers but wrote to them sequentially, so it passed even with the busy_timeout pragma removed — it did not actually guard the fix. Replace it with test_record_waits_for_concurrent_write_lock: a background thread holds the SQLite write lock (BEGIN IMMEDIATE) while the tracker attempts its INSERT. With busy_timeout=5000 the write waits and succeeds; with busy_timeout=0 (the #1082 bug) it fails immediately with SQLITE_BUSY. Add a shared DB_ENV_LOCK so RTK_DB_PATH-mutating tests don't race. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/core/tracking.rs | 86 ++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/src/core/tracking.rs b/src/core/tracking.rs index b411b85ed..8abad2ac1 100644 --- a/src/core/tracking.rs +++ b/src/core/tracking.rs @@ -1450,6 +1450,10 @@ pub fn args_display(args: &[OsString]) -> String { mod tests { use super::*; + // Serializes tests that mutate the process-global RTK_DB_PATH env var so they + // don't race each other (set_var/remove_var is process-wide). + static DB_ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + // 1. estimate_tokens — verify ~4 chars/token ratio #[test] fn test_estimate_tokens() { @@ -1577,9 +1581,7 @@ mod tests { #[test] fn test_db_path_env_and_default() { use std::env; - use std::sync::Mutex; - static ENV_LOCK: Mutex<()> = Mutex::new(()); - let _guard = ENV_LOCK.lock().unwrap(); + let _guard = DB_ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); let custom_path = env::temp_dir().join("rtk_test_custom.db"); env::set_var("RTK_DB_PATH", &custom_path); @@ -1633,36 +1635,58 @@ mod tests { ); } - // 12. busy_timeout pragma is applied even when journal_mode=WAL is already set. - // Regression: both pragmas were in one execute_batch() call. If journal_mode=WAL - // returned a transient error, busy_timeout was never set (stayed at 0), causing - // all lock-contention writes to fail immediately. - // Verify by opening two connections and writing from both — WAL + busy_timeout - // must allow this without returning a "database is locked" error. + // 12. busy_timeout pragma lets a tracked write WAIT for a concurrent writer's + // lock instead of failing immediately. Regression for #1082: both pragmas were + // in one execute_batch() call, so a transient journal_mode=WAL error left + // busy_timeout at 0 and every lock-contended write failed with SQLITE_BUSY — + // exactly the hook-spawned concurrent-rtk case where gain stats went missing. + // We reproduce real contention: a second connection holds the write lock on a + // background thread while the tracker attempts its INSERT. #[test] - fn test_busy_timeout_set_independently_of_wal_mode() { - let pid = std::process::id(); + fn test_record_waits_for_concurrent_write_lock() { + use std::sync::mpsc; + use std::time::Duration; - // Open DB twice: second open simulates hook-spawned concurrent connection - let tracker1 = Tracker::new().expect("first open must succeed"); - let tracker2 = Tracker::new().expect("second open must succeed even with first open"); - - // Both connections should be able to write (busy_timeout guarantees no - // immediate lock-fail; WAL mode serialises writers without blocking readers) - let cmd1 = format!("git status hook_{}_a", pid); - let cmd2 = format!("git status hook_{}_b", pid); - tracker1 - .record("git status", &cmd1, 100, 20, 5) - .expect("write from first connection must succeed"); - tracker2 - .record("git status", &cmd2, 100, 20, 5) - .expect("write from second connection must succeed"); - - // Confirm both records are visible - let verify = Tracker::new().expect("verify tracker must open"); - let recent = verify.get_recent(20).expect("get_recent must work"); - assert!(recent.iter().any(|r| r.rtk_cmd == cmd1), "record A missing"); - assert!(recent.iter().any(|r| r.rtk_cmd == cmd2), "record B missing"); + // The competing writer holds the lock this long: well under the 5s + // busy_timeout (so the waiting write succeeds when the fix is present) but + // long enough that a busy_timeout of 0 fails immediately. + const LOCK_HOLD_MS: u64 = 300; + + let _guard = DB_ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + + let pid = std::process::id(); + let db_path = std::env::temp_dir().join(format!("rtk_busy_timeout_{}.db", pid)); + let _ = std::fs::remove_file(&db_path); + std::env::set_var("RTK_DB_PATH", &db_path); + + // Create the schema and a connection carrying the busy_timeout pragma (the fix). + let tracker = Tracker::new().expect("tracker open must succeed"); + + // Competing writer grabs the write lock and holds it briefly on a thread. + let (locked_tx, locked_rx) = mpsc::channel(); + let lock_path = db_path.clone(); + let blocker = std::thread::spawn(move || { + let conn = Connection::open(&lock_path).expect("blocker open must succeed"); + conn.execute_batch("BEGIN IMMEDIATE;") + .expect("blocker must acquire the write lock"); + locked_tx.send(()).expect("signal lock held"); + std::thread::sleep(Duration::from_millis(LOCK_HOLD_MS)); + conn.execute_batch("COMMIT;") + .expect("blocker must release the write lock"); + }); + + locked_rx.recv().expect("blocker must signal the lock is held"); + + // With busy_timeout=5000 this INSERT waits for the lock; with busy_timeout=0 + // (the bug) it returns SQLITE_BUSY immediately and the stat is lost. + let cmd = format!("git status hook_{}", pid); + let result = tracker.record("git status", &cmd, 100, 20, 5); + + blocker.join().expect("blocker thread must finish"); + std::env::remove_var("RTK_DB_PATH"); + let _ = std::fs::remove_file(&db_path); + + result.expect("record must wait for the concurrent lock and succeed (busy_timeout)"); } // 13. TimedExecution::track succeeds even when a concurrent reader holds the DB. From ff0390df98a21df0c0699ceed432a8dacec11d09 Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Wed, 17 Jun 2026 23:16:04 +0000 Subject: [PATCH 3/5] style(tracking): cargo fmt for #1082 test Co-Authored-By: Claude Opus 4.8 (1M context) --- src/core/tracking.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core/tracking.rs b/src/core/tracking.rs index 8abad2ac1..79d51e618 100644 --- a/src/core/tracking.rs +++ b/src/core/tracking.rs @@ -1675,7 +1675,9 @@ mod tests { .expect("blocker must release the write lock"); }); - locked_rx.recv().expect("blocker must signal the lock is held"); + locked_rx + .recv() + .expect("blocker must signal the lock is held"); // With busy_timeout=5000 this INSERT waits for the lock; with busy_timeout=0 // (the bug) it returns SQLITE_BUSY immediately and the stat is lost. From a2556ff0d655f0edf1d909ba27951ad81da9e740 Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Wed, 17 Jun 2026 23:26:54 +0000 Subject: [PATCH 4/5] fix(tracking): isolate concurrency test via open_at_path, drop env race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lock-contention test set the process-global RTK_DB_PATH, which raced the other tracking tests that use the default DB under `cargo test --all` (test_timed_execution_records_time failed intermittently in CI). Extract Tracker::open_at_path(&Path) — shared by new() — so the test can target an isolated temp file without mutating any global env var. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/core/tracking.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/core/tracking.rs b/src/core/tracking.rs index 79d51e618..c28a81cf3 100644 --- a/src/core/tracking.rs +++ b/src/core/tracking.rs @@ -251,8 +251,14 @@ impl Tracker { if let Some(parent) = db_path.parent() { std::fs::create_dir_all(parent)?; } + Self::open_at_path(&db_path) + } - let conn = Connection::open(&db_path)?; + /// Open (or create) the history DB at an explicit path, applying the same + /// pragmas and schema as `new()`. Kept separate so tests can target an + /// isolated file without mutating the process-global RTK_DB_PATH env var. + fn open_at_path(db_path: &std::path::Path) -> Result { + let conn = Connection::open(db_path)?; // Set busy_timeout first, separately from journal_mode=WAL. // When the Claude Code hook rewrites a command it first spawns `rtk rewrite`, // which starts a background telemetry thread that opens the DB. If that @@ -1652,15 +1658,14 @@ mod tests { // long enough that a busy_timeout of 0 fails immediately. const LOCK_HOLD_MS: u64 = 300; - let _guard = DB_ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); - + // Isolated DB file (no RTK_DB_PATH env mutation, so this can't race the + // other tracking tests that use the default DB under `cargo test --all`). let pid = std::process::id(); let db_path = std::env::temp_dir().join(format!("rtk_busy_timeout_{}.db", pid)); let _ = std::fs::remove_file(&db_path); - std::env::set_var("RTK_DB_PATH", &db_path); - // Create the schema and a connection carrying the busy_timeout pragma (the fix). - let tracker = Tracker::new().expect("tracker open must succeed"); + // Connection carrying the busy_timeout pragma (the fix), at the isolated path. + let tracker = Tracker::open_at_path(&db_path).expect("tracker open must succeed"); // Competing writer grabs the write lock and holds it briefly on a thread. let (locked_tx, locked_rx) = mpsc::channel(); @@ -1685,7 +1690,6 @@ mod tests { let result = tracker.record("git status", &cmd, 100, 20, 5); blocker.join().expect("blocker thread must finish"); - std::env::remove_var("RTK_DB_PATH"); let _ = std::fs::remove_file(&db_path); result.expect("record must wait for the concurrent lock and succeed (busy_timeout)"); From 32dd9f656b2ae13a1a6f25e51b2deda84be41c56 Mon Sep 17 00:00:00 2001 From: Ousama Ben Younes Date: Wed, 17 Jun 2026 23:34:14 +0000 Subject: [PATCH 5/5] test(tracking): use tempfile::tempdir to satisfy filesystem-deletion lint The semgrep filesystem-deletion rule flagged the explicit fs::remove_file cleanup. Switch to tempfile::tempdir() (the repo's existing convention), which auto-removes on drop without a literal deletion call. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/core/tracking.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/core/tracking.rs b/src/core/tracking.rs index c28a81cf3..d8e3cc200 100644 --- a/src/core/tracking.rs +++ b/src/core/tracking.rs @@ -1658,11 +1658,12 @@ mod tests { // long enough that a busy_timeout of 0 fails immediately. const LOCK_HOLD_MS: u64 = 300; - // Isolated DB file (no RTK_DB_PATH env mutation, so this can't race the - // other tracking tests that use the default DB under `cargo test --all`). + // Isolated temp dir (auto-removed on drop, no RTK_DB_PATH env mutation) so + // this can't race the other tracking tests that use the default DB under + // `cargo test --all`. let pid = std::process::id(); - let db_path = std::env::temp_dir().join(format!("rtk_busy_timeout_{}.db", pid)); - let _ = std::fs::remove_file(&db_path); + let dir = tempfile::tempdir().expect("create temp dir"); + let db_path = dir.path().join("busy_timeout.db"); // Connection carrying the busy_timeout pragma (the fix), at the isolated path. let tracker = Tracker::open_at_path(&db_path).expect("tracker open must succeed"); @@ -1690,7 +1691,8 @@ mod tests { let result = tracker.record("git status", &cmd, 100, 20, 5); blocker.join().expect("blocker thread must finish"); - let _ = std::fs::remove_file(&db_path); + // `dir` is dropped at end of scope, removing the temp DB (no explicit + // fs::remove_file — keeps the filesystem-deletion lint clean). result.expect("record must wait for the concurrent lock and succeed (busy_timeout)"); }