From d68c64dd7816f9a14b260451dd4ed80c83a9917a Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:37:54 -0700 Subject: [PATCH 1/5] xdiff: support external hunks via xpparam_t Add two new xpparam_t fields (external_hunks, external_hunks_nr) that let callers supply pre-computed hunks. When set, xdl_diff() populates the changed[] arrays from these hunks instead of running the diff algorithm, then continues through compaction and emission as usual. Validate supplied hunks before use: reject out-of-bounds line numbers, overlapping or out-of-order hunks, negative counts, and violations of the synchronization invariant (unchanged line counts must match between files). On validation failure, fall back to the builtin diff algorithm. Skip trim_common_tail() in xdi_diff() when external hunks are present, since external hunks reference line numbers in the original content. Signed-off-by: Michael Montalbo --- xdiff-interface.c | 7 +++- xdiff/xdiff.h | 13 +++++++ xdiff/xdiffi.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/xdiff-interface.c b/xdiff-interface.c index f043330f2a12a0..9542c0bcc20f37 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -124,7 +124,12 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE) return -1; - if (!xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) + /* + * External hunks reference line numbers in the original content; + * trimming the tail would change line counts and invalidate them. + */ + if (!xpp->external_hunks && + !xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) trim_common_tail(&a, &b); return xdl_diff(&a, &b, xpp, xecfg, xecb); diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index dc370712e92860..2ee6f1aae3635e 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -78,6 +78,15 @@ typedef struct s_mmbuffer { long size; } mmbuffer_t; +/* + * Hunk descriptor for externally computed diffs. + * Line numbers are 1-based, matching unified diff convention. + */ +struct xdl_hunk { + long old_start, old_count; + long new_start, new_count; +}; + typedef struct s_xpparam { unsigned long flags; @@ -88,6 +97,10 @@ typedef struct s_xpparam { /* See Documentation/diff-options.adoc. */ char **anchors; size_t anchors_nr; + + /* Externally computed hunks: bypass the diff algorithm. */ + const struct xdl_hunk *external_hunks; + size_t external_hunks_nr; } xpparam_t; typedef struct s_xdemitcb { diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 5455b4690d38ff..232f4a8de8f8a1 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -1085,16 +1085,104 @@ static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe, } } +/* + * Populate the changed[] arrays from externally supplied hunks, + * bypassing the diff algorithm. Validates that hunks are in order, + * non-overlapping, and within bounds. + * + * Returns 0 on success, -1 on validation failure. + */ +static int xdl_populate_hunks_from_external(xdfenv_t *xe, + const struct xdl_hunk *hunks, + size_t nr_hunks) +{ + size_t i; + long j, prev_old_end = 0, prev_new_end = 0; + long total_old = 0, total_new = 0; + + /* + * Clear changed[] arrays including sentinels. + * xdl_prepare_env() may have dirtied them via + * xdl_cleanup_records(), and xdl_change_compact() reads + * the sentinel at changed[-1] during backward scans. + */ + memset(xe->xdf1.changed - 1, 0, + (xe->xdf1.nrec + 2) * sizeof(bool)); + memset(xe->xdf2.changed - 1, 0, + (xe->xdf2.nrec + 2) * sizeof(bool)); + + for (i = 0; i < nr_hunks; i++) { + const struct xdl_hunk *h = &hunks[i]; + + if (h->old_count < 0 || h->new_count < 0) + return -1; + + /* Bounds check (1-based line numbers) */ + if (h->old_count > 0 && + (h->old_start < 1 || + h->old_start + h->old_count - 1 > xe->xdf1.nrec)) + return -1; + if (h->new_count > 0 && + (h->new_start < 1 || + h->new_start + h->new_count - 1 > xe->xdf2.nrec)) + return -1; + + /* Zero-count hunks: start must still be in [1, nrec+1] */ + if (h->old_count == 0 && + (h->old_start < 1 || h->old_start > xe->xdf1.nrec + 1)) + return -1; + if (h->new_count == 0 && + (h->new_start < 1 || h->new_start > xe->xdf2.nrec + 1)) + return -1; + + /* Ordering: no overlap with previous hunk */ + if (h->old_start < prev_old_end || + h->new_start < prev_new_end) + return -1; + + for (j = 0; j < h->old_count; j++) + xe->xdf1.changed[h->old_start - 1 + j] = true; + for (j = 0; j < h->new_count; j++) + xe->xdf2.changed[h->new_start - 1 + j] = true; + + prev_old_end = h->old_start + h->old_count; + prev_new_end = h->new_start + h->new_count; + total_old += h->old_count; + total_new += h->new_count; + } + + /* + * Synchronization invariant: unchanged line counts must match. + * Otherwise xdl_build_script() would walk off one array. + */ + if ((long)xe->xdf1.nrec - total_old != + (long)xe->xdf2.nrec - total_new) + return -1; + + return 0; +} + int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *ecb) { xdchange_t *xscr; xdfenv_t xe; emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff; - if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { + if (xpp->external_hunks) { + if (xdl_prepare_env(mf1, mf2, xpp, &xe) < 0) + return -1; + if (xdl_populate_hunks_from_external(&xe, + xpp->external_hunks, + xpp->external_hunks_nr) == 0) + goto diff_done; + xdl_free_env(&xe); + } + if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) return -1; - } + +diff_done: + if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || xdl_build_script(&xe, &xscr) < 0) { From 202b529becef0be983d74df84ca829fc7fe85cf7 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:37:59 -0700 Subject: [PATCH 2/5] userdiff: add diff..process config Add a new per-driver configuration key that specifies the command for a long-running diff process. The name follows filter..process: a long-running subprocess that stays alive across files within a single git invocation. Signed-off-by: Michael Montalbo --- userdiff.c | 7 +++++++ userdiff.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/userdiff.c b/userdiff.c index fe710a68bfdfa6..81c0bebcce65e0 100644 --- a/userdiff.c +++ b/userdiff.c @@ -499,6 +499,13 @@ int userdiff_config(const char *k, const char *v) drv->algorithm = drv->algorithm_owned; return ret; } + if (!strcmp(type, "process")) { + int ret; + FREE_AND_NULL(drv->process_owned); + ret = git_config_string(&drv->process_owned, k, v); + drv->process = drv->process_owned; + return ret; + } return 0; } diff --git a/userdiff.h b/userdiff.h index 827361b0bc9569..51c26e0d4190e5 100644 --- a/userdiff.h +++ b/userdiff.h @@ -31,6 +31,8 @@ struct userdiff_driver { char *textconv_owned; struct notes_cache *textconv_cache; int textconv_want_cache; + const char *process; + char *process_owned; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, From a7b65a4303759b202339bca890eb1b631690569b Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:38:21 -0700 Subject: [PATCH 3/5] diff: add long-running diff process via diff..process Add support for external diff processes that communicate via the long-running process protocol (pkt-line over stdin/stdout). A diff process is configured per userdiff driver: [diff "cdiff"] process = /path/to/diff-tool The tool provides custom line-matching: it receives file pairs and returns hunks that reference original line numbers. Unlike textconv, which transforms the displayed content, the diff output shows the actual file while the tool controls which lines are marked as changed. The handshake negotiates version=1 and capability=hunks. Per-file requests send command=hunks, pathname, and both file contents as packetized data. The tool responds with hunk lines and a status packet. On error, git falls back to the builtin diff algorithm with a warning. Zero hunks with status=success means the tool considers the files equivalent. Git skips diff output for that file. Signed-off-by: Michael Montalbo --- Documentation/config/diff.adoc | 8 + Documentation/gitattributes.adoc | 40 ++++ Makefile | 1 + diff-process.c | 206 +++++++++++++++++++ diff-process.h | 28 +++ diff.c | 23 +++ t/t4080-diff-process.sh | 338 +++++++++++++++++++++++++++++++ 7 files changed, 644 insertions(+) create mode 100644 diff-process.c create mode 100644 diff-process.h create mode 100755 t/t4080-diff-process.sh diff --git a/Documentation/config/diff.adoc b/Documentation/config/diff.adoc index 1135a62a0ad3de..4ab5f60df685d0 100644 --- a/Documentation/config/diff.adoc +++ b/Documentation/config/diff.adoc @@ -218,6 +218,14 @@ endif::git-diff[] Set this option to `true` to make the diff driver cache the text conversion outputs. See linkgit:gitattributes[5] for details. +`diff..process`:: + The command to run as a long-running diff process. + The tool communicates via the pkt-line protocol and returns + hunks that are fed into Git's diff and blame pipelines. + If the tool returns zero hunks, the file is treated as + unchanged for both diff output and blame attribution. + See linkgit:gitattributes[5] for details. + `diff.indentHeuristic`:: Set this option to `false` to disable the default heuristics that shift diff hunk boundaries to make patches easier to read. diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index f20041a323d174..cc724f8c636671 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -821,6 +821,46 @@ NOTE: If `diff..command` is defined for path with the (see above), and adding `diff..algorithm` has no effect, as the algorithm is not passed to the external diff driver. +Using an external diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An external tool can provide content-aware line matching by +setting `diff..process` to the command that runs +the tool. The tool is a long-running process that communicates via +the pkt-line protocol (see +linkgit:gitprotocol-long-running-process[5]). + +------------------------ +*.c diff=cdiff +------------------------ + +---------------------------------------------------------------- +[diff "cdiff"] + process = /path/to/diff-process-tool +---------------------------------------------------------------- + +The tool receives file pairs and returns hunk descriptors indicating +which lines changed. Git feeds these hunks into its standard diff +pipeline, so all output features (word diff, function context, +color) work normally. + +If the tool fails or returns an error, Git silently falls back to +the builtin diff algorithm. If the tool returns invalid hunks +(out of bounds, overlapping), Git also falls back silently. + +The handshake negotiates `version=1` and `capability=hunks`. +Per-file requests send `command=hunks` and `pathname=`, +followed by the old and new file content as packetized data. +The tool responds with lines of the form +`hunk ` +(1-based line numbers), a flush packet, and `status=success`. + +If the tool returns zero hunks with `status=success`, Git treats +the file as having no changes and produces no diff output. + +Tools should ignore unknown keys in the per-file request to +remain forward-compatible. + Defining a custom hunk-header ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Makefile b/Makefile index cedc234173e377..22900368dd59b5 100644 --- a/Makefile +++ b/Makefile @@ -1142,6 +1142,7 @@ LIB_OBJS += diff-delta.o LIB_OBJS += diff-merges.o LIB_OBJS += diff-lib.o LIB_OBJS += diff-no-index.o +LIB_OBJS += diff-process.o LIB_OBJS += diff.o LIB_OBJS += diffcore-break.o LIB_OBJS += diffcore-delta.o diff --git a/diff-process.c b/diff-process.c new file mode 100644 index 00000000000000..801ac9e22e41bc --- /dev/null +++ b/diff-process.c @@ -0,0 +1,206 @@ +/* + * Diff process backend: communicates with a long-running external + * tool via the pkt-line protocol to obtain custom line-matching + * results. Unlike textconv, which transforms the displayed content, + * hunks from a diff process reference original line numbers and + * the display shows the actual file content. + * + * Protocol: pkt-line over stdin/stdout, following the pattern of + * the long-running filter process protocol (see convert.c). + * + * Handshake: + * git> git-diff-client / version=1 / flush + * tool< git-diff-server / version=1 / flush + * git> capability=hunks / flush + * tool< capability=hunks / flush + * + * Per-file: + * git> command=hunks / pathname= / flush + * git> / flush + * git> / flush + * tool< hunk + * tool< ... / flush + * tool< status=success / flush + * + * Zero hunks with status=success means the tool considers the + * files equivalent. Git will skip the diff for that file. + */ + +#include "git-compat-util.h" +#include "diff-process.h" +#include "userdiff.h" +#include "sub-process.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff/xdiff.h" + +#define CAP_HUNKS (1u << 0) + +struct diff_subprocess { + struct subprocess_entry subprocess; + unsigned int supported_capabilities; +}; + +static int subprocess_map_initialized; +static struct hashmap subprocess_map; + +static int start_diff_process_fn(struct subprocess_entry *subprocess) +{ + static int versions[] = { 1, 0 }; + static struct subprocess_capability capabilities[] = { + { "hunks", CAP_HUNKS }, + { NULL, 0 } + }; + struct diff_subprocess *entry = + (struct diff_subprocess *)subprocess; + + /* Uses dying pkt-line variant, same as convert.c filters. */ + return subprocess_handshake(subprocess, "git-diff", + versions, NULL, + capabilities, + &entry->supported_capabilities); +} + +static struct diff_subprocess *find_or_start_process(const char *cmd) +{ + struct diff_subprocess *entry; + + if (!subprocess_map_initialized) { + subprocess_map_initialized = 1; + hashmap_init(&subprocess_map, cmd2process_cmp, NULL, 0); + } + + entry = (struct diff_subprocess *) + subprocess_find_entry(&subprocess_map, cmd); + if (entry) + return entry; + + entry = xcalloc(1, sizeof(*entry)); + if (subprocess_start(&subprocess_map, &entry->subprocess, + cmd, start_diff_process_fn)) { + free(entry); + return NULL; + } + + return entry; +} + +static int send_file_content(int fd, const char *buf, long size) +{ + int ret; + + if (size > 0) + ret = write_packetized_from_buf_no_flush(buf, size, fd); + else + ret = 0; + if (ret) + return ret; + return packet_flush_gently(fd); +} + +static int parse_hunk_line(const char *line, struct xdl_hunk *hunk) +{ + char *end; + + /* Format: "hunk " */ + if (!skip_prefix(line, "hunk ", &line)) + return -1; + + hunk->old_start = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->old_count = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->new_start = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->new_count = strtol(line, &end, 10); + if (end == line || *end != '\0') + return -1; + + return 0; +} + +int diff_process_get_hunks(struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out) +{ + struct diff_subprocess *backend; + struct child_process *process; + int fd_in, fd_out; + struct strbuf status = STRBUF_INIT; + struct xdl_hunk *hunks = NULL; + struct xdl_hunk hunk; + size_t nr_hunks = 0, alloc_hunks = 0; + int len; + char *line; + + if (!drv || !drv->process) + return -1; + + backend = find_or_start_process(drv->process); + if (!backend) + return -1; + + if (!(backend->supported_capabilities & CAP_HUNKS)) + return -1; + + process = subprocess_get_child_process(&backend->subprocess); + fd_in = process->in; + fd_out = process->out; + + /* Send request */ + if (packet_write_fmt_gently(fd_in, "command=hunks\n") || + packet_write_fmt_gently(fd_in, "pathname=%s\n", path) || + packet_flush_gently(fd_in)) + goto error; + + /* Send old file content */ + if (send_file_content(fd_in, old_buf, old_size)) + goto error; + + /* Send new file content */ + if (send_file_content(fd_in, new_buf, new_size)) + goto error; + + /* Read hunks until flush packet */ + while ((len = packet_read_line_gently(fd_out, NULL, &line)) >= 0 && + line) { + if (parse_hunk_line(line, &hunk) < 0) + goto error; + ALLOC_GROW(hunks, nr_hunks + 1, alloc_hunks); + hunks[nr_hunks++] = hunk; + } + if (len < 0) + goto error; + + /* Read status */ + if (subprocess_read_status(fd_out, &status)) + goto error; + + if (strcmp(status.buf, "success")) { + if (!strcmp(status.buf, "abort")) + backend->supported_capabilities &= ~CAP_HUNKS; + goto error; + } + + *hunks_out = hunks; + *nr_hunks_out = nr_hunks; + strbuf_release(&status); + return 0; + +error: + free(hunks); + strbuf_release(&status); + return -1; +} diff --git a/diff-process.h b/diff-process.h new file mode 100644 index 00000000000000..4c84951e0273fa --- /dev/null +++ b/diff-process.h @@ -0,0 +1,28 @@ +#ifndef DIFF_PROCESS_H +#define DIFF_PROCESS_H + +struct userdiff_driver; +struct xdl_hunk; + +/* + * Query a diff process for hunks describing the changes + * between old_buf and new_buf. + * + * The backend is a long-running subprocess configured via + * diff..process. It receives file content via + * pkt-line and returns hunks with 1-based line numbers. + * + * On success, sets *hunks_out and *nr_hunks_out to a newly allocated + * array (caller must free) and returns 0. + * + * On failure, returns -1. The caller should fall back to the + * builtin diff algorithm. + */ +int diff_process_get_hunks(struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out); + +#endif /* DIFF_PROCESS_H */ diff --git a/diff.c b/diff.c index 397e38b41cc6fa..1aeb0f319ebae0 100644 --- a/diff.c +++ b/diff.c @@ -25,6 +25,7 @@ #include "utf8.h" #include "odb.h" #include "userdiff.h" +#include "diff-process.h" #include "submodule.h" #include "hashmap.h" #include "mem-pool.h" @@ -3991,6 +3992,7 @@ static void builtin_diff(const char *name_a, xpparam_t xpp; xdemitconf_t xecfg; struct emit_callback ecbdata; + struct xdl_hunk *ext_hunks = NULL; unsigned ws_rule; const struct userdiff_funcname *pe; @@ -4031,6 +4033,26 @@ static void builtin_diff(const char *name_a, xpp.ignore_regex_nr = o->ignore_regex_nr; xpp.anchors = o->anchors; xpp.anchors_nr = o->anchors_nr; + + if (!o->ignore_driver_algorithm && + one->driver && one->driver->process) { + size_t ext_hunks_nr = 0; + if (!diff_process_get_hunks( + one->driver, name_a, + mf1.ptr, mf1.size, + mf2.ptr, mf2.size, + &ext_hunks, &ext_hunks_nr)) { + if (!ext_hunks_nr) + goto free_ab_and_return; + xpp.external_hunks = ext_hunks; + xpp.external_hunks_nr = ext_hunks_nr; + } else { + warning(_("diff process failed for '%s'," + " falling back to builtin diff"), + name_a); + } + } + xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_FUNCNAMES; @@ -4111,6 +4133,7 @@ static void builtin_diff(const char *name_a, } else if (xdi_diff_outf(&mf1, &mf2, NULL, fn_out_consume, &ecbdata, &xpp, &xecfg)) die("unable to generate diff for %s", one->path); + free(ext_hunks); if (o->word_diff) free_diff_words_data(&ecbdata); if (textconv_one) diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh new file mode 100755 index 00000000000000..6f49f4e66bb8e3 --- /dev/null +++ b/t/t4080-diff-process.sh @@ -0,0 +1,338 @@ +#!/bin/sh + +test_description='diff process via long-running process' + +. ./test-lib.sh + +if test_have_prereq PYTHON +then + PYTHON_PATH=$(command -v python3) || PYTHON_PATH=$(command -v python) +fi + +# +# A single parametric diff process. +# Usage: diff-process-backend --mode= [--log=] +# +# Modes: +# whole-file - report all lines as changed (default) +# fixed-hunk - always report hunk 5 2 5 2 +# bad-hunk - report out-of-bounds hunk 999 1 999 1 +# zero-hunk - return zero hunks (files considered equivalent) +# error - return status=error for every request +# abort - return status=abort for every request +# crash - read one request then exit without responding +# +setup_backend () { + cat >"$TRASH_DIRECTORY/diff-process-backend.py" <<-\PYEOF + import sys, os + + def read_pkt(): + hdr = sys.stdin.buffer.read(4) + if len(hdr) < 4: return None + length = int(hdr, 16) + if length == 0: return "" + data = sys.stdin.buffer.read(length - 4) + return data.decode().rstrip("\n") + + def write_pkt(line): + data = (line + "\n").encode() + sys.stdout.buffer.write(f"{len(data)+4:04x}".encode() + data) + sys.stdout.buffer.flush() + + def write_flush(): + sys.stdout.buffer.write(b"0000") + sys.stdout.buffer.flush() + + def read_content(): + chunks = [] + while True: + hdr = sys.stdin.buffer.read(4) + if len(hdr) < 4: break + length = int(hdr, 16) + if length == 0: break + chunks.append(sys.stdin.buffer.read(length - 4)) + return b"".join(chunks) + + mode = "whole-file" + logfile = None + for arg in sys.argv[1:]: + if arg.startswith("--mode="): + mode = arg[7:] + elif arg.startswith("--log="): + logfile = open(arg[6:], "a") + + def log(msg): + if logfile: + logfile.write(msg + "\n") + logfile.flush() + + # Handshake + assert read_pkt() == "git-diff-client" + assert read_pkt() == "version=1" + read_pkt() + write_pkt("git-diff-server") + write_pkt("version=1") + write_flush() + while True: + p = read_pkt() + if p == "": break + write_pkt("capability=hunks") + write_flush() + + log("ready") + + while True: + cmd = None + pathname = None + while True: + p = read_pkt() + if p is None: sys.exit(0) + if p == "": break + if p.startswith("command="): cmd = p.split("=",1)[1] + if p.startswith("pathname="): pathname = p.split("=",1)[1] + if cmd is None: sys.exit(0) + old = read_content() + new = read_content() + log(f"command={cmd} pathname={pathname}") + + if mode == "error": + write_flush() + write_pkt("status=error") + write_flush() + continue + + if mode == "abort": + write_flush() + write_pkt("status=abort") + write_flush() + continue + + if mode == "crash": + sys.exit(1) + + if cmd == "hunks": + if mode == "fixed-hunk": + write_pkt("hunk 5 2 5 2") + elif mode == "bad-hunk": + write_pkt("hunk 999 1 999 1") + elif mode == "zero-hunk": + pass + else: + ol = len(old.split(b"\n")) + nl = len(new.split(b"\n")) + write_pkt(f"hunk 1 {ol} 1 {nl}") + write_flush() + write_pkt("status=success") + write_flush() + else: + write_flush() + write_pkt("status=error") + write_flush() + PYEOF + write_script diff-process-backend <<-SHEOF + exec "$PYTHON_PATH" "$TRASH_DIRECTORY/diff-process-backend.py" "\$@" + SHEOF +} + +BACKEND="./diff-process-backend" + +test_expect_success PYTHON 'setup' ' + setup_backend && + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + git commit -m "initial" +' + +test_expect_success PYTHON 'diff process hunk boundaries affect output' ' + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + OLD5 + OLD6 + line7 + line8 + OLD9 + OLD10 + EOF + git add boundary.c && + git commit -m "add boundary.c" && + + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + NEW5 + NEW6 + line7 + line8 + NEW9 + NEW10 + EOF + + # The file has changes at lines 5-6 and 9-10, but fixed-hunk + # only reports lines 5-6 as changed. Lines 9-10 should not + # appear as changed in the output. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^-OLD6" actual && + grep "^+NEW5" actual && + grep "^+NEW6" actual && + ! grep "^-OLD9" actual && + ! grep "^-OLD10" actual && + ! grep "^+NEW9" actual && + ! grep "^+NEW10" actual +' + +test_expect_success PYTHON 'diff process fallback on tool error status' ' + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff boundary.c >actual && + # Fallback produces the full builtin diff (both change regions). + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual && + # Tool was contacted (it replied with error, not crash). + grep "command=hunks pathname=boundary.c" backend.log +' + +test_expect_success PYTHON 'diff process fallback on bad hunks' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-hunk" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual +' + +test_expect_success PYTHON 'diff process fallback on tool crash' ' + git -c diff.cdiff.process="$BACKEND --mode=crash" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual +' + +test_expect_success PYTHON 'diff process abort disables for session' ' + cat >abort1.c <<-\EOF && + int first(void) { return 1; } + EOF + cat >abort2.c <<-\EOF && + int second(void) { return 2; } + EOF + git add abort1.c abort2.c && + git commit -m "add abort files" && + + cat >abort1.c <<-\EOF && + int first(void) { return 10; } + EOF + cat >abort2.c <<-\EOF && + int second(void) { return 20; } + EOF + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --mode=abort --log=backend.log" \ + diff -- abort1.c abort2.c >actual && + # Both files should still produce diff output via fallback. + grep "return 10" actual && + grep "return 20" actual && + # The tool aborts on the first file and git clears its + # capability. The second file never contacts the tool, + # so the log should have exactly one entry, not two. + grep "command=hunks" backend.log >matches && + test_line_count = 1 matches +' + +test_expect_success PYTHON 'diff process handles multiple files' ' + cat >multi1.c <<-\EOF && + int one(void) { return 1; } + EOF + cat >multi2.c <<-\EOF && + int two(void) { return 2; } + EOF + git add multi1.c multi2.c && + git commit -m "add multi files" && + + cat >multi1.c <<-\EOF && + int one(void) { return 10; } + EOF + cat >multi2.c <<-\EOF && + int two(void) { return 20; } + EOF + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- multi1.c multi2.c >actual && + grep "return 10" actual && + grep "return 20" actual && + grep "pathname=multi1.c" backend.log && + grep "pathname=multi2.c" backend.log +' + +test_expect_success PYTHON 'diff process with --word-diff' ' + cat >worddiff.c <<-\EOF && + int value(void) { return 1; } + EOF + git add worddiff.c && + git commit -m "add worddiff.c" && + + cat >worddiff.c <<-\EOF && + int value(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND" \ + diff --word-diff worddiff.c >actual && + grep "\[-1;-\]" actual && + grep "{+999;+}" actual +' + +test_expect_success PYTHON 'diff process bypassed by --diff-algorithm' ' + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --diff-algorithm=patience worddiff.c >actual && + grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success PYTHON 'diff process works with git log -p' ' + cat >logtest.c <<-\EOF && + int logfunc(void) { return 1; } + EOF + git add logtest.c && + git commit -m "add logtest.c" && + + cat >logtest.c <<-\EOF && + int logfunc(void) { return 2; } + EOF + git add logtest.c && + git commit -m "change logtest.c" && + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + log -1 -p -- logtest.c >actual && + grep "return 2" actual && + grep "command=hunks pathname=logtest.c" backend.log +' + +test_expect_success PYTHON 'diff process zero hunks suppresses diff output' ' + cat >zerohunk.c <<-\EOF && + int zero(void) { return 0; } + EOF + git add zerohunk.c && + git commit -m "add zerohunk.c" && + + cat >zerohunk.c <<-\EOF && + int zero(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND --mode=zero-hunk" \ + diff zerohunk.c >actual && + test_must_be_empty actual +' + +test_done From bc8a331f47ca1541cf41d8586d2ac54abc2469c5 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:38:49 -0700 Subject: [PATCH 4/5] blame: consult diff process for zero-hunk detection When a diff process is configured via diff..process, consult it during blame's per-commit diffing. If the process returns zero hunks for a commit's changes to a file, treat the commit as having no changes, causing blame to attribute lines to earlier commits. The subprocess is long-running (one startup cost amortized across the blame traversal), but each commit in the file's history incurs a round-trip to the tool. Signed-off-by: Michael Montalbo --- Documentation/gitattributes.adoc | 3 +++ blame.c | 43 +++++++++++++++++++++++++++++--- t/t4080-diff-process.sh | 32 ++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index cc724f8c636671..7d66fa3aa12e31 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -857,6 +857,9 @@ The tool responds with lines of the form If the tool returns zero hunks with `status=success`, Git treats the file as having no changes and produces no diff output. +`git blame` also consults the diff process and skips commits +where it reports zero hunks, attributing lines to earlier commits +instead. Tools should ignore unknown keys in the per-file request to remain forward-compatible. diff --git a/blame.c b/blame.c index a3c49d132e4ae1..8a5f14db7a0f87 100644 --- a/blame.c +++ b/blame.c @@ -19,6 +19,8 @@ #include "tag.h" #include "trace2.h" #include "blame.h" +#include "diff-process.h" +#include "userdiff.h" #include "alloc.h" #include "commit-slab.h" #include "bloom.h" @@ -315,16 +317,47 @@ static struct commit *fake_working_tree_commit(struct repository *r, static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, - xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) + xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, + int xdl_opts, struct index_state *istate, + const char *path) { xpparam_t xpp = {0}; xdemitconf_t xecfg = {0}; xdemitcb_t ecb = {NULL}; + struct xdl_hunk *ext_hunks = NULL; + int ret; xpp.flags = xdl_opts; xecfg.hunk_func = hunk_func; ecb.priv = cb_data; - return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + + if (path && istate) { + struct userdiff_driver *drv; + drv = userdiff_find_by_path(istate, path); + if (drv && drv->process) { + size_t nr = 0; + if (!diff_process_get_hunks(drv, path, + file_a->ptr, file_a->size, + file_b->ptr, file_b->size, + &ext_hunks, &nr)) { + if (!nr) { + /* + * Zero hunks: the diff process + * considers these files equivalent. + * Skip so blame looks past this + * commit. + */ + return 0; + } + xpp.external_hunks = ext_hunks; + xpp.external_hunks_nr = nr; + } + } + } + + ret = xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + free(ext_hunks); + return ret; } static const char *get_next_line(const char *start, const char *end) @@ -1961,7 +1994,8 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, &sb->num_read_blob, ignore_diffs); sb->num_get_patch++; - if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts)) + if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts, + sb->revs->diffopt.repo->index, target->path)) die("unable to generate diff (%s -> %s)", oid_to_hex(&parent->commit->object.oid), oid_to_hex(&target->commit->object.oid)); @@ -2114,7 +2148,8 @@ static void find_copy_in_blob(struct blame_scoreboard *sb, * file_p partially may match that image. */ memset(split, 0, sizeof(struct blame_entry [3])); - if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts)) + if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts, + NULL, NULL)) die("unable to generate diff (%s)", oid_to_hex(&parent->commit->object.oid)); /* remainder, if any, all match the preimage */ diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index 6f49f4e66bb8e3..5ed644b786a9c0 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -335,4 +335,36 @@ test_expect_success PYTHON 'diff process zero hunks suppresses diff output' ' test_must_be_empty actual ' +test_expect_success PYTHON 'blame skips commits with zero hunks from diff process' ' + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "add blame.c" && + + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "reformat blame.c" && + BLAME_COMMIT=$(git rev-parse --short HEAD) && + + # Without zero-hunk mode, blame attributes the change. + git blame blame.c >without && + grep "$BLAME_COMMIT" without && + + # With zero-hunk mode, the process considers the files equivalent + # and blame skips the reformat commit. + git -c diff.cdiff.process="$BACKEND --mode=zero-hunk" \ + blame blame.c >with && + ! grep "$BLAME_COMMIT" with +' + + test_done From 981859864320bfce309314f95f41cb3a1ddfbb3c Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:39:25 -0700 Subject: [PATCH 5/5] diff-process-normalize: add built-in whitespace normalizer Add git diff-process-normalize as a minimal reference implementation of the diff process protocol. It compares files line by line using xdiff_compare_lines() with XDF_IGNORE_WHITESPACE (same logic as "git diff -w"). If all lines match, it returns zero hunks; otherwise it returns an error so git falls back to the builtin diff algorithm. [diff "cdiff"] process = git diff-process-normalize Update documentation to describe the diff process protocol and the built-in normalize tool. Signed-off-by: Michael Montalbo --- Documentation/config/diff.adoc | 11 +-- Documentation/gitattributes.adoc | 16 ++++ Makefile | 1 + builtin.h | 1 + builtin/diff-process-normalize.c | 143 +++++++++++++++++++++++++++++++ git.c | 1 + t/t4080-diff-process.sh | 60 +++++++++++++ 7 files changed, 228 insertions(+), 5 deletions(-) create mode 100644 builtin/diff-process-normalize.c diff --git a/Documentation/config/diff.adoc b/Documentation/config/diff.adoc index 4ab5f60df685d0..15396aa238086e 100644 --- a/Documentation/config/diff.adoc +++ b/Documentation/config/diff.adoc @@ -219,11 +219,12 @@ endif::git-diff[] conversion outputs. See linkgit:gitattributes[5] for details. `diff..process`:: - The command to run as a long-running diff process. - The tool communicates via the pkt-line protocol and returns - hunks that are fed into Git's diff and blame pipelines. - If the tool returns zero hunks, the file is treated as - unchanged for both diff output and blame attribution. + The command to run as a long-running diff process that + provides custom line-matching. Unlike `textconv`, the + display shows original content; the tool controls which + lines are marked as changed. If the tool returns zero + hunks, the file is treated as unchanged for diff output + and blame attribution. See linkgit:gitattributes[5] for details. `diff.indentHeuristic`:: diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index 7d66fa3aa12e31..9717235a1ad15c 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -861,6 +861,22 @@ the file as having no changes and produces no diff output. where it reports zero hunks, attributing lines to earlier commits instead. +As a minimal reference implementation, Git ships with +`git diff-process-normalize`, which detects whitespace-only +changes. Files whose only differences are whitespace produce +zero hunks; files with non-whitespace changes fall back to +the builtin diff algorithm. To use it: + +---------------------------------------------------------------- +[diff "cdiff"] + process = git diff-process-normalize +---------------------------------------------------------------- + +This is useful after running a code formatter: `git diff` shows +no output for files that only had whitespace changes, +`git blame` skips whitespace-only commits automatically without +requiring a `.git-blame-ignore-revs` file. + Tools should ignore unknown keys in the per-file request to remain forward-compatible. diff --git a/Makefile b/Makefile index 22900368dd59b5..01acfaf7b80764 100644 --- a/Makefile +++ b/Makefile @@ -1409,6 +1409,7 @@ BUILTIN_OBJS += builtin/diagnose.o BUILTIN_OBJS += builtin/diff-files.o BUILTIN_OBJS += builtin/diff-index.o BUILTIN_OBJS += builtin/diff-pairs.o +BUILTIN_OBJS += builtin/diff-process-normalize.o BUILTIN_OBJS += builtin/diff-tree.o BUILTIN_OBJS += builtin/diff.o BUILTIN_OBJS += builtin/difftool.o diff --git a/builtin.h b/builtin.h index 235c51f30e5380..c713a0417f615f 100644 --- a/builtin.h +++ b/builtin.h @@ -178,6 +178,7 @@ int cmd_diff_files(int argc, const char **argv, const char *prefix, struct repos int cmd_diff_index(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff_pairs(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_diff_process_normalize(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff_tree(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_difftool(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_env__helper(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/diff-process-normalize.c b/builtin/diff-process-normalize.c new file mode 100644 index 00000000000000..7cb4430ad455be --- /dev/null +++ b/builtin/diff-process-normalize.c @@ -0,0 +1,143 @@ +/* + * Minimal reference implementation of a diff process. Returns + * zero hunks for files whose only differences are whitespace, + * and status=error otherwise (falling back to the builtin diff). + * + * Uses xdiff_compare_lines() with XDF_IGNORE_WHITESPACE, + * giving the same whitespace handling as "git diff -w". + */ + +#include "builtin.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff-interface.h" + +/* + * Read a single pkt-line. Returns 1 for data, 0 for flush, -1 for EOF. + */ +static int read_pkt(int fd, struct strbuf *line) +{ + int len; + char *data; + + if (packet_read_line_gently(fd, &len, &data) < 0) + return -1; + if (!data || !len) + return 0; /* flush */ + strbuf_reset(line); + strbuf_add(line, data, len); + strbuf_rtrim(line); + return 1; +} + +/* + * Read packetized content until a flush packet. + */ +static int read_content(int fd, struct strbuf *out) +{ + strbuf_reset(out); + if (read_packetized_to_strbuf(fd, out, PACKET_READ_GENTLE_ON_EOF) < 0) + return -1; + return 0; +} + +/* + * Compare two buffers line by line using xdiff_compare_lines() with + * XDF_IGNORE_WHITESPACE (same logic as "git diff -w"). + * Returns 1 if all lines match, 0 otherwise. + */ +static int whitespace_equivalent(const char *a, long size_a, + const char *b, long size_b) +{ + const char *ea = a + size_a; + const char *eb = b + size_b; + + while (a < ea && b < eb) { + const char *eol_a = memchr(a, '\n', ea - a); + const char *eol_b = memchr(b, '\n', eb - b); + long len_a = (eol_a ? eol_a : ea) - a; + long len_b = (eol_b ? eol_b : eb) - b; + + if (!xdiff_compare_lines(a, len_a, b, len_b, + XDF_IGNORE_WHITESPACE)) + return 0; + + a += len_a + (eol_a ? 1 : 0); + b += len_b + (eol_b ? 1 : 0); + } + + /* Both sides must be exhausted */ + return a >= ea && b >= eb; +} + +int cmd_diff_process_normalize(int argc UNUSED, const char **argv UNUSED, + const char *prefix UNUSED, + struct repository *repo UNUSED) +{ + struct strbuf line = STRBUF_INIT; + struct strbuf old_content = STRBUF_INIT; + struct strbuf new_content = STRBUF_INIT; + int ret; + + /* Handshake: read client greeting */ + ret = read_pkt(0, &line); + if (ret <= 0 || strcmp(line.buf, "git-diff-client")) + return 1; + ret = read_pkt(0, &line); + if (ret <= 0 || strcmp(line.buf, "version=1")) + return 1; + read_pkt(0, &line); /* flush */ + + /* Send server greeting */ + packet_write_fmt(1, "git-diff-server\n"); + packet_write_fmt(1, "version=1\n"); + packet_flush(1); + + /* Read client capabilities until flush */ + while ((ret = read_pkt(0, &line)) > 0) + ; /* consume */ + + /* Send our capabilities */ + packet_write_fmt(1, "capability=hunks\n"); + packet_flush(1); + + /* Main loop: process file pairs */ + for (;;) { + int have_command = 0; + + /* Read request headers until flush */ + while ((ret = read_pkt(0, &line)) > 0) { + if (starts_with(line.buf, "command=")) + have_command = 1; + } + if (ret < 0) + break; /* EOF: client closed connection */ + if (!have_command) + break; + + /* Read old file content */ + if (read_content(0, &old_content) < 0) + break; + /* Read new file content */ + if (read_content(0, &new_content) < 0) + break; + + if (whitespace_equivalent(old_content.buf, old_content.len, + new_content.buf, new_content.len)) { + /* Whitespace-only differences */ + packet_flush(1); /* zero hunks */ + packet_write_fmt(1, "status=success\n"); + packet_flush(1); + } else { + /* Non-whitespace differences: fall back */ + packet_flush(1); + packet_write_fmt(1, "status=error\n"); + packet_flush(1); + } + } + + strbuf_release(&line); + strbuf_release(&old_content); + strbuf_release(&new_content); + return 0; +} diff --git a/git.c b/git.c index 5a40eab8a26a66..6239240b021f1d 100644 --- a/git.c +++ b/git.c @@ -568,6 +568,7 @@ static struct cmd_struct commands[] = { { "diff-files", cmd_diff_files, RUN_SETUP | NEED_WORK_TREE | NO_PARSEOPT }, { "diff-index", cmd_diff_index, RUN_SETUP | NO_PARSEOPT }, { "diff-pairs", cmd_diff_pairs, RUN_SETUP | NO_PARSEOPT }, + { "diff-process-normalize", cmd_diff_process_normalize, NO_PARSEOPT }, { "diff-tree", cmd_diff_tree, RUN_SETUP | NO_PARSEOPT }, { "difftool", cmd_difftool, RUN_SETUP_GENTLY }, { "fast-export", cmd_fast_export, RUN_SETUP }, diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index 5ed644b786a9c0..a6fa1df456d7f2 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -366,5 +366,65 @@ test_expect_success PYTHON 'blame skips commits with zero hunks from diff proces ! grep "$BLAME_COMMIT" with ' +NORMALIZE="git diff-process-normalize" + +test_expect_success 'diff-process-normalize setup' ' + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + test_commit normalize-base +' + +test_expect_success 'diff-process-normalize suppresses whitespace-only changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add ws.c && + git commit -m "add ws.c" && + + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + test_must_be_empty actual +' + +test_expect_success 'diff-process-normalize falls back on non-whitespace changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + + int added_function(void) + { + return 99; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + grep "added_function" actual +' + +test_expect_success 'diff-process-normalize falls back on mixed whitespace and real changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 42; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + grep "return 42" actual +' test_done