diff --git a/cmd/msgvault/cmd/account_scope.go b/cmd/msgvault/cmd/account_scope.go new file mode 100644 index 00000000..b345c905 --- /dev/null +++ b/cmd/msgvault/cmd/account_scope.go @@ -0,0 +1,104 @@ +package cmd + +import ( + "errors" + "fmt" + + "github.com/wesm/msgvault/internal/store" +) + +// AccountScope is the result of resolving a user-supplied --account +// flag against the store. +type AccountScope struct { + Input string + Source *store.Source + Collection *store.CollectionWithSources +} + +// IsEmpty reports whether the scope resolved to nothing. +func (s AccountScope) IsEmpty() bool { + return s.Source == nil && s.Collection == nil +} + +// IsCollection reports whether the scope refers to a collection. +func (s AccountScope) IsCollection() bool { + return s.Collection != nil +} + +// SourceIDs returns the source IDs that this scope expands to. +func (s AccountScope) SourceIDs() []int64 { + switch { + case s.Collection != nil: + return append([]int64(nil), s.Collection.SourceIDs...) + case s.Source != nil: + return []int64{s.Source.ID} + } + return nil +} + +// DisplayName returns a human-readable label for the scope. +func (s AccountScope) DisplayName() string { + switch { + case s.Collection != nil: + return s.Collection.Name + case s.Source != nil: + return s.Source.Identifier + } + return "" +} + +// ResolveAccount resolves a user-supplied --account string against +// the store. Collections are checked first, then sources. +func ResolveAccount( + st *store.Store, input string, +) (AccountScope, error) { + scope := AccountScope{Input: input} + if input == "" { + return scope, nil + } + + // Try collection first. + coll, err := st.GetCollectionByName(input) + switch { + case err == nil: + scope.Collection = coll + return scope, nil + case errors.Is(err, store.ErrCollectionNotFound): + // Fall through to source lookup. + default: + return scope, fmt.Errorf( + "look up collection %q: %w", input, err, + ) + } + + // Source lookup. + sources, err := st.GetSourcesByIdentifierOrDisplayName(input) + if err != nil { + return scope, fmt.Errorf( + "look up source for %q: %w", input, err, + ) + } + if len(sources) == 0 { + return scope, fmt.Errorf( + "no collection or source found for %q "+ + "(try 'msgvault collections list' or "+ + "'msgvault list-accounts')", + input, + ) + } + if len(sources) > 1 { + names := make([]string, 0, len(sources)) + for _, s := range sources { + names = append(names, fmt.Sprintf( + "%s (%s, id=%d)", + s.Identifier, s.SourceType, s.ID, + )) + } + return scope, fmt.Errorf( + "ambiguous account %q matches multiple sources: %v", + input, names, + ) + } + scope.Source = sources[0] + return scope, nil +} diff --git a/cmd/msgvault/cmd/build_cache.go b/cmd/msgvault/cmd/build_cache.go index 6f25a87a..65265780 100644 --- a/cmd/msgvault/cmd/build_cache.go +++ b/cmd/msgvault/cmd/build_cache.go @@ -680,7 +680,7 @@ func setupSQLiteSource(duckDB *sql.DB, dbPath string) (cleanup func(), err error query string typeOverrides string // DuckDB types parameter for read_csv_auto (empty = infer all) }{ - {"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type FROM messages WHERE sent_at IS NOT NULL", + {"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type FROM messages WHERE sent_at IS NOT NULL AND deleted_at IS NULL", "types={'sent_at': 'TIMESTAMP', 'deleted_from_source_at': 'TIMESTAMP'}"}, {"message_recipients", "SELECT message_id, participant_id, recipient_type, display_name FROM message_recipients", ""}, {"message_labels", "SELECT message_id, label_id FROM message_labels", ""}, diff --git a/cmd/msgvault/cmd/build_cache_test.go b/cmd/msgvault/cmd/build_cache_test.go index e4e74ab0..6b1ff257 100644 --- a/cmd/msgvault/cmd/build_cache_test.go +++ b/cmd/msgvault/cmd/build_cache_test.go @@ -55,6 +55,7 @@ func setupTestSQLite(t *testing.T) (string, func()) { deleted_from_source_at TIMESTAMP, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email', + deleted_at DATETIME, UNIQUE(source_id, source_message_id) ); @@ -1133,7 +1134,7 @@ func TestBuildCache_EmptyDatabase(t *testing.T) { db, _ := sql.Open("sqlite3", dbPath) _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email', deleted_at DATETIME); CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); @@ -1333,7 +1334,7 @@ func BenchmarkBuildCache(b *testing.B) { // Create schema _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email', deleted_at DATETIME); CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); @@ -1427,6 +1428,7 @@ func setupTestSQLiteEmpty(t *testing.T) (string, func()) { deleted_from_source_at TIMESTAMP, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email', + deleted_at DATETIME, UNIQUE(source_id, source_message_id) ); CREATE TABLE participants ( @@ -1991,7 +1993,7 @@ func BenchmarkBuildCacheIncremental(b *testing.B) { // Create schema and initial data (10000 messages) _, _ = db.Exec(` CREATE TABLE sources (id INTEGER PRIMARY KEY, identifier TEXT); - CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email'); + CREATE TABLE messages (id INTEGER PRIMARY KEY, source_id INTEGER, source_message_id TEXT, sent_at TIMESTAMP, size_estimate INTEGER, has_attachments BOOLEAN, subject TEXT, snippet TEXT, conversation_id INTEGER, deleted_from_source_at TIMESTAMP, attachment_count INTEGER DEFAULT 0, sender_id INTEGER, message_type TEXT NOT NULL DEFAULT 'email', deleted_at DATETIME); CREATE TABLE participants (id INTEGER PRIMARY KEY, email_address TEXT UNIQUE, domain TEXT, display_name TEXT, phone_number TEXT); CREATE TABLE message_recipients (message_id INTEGER, participant_id INTEGER, recipient_type TEXT, display_name TEXT); CREATE TABLE labels (id INTEGER PRIMARY KEY, name TEXT); diff --git a/cmd/msgvault/cmd/collections.go b/cmd/msgvault/cmd/collections.go new file mode 100644 index 00000000..f5a2caf7 --- /dev/null +++ b/cmd/msgvault/cmd/collections.go @@ -0,0 +1,238 @@ +package cmd + +import ( + "fmt" + "os" + "strconv" + "strings" + "text/tabwriter" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/store" +) + +var collectionsCmd = &cobra.Command{ + Use: "collections", + Short: "Manage named groups of accounts", + Long: `Collections are named groupings of accounts that let you view and +deduplicate across multiple sources as one unified archive. + +A default "All" collection is created automatically and includes +every account.`, +} + +var collectionsCreateCmd = &cobra.Command{ + Use: "create --accounts ", + Short: "Create a new collection", + Args: cobra.ExactArgs(1), + RunE: runCollectionsCreate, +} + +var collectionsListCmd = &cobra.Command{ + Use: "list", + Short: "List all collections", + RunE: runCollectionsList, +} + +var collectionsShowCmd = &cobra.Command{ + Use: "show ", + Short: "Show collection details", + Args: cobra.ExactArgs(1), + RunE: runCollectionsShow, +} + +var collectionsAddCmd = &cobra.Command{ + Use: "add --accounts ", + Short: "Add accounts to a collection", + Args: cobra.ExactArgs(1), + RunE: runCollectionsAdd, +} + +var collectionsRemoveCmd = &cobra.Command{ + Use: "remove --accounts ", + Short: "Remove accounts from a collection", + Args: cobra.ExactArgs(1), + RunE: runCollectionsRemove, +} + +var collectionsDeleteCmd = &cobra.Command{ + Use: "delete ", + Short: "Delete a collection (sources and messages are untouched)", + Args: cobra.ExactArgs(1), + RunE: runCollectionsDelete, +} + +var collectionsAccounts string + +func runCollectionsCreate(_ *cobra.Command, args []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + name := args[0] + sourceIDs, err := resolveAccountList(st, collectionsAccounts) + if err != nil { + return err + } + + coll, err := st.CreateCollection(name, "", sourceIDs) + if err != nil { + return err + } + fmt.Printf("Created collection %q with %d source(s).\n", + coll.Name, len(sourceIDs)) + return nil +} + +func runCollectionsList(_ *cobra.Command, _ []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + collections, err := st.ListCollections() + if err != nil { + return err + } + if len(collections) == 0 { + fmt.Println("No collections.") + return nil + } + + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + _, _ = fmt.Fprintln(w, "NAME\tSOURCES\tMESSAGES") + for _, c := range collections { + _, _ = fmt.Fprintf(w, "%s\t%d\t%s\n", + c.Name, len(c.SourceIDs), + formatCount(c.MessageCount)) + } + _ = w.Flush() + return nil +} + +func runCollectionsShow(_ *cobra.Command, args []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + coll, err := st.GetCollectionByName(args[0]) + if err != nil { + return err + } + + fmt.Printf("Collection: %s\n", coll.Name) + if coll.Description != "" { + fmt.Printf("Description: %s\n", coll.Description) + } + fmt.Printf("Sources: %d\n", len(coll.SourceIDs)) + fmt.Printf("Messages: %s\n", formatCount(coll.MessageCount)) + fmt.Printf("Created: %s\n", coll.CreatedAt.Format("2006-01-02 15:04")) + + if len(coll.SourceIDs) > 0 { + fmt.Println("\nMember source IDs:", coll.SourceIDs) + } + return nil +} + +func runCollectionsAdd(_ *cobra.Command, args []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + sourceIDs, err := resolveAccountList(st, collectionsAccounts) + if err != nil { + return err + } + + if err := st.AddSourcesToCollection(args[0], sourceIDs); err != nil { + return err + } + fmt.Printf("Added %d source(s) to %q.\n", len(sourceIDs), args[0]) + return nil +} + +func runCollectionsRemove(_ *cobra.Command, args []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + sourceIDs, err := resolveAccountList(st, collectionsAccounts) + if err != nil { + return err + } + + if err := st.RemoveSourcesFromCollection(args[0], sourceIDs); err != nil { + return err + } + fmt.Printf("Removed %d source(s) from %q.\n", len(sourceIDs), args[0]) + return nil +} + +func runCollectionsDelete(_ *cobra.Command, args []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + if err := st.DeleteCollection(args[0]); err != nil { + return err + } + fmt.Printf("Deleted collection %q.\n", args[0]) + return nil +} + +func resolveAccountList(st *store.Store, accounts string) ([]int64, error) { + if accounts == "" { + return nil, fmt.Errorf("--accounts is required") + } + parts := strings.Split(accounts, ",") + var ids []int64 + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + // Try as numeric ID first + if id, err := strconv.ParseInt(p, 10, 64); err == nil { + ids = append(ids, id) + continue + } + // Resolve by identifier + scope, err := ResolveAccount(st, p) + if err != nil { + return nil, err + } + ids = append(ids, scope.SourceIDs()...) + } + if len(ids) == 0 { + return nil, fmt.Errorf("no valid accounts in --accounts") + } + return ids, nil +} + +func init() { + rootCmd.AddCommand(collectionsCmd) + collectionsCmd.AddCommand(collectionsCreateCmd) + collectionsCmd.AddCommand(collectionsListCmd) + collectionsCmd.AddCommand(collectionsShowCmd) + collectionsCmd.AddCommand(collectionsAddCmd) + collectionsCmd.AddCommand(collectionsRemoveCmd) + collectionsCmd.AddCommand(collectionsDeleteCmd) + + collectionsCreateCmd.Flags().StringVar(&collectionsAccounts, + "accounts", "", "Comma-separated account emails or source IDs") + collectionsAddCmd.Flags().StringVar(&collectionsAccounts, + "accounts", "", "Comma-separated account emails or source IDs") + collectionsRemoveCmd.Flags().StringVar(&collectionsAccounts, + "accounts", "", "Comma-separated account emails or source IDs") +} diff --git a/cmd/msgvault/cmd/deduplicate.go b/cmd/msgvault/cmd/deduplicate.go new file mode 100644 index 00000000..fd9a2b71 --- /dev/null +++ b/cmd/msgvault/cmd/deduplicate.go @@ -0,0 +1,414 @@ +package cmd + +import ( + "bufio" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "time" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/dedup" + "github.com/wesm/msgvault/internal/store" +) + +var deduplicateCmd = &cobra.Command{ + Use: "deduplicate", + Short: "Find and merge duplicate messages within an account", + Long: `Find and merge duplicate messages that were ingested through multiple paths +for the same account (for example, Gmail API sync plus an mbox export of the +same mailbox, or an IMAP sync plus an emlx import). + +Duplicates are grouped by the RFC822 Message-ID header. For each group the +engine selects a survivor, unions the labels from every copy onto the +survivor, and hides the pruned copies in the msgvault database. + +By default, deduplicate ONLY modifies the msgvault database. Your original +source files and remote servers are never modified. Hidden rows can be +restored with --undo, so a dedup run is fully reversible. + +Use --dry-run to scan and report without writing anything. +Use --content-hash to also group messages by normalized raw MIME when +Message-ID matching is insufficient. +Use --undo to reverse a previous dedup run.`, + RunE: runDeduplicate, +} + +var ( + dedupDryRun bool + dedupNoBackup bool + dedupPrefer string + dedupContentHash bool + dedupUndo []string + dedupAccount string + dedupDeleteFromSourceSrvr bool + dedupYes bool +) + +func runDeduplicate(cmd *cobra.Command, _ []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + dbPath := cfg.DatabaseDSN() + + deletionsDir := filepath.Join(cfg.Data.DataDir, "deletions") + + preference := dedup.DefaultSourcePreference + if dedupPrefer != "" { + preference = strings.Split(dedupPrefer, ",") + known := make(map[string]bool, len(dedup.DefaultSourcePreference)) + for _, t := range dedup.DefaultSourcePreference { + known[t] = true + } + for i := range preference { + preference[i] = strings.TrimSpace(preference[i]) + if !known[preference[i]] { + fmt.Fprintf(os.Stderr, "Warning: unknown source type in --prefer: %q\n", preference[i]) + } + } + } + + var ( + accountSourceIDs []int64 + canonicalAccount string + ) + if dedupAccount != "" { + scope, err := ResolveAccount(st, dedupAccount) + if err != nil { + return err + } + accountSourceIDs = scope.SourceIDs() + if len(accountSourceIDs) == 0 { + return fmt.Errorf("--account %q resolved to zero sources", dedupAccount) + } + canonicalAccount = scope.DisplayName() + } + + identityAddrs := cfg.IdentityAddressSet() + if len(identityAddrs) > 0 { + logger.Info("dedup identity addresses loaded", + "count", len(identityAddrs)) + } + + config := dedup.Config{ + SourcePreference: preference, + ContentHashFallback: dedupContentHash, + DryRun: dedupDryRun, + AccountSourceIDs: accountSourceIDs, + Account: canonicalAccount, + DeleteDupsFromSourceServer: dedupDeleteFromSourceSrvr, + DeletionsDir: deletionsDir, + IdentityAddresses: identityAddrs, + } + + engine := dedup.NewEngine(st, config, logger) + + if len(dedupUndo) > 0 { + var allStillRunning []string + for _, batchID := range dedupUndo { + restored, stillRunning, err := engine.Undo(batchID) + // Undo is best-effort: database rows may have been restored + // even if cancelling pending manifests failed. Always report + // the restored count and any still-running manifests before + // returning the error so the user isn't left thinking the + // undo did nothing. + fmt.Printf("Restored %d messages from batch %q.\n", + restored, batchID) + allStillRunning = append(allStillRunning, stillRunning...) + if err != nil { + printStillRunningWarning(allStillRunning) + fmt.Fprintf(os.Stderr, + "\nError cancelling one or more pending manifests "+ + "for batch %q:\n %v\n", batchID, err) + return fmt.Errorf("undo dedup %q: %w", batchID, err) + } + } + printStillRunningWarning(allStillRunning) + return nil + } + + if len(accountSourceIDs) == 0 { + return runDeduplicatePerSource(cmd, st, dbPath, config) + } + + return runDeduplicateOnce(cmd, st, dbPath, config, engine) +} + +func runDeduplicatePerSource( + cmd *cobra.Command, + st *store.Store, + dbPath string, + cfgBase dedup.Config, +) error { + sources, err := st.ListSources("") + if err != nil { + return fmt.Errorf("list sources: %w", err) + } + if len(sources) == 0 { + fmt.Println("No sources found.") + return nil + } + + fmt.Println( + "No --account specified; deduping each source independently.", + ) + fmt.Println() + + backedUp := false + anyRan := false + var executedBatches []string + for _, src := range sources { + cfgScoped := cfgBase + cfgScoped.AccountSourceIDs = []int64{src.ID} + cfgScoped.Account = src.Identifier + engineScoped := dedup.NewEngine(st, cfgScoped, logger) + + fmt.Printf("--- %s (%s) ---\n", src.Identifier, src.SourceType) + report, err := engineScoped.Scan(cmd.Context()) + if err != nil { + return fmt.Errorf("scan %s: %w", src.Identifier, err) + } + if report.DuplicateGroups == 0 { + fmt.Println(" No duplicates.") + fmt.Println() + continue + } + + anyRan = true + fmt.Print(engineScoped.FormatReport(report)) + if cfgScoped.DryRun { + fmt.Println() + continue + } + + if !dedupYes { + fmt.Printf( + "\nProceed with deduplication for %s? "+ + "This will hide %d duplicates "+ + "(reversible with --undo). [y/N]: ", + src.Identifier, report.DuplicateMessages, + ) + ok, err := readDedupYesNo(cmd) + if err != nil { + return err + } + if !ok { + fmt.Println("Skipped.") + continue + } + } + + if !backedUp && !dedupNoBackup { + backedUp = true + backupPath := fmt.Sprintf( + "%s.dedup-backup-%s", dbPath, + time.Now().Format("20060102-150405"), + ) + fmt.Printf("Backing up database to %s...\n", + filepath.Base(backupPath)) + if err := backupDatabase(st, backupPath); err != nil { + return fmt.Errorf("backup database: %w", err) + } + } + + batchID := fmt.Sprintf( + "dedup-%s-%d-%s", time.Now().Format("20060102-150405"), src.ID, dedup.SanitizeFilenameComponent(src.Identifier), + ) + summary, err := engineScoped.Execute( + cmd.Context(), report, batchID, + ) + if err != nil { + return fmt.Errorf("execute %s: %w", src.Identifier, err) + } + executedBatches = append(executedBatches, summary.BatchID) + printDedupSummary(summary) + fmt.Println() + } + + if cfgBase.DryRun { + fmt.Println("\nDry run complete. No changes made.") + } else if !anyRan { + fmt.Println("No duplicates found in any source.") + } else if len(executedBatches) > 1 { + var b strings.Builder + b.WriteString("\nTo undo all of the above:\n msgvault deduplicate") + for _, id := range executedBatches { + fmt.Fprintf(&b, " --undo %s", id) + } + b.WriteString("\n") + fmt.Print(b.String()) + } + return nil +} + +func runDeduplicateOnce( + cmd *cobra.Command, + st *store.Store, + dbPath string, + cfgScoped dedup.Config, + engine *dedup.Engine, +) error { + fmt.Println("Scanning for duplicate messages...") + report, err := engine.Scan(cmd.Context()) + if err != nil { + return fmt.Errorf("scan: %w", err) + } + + fmt.Print(engine.FormatMethodology()) + fmt.Print(engine.FormatReport(report)) + + if cfgScoped.DryRun { + fmt.Println("\nDry run complete. No changes made.") + return nil + } + if report.DuplicateGroups == 0 { + fmt.Println("\nNo duplicates found.") + return nil + } + + if !dedupYes { + fmt.Printf( + "\nProceed with deduplication? This will hide %d "+ + "duplicates (reversible with --undo). [y/N]: ", + report.DuplicateMessages, + ) + ok, err := readDedupYesNo(cmd) + if err != nil { + return err + } + if !ok { + fmt.Println("Aborted.") + return nil + } + } + + if !dedupNoBackup { + backupPath := fmt.Sprintf( + "%s.dedup-backup-%s", dbPath, + time.Now().Format("20060102-150405"), + ) + fmt.Printf("Backing up database to %s...\n", + filepath.Base(backupPath)) + if err := backupDatabase(st, backupPath); err != nil { + return fmt.Errorf("backup database: %w", err) + } + } + + batchID := fmt.Sprintf( + "dedup-%s-run-%s", + time.Now().Format("20060102-150405"), + randomBatchToken(), + ) + fmt.Println("Merging duplicates...") + summary, err := engine.Execute(cmd.Context(), report, batchID) + if err != nil { + return fmt.Errorf("execute: %w", err) + } + + printDedupSummary(summary) + fmt.Println("\nTo update analytics cache: " + + "msgvault build-cache --full-rebuild") + return nil +} + +func printDedupSummary(summary *dedup.ExecutionSummary) { + fmt.Printf("\n=== Deduplication Complete ===\n") + fmt.Printf("Batch ID: %s\n", summary.BatchID) + fmt.Printf("Groups merged: %d\n", summary.GroupsMerged) + fmt.Printf("Messages pruned: %d\n", summary.MessagesRemoved) + fmt.Printf("Labels transferred: %d\n", summary.LabelsTransferred) + fmt.Printf("Raw MIME backfilled: %d\n", summary.RawMIMEBackfilled) + + if len(summary.StagedManifests) > 0 { + fmt.Println("\nStaged deletion manifests (pending):") + for _, m := range summary.StagedManifests { + fmt.Printf(" %s [%s] %d messages (%s)\n", + m.ManifestID, m.SourceType, m.MessageCount, m.Account) + } + fmt.Println( + "\nRun 'msgvault delete-staged' to remove the " + + "duplicates from the remote server.", + ) + } + fmt.Printf("\nTo undo: msgvault deduplicate --undo %s\n", + summary.BatchID) +} + +func readDedupYesNo(cmd *cobra.Command) (bool, error) { + reader := bufio.NewReader(cmd.InOrStdin()) + response, err := reader.ReadString('\n') + if err != nil && !errors.Is(err, io.EOF) { + return false, fmt.Errorf("read confirmation: %w", err) + } + response = strings.TrimSpace(strings.ToLower(response)) + return response == "y" || response == "yes", nil +} + +// randomBatchToken returns a short random hex token used to disambiguate +// single-run dedup batch IDs from per-source batch IDs that may have been +// generated in the same second. +func randomBatchToken() string { + var b [4]byte + if _, err := rand.Read(b[:]); err != nil { + return fmt.Sprintf("%08x", time.Now().UnixNano()&0xffffffff) + } + return hex.EncodeToString(b[:]) +} + +// backupDatabase writes a point-in-time consistent copy of the SQLite +// database to dst using VACUUM INTO. Unlike a file-system copy of the +// main/-wal/-shm triple, this is atomic and handles uncheckpointed WAL +// pages without any external coordination. +func backupDatabase(st *store.Store, dst string) error { + if _, err := os.Stat(dst); err == nil { + return fmt.Errorf("backup target already exists: %s", dst) + } + if _, err := st.DB().Exec("VACUUM INTO ?", dst); err != nil { + return fmt.Errorf("vacuum into %s: %w", dst, err) + } + return nil +} + +func printStillRunningWarning(ids []string) { + if len(ids) == 0 { + return + } + fmt.Printf( + "\nWarning: the following deletion manifests are already in " + + "progress\nand cannot be cancelled:\n", + ) + for _, id := range ids { + fmt.Printf(" - %s\n", id) + } +} + +func init() { + rootCmd.AddCommand(deduplicateCmd) + deduplicateCmd.Flags().BoolVar(&dedupDryRun, "dry-run", false, + "Scan and report only; do not modify data") + deduplicateCmd.Flags().BoolVar(&dedupNoBackup, "no-backup", false, + "Skip database backup before merging") + deduplicateCmd.Flags().StringVar(&dedupPrefer, "prefer", "", + "Comma-separated source type preference order "+ + "(default: gmail,imap,mbox,emlx,hey)") + deduplicateCmd.Flags().BoolVar(&dedupContentHash, "content-hash", false, + "Also detect duplicates by normalized raw MIME content") + deduplicateCmd.Flags().StringArrayVar(&dedupUndo, "undo", nil, + "Undo a previous dedup run by batch ID "+ + "(repeat to undo multiple batches)") + deduplicateCmd.Flags().StringVar(&dedupAccount, "account", "", + "Dedup across all sources for this account") + deduplicateCmd.Flags().BoolVar(&dedupDeleteFromSourceSrvr, + "delete-dups-from-source-server", false, + "DESTRUCTIVE: stage pruned duplicates for remote deletion") + deduplicateCmd.Flags().BoolVarP(&dedupYes, "yes", "y", false, + "Skip confirmation prompt") +} diff --git a/cmd/msgvault/cmd/import_imessage.go b/cmd/msgvault/cmd/import_imessage.go index b537846f..6009f388 100644 --- a/cmd/msgvault/cmd/import_imessage.go +++ b/cmd/msgvault/cmd/import_imessage.go @@ -120,6 +120,12 @@ func runImportImessage(cmd *cobra.Command, _ []string) error { func openStoreAndInit() (*store.Store, error) { dbPath := cfg.DatabaseDSN() + if _, err := os.Stat(dbPath); os.IsNotExist(err) { + return nil, fmt.Errorf( + "database not found: %s\nRun 'msgvault init-db' first", + dbPath, + ) + } s, err := store.Open(dbPath) if err != nil { return nil, fmt.Errorf("open database: %w", err) diff --git a/cmd/msgvault/cmd/list_identities.go b/cmd/msgvault/cmd/list_identities.go new file mode 100644 index 00000000..32c278e0 --- /dev/null +++ b/cmd/msgvault/cmd/list_identities.go @@ -0,0 +1,204 @@ +package cmd + +import ( + "bytes" + "fmt" + "os" + "regexp" + "strings" + "text/tabwriter" + "time" + + "github.com/BurntSushi/toml" + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/store" +) + +var ( + identitiesAccount string + identitiesJSON bool + identitiesTOML bool + identitiesMinCount int64 + identitiesMatch string +) + +var listIdentitiesCmd = &cobra.Command{ + Use: "list-identities", + Short: "List every email address you've likely sent from", + Long: `List every email address that the archive considers a likely "me" +identity, ranked by the number of sent messages attributable to each address. + +Three independent signals are used for detection: + is_from_me messages.is_from_me set at ingest time + sent-label message carries a SENT label + account-match From: address matches the source identifier + +Use --toml to generate a ready-to-paste [identity] config block for +deduplicate's sent-copy detection.`, + RunE: runListIdentities, +} + +func runListIdentities(_ *cobra.Command, _ []string) error { + st, err := openStoreAndInit() + if err != nil { + return err + } + defer func() { _ = st.Close() }() + + var scopeIDs []int64 + scopeLabel := "all" + if identitiesAccount != "" { + scope, err := ResolveAccount(st, identitiesAccount) + if err != nil { + return err + } + scopeIDs = scope.SourceIDs() + if len(scopeIDs) == 0 { + return fmt.Errorf("--account %q resolved to zero sources", identitiesAccount) + } + scopeLabel = scope.DisplayName() + } + + var matcher *regexp.Regexp + if identitiesMatch != "" { + pattern := identitiesMatch + if !strings.HasPrefix(pattern, "(?i)") { + pattern = "(?i)" + pattern + } + matcher, err = regexp.Compile(pattern) + if err != nil { + return fmt.Errorf( + "invalid --match regex %q: %w", + identitiesMatch, err, + ) + } + } + + started := time.Now() + candidates, err := st.ListLikelyIdentities(scopeIDs...) + if err != nil { + return err + } + + if identitiesMinCount > 0 || matcher != nil { + trimmed := make( + []store.IdentityCandidate, 0, len(candidates), + ) + for _, c := range candidates { + if c.MessageCount < identitiesMinCount { + continue + } + if matcher != nil && !matcher.MatchString(c.Email) { + continue + } + trimmed = append(trimmed, c) + } + candidates = trimmed + } + + logger.Info("list-identities", + "scope", scopeLabel, + "count", len(candidates), + "duration_ms", time.Since(started).Milliseconds()) + + if identitiesJSON && identitiesTOML { + return fmt.Errorf("--json and --toml are mutually exclusive") + } + switch { + case identitiesJSON: + return writeIdentitiesJSON(candidates) + case identitiesTOML: + return writeIdentitiesTOML(candidates) + default: + return writeIdentitiesTable(candidates) + } +} + +func writeIdentitiesTOML(candidates []store.IdentityCandidate) error { + if len(candidates) == 0 { + fmt.Println("# no candidates — nothing to paste") + return nil + } + + addrs := make([]string, 0, len(candidates)) + for _, c := range candidates { + addrs = append(addrs, c.Email) + } + + cfg := struct { + Identity struct { + Addresses []string `toml:"addresses"` + } `toml:"identity"` + }{} + cfg.Identity.Addresses = addrs + + var buf bytes.Buffer + if err := toml.NewEncoder(&buf).Encode(cfg); err != nil { + return fmt.Errorf("encode TOML: %w", err) + } + fmt.Print(buf.String()) + return nil +} + +func writeIdentitiesTable(candidates []store.IdentityCandidate) error { + if len(candidates) == 0 { + fmt.Println("No likely sent-from addresses found.") + return nil + } + + w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0) + _, _ = fmt.Fprintln(w, "ADDRESS\tMESSAGES\tSOURCES\tSIGNALS") + for _, c := range candidates { + _, _ = fmt.Fprintf(w, "%s\t%s\t%d\t%s\n", + c.Email, + formatCount(c.MessageCount), + len(c.SourceIDs), + c.Signals.String(), + ) + } + _ = w.Flush() + fmt.Printf("\n%d candidate address(es)\n", len(candidates)) + return nil +} + +func writeIdentitiesJSON(candidates []store.IdentityCandidate) error { + payload := make([]map[string]any, 0, len(candidates)) + for _, c := range candidates { + entry := map[string]any{ + "email": c.Email, + "message_count": c.MessageCount, + "source_ids": c.SourceIDs, + "signals": splitSignals(c.Signals), + } + payload = append(payload, entry) + } + return printJSON(payload) +} + +func splitSignals(s store.IdentitySignal) []string { + var out []string + if s&store.SignalFromMe != 0 { + out = append(out, "is_from_me") + } + if s&store.SignalSentLabel != 0 { + out = append(out, "sent_label") + } + if s&store.SignalAccountMatch != 0 { + out = append(out, "account_match") + } + return out +} + +func init() { + listIdentitiesCmd.Flags().StringVar(&identitiesAccount, + "account", "", "Restrict to a single account") + listIdentitiesCmd.Flags().BoolVar(&identitiesJSON, + "json", false, "Output as JSON") + listIdentitiesCmd.Flags().BoolVar(&identitiesTOML, + "toml", false, "Output a ready-to-paste [identity] config block") + listIdentitiesCmd.Flags().Int64Var(&identitiesMinCount, + "min-count", 0, "Drop addresses with fewer than N messages") + listIdentitiesCmd.Flags().StringVar(&identitiesMatch, + "match", "", "Filter by Go regex (case-insensitive)") + rootCmd.AddCommand(listIdentitiesCmd) +} diff --git a/docs/superpowers/plans/2026-04-20-deduplication.md b/docs/superpowers/plans/2026-04-20-deduplication.md new file mode 100644 index 00000000..482de428 --- /dev/null +++ b/docs/superpowers/plans/2026-04-20-deduplication.md @@ -0,0 +1,404 @@ +# Identities, Collections, and Deduplication -- Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **START HERE:** Read the spec first: `docs/superpowers/specs/2026-04-20-deduplicate-command.md` (in this repo). Read `CLAUDE.md` for project conventions. Read the prior art table below -- each entry references code on a local git branch that should be ported, not written from scratch. Use `git show :` to read those files. + +**Goal:** Implement the three concepts from the proposal (wesm/msgvault#278): identities tied to accounts, collections of accounts with a default "All" collection, and deduplication at account and collection level. + +**Architecture:** Four layers, bottom up: +1. **Store** -- SQL queries for dedup, identities, and collections (SQLite via mattn/go-sqlite3) +2. **Engine** -- Dedup scan/execute/undo logic, survivor selection, content-hash fallback +3. **Query** -- SourceIDs multi-filter and soft-delete exclusion across all query paths, source_filter helper +4. **CLI + TUI** -- deduplicate command, list-identities, collections CRUD, TUI account selector with collections + +**Tech Stack:** Go, SQLite, cobra CLI, Bubble Tea TUI, existing deletion staging (`internal/deletion/`) + +**Spec:** `docs/superpowers/specs/2026-04-20-deduplicate-command.md` (in this repo, read it before starting) + +**Prior art on existing branches:** + +| File | Branch | Lines | What it does | +|------|--------|-------|-------------| +| `internal/store/dedup.go` | jesse/dedupe-integration | 521 | FindDuplicatesByRFC822ID, GetDuplicateGroupMessages, MergeDuplicates, UndoDedup, BackfillRFC822IDs, StreamMessageRaw | +| `internal/store/dedup_test.go` | jesse/dedupe-integration | ~150 | Store dedup tests | +| `internal/dedup/dedup.go` | jesse/dedupe-integration | 1154 | Engine: Scan, Execute, Undo, survivor selection, content-hash, deletion staging, FormatReport, FormatMethodology | +| `internal/dedup/dedup_test.go` | jesse/dedupe-integration | 439 | Engine tests | +| `internal/dedup/dedup_identity_test.go` | jesse/dedupe-integration | 238 | Identity address match tests | +| `cmd/msgvault/cmd/deduplicate.go` | jesse/dedupe-integration | 462 | CLI with safety defaults (--delete-dups-from-source-server) | +| `cmd/msgvault/cmd/deduplicate_test.go` | add-deduplicate-command | 499 | CLI tests | +| `cmd/msgvault/cmd/account_scope.go` | jesse/dedupe-integration | 127 | ResolveAccount helper | +| `internal/store/identities.go` | jesse/dedupe-integration | 215 | ListLikelyIdentities with signal bitmask | +| `internal/store/identities_test.go` | jesse/dedupe-integration | 246 | Identity discovery tests | +| `cmd/msgvault/cmd/list_identities.go` | jesse/dedupe-integration | 303 | list-identities command with --toml, --json, --match | +| `internal/store/merged_accounts.go` | jesse/dedupe-integration | 327 | CRUD for merged accounts + sources | +| `internal/store/merged_accounts_test.go` | jesse/dedupe-integration | 217 | Merged account tests | +| `cmd/msgvault/cmd/merge_accounts.go` | jesse/dedupe-integration | 389 | merge-accounts CLI (create/list/show/add/remove/delete) | +| `internal/query/source_filter.go` | jesse/dedupe-integration | 39 | appendSourceFilter helper for SourceIDs | +| `internal/query/models.go` | jesse/dedupe-integration | diff | SourceIDs on MessageFilter, AggregateOptions, StatsOptions, MergedAccountInfo | +| Soft-delete exclusion | jesse/dedupe-integration (bf49689) | diff | deleted_at IS NULL in all query paths | +| TUI merged accounts | jesse/dedupe-integration (5ac2a58) | diff | Account selector, SourceIDs propagation | + +**What's already on main:** Schema has `deleted_at`, `delete_batch_id`, `rfc822_message_id`, `is_from_me` on messages table. No dedup/identity/collection code exists. No query paths filter on `deleted_at`. + +--- + +## Phase 1: Deduplication + +### Task 1: Store Layer -- Duplicate Finding and Merging + +Port `internal/store/dedup.go` and tests from `jesse/dedupe-integration`. + +**Files:** +- Create: `internal/store/dedup.go` +- Create: `internal/store/dedup_test.go` +- Reference: `git show jesse/dedupe-integration:internal/store/dedup.go` +- Reference: `git show jesse/dedupe-integration:internal/store/dedup_test.go` + +- [ ] **Step 1: Write failing tests for FindDuplicatesByRFC822ID** + +Two messages same rfc822_message_id = one group. After MergeDuplicates, group disappears. + +Run: `go test ./internal/store/ -run TestStore_FindDuplicatesByRFC822ID -v` + +- [ ] **Step 2: Write failing tests for GetDuplicateGroupMessages** + +Sent-label detection, is_from_me, from_email extraction, source scoping. + +- [ ] **Step 3: Write failing tests for MergeDuplicates** + +Label union, raw MIME backfill, soft-delete with batch ID. + +- [ ] **Step 4: Implement store/dedup.go** + +Types: DuplicateGroupKey, DuplicateMessageRow, MergeResult, ContentHashCandidate. Methods: FindDuplicatesByRFC822ID, GetDuplicateGroupMessages, MergeDuplicates, UndoDedup, CountActiveMessages, CountMessagesWithoutRFC822ID, BackfillRFC822IDs, StreamMessageRaw, GetAllRawMIMECandidates. + +Run: `go test ./internal/store/ -run "TestStore_(FindDuplicates|GetDuplicateGroup|MergeDuplicates)" -v` + +- [ ] **Step 5: Write and run tests for UndoDedup and CountActiveMessages** + +- [ ] **Step 6: Format, vet, commit** + +--- + +### Task 2: Dedup Engine -- Scan, Execute, Undo + +Port core engine from `jesse/dedupe-integration:internal/dedup/dedup.go`. + +**Files:** +- Create: `internal/dedup/dedup.go` +- Create: `internal/dedup/dedup_test.go` +- Reference: `git show jesse/dedupe-integration:internal/dedup/dedup.go` +- Reference: `git show jesse/dedupe-integration:internal/dedup/dedup_test.go` + +- [ ] **Step 1: Write failing tests for Scan (RFC822 grouping) and survivor selection** + +Gmail preferred over mbox. Raw MIME tiebreaker. Sent-copy constraint. Three-signal IsSentCopy. + +- [ ] **Step 2: Implement Config, types, NewEngine, Scan, selectSurvivor, isBetter** + +Include three-signal IsSentCopy (SENT label, is_from_me, identity address match). Skip content-hash for now. + +- [ ] **Step 3: Write failing tests for Execute and Undo** + +Execute soft-deletes, transfers labels, backfills MIME. Undo restores. + +- [ ] **Step 4: Implement Execute, Undo, deletion staging (same-source_id guard)** + +- [ ] **Step 5: Implement FormatReport and FormatMethodology** + +- [ ] **Step 6: Format, vet, commit** + +--- + +### Task 3: Dedup Engine -- Content-Hash Fallback + +Port normalized content-hash scanning for header-rewritten duplicates. + +**Files:** +- Modify: `internal/dedup/dedup.go` +- Modify: `internal/dedup/dedup_test.go` + +- [ ] **Step 1: Write failing test for content-hash detection** + +Two messages different rfc822_message_id, identical normalized body. ContentHashFallback=true finds them. + +- [ ] **Step 2: Implement scanNormalizedHashGroups, normalizeRawMIME, worker pool** + +- [ ] **Step 3: Format, vet, commit** + +--- + +### Task 4: Soft-Delete Exclusion Across All Query Paths + +Add `deleted_at IS NULL` to every query path. Port from commit `bf49689`. + +**Files:** +- Modify: `internal/query/sqlite.go` -- optsToFilterConditions, buildFilterJoinsAndConditions, buildSearchQueryParts, GetTotalStats, GetGmailIDsByFilter +- Modify: `internal/query/duckdb.go` -- Search fallback +- Modify: `cmd/msgvault/cmd/build_cache.go` -- messages export query + +- [ ] **Step 1: Add `deleted_at IS NULL` to all SQLite query builders** + +One-line addition in each: `conditions = append(conditions, prefix+"deleted_at IS NULL")` + +- [ ] **Step 2: Add to DuckDB search fallback** + +- [ ] **Step 3: Add `AND deleted_at IS NULL` to Parquet export query** + +- [ ] **Step 4: Run full test suite** -- `go test ./... -count=1` + +- [ ] **Step 5: Format, vet, commit** + +--- + +### Task 5: Account Resolution Helper and Deduplicate Command + +Port CLI from `jesse/dedupe-integration`. + +**Files:** +- Create: `cmd/msgvault/cmd/account_scope.go` +- Create: `cmd/msgvault/cmd/deduplicate.go` +- Create: `cmd/msgvault/cmd/deduplicate_test.go` +- Reference: `git show jesse/dedupe-integration:cmd/msgvault/cmd/account_scope.go` +- Reference: `git show jesse/dedupe-integration:cmd/msgvault/cmd/deduplicate.go` +- Reference: `git show add-deduplicate-command:cmd/msgvault/cmd/deduplicate_test.go` + +- [ ] **Step 1: Implement AccountScope and ResolveAccount** + +Skip merged-account lookup for now (stub returns ErrMergedAccountNotFound). Structure so collection lookup slots in later. + +- [ ] **Step 2: Implement deduplicate command** + +Flags: --account, --dry-run, --prefer, --content-hash, --delete-dups-from-source-server, --no-backup, --undo, --yes. Two modes: per-source (default) and scoped (--account). + +- [ ] **Step 3: Port CLI tests, adapt flag names** + +- [ ] **Step 4: Build and smoke test** -- `make build && ./msgvault deduplicate --help` + +- [ ] **Step 5: Format, vet, commit** + +--- + +## Phase 2: Identities + +### Task 6: Identity Discovery Store Layer + +Port `internal/store/identities.go` from `jesse/dedupe-integration`. + +**Files:** +- Create: `internal/store/identities.go` +- Create: `internal/store/identities_test.go` +- Reference: `git show jesse/dedupe-integration:internal/store/identities.go` +- Reference: `git show jesse/dedupe-integration:internal/store/identities_test.go` + +- [ ] **Step 1: Write failing tests for ListLikelyIdentities** + +Test each signal in isolation (is_from_me alone, SENT label without is_from_me, account-identifier match). Negative case: stranger address excluded. Count ordering. Source scoping. Soft-delete exclusion. + +- [ ] **Step 2: Implement IdentityCandidate, IdentitySignal, ListLikelyIdentities** + +- [ ] **Step 3: Format, vet, commit** + +--- + +### Task 7: list-identities Command + +Port `cmd/msgvault/cmd/list_identities.go` from `jesse/dedupe-integration`. + +**Files:** +- Create: `cmd/msgvault/cmd/list_identities.go` +- Reference: `git show jesse/dedupe-integration:cmd/msgvault/cmd/list_identities.go` + +- [ ] **Step 1: Implement list-identities command** + +Flags: --account, --min-count, --match (regex), --json, --toml. Human table output: ADDRESS, MESSAGES, SOURCES, SIGNALS. --toml output produces paste-ready `[identity]` config block. + +- [ ] **Step 2: Build and smoke test** -- `make build && ./msgvault list-identities --help` + +- [ ] **Step 3: Format, vet, commit** + +--- + +### Task 8: Identity Config Integration + +Port `[identity].addresses` config from `jesse/dedupe-integration`. + +**Files:** +- Modify: `internal/config/config.go` -- Add IdentityConfig struct, IdentityAddressSet() method +- Modify: `cmd/msgvault/cmd/deduplicate.go` -- Load identity addresses, pass to engine +- Reference: `git show jesse/dedupe-integration:internal/config/config.go` + +- [ ] **Step 1: Add IdentityConfig to config.go** + +New `[identity]` section with `addresses []string`. `IdentityAddressSet()` returns normalized `map[string]bool`. + +- [ ] **Step 2: Wire into deduplicate command** + +Load `cfg.IdentityAddressSet()`, pass to `dedup.Config.IdentityAddresses`. Log count when loaded. + +- [ ] **Step 3: Port identity-match dedup tests** + +Reference: `git show jesse/dedupe-integration:internal/dedup/dedup_identity_test.go` + +- [ ] **Step 4: Format, vet, commit** + +--- + +## Phase 3: Collections + +### Task 9: Collections Store Layer + +Port and rename from `jesse/dedupe-integration:internal/store/merged_accounts.go`. Rename tables from `merged_accounts` to `collections` and `merged_account_sources` to `collection_sources`. Add `collection_identities` table. + +**Files:** +- Create: `internal/store/collections.go` +- Create: `internal/store/collections_test.go` +- Reference: `git show jesse/dedupe-integration:internal/store/merged_accounts.go` +- Reference: `git show jesse/dedupe-integration:internal/store/merged_accounts_test.go` + +- [ ] **Step 1: Write failing tests for CRUD** + +Create, list, show (with source IDs + message count), add sources, remove sources, delete. Rejection of missing sources and duplicate names. Idempotent add/remove. Deletion leaves sources and messages intact. + +- [ ] **Step 2: Implement collections store** + +Schema: `collections` (id, name, description, created_at), `collection_sources` (collection_id, source_id), `collection_identities` (collection_id, address). Methods: CreateCollection, ListCollections, GetCollectionByName, AddSourcesToCollection, RemoveSourcesFromCollection, DeleteCollection, GetCollectionIdentities, SetCollectionIdentities. + +- [ ] **Step 3: Implement default "All" collection** + +`EnsureDefaultCollection()` creates "All" if it doesn't exist and adds all sources. Called during InitSchema. New sources auto-join "All" on creation (hook in source creation path). + +- [ ] **Step 4: Format, vet, commit** + +--- + +### Task 10: Query Layer -- SourceIDs Multi-Filter + +Port source_filter helper and SourceIDs field from `jesse/dedupe-integration`. + +**Files:** +- Create: `internal/query/source_filter.go` +- Modify: `internal/query/models.go` -- Add SourceIDs to MessageFilter, AggregateOptions, StatsOptions. Add CollectionInfo type (renamed from MergedAccountInfo). +- Modify: `internal/query/sqlite.go` -- Use appendSourceFilter everywhere SourceID is checked +- Modify: `internal/query/duckdb.go` -- Same +- Reference: `git show jesse/dedupe-integration:internal/query/source_filter.go` +- Reference: `git diff jesse/active-work..jesse/dedupe-integration -- internal/query/models.go` + +- [ ] **Step 1: Create source_filter.go with appendSourceFilter helper** + +- [ ] **Step 2: Add SourceIDs to MessageFilter, AggregateOptions, StatsOptions** + +Add CollectionInfo type. Update MessageFilter.Clone() to deep-copy SourceIDs. + +- [ ] **Step 3: Update SQLite and DuckDB engines to use appendSourceFilter** + +Replace every manual SourceID check with the shared helper. + +- [ ] **Step 4: Add ListCollections to Engine interface** + +Returns collection rows. DuckDB delegates to embedded SQLite. Remote returns empty. + +- [ ] **Step 5: Run full test suite** + +- [ ] **Step 6: Format, vet, commit** + +--- + +### Task 11: Collections CLI + +Adapt from `jesse/dedupe-integration:cmd/msgvault/cmd/merge_accounts.go`, renamed to `collections`. + +**Files:** +- Create: `cmd/msgvault/cmd/collections.go` +- Modify: `cmd/msgvault/cmd/account_scope.go` -- Add collection lookup to ResolveAccount +- Reference: `git show jesse/dedupe-integration:cmd/msgvault/cmd/merge_accounts.go` + +- [ ] **Step 1: Implement collections command** + +Subcommands: create, list, show, add, remove, delete. `create` runs identity discovery + confirmation + dedup. `add` triggers dedup of new sources against existing set. + +- [ ] **Step 2: Update ResolveAccount to check collections** + +Collections take precedence over source identifiers (same pattern as merged accounts). Update error messages to point at `collections list`. + +- [ ] **Step 3: Update deduplicate --account to resolve collections** + +`--account` now resolves to collections too. Add `--collection` as an explicit alias. + +- [ ] **Step 4: Build and smoke test** + +- [ ] **Step 5: Format, vet, commit** + +--- + +### Task 12: Collections Export to Account + +**Files:** +- Modify: `cmd/msgvault/cmd/collections.go` -- Add `export` subcommand +- Modify: `internal/store/collections.go` -- Add ExportCollectionToAccount method + +- [ ] **Step 1: Implement ExportCollectionToAccount in store** + +Create a new source from the deduplicated, identity-resolved contents of a collection. Copy survivor messages with unified labels. New source gets source_type "collection_export". + +- [ ] **Step 2: Add `collections export --as ` subcommand** + +- [ ] **Step 3: Format, vet, commit** + +--- + +### Task 13: TUI -- Collections in Account Selector + +Port TUI integration from `jesse/dedupe-integration` commit `5ac2a58`, renamed from merged accounts to collections. + +**Files:** +- Modify: `internal/tui/model.go` -- Add collectionAccounts field, loadAccounts fetches collections, account selector shows collections, SourceIDs propagation +- Modify: `internal/tui/keys.go` -- Collection selection in account modal +- Modify: `internal/tui/view.go` -- Render collections section in selector, header bar shows collection name +- Create: `internal/tui/nav_modal_test.go` -- Test collection selection and SourceIDs propagation +- Reference: `git diff jesse/active-work..jesse/dedupe-integration -- internal/tui/` + +- [ ] **Step 1: Add collections to model state** + +`collections []query.CollectionInfo`, `accountFilterSourceIDs []int64`, `accountFilterLabel string`. `accountFilterSourceIDsCopy()` for defensive copies. + +- [ ] **Step 2: Update loadAccounts to fetch collections** + +Best-effort: failure to load collections downgrades silently. + +- [ ] **Step 3: Update account selector modal** + +Second section below individual accounts. Cursor math, selection, round-trip restoration. + +- [ ] **Step 4: Propagate SourceIDs through all query builders** + +loadData, loadStats, buildMessageFilter all pass SourceIDs from model. + +- [ ] **Step 5: Port tests** + +- [ ] **Step 6: Format, vet, commit** + +--- + +## Phase 4: Verification + +### Task 14: Full Build and Integration + +- [ ] **Step 1: Run full test suite** -- `go test ./... -count=1` + +- [ ] **Step 2: Run linter** -- `make lint` + +- [ ] **Step 3: Build** -- `make build` + +- [ ] **Step 4: Smoke test CLI** + +``` +./msgvault deduplicate --help +./msgvault deduplicate --dry-run +./msgvault list-identities --help +./msgvault collections --help +./msgvault collections list +``` + +- [ ] **Step 5: Final commit if needed** diff --git a/docs/superpowers/plans/2026-04-21-dedup-test-coverage.md b/docs/superpowers/plans/2026-04-21-dedup-test-coverage.md new file mode 100644 index 00000000..eca47eac --- /dev/null +++ b/docs/superpowers/plans/2026-04-21-dedup-test-coverage.md @@ -0,0 +1,655 @@ +# Dedup Test Coverage Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Fill the critical test gaps in the dedup/identities/collections features to protect against silent data loss and incorrect survivor selection. + +**Architecture:** Pure test additions — no production code changes. Tests are organized by package: `internal/dedup/`, `internal/store/`, `internal/query/`. Each task adds tests to existing `_test.go` files or creates new ones. + +**Tech Stack:** Go, table-driven tests, `storetest.Fixture`, `testutil.MustNoErr` + +**Test run command:** `go test ./internal/dedup/ ./internal/store/ ./internal/query/ -v -count=1` + +--- + +## Task 1: normalizeRawMIME Unit Tests + +The most dangerous untested function. It determines which headers are stripped before hashing. A bug causes false-positive merges (data loss) or missed duplicates. + +**Files:** +- Create: `internal/dedup/normalize_test.go` + +Since `normalizeRawMIME` is unexported, tests go in `package dedup` (internal test package). + +- [ ] **Step 1: Write table-driven tests** + +```go +// internal/dedup/normalize_test.go +package dedup + +import ( + "bytes" + "testing" +) + +func TestNormalizeRawMIME(t *testing.T) { + tests := []struct { + name string + input []byte + wantSame bool // true if output should equal input + contains string // substring the output must contain (empty = skip) + excludes string // substring the output must NOT contain (empty = skip) + }{ + { + name: "strips Received header (CRLF)", + input: []byte("Received: from mx1.google.com\r\nFrom: alice@example.com\r\nSubject: Hi\r\n\r\nBody"), + contains: "From: alice@example.com", + excludes: "Received", + }, + { + name: "strips multiple transport headers", + input: []byte("Delivered-To: bob@example.com\r\nX-Gmail-Labels: INBOX\r\nAuthentication-Results: spf=pass\r\nFrom: alice@example.com\r\nSubject: Test\r\n\r\nBody"), + contains: "From: alice@example.com", + excludes: "Delivered-To", + }, + { + name: "preserves non-transport headers", + input: []byte("From: alice@example.com\r\nTo: bob@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nBody text"), + contains: "Subject: Meeting", + }, + { + name: "handles LF-only line endings", + input: []byte("Received: from mx1\nFrom: alice@example.com\nSubject: Test\n\nBody with LF"), + contains: "From: alice@example.com", + excludes: "Received", + }, + { + name: "no header/body separator returns raw unchanged", + input: []byte("This is just a blob of text with no headers"), + wantSame: true, + }, + { + name: "malformed headers return raw unchanged", + input: []byte("Not a header at all\r\n\r\nBody"), + wantSame: true, + }, + { + name: "empty body preserved", + input: []byte("From: alice@example.com\r\nSubject: Empty\r\n\r\n"), + contains: "Subject: Empty", + }, + { + name: "preserves body content exactly", + input: []byte("Received: from mx1\r\nFrom: a@b.com\r\n\r\nExact body content here."), + contains: "Exact body content here.", + }, + { + name: "does not mutate input buffer", + input: []byte("Received: from mx1\r\nFrom: a@b.com\r\nSubject: Test\r\n\r\nBody"), + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Copy input to detect mutation + inputCopy := make([]byte, len(tt.input)) + copy(inputCopy, tt.input) + + result := normalizeRawMIME(tt.input) + + // Verify input was not mutated + if !bytes.Equal(tt.input, inputCopy) { + t.Error("normalizeRawMIME mutated its input buffer") + } + + if tt.wantSame { + if !bytes.Equal(result, tt.input) { + t.Errorf("expected unchanged output, got:\n%s", result) + } + return + } + if tt.contains != "" && !bytes.Contains(result, []byte(tt.contains)) { + t.Errorf("output missing %q:\n%s", tt.contains, result) + } + if tt.excludes != "" && bytes.Contains(result, []byte(tt.excludes)) { + t.Errorf("output should not contain %q:\n%s", tt.excludes, result) + } + }) + } +} + +func TestNormalizeRawMIME_DeterministicOutput(t *testing.T) { + // Same logical message with different transport headers must hash identically + raw1 := []byte("Received: from mx1.google.com\r\nFrom: sender@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet at 3pm.") + raw2 := []byte("Received: from mx2.google.com\r\nDelivered-To: other@example.com\r\nFrom: sender@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet at 3pm.") + + norm1 := normalizeRawMIME(raw1) + norm2 := normalizeRawMIME(raw2) + + hash1 := sha256Hex(norm1) + hash2 := sha256Hex(norm2) + if hash1 != hash2 { + t.Errorf("same message with different transport headers produced different hashes:\n raw1 normalized:\n%s\n raw2 normalized:\n%s", norm1, norm2) + } +} +``` + +- [ ] **Step 2: Run tests** + +Run: `go test ./internal/dedup/ -run TestNormalizeRawMIME -v` + +- [ ] **Step 3: Format, vet, commit** + +--- + +## Task 2: selectSurvivor Tiebreaker Chain Tests + +Only source-type preference and sent-copy are tested. The HasRawMIME > LabelCount > ArchivedAt > ID tiebreaker chain has no coverage. If someone reorders conditions, nothing catches it. + +**Files:** +- Modify: `internal/dedup/dedup_test.go` + +- [ ] **Step 1: Write tiebreaker tests** + +Append to `internal/dedup/dedup_test.go`: + +```go +func TestSelectSurvivor_Tiebreakers(t *testing.T) { + st := testutil.NewTestStore(t) + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{1}, + Account: "test", + }, nil) + + // Helper to build a group and call Scan-less selectSurvivor + // via a Scan that creates a synthetic group. Since selectSurvivor + // is unexported, we test it indirectly through Scan. Instead, + // we build the scenario in the database and let the engine pick. + + // For these tests we use a simpler approach: create two messages + // with the same rfc822_message_id in the same source, and vary + // the tiebreaker attribute. + + tests := []struct { + name string + messages []struct { + srcMsgID string + hasRaw bool + labelCount int + isFromMe bool + } + wantSurvivorIdx int // index into messages (0 or 1) + }{ + { + name: "raw MIME wins over no raw MIME", + messages: []struct { + srcMsgID string + hasRaw bool + labelCount int + isFromMe bool + }{ + {"no-raw", false, 1, false}, + {"has-raw", true, 1, false}, + }, + wantSurvivorIdx: 1, + }, + { + name: "more labels wins when raw MIME is equal", + messages: []struct { + srcMsgID string + hasRaw bool + labelCount int + isFromMe bool + }{ + {"few-labels", false, 1, false}, + {"many-labels", false, 3, false}, + }, + wantSurvivorIdx: 1, + }, + { + name: "lower ID wins as final tiebreaker", + messages: []struct { + srcMsgID string + hasRaw bool + labelCount int + isFromMe bool + }{ + {"first", false, 1, false}, + {"second", false, 1, false}, + }, + wantSurvivorIdx: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f := storetest.New(t) + rfc822ID := "rfc-" + tt.name + + var msgIDs []int64 + for _, m := range tt.messages { + id := addMessage(t, f.Store, f.Source, m.srcMsgID+"-"+tt.name, rfc822ID, m.isFromMe) + if m.hasRaw { + testutil.MustNoErr(t, + f.Store.UpsertMessageRaw(id, []byte("Subject: test\r\n\r\nBody")), + "UpsertMessageRaw", + ) + } + for i := 0; i < m.labelCount; i++ { + lid, err := f.Store.EnsureLabel( + f.Source.ID, + fmt.Sprintf("LBL-%s-%d", m.srcMsgID, i), + fmt.Sprintf("Label %d", i), + "user", + ) + testutil.MustNoErr(t, err, "EnsureLabel") + testutil.MustNoErr(t, + f.Store.LinkMessageLabel(id, lid), + "LinkMessageLabel", + ) + } + msgIDs = append(msgIDs, id) + } + + eng := dedup.NewEngine(f.Store, dedup.Config{ + AccountSourceIDs: []int64{f.Source.ID}, + Account: "test", + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + + group := report.Groups[0] + survivor := group.Messages[group.Survivor] + wantID := msgIDs[tt.wantSurvivorIdx] + if survivor.ID != wantID { + t.Errorf("survivor = %d, want %d (index %d)", + survivor.ID, wantID, tt.wantSurvivorIdx) + } + }) + } +} +``` + +- [ ] **Step 2: Run tests** + +Run: `go test ./internal/dedup/ -run TestSelectSurvivor_Tiebreakers -v` + +- [ ] **Step 3: Format, vet, commit** + +--- + +## Task 3: BackfillRFC822IDs and MergeDuplicates Raw MIME Tests + +Two store-layer write paths with zero coverage on their non-trivial branches. + +**Files:** +- Modify: `internal/store/dedup_test.go` + +- [ ] **Step 1: Write BackfillRFC822IDs test with real MIME data** + +Append to `internal/store/dedup_test.go`: + +```go +func TestStore_BackfillRFC822IDs_ParsesFromRawMIME(t *testing.T) { + f := storetest.New(t) + + // Insert a message WITHOUT an rfc822_message_id but WITH raw MIME + // that contains a Message-ID header. + id := newRFC822Message(t, f, "needs-backfill", "") // empty rfc822 ID + + rawMIME := []byte("From: alice@example.com\r\nTo: bob@example.com\r\nMessage-ID: \r\nSubject: Backfill test\r\n\r\nBody text") + testutil.MustNoErr(t, + f.Store.UpsertMessageRaw(id, rawMIME), + "UpsertMessageRaw", + ) + + // Verify it needs backfill. + count, err := f.Store.CountMessagesWithoutRFC822ID() + testutil.MustNoErr(t, err, "CountMessagesWithoutRFC822ID") + if count != 1 { + t.Fatalf("count without rfc822 = %d, want 1", count) + } + + // Run backfill. + updated, err := f.Store.BackfillRFC822IDs(nil) + testutil.MustNoErr(t, err, "BackfillRFC822IDs") + if updated != 1 { + t.Fatalf("updated = %d, want 1", updated) + } + + // Verify the rfc822_message_id was set correctly (angle brackets stripped). + var rfc822ID string + err = f.Store.DB().QueryRow( + "SELECT rfc822_message_id FROM messages WHERE id = ?", id, + ).Scan(&rfc822ID) + testutil.MustNoErr(t, err, "scan rfc822_message_id") + if rfc822ID != "unique-123@example.com" { + t.Errorf("rfc822_message_id = %q, want unique-123@example.com", rfc822ID) + } + + // Verify count is now zero. + count, err = f.Store.CountMessagesWithoutRFC822ID() + testutil.MustNoErr(t, err, "CountMessagesWithoutRFC822ID after backfill") + if count != 0 { + t.Errorf("count after backfill = %d, want 0", count) + } +} +``` + +- [ ] **Step 2: Write MergeDuplicates raw MIME backfill test** + +```go +func TestStore_MergeDuplicates_BackfillsRawMIME(t *testing.T) { + f := storetest.New(t) + + // Survivor has NO raw MIME, duplicate HAS raw MIME. + idSurvivor := newRFC822Message(t, f, "survivor", "rfc822-mime-backfill") + idDuplicate := newRFC822Message(t, f, "duplicate", "rfc822-mime-backfill") + + rawData := []byte("From: alice@example.com\r\nSubject: Test\r\n\r\nBody") + testutil.MustNoErr(t, + f.Store.UpsertMessageRaw(idDuplicate, rawData), + "UpsertMessageRaw on duplicate", + ) + + // Verify survivor has no raw MIME before merge. + _, err := f.Store.GetMessageRaw(idSurvivor) + if err == nil { + t.Fatal("survivor should not have raw MIME before merge") + } + + result, err := f.Store.MergeDuplicates( + idSurvivor, []int64{idDuplicate}, "batch-mime", + ) + testutil.MustNoErr(t, err, "MergeDuplicates") + if result.RawMIMEBackfilled != 1 { + t.Errorf("RawMIMEBackfilled = %d, want 1", result.RawMIMEBackfilled) + } + + // Verify survivor now has raw MIME. + got, err := f.Store.GetMessageRaw(idSurvivor) + testutil.MustNoErr(t, err, "GetMessageRaw survivor after merge") + if len(got) == 0 { + t.Error("survivor raw MIME should not be empty after backfill") + } +} +``` + +- [ ] **Step 3: Run tests** + +Run: `go test ./internal/store/ -run "TestStore_(BackfillRFC822IDs_Parses|MergeDuplicates_Backfills)" -v` + +- [ ] **Step 4: Format, vet, commit** + +--- + +## Task 4: appendSourceFilter Unit Tests + +Pure function used by every scoped query. No test file exists. + +**Files:** +- Create: `internal/query/source_filter_test.go` + +- [ ] **Step 1: Write table-driven tests** + +```go +package query + +import ( + "testing" +) + +func TestAppendSourceFilter(t *testing.T) { + id42 := int64(42) + + tests := []struct { + name string + singleID *int64 + multiIDs []int64 + prefix string + wantConditions int // number of conditions added (0 or 1) + wantArgs int // number of args added + wantCondition string + }{ + { + name: "neither single nor multi", + singleID: nil, + multiIDs: nil, + prefix: "m.", + wantConditions: 0, + wantArgs: 0, + }, + { + name: "single ID", + singleID: &id42, + multiIDs: nil, + prefix: "m.", + wantConditions: 1, + wantArgs: 1, + wantCondition: "m.source_id = ?", + }, + { + name: "multi IDs", + singleID: nil, + multiIDs: []int64{1, 2, 3}, + prefix: "msg.", + wantConditions: 1, + wantArgs: 3, + wantCondition: "msg.source_id IN (?,?,?)", + }, + { + name: "multi IDs take precedence over single", + singleID: &id42, + multiIDs: []int64{10, 20}, + prefix: "", + wantConditions: 1, + wantArgs: 2, + wantCondition: "source_id IN (?,?)", + }, + { + name: "empty prefix works", + singleID: &id42, + multiIDs: nil, + prefix: "", + wantConditions: 1, + wantArgs: 1, + wantCondition: "source_id = ?", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + conditions, args := appendSourceFilter( + nil, nil, tt.prefix, tt.singleID, tt.multiIDs, + ) + if len(conditions) != tt.wantConditions { + t.Errorf("conditions = %d, want %d: %v", + len(conditions), tt.wantConditions, conditions) + } + if len(args) != tt.wantArgs { + t.Errorf("args = %d, want %d", len(args), tt.wantArgs) + } + if tt.wantCondition != "" && len(conditions) > 0 { + if conditions[0] != tt.wantCondition { + t.Errorf("condition = %q, want %q", + conditions[0], tt.wantCondition) + } + } + }) + } +} +``` + +- [ ] **Step 2: Run tests** + +Run: `go test ./internal/query/ -run TestAppendSourceFilter -v` + +- [ ] **Step 3: Format, vet, commit** + +--- + +## Task 5: Collections Edge Case Tests + +Missing validation paths: bad input, idempotent operations, default collection incremental behavior. + +**Files:** +- Modify: `internal/store/collections_test.go` + +- [ ] **Step 1: Write validation and edge case tests** + +Append to `internal/store/collections_test.go`: + +```go +func TestCollections_Validation(t *testing.T) { + f := storetest.New(t) + st := f.Store + + t.Run("empty name rejected", func(t *testing.T) { + _, err := st.CreateCollection("", "", []int64{f.Source.ID}) + if err == nil { + t.Fatal("expected error for empty name") + } + }) + + t.Run("zero sources rejected", func(t *testing.T) { + _, err := st.CreateCollection("empty", "", nil) + if err == nil { + t.Fatal("expected error for zero sources") + } + }) + + t.Run("nonexistent source rejected", func(t *testing.T) { + _, err := st.CreateCollection("bad", "", []int64{99999}) + if err == nil { + t.Fatal("expected error for nonexistent source") + } + }) + + t.Run("delete nonexistent returns error", func(t *testing.T) { + err := st.DeleteCollection("nonexistent") + if err != store.ErrCollectionNotFound { + t.Fatalf("expected ErrCollectionNotFound, got %v", err) + } + }) +} + +func TestCollections_Idempotent(t *testing.T) { + f := storetest.New(t) + st := f.Store + + _, err := st.CreateCollection("idem", "", []int64{f.Source.ID}) + testutil.MustNoErr(t, err, "CreateCollection") + + t.Run("add same source twice is no-op", func(t *testing.T) { + err := st.AddSourcesToCollection("idem", []int64{f.Source.ID}) + testutil.MustNoErr(t, err, "AddSourcesToCollection (dupe)") + coll, err := st.GetCollectionByName("idem") + testutil.MustNoErr(t, err, "GetCollectionByName") + if len(coll.SourceIDs) != 1 { + t.Fatalf("sourceIDs = %d, want 1", len(coll.SourceIDs)) + } + }) + + t.Run("remove absent source is no-op", func(t *testing.T) { + src2, err := st.GetOrCreateSource("mbox", "other@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource") + err = st.RemoveSourcesFromCollection("idem", []int64{src2.ID}) + testutil.MustNoErr(t, err, "RemoveSourcesFromCollection (absent)") + }) +} + +func TestCollections_DefaultAllIncremental(t *testing.T) { + f := storetest.New(t) + st := f.Store + + // First call creates "All" with the existing source. + testutil.MustNoErr(t, st.EnsureDefaultCollection(), "EnsureDefaultCollection 1") + coll, err := st.GetCollectionByName("All") + testutil.MustNoErr(t, err, "GetCollectionByName") + initialCount := len(coll.SourceIDs) + + // Add a new source. + _, err = st.GetOrCreateSource("mbox", "new@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource") + + // Second call should add the new source to "All". + testutil.MustNoErr(t, st.EnsureDefaultCollection(), "EnsureDefaultCollection 2") + coll, err = st.GetCollectionByName("All") + testutil.MustNoErr(t, err, "GetCollectionByName after add") + if len(coll.SourceIDs) != initialCount+1 { + t.Errorf("sourceIDs = %d, want %d", len(coll.SourceIDs), initialCount+1) + } +} +``` + +- [ ] **Step 2: Run tests** + +Run: `go test ./internal/store/ -run "TestCollections_(Validation|Idempotent|DefaultAllIncremental)" -v` + +- [ ] **Step 3: Format, vet, commit** + +--- + +## Task 6: Identity Signal Combination and Case Sensitivity Tests + +No test verifies all three signals firing simultaneously, and no test checks case-insensitive address matching. + +**Files:** +- Modify: `internal/store/identities_test.go` + +- [ ] **Step 1: Write combined signal and case tests** + +Append to `internal/store/identities_test.go`: + +```go +func TestListLikelyIdentities_AllThreeSignals(t *testing.T) { + f := storetest.New(t) + // Source identifier is "test@example.com" (matches the fixture). + // Insert a message: From: test@example.com, is_from_me=true, has SENT label. + mid := addMessageFromParticipant( + t, f, f.Source, "m1", "test@example.com", true, + ) + lid, err := f.Store.EnsureLabel(f.Source.ID, "SENT", "Sent", "system") + testutil.MustNoErr(t, err, "EnsureLabel") + testutil.MustNoErr(t, f.Store.LinkMessageLabel(mid, lid), "LinkMessageLabel") + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + + got := ids[0] + want := store.SignalFromMe | store.SignalSentLabel | store.SignalAccountMatch + if got.Signals != want { + t.Errorf("signals = %v, want all three: %v", got.Signals, want) + } +} + +func TestListLikelyIdentities_CaseInsensitive(t *testing.T) { + f := storetest.New(t) + // Insert with mixed-case From address, is_from_me=true. + addMessageFromParticipant( + t, f, f.Source, "m1", "Alice@Example.COM", true, + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + if ids[0].Email != "alice@example.com" { + t.Errorf("email = %q, want lower-cased alice@example.com", ids[0].Email) + } +} +``` + +- [ ] **Step 2: Run tests** + +Run: `go test ./internal/store/ -run "TestListLikelyIdentities_(AllThreeSignals|CaseInsensitive)" -v` + +- [ ] **Step 3: Format, vet, commit** diff --git a/docs/superpowers/specs/2026-04-20-deduplicate-command.md b/docs/superpowers/specs/2026-04-20-deduplicate-command.md new file mode 100644 index 00000000..2a6b366b --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-deduplicate-command.md @@ -0,0 +1,149 @@ +# Identities, Collections, and Deduplication + +Our communications are scattered across decades of email accounts, chat apps, phone backups, and meeting recordings. Getting it all into one place is the first step. msgvault handles that -- it accepts Gmail API syncs, old mbox exports, Apple Mail folders from a retired laptop, IMAP backups, Facebook Messenger dumps, SMS exports, WhatsApp histories, meeting recordings, call logs. Throw everything at it. + +The mess comes after. The same email appears three times because it arrived via Gmail sync, an old mbox export, and an Apple Mail import. Message counts are wrong. Search returns duplicates. There's no way to tell what was sent vs received in an old mbox dump that lost its metadata. And every import sits in its own silo -- there's no unified view of "all my communications." + +msgvault already does the hard part: ingesting data from anywhere. What it lacks are the tools to properly organize that pile so it can be used and reused. + +## Proposal + +### I'm proposing three concepts that turn a pile of imports into an authoritative archive: + +1. **Identities** (tied to accounts). "Who am I in this data?" A set of email addresses and phone numbers attached to an account. Identity determines what was sent vs received, powers sender exchange analysis, and recovers metadata that old imports lost. +2. **Collections** (of accounts). A named grouping of accounts -- "personal," "work," "everything." Collections are how the user sees their communications as one unified archive across multiple accounts. +3. **Deduplication** (of accounts, and of collections). Collapse redundant copies so counts and search results reflect reality. No data destroyed, ever. Runs at import time, at collection creation, or on demand. + +None of these require a specific order. Dedup a single account today. Import more data next month. Create a collection once you've figured out what you have. Dedup again. The system works in whatever sequence the user needs. + +## Identities + +An identity is a set of addresses and phone numbers that represent "me" for a given account. Identity answers the most basic question about any message: did I send this or receive it? + +Old imports lose this signal. A Gmail sync knows which messages are sent (SENT label, is_from_me flag). An mbox dump from 2012 does not. Without identity, there's no way to recover it. + +Identities are tied to accounts. An account's identity is the set of addresses the user sent from through that account. The same address can appear in multiple accounts' identities -- jesse@gmail.com might be an identity on both a Gmail sync account and an old mbox import of the same mailbox. This is expected and correct. + +**Discovery:** `msgvault list-identities` scans an account for likely "me" addresses, ranked by message count with evidence signals (is_from_me, sent-label, account-identifier match). The user reviews, prunes, and confirms. + +Identity drives: + +- **Sent-copy detection** for dedup survivor selection +- **Inbound/outbound classification** for every message in the account +- **Sender exchange analysis** in the TUI + +## Collections + +A collection is a named grouping of accounts. It is the unit of "all my communications" in msgvault -- email, messages, transcripts, across multiple accounts, all in one view. + +A collection's identity is the union of its accounts' identities. + +#### Why "Collection" + +In my prototype I called this a "Merged account" and then realized that only mattered when I was creating it, not once I was using it. The user doesn't think "I'm merging accounts." They think "this is all my work email" or "this is everything." + +I propose **Collection** as the right name because: + +- It describes what the user has, not how they assembled it. "My personal collection." "My work collection." +- It includes accounts without being limited to them. A collection might span a Gmail account, an old mbox import, and a Messenger export. Calling that an "account" is a stretch. Calling it a collection is accurate. +- It's natural at every scale. "All" is a collection. "Work" is a collection. +- Accounts are the individual archives. Collections are the user's organization of them. +- Accounts can be created from exported collections + +### The Default Collection + +I propose that by default we create an "All" collection exists from the start and automatically includes every account. When a user imports an account, it joins "All" without any extra steps. + +For users who want more granularity, additional collections can be created: + +```shell +msgvault collections create work --accounts alice@company.com,old-imap-backup +``` + +1. Groups the named accounts +2. Inherits identity from each account (union of all identity addresses) +3. Runs dedup across the set + +Accounts can belong to multiple collections (e.g., an account in both "work" and "All"). Adding accounts later (`collections add work --accounts old-backup`) triggers dedup of the new account against the existing set. + +### Exporting a Collection to an Account + +A collection can be exported to a single, clean account. This produces a deduplicated, identity-resolved view of the collection as a standalone account -- no duplicates, correct sent/received classification, and unified labels. + +```shell +msgvault collections export personal --as personal-archive +``` + +This is the path from "I assembled a pile of imports" to "I have one authoritative account I trust." The original accounts and collection remain intact. + +## Deduplication + +### Account-Level Dedup + +- `msgvault deduplicate --account alice@gmail.com` -- dedup within a single account +- Runs automatically when importing into an existing account (`--into`) +- Can be run at any time afterward + +### Collection-Level Dedup + +- `msgvault deduplicate --collection personal` -- dedup across all accounts in the collection +- Runs automatically when creating a collection or adding accounts to one +- Can be run at any time afterward + +### Unscoped Dedup + +- `msgvault deduplicate` (no flags) -- scans each account independently. Catches re-imports of the same file. + +Dedup only operates within the boundary the user specifies. It never implicitly reaches across accounts or collections. Creating a collection that combines multiple accounts is how cross-account dedup works -- the user declares "these are all mine, deduplicate them together." + +### Detection + +Primary grouping is by RFC822 Message-ID. The engine backfills this header from stored raw MIME for messages ingested before the field was captured. A secondary content-hash pass catches duplicates where re-export rewrote headers (normalized body hash, parallelized). + +### Survivor Selection + +One message survives per duplicate group, chosen by: + +1. Source type preference: gmail > imap > mbox > emlx > hey (configurable) +2. Has raw MIME +3. More labels +4. Earlier archived timestamp +5. Lower row ID + +The survivor inherits all labels and raw MIME from pruned copies. Pruned copies are soft-deleted and hidden from all query paths. Every run gets a batch ID; `--undo` reverses it. + +### Sent-Message Safety + +When any message in a duplicate group looks like a sent copy, only sent copies are eligible survivors. This preserves the "I sent this" signal. Detection uses three signals, OR-combined: Gmail SENT label, is_from_me flag, and identity address match. + +### Importing with Dedup + +Importing into an existing account or collection deduplicates at ingest time. Messages are hashed against the existing corpus; duplicates are skipped before they enter the database. + +```shell +# Import into an existing account (dedup against that account's messages) +msgvault import-mbox --into alice@gmail.com /path/to/old-backup.mbox + +# Import into a collection (dedup against all accounts in the collection) +msgvault import-mbox --into personal /path/to/another-backup.mbox +``` + +Only new messages land. No separate dedup step needed. When `--into` is not specified, import creates a new account as it does today. The user can add it to a collection later. + +## Safety + +Dedup is a progression from identification to hiding to deletion. Each step is deliberate: + +1. **Scan.** Find duplicates, report what would change. No data touched. +2. **Soft-delete.** Pruned copies are hidden from all queries. The data stays in the archive. `--undo` restores everything. +3. **Delete.** Eventually, the user can permanently remove redundant data from the local archive. This is a separate, explicit action -- never automatic, never triggered by dedup itself. +4. **Remote deletion.** Deleting duplicates from source servers (Gmail, IMAP) is yet another separate decision. Defaults to trash (~30-day recovery). Permanent deletion requires explicit opt-in and interactive confirmation. + +The user controls every step. The system never escalates from one level to the next without being told to. + +Attachment dedup is already handled -- attachments use content-hash addressed storage, so identical files are stored once regardless of how many messages reference them. + +## Out of Scope + +- Implicit cross-collection dedup (the user must explicitly create a collection to combine accounts) +- Automatic identity inference without user confirmation diff --git a/internal/config/config.go b/internal/config/config.go index c5b2531c..200016c1 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -70,6 +70,11 @@ type RemoteConfig struct { } // Config represents the msgvault configuration. +// IdentityConfig holds the user's curated identity addresses. +type IdentityConfig struct { + Addresses []string `toml:"addresses"` +} + type Config struct { Data DataConfig `toml:"data"` Log LogConfig `toml:"log"` @@ -80,6 +85,7 @@ type Config struct { Server ServerConfig `toml:"server"` Remote RemoteConfig `toml:"remote"` Vector vector.Config `toml:"vector"` + Identity IdentityConfig `toml:"identity"` Accounts []AccountSchedule `toml:"accounts"` // Computed paths (not from config file) @@ -121,6 +127,21 @@ type LogConfig struct { SQLTrace bool `toml:"sql_trace"` } +// IdentityAddressSet returns the configured identity addresses as a +// normalized (lower-cased) set for O(1) lookup. Returns an empty map +// (not nil) when no addresses are configured. +func (c *Config) IdentityAddressSet() map[string]bool { + out := make(map[string]bool, len(c.Identity.Addresses)) + for _, addr := range c.Identity.Addresses { + a := strings.ToLower(strings.TrimSpace(addr)) + if a == "" { + continue + } + out[a] = true + } + return out +} + // DataConfig holds data storage configuration. type DataConfig struct { DataDir string `toml:"data_dir"` diff --git a/internal/dedup/dedup.go b/internal/dedup/dedup.go new file mode 100644 index 00000000..ea8474cf --- /dev/null +++ b/internal/dedup/dedup.go @@ -0,0 +1,1124 @@ +// Package dedup provides cross-source duplicate detection and merging. +// +// # Scoping rules +// +// Dedup is always run against a single logical account. Without explicit +// account scoping, dedup operates on one source at a time (intra-source), +// which means a duplicate group can only contain messages that were +// ingested twice into the same source (for example, re-importing the +// same mbox twice). When the caller supplies an account, dedup operates +// on every source belonging to that account at once (for example, a +// Gmail API sync plus a mbox export of the same mailbox). +// +// Dedup intentionally never merges messages across different accounts. +// This is critical for sent messages: a message alice sends to bob is +// one logical message but it has a legitimate copy in alice's "Sent" +// collection and a legitimate copy in bob's "Inbox". Both copies share +// the same RFC822 Message-ID. If both accounts are archived in +// msgvault, they must be preserved independently because deleting one +// would change the other user's view of history. Sent-message handling +// is covered in more detail by FormatMethodology. +package dedup + +import ( + "bufio" + "bytes" + "compress/zlib" + "context" + "crypto/sha256" + "encoding/hex" + "errors" + "fmt" + "io" + "log/slog" + "net/textproto" + "path/filepath" + "runtime" + "sort" + "strings" + "sync" + "time" + + "github.com/wesm/msgvault/internal/deletion" + "github.com/wesm/msgvault/internal/store" +) + +// Config controls the dedup engine behaviour. +type Config struct { + // SourcePreference orders source types when picking a survivor + // inside a duplicate group. Earlier entries win. + SourcePreference []string + + // DryRun reports what would happen without mutating the database + // or writing deletion manifests. + DryRun bool + + // ContentHashFallback also groups messages by normalized raw MIME + // content after the RFC822 Message-ID pass. This is slower, but can + // catch duplicates where Message-ID is missing or transport headers + // are the only difference between copies. + ContentHashFallback bool + + // AccountSourceIDs restricts dedup to the listed sources and + // allows cross-source grouping between them. Callers that want + // strict per-source dedup should leave this empty. + AccountSourceIDs []int64 + + // Account is the canonical identifier for the scoped account + // (for example, "alice@gmail.com"). It is used when building + // deletion manifests and in the methodology output. + Account string + + // DeleteDupsFromSourceServer, when true, writes pending + // deletion manifests for pruned duplicates that meet ALL of: + // 1. the pruned copy lives in a remote source (gmail/imap), + // 2. the surviving copy is in the SAME source_id (i.e. the + // very same remote mailbox holds the winner). + // + // This second rule is load-bearing: it guarantees that a + // merged-pile dedup run can never cause deletions from the + // user's authoritative Gmail/IMAP account just because a + // duplicate was found in a local archive. Only true + // intra-mailbox duplicates are ever proposed for remote + // deletion. + // + // Even with this rule, the field defaults to false so that + // destructive side effects never happen without an explicit + // --delete-dups-from-source-server opt-in at the CLI layer. + DeleteDupsFromSourceServer bool + + // DeletionsDir is the directory where staged deletion manifests + // are written. Required when DeleteDupsFromSourceServer is true. + DeletionsDir string + + // IdentityAddresses lists lower-cased email addresses the + // user considers "me". When a pruned candidate's From: + // matches any of them, the survivor-selection rule treats + // the message as a sent copy — in addition to the existing + // Gmail SENT label and messages.is_from_me signals. + IdentityAddresses map[string]bool +} + +// DefaultSourcePreference is the default source-type authority order. +var DefaultSourcePreference = []string{ + "gmail", "imap", "mbox", "emlx", "hey", +} + +// remoteSourceTypes lists source types whose messages can be deleted +// via the deletion-staging machinery. +var remoteSourceTypes = map[string]bool{ + "gmail": true, + "imap": true, +} + +// Engine orchestrates duplicate detection and merging. +type Engine struct { + store *store.Store + config Config + logger *slog.Logger +} + +// NewEngine creates a new dedup engine. +func NewEngine(st *store.Store, cfg Config, logger *slog.Logger) *Engine { + if len(cfg.SourcePreference) == 0 { + cfg.SourcePreference = DefaultSourcePreference + } + if logger == nil { + logger = slog.Default() + } + return &Engine{store: st, config: cfg, logger: logger} +} + +// DuplicateGroup represents a set of messages that are duplicates of +// each other (share the same RFC822 Message-ID in the scoped sources). +type DuplicateGroup struct { + Key string // RFC822 Message-ID or normalized hash + KeyType string // "message-id" or "normalized-hash" + Messages []DuplicateMessage // all messages in the group + Survivor int // index into Messages of the chosen survivor +} + +// DuplicateMessage holds metadata for a single message in a duplicate +// group, including sent-message signals for safety checks. +type DuplicateMessage struct { + ID int64 + SourceID int64 + SourceType string + SourceIdentifier string + SourceMessageID string + Subject string + SentAt time.Time + HasRawMIME bool + LabelCount int + ArchivedAt time.Time + IsFromMe bool + HasSentLabel bool + FromEmail string + MatchedIdentity bool +} + +// IsSentCopy reports whether this message appears to be the sender-side +// copy of an outbound email. Three independent signals (OR-combined): +// - Gmail SENT system label on the message +// - messages.is_from_me set at ingest time +// - From: address matches a configured identity address +func (m DuplicateMessage) IsSentCopy() bool { + return m.HasSentLabel || m.IsFromMe || m.MatchedIdentity +} + +// Report summarises the results of a dedup scan. +type Report struct { + TotalMessages int64 + DuplicateGroups int + DuplicateMessages int // messages that would be pruned + BySourcePair map[string]int // "gmail+mbox" -> groups + SampleGroups []DuplicateGroup + Groups []DuplicateGroup + BackfilledCount int64 + ContentHashGroups int + SkippedDecompressionErrors int +} + +// ExecutionSummary summarises the results of dedup execution. +type ExecutionSummary struct { + GroupsMerged int + MessagesRemoved int + LabelsTransferred int + RawMIMEBackfilled int + BatchID string + StagedManifests []StagedManifest +} + +// StagedManifest records a single deletion manifest created by dedup. +type StagedManifest struct { + Account string + SourceType string + ManifestID string + MessageCount int +} + +// remoteKey groups remote source IDs by the (account, source_type) pair so +// that a user with multiple remote sources sharing the same account +// identifier (e.g. gmail + imap for the same address) gets one manifest per +// source type rather than a single manifest whose SourceType label reflects +// only the first contributor. +type remoteKey struct { + Account string + SourceType string +} + +// Scan finds all duplicate groups that dedup would prune. +// AccountSourceIDs must be non-empty to prevent accidental cross-account +// grouping; the CLI ensures this by iterating sources one at a time when +// no explicit --account is given. +func (e *Engine) Scan(ctx context.Context) (*Report, error) { + if len(e.config.AccountSourceIDs) == 0 { + return nil, fmt.Errorf("AccountSourceIDs must be non-empty; use per-source iteration for unscoped dedup") + } + + count, err := e.store.CountMessagesWithoutRFC822ID( + e.config.AccountSourceIDs..., + ) + if err != nil { + return nil, fmt.Errorf("count messages without rfc822 id: %w", err) + } + + var backfilledCount int64 + if count > 0 && e.config.DryRun { + e.logger.Info( + "dry-run: backfill needed before dedup can run — "+ + "messages missing rfc822_message_id will be skipped", + "count", count) + backfilledCount = -count // negative signals "needed but skipped" + } else if count > 0 { + e.logger.Info("backfilling rfc822_message_id from stored MIME", + "count", count) + var backfillFailed int64 + backfilledCount, backfillFailed, err = e.store.BackfillRFC822IDs( + e.config.AccountSourceIDs, + func(done, total int64) { + e.logger.Info("backfill progress", + "done", done, "total", total) + }, + ) + if err != nil { + return nil, fmt.Errorf("backfill rfc822 ids: %w", err) + } + if backfilledCount > 0 { + e.logger.Info("backfilled rfc822_message_id", + "count", backfilledCount) + } + if backfillFailed > 0 { + e.logger.Warn("backfill: some messages could not be parsed", + "failed", backfillFailed) + } + } + + totalMessages, err := e.store.CountActiveMessages( + e.config.AccountSourceIDs..., + ) + if err != nil { + return nil, fmt.Errorf("count active messages: %w", err) + } + + storeGroups, err := e.store.FindDuplicatesByRFC822ID( + e.config.AccountSourceIDs..., + ) + if err != nil { + return nil, fmt.Errorf("find duplicates: %w", err) + } + + report := &Report{ + TotalMessages: totalMessages, + BySourcePair: make(map[string]int), + BackfilledCount: backfilledCount, + } + + for _, sg := range storeGroups { + if ctx.Err() != nil { + return nil, ctx.Err() + } + msgs, err := e.store.GetDuplicateGroupMessages( + sg.RFC822MessageID, e.config.AccountSourceIDs..., + ) + if err != nil { + return nil, fmt.Errorf( + "get group messages for %s: %w", + sg.RFC822MessageID, err, + ) + } + if len(msgs) < 2 { + continue + } + + group := DuplicateGroup{ + Key: sg.RFC822MessageID, + KeyType: "message-id", + } + for _, m := range msgs { + matched := false + if len(e.config.IdentityAddresses) > 0 && + m.FromEmail != "" { + matched = e.config.IdentityAddresses[m.FromEmail] + } + group.Messages = append(group.Messages, DuplicateMessage{ + ID: m.ID, + SourceID: m.SourceID, + SourceType: m.SourceType, + SourceIdentifier: m.SourceIdentifier, + SourceMessageID: m.SourceMessageID, + Subject: m.Subject, + SentAt: m.SentAt, + HasRawMIME: m.HasRawMIME, + LabelCount: m.LabelCount, + ArchivedAt: m.ArchivedAt, + IsFromMe: m.IsFromMe, + HasSentLabel: m.HasSentLabel, + FromEmail: m.FromEmail, + MatchedIdentity: matched, + }) + } + + e.selectSurvivor(&group) + report.Groups = append(report.Groups, group) + report.BySourcePair[sourcePairKey(group.Messages)]++ + } + + if e.config.ContentHashFallback { + excludeIDs := make(map[int64]bool, len(report.Groups)*2) + for _, g := range report.Groups { + for _, m := range g.Messages { + excludeIDs[m.ID] = true + } + } + + contentHashGroups, skipped, err := e.scanNormalizedHashGroups(excludeIDs) + if err != nil { + return nil, fmt.Errorf( + "scan normalized content hashes: %w", err, + ) + } + report.SkippedDecompressionErrors = skipped + for _, g := range contentHashGroups { + report.Groups = append(report.Groups, g) + report.ContentHashGroups++ + report.BySourcePair[sourcePairKey(g.Messages)]++ + } + } + + report.DuplicateGroups = len(report.Groups) + for _, g := range report.Groups { + report.DuplicateMessages += len(g.Messages) - 1 + } + + maxSamples := min(10, len(report.Groups)) + report.SampleGroups = append( + []DuplicateGroup(nil), report.Groups[:maxSamples]..., + ) + + return report, nil +} + +// rawWorkItem carries one compressed raw-MIME blob to a worker. +type rawWorkItem struct { + candidate store.ContentHashCandidate + rawData []byte + compress string +} + +// hashResult carries the normalized hash plus message metadata. +type hashResult struct { + hash string + msg DuplicateMessage + skipped bool +} + +// scanNormalizedHashGroups hashes raw MIME after stripping transport-specific +// headers. It skips messages already matched by the primary Message-ID pass. +// Returns the duplicate groups plus a count of candidates skipped due to +// zlib decompression failure. +func (e *Engine) scanNormalizedHashGroups( + excludeIDs map[int64]bool, +) ([]DuplicateGroup, int, error) { + candidates, err := e.store.GetAllRawMIMECandidates( + e.config.AccountSourceIDs..., + ) + if err != nil { + return nil, 0, err + } + + candidateMap := make(map[int64]store.ContentHashCandidate, len(candidates)) + for _, c := range candidates { + if !excludeIDs[c.ID] { + candidateMap[c.ID] = c + } + } + if len(candidateMap) == 0 { + return nil, 0, nil + } + + ids := make([]int64, 0, len(candidateMap)) + for id := range candidateMap { + ids = append(ids, id) + } + sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) + + numWorkers := runtime.NumCPU() + if numWorkers > 16 { + numWorkers = 16 + } + if numWorkers > len(ids) { + numWorkers = len(ids) + } + if numWorkers < 1 { + numWorkers = 1 + } + + work := make(chan rawWorkItem, numWorkers*4) + results := make(chan hashResult, numWorkers*4) + + var wg sync.WaitGroup + for i := 0; i < numWorkers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for item := range work { + raw := item.rawData + if item.compress == "zlib" { + r, err := zlib.NewReader(bytes.NewReader(raw)) + if err != nil { + e.logger.Warn("content-hash: zlib open failed", + "message_id", item.candidate.ID, "err", err) + results <- hashResult{skipped: true} + continue + } + decompressed, err := io.ReadAll(r) + _ = r.Close() + if err != nil { + e.logger.Warn("content-hash: zlib read failed", + "message_id", item.candidate.ID, "err", err) + results <- hashResult{skipped: true} + continue + } + raw = decompressed + } + + matched := false + if len(e.config.IdentityAddresses) > 0 && + item.candidate.FromEmail != "" { + matched = e.config.IdentityAddresses[item.candidate.FromEmail] + } + + results <- hashResult{ + hash: sha256Hex(normalizeRawMIME(raw)), + msg: DuplicateMessage{ + ID: item.candidate.ID, + SourceID: item.candidate.SourceID, + SourceType: item.candidate.SourceType, + SourceIdentifier: item.candidate.SourceIdentifier, + SourceMessageID: item.candidate.SourceMessageID, + Subject: item.candidate.Subject, + SentAt: item.candidate.SentAt, + HasRawMIME: true, + LabelCount: item.candidate.LabelCount, + ArchivedAt: item.candidate.ArchivedAt, + IsFromMe: item.candidate.IsFromMe, + HasSentLabel: item.candidate.HasSentLabel, + FromEmail: item.candidate.FromEmail, + MatchedIdentity: matched, + }, + } + } + }() + } + + type hashEntry struct { + msgs []DuplicateMessage + } + hashMap := make(map[string]*hashEntry) + skipped := 0 + collectDone := make(chan struct{}) + go func() { + for r := range results { + if r.skipped { + skipped++ + continue + } + if entry, ok := hashMap[r.hash]; ok { + entry.msgs = append(entry.msgs, r.msg) + } else { + hashMap[r.hash] = &hashEntry{msgs: []DuplicateMessage{r.msg}} + } + } + close(collectDone) + }() + + readErr := e.store.StreamMessageRaw( + ids, + func(messageID int64, rawData []byte, compression string) { + c, ok := candidateMap[messageID] + if !ok { + return + } + dataCopy := make([]byte, len(rawData)) + copy(dataCopy, rawData) + work <- rawWorkItem{ + candidate: c, + rawData: dataCopy, + compress: compression, + } + }, + ) + close(work) + wg.Wait() + close(results) + <-collectDone + + if readErr != nil { + return nil, skipped, fmt.Errorf("stream message raw: %w", readErr) + } + + var groups []DuplicateGroup + for hash, entry := range hashMap { + if len(entry.msgs) < 2 { + continue + } + g := DuplicateGroup{ + Key: hash, + KeyType: "normalized-hash", + Messages: entry.msgs, + } + e.selectSurvivor(&g) + groups = append(groups, g) + } + return groups, skipped, nil +} + +// transportHeaders vary across otherwise-identical copies of the same email. +var transportHeaders = map[string]bool{ + "Received": true, + "Delivered-To": true, + "Return-Path": true, + "X-Received": true, + "X-Gmail-Labels": true, + "X-Gmail-Received": true, + "X-Google-Smtp-Source": true, + "X-Gm-Message-State": true, + "Authentication-Results": true, + "Dkim-Signature": true, + "Arc-Seal": true, + "Arc-Message-Signature": true, + "Arc-Authentication-Results": true, + "X-Google-Dkim-Signature": true, + "X-Forwarded-To": true, + "X-Forwarded-For": true, + "X-Original-To": true, + "X-Apple-Mail-Labels": true, +} + +// normalizeRawMIME strips transport/export-specific headers before hashing. +func normalizeRawMIME(raw []byte) []byte { + crlfEnd := bytes.Index(raw, []byte("\r\n\r\n")) + lfEnd := bytes.Index(raw, []byte("\n\n")) + headerEnd := -1 + switch { + case crlfEnd >= 0 && lfEnd >= 0: + headerEnd = min(crlfEnd, lfEnd) + case crlfEnd >= 0: + headerEnd = crlfEnd + case lfEnd >= 0: + headerEnd = lfEnd + } + if headerEnd == -1 { + return raw + } + + headerSection := raw[:headerEnd] + // Find the start of the actual body after the blank line. + var bodyStart int + if bytes.HasPrefix(raw[headerEnd:], []byte("\r\n\r\n")) { + bodyStart = headerEnd + 4 + } else { + bodyStart = headerEnd + 2 // "\n\n" + } + body := raw[bodyStart:] + + // Copy headerSection before appending to avoid mutating the + // underlying raw buffer (headerSection is a sub-slice of raw). + hdrBuf := make([]byte, len(headerSection)+4) + copy(hdrBuf, headerSection) + copy(hdrBuf[len(headerSection):], "\r\n\r\n") + reader := textproto.NewReader(bufio.NewReader(bytes.NewReader(hdrBuf))) + mimeHeader, err := reader.ReadMIMEHeader() + if err != nil { + return raw + } + + var kept []string + for key := range mimeHeader { + if !transportHeaders[textproto.CanonicalMIMEHeaderKey(key)] { + kept = append(kept, key) + } + } + sort.Strings(kept) + + var buf bytes.Buffer + for _, key := range kept { + for _, val := range mimeHeader[key] { + fmt.Fprintf(&buf, "%s: %s\n", key, val) + } + } + buf.WriteString("\n") // canonical header/body separator + buf.Write(body) + return buf.Bytes() +} + +func sha256Hex(data []byte) string { + h := sha256.Sum256(data) + return hex.EncodeToString(h[:]) +} + +// selectSurvivor picks the best message to keep in a duplicate group. +func (e *Engine) selectSurvivor(group *DuplicateGroup) { + if len(group.Messages) <= 1 { + group.Survivor = 0 + return + } + + priorityMap := make(map[string]int) + for i, st := range e.config.SourcePreference { + priorityMap[st] = i + } + + candidates := allIndexes(len(group.Messages)) + var sentIdxs []int + for _, i := range candidates { + if group.Messages[i].IsSentCopy() { + sentIdxs = append(sentIdxs, i) + } + } + if len(sentIdxs) > 0 { + candidates = sentIdxs + } + + best := candidates[0] + for _, i := range candidates[1:] { + if e.isBetter( + group.Messages[i], group.Messages[best], priorityMap, + ) { + best = i + } + } + group.Survivor = best +} + +func allIndexes(n int) []int { + out := make([]int, n) + for i := range out { + out[i] = i + } + return out +} + +// isBetter returns true if candidate is a better survivor than current. +func (e *Engine) isBetter( + candidate, current DuplicateMessage, priorityMap map[string]int, +) bool { + candPri := sourcePriority(candidate.SourceType, priorityMap) + currPri := sourcePriority(current.SourceType, priorityMap) + if candPri != currPri { + return candPri < currPri + } + if candidate.HasRawMIME != current.HasRawMIME { + return candidate.HasRawMIME + } + if candidate.LabelCount != current.LabelCount { + return candidate.LabelCount > current.LabelCount + } + if !candidate.ArchivedAt.IsZero() && !current.ArchivedAt.IsZero() { + return candidate.ArchivedAt.Before(current.ArchivedAt) + } + return candidate.ID < current.ID +} + +func sourcePriority(sourceType string, priorityMap map[string]int) int { + if p, ok := priorityMap[sourceType]; ok { + return p + } + return len(priorityMap) +} + +// Execute merges every duplicate group: unions labels onto the +// survivor, soft-deletes the pruned duplicates, and — when +// DeleteDupsFromSourceServer is enabled AND a pruned copy shares a +// source_id with its survivor — writes a deletion manifest. +func (e *Engine) Execute( + ctx context.Context, report *Report, batchID string, +) (*ExecutionSummary, error) { + summary := &ExecutionSummary{BatchID: batchID} + + remoteByKey := make(map[remoteKey][]string) + + for i, group := range report.Groups { + if ctx.Err() != nil { + return summary, ctx.Err() + } + + survivor := group.Messages[group.Survivor] + survivorID := survivor.ID + var dupIDs []int64 + for j, m := range group.Messages { + if j == group.Survivor { + continue + } + dupIDs = append(dupIDs, m.ID) + + if !e.config.DeleteDupsFromSourceServer { + continue + } + if !remoteSourceTypes[m.SourceType] { + continue + } + if m.SourceID != survivor.SourceID { + continue + } + acct := m.SourceIdentifier + if acct == "" { + acct = e.config.Account + } + key := remoteKey{Account: acct, SourceType: m.SourceType} + remoteByKey[key] = append( + remoteByKey[key], m.SourceMessageID, + ) + } + + mergeResult, err := e.store.MergeDuplicates( + survivorID, dupIDs, batchID, + ) + if err != nil { + return summary, fmt.Errorf( + "merge group %d (%s): %w", i, group.Key, err, + ) + } + + summary.GroupsMerged++ + summary.MessagesRemoved += len(dupIDs) + summary.LabelsTransferred += mergeResult.LabelsTransferred + summary.RawMIMEBackfilled += mergeResult.RawMIMEBackfilled + } + + if e.config.DeleteDupsFromSourceServer && len(remoteByKey) > 0 { + staged, err := e.stageDeletionManifests(batchID, remoteByKey) + if err != nil { + return summary, err + } + summary.StagedManifests = staged + } + + return summary, nil +} + +func (e *Engine) stageDeletionManifests( + batchID string, + byKey map[remoteKey][]string, +) ([]StagedManifest, error) { + if e.config.DeletionsDir == "" { + return nil, fmt.Errorf( + "deletions dir not configured but " + + "DeleteDupsFromSourceServer is true", + ) + } + + mgr, err := deletion.NewManager(e.config.DeletionsDir) + if err != nil { + return nil, fmt.Errorf("open deletion manager: %w", err) + } + + keys := make([]remoteKey, 0, len(byKey)) + for k := range byKey { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { + if keys[i].Account != keys[j].Account { + return keys[i].Account < keys[j].Account + } + return keys[i].SourceType < keys[j].SourceType + }) + + // Single-type accounts keep the original manifest ID (no source-type + // suffix) so existing consumers — and test fixtures — don't see a + // rename. Only accounts contributing duplicates from more than one + // source type need disambiguation. + typesPerAccount := make(map[string]int) + for k := range byKey { + typesPerAccount[k.Account]++ + } + + var staged []StagedManifest + for _, k := range keys { + ids := dedupStrings(byKey[k]) + if len(ids) == 0 { + continue + } + + description := fmt.Sprintf("Dedup pruned duplicates (%s)", batchID) + manifest := deletion.NewManifest(description, ids) + if typesPerAccount[k.Account] > 1 { + manifest.ID = manifestIDFor(batchID, k.Account+"-"+k.SourceType) + } else { + manifest.ID = manifestIDFor(batchID, k.Account) + } + manifest.CreatedBy = "dedup" + manifest.Filters.Account = k.Account + + path := filepath.Join( + mgr.PendingDir(), manifest.ID+".json", + ) + if err := manifest.Save(path); err != nil { + return staged, fmt.Errorf( + "save manifest for %s: %w", k.Account, err, + ) + } + staged = append(staged, StagedManifest{ + Account: k.Account, + SourceType: k.SourceType, + ManifestID: manifest.ID, + MessageCount: len(ids), + }) + } + return staged, nil +} + +func manifestIDFor(batchID, account string) string { + return fmt.Sprintf("%s-%s", batchID, SanitizeFilenameComponent(account)) +} + +// SanitizeFilenameComponent strips or replaces characters that are unsafe +// for use in filenames, ensuring the result contains only alphanumeric, +// hyphens, and underscores (with @ and . replaced by hyphens). +func SanitizeFilenameComponent(a string) string { + var b strings.Builder + for _, r := range a { + switch { + case (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || + (r >= '0' && r <= '9') || r == '-' || r == '_': + b.WriteRune(r) + case r == '@' || r == '.': + b.WriteRune('-') + } + } + s := b.String() + if s == "" { + s = "account" + } + if len(s) > 40 { + sum := sha256.Sum256([]byte(a)) + s = s[:31] + "-" + hex.EncodeToString(sum[:4]) + } + return s +} + +func dedupStrings(in []string) []string { + seen := make(map[string]bool, len(in)) + out := make([]string, 0, len(in)) + for _, s := range in { + if seen[s] { + continue + } + seen[s] = true + out = append(out, s) + } + sort.Strings(out) + return out +} + +// Undo restores every message with the given batch ID and cancels any +// pending deletion manifests that dedup created for that batch. +// +// Manifest cancellation is best-effort: if cancelling one manifest +// fails, the remaining manifests are still attempted, and any errors +// are joined into a single returned error alongside the restored row +// count and the list of manifests already in progress. +func (e *Engine) Undo(batchID string) (int64, []string, error) { + restored, err := e.store.UndoDedup(batchID) + if err != nil { + return 0, nil, err + } + + if e.config.DeletionsDir == "" { + return restored, nil, nil + } + + mgr, err := deletion.NewManager(e.config.DeletionsDir) + if err != nil { + return restored, nil, fmt.Errorf("open deletion manager: %w", err) + } + pending, err := mgr.ListPending() + if err != nil { + return restored, nil, fmt.Errorf("list pending: %w", err) + } + inProgress, err := mgr.ListInProgress() + if err != nil { + return restored, nil, fmt.Errorf("list in-progress: %w", err) + } + + var stillExecuting []string + var cancelErrs []error + prefix := batchID + "-" + for _, m := range pending { + if !strings.HasPrefix(m.ID, prefix) && m.ID != batchID { + continue + } + if err := mgr.CancelManifest(m.ID); err != nil { + cancelErrs = append(cancelErrs, fmt.Errorf( + "cancel manifest %s: %w", m.ID, err, + )) + } + } + for _, m := range inProgress { + if !strings.HasPrefix(m.ID, prefix) && m.ID != batchID { + continue + } + stillExecuting = append(stillExecuting, m.ID) + } + if len(cancelErrs) > 0 { + return restored, stillExecuting, errors.Join(cancelErrs...) + } + return restored, stillExecuting, nil +} + +// FormatReport renders a human-readable report of the scan results. +func (e *Engine) FormatReport(r *Report) string { + var sb strings.Builder + sb.WriteString("\n=== Deduplication Report ===\n\n") + + if r.BackfilledCount < 0 { + fmt.Fprintf(&sb, + "Note: %d messages need RFC822 Message-ID backfill "+ + "from stored MIME (skipped in dry-run).\n"+ + "These messages will be backfilled and included "+ + "when you run with --apply.\n\n", + -r.BackfilledCount) + } else if r.BackfilledCount > 0 { + fmt.Fprintf(&sb, + "Backfilled %d messages with RFC822 Message-ID "+ + "from stored MIME.\n\n", + r.BackfilledCount) + } + + if r.DuplicateGroups == 0 { + sb.WriteString("No duplicates found.\n") + return sb.String() + } + + fmt.Fprintf(&sb, "Duplicate groups found: %d\n", r.DuplicateGroups) + fmt.Fprintf(&sb, "Messages to prune: %d\n", r.DuplicateMessages) + if r.ContentHashGroups > 0 { + fmt.Fprintf(&sb, "Content-hash groups: %d\n", r.ContentHashGroups) + } + if r.SkippedDecompressionErrors > 0 { + fmt.Fprintf(&sb, + "Skipped (decompression error): %d "+ + "(see log for per-message details)\n", + r.SkippedDecompressionErrors) + } + + if len(r.BySourcePair) > 0 { + sb.WriteString("\nBreakdown by source pair:\n") + pairs := make([]string, 0, len(r.BySourcePair)) + for k := range r.BySourcePair { + pairs = append(pairs, k) + } + sort.Strings(pairs) + for _, pair := range pairs { + fmt.Fprintf(&sb, " %-20s %d groups\n", + pair, r.BySourcePair[pair]) + } + } + + sentGroups := 0 + for _, g := range r.Groups { + for _, m := range g.Messages { + if m.IsSentCopy() { + sentGroups++ + break + } + } + } + if sentGroups > 0 { + fmt.Fprintf(&sb, + "\nSent-copy groups detected: %d "+ + "(survivor forced to a sent copy)\n", + sentGroups) + } + + if len(r.SampleGroups) > 0 { + sb.WriteString("\nSample duplicate groups:\n") + for i, g := range r.SampleGroups { + label := g.Key + if g.KeyType != "" && g.KeyType != "message-id" { + label = fmt.Sprintf("%s (%s)", g.Key, g.KeyType) + } + fmt.Fprintf(&sb, "\n Group %d: %s\n", i+1, label) + for j, m := range g.Messages { + marker := " " + if j == g.Survivor { + marker = "* " + } + sent := "" + if m.IsSentCopy() { + sent = " [sent]" + } + fmt.Fprintf(&sb, + " %s[%s:%s]%s %s "+ + "(labels: %d, raw: %v)\n", + marker, m.SourceType, m.SourceIdentifier, + sent, m.Subject, m.LabelCount, m.HasRawMIME, + ) + } + } + } + + return sb.String() +} + +// FormatMethodology returns a detailed explanation of how dedup works. +func (e *Engine) FormatMethodology() string { + var sb strings.Builder + sb.WriteString("\n=== Deduplication Methodology ===\n\n") + + sb.WriteString("Scope:\n") + if e.config.Account != "" { + fmt.Fprintf(&sb, + " Scoped to account: %s (%d source(s)). "+ + "Cross-source dedup is enabled\n"+ + " within this account.\n", + e.config.Account, len(e.config.AccountSourceIDs)) + } else if len(e.config.AccountSourceIDs) > 0 { + fmt.Fprintf(&sb, + " Scoped to %d source(s). Cross-source dedup is "+ + "enabled within that set.\n", + len(e.config.AccountSourceIDs)) + } else { + sb.WriteString( + " No account specified — only messages that appear " + + "twice in the\n" + + " SAME source are eligible. Rerun with " + + "--account to dedup\n" + + " across sources that belong to one mailbox " + + "(e.g. Gmail sync +\n" + + " mbox import of the same account).\n", + ) + } + sb.WriteString("\n") + + sb.WriteString("Detection:\n") + sb.WriteString(" Messages are grouped by the RFC822 Message-ID " + + "header.\n") + sb.WriteString(" Messages missing that header are backfilled " + + "from stored MIME\n") + sb.WriteString(" before the scan runs.") + if e.config.ContentHashFallback { + sb.WriteString(" Messages still without an ID are then compared via\n") + sb.WriteString(" a normalized raw-MIME hash that strips transport " + + "headers such as\n") + sb.WriteString(" Received, Delivered-To, X-Gmail-Labels, and " + + "DKIM/ARC traces.\n") + sb.WriteString(" The hash is byte-sensitive below the header " + + "boundary, so two\n") + sb.WriteString(" messages whose bodies differ only in line-ending " + + "style (CRLF vs LF)\n") + sb.WriteString(" will not match via content-hash.\n\n") + } else { + sb.WriteString(" Messages still without an ID are ignored.\n\n") + } + + sb.WriteString("Survivor selection:\n") + for i, st := range e.config.SourcePreference { + fmt.Fprintf(&sb, " %d. %s\n", i+1, st) + } + sb.WriteString(" Tiebreakers: has raw MIME > more labels > " + + "earlier archived_at > lower id.\n\n") + + sb.WriteString("Sent messages:\n") + sb.WriteString( + " Dedup NEVER merges messages across different " + + "accounts. A message that\n" + + " alice sent to bob is two distinct mailbox " + + "copies — one in alice's\n" + + " Sent folder and one in bob's Inbox. Both are " + + "preserved independently\n" + + " because deleting either would alter the other " + + "user's archive.\n\n", + ) + + sb.WriteString("Merge behaviour:\n") + sb.WriteString(" - Labels from every copy are unioned onto " + + "the survivor.\n") + sb.WriteString(" - Raw MIME is backfilled onto the survivor " + + "if it lacks it.\n") + sb.WriteString(" - Only raw MIME is backfilled; parsed " + + "message_bodies are not.\n") + sb.WriteString(" If a survivor is missing text for display, run\n") + sb.WriteString(" 'msgvault repair-encoding' or " + + "'msgvault build-cache --full-rebuild'.\n") + sb.WriteString(" - Pruned duplicates are hidden in the msgvault " + + "database (reversible via --undo).\n") + sb.WriteString(" - Remote mailboxes (Gmail, IMAP) are NEVER " + + "modified by default.\n") + + return sb.String() +} + +func sourcePairKey(msgs []DuplicateMessage) string { + types := make(map[string]bool) + for _, m := range msgs { + types[m.SourceType] = true + } + sorted := make([]string, 0, len(types)) + for t := range types { + sorted = append(sorted, t) + } + sort.Strings(sorted) + return strings.Join(sorted, "+") +} diff --git a/internal/dedup/dedup_test.go b/internal/dedup/dedup_test.go new file mode 100644 index 00000000..3baf2f33 --- /dev/null +++ b/internal/dedup/dedup_test.go @@ -0,0 +1,474 @@ +package dedup_test + +import ( + "context" + "database/sql" + "path/filepath" + "strings" + "testing" + + "github.com/wesm/msgvault/internal/dedup" + "github.com/wesm/msgvault/internal/deletion" + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/testutil" + "github.com/wesm/msgvault/internal/testutil/storetest" +) + +func addMessage( + t *testing.T, + st *store.Store, + source *store.Source, + srcMsgID, rfc822ID string, + fromMe bool, +) int64 { + t.Helper() + convID, err := st.EnsureConversation( + source.ID, "thread-"+srcMsgID, "Subject", + ) + testutil.MustNoErr(t, err, "EnsureConversation") + id, err := st.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: source.ID, + SourceMessageID: srcMsgID, + RFC822MessageID: sql.NullString{ + String: rfc822ID, Valid: rfc822ID != "", + }, + MessageType: "email", + IsFromMe: fromMe, + SizeEstimate: 1000, + }) + testutil.MustNoErr(t, err, "UpsertMessage") + return id +} + +func assertSoftDeleted( + t *testing.T, st *store.Store, msgID int64, wantDeleted bool, +) { + t.Helper() + var deletedAt sql.NullTime + err := st.DB().QueryRow( + "SELECT deleted_at FROM messages WHERE id = ?", msgID, + ).Scan(&deletedAt) + testutil.MustNoErr(t, err, "query deleted_at") + if wantDeleted && !deletedAt.Valid { + t.Errorf("message %d: deleted_at should be set", msgID) + } + if !wantDeleted && deletedAt.Valid { + t.Errorf("message %d: deleted_at should be NULL", msgID) + } +} + +func linkLabel( + t *testing.T, + st *store.Store, + sourceID, msgID int64, + sourceLabelID, name, typ string, +) { + t.Helper() + lid, err := st.EnsureLabel(sourceID, sourceLabelID, name, typ) + testutil.MustNoErr(t, err, "EnsureLabel "+sourceLabelID) + testutil.MustNoErr(t, + st.LinkMessageLabel(msgID, lid), + "LinkMessageLabel "+sourceLabelID, + ) +} + +func TestEngine_Scan_UnionsLabelsOnSurvivor(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + mbox, err := st.GetOrCreateSource("mbox", "test@example.com-mbox") + testutil.MustNoErr(t, err, "GetOrCreateSource mbox") + + idGmail := addMessage(t, st, gmail, "gmail-1", "rfc-union", false) + idMbox := addMessage(t, st, mbox, "mbox-1", "rfc-union", false) + + linkLabel(t, st, gmail.ID, idGmail, "INBOX", "Inbox", "system") + linkLabel(t, st, mbox.ID, idMbox, "Archive", "Archive", "user") + linkLabel(t, st, mbox.ID, idMbox, "Work", "Work", "user") + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID, mbox.ID}, + Account: "test@example.com", + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + if report.DuplicateMessages != 1 { + t.Fatalf("prune count = %d, want 1", report.DuplicateMessages) + } + + group := report.Groups[0] + survivor := group.Messages[group.Survivor] + if survivor.ID != idGmail { + t.Errorf("survivor = %d, want %d (gmail)", survivor.ID, idGmail) + } + + summary, err := eng.Execute( + context.Background(), report, "batch-union", + ) + testutil.MustNoErr(t, err, "Execute") + if summary.GroupsMerged != 1 { + t.Errorf("groupsMerged = %d, want 1", summary.GroupsMerged) + } + + f.AssertLabelCount(idGmail, 3) + assertSoftDeleted(t, st, idMbox, true) +} + +func TestEngine_Scan_RejectsEmptyAccountSourceIDs(t *testing.T) { + f := storetest.New(t) + st := f.Store + + cases := []struct { + name string + ids []int64 + }{ + {"nil", nil}, + {"empty slice", []int64{}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: tc.ids, + }, nil) + _, err := eng.Scan(context.Background()) + if err == nil { + t.Fatal("expected error for empty AccountSourceIDs") + } + if !strings.Contains(err.Error(), "AccountSourceIDs must be non-empty") { + t.Errorf("unexpected error: %v", err) + } + }) + } +} + +func TestEngine_SurvivorFavorsSentCopy(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + idInbox := addMessage(t, st, gmail, "inbox-sent", "rfc-sent", false) + idSent := addMessage(t, st, gmail, "sent-sent", "rfc-sent", true) + + linkLabel(t, st, gmail.ID, idInbox, "INBOX", "Inbox", "system") + linkLabel(t, st, gmail.ID, idSent, "SENT", "Sent", "system") + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID}, + Account: "test@example.com", + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + + group := report.Groups[0] + survivor := group.Messages[group.Survivor] + if survivor.ID != idSent { + t.Errorf("survivor = %d, want sent copy %d", + survivor.ID, idSent) + } + if !survivor.IsSentCopy() { + t.Errorf("survivor should be a sent copy") + } +} + +func TestEngine_DefaultConfig_NeverStagesRemote(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + _ = addMessage(t, st, gmail, "g-1", "rfc-default", false) + _ = addMessage(t, st, gmail, "g-2", "rfc-default", false) + + deletionsDir := filepath.Join(t.TempDir(), "deletions") + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID}, + Account: "test@example.com", + DeletionsDir: deletionsDir, + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + summary, err := eng.Execute( + context.Background(), report, "batch-default", + ) + testutil.MustNoErr(t, err, "Execute") + + if summary.MessagesRemoved != 1 { + t.Errorf("messagesRemoved = %d, want 1", summary.MessagesRemoved) + } + if len(summary.StagedManifests) != 0 { + t.Errorf("stagedManifests = %d, want 0", len(summary.StagedManifests)) + } + + mgr, err := deletion.NewManager(deletionsDir) + testutil.MustNoErr(t, err, "NewManager") + pending, err := mgr.ListPending() + testutil.MustNoErr(t, err, "ListPending") + if len(pending) != 0 { + t.Errorf("pending manifests = %d, want 0", len(pending)) + } +} + +func TestEngine_OptIn_StagesOnlyWithinSameSourceID(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + otherGmail, err := st.GetOrCreateSource("gmail", "other@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource otherGmail") + mbox, err := st.GetOrCreateSource("mbox", "local.mbox") + testutil.MustNoErr(t, err, "GetOrCreateSource mbox") + + idWinner := addMessage(t, st, gmail, "g-1", "rfc-opt", false) + idLoser := addMessage(t, st, gmail, "g-2", "rfc-opt", false) + idOther := addMessage(t, st, otherGmail, "g-3", "rfc-opt", false) + idMbox := addMessage(t, st, mbox, "m-1", "rfc-opt", false) + + deletionsDir := filepath.Join(t.TempDir(), "deletions") + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID, otherGmail.ID, mbox.ID}, + Account: "pile", + DeleteDupsFromSourceServer: true, + DeletionsDir: deletionsDir, + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + summary, err := eng.Execute( + context.Background(), report, "batch-opt", + ) + testutil.MustNoErr(t, err, "Execute") + + if summary.MessagesRemoved != 3 { + t.Errorf("messagesRemoved = %d, want 3", summary.MessagesRemoved) + } + assertSoftDeleted(t, st, idWinner, false) + assertSoftDeleted(t, st, idLoser, true) + assertSoftDeleted(t, st, idOther, true) + assertSoftDeleted(t, st, idMbox, true) + + if len(summary.StagedManifests) != 1 { + t.Fatalf("stagedManifests = %d, want 1", len(summary.StagedManifests)) + } + sm := summary.StagedManifests[0] + if sm.Account != "test@example.com" { + t.Errorf("staged account = %q, want test@example.com", sm.Account) + } + if sm.MessageCount != 1 { + t.Errorf("staged count = %d, want 1", sm.MessageCount) + } + + mgr, err := deletion.NewManager(deletionsDir) + testutil.MustNoErr(t, err, "NewManager") + pending, err := mgr.ListPending() + testutil.MustNoErr(t, err, "ListPending") + if len(pending) != 1 { + t.Fatalf("pending = %d, want 1", len(pending)) + } + if len(pending[0].GmailIDs) != 1 || pending[0].GmailIDs[0] != "g-2" { + t.Errorf("manifest GmailIDs = %v, want [g-2]", pending[0].GmailIDs) + } + + restored, stillExec, err := eng.Undo("batch-opt") + testutil.MustNoErr(t, err, "Undo") + if restored != 3 { + t.Errorf("restored = %d, want 3", restored) + } + if len(stillExec) != 0 { + t.Errorf("stillExec = %v, want empty", stillExec) + } + pending, err = mgr.ListPending() + testutil.MustNoErr(t, err, "ListPending after undo") + if len(pending) != 0 { + t.Errorf("pending after undo = %d, want 0", len(pending)) + } +} + +func TestEngine_ScopedToSingleSource_IgnoresCrossAccount(t *testing.T) { + f := storetest.New(t) + st := f.Store + alice := f.Source + + bob, err := st.GetOrCreateSource("gmail", "bob@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource bob") + + addMessage(t, st, alice, "a-1", "rfc-cross", true) + addMessage(t, st, bob, "b-1", "rfc-cross", false) + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{alice.ID}, + Account: "test@example.com", + }, nil) + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 0 { + t.Errorf("cross-account dedup happened: groups = %d", + report.DuplicateGroups) + } +} + +func TestEngine_ContentHashFallbackFindsNormalizedDuplicates(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + mbox, err := st.GetOrCreateSource("mbox", "test@example.com-mbox") + testutil.MustNoErr(t, err, "GetOrCreateSource mbox") + + id1 := addMessage(t, st, gmail, "hash-1", "", false) + id2 := addMessage(t, st, mbox, "hash-2", "", false) + + raw1 := []byte("Received: from mx1.google.com\r\nDelivered-To: one@example.com\r\nX-Gmail-Labels: INBOX\r\nFrom: sender@example.com\r\nSubject: Meeting tomorrow\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet tomorrow at 3pm.") + raw2 := []byte("Received: from mx2.google.com\r\nDelivered-To: two@example.com\r\nX-Gmail-Labels: SENT\r\nAuthentication-Results: spf=pass\r\nFrom: sender@example.com\r\nSubject: Meeting tomorrow\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet tomorrow at 3pm.") + testutil.MustNoErr(t, st.UpsertMessageRaw(id1, raw1), "UpsertMessageRaw id1") + testutil.MustNoErr(t, st.UpsertMessageRaw(id2, raw2), "UpsertMessageRaw id2") + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID, mbox.ID}, + Account: "test@example.com", + ContentHashFallback: true, + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + if report.ContentHashGroups != 1 { + t.Fatalf("contentHashGroups = %d, want 1", report.ContentHashGroups) + } + if got := report.Groups[0].KeyType; got != "normalized-hash" { + t.Fatalf("keyType = %q, want normalized-hash", got) + } +} + +func TestEngine_ContentHashFallbackDisabledByDefault(t *testing.T) { + f := storetest.New(t) + st := f.Store + gmail := f.Source + + mbox, err := st.GetOrCreateSource("mbox", "test@example.com-mbox") + testutil.MustNoErr(t, err, "GetOrCreateSource mbox") + + id1 := addMessage(t, st, gmail, "hash-off-1", "", false) + id2 := addMessage(t, st, mbox, "hash-off-2", "", false) + raw := []byte("Subject: No Message-ID\r\n\r\nIdentical body") + testutil.MustNoErr(t, st.UpsertMessageRaw(id1, raw), "UpsertMessageRaw id1") + testutil.MustNoErr(t, st.UpsertMessageRaw(id2, raw), "UpsertMessageRaw id2") + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{gmail.ID, mbox.ID}, + Account: "test@example.com", + }, nil) + + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 0 { + t.Fatalf("groups = %d, want 0", report.DuplicateGroups) + } +} + +func TestEngine_FormatMethodology_MentionsSentPolicy(t *testing.T) { + f := storetest.New(t) + eng := dedup.NewEngine(f.Store, dedup.Config{ + Account: "test@example.com", + AccountSourceIDs: []int64{f.Source.ID}, + }, nil) + out := eng.FormatMethodology() + if !strings.Contains( + strings.ToLower(out), + "never merges messages across different", + ) { + t.Errorf("methodology missing cross-account guarantee") + } +} + +func TestEngine_SurvivorTiebreakers(t *testing.T) { + t.Run("raw MIME wins over no raw MIME", func(t *testing.T) { + f := storetest.New(t) + st := f.Store + + idNoRaw := addMessage(t, st, f.Source, "no-raw", "rfc-raw-tie", false) + idHasRaw := addMessage(t, st, f.Source, "has-raw", "rfc-raw-tie", false) + testutil.MustNoErr(t, + st.UpsertMessageRaw(idHasRaw, []byte("Subject: test\r\n\r\nBody")), + "UpsertMessageRaw", + ) + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{f.Source.ID}, + Account: "test", + }, nil) + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + survivor := report.Groups[0].Messages[report.Groups[0].Survivor] + if survivor.ID != idHasRaw { + t.Errorf("survivor = %d, want %d (has raw)", survivor.ID, idHasRaw) + } + _ = idNoRaw + }) + + t.Run("more labels wins when raw MIME is equal", func(t *testing.T) { + f := storetest.New(t) + st := f.Store + + idFew := addMessage(t, st, f.Source, "few", "rfc-label-tie", false) + idMany := addMessage(t, st, f.Source, "many", "rfc-label-tie", false) + + lid1, _ := st.EnsureLabel(f.Source.ID, "L1", "Label1", "user") + lid2, _ := st.EnsureLabel(f.Source.ID, "L2", "Label2", "user") + lid3, _ := st.EnsureLabel(f.Source.ID, "L3", "Label3", "user") + _ = st.LinkMessageLabel(idFew, lid1) + _ = st.LinkMessageLabel(idMany, lid1) + _ = st.LinkMessageLabel(idMany, lid2) + _ = st.LinkMessageLabel(idMany, lid3) + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{f.Source.ID}, + Account: "test", + }, nil) + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + survivor := report.Groups[0].Messages[report.Groups[0].Survivor] + if survivor.ID != idMany { + t.Errorf("survivor = %d, want %d (more labels)", survivor.ID, idMany) + } + }) + + t.Run("lower ID wins as final tiebreaker", func(t *testing.T) { + f := storetest.New(t) + st := f.Store + + idFirst := addMessage(t, st, f.Source, "first", "rfc-id-tie", false) + _ = addMessage(t, st, f.Source, "second", "rfc-id-tie", false) + + eng := dedup.NewEngine(st, dedup.Config{ + AccountSourceIDs: []int64{f.Source.ID}, + Account: "test", + }, nil) + report, err := eng.Scan(context.Background()) + testutil.MustNoErr(t, err, "Scan") + if report.DuplicateGroups != 1 { + t.Fatalf("groups = %d, want 1", report.DuplicateGroups) + } + survivor := report.Groups[0].Messages[report.Groups[0].Survivor] + if survivor.ID != idFirst { + t.Errorf("survivor = %d, want %d (lower ID)", survivor.ID, idFirst) + } + }) +} diff --git a/internal/dedup/normalize_test.go b/internal/dedup/normalize_test.go new file mode 100644 index 00000000..e79609aa --- /dev/null +++ b/internal/dedup/normalize_test.go @@ -0,0 +1,97 @@ +package dedup + +import ( + "bytes" + "testing" +) + +func TestNormalizeRawMIME(t *testing.T) { + tests := []struct { + name string + input []byte + wantSame bool // true if output should equal input + contains string // substring the output must contain + excludes string // substring the output must NOT contain + }{ + { + name: "strips Received header (CRLF)", + input: []byte("Received: from mx1.google.com\r\nFrom: alice@example.com\r\nSubject: Hi\r\n\r\nBody"), + contains: "From: alice@example.com", + excludes: "Received", + }, + { + name: "strips multiple transport headers", + input: []byte("Delivered-To: bob@example.com\r\nX-Gmail-Labels: INBOX\r\nAuthentication-Results: spf=pass\r\nFrom: alice@example.com\r\nSubject: Test\r\n\r\nBody"), + contains: "From: alice@example.com", + excludes: "Delivered-To", + }, + { + name: "preserves non-transport headers", + input: []byte("From: alice@example.com\r\nTo: bob@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nBody text"), + contains: "Subject: Meeting", + }, + { + name: "handles LF-only line endings", + input: []byte("Received: from mx1\nFrom: alice@example.com\nSubject: Test\n\nBody with LF"), + contains: "From: alice@example.com", + excludes: "Received", + }, + { + name: "no header/body separator returns raw unchanged", + input: []byte("This is just a blob of text with no headers"), + wantSame: true, + }, + { + name: "empty body preserved", + input: []byte("From: alice@example.com\r\nSubject: Empty\r\n\r\n"), + contains: "Subject: Empty", + }, + { + name: "preserves body content exactly", + input: []byte("Received: from mx1\r\nFrom: a@b.com\r\n\r\nExact body content here."), + contains: "Exact body content here.", + }, + { + name: "LF headers with CRLF in body uses earliest boundary", + input: []byte("From: a@b.com\nSubject: Test\n\nBody has \r\n\r\n inside"), + contains: "Body has \r\n\r\n inside", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + inputCopy := make([]byte, len(tt.input)) + copy(inputCopy, tt.input) + + result := normalizeRawMIME(tt.input) + + if !bytes.Equal(tt.input, inputCopy) { + t.Error("normalizeRawMIME mutated its input buffer") + } + + if tt.wantSame { + if !bytes.Equal(result, tt.input) { + t.Errorf("expected unchanged output, got:\n%s", result) + } + return + } + if tt.contains != "" && !bytes.Contains(result, []byte(tt.contains)) { + t.Errorf("output missing %q:\n%s", tt.contains, result) + } + if tt.excludes != "" && bytes.Contains(result, []byte(tt.excludes)) { + t.Errorf("output should not contain %q:\n%s", tt.excludes, result) + } + }) + } +} + +func TestNormalizeRawMIME_DeterministicOutput(t *testing.T) { + raw1 := []byte("Received: from mx1.google.com\r\nFrom: sender@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet at 3pm.") + raw2 := []byte("Received: from mx2.google.com\r\nDelivered-To: other@example.com\r\nFrom: sender@example.com\r\nSubject: Meeting\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\n\r\nLet's meet at 3pm.") + + hash1 := sha256Hex(normalizeRawMIME(raw1)) + hash2 := sha256Hex(normalizeRawMIME(raw2)) + if hash1 != hash2 { + t.Errorf("same message with different transport headers produced different hashes") + } +} diff --git a/internal/logging/logging.go b/internal/logging/logging.go index 623abc47..02a5f111 100644 --- a/internal/logging/logging.go +++ b/internal/logging/logging.go @@ -211,10 +211,17 @@ func BuildHandler(opts Options) (*Result, error) { _ = f.Close() }) default: + target := path + if target == "" { + target = opts.FilePath + } + if target == "" { + target = opts.LogsDir + } _, _ = fmt.Fprintf(stderr, "warning: could not open msgvault log file in %s: %v "+ "(continuing with stderr-only logging)\n", - opts.LogsDir, err, + target, err, ) } } diff --git a/internal/query/duckdb.go b/internal/query/duckdb.go index 08981f34..b69525ce 100644 --- a/internal/query/duckdb.go +++ b/internal/query/duckdb.go @@ -297,6 +297,11 @@ func (e *DuckDBEngine) parquetCTEs() string { } else { msgExtra = append(msgExtra, "'' AS message_type") } + if e.hasCol("messages", "deleted_at") { + msgReplace = append(msgReplace, "TRY_CAST(deleted_at AS TIMESTAMP) AS deleted_at") + } else { + msgExtra = append(msgExtra, "NULL::TIMESTAMP AS deleted_at") + } msgCTE := fmt.Sprintf("SELECT * REPLACE (\n\t\t\t\t%s\n\t\t\t)", strings.Join(msgReplace, ",\n\t\t\t\t")) if len(msgExtra) > 0 { msgCTE += ", " + strings.Join(msgExtra, ", ") @@ -648,10 +653,8 @@ func (e *DuckDBEngine) buildWhereClause(opts AggregateOptions, keyColumns ...str // message_type IS NULL and '' handle old data without the column. conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") - if opts.SourceID != nil { - conditions = append(conditions, "msg.source_id = ?") - args = append(args, *opts.SourceID) - } + conditions = append(conditions, "msg.deleted_at IS NULL") + conditions, args = appendSourceFilter(conditions, args, "msg.", opts.SourceID, opts.SourceIDs) if opts.After != nil { conditions = append(conditions, "msg.sent_at >= CAST(? AS TIMESTAMP)") @@ -854,10 +857,8 @@ func (e *DuckDBEngine) buildFilterConditions(filter MessageFilter) (string, []in // message_type IS NULL and '' handle old data without the column. conditions = append(conditions, "(msg.message_type = 'email' OR msg.message_type IS NULL OR msg.message_type = '')") - if filter.SourceID != nil { - conditions = append(conditions, "msg.source_id = ?") - args = append(args, *filter.SourceID) - } + conditions = append(conditions, "msg.deleted_at IS NULL") + conditions, args = appendSourceFilter(conditions, args, "msg.", filter.SourceID, filter.SourceIDs) if filter.ConversationID != nil { conditions = append(conditions, "msg.conversation_id = ?") @@ -1117,10 +1118,8 @@ func (e *DuckDBEngine) GetTotalStats(ctx context.Context, opts StatsOptions) (*T // Restrict to email messages only; NULL and '' handle pre-message_type data. conditions = append(conditions, emailOnlyFilterMsg) - if opts.SourceID != nil { - conditions = append(conditions, "msg.source_id = ?") - args = append(args, *opts.SourceID) - } + conditions = append(conditions, "msg.deleted_at IS NULL") + conditions, args = appendSourceFilter(conditions, args, "msg.", opts.SourceID, opts.SourceIDs) if opts.WithAttachmentsOnly { conditions = append(conditions, "msg.has_attachments = 1") @@ -1470,7 +1469,8 @@ func (e *DuckDBEngine) Search(ctx context.Context, q *search.Query, limit, offse var args []interface{} var joins []string - // Include all messages (deleted messages shown with indicator in TUI) + // Exclude rows soft-deleted by deduplicate (sqlite_scan path). + conditions = append(conditions, "m.deleted_at IS NULL") // From filter if len(q.FromAddrs) > 0 { @@ -1670,16 +1670,9 @@ func (e *DuckDBEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi var args []interface{} // Always exclude deleted messages + conditions = append(conditions, "msg.deleted_at IS NULL") conditions = append(conditions, "msg.deleted_from_source_at IS NULL") - - // Gmail scoping is handled by JOIN src in the query below — this function - // is used for Gmail-specific deletion/staging workflows and must not - // return WhatsApp or other source IDs. - - if filter.SourceID != nil { - conditions = append(conditions, "msg.source_id = ?") - args = append(args, *filter.SourceID) - } + conditions, args = appendSourceFilter(conditions, args, "msg.", filter.SourceID, filter.SourceIDs) // Use EXISTS subqueries for filtering (becomes semi-joins, no duplicates) if filter.Sender != "" { @@ -2319,10 +2312,8 @@ func (e *DuckDBEngine) buildSearchConditions(q *search.Query, filter MessageFilt conditions = append(conditions, emailOnlyFilterMsg) // Apply basic filter conditions (ignoring join flags for search - we handle those differently) - if filter.SourceID != nil { - conditions = append(conditions, "msg.source_id = ?") - args = append(args, *filter.SourceID) - } + conditions = append(conditions, "msg.deleted_at IS NULL") + conditions, args = appendSourceFilter(conditions, args, "msg.", filter.SourceID, filter.SourceIDs) if filter.After != nil { conditions = append(conditions, "msg.sent_at >= CAST(? AS TIMESTAMP)") args = append(args, filter.After.Format("2006-01-02 15:04:05")) diff --git a/internal/query/models.go b/internal/query/models.go index 06d07691..f21454fc 100644 --- a/internal/query/models.go +++ b/internal/query/models.go @@ -211,7 +211,8 @@ type MessageFilter struct { TimeRange TimeRange // Account filter - SourceID *int64 // nil means all accounts + SourceID *int64 // nil means all accounts + SourceIDs []int64 // multi-source filter (collections); overrides SourceID // Date range After *time.Time @@ -281,13 +282,17 @@ func (f MessageFilter) Clone() MessageFilter { clone.EmptyValueTargets[k] = v } } + if f.SourceIDs != nil { + clone.SourceIDs = append([]int64(nil), f.SourceIDs...) + } return clone } // AggregateOptions configures an aggregate query. type AggregateOptions struct { // Account filter - SourceID *int64 // nil means all accounts + SourceID *int64 // nil means all accounts + SourceIDs []int64 // multi-source filter (collections) // Date range After *time.Time @@ -332,6 +337,7 @@ type AccountInfo struct { // StatsOptions configures a stats query. type StatsOptions struct { SourceID *int64 // nil means all accounts + SourceIDs []int64 // multi-source filter (collections) WithAttachmentsOnly bool // only count messages with attachments HideDeletedFromSource bool // exclude messages where deleted_from_source_at IS NOT NULL SearchQuery string // when set, stats reflect only messages matching this search diff --git a/internal/query/source_filter.go b/internal/query/source_filter.go new file mode 100644 index 00000000..501419c6 --- /dev/null +++ b/internal/query/source_filter.go @@ -0,0 +1,37 @@ +package query + +import ( + "fmt" + "strings" +) + +// appendSourceFilter returns conditions/args updated with a source-id +// filter drawn from either SourceIDs (multi) or SourceID (single). +// SourceIDs takes precedence when both are provided. A non-nil but +// empty multiIDs slice produces a 1=0 (match-nothing) condition. +func appendSourceFilter( + conditions []string, args []any, + prefix string, singleID *int64, multiIDs []int64, +) ([]string, []any) { + if multiIDs != nil && len(multiIDs) == 0 { + conditions = append(conditions, "1=0") + return conditions, args + } + if len(multiIDs) > 0 { + placeholders := make([]string, len(multiIDs)) + for i, id := range multiIDs { + placeholders[i] = "?" + args = append(args, id) + } + conditions = append(conditions, fmt.Sprintf( + "%ssource_id IN (%s)", + prefix, strings.Join(placeholders, ","), + )) + return conditions, args + } + if singleID != nil { + conditions = append(conditions, prefix+"source_id = ?") + args = append(args, *singleID) + } + return conditions, args +} diff --git a/internal/query/source_filter_test.go b/internal/query/source_filter_test.go new file mode 100644 index 00000000..3887fd78 --- /dev/null +++ b/internal/query/source_filter_test.go @@ -0,0 +1,112 @@ +package query + +import ( + "testing" +) + +func TestAppendSourceFilter(t *testing.T) { + id42 := int64(42) + + tests := []struct { + name string + singleID *int64 + multiIDs []int64 + prefix string + wantConditions int + wantArgs int + wantCondition string + }{ + { + name: "neither single nor multi", + singleID: nil, + multiIDs: nil, + prefix: "m.", + wantConditions: 0, + wantArgs: 0, + }, + { + name: "single ID", + singleID: &id42, + multiIDs: nil, + prefix: "m.", + wantConditions: 1, + wantArgs: 1, + wantCondition: "m.source_id = ?", + }, + { + name: "empty multi IDs matches nothing", + singleID: nil, + multiIDs: []int64{}, + prefix: "m.", + wantConditions: 1, + wantArgs: 0, + wantCondition: "1=0", + }, + { + name: "empty multi IDs overrides singleID", + singleID: &id42, + multiIDs: []int64{}, + prefix: "m.", + wantConditions: 1, + wantArgs: 0, + wantCondition: "1=0", + }, + { + name: "single multi ID", + singleID: nil, + multiIDs: []int64{7}, + prefix: "m.", + wantConditions: 1, + wantArgs: 1, + wantCondition: "m.source_id IN (?)", + }, + { + name: "multi IDs", + singleID: nil, + multiIDs: []int64{1, 2, 3}, + prefix: "msg.", + wantConditions: 1, + wantArgs: 3, + wantCondition: "msg.source_id IN (?,?,?)", + }, + { + name: "multi IDs take precedence over single", + singleID: &id42, + multiIDs: []int64{10, 20}, + prefix: "", + wantConditions: 1, + wantArgs: 2, + wantCondition: "source_id IN (?,?)", + }, + { + name: "empty prefix", + singleID: &id42, + multiIDs: nil, + prefix: "", + wantConditions: 1, + wantArgs: 1, + wantCondition: "source_id = ?", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + conditions, args := appendSourceFilter( + nil, nil, tt.prefix, tt.singleID, tt.multiIDs, + ) + if len(conditions) != tt.wantConditions { + t.Errorf("conditions = %d, want %d: %v", + len(conditions), tt.wantConditions, conditions) + } + if len(args) != tt.wantArgs { + t.Errorf("args = %d, want %d", len(args), tt.wantArgs) + } + if tt.wantCondition != "" && len(conditions) > 0 { + if conditions[0] != tt.wantCondition { + t.Errorf("condition = %q, want %q", + conditions[0], tt.wantCondition) + } + } + }) + } +} diff --git a/internal/query/sqlite.go b/internal/query/sqlite.go index 54977fbc..1a46ac5c 100644 --- a/internal/query/sqlite.go +++ b/internal/query/sqlite.go @@ -189,10 +189,12 @@ func optsToFilterConditions(opts AggregateOptions, prefix string) ([]string, []i // message_type IS NULL and '' handle old data without the column. conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") - if opts.SourceID != nil { - conditions = append(conditions, prefix+"source_id = ?") - args = append(args, *opts.SourceID) - } + // Always exclude rows soft-deleted by deduplicate. + conditions = append(conditions, prefix+"deleted_at IS NULL") + + conditions, args = appendSourceFilter( + conditions, args, prefix, opts.SourceID, opts.SourceIDs, + ) if opts.After != nil { conditions = append(conditions, prefix+"sent_at >= ?") args = append(args, opts.After.Format("2006-01-02 15:04:05")) @@ -261,10 +263,12 @@ func buildFilterJoinsAndConditions(filter MessageFilter, tableAlias string) (str // message_type IS NULL and '' handle old data without the column. conditions = append(conditions, "("+prefix+"message_type = 'email' OR "+prefix+"message_type IS NULL OR "+prefix+"message_type = '')") - if filter.SourceID != nil { - conditions = append(conditions, prefix+"source_id = ?") - args = append(args, *filter.SourceID) - } + // Always exclude rows soft-deleted by deduplicate. + conditions = append(conditions, prefix+"deleted_at IS NULL") + + conditions, args = appendSourceFilter( + conditions, args, prefix, filter.SourceID, filter.SourceIDs, + ) if filter.ConversationID != nil { conditions = append(conditions, prefix+"conversation_id = ?") @@ -881,11 +885,11 @@ func (e *SQLiteEngine) GetTotalStats(ctx context.Context, opts StatsOptions) (*T var args []interface{} // Restrict to email messages only; NULL and '' handle pre-message_type data. conditions = append(conditions, emailOnlyFilterM) - // Include all messages (deleted messages shown with indicator in TUI) - if opts.SourceID != nil { - conditions = append(conditions, "m.source_id = ?") - args = append(args, *opts.SourceID) - } + // Exclude rows soft-deleted by deduplicate. + conditions = append(conditions, "m.deleted_at IS NULL") + conditions, args = appendSourceFilter( + conditions, args, "m.", opts.SourceID, opts.SourceIDs, + ) if opts.WithAttachmentsOnly { conditions = append(conditions, "m.has_attachments = 1") } @@ -999,13 +1003,11 @@ func (e *SQLiteEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi var conditions []string var args []interface{} - // Always exclude deleted messages + // Exclude remote-deleted and dedup-soft-deleted messages. conditions = append(conditions, "m.deleted_from_source_at IS NULL") + conditions = append(conditions, "m.deleted_at IS NULL") - if filter.SourceID != nil { - conditions = append(conditions, "m.source_id = ?") - args = append(args, *filter.SourceID) - } + conditions, args = appendSourceFilter(conditions, args, "m.", filter.SourceID, filter.SourceIDs) // Build JOIN clauses based on filter type var joins []string @@ -1139,6 +1141,8 @@ func (e *SQLiteEngine) GetGmailIDsByFilter(ctx context.Context, filter MessageFi func (e *SQLiteEngine) buildSearchQueryParts(ctx context.Context, q *search.Query) (conditions []string, args []interface{}, joins []string, ftsJoin string) { // Restrict to email messages only; NULL and '' handle pre-message_type data. conditions = append(conditions, emailOnlyFilterM) + // Exclude rows soft-deleted by deduplicate. + conditions = append(conditions, "m.deleted_at IS NULL") // From filter - uses EXISTS to avoid join multiplication in aggregates. // Handles both exact addresses and @domain patterns. diff --git a/internal/store/collections.go b/internal/store/collections.go new file mode 100644 index 00000000..b6b111af --- /dev/null +++ b/internal/store/collections.go @@ -0,0 +1,410 @@ +package store + +import ( + "database/sql" + "errors" + "fmt" + "strings" + "time" +) + +// Collection is a named grouping of sources that should be treated as +// a single logical archive. +type Collection struct { + ID int64 + Name string + Description string + CreatedAt time.Time +} + +// CollectionWithSources bundles a Collection with its member source +// IDs and a message-count aggregate. +type CollectionWithSources struct { + Collection + SourceIDs []int64 + MessageCount int64 +} + +// ErrCollectionNotFound is returned when a collection lookup has no hits. +var ErrCollectionNotFound = errors.New("collection not found") + +// ensureCollectionSchema creates the collections tables on demand. +func (s *Store) ensureCollectionSchema() error { + stmts := []string{ + `CREATE TABLE IF NOT EXISTS collections ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + description TEXT NOT NULL DEFAULT '', + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP + )`, + `CREATE TABLE IF NOT EXISTS collection_sources ( + collection_id INTEGER NOT NULL REFERENCES collections(id) ON DELETE CASCADE, + source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE, + PRIMARY KEY (collection_id, source_id) + )`, + `CREATE INDEX IF NOT EXISTS idx_collection_sources_source_id + ON collection_sources(source_id)`, + } + for _, stmt := range stmts { + if _, err := s.db.Exec(stmt); err != nil { + return fmt.Errorf("create collections schema: %w", err) + } + } + return nil +} + +// EnsureDefaultCollection creates the "All" collection if it doesn't +// exist and adds all current sources to it. Safe to call on every +// schema init. +func (s *Store) EnsureDefaultCollection() error { + if err := s.ensureCollectionSchema(); err != nil { + return err + } + + var id int64 + err := s.db.QueryRow( + `SELECT id FROM collections WHERE name = 'All'`, + ).Scan(&id) + if errors.Is(err, sql.ErrNoRows) { + res, err := s.db.Exec( + `INSERT INTO collections (name, description) + VALUES ('All', 'All accounts')`, + ) + if err != nil { + return fmt.Errorf("create default collection: %w", err) + } + id, _ = res.LastInsertId() + } else if err != nil { + return fmt.Errorf("check default collection: %w", err) + } + + // Add all sources not already in it. + _, err = s.db.Exec( + `INSERT OR IGNORE INTO collection_sources (collection_id, source_id) + SELECT ?, id FROM sources`, + id, + ) + return err +} + +// CreateCollection inserts a new collection with the given name, +// description, and member source IDs. +func (s *Store) CreateCollection( + name, description string, sourceIDs []int64, +) (*Collection, error) { + if err := s.ensureCollectionSchema(); err != nil { + return nil, err + } + + name = strings.TrimSpace(name) + if name == "" { + return nil, fmt.Errorf("collection name is required") + } + if len(sourceIDs) == 0 { + return nil, fmt.Errorf( + "collection %q needs at least one source", name, + ) + } + + unique := uniqueInt64s(sourceIDs) + if err := s.validateSourceIDs(unique); err != nil { + return nil, err + } + + var created *Collection + err := s.withTx(func(tx *loggedTx) error { + res, err := tx.Exec( + `INSERT INTO collections (name, description) + VALUES (?, ?)`, + name, description, + ) + if err != nil { + if isSQLiteError(err, "UNIQUE constraint failed") { + return fmt.Errorf( + "collection %q already exists", name, + ) + } + return fmt.Errorf("insert collection: %w", err) + } + id, err := res.LastInsertId() + if err != nil { + return fmt.Errorf("last insert id: %w", err) + } + + for _, sid := range unique { + if _, err := tx.Exec( + `INSERT INTO collection_sources + (collection_id, source_id) + VALUES (?, ?)`, + id, sid, + ); err != nil { + return fmt.Errorf("link source %d: %w", sid, err) + } + } + + row := tx.QueryRow( + `SELECT id, name, description, created_at + FROM collections WHERE id = ?`, id, + ) + c, scanErr := scanCollection(row) + if scanErr != nil { + return scanErr + } + created = c + return nil + }) + if err != nil { + return nil, err + } + return created, nil +} + +// GetCollectionByName returns the collection with the given name and +// its member source IDs. +func (s *Store) GetCollectionByName( + name string, +) (*CollectionWithSources, error) { + if err := s.ensureCollectionSchema(); err != nil { + return nil, err + } + + row := s.db.QueryRow( + `SELECT id, name, description, created_at + FROM collections WHERE name = ?`, name, + ) + c, err := scanCollection(row) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrCollectionNotFound + } + return nil, err + } + return s.hydrateCollection(c) +} + +// ListCollections returns every collection with source IDs and +// message counts. +func (s *Store) ListCollections() ([]*CollectionWithSources, error) { + if err := s.ensureCollectionSchema(); err != nil { + return nil, err + } + + rows, err := s.db.Query( + `SELECT id, name, description, created_at + FROM collections ORDER BY name`, + ) + if err != nil { + return nil, fmt.Errorf("list collections: %w", err) + } + defer func() { _ = rows.Close() }() + + var collections []*Collection + for rows.Next() { + c, scanErr := scanCollection(rows) + if scanErr != nil { + return nil, scanErr + } + collections = append(collections, c) + } + if err := rows.Err(); err != nil { + return nil, err + } + + result := make([]*CollectionWithSources, 0, len(collections)) + for _, c := range collections { + hydrated, err := s.hydrateCollection(c) + if err != nil { + return nil, err + } + result = append(result, hydrated) + } + return result, nil +} + +// getCollectionID looks up a collection ID by name without hydrating. +func (s *Store) getCollectionID(name string) (int64, error) { + var id int64 + err := s.db.QueryRow( + `SELECT id FROM collections WHERE name = ?`, name, + ).Scan(&id) + if errors.Is(err, sql.ErrNoRows) { + return 0, ErrCollectionNotFound + } + if err != nil { + return 0, err + } + return id, nil +} + +// AddSourcesToCollection attaches sources to a collection. Idempotent. +func (s *Store) AddSourcesToCollection(name string, sourceIDs []int64) error { + if err := s.ensureCollectionSchema(); err != nil { + return err + } + if err := s.validateSourceIDs(sourceIDs); err != nil { + return err + } + collID, err := s.getCollectionID(name) + if err != nil { + return err + } + return s.withTx(func(tx *loggedTx) error { + for _, sid := range sourceIDs { + if _, err := tx.Exec( + `INSERT OR IGNORE INTO collection_sources + (collection_id, source_id) + VALUES (?, ?)`, + collID, sid, + ); err != nil { + return fmt.Errorf("add source %d: %w", sid, err) + } + } + return nil + }) +} + +// RemoveSourcesFromCollection detaches sources. Idempotent. +func (s *Store) RemoveSourcesFromCollection(name string, sourceIDs []int64) error { + if err := s.ensureCollectionSchema(); err != nil { + return err + } + if err := s.validateSourceIDs(sourceIDs); err != nil { + return err + } + collID, err := s.getCollectionID(name) + if err != nil { + return err + } + return s.withTx(func(tx *loggedTx) error { + for _, sid := range sourceIDs { + if _, err := tx.Exec( + `DELETE FROM collection_sources + WHERE collection_id = ? AND source_id = ?`, + collID, sid, + ); err != nil { + return fmt.Errorf("remove source %d: %w", sid, err) + } + } + return nil + }) +} + +// DeleteCollection drops the collection. Sources and messages untouched. +func (s *Store) DeleteCollection(name string) error { + if err := s.ensureCollectionSchema(); err != nil { + return err + } + res, err := s.db.Exec( + `DELETE FROM collections WHERE name = ?`, name, + ) + if err != nil { + return fmt.Errorf("delete collection: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return ErrCollectionNotFound + } + return nil +} + +func (s *Store) hydrateCollection( + c *Collection, +) (*CollectionWithSources, error) { + rows, err := s.db.Query( + `SELECT source_id FROM collection_sources + WHERE collection_id = ? + ORDER BY source_id`, + c.ID, + ) + if err != nil { + return nil, fmt.Errorf("load sources for %s: %w", c.Name, err) + } + var sourceIDs []int64 + for rows.Next() { + var sid int64 + if err := rows.Scan(&sid); err != nil { + _ = rows.Close() + return nil, err + } + sourceIDs = append(sourceIDs, sid) + } + _ = rows.Close() + if err := rows.Err(); err != nil { + return nil, err + } + + var count int64 + if len(sourceIDs) > 0 { + count, err = s.CountActiveMessages(sourceIDs...) + if err != nil { + return nil, err + } + } + + return &CollectionWithSources{ + Collection: *c, + SourceIDs: sourceIDs, + MessageCount: count, + }, nil +} + +func scanCollection(row interface { + Scan(dest ...any) error +}) (*Collection, error) { + var c Collection + if err := row.Scan( + &c.ID, &c.Name, &c.Description, &c.CreatedAt, + ); err != nil { + return nil, err + } + return &c, nil +} + +func (s *Store) validateSourceIDs(ids []int64) error { + if len(ids) == 0 { + return nil + } + placeholders := make([]string, len(ids)) + args := make([]any, len(ids)) + for i, id := range ids { + placeholders[i] = "?" + args[i] = id + } + query := "SELECT id FROM sources WHERE id IN (" + + strings.Join(placeholders, ",") + ")" + rows, err := s.db.Query(query, args...) + if err != nil { + return fmt.Errorf("validate source IDs: %w", err) + } + defer func() { _ = rows.Close() }() + found := make(map[int64]bool, len(ids)) + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + return err + } + found[id] = true + } + if err := rows.Err(); err != nil { + return err + } + for _, id := range ids { + if !found[id] { + return fmt.Errorf("source %d not found", id) + } + } + return nil +} + +func uniqueInt64s(in []int64) []int64 { + seen := make(map[int64]bool, len(in)) + out := make([]int64, 0, len(in)) + for _, v := range in { + if seen[v] { + continue + } + seen[v] = true + out = append(out, v) + } + return out +} diff --git a/internal/store/collections_test.go b/internal/store/collections_test.go new file mode 100644 index 00000000..ee13a956 --- /dev/null +++ b/internal/store/collections_test.go @@ -0,0 +1,190 @@ +package store_test + +import ( + "testing" + + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/testutil" + "github.com/wesm/msgvault/internal/testutil/storetest" +) + +func TestCollections_CRUD(t *testing.T) { + f := storetest.New(t) + st := f.Store + + src2, err := st.GetOrCreateSource("mbox", "backup@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource") + + // Create + coll, err := st.CreateCollection("work", "Work emails", []int64{f.Source.ID, src2.ID}) + testutil.MustNoErr(t, err, "CreateCollection") + if coll.Name != "work" { + t.Fatalf("name = %q, want work", coll.Name) + } + + // List — includes the auto-created "All" collection plus "work" + list, err := st.ListCollections() + testutil.MustNoErr(t, err, "ListCollections") + if len(list) != 2 { + t.Fatalf("list = %d, want 2", len(list)) + } + // Find "work" in the list and verify its sources. + var workColl *store.CollectionWithSources + for _, c := range list { + if c.Name == "work" { + workColl = c + break + } + } + if workColl == nil { + t.Fatal("expected 'work' collection in list") + } + if len(workColl.SourceIDs) != 2 { + t.Fatalf("sourceIDs = %d, want 2", len(workColl.SourceIDs)) + } + + // Get by name + got, err := st.GetCollectionByName("work") + testutil.MustNoErr(t, err, "GetCollectionByName") + if got.Name != "work" { + t.Fatalf("got name = %q", got.Name) + } + + // Not found + _, err = st.GetCollectionByName("nonexistent") + if err != store.ErrCollectionNotFound { + t.Fatalf("expected ErrCollectionNotFound, got %v", err) + } + + // Duplicate name rejected + _, err = st.CreateCollection("work", "", []int64{f.Source.ID}) + if err == nil { + t.Fatal("expected error for duplicate name") + } + + // Remove source + err = st.RemoveSourcesFromCollection("work", []int64{src2.ID}) + testutil.MustNoErr(t, err, "RemoveSourcesFromCollection") + got, err = st.GetCollectionByName("work") + testutil.MustNoErr(t, err, "GetCollectionByName after remove") + if len(got.SourceIDs) != 1 { + t.Fatalf("sourceIDs after remove = %d, want 1", len(got.SourceIDs)) + } + + // Add source back + err = st.AddSourcesToCollection("work", []int64{src2.ID}) + testutil.MustNoErr(t, err, "AddSourcesToCollection") + got, err = st.GetCollectionByName("work") + testutil.MustNoErr(t, err, "GetCollectionByName after add") + if len(got.SourceIDs) != 2 { + t.Fatalf("sourceIDs after add = %d, want 2", len(got.SourceIDs)) + } + + // Delete + err = st.DeleteCollection("work") + testutil.MustNoErr(t, err, "DeleteCollection") + _, err = st.GetCollectionByName("work") + if err != store.ErrCollectionNotFound { + t.Fatalf("expected not found after delete, got %v", err) + } +} + +func TestCollections_DefaultAll(t *testing.T) { + f := storetest.New(t) + st := f.Store + + err := st.EnsureDefaultCollection() + testutil.MustNoErr(t, err, "EnsureDefaultCollection") + + coll, err := st.GetCollectionByName("All") + testutil.MustNoErr(t, err, "GetCollectionByName All") + if coll.Name != "All" { + t.Fatalf("name = %q, want All", coll.Name) + } + // Should include the fixture's source + if len(coll.SourceIDs) < 1 { + t.Fatalf("All collection should have at least 1 source") + } + + // Idempotent + err = st.EnsureDefaultCollection() + testutil.MustNoErr(t, err, "EnsureDefaultCollection (2nd call)") +} + +func TestCollections_Validation(t *testing.T) { + f := storetest.New(t) + st := f.Store + + t.Run("empty name rejected", func(t *testing.T) { + _, err := st.CreateCollection("", "", []int64{f.Source.ID}) + if err == nil { + t.Fatal("expected error for empty name") + } + }) + + t.Run("zero sources rejected", func(t *testing.T) { + _, err := st.CreateCollection("empty", "", nil) + if err == nil { + t.Fatal("expected error for zero sources") + } + }) + + t.Run("nonexistent source rejected", func(t *testing.T) { + _, err := st.CreateCollection("bad", "", []int64{99999}) + if err == nil { + t.Fatal("expected error for nonexistent source") + } + }) + + t.Run("delete nonexistent returns error", func(t *testing.T) { + err := st.DeleteCollection("nonexistent") + if err != store.ErrCollectionNotFound { + t.Fatalf("expected ErrCollectionNotFound, got %v", err) + } + }) +} + +func TestCollections_Idempotent(t *testing.T) { + f := storetest.New(t) + st := f.Store + + _, err := st.CreateCollection("idem", "", []int64{f.Source.ID}) + testutil.MustNoErr(t, err, "CreateCollection") + + t.Run("add same source twice is no-op", func(t *testing.T) { + err := st.AddSourcesToCollection("idem", []int64{f.Source.ID}) + testutil.MustNoErr(t, err, "AddSourcesToCollection (dupe)") + coll, err := st.GetCollectionByName("idem") + testutil.MustNoErr(t, err, "GetCollectionByName") + if len(coll.SourceIDs) != 1 { + t.Fatalf("sourceIDs = %d, want 1", len(coll.SourceIDs)) + } + }) + + t.Run("remove absent source is no-op", func(t *testing.T) { + src2, err := st.GetOrCreateSource("mbox", "other@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource") + err = st.RemoveSourcesFromCollection("idem", []int64{src2.ID}) + testutil.MustNoErr(t, err, "RemoveSourcesFromCollection (absent)") + }) +} + +func TestCollections_DefaultAllIncremental(t *testing.T) { + f := storetest.New(t) + st := f.Store + + testutil.MustNoErr(t, st.EnsureDefaultCollection(), "EnsureDefaultCollection 1") + coll, err := st.GetCollectionByName("All") + testutil.MustNoErr(t, err, "GetCollectionByName") + initialCount := len(coll.SourceIDs) + + _, err = st.GetOrCreateSource("mbox", "new@example.com") + testutil.MustNoErr(t, err, "GetOrCreateSource") + + testutil.MustNoErr(t, st.EnsureDefaultCollection(), "EnsureDefaultCollection 2") + coll, err = st.GetCollectionByName("All") + testutil.MustNoErr(t, err, "GetCollectionByName after add") + if len(coll.SourceIDs) != initialCount+1 { + t.Errorf("sourceIDs = %d, want %d", len(coll.SourceIDs), initialCount+1) + } +} diff --git a/internal/store/dedup.go b/internal/store/dedup.go new file mode 100644 index 00000000..2e3094dc --- /dev/null +++ b/internal/store/dedup.go @@ -0,0 +1,494 @@ +package store + +import ( + "database/sql" + "fmt" + "strings" + "time" + + "github.com/wesm/msgvault/internal/mime" +) + +// DuplicateGroupKey identifies a group of messages sharing the same +// RFC822 Message-ID. Lightweight return type for the store layer. +type DuplicateGroupKey struct { + RFC822MessageID string + Count int +} + +// DuplicateMessageRow holds metadata needed to select the survivor in a +// duplicate group. Lightweight return type for the store layer. +type DuplicateMessageRow struct { + ID int64 + SourceID int64 + SourceType string + SourceIdentifier string + SourceMessageID string + Subject string + SentAt time.Time + ArchivedAt time.Time + HasRawMIME bool + LabelCount int + IsFromMe bool + HasSentLabel bool // true if the message has the Gmail SENT label + FromEmail string // lower-cased From: address (for identity-match sent detection) +} + +// MergeResult holds the counts from a MergeDuplicates operation. +type MergeResult struct { + LabelsTransferred int + RawMIMEBackfilled int +} + +// ContentHashCandidate holds message metadata for raw-MIME hash scans. +type ContentHashCandidate struct { + ID int64 + SourceID int64 + SourceType string + SourceIdentifier string + SourceMessageID string + Subject string + SentAt time.Time + ArchivedAt time.Time + LabelCount int + IsFromMe bool + HasSentLabel bool + FromEmail string +} + +func (s *Store) FindDuplicatesByRFC822ID(sourceIDs ...int64) ([]DuplicateGroupKey, error) { + query := ` + SELECT rfc822_message_id, COUNT(*) AS cnt + FROM messages + WHERE rfc822_message_id IS NOT NULL + AND rfc822_message_id != '' + AND deleted_at IS NULL` + var args []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + args = append(args, id) + } + query += " AND source_id IN (" + strings.Join(placeholders, ",") + ")" + } + query += ` + GROUP BY rfc822_message_id + HAVING cnt > 1` + + rows, err := s.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("find duplicates by rfc822 id: %w", err) + } + defer func() { _ = rows.Close() }() + + var groups []DuplicateGroupKey + for rows.Next() { + var g DuplicateGroupKey + if err := rows.Scan(&g.RFC822MessageID, &g.Count); err != nil { + return nil, err + } + groups = append(groups, g) + } + return groups, rows.Err() +} + +func (s *Store) GetDuplicateGroupMessages( + rfc822ID string, sourceIDs ...int64, +) ([]DuplicateMessageRow, error) { + query := ` + SELECT m.id, m.source_id, s.source_type, s.identifier, + m.source_message_id, + COALESCE(m.subject, ''), m.sent_at, m.archived_at, + (CASE WHEN mr.message_id IS NOT NULL THEN 1 ELSE 0 END) AS has_raw, + (SELECT COUNT(*) FROM message_labels ml + WHERE ml.message_id = m.id) AS label_count, + COALESCE(m.is_from_me, 0) AS is_from_me, + CAST(EXISTS ( + SELECT 1 FROM message_labels ml2 + JOIN labels l ON l.id = ml2.label_id + WHERE ml2.message_id = m.id + AND (l.source_label_id = 'SENT' OR UPPER(l.name) = 'SENT') + ) AS INTEGER) AS has_sent_label, + COALESCE(( + SELECT LOWER(p_from.email_address) + FROM message_recipients mr_from + JOIN participants p_from + ON p_from.id = mr_from.participant_id + WHERE mr_from.message_id = m.id + AND mr_from.recipient_type = 'from' + LIMIT 1 + ), '') AS from_email + FROM messages m + JOIN sources s ON s.id = m.source_id + LEFT JOIN message_raw mr ON mr.message_id = m.id + WHERE m.rfc822_message_id = ? AND m.deleted_at IS NULL` + args := []any{rfc822ID} + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + args = append(args, id) + } + query += " AND m.source_id IN (" + strings.Join(placeholders, ",") + ")" + } + query += " ORDER BY m.id" + + rows, err := s.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("get duplicate group messages: %w", err) + } + defer func() { _ = rows.Close() }() + + var msgs []DuplicateMessageRow + for rows.Next() { + var dm DuplicateMessageRow + var sentAt, archivedAt sql.NullTime + var hasRaw, isFromMe, hasSent int + if err := rows.Scan( + &dm.ID, &dm.SourceID, &dm.SourceType, &dm.SourceIdentifier, + &dm.SourceMessageID, &dm.Subject, &sentAt, &archivedAt, + &hasRaw, &dm.LabelCount, &isFromMe, &hasSent, + &dm.FromEmail, + ); err != nil { + return nil, err + } + if sentAt.Valid { + dm.SentAt = sentAt.Time + } + if archivedAt.Valid { + dm.ArchivedAt = archivedAt.Time + } + dm.HasRawMIME = hasRaw == 1 + dm.IsFromMe = isFromMe == 1 + dm.HasSentLabel = hasSent == 1 + msgs = append(msgs, dm) + } + return msgs, rows.Err() +} + +func (s *Store) MergeDuplicates( + survivorID int64, duplicateIDs []int64, batchID string, +) (*MergeResult, error) { + if len(duplicateIDs) == 0 { + return &MergeResult{}, nil + } + + result := &MergeResult{} + unionLabelsSQL := s.dialect.InsertOrIgnore(`INSERT OR IGNORE INTO message_labels (message_id, label_id) + SELECT ?, label_id FROM message_labels WHERE message_id = ?`) + backfillRawSQL := s.dialect.InsertOrIgnore(`INSERT OR IGNORE INTO message_raw + (message_id, raw_data, raw_format, compression) + SELECT ?, raw_data, raw_format, compression + FROM message_raw WHERE message_id = ?`) + softDeleteSQL := fmt.Sprintf(`UPDATE messages + SET deleted_at = %s, delete_batch_id = ? + WHERE id = ?`, s.dialect.Now()) + + err := s.withTx(func(tx *loggedTx) error { + for _, dupID := range duplicateIDs { + res, err := tx.Exec(unionLabelsSQL, survivorID, dupID) + if err != nil { + return fmt.Errorf("union labels from %d: %w", dupID, err) + } + affected, _ := res.RowsAffected() + result.LabelsTransferred += int(affected) + } + + var survivorHasRaw int + if err := tx.QueryRow( + `SELECT COUNT(*) FROM message_raw WHERE message_id = ?`, + survivorID, + ).Scan(&survivorHasRaw); err != nil { + return fmt.Errorf("check survivor raw MIME: %w", err) + } + if survivorHasRaw == 0 { + for _, dupID := range duplicateIDs { + res, err := tx.Exec(backfillRawSQL, survivorID, dupID) + if err != nil { + return fmt.Errorf("backfill raw MIME from %d: %w", dupID, err) + } + affected, _ := res.RowsAffected() + if affected > 0 { + result.RawMIMEBackfilled++ + break + } + } + } + + for _, dupID := range duplicateIDs { + if _, err := tx.Exec(softDeleteSQL, batchID, dupID); err != nil { + return fmt.Errorf("soft-delete duplicate %d: %w", dupID, err) + } + } + return nil + }) + return result, err +} + +func (s *Store) GetAllRawMIMECandidates( + sourceIDs ...int64, +) ([]ContentHashCandidate, error) { + query := ` + SELECT m.id, m.source_id, s.source_type, s.identifier, + m.source_message_id, + COALESCE(m.subject, ''), m.sent_at, m.archived_at, + (SELECT COUNT(*) FROM message_labels ml + WHERE ml.message_id = m.id) AS label_count, + COALESCE(m.is_from_me, 0) AS is_from_me, + CAST(EXISTS ( + SELECT 1 FROM message_labels ml2 + JOIN labels l ON l.id = ml2.label_id + WHERE ml2.message_id = m.id + AND (l.source_label_id = 'SENT' OR UPPER(l.name) = 'SENT') + ) AS INTEGER) AS has_sent_label, + COALESCE(( + SELECT LOWER(p_from.email_address) + FROM message_recipients mr_from + JOIN participants p_from + ON p_from.id = mr_from.participant_id + WHERE mr_from.message_id = m.id + AND mr_from.recipient_type = 'from' + LIMIT 1 + ), '') AS from_email + FROM messages m + JOIN sources s ON s.id = m.source_id + JOIN message_raw mr ON mr.message_id = m.id + WHERE m.deleted_at IS NULL` + var args []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + args = append(args, id) + } + query += " AND m.source_id IN (" + strings.Join(placeholders, ",") + ")" + } + query += " ORDER BY m.id" + + rows, err := s.db.Query(query, args...) + if err != nil { + return nil, fmt.Errorf("get all raw MIME candidates: %w", err) + } + defer func() { _ = rows.Close() }() + + var candidates []ContentHashCandidate + for rows.Next() { + var c ContentHashCandidate + var sentAt, archivedAt sql.NullTime + var isFromMe, hasSent int + if err := rows.Scan( + &c.ID, &c.SourceID, &c.SourceType, &c.SourceIdentifier, + &c.SourceMessageID, &c.Subject, &sentAt, &archivedAt, + &c.LabelCount, &isFromMe, &hasSent, &c.FromEmail, + ); err != nil { + return nil, err + } + if sentAt.Valid { + c.SentAt = sentAt.Time + } + if archivedAt.Valid { + c.ArchivedAt = archivedAt.Time + } + c.IsFromMe = isFromMe == 1 + c.HasSentLabel = hasSent == 1 + candidates = append(candidates, c) + } + return candidates, rows.Err() +} + +func (s *Store) StreamMessageRaw( + messageIDs []int64, + fn func(messageID int64, rawData []byte, compression string), +) error { + const chunkSize = 500 + for start := 0; start < len(messageIDs); start += chunkSize { + end := min(start+chunkSize, len(messageIDs)) + chunk := messageIDs[start:end] + + placeholders := make([]string, len(chunk)) + args := make([]any, len(chunk)) + for i, id := range chunk { + placeholders[i] = "?" + args[i] = id + } + + query := "SELECT message_id, raw_data, compression FROM message_raw WHERE message_id IN (" + + strings.Join(placeholders, ",") + ")" + rows, err := s.db.Query(query, args...) + if err != nil { + return fmt.Errorf("stream message raw: %w", err) + } + + for rows.Next() { + var msgID int64 + var rawData []byte + var compression sql.NullString + if err := rows.Scan(&msgID, &rawData, &compression); err != nil { + _ = rows.Close() + return err + } + comp := "" + if compression.Valid { + comp = compression.String + } + fn(msgID, rawData, comp) + } + if err := rows.Err(); err != nil { + _ = rows.Close() + return err + } + _ = rows.Close() + } + return nil +} + +// UndoDedup restores soft-deleted duplicates from a dedup batch by +// clearing deleted_at and delete_batch_id. Merge side effects (labels +// copied to survivors, raw MIME backfilled onto survivors) are not +// reversed — those changes are additive enrichment that leaves +// survivors strictly better off. +func (s *Store) UndoDedup(batchID string) (int64, error) { + result, err := s.db.Exec(` + UPDATE messages + SET deleted_at = NULL, delete_batch_id = NULL + WHERE delete_batch_id = ? + `, batchID) + if err != nil { + return 0, fmt.Errorf("undo dedup: %w", err) + } + return result.RowsAffected() +} + +func (s *Store) CountActiveMessages(sourceIDs ...int64) (int64, error) { + query := "SELECT COUNT(*) FROM messages WHERE deleted_at IS NULL" + var args []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + args = append(args, id) + } + query += " AND source_id IN (" + strings.Join(placeholders, ",") + ")" + } + var count int64 + err := s.db.QueryRow(query, args...).Scan(&count) + return count, err +} + +func (s *Store) CountMessagesWithoutRFC822ID(sourceIDs ...int64) (int64, error) { + q := `SELECT COUNT(*) FROM messages m + JOIN message_raw mr ON mr.message_id = m.id + WHERE (m.rfc822_message_id IS NULL OR m.rfc822_message_id = '') + AND m.deleted_at IS NULL` + var args []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + args = append(args, id) + } + q += " AND m.source_id IN (" + strings.Join(placeholders, ",") + ")" + } + var count int64 + err := s.db.QueryRow(q, args...).Scan(&count) + return count, err +} + +func (s *Store) BackfillRFC822IDs( + sourceIDs []int64, + progress func(done, total int64), +) (updated int64, failed int64, err error) { + scopeClause := "" + var scopeArgs []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + scopeArgs = append(scopeArgs, id) + } + scopeClause = " AND m.source_id IN (" + strings.Join(placeholders, ",") + ")" + } + + var total int64 + countQ := `SELECT COUNT(*) FROM messages m + JOIN message_raw mr ON mr.message_id = m.id + WHERE (m.rfc822_message_id IS NULL OR m.rfc822_message_id = '') + AND m.deleted_at IS NULL` + scopeClause + err = s.db.QueryRow(countQ, scopeArgs...).Scan(&total) + if err != nil { + return 0, 0, fmt.Errorf("count backfill candidates: %w", err) + } + if total == 0 { + return 0, 0, nil + } + + const batchSize = 1000 + lastID := int64(0) + + for { + batchQ := `SELECT m.id FROM messages m + JOIN message_raw mr ON mr.message_id = m.id + WHERE (m.rfc822_message_id IS NULL OR m.rfc822_message_id = '') + AND m.deleted_at IS NULL + AND m.id > ?` + scopeClause + ` + ORDER BY m.id + LIMIT ?` + batchArgs := append([]any{lastID}, scopeArgs...) + batchArgs = append(batchArgs, batchSize) + rows, err := s.db.Query(batchQ, batchArgs...) + if err != nil { + return updated, failed, fmt.Errorf("fetch backfill batch: %w", err) + } + + var batchIDs []int64 + for rows.Next() { + var id int64 + if err := rows.Scan(&id); err != nil { + _ = rows.Close() + return updated, failed, err + } + batchIDs = append(batchIDs, id) + } + _ = rows.Close() + if err := rows.Err(); err != nil { + return updated, failed, err + } + if len(batchIDs) == 0 { + break + } + + for _, id := range batchIDs { + raw, err := s.GetMessageRaw(id) + if err != nil { + failed++ + continue + } + parsed, err := mime.Parse(raw) + if err != nil || parsed.MessageID == "" { + failed++ + continue + } + normalizedID := strings.TrimSpace(parsed.MessageID) + normalizedID = strings.Trim(normalizedID, "<>") + if normalizedID == "" { + failed++ + continue + } + if _, err := s.db.Exec( + "UPDATE messages SET rfc822_message_id = ? WHERE id = ?", + normalizedID, id, + ); err != nil { + failed++ + continue + } + updated++ + } + + lastID = batchIDs[len(batchIDs)-1] + if progress != nil { + progress(updated, total) + } + } + return updated, failed, nil +} diff --git a/internal/store/dedup_test.go b/internal/store/dedup_test.go new file mode 100644 index 00000000..75cc44b6 --- /dev/null +++ b/internal/store/dedup_test.go @@ -0,0 +1,250 @@ +package store_test + +import ( + "database/sql" + "testing" + + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/testutil" + "github.com/wesm/msgvault/internal/testutil/storetest" +) + +func newRFC822Message( + t *testing.T, f *storetest.Fixture, sourceMessageID, rfc822ID string, +) int64 { + t.Helper() + id, err := f.Store.UpsertMessage(&store.Message{ + ConversationID: f.ConvID, + SourceID: f.Source.ID, + SourceMessageID: sourceMessageID, + RFC822MessageID: sql.NullString{ + String: rfc822ID, Valid: rfc822ID != "", + }, + MessageType: "email", + SizeEstimate: 1000, + }) + testutil.MustNoErr(t, err, "UpsertMessage") + return id +} + +func TestStore_FindDuplicatesByRFC822ID(t *testing.T) { + f := storetest.New(t) + idA := newRFC822Message(t, f, "src-a", "rfc822-shared") + idB := newRFC822Message(t, f, "src-b", "rfc822-shared") + _ = newRFC822Message(t, f, "src-c", "rfc822-unique") + + groups, err := f.Store.FindDuplicatesByRFC822ID() + testutil.MustNoErr(t, err, "FindDuplicatesByRFC822ID") + if len(groups) != 1 { + t.Fatalf("groups = %d, want 1", len(groups)) + } + if groups[0].RFC822MessageID != "rfc822-shared" { + t.Errorf("key = %q, want rfc822-shared", groups[0].RFC822MessageID) + } + if groups[0].Count != 2 { + t.Errorf("count = %d, want 2", groups[0].Count) + } + + _, err = f.Store.MergeDuplicates(idA, []int64{idB}, "batch-test") + testutil.MustNoErr(t, err, "MergeDuplicates") + + groups, err = f.Store.FindDuplicatesByRFC822ID() + testutil.MustNoErr(t, err, "FindDuplicatesByRFC822ID after merge") + if len(groups) != 0 { + t.Errorf("groups after merge = %d, want 0", len(groups)) + } +} + +func TestStore_GetDuplicateGroupMessages_SentLabel(t *testing.T) { + f := storetest.New(t) + idInbox := newRFC822Message(t, f, "inbox-copy", "rfc822-sent") + idSent := newRFC822Message(t, f, "sent-copy", "rfc822-sent") + + labels := f.EnsureLabels( + map[string]string{"SENT": "Sent", "INBOX": "Inbox"}, "system", + ) + testutil.MustNoErr(t, f.Store.LinkMessageLabel(idInbox, labels["INBOX"]), "link INBOX") + testutil.MustNoErr(t, f.Store.LinkMessageLabel(idSent, labels["SENT"]), "link SENT") + + rows, err := f.Store.GetDuplicateGroupMessages("rfc822-sent") + testutil.MustNoErr(t, err, "GetDuplicateGroupMessages") + if len(rows) != 2 { + t.Fatalf("rows = %d, want 2", len(rows)) + } + + var sentRow, inboxRow *store.DuplicateMessageRow + for i := range rows { + switch rows[i].ID { + case idSent: + sentRow = &rows[i] + case idInbox: + inboxRow = &rows[i] + } + } + if sentRow == nil || inboxRow == nil { + t.Fatalf("missing rows: sent=%v inbox=%v", sentRow, inboxRow) + } + if !sentRow.HasSentLabel { + t.Errorf("sent row: HasSentLabel = false, want true") + } + if inboxRow.HasSentLabel { + t.Errorf("inbox row: HasSentLabel = true, want false") + } +} + +func TestStore_MergeDuplicates_UnionsLabels(t *testing.T) { + f := storetest.New(t) + idKeep := newRFC822Message(t, f, "keep", "rfc822-merge") + idDrop := newRFC822Message(t, f, "drop", "rfc822-merge") + + labels := f.EnsureLabels( + map[string]string{"INBOX": "Inbox", "IMPORTANT": "Important", "WORK": "Work"}, "user", + ) + testutil.MustNoErr(t, f.Store.LinkMessageLabel(idKeep, labels["INBOX"]), "link INBOX to keep") + testutil.MustNoErr(t, f.Store.LinkMessageLabel(idDrop, labels["IMPORTANT"]), "link IMPORTANT to drop") + testutil.MustNoErr(t, f.Store.LinkMessageLabel(idDrop, labels["WORK"]), "link WORK to drop") + + result, err := f.Store.MergeDuplicates(idKeep, []int64{idDrop}, "batch-labels") + testutil.MustNoErr(t, err, "MergeDuplicates") + if result.LabelsTransferred != 2 { + t.Errorf("labelsTransferred = %d, want 2", result.LabelsTransferred) + } + + f.AssertLabelCount(idKeep, 3) + assertDedupDeleted(t, f.Store, idDrop, true) + + restored, err := f.Store.UndoDedup("batch-labels") + testutil.MustNoErr(t, err, "UndoDedup") + if restored != 1 { + t.Errorf("restored = %d, want 1", restored) + } + assertDedupDeleted(t, f.Store, idDrop, false) +} + +func assertDedupDeleted( + t *testing.T, st *store.Store, msgID int64, wantDeleted bool, +) { + t.Helper() + var deletedAt sql.NullTime + err := st.DB().QueryRow( + "SELECT deleted_at FROM messages WHERE id = ?", msgID, + ).Scan(&deletedAt) + testutil.MustNoErr(t, err, "query deleted_at") + if wantDeleted && !deletedAt.Valid { + t.Errorf("message %d: deleted_at should be set", msgID) + } + if !wantDeleted && deletedAt.Valid { + t.Errorf("message %d: deleted_at should be NULL", msgID) + } +} + +func TestStore_BackfillRFC822IDs_EmptyTable(t *testing.T) { + f := storetest.New(t) + count, err := f.Store.CountMessagesWithoutRFC822ID() + testutil.MustNoErr(t, err, "CountMessagesWithoutRFC822ID") + if count != 0 { + t.Errorf("empty-table count = %d, want 0", count) + } + + updated, _, err := f.Store.BackfillRFC822IDs(nil, nil) + testutil.MustNoErr(t, err, "BackfillRFC822IDs") + if updated != 0 { + t.Errorf("updated = %d, want 0", updated) + } +} + +func TestStore_CountActiveMessages(t *testing.T) { + f := storetest.New(t) + _ = newRFC822Message(t, f, "a", "id-a") + idB := newRFC822Message(t, f, "b", "id-b") + + total, err := f.Store.CountActiveMessages() + testutil.MustNoErr(t, err, "CountActiveMessages") + if total != 2 { + t.Errorf("active = %d, want 2", total) + } + + _, err = f.Store.MergeDuplicates( + newRFC822Message(t, f, "c", "id-c"), + []int64{idB}, + "batch-count", + ) + testutil.MustNoErr(t, err, "MergeDuplicates") + + total, err = f.Store.CountActiveMessages() + testutil.MustNoErr(t, err, "CountActiveMessages after merge") + if total != 2 { + t.Errorf("active after merge = %d, want 2", total) + } +} + +func TestStore_BackfillRFC822IDs_ParsesFromRawMIME(t *testing.T) { + f := storetest.New(t) + + id := newRFC822Message(t, f, "needs-backfill", "") + + rawMIME := []byte("From: alice@example.com\r\nTo: bob@example.com\r\nMessage-ID: \r\nSubject: Backfill test\r\n\r\nBody text") + testutil.MustNoErr(t, + f.Store.UpsertMessageRaw(id, rawMIME), + "UpsertMessageRaw", + ) + + count, err := f.Store.CountMessagesWithoutRFC822ID() + testutil.MustNoErr(t, err, "CountMessagesWithoutRFC822ID") + if count != 1 { + t.Fatalf("count without rfc822 = %d, want 1", count) + } + + updated, _, err := f.Store.BackfillRFC822IDs(nil, nil) + testutil.MustNoErr(t, err, "BackfillRFC822IDs") + if updated != 1 { + t.Fatalf("updated = %d, want 1", updated) + } + + var rfc822ID string + err = f.Store.DB().QueryRow( + "SELECT rfc822_message_id FROM messages WHERE id = ?", id, + ).Scan(&rfc822ID) + testutil.MustNoErr(t, err, "scan rfc822_message_id") + if rfc822ID != "unique-123@example.com" { + t.Errorf("rfc822_message_id = %q, want unique-123@example.com", rfc822ID) + } + + count, err = f.Store.CountMessagesWithoutRFC822ID() + testutil.MustNoErr(t, err, "CountMessagesWithoutRFC822ID after backfill") + if count != 0 { + t.Errorf("count after backfill = %d, want 0", count) + } +} + +func TestStore_MergeDuplicates_BackfillsRawMIME(t *testing.T) { + f := storetest.New(t) + + idSurvivor := newRFC822Message(t, f, "survivor", "rfc822-mime-backfill") + idDuplicate := newRFC822Message(t, f, "duplicate", "rfc822-mime-backfill") + + rawData := []byte("From: alice@example.com\r\nSubject: Test\r\n\r\nBody") + testutil.MustNoErr(t, + f.Store.UpsertMessageRaw(idDuplicate, rawData), + "UpsertMessageRaw on duplicate", + ) + + _, err := f.Store.GetMessageRaw(idSurvivor) + if err == nil { + t.Fatal("survivor should not have raw MIME before merge") + } + + result, err := f.Store.MergeDuplicates( + idSurvivor, []int64{idDuplicate}, "batch-mime", + ) + testutil.MustNoErr(t, err, "MergeDuplicates") + if result.RawMIMEBackfilled != 1 { + t.Errorf("RawMIMEBackfilled = %d, want 1", result.RawMIMEBackfilled) + } + + got, err := f.Store.GetMessageRaw(idSurvivor) + testutil.MustNoErr(t, err, "GetMessageRaw survivor after merge") + if len(got) == 0 { + t.Error("survivor raw MIME should not be empty after backfill") + } +} diff --git a/internal/store/identities.go b/internal/store/identities.go new file mode 100644 index 00000000..67372717 --- /dev/null +++ b/internal/store/identities.go @@ -0,0 +1,173 @@ +package store + +import ( + "fmt" + "strconv" + "strings" +) + +// IdentityCandidate is a single "likely me" email address discovered +// in the archive. +type IdentityCandidate struct { + Email string + MessageCount int64 + Signals IdentitySignal + SourceIDs []int64 +} + +// IdentitySignal is a bitmask describing which evidence types +// support an address being "me". +type IdentitySignal uint8 + +const ( + SignalFromMe IdentitySignal = 1 << iota + SignalSentLabel IdentitySignal = 1 << iota + SignalAccountMatch IdentitySignal = 1 << iota +) + +// String renders a short human-readable label for an IdentitySignal bitmask. +func (s IdentitySignal) String() string { + if s == 0 { + return "" + } + parts := make([]string, 0, 3) + if s&SignalFromMe != 0 { + parts = append(parts, "is_from_me") + } + if s&SignalSentLabel != 0 { + parts = append(parts, "sent-label") + } + if s&SignalAccountMatch != 0 { + parts = append(parts, "account-match") + } + return strings.Join(parts, ",") +} + +// ListLikelyIdentities returns every From: address that the archive +// considers a candidate "me" identity, ranked by total sent count. +// +// Three independent signals contribute: +// 1. messages.is_from_me = 1 +// 2. Message carries a SENT label +// 3. Address equals a source identifier within scope +func (s *Store) ListLikelyIdentities( + sourceIDs ...int64, +) ([]IdentityCandidate, error) { + scopeClause := "" + var scopeArgs []any + if len(sourceIDs) > 0 { + placeholders := make([]string, len(sourceIDs)) + for i, id := range sourceIDs { + placeholders[i] = "?" + scopeArgs = append(scopeArgs, id) + } + scopeClause = " AND m.source_id IN (" + + strings.Join(placeholders, ",") + ")" + } + + query := ` + WITH sent_messages AS ( + SELECT + m.id, + m.source_id, + m.is_from_me, + EXISTS ( + SELECT 1 + FROM message_labels ml + JOIN labels l ON l.id = ml.label_id + WHERE ml.message_id = m.id + AND (l.source_label_id = 'SENT' + OR UPPER(l.name) = 'SENT') + ) AS has_sent_label, + LOWER(p_from.email_address) AS email, + LOWER(src.identifier) AS src_identifier + FROM messages m + JOIN message_recipients mr_from + ON mr_from.message_id = m.id + AND mr_from.recipient_type = 'from' + JOIN participants p_from + ON p_from.id = mr_from.participant_id + JOIN sources src ON src.id = m.source_id + WHERE m.deleted_at IS NULL + AND p_from.email_address IS NOT NULL + AND p_from.email_address != ''` + + scopeClause + ` + ) + SELECT + email, + COUNT(*) AS sent_count, + MAX(CASE WHEN is_from_me = 1 THEN 1 ELSE 0 END) AS sig_from_me, + MAX(CASE WHEN has_sent_label THEN 1 ELSE 0 END) AS sig_sent_label, + MAX(CASE WHEN email = src_identifier THEN 1 ELSE 0 END) AS sig_account_match, + -- GROUP_CONCAT is SQLite-specific. When the Postgres + -- dialect lands, switch to string_agg(DISTINCT source_id::text, ','). + -- Result order is unspecified; callers treat SourceIDs as a set. + GROUP_CONCAT(DISTINCT source_id) AS source_ids + FROM sent_messages + WHERE (is_from_me = 1 + OR has_sent_label + OR email = src_identifier) + GROUP BY email + ORDER BY sent_count DESC, email ASC + ` + + rows, err := s.db.Query(query, scopeArgs...) + if err != nil { + return nil, fmt.Errorf("list likely identities: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []IdentityCandidate + for rows.Next() { + var ( + email string + sentCount int64 + sigFromMe int + sigSent int + sigAccount int + sourceIDsList string + ) + if err := rows.Scan( + &email, &sentCount, + &sigFromMe, &sigSent, &sigAccount, + &sourceIDsList, + ); err != nil { + return nil, err + } + + var sigs IdentitySignal + if sigFromMe == 1 { + sigs |= SignalFromMe + } + if sigSent == 1 { + sigs |= SignalSentLabel + } + if sigAccount == 1 { + sigs |= SignalAccountMatch + } + + out = append(out, IdentityCandidate{ + Email: email, + MessageCount: sentCount, + Signals: sigs, + SourceIDs: parseInt64CSV(sourceIDsList), + }) + } + return out, rows.Err() +} + +func parseInt64CSV(s string) []int64 { + if s == "" { + return nil + } + parts := strings.Split(s, ",") + out := make([]int64, 0, len(parts)) + for _, p := range parts { + id, err := strconv.ParseInt(strings.TrimSpace(p), 10, 64) + if err != nil { + continue + } + out = append(out, id) + } + return out +} diff --git a/internal/store/identities_test.go b/internal/store/identities_test.go new file mode 100644 index 00000000..91488512 --- /dev/null +++ b/internal/store/identities_test.go @@ -0,0 +1,260 @@ +package store_test + +import ( + "database/sql" + "testing" + + "github.com/wesm/msgvault/internal/store" + "github.com/wesm/msgvault/internal/testutil" + "github.com/wesm/msgvault/internal/testutil/storetest" +) + +func addMessageFromParticipant( + t *testing.T, f *storetest.Fixture, + source *store.Source, + srcMessageID, fromEmail string, + isFromMe bool, +) int64 { + t.Helper() + pid, err := f.Store.EnsureParticipant(fromEmail, "", "") + testutil.MustNoErr(t, err, "EnsureParticipant "+fromEmail) + + convID, err := f.Store.EnsureConversation( + source.ID, "conv-"+srcMessageID, "Subject", + ) + testutil.MustNoErr(t, err, "EnsureConversation") + + mid, err := f.Store.UpsertMessage(&store.Message{ + ConversationID: convID, + SourceID: source.ID, + SourceMessageID: srcMessageID, + MessageType: "email", + IsFromMe: isFromMe, + SizeEstimate: 1000, + }) + testutil.MustNoErr(t, err, "UpsertMessage") + + testutil.MustNoErr(t, + f.Store.ReplaceMessageRecipients( + mid, "from", []int64{pid}, []string{""}, + ), + "ReplaceMessageRecipients", + ) + return mid +} + +func TestListLikelyIdentities_SignalsFromMe(t *testing.T) { + f := storetest.New(t) + for i := 1; i <= 3; i++ { + addMessageFromParticipant( + t, f, f.Source, + "m"+string(rune('0'+i)), + "alice@example.com", + true, + ) + } + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + got := ids[0] + if got.Email != "alice@example.com" { + t.Errorf("email = %q, want alice@example.com", got.Email) + } + if got.MessageCount != 3 { + t.Errorf("count = %d, want 3", got.MessageCount) + } + if got.Signals&store.SignalFromMe == 0 { + t.Errorf("SignalFromMe not set: %v", got.Signals) + } +} + +func TestListLikelyIdentities_SentLabelWithoutIsFromMe(t *testing.T) { + f := storetest.New(t) + mid := addMessageFromParticipant( + t, f, f.Source, "m1", "alice@example.com", false, + ) + + lid, err := f.Store.EnsureLabel( + f.Source.ID, "SENT", "Sent", "system", + ) + testutil.MustNoErr(t, err, "EnsureLabel SENT") + testutil.MustNoErr(t, + f.Store.LinkMessageLabel(mid, lid), + "LinkMessageLabel", + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + got := ids[0] + if got.Signals&store.SignalSentLabel == 0 { + t.Errorf("SignalSentLabel not set: %v", got.Signals) + } + if got.Signals&store.SignalFromMe != 0 { + t.Errorf("SignalFromMe should not fire: %v", got.Signals) + } +} + +func TestListLikelyIdentities_AccountIdentifierMatch(t *testing.T) { + f := storetest.New(t) + addMessageFromParticipant( + t, f, f.Source, "m1", "test@example.com", false, + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + got := ids[0] + if got.Signals != store.SignalAccountMatch { + t.Errorf("signals = %v, want SignalAccountMatch only", got.Signals) + } + if got.Email != "test@example.com" { + t.Errorf("email = %q, want test@example.com", got.Email) + } +} + +func TestListLikelyIdentities_ExcludesOtherPeople(t *testing.T) { + f := storetest.New(t) + addMessageFromParticipant( + t, f, f.Source, "m1", "stranger@elsewhere.org", false, + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + + if len(ids) != 0 { + t.Errorf("got %d candidates, want 0: %v", len(ids), ids) + } +} + +func TestListLikelyIdentities_RankedByCount(t *testing.T) { + f := storetest.New(t) + + for i := 1; i <= 3; i++ { + addMessageFromParticipant( + t, f, f.Source, + "a"+string(rune('0'+i)), + "alice@example.com", + true, + ) + } + addMessageFromParticipant( + t, f, f.Source, "b1", "bob@example.com", true, + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 2 { + t.Fatalf("got %d candidates, want 2", len(ids)) + } + if ids[0].Email != "alice@example.com" { + t.Errorf("first = %q, want alice@example.com", ids[0].Email) + } + if ids[0].MessageCount != 3 { + t.Errorf("alice count = %d, want 3", ids[0].MessageCount) + } + if ids[1].Email != "bob@example.com" { + t.Errorf("second = %q, want bob@example.com", ids[1].Email) + } +} + +func TestListLikelyIdentities_ScopedToSources(t *testing.T) { + f := storetest.New(t) + + addMessageFromParticipant( + t, f, f.Source, "m1", "alice@example.com", true, + ) + + src2, err := f.Store.GetOrCreateSource("gmail", "bob@other.com") + testutil.MustNoErr(t, err, "GetOrCreateSource bob") + addMessageFromParticipant( + t, f, src2, "m2", "bob@other.com", true, + ) + + ids, err := f.Store.ListLikelyIdentities(f.Source.ID) + testutil.MustNoErr(t, err, "ListLikelyIdentities scoped") + if len(ids) != 1 || ids[0].Email != "alice@example.com" { + t.Errorf("scoped result = %v, want [alice@example.com]", ids) + } + + ids, err = f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities all") + if len(ids) != 2 { + t.Errorf("unscoped count = %d, want 2", len(ids)) + } +} + +func TestListLikelyIdentities_ExcludesSoftDeleted(t *testing.T) { + f := storetest.New(t) + mid := addMessageFromParticipant( + t, f, f.Source, "m1", "alice@example.com", true, + ) + _, err := f.Store.DB().Exec( + "UPDATE messages SET deleted_at = datetime('now') WHERE id = ?", + mid, + ) + testutil.MustNoErr(t, err, "soft-delete") + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 0 { + t.Errorf("soft-deleted message should be excluded: %v", ids) + } + + var deletedAt sql.NullTime + err = f.Store.DB().QueryRow( + "SELECT deleted_at FROM messages WHERE id = ?", mid, + ).Scan(&deletedAt) + testutil.MustNoErr(t, err, "scan deleted_at") + if !deletedAt.Valid { + t.Error("deleted_at should be set") + } +} + +func TestListLikelyIdentities_AllThreeSignals(t *testing.T) { + f := storetest.New(t) + mid := addMessageFromParticipant( + t, f, f.Source, "m1", "test@example.com", true, + ) + lid, err := f.Store.EnsureLabel(f.Source.ID, "SENT", "Sent", "system") + testutil.MustNoErr(t, err, "EnsureLabel") + testutil.MustNoErr(t, f.Store.LinkMessageLabel(mid, lid), "LinkMessageLabel") + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + + got := ids[0] + want := store.SignalFromMe | store.SignalSentLabel | store.SignalAccountMatch + if got.Signals != want { + t.Errorf("signals = %v, want all three: %v", got.Signals, want) + } +} + +func TestListLikelyIdentities_CaseInsensitive(t *testing.T) { + f := storetest.New(t) + addMessageFromParticipant( + t, f, f.Source, "m1", "Alice@Example.COM", true, + ) + + ids, err := f.Store.ListLikelyIdentities() + testutil.MustNoErr(t, err, "ListLikelyIdentities") + if len(ids) != 1 { + t.Fatalf("got %d candidates, want 1", len(ids)) + } + if ids[0].Email != "alice@example.com" { + t.Errorf("email = %q, want lower-cased alice@example.com", ids[0].Email) + } +} diff --git a/internal/store/store.go b/internal/store/store.go index ae5f0f9b..4b8933a4 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -456,14 +456,20 @@ func (s *Store) InitSchema() error { if !s.dialect.IsNoSuchModuleError(err) { return fmt.Errorf("init FTS schema: %w", err) } - // Module not compiled in; availability stays false. - return nil + // Module not compiled in; availability stays false. Fall + // through so the rest of schema init still runs. } } // Probe availability through the dialect so it works uniformly for // backends that carry FTS inside their main schema. s.fts5Available = s.dialect.FTSAvailable(s.db.DB) + + // Ensure the default "All" collection exists and contains every source. + if err := s.EnsureDefaultCollection(); err != nil { + return fmt.Errorf("ensure default collection: %w", err) + } + return nil } diff --git a/internal/store/sync.go b/internal/store/sync.go index 9d5e21ae..ee082299 100644 --- a/internal/store/sync.go +++ b/internal/store/sync.go @@ -321,6 +321,13 @@ func (s *Store) GetOrCreateSource(sourceType, identifier string) (*Source, error } newSource.ID, _ = result.LastInsertId() + // Add to the default "All" collection if it exists. + _, _ = s.db.Exec( + `INSERT OR IGNORE INTO collection_sources (collection_id, source_id) + SELECT id, ? FROM collections WHERE name = 'All'`, + newSource.ID, + ) + return newSource, nil }