Skip to content

Commit 4c2191f

Browse files
committed
feat: add deduplication engine with identity discovery
Content-hash based duplicate detection across accounts with soft-delete merging. Three-signal identity discovery (From header, OAuth, config). CLI commands: deduplicate (dry-run default, --apply, --undo) and list-identities. All query paths exclude dedup-soft-deleted rows.
1 parent 2ad7721 commit 4c2191f

13 files changed

Lines changed: 3096 additions & 4 deletions

File tree

cmd/msgvault/cmd/account_scope.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package cmd
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/wesm/msgvault/internal/store"
7+
)
8+
9+
// AccountScope is the result of resolving a user-supplied --account
10+
// flag against the store.
11+
type AccountScope struct {
12+
Input string
13+
Source *store.Source
14+
}
15+
16+
// IsEmpty reports whether the scope resolved to nothing.
17+
func (s AccountScope) IsEmpty() bool {
18+
return s.Source == nil
19+
}
20+
21+
// SourceIDs returns the source IDs that this scope expands to.
22+
func (s AccountScope) SourceIDs() []int64 {
23+
if s.Source != nil {
24+
return []int64{s.Source.ID}
25+
}
26+
return nil
27+
}
28+
29+
// DisplayName returns a human-readable label for the scope.
30+
func (s AccountScope) DisplayName() string {
31+
if s.Source != nil {
32+
return s.Source.Identifier
33+
}
34+
return ""
35+
}
36+
37+
// ResolveAccount resolves a user-supplied --account string against
38+
// the store. Returns an empty scope if input is empty. Currently
39+
// looks up sources by identifier or display name; collection lookup
40+
// will be added when collections are implemented.
41+
func ResolveAccount(
42+
st *store.Store, input string,
43+
) (AccountScope, error) {
44+
scope := AccountScope{Input: input}
45+
if input == "" {
46+
return scope, nil
47+
}
48+
49+
sources, err := st.GetSourcesByIdentifierOrDisplayName(input)
50+
if err != nil {
51+
return scope, fmt.Errorf(
52+
"look up source for %q: %w", input, err,
53+
)
54+
}
55+
if len(sources) == 0 {
56+
return scope, fmt.Errorf(
57+
"no account or source found for %q "+
58+
"(try 'msgvault list-accounts')",
59+
input,
60+
)
61+
}
62+
if len(sources) > 1 {
63+
names := make([]string, 0, len(sources))
64+
for _, s := range sources {
65+
names = append(names, fmt.Sprintf(
66+
"%s (%s, id=%d)",
67+
s.Identifier, s.SourceType, s.ID,
68+
))
69+
}
70+
return scope, fmt.Errorf(
71+
"ambiguous account %q matches multiple sources: %v",
72+
input, names,
73+
)
74+
}
75+
scope.Source = sources[0]
76+
return scope, nil
77+
}

cmd/msgvault/cmd/build_cache.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ func setupSQLiteSource(duckDB *sql.DB, dbPath string) (cleanup func(), err error
680680
query string
681681
typeOverrides string // DuckDB types parameter for read_csv_auto (empty = infer all)
682682
}{
683-
{"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type FROM messages WHERE sent_at IS NOT NULL",
683+
{"messages", "SELECT id, source_id, source_message_id, conversation_id, subject, snippet, sent_at, size_estimate, has_attachments, attachment_count, deleted_from_source_at, sender_id, message_type FROM messages WHERE sent_at IS NOT NULL AND deleted_at IS NULL",
684684
"types={'sent_at': 'TIMESTAMP', 'deleted_from_source_at': 'TIMESTAMP'}"},
685685
{"message_recipients", "SELECT message_id, participant_id, recipient_type, display_name FROM message_recipients", ""},
686686
{"message_labels", "SELECT message_id, label_id FROM message_labels", ""},

0 commit comments

Comments
 (0)