diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 00000000..40da6dda --- /dev/null +++ b/PLAN.md @@ -0,0 +1,466 @@ +# Fix Plan: PST Import Code Review Issues + +11 issues found during code review of the `import-pst` branch. Grouped by file to minimize context switching. + +--- + +## Group 1: `internal/pst/mime.go` + `internal/pst/mime_test.go` (Issues 1-4) + +### Issue 1 — Header injection via unsanitized MAPI properties (HIGH) + +**Problem:** `writeSynthesizedHeaders` (line 178-228) writes PST fields directly into RFC 5322 headers. Email addresses are not sanitized — a `SenderEmail` containing `\r\n` could inject arbitrary headers (e.g. `evil@bad.com\r\nBcc: victim@example.com`). + +**Fix:** + +1. Add a `sanitizeHeaderValue` helper near the other utility functions (after `formatDisplayList`): + +```go +// sanitizeHeaderValue strips CR and LF characters to prevent header injection. +func sanitizeHeaderValue(s string) string { + return strings.Map(func(r rune) rune { + if r == '\r' || r == '\n' { + return -1 + } + return r + }, s) +} +``` + +2. In `formatAddr`, sanitize the `email` parameter (the only unsanitized value — `name` already goes through `mime.QEncoding.Encode`): + +```go +func formatAddr(name, email string) string { + email = sanitizeHeaderValue(email) + // ... rest unchanged +} +``` + +3. In `writeSynthesizedHeaders`, sanitize `msg.MessageID`, `msg.InReplyTo`, and `msg.References` before use: + +- Lines 207-209: change to `mid := sanitizeHeaderValue(msg.MessageID)` +- Lines 215-218: change to `irt := sanitizeHeaderValue(msg.InReplyTo)` +- Line 222: change to `writeHeader(buf, "References", sanitizeHeaderValue(msg.References))` + +Note: `msg.Subject`, `msg.DisplayTo/Cc/Bcc`, and `msg.SenderName` all go through `mime.QEncoding.Encode` which is safe. + +4. Add tests in `mime_test.go`: + +```go +func TestSanitizeHeaderValue(t *testing.T) { + tests := []struct{ in, want string }{ + {"normal@example.com", "normal@example.com"}, + {"evil@example.com\r\nBcc: victim@evil.com", "evil@example.comBcc: victim@evil.com"}, + {"has\nnewline", "hasnewline"}, + {"has\rreturn", "hasreturn"}, + } + for _, tt := range tests { + got := sanitizeHeaderValue(tt.in) + if got != tt.want { + t.Errorf("sanitizeHeaderValue(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} + +func TestBuildRFC5322_HeaderInjection(t *testing.T) { + msg := &MessageEntry{ + EntryID: "1", + SenderEmail: "evil@example.com\r\nBcc: victim@evil.com", + Subject: "Test", + BodyText: "body", + } + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + if strings.Contains(string(raw), "Bcc:") { + t.Error("header injection: Bcc header was injected via SenderEmail") + } +} +``` + +--- + +### Issue 2 — No filename sanitization on attachments (MEDIUM) + +**Problem:** Attachment filenames from PST (line 82-90) are used directly in Content-Type and Content-Disposition headers without sanitization. A malicious PST could embed `../../etc/passwd` or null bytes. + +**Fix:** + +1. Add `"path/filepath"` to the import block. + +2. Add a `sanitizeFilename` helper: + +```go +// sanitizeFilename strips path components and dangerous characters from +// attachment filenames sourced from PST data. +func sanitizeFilename(name string) string { + name = filepath.Base(name) + if name == "." { + return "" + } + return strings.Map(func(r rune) rune { + if r < 0x20 || r == 0x7f { + return -1 + } + return r + }, name) +} +``` + +3. In `BuildRFC5322`, in the attachment loop (line 74), after `att := &attachments[i]`, add: + +```go +fname := sanitizeFilename(att.Filename) +``` + +Then use `fname` instead of `att.Filename` on lines 82, 88, and 90. + +4. Add test in `mime_test.go`: + +```go +func TestSanitizeFilename(t *testing.T) { + tests := []struct{ in, want string }{ + {"report.pdf", "report.pdf"}, + {"../../etc/passwd", "passwd"}, + {`C:\Users\evil\payload.exe`, "payload.exe"}, + {"file\x00name.txt", "filename.txt"}, + {"normal.doc", "normal.doc"}, + {"", ""}, + } + for _, tt := range tests { + got := sanitizeFilename(tt.in) + if got != tt.want { + t.Errorf("sanitizeFilename(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} +``` + +--- + +### Issue 3 — ContentID not sanitized (MEDIUM) + +**Problem:** `att.ContentID` (line 87) is wrapped in `<` and `>` without sanitizing. If the ContentID contains `>` or newlines, it could break MIME structure. + +**Fix:** + +1. Add a `sanitizeContentID` helper: + +```go +// sanitizeContentID strips characters that could break the Content-Id +// angle-bracket wrapper. +func sanitizeContentID(s string) string { + return strings.Map(func(r rune) rune { + switch r { + case '<', '>', '\r', '\n': + return -1 + default: + return r + } + }, s) +} +``` + +2. At line 86-87, change to: + +```go +if att.ContentID != "" { + cid := sanitizeContentID(att.ContentID) + ah.Set("Content-Id", "<"+cid+">") +``` + +Use `cid` also in the condition for Content-Disposition (inline vs attachment) — or keep using `att.ContentID != ""` for the condition, which is fine since we only sanitize the *value*. + +3. Add test in `mime_test.go`: + +```go +func TestSanitizeContentID(t *testing.T) { + tests := []struct{ in, want string }{ + {"abc123@example.com", "abc123@example.com"}, + {"header\r\n", "injectedheader"}, + } + for _, tt := range tests { + got := sanitizeContentID(tt.in) + if got != tt.want { + t.Errorf("sanitizeContentID(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} +``` + +--- + +### Issue 4 — QP encoder trailing whitespace not encoded (MEDIUM) + +**Problem:** `writeQP` (line 268-305) doesn't encode trailing spaces at end of lines before `\n`. RFC 2045 section 6.7 requires trailing whitespace on a line to be encoded. + +**Fix:** + +Replace the `writeQP` function with this corrected version: + +```go +func writeQP(dst interface{ Write([]byte) (int, error) }, s string) { + const maxLine = 76 + var line strings.Builder + + encodeTrailingWS := func() { + str := line.String() + if len(str) == 0 { + return + } + last := str[len(str)-1] + if last == ' ' || last == '\t' { + line.Reset() + line.WriteString(str[:len(str)-1]) + line.WriteString(fmt.Sprintf("=%02X", last)) + } + } + + flush := func(soft bool) { + if !soft { + encodeTrailingWS() + } + if soft { + _, _ = dst.Write([]byte(line.String() + "=\r\n")) + } else { + _, _ = dst.Write([]byte(line.String() + "\r\n")) + } + line.Reset() + } + + for _, b := range []byte(s) { + var encoded string + switch { + case b == '\r': + continue + case b == '\n': + flush(false) + continue + case b == '=': + encoded = "=3D" + case b < 32 || b > 126: + encoded = fmt.Sprintf("=%02X", b) + default: + encoded = string(rune(b)) + } + + if line.Len()+len(encoded) > maxLine { + flush(true) + } + line.WriteString(encoded) + } + if line.Len() > 0 { + flush(false) + } +} +``` + +Add test in `mime_test.go` (add `"bytes"` to the test file imports): + +```go +func TestWriteQP_TrailingSpace(t *testing.T) { + var buf bytes.Buffer + writeQP(&buf, "hello \nworld") + got := buf.String() + if !strings.Contains(got, "hello=20\r\n") { + t.Errorf("trailing space not encoded: got %q", got) + } +} +``` + +--- + +## Group 2: `internal/pst/reader.go` (Issues 7, 8, 10) + +### Issue 8 — strings.Builder for binary attachment data (LOW) + +**Problem:** `ReadAttachments` (line 247-265) streams binary attachment content into `strings.Builder` then converts via `[]byte(buf.String())`, causing unnecessary copies. Also requires the `stringBuilderWriter` adapter type. + +**Fix:** + +1. Add `"bytes"` to the import block. + +2. In `ReadAttachments`, change lines 247-248 from: + +```go +var buf strings.Builder +written, err := att.WriteTo((*stringBuilderWriter)(&buf)) +``` + +To: + +```go +var buf bytes.Buffer +written, err := att.WriteTo(&buf) +``` + +3. Change line 264 from `[]byte(buf.String())` to `buf.Bytes()`. + +4. Delete the `stringBuilderWriter` type and its `Write` method (lines 293-297): + +```go +// stringBuilderWriter adapts strings.Builder to io.Writer. +type stringBuilderWriter strings.Builder + +func (w *stringBuilderWriter) Write(p []byte) (int, error) { + return (*strings.Builder)(w).Write(p) +} +``` + +Note: `"strings"` import is still needed by `isExchangeDN`, `extractCN`, etc. + +--- + +### Issue 7 — Single large attachment can exceed memory limit (MEDIUM) + +**Problem:** `ReadAttachments` reads each full attachment into memory *before* checking `maxBytes`. A single 128 MiB attachment is fully buffered before the limit triggers. + +**Fix:** + +In `ReadAttachments`, after the `mimeType` assignment (line 241) and *before* the `var buf bytes.Buffer` line (which will be the new line after Issue 8 fix), add a pre-check: + +```go + // Pre-check: skip remaining attachments if this one would exceed the limit. + if maxBytes > 0 { + estimatedSize := int64(att.GetAttachSize()) + if estimatedSize > 0 && totalBytes+estimatedSize > maxBytes { + break + } + } +``` + +Keep the existing post-read check (lines 253-256) as a safety net since `GetAttachSize()` may be inaccurate or zero. + +--- + +### Issue 10 — windowsFiletimeToTime no validation on negative values (LOW) + +**Problem:** Negative FILETIME values (corrupted PST data) produce nonsensical dates via the arithmetic. + +**Fix:** + +1. In `windowsFiletimeToTime` (line 27), change `if ft == 0` to `if ft <= 0`. + +2. Add a test case to the existing `TestWindowsFiletimeToTime` in `mime_test.go`: + +```go +{ + name: "negative", + ft: -1, + want: time.Time{}, +}, +``` + +--- + +## Group 3: `internal/importer/pst_import.go` (Issues 5, 6, 9) + +### Issue 9 — flushPending takes unused labelIDs parameter (LOW) + +**Problem:** `flushPending` (line 281) accepts `labelIDs map[string]int64` but never reads it. + +**Fix:** + +1. Change line 281 from: + +```go +flushPending := func(labelIDs map[string]int64) (stop bool) { +``` + +To: + +```go +flushPending := func() (stop bool) { +``` + +2. Update call site at line ~503 from `flushPending(labelCache)` to `flushPending()`. + +3. Update call site at line ~518 from `flushPending(labelCache)` to `flushPending()`. + +--- + +### Issue 5 — checkpointBlocked never resets (MEDIUM) + +**Problem:** Once `checkpointBlocked` is set `true` after an ingest error (line 359), it never resets. No further checkpoints are saved for the rest of the import. If interrupted after that, all progress since the last checkpoint is lost. + +**Fix:** + +At lines 377-380 (the cleanup block at end of `flushPending`), add a reset before `return false`: + +```go + clear(pending) + pending = pending[:0] + pendingBytes = 0 + // Reset checkpoint blocking so future successful batches can checkpoint. + // checkpointBlocked is set when an ingest error occurs within a batch; + // once the batch completes, we allow checkpointing again. + checkpointBlocked = false + return false +``` + +--- + +### Issue 6 — Resume by folder index is fragile (MEDIUM) + +**Problem:** Resume saves `FolderIndex` (position in the folders slice) but doesn't validate that the folder at that index is still the same folder. If folder ordering changes between runs, the resume skips the wrong folder. + +**Fix:** + +After line 244 (`summary.FoldersTotal = len(folders)`) and before line 247 (the `const` block), add: + +```go + // Validate resume folder still matches. + if summary.WasResumed && resume.FolderIndex > 0 { + if resume.FolderIndex >= len(folders) { + log.Warn("resume folder index out of range; restarting from beginning", + "saved_index", resume.FolderIndex, + "folder_count", len(folders), + ) + resume.FolderIndex = 0 + resume.MsgIndex = 0 + } else if folders[resume.FolderIndex].Entry.Path != resume.FolderPath { + log.Warn("resume folder path mismatch; restarting from beginning", + "saved_path", resume.FolderPath, + "actual_path", folders[resume.FolderIndex].Entry.Path, + ) + resume.FolderIndex = 0 + resume.MsgIndex = 0 + } + } +``` + +--- + +## Group 4: `cmd/msgvault/cmd/import_pst.go` (Issue 11) + +### Issue 11 — os.Exit(130) bypasses deferred cleanup (LOW — document only) + +**Problem:** The second SIGINT calls `os.Exit(130)` which skips all deferred functions (`st.Close()`, `pstFile.Close()`). This is a deliberate UX trade-off but not documented. + +**Fix:** + +Add a comment above `os.Exit(130)` at line 85: + +```go + // NOTE: os.Exit bypasses all deferred cleanup (db.Close, + // pstFile.Close, etc.). This is deliberate: the first + // Ctrl+C already triggered graceful shutdown with checkpoint + // saving via context cancellation. SQLite WAL journaling + // ensures database consistency even on hard exit. + fmt.Fprintln(cmd.ErrOrStderr(), "Interrupted again. Exiting immediately.") + os.Exit(130) +``` + +--- + +## Verification + +After all changes, run: + +```bash +go fmt ./... +go vet ./... +go test ./internal/pst/ ./internal/importer/ +``` + +All existing tests must still pass, plus the new tests added above. diff --git a/cmd/msgvault/cmd/import_pst.go b/cmd/msgvault/cmd/import_pst.go new file mode 100644 index 00000000..04ab4062 --- /dev/null +++ b/cmd/msgvault/cmd/import_pst.go @@ -0,0 +1,164 @@ +package cmd + +import ( + "context" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/spf13/cobra" + "github.com/wesm/msgvault/internal/importer" + "github.com/wesm/msgvault/internal/store" +) + +var ( + importPstSourceType string + importPstSkipFolders []string + importPstNoResume bool + importPstCheckpointInterval int + importPstNoAttachments bool +) + +var importPstCmd = &cobra.Command{ + Use: "import-pst ", + Short: "Import a PST (Outlook) archive into msgvault", + Long: `Import a Microsoft Outlook PST file into msgvault. + +All email messages are imported. Calendar items, contacts, tasks, and notes +are skipped automatically. The PST folder structure is preserved as labels +(e.g. the Inbox folder becomes the "Inbox" label). + +The import is resumable: if interrupted with Ctrl+C, rerunning with the same +arguments will continue from where it left off. Use --no-resume to start fresh. + +Examples: + msgvault init-db + msgvault import-pst you@company.com /path/to/archive.pst + msgvault import-pst you@outlook.com backup.pst --skip-folder "Deleted Items" + msgvault import-pst you@outlook.com backup.pst --no-resume +`, + Args: cobra.ExactArgs(2), + SilenceUsage: true, + RunE: func(cmd *cobra.Command, args []string) error { + identifier := args[0] + pstPath := args[1] + + // Graceful Ctrl+C: first signal saves checkpoint, second exits immediately. + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + sigChan := make(chan os.Signal, 2) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + done := make(chan struct{}) + defer func() { + close(done) + signal.Stop(sigChan) + for { + select { + case <-sigChan: + default: + return + } + } + }() + + go func() { + signals := 0 + for { + select { + case <-done: + return + case <-sigChan: + select { + case <-done: + return + default: + } + signals++ + if signals == 1 { + fmt.Fprintln(cmd.ErrOrStderr(), "\nInterrupted. Saving checkpoint...") + cancel() + continue + } + // NOTE: os.Exit bypasses all deferred cleanup (db.Close, + // pstFile.Close, etc.). This is deliberate: the first + // Ctrl+C already triggered graceful shutdown with checkpoint + // saving via context cancellation. SQLite WAL journaling + // ensures database consistency even on hard exit. + fmt.Fprintln(cmd.ErrOrStderr(), "Interrupted again. Exiting immediately.") + os.Exit(130) + } + } + }() + + dbPath := cfg.DatabaseDSN() + st, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer st.Close() + + if err := st.InitSchema(); err != nil { + return fmt.Errorf("init schema: %w", err) + } + + attachmentsDir := cfg.AttachmentsDir() + if importPstNoAttachments { + attachmentsDir = "" + } + + summary, err := importer.ImportPst(ctx, st, pstPath, importer.PstImportOptions{ + SourceType: importPstSourceType, + Identifier: identifier, + SkipFolders: importPstSkipFolders, + NoResume: importPstNoResume, + CheckpointInterval: importPstCheckpointInterval, + AttachmentsDir: attachmentsDir, + Logger: logger, + }) + if err != nil { + return err + } + + out := cmd.OutOrStdout() + switch { + case ctx.Err() != nil: + fmt.Fprintln(out, "Import interrupted. Run again to resume.") + case summary.HardErrors: + fmt.Fprintln(out, "Import complete (with errors).") + default: + fmt.Fprintln(out, "Import complete.") + } + + if summary.WasResumed { + fmt.Fprintln(out, " Resumed from checkpoint.") + } + fmt.Fprintf(out, " File: %s\n", pstPath) + fmt.Fprintf(out, " Folders: %d / %d\n", summary.FoldersImported, summary.FoldersTotal) + fmt.Fprintf(out, " Processed: %d messages\n", summary.MessagesProcessed) + fmt.Fprintf(out, " Added: %d messages\n", summary.MessagesAdded) + fmt.Fprintf(out, " Updated: %d messages\n", summary.MessagesUpdated) + fmt.Fprintf(out, " Skipped: %d messages\n", summary.MessagesSkipped) + fmt.Fprintf(out, " Errors: %d\n", summary.Errors) + fmt.Fprintf(out, " Duration: %s\n", summary.Duration.Round(1e9)) + + if ctx.Err() != nil { + return context.Canceled + } + if summary.HardErrors { + return fmt.Errorf("import completed with %d errors", summary.Errors) + } + return nil + }, +} + +func init() { + rootCmd.AddCommand(importPstCmd) + + importPstCmd.Flags().StringVar(&importPstSourceType, "source-type", "pst", "Source type recorded in the database") + importPstCmd.Flags().StringArrayVar(&importPstSkipFolders, "skip-folder", nil, "Folder name to skip (repeatable, case-insensitive)") + importPstCmd.Flags().BoolVar(&importPstNoResume, "no-resume", false, "Do not resume from an interrupted import") + importPstCmd.Flags().IntVar(&importPstCheckpointInterval, "checkpoint-interval", 200, "Save progress every N messages") + importPstCmd.Flags().BoolVar(&importPstNoAttachments, "no-attachments", false, "Do not store attachments to disk (messages are still imported)") +} diff --git a/go.mod b/go.mod index 34894997..d64bab82 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/charmbracelet/x/ansi v0.11.7 github.com/coreos/go-oidc/v3 v3.18.0 github.com/emersion/go-imap/v2 v2.0.0-beta.8 + github.com/emersion/go-message v0.18.2 github.com/emersion/go-sasl v0.0.0-20241020182733-b788ff22d5a6 github.com/go-chi/chi/v5 v5.2.5 github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f @@ -22,8 +23,10 @@ require ( github.com/mattn/go-isatty v0.0.21 github.com/mattn/go-runewidth v0.0.23 github.com/mattn/go-sqlite3 v1.14.42 + github.com/mooijtech/go-pst/v6 v6.0.2 github.com/muesli/termenv v0.16.0 github.com/robfig/cron/v3 v3.0.1 + github.com/rotisserie/eris v0.5.4 github.com/spf13/cobra v1.10.2 golang.org/x/mod v0.35.0 golang.org/x/net v0.53.0 @@ -49,11 +52,11 @@ require ( github.com/clipperhouse/displaywidth v0.11.0 // indirect github.com/clipperhouse/uax29/v2 v2.7.0 // indirect github.com/dustin/go-humanize v1.0.1 // indirect - github.com/emersion/go-message v0.18.2 // indirect github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect github.com/go-jose/go-jose/v4 v4.1.4 // indirect github.com/go-viper/mapstructure/v2 v2.3.0 // indirect github.com/goccy/go-json v0.10.5 // indirect + github.com/godzie44/go-uring v0.0.0-20220926161041-69611e8b13d5 // indirect github.com/google/flatbuffers v25.1.24+incompatible // indirect github.com/google/jsonschema-go v0.4.2 // indirect github.com/google/uuid v1.6.0 // indirect @@ -61,18 +64,22 @@ require ( github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 // indirect github.com/klauspost/compress v1.17.11 // indirect github.com/klauspost/cpuid/v2 v2.2.9 // indirect + github.com/libp2p/go-sockaddr v0.1.1 // indirect github.com/lucasb-eyer/go-colorful v1.4.0 // indirect github.com/mattn/go-localereader v0.0.1 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect github.com/olekukonko/tablewriter v0.0.5 // indirect + github.com/philhofer/fwd v1.1.2 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/spf13/cast v1.7.1 // indirect github.com/spf13/pflag v1.0.9 // indirect github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect + github.com/tidwall/btree v1.6.0 // indirect + github.com/tinylib/msgp v1.1.8 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/yosida95/uritemplate/v3 v3.0.2 // indirect github.com/zeebo/xxh3 v1.0.2 // indirect @@ -80,4 +87,5 @@ require ( golang.org/x/telemetry v0.0.0-20260311193753-579e4da9a98c // indirect golang.org/x/tools v0.43.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect + google.golang.org/protobuf v1.36.1 // indirect ) diff --git a/go.sum b/go.sum index f22e4ae2..9041daad 100644 --- a/go.sum +++ b/go.sum @@ -83,6 +83,8 @@ github.com/go-viper/mapstructure/v2 v2.3.0 h1:27XbWsHIqhbdR5TIC911OfYvgSaW93HM+d github.com/go-viper/mapstructure/v2 v2.3.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4= github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/godzie44/go-uring v0.0.0-20220926161041-69611e8b13d5 h1:5zELAgnSz0gqmr4Q5DWCoOzNHoeBAxVUXB7LS1eG+sw= +github.com/godzie44/go-uring v0.0.0-20220926161041-69611e8b13d5/go.mod h1:ermjEDUoT/fS+3Ona5Vd6t6mZkw1eHp99ILO5jGRBkM= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= @@ -112,6 +114,8 @@ github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/libp2p/go-sockaddr v0.1.1 h1:yD80l2ZOdGksnOyHrhxDdTDFrf7Oy+v3FMVArIRgZxQ= +github.com/libp2p/go-sockaddr v0.1.1/go.mod h1:syPvOmNs24S3dFVGJA1/mrqdeijPxLV2Le3BRLKd68k= github.com/lucasb-eyer/go-colorful v1.4.0 h1:UtrWVfLdarDgc44HcS7pYloGHJUjHV/4FwW4TvVgFr4= github.com/lucasb-eyer/go-colorful v1.4.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/marcboeker/go-duckdb v1.8.5 h1:tkYp+TANippy0DaIOP5OEfBEwbUINqiFqgwMQ44jME0= @@ -133,6 +137,8 @@ github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8D github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4= github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE= +github.com/mooijtech/go-pst/v6 v6.0.2 h1:mXzOgcSZ30UPuCWpz4DAQCTm0ttOmiejOuF/CN32C2Q= +github.com/mooijtech/go-pst/v6 v6.0.2/go.mod h1:pF4o5rQwF33uLJQ0c+CZICeK4GwcKTpGVq6yVOHrvkY= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= @@ -141,6 +147,8 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= +github.com/philhofer/fwd v1.1.2 h1:bnDivRJ1EWPjUIRXV5KfORO897HTbpFAQddBdE8t7Gw= +github.com/philhofer/fwd v1.1.2/go.mod h1:qkPdfjR2SIEbspLqpe1tO4n5yICnr2DY7mqEx2tUTP0= github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -153,6 +161,8 @@ github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/rotisserie/eris v0.5.4 h1:Il6IvLdAapsMhvuOahHWiBnl1G++Q0/L5UIkI5mARSk= +github.com/rotisserie/eris v0.5.4/go.mod h1:Z/kgYTJiJtocxCbFfvRmO+QejApzG6zpyky9G1A4g9s= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= @@ -164,6 +174,10 @@ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cma github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tidwall/btree v1.6.0 h1:LDZfKfQIBHGHWSwckhXI0RPSXzlo+KYdjK7FWSqOzzg= +github.com/tidwall/btree v1.6.0/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY= +github.com/tinylib/msgp v1.1.8 h1:FCXC1xanKO4I8plpHGH2P7koL/RzZs12l/+r7vakfm0= +github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= @@ -179,12 +193,14 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa h1:t2QcU6V556bFjYgu4L6C+6VrCPyJZ+eyRsABUPs1mz4= golang.org/x/exp v0.0.0-20250218142911-aa4b98e5adaa/go.mod h1:BHOTPb3L19zxehTsLoJXVaTktb06DFgmdW6Wb9s8jqk= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= @@ -196,11 +212,13 @@ golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190228124157-a34e9553db1e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= @@ -208,10 +226,12 @@ golang.org/x/telemetry v0.0.0-20260311193753-579e4da9a98c h1:6a8FdnNk6bTXBjR4AGK golang.org/x/telemetry v0.0.0-20260311193753-579e4da9a98c/go.mod h1:TpUTTEp9frx7rTdLpC9gFG9kdI7zVLFTFFlqaH2Cncw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= @@ -221,6 +241,7 @@ golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.4.0/go.mod h1:UE5sM2OK9E/d67R0ANs2xJizIymRP5gJU295PvKXxjQ= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= @@ -229,6 +250,8 @@ golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhS golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= +google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= +google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/importer/pst_import.go b/internal/importer/pst_import.go new file mode 100644 index 00000000..2c27d0b2 --- /dev/null +++ b/internal/importer/pst_import.go @@ -0,0 +1,579 @@ +package importer + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "time" + + pstlib "github.com/mooijtech/go-pst/v6/pkg" + pstreader "github.com/wesm/msgvault/internal/pst" + "github.com/wesm/msgvault/internal/store" +) + +// PstImportOptions configures a PST import operation. +type PstImportOptions struct { + // SourceType is the sources.source_type value. Defaults to "pst". + SourceType string + + // Identifier is the email address for this source (required). + Identifier string + + // SkipFolders is a list of folder names to skip (case-insensitive). + // E.g. []string{"Deleted Items", "Junk Email", "Trash"}. + SkipFolders []string + + // NoResume forces a fresh import even if an active sync run exists. + NoResume bool + + // CheckpointInterval controls how often (in messages) to save progress. + // Defaults to 200. + CheckpointInterval int + + // AttachmentsDir controls where attachment files are written. + // Empty string disables disk storage (messages still imported). + AttachmentsDir string + + // MaxMessageBytes limits the total byte size (body + attachments) read + // per message. Defaults to 128 MiB. + MaxMessageBytes int64 + + // IngestFunc allows tests to override message ingestion. + IngestFunc func( + ctx context.Context, st *store.Store, + sourceID int64, identifier, attachmentsDir string, + labelIDs []int64, sourceMsgID, rawHash string, + raw []byte, fallbackDate time.Time, + log *slog.Logger, + ) error + + // Logger defaults to slog.Default(). + Logger *slog.Logger +} + +// PstImportSummary reports the results of a PST import. +type PstImportSummary struct { + WasResumed bool + Duration time.Duration + + FoldersTotal int + FoldersImported int + + MessagesProcessed int64 + MessagesAdded int64 + MessagesUpdated int64 + MessagesSkipped int64 + Errors int64 + HardErrors bool +} + +// pstCheckpoint tracks resume state for PST imports. +type pstCheckpoint struct { + File string `json:"file"` + FolderIndex int `json:"folder_index"` + FolderPath string `json:"folder_path"` + MsgIndex int64 `json:"msg_index"` +} + +const defaultMaxPstMessageBytes int64 = 128 << 20 // 128 MiB + +// ImportPst imports all email messages from a PST file into the msgvault database. +// +// Folder structure is preserved as labels. Non-email items (calendar, contacts, +// tasks) are skipped automatically. The import is resumable: if interrupted, +// rerunning with the same arguments continues from where it left off. +func ImportPst(ctx context.Context, st *store.Store, pstPath string, opts PstImportOptions) (*PstImportSummary, error) { + if opts.SourceType == "" { + opts.SourceType = "pst" + } + if opts.Identifier == "" { + return nil, fmt.Errorf("identifier is required") + } + if opts.CheckpointInterval <= 0 { + opts.CheckpointInterval = 200 + } + if opts.MaxMessageBytes <= 0 { + opts.MaxMessageBytes = defaultMaxPstMessageBytes + } + + ingestFn := opts.IngestFunc + if ingestFn == nil { + ingestFn = IngestRawMessage + } + + log := opts.Logger + if log == nil { + log = slog.Default() + } + + start := time.Now() + summary := &PstImportSummary{} + + absPath, err := filepath.Abs(pstPath) + if err != nil { + return nil, fmt.Errorf("abs path: %w", err) + } + cpFile := absPath + if resolved, err := filepath.EvalSymlinks(absPath); err == nil { + cpFile = resolved + } + + // Build skip-folder set (case-insensitive). + skipFolders := make(map[string]bool, len(opts.SkipFolders)) + for _, f := range opts.SkipFolders { + skipFolders[strings.ToLower(f)] = true + } + + // Get or create source. + src, err := st.GetOrCreateSource(opts.SourceType, opts.Identifier) + if err != nil { + return nil, fmt.Errorf("get/create source: %w", err) + } + + // Set display name to the PST filename so it appears in list-accounts / get_stats. + pstBase := filepath.Base(absPath) + if !src.DisplayName.Valid || src.DisplayName.String == "" { + if err := st.UpdateSourceDisplayName(src.ID, pstBase); err != nil { + log.Warn("failed to set source display name", "error", err) + } else { + src.DisplayName.Valid = true + src.DisplayName.String = pstBase + } + } + + // Resume or start sync. + var ( + syncID int64 + cp store.Checkpoint + resume pstCheckpoint + ) + + if !opts.NoResume { + active, err := st.GetActiveSync(src.ID) + if err != nil { + return nil, fmt.Errorf("check active sync: %w", err) + } + if active != nil { + syncID = active.ID + cp.MessagesProcessed = active.MessagesProcessed + cp.MessagesAdded = active.MessagesAdded + cp.MessagesUpdated = active.MessagesUpdated + cp.ErrorsCount = active.ErrorsCount + if active.CursorBefore.Valid && active.CursorBefore.String != "" { + var saved pstCheckpoint + if err := json.Unmarshal([]byte(active.CursorBefore.String), &saved); err == nil { + sameFile := saved.File == absPath || saved.File == cpFile + if !sameFile && saved.File != "" { + if curInfo, err := os.Stat(absPath); err == nil { + if cpInfo, err := os.Stat(saved.File); err == nil && os.SameFile(curInfo, cpInfo) { + sameFile = true + } + } + } + if sameFile { + resume = saved + summary.WasResumed = true + log.Info("resuming pst import", + "file", absPath, + "folder_index", resume.FolderIndex, + "msg_index", resume.MsgIndex, + ) + } else if saved.File != "" { + return nil, fmt.Errorf("active pst import is for %q, not %q; rerun with --no-resume", saved.File, absPath) + } + } + } + } + } + + if syncID == 0 { + syncID, err = st.StartSync(src.ID, "import-pst") + if err != nil { + return nil, fmt.Errorf("start sync: %w", err) + } + } + + failSync := func(msg string) { + if fsErr := st.FailSync(syncID, msg); fsErr != nil { + log.Warn("failed to record sync failure", "error", fsErr) + } + } + + // Save initial checkpoint only for new syncs; resuming preserves the existing cursor. + if !summary.WasResumed { + if err := savePstCheckpoint(st, syncID, cpFile, 0, "", 0, &cp); err != nil { + log.Warn("failed to save initial checkpoint", "error", err) + } + } + + // Open PST file. + pstFile, err := pstreader.Open(absPath) + if err != nil { + failSync(err.Error()) + return nil, fmt.Errorf("open pst: %w", err) + } + defer pstFile.Close() + + // Collect all email folders in traversal order. + type folderRecord struct { + Entry pstreader.FolderEntry + Folder *pstlib.Folder + } + var folders []folderRecord + + if err := pstFile.WalkFolders(func(entry pstreader.FolderEntry, folder *pstlib.Folder) error { + if skipFolders[strings.ToLower(entry.Name)] { + log.Debug("skipping folder", "path", entry.Path) + return nil + } + // Only collect folders that may contain messages. + if folder.MessageCount > 0 { + f := *folder // copy value + folders = append(folders, folderRecord{Entry: entry, Folder: &f}) + } + return nil + }); err != nil { + failSync(err.Error()) + return nil, fmt.Errorf("walk folders: %w", err) + } + + summary.FoldersTotal = len(folders) + + // Validate resume folder still matches. Check whenever FolderPath is set, + // including index 0, so a path change in the first folder is caught. + if summary.WasResumed && resume.FolderPath != "" { + if resume.FolderIndex >= len(folders) { + log.Warn("resume folder index out of range; restarting from beginning", + "saved_index", resume.FolderIndex, + "folder_count", len(folders), + ) + resume.FolderIndex = 0 + resume.MsgIndex = 0 + } else if folders[resume.FolderIndex].Entry.Path != resume.FolderPath { + log.Warn("resume folder path mismatch; restarting from beginning", + "saved_path", resume.FolderPath, + "actual_path", folders[resume.FolderIndex].Entry.Path, + ) + resume.FolderIndex = 0 + resume.MsgIndex = 0 + } + } + + // Batching constants (same as MBOX/EMLX importers). + const ( + batchSize = 200 + batchBytes = 32 << 20 // 32 MiB + ) + + type pendingPstMessage struct { + Raw []byte + RawHash string + SourceMsgID string + FallbackDate time.Time + LabelID int64 + FolderIndex int + FolderPath string + MsgIndex int64 + } + + var ( + pending []pendingPstMessage + pendingBytes int64 + checkpointBlocked bool + hardErrors bool + currentMsgIdx int64 + ) + + saveCp := func(fi int, fp string, mi int64) { + if err := savePstCheckpoint(st, syncID, cpFile, fi, fp, mi, &cp); err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("failed to save checkpoint", "error", err) + } + } + + flushPending := func() (stop bool) { + if len(pending) == 0 { + return false + } + + ids := make([]string, len(pending)) + for i, p := range pending { + ids[i] = p.SourceMsgID + } + + existingWithRaw, errWithRaw := st.MessageExistsWithRawBatch(src.ID, ids) + if errWithRaw != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("existence check failed", "error", errWithRaw) + } + + existingAny, errAny := st.MessageExistsBatch(src.ID, ids) + if errAny != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("existence check (any) failed", "error", errAny) + } + + for _, p := range pending { + if ctx.Err() != nil { + saveCp(p.FolderIndex, p.FolderPath, p.MsgIndex) + summary.Duration = time.Since(start) + return true + } + + cp.MessagesProcessed++ + summary.MessagesProcessed++ + + // Deduplicate: if exists with raw, just ensure labels are applied. + if errWithRaw == nil { + if msgID, exists := existingWithRaw[p.SourceMsgID]; exists { + summary.MessagesSkipped++ + if p.LabelID != 0 { + if err := st.AddMessageLabels(msgID, []int64{p.LabelID}); err != nil { + log.Warn("add labels to existing message", "error", err) + } + } + if !checkpointBlocked && cp.MessagesProcessed%int64(opts.CheckpointInterval) == 0 { + saveCp(p.FolderIndex, p.FolderPath, p.MsgIndex) + } + continue + } + } else { + // Fall back to individual check. + one, err := st.MessageExistsWithRawBatch(src.ID, []string{p.SourceMsgID}) + if err == nil { + if msgID, exists := one[p.SourceMsgID]; exists { + summary.MessagesSkipped++ + if p.LabelID != 0 { + _ = st.AddMessageLabels(msgID, []int64{p.LabelID}) + } + continue + } + } + } + + alreadyExists := false + if errAny == nil { + _, alreadyExists = existingAny[p.SourceMsgID] + } + + lblIDs := []int64{} + if p.LabelID != 0 { + lblIDs = []int64{p.LabelID} + } + + if err := ingestFn(ctx, st, src.ID, opts.Identifier, opts.AttachmentsDir, + lblIDs, p.SourceMsgID, p.RawHash, p.Raw, p.FallbackDate, log, + ); err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("failed to ingest message", "source_msg_id", p.SourceMsgID, "error", err) + checkpointBlocked = true + hardErrors = true + continue + } + + if alreadyExists { + cp.MessagesUpdated++ + summary.MessagesUpdated++ + } else { + cp.MessagesAdded++ + summary.MessagesAdded++ + } + + if !checkpointBlocked && cp.MessagesProcessed%int64(opts.CheckpointInterval) == 0 { + saveCp(p.FolderIndex, p.FolderPath, p.MsgIndex) + } + } + + clear(pending) + pending = pending[:0] + pendingBytes = 0 + // Reset checkpoint blocking so future successful batches can checkpoint. + // checkpointBlocked is set when an ingest error occurs within a batch; + // once the batch completes, we allow checkpointing again. + checkpointBlocked = false + return false + } + + // Process each folder. + labelCache := make(map[string]int64) // path → label ID + + for fi, fr := range folders { + if ctx.Err() != nil { + break + } + + // Resume: skip folders we've already completed. + if summary.WasResumed && fi < resume.FolderIndex { + continue + } + currentMsgIdx = 0 + + entry := fr.Entry + folder := fr.Folder + + log.Debug("processing folder", "path", entry.Path, "count", entry.MsgCount) + + // Ensure label for this folder. + labelID, ok := labelCache[entry.Path] + if !ok { + lid, err := st.EnsureLabel(src.ID, entry.Path, entry.Name, "user") + if err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("ensure label failed", "path", entry.Path, "error", err) + lid = 0 + } + labelCache[entry.Path] = lid + labelID = lid + } + + msgIter, err := folder.GetMessageIterator() + if err != nil { + // ErrMessagesNotFound is expected for empty folders. + cp.ErrorsCount++ + summary.Errors++ + log.Warn("get message iterator failed", "path", entry.Path, "error", err) + continue + } + + summary.FoldersImported++ + + for msgIter.Next() { + if ctx.Err() != nil { + break + } + + currentMsgIdx++ + + // Resume: skip messages already processed in the resumed folder. + if summary.WasResumed && fi == resume.FolderIndex && currentMsgIdx <= resume.MsgIndex { + continue + } + + msg := msgIter.Value() + + entry := pstreader.ExtractMessage(msg, folder.Name) + if entry == nil { + // Not an email (calendar, contact, task, etc.) — skip silently. + continue + } + + attachments, err := pstreader.ReadAttachments(msg, opts.MaxMessageBytes) + if err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("read attachments failed", "entry_id", entry.EntryID, "error", err) + // Continue without attachments rather than skipping the message. + attachments = nil + } + + raw, err := pstreader.BuildRFC5322(entry, attachments) + if err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("build RFC5322 failed", "entry_id", entry.EntryID, "error", err) + continue + } + + if opts.MaxMessageBytes > 0 && int64(len(raw)) > opts.MaxMessageBytes { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("message exceeds size limit; skipping", + "entry_id", entry.EntryID, + "size", len(raw), + "limit", opts.MaxMessageBytes, + ) + continue + } + + sum := sha256.Sum256(raw) + rawHash := hex.EncodeToString(sum[:]) + // Use the PST entry ID as the stable dedup key so that re-importing + // the same PST file always skips already-imported messages, even when + // the MIME reconstruction produces different bytes (e.g. random + // multipart boundaries). rawHash is still passed to IngestRawMessage + // as a fallback for thread ID generation. + sourceMsgID := "pst-" + entry.EntryID + + fallbackDate := entry.SentAt + if fallbackDate.IsZero() { + fallbackDate = entry.ReceivedAt + } + if fallbackDate.IsZero() { + fallbackDate = entry.CreationTime + } + + pending = append(pending, pendingPstMessage{ + Raw: raw, + RawHash: rawHash, + SourceMsgID: sourceMsgID, + FallbackDate: fallbackDate, + LabelID: labelID, + FolderIndex: fi, + FolderPath: fr.Entry.Path, + MsgIndex: currentMsgIdx, + }) + pendingBytes += int64(len(raw)) + + if len(pending) >= batchSize || pendingBytes >= batchBytes { + if stop := flushPending(); stop { + summary.HardErrors = hardErrors + return summary, nil + } + } + } + + if err := msgIter.Err(); err != nil { + cp.ErrorsCount++ + summary.Errors++ + log.Warn("message iterator error", "path", fr.Entry.Path, "error", err) + } + } + + // Flush any remaining messages. + if stop := flushPending(); stop { + summary.HardErrors = hardErrors + return summary, nil + } + + summary.Duration = time.Since(start) + summary.HardErrors = hardErrors + + finalMsg := fmt.Sprintf("folders:%d messages:%d", summary.FoldersImported, summary.MessagesProcessed) + if hardErrors { + if err := st.FailSync(syncID, fmt.Sprintf("completed with %d errors", cp.ErrorsCount)); err != nil { + return summary, fmt.Errorf("fail sync: %w", err) + } + return summary, nil + } + + if err := st.CompleteSync(syncID, finalMsg); err != nil { + return summary, fmt.Errorf("complete sync: %w", err) + } + + return summary, nil +} + +func savePstCheckpoint(st *store.Store, syncID int64, file string, folderIndex int, folderPath string, msgIndex int64, cp *store.Checkpoint) error { + b, err := json.Marshal(pstCheckpoint{ + File: file, + FolderIndex: folderIndex, + FolderPath: folderPath, + MsgIndex: msgIndex, + }) + if err != nil { + return fmt.Errorf("marshal checkpoint: %w", err) + } + cp.PageToken = string(b) + return st.UpdateSyncCheckpoint(syncID, cp) +} diff --git a/internal/importer/pst_import_test.go b/internal/importer/pst_import_test.go new file mode 100644 index 00000000..e2f88fcd --- /dev/null +++ b/internal/importer/pst_import_test.go @@ -0,0 +1,163 @@ +package importer + +import ( + "context" + "encoding/json" + "log/slog" + "path/filepath" + "testing" + "time" + + "github.com/wesm/msgvault/internal/store" +) + +// mockIngestFunc records IngestRawMessage calls for inspection in tests. +type mockIngestFunc struct { + calls []mockIngestCall + err error +} + +type mockIngestCall struct { + SourceID int64 + Identifier string + SourceMsgID string + RawHash string + LabelIDs []int64 + FallbackDate time.Time + RawLen int +} + +func (m *mockIngestFunc) fn( + ctx context.Context, st *store.Store, + sourceID int64, identifier, attachmentsDir string, + labelIDs []int64, sourceMsgID, rawHash string, + raw []byte, fallbackDate time.Time, + log *slog.Logger, +) error { + m.calls = append(m.calls, mockIngestCall{ + SourceID: sourceID, + Identifier: identifier, + SourceMsgID: sourceMsgID, + RawHash: rawHash, + LabelIDs: append([]int64(nil), labelIDs...), + FallbackDate: fallbackDate, + RawLen: len(raw), + }) + return m.err +} + +func openTestStorePst(t *testing.T) *store.Store { + t.Helper() + tmp := t.TempDir() + dbPath := filepath.Join(tmp, "msgvault.db") + st, err := store.Open(dbPath) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + if err := st.InitSchema(); err != nil { + t.Fatalf("init schema: %v", err) + } + return st +} + +// TestImportPst_MissingFile verifies that ImportPst returns an error for a +// non-existent PST file without panicking or corrupting the database. +func TestImportPst_MissingFile(t *testing.T) { + st := openTestStorePst(t) + mock := &mockIngestFunc{} + + _, err := ImportPst(context.Background(), st, "/nonexistent/path.pst", PstImportOptions{ + Identifier: "user@example.com", + NoResume: true, + IngestFunc: mock.fn, + }) + if err == nil { + t.Fatal("expected error for non-existent PST file, got nil") + } + if len(mock.calls) != 0 { + t.Errorf("expected 0 ingest calls, got %d", len(mock.calls)) + } +} + +// TestImportPst_RequiresIdentifier verifies that ImportPst rejects an empty identifier. +func TestImportPst_RequiresIdentifier(t *testing.T) { + st := openTestStorePst(t) + _, err := ImportPst(context.Background(), st, "any.pst", PstImportOptions{ + Identifier: "", + }) + if err == nil { + t.Fatal("expected error for empty identifier") + } +} + +// TestPstCheckpoint_RoundTrip verifies that savePstCheckpoint stores a checkpoint +// that can be decoded back to the original values. +func TestPstCheckpoint_RoundTrip(t *testing.T) { + st := openTestStorePst(t) + src, err := st.GetOrCreateSource("pst", "user@example.com") + if err != nil { + t.Fatalf("get/create source: %v", err) + } + + syncID, err := st.StartSync(src.ID, "import-pst") + if err != nil { + t.Fatalf("start sync: %v", err) + } + + cp := &store.Checkpoint{ + MessagesProcessed: 42, + MessagesAdded: 40, + } + if err := savePstCheckpoint(st, syncID, "/path/to/file.pst", 3, "Inbox/Archive", 100, cp); err != nil { + t.Fatalf("savePstCheckpoint: %v", err) + } + + active, err := st.GetActiveSync(src.ID) + if err != nil { + t.Fatalf("get active sync: %v", err) + } + if active == nil { + t.Fatal("expected active sync, got nil") + } + if !active.CursorBefore.Valid { + t.Fatal("expected cursor_before to be set") + } + + var saved pstCheckpoint + if err := json.Unmarshal([]byte(active.CursorBefore.String), &saved); err != nil { + t.Fatalf("unmarshal checkpoint: %v", err) + } + + if saved.File != "/path/to/file.pst" { + t.Errorf("File = %q, want %q", saved.File, "/path/to/file.pst") + } + if saved.FolderIndex != 3 { + t.Errorf("FolderIndex = %d, want 3", saved.FolderIndex) + } + if saved.FolderPath != "Inbox/Archive" { + t.Errorf("FolderPath = %q, want %q", saved.FolderPath, "Inbox/Archive") + } + if saved.MsgIndex != 100 { + t.Errorf("MsgIndex = %d, want 100", saved.MsgIndex) + } +} + +// TestImportPst_ContextCancelledBeforeOpen ensures that context cancellation +// before the PST file is opened is handled gracefully. +func TestImportPst_ContextCancelledBeforeOpen(t *testing.T) { + st := openTestStorePst(t) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + // Use a non-existent path so Open fails fast. + _, err := ImportPst(ctx, st, "/nonexistent.pst", PstImportOptions{ + Identifier: "user@example.com", + NoResume: true, + }) + // Either ctx error or open error is acceptable — we just must not hang. + if err == nil { + t.Error("expected error (either ctx or open), got nil") + } +} diff --git a/internal/importer/pst_integration_test.go b/internal/importer/pst_integration_test.go new file mode 100644 index 00000000..ee12f617 --- /dev/null +++ b/internal/importer/pst_integration_test.go @@ -0,0 +1,185 @@ +package importer + +import ( + "context" + "path/filepath" + "testing" + + "github.com/wesm/msgvault/internal/store" +) + +const pstTestdataDir = "../pst/testdata" + +func openIntegrationStore(t *testing.T) *store.Store { + t.Helper() + tmp := t.TempDir() + st, err := store.Open(filepath.Join(tmp, "msgvault.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = st.Close() }) + if err := st.InitSchema(); err != nil { + t.Fatalf("init schema: %v", err) + } + return st +} + +// TestImportPst_SupportPST imports the real support.pst fixture and asserts +// the expected message counts and deduplication behaviour. +func TestImportPst_SupportPST(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "support.pst") + + summary, err := ImportPst(context.Background(), st, pstPath, PstImportOptions{ + Identifier: "support@hackingteam.com", + NoResume: true, + }) + if err != nil { + t.Fatalf("ImportPst: %v", err) + } + + // support.pst has 17 email messages across Drafts (6) and Sent Messages (11). + if summary.MessagesProcessed != 17 { + t.Errorf("MessagesProcessed = %d, want 17", summary.MessagesProcessed) + } + if summary.MessagesAdded != 17 { + t.Errorf("MessagesAdded = %d, want 17", summary.MessagesAdded) + } + if summary.MessagesSkipped != 0 { + t.Errorf("MessagesSkipped = %d, want 0 on first import", summary.MessagesSkipped) + } + if summary.HardErrors { + t.Error("HardErrors = true, want false") + } + if summary.FoldersImported == 0 { + t.Error("FoldersImported = 0, expected > 0") + } +} + +// TestImportPst_SupportPST_Idempotent verifies that re-importing the same PST +// skips all messages (content-hash deduplication). +func TestImportPst_SupportPST_Idempotent(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "support.pst") + opts := PstImportOptions{ + Identifier: "support@hackingteam.com", + NoResume: true, + } + + // First import. + first, err := ImportPst(context.Background(), st, pstPath, opts) + if err != nil { + t.Fatalf("first ImportPst: %v", err) + } + if first.MessagesAdded == 0 { + t.Fatal("first import added no messages") + } + + // Second import — everything should be skipped. + second, err := ImportPst(context.Background(), st, pstPath, opts) + if err != nil { + t.Fatalf("second ImportPst: %v", err) + } + if second.MessagesSkipped != first.MessagesAdded { + t.Errorf("second import: skipped=%d, want %d (all from first)", + second.MessagesSkipped, first.MessagesAdded) + } + if second.MessagesAdded != 0 { + t.Errorf("second import: added=%d, want 0", second.MessagesAdded) + } +} + +// TestImportPst_SupportPST_CrossFolderLabels verifies that duplicate messages +// (same content in Drafts and Sent Messages) get both folder labels applied +// rather than being ingested twice. +func TestImportPst_SupportPST_CrossFolderLabels(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "support.pst") + + summary, err := ImportPst(context.Background(), st, pstPath, PstImportOptions{ + Identifier: "support@hackingteam.com", + NoResume: true, + }) + if err != nil { + t.Fatalf("ImportPst: %v", err) + } + + // support.pst has 17 raw items but some subjects appear in both Drafts and + // Sent Messages (duplicates). The total processed should equal all items. + // Added + Skipped should equal processed (no items dropped). + if summary.MessagesAdded+summary.MessagesSkipped+summary.MessagesUpdated != summary.MessagesProcessed { + t.Errorf("accounting mismatch: added(%d)+skipped(%d)+updated(%d) != processed(%d)", + summary.MessagesAdded, summary.MessagesSkipped, summary.MessagesUpdated, summary.MessagesProcessed) + } +} + +// TestImportPst_SupportPST_SkipFolder verifies that --skip-folder correctly +// excludes the specified folder from import. +func TestImportPst_SupportPST_SkipFolder(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "support.pst") + + // Skip Drafts (6 messages) — should only import Sent Messages (11). + summary, err := ImportPst(context.Background(), st, pstPath, PstImportOptions{ + Identifier: "support@hackingteam.com", + SkipFolders: []string{"Drafts"}, + NoResume: true, + }) + if err != nil { + t.Fatalf("ImportPst: %v", err) + } + + // With Drafts skipped we process fewer messages. Some "Sent Messages" subjects + // also appear in Drafts — but those aren't processed since Drafts is skipped. + // At minimum we should have processed fewer than all 17. + if summary.MessagesProcessed >= 17 { + t.Errorf("MessagesProcessed = %d with Drafts skipped; expected < 17", summary.MessagesProcessed) + } + if summary.MessagesProcessed == 0 { + t.Error("MessagesProcessed = 0; Sent Messages should still be imported") + } +} + +// TestImportPst_SupportPST_ContextCancelled verifies that cancelling mid-import +// saves a checkpoint and returns cleanly (no panic, no hang). +func TestImportPst_SupportPST_ContextCancelled(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "support.pst") + + // Cancel immediately — this should cause ImportPst to abort early. + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + summary, _ := ImportPst(ctx, st, pstPath, PstImportOptions{ + Identifier: "support@hackingteam.com", + NoResume: true, + }) + + // Must not panic and must return a (possibly zero) summary. + if summary == nil { + t.Fatal("ImportPst returned nil summary") + } +} + +// TestImportPst_32BitPST verifies that a 32-bit format PST is handled +// gracefully. go-pst may fail to read sub-folder metadata in 32-bit files; +// the importer skips those branches and completes without error. +func TestImportPst_32BitPST(t *testing.T) { + st := openIntegrationStore(t) + pstPath := filepath.Join(pstTestdataDir, "32-bit.pst") + + summary, err := ImportPst(context.Background(), st, pstPath, PstImportOptions{ + Identifier: "user@example.com", + NoResume: true, + }) + if err != nil { + t.Fatalf("ImportPst: %v", err) + } + // 32-bit.pst has no readable email messages. + if summary.MessagesProcessed != 0 { + t.Errorf("MessagesProcessed = %d, want 0", summary.MessagesProcessed) + } + if summary.HardErrors { + t.Error("HardErrors = true, want false") + } +} diff --git a/internal/pst/mime.go b/internal/pst/mime.go new file mode 100644 index 00000000..d79df2ce --- /dev/null +++ b/internal/pst/mime.go @@ -0,0 +1,369 @@ +package pst + +import ( + "bytes" + "encoding/base64" + "fmt" + "mime" + "mime/multipart" + "net/textproto" + "path/filepath" + "strings" + "time" +) + +// BuildRFC5322 constructs raw RFC 5322/MIME bytes from a MessageEntry and its +// attachments. +// +// Strategy: +// 1. If TransportHeaders is non-empty, use those verbatim as the message +// headers (stripping any existing MIME content headers we'll replace). +// 2. Otherwise, synthesize headers from MAPI properties. +// 3. Build the body as multipart/alternative when both text and HTML are +// present, or as a simple text/plain or text/html part when only one +// exists. +// 4. When attachments are present, wrap the body in multipart/mixed. +func BuildRFC5322(msg *MessageEntry, attachments []AttachmentEntry) ([]byte, error) { + // headerBuf holds all RFC 5322 header lines (without the trailing blank line). + // bodyBuf holds the MIME body (MIME-Version, Content-Type, blank line, body). + var headerBuf, bodyBuf bytes.Buffer + + // Write message-identifying headers. + if msg.TransportHeaders != "" { + writeTransportHeaders(&headerBuf, msg.TransportHeaders) + } else { + writeSynthesizedHeaders(&headerBuf, msg) + } + + hasText := msg.BodyText != "" + hasHTML := msg.BodyHTML != "" + hasAtts := len(attachments) > 0 + + switch { + case hasAtts: + // Wrap body and attachments in multipart/mixed. + mw := multipart.NewWriter(&bodyBuf) + fmt.Fprintf(&headerBuf, "MIME-Version: 1.0\r\nContent-Type: multipart/mixed;\r\n\tboundary=%q\r\n\r\n", mw.Boundary()) + + // Body sub-part. + bh := make(textproto.MIMEHeader) + if hasText && hasHTML { + // Build the multipart/alternative sub-part into a buffer so we know + // the boundary before writing the outer part header. + var innerBuf bytes.Buffer + altW := multipart.NewWriter(&innerBuf) + writeTextPart(altW, msg.BodyText) + writeHTMLPart(altW, msg.BodyHTML) + _ = altW.Close() + + bh.Set("Content-Type", fmt.Sprintf(`multipart/alternative; boundary="%s"`, altW.Boundary())) + pw, _ := mw.CreatePart(bh) + _, _ = pw.Write(innerBuf.Bytes()) + } else if hasHTML { + bh.Set("Content-Type", "text/html; charset=utf-8") + bh.Set("Content-Transfer-Encoding", "quoted-printable") + pw, _ := mw.CreatePart(bh) + writeQP(pw, msg.BodyHTML) + } else { + bh.Set("Content-Type", "text/plain; charset=utf-8") + bh.Set("Content-Transfer-Encoding", "quoted-printable") + pw, _ := mw.CreatePart(bh) + writeQP(pw, msg.BodyText) + } + + // Attachment parts. + for i := range attachments { + att := &attachments[i] + ah := make(textproto.MIMEHeader) + ct := att.MIMEType + if ct == "" { + ct = "application/octet-stream" + } else if _, _, err := mime.ParseMediaType(ct); err != nil { + ct = "application/octet-stream" + } + fname := sanitizeFilename(att.Filename) + if fname != "" { + ah.Set("Content-Type", mime.FormatMediaType(ct, map[string]string{"name": fname})) + } else { + ah.Set("Content-Type", ct) + } + if att.ContentID != "" { + cid := sanitizeContentID(att.ContentID) + ah.Set("Content-Id", "<"+cid+">") + ah.Set("Content-Disposition", fmt.Sprintf("inline; filename=%q", fname)) + } else { + ah.Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", fname)) + } + ah.Set("Content-Transfer-Encoding", "base64") + pw, _ := mw.CreatePart(ah) + enc := base64.NewEncoder(base64.StdEncoding, pw) + _, _ = enc.Write(att.Content) + _ = enc.Close() + } + _ = mw.Close() + + case hasText && hasHTML: + // multipart/alternative with no attachments. + mw := multipart.NewWriter(&bodyBuf) + fmt.Fprintf(&headerBuf, "MIME-Version: 1.0\r\nContent-Type: multipart/alternative;\r\n\tboundary=%q\r\n\r\n", mw.Boundary()) + writeTextPart(mw, msg.BodyText) + writeHTMLPart(mw, msg.BodyHTML) + _ = mw.Close() + + case hasHTML: + headerBuf.WriteString("MIME-Version: 1.0\r\nContent-Type: text/html; charset=utf-8\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n") + writeQP(&bodyBuf, msg.BodyHTML) + + default: + // text/plain only, or empty body. + headerBuf.WriteString("MIME-Version: 1.0\r\nContent-Type: text/plain; charset=utf-8\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n") + writeQP(&bodyBuf, msg.BodyText) + } + + return append(headerBuf.Bytes(), bodyBuf.Bytes()...), nil +} + +// writeTextPart writes a text/plain MIME part to mw. +func writeTextPart(mw *multipart.Writer, text string) { + th := make(textproto.MIMEHeader) + th.Set("Content-Type", "text/plain; charset=utf-8") + th.Set("Content-Transfer-Encoding", "quoted-printable") + pw, _ := mw.CreatePart(th) + writeQP(pw, text) +} + +// writeHTMLPart writes a text/html MIME part to mw. +func writeHTMLPart(mw *multipart.Writer, html string) { + th := make(textproto.MIMEHeader) + th.Set("Content-Type", "text/html; charset=utf-8") + th.Set("Content-Transfer-Encoding", "quoted-printable") + pw, _ := mw.CreatePart(th) + writeQP(pw, html) +} + +// writeTransportHeaders writes the original transport headers to buf, +// stripping any existing MIME content headers that we will replace. +func writeTransportHeaders(buf *bytes.Buffer, headers string) { + // Normalise line endings. + headers = strings.ReplaceAll(headers, "\r\n", "\n") + headers = strings.ReplaceAll(headers, "\r", "\n") + + lines := strings.Split(headers, "\n") + + skipContinuation := false + for _, line := range lines { + if line == "" { + // End of headers. + break + } + // Folded header continuation lines start with whitespace. + if len(line) > 0 && (line[0] == ' ' || line[0] == '\t') { + if !skipContinuation { + buf.WriteString(line) + buf.WriteString("\r\n") + } + continue + } + // New header field — strip MIME content headers we'll rebuild. + lower := strings.ToLower(line) + if strings.HasPrefix(lower, "mime-version:") || + strings.HasPrefix(lower, "content-type:") || + strings.HasPrefix(lower, "content-transfer-encoding:") { + skipContinuation = true + continue + } + skipContinuation = false + buf.WriteString(line) + buf.WriteString("\r\n") + } +} + +// writeSynthesizedHeaders writes RFC 5322 headers synthesized from MAPI +// properties when TransportMessageHeaders is absent. +func writeSynthesizedHeaders(buf *bytes.Buffer, msg *MessageEntry) { + writeHeader(buf, "From", formatAddr(msg.SenderName, msg.SenderEmail)) + + if msg.DisplayTo != "" { + writeHeader(buf, "To", formatDisplayList(msg.DisplayTo)) + } + if msg.DisplayCc != "" { + writeHeader(buf, "Cc", formatDisplayList(msg.DisplayCc)) + } + if msg.DisplayBcc != "" { + writeHeader(buf, "Bcc", formatDisplayList(msg.DisplayBcc)) + } + + t := msg.SentAt + if t.IsZero() { + t = msg.ReceivedAt + } + if t.IsZero() { + t = msg.CreationTime + } + if !t.IsZero() { + writeHeader(buf, "Date", t.Format(time.RFC1123Z)) + } + + if msg.Subject != "" { + writeHeader(buf, "Subject", mime.QEncoding.Encode("utf-8", msg.Subject)) + } + + if msg.MessageID != "" { + mid := sanitizeHeaderValue(msg.MessageID) + if !strings.HasPrefix(mid, "<") { + mid = "<" + mid + ">" + } + writeHeader(buf, "Message-Id", mid) + } + + if msg.InReplyTo != "" { + irt := sanitizeHeaderValue(msg.InReplyTo) + if !strings.HasPrefix(irt, "<") { + irt = "<" + irt + ">" + } + writeHeader(buf, "In-Reply-To", irt) + } + + if msg.References != "" { + writeHeader(buf, "References", sanitizeHeaderValue(msg.References)) + } + + writeHeader(buf, "X-Msgvault-Source", "pst") + writeHeader(buf, "X-Msgvault-Synthesized", "true") +} + +func writeHeader(buf *bytes.Buffer, name, value string) { + buf.WriteString(name) + buf.WriteString(": ") + buf.WriteString(value) + buf.WriteString("\r\n") +} + +// sanitizeHeaderValue strips CR and LF characters to prevent header injection. +func sanitizeHeaderValue(s string) string { + return strings.Map(func(r rune) rune { + if r == '\r' || r == '\n' { + return -1 + } + return r + }, s) +} + +// sanitizeFilename strips path components and dangerous characters from +// attachment filenames sourced from PST data. Handles both Unix and Windows +// path separators since PST data originates on Windows. +func sanitizeFilename(name string) string { + // Normalize Windows backslash separators before calling filepath.Base. + name = strings.ReplaceAll(name, `\`, "/") + name = filepath.Base(name) + if name == "." { + return "" + } + return strings.Map(func(r rune) rune { + if r < 0x20 || r == 0x7f { + return -1 + } + return r + }, name) +} + +// sanitizeContentID strips characters that could break the Content-Id +// angle-bracket wrapper. +func sanitizeContentID(s string) string { + return strings.Map(func(r rune) rune { + switch r { + case '<', '>', '\r', '\n': + return -1 + default: + return r + } + }, s) +} + +// formatAddr formats a display name + email as "Name " or just email. +func formatAddr(name, email string) string { + email = sanitizeHeaderValue(email) + if name == "" && email == "" { + return "" + } + if name == "" { + return email + } + if email == "" { + return mime.QEncoding.Encode("utf-8", name) + } + return fmt.Sprintf("%s <%s>", mime.QEncoding.Encode("utf-8", name), email) +} + +// formatDisplayList converts a semicolon-separated PST display list to a +// comma-separated header value. PST DisplayTo/Cc/Bcc fields contain display +// names only (not email addresses), so we emit them without angle brackets. +func formatDisplayList(display string) string { + parts := strings.Split(display, ";") + cleaned := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + cleaned = append(cleaned, mime.QEncoding.Encode("utf-8", p)) + } + } + return strings.Join(cleaned, ", ") +} + +// writeQP writes s as quoted-printable text to dst. +// Lines longer than 76 characters are soft-wrapped with "=\r\n". +// Trailing whitespace before line breaks is encoded per RFC 2045 §6.7. +func writeQP(dst interface{ Write([]byte) (int, error) }, s string) { + const maxLine = 76 + var line strings.Builder + + encodeTrailingWS := func() { + str := line.String() + if len(str) == 0 { + return + } + last := str[len(str)-1] + if last == ' ' || last == '\t' { + line.Reset() + line.WriteString(str[:len(str)-1]) + line.WriteString(fmt.Sprintf("=%02X", last)) + } + } + + flush := func(soft bool) { + if !soft { + encodeTrailingWS() + } + if soft { + _, _ = dst.Write([]byte(line.String() + "=\r\n")) + } else { + _, _ = dst.Write([]byte(line.String() + "\r\n")) + } + line.Reset() + } + + for _, b := range []byte(s) { + var encoded string + switch { + case b == '\r': + continue + case b == '\n': + flush(false) + continue + case b == '=': + encoded = "=3D" + case b < 32 || b > 126: + encoded = fmt.Sprintf("=%02X", b) + default: + encoded = string(rune(b)) + } + + if line.Len()+len(encoded) > maxLine { + flush(true) + } + line.WriteString(encoded) + } + if line.Len() > 0 { + flush(false) + } +} diff --git a/internal/pst/mime_test.go b/internal/pst/mime_test.go new file mode 100644 index 00000000..42ae244d --- /dev/null +++ b/internal/pst/mime_test.go @@ -0,0 +1,307 @@ +package pst + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestWindowsFiletimeToTime(t *testing.T) { + tests := []struct { + name string + ft int64 + want time.Time + }{ + { + name: "zero", + ft: 0, + want: time.Time{}, + }, + { + name: "unix epoch", + // 1970-01-01 00:00:00 UTC in Windows FILETIME + ft: 116444736000000000, + want: time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC), + }, + { + name: "2024-01-15 10:30:00 UTC", + // (2024-01-15T10:30:00 UTC - 1601-01-01) in 100ns intervals + ft: 133497882000000000, + want: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC), + }, + { + name: "negative", + ft: -1, + want: time.Time{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := windowsFiletimeToTime(tt.ft) + if !got.Equal(tt.want) { + t.Errorf("windowsFiletimeToTime(%d) = %v, want %v", tt.ft, got, tt.want) + } + }) + } +} + +func TestExtractCN(t *testing.T) { + tests := []struct { + dn string + want string + }{ + {"/O=CORP/OU=EXCHANGE/CN=RECIPIENTS/CN=JSMITH", "JSMITH"}, + {"/o=Contoso/ou=Exchange/cn=Recipients/cn=jdoe", "jdoe"}, + {"user@example.com", "user@example.com"}, // not a DN + {"", ""}, + } + for _, tt := range tests { + got := extractCN(tt.dn) + if got != tt.want { + t.Errorf("extractCN(%q) = %q, want %q", tt.dn, got, tt.want) + } + } +} + +func TestIsExchangeDN(t *testing.T) { + if !isExchangeDN("/O=CORP/OU=EXCH/CN=user") { + t.Error("expected true for /O= DN") + } + if !isExchangeDN("/o=corp/cn=user") { + t.Error("expected true for /o= DN") + } + if isExchangeDN("user@example.com") { + t.Error("expected false for SMTP address") + } +} + +func TestBuildRFC5322_SynthesizedHeaders(t *testing.T) { + msg := &MessageEntry{ + EntryID: "12345", + FolderPath: "Inbox", + Subject: "Hello World", + BodyText: "This is a test message.", + SenderName: "Alice", + SenderEmail: "alice@example.com", + DisplayTo: "Bob", + MessageID: "", + SentAt: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC), + } + + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + + s := string(raw) + if !strings.Contains(s, "From:") { + t.Error("missing From header") + } + if !strings.Contains(s, "alice@example.com") { + t.Error("missing sender email") + } + if !strings.Contains(s, "Subject:") { + t.Error("missing Subject header") + } + if !strings.Contains(s, "Message-Id:") { + t.Error("missing Message-Id header") + } + if !strings.Contains(s, "X-Msgvault-Synthesized: true") { + t.Error("missing X-Msgvault-Synthesized header") + } + if !strings.Contains(s, "text/plain") { + t.Error("missing text/plain content type") + } + if !strings.Contains(s, "This is a test message") { + t.Error("body text not found in output") + } +} + +func TestBuildRFC5322_TransportHeaders(t *testing.T) { + transportHeaders := "From: alice@example.com\r\nTo: bob@example.com\r\nSubject: Test\r\nMessage-ID: \r\nDate: Mon, 15 Jan 2024 10:30:00 +0000\r\n" + + msg := &MessageEntry{ + EntryID: "99", + TransportHeaders: transportHeaders, + BodyText: "Body text here.", + BodyHTML: "

Body HTML here.

", + } + + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + + s := string(raw) + // Original headers should be present. + if !strings.Contains(s, "From: alice@example.com") { + t.Error("missing original From header") + } + if !strings.Contains(s, "Message-ID: ") { + t.Error("missing original Message-ID header") + } + // Should NOT have synthesized header. + if strings.Contains(s, "X-Msgvault-Synthesized") { + t.Error("should not have X-Msgvault-Synthesized when transport headers present") + } + // Both text and HTML → multipart/alternative. + if !strings.Contains(s, "multipart/alternative") { + t.Error("expected multipart/alternative for text+html body") + } +} + +func TestBuildRFC5322_WithAttachments(t *testing.T) { + msg := &MessageEntry{ + EntryID: "42", + Subject: "With attachment", + BodyText: "See attached.", + SenderEmail: "sender@example.com", + } + attachments := []AttachmentEntry{ + { + Filename: "report.pdf", + MIMEType: "application/pdf", + Content: []byte("%PDF-1.4 test"), + }, + } + + raw, err := BuildRFC5322(msg, attachments) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + + s := string(raw) + if !strings.Contains(s, "multipart/mixed") { + t.Error("expected multipart/mixed for message with attachments") + } + if !strings.Contains(s, "report.pdf") { + t.Error("attachment filename not found") + } + if !strings.Contains(s, "application/pdf") { + t.Error("attachment content type not found") + } +} + +func TestBuildRFC5322_EmptyBody(t *testing.T) { + msg := &MessageEntry{ + EntryID: "1", + SenderEmail: "a@b.com", + Subject: "No body", + } + + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + + s := string(raw) + if !strings.Contains(s, "text/plain") { + t.Error("expected text/plain even for empty body") + } +} + +func TestSanitizeHeaderValue(t *testing.T) { + tests := []struct{ in, want string }{ + {"normal@example.com", "normal@example.com"}, + {"evil@example.com\r\nBcc: victim@evil.com", "evil@example.comBcc: victim@evil.com"}, + {"has\nnewline", "hasnewline"}, + {"has\rreturn", "hasreturn"}, + } + for _, tt := range tests { + got := sanitizeHeaderValue(tt.in) + if got != tt.want { + t.Errorf("sanitizeHeaderValue(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} + +func TestBuildRFC5322_HeaderInjection(t *testing.T) { + msg := &MessageEntry{ + EntryID: "1", + SenderEmail: "evil@example.com\r\nBcc: victim@evil.com", + Subject: "Test", + BodyText: "body", + } + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + // Check that "Bcc:" does not appear as a separate header line (the actual + // injection vector). A sanitized value may still contain "Bcc:" as a + // substring within the From address, but not as a new header line. + if strings.Contains(string(raw), "\r\nBcc:") { + t.Error("header injection: Bcc header was injected via SenderEmail") + } +} + +func TestSanitizeFilename(t *testing.T) { + tests := []struct{ in, want string }{ + {"report.pdf", "report.pdf"}, + {"../../etc/passwd", "passwd"}, + {`C:\Users\evil\payload.exe`, "payload.exe"}, + {"file\x00name.txt", "filename.txt"}, + {"normal.doc", "normal.doc"}, + {"", ""}, + } + for _, tt := range tests { + got := sanitizeFilename(tt.in) + if got != tt.want { + t.Errorf("sanitizeFilename(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} + +func TestSanitizeContentID(t *testing.T) { + tests := []struct{ in, want string }{ + {"abc123@example.com", "abc123@example.com"}, + {"header\r\n", "injectedheader"}, + } + for _, tt := range tests { + got := sanitizeContentID(tt.in) + if got != tt.want { + t.Errorf("sanitizeContentID(%q) = %q, want %q", tt.in, got, tt.want) + } + } +} + +func TestWriteQP_TrailingSpace(t *testing.T) { + var buf bytes.Buffer + writeQP(&buf, "hello \nworld") + got := buf.String() + if !strings.Contains(got, "hello=20\r\n") { + t.Errorf("trailing space not encoded: got %q", got) + } +} + +func TestBuildRFC5322_TransportHeadersStripMIME(t *testing.T) { + // Transport headers that include MIME headers — these should be stripped. + transportHeaders := "From: alice@example.com\r\nMIME-Version: 1.0\r\nContent-Type: text/plain; charset=us-ascii\r\nContent-Transfer-Encoding: 7bit\r\nSubject: Old MIME\r\n" + + msg := &MessageEntry{ + TransportHeaders: transportHeaders, + BodyText: "Hello.", + } + + raw, err := BuildRFC5322(msg, nil) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + + s := string(raw) + // From and Subject should be present. + if !strings.Contains(s, "From: alice@example.com") { + t.Error("From header missing") + } + if !strings.Contains(s, "Subject: Old MIME") { + t.Error("Subject header missing") + } + // The old Content-Type from transport headers should not appear verbatim. + // (Our rebuilt MIME-Version and Content-Type replaces it.) + // We expect exactly one Content-Type occurrence (ours, for text/plain). + count := strings.Count(s, "Content-Type:") + if count != 1 { + t.Errorf("expected 1 Content-Type header, got %d", count) + } +} diff --git a/internal/pst/reader.go b/internal/pst/reader.go new file mode 100644 index 00000000..69a5e8e9 --- /dev/null +++ b/internal/pst/reader.go @@ -0,0 +1,322 @@ +package pst + +import ( + "bytes" + "fmt" + "io" + "os" + "strings" + "time" + + charsets "github.com/emersion/go-message/charset" + pstlib "github.com/mooijtech/go-pst/v6/pkg" + "github.com/mooijtech/go-pst/v6/pkg/properties" + "github.com/rotisserie/eris" + "golang.org/x/text/encoding" +) + +func init() { + // Register extended charsets so go-pst can decode non-UTF-8 encoded PST files. + pstlib.ExtendCharsets(func(name string, enc encoding.Encoding) { + charsets.RegisterEncoding(name, enc) + }) +} + +// windowsFiletimeToTime converts a Windows FILETIME value (100-nanosecond +// intervals since 1601-01-01 UTC) to a time.Time. Returns zero time if ft == 0. +func windowsFiletimeToTime(ft int64) time.Time { + if ft <= 0 { + return time.Time{} + } + // 11644473600 seconds between Windows epoch (1601-01-01) and Unix epoch (1970-01-01). + const epochDiff int64 = 11644473600 + secs := ft/10_000_000 - epochDiff + ns := (ft % 10_000_000) * 100 + return time.Unix(secs, ns).UTC() +} + +// File wraps a PST file for reading. +type File struct { + pstFile *pstlib.File + closer io.Closer +} + +// Open opens a PST file at path for reading. +func Open(path string) (*File, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open: %w", err) + } + pstFile, err := pstlib.New(f) + if err != nil { + _ = f.Close() + return nil, fmt.Errorf("parse pst: %w", err) + } + return &File{pstFile: pstFile, closer: f}, nil +} + +// Close releases all resources held by the File. +func (f *File) Close() error { + f.pstFile.Cleanup() + return f.closer.Close() +} + +// FolderEntry holds metadata about a PST folder. +type FolderEntry struct { + Name string + Path string // Full slash-separated path, e.g. "Personal Folders/Inbox/Archive" + MsgCount int32 +} + +// MessageEntry holds the extracted email properties from a PST message. +type MessageEntry struct { + // EntryID is the PST node identifier, used as part of the dedup key. + EntryID string + + // FolderPath is the slash-separated folder path this message came from. + FolderPath string + + // Subject is the message subject. + Subject string + + // TransportMessageHeaders contains the original RFC 5322 headers as stored + // by the PST, available for most internet-delivered messages. Empty for + // drafts or messages sent via Exchange-only paths. + TransportHeaders string + + // Body content. + BodyText string + BodyHTML string + + // Sender fields. + SenderName string + SenderEmail string + SenderAddressType string // "SMTP" or "EX" (Exchange DN) + + // DisplayTo/Cc/Bcc are semicolon-separated display names from MAPI. + // Email addresses must be parsed from TransportHeaders when available. + DisplayTo string + DisplayCc string + DisplayBcc string + + // Threading headers. + MessageID string + InReplyTo string + References string + + // Timestamps. + SentAt time.Time // ClientSubmitTime + ReceivedAt time.Time // MessageDeliveryTime + CreationTime time.Time // PidTagCreationTime +} + +// AttachmentEntry holds metadata and content for a PST attachment. +type AttachmentEntry struct { + Filename string + MIMEType string + ContentID string + Size int32 + Content []byte +} + +// WalkFolderFunc is called for each folder during WalkFolders. The raw pstlib +// folder is provided so callers can iterate messages. +type WalkFolderFunc func(entry FolderEntry, folder *pstlib.Folder) error + +// WalkFolders walks all folders in the PST file recursively, building +// slash-separated folder paths. Search folders are skipped automatically. +// Returns the first non-nil error returned by fn. +func (f *File) WalkFolders(fn WalkFolderFunc) error { + rootFolder, err := f.pstFile.GetRootFolder() + if err != nil { + return fmt.Errorf("get root folder: %w", err) + } + return walkFoldersRecursive(&rootFolder, "", fn) +} + +func walkFoldersRecursive(folder *pstlib.Folder, parentPath string, fn WalkFolderFunc) error { + path := folder.Name + if parentPath != "" { + path = parentPath + "/" + folder.Name + } + + entry := FolderEntry{ + Name: folder.Name, + Path: path, + MsgCount: folder.MessageCount, + } + + if err := fn(entry, folder); err != nil { + return err + } + + if !folder.HasSubFolders { + return nil + } + + subFolders, err := folder.GetSubFolders() + if err != nil { + // Some PST variants (e.g. 32-bit) can fail to read sub-folder + // metadata; log and continue rather than aborting the walk. + return nil + } + for i := range subFolders { + if err := walkFoldersRecursive(&subFolders[i], path, fn); err != nil { + return err + } + } + return nil +} + +// ExtractMessage extracts email properties from a pstlib.Message. +// Returns nil if the message is not an email (e.g. calendar, contact, task). +func ExtractMessage(msg *pstlib.Message, folderPath string) *MessageEntry { + props, ok := msg.Properties.(*properties.Message) + if !ok { + return nil + } + + subject := props.GetSubject() + if subject == "" { + subject = props.GetNormalizedSubject() + } + if subject == "" { + subject = props.GetInternetSubject() + } + + senderEmail := props.GetSenderEmailAddress() + // Exchange Distinguished Names start with /O= — try to resolve to SMTP. + if isExchangeDN(senderEmail) { + if smtp := props.GetSmtpAddress(); smtp != "" { + senderEmail = smtp + } else { + senderEmail = extractCN(senderEmail) + } + } + + return &MessageEntry{ + EntryID: fmt.Sprintf("%d", msg.Identifier), + FolderPath: folderPath, + Subject: subject, + TransportHeaders: props.GetTransportMessageHeaders(), + BodyText: props.GetBody(), + BodyHTML: props.GetBodyHtml(), + SenderName: props.GetSenderName(), + SenderEmail: senderEmail, + SenderAddressType: props.GetSenderAddressType(), + DisplayTo: props.GetDisplayTo(), + DisplayCc: props.GetDisplayCc(), + DisplayBcc: props.GetDisplayBcc(), + MessageID: props.GetInternetMessageId(), + InReplyTo: props.GetInReplyToId(), + References: props.GetInternetReferences(), + SentAt: windowsFiletimeToTime(props.GetClientSubmitTime()), + ReceivedAt: windowsFiletimeToTime(props.GetMessageDeliveryTime()), + CreationTime: windowsFiletimeToTime(props.GetCreationTime()), + } +} + +var errAttachmentTooLarge = fmt.Errorf("attachment exceeds size limit") + +// limitWriter wraps an io.Writer and returns errAttachmentTooLarge once the +// remaining byte budget is exhausted. +type limitWriter struct { + w io.Writer + remaining int64 +} + +func (lw *limitWriter) Write(p []byte) (int, error) { + if int64(len(p)) > lw.remaining { + return 0, errAttachmentTooLarge + } + n, err := lw.w.Write(p) + lw.remaining -= int64(n) + return n, err +} + +// ReadAttachments reads all attachments from a pstlib.Message into memory. +// Returns an empty slice (not an error) when there are no attachments. +// Individual attachment read errors are returned as a non-nil error. +func ReadAttachments(msg *pstlib.Message, maxBytes int64) ([]AttachmentEntry, error) { + iter, err := msg.GetAttachmentIterator() + if eris.Is(err, pstlib.ErrAttachmentsNotFound) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("get attachment iterator: %w", err) + } + + var attachments []AttachmentEntry + var totalBytes int64 + + for iter.Next() { + att := iter.Value() + + filename := att.GetAttachLongFilename() + if filename == "" { + filename = att.GetAttachFilename() + } + + mimeType := att.GetAttachMimeTag() + if mimeType == "" { + mimeType = "application/octet-stream" + } + + // Pre-check using the reported size when available; the bounded writer below + // enforces the limit unconditionally, covering the size==0 case. + if maxBytes > 0 { + estimatedSize := int64(att.GetAttachSize()) + if estimatedSize > 0 && totalBytes+estimatedSize > maxBytes { + break + } + } + + // Stream attachment content through a bounded writer so a corrupted or + // malicious PST reporting size 0 cannot exhaust memory. + var buf bytes.Buffer + w := io.Writer(&buf) + if maxBytes > 0 { + w = &limitWriter{w: &buf, remaining: maxBytes - totalBytes} + } + written, err := att.WriteTo(w) + if err != nil { + // ErrAttachmentTooLarge means we hit the cap; stop reading further attachments. + if err == errAttachmentTooLarge { + break + } + return nil, fmt.Errorf("read attachment %q: %w", filename, err) + } + totalBytes += written + + attachments = append(attachments, AttachmentEntry{ + Filename: filename, + MIMEType: mimeType, + ContentID: att.GetAttachContentId(), + Size: att.GetAttachSize(), + Content: buf.Bytes(), + }) + } + + if err := iter.Err(); err != nil { + return attachments, fmt.Errorf("attachment iterator: %w", err) + } + return attachments, nil +} + +// isExchangeDN reports whether s looks like an Exchange Distinguished Name. +func isExchangeDN(s string) bool { + return strings.HasPrefix(s, "/O=") || strings.HasPrefix(s, "/o=") +} + +// extractCN extracts the last CN= component from an Exchange DN. +// E.g. "/O=CORP/OU=EXCHANGE/CN=RECIPIENTS/CN=JSMITH" → "JSMITH". +func extractCN(dn string) string { + parts := strings.Split(dn, "/") + for i := len(parts) - 1; i >= 0; i-- { + p := parts[i] + if upper := strings.ToUpper(p); strings.HasPrefix(upper, "CN=") { + return p[3:] + } + } + return dn +} diff --git a/internal/pst/reader_test.go b/internal/pst/reader_test.go new file mode 100644 index 00000000..d4b66079 --- /dev/null +++ b/internal/pst/reader_test.go @@ -0,0 +1,347 @@ +package pst_test + +import ( + "path/filepath" + "strings" + "testing" + + pstlib "github.com/mooijtech/go-pst/v6/pkg" + pstreader "github.com/wesm/msgvault/internal/pst" +) + +const testdataDir = "testdata" + +func supportPST(t *testing.T) string { + t.Helper() + return filepath.Join(testdataDir, "support.pst") +} + +func bit32PST(t *testing.T) string { + t.Helper() + return filepath.Join(testdataDir, "32-bit.pst") +} + +// TestOpen_SupportPST verifies that a real 64-bit PST file opens without error. +func TestOpen_SupportPST(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() +} + +// TestOpen_32BitPST verifies that a 32-bit PST file opens without error. +func TestOpen_32BitPST(t *testing.T) { + f, err := pstreader.Open(bit32PST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() +} + +// TestOpen_NonExistent verifies a clear error for missing files. +func TestOpen_NonExistent(t *testing.T) { + _, err := pstreader.Open("/nonexistent/path.pst") + if err == nil { + t.Fatal("expected error, got nil") + } +} + +// TestWalkFolders_SupportPST verifies that WalkFolders visits the known folders +// and builds correct slash-separated paths. +func TestWalkFolders_SupportPST(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + seen := make(map[string]int32) // path → message count + if err := f.WalkFolders(func(entry pstreader.FolderEntry, _ *pstlib.Folder) error { + seen[entry.Path] = entry.MsgCount + return nil + }); err != nil { + t.Fatalf("WalkFolders: %v", err) + } + + if len(seen) == 0 { + t.Fatal("WalkFolders visited no folders") + } + + // The support PST has at least these two message-bearing folders. + wantFolders := []string{"Drafts", "Sent Messages"} + for _, want := range wantFolders { + found := false + for path := range seen { + // Path may be "Root/Drafts" or just "Drafts" depending on hierarchy. + if path == want || strings.HasSuffix(path, "/"+want) { + found = true + break + } + } + if !found { + t.Errorf("folder %q not found in: %v", want, keys(seen)) + } + } +} + +// TestWalkFolders_PathsAreSlashSeparated verifies that nested folders produce +// slash-separated paths (e.g. "Personal Folders/Inbox/Archive"). +func TestWalkFolders_PathsAreSlashSeparated(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + depth2 := false + _ = f.WalkFolders(func(entry pstreader.FolderEntry, _ *pstlib.Folder) error { + if strings.Count(entry.Path, "/") >= 1 { + depth2 = true + } + return nil + }) + if !depth2 { + t.Error("expected at least one folder path with depth >= 2 (slash-separated)") + } +} + +// TestExtractMessages_SupportPST verifies that email messages are extracted +// with the expected properties from support.pst. +func TestExtractMessages_SupportPST(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + var emails []*pstreader.MessageEntry + if err := f.WalkFolders(func(entry pstreader.FolderEntry, folder *pstlib.Folder) error { + iter, err := folder.GetMessageIterator() + if err != nil { + return nil // ErrMessagesNotFound or empty folder + } + for iter.Next() { + msg := iter.Value() + if e := pstreader.ExtractMessage(msg, entry.Path); e != nil { + emails = append(emails, e) + } + } + return iter.Err() + }); err != nil { + t.Fatalf("WalkFolders: %v", err) + } + + // support.pst contains exactly 17 email messages. + if len(emails) != 17 { + t.Errorf("got %d email messages, want 17", len(emails)) + } + + // Find the first known message by subject. + var found *pstreader.MessageEntry + for _, e := range emails { + if e.Subject == "Desktop exploits suspension notice" { + found = e + break + } + } + if found == nil { + t.Fatal("could not find message with subject 'Desktop exploits suspension notice'") + } + + if found.SenderEmail != "support@hackingteam.com" { + t.Errorf("SenderEmail = %q, want %q", found.SenderEmail, "support@hackingteam.com") + } + if found.SenderName != "RCS Support" { + t.Errorf("SenderName = %q, want %q", found.SenderName, "RCS Support") + } + if found.TransportHeaders == "" { + t.Error("expected TransportHeaders to be non-empty for an internet-delivered message") + } + if found.MessageID == "" { + t.Error("expected MessageID to be non-empty") + } + if found.SentAt.IsZero() { + t.Error("expected SentAt to be non-zero") + } +} + +// TestExtractMessages_NonEmailsSkipped verifies that non-email items (contacts, +// calendar, tasks) do not appear in the extracted message list. +func TestExtractMessages_NonEmailsSkipped(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + var total, emails int + _ = f.WalkFolders(func(_ pstreader.FolderEntry, folder *pstlib.Folder) error { + iter, err := folder.GetMessageIterator() + if err != nil { + return nil + } + for iter.Next() { + total++ + msg := iter.Value() + if pstreader.ExtractMessage(msg, "") != nil { + emails++ + } + } + return nil + }) + // All items in support.pst should be email messages. + if total != emails { + t.Errorf("total items=%d, emails=%d: %d non-email items unexpectedly extracted", + total, emails, emails-total) + } +} + +// TestReadAttachments_SupportPST verifies that the message with known attachments +// returns non-empty attachment content. +func TestReadAttachments_SupportPST(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + found := false + _ = f.WalkFolders(func(entry pstreader.FolderEntry, folder *pstlib.Folder) error { + iter, err := folder.GetMessageIterator() + if err != nil { + return nil + } + for iter.Next() { + msg := iter.Value() + e := pstreader.ExtractMessage(msg, entry.Path) + if e == nil || e.Subject != "IMPORTANT: Support portal downtime for maintenance" { + continue + } + found = true + atts, err := pstreader.ReadAttachments(msg, 0) + if err != nil { + t.Errorf("ReadAttachments: %v", err) + return nil + } + if len(atts) != 2 { + t.Errorf("got %d attachments, want 2", len(atts)) + return nil + } + for i, att := range atts { + if len(att.Content) == 0 { + t.Errorf("attachment %d (%q) has empty content", i, att.Filename) + } + } + } + return nil + }) + + if !found { + t.Error("could not find the message with 2 attachments") + } +} + +// TestBuildRFC5322_RoundTrip verifies that MIME built from a real PST message +// can be successfully re-parsed by the msgvault MIME parser. +func TestBuildRFC5322_RoundTrip(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + var rawMIME []byte + _ = f.WalkFolders(func(entry pstreader.FolderEntry, folder *pstlib.Folder) error { + if rawMIME != nil { + return nil // already found one + } + iter, err := folder.GetMessageIterator() + if err != nil { + return nil + } + for iter.Next() { + msg := iter.Value() + e := pstreader.ExtractMessage(msg, entry.Path) + if e == nil { + continue + } + atts, _ := pstreader.ReadAttachments(msg, 0) + raw, err := pstreader.BuildRFC5322(e, atts) + if err != nil { + t.Errorf("BuildRFC5322: %v", err) + return nil + } + rawMIME = raw + return nil + } + return nil + }) + + if rawMIME == nil { + t.Fatal("no messages found to test MIME round-trip") + } + if len(rawMIME) == 0 { + t.Fatal("BuildRFC5322 returned empty bytes") + } + + // Verify the output is valid enough for our MIME parser. + s := string(rawMIME) + if !strings.Contains(s, "MIME-Version: 1.0") { + t.Error("MIME output missing MIME-Version header") + } + if !strings.Contains(s, "Content-Type:") { + t.Error("MIME output missing Content-Type header") + } +} + +// TestBuildRFC5322_WithAttachments_RoundTrip verifies the attachment message +// produces valid multipart/mixed MIME. +func TestBuildRFC5322_WithAttachments_RoundTrip(t *testing.T) { + f, err := pstreader.Open(supportPST(t)) + if err != nil { + t.Fatalf("Open: %v", err) + } + defer f.Close() + + _ = f.WalkFolders(func(entry pstreader.FolderEntry, folder *pstlib.Folder) error { + iter, err := folder.GetMessageIterator() + if err != nil { + return nil + } + for iter.Next() { + msg := iter.Value() + e := pstreader.ExtractMessage(msg, entry.Path) + if e == nil || e.Subject != "IMPORTANT: Support portal downtime for maintenance" { + continue + } + atts, err := pstreader.ReadAttachments(msg, 0) + if err != nil { + t.Fatalf("ReadAttachments: %v", err) + } + raw, err := pstreader.BuildRFC5322(e, atts) + if err != nil { + t.Fatalf("BuildRFC5322: %v", err) + } + s := string(raw) + if !strings.Contains(s, "multipart/mixed") { + t.Error("expected multipart/mixed for message with attachments") + } + // Both attachments in this message have ContentIDs so they + // render as inline; check for Content-Disposition regardless. + if !strings.Contains(s, "Content-Disposition:") { + t.Error("expected Content-Disposition header in attachment MIME") + } + } + return nil + }) +} + +// keys returns map keys for error messages. +func keys[K comparable, V any](m map[K]V) []K { + out := make([]K, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/internal/pst/testdata/32-bit.pst b/internal/pst/testdata/32-bit.pst new file mode 100644 index 00000000..8800186a Binary files /dev/null and b/internal/pst/testdata/32-bit.pst differ diff --git a/internal/pst/testdata/support.pst b/internal/pst/testdata/support.pst new file mode 100644 index 00000000..869d214d Binary files /dev/null and b/internal/pst/testdata/support.pst differ