package cli import ( "encoding/json" "os" "path/filepath" "strings" "testing" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" ) func TestMergeWritesMergedOutputAndReport(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored"}]}, {"start": 1, "end": 2, "text": "first a"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 5, "end": 6, "text": "first b"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) outputBytes, err := os.ReadFile(output) if err != nil { t.Fatalf("read output bytes: %v", err) } outputJSON := string(outputBytes) if !strings.Contains(outputJSON, `"overlap_groups": []`) { t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON) } if transcript.Metadata.Application != "seriatim" { t.Fatalf("unexpected application metadata: %q", transcript.Metadata.Application) } if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) { t.Fatalf("input files not sorted deterministically: got %v want %v", got, want) } if got, want := len(transcript.Segments), 3; got != want { t.Fatalf("expected merged output to contain %d segments, got %d", want, got) } assertSegment(t, transcript.Segments[0], 1, inputA, 1, "Alice", 1, 2, "first a") assertSegment(t, transcript.Segments[1], 2, inputB, 0, "Bob", 5, 6, "first b") assertSegment(t, transcript.Segments[2], 3, inputA, 0, "Alice", 10, 11, "second a") if strings.Contains(outputJSON, "internal_ref") { t.Fatalf("did not expect internal_ref in output:\n%s", outputJSON) } if strings.Contains(outputJSON, "words") { t.Fatalf("did not expect words in output:\n%s", outputJSON) } if len(transcript.OverlapGroups) != 0 { t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups)) } var rpt report.Report readJSON(t, reportPath, &rpt) gotModules := make([]string, 0, len(rpt.Events)) for _, event := range rpt.Events { gotModules = append(gotModules, event.Module) } wantModules := []string{ "json-files", "validate-raw", "normalize-speakers", "trim-text", "placeholder-merger", "detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output", "json", } if !equalStrings(gotModules, wantModules) { t.Fatalf("report event order mismatch:\ngot %v\nwant %v", gotModules, wantModules) } } func TestMergeTieBreakOrder(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 4, "text": "a-late-end"}, {"start": 1, "end": 2, "text": "a-index-one"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 1, "end": 2, "text": "b-same-time"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) got := []string{ transcript.Segments[0].Text, transcript.Segments[1].Text, transcript.Segments[2].Text, } want := []string{"a-index-one", "b-same-time", "a-late-end"} if !equalStrings(got, want) { t.Fatalf("tie-break order mismatch: got %v want %v", got, want) } for index, segment := range transcript.Segments { if segment.ID != index+1 { t.Fatalf("segment %d has id %d; want %d", index, segment.ID, index+1) } } } func TestMergeDetectsOverlapGroups(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 5, "text": "alice long"}, {"start": 2, "end": 3, "text": "alice nested"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 4, "end": 6, "text": "bob overlap"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.OverlapGroups) != 1 { t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) } group := transcript.OverlapGroups[0] if group.ID != 1 { t.Fatalf("group ID = %d, want 1", group.ID) } if group.Start != 1 || group.End != 6 { t.Fatalf("group bounds = %f-%f, want 1-6", group.Start, group.End) } wantRefs := []string{inputA + "#0", inputA + "#1", inputB + "#0"} if !equalStrings(group.Segments, wantRefs) { t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs) } if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) { t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers) } if group.Class != "unknown" || group.Resolution != "unresolved" { t.Fatalf("unexpected group class/resolution: %q/%q", group.Class, group.Resolution) } for index, segment := range transcript.Segments { if segment.OverlapGroupID != 1 { t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID) } } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "detect-overlaps", "detected 1 overlap group(s)") { t.Fatal("expected detect-overlaps report event") } } func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ "segments": [ {"start": 1, "end": 2, "text": "hello"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: First Match match: ["adam"] - speaker: Later Match match: ["Adam_Rakestraw"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Speaker, "First Match"; got != want { t.Fatalf("speaker = %q, want %q", got, want) } } func TestUnknownModulesFailDuringValidation(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") tests := []struct { name string args []string want string }{ { name: "input reader", args: []string{"--input-reader", "missing-reader"}, want: `unknown input reader "missing-reader"`, }, { name: "preprocessing", args: []string{"--preprocessing-modules", "validate-raw,missing-module"}, want: `unknown preprocessing module "missing-module"`, }, { name: "postprocessing", args: []string{"--postprocessing-modules", "missing-module"}, want: `unknown postprocessing module "missing-module"`, }, { name: "output", args: []string{"--output-modules", "missing-module"}, want: `unknown output module "missing-module"`, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { args := []string{ "--input-file", input, "--speakers", speakers, "--output-file", output, } args = append(args, test.args...) err := executeMerge(args...) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %q", test.want, err.Error()) } }) } } func TestInvalidPreprocessingOrderFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--preprocessing-modules", "trim-text,validate-raw", ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `requires state "canonical"`) { t.Fatalf("unexpected error: %v", err) } } func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) { dir := t.TempDir() speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["missing.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", filepath.Join(dir, "missing.json"), "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "--input-file") { t.Fatalf("unexpected error: %v", err) } } func TestDefaultMergeWorksWithoutSpeakersOrAutocorrect(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Speaker, "input.json"; got != want { t.Fatalf("speaker = %q, want %q", got, want) } if got, want := transcript.Segments[0].Text, "Frank"; got != want { t.Fatalf("text = %q, want %q", got, want) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "preprocessing", "normalize-speakers", "using input basenames") { t.Fatal("expected normalize-speakers fallback report event") } if !hasReportEvent(rpt, "postprocessing", "autocorrect", "skipped autocorrect") { t.Fatal("expected autocorrect skip report event") } } func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: Hrank match: ["Frank"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--autocorrect", autocorrect, "--output-file", output, "--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect", ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) { t.Fatalf("unexpected error: %v", err) } } func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."}, {"start": 3, "end": 4, "text": "God-free and FRANK stayed."} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: Hrank match: ["Frank"] - target: Mike Brown match: ["Mike Pat"] - target: Godfrey match: ["God-free"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--autocorrect", autocorrect, "--output-file", output, "--report-file", reportPath, "--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output", ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want { t.Fatalf("segment 0 text = %q, want %q", got, want) } if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want { t.Fatalf("segment 1 text = %q, want %q", got, want) } var rpt report.Report readJSON(t, reportPath, &rpt) found := false for _, event := range rpt.Events { if event.Stage == "postprocessing" && event.Module == "autocorrect" { found = true if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") { t.Fatalf("unexpected autocorrect report message: %q", event.Message) } } } if !found { t.Fatal("expected autocorrect report event") } } func TestInvalidAutocorrectFileFailsWhenProvided(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) output := filepath.Join(dir, "merged.json") autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: "" match: ["Frank"] `) err := executeMerge( "--input-file", input, "--autocorrect", autocorrect, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "must include target") { t.Fatalf("unexpected error: %v", err) } } func TestOutputJSONIsByteStable(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`) inputB := writeJSONFile(t, dir, "b.json", `{"segments":[{"start":1,"end":2,"text":"b"}]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) outputA := filepath.Join(dir, "merged-a.json") outputB := filepath.Join(dir, "merged-b.json") args := []string{ "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, } err := executeMerge(append(append([]string(nil), args...), "--output-file", outputA)...) if err != nil { t.Fatalf("first merge failed: %v", err) } err = executeMerge(append(append([]string(nil), args...), "--output-file", outputB)...) if err != nil { t.Fatalf("second merge failed: %v", err) } first, err := os.ReadFile(outputA) if err != nil { t.Fatalf("read first output: %v", err) } second, err := os.ReadFile(outputB) if err != nil { t.Fatalf("read second output: %v", err) } if string(first) != string(second) { t.Fatalf("expected byte-stable output\nfirst:\n%s\nsecond:\n%s", first, second) } } func TestMissingSpeakerMappingFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["other.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `speaker map has no match for input basename "input.json"`) { t.Fatalf("unexpected error: %v", err) } } func TestMalformedJSONFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "parse input file") { t.Fatalf("unexpected error: %v", err) } } func TestMissingTopLevelSegmentsFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "must contain top-level segments array") { t.Fatalf("unexpected error: %v", err) } } func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) { tests := []struct { name string json string want string }{ { name: "missing start", json: `{"segments":[{"end":1,"text":"x"}]}`, want: "segment 0 missing numeric start", }, { name: "wrong typed end", json: `{"segments":[{"start":0,"end":"1","text":"x"}]}`, want: "segment 0 end must be numeric", }, { name: "wrong typed text", json: `{"segments":[{"start":0,"end":1,"text":7}]}`, want: "segment 0 text must be a string", }, { name: "null text", json: `{"segments":[{"start":0,"end":1,"text":null}]}`, want: "segment 0 missing string text", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), input) { t.Fatalf("expected error to contain source path %q, got %v", input, err) } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %v", test.want, err) } }) } } func TestInvalidTimingFails(t *testing.T) { tests := []struct { name string json string want string }{ { name: "negative start", json: `{"segments":[{"start":-1,"end":1,"text":"x"}]}`, want: "segment 0 has negative start", }, { name: "end before start", json: `{"segments":[{"start":2,"end":1,"text":"x"}]}`, want: "segment 0 has end before start", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %v", test.want, err) } }) } } func executeMerge(args ...string) error { cmd := NewRootCommand() cmd.SetArgs(append([]string{"merge"}, args...)) return cmd.Execute() } func writeJSONFile(t *testing.T, dir string, name string, content string) string { t.Helper() path := filepath.Join(dir, name) if err := os.WriteFile(path, []byte(content+"\n"), 0o600); err != nil { t.Fatalf("write file: %v", err) } return path } func writeYAMLFile(t *testing.T, dir string, name string, content string) string { t.Helper() path := filepath.Join(dir, name) if err := os.WriteFile(path, []byte(content), 0o600); err != nil { t.Fatalf("write file: %v", err) } return path } func readJSON(t *testing.T, path string, target any) { t.Helper() data, err := os.ReadFile(path) if err != nil { t.Fatalf("read %s: %v", path, err) } if err := json.Unmarshal(data, target); err != nil { t.Fatalf("unmarshal %s: %v", path, err) } } func equalStrings(left []string, right []string) bool { if len(left) != len(right) { return false } for index := range left { if left[index] != right[index] { return false } } return true } func hasReportEvent(rpt report.Report, stage string, module string, messageSubstring string) bool { for _, event := range rpt.Events { if event.Stage == stage && event.Module == module && strings.Contains(event.Message, messageSubstring) { return true } } return false } func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) { t.Helper() if segment.ID != id { t.Fatalf("segment ID = %d, want %d", segment.ID, id) } if segment.Source != source { t.Fatalf("segment source = %q, want %q", segment.Source, source) } if segment.SourceSegmentIndex != sourceIndex { t.Fatalf("segment source index = %d, want %d", segment.SourceSegmentIndex, sourceIndex) } if segment.Speaker != speaker { t.Fatalf("segment speaker = %q, want %q", segment.Speaker, speaker) } if segment.Start != start { t.Fatalf("segment start = %f, want %f", segment.Start, start) } if segment.End != end { t.Fatalf("segment end = %f, want %f", segment.End, end) } if segment.Text != text { t.Fatalf("segment text = %q, want %q", segment.Text, text) } }