package cli import ( "encoding/json" "os" "path/filepath" "reflect" "strings" "testing" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" "gitea.maximumdirect.net/eric/seriatim/schema" ) func TestMergeWritesMergedOutputAndReport(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored", "start": 10.1, "end": 10.2}]}, {"start": 1, "end": 2, "text": "first a"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 5, "end": 6, "text": "first b"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) outputBytes, err := os.ReadFile(output) if err != nil { t.Fatalf("read output bytes: %v", err) } outputJSON := string(outputBytes) if !strings.Contains(outputJSON, `"overlap_groups": []`) { t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON) } if transcript.Metadata.Application != "seriatim" { t.Fatalf("unexpected application metadata: %q", transcript.Metadata.Application) } if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) { t.Fatalf("input files not sorted deterministically: got %v want %v", got, want) } if got, want := len(transcript.Segments), 3; got != want { t.Fatalf("expected merged output to contain %d segments, got %d", want, got) } assertSegment(t, transcript.Segments[0], 1, inputA, 1, "Alice", 1, 2, "first a") assertSegment(t, transcript.Segments[1], 2, inputB, 0, "Bob", 5, 6, "first b") assertSegment(t, transcript.Segments[2], 3, inputA, 0, "Alice", 10, 11, "second a") if strings.Contains(outputJSON, "internal_ref") { t.Fatalf("did not expect internal_ref in output:\n%s", outputJSON) } if strings.Contains(outputJSON, "words") { t.Fatalf("did not expect words in output:\n%s", outputJSON) } if len(transcript.OverlapGroups) != 0 { t.Fatalf("expected output to contain no overlap groups, got %d", len(transcript.OverlapGroups)) } var rpt report.Report readJSON(t, reportPath, &rpt) gotModules := make([]string, 0, len(rpt.Events)) for _, event := range rpt.Events { gotModules = append(gotModules, event.Module) } wantModules := []string{ "json-files", "validate-raw", "normalize-speakers", "trim-text", "chronological-merge", "detect-overlaps", "resolve-overlaps", "backchannel", "filler", "resolve-danglers", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output", "json", } if !equalStrings(gotModules, wantModules) { t.Fatalf("report event order mismatch:\ngot %v\nwant %v", gotModules, wantModules) } if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 3 output segment(s)") { t.Fatal("expected validate-output report event") } if !hasReportEvent(rpt, "merge", "chronological-merge", "merged 2 canonical transcript(s) into 3 segment(s)") { t.Fatal("expected chronological-merge report event") } } func TestMergeWritesMinimalOutputSchema(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 2, "text": " Yeah. "}, {"start": 8, "end": 9, "text": " next "} ] }`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--output-schema", "minimal", "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript schema.MinimalTranscript readJSON(t, output, &transcript) if transcript.Metadata.Application != "seriatim" { t.Fatalf("application = %q, want seriatim", transcript.Metadata.Application) } if transcript.Metadata.OutputSchema != "minimal" { t.Fatalf("output_schema = %q, want minimal", transcript.Metadata.OutputSchema) } if got, want := len(transcript.Segments), 2; got != want { t.Fatalf("segment count = %d, want %d", got, want) } for index, segment := range transcript.Segments { if segment.ID != index+1 { t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1) } } if transcript.Segments[0].Speaker != "input.json" || transcript.Segments[0].Text != "Yeah." { t.Fatalf("first segment = %#v", transcript.Segments[0]) } outputBytes, err := os.ReadFile(output) if err != nil { t.Fatalf("read output: %v", err) } for _, forbidden := range []string{"overlap_groups", "categories", "source", "derived_from"} { if strings.Contains(string(outputBytes), forbidden) { t.Fatalf("minimal output contains %q:\n%s", forbidden, outputBytes) } } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 2 output segment(s)") { t.Fatal("expected validate-output report event") } } func TestMergeTieBreakOrder(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 4, "text": "a-late-end"}, {"start": 1, "end": 2, "text": "a-index-one"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 1, "end": 2, "text": "b-same-time"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) got := []string{ transcript.Segments[0].Text, transcript.Segments[1].Text, transcript.Segments[2].Text, } want := []string{"a-index-one", "b-same-time", "a-late-end"} if !equalStrings(got, want) { t.Fatalf("tie-break order mismatch: got %v want %v", got, want) } for index, segment := range transcript.Segments { if segment.ID != index+1 { t.Fatalf("segment %d has id %d; want %d", index, segment.ID, index+1) } } } func TestMergeValidateOutputBeforeAssignIDsFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 2, "text": "hello"} ] }`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--postprocessing-modules", "validate-output,assign-ids", ) if err == nil { t.Fatal("expected validation error") } if !strings.Contains(err.Error(), "validate-output: segment 0 has id 0; want 1") { t.Fatalf("unexpected error: %v", err) } } func TestMergeValidateMinimalOutputBeforeAssignIDsFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 2, "text": "hello"} ] }`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--output-schema", "minimal", "--postprocessing-modules", "validate-output,assign-ids", ) if err == nil { t.Fatal("expected validation error") } if !strings.Contains(err.Error(), "validate-output: segment 0 has id 0; want 1") { t.Fatalf("unexpected error: %v", err) } } func TestMergeDetectsOverlapGroups(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 5, "text": "alice long"}, {"start": 2, "end": 3, "text": "alice nested"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 4, "end": 6, "text": "bob overlap"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.OverlapGroups) != 1 { t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) } group := transcript.OverlapGroups[0] if group.ID != 1 { t.Fatalf("group ID = %d, want 1", group.ID) } if group.Start != 1 || group.End != 6 { t.Fatalf("group bounds = %f-%f, want 1-6", group.Start, group.End) } wantRefs := []string{"coalesce:1", inputB + "#0"} if !equalStrings(group.Segments, wantRefs) { t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs) } if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) { t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers) } if group.Class != "unknown" || group.Resolution != "unresolved" { t.Fatalf("unexpected group class/resolution: %q/%q", group.Class, group.Resolution) } for index, segment := range transcript.Segments { if segment.OverlapGroupID != 1 { t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID) } } if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].SourceRef != "coalesce:1" { t.Fatalf("coalesced source_ref = %q, want coalesce:1", transcript.Segments[0].SourceRef) } if !equalStrings(transcript.Segments[0].DerivedFrom, []string{inputA + "#0", inputA + "#1"}) { t.Fatalf("coalesced derived_from = %v", transcript.Segments[0].DerivedFrom) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "detect-overlaps", "detected 1 overlap group(s)") { t.Fatal("expected detect-overlaps report event") } } func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) { t.Setenv(config.WordRunReorderWindowEnv, "0.4") dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 5, "text": "alice original", "words": [ {"word": "outside", "start": 0.5, "end": 1.0}, {"word": "hello", "start": 1.1, "end": 1.2, "score": 0.98, "speaker": "SPEAKER_00"}, {"word": "there", "start": 1.8, "end": 2.0}, {"word": "later", "start": 3.0, "end": 3.1} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.5, "end": 4, "text": "bob original", "words": [ {"word": "bob", "start": 2.2, "end": 2.3}, {"word": "reply", "start": 2.4, "end": 2.5} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.OverlapGroups) != 0 { t.Fatalf("overlap groups = %#v, want none", transcript.OverlapGroups) } if got, want := len(transcript.Segments), 3; got != want { t.Fatalf("segment count = %d, want %d", got, want) } wantTexts := []string{"outside hello there", "bob reply", "later"} wantSpeakers := []string{"Alice", "Bob", "Alice"} wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1", "word-run:1:1:2"} for index, segment := range transcript.Segments { if segment.ID != index+1 { t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1) } if segment.Text != wantTexts[index] { t.Fatalf("segment %d text = %q, want %q", index, segment.Text, wantTexts[index]) } if segment.Speaker != wantSpeakers[index] { t.Fatalf("segment %d speaker = %q, want %q", index, segment.Speaker, wantSpeakers[index]) } if segment.SourceRef != wantRefs[index] { t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index]) } if segment.SourceSegmentIndex != nil { t.Fatalf("segment %d source_segment_index = %d, want nil", index, *segment.SourceSegmentIndex) } if segment.OverlapGroupID != 0 { t.Fatalf("segment %d overlap_group_id = %d, want 0", index, segment.OverlapGroupID) } } if !equalStrings(transcript.Segments[0].DerivedFrom, []string{inputA + "#0"}) { t.Fatalf("segment 0 derived_from = %v", transcript.Segments[0].DerivedFrom) } outputBytes, err := os.ReadFile(output) if err != nil { t.Fatalf("read output bytes: %v", err) } if strings.Contains(string(outputBytes), "words") { t.Fatalf("did not expect word timing in output:\n%s", outputBytes) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "resolve-overlaps", "processed 1 overlap group(s); changed 1; removed 2 original segment(s); created 3 replacement segment(s)") { t.Fatal("expected resolve-overlaps summary report event") } } func TestMergeDetectsResidualOverlapsAfterResolution(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 4, "text": "alice residual", "words": [ {"word": "alice", "start": 1.0, "end": 2.0} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.5, "end": 3, "text": "bob residual", "words": [ {"word": "bob", "start": 1.5, "end": 2.5} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.OverlapGroups) != 1 { t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) } group := transcript.OverlapGroups[0] if group.ID != 1 { t.Fatalf("group ID = %d, want 1", group.ID) } if group.Start != 1 || group.End != 2.5 { t.Fatalf("group bounds = %f-%f, want 1-2.5", group.Start, group.End) } wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"} if !equalStrings(group.Segments, wantRefs) { t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs) } if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) { t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers) } for index, segment := range transcript.Segments { if segment.ID != index+1 { t.Fatalf("segment %d ID = %d, want %d", index, segment.ID, index+1) } if segment.OverlapGroupID != 1 { t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID) } if segment.SourceRef != wantRefs[index] { t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index]) } } var rpt report.Report readJSON(t, reportPath, &rpt) detectMessages := make([]string, 0) for _, event := range rpt.Events { if event.Stage == "postprocessing" && event.Module == "detect-overlaps" { detectMessages = append(detectMessages, event.Message) } } wantMessages := []string{"detected 1 overlap group(s)", "detected 1 overlap group(s)"} if !equalStrings(detectMessages, wantMessages) { t.Fatalf("detect-overlaps messages = %v, want %v", detectMessages, wantMessages) } } func TestMergeOrdersNearStartWordRunsShorterFirst(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 4, "text": "alice long", "words": [ {"word": "alice-long", "start": 1.0, "end": 2.0} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.1, "end": 3, "text": "bob short", "words": [ {"word": "bob-short", "start": 1.2, "end": 1.3} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].Text != "bob-short" || transcript.Segments[0].ID != 1 { t.Fatalf("first segment = %#v, want bob-short with ID 1", transcript.Segments[0]) } if transcript.Segments[1].Text != "alice-long" || transcript.Segments[1].ID != 2 { t.Fatalf("second segment = %#v, want alice-long with ID 2", transcript.Segments[1]) } if len(transcript.OverlapGroups) != 1 { t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) } if transcript.Segments[0].OverlapGroupID != 1 || transcript.Segments[1].OverlapGroupID != 1 { t.Fatalf("segments should retain residual overlap annotation: %#v", transcript.Segments) } wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"} if !equalStrings(transcript.OverlapGroups[0].Segments, wantRefs) { t.Fatalf("overlap refs = %v, want %v", transcript.OverlapGroups[0].Segments, wantRefs) } } func TestMergeCoalescesSameSpeakerSegmentsBeforeFinalOverlapDetection(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 2, "text": "first"}, {"start": 4, "end": 5, "text": "second"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 4.5, "end": 6, "text": "bob"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } alice := transcript.Segments[0] if alice.ID != 1 || alice.Text != "first second" || alice.SourceRef != "coalesce:1" { t.Fatalf("unexpected coalesced Alice segment: %#v", alice) } if alice.SourceSegmentIndex != nil { t.Fatalf("coalesced segment source_segment_index = %d, want nil", *alice.SourceSegmentIndex) } if !equalStrings(alice.DerivedFrom, []string{inputA + "#0", inputA + "#1"}) { t.Fatalf("derived_from = %v", alice.DerivedFrom) } if transcript.Segments[1].ID != 2 || transcript.Segments[1].Text != "bob" { t.Fatalf("unexpected Bob segment: %#v", transcript.Segments[1]) } if len(transcript.OverlapGroups) != 1 { t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) } group := transcript.OverlapGroups[0] if !equalStrings(group.Segments, []string{"coalesce:1", inputB + "#0"}) { t.Fatalf("group refs = %v", group.Segments) } if alice.OverlapGroupID != 1 || transcript.Segments[1].OverlapGroupID != 1 { t.Fatalf("expected final overlap annotation after coalesce: %#v", transcript.Segments) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "coalesce", "merged 2 original segment(s) into 1 coalesced segment(s)") { t.Fatal("expected coalesce report event") } } func TestMergeCoalesceGapOverridePreventsMerge(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 2, "text": "first"}, {"start": 4, "end": 5, "text": "second"} ] }`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--coalesce-gap", "1", ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].Text != "first" || transcript.Segments[1].Text != "second" { t.Fatalf("segments were unexpectedly coalesced: %#v", transcript.Segments) } } func TestMergeResolveDanglersMergesDanglingEnd(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 4, "text": "main tail.", "words": [ {"word": "main", "start": 1, "end": 1.1}, {"word": "tail.", "start": 3, "end": 3.1} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.5, "end": 2, "text": "interruption", "words": [ {"word": "interruption", "start": 1.5, "end": 2} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "main tail." { t.Fatalf("first segment = %#v, want Alice merged dangling end", transcript.Segments[0]) } if transcript.Segments[0].ID != 1 || transcript.Segments[1].ID != 2 { t.Fatalf("ids not sequential after resolve-danglers: %#v", transcript.Segments) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "resolve-danglers", "merged 1 dangling segment(s) into 1 target segment(s)") { t.Fatal("expected resolve-danglers report event") } } func TestMergeResolveDanglersMergesDanglingStart(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 4, "text": "start target words", "words": [ {"word": "start", "start": 1, "end": 1.1}, {"word": "target", "start": 3, "end": 3.1}, {"word": "words", "start": 3.2, "end": 3.3} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.5, "end": 2, "text": "interruption", "words": [ {"word": "interruption", "start": 1.5, "end": 2} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) found := false for _, segment := range transcript.Segments { if segment.Speaker == "Alice" && segment.Text == "start target words" { found = true } } if !found { t.Fatalf("expected resolved dangling start in output, got %#v", transcript.Segments) } } func TestMergeTagsBackchannelSegments(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 1.5, "text": " Yeah. "}, {"start": 6, "end": 7, "text": "not a backchannel"} ] }`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if !equalStrings(transcript.Segments[0].Categories, []string{"backchannel"}) { t.Fatalf("segment categories = %v, want [backchannel]", transcript.Segments[0].Categories) } if len(transcript.Segments[1].Categories) != 0 { t.Fatalf("unexpected categories = %v", transcript.Segments[1].Categories) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "backchannel", "tagged 1 backchannel segment(s)") { t.Fatal("expected backchannel report event") } } func TestMergeTagsFillerSegments(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 1.5, "text": " Um uh "}, {"start": 6, "end": 7, "text": "not filler"} ] }`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if !equalStrings(transcript.Segments[0].Categories, []string{"filler"}) { t.Fatalf("segment categories = %v, want [filler]", transcript.Segments[0].Categories) } if len(transcript.Segments[1].Categories) != 0 { t.Fatalf("unexpected categories = %v", transcript.Segments[1].Categories) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "postprocessing", "filler", "tagged 1 filler segment(s)") { t.Fatal("expected filler report event") } } func TestMergeCoalescesSameSpeakerBackchannelWithFollowingSegment(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "zach.json", `{ "segments": [ {"start": 1, "end": 1.7, "text": "That makes sense."}, {"start": 1.72, "end": 4, "text": "So, like, next thought."} ] }`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 1 { t.Fatalf("segment count = %d, want 1", len(transcript.Segments)) } segment := transcript.Segments[0] if segment.Text != "That makes sense. So, like, next thought." { t.Fatalf("text = %q", segment.Text) } if segment.SourceRef != "coalesce:1" { t.Fatalf("source_ref = %q, want coalesce:1", segment.SourceRef) } if !equalStrings(segment.DerivedFrom, []string{input + "#0", input + "#1"}) { t.Fatalf("derived_from = %v", segment.DerivedFrom) } if len(segment.Categories) != 0 { t.Fatalf("categories = %v, want none", segment.Categories) } } func TestMergeCoalescesBackchannelAfterDifferentSpeakerIntoFollowingSameSpeakerSegment(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "mike.json", `{ "segments": [ {"start": 1, "end": 2, "text": "previous speaker"} ] }`) inputB := writeJSONFile(t, dir, "zach.json", `{ "segments": [ {"start": 2.5, "end": 3, "text": "That makes sense."}, {"start": 3.02, "end": 6, "text": "So, like, next thought."} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Mike match: ["mike.json"] - speaker: Zach match: ["zach.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].Speaker != "Mike" || transcript.Segments[0].Text != "previous speaker" { t.Fatalf("first segment = %#v, want Mike original", transcript.Segments[0]) } if transcript.Segments[1].Speaker != "Zach" || transcript.Segments[1].Text != "That makes sense. So, like, next thought." { t.Fatalf("second segment = %#v, want coalesced Zach", transcript.Segments[1]) } if !equalStrings(transcript.Segments[1].DerivedFrom, []string{inputB + "#0", inputB + "#1"}) { t.Fatalf("derived_from = %v", transcript.Segments[1].DerivedFrom) } if len(transcript.Segments[1].Categories) != 0 { t.Fatalf("categories = %v, want none", transcript.Segments[1].Categories) } } func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 2, "text": "first"}, {"start": 3, "end": 4, "text": "second"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 2.2, "end": 2.5, "text": "yeah"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "first second" { t.Fatalf("first segment = %#v, want coalesced Alice", transcript.Segments[0]) } if len(transcript.Segments[0].Categories) != 0 { t.Fatalf("coalesced segment categories = %v, want none", transcript.Segments[0].Categories) } if transcript.Segments[1].Speaker != "Bob" || !equalStrings(transcript.Segments[1].Categories, []string{"backchannel"}) { t.Fatalf("second segment = %#v, want Bob backchannel", transcript.Segments[1]) } } func TestMergeCoalescesAroundDifferentSpeakerFiller(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ {"start": 1, "end": 2, "text": "first"}, {"start": 3, "end": 4, "text": "second"} ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ {"start": 2.2, "end": 2.5, "text": "um"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.Segments) != 2 { t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) } if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "first second" { t.Fatalf("first segment = %#v, want coalesced Alice", transcript.Segments[0]) } if len(transcript.Segments[0].Categories) != 0 { t.Fatalf("coalesced segment categories = %v, want none", transcript.Segments[0].Categories) } if transcript.Segments[1].Speaker != "Bob" || !equalStrings(transcript.Segments[1].Categories, []string{"filler"}) { t.Fatalf("second segment = %#v, want Bob filler", transcript.Segments[1]) } } func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ "segments": [ {"start": 1, "end": 2, "text": "hello"} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: First Match match: ["adam"] - speaker: Later Match match: ["Adam_Rakestraw"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Speaker, "First Match"; got != want { t.Fatalf("speaker = %q, want %q", got, want) } } func TestUnknownModulesFailDuringValidation(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") tests := []struct { name string args []string want string }{ { name: "input reader", args: []string{"--input-reader", "missing-reader"}, want: `unknown input reader "missing-reader"`, }, { name: "preprocessing", args: []string{"--preprocessing-modules", "validate-raw,missing-module"}, want: `unknown preprocessing module "missing-module"`, }, { name: "postprocessing", args: []string{"--postprocessing-modules", "missing-module"}, want: `unknown postprocessing module "missing-module"`, }, { name: "output", args: []string{"--output-modules", "missing-module"}, want: `unknown output module "missing-module"`, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { args := []string{ "--input-file", input, "--speakers", speakers, "--output-file", output, } args = append(args, test.args...) err := executeMerge(args...) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %q", test.want, err.Error()) } }) } } func TestUnknownOutputSchemaFailsDuringValidation(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--output-schema", "compact", ) if err == nil { t.Fatal("expected output schema error") } if !strings.Contains(err.Error(), "--output-schema must be one of") { t.Fatalf("unexpected error: %v", err) } } func TestInvalidPreprocessingOrderFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, "--preprocessing-modules", "trim-text,validate-raw", ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `requires state "canonical"`) { t.Fatalf("unexpected error: %v", err) } } func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) { dir := t.TempDir() speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["missing.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", filepath.Join(dir, "missing.json"), "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "--input-file") { t.Fatalf("unexpected error: %v", err) } } func TestDefaultMergeWorksWithoutSpeakersOrAutocorrect(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Speaker, "input.json"; got != want { t.Fatalf("speaker = %q, want %q", got, want) } if got, want := transcript.Segments[0].Text, "Frank"; got != want { t.Fatalf("text = %q, want %q", got, want) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "preprocessing", "normalize-speakers", "using input basenames") { t.Fatal("expected normalize-speakers fallback report event") } if !hasReportEvent(rpt, "postprocessing", "autocorrect", "skipped autocorrect") { t.Fatal("expected autocorrect skip report event") } } func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: Hrank match: ["Frank"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--autocorrect", autocorrect, "--output-file", output, "--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect", ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) { t.Fatalf("unexpected error: %v", err) } } func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ {"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."}, {"start": 3, "end": 4, "text": "God-free and FRANK stayed."} ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: Hrank match: ["Frank"] - target: Mike Brown match: ["Mike Pat"] - target: Godfrey match: ["God-free"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--autocorrect", autocorrect, "--output-file", output, "--report-file", reportPath, "--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output", ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want { t.Fatalf("segment 0 text = %q, want %q", got, want) } if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want { t.Fatalf("segment 1 text = %q, want %q", got, want) } var rpt report.Report readJSON(t, reportPath, &rpt) found := false for _, event := range rpt.Events { if event.Stage == "postprocessing" && event.Module == "autocorrect" { found = true if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") { t.Fatalf("unexpected autocorrect report message: %q", event.Message) } } } if !found { t.Fatal("expected autocorrect report event") } } func TestInvalidAutocorrectFileFailsWhenProvided(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) output := filepath.Join(dir, "merged.json") autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: - target: "" match: ["Frank"] `) err := executeMerge( "--input-file", input, "--autocorrect", autocorrect, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "must include target") { t.Fatalf("unexpected error: %v", err) } } func TestOutputJSONIsByteStable(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`) inputB := writeJSONFile(t, dir, "b.json", `{"segments":[{"start":1,"end":2,"text":"b"}]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) outputA := filepath.Join(dir, "merged-a.json") outputB := filepath.Join(dir, "merged-b.json") args := []string{ "--input-file", inputB, "--input-file", inputA, "--speakers", speakers, } err := executeMerge(append(append([]string(nil), args...), "--output-file", outputA)...) if err != nil { t.Fatalf("first merge failed: %v", err) } err = executeMerge(append(append([]string(nil), args...), "--output-file", outputB)...) if err != nil { t.Fatalf("second merge failed: %v", err) } first, err := os.ReadFile(outputA) if err != nil { t.Fatalf("read first output: %v", err) } second, err := os.ReadFile(outputB) if err != nil { t.Fatalf("read second output: %v", err) } if string(first) != string(second) { t.Fatalf("expected byte-stable output\nfirst:\n%s\nsecond:\n%s", first, second) } } func TestMissingSpeakerMappingFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["other.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), `speaker map has no match for input basename "input.json"`) { t.Fatalf("unexpected error: %v", err) } } func TestMalformedJSONFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "parse input file") { t.Fatalf("unexpected error: %v", err) } } func TestMissingTopLevelSegmentsFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{}`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), "must contain top-level segments array") { t.Fatalf("unexpected error: %v", err) } } func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) { tests := []struct { name string json string want string }{ { name: "missing start", json: `{"segments":[{"end":1,"text":"x"}]}`, want: "segment 0 missing numeric start", }, { name: "wrong typed end", json: `{"segments":[{"start":0,"end":"1","text":"x"}]}`, want: "segment 0 end must be numeric", }, { name: "wrong typed text", json: `{"segments":[{"start":0,"end":1,"text":7}]}`, want: "segment 0 text must be a string", }, { name: "null text", json: `{"segments":[{"start":0,"end":1,"text":null}]}`, want: "segment 0 missing string text", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), input) { t.Fatalf("expected error to contain source path %q, got %v", input, err) } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %v", test.want, err) } }) } } func TestInvalidWordFieldsFailWithSourceAndIndex(t *testing.T) { tests := []struct { name string json string want string }{ { name: "words not array", json: `{"segments":[{"start":0,"end":1,"text":"x","words":{}}]}`, want: "segment 0 words must be an array", }, { name: "missing word", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"start":0,"end":0.1}]}]}`, want: "segment 0 word 0 missing string word", }, { name: "wrong typed word", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":7,"start":0,"end":0.1}]}]}`, want: "segment 0 word 0 word must be a string", }, { name: "wrong typed start", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":"0","end":0.1}]}]}`, want: "segment 0 word 0 start must be numeric", }, { name: "wrong typed end", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":"0.1"}]}]}`, want: "segment 0 word 0 end must be numeric", }, { name: "wrong typed score", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":0.1,"score":"good"}]}]}`, want: "segment 0 word 0 score must be numeric", }, { name: "wrong typed speaker", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":0.1,"speaker":7}]}]}`, want: "segment 0 word 0 speaker must be a string", }, { name: "negative start", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":-0.1,"end":0.1}]}]}`, want: "segment 0 word 0 has negative start", }, { name: "end before start", json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0.2,"end":0.1}]}]}`, want: "segment 0 word 0 has end before start", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), input) { t.Fatalf("expected error to contain source path %q, got %v", input, err) } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %v", test.want, err) } }) } } func TestUntimedWordsAreAcceptedAndReported(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ "segments": [ { "start": 1, "end": 2, "text": "about 13", "words": [ {"word": "about", "start": 1.1, "end": 1.2}, {"word": "13"} ] } ] }`) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, "--report-file", reportPath, ) if err != nil { t.Fatalf("merge failed: %v", err) } var rpt report.Report readJSON(t, reportPath, &rpt) if !hasReportEvent(rpt, "input", "json-files", `segment 0 word 1 "13" has no complete timing`) { t.Fatal("expected untimed word warning report event") } foundWarning := false for _, event := range rpt.Events { if event.Stage == "input" && event.Module == "json-files" && strings.Contains(event.Message, `"13" has no complete timing`) { foundWarning = event.Severity == report.SeverityWarning } } if !foundWarning { t.Fatal("expected untimed word event to use warning severity") } } func TestMergeResolutionPreservesUntimedWordText(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 1, "end": 3, "text": "about 13 and a half", "words": [ {"word": "about", "start": 1.1, "end": 1.2}, {"word": "13"}, {"word": "and", "start": 1.24, "end": 1.3}, {"word": "a", "start": 1.32, "end": 1.34}, {"word": "half", "start": 1.36, "end": 1.5} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 1.15, "end": 2, "text": "bob overlap", "words": [ {"word": "bob", "start": 1.6, "end": 1.7}, {"word": "overlap", "start": 1.6, "end": 1.8} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) if len(transcript.OverlapGroups) != 0 { t.Fatalf("expected overlap group to be resolved, got %#v", transcript.OverlapGroups) } found := false for _, segment := range transcript.Segments { if segment.Speaker == "Alice" && segment.Text == "about 13 and a half" { found = true if segment.Start != 1.1 || segment.End != 1.5 { t.Fatalf("Alice replacement bounds = %f-%f, want 1.1-1.5", segment.Start, segment.End) } } } if !found { t.Fatalf("expected Alice replacement to preserve untimed word text, got %#v", transcript.Segments) } } func TestMergeResolveOverlapsAbsorbsNearbyContext(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 9, "end": 9.95, "text": "before", "words": [ {"word": "before", "start": 9.7, "end": 9.9} ] }, { "start": 10, "end": 11, "text": "inside", "words": [ {"word": "inside", "start": 10.5, "end": 10.7} ] }, { "start": 11.1, "end": 12, "text": "after", "words": [ {"word": "after", "start": 11.2, "end": 11.3} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 10.2, "end": 11, "text": "bob", "words": [ {"word": "bob", "start": 10.4, "end": 10.6} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript model.FinalTranscript readJSON(t, output, &transcript) var aliceSegments []model.Segment for _, segment := range transcript.Segments { if segment.Speaker == "Alice" { aliceSegments = append(aliceSegments, segment) } } if len(aliceSegments) != 1 { t.Fatalf("Alice segment count = %d, want 1: %#v", len(aliceSegments), aliceSegments) } if aliceSegments[0].Text != "before inside after" { t.Fatalf("Alice text = %q", aliceSegments[0].Text) } if !reflect.DeepEqual(aliceSegments[0].DerivedFrom, []string{inputA + "#0", inputA + "#1", inputA + "#2"}) { t.Fatalf("Alice derived_from = %v", aliceSegments[0].DerivedFrom) } } func TestMergeResolveOverlapsPreservesAbsorbedContextPrefix(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ { "start": 7, "end": 9.95, "text": "full context prefix near", "words": [ {"word": "full", "start": 7.1, "end": 7.2}, {"word": "context", "start": 7.3, "end": 7.4}, {"word": "prefix", "start": 7.5, "end": 7.6}, {"word": "near", "start": 9.7, "end": 9.9} ] }, { "start": 10, "end": 11, "text": "inside", "words": [ {"word": "inside", "start": 10.5, "end": 10.7} ] } ] }`) inputB := writeJSONFile(t, dir, "b.json", `{ "segments": [ { "start": 10.2, "end": 11, "text": "bob", "words": [ {"word": "bob", "start": 10.4, "end": 10.6} ] } ] }`) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["a.json"] - speaker: Bob match: ["b.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", inputA, "--input-file", inputB, "--speakers", speakers, "--output-schema", "minimal", "--output-file", output, ) if err != nil { t.Fatalf("merge failed: %v", err) } var transcript schema.MinimalTranscript readJSON(t, output, &transcript) aliceText := make([]string, 0) for _, segment := range transcript.Segments { if segment.Speaker == "Alice" { aliceText = append(aliceText, segment.Text) } } if strings.Join(aliceText, " ") != "full context prefix near inside" { t.Fatalf("expected full absorbed context prefix in Alice output, got %#v", transcript.Segments) } } func TestInvalidTimingFails(t *testing.T) { tests := []struct { name string json string want string }{ { name: "negative start", json: `{"segments":[{"start":-1,"end":1,"text":"x"}]}`, want: "segment 0 has negative start", }, { name: "end before start", json: `{"segments":[{"start":2,"end":1,"text":"x"}]}`, want: "segment 0 has end before start", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - speaker: Alice match: ["input.json"] `) output := filepath.Join(dir, "merged.json") err := executeMerge( "--input-file", input, "--speakers", speakers, "--output-file", output, ) if err == nil { t.Fatal("expected error") } if !strings.Contains(err.Error(), test.want) { t.Fatalf("expected error to contain %q, got %v", test.want, err) } }) } } func executeMerge(args ...string) error { cmd := NewRootCommand() cmd.SetArgs(append([]string{"merge"}, args...)) return cmd.Execute() } func writeJSONFile(t *testing.T, dir string, name string, content string) string { t.Helper() path := filepath.Join(dir, name) if err := os.WriteFile(path, []byte(content+"\n"), 0o600); err != nil { t.Fatalf("write file: %v", err) } return path } func writeYAMLFile(t *testing.T, dir string, name string, content string) string { t.Helper() path := filepath.Join(dir, name) if err := os.WriteFile(path, []byte(content), 0o600); err != nil { t.Fatalf("write file: %v", err) } return path } func readJSON(t *testing.T, path string, target any) { t.Helper() data, err := os.ReadFile(path) if err != nil { t.Fatalf("read %s: %v", path, err) } if err := json.Unmarshal(data, target); err != nil { t.Fatalf("unmarshal %s: %v", path, err) } } func equalStrings(left []string, right []string) bool { if len(left) != len(right) { return false } for index := range left { if left[index] != right[index] { return false } } return true } func hasReportEvent(rpt report.Report, stage string, module string, messageSubstring string) bool { for _, event := range rpt.Events { if event.Stage == stage && event.Module == module && strings.Contains(event.Message, messageSubstring) { return true } } return false } func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) { t.Helper() if segment.ID != id { t.Fatalf("segment ID = %d, want %d", segment.ID, id) } if segment.Source != source { t.Fatalf("segment source = %q, want %q", segment.Source, source) } if segment.SourceSegmentIndex == nil { t.Fatalf("segment source index = nil, want %d", sourceIndex) } if *segment.SourceSegmentIndex != sourceIndex { t.Fatalf("segment source index = %d, want %d", *segment.SourceSegmentIndex, sourceIndex) } if segment.Speaker != speaker { t.Fatalf("segment speaker = %q, want %q", segment.Speaker, speaker) } if segment.Start != start { t.Fatalf("segment start = %f, want %f", segment.Start, start) } if segment.End != end { t.Fatalf("segment end = %f, want %f", segment.End, end) } if segment.Text != text { t.Fatalf("segment text = %q, want %q", segment.Text, text) } }