diff --git a/README.md b/README.md index fd9da0d..456ed75 100644 --- a/README.md +++ b/README.md @@ -263,7 +263,7 @@ The default pipeline runs `coalesce` before the second overlap detection pass. I Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`. -Different-speaker backchannel and filler segments do not block coalescing of surrounding same-speaker segments. When same-speaker segments are coalesced, any `backchannel` or `filler` category from the merged inputs is dropped from the coalesced segment. +Different-speaker backchannel and filler segments do not block coalescing of surrounding same-speaker segments. Same-speaker backchannel and filler segments are merged normally when they are within `--coalesce-gap`. When same-speaker segments are coalesced, any `backchannel` or `filler` category from the merged inputs is dropped from the coalesced segment. ## Autocorrect diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 9b4da8a..98df1cc 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -665,6 +665,94 @@ func TestMergeTagsFillerSegments(t *testing.T) { } } +func TestMergeCoalescesSameSpeakerBackchannelWithFollowingSegment(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "zach.json", `{ + "segments": [ + {"start": 1, "end": 1.7, "text": "That makes sense."}, + {"start": 1.72, "end": 4, "text": "So, like, next thought."} + ] + }`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(transcript.Segments)) + } + segment := transcript.Segments[0] + if segment.Text != "That makes sense. So, like, next thought." { + t.Fatalf("text = %q", segment.Text) + } + if segment.SourceRef != "coalesce:1" { + t.Fatalf("source_ref = %q, want coalesce:1", segment.SourceRef) + } + if !equalStrings(segment.DerivedFrom, []string{input + "#0", input + "#1"}) { + t.Fatalf("derived_from = %v", segment.DerivedFrom) + } + if len(segment.Categories) != 0 { + t.Fatalf("categories = %v, want none", segment.Categories) + } +} + +func TestMergeCoalescesBackchannelAfterDifferentSpeakerIntoFollowingSameSpeakerSegment(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "mike.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": "previous speaker"} + ] + }`) + inputB := writeJSONFile(t, dir, "zach.json", `{ + "segments": [ + {"start": 2.5, "end": 3, "text": "That makes sense."}, + {"start": 3.02, "end": 6, "text": "So, like, next thought."} + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Mike + match: ["mike.json"] + - speaker: Zach + match: ["zach.json"] +`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if transcript.Segments[0].Speaker != "Mike" || transcript.Segments[0].Text != "previous speaker" { + t.Fatalf("first segment = %#v, want Mike original", transcript.Segments[0]) + } + if transcript.Segments[1].Speaker != "Zach" || transcript.Segments[1].Text != "That makes sense. So, like, next thought." { + t.Fatalf("second segment = %#v, want coalesced Zach", transcript.Segments[1]) + } + if !equalStrings(transcript.Segments[1].DerivedFrom, []string{inputB + "#0", inputB + "#1"}) { + t.Fatalf("derived_from = %v", transcript.Segments[1].DerivedFrom) + } + if len(transcript.Segments[1].Categories) != 0 { + t.Fatalf("categories = %v, want none", transcript.Segments[1].Categories) + } +} + func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ diff --git a/internal/coalesce/coalesce.go b/internal/coalesce/coalesce.go index 70ac074..7925d75 100644 --- a/internal/coalesce/coalesce.go +++ b/internal/coalesce/coalesce.go @@ -29,16 +29,22 @@ func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summ current := newRun(in.Segments[0]) pendingSkipped := make([]model.Segment, 0) for _, segment := range in.Segments[1:] { - if current.canMerge(segment, gap) { - current.add(segment) - continue - } - if segment.Speaker != current.speaker() && hasAnyCategory(segment, "backchannel", "filler") { + if segment.Speaker == current.speaker() { + if current.canMerge(segment, gap) { + current.add(segment) + continue + } + } else if isSkippableInterjection(segment) { pendingSkipped = append(pendingSkipped, segment) continue } coalescedID = appendRun(&out, current, coalescedID, &summary) + if seeded, remaining, ok := seedRunFromPending(pendingSkipped, segment, gap); ok { + pendingSkipped = remaining + current = seeded + continue + } out.Segments = append(out.Segments, pendingSkipped...) pendingSkipped = pendingSkipped[:0] current = newRun(segment) @@ -72,6 +78,37 @@ func (r *run) add(segment model.Segment) { r.segments = append(r.segments, segment) } +func seedRunFromPending(pending []model.Segment, segment model.Segment, gap float64) (run, []model.Segment, bool) { + for start := range pending { + if pending[start].Speaker != segment.Speaker { + continue + } + + candidate := newRun(pending[start]) + selected := map[int]struct{}{start: {}} + for index := start + 1; index < len(pending); index++ { + if pending[index].Speaker == segment.Speaker && candidate.canMerge(pending[index], gap) { + candidate.add(pending[index]) + selected[index] = struct{}{} + } + } + if !candidate.canMerge(segment, gap) { + continue + } + + candidate.add(segment) + remaining := make([]model.Segment, 0, len(pending)-len(selected)) + for index, skipped := range pending { + if _, ok := selected[index]; ok { + continue + } + remaining = append(remaining, skipped) + } + return candidate, remaining, true + } + return run{}, pending, false +} + func appendRun(out *model.MergedTranscript, current run, coalescedID int, summary *Summary) int { if len(current.segments) == 1 { out.Segments = append(out.Segments, current.segments[0]) @@ -129,6 +166,10 @@ func segmentRef(segment model.Segment) string { return segment.Source } +func isSkippableInterjection(segment model.Segment) bool { + return hasAnyCategory(segment, "backchannel", "filler") +} + func hasAnyCategory(segment model.Segment, categories ...string) bool { for _, existing := range segment.Categories { for _, category := range categories { diff --git a/internal/coalesce/coalesce_test.go b/internal/coalesce/coalesce_test.go index e004813..380e3f7 100644 --- a/internal/coalesce/coalesce_test.go +++ b/internal/coalesce/coalesce_test.go @@ -165,6 +165,119 @@ func TestApplyDropsFillerCategoryFromMergedSameSpeakerRun(t *testing.T) { } } +func TestApplyMergesSameSpeakerBackchannelIntoDerivedRun(t *testing.T) { + first := segment("zach.json", 110, "Zach", 7811.778, 7812.478, "That makes sense.") + first.Categories = []string{"backchannel"} + second := model.Segment{ + Source: "zach.json", + SourceRef: "coalesce:347", + DerivedFrom: []string{"zach.json#111", "zach.json#112"}, + Speaker: "Zach", + Start: 7812.498, + End: 7824.045, + Text: "So, like, I'm above the silence field.", + } + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3) + if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + if got.Segments[0].Text != "That makes sense. So, like, I'm above the silence field." { + t.Fatalf("text = %q", got.Segments[0].Text) + } + if got.Segments[0].SourceRef != "coalesce:1" { + t.Fatalf("source_ref = %q, want coalesce:1", got.Segments[0].SourceRef) + } + if !reflect.DeepEqual(got.Segments[0].DerivedFrom, []string{"zach.json#110", "coalesce:347"}) { + t.Fatalf("derived_from = %v", got.Segments[0].DerivedFrom) + } + if got.Segments[0].Categories != nil { + t.Fatalf("categories = %v, want nil", got.Segments[0].Categories) + } +} + +func TestApplyMergesSameSpeakerFillerIntoDerivedRun(t *testing.T) { + first := segment("zach.json", 110, "Zach", 1, 1.5, "um") + first.Categories = []string{"filler"} + second := model.Segment{ + Source: "zach.json", + SourceRef: "coalesce:12", + DerivedFrom: []string{"zach.json#111", "zach.json#112"}, + Speaker: "Zach", + Start: 1.6, + End: 4, + Text: "next thought", + } + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3) + if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + if got.Segments[0].Text != "um next thought" { + t.Fatalf("text = %q", got.Segments[0].Text) + } + if !reflect.DeepEqual(got.Segments[0].DerivedFrom, []string{"zach.json#110", "coalesce:12"}) { + t.Fatalf("derived_from = %v", got.Segments[0].DerivedFrom) + } + if got.Segments[0].Categories != nil { + t.Fatalf("categories = %v, want nil", got.Segments[0].Categories) + } +} + +func TestApplyUsesSkippedBackchannelToSeedNextSameSpeakerRun(t *testing.T) { + mike := segment("mike.json", 367, "Mike", 7803.57, 7810.719, "It's very easy to notice.") + backchannel := segment("zach.json", 110, "Zach", 7811.778, 7812.478, "That makes sense.") + backchannel.Categories = []string{"backchannel"} + next := segment("zach.json", 111, "Zach", 7812.498, 7820, "So, like, next thought.") + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{mike, backchannel, next}}, 3) + if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[0].Text != "It's very easy to notice." { + t.Fatalf("first text = %q", got.Segments[0].Text) + } + if got.Segments[1].Speaker != "Zach" || got.Segments[1].Text != "That makes sense. So, like, next thought." { + t.Fatalf("second segment = %#v", got.Segments[1]) + } + if !reflect.DeepEqual(got.Segments[1].DerivedFrom, []string{"zach.json#110", "zach.json#111"}) { + t.Fatalf("derived_from = %v", got.Segments[1].DerivedFrom) + } + if got.Segments[1].Categories != nil { + t.Fatalf("categories = %v, want nil", got.Segments[1].Categories) + } +} + +func TestApplyUsesSkippedFillerToSeedNextSameSpeakerRun(t *testing.T) { + alice := segment("alice.json", 0, "Alice", 1, 2, "first") + filler := segment("bob.json", 0, "Bob", 2.1, 2.3, "um") + filler.Categories = []string{"filler"} + bob := segment("bob.json", 1, "Bob", 2.4, 4, "actual thought") + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{alice, filler, bob}}, 3) + if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[1].Text != "um actual thought" { + t.Fatalf("second text = %q", got.Segments[1].Text) + } + if got.Segments[1].Categories != nil { + t.Fatalf("categories = %v, want nil", got.Segments[1].Categories) + } +} + func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) { first := segment("a.json", 0, "Alice", 1, 2, "first") backchannel := segment("b.json", 0, "Bob", 2.2, 2.5, "yeah")