Files
seriatim/internal/coalesce/coalesce_test.go

399 lines
14 KiB
Go

package coalesce
import (
"reflect"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
func TestApplyMergesConsecutiveSameSpeakerWithinGap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 1, 2, " first "),
segment("a.json", 1, "Alice", 4, 5, "second"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
segment := got.Segments[0]
if segment.Text != "first second" {
t.Fatalf("text = %q", segment.Text)
}
if segment.Start != 1 || segment.End != 5 {
t.Fatalf("bounds = %f-%f, want 1-5", segment.Start, segment.End)
}
if segment.Source != "a.json" {
t.Fatalf("source = %q, want a.json", segment.Source)
}
if segment.SourceRef != "coalesce:1" {
t.Fatalf("source_ref = %q, want coalesce:1", segment.SourceRef)
}
if segment.SourceSegmentIndex != nil {
t.Fatalf("source_segment_index = %d, want nil", *segment.SourceSegmentIndex)
}
if !reflect.DeepEqual(segment.DerivedFrom, []string{"a.json#0", "a.json#1"}) {
t.Fatalf("derived_from = %v", segment.DerivedFrom)
}
}
func TestApplyDoesNotMergeSameSpeakerBeyondGap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 1, 2, "first"),
segment("a.json", 1, "Alice", 5.1, 6, "second"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 0 || summary.CoalescedSegments != 0 {
t.Fatalf("summary = %#v", summary)
}
if !reflect.DeepEqual(got.Segments, merged.Segments) {
t.Fatalf("segments changed:\ngot %#v\nwant %#v", got.Segments, merged.Segments)
}
}
func TestApplyDoesNotMergeAcrossDifferentSpeaker(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 1, 2, "first"),
segment("b.json", 0, "Bob", 2.5, 3, "bob"),
segment("a.json", 1, "Alice", 3.5, 4, "second"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 0 || summary.CoalescedSegments != 0 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 3 {
t.Fatalf("segment count = %d, want 3", len(got.Segments))
}
}
func TestApplyMergesNegativeGapOverlap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 1, 4, "first"),
segment("a.json", 1, "Alice", 3, 5, "second"),
},
}
got, summary := Apply(merged, 0)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if got.Segments[0].Start != 1 || got.Segments[0].End != 5 {
t.Fatalf("bounds = %f-%f, want 1-5", got.Segments[0].Start, got.Segments[0].End)
}
}
func TestApplyHonorsCurrentOrder(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 10, 11, "later"),
segment("a.json", 1, "Alice", 1, 2, "earlier"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if got.Segments[0].Text != "later earlier" {
t.Fatalf("text = %q, want current-order merge", got.Segments[0].Text)
}
if got.Segments[0].Start != 1 || got.Segments[0].End != 11 {
t.Fatalf("bounds = %f-%f, want 1-11", got.Segments[0].Start, got.Segments[0].End)
}
}
func TestApplyUsesEffectiveRunEndForReorderedSegments(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 10, 20, "long"),
segment("a.json", 1, "Alice", 1, 2, "early"),
segment("a.json", 2, "Alice", 22, 23, "after long"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 3 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Text != "long early after long" {
t.Fatalf("text = %q", got.Segments[0].Text)
}
if got.Segments[0].Start != 1 || got.Segments[0].End != 23 {
t.Fatalf("bounds = %f-%f, want 1-23", got.Segments[0].Start, got.Segments[0].End)
}
}
func TestApplyDoesNotMergeBeyondEffectiveRunEndGap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 10, 20, "long"),
segment("a.json", 1, "Alice", 1, 2, "early"),
segment("a.json", 2, "Alice", 23.1, 24, "too late"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "long early" || got.Segments[1].Text != "too late" {
t.Fatalf("segments = %#v", got.Segments)
}
}
func TestApplyDerivedProvenanceForMixedSourcesAndDerivedInputs(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
second := model.Segment{
Source: "b.json",
SourceRef: "word-run:1:1:1",
DerivedFrom: []string{"b.json#0"},
Speaker: "Alice",
Start: 2.5,
End: 3,
Text: "second",
}
got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
segment := got.Segments[0]
if segment.Source != "derived" {
t.Fatalf("source = %q, want derived", segment.Source)
}
if !reflect.DeepEqual(segment.DerivedFrom, []string{"a.json#0", "word-run:1:1:1"}) {
t.Fatalf("derived_from = %v", segment.DerivedFrom)
}
}
func TestApplyDropsBackchannelCategoryFromMergedSameSpeakerRun(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "yeah")
first.Categories = []string{"backchannel"}
second := segment("a.json", 1, "Alice", 2.5, 3, "more")
got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
}
}
func TestApplyDropsFillerCategoryFromMergedSameSpeakerRun(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "um")
first.Categories = []string{"filler"}
second := segment("a.json", 1, "Alice", 2.5, 3, "more")
got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
}
}
func TestApplyMergesSameSpeakerBackchannelIntoDerivedRun(t *testing.T) {
first := segment("zach.json", 110, "Zach", 7811.778, 7812.478, "That makes sense.")
first.Categories = []string{"backchannel"}
second := model.Segment{
Source: "zach.json",
SourceRef: "coalesce:347",
DerivedFrom: []string{"zach.json#111", "zach.json#112"},
Speaker: "Zach",
Start: 7812.498,
End: 7824.045,
Text: "So, like, I'm above the silence field.",
}
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Text != "That makes sense. So, like, I'm above the silence field." {
t.Fatalf("text = %q", got.Segments[0].Text)
}
if got.Segments[0].SourceRef != "coalesce:1" {
t.Fatalf("source_ref = %q, want coalesce:1", got.Segments[0].SourceRef)
}
if !reflect.DeepEqual(got.Segments[0].DerivedFrom, []string{"zach.json#110", "coalesce:347"}) {
t.Fatalf("derived_from = %v", got.Segments[0].DerivedFrom)
}
if got.Segments[0].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
}
}
func TestApplyMergesSameSpeakerFillerIntoDerivedRun(t *testing.T) {
first := segment("zach.json", 110, "Zach", 1, 1.5, "um")
first.Categories = []string{"filler"}
second := model.Segment{
Source: "zach.json",
SourceRef: "coalesce:12",
DerivedFrom: []string{"zach.json#111", "zach.json#112"},
Speaker: "Zach",
Start: 1.6,
End: 4,
Text: "next thought",
}
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Text != "um next thought" {
t.Fatalf("text = %q", got.Segments[0].Text)
}
if !reflect.DeepEqual(got.Segments[0].DerivedFrom, []string{"zach.json#110", "coalesce:12"}) {
t.Fatalf("derived_from = %v", got.Segments[0].DerivedFrom)
}
if got.Segments[0].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
}
}
func TestApplyUsesSkippedBackchannelToSeedNextSameSpeakerRun(t *testing.T) {
mike := segment("mike.json", 367, "Mike", 7803.57, 7810.719, "It's very easy to notice.")
backchannel := segment("zach.json", 110, "Zach", 7811.778, 7812.478, "That makes sense.")
backchannel.Categories = []string{"backchannel"}
next := segment("zach.json", 111, "Zach", 7812.498, 7820, "So, like, next thought.")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{mike, backchannel, next}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "It's very easy to notice." {
t.Fatalf("first text = %q", got.Segments[0].Text)
}
if got.Segments[1].Speaker != "Zach" || got.Segments[1].Text != "That makes sense. So, like, next thought." {
t.Fatalf("second segment = %#v", got.Segments[1])
}
if !reflect.DeepEqual(got.Segments[1].DerivedFrom, []string{"zach.json#110", "zach.json#111"}) {
t.Fatalf("derived_from = %v", got.Segments[1].DerivedFrom)
}
if got.Segments[1].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[1].Categories)
}
}
func TestApplyUsesSkippedFillerToSeedNextSameSpeakerRun(t *testing.T) {
alice := segment("alice.json", 0, "Alice", 1, 2, "first")
filler := segment("bob.json", 0, "Bob", 2.1, 2.3, "um")
filler.Categories = []string{"filler"}
bob := segment("bob.json", 1, "Bob", 2.4, 4, "actual thought")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{alice, filler, bob}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[1].Text != "um actual thought" {
t.Fatalf("second text = %q", got.Segments[1].Text)
}
if got.Segments[1].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[1].Categories)
}
}
func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
backchannel := segment("b.json", 0, "Bob", 2.2, 2.5, "yeah")
backchannel.Categories = []string{"backchannel"}
second := segment("a.json", 1, "Alice", 3, 4, "second")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, backchannel, second}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "first second" {
t.Fatalf("first output text = %q, want first second", got.Segments[0].Text)
}
if got.Segments[1].Text != "yeah" || !reflect.DeepEqual(got.Segments[1].Categories, []string{"backchannel"}) {
t.Fatalf("second output segment = %#v", got.Segments[1])
}
}
func TestApplySkipsDifferentSpeakerFillerAsMergeBlocker(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
filler := segment("b.json", 0, "Bob", 2.2, 2.5, "um")
filler.Categories = []string{"filler"}
second := segment("a.json", 1, "Alice", 3, 4, "second")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, filler, second}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "first second" {
t.Fatalf("first output text = %q, want first second", got.Segments[0].Text)
}
if got.Segments[1].Text != "um" || !reflect.DeepEqual(got.Segments[1].Categories, []string{"filler"}) {
t.Fatalf("second output segment = %#v", got.Segments[1])
}
}
func TestApplyDifferentSpeakerNonBackchannelStillBlocksMerge(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
bob := segment("b.json", 0, "Bob", 2.2, 2.5, "interruption")
second := segment("a.json", 1, "Alice", 3, 4, "second")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, bob, second}}, 3)
if summary.OriginalSegmentsMerged != 0 || summary.CoalescedSegments != 0 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 3 {
t.Fatalf("segment count = %d, want 3", len(got.Segments))
}
}
func segment(source string, sourceIndex int, speaker string, start float64, end float64, text string) model.Segment {
return model.Segment{
Source: source,
SourceSegmentIndex: intPtr(sourceIndex),
Speaker: speaker,
Start: start,
End: end,
Text: text,
Words: []model.Word{
{Text: text, Start: start, End: end, Timed: true},
},
}
}
func intPtr(value int) *int {
return &value
}