Files
seriatim/internal/overlap/resolve_test.go

594 lines
18 KiB
Go

package overlap
import (
"reflect"
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
func TestResolveNoOverlapGroupsIsNoOp(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 2, word("hello", 1.1, 1.2)),
},
}
got, summary, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if !reflect.DeepEqual(got, merged) {
t.Fatalf("expected no-op result:\ngot %#v\nwant %#v", got, merged)
}
if summary.GroupsProcessed != 0 || summary.GroupsChanged != 0 {
t.Fatalf("unexpected summary: %#v", summary)
}
}
func TestResolveCreatesChronologicalWordRunSegments(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 5, word("A1", 1.1, 1.2), word("A2", 1.8, 2.0)),
segmentWithWords("b.json", 0, "Bob", 1.5, 4, word("B1", 1.55, 1.7), word("B2", 2.6, 2.8)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
merged.Segments[0].OverlapGroupID = 1
merged.Segments[1].OverlapGroupID = 1
got, summary, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.GroupsProcessed != 1 || summary.GroupsChanged != 1 || summary.OriginalsRemoved != 2 || summary.ReplacementsCreated != 3 {
t.Fatalf("unexpected summary: %#v", summary)
}
if len(got.OverlapGroups) != 0 {
t.Fatalf("expected resolved group to be removed, got %#v", got.OverlapGroups)
}
gotTexts := []string{got.Segments[0].Text, got.Segments[1].Text, got.Segments[2].Text}
wantTexts := []string{"A1 A2", "B1", "B2"}
if !reflect.DeepEqual(gotTexts, wantTexts) {
t.Fatalf("texts = %v, want %v", gotTexts, wantTexts)
}
for _, segment := range got.Segments {
if segment.ID != 0 {
t.Fatalf("replacement segment has ID %d, want 0", segment.ID)
}
if segment.SourceSegmentIndex != nil {
t.Fatalf("replacement segment source index = %d, want nil", *segment.SourceSegmentIndex)
}
if segment.OverlapGroupID != 0 {
t.Fatalf("replacement segment overlap group ID = %d, want 0", segment.OverlapGroupID)
}
if segment.SourceRef == "" {
t.Fatal("replacement segment missing source_ref")
}
}
}
func TestResolveIncludesWordsByIntervalIntersection(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords(
"a.json",
0,
"Alice",
9,
21,
word("before", 9.5, 10),
word("left-edge", 9.9, 10.1),
word("inside", 11, 11.2),
word("right-edge", 19.9, 20.1),
word("after", 20, 20.2),
),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 20, []string{"a.json#0"}, []string{"Alice"}),
},
}
merged.Segments[0].OverlapGroupID = 1
got, _, err := Resolve(merged, 10, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Text != "left-edge inside right-edge" {
t.Fatalf("text = %q", got.Segments[0].Text)
}
}
func TestResolveIncludesContextWordsAroundOverlapWindow(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 7.5, 9.5, word("before", 8.5, 8.7)),
segmentWithWords("a.json", 1, "Alice", 10, 12, word("inside", 10.5, 10.7)),
segmentWithWords("a.json", 2, "Alice", 12.5, 13.5, word("after", 13, 13.2)),
segmentWithWords("b.json", 0, "Bob", 10.2, 11.2, word("bob", 10.4, 10.6)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 12, []string{"a.json#1", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
merged.Segments[1].OverlapGroupID = 1
merged.Segments[3].OverlapGroupID = 1
got, summary, err := Resolve(merged, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.GroupsChanged != 1 || summary.OriginalsRemoved != 4 || summary.ReplacementsCreated != 2 {
t.Fatalf("unexpected summary: %#v", summary)
}
if gotTexts(got.Segments) != "before inside after,bob" {
t.Fatalf("segment texts = %s", gotTexts(got.Segments))
}
alice := got.Segments[0]
if alice.Start != 8.5 || alice.End != 13.2 {
t.Fatalf("context bounds = %f-%f, want 8.5-13.2", alice.Start, alice.End)
}
if !reflect.DeepEqual(alice.DerivedFrom, []string{"a.json#0", "a.json#1", "a.json#2"}) {
t.Fatalf("derived_from = %v", alice.DerivedFrom)
}
}
func TestResolveDoesNotIncludeContextOutsideWindow(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 5, 6.9, word("outside", 6, 6.2)),
segmentWithWords("a.json", 1, "Alice", 10, 12, word("inside", 10.5, 10.7)),
segmentWithWords("b.json", 0, "Bob", 10.2, 11.2, word("bob", 10.4, 10.6)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 12, []string{"a.json#1", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
got, _, err := Resolve(merged, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "Alice,bob,inside" {
t.Fatalf("segment texts = %s", gotTexts(got.Segments))
}
if got.Segments[0].SourceSegmentIndex == nil || *got.Segments[0].SourceSegmentIndex != 0 {
t.Fatalf("outside context segment was not preserved: %#v", got.Segments[0])
}
}
func TestResolveDoesNotIncludeNearbyNonGroupSpeakerContext(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 10, 12, word("alice", 10.5, 10.7)),
segmentWithWords("b.json", 0, "Bob", 10.2, 11.2, word("bob", 10.4, 10.6)),
segmentWithWords("c.json", 0, "Carol", 12.5, 13.5, word("carol", 13, 13.2)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 12, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
got, _, err := Resolve(merged, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "bob,alice,Carol" {
t.Fatalf("segment texts = %s", gotTexts(got.Segments))
}
if got.Segments[2].SourceSegmentIndex == nil || *got.Segments[2].SourceSegmentIndex != 0 {
t.Fatalf("non-group speaker context segment was not preserved: %#v", got.Segments[2])
}
}
func TestResolveRemovesIncludedContextSegmentsForReplacedSpeaker(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 8, 9, word("before", 8.5, 8.7)),
segmentWithWords("a.json", 1, "Alice", 10, 12, word("inside", 10.5, 10.7)),
segmentWithWords("b.json", 0, "Bob", 10.2, 11.2),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 12, []string{"a.json#1", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
merged.Segments[1].OverlapGroupID = 1
merged.Segments[2].OverlapGroupID = 1
got, summary, err := Resolve(merged, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.OriginalsRemoved != 2 || summary.ReplacementsCreated != 1 {
t.Fatalf("unexpected summary: %#v", summary)
}
if gotTexts(got.Segments) != "before inside,Bob" {
t.Fatalf("segment texts = %s", gotTexts(got.Segments))
}
if got.Segments[1].OverlapGroupID != 0 {
t.Fatalf("kept original group annotation = %d, want 0", got.Segments[1].OverlapGroupID)
}
}
func TestResolveSkipsContextSegmentReferencedByAnotherOverlapGroup(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 8, 9, word("other-group", 8.5, 8.7)),
segmentWithWords("a.json", 1, "Alice", 10, 12, word("inside", 10.5, 10.7)),
segmentWithWords("b.json", 0, "Bob", 10.2, 11.2, word("bob", 10.4, 10.6)),
segmentWithWords("c.json", 0, "Carol", 8.5, 9.5),
},
OverlapGroups: []model.OverlapGroup{
group(1, 10, 12, []string{"a.json#1", "b.json#0"}, []string{"Alice", "Bob"}),
group(2, 8, 9.5, []string{"a.json#0", "c.json#0"}, []string{"Alice", "Carol"}),
},
}
merged.Segments[0].OverlapGroupID = 2
merged.Segments[1].OverlapGroupID = 1
merged.Segments[2].OverlapGroupID = 1
merged.Segments[3].OverlapGroupID = 2
refToIndex := map[string]int{}
for index, segment := range merged.Segments {
refToIndex[SegmentRef(segment)] = index
}
overlapRefs := map[string]struct{}{
"a.json#0": {},
"a.json#1": {},
"b.json#0": {},
"c.json#0": {},
}
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if !reflect.DeepEqual(resolved.removeRefs, []string{"a.json#1", "b.json#0"}) {
t.Fatalf("remove refs = %v", resolved.removeRefs)
}
if gotTexts(resolved.replacements) != "bob,inside" {
t.Fatalf("replacement texts = %s", gotTexts(resolved.replacements))
}
}
func TestResolveWordRunGapThreshold(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 4, word("one", 1, 1.1), word("two", 1.85, 2), word("three", 2.8, 3)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 4, []string{"a.json#0"}, []string{"Alice"}),
},
}
merged.Segments[0].OverlapGroupID = 1
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "one two" || got.Segments[1].Text != "three" {
t.Fatalf("unexpected replacement texts: %#v", got.Segments)
}
}
func TestResolvePartialResolutionKeepsNoWordSpeakerOriginals(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 5, word("hello", 1.2, 1.4)),
segmentWithWords("b.json", 0, "Bob", 2, 4),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
merged.Segments[0].OverlapGroupID = 1
merged.Segments[1].OverlapGroupID = 1
got, summary, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.OriginalsRemoved != 1 || summary.ReplacementsCreated != 1 {
t.Fatalf("unexpected summary: %#v", summary)
}
if len(got.OverlapGroups) != 0 {
t.Fatalf("expected changed group to be removed, got %#v", got.OverlapGroups)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "hello" || got.Segments[1].Text != "Bob" {
t.Fatalf("unexpected segment texts: %#v", got.Segments)
}
if got.Segments[1].SourceSegmentIndex == nil {
t.Fatal("kept original should retain source_segment_index")
}
if got.Segments[1].OverlapGroupID != 0 {
t.Fatalf("kept original overlap group ID = %d, want 0", got.Segments[1].OverlapGroupID)
}
}
func TestResolveGroupWithNoUsableWordsRemainsUnchanged(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 5),
segmentWithWords("b.json", 0, "Bob", 2, 4),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
merged.Segments[0].OverlapGroupID = 1
merged.Segments[1].OverlapGroupID = 1
got, summary, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.GroupsChanged != 0 || summary.OriginalsRemoved != 0 || summary.ReplacementsCreated != 0 {
t.Fatalf("unexpected summary: %#v", summary)
}
if !reflect.DeepEqual(got, merged) {
t.Fatalf("expected unchanged transcript:\ngot %#v\nwant %#v", got, merged)
}
}
func TestResolveReplacementProvenanceIsDeterministic(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 1, "Alice", 1, 3, word("second", 1.5, 1.6)),
segmentWithWords("a.json", 0, "Alice", 1, 3, word("first", 1.1, 1.2)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#1", "a.json#0"}, []string{"Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
segment := got.Segments[0]
if segment.SourceRef != "word-run:1:1:1" {
t.Fatalf("source_ref = %q", segment.SourceRef)
}
if !reflect.DeepEqual(segment.DerivedFrom, []string{"a.json#0", "a.json#1"}) {
t.Fatalf("derived_from = %v", segment.DerivedFrom)
}
}
func TestResolveIncludesUntimedWordsInTextWithoutChangingBounds(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords(
"a.json",
0,
"Alice",
1,
3,
untimedWord("pre"),
word("one", 1.1, 1.2),
untimedWord("middle"),
word("two", 1.4, 1.5),
untimedWord("post"),
),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#0"}, []string{"Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
segment := got.Segments[0]
if segment.Text != "pre one middle two post" {
t.Fatalf("text = %q", segment.Text)
}
if segment.Start != 1.1 || segment.End != 1.5 {
t.Fatalf("bounds = %f-%f, want 1.1-1.5", segment.Start, segment.End)
}
}
func TestResolveUntimedWordsDoNotBridgeWordRunGap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords(
"a.json",
0,
"Alice",
1,
4,
word("one", 1, 1.1),
untimedWord("middle"),
word("two", 2, 2.1),
),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 4, []string{"a.json#0"}, []string{"Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "one middle" || got.Segments[1].Text != "two" {
t.Fatalf("unexpected texts: %#v", got.Segments)
}
if got.Segments[0].End != 1.1 || got.Segments[1].Start != 2 {
t.Fatalf("untimed word changed bounds: %#v", got.Segments)
}
}
func TestResolveSpeakerWithOnlyUntimedWordsIsNotReplaced(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 3, untimedWord("hello")),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#0"}, []string{"Alice"}),
},
}
merged.Segments[0].OverlapGroupID = 1
got, summary, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if summary.GroupsChanged != 0 {
t.Fatalf("unexpected summary: %#v", summary)
}
if !reflect.DeepEqual(got, merged) {
t.Fatalf("expected unchanged transcript:\ngot %#v\nwant %#v", got, merged)
}
}
func TestResolveReordersNearStartWordRunsByDuration(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 3, word("long", 1, 2)),
segmentWithWords("b.json", 0, "Bob", 1, 3, word("short", 1.2, 1.3)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "short,long" {
t.Fatalf("segment order = %s, want short,long", gotTexts(got.Segments))
}
if got.Segments[0].Start != 1.2 || got.Segments[0].End != 1.3 {
t.Fatalf("short segment bounds changed: %#v", got.Segments[0])
}
if got.Segments[1].SourceRef != "word-run:1:1:1" || got.Segments[1].Text != "long" {
t.Fatalf("long segment provenance/text changed: %#v", got.Segments[1])
}
}
func TestResolveDoesNotReorderWordRunsOutsideWindow(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 3, word("long", 1, 2)),
segmentWithWords("b.json", 0, "Bob", 1, 3, word("short", 1.5, 1.6)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "long,short" {
t.Fatalf("segment order = %s, want long,short", gotTexts(got.Segments))
}
}
func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1, 3, word("long", 1, 2)),
segmentWithWords("b.json", 0, "Bob", 1, 3, word("medium", 1.3, 1.8)),
segmentWithWords("c.json", 0, "Carol", 1, 3, word("short", 1.65, 1.75)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"a.json#0", "b.json#0", "c.json#0"}, []string{"Alice", "Bob", "Carol"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "short,medium,long" {
t.Fatalf("segment order = %s, want short,medium,long", gotTexts(got.Segments))
}
}
func TestResolveReorderFallsBackToDeterministicOrderForEqualDurations(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("b.json", 0, "Bob", 1, 3, word("bob", 1, 1.5)),
segmentWithWords("a.json", 0, "Alice", 1, 3, word("alice", 1.2, 1.7)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1, 3, []string{"b.json#0", "a.json#0"}, []string{"Bob", "Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 0)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
if gotTexts(got.Segments) != "bob,alice" {
t.Fatalf("segment order = %s, want bob,alice", gotTexts(got.Segments))
}
}
func segmentWithWords(source string, sourceIndex int, speaker string, start float64, end float64, words ...model.Word) model.Segment {
segment := segment(source, sourceIndex, speaker, start, end)
segment.Words = words
return segment
}
func word(text string, start float64, end float64) model.Word {
return model.Word{
Text: text,
Start: start,
End: end,
Timed: true,
}
}
func gotTexts(segments []model.Segment) string {
texts := make([]string, 0, len(segments))
for _, segment := range segments {
texts = append(texts, segment.Text)
}
return strings.Join(texts, ",")
}
func untimedWord(text string) model.Word {
return model.Word{
Text: text,
}
}
func group(id int, start float64, end float64, refs []string, speakers []string) model.OverlapGroup {
return model.OverlapGroup{
ID: id,
Start: start,
End: end,
Segments: refs,
Speakers: speakers,
Class: defaultClass,
Resolution: defaultResolution,
}
}