Recompute overlap groups during trim

This commit is contained in:
2026-05-08 14:47:52 +00:00
parent 52f7729100
commit 1c0e4438ae
2 changed files with 524 additions and 12 deletions

View File

@@ -210,6 +210,212 @@ func TestApplyPreservesRetainedSegmentFieldsAndClearsOverlapIDs(t *testing.T) {
}
}
func TestApplyFullSchemaRemovesStaleOverlapGroups(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1,3")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
for index, segment := range result.Transcript.Segments {
if segment.OverlapGroupID != 0 {
t.Fatalf("segment %d overlap_group_id = %d, want 0", index, segment.OverlapGroupID)
}
}
}
func TestApplyFullSchemaRecomputesOverlapGroup(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1,2")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
assertSegmentIDs(t, result.Transcript.Segments, []int{1, 2})
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
}, []int{1, 1})
if len(result.Transcript.OverlapGroups) != 1 {
t.Fatalf("overlap_groups count = %d, want 1", len(result.Transcript.OverlapGroups))
}
group := result.Transcript.OverlapGroups[0]
if group.ID != 1 {
t.Fatalf("group ID = %d, want 1", group.ID)
}
if group.Start != 1 || group.End != 4 {
t.Fatalf("group times = %.3f-%.3f, want 1.000-4.000", group.Start, group.End)
}
if !equalStringSlices(group.Segments, []string{"a.json#10", "b.json#20"}) {
t.Fatalf("group segments = %v, want %v", group.Segments, []string{"a.json#10", "b.json#20"})
}
if !equalStringSlices(group.Speakers, []string{"Alice", "Bob"}) {
t.Fatalf("group speakers = %v, want %v", group.Speakers, []string{"Alice", "Bob"})
}
}
func TestApplyFullSchemaDropsGroupWhenFewerThanTwoSpeakersRemain(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].OverlapGroupID != 0 {
t.Fatalf("segment overlap_group_id = %d, want 0", result.Transcript.Segments[0].OverlapGroupID)
}
}
func TestApplyFullSchemaHandlesTransitiveOverlaps(t *testing.T) {
input := transitiveOverlapFixture()
selector := mustParseSelector(t, "1-3")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 1 {
t.Fatalf("overlap_groups count = %d, want 1", len(result.Transcript.OverlapGroups))
}
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
result.Transcript.Segments[2].OverlapGroupID,
}, []int{1, 1, 1})
group := result.Transcript.OverlapGroups[0]
if group.Start != 10 || group.End != 15 {
t.Fatalf("group times = %.3f-%.3f, want 10.000-15.000", group.Start, group.End)
}
}
func TestApplyFullSchemaBoundaryTouchingNotGrouped(t *testing.T) {
input := boundaryFixture()
selector := mustParseSelector(t, "1-2")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
}, []int{0, 0})
}
func TestApplyIntermediateDoesNotIncludeOverlapGroups(t *testing.T) {
input := schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: "seriatim-intermediate",
},
Segments: []schema.IntermediateSegment{
{ID: 1, Start: 1, End: 3, Speaker: "Alice", Text: "alpha", Categories: []string{"word-run"}},
{ID: 2, Start: 2, End: 4, Speaker: "Bob", Text: "beta", Categories: []string{"filler"}},
},
}
selector := mustParseSelector(t, "1")
result, err := ApplyIntermediate(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply intermediate failed: %v", err)
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].ID != 1 {
t.Fatalf("segment id = %d, want 1", result.Transcript.Segments[0].ID)
}
if err := schema.ValidateIntermediateTranscript(result.Transcript); err != nil {
t.Fatalf("intermediate output should remain valid: %v", err)
}
}
func TestApplyMinimalDoesNotIncludeOverlapGroups(t *testing.T) {
input := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: "seriatim-minimal",
},
Segments: []schema.MinimalSegment{
{ID: 1, Start: 1, End: 3, Speaker: "Alice", Text: "alpha"},
{ID: 2, Start: 2, End: 4, Speaker: "Bob", Text: "beta"},
},
}
selector := mustParseSelector(t, "2")
result, err := ApplyMinimal(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply minimal failed: %v", err)
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].ID != 1 {
t.Fatalf("segment id = %d, want 1", result.Transcript.Segments[0].ID)
}
if err := schema.ValidateMinimalTranscript(result.Transcript); err != nil {
t.Fatalf("minimal output should remain valid: %v", err)
}
}
func TestApplyOutputInvariantsValidAfterRenumberAndOverlapRecompute(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "2,1")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if err := schema.ValidateTranscript(result.Transcript); err != nil {
t.Fatalf("trim output should remain valid: %v", err)
}
}
func mustParseSelector(t *testing.T, value string) Selector {
t.Helper()
selector, err := ParseSelector(value)
@@ -303,6 +509,104 @@ func fullTranscriptFixture() schema.Transcript {
}
}
func overlapTranscriptFixture() schema.Transcript {
first := 10
second := 20
third := 30
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
InputReader: "json-files",
InputFiles: []string{"a.json", "b.json", "c.json"},
PreprocessingModules: []string{"validate-raw"},
PostprocessingModules: []string{"detect-overlaps"},
OutputModules: []string{"json"},
},
Segments: []schema.Segment{
{
ID: 1,
Source: "a.json",
SourceSegmentIndex: &first,
SourceRef: "a.json#10",
Speaker: "Alice",
Start: 1,
End: 4,
Text: "a",
OverlapGroupID: 99,
},
{
ID: 2,
Source: "b.json",
SourceSegmentIndex: &second,
SourceRef: "b.json#20",
Speaker: "Bob",
Start: 2,
End: 3,
Text: "b",
OverlapGroupID: 99,
},
{
ID: 3,
Source: "c.json",
SourceSegmentIndex: &third,
SourceRef: "c.json#30",
Speaker: "Carol",
Start: 10,
End: 11,
Text: "c",
OverlapGroupID: 100,
},
},
OverlapGroups: []schema.OverlapGroup{
{
ID: 99,
Start: 0,
End: 100,
Segments: []string{"stale#1", "stale#2"},
Speakers: []string{"stale"},
Class: "unknown",
Resolution: "unresolved",
},
},
}
}
func transitiveOverlapFixture() schema.Transcript {
one := 1
two := 2
three := 3
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &one, Speaker: "Alice", Start: 10, End: 14, Text: "a"},
{ID: 2, Source: "b.json", SourceSegmentIndex: &two, Speaker: "Bob", Start: 12, End: 13, Text: "b"},
{ID: 3, Source: "c.json", SourceSegmentIndex: &three, Speaker: "Carol", Start: 13.5, End: 15, Text: "c"},
},
OverlapGroups: []schema.OverlapGroup{{ID: 77}},
}
}
func boundaryFixture() schema.Transcript {
one := 1
two := 2
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &one, Speaker: "Alice", Start: 1, End: 2, Text: "a", OverlapGroupID: 7},
{ID: 2, Source: "b.json", SourceSegmentIndex: &two, Speaker: "Bob", Start: 2, End: 3, Text: "b", OverlapGroupID: 7},
},
OverlapGroups: []schema.OverlapGroup{{ID: 7, Start: 1, End: 3}},
}
}
func assertSegmentIDs(t *testing.T, segments []schema.Segment, want []int) {
t.Helper()
got := make([]int, len(segments))