Minor updates to overlap detection and segment coalescing logic

This commit is contained in:
2026-04-28 14:11:38 -05:00
parent 28c2eea340
commit a3ca6665a9
14 changed files with 662 additions and 95 deletions

View File

@@ -57,17 +57,20 @@ func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summ
type run struct {
segments []model.Segment
start float64
end float64
}
func newRun(segment model.Segment) run {
return run{
segments: []model.Segment{segment},
start: segment.Start,
end: segment.End,
}
}
func (r run) canMerge(next model.Segment, gap float64) bool {
current := r.segments[len(r.segments)-1]
return current.Speaker == next.Speaker && next.Start-current.End <= gap
return r.speaker() == next.Speaker && next.Start-r.end <= gap
}
func (r run) speaker() string {
@@ -76,6 +79,12 @@ func (r run) speaker() string {
func (r *run) add(segment model.Segment) {
r.segments = append(r.segments, segment)
if segment.Start < r.start {
r.start = segment.Start
}
if segment.End > r.end {
r.end = segment.End
}
}
func seedRunFromPending(pending []model.Segment, segment model.Segment, gap float64) (run, []model.Segment, bool) {
@@ -129,19 +138,13 @@ func (r run) coalescedSegment(id int) model.Segment {
SourceRef: fmt.Sprintf("coalesce:%d", id),
DerivedFrom: make([]string, 0, len(r.segments)),
Speaker: first.Speaker,
Start: first.Start,
End: first.End,
Start: r.start,
End: r.end,
Words: make([]model.Word, 0),
}
text := make([]string, 0, len(r.segments))
for _, segment := range r.segments {
if segment.Start < merged.Start {
merged.Start = segment.Start
}
if segment.End > merged.End {
merged.End = segment.End
}
if segment.Source != merged.Source {
merged.Source = "derived"
}

View File

@@ -115,6 +115,51 @@ func TestApplyHonorsCurrentOrder(t *testing.T) {
}
}
func TestApplyUsesEffectiveRunEndForReorderedSegments(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 10, 20, "long"),
segment("a.json", 1, "Alice", 1, 2, "early"),
segment("a.json", 2, "Alice", 22, 23, "after long"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 3 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Text != "long early after long" {
t.Fatalf("text = %q", got.Segments[0].Text)
}
if got.Segments[0].Start != 1 || got.Segments[0].End != 23 {
t.Fatalf("bounds = %f-%f, want 1-23", got.Segments[0].Start, got.Segments[0].End)
}
}
func TestApplyDoesNotMergeBeyondEffectiveRunEndGap(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segment("a.json", 0, "Alice", 10, 20, "long"),
segment("a.json", 1, "Alice", 1, 2, "early"),
segment("a.json", 2, "Alice", 23.1, 24, "too late"),
},
}
got, summary := Apply(merged, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "long early" || got.Segments[1].Text != "too late" {
t.Fatalf("segments = %#v", got.Segments)
}
}
func TestApplyDerivedProvenanceForMixedSourcesAndDerivedInputs(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
second := model.Segment{