Implemented an overlap detection module in the postprocessing chain

This commit is contained in:
2026-04-26 20:39:49 -05:00
parent f9ca80f2e8
commit e42a2326e8
8 changed files with 464 additions and 4 deletions

118
internal/overlap/detect.go Normal file
View File

@@ -0,0 +1,118 @@
package overlap
import (
"fmt"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
const (
defaultClass = "unknown"
defaultResolution = "unresolved"
)
// Detect annotates overlapping segment groups in an already sorted merged transcript.
func Detect(in model.MergedTranscript) model.MergedTranscript {
clearExisting(&in)
if len(in.Segments) < 2 {
return in
}
var groupID int
var candidate overlapCandidate
for index := range in.Segments {
segment := in.Segments[index]
if !candidate.active {
candidate = newCandidate(index, segment)
continue
}
if segment.Start < candidate.end {
candidate.add(index, segment)
continue
}
groupID = finalizeCandidate(&in, candidate, groupID)
candidate = newCandidate(index, segment)
}
finalizeCandidate(&in, candidate, groupID)
return in
}
type overlapCandidate struct {
active bool
indices []int
start float64
end float64
}
func newCandidate(index int, segment model.Segment) overlapCandidate {
return overlapCandidate{
active: true,
indices: []int{index},
start: segment.Start,
end: segment.End,
}
}
func (c *overlapCandidate) add(index int, segment model.Segment) {
c.indices = append(c.indices, index)
if segment.End > c.end {
c.end = segment.End
}
}
func finalizeCandidate(in *model.MergedTranscript, candidate overlapCandidate, currentGroupID int) int {
if !candidate.active || len(candidate.indices) < 2 {
return currentGroupID
}
speakers := distinctSpeakers(in.Segments, candidate.indices)
if len(speakers) < 2 {
return currentGroupID
}
groupID := currentGroupID + 1
refs := make([]string, 0, len(candidate.indices))
for _, index := range candidate.indices {
in.Segments[index].OverlapGroupID = groupID
refs = append(refs, segmentRef(in.Segments[index]))
}
in.OverlapGroups = append(in.OverlapGroups, model.OverlapGroup{
ID: groupID,
Start: candidate.start,
End: candidate.end,
Segments: refs,
Speakers: speakers,
Class: defaultClass,
Resolution: defaultResolution,
})
return groupID
}
func distinctSpeakers(segments []model.Segment, indices []int) []string {
seen := make(map[string]struct{}, len(indices))
speakers := make([]string, 0, len(indices))
for _, index := range indices {
speaker := segments[index].Speaker
if _, exists := seen[speaker]; exists {
continue
}
seen[speaker] = struct{}{}
speakers = append(speakers, speaker)
}
return speakers
}
func segmentRef(segment model.Segment) string {
return fmt.Sprintf("%s#%d", segment.Source, segment.SourceSegmentIndex)
}
func clearExisting(in *model.MergedTranscript) {
in.OverlapGroups = make([]model.OverlapGroup, 0)
for index := range in.Segments {
in.Segments[index].OverlapGroupID = 0
}
}