Implemented an overlap detection module in the postprocessing chain
This commit is contained in:
118
internal/overlap/detect.go
Normal file
118
internal/overlap/detect.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package overlap
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultClass = "unknown"
|
||||
defaultResolution = "unresolved"
|
||||
)
|
||||
|
||||
// Detect annotates overlapping segment groups in an already sorted merged transcript.
|
||||
func Detect(in model.MergedTranscript) model.MergedTranscript {
|
||||
clearExisting(&in)
|
||||
if len(in.Segments) < 2 {
|
||||
return in
|
||||
}
|
||||
|
||||
var groupID int
|
||||
var candidate overlapCandidate
|
||||
for index := range in.Segments {
|
||||
segment := in.Segments[index]
|
||||
if !candidate.active {
|
||||
candidate = newCandidate(index, segment)
|
||||
continue
|
||||
}
|
||||
|
||||
if segment.Start < candidate.end {
|
||||
candidate.add(index, segment)
|
||||
continue
|
||||
}
|
||||
|
||||
groupID = finalizeCandidate(&in, candidate, groupID)
|
||||
candidate = newCandidate(index, segment)
|
||||
}
|
||||
|
||||
finalizeCandidate(&in, candidate, groupID)
|
||||
return in
|
||||
}
|
||||
|
||||
type overlapCandidate struct {
|
||||
active bool
|
||||
indices []int
|
||||
start float64
|
||||
end float64
|
||||
}
|
||||
|
||||
func newCandidate(index int, segment model.Segment) overlapCandidate {
|
||||
return overlapCandidate{
|
||||
active: true,
|
||||
indices: []int{index},
|
||||
start: segment.Start,
|
||||
end: segment.End,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *overlapCandidate) add(index int, segment model.Segment) {
|
||||
c.indices = append(c.indices, index)
|
||||
if segment.End > c.end {
|
||||
c.end = segment.End
|
||||
}
|
||||
}
|
||||
|
||||
func finalizeCandidate(in *model.MergedTranscript, candidate overlapCandidate, currentGroupID int) int {
|
||||
if !candidate.active || len(candidate.indices) < 2 {
|
||||
return currentGroupID
|
||||
}
|
||||
|
||||
speakers := distinctSpeakers(in.Segments, candidate.indices)
|
||||
if len(speakers) < 2 {
|
||||
return currentGroupID
|
||||
}
|
||||
|
||||
groupID := currentGroupID + 1
|
||||
refs := make([]string, 0, len(candidate.indices))
|
||||
for _, index := range candidate.indices {
|
||||
in.Segments[index].OverlapGroupID = groupID
|
||||
refs = append(refs, segmentRef(in.Segments[index]))
|
||||
}
|
||||
|
||||
in.OverlapGroups = append(in.OverlapGroups, model.OverlapGroup{
|
||||
ID: groupID,
|
||||
Start: candidate.start,
|
||||
End: candidate.end,
|
||||
Segments: refs,
|
||||
Speakers: speakers,
|
||||
Class: defaultClass,
|
||||
Resolution: defaultResolution,
|
||||
})
|
||||
return groupID
|
||||
}
|
||||
|
||||
func distinctSpeakers(segments []model.Segment, indices []int) []string {
|
||||
seen := make(map[string]struct{}, len(indices))
|
||||
speakers := make([]string, 0, len(indices))
|
||||
for _, index := range indices {
|
||||
speaker := segments[index].Speaker
|
||||
if _, exists := seen[speaker]; exists {
|
||||
continue
|
||||
}
|
||||
seen[speaker] = struct{}{}
|
||||
speakers = append(speakers, speaker)
|
||||
}
|
||||
return speakers
|
||||
}
|
||||
|
||||
func segmentRef(segment model.Segment) string {
|
||||
return fmt.Sprintf("%s#%d", segment.Source, segment.SourceSegmentIndex)
|
||||
}
|
||||
|
||||
func clearExisting(in *model.MergedTranscript) {
|
||||
in.OverlapGroups = make([]model.OverlapGroup, 0)
|
||||
for index := range in.Segments {
|
||||
in.Segments[index].OverlapGroupID = 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user