package coalesce import ( "fmt" "strings" "gitea.maximumdirect.net/eric/seriatim/internal/model" ) // Summary records deterministic counters for a coalesce pass. type Summary struct { OriginalSegmentsMerged int CoalescedSegments int } // Apply merges adjacent same-speaker segments in the transcript's current order. func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summary) { if len(in.Segments) < 2 { return in, Summary{} } out := model.MergedTranscript{ Segments: make([]model.Segment, 0, len(in.Segments)), OverlapGroups: in.OverlapGroups, } summary := Summary{} coalescedID := 0 current := newRun(in.Segments[0]) pendingSkipped := make([]model.Segment, 0) for _, segment := range in.Segments[1:] { if current.canMerge(segment, gap) { current.add(segment) continue } if segment.Speaker != current.speaker() && hasAnyCategory(segment, "backchannel", "filler") { pendingSkipped = append(pendingSkipped, segment) continue } coalescedID = appendRun(&out, current, coalescedID, &summary) out.Segments = append(out.Segments, pendingSkipped...) pendingSkipped = pendingSkipped[:0] current = newRun(segment) } coalescedID = appendRun(&out, current, coalescedID, &summary) out.Segments = append(out.Segments, pendingSkipped...) return out, summary } type run struct { segments []model.Segment } func newRun(segment model.Segment) run { return run{ segments: []model.Segment{segment}, } } func (r run) canMerge(next model.Segment, gap float64) bool { current := r.segments[len(r.segments)-1] return current.Speaker == next.Speaker && next.Start-current.End <= gap } func (r run) speaker() string { return r.segments[0].Speaker } func (r *run) add(segment model.Segment) { r.segments = append(r.segments, segment) } func appendRun(out *model.MergedTranscript, current run, coalescedID int, summary *Summary) int { if len(current.segments) == 1 { out.Segments = append(out.Segments, current.segments[0]) return coalescedID } coalescedID++ out.Segments = append(out.Segments, current.coalescedSegment(coalescedID)) summary.OriginalSegmentsMerged += len(current.segments) summary.CoalescedSegments++ return coalescedID } func (r run) coalescedSegment(id int) model.Segment { first := r.segments[0] merged := model.Segment{ Source: first.Source, SourceRef: fmt.Sprintf("coalesce:%d", id), DerivedFrom: make([]string, 0, len(r.segments)), Speaker: first.Speaker, Start: first.Start, End: first.End, Words: make([]model.Word, 0), } text := make([]string, 0, len(r.segments)) for _, segment := range r.segments { if segment.Start < merged.Start { merged.Start = segment.Start } if segment.End > merged.End { merged.End = segment.End } if segment.Source != merged.Source { merged.Source = "derived" } if trimmed := strings.TrimSpace(segment.Text); trimmed != "" { text = append(text, trimmed) } merged.Words = append(merged.Words, segment.Words...) merged.DerivedFrom = append(merged.DerivedFrom, segmentRef(segment)) } merged.Text = strings.Join(text, " ") return merged } func segmentRef(segment model.Segment) string { if segment.SourceSegmentIndex != nil { return fmt.Sprintf("%s#%d", segment.Source, *segment.SourceSegmentIndex) } if segment.SourceRef != "" { return segment.SourceRef } return segment.Source } func hasAnyCategory(segment model.Segment, categories ...string) bool { for _, existing := range segment.Categories { for _, category := range categories { if existing == category { return true } } } return false }