package coalesce import ( "fmt" "strings" "gitea.maximumdirect.net/eric/seriatim/internal/model" ) // Summary records deterministic counters for a coalesce pass. type Summary struct { OriginalSegmentsMerged int CoalescedSegments int } // Apply merges adjacent same-speaker segments in the transcript's current order. func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summary) { if len(in.Segments) < 2 { return in, Summary{} } out := model.MergedTranscript{ Segments: make([]model.Segment, 0, len(in.Segments)), OverlapGroups: in.OverlapGroups, } summary := Summary{} coalescedID := 0 current := newRun(in.Segments[0]) pendingSkipped := make([]model.Segment, 0) for _, segment := range in.Segments[1:] { if segment.Speaker == current.speaker() { if current.canMerge(segment, gap) { current.add(segment) continue } } else if isSkippableInterjection(segment) { pendingSkipped = append(pendingSkipped, segment) continue } coalescedID = appendRun(&out, current, coalescedID, &summary) if seeded, remaining, ok := seedRunFromPending(pendingSkipped, segment, gap); ok { pendingSkipped = remaining current = seeded continue } out.Segments = append(out.Segments, pendingSkipped...) pendingSkipped = pendingSkipped[:0] current = newRun(segment) } coalescedID = appendRun(&out, current, coalescedID, &summary) out.Segments = append(out.Segments, pendingSkipped...) return out, summary } type run struct { segments []model.Segment start float64 end float64 } func newRun(segment model.Segment) run { return run{ segments: []model.Segment{segment}, start: segment.Start, end: segment.End, } } func (r run) canMerge(next model.Segment, gap float64) bool { return r.speaker() == next.Speaker && next.Start-r.end <= gap } func (r run) speaker() string { return r.segments[0].Speaker } func (r *run) add(segment model.Segment) { r.segments = append(r.segments, segment) if segment.Start < r.start { r.start = segment.Start } if segment.End > r.end { r.end = segment.End } } func seedRunFromPending(pending []model.Segment, segment model.Segment, gap float64) (run, []model.Segment, bool) { for start := range pending { if pending[start].Speaker != segment.Speaker { continue } candidate := newRun(pending[start]) selected := map[int]struct{}{start: {}} for index := start + 1; index < len(pending); index++ { if pending[index].Speaker == segment.Speaker && candidate.canMerge(pending[index], gap) { candidate.add(pending[index]) selected[index] = struct{}{} } } if !candidate.canMerge(segment, gap) { continue } candidate.add(segment) remaining := make([]model.Segment, 0, len(pending)-len(selected)) for index, skipped := range pending { if _, ok := selected[index]; ok { continue } remaining = append(remaining, skipped) } return candidate, remaining, true } return run{}, pending, false } func appendRun(out *model.MergedTranscript, current run, coalescedID int, summary *Summary) int { if len(current.segments) == 1 { out.Segments = append(out.Segments, current.segments[0]) return coalescedID } coalescedID++ out.Segments = append(out.Segments, current.coalescedSegment(coalescedID)) summary.OriginalSegmentsMerged += len(current.segments) summary.CoalescedSegments++ return coalescedID } func (r run) coalescedSegment(id int) model.Segment { first := r.segments[0] merged := model.Segment{ Source: first.Source, SourceRef: fmt.Sprintf("coalesce:%d", id), DerivedFrom: make([]string, 0, len(r.segments)), Speaker: first.Speaker, Start: r.start, End: r.end, Words: make([]model.Word, 0), } text := make([]string, 0, len(r.segments)) for _, segment := range r.segments { if segment.Source != merged.Source { merged.Source = "derived" } if trimmed := strings.TrimSpace(segment.Text); trimmed != "" { text = append(text, trimmed) } merged.Words = append(merged.Words, segment.Words...) merged.DerivedFrom = append(merged.DerivedFrom, segmentRef(segment)) } merged.Text = strings.Join(text, " ") return merged } func segmentRef(segment model.Segment) string { if segment.SourceSegmentIndex != nil { return fmt.Sprintf("%s#%d", segment.Source, *segment.SourceSegmentIndex) } if segment.SourceRef != "" { return segment.SourceRef } return segment.Source } func isSkippableInterjection(segment model.Segment) bool { return hasAnyCategory(segment, "backchannel", "filler") } func hasAnyCategory(segment model.Segment, categories ...string) bool { for _, existing := range segment.Categories { for _, category := range categories { if existing == category { return true } } } return false }