package danglers import ( "fmt" "sort" "strings" "unicode" "unicode/utf8" "gitea.maximumdirect.net/eric/seriatim/internal/model" ) // Summary records deterministic counters for a resolve-danglers pass. type Summary struct { DanglersMerged int TargetsChanged int } // Apply merges tiny derived fragments back into matching provenance targets. func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) { if len(in.Segments) < 2 { return in, Summary{} } afterEnds, endSummary := resolveDanglingEnds(in) afterStarts, startSummary := resolveDanglingStarts(afterEnds) return afterStarts, Summary{ DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged, TargetsChanged: countResolvedTargets(afterStarts), } } func countResolvedTargets(in model.MergedTranscript) int { count := 0 for _, segment := range in.Segments { if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") { count++ } } return count } func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) { consumed := make([]bool, len(in.Segments)) builders := make(map[int]*builder) merged := 0 for index, segment := range in.Segments { if consumed[index] || !isDanglingEnd(segment) { continue } target := nearestPriorMatch(in.Segments, consumed, index) if target < 0 { continue } builderFor(builders, target, in.Segments[target]).appendEnd(segment) consumed[index] = true merged++ } return buildResult(in, consumed, builders, merged) } func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) { consumed := make([]bool, len(in.Segments)) builders := make(map[int]*builder) merged := 0 for index, segment := range in.Segments { if consumed[index] || !isDanglingStart(segment) { continue } target := nearestSubsequentMatch(in.Segments, consumed, index) if target < 0 { continue } builderFor(builders, target, in.Segments[target]).prependStart(segment) consumed[index] = true merged++ } return buildResult(in, consumed, builders, merged) } func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) { if merged == 0 { return in, Summary{} } builderIDs := make([]int, 0, len(builders)) for index := range builders { builderIDs = append(builderIDs, index) } sort.Ints(builderIDs) for id, index := range builderIDs { builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1) } out := model.MergedTranscript{ Segments: make([]model.Segment, 0, len(in.Segments)-merged), OverlapGroups: in.OverlapGroups, } for index, segment := range in.Segments { if consumed[index] { continue } if builder, exists := builders[index]; exists { out.Segments = append(out.Segments, builder.segment()) continue } out.Segments = append(out.Segments, segment) } return out, Summary{ DanglersMerged: merged, TargetsChanged: len(builders), } } type builder struct { target model.Segment prefixes []model.Segment suffixes []model.Segment sourceRef string } func builderFor(builders map[int]*builder, index int, target model.Segment) *builder { if existing, exists := builders[index]; exists { return existing } builders[index] = &builder{target: target} return builders[index] } func (b *builder) appendEnd(segment model.Segment) { b.suffixes = append(b.suffixes, segment) } func (b *builder) prependStart(segment model.Segment) { b.prefixes = append(b.prefixes, segment) } func (b builder) segment() model.Segment { parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes)) for index := len(b.prefixes) - 1; index >= 0; index-- { parts = append(parts, b.prefixes[index]) } parts = append(parts, b.target) parts = append(parts, b.suffixes...) merged := model.Segment{ Source: parts[0].Source, SourceRef: b.sourceRef, DerivedFrom: unionDerivedFrom(parts), Speaker: b.target.Speaker, Start: parts[0].Start, End: parts[0].End, Categories: append([]string(nil), b.target.Categories...), Words: make([]model.Word, 0), } text := make([]string, 0, len(parts)) for _, part := range parts { if part.Source != merged.Source { merged.Source = "derived" } if part.Start < merged.Start { merged.Start = part.Start } if part.End > merged.End { merged.End = part.End } if trimmed := strings.TrimSpace(part.Text); trimmed != "" { text = append(text, trimmed) } merged.Words = append(merged.Words, part.Words...) } merged.Text = strings.Join(text, " ") return merged } func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int { for candidate := index - 1; candidate >= 0; candidate-- { if consumed[candidate] { continue } if sharesDerivedFrom(segments[index], segments[candidate]) { return candidate } } return -1 } func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int { for candidate := index + 1; candidate < len(segments); candidate++ { if consumed[candidate] { continue } if sharesDerivedFrom(segments[index], segments[candidate]) { return candidate } } return -1 } func isDanglingEnd(segment model.Segment) bool { return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text) } func isDanglingStart(segment model.Segment) bool { return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 } func hasDerivedFrom(segment model.Segment) bool { return len(segment.DerivedFrom) > 0 } func wordCount(text string) int { return len(strings.Fields(strings.TrimSpace(text))) } func endsWithPunctuation(text string) bool { text = strings.TrimSpace(text) if text == "" { return false } r, _ := utf8.DecodeLastRuneInString(text) return r != utf8.RuneError && unicode.IsPunct(r) } func sharesDerivedFrom(left model.Segment, right model.Segment) bool { if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 { return false } seen := make(map[string]struct{}, len(left.DerivedFrom)) for _, ref := range left.DerivedFrom { seen[ref] = struct{}{} } for _, ref := range right.DerivedFrom { if _, exists := seen[ref]; exists { return true } } return false } func unionDerivedFrom(segments []model.Segment) []string { seen := make(map[string]struct{}) refs := make([]string, 0) for _, segment := range segments { for _, ref := range segment.DerivedFrom { if _, exists := seen[ref]; exists { continue } seen[ref] = struct{}{} refs = append(refs, ref) } } sort.Strings(refs) return refs }