Implemented a new internal/danglers package with deterministic two-pass dangling-end then dangling-start resolution

This commit is contained in:
2026-04-28 15:38:16 -05:00
parent 47b6727973
commit f1ce35dfc3
8 changed files with 602 additions and 8 deletions

View File

@@ -0,0 +1,261 @@
package danglers
import (
"fmt"
"sort"
"strings"
"unicode"
"unicode/utf8"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
// Summary records deterministic counters for a resolve-danglers pass.
type Summary struct {
DanglersMerged int
TargetsChanged int
}
// Apply merges tiny derived fragments back into matching provenance targets.
func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) {
if len(in.Segments) < 2 {
return in, Summary{}
}
afterEnds, endSummary := resolveDanglingEnds(in)
afterStarts, startSummary := resolveDanglingStarts(afterEnds)
return afterStarts, Summary{
DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged,
TargetsChanged: countResolvedTargets(afterStarts),
}
}
func countResolvedTargets(in model.MergedTranscript) int {
count := 0
for _, segment := range in.Segments {
if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") {
count++
}
}
return count
}
func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) {
consumed := make([]bool, len(in.Segments))
builders := make(map[int]*builder)
merged := 0
for index, segment := range in.Segments {
if consumed[index] || !isDanglingEnd(segment) {
continue
}
target := nearestPriorMatch(in.Segments, consumed, index)
if target < 0 {
continue
}
builderFor(builders, target, in.Segments[target]).appendEnd(segment)
consumed[index] = true
merged++
}
return buildResult(in, consumed, builders, merged)
}
func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) {
consumed := make([]bool, len(in.Segments))
builders := make(map[int]*builder)
merged := 0
for index, segment := range in.Segments {
if consumed[index] || !isDanglingStart(segment) {
continue
}
target := nearestSubsequentMatch(in.Segments, consumed, index)
if target < 0 {
continue
}
builderFor(builders, target, in.Segments[target]).prependStart(segment)
consumed[index] = true
merged++
}
return buildResult(in, consumed, builders, merged)
}
func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) {
if merged == 0 {
return in, Summary{}
}
builderIDs := make([]int, 0, len(builders))
for index := range builders {
builderIDs = append(builderIDs, index)
}
sort.Ints(builderIDs)
for id, index := range builderIDs {
builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1)
}
out := model.MergedTranscript{
Segments: make([]model.Segment, 0, len(in.Segments)-merged),
OverlapGroups: in.OverlapGroups,
}
for index, segment := range in.Segments {
if consumed[index] {
continue
}
if builder, exists := builders[index]; exists {
out.Segments = append(out.Segments, builder.segment())
continue
}
out.Segments = append(out.Segments, segment)
}
return out, Summary{
DanglersMerged: merged,
TargetsChanged: len(builders),
}
}
type builder struct {
target model.Segment
prefixes []model.Segment
suffixes []model.Segment
sourceRef string
}
func builderFor(builders map[int]*builder, index int, target model.Segment) *builder {
if existing, exists := builders[index]; exists {
return existing
}
builders[index] = &builder{target: target}
return builders[index]
}
func (b *builder) appendEnd(segment model.Segment) {
b.suffixes = append(b.suffixes, segment)
}
func (b *builder) prependStart(segment model.Segment) {
b.prefixes = append(b.prefixes, segment)
}
func (b builder) segment() model.Segment {
parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes))
for index := len(b.prefixes) - 1; index >= 0; index-- {
parts = append(parts, b.prefixes[index])
}
parts = append(parts, b.target)
parts = append(parts, b.suffixes...)
merged := model.Segment{
Source: parts[0].Source,
SourceRef: b.sourceRef,
DerivedFrom: unionDerivedFrom(parts),
Speaker: b.target.Speaker,
Start: parts[0].Start,
End: parts[0].End,
Categories: append([]string(nil), b.target.Categories...),
Words: make([]model.Word, 0),
}
text := make([]string, 0, len(parts))
for _, part := range parts {
if part.Source != merged.Source {
merged.Source = "derived"
}
if part.Start < merged.Start {
merged.Start = part.Start
}
if part.End > merged.End {
merged.End = part.End
}
if trimmed := strings.TrimSpace(part.Text); trimmed != "" {
text = append(text, trimmed)
}
merged.Words = append(merged.Words, part.Words...)
}
merged.Text = strings.Join(text, " ")
return merged
}
func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int {
for candidate := index - 1; candidate >= 0; candidate-- {
if consumed[candidate] {
continue
}
if sharesDerivedFrom(segments[index], segments[candidate]) {
return candidate
}
}
return -1
}
func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int {
for candidate := index + 1; candidate < len(segments); candidate++ {
if consumed[candidate] {
continue
}
if sharesDerivedFrom(segments[index], segments[candidate]) {
return candidate
}
}
return -1
}
func isDanglingEnd(segment model.Segment) bool {
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text)
}
func isDanglingStart(segment model.Segment) bool {
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2
}
func hasDerivedFrom(segment model.Segment) bool {
return len(segment.DerivedFrom) > 0
}
func wordCount(text string) int {
return len(strings.Fields(strings.TrimSpace(text)))
}
func endsWithPunctuation(text string) bool {
text = strings.TrimSpace(text)
if text == "" {
return false
}
r, _ := utf8.DecodeLastRuneInString(text)
return r != utf8.RuneError && unicode.IsPunct(r)
}
func sharesDerivedFrom(left model.Segment, right model.Segment) bool {
if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 {
return false
}
seen := make(map[string]struct{}, len(left.DerivedFrom))
for _, ref := range left.DerivedFrom {
seen[ref] = struct{}{}
}
for _, ref := range right.DerivedFrom {
if _, exists := seen[ref]; exists {
return true
}
}
return false
}
func unionDerivedFrom(segments []model.Segment) []string {
seen := make(map[string]struct{})
refs := make([]string, 0)
for _, segment := range segments {
for _, ref := range segment.DerivedFrom {
if _, exists := seen[ref]; exists {
continue
}
seen[ref] = struct{}{}
refs = append(refs, ref)
}
}
sort.Strings(refs)
return refs
}