Implemented a new internal/danglers package with deterministic two-pass dangling-end then dangling-start resolution
This commit is contained in:
261
internal/danglers/danglers.go
Normal file
261
internal/danglers/danglers.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package danglers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
// Summary records deterministic counters for a resolve-danglers pass.
|
||||
type Summary struct {
|
||||
DanglersMerged int
|
||||
TargetsChanged int
|
||||
}
|
||||
|
||||
// Apply merges tiny derived fragments back into matching provenance targets.
|
||||
func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
if len(in.Segments) < 2 {
|
||||
return in, Summary{}
|
||||
}
|
||||
|
||||
afterEnds, endSummary := resolveDanglingEnds(in)
|
||||
afterStarts, startSummary := resolveDanglingStarts(afterEnds)
|
||||
return afterStarts, Summary{
|
||||
DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged,
|
||||
TargetsChanged: countResolvedTargets(afterStarts),
|
||||
}
|
||||
}
|
||||
|
||||
func countResolvedTargets(in model.MergedTranscript) int {
|
||||
count := 0
|
||||
for _, segment := range in.Segments {
|
||||
if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
consumed := make([]bool, len(in.Segments))
|
||||
builders := make(map[int]*builder)
|
||||
merged := 0
|
||||
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] || !isDanglingEnd(segment) {
|
||||
continue
|
||||
}
|
||||
target := nearestPriorMatch(in.Segments, consumed, index)
|
||||
if target < 0 {
|
||||
continue
|
||||
}
|
||||
builderFor(builders, target, in.Segments[target]).appendEnd(segment)
|
||||
consumed[index] = true
|
||||
merged++
|
||||
}
|
||||
|
||||
return buildResult(in, consumed, builders, merged)
|
||||
}
|
||||
|
||||
func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
consumed := make([]bool, len(in.Segments))
|
||||
builders := make(map[int]*builder)
|
||||
merged := 0
|
||||
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] || !isDanglingStart(segment) {
|
||||
continue
|
||||
}
|
||||
target := nearestSubsequentMatch(in.Segments, consumed, index)
|
||||
if target < 0 {
|
||||
continue
|
||||
}
|
||||
builderFor(builders, target, in.Segments[target]).prependStart(segment)
|
||||
consumed[index] = true
|
||||
merged++
|
||||
}
|
||||
|
||||
return buildResult(in, consumed, builders, merged)
|
||||
}
|
||||
|
||||
func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) {
|
||||
if merged == 0 {
|
||||
return in, Summary{}
|
||||
}
|
||||
|
||||
builderIDs := make([]int, 0, len(builders))
|
||||
for index := range builders {
|
||||
builderIDs = append(builderIDs, index)
|
||||
}
|
||||
sort.Ints(builderIDs)
|
||||
for id, index := range builderIDs {
|
||||
builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1)
|
||||
}
|
||||
|
||||
out := model.MergedTranscript{
|
||||
Segments: make([]model.Segment, 0, len(in.Segments)-merged),
|
||||
OverlapGroups: in.OverlapGroups,
|
||||
}
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] {
|
||||
continue
|
||||
}
|
||||
if builder, exists := builders[index]; exists {
|
||||
out.Segments = append(out.Segments, builder.segment())
|
||||
continue
|
||||
}
|
||||
out.Segments = append(out.Segments, segment)
|
||||
}
|
||||
|
||||
return out, Summary{
|
||||
DanglersMerged: merged,
|
||||
TargetsChanged: len(builders),
|
||||
}
|
||||
}
|
||||
|
||||
type builder struct {
|
||||
target model.Segment
|
||||
prefixes []model.Segment
|
||||
suffixes []model.Segment
|
||||
sourceRef string
|
||||
}
|
||||
|
||||
func builderFor(builders map[int]*builder, index int, target model.Segment) *builder {
|
||||
if existing, exists := builders[index]; exists {
|
||||
return existing
|
||||
}
|
||||
builders[index] = &builder{target: target}
|
||||
return builders[index]
|
||||
}
|
||||
|
||||
func (b *builder) appendEnd(segment model.Segment) {
|
||||
b.suffixes = append(b.suffixes, segment)
|
||||
}
|
||||
|
||||
func (b *builder) prependStart(segment model.Segment) {
|
||||
b.prefixes = append(b.prefixes, segment)
|
||||
}
|
||||
|
||||
func (b builder) segment() model.Segment {
|
||||
parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes))
|
||||
for index := len(b.prefixes) - 1; index >= 0; index-- {
|
||||
parts = append(parts, b.prefixes[index])
|
||||
}
|
||||
parts = append(parts, b.target)
|
||||
parts = append(parts, b.suffixes...)
|
||||
|
||||
merged := model.Segment{
|
||||
Source: parts[0].Source,
|
||||
SourceRef: b.sourceRef,
|
||||
DerivedFrom: unionDerivedFrom(parts),
|
||||
Speaker: b.target.Speaker,
|
||||
Start: parts[0].Start,
|
||||
End: parts[0].End,
|
||||
Categories: append([]string(nil), b.target.Categories...),
|
||||
Words: make([]model.Word, 0),
|
||||
}
|
||||
|
||||
text := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
if part.Source != merged.Source {
|
||||
merged.Source = "derived"
|
||||
}
|
||||
if part.Start < merged.Start {
|
||||
merged.Start = part.Start
|
||||
}
|
||||
if part.End > merged.End {
|
||||
merged.End = part.End
|
||||
}
|
||||
if trimmed := strings.TrimSpace(part.Text); trimmed != "" {
|
||||
text = append(text, trimmed)
|
||||
}
|
||||
merged.Words = append(merged.Words, part.Words...)
|
||||
}
|
||||
merged.Text = strings.Join(text, " ")
|
||||
return merged
|
||||
}
|
||||
|
||||
func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int {
|
||||
for candidate := index - 1; candidate >= 0; candidate-- {
|
||||
if consumed[candidate] {
|
||||
continue
|
||||
}
|
||||
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int {
|
||||
for candidate := index + 1; candidate < len(segments); candidate++ {
|
||||
if consumed[candidate] {
|
||||
continue
|
||||
}
|
||||
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func isDanglingEnd(segment model.Segment) bool {
|
||||
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text)
|
||||
}
|
||||
|
||||
func isDanglingStart(segment model.Segment) bool {
|
||||
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2
|
||||
}
|
||||
|
||||
func hasDerivedFrom(segment model.Segment) bool {
|
||||
return len(segment.DerivedFrom) > 0
|
||||
}
|
||||
|
||||
func wordCount(text string) int {
|
||||
return len(strings.Fields(strings.TrimSpace(text)))
|
||||
}
|
||||
|
||||
func endsWithPunctuation(text string) bool {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return false
|
||||
}
|
||||
r, _ := utf8.DecodeLastRuneInString(text)
|
||||
return r != utf8.RuneError && unicode.IsPunct(r)
|
||||
}
|
||||
|
||||
func sharesDerivedFrom(left model.Segment, right model.Segment) bool {
|
||||
if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 {
|
||||
return false
|
||||
}
|
||||
seen := make(map[string]struct{}, len(left.DerivedFrom))
|
||||
for _, ref := range left.DerivedFrom {
|
||||
seen[ref] = struct{}{}
|
||||
}
|
||||
for _, ref := range right.DerivedFrom {
|
||||
if _, exists := seen[ref]; exists {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func unionDerivedFrom(segments []model.Segment) []string {
|
||||
seen := make(map[string]struct{})
|
||||
refs := make([]string, 0)
|
||||
for _, segment := range segments {
|
||||
for _, ref := range segment.DerivedFrom {
|
||||
if _, exists := seen[ref]; exists {
|
||||
continue
|
||||
}
|
||||
seen[ref] = struct{}{}
|
||||
refs = append(refs, ref)
|
||||
}
|
||||
}
|
||||
sort.Strings(refs)
|
||||
return refs
|
||||
}
|
||||
Reference in New Issue
Block a user