262 lines
6.5 KiB
Go
262 lines
6.5 KiB
Go
package danglers
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
|
)
|
|
|
|
// Summary records deterministic counters for a resolve-danglers pass.
|
|
type Summary struct {
|
|
DanglersMerged int
|
|
TargetsChanged int
|
|
}
|
|
|
|
// Apply merges tiny derived fragments back into matching provenance targets.
|
|
func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
|
if len(in.Segments) < 2 {
|
|
return in, Summary{}
|
|
}
|
|
|
|
afterEnds, endSummary := resolveDanglingEnds(in)
|
|
afterStarts, startSummary := resolveDanglingStarts(afterEnds)
|
|
return afterStarts, Summary{
|
|
DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged,
|
|
TargetsChanged: countResolvedTargets(afterStarts),
|
|
}
|
|
}
|
|
|
|
func countResolvedTargets(in model.MergedTranscript) int {
|
|
count := 0
|
|
for _, segment := range in.Segments {
|
|
if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") {
|
|
count++
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
|
consumed := make([]bool, len(in.Segments))
|
|
builders := make(map[int]*builder)
|
|
merged := 0
|
|
|
|
for index, segment := range in.Segments {
|
|
if consumed[index] || !isDanglingEnd(segment) {
|
|
continue
|
|
}
|
|
target := nearestPriorMatch(in.Segments, consumed, index)
|
|
if target < 0 {
|
|
continue
|
|
}
|
|
builderFor(builders, target, in.Segments[target]).appendEnd(segment)
|
|
consumed[index] = true
|
|
merged++
|
|
}
|
|
|
|
return buildResult(in, consumed, builders, merged)
|
|
}
|
|
|
|
func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
|
consumed := make([]bool, len(in.Segments))
|
|
builders := make(map[int]*builder)
|
|
merged := 0
|
|
|
|
for index, segment := range in.Segments {
|
|
if consumed[index] || !isDanglingStart(segment) {
|
|
continue
|
|
}
|
|
target := nearestSubsequentMatch(in.Segments, consumed, index)
|
|
if target < 0 {
|
|
continue
|
|
}
|
|
builderFor(builders, target, in.Segments[target]).prependStart(segment)
|
|
consumed[index] = true
|
|
merged++
|
|
}
|
|
|
|
return buildResult(in, consumed, builders, merged)
|
|
}
|
|
|
|
func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) {
|
|
if merged == 0 {
|
|
return in, Summary{}
|
|
}
|
|
|
|
builderIDs := make([]int, 0, len(builders))
|
|
for index := range builders {
|
|
builderIDs = append(builderIDs, index)
|
|
}
|
|
sort.Ints(builderIDs)
|
|
for id, index := range builderIDs {
|
|
builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1)
|
|
}
|
|
|
|
out := model.MergedTranscript{
|
|
Segments: make([]model.Segment, 0, len(in.Segments)-merged),
|
|
OverlapGroups: in.OverlapGroups,
|
|
}
|
|
for index, segment := range in.Segments {
|
|
if consumed[index] {
|
|
continue
|
|
}
|
|
if builder, exists := builders[index]; exists {
|
|
out.Segments = append(out.Segments, builder.segment())
|
|
continue
|
|
}
|
|
out.Segments = append(out.Segments, segment)
|
|
}
|
|
|
|
return out, Summary{
|
|
DanglersMerged: merged,
|
|
TargetsChanged: len(builders),
|
|
}
|
|
}
|
|
|
|
type builder struct {
|
|
target model.Segment
|
|
prefixes []model.Segment
|
|
suffixes []model.Segment
|
|
sourceRef string
|
|
}
|
|
|
|
func builderFor(builders map[int]*builder, index int, target model.Segment) *builder {
|
|
if existing, exists := builders[index]; exists {
|
|
return existing
|
|
}
|
|
builders[index] = &builder{target: target}
|
|
return builders[index]
|
|
}
|
|
|
|
func (b *builder) appendEnd(segment model.Segment) {
|
|
b.suffixes = append(b.suffixes, segment)
|
|
}
|
|
|
|
func (b *builder) prependStart(segment model.Segment) {
|
|
b.prefixes = append(b.prefixes, segment)
|
|
}
|
|
|
|
func (b builder) segment() model.Segment {
|
|
parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes))
|
|
for index := len(b.prefixes) - 1; index >= 0; index-- {
|
|
parts = append(parts, b.prefixes[index])
|
|
}
|
|
parts = append(parts, b.target)
|
|
parts = append(parts, b.suffixes...)
|
|
|
|
merged := model.Segment{
|
|
Source: parts[0].Source,
|
|
SourceRef: b.sourceRef,
|
|
DerivedFrom: unionDerivedFrom(parts),
|
|
Speaker: b.target.Speaker,
|
|
Start: parts[0].Start,
|
|
End: parts[0].End,
|
|
Categories: append([]string(nil), b.target.Categories...),
|
|
Words: make([]model.Word, 0),
|
|
}
|
|
|
|
text := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
if part.Source != merged.Source {
|
|
merged.Source = "derived"
|
|
}
|
|
if part.Start < merged.Start {
|
|
merged.Start = part.Start
|
|
}
|
|
if part.End > merged.End {
|
|
merged.End = part.End
|
|
}
|
|
if trimmed := strings.TrimSpace(part.Text); trimmed != "" {
|
|
text = append(text, trimmed)
|
|
}
|
|
merged.Words = append(merged.Words, part.Words...)
|
|
}
|
|
merged.Text = strings.Join(text, " ")
|
|
return merged
|
|
}
|
|
|
|
func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int {
|
|
for candidate := index - 1; candidate >= 0; candidate-- {
|
|
if consumed[candidate] {
|
|
continue
|
|
}
|
|
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
|
return candidate
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int {
|
|
for candidate := index + 1; candidate < len(segments); candidate++ {
|
|
if consumed[candidate] {
|
|
continue
|
|
}
|
|
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
|
return candidate
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func isDanglingEnd(segment model.Segment) bool {
|
|
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text)
|
|
}
|
|
|
|
func isDanglingStart(segment model.Segment) bool {
|
|
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2
|
|
}
|
|
|
|
func hasDerivedFrom(segment model.Segment) bool {
|
|
return len(segment.DerivedFrom) > 0
|
|
}
|
|
|
|
func wordCount(text string) int {
|
|
return len(strings.Fields(strings.TrimSpace(text)))
|
|
}
|
|
|
|
func endsWithPunctuation(text string) bool {
|
|
text = strings.TrimSpace(text)
|
|
if text == "" {
|
|
return false
|
|
}
|
|
r, _ := utf8.DecodeLastRuneInString(text)
|
|
return r != utf8.RuneError && unicode.IsPunct(r)
|
|
}
|
|
|
|
func sharesDerivedFrom(left model.Segment, right model.Segment) bool {
|
|
if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 {
|
|
return false
|
|
}
|
|
seen := make(map[string]struct{}, len(left.DerivedFrom))
|
|
for _, ref := range left.DerivedFrom {
|
|
seen[ref] = struct{}{}
|
|
}
|
|
for _, ref := range right.DerivedFrom {
|
|
if _, exists := seen[ref]; exists {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func unionDerivedFrom(segments []model.Segment) []string {
|
|
seen := make(map[string]struct{})
|
|
refs := make([]string, 0)
|
|
for _, segment := range segments {
|
|
for _, ref := range segment.DerivedFrom {
|
|
if _, exists := seen[ref]; exists {
|
|
continue
|
|
}
|
|
seen[ref] = struct{}{}
|
|
refs = append(refs, ref)
|
|
}
|
|
}
|
|
sort.Strings(refs)
|
|
return refs
|
|
}
|