Fixed a text duplication bug in the resolve-overlaps module

This commit is contained in:
2026-04-29 07:00:18 -05:00
parent cc80a123ef
commit cc02a7a01e
2 changed files with 60 additions and 3 deletions

View File

@@ -36,6 +36,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
overlapRefs[ref] = struct{}{} overlapRefs[ref] = struct{}{}
} }
} }
claimedContextRefs := make(map[string]struct{})
removeRefs := make(map[string]struct{}) removeRefs := make(map[string]struct{})
clearAnnotationRefs := make(map[string]struct{}) clearAnnotationRefs := make(map[string]struct{})
@@ -44,7 +45,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
replacementOrder := make(map[string]replacementOrder) replacementOrder := make(map[string]replacementOrder)
for _, group := range in.OverlapGroups { for _, group := range in.OverlapGroups {
resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, wordRunGap, wordRunReorderWindow, contextWindow) resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, claimedContextRefs, wordRunGap, wordRunReorderWindow, contextWindow)
if err != nil { if err != nil {
return model.MergedTranscript{}, ResolutionSummary{}, err return model.MergedTranscript{}, ResolutionSummary{}, err
} }
@@ -58,6 +59,9 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
for sourceRef, order := range resolved.replacementOrder { for sourceRef, order := range resolved.replacementOrder {
replacementOrder[sourceRef] = order replacementOrder[sourceRef] = order
} }
for _, ref := range resolved.contextRefs {
claimedContextRefs[ref] = struct{}{}
}
for _, ref := range group.Segments { for _, ref := range group.Segments {
clearAnnotationRefs[ref] = struct{}{} clearAnnotationRefs[ref] = struct{}{}
@@ -107,6 +111,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
type resolvedGroup struct { type resolvedGroup struct {
removeRefs []string removeRefs []string
contextRefs []string
replacements []model.Segment replacements []model.Segment
replacementOrder map[string]replacementOrder replacementOrder map[string]replacementOrder
} }
@@ -131,9 +136,10 @@ type wordRun struct {
end float64 end float64
} }
func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) { func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, claimedContextRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) {
segmentsBySpeaker := make(map[string][]model.Segment) segmentsBySpeaker := make(map[string][]model.Segment)
refsBySpeaker := make(map[string][]string) refsBySpeaker := make(map[string][]string)
contextRefs := make([]string, 0)
groupRefs := make(map[string]struct{}, len(group.Segments)) groupRefs := make(map[string]struct{}, len(group.Segments))
groupSpeakers := make(map[string]struct{}) groupSpeakers := make(map[string]struct{})
for _, ref := range group.Segments { for _, ref := range group.Segments {
@@ -154,6 +160,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
if _, exists := overlapRefs[ref]; exists { if _, exists := overlapRefs[ref]; exists {
continue continue
} }
if _, exists := claimedContextRefs[ref]; exists {
continue
}
if _, exists := groupSpeakers[segment.Speaker]; !exists { if _, exists := groupSpeakers[segment.Speaker]; !exists {
continue continue
} }
@@ -166,6 +175,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
} }
segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment) segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment)
refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref) refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref)
if _, exists := groupRefs[ref]; !exists {
contextRefs = append(contextRefs, ref)
}
} }
speakers := groupSpeakerOrder(group, segmentsBySpeaker) speakers := groupSpeakerOrder(group, segmentsBySpeaker)
@@ -187,6 +199,7 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run)) resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run))
} }
} }
resolved.contextRefs = uniqueStrings(contextRefs)
resolved.replacements, resolved.replacementOrder = reorderReplacementSegments(group.ID, resolved.replacements, wordRunReorderWindow) resolved.replacements, resolved.replacementOrder = reorderReplacementSegments(group.ID, resolved.replacements, wordRunReorderWindow)
return resolved, nil return resolved, nil
@@ -466,3 +479,16 @@ func uniqueSortedStrings(values []string) []string {
sort.Strings(unique) sort.Strings(unique)
return unique return unique
} }
func uniqueStrings(values []string) []string {
seen := make(map[string]struct{}, len(values))
unique := make([]string, 0, len(values))
for _, value := range values {
if _, exists := seen[value]; exists {
continue
}
seen[value] = struct{}{}
unique = append(unique, value)
}
return unique
}

View File

@@ -333,7 +333,7 @@ func TestResolveSkipsContextSegmentReferencedByAnotherOverlapGroup(t *testing.T)
"c.json#0": {}, "c.json#0": {},
} }
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, 10, 0.4, 3) resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, map[string]struct{}{}, 10, 0.4, 3)
if err != nil { if err != nil {
t.Fatalf("resolve failed: %v", err) t.Fatalf("resolve failed: %v", err)
} }
@@ -597,6 +597,37 @@ func TestResolveDoesNotReorderWordRunsOutsideWindow(t *testing.T) {
} }
} }
func TestResolveDoesNotReuseContextSegmentAcrossGroups(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1.0, 1.1, word("alpha", 1.0, 1.05)),
segmentWithWords("b.json", 0, "Bob", 1.05, 1.15, word("beta", 1.05, 1.1)),
segmentWithWords("a.json", 1, "Alice", 2.0, 2.1, word("shared", 2.0, 2.05)),
segmentWithWords("c.json", 0, "Carol", 4.0, 4.1, word("gamma", 4.0, 4.05)),
segmentWithWords("a.json", 2, "Alice", 4.2, 4.3, word("delta", 4.2, 4.25)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1.0, 1.15, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
group(2, 4.0, 4.3, []string{"c.json#0", "a.json#2"}, []string{"Carol", "Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
sharedCount := 0
for _, segment := range got.Segments {
if segment.Text == "shared" {
sharedCount++
}
}
if sharedCount != 1 {
t.Fatalf("shared context segment was reused %d time(s); want 1", sharedCount)
}
}
func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) { func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) {
merged := model.MergedTranscript{ merged := model.MergedTranscript{
Segments: []model.Segment{ Segments: []model.Segment{