Fixed a text duplication bug in the resolve-overlaps module

This commit is contained in:
2026-04-29 07:00:18 -05:00
parent cc80a123ef
commit cc02a7a01e
2 changed files with 60 additions and 3 deletions

View File

@@ -36,6 +36,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
overlapRefs[ref] = struct{}{}
}
}
claimedContextRefs := make(map[string]struct{})
removeRefs := make(map[string]struct{})
clearAnnotationRefs := make(map[string]struct{})
@@ -44,7 +45,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
replacementOrder := make(map[string]replacementOrder)
for _, group := range in.OverlapGroups {
resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, wordRunGap, wordRunReorderWindow, contextWindow)
resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, claimedContextRefs, wordRunGap, wordRunReorderWindow, contextWindow)
if err != nil {
return model.MergedTranscript{}, ResolutionSummary{}, err
}
@@ -58,6 +59,9 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
for sourceRef, order := range resolved.replacementOrder {
replacementOrder[sourceRef] = order
}
for _, ref := range resolved.contextRefs {
claimedContextRefs[ref] = struct{}{}
}
for _, ref := range group.Segments {
clearAnnotationRefs[ref] = struct{}{}
@@ -107,6 +111,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
type resolvedGroup struct {
removeRefs []string
contextRefs []string
replacements []model.Segment
replacementOrder map[string]replacementOrder
}
@@ -131,9 +136,10 @@ type wordRun struct {
end float64
}
func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) {
func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, claimedContextRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) {
segmentsBySpeaker := make(map[string][]model.Segment)
refsBySpeaker := make(map[string][]string)
contextRefs := make([]string, 0)
groupRefs := make(map[string]struct{}, len(group.Segments))
groupSpeakers := make(map[string]struct{})
for _, ref := range group.Segments {
@@ -154,6 +160,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
if _, exists := overlapRefs[ref]; exists {
continue
}
if _, exists := claimedContextRefs[ref]; exists {
continue
}
if _, exists := groupSpeakers[segment.Speaker]; !exists {
continue
}
@@ -166,6 +175,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
}
segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment)
refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref)
if _, exists := groupRefs[ref]; !exists {
contextRefs = append(contextRefs, ref)
}
}
speakers := groupSpeakerOrder(group, segmentsBySpeaker)
@@ -187,6 +199,7 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run))
}
}
resolved.contextRefs = uniqueStrings(contextRefs)
resolved.replacements, resolved.replacementOrder = reorderReplacementSegments(group.ID, resolved.replacements, wordRunReorderWindow)
return resolved, nil
@@ -466,3 +479,16 @@ func uniqueSortedStrings(values []string) []string {
sort.Strings(unique)
return unique
}
func uniqueStrings(values []string) []string {
seen := make(map[string]struct{}, len(values))
unique := make([]string, 0, len(values))
for _, value := range values {
if _, exists := seen[value]; exists {
continue
}
seen[value] = struct{}{}
unique = append(unique, value)
}
return unique
}

View File

@@ -333,7 +333,7 @@ func TestResolveSkipsContextSegmentReferencedByAnotherOverlapGroup(t *testing.T)
"c.json#0": {},
}
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, 10, 0.4, 3)
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, map[string]struct{}{}, 10, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
@@ -597,6 +597,37 @@ func TestResolveDoesNotReorderWordRunsOutsideWindow(t *testing.T) {
}
}
func TestResolveDoesNotReuseContextSegmentAcrossGroups(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{
segmentWithWords("a.json", 0, "Alice", 1.0, 1.1, word("alpha", 1.0, 1.05)),
segmentWithWords("b.json", 0, "Bob", 1.05, 1.15, word("beta", 1.05, 1.1)),
segmentWithWords("a.json", 1, "Alice", 2.0, 2.1, word("shared", 2.0, 2.05)),
segmentWithWords("c.json", 0, "Carol", 4.0, 4.1, word("gamma", 4.0, 4.05)),
segmentWithWords("a.json", 2, "Alice", 4.2, 4.3, word("delta", 4.2, 4.25)),
},
OverlapGroups: []model.OverlapGroup{
group(1, 1.0, 1.15, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
group(2, 4.0, 4.3, []string{"c.json#0", "a.json#2"}, []string{"Carol", "Alice"}),
},
}
got, _, err := Resolve(merged, 0.75, 0.4, 3)
if err != nil {
t.Fatalf("resolve failed: %v", err)
}
sharedCount := 0
for _, segment := range got.Segments {
if segment.Text == "shared" {
sharedCount++
}
}
if sharedCount != 1 {
t.Fatalf("shared context segment was reused %d time(s); want 1", sharedCount)
}
}
func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) {
merged := model.MergedTranscript{
Segments: []model.Segment{