Fixed a text duplication bug in the resolve-overlaps module
This commit is contained in:
@@ -36,6 +36,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
|
|||||||
overlapRefs[ref] = struct{}{}
|
overlapRefs[ref] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
claimedContextRefs := make(map[string]struct{})
|
||||||
|
|
||||||
removeRefs := make(map[string]struct{})
|
removeRefs := make(map[string]struct{})
|
||||||
clearAnnotationRefs := make(map[string]struct{})
|
clearAnnotationRefs := make(map[string]struct{})
|
||||||
@@ -44,7 +45,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
|
|||||||
replacementOrder := make(map[string]replacementOrder)
|
replacementOrder := make(map[string]replacementOrder)
|
||||||
|
|
||||||
for _, group := range in.OverlapGroups {
|
for _, group := range in.OverlapGroups {
|
||||||
resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, wordRunGap, wordRunReorderWindow, contextWindow)
|
resolved, err := resolveGroup(in, group, refToIndex, overlapRefs, claimedContextRefs, wordRunGap, wordRunReorderWindow, contextWindow)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return model.MergedTranscript{}, ResolutionSummary{}, err
|
return model.MergedTranscript{}, ResolutionSummary{}, err
|
||||||
}
|
}
|
||||||
@@ -58,6 +59,9 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
|
|||||||
for sourceRef, order := range resolved.replacementOrder {
|
for sourceRef, order := range resolved.replacementOrder {
|
||||||
replacementOrder[sourceRef] = order
|
replacementOrder[sourceRef] = order
|
||||||
}
|
}
|
||||||
|
for _, ref := range resolved.contextRefs {
|
||||||
|
claimedContextRefs[ref] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
for _, ref := range group.Segments {
|
for _, ref := range group.Segments {
|
||||||
clearAnnotationRefs[ref] = struct{}{}
|
clearAnnotationRefs[ref] = struct{}{}
|
||||||
@@ -107,6 +111,7 @@ func Resolve(in model.MergedTranscript, wordRunGap float64, wordRunReorderWindow
|
|||||||
|
|
||||||
type resolvedGroup struct {
|
type resolvedGroup struct {
|
||||||
removeRefs []string
|
removeRefs []string
|
||||||
|
contextRefs []string
|
||||||
replacements []model.Segment
|
replacements []model.Segment
|
||||||
replacementOrder map[string]replacementOrder
|
replacementOrder map[string]replacementOrder
|
||||||
}
|
}
|
||||||
@@ -131,9 +136,10 @@ type wordRun struct {
|
|||||||
end float64
|
end float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) {
|
func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, overlapRefs map[string]struct{}, claimedContextRefs map[string]struct{}, wordRunGap float64, wordRunReorderWindow float64, contextWindow float64) (resolvedGroup, error) {
|
||||||
segmentsBySpeaker := make(map[string][]model.Segment)
|
segmentsBySpeaker := make(map[string][]model.Segment)
|
||||||
refsBySpeaker := make(map[string][]string)
|
refsBySpeaker := make(map[string][]string)
|
||||||
|
contextRefs := make([]string, 0)
|
||||||
groupRefs := make(map[string]struct{}, len(group.Segments))
|
groupRefs := make(map[string]struct{}, len(group.Segments))
|
||||||
groupSpeakers := make(map[string]struct{})
|
groupSpeakers := make(map[string]struct{})
|
||||||
for _, ref := range group.Segments {
|
for _, ref := range group.Segments {
|
||||||
@@ -154,6 +160,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
|
|||||||
if _, exists := overlapRefs[ref]; exists {
|
if _, exists := overlapRefs[ref]; exists {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if _, exists := claimedContextRefs[ref]; exists {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if _, exists := groupSpeakers[segment.Speaker]; !exists {
|
if _, exists := groupSpeakers[segment.Speaker]; !exists {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -166,6 +175,9 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
|
|||||||
}
|
}
|
||||||
segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment)
|
segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment)
|
||||||
refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref)
|
refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref)
|
||||||
|
if _, exists := groupRefs[ref]; !exists {
|
||||||
|
contextRefs = append(contextRefs, ref)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
speakers := groupSpeakerOrder(group, segmentsBySpeaker)
|
speakers := groupSpeakerOrder(group, segmentsBySpeaker)
|
||||||
@@ -187,6 +199,7 @@ func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToInde
|
|||||||
resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run))
|
resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
resolved.contextRefs = uniqueStrings(contextRefs)
|
||||||
|
|
||||||
resolved.replacements, resolved.replacementOrder = reorderReplacementSegments(group.ID, resolved.replacements, wordRunReorderWindow)
|
resolved.replacements, resolved.replacementOrder = reorderReplacementSegments(group.ID, resolved.replacements, wordRunReorderWindow)
|
||||||
return resolved, nil
|
return resolved, nil
|
||||||
@@ -466,3 +479,16 @@ func uniqueSortedStrings(values []string) []string {
|
|||||||
sort.Strings(unique)
|
sort.Strings(unique)
|
||||||
return unique
|
return unique
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func uniqueStrings(values []string) []string {
|
||||||
|
seen := make(map[string]struct{}, len(values))
|
||||||
|
unique := make([]string, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
if _, exists := seen[value]; exists {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[value] = struct{}{}
|
||||||
|
unique = append(unique, value)
|
||||||
|
}
|
||||||
|
return unique
|
||||||
|
}
|
||||||
|
|||||||
@@ -333,7 +333,7 @@ func TestResolveSkipsContextSegmentReferencedByAnotherOverlapGroup(t *testing.T)
|
|||||||
"c.json#0": {},
|
"c.json#0": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, 10, 0.4, 3)
|
resolved, err := resolveGroup(merged, merged.OverlapGroups[0], refToIndex, overlapRefs, map[string]struct{}{}, 10, 0.4, 3)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("resolve failed: %v", err)
|
t.Fatalf("resolve failed: %v", err)
|
||||||
}
|
}
|
||||||
@@ -597,6 +597,37 @@ func TestResolveDoesNotReorderWordRunsOutsideWindow(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveDoesNotReuseContextSegmentAcrossGroups(t *testing.T) {
|
||||||
|
merged := model.MergedTranscript{
|
||||||
|
Segments: []model.Segment{
|
||||||
|
segmentWithWords("a.json", 0, "Alice", 1.0, 1.1, word("alpha", 1.0, 1.05)),
|
||||||
|
segmentWithWords("b.json", 0, "Bob", 1.05, 1.15, word("beta", 1.05, 1.1)),
|
||||||
|
segmentWithWords("a.json", 1, "Alice", 2.0, 2.1, word("shared", 2.0, 2.05)),
|
||||||
|
segmentWithWords("c.json", 0, "Carol", 4.0, 4.1, word("gamma", 4.0, 4.05)),
|
||||||
|
segmentWithWords("a.json", 2, "Alice", 4.2, 4.3, word("delta", 4.2, 4.25)),
|
||||||
|
},
|
||||||
|
OverlapGroups: []model.OverlapGroup{
|
||||||
|
group(1, 1.0, 1.15, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}),
|
||||||
|
group(2, 4.0, 4.3, []string{"c.json#0", "a.json#2"}, []string{"Carol", "Alice"}),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, _, err := Resolve(merged, 0.75, 0.4, 3)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolve failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sharedCount := 0
|
||||||
|
for _, segment := range got.Segments {
|
||||||
|
if segment.Text == "shared" {
|
||||||
|
sharedCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if sharedCount != 1 {
|
||||||
|
t.Fatalf("shared context segment was reused %d time(s); want 1", sharedCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) {
|
func TestResolveReordersTransitiveNearStartClustersByDuration(t *testing.T) {
|
||||||
merged := model.MergedTranscript{
|
merged := model.MergedTranscript{
|
||||||
Segments: []model.Segment{
|
Segments: []model.Segment{
|
||||||
|
|||||||
Reference in New Issue
Block a user