Implemented a new internal/danglers package with deterministic two-pass dangling-end then dangling-start resolution
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/backchannel"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/coalesce"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/danglers"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/filler"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/overlap"
|
||||
@@ -169,6 +170,27 @@ func (coalescePostprocessor) Process(ctx context.Context, in model.MergedTranscr
|
||||
}, nil
|
||||
}
|
||||
|
||||
type resolveDanglersPostprocessor struct{}
|
||||
|
||||
func (resolveDanglersPostprocessor) Name() string {
|
||||
return "resolve-danglers"
|
||||
}
|
||||
|
||||
func (resolveDanglersPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
out, summary := danglers.Apply(in)
|
||||
return out, []report.Event{
|
||||
report.Info(
|
||||
"postprocessing",
|
||||
"resolve-danglers",
|
||||
fmt.Sprintf("merged %d dangling segment(s) into %d target segment(s)", summary.DanglersMerged, summary.TargetsChanged),
|
||||
),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type autocorrectPostprocessor struct{}
|
||||
|
||||
func (autocorrectPostprocessor) Name() string {
|
||||
|
||||
@@ -16,6 +16,7 @@ func NewRegistry() *pipeline.Registry {
|
||||
registry.RegisterPostprocessor(backchannelPostprocessor{})
|
||||
registry.RegisterPostprocessor(fillerPostprocessor{})
|
||||
registry.RegisterPostprocessor(coalescePostprocessor{})
|
||||
registry.RegisterPostprocessor(resolveDanglersPostprocessor{})
|
||||
registry.RegisterPostprocessor(assignIDs{})
|
||||
registry.RegisterPostprocessor(validateOutput{})
|
||||
registry.RegisterPostprocessor(autocorrectPostprocessor{})
|
||||
|
||||
@@ -95,6 +95,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
||||
"resolve-overlaps",
|
||||
"backchannel",
|
||||
"filler",
|
||||
"resolve-danglers",
|
||||
"coalesce",
|
||||
"detect-overlaps",
|
||||
"autocorrect",
|
||||
@@ -701,6 +702,128 @@ func TestMergeCoalesceGapOverridePreventsMerge(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeResolveDanglersMergesDanglingEnd(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
"segments": [
|
||||
{
|
||||
"start": 1,
|
||||
"end": 4,
|
||||
"text": "main tail.",
|
||||
"words": [
|
||||
{"word": "main", "start": 1, "end": 1.1},
|
||||
{"word": "tail.", "start": 3, "end": 3.1}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||
"segments": [
|
||||
{
|
||||
"start": 1.5,
|
||||
"end": 2,
|
||||
"text": "interruption",
|
||||
"words": [
|
||||
{"word": "interruption", "start": 1.5, "end": 2}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||
- speaker: Alice
|
||||
match: ["a.json"]
|
||||
- speaker: Bob
|
||||
match: ["b.json"]
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
reportPath := filepath.Join(dir, "report.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputA,
|
||||
"--input-file", inputB,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
"--report-file", reportPath,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "main tail." {
|
||||
t.Fatalf("first segment = %#v, want Alice merged dangling end", transcript.Segments[0])
|
||||
}
|
||||
if transcript.Segments[0].ID != 1 || transcript.Segments[1].ID != 2 {
|
||||
t.Fatalf("ids not sequential after resolve-danglers: %#v", transcript.Segments)
|
||||
}
|
||||
|
||||
var rpt report.Report
|
||||
readJSON(t, reportPath, &rpt)
|
||||
if !hasReportEvent(rpt, "postprocessing", "resolve-danglers", "merged 1 dangling segment(s) into 1 target segment(s)") {
|
||||
t.Fatal("expected resolve-danglers report event")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeResolveDanglersMergesDanglingStart(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
"segments": [
|
||||
{
|
||||
"start": 1,
|
||||
"end": 4,
|
||||
"text": "start target words",
|
||||
"words": [
|
||||
{"word": "start", "start": 1, "end": 1.1},
|
||||
{"word": "target", "start": 3, "end": 3.1},
|
||||
{"word": "words", "start": 3.2, "end": 3.3}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||
"segments": [
|
||||
{
|
||||
"start": 1.5,
|
||||
"end": 2,
|
||||
"text": "interruption",
|
||||
"words": [
|
||||
{"word": "interruption", "start": 1.5, "end": 2}
|
||||
]
|
||||
}
|
||||
]
|
||||
}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||
- speaker: Alice
|
||||
match: ["a.json"]
|
||||
- speaker: Bob
|
||||
match: ["b.json"]
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputA,
|
||||
"--input-file", inputB,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
found := false
|
||||
for _, segment := range transcript.Segments {
|
||||
if segment.Speaker == "Alice" && segment.Text == "start target words" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("expected resolved dangling start in output, got %#v", transcript.Segments)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeTagsBackchannelSegments(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", `{
|
||||
|
||||
@@ -15,7 +15,7 @@ const (
|
||||
DefaultOutputModules = "json"
|
||||
DefaultOutputSchema = OutputSchemaSeriatim
|
||||
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
|
||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
|
||||
DefaultOverlapWordRunGap = 0.75
|
||||
DefaultWordRunReorderWindow = 1.0
|
||||
DefaultCoalesceGap = 3.0
|
||||
|
||||
261
internal/danglers/danglers.go
Normal file
261
internal/danglers/danglers.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package danglers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
// Summary records deterministic counters for a resolve-danglers pass.
|
||||
type Summary struct {
|
||||
DanglersMerged int
|
||||
TargetsChanged int
|
||||
}
|
||||
|
||||
// Apply merges tiny derived fragments back into matching provenance targets.
|
||||
func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
if len(in.Segments) < 2 {
|
||||
return in, Summary{}
|
||||
}
|
||||
|
||||
afterEnds, endSummary := resolveDanglingEnds(in)
|
||||
afterStarts, startSummary := resolveDanglingStarts(afterEnds)
|
||||
return afterStarts, Summary{
|
||||
DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged,
|
||||
TargetsChanged: countResolvedTargets(afterStarts),
|
||||
}
|
||||
}
|
||||
|
||||
func countResolvedTargets(in model.MergedTranscript) int {
|
||||
count := 0
|
||||
for _, segment := range in.Segments {
|
||||
if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
consumed := make([]bool, len(in.Segments))
|
||||
builders := make(map[int]*builder)
|
||||
merged := 0
|
||||
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] || !isDanglingEnd(segment) {
|
||||
continue
|
||||
}
|
||||
target := nearestPriorMatch(in.Segments, consumed, index)
|
||||
if target < 0 {
|
||||
continue
|
||||
}
|
||||
builderFor(builders, target, in.Segments[target]).appendEnd(segment)
|
||||
consumed[index] = true
|
||||
merged++
|
||||
}
|
||||
|
||||
return buildResult(in, consumed, builders, merged)
|
||||
}
|
||||
|
||||
func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) {
|
||||
consumed := make([]bool, len(in.Segments))
|
||||
builders := make(map[int]*builder)
|
||||
merged := 0
|
||||
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] || !isDanglingStart(segment) {
|
||||
continue
|
||||
}
|
||||
target := nearestSubsequentMatch(in.Segments, consumed, index)
|
||||
if target < 0 {
|
||||
continue
|
||||
}
|
||||
builderFor(builders, target, in.Segments[target]).prependStart(segment)
|
||||
consumed[index] = true
|
||||
merged++
|
||||
}
|
||||
|
||||
return buildResult(in, consumed, builders, merged)
|
||||
}
|
||||
|
||||
func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) {
|
||||
if merged == 0 {
|
||||
return in, Summary{}
|
||||
}
|
||||
|
||||
builderIDs := make([]int, 0, len(builders))
|
||||
for index := range builders {
|
||||
builderIDs = append(builderIDs, index)
|
||||
}
|
||||
sort.Ints(builderIDs)
|
||||
for id, index := range builderIDs {
|
||||
builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1)
|
||||
}
|
||||
|
||||
out := model.MergedTranscript{
|
||||
Segments: make([]model.Segment, 0, len(in.Segments)-merged),
|
||||
OverlapGroups: in.OverlapGroups,
|
||||
}
|
||||
for index, segment := range in.Segments {
|
||||
if consumed[index] {
|
||||
continue
|
||||
}
|
||||
if builder, exists := builders[index]; exists {
|
||||
out.Segments = append(out.Segments, builder.segment())
|
||||
continue
|
||||
}
|
||||
out.Segments = append(out.Segments, segment)
|
||||
}
|
||||
|
||||
return out, Summary{
|
||||
DanglersMerged: merged,
|
||||
TargetsChanged: len(builders),
|
||||
}
|
||||
}
|
||||
|
||||
type builder struct {
|
||||
target model.Segment
|
||||
prefixes []model.Segment
|
||||
suffixes []model.Segment
|
||||
sourceRef string
|
||||
}
|
||||
|
||||
func builderFor(builders map[int]*builder, index int, target model.Segment) *builder {
|
||||
if existing, exists := builders[index]; exists {
|
||||
return existing
|
||||
}
|
||||
builders[index] = &builder{target: target}
|
||||
return builders[index]
|
||||
}
|
||||
|
||||
func (b *builder) appendEnd(segment model.Segment) {
|
||||
b.suffixes = append(b.suffixes, segment)
|
||||
}
|
||||
|
||||
func (b *builder) prependStart(segment model.Segment) {
|
||||
b.prefixes = append(b.prefixes, segment)
|
||||
}
|
||||
|
||||
func (b builder) segment() model.Segment {
|
||||
parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes))
|
||||
for index := len(b.prefixes) - 1; index >= 0; index-- {
|
||||
parts = append(parts, b.prefixes[index])
|
||||
}
|
||||
parts = append(parts, b.target)
|
||||
parts = append(parts, b.suffixes...)
|
||||
|
||||
merged := model.Segment{
|
||||
Source: parts[0].Source,
|
||||
SourceRef: b.sourceRef,
|
||||
DerivedFrom: unionDerivedFrom(parts),
|
||||
Speaker: b.target.Speaker,
|
||||
Start: parts[0].Start,
|
||||
End: parts[0].End,
|
||||
Categories: append([]string(nil), b.target.Categories...),
|
||||
Words: make([]model.Word, 0),
|
||||
}
|
||||
|
||||
text := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
if part.Source != merged.Source {
|
||||
merged.Source = "derived"
|
||||
}
|
||||
if part.Start < merged.Start {
|
||||
merged.Start = part.Start
|
||||
}
|
||||
if part.End > merged.End {
|
||||
merged.End = part.End
|
||||
}
|
||||
if trimmed := strings.TrimSpace(part.Text); trimmed != "" {
|
||||
text = append(text, trimmed)
|
||||
}
|
||||
merged.Words = append(merged.Words, part.Words...)
|
||||
}
|
||||
merged.Text = strings.Join(text, " ")
|
||||
return merged
|
||||
}
|
||||
|
||||
func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int {
|
||||
for candidate := index - 1; candidate >= 0; candidate-- {
|
||||
if consumed[candidate] {
|
||||
continue
|
||||
}
|
||||
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int {
|
||||
for candidate := index + 1; candidate < len(segments); candidate++ {
|
||||
if consumed[candidate] {
|
||||
continue
|
||||
}
|
||||
if sharesDerivedFrom(segments[index], segments[candidate]) {
|
||||
return candidate
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func isDanglingEnd(segment model.Segment) bool {
|
||||
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text)
|
||||
}
|
||||
|
||||
func isDanglingStart(segment model.Segment) bool {
|
||||
return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2
|
||||
}
|
||||
|
||||
func hasDerivedFrom(segment model.Segment) bool {
|
||||
return len(segment.DerivedFrom) > 0
|
||||
}
|
||||
|
||||
func wordCount(text string) int {
|
||||
return len(strings.Fields(strings.TrimSpace(text)))
|
||||
}
|
||||
|
||||
func endsWithPunctuation(text string) bool {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return false
|
||||
}
|
||||
r, _ := utf8.DecodeLastRuneInString(text)
|
||||
return r != utf8.RuneError && unicode.IsPunct(r)
|
||||
}
|
||||
|
||||
func sharesDerivedFrom(left model.Segment, right model.Segment) bool {
|
||||
if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 {
|
||||
return false
|
||||
}
|
||||
seen := make(map[string]struct{}, len(left.DerivedFrom))
|
||||
for _, ref := range left.DerivedFrom {
|
||||
seen[ref] = struct{}{}
|
||||
}
|
||||
for _, ref := range right.DerivedFrom {
|
||||
if _, exists := seen[ref]; exists {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func unionDerivedFrom(segments []model.Segment) []string {
|
||||
seen := make(map[string]struct{})
|
||||
refs := make([]string, 0)
|
||||
for _, segment := range segments {
|
||||
for _, ref := range segment.DerivedFrom {
|
||||
if _, exists := seen[ref]; exists {
|
||||
continue
|
||||
}
|
||||
seen[ref] = struct{}{}
|
||||
refs = append(refs, ref)
|
||||
}
|
||||
}
|
||||
sort.Strings(refs)
|
||||
return refs
|
||||
}
|
||||
178
internal/danglers/danglers_test.go
Normal file
178
internal/danglers/danglers_test.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package danglers
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
func TestApplyMergesDanglingEndIntoNearestPriorSharedDerivedFrom(t *testing.T) {
|
||||
got, summary := Apply(transcript(
|
||||
segment("a", "Alice", 1, 2, "target", []string{"source#1"}),
|
||||
segment("b", "Bob", 2, 3, "middle", []string{"other#1"}),
|
||||
segment("a", "Alice", 3, 4, "end.", []string{"source#1"}),
|
||||
))
|
||||
|
||||
if summary.DanglersMerged != 1 || summary.TargetsChanged != 1 {
|
||||
t.Fatalf("summary = %#v", summary)
|
||||
}
|
||||
if len(got.Segments) != 2 {
|
||||
t.Fatalf("segment count = %d, want 2", len(got.Segments))
|
||||
}
|
||||
assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1"})
|
||||
if got.Segments[0].SourceSegmentIndex != nil || got.Segments[0].OverlapGroupID != 0 || got.Segments[0].ID != 0 {
|
||||
t.Fatalf("stale fields not cleared: %#v", got.Segments[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyMergesDanglingStartIntoNearestSubsequentSharedDerivedFrom(t *testing.T) {
|
||||
got, summary := Apply(transcript(
|
||||
segment("a", "Alice", 1, 2, "start", []string{"source#1"}),
|
||||
segment("b", "Bob", 2, 3, "middle", []string{"other#1"}),
|
||||
segment("a", "Alice", 3, 4, "target", []string{"source#1"}),
|
||||
))
|
||||
|
||||
if summary.DanglersMerged != 1 || summary.TargetsChanged != 1 {
|
||||
t.Fatalf("summary = %#v", summary)
|
||||
}
|
||||
if len(got.Segments) != 2 {
|
||||
t.Fatalf("segment count = %d, want 2", len(got.Segments))
|
||||
}
|
||||
assertSegment(t, got.Segments[1], "resolve-danglers:1", "start target", 1, 4, []string{"source#1"})
|
||||
}
|
||||
|
||||
func TestApplyUsesAnyDerivedFromIntersection(t *testing.T) {
|
||||
got, _ := Apply(transcript(
|
||||
segment("a", "Alice", 1, 2, "target", []string{"source#1", "source#2"}),
|
||||
segment("a", "Alice", 3, 4, "end.", []string{"source#2", "source#3"}),
|
||||
))
|
||||
|
||||
assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1", "source#2", "source#3"})
|
||||
}
|
||||
|
||||
func TestApplyDoesNotMergeWithoutSharedProvenance(t *testing.T) {
|
||||
in := transcript(
|
||||
segment("a", "Alice", 1, 2, "target", []string{"source#1"}),
|
||||
segment("a", "Alice", 3, 4, "end.", []string{"source#2"}),
|
||||
)
|
||||
|
||||
got, summary := Apply(in)
|
||||
if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) {
|
||||
t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDoesNotMergeLongDanglers(t *testing.T) {
|
||||
in := transcript(
|
||||
segment("a", "Alice", 1, 2, "target words here", []string{"source#1"}),
|
||||
segment("a", "Alice", 3, 4, "three word end.", []string{"source#1"}),
|
||||
)
|
||||
|
||||
got, summary := Apply(in)
|
||||
if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) {
|
||||
t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDanglingEndRequiresPunctuation(t *testing.T) {
|
||||
in := transcript(
|
||||
segment("a", "Alice", 1, 2, "target", []string{"source#1"}),
|
||||
segment("a", "Alice", 3, 4, "end", []string{"source#1"}),
|
||||
)
|
||||
|
||||
resolved, _ := resolveDanglingEnds(in)
|
||||
if !reflect.DeepEqual(resolved, in) {
|
||||
t.Fatalf("punctuation-free end should not merge backward:\ngot %#v\nwant %#v", resolved, in)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDanglingStartDoesNotRequirePunctuation(t *testing.T) {
|
||||
got, summary := Apply(transcript(
|
||||
segment("a", "Alice", 1, 2, "start", []string{"source#1"}),
|
||||
segment("a", "Alice", 3, 4, "target words", []string{"source#1"}),
|
||||
))
|
||||
|
||||
if summary.DanglersMerged != 1 {
|
||||
t.Fatalf("summary = %#v", summary)
|
||||
}
|
||||
assertSegment(t, got.Segments[0], "resolve-danglers:1", "start target words", 1, 4, []string{"source#1"})
|
||||
}
|
||||
|
||||
func TestApplyMergesMultipleDanglersIntoOneTarget(t *testing.T) {
|
||||
got, summary := Apply(transcript(
|
||||
segment("a", "Alice", 1, 2, "prefix", []string{"source#1"}),
|
||||
segment("a", "Alice", 3, 4, "target", []string{"source#1"}),
|
||||
segment("a", "Alice", 5, 6, "tail.", []string{"source#1"}),
|
||||
))
|
||||
|
||||
if summary.DanglersMerged != 2 || summary.TargetsChanged != 1 {
|
||||
t.Fatalf("summary = %#v", summary)
|
||||
}
|
||||
if len(got.Segments) != 1 {
|
||||
t.Fatalf("segment count = %d, want 1", len(got.Segments))
|
||||
}
|
||||
assertSegment(t, got.Segments[0], "resolve-danglers:1", "prefix target tail.", 1, 6, []string{"source#1"})
|
||||
}
|
||||
|
||||
func TestApplyMergedSegmentShape(t *testing.T) {
|
||||
sourceIndex := 1
|
||||
target := segment("a", "Alice", 2, 3, "target", []string{"a#1"})
|
||||
target.ID = 99
|
||||
target.SourceSegmentIndex = &sourceIndex
|
||||
target.OverlapGroupID = 7
|
||||
target.Categories = []string{"manual"}
|
||||
target.Words = []model.Word{{Text: "target", Start: 2, End: 3, Timed: true}}
|
||||
|
||||
dangler := segment("b", "Alice", 1, 1.5, "start", []string{"a#1", "b#2"})
|
||||
dangler.Categories = []string{"dangler"}
|
||||
dangler.Words = []model.Word{{Text: "start", Start: 1, End: 1.5, Timed: true}}
|
||||
|
||||
got, _ := Apply(transcript(dangler, target))
|
||||
merged := got.Segments[0]
|
||||
if merged.Source != "derived" {
|
||||
t.Fatalf("source = %q, want derived", merged.Source)
|
||||
}
|
||||
if !reflect.DeepEqual(merged.Categories, []string{"manual"}) {
|
||||
t.Fatalf("categories = %v, want target categories only", merged.Categories)
|
||||
}
|
||||
if gotWords := []string{merged.Words[0].Text, merged.Words[1].Text}; !reflect.DeepEqual(gotWords, []string{"start", "target"}) {
|
||||
t.Fatalf("word order = %v", gotWords)
|
||||
}
|
||||
assertSegment(t, merged, "resolve-danglers:1", "start target", 1, 3, []string{"a#1", "b#2"})
|
||||
if merged.ID != 0 || merged.SourceSegmentIndex != nil || merged.OverlapGroupID != 0 {
|
||||
t.Fatalf("stale fields not cleared: %#v", merged)
|
||||
}
|
||||
}
|
||||
|
||||
func transcript(segments ...model.Segment) model.MergedTranscript {
|
||||
return model.MergedTranscript{Segments: segments}
|
||||
}
|
||||
|
||||
func segment(source string, speaker string, start float64, end float64, text string, derivedFrom []string) model.Segment {
|
||||
return model.Segment{
|
||||
Source: source,
|
||||
SourceRef: source + "-ref",
|
||||
DerivedFrom: append([]string(nil), derivedFrom...),
|
||||
Speaker: speaker,
|
||||
Start: start,
|
||||
End: end,
|
||||
Text: text,
|
||||
}
|
||||
}
|
||||
|
||||
func assertSegment(t *testing.T, segment model.Segment, sourceRef string, text string, start float64, end float64, derivedFrom []string) {
|
||||
t.Helper()
|
||||
if segment.SourceRef != sourceRef {
|
||||
t.Fatalf("source_ref = %q, want %q", segment.SourceRef, sourceRef)
|
||||
}
|
||||
if segment.Text != text {
|
||||
t.Fatalf("text = %q, want %q", segment.Text, text)
|
||||
}
|
||||
if segment.Start != start || segment.End != end {
|
||||
t.Fatalf("bounds = %f-%f, want %f-%f", segment.Start, segment.End, start, end)
|
||||
}
|
||||
if !reflect.DeepEqual(segment.DerivedFrom, derivedFrom) {
|
||||
t.Fatalf("derived_from = %v, want %v", segment.DerivedFrom, derivedFrom)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user