From f1ce35dfc3d41a9faedf49a99d310b8a0246f0d3 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Tue, 28 Apr 2026 15:38:16 -0500 Subject: [PATCH] Implemented a new internal/danglers package with deterministic two-pass dangling-end then dangling-start resolution --- README.md | 17 +- architecture.md | 6 +- internal/builtin/postprocess.go | 22 +++ internal/builtin/registry.go | 1 + internal/cli/merge_test.go | 123 ++++++++++++++ internal/config/config.go | 2 +- internal/danglers/danglers.go | 261 +++++++++++++++++++++++++++++ internal/danglers/danglers_test.go | 178 ++++++++++++++++++++ 8 files changed, 602 insertions(+), 8 deletions(-) create mode 100644 internal/danglers/danglers.go create mode 100644 internal/danglers/danglers_test.go diff --git a/README.md b/README.md index 2287a36..670a976 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ Global flags: | `--output-modules` | No | `json` | Comma-separated output modules. | | `--output-schema` | No | `seriatim` | JSON output contract. Allowed values are `seriatim` and `minimal`. | | `--preprocessing-modules` | No | `validate-raw,normalize-speakers,trim-text` | Comma-separated preprocessing modules, evaluated in order. | -| `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. | +| `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,resolve-danglers,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. | | `--coalesce-gap` | No | `3.0` | Maximum same-speaker gap in seconds for `coalesce`; also used as the `resolve-overlaps` context window. Must be a non-negative float. | Environment variables: @@ -169,7 +169,7 @@ The default `seriatim` schema uses the full seriatim envelope: "input_reader": "json-files", "input_files": ["eric.json", "mike.json"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], - "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "filler", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], + "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "filler", "coalesce", "resolve-danglers", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], "output_modules": ["json"] }, "segments": [ @@ -265,7 +265,7 @@ Overlap behavior: ## Overlap Resolution -The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `filler`, then `coalesce`, then a second `detect-overlaps` pass. +The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `filler`, then `coalesce`, then `resolve-danglers`, then a second `detect-overlaps` pass. For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: @@ -311,12 +311,21 @@ Filler matching is case-insensitive, ignores punctuation for matching and word-c ## Coalescing -The default pipeline runs `coalesce` before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`. +The default pipeline runs `coalesce` before `resolve-danglers` and the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`. Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`. Different-speaker backchannel and filler segments do not block coalescing of surrounding same-speaker segments. Same-speaker backchannel and filler segments are merged normally when they are within `--coalesce-gap`. When same-speaker segments are coalesced, any `backchannel` or `filler` category from the merged inputs is dropped from the coalesced segment. +## Dangler Resolution + +The default pipeline runs `resolve-danglers` after `coalesce` and before the second overlap detection pass. It repairs short derived fragments when they share provenance with a nearby segment: + +- Dangling-end fragments have no more than two words and end in punctuation. +- Dangling-start fragments have no more than two words. +- Matching uses any shared `derived_from` value. +- Merged segments use `source_ref` values such as `resolve-danglers:1`, keep the target segment's transcript position, and union `derived_from`. + ## Autocorrect Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report. diff --git a/architecture.md b/architecture.md index 70ebeb2..7eca34e 100644 --- a/architecture.md +++ b/architecture.md @@ -146,7 +146,7 @@ The postprocessing stage applies zero or more modules to the merged transcript. Postprocessing modules are selected at runtime with a comma-separated list of canonical module names: ```text ---postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output +--postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,resolve-danglers,detect-overlaps,autocorrect,assign-ids,validate-output ``` Modules run in the exact order provided. Unknown module names are configuration errors. @@ -214,7 +214,7 @@ seriatim merge \ --speakers speakers.yml \ --autocorrect autocorrect.yml \ --preprocessing-modules validate-raw,normalize-speakers,trim-text \ - --postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output \ + --postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,resolve-danglers,detect-overlaps,autocorrect,assign-ids,validate-output \ --output-modules json \ --output-schema seriatim \ --output-file merged.json \ @@ -435,7 +435,7 @@ validate-raw,normalize-speakers,trim-text Recommended default postprocessing modules: ```text -detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output +detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,resolve-danglers,detect-overlaps,autocorrect,assign-ids,validate-output ``` The default output module is: diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index 887f69c..f4d056a 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -9,6 +9,7 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/backchannel" "gitea.maximumdirect.net/eric/seriatim/internal/coalesce" "gitea.maximumdirect.net/eric/seriatim/internal/config" + "gitea.maximumdirect.net/eric/seriatim/internal/danglers" "gitea.maximumdirect.net/eric/seriatim/internal/filler" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/overlap" @@ -169,6 +170,27 @@ func (coalescePostprocessor) Process(ctx context.Context, in model.MergedTranscr }, nil } +type resolveDanglersPostprocessor struct{} + +func (resolveDanglersPostprocessor) Name() string { + return "resolve-danglers" +} + +func (resolveDanglersPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) { + if err := ctx.Err(); err != nil { + return model.MergedTranscript{}, nil, err + } + + out, summary := danglers.Apply(in) + return out, []report.Event{ + report.Info( + "postprocessing", + "resolve-danglers", + fmt.Sprintf("merged %d dangling segment(s) into %d target segment(s)", summary.DanglersMerged, summary.TargetsChanged), + ), + }, nil +} + type autocorrectPostprocessor struct{} func (autocorrectPostprocessor) Name() string { diff --git a/internal/builtin/registry.go b/internal/builtin/registry.go index 29a73fa..f7b59b2 100644 --- a/internal/builtin/registry.go +++ b/internal/builtin/registry.go @@ -16,6 +16,7 @@ func NewRegistry() *pipeline.Registry { registry.RegisterPostprocessor(backchannelPostprocessor{}) registry.RegisterPostprocessor(fillerPostprocessor{}) registry.RegisterPostprocessor(coalescePostprocessor{}) + registry.RegisterPostprocessor(resolveDanglersPostprocessor{}) registry.RegisterPostprocessor(assignIDs{}) registry.RegisterPostprocessor(validateOutput{}) registry.RegisterPostprocessor(autocorrectPostprocessor{}) diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 6c66cd4..64b289d 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -95,6 +95,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { "resolve-overlaps", "backchannel", "filler", + "resolve-danglers", "coalesce", "detect-overlaps", "autocorrect", @@ -701,6 +702,128 @@ func TestMergeCoalesceGapOverridePreventsMerge(t *testing.T) { } } +func TestMergeResolveDanglersMergesDanglingEnd(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + { + "start": 1, + "end": 4, + "text": "main tail.", + "words": [ + {"word": "main", "start": 1, "end": 1.1}, + {"word": "tail.", "start": 3, "end": 3.1} + ] + } + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + { + "start": 1.5, + "end": 2, + "text": "interruption", + "words": [ + {"word": "interruption", "start": 1.5, "end": 2} + ] + } + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "main tail." { + t.Fatalf("first segment = %#v, want Alice merged dangling end", transcript.Segments[0]) + } + if transcript.Segments[0].ID != 1 || transcript.Segments[1].ID != 2 { + t.Fatalf("ids not sequential after resolve-danglers: %#v", transcript.Segments) + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "postprocessing", "resolve-danglers", "merged 1 dangling segment(s) into 1 target segment(s)") { + t.Fatal("expected resolve-danglers report event") + } +} + +func TestMergeResolveDanglersMergesDanglingStart(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + { + "start": 1, + "end": 4, + "text": "start target words", + "words": [ + {"word": "start", "start": 1, "end": 1.1}, + {"word": "target", "start": 3, "end": 3.1}, + {"word": "words", "start": 3.2, "end": 3.3} + ] + } + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + { + "start": 1.5, + "end": 2, + "text": "interruption", + "words": [ + {"word": "interruption", "start": 1.5, "end": 2} + ] + } + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + found := false + for _, segment := range transcript.Segments { + if segment.Speaker == "Alice" && segment.Text == "start target words" { + found = true + } + } + if !found { + t.Fatalf("expected resolved dangling start in output, got %#v", transcript.Segments) + } +} + func TestMergeTagsBackchannelSegments(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{ diff --git a/internal/config/config.go b/internal/config/config.go index f154c60..3a5a12a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,7 +15,7 @@ const ( DefaultOutputModules = "json" DefaultOutputSchema = OutputSchemaSeriatim DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" - DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" + DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultOverlapWordRunGap = 0.75 DefaultWordRunReorderWindow = 1.0 DefaultCoalesceGap = 3.0 diff --git a/internal/danglers/danglers.go b/internal/danglers/danglers.go new file mode 100644 index 0000000..a47916a --- /dev/null +++ b/internal/danglers/danglers.go @@ -0,0 +1,261 @@ +package danglers + +import ( + "fmt" + "sort" + "strings" + "unicode" + "unicode/utf8" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +// Summary records deterministic counters for a resolve-danglers pass. +type Summary struct { + DanglersMerged int + TargetsChanged int +} + +// Apply merges tiny derived fragments back into matching provenance targets. +func Apply(in model.MergedTranscript) (model.MergedTranscript, Summary) { + if len(in.Segments) < 2 { + return in, Summary{} + } + + afterEnds, endSummary := resolveDanglingEnds(in) + afterStarts, startSummary := resolveDanglingStarts(afterEnds) + return afterStarts, Summary{ + DanglersMerged: endSummary.DanglersMerged + startSummary.DanglersMerged, + TargetsChanged: countResolvedTargets(afterStarts), + } +} + +func countResolvedTargets(in model.MergedTranscript) int { + count := 0 + for _, segment := range in.Segments { + if strings.HasPrefix(segment.SourceRef, "resolve-danglers:") { + count++ + } + } + return count +} + +func resolveDanglingEnds(in model.MergedTranscript) (model.MergedTranscript, Summary) { + consumed := make([]bool, len(in.Segments)) + builders := make(map[int]*builder) + merged := 0 + + for index, segment := range in.Segments { + if consumed[index] || !isDanglingEnd(segment) { + continue + } + target := nearestPriorMatch(in.Segments, consumed, index) + if target < 0 { + continue + } + builderFor(builders, target, in.Segments[target]).appendEnd(segment) + consumed[index] = true + merged++ + } + + return buildResult(in, consumed, builders, merged) +} + +func resolveDanglingStarts(in model.MergedTranscript) (model.MergedTranscript, Summary) { + consumed := make([]bool, len(in.Segments)) + builders := make(map[int]*builder) + merged := 0 + + for index, segment := range in.Segments { + if consumed[index] || !isDanglingStart(segment) { + continue + } + target := nearestSubsequentMatch(in.Segments, consumed, index) + if target < 0 { + continue + } + builderFor(builders, target, in.Segments[target]).prependStart(segment) + consumed[index] = true + merged++ + } + + return buildResult(in, consumed, builders, merged) +} + +func buildResult(in model.MergedTranscript, consumed []bool, builders map[int]*builder, merged int) (model.MergedTranscript, Summary) { + if merged == 0 { + return in, Summary{} + } + + builderIDs := make([]int, 0, len(builders)) + for index := range builders { + builderIDs = append(builderIDs, index) + } + sort.Ints(builderIDs) + for id, index := range builderIDs { + builders[index].sourceRef = fmt.Sprintf("resolve-danglers:%d", id+1) + } + + out := model.MergedTranscript{ + Segments: make([]model.Segment, 0, len(in.Segments)-merged), + OverlapGroups: in.OverlapGroups, + } + for index, segment := range in.Segments { + if consumed[index] { + continue + } + if builder, exists := builders[index]; exists { + out.Segments = append(out.Segments, builder.segment()) + continue + } + out.Segments = append(out.Segments, segment) + } + + return out, Summary{ + DanglersMerged: merged, + TargetsChanged: len(builders), + } +} + +type builder struct { + target model.Segment + prefixes []model.Segment + suffixes []model.Segment + sourceRef string +} + +func builderFor(builders map[int]*builder, index int, target model.Segment) *builder { + if existing, exists := builders[index]; exists { + return existing + } + builders[index] = &builder{target: target} + return builders[index] +} + +func (b *builder) appendEnd(segment model.Segment) { + b.suffixes = append(b.suffixes, segment) +} + +func (b *builder) prependStart(segment model.Segment) { + b.prefixes = append(b.prefixes, segment) +} + +func (b builder) segment() model.Segment { + parts := make([]model.Segment, 0, len(b.prefixes)+1+len(b.suffixes)) + for index := len(b.prefixes) - 1; index >= 0; index-- { + parts = append(parts, b.prefixes[index]) + } + parts = append(parts, b.target) + parts = append(parts, b.suffixes...) + + merged := model.Segment{ + Source: parts[0].Source, + SourceRef: b.sourceRef, + DerivedFrom: unionDerivedFrom(parts), + Speaker: b.target.Speaker, + Start: parts[0].Start, + End: parts[0].End, + Categories: append([]string(nil), b.target.Categories...), + Words: make([]model.Word, 0), + } + + text := make([]string, 0, len(parts)) + for _, part := range parts { + if part.Source != merged.Source { + merged.Source = "derived" + } + if part.Start < merged.Start { + merged.Start = part.Start + } + if part.End > merged.End { + merged.End = part.End + } + if trimmed := strings.TrimSpace(part.Text); trimmed != "" { + text = append(text, trimmed) + } + merged.Words = append(merged.Words, part.Words...) + } + merged.Text = strings.Join(text, " ") + return merged +} + +func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int { + for candidate := index - 1; candidate >= 0; candidate-- { + if consumed[candidate] { + continue + } + if sharesDerivedFrom(segments[index], segments[candidate]) { + return candidate + } + } + return -1 +} + +func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int) int { + for candidate := index + 1; candidate < len(segments); candidate++ { + if consumed[candidate] { + continue + } + if sharesDerivedFrom(segments[index], segments[candidate]) { + return candidate + } + } + return -1 +} + +func isDanglingEnd(segment model.Segment) bool { + return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 && endsWithPunctuation(segment.Text) +} + +func isDanglingStart(segment model.Segment) bool { + return hasDerivedFrom(segment) && wordCount(segment.Text) <= 2 +} + +func hasDerivedFrom(segment model.Segment) bool { + return len(segment.DerivedFrom) > 0 +} + +func wordCount(text string) int { + return len(strings.Fields(strings.TrimSpace(text))) +} + +func endsWithPunctuation(text string) bool { + text = strings.TrimSpace(text) + if text == "" { + return false + } + r, _ := utf8.DecodeLastRuneInString(text) + return r != utf8.RuneError && unicode.IsPunct(r) +} + +func sharesDerivedFrom(left model.Segment, right model.Segment) bool { + if len(left.DerivedFrom) == 0 || len(right.DerivedFrom) == 0 { + return false + } + seen := make(map[string]struct{}, len(left.DerivedFrom)) + for _, ref := range left.DerivedFrom { + seen[ref] = struct{}{} + } + for _, ref := range right.DerivedFrom { + if _, exists := seen[ref]; exists { + return true + } + } + return false +} + +func unionDerivedFrom(segments []model.Segment) []string { + seen := make(map[string]struct{}) + refs := make([]string, 0) + for _, segment := range segments { + for _, ref := range segment.DerivedFrom { + if _, exists := seen[ref]; exists { + continue + } + seen[ref] = struct{}{} + refs = append(refs, ref) + } + } + sort.Strings(refs) + return refs +} diff --git a/internal/danglers/danglers_test.go b/internal/danglers/danglers_test.go new file mode 100644 index 0000000..3e513bb --- /dev/null +++ b/internal/danglers/danglers_test.go @@ -0,0 +1,178 @@ +package danglers + +import ( + "reflect" + "testing" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +func TestApplyMergesDanglingEndIntoNearestPriorSharedDerivedFrom(t *testing.T) { + got, summary := Apply(transcript( + segment("a", "Alice", 1, 2, "target", []string{"source#1"}), + segment("b", "Bob", 2, 3, "middle", []string{"other#1"}), + segment("a", "Alice", 3, 4, "end.", []string{"source#1"}), + )) + + if summary.DanglersMerged != 1 || summary.TargetsChanged != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1"}) + if got.Segments[0].SourceSegmentIndex != nil || got.Segments[0].OverlapGroupID != 0 || got.Segments[0].ID != 0 { + t.Fatalf("stale fields not cleared: %#v", got.Segments[0]) + } +} + +func TestApplyMergesDanglingStartIntoNearestSubsequentSharedDerivedFrom(t *testing.T) { + got, summary := Apply(transcript( + segment("a", "Alice", 1, 2, "start", []string{"source#1"}), + segment("b", "Bob", 2, 3, "middle", []string{"other#1"}), + segment("a", "Alice", 3, 4, "target", []string{"source#1"}), + )) + + if summary.DanglersMerged != 1 || summary.TargetsChanged != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + assertSegment(t, got.Segments[1], "resolve-danglers:1", "start target", 1, 4, []string{"source#1"}) +} + +func TestApplyUsesAnyDerivedFromIntersection(t *testing.T) { + got, _ := Apply(transcript( + segment("a", "Alice", 1, 2, "target", []string{"source#1", "source#2"}), + segment("a", "Alice", 3, 4, "end.", []string{"source#2", "source#3"}), + )) + + assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1", "source#2", "source#3"}) +} + +func TestApplyDoesNotMergeWithoutSharedProvenance(t *testing.T) { + in := transcript( + segment("a", "Alice", 1, 2, "target", []string{"source#1"}), + segment("a", "Alice", 3, 4, "end.", []string{"source#2"}), + ) + + got, summary := Apply(in) + if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) { + t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in) + } +} + +func TestApplyDoesNotMergeLongDanglers(t *testing.T) { + in := transcript( + segment("a", "Alice", 1, 2, "target words here", []string{"source#1"}), + segment("a", "Alice", 3, 4, "three word end.", []string{"source#1"}), + ) + + got, summary := Apply(in) + if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) { + t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in) + } +} + +func TestApplyDanglingEndRequiresPunctuation(t *testing.T) { + in := transcript( + segment("a", "Alice", 1, 2, "target", []string{"source#1"}), + segment("a", "Alice", 3, 4, "end", []string{"source#1"}), + ) + + resolved, _ := resolveDanglingEnds(in) + if !reflect.DeepEqual(resolved, in) { + t.Fatalf("punctuation-free end should not merge backward:\ngot %#v\nwant %#v", resolved, in) + } +} + +func TestApplyDanglingStartDoesNotRequirePunctuation(t *testing.T) { + got, summary := Apply(transcript( + segment("a", "Alice", 1, 2, "start", []string{"source#1"}), + segment("a", "Alice", 3, 4, "target words", []string{"source#1"}), + )) + + if summary.DanglersMerged != 1 { + t.Fatalf("summary = %#v", summary) + } + assertSegment(t, got.Segments[0], "resolve-danglers:1", "start target words", 1, 4, []string{"source#1"}) +} + +func TestApplyMergesMultipleDanglersIntoOneTarget(t *testing.T) { + got, summary := Apply(transcript( + segment("a", "Alice", 1, 2, "prefix", []string{"source#1"}), + segment("a", "Alice", 3, 4, "target", []string{"source#1"}), + segment("a", "Alice", 5, 6, "tail.", []string{"source#1"}), + )) + + if summary.DanglersMerged != 2 || summary.TargetsChanged != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + assertSegment(t, got.Segments[0], "resolve-danglers:1", "prefix target tail.", 1, 6, []string{"source#1"}) +} + +func TestApplyMergedSegmentShape(t *testing.T) { + sourceIndex := 1 + target := segment("a", "Alice", 2, 3, "target", []string{"a#1"}) + target.ID = 99 + target.SourceSegmentIndex = &sourceIndex + target.OverlapGroupID = 7 + target.Categories = []string{"manual"} + target.Words = []model.Word{{Text: "target", Start: 2, End: 3, Timed: true}} + + dangler := segment("b", "Alice", 1, 1.5, "start", []string{"a#1", "b#2"}) + dangler.Categories = []string{"dangler"} + dangler.Words = []model.Word{{Text: "start", Start: 1, End: 1.5, Timed: true}} + + got, _ := Apply(transcript(dangler, target)) + merged := got.Segments[0] + if merged.Source != "derived" { + t.Fatalf("source = %q, want derived", merged.Source) + } + if !reflect.DeepEqual(merged.Categories, []string{"manual"}) { + t.Fatalf("categories = %v, want target categories only", merged.Categories) + } + if gotWords := []string{merged.Words[0].Text, merged.Words[1].Text}; !reflect.DeepEqual(gotWords, []string{"start", "target"}) { + t.Fatalf("word order = %v", gotWords) + } + assertSegment(t, merged, "resolve-danglers:1", "start target", 1, 3, []string{"a#1", "b#2"}) + if merged.ID != 0 || merged.SourceSegmentIndex != nil || merged.OverlapGroupID != 0 { + t.Fatalf("stale fields not cleared: %#v", merged) + } +} + +func transcript(segments ...model.Segment) model.MergedTranscript { + return model.MergedTranscript{Segments: segments} +} + +func segment(source string, speaker string, start float64, end float64, text string, derivedFrom []string) model.Segment { + return model.Segment{ + Source: source, + SourceRef: source + "-ref", + DerivedFrom: append([]string(nil), derivedFrom...), + Speaker: speaker, + Start: start, + End: end, + Text: text, + } +} + +func assertSegment(t *testing.T, segment model.Segment, sourceRef string, text string, start float64, end float64, derivedFrom []string) { + t.Helper() + if segment.SourceRef != sourceRef { + t.Fatalf("source_ref = %q, want %q", segment.SourceRef, sourceRef) + } + if segment.Text != text { + t.Fatalf("text = %q, want %q", segment.Text, text) + } + if segment.Start != start || segment.End != end { + t.Fatalf("bounds = %f-%f, want %f-%f", segment.Start, segment.End, start, end) + } + if !reflect.DeepEqual(segment.DerivedFrom, derivedFrom) { + t.Fatalf("derived_from = %v, want %v", segment.DerivedFrom, derivedFrom) + } +}