From 13d972cb246b574b9bb86c8d305c55d0947fc5f2 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Mon, 27 Apr 2026 18:17:40 -0500 Subject: [PATCH] Update the default postprocessing pipeline to run detect-overlaps twice --- README.md | 10 ++-- internal/cli/merge_test.go | 100 +++++++++++++++++++++++++++++++++++-- internal/config/config.go | 2 +- 3 files changed, 102 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6953450..79badd9 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Optional flags: - `--input-reader`: input reader module. Default: `json-files`. - `--output-modules`: comma-separated output modules. Default: `json`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. -- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output`. +- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output`. ## Input JSON Format @@ -150,7 +150,7 @@ The merged output uses the current seriatim envelope: "input_reader": "json-files", "input_files": ["eric.json", "mike.json"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], - "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output"], + "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], "output_modules": ["json"] }, "segments": [ @@ -214,7 +214,7 @@ Overlap behavior: ## Overlap Resolution -The default postprocessing pipeline runs `resolve-overlaps` after `detect-overlaps`. +The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then a second `detect-overlaps` pass. For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: @@ -227,8 +227,8 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word - Replacement segment text is built by joining word text with single spaces. - Replacement segments include `source_ref` and `derived_from`. - Replacement segments omit `source_segment_index` because they are derived from one or more original segments. -- Resolved overlap groups are removed from `overlap_groups`. -- Replacement segments are left without `overlap_group_id`; future passes can detect any remaining overlap. +- Resolved overlap groups are removed before the second detection pass. +- Replacement segments are left without `overlap_group_id` until the second detection pass annotates any remaining overlap. - If a speaker has no usable word timing in a group, that speaker's original segment is kept. - If no speakers in a group have usable word timing, the original group and annotations remain unchanged. diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index c166d54..c66327b 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { "placeholder-merger", "detect-overlaps", "resolve-overlaps", + "detect-overlaps", "autocorrect", "assign-ids", "validate-output", @@ -241,8 +242,8 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) { "end": 4, "text": "bob original", "words": [ - {"word": "bob", "start": 1.55, "end": 1.7}, - {"word": "reply", "start": 2.0, "end": 2.2} + {"word": "bob", "start": 2.2, "end": 2.3}, + {"word": "reply", "start": 2.4, "end": 2.5} ] } ] @@ -318,6 +319,97 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) { } } +func TestMergeDetectsResidualOverlapsAfterResolution(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + { + "start": 1, + "end": 4, + "text": "alice residual", + "words": [ + {"word": "alice", "start": 1.0, "end": 2.0} + ] + } + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + { + "start": 1.5, + "end": 3, + "text": "bob residual", + "words": [ + {"word": "bob", "start": 1.5, "end": 2.5} + ] + } + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.OverlapGroups) != 1 { + t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) + } + group := transcript.OverlapGroups[0] + if group.ID != 1 { + t.Fatalf("group ID = %d, want 1", group.ID) + } + if group.Start != 1 || group.End != 2.5 { + t.Fatalf("group bounds = %f-%f, want 1-2.5", group.Start, group.End) + } + wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"} + if !equalStrings(group.Segments, wantRefs) { + t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs) + } + if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) { + t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers) + } + for index, segment := range transcript.Segments { + if segment.ID != index+1 { + t.Fatalf("segment %d ID = %d, want %d", index, segment.ID, index+1) + } + if segment.OverlapGroupID != 1 { + t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID) + } + if segment.SourceRef != wantRefs[index] { + t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index]) + } + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + detectMessages := make([]string, 0) + for _, event := range rpt.Events { + if event.Stage == "postprocessing" && event.Module == "detect-overlaps" { + detectMessages = append(detectMessages, event.Message) + } + } + wantMessages := []string{"detected 1 overlap group(s)", "detected 1 overlap group(s)"} + if !equalStrings(detectMessages, wantMessages) { + t.Fatalf("detect-overlaps messages = %v, want %v", detectMessages, wantMessages) + } +} + func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ @@ -895,8 +987,8 @@ func TestMergeResolutionPreservesUntimedWordText(t *testing.T) { "end": 2, "text": "bob overlap", "words": [ - {"word": "bob", "start": 1.16, "end": 1.25}, - {"word": "overlap", "start": 1.3, "end": 1.5} + {"word": "bob", "start": 1.6, "end": 1.7}, + {"word": "overlap", "start": 1.6, "end": 1.8} ] } ] diff --git a/internal/config/config.go b/internal/config/config.go index 9c64dea..ebad56b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,7 +14,7 @@ const ( DefaultInputReader = "json-files" DefaultOutputModules = "json" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" - DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output" + DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultOverlapWordRunGap = 0.75 OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP" )