Update the default postprocessing pipeline to run detect-overlaps twice

This commit is contained in:
2026-04-27 18:17:40 -05:00
parent 1b9f4bd922
commit 13d972cb24
3 changed files with 102 additions and 10 deletions

View File

@@ -44,7 +44,7 @@ Optional flags:
- `--input-reader`: input reader module. Default: `json-files`. - `--input-reader`: input reader module. Default: `json-files`.
- `--output-modules`: comma-separated output modules. Default: `json`. - `--output-modules`: comma-separated output modules. Default: `json`.
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output`. - `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output`.
## Input JSON Format ## Input JSON Format
@@ -150,7 +150,7 @@ The merged output uses the current seriatim envelope:
"input_reader": "json-files", "input_reader": "json-files",
"input_files": ["eric.json", "mike.json"], "input_files": ["eric.json", "mike.json"],
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output"], "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
"output_modules": ["json"] "output_modules": ["json"]
}, },
"segments": [ "segments": [
@@ -214,7 +214,7 @@ Overlap behavior:
## Overlap Resolution ## Overlap Resolution
The default postprocessing pipeline runs `resolve-overlaps` after `detect-overlaps`. The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then a second `detect-overlaps` pass.
For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments:
@@ -227,8 +227,8 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word
- Replacement segment text is built by joining word text with single spaces. - Replacement segment text is built by joining word text with single spaces.
- Replacement segments include `source_ref` and `derived_from`. - Replacement segments include `source_ref` and `derived_from`.
- Replacement segments omit `source_segment_index` because they are derived from one or more original segments. - Replacement segments omit `source_segment_index` because they are derived from one or more original segments.
- Resolved overlap groups are removed from `overlap_groups`. - Resolved overlap groups are removed before the second detection pass.
- Replacement segments are left without `overlap_group_id`; future passes can detect any remaining overlap. - Replacement segments are left without `overlap_group_id` until the second detection pass annotates any remaining overlap.
- If a speaker has no usable word timing in a group, that speaker's original segment is kept. - If a speaker has no usable word timing in a group, that speaker's original segment is kept.
- If no speakers in a group have usable word timing, the original group and annotations remain unchanged. - If no speakers in a group have usable word timing, the original group and annotations remain unchanged.

View File

@@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
"placeholder-merger", "placeholder-merger",
"detect-overlaps", "detect-overlaps",
"resolve-overlaps", "resolve-overlaps",
"detect-overlaps",
"autocorrect", "autocorrect",
"assign-ids", "assign-ids",
"validate-output", "validate-output",
@@ -241,8 +242,8 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
"end": 4, "end": 4,
"text": "bob original", "text": "bob original",
"words": [ "words": [
{"word": "bob", "start": 1.55, "end": 1.7}, {"word": "bob", "start": 2.2, "end": 2.3},
{"word": "reply", "start": 2.0, "end": 2.2} {"word": "reply", "start": 2.4, "end": 2.5}
] ]
} }
] ]
@@ -318,6 +319,97 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
} }
} }
func TestMergeDetectsResidualOverlapsAfterResolution(t *testing.T) {
dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{
"segments": [
{
"start": 1,
"end": 4,
"text": "alice residual",
"words": [
{"word": "alice", "start": 1.0, "end": 2.0}
]
}
]
}`)
inputB := writeJSONFile(t, dir, "b.json", `{
"segments": [
{
"start": 1.5,
"end": 3,
"text": "bob residual",
"words": [
{"word": "bob", "start": 1.5, "end": 2.5}
]
}
]
}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["a.json"]
- speaker: Bob
match: ["b.json"]
`)
output := filepath.Join(dir, "merged.json")
reportPath := filepath.Join(dir, "report.json")
err := executeMerge(
"--input-file", inputA,
"--input-file", inputB,
"--speakers", speakers,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript model.FinalTranscript
readJSON(t, output, &transcript)
if len(transcript.OverlapGroups) != 1 {
t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups))
}
group := transcript.OverlapGroups[0]
if group.ID != 1 {
t.Fatalf("group ID = %d, want 1", group.ID)
}
if group.Start != 1 || group.End != 2.5 {
t.Fatalf("group bounds = %f-%f, want 1-2.5", group.Start, group.End)
}
wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"}
if !equalStrings(group.Segments, wantRefs) {
t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs)
}
if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) {
t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers)
}
for index, segment := range transcript.Segments {
if segment.ID != index+1 {
t.Fatalf("segment %d ID = %d, want %d", index, segment.ID, index+1)
}
if segment.OverlapGroupID != 1 {
t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID)
}
if segment.SourceRef != wantRefs[index] {
t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index])
}
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
detectMessages := make([]string, 0)
for _, event := range rpt.Events {
if event.Stage == "postprocessing" && event.Module == "detect-overlaps" {
detectMessages = append(detectMessages, event.Message)
}
}
wantMessages := []string{"detected 1 overlap group(s)", "detected 1 overlap group(s)"}
if !equalStrings(detectMessages, wantMessages) {
t.Fatalf("detect-overlaps messages = %v, want %v", detectMessages, wantMessages)
}
}
func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{
@@ -895,8 +987,8 @@ func TestMergeResolutionPreservesUntimedWordText(t *testing.T) {
"end": 2, "end": 2,
"text": "bob overlap", "text": "bob overlap",
"words": [ "words": [
{"word": "bob", "start": 1.16, "end": 1.25}, {"word": "bob", "start": 1.6, "end": 1.7},
{"word": "overlap", "start": 1.3, "end": 1.5} {"word": "overlap", "start": 1.6, "end": 1.8}
] ]
} }
] ]

View File

@@ -14,7 +14,7 @@ const (
DefaultInputReader = "json-files" DefaultInputReader = "json-files"
DefaultOutputModules = "json" DefaultOutputModules = "json"
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output" DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output"
DefaultOverlapWordRunGap = 0.75 DefaultOverlapWordRunGap = 0.75
OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP" OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP"
) )