Update the default postprocessing pipeline to run detect-overlaps twice
This commit is contained in:
10
README.md
10
README.md
@@ -44,7 +44,7 @@ Optional flags:
|
|||||||
- `--input-reader`: input reader module. Default: `json-files`.
|
- `--input-reader`: input reader module. Default: `json-files`.
|
||||||
- `--output-modules`: comma-separated output modules. Default: `json`.
|
- `--output-modules`: comma-separated output modules. Default: `json`.
|
||||||
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
||||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output`.
|
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output`.
|
||||||
|
|
||||||
## Input JSON Format
|
## Input JSON Format
|
||||||
|
|
||||||
@@ -150,7 +150,7 @@ The merged output uses the current seriatim envelope:
|
|||||||
"input_reader": "json-files",
|
"input_reader": "json-files",
|
||||||
"input_files": ["eric.json", "mike.json"],
|
"input_files": ["eric.json", "mike.json"],
|
||||||
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
|
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
|
||||||
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output"],
|
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
|
||||||
"output_modules": ["json"]
|
"output_modules": ["json"]
|
||||||
},
|
},
|
||||||
"segments": [
|
"segments": [
|
||||||
@@ -214,7 +214,7 @@ Overlap behavior:
|
|||||||
|
|
||||||
## Overlap Resolution
|
## Overlap Resolution
|
||||||
|
|
||||||
The default postprocessing pipeline runs `resolve-overlaps` after `detect-overlaps`.
|
The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then a second `detect-overlaps` pass.
|
||||||
|
|
||||||
For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments:
|
For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments:
|
||||||
|
|
||||||
@@ -227,8 +227,8 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word
|
|||||||
- Replacement segment text is built by joining word text with single spaces.
|
- Replacement segment text is built by joining word text with single spaces.
|
||||||
- Replacement segments include `source_ref` and `derived_from`.
|
- Replacement segments include `source_ref` and `derived_from`.
|
||||||
- Replacement segments omit `source_segment_index` because they are derived from one or more original segments.
|
- Replacement segments omit `source_segment_index` because they are derived from one or more original segments.
|
||||||
- Resolved overlap groups are removed from `overlap_groups`.
|
- Resolved overlap groups are removed before the second detection pass.
|
||||||
- Replacement segments are left without `overlap_group_id`; future passes can detect any remaining overlap.
|
- Replacement segments are left without `overlap_group_id` until the second detection pass annotates any remaining overlap.
|
||||||
- If a speaker has no usable word timing in a group, that speaker's original segment is kept.
|
- If a speaker has no usable word timing in a group, that speaker's original segment is kept.
|
||||||
- If no speakers in a group have usable word timing, the original group and annotations remain unchanged.
|
- If no speakers in a group have usable word timing, the original group and annotations remain unchanged.
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
|||||||
"placeholder-merger",
|
"placeholder-merger",
|
||||||
"detect-overlaps",
|
"detect-overlaps",
|
||||||
"resolve-overlaps",
|
"resolve-overlaps",
|
||||||
|
"detect-overlaps",
|
||||||
"autocorrect",
|
"autocorrect",
|
||||||
"assign-ids",
|
"assign-ids",
|
||||||
"validate-output",
|
"validate-output",
|
||||||
@@ -241,8 +242,8 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
|
|||||||
"end": 4,
|
"end": 4,
|
||||||
"text": "bob original",
|
"text": "bob original",
|
||||||
"words": [
|
"words": [
|
||||||
{"word": "bob", "start": 1.55, "end": 1.7},
|
{"word": "bob", "start": 2.2, "end": 2.3},
|
||||||
{"word": "reply", "start": 2.0, "end": 2.2}
|
{"word": "reply", "start": 2.4, "end": 2.5}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -318,6 +319,97 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMergeDetectsResidualOverlapsAfterResolution(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||||
|
"segments": [
|
||||||
|
{
|
||||||
|
"start": 1,
|
||||||
|
"end": 4,
|
||||||
|
"text": "alice residual",
|
||||||
|
"words": [
|
||||||
|
{"word": "alice", "start": 1.0, "end": 2.0}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||||
|
"segments": [
|
||||||
|
{
|
||||||
|
"start": 1.5,
|
||||||
|
"end": 3,
|
||||||
|
"text": "bob residual",
|
||||||
|
"words": [
|
||||||
|
{"word": "bob", "start": 1.5, "end": 2.5}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||||
|
- speaker: Alice
|
||||||
|
match: ["a.json"]
|
||||||
|
- speaker: Bob
|
||||||
|
match: ["b.json"]
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
reportPath := filepath.Join(dir, "report.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", inputA,
|
||||||
|
"--input-file", inputB,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
"--report-file", reportPath,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("merge failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var transcript model.FinalTranscript
|
||||||
|
readJSON(t, output, &transcript)
|
||||||
|
if len(transcript.OverlapGroups) != 1 {
|
||||||
|
t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups))
|
||||||
|
}
|
||||||
|
group := transcript.OverlapGroups[0]
|
||||||
|
if group.ID != 1 {
|
||||||
|
t.Fatalf("group ID = %d, want 1", group.ID)
|
||||||
|
}
|
||||||
|
if group.Start != 1 || group.End != 2.5 {
|
||||||
|
t.Fatalf("group bounds = %f-%f, want 1-2.5", group.Start, group.End)
|
||||||
|
}
|
||||||
|
wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"}
|
||||||
|
if !equalStrings(group.Segments, wantRefs) {
|
||||||
|
t.Fatalf("group refs = %v, want %v", group.Segments, wantRefs)
|
||||||
|
}
|
||||||
|
if !equalStrings(group.Speakers, []string{"Alice", "Bob"}) {
|
||||||
|
t.Fatalf("group speakers = %v, want [Alice Bob]", group.Speakers)
|
||||||
|
}
|
||||||
|
for index, segment := range transcript.Segments {
|
||||||
|
if segment.ID != index+1 {
|
||||||
|
t.Fatalf("segment %d ID = %d, want %d", index, segment.ID, index+1)
|
||||||
|
}
|
||||||
|
if segment.OverlapGroupID != 1 {
|
||||||
|
t.Fatalf("segment %d overlap group ID = %d, want 1", index, segment.OverlapGroupID)
|
||||||
|
}
|
||||||
|
if segment.SourceRef != wantRefs[index] {
|
||||||
|
t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var rpt report.Report
|
||||||
|
readJSON(t, reportPath, &rpt)
|
||||||
|
detectMessages := make([]string, 0)
|
||||||
|
for _, event := range rpt.Events {
|
||||||
|
if event.Stage == "postprocessing" && event.Module == "detect-overlaps" {
|
||||||
|
detectMessages = append(detectMessages, event.Message)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wantMessages := []string{"detected 1 overlap group(s)", "detected 1 overlap group(s)"}
|
||||||
|
if !equalStrings(detectMessages, wantMessages) {
|
||||||
|
t.Fatalf("detect-overlaps messages = %v, want %v", detectMessages, wantMessages)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) {
|
func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{
|
input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{
|
||||||
@@ -895,8 +987,8 @@ func TestMergeResolutionPreservesUntimedWordText(t *testing.T) {
|
|||||||
"end": 2,
|
"end": 2,
|
||||||
"text": "bob overlap",
|
"text": "bob overlap",
|
||||||
"words": [
|
"words": [
|
||||||
{"word": "bob", "start": 1.16, "end": 1.25},
|
{"word": "bob", "start": 1.6, "end": 1.7},
|
||||||
{"word": "overlap", "start": 1.3, "end": 1.5}
|
{"word": "overlap", "start": 1.6, "end": 1.8}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ const (
|
|||||||
DefaultInputReader = "json-files"
|
DefaultInputReader = "json-files"
|
||||||
DefaultOutputModules = "json"
|
DefaultOutputModules = "json"
|
||||||
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
||||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output"
|
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,detect-overlaps,autocorrect,assign-ids,validate-output"
|
||||||
DefaultOverlapWordRunGap = 0.75
|
DefaultOverlapWordRunGap = 0.75
|
||||||
OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP"
|
OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user