Implemented an autocorrect module at the postprocessing stage

This commit is contained in:
2026-04-26 19:33:23 -05:00
parent 99d0c425d6
commit 3928e0c4a7
7 changed files with 482 additions and 6 deletions

View File

@@ -304,7 +304,7 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
"--input-file", input,
"--speakers", speakers,
"--output-file", output,
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
)
if err == nil {
t.Fatal("expected error")
@@ -314,6 +314,94 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
}
}
func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["input.json"]
`)
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
- target: Hrank
match: ["Frank"]
`)
output := filepath.Join(dir, "merged.json")
err := executeMerge(
"--input-file", input,
"--speakers", speakers,
"--autocorrect", autocorrect,
"--output-file", output,
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) {
t.Fatalf("unexpected error: %v", err)
}
}
func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."},
{"start": 3, "end": 4, "text": "God-free and FRANK stayed."}
]
}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["input.json"]
`)
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
- target: Hrank
match: ["Frank"]
- target: Mike Brown
match: ["Mike Pat"]
- target: Godfrey
match: ["God-free"]
`)
output := filepath.Join(dir, "merged.json")
reportPath := filepath.Join(dir, "report.json")
err := executeMerge(
"--input-file", input,
"--speakers", speakers,
"--autocorrect", autocorrect,
"--output-file", output,
"--report-file", reportPath,
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript model.FinalTranscript
readJSON(t, output, &transcript)
if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want {
t.Fatalf("segment 0 text = %q, want %q", got, want)
}
if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want {
t.Fatalf("segment 1 text = %q, want %q", got, want)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
found := false
for _, event := range rpt.Events {
if event.Stage == "postprocessing" && event.Module == "autocorrect" {
found = true
if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") {
t.Fatalf("unexpected autocorrect report message: %q", event.Message)
}
}
}
if !found {
t.Fatal("expected autocorrect report event")
}
}
func TestOutputJSONIsByteStable(t *testing.T) {
dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)