Implemented an autocorrect module at the postprocessing stage

This commit is contained in:
2026-04-26 19:33:23 -05:00
parent 99d0c425d6
commit 3928e0c4a7
7 changed files with 482 additions and 6 deletions

View File

@@ -46,7 +46,7 @@ Optional flags:
- `--output-modules`: comma-separated output modules. Default: `json`. - `--output-modules`: comma-separated output modules. Default: `json`.
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`. - `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`.
- `--autocorrect`: autocorrect rules file. Reserved for the `autocorrect` module; not part of the default pipeline. - `--autocorrect`: autocorrect rules file. Required when the postprocessing `autocorrect` module is enabled.
## Input JSON Format ## Input JSON Format
@@ -163,9 +163,46 @@ Segments are sorted deterministically by:
Final segment IDs are assigned after sorting and start at `1`. Final segment IDs are assigned after sorting and start at `1`.
## Autocorrect
Autocorrect is an opt-in postprocessing module. It is not part of the default pipeline.
Enable it by adding `autocorrect` to `--postprocessing-modules` and passing `--autocorrect`:
```sh
go run ./cmd/seriatim merge \
--input-file input.json \
--speakers speakers.yml \
--autocorrect autocorrect.yml \
--postprocessing-modules detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output \
--output-file merged.json
```
`autocorrect.yml` format:
```yaml
autocorrect:
- target: "Hrank"
match:
- "hrank"
- "Frank"
- target: "Mike Brown"
match:
- "Mike Pat"
```
Matching behavior:
- Matching is case-sensitive.
- Matches apply only to whole tokens, not substrings inside larger words.
- Punctuation and whitespace can surround a match.
- Multi-word and hyphenated matches are supported.
- Duplicate match strings are invalid, including duplicates across separate rules.
## Current Limitations ## Current Limitations
- Only JSON input is supported. - Only JSON input is supported.
- Word-level timing data is not preserved yet. - Word-level timing data is not preserved yet.
- Overlap detection and overlap resolution are currently no-op modules. - Overlap detection and overlap resolution are currently no-op modules.
- Autocorrect, coalescing, and alternate output formats are not implemented yet. - Coalescing and alternate output formats are not implemented yet.

View File

@@ -0,0 +1,132 @@
package autocorrect
import (
"fmt"
"os"
"strings"
"gopkg.in/yaml.v3"
)
// Rules stores ordered autocorrect replacement rules.
type Rules struct {
rules []Rule
}
// Rule replaces ordered match strings with a canonical target.
type Rule struct {
Target string `yaml:"target"`
Match []string `yaml:"match"`
}
type fileSchema struct {
Autocorrect []Rule `yaml:"autocorrect"`
}
// Load parses and validates an autocorrect.yml file.
func Load(path string) (Rules, error) {
data, err := os.ReadFile(path)
if err != nil {
return Rules{}, err
}
var parsed fileSchema
if err := yaml.Unmarshal(data, &parsed); err != nil {
return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
}
if len(parsed.Autocorrect) == 0 {
return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
}
seenMatches := make(map[string]int)
rules := make([]Rule, 0, len(parsed.Autocorrect))
for ruleIndex, rule := range parsed.Autocorrect {
rule.Target = strings.TrimSpace(rule.Target)
if rule.Target == "" {
return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
}
if len(rule.Match) == 0 {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
}
localMatches := make(map[string]struct{}, len(rule.Match))
for matchIndex, match := range rule.Match {
match = strings.TrimSpace(match)
if match == "" {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
}
if _, exists := localMatches[match]; exists {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
}
localMatches[match] = struct{}{}
if previousRuleIndex, exists := seenMatches[match]; exists {
return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
}
seenMatches[match] = ruleIndex
rule.Match[matchIndex] = match
}
rules = append(rules, rule)
}
return Rules{rules: rules}, nil
}
// Apply replaces configured whole-token matches and returns the updated text and replacement count.
func (r Rules) Apply(text string) (string, int) {
total := 0
for _, rule := range r.rules {
for _, match := range rule.Match {
var count int
text, count = replaceWholeToken(text, match, rule.Target)
total += count
}
}
return text, total
}
func replaceWholeToken(text string, match string, target string) (string, int) {
if text == "" || match == "" {
return text, 0
}
var builder strings.Builder
replacements := 0
searchStart := 0
writeStart := 0
for {
index := strings.Index(text[searchStart:], match)
if index == -1 {
break
}
index += searchStart
end := index + len(match)
if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
builder.WriteString(text[writeStart:index])
builder.WriteString(target)
replacements++
writeStart = end
searchStart = end
continue
}
searchStart = index + 1
}
if replacements == 0 {
return text, 0
}
builder.WriteString(text[writeStart:])
return builder.String(), replacements
}
func isTokenBoundary(text string, index int) bool {
if index < 0 || index >= len(text) {
return true
}
char := text[index]
return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
}

View File

@@ -0,0 +1,190 @@
package autocorrect
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestLoadValidRules(t *testing.T) {
dir := t.TempDir()
path := writeAutocorrect(t, dir, `autocorrect:
- target: "Hrank"
match:
- "Frank"
- "frank"
`)
rules, err := Load(path)
if err != nil {
t.Fatalf("load rules: %v", err)
}
got, count := rules.Apply("Frank and frank")
if got != "Hrank and Hrank" {
t.Fatalf("text = %q, want %q", got, "Hrank and Hrank")
}
if count != 2 {
t.Fatalf("count = %d, want 2", count)
}
}
func TestLoadValidation(t *testing.T) {
tests := []struct {
name string
content string
want string
}{
{
name: "missing top-level autocorrect",
content: `other: []`,
want: "must contain at least one autocorrect rule",
},
{
name: "empty rules list",
content: `autocorrect: []`,
want: "must contain at least one autocorrect rule",
},
{
name: "empty target",
content: `autocorrect:
- target: ""
match: ["Frank"]
`,
want: "must include target",
},
{
name: "empty match list",
content: `autocorrect:
- target: "Hrank"
match: []
`,
want: "must include at least one match string",
},
{
name: "empty match string",
content: `autocorrect:
- target: "Hrank"
match: [" "]
`,
want: "contains empty match string",
},
{
name: "duplicate match across rules",
content: `autocorrect:
- target: "Hrank"
match: ["Frank"]
- target: "Other"
match: ["Frank"]
`,
want: `appears in both rule 0 and rule 1`,
},
{
name: "duplicate match within rule",
content: `autocorrect:
- target: "Hrank"
match: ["Frank", "Frank"]
`,
want: `contains duplicate match string "Frank"`,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
dir := t.TempDir()
path := writeAutocorrect(t, dir, test.content)
_, err := Load(path)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), test.want) {
t.Fatalf("expected error to contain %q, got %v", test.want, err)
}
})
}
}
func TestApplyReplacementBehavior(t *testing.T) {
rules := Rules{rules: []Rule{
{
Target: "Hrank",
Match: []string{"Frank"},
},
{
Target: "Mike Brown",
Match: []string{"Mike Pat"},
},
{
Target: "Godfrey",
Match: []string{"God-free"},
},
}}
tests := []struct {
name string
input string
want string
wantCount int
}{
{
name: "case sensitive",
input: "Frank and FRANK",
want: "Hrank and FRANK",
wantCount: 1,
},
{
name: "punctuation boundary",
input: "Frank, are you there?",
want: "Hrank, are you there?",
wantCount: 1,
},
{
name: "no substring in larger token",
input: "Franklin and xFrank Frank_y Frank2",
want: "Franklin and xFrank Frank_y Frank2",
wantCount: 0,
},
{
name: "multi word match",
input: "Hello Mike Pat.",
want: "Hello Mike Brown.",
wantCount: 1,
},
{
name: "hyphenated match",
input: "God-free is here.",
want: "Godfrey is here.",
wantCount: 1,
},
{
name: "hyphen outside match is boundary",
input: "x-Frank-y",
want: "x-Hrank-y",
wantCount: 1,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got, count := rules.Apply(test.input)
if got != test.want {
t.Fatalf("text = %q, want %q", got, test.want)
}
if count != test.wantCount {
t.Fatalf("count = %d, want %d", count, test.wantCount)
}
})
}
}
func writeAutocorrect(t *testing.T, dir string, content string) string {
t.Helper()
path := filepath.Join(dir, "autocorrect.yml")
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
t.Fatalf("write autocorrect file: %v", err)
}
return path
}

View File

@@ -2,7 +2,9 @@ package builtin
import ( import (
"context" "context"
"fmt"
"gitea.maximumdirect.net/eric/seriatim/internal/autocorrect"
"gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/report" "gitea.maximumdirect.net/eric/seriatim/internal/report"
@@ -45,3 +47,31 @@ func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg con
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"), report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
}, nil }, nil
} }
type autocorrectPostprocessor struct{}
func (autocorrectPostprocessor) Name() string {
return "autocorrect"
}
func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
rules, err := autocorrect.Load(cfg.AutocorrectFile)
if err != nil {
return model.MergedTranscript{}, nil, err
}
replacements := 0
for index := range in.Segments {
var count int
in.Segments[index].Text, count = rules.Apply(in.Segments[index].Text)
replacements += count
}
return in, []report.Event{
report.Info("postprocessing", "autocorrect", fmt.Sprintf("applied %d autocorrect replacement(s)", replacements)),
}, nil
}

View File

@@ -10,13 +10,12 @@ func NewRegistry() *pipeline.Registry {
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw}) registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
registry.RegisterPreprocessor(normalizeSpeakers{}) registry.RegisterPreprocessor(normalizeSpeakers{})
registry.RegisterPreprocessor(trimText{}) registry.RegisterPreprocessor(trimText{})
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
registry.RegisterMerger(placeholderMerger{}) registry.RegisterMerger(placeholderMerger{})
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"}) registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"}) registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
registry.RegisterPostprocessor(assignIDs{}) registry.RegisterPostprocessor(assignIDs{})
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"}) registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"}) registry.RegisterPostprocessor(autocorrectPostprocessor{})
registry.RegisterOutputWriter(jsonOutputWriter{}) registry.RegisterOutputWriter(jsonOutputWriter{})
return registry return registry

View File

@@ -304,7 +304,7 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
"--input-file", input, "--input-file", input,
"--speakers", speakers, "--speakers", speakers,
"--output-file", output, "--output-file", output,
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect", "--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
) )
if err == nil { if err == nil {
t.Fatal("expected error") t.Fatal("expected error")
@@ -314,6 +314,94 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
} }
} }
func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["input.json"]
`)
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
- target: Hrank
match: ["Frank"]
`)
output := filepath.Join(dir, "merged.json")
err := executeMerge(
"--input-file", input,
"--speakers", speakers,
"--autocorrect", autocorrect,
"--output-file", output,
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) {
t.Fatalf("unexpected error: %v", err)
}
}
func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."},
{"start": 3, "end": 4, "text": "God-free and FRANK stayed."}
]
}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["input.json"]
`)
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
- target: Hrank
match: ["Frank"]
- target: Mike Brown
match: ["Mike Pat"]
- target: Godfrey
match: ["God-free"]
`)
output := filepath.Join(dir, "merged.json")
reportPath := filepath.Join(dir, "report.json")
err := executeMerge(
"--input-file", input,
"--speakers", speakers,
"--autocorrect", autocorrect,
"--output-file", output,
"--report-file", reportPath,
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript model.FinalTranscript
readJSON(t, output, &transcript)
if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want {
t.Fatalf("segment 0 text = %q, want %q", got, want)
}
if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want {
t.Fatalf("segment 1 text = %q, want %q", got, want)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
found := false
for _, event := range rpt.Events {
if event.Stage == "postprocessing" && event.Module == "autocorrect" {
found = true
if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") {
t.Fatalf("unexpected autocorrect report message: %q", event.Message)
}
}
}
if !found {
t.Fatal("expected autocorrect report event")
}
}
func TestOutputJSONIsByteStable(t *testing.T) { func TestOutputJSONIsByteStable(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`) inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)

View File

@@ -111,7 +111,7 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
} }
} }
if contains(cfg.PreprocessingModules, "autocorrect") || contains(cfg.PostprocessingModules, "autocorrect") { if contains(cfg.PostprocessingModules, "autocorrect") {
if cfg.AutocorrectFile == "" { if cfg.AutocorrectFile == "" {
return Config{}, errors.New("--autocorrect is required when autocorrect is enabled") return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
} }