Implemented an autocorrect module at the postprocessing stage

2026-04-26 19:33:23 -05:00
parent 99d0c425d6
commit 3928e0c4a7
7 changed files with 482 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Optional flags:
 - `--output-modules`: comma-separated output modules. Default: `json`.
 - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
 - `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`.
- `--autocorrect`: autocorrect rules file. Reserved for the `autocorrect` module; not part of the default pipeline.
+- `--autocorrect`: autocorrect rules file. Required when the postprocessing `autocorrect` module is enabled.
 ## Input JSON Format
@@ -163,9 +163,46 @@ Segments are sorted deterministically by:
 Final segment IDs are assigned after sorting and start at `1`.
 ## Autocorrect
 Autocorrect is an opt-in postprocessing module. It is not part of the default pipeline.
 Enable it by adding `autocorrect` to `--postprocessing-modules` and passing `--autocorrect`:
 ```sh
 go run ./cmd/seriatim merge \
  --input-file input.json \
  --speakers speakers.yml \
  --autocorrect autocorrect.yml \
  --postprocessing-modules detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output \
  --output-file merged.json
 ```
 `autocorrect.yml` format:
 ```yaml
 autocorrect:
  - target: "Hrank"
    match:
      - "hrank"
      - "Frank"
  - target: "Mike Brown"
    match:
      - "Mike Pat"
 ```
 Matching behavior:
 - Matching is case-sensitive.
 - Matches apply only to whole tokens, not substrings inside larger words.
 - Punctuation and whitespace can surround a match.
 - Multi-word and hyphenated matches are supported.
 - Duplicate match strings are invalid, including duplicates across separate rules.
 ## Current Limitations
 - Only JSON input is supported.
 - Word-level timing data is not preserved yet.
 - Overlap detection and overlap resolution are currently no-op modules.
- Autocorrect, coalescing, and alternate output formats are not implemented yet.
+- Coalescing and alternate output formats are not implemented yet.
--- a/internal/autocorrect/autocorrect.go
+++ b/internal/autocorrect/autocorrect.go
@@ -0,0 +1,132 @@
 package autocorrect
 import (
 	"fmt"
 	"os"
 	"strings"
 	"gopkg.in/yaml.v3"
 )
 // Rules stores ordered autocorrect replacement rules.
 type Rules struct {
 	rules []Rule
 }
 // Rule replaces ordered match strings with a canonical target.
 type Rule struct {
 	Target string   `yaml:"target"`
 	Match  []string `yaml:"match"`
 }
 type fileSchema struct {
 	Autocorrect []Rule `yaml:"autocorrect"`
 }
 // Load parses and validates an autocorrect.yml file.
 func Load(path string) (Rules, error) {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		return Rules{}, err
 	}
 	var parsed fileSchema
 	if err := yaml.Unmarshal(data, &parsed); err != nil {
 		return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
 	}
 	if len(parsed.Autocorrect) == 0 {
 		return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
 	}
 	seenMatches := make(map[string]int)
 	rules := make([]Rule, 0, len(parsed.Autocorrect))
 	for ruleIndex, rule := range parsed.Autocorrect {
 		rule.Target = strings.TrimSpace(rule.Target)
 		if rule.Target == "" {
 			return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
 		}
 		if len(rule.Match) == 0 {
 			return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
 		}
 		localMatches := make(map[string]struct{}, len(rule.Match))
 		for matchIndex, match := range rule.Match {
 			match = strings.TrimSpace(match)
 			if match == "" {
 				return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
 			}
 			if _, exists := localMatches[match]; exists {
 				return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
 			}
 			localMatches[match] = struct{}{}
 			if previousRuleIndex, exists := seenMatches[match]; exists {
 				return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
 			}
 			seenMatches[match] = ruleIndex
 			rule.Match[matchIndex] = match
 		}
 		rules = append(rules, rule)
 	}
 	return Rules{rules: rules}, nil
 }
 // Apply replaces configured whole-token matches and returns the updated text and replacement count.
 func (r Rules) Apply(text string) (string, int) {
 	total := 0
 	for _, rule := range r.rules {
 		for _, match := range rule.Match {
 			var count int
 			text, count = replaceWholeToken(text, match, rule.Target)
 			total += count
 		}
 	}
 	return text, total
 }
 func replaceWholeToken(text string, match string, target string) (string, int) {
 	if text == "" || match == "" {
 		return text, 0
 	}
 	var builder strings.Builder
 	replacements := 0
 	searchStart := 0
 	writeStart := 0
 	for {
 		index := strings.Index(text[searchStart:], match)
 		if index == -1 {
 			break
 		}
 		index += searchStart
 		end := index + len(match)
 		if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
 			builder.WriteString(text[writeStart:index])
 			builder.WriteString(target)
 			replacements++
 			writeStart = end
 			searchStart = end
 			continue
 		}
 		searchStart = index + 1
 	}
 	if replacements == 0 {
 		return text, 0
 	}
 	builder.WriteString(text[writeStart:])
 	return builder.String(), replacements
 }
 func isTokenBoundary(text string, index int) bool {
 	if index < 0 || index >= len(text) {
 		return true
 	}
 	char := text[index]
 	return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
 }
--- a/internal/autocorrect/autocorrect_test.go
+++ b/internal/autocorrect/autocorrect_test.go
@@ -0,0 +1,190 @@
 package autocorrect
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 )
 func TestLoadValidRules(t *testing.T) {
 	dir := t.TempDir()
 	path := writeAutocorrect(t, dir, `autocorrect:
  - target: "Hrank"
    match:
      - "Frank"
      - "frank"
 `)
 	rules, err := Load(path)
 	if err != nil {
 		t.Fatalf("load rules: %v", err)
 	}
 	got, count := rules.Apply("Frank and frank")
 	if got != "Hrank and Hrank" {
 		t.Fatalf("text = %q, want %q", got, "Hrank and Hrank")
 	}
 	if count != 2 {
 		t.Fatalf("count = %d, want 2", count)
 	}
 }
 func TestLoadValidation(t *testing.T) {
 	tests := []struct {
 		name    string
 		content string
 		want    string
 	}{
 		{
 			name:    "missing top-level autocorrect",
 			content: `other: []`,
 			want:    "must contain at least one autocorrect rule",
 		},
 		{
 			name:    "empty rules list",
 			content: `autocorrect: []`,
 			want:    "must contain at least one autocorrect rule",
 		},
 		{
 			name: "empty target",
 			content: `autocorrect:
  - target: ""
    match: ["Frank"]
 `,
 			want: "must include target",
 		},
 		{
 			name: "empty match list",
 			content: `autocorrect:
  - target: "Hrank"
    match: []
 `,
 			want: "must include at least one match string",
 		},
 		{
 			name: "empty match string",
 			content: `autocorrect:
  - target: "Hrank"
    match: [" "]
 `,
 			want: "contains empty match string",
 		},
 		{
 			name: "duplicate match across rules",
 			content: `autocorrect:
  - target: "Hrank"
    match: ["Frank"]
  - target: "Other"
    match: ["Frank"]
 `,
 			want: `appears in both rule 0 and rule 1`,
 		},
 		{
 			name: "duplicate match within rule",
 			content: `autocorrect:
  - target: "Hrank"
    match: ["Frank", "Frank"]
 `,
 			want: `contains duplicate match string "Frank"`,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			dir := t.TempDir()
 			path := writeAutocorrect(t, dir, test.content)
 			_, err := Load(path)
 			if err == nil {
 				t.Fatal("expected error")
 			}
 			if !strings.Contains(err.Error(), test.want) {
 				t.Fatalf("expected error to contain %q, got %v", test.want, err)
 			}
 		})
 	}
 }
 func TestApplyReplacementBehavior(t *testing.T) {
 	rules := Rules{rules: []Rule{
 		{
 			Target: "Hrank",
 			Match:  []string{"Frank"},
 		},
 		{
 			Target: "Mike Brown",
 			Match:  []string{"Mike Pat"},
 		},
 		{
 			Target: "Godfrey",
 			Match:  []string{"God-free"},
 		},
 	}}
 	tests := []struct {
 		name      string
 		input     string
 		want      string
 		wantCount int
 	}{
 		{
 			name:      "case sensitive",
 			input:     "Frank and FRANK",
 			want:      "Hrank and FRANK",
 			wantCount: 1,
 		},
 		{
 			name:      "punctuation boundary",
 			input:     "Frank, are you there?",
 			want:      "Hrank, are you there?",
 			wantCount: 1,
 		},
 		{
 			name:      "no substring in larger token",
 			input:     "Franklin and xFrank Frank_y Frank2",
 			want:      "Franklin and xFrank Frank_y Frank2",
 			wantCount: 0,
 		},
 		{
 			name:      "multi word match",
 			input:     "Hello Mike Pat.",
 			want:      "Hello Mike Brown.",
 			wantCount: 1,
 		},
 		{
 			name:      "hyphenated match",
 			input:     "God-free is here.",
 			want:      "Godfrey is here.",
 			wantCount: 1,
 		},
 		{
 			name:      "hyphen outside match is boundary",
 			input:     "x-Frank-y",
 			want:      "x-Hrank-y",
 			wantCount: 1,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			got, count := rules.Apply(test.input)
 			if got != test.want {
 				t.Fatalf("text = %q, want %q", got, test.want)
 			}
 			if count != test.wantCount {
 				t.Fatalf("count = %d, want %d", count, test.wantCount)
 			}
 		})
 	}
 }
 func writeAutocorrect(t *testing.T, dir string, content string) string {
 	t.Helper()
 	path := filepath.Join(dir, "autocorrect.yml")
 	if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
 		t.Fatalf("write autocorrect file: %v", err)
 	}
 	return path
 }
--- a/internal/builtin/postprocess.go
+++ b/internal/builtin/postprocess.go
@@ -2,7 +2,9 @@ package builtin
 import (
 	"context"
 	"fmt"
 	"gitea.maximumdirect.net/eric/seriatim/internal/autocorrect"
 	"gitea.maximumdirect.net/eric/seriatim/internal/config"
 	"gitea.maximumdirect.net/eric/seriatim/internal/model"
 	"gitea.maximumdirect.net/eric/seriatim/internal/report"
@@ -45,3 +47,31 @@ func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg con
 		report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
 	}, nil
 }
 type autocorrectPostprocessor struct{}
 func (autocorrectPostprocessor) Name() string {
 	return "autocorrect"
 }
 func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
 	if err := ctx.Err(); err != nil {
 		return model.MergedTranscript{}, nil, err
 	}
 	rules, err := autocorrect.Load(cfg.AutocorrectFile)
 	if err != nil {
 		return model.MergedTranscript{}, nil, err
 	}
 	replacements := 0
 	for index := range in.Segments {
 		var count int
 		in.Segments[index].Text, count = rules.Apply(in.Segments[index].Text)
 		replacements += count
 	}
 	return in, []report.Event{
 		report.Info("postprocessing", "autocorrect", fmt.Sprintf("applied %d autocorrect replacement(s)", replacements)),
 	}, nil
 }
--- a/internal/builtin/registry.go
+++ b/internal/builtin/registry.go
@@ -10,13 +10,12 @@ func NewRegistry() *pipeline.Registry {
 	registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
 	registry.RegisterPreprocessor(normalizeSpeakers{})
 	registry.RegisterPreprocessor(trimText{})
 	registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
 	registry.RegisterMerger(placeholderMerger{})
 	registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
 	registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
 	registry.RegisterPostprocessor(assignIDs{})
 	registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
-	registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"})
+	registry.RegisterPostprocessor(autocorrectPostprocessor{})
 	registry.RegisterOutputWriter(jsonOutputWriter{})
 	return registry
--- a/internal/cli/merge_test.go
+++ b/internal/cli/merge_test.go
@@ -304,7 +304,7 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
 		"--input-file", input,
 		"--speakers", speakers,
 		"--output-file", output,
-		"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
+		"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
 	)
 	if err == nil {
 		t.Fatal("expected error")
@@ -314,6 +314,94 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
 	}
 }
 func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) {
 	dir := t.TempDir()
 	input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
 	speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
  - speaker: Alice
    match: ["input.json"]
 `)
 	autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
  - target: Hrank
    match: ["Frank"]
 `)
 	output := filepath.Join(dir, "merged.json")
 	err := executeMerge(
 		"--input-file", input,
 		"--speakers", speakers,
 		"--autocorrect", autocorrect,
 		"--output-file", output,
 		"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
 	)
 	if err == nil {
 		t.Fatal("expected error")
 	}
 	if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) {
 		t.Fatalf("unexpected error: %v", err)
 	}
 }
 func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) {
 	dir := t.TempDir()
 	input := writeJSONFile(t, dir, "input.json", `{
 		"segments": [
 			{"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."},
 			{"start": 3, "end": 4, "text": "God-free and FRANK stayed."}
 		]
 	}`)
 	speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
  - speaker: Alice
    match: ["input.json"]
 `)
 	autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
  - target: Hrank
    match: ["Frank"]
  - target: Mike Brown
    match: ["Mike Pat"]
  - target: Godfrey
    match: ["God-free"]
 `)
 	output := filepath.Join(dir, "merged.json")
 	reportPath := filepath.Join(dir, "report.json")
 	err := executeMerge(
 		"--input-file", input,
 		"--speakers", speakers,
 		"--autocorrect", autocorrect,
 		"--output-file", output,
 		"--report-file", reportPath,
 		"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
 	)
 	if err != nil {
 		t.Fatalf("merge failed: %v", err)
 	}
 	var transcript model.FinalTranscript
 	readJSON(t, output, &transcript)
 	if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want {
 		t.Fatalf("segment 0 text = %q, want %q", got, want)
 	}
 	if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want {
 		t.Fatalf("segment 1 text = %q, want %q", got, want)
 	}
 	var rpt report.Report
 	readJSON(t, reportPath, &rpt)
 	found := false
 	for _, event := range rpt.Events {
 		if event.Stage == "postprocessing" && event.Module == "autocorrect" {
 			found = true
 			if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") {
 				t.Fatalf("unexpected autocorrect report message: %q", event.Message)
 			}
 		}
 	}
 	if !found {
 		t.Fatal("expected autocorrect report event")
 	}
 }
 func TestOutputJSONIsByteStable(t *testing.T) {
 	dir := t.TempDir()
 	inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -111,7 +111,7 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
 		}
 	}
-	if contains(cfg.PreprocessingModules, "autocorrect") || contains(cfg.PostprocessingModules, "autocorrect") {
+	if contains(cfg.PostprocessingModules, "autocorrect") {
 		if cfg.AutocorrectFile == "" {
 			return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
 		}