Implemented an autocorrect module at the postprocessing stage

This commit is contained in:
2026-04-26 19:33:23 -05:00
parent 99d0c425d6
commit 3928e0c4a7
7 changed files with 482 additions and 6 deletions

View File

@@ -0,0 +1,132 @@
package autocorrect
import (
"fmt"
"os"
"strings"
"gopkg.in/yaml.v3"
)
// Rules stores ordered autocorrect replacement rules.
type Rules struct {
rules []Rule
}
// Rule replaces ordered match strings with a canonical target.
type Rule struct {
Target string `yaml:"target"`
Match []string `yaml:"match"`
}
type fileSchema struct {
Autocorrect []Rule `yaml:"autocorrect"`
}
// Load parses and validates an autocorrect.yml file.
func Load(path string) (Rules, error) {
data, err := os.ReadFile(path)
if err != nil {
return Rules{}, err
}
var parsed fileSchema
if err := yaml.Unmarshal(data, &parsed); err != nil {
return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
}
if len(parsed.Autocorrect) == 0 {
return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
}
seenMatches := make(map[string]int)
rules := make([]Rule, 0, len(parsed.Autocorrect))
for ruleIndex, rule := range parsed.Autocorrect {
rule.Target = strings.TrimSpace(rule.Target)
if rule.Target == "" {
return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
}
if len(rule.Match) == 0 {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
}
localMatches := make(map[string]struct{}, len(rule.Match))
for matchIndex, match := range rule.Match {
match = strings.TrimSpace(match)
if match == "" {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
}
if _, exists := localMatches[match]; exists {
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
}
localMatches[match] = struct{}{}
if previousRuleIndex, exists := seenMatches[match]; exists {
return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
}
seenMatches[match] = ruleIndex
rule.Match[matchIndex] = match
}
rules = append(rules, rule)
}
return Rules{rules: rules}, nil
}
// Apply replaces configured whole-token matches and returns the updated text and replacement count.
func (r Rules) Apply(text string) (string, int) {
total := 0
for _, rule := range r.rules {
for _, match := range rule.Match {
var count int
text, count = replaceWholeToken(text, match, rule.Target)
total += count
}
}
return text, total
}
func replaceWholeToken(text string, match string, target string) (string, int) {
if text == "" || match == "" {
return text, 0
}
var builder strings.Builder
replacements := 0
searchStart := 0
writeStart := 0
for {
index := strings.Index(text[searchStart:], match)
if index == -1 {
break
}
index += searchStart
end := index + len(match)
if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
builder.WriteString(text[writeStart:index])
builder.WriteString(target)
replacements++
writeStart = end
searchStart = end
continue
}
searchStart = index + 1
}
if replacements == 0 {
return text, 0
}
builder.WriteString(text[writeStart:])
return builder.String(), replacements
}
func isTokenBoundary(text string, index int) bool {
if index < 0 || index >= len(text) {
return true
}
char := text[index]
return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
}

View File

@@ -0,0 +1,190 @@
package autocorrect
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestLoadValidRules(t *testing.T) {
dir := t.TempDir()
path := writeAutocorrect(t, dir, `autocorrect:
- target: "Hrank"
match:
- "Frank"
- "frank"
`)
rules, err := Load(path)
if err != nil {
t.Fatalf("load rules: %v", err)
}
got, count := rules.Apply("Frank and frank")
if got != "Hrank and Hrank" {
t.Fatalf("text = %q, want %q", got, "Hrank and Hrank")
}
if count != 2 {
t.Fatalf("count = %d, want 2", count)
}
}
func TestLoadValidation(t *testing.T) {
tests := []struct {
name string
content string
want string
}{
{
name: "missing top-level autocorrect",
content: `other: []`,
want: "must contain at least one autocorrect rule",
},
{
name: "empty rules list",
content: `autocorrect: []`,
want: "must contain at least one autocorrect rule",
},
{
name: "empty target",
content: `autocorrect:
- target: ""
match: ["Frank"]
`,
want: "must include target",
},
{
name: "empty match list",
content: `autocorrect:
- target: "Hrank"
match: []
`,
want: "must include at least one match string",
},
{
name: "empty match string",
content: `autocorrect:
- target: "Hrank"
match: [" "]
`,
want: "contains empty match string",
},
{
name: "duplicate match across rules",
content: `autocorrect:
- target: "Hrank"
match: ["Frank"]
- target: "Other"
match: ["Frank"]
`,
want: `appears in both rule 0 and rule 1`,
},
{
name: "duplicate match within rule",
content: `autocorrect:
- target: "Hrank"
match: ["Frank", "Frank"]
`,
want: `contains duplicate match string "Frank"`,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
dir := t.TempDir()
path := writeAutocorrect(t, dir, test.content)
_, err := Load(path)
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), test.want) {
t.Fatalf("expected error to contain %q, got %v", test.want, err)
}
})
}
}
func TestApplyReplacementBehavior(t *testing.T) {
rules := Rules{rules: []Rule{
{
Target: "Hrank",
Match: []string{"Frank"},
},
{
Target: "Mike Brown",
Match: []string{"Mike Pat"},
},
{
Target: "Godfrey",
Match: []string{"God-free"},
},
}}
tests := []struct {
name string
input string
want string
wantCount int
}{
{
name: "case sensitive",
input: "Frank and FRANK",
want: "Hrank and FRANK",
wantCount: 1,
},
{
name: "punctuation boundary",
input: "Frank, are you there?",
want: "Hrank, are you there?",
wantCount: 1,
},
{
name: "no substring in larger token",
input: "Franklin and xFrank Frank_y Frank2",
want: "Franklin and xFrank Frank_y Frank2",
wantCount: 0,
},
{
name: "multi word match",
input: "Hello Mike Pat.",
want: "Hello Mike Brown.",
wantCount: 1,
},
{
name: "hyphenated match",
input: "God-free is here.",
want: "Godfrey is here.",
wantCount: 1,
},
{
name: "hyphen outside match is boundary",
input: "x-Frank-y",
want: "x-Hrank-y",
wantCount: 1,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got, count := rules.Apply(test.input)
if got != test.want {
t.Fatalf("text = %q, want %q", got, test.want)
}
if count != test.wantCount {
t.Fatalf("count = %d, want %d", count, test.wantCount)
}
})
}
}
func writeAutocorrect(t *testing.T, dir string, content string) string {
t.Helper()
path := filepath.Join(dir, "autocorrect.yml")
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
t.Fatalf("write autocorrect file: %v", err)
}
return path
}