Implemented an autocorrect module at the postprocessing stage
This commit is contained in:
132
internal/autocorrect/autocorrect.go
Normal file
132
internal/autocorrect/autocorrect.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package autocorrect
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Rules stores ordered autocorrect replacement rules.
|
||||
type Rules struct {
|
||||
rules []Rule
|
||||
}
|
||||
|
||||
// Rule replaces ordered match strings with a canonical target.
|
||||
type Rule struct {
|
||||
Target string `yaml:"target"`
|
||||
Match []string `yaml:"match"`
|
||||
}
|
||||
|
||||
type fileSchema struct {
|
||||
Autocorrect []Rule `yaml:"autocorrect"`
|
||||
}
|
||||
|
||||
// Load parses and validates an autocorrect.yml file.
|
||||
func Load(path string) (Rules, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return Rules{}, err
|
||||
}
|
||||
|
||||
var parsed fileSchema
|
||||
if err := yaml.Unmarshal(data, &parsed); err != nil {
|
||||
return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
|
||||
}
|
||||
if len(parsed.Autocorrect) == 0 {
|
||||
return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
|
||||
}
|
||||
|
||||
seenMatches := make(map[string]int)
|
||||
rules := make([]Rule, 0, len(parsed.Autocorrect))
|
||||
for ruleIndex, rule := range parsed.Autocorrect {
|
||||
rule.Target = strings.TrimSpace(rule.Target)
|
||||
if rule.Target == "" {
|
||||
return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
|
||||
}
|
||||
if len(rule.Match) == 0 {
|
||||
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
|
||||
}
|
||||
|
||||
localMatches := make(map[string]struct{}, len(rule.Match))
|
||||
for matchIndex, match := range rule.Match {
|
||||
match = strings.TrimSpace(match)
|
||||
if match == "" {
|
||||
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
|
||||
}
|
||||
if _, exists := localMatches[match]; exists {
|
||||
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
|
||||
}
|
||||
localMatches[match] = struct{}{}
|
||||
|
||||
if previousRuleIndex, exists := seenMatches[match]; exists {
|
||||
return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
|
||||
}
|
||||
seenMatches[match] = ruleIndex
|
||||
rule.Match[matchIndex] = match
|
||||
}
|
||||
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
|
||||
return Rules{rules: rules}, nil
|
||||
}
|
||||
|
||||
// Apply replaces configured whole-token matches and returns the updated text and replacement count.
|
||||
func (r Rules) Apply(text string) (string, int) {
|
||||
total := 0
|
||||
for _, rule := range r.rules {
|
||||
for _, match := range rule.Match {
|
||||
var count int
|
||||
text, count = replaceWholeToken(text, match, rule.Target)
|
||||
total += count
|
||||
}
|
||||
}
|
||||
return text, total
|
||||
}
|
||||
|
||||
func replaceWholeToken(text string, match string, target string) (string, int) {
|
||||
if text == "" || match == "" {
|
||||
return text, 0
|
||||
}
|
||||
|
||||
var builder strings.Builder
|
||||
replacements := 0
|
||||
searchStart := 0
|
||||
writeStart := 0
|
||||
for {
|
||||
index := strings.Index(text[searchStart:], match)
|
||||
if index == -1 {
|
||||
break
|
||||
}
|
||||
index += searchStart
|
||||
end := index + len(match)
|
||||
|
||||
if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
|
||||
builder.WriteString(text[writeStart:index])
|
||||
builder.WriteString(target)
|
||||
replacements++
|
||||
writeStart = end
|
||||
searchStart = end
|
||||
continue
|
||||
}
|
||||
|
||||
searchStart = index + 1
|
||||
}
|
||||
|
||||
if replacements == 0 {
|
||||
return text, 0
|
||||
}
|
||||
|
||||
builder.WriteString(text[writeStart:])
|
||||
return builder.String(), replacements
|
||||
}
|
||||
|
||||
func isTokenBoundary(text string, index int) bool {
|
||||
if index < 0 || index >= len(text) {
|
||||
return true
|
||||
}
|
||||
char := text[index]
|
||||
return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
|
||||
}
|
||||
Reference in New Issue
Block a user