133 lines
3.5 KiB
Go
133 lines
3.5 KiB
Go
package autocorrect
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// Rules stores ordered autocorrect replacement rules.
|
|
type Rules struct {
|
|
rules []Rule
|
|
}
|
|
|
|
// Rule replaces ordered match strings with a canonical target.
|
|
type Rule struct {
|
|
Target string `yaml:"target"`
|
|
Match []string `yaml:"match"`
|
|
}
|
|
|
|
type fileSchema struct {
|
|
Autocorrect []Rule `yaml:"autocorrect"`
|
|
}
|
|
|
|
// Load parses and validates an autocorrect.yml file.
|
|
func Load(path string) (Rules, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return Rules{}, err
|
|
}
|
|
|
|
var parsed fileSchema
|
|
if err := yaml.Unmarshal(data, &parsed); err != nil {
|
|
return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
|
|
}
|
|
if len(parsed.Autocorrect) == 0 {
|
|
return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
|
|
}
|
|
|
|
seenMatches := make(map[string]int)
|
|
rules := make([]Rule, 0, len(parsed.Autocorrect))
|
|
for ruleIndex, rule := range parsed.Autocorrect {
|
|
rule.Target = strings.TrimSpace(rule.Target)
|
|
if rule.Target == "" {
|
|
return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
|
|
}
|
|
if len(rule.Match) == 0 {
|
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
|
|
}
|
|
|
|
localMatches := make(map[string]struct{}, len(rule.Match))
|
|
for matchIndex, match := range rule.Match {
|
|
match = strings.TrimSpace(match)
|
|
if match == "" {
|
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
|
|
}
|
|
if _, exists := localMatches[match]; exists {
|
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
|
|
}
|
|
localMatches[match] = struct{}{}
|
|
|
|
if previousRuleIndex, exists := seenMatches[match]; exists {
|
|
return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
|
|
}
|
|
seenMatches[match] = ruleIndex
|
|
rule.Match[matchIndex] = match
|
|
}
|
|
|
|
rules = append(rules, rule)
|
|
}
|
|
|
|
return Rules{rules: rules}, nil
|
|
}
|
|
|
|
// Apply replaces configured whole-token matches and returns the updated text and replacement count.
|
|
func (r Rules) Apply(text string) (string, int) {
|
|
total := 0
|
|
for _, rule := range r.rules {
|
|
for _, match := range rule.Match {
|
|
var count int
|
|
text, count = replaceWholeToken(text, match, rule.Target)
|
|
total += count
|
|
}
|
|
}
|
|
return text, total
|
|
}
|
|
|
|
func replaceWholeToken(text string, match string, target string) (string, int) {
|
|
if text == "" || match == "" {
|
|
return text, 0
|
|
}
|
|
|
|
var builder strings.Builder
|
|
replacements := 0
|
|
searchStart := 0
|
|
writeStart := 0
|
|
for {
|
|
index := strings.Index(text[searchStart:], match)
|
|
if index == -1 {
|
|
break
|
|
}
|
|
index += searchStart
|
|
end := index + len(match)
|
|
|
|
if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
|
|
builder.WriteString(text[writeStart:index])
|
|
builder.WriteString(target)
|
|
replacements++
|
|
writeStart = end
|
|
searchStart = end
|
|
continue
|
|
}
|
|
|
|
searchStart = index + 1
|
|
}
|
|
|
|
if replacements == 0 {
|
|
return text, 0
|
|
}
|
|
|
|
builder.WriteString(text[writeStart:])
|
|
return builder.String(), replacements
|
|
}
|
|
|
|
func isTokenBoundary(text string, index int) bool {
|
|
if index < 0 || index >= len(text) {
|
|
return true
|
|
}
|
|
char := text[index]
|
|
return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
|
|
}
|