Implemented an autocorrect module at the postprocessing stage

This commit is contained in:
2026-04-26 19:33:23 -05:00
parent 99d0c425d6
commit 3928e0c4a7
7 changed files with 482 additions and 6 deletions

View File

@@ -2,7 +2,9 @@ package builtin
import (
"context"
"fmt"
"gitea.maximumdirect.net/eric/seriatim/internal/autocorrect"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
@@ -45,3 +47,31 @@ func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg con
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
}, nil
}
type autocorrectPostprocessor struct{}
func (autocorrectPostprocessor) Name() string {
return "autocorrect"
}
func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
rules, err := autocorrect.Load(cfg.AutocorrectFile)
if err != nil {
return model.MergedTranscript{}, nil, err
}
replacements := 0
for index := range in.Segments {
var count int
in.Segments[index].Text, count = rules.Apply(in.Segments[index].Text)
replacements += count
}
return in, []report.Event{
report.Info("postprocessing", "autocorrect", fmt.Sprintf("applied %d autocorrect replacement(s)", replacements)),
}, nil
}

View File

@@ -10,13 +10,12 @@ func NewRegistry() *pipeline.Registry {
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
registry.RegisterPreprocessor(normalizeSpeakers{})
registry.RegisterPreprocessor(trimText{})
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
registry.RegisterMerger(placeholderMerger{})
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
registry.RegisterPostprocessor(assignIDs{})
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"})
registry.RegisterPostprocessor(autocorrectPostprocessor{})
registry.RegisterOutputWriter(jsonOutputWriter{})
return registry