Implemented an autocorrect module at the postprocessing stage
This commit is contained in:
41
README.md
41
README.md
@@ -46,7 +46,7 @@ Optional flags:
|
|||||||
- `--output-modules`: comma-separated output modules. Default: `json`.
|
- `--output-modules`: comma-separated output modules. Default: `json`.
|
||||||
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
||||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`.
|
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`.
|
||||||
- `--autocorrect`: autocorrect rules file. Reserved for the `autocorrect` module; not part of the default pipeline.
|
- `--autocorrect`: autocorrect rules file. Required when the postprocessing `autocorrect` module is enabled.
|
||||||
|
|
||||||
## Input JSON Format
|
## Input JSON Format
|
||||||
|
|
||||||
@@ -163,9 +163,46 @@ Segments are sorted deterministically by:
|
|||||||
|
|
||||||
Final segment IDs are assigned after sorting and start at `1`.
|
Final segment IDs are assigned after sorting and start at `1`.
|
||||||
|
|
||||||
|
## Autocorrect
|
||||||
|
|
||||||
|
Autocorrect is an opt-in postprocessing module. It is not part of the default pipeline.
|
||||||
|
|
||||||
|
Enable it by adding `autocorrect` to `--postprocessing-modules` and passing `--autocorrect`:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
go run ./cmd/seriatim merge \
|
||||||
|
--input-file input.json \
|
||||||
|
--speakers speakers.yml \
|
||||||
|
--autocorrect autocorrect.yml \
|
||||||
|
--postprocessing-modules detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output \
|
||||||
|
--output-file merged.json
|
||||||
|
```
|
||||||
|
|
||||||
|
`autocorrect.yml` format:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match:
|
||||||
|
- "hrank"
|
||||||
|
- "Frank"
|
||||||
|
|
||||||
|
- target: "Mike Brown"
|
||||||
|
match:
|
||||||
|
- "Mike Pat"
|
||||||
|
```
|
||||||
|
|
||||||
|
Matching behavior:
|
||||||
|
|
||||||
|
- Matching is case-sensitive.
|
||||||
|
- Matches apply only to whole tokens, not substrings inside larger words.
|
||||||
|
- Punctuation and whitespace can surround a match.
|
||||||
|
- Multi-word and hyphenated matches are supported.
|
||||||
|
- Duplicate match strings are invalid, including duplicates across separate rules.
|
||||||
|
|
||||||
## Current Limitations
|
## Current Limitations
|
||||||
|
|
||||||
- Only JSON input is supported.
|
- Only JSON input is supported.
|
||||||
- Word-level timing data is not preserved yet.
|
- Word-level timing data is not preserved yet.
|
||||||
- Overlap detection and overlap resolution are currently no-op modules.
|
- Overlap detection and overlap resolution are currently no-op modules.
|
||||||
- Autocorrect, coalescing, and alternate output formats are not implemented yet.
|
- Coalescing and alternate output formats are not implemented yet.
|
||||||
|
|||||||
132
internal/autocorrect/autocorrect.go
Normal file
132
internal/autocorrect/autocorrect.go
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
package autocorrect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Rules stores ordered autocorrect replacement rules.
|
||||||
|
type Rules struct {
|
||||||
|
rules []Rule
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rule replaces ordered match strings with a canonical target.
|
||||||
|
type Rule struct {
|
||||||
|
Target string `yaml:"target"`
|
||||||
|
Match []string `yaml:"match"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fileSchema struct {
|
||||||
|
Autocorrect []Rule `yaml:"autocorrect"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load parses and validates an autocorrect.yml file.
|
||||||
|
func Load(path string) (Rules, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return Rules{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var parsed fileSchema
|
||||||
|
if err := yaml.Unmarshal(data, &parsed); err != nil {
|
||||||
|
return Rules{}, fmt.Errorf("parse autocorrect file %q: %w", path, err)
|
||||||
|
}
|
||||||
|
if len(parsed.Autocorrect) == 0 {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect file %q must contain at least one autocorrect rule", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
seenMatches := make(map[string]int)
|
||||||
|
rules := make([]Rule, 0, len(parsed.Autocorrect))
|
||||||
|
for ruleIndex, rule := range parsed.Autocorrect {
|
||||||
|
rule.Target = strings.TrimSpace(rule.Target)
|
||||||
|
if rule.Target == "" {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect rule %d must include target", ruleIndex)
|
||||||
|
}
|
||||||
|
if len(rule.Match) == 0 {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q must include at least one match string", ruleIndex, rule.Target)
|
||||||
|
}
|
||||||
|
|
||||||
|
localMatches := make(map[string]struct{}, len(rule.Match))
|
||||||
|
for matchIndex, match := range rule.Match {
|
||||||
|
match = strings.TrimSpace(match)
|
||||||
|
if match == "" {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains empty match string at index %d", ruleIndex, rule.Target, matchIndex)
|
||||||
|
}
|
||||||
|
if _, exists := localMatches[match]; exists {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect rule %d for target %q contains duplicate match string %q", ruleIndex, rule.Target, match)
|
||||||
|
}
|
||||||
|
localMatches[match] = struct{}{}
|
||||||
|
|
||||||
|
if previousRuleIndex, exists := seenMatches[match]; exists {
|
||||||
|
return Rules{}, fmt.Errorf("autocorrect match string %q appears in both rule %d and rule %d", match, previousRuleIndex, ruleIndex)
|
||||||
|
}
|
||||||
|
seenMatches[match] = ruleIndex
|
||||||
|
rule.Match[matchIndex] = match
|
||||||
|
}
|
||||||
|
|
||||||
|
rules = append(rules, rule)
|
||||||
|
}
|
||||||
|
|
||||||
|
return Rules{rules: rules}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply replaces configured whole-token matches and returns the updated text and replacement count.
|
||||||
|
func (r Rules) Apply(text string) (string, int) {
|
||||||
|
total := 0
|
||||||
|
for _, rule := range r.rules {
|
||||||
|
for _, match := range rule.Match {
|
||||||
|
var count int
|
||||||
|
text, count = replaceWholeToken(text, match, rule.Target)
|
||||||
|
total += count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text, total
|
||||||
|
}
|
||||||
|
|
||||||
|
func replaceWholeToken(text string, match string, target string) (string, int) {
|
||||||
|
if text == "" || match == "" {
|
||||||
|
return text, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
var builder strings.Builder
|
||||||
|
replacements := 0
|
||||||
|
searchStart := 0
|
||||||
|
writeStart := 0
|
||||||
|
for {
|
||||||
|
index := strings.Index(text[searchStart:], match)
|
||||||
|
if index == -1 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
index += searchStart
|
||||||
|
end := index + len(match)
|
||||||
|
|
||||||
|
if isTokenBoundary(text, index-1) && isTokenBoundary(text, end) {
|
||||||
|
builder.WriteString(text[writeStart:index])
|
||||||
|
builder.WriteString(target)
|
||||||
|
replacements++
|
||||||
|
writeStart = end
|
||||||
|
searchStart = end
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
searchStart = index + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if replacements == 0 {
|
||||||
|
return text, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.WriteString(text[writeStart:])
|
||||||
|
return builder.String(), replacements
|
||||||
|
}
|
||||||
|
|
||||||
|
func isTokenBoundary(text string, index int) bool {
|
||||||
|
if index < 0 || index >= len(text) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
char := text[index]
|
||||||
|
return !((char >= 'A' && char <= 'Z') || (char >= 'a' && char <= 'z') || (char >= '0' && char <= '9') || char == '_')
|
||||||
|
}
|
||||||
190
internal/autocorrect/autocorrect_test.go
Normal file
190
internal/autocorrect/autocorrect_test.go
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
package autocorrect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLoadValidRules(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := writeAutocorrect(t, dir, `autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match:
|
||||||
|
- "Frank"
|
||||||
|
- "frank"
|
||||||
|
`)
|
||||||
|
|
||||||
|
rules, err := Load(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("load rules: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, count := rules.Apply("Frank and frank")
|
||||||
|
if got != "Hrank and Hrank" {
|
||||||
|
t.Fatalf("text = %q, want %q", got, "Hrank and Hrank")
|
||||||
|
}
|
||||||
|
if count != 2 {
|
||||||
|
t.Fatalf("count = %d, want 2", count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestLoadValidation(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
content string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "missing top-level autocorrect",
|
||||||
|
content: `other: []`,
|
||||||
|
want: "must contain at least one autocorrect rule",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty rules list",
|
||||||
|
content: `autocorrect: []`,
|
||||||
|
want: "must contain at least one autocorrect rule",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty target",
|
||||||
|
content: `autocorrect:
|
||||||
|
- target: ""
|
||||||
|
match: ["Frank"]
|
||||||
|
`,
|
||||||
|
want: "must include target",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty match list",
|
||||||
|
content: `autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match: []
|
||||||
|
`,
|
||||||
|
want: "must include at least one match string",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty match string",
|
||||||
|
content: `autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match: [" "]
|
||||||
|
`,
|
||||||
|
want: "contains empty match string",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "duplicate match across rules",
|
||||||
|
content: `autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match: ["Frank"]
|
||||||
|
- target: "Other"
|
||||||
|
match: ["Frank"]
|
||||||
|
`,
|
||||||
|
want: `appears in both rule 0 and rule 1`,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "duplicate match within rule",
|
||||||
|
content: `autocorrect:
|
||||||
|
- target: "Hrank"
|
||||||
|
match: ["Frank", "Frank"]
|
||||||
|
`,
|
||||||
|
want: `contains duplicate match string "Frank"`,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := writeAutocorrect(t, dir, test.content)
|
||||||
|
|
||||||
|
_, err := Load(path)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), test.want) {
|
||||||
|
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyReplacementBehavior(t *testing.T) {
|
||||||
|
rules := Rules{rules: []Rule{
|
||||||
|
{
|
||||||
|
Target: "Hrank",
|
||||||
|
Match: []string{"Frank"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Target: "Mike Brown",
|
||||||
|
Match: []string{"Mike Pat"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Target: "Godfrey",
|
||||||
|
Match: []string{"God-free"},
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
want string
|
||||||
|
wantCount int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "case sensitive",
|
||||||
|
input: "Frank and FRANK",
|
||||||
|
want: "Hrank and FRANK",
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "punctuation boundary",
|
||||||
|
input: "Frank, are you there?",
|
||||||
|
want: "Hrank, are you there?",
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no substring in larger token",
|
||||||
|
input: "Franklin and xFrank Frank_y Frank2",
|
||||||
|
want: "Franklin and xFrank Frank_y Frank2",
|
||||||
|
wantCount: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "multi word match",
|
||||||
|
input: "Hello Mike Pat.",
|
||||||
|
want: "Hello Mike Brown.",
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "hyphenated match",
|
||||||
|
input: "God-free is here.",
|
||||||
|
want: "Godfrey is here.",
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "hyphen outside match is boundary",
|
||||||
|
input: "x-Frank-y",
|
||||||
|
want: "x-Hrank-y",
|
||||||
|
wantCount: 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
got, count := rules.Apply(test.input)
|
||||||
|
if got != test.want {
|
||||||
|
t.Fatalf("text = %q, want %q", got, test.want)
|
||||||
|
}
|
||||||
|
if count != test.wantCount {
|
||||||
|
t.Fatalf("count = %d, want %d", count, test.wantCount)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeAutocorrect(t *testing.T, dir string, content string) string {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
path := filepath.Join(dir, "autocorrect.yml")
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
|
||||||
|
t.Fatalf("write autocorrect file: %v", err)
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
@@ -2,7 +2,9 @@ package builtin
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/autocorrect"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||||
@@ -45,3 +47,31 @@ func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg con
|
|||||||
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
|
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type autocorrectPostprocessor struct{}
|
||||||
|
|
||||||
|
func (autocorrectPostprocessor) Name() string {
|
||||||
|
return "autocorrect"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return model.MergedTranscript{}, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
rules, err := autocorrect.Load(cfg.AutocorrectFile)
|
||||||
|
if err != nil {
|
||||||
|
return model.MergedTranscript{}, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
replacements := 0
|
||||||
|
for index := range in.Segments {
|
||||||
|
var count int
|
||||||
|
in.Segments[index].Text, count = rules.Apply(in.Segments[index].Text)
|
||||||
|
replacements += count
|
||||||
|
}
|
||||||
|
|
||||||
|
return in, []report.Event{
|
||||||
|
report.Info("postprocessing", "autocorrect", fmt.Sprintf("applied %d autocorrect replacement(s)", replacements)),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -10,13 +10,12 @@ func NewRegistry() *pipeline.Registry {
|
|||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
||||||
registry.RegisterPreprocessor(normalizeSpeakers{})
|
registry.RegisterPreprocessor(normalizeSpeakers{})
|
||||||
registry.RegisterPreprocessor(trimText{})
|
registry.RegisterPreprocessor(trimText{})
|
||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
|
||||||
registry.RegisterMerger(placeholderMerger{})
|
registry.RegisterMerger(placeholderMerger{})
|
||||||
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
||||||
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
|
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
|
||||||
registry.RegisterPostprocessor(assignIDs{})
|
registry.RegisterPostprocessor(assignIDs{})
|
||||||
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
|
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
|
||||||
registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"})
|
registry.RegisterPostprocessor(autocorrectPostprocessor{})
|
||||||
registry.RegisterOutputWriter(jsonOutputWriter{})
|
registry.RegisterOutputWriter(jsonOutputWriter{})
|
||||||
|
|
||||||
return registry
|
return registry
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
|||||||
"--input-file", input,
|
"--input-file", input,
|
||||||
"--speakers", speakers,
|
"--speakers", speakers,
|
||||||
"--output-file", output,
|
"--output-file", output,
|
||||||
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
|
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
t.Fatal("expected error")
|
t.Fatal("expected error")
|
||||||
@@ -314,6 +314,94 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestPreprocessingAutocorrectIsUnknownModule(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||||
|
- speaker: Alice
|
||||||
|
match: ["input.json"]
|
||||||
|
`)
|
||||||
|
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
|
||||||
|
- target: Hrank
|
||||||
|
match: ["Frank"]
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--autocorrect", autocorrect,
|
||||||
|
"--output-file", output,
|
||||||
|
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), `unknown preprocessing module "autocorrect"`) {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{
|
||||||
|
"segments": [
|
||||||
|
{"start": 1, "end": 2, "text": "Frank met Mike Pat, not Franklin."},
|
||||||
|
{"start": 3, "end": 4, "text": "God-free and FRANK stayed."}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||||
|
- speaker: Alice
|
||||||
|
match: ["input.json"]
|
||||||
|
`)
|
||||||
|
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
|
||||||
|
- target: Hrank
|
||||||
|
match: ["Frank"]
|
||||||
|
- target: Mike Brown
|
||||||
|
match: ["Mike Pat"]
|
||||||
|
- target: Godfrey
|
||||||
|
match: ["God-free"]
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
reportPath := filepath.Join(dir, "report.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--autocorrect", autocorrect,
|
||||||
|
"--output-file", output,
|
||||||
|
"--report-file", reportPath,
|
||||||
|
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("merge failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var transcript model.FinalTranscript
|
||||||
|
readJSON(t, output, &transcript)
|
||||||
|
if got, want := transcript.Segments[0].Text, "Hrank met Mike Brown, not Franklin."; got != want {
|
||||||
|
t.Fatalf("segment 0 text = %q, want %q", got, want)
|
||||||
|
}
|
||||||
|
if got, want := transcript.Segments[1].Text, "Godfrey and FRANK stayed."; got != want {
|
||||||
|
t.Fatalf("segment 1 text = %q, want %q", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rpt report.Report
|
||||||
|
readJSON(t, reportPath, &rpt)
|
||||||
|
found := false
|
||||||
|
for _, event := range rpt.Events {
|
||||||
|
if event.Stage == "postprocessing" && event.Module == "autocorrect" {
|
||||||
|
found = true
|
||||||
|
if !strings.Contains(event.Message, "applied 3 autocorrect replacement(s)") {
|
||||||
|
t.Fatalf("unexpected autocorrect report message: %q", event.Message)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatal("expected autocorrect report event")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if contains(cfg.PreprocessingModules, "autocorrect") || contains(cfg.PostprocessingModules, "autocorrect") {
|
if contains(cfg.PostprocessingModules, "autocorrect") {
|
||||||
if cfg.AutocorrectFile == "" {
|
if cfg.AutocorrectFile == "" {
|
||||||
return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
|
return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user