Simplify the CLI interface and update documentation accordingly
This commit is contained in:
20
README.md
20
README.md
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
`seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order.
|
`seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order.
|
||||||
|
|
||||||
The current implementation supports the `merge` command. It reads one or more input JSON files, maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, assigns consecutive numeric `id` values, and writes a merged JSON artifact.
|
The current implementation supports the `merge` command. It reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, assigns consecutive numeric `id` values, and writes a merged JSON artifact.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@@ -12,7 +12,6 @@ Run from source:
|
|||||||
go run ./cmd/seriatim merge \
|
go run ./cmd/seriatim merge \
|
||||||
--input-file samples/raw/2026-04-19-Eric_Rakestraw.json \
|
--input-file samples/raw/2026-04-19-Eric_Rakestraw.json \
|
||||||
--input-file samples/raw/2026-04-19-Mike_Brown.json \
|
--input-file samples/raw/2026-04-19-Mike_Brown.json \
|
||||||
--speakers samples/speakers.yml \
|
|
||||||
--output-file merged.json
|
--output-file merged.json
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -22,7 +21,6 @@ Optional report output:
|
|||||||
go run ./cmd/seriatim merge \
|
go run ./cmd/seriatim merge \
|
||||||
--input-file eric.json \
|
--input-file eric.json \
|
||||||
--input-file mike.json \
|
--input-file mike.json \
|
||||||
--speakers speakers.yml \
|
|
||||||
--output-file merged.json \
|
--output-file merged.json \
|
||||||
--report-file report.json
|
--report-file report.json
|
||||||
```
|
```
|
||||||
@@ -36,17 +34,17 @@ seriatim merge [flags]
|
|||||||
Required flags for the default pipeline:
|
Required flags for the default pipeline:
|
||||||
|
|
||||||
- `--input-file`: input transcript JSON file. Repeat once per speaker/input file.
|
- `--input-file`: input transcript JSON file. Repeat once per speaker/input file.
|
||||||
- `--speakers`: speaker map YAML file. Required because `normalize-speakers` is enabled by default.
|
|
||||||
- `--output-file`: merged transcript JSON output path.
|
- `--output-file`: merged transcript JSON output path.
|
||||||
|
|
||||||
Optional flags:
|
Optional flags:
|
||||||
|
|
||||||
- `--report-file`: write a JSON report with pipeline events.
|
- `--report-file`: write a JSON report with pipeline events.
|
||||||
|
- `--speakers`: speaker map YAML file. When omitted, input file basenames are used as speaker labels.
|
||||||
|
- `--autocorrect`: autocorrect rules file. When omitted, the default `autocorrect` module no-ops.
|
||||||
- `--input-reader`: input reader module. Default: `json-files`.
|
- `--input-reader`: input reader module. Default: `json-files`.
|
||||||
- `--output-modules`: comma-separated output modules. Default: `json`.
|
- `--output-modules`: comma-separated output modules. Default: `json`.
|
||||||
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
||||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`.
|
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output`.
|
||||||
- `--autocorrect`: autocorrect rules file. Required when the postprocessing `autocorrect` module is enabled.
|
|
||||||
|
|
||||||
## Input JSON Format
|
## Input JSON Format
|
||||||
|
|
||||||
@@ -76,6 +74,8 @@ Other WhisperX fields, including `words` and raw diarization speaker labels, are
|
|||||||
|
|
||||||
`speakers.yml` maps input files to canonical speaker names using ordered substring rules:
|
`speakers.yml` maps input files to canonical speaker names using ordered substring rules:
|
||||||
|
|
||||||
|
This file is optional. If `--speakers` is omitted, `seriatim` uses each input file basename as the segment speaker label.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
match:
|
match:
|
||||||
- speaker: "Eric Rakestraw"
|
- speaker: "Eric Rakestraw"
|
||||||
@@ -137,7 +137,7 @@ The merged output uses the current seriatim envelope:
|
|||||||
"input_reader": "json-files",
|
"input_reader": "json-files",
|
||||||
"input_files": ["eric.json", "mike.json"],
|
"input_files": ["eric.json", "mike.json"],
|
||||||
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
|
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
|
||||||
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "assign-ids", "validate-output"],
|
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output"],
|
||||||
"output_modules": ["json"]
|
"output_modules": ["json"]
|
||||||
},
|
},
|
||||||
"segments": [
|
"segments": [
|
||||||
@@ -165,16 +165,14 @@ Final segment IDs are assigned after sorting and start at `1`.
|
|||||||
|
|
||||||
## Autocorrect
|
## Autocorrect
|
||||||
|
|
||||||
Autocorrect is an opt-in postprocessing module. It is not part of the default pipeline.
|
Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report.
|
||||||
|
|
||||||
Enable it by adding `autocorrect` to `--postprocessing-modules` and passing `--autocorrect`:
|
Enable corrections by passing `--autocorrect`:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
go run ./cmd/seriatim merge \
|
go run ./cmd/seriatim merge \
|
||||||
--input-file input.json \
|
--input-file input.json \
|
||||||
--speakers speakers.yml \
|
|
||||||
--autocorrect autocorrect.yml \
|
--autocorrect autocorrect.yml \
|
||||||
--postprocessing-modules detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output \
|
|
||||||
--output-file merged.json
|
--output-file merged.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -58,6 +58,11 @@ func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTran
|
|||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return model.MergedTranscript{}, nil, err
|
return model.MergedTranscript{}, nil, err
|
||||||
}
|
}
|
||||||
|
if cfg.AutocorrectFile == "" {
|
||||||
|
return in, []report.Event{
|
||||||
|
report.Info("postprocessing", "autocorrect", "skipped autocorrect because no autocorrect file was supplied"),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
rules, err := autocorrect.Load(cfg.AutocorrectFile)
|
rules, err := autocorrect.Load(cfg.AutocorrectFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package builtin
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
@@ -99,17 +100,26 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat
|
|||||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
||||||
}
|
}
|
||||||
|
|
||||||
speakers, err := speaker.LoadMap(cfg.SpeakersFile)
|
var speakers speaker.Map
|
||||||
|
useSpeakerMap := cfg.SpeakersFile != ""
|
||||||
|
if useSpeakerMap {
|
||||||
|
var err error
|
||||||
|
speakers, err = speaker.LoadMap(cfg.SpeakersFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return pipeline.PreprocessState{}, nil, err
|
return pipeline.PreprocessState{}, nil, err
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
||||||
for _, raw := range in.Raw {
|
for _, raw := range in.Raw {
|
||||||
canonicalSpeaker, err := speakers.SpeakerForSource(raw.Source)
|
canonicalSpeaker := filepath.Base(raw.Source)
|
||||||
|
if useSpeakerMap {
|
||||||
|
var err error
|
||||||
|
canonicalSpeaker, err = speakers.SpeakerForSource(raw.Source)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return pipeline.PreprocessState{}, nil, err
|
return pipeline.PreprocessState{}, nil, err
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
segments := make([]model.Segment, 0, len(raw.Segments))
|
segments := make([]model.Segment, 0, len(raw.Segments))
|
||||||
for index, rawSegment := range raw.Segments {
|
for index, rawSegment := range raw.Segments {
|
||||||
@@ -129,11 +139,16 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message := "created canonical transcript(s) from raw input"
|
||||||
|
if !useSpeakerMap {
|
||||||
|
message = "created canonical transcript(s) using input basenames as speaker labels"
|
||||||
|
}
|
||||||
|
|
||||||
return pipeline.PreprocessState{
|
return pipeline.PreprocessState{
|
||||||
State: pipeline.StateCanonical,
|
State: pipeline.StateCanonical,
|
||||||
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
||||||
Canonical: canonical,
|
Canonical: canonical,
|
||||||
}, []report.Event{
|
}, []report.Event{
|
||||||
report.Info("preprocessing", "normalize-speakers", "created canonical transcript(s) from raw input"),
|
report.Info("preprocessing", "normalize-speakers", message),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
|||||||
"placeholder-merger",
|
"placeholder-merger",
|
||||||
"detect-overlaps",
|
"detect-overlaps",
|
||||||
"resolve-overlaps",
|
"resolve-overlaps",
|
||||||
|
"autocorrect",
|
||||||
"assign-ids",
|
"assign-ids",
|
||||||
"validate-output",
|
"validate-output",
|
||||||
"json",
|
"json",
|
||||||
@@ -274,43 +275,37 @@ func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
func TestDefaultMergeWorksWithoutSpeakersOrAutocorrect(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
reportPath := filepath.Join(dir, "report.json")
|
||||||
|
|
||||||
err := executeMerge(
|
err := executeMerge(
|
||||||
"--input-file", input,
|
"--input-file", input,
|
||||||
"--output-file", output,
|
"--output-file", output,
|
||||||
|
"--report-file", reportPath,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err != nil {
|
||||||
t.Fatal("expected error")
|
t.Fatalf("merge failed: %v", err)
|
||||||
}
|
}
|
||||||
if !strings.Contains(err.Error(), "--speakers is required") {
|
|
||||||
t.Fatalf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
var transcript model.FinalTranscript
|
||||||
dir := t.TempDir()
|
readJSON(t, output, &transcript)
|
||||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
if got, want := transcript.Segments[0].Speaker, "input.json"; got != want {
|
||||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
t.Fatalf("speaker = %q, want %q", got, want)
|
||||||
- speaker: Alice
|
|
||||||
match: ["input.json"]
|
|
||||||
`)
|
|
||||||
output := filepath.Join(dir, "merged.json")
|
|
||||||
|
|
||||||
err := executeMerge(
|
|
||||||
"--input-file", input,
|
|
||||||
"--speakers", speakers,
|
|
||||||
"--output-file", output,
|
|
||||||
"--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output",
|
|
||||||
)
|
|
||||||
if err == nil {
|
|
||||||
t.Fatal("expected error")
|
|
||||||
}
|
}
|
||||||
if !strings.Contains(err.Error(), "--autocorrect is required") {
|
if got, want := transcript.Segments[0].Text, "Frank"; got != want {
|
||||||
t.Fatalf("unexpected error: %v", err)
|
t.Fatalf("text = %q, want %q", got, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rpt report.Report
|
||||||
|
readJSON(t, reportPath, &rpt)
|
||||||
|
if !hasReportEvent(rpt, "preprocessing", "normalize-speakers", "using input basenames") {
|
||||||
|
t.Fatal("expected normalize-speakers fallback report event")
|
||||||
|
}
|
||||||
|
if !hasReportEvent(rpt, "postprocessing", "autocorrect", "skipped autocorrect") {
|
||||||
|
t.Fatal("expected autocorrect skip report event")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -402,6 +397,28 @@ func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestInvalidAutocorrectFileFailsWhenProvided(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect:
|
||||||
|
- target: ""
|
||||||
|
match: ["Frank"]
|
||||||
|
`)
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--autocorrect", autocorrect,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "must include target") {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
||||||
@@ -658,6 +675,15 @@ func equalStrings(left []string, right []string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasReportEvent(rpt report.Report, stage string, module string, messageSubstring string) bool {
|
||||||
|
for _, event := range rpt.Events {
|
||||||
|
if event.Stage == stage && event.Module == module && strings.Contains(event.Message, messageSubstring) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) {
|
func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ const (
|
|||||||
DefaultInputReader = "json-files"
|
DefaultInputReader = "json-files"
|
||||||
DefaultOutputModules = "json"
|
DefaultOutputModules = "json"
|
||||||
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
||||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,assign-ids,validate-output"
|
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output"
|
||||||
)
|
)
|
||||||
|
|
||||||
// MergeOptions captures raw CLI option values before validation.
|
// MergeOptions captures raw CLI option values before validation.
|
||||||
@@ -98,27 +98,13 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
|
|||||||
cfg.AutocorrectFile = ""
|
cfg.AutocorrectFile = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
if contains(cfg.PreprocessingModules, "normalize-speakers") {
|
if cfg.SpeakersFile != "" {
|
||||||
if cfg.SpeakersFile == "" {
|
|
||||||
return Config{}, errors.New("--speakers is required when normalize-speakers is enabled")
|
|
||||||
}
|
|
||||||
if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil {
|
|
||||||
return Config{}, err
|
|
||||||
}
|
|
||||||
} else if cfg.SpeakersFile != "" {
|
|
||||||
if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil {
|
if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil {
|
||||||
return Config{}, err
|
return Config{}, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if contains(cfg.PostprocessingModules, "autocorrect") {
|
if cfg.AutocorrectFile != "" {
|
||||||
if cfg.AutocorrectFile == "" {
|
|
||||||
return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
|
|
||||||
}
|
|
||||||
if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil {
|
|
||||||
return Config{}, err
|
|
||||||
}
|
|
||||||
} else if cfg.AutocorrectFile != "" {
|
|
||||||
if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil {
|
if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil {
|
||||||
return Config{}, err
|
return Config{}, err
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user