diff --git a/README.md b/README.md index 3074f58..f823379 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ `seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order. -The current implementation supports the `merge` command. It reads one or more input JSON files, maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, assigns consecutive numeric `id` values, and writes a merged JSON artifact. +The current implementation supports the `merge` command. It reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, assigns consecutive numeric `id` values, and writes a merged JSON artifact. ## Usage @@ -12,7 +12,6 @@ Run from source: go run ./cmd/seriatim merge \ --input-file samples/raw/2026-04-19-Eric_Rakestraw.json \ --input-file samples/raw/2026-04-19-Mike_Brown.json \ - --speakers samples/speakers.yml \ --output-file merged.json ``` @@ -22,7 +21,6 @@ Optional report output: go run ./cmd/seriatim merge \ --input-file eric.json \ --input-file mike.json \ - --speakers speakers.yml \ --output-file merged.json \ --report-file report.json ``` @@ -36,17 +34,17 @@ seriatim merge [flags] Required flags for the default pipeline: - `--input-file`: input transcript JSON file. Repeat once per speaker/input file. -- `--speakers`: speaker map YAML file. Required because `normalize-speakers` is enabled by default. - `--output-file`: merged transcript JSON output path. Optional flags: - `--report-file`: write a JSON report with pipeline events. +- `--speakers`: speaker map YAML file. When omitted, input file basenames are used as speaker labels. +- `--autocorrect`: autocorrect rules file. When omitted, the default `autocorrect` module no-ops. - `--input-reader`: input reader module. Default: `json-files`. - `--output-modules`: comma-separated output modules. Default: `json`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. -- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,assign-ids,validate-output`. -- `--autocorrect`: autocorrect rules file. Required when the postprocessing `autocorrect` module is enabled. +- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output`. ## Input JSON Format @@ -76,6 +74,8 @@ Other WhisperX fields, including `words` and raw diarization speaker labels, are `speakers.yml` maps input files to canonical speaker names using ordered substring rules: +This file is optional. If `--speakers` is omitted, `seriatim` uses each input file basename as the segment speaker label. + ```yaml match: - speaker: "Eric Rakestraw" @@ -137,7 +137,7 @@ The merged output uses the current seriatim envelope: "input_reader": "json-files", "input_files": ["eric.json", "mike.json"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], - "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "assign-ids", "validate-output"], + "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "autocorrect", "assign-ids", "validate-output"], "output_modules": ["json"] }, "segments": [ @@ -165,16 +165,14 @@ Final segment IDs are assigned after sorting and start at `1`. ## Autocorrect -Autocorrect is an opt-in postprocessing module. It is not part of the default pipeline. +Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report. -Enable it by adding `autocorrect` to `--postprocessing-modules` and passing `--autocorrect`: +Enable corrections by passing `--autocorrect`: ```sh go run ./cmd/seriatim merge \ --input-file input.json \ - --speakers speakers.yml \ --autocorrect autocorrect.yml \ - --postprocessing-modules detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output \ --output-file merged.json ``` diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index ac1d585..78c09b5 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -58,6 +58,11 @@ func (autocorrectPostprocessor) Process(ctx context.Context, in model.MergedTran if err := ctx.Err(); err != nil { return model.MergedTranscript{}, nil, err } + if cfg.AutocorrectFile == "" { + return in, []report.Event{ + report.Info("postprocessing", "autocorrect", "skipped autocorrect because no autocorrect file was supplied"), + }, nil + } rules, err := autocorrect.Load(cfg.AutocorrectFile) if err != nil { diff --git a/internal/builtin/preprocess.go b/internal/builtin/preprocess.go index 47770cf..3fbbcfd 100644 --- a/internal/builtin/preprocess.go +++ b/internal/builtin/preprocess.go @@ -3,6 +3,7 @@ package builtin import ( "context" "fmt" + "path/filepath" "strings" "gitea.maximumdirect.net/eric/seriatim/internal/config" @@ -99,16 +100,25 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State) } - speakers, err := speaker.LoadMap(cfg.SpeakersFile) - if err != nil { - return pipeline.PreprocessState{}, nil, err + var speakers speaker.Map + useSpeakerMap := cfg.SpeakersFile != "" + if useSpeakerMap { + var err error + speakers, err = speaker.LoadMap(cfg.SpeakersFile) + if err != nil { + return pipeline.PreprocessState{}, nil, err + } } canonical := make([]model.CanonicalTranscript, 0, len(in.Raw)) for _, raw := range in.Raw { - canonicalSpeaker, err := speakers.SpeakerForSource(raw.Source) - if err != nil { - return pipeline.PreprocessState{}, nil, err + canonicalSpeaker := filepath.Base(raw.Source) + if useSpeakerMap { + var err error + canonicalSpeaker, err = speakers.SpeakerForSource(raw.Source) + if err != nil { + return pipeline.PreprocessState{}, nil, err + } } segments := make([]model.Segment, 0, len(raw.Segments)) @@ -129,11 +139,16 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat }) } + message := "created canonical transcript(s) from raw input" + if !useSpeakerMap { + message = "created canonical transcript(s) using input basenames as speaker labels" + } + return pipeline.PreprocessState{ State: pipeline.StateCanonical, Raw: append([]model.RawTranscript(nil), in.Raw...), Canonical: canonical, }, []report.Event{ - report.Info("preprocessing", "normalize-speakers", "created canonical transcript(s) from raw input"), + report.Info("preprocessing", "normalize-speakers", message), }, nil } diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 64aa351..05146ee 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { "placeholder-merger", "detect-overlaps", "resolve-overlaps", + "autocorrect", "assign-ids", "validate-output", "json", @@ -274,43 +275,37 @@ func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) { } } -func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) { +func TestDefaultMergeWorksWithoutSpeakersOrAutocorrect(t *testing.T) { dir := t.TempDir() - input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) + input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") err := executeMerge( "--input-file", input, "--output-file", output, + "--report-file", reportPath, ) - if err == nil { - t.Fatal("expected error") + if err != nil { + t.Fatalf("merge failed: %v", err) } - if !strings.Contains(err.Error(), "--speakers is required") { - t.Fatalf("unexpected error: %v", err) - } -} -func TestAutocorrectRequiresAutocorrectFile(t *testing.T) { - dir := t.TempDir() - input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `match: - - speaker: Alice - match: ["input.json"] -`) - output := filepath.Join(dir, "merged.json") - - err := executeMerge( - "--input-file", input, - "--speakers", speakers, - "--output-file", output, - "--postprocessing-modules", "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output", - ) - if err == nil { - t.Fatal("expected error") + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if got, want := transcript.Segments[0].Speaker, "input.json"; got != want { + t.Fatalf("speaker = %q, want %q", got, want) } - if !strings.Contains(err.Error(), "--autocorrect is required") { - t.Fatalf("unexpected error: %v", err) + if got, want := transcript.Segments[0].Text, "Frank"; got != want { + t.Fatalf("text = %q, want %q", got, want) + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "preprocessing", "normalize-speakers", "using input basenames") { + t.Fatal("expected normalize-speakers fallback report event") + } + if !hasReportEvent(rpt, "postprocessing", "autocorrect", "skipped autocorrect") { + t.Fatal("expected autocorrect skip report event") } } @@ -402,6 +397,28 @@ func TestPostprocessingAutocorrectUpdatesOutputAndReport(t *testing.T) { } } +func TestInvalidAutocorrectFileFailsWhenProvided(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"text":"Frank"}]}`) + output := filepath.Join(dir, "merged.json") + autocorrect := writeYAMLFile(t, dir, "autocorrect.yml", `autocorrect: + - target: "" + match: ["Frank"] +`) + + err := executeMerge( + "--input-file", input, + "--autocorrect", autocorrect, + "--output-file", output, + ) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), "must include target") { + t.Fatalf("unexpected error: %v", err) + } +} + func TestOutputJSONIsByteStable(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`) @@ -658,6 +675,15 @@ func equalStrings(left []string, right []string) bool { return true } +func hasReportEvent(rpt report.Report, stage string, module string, messageSubstring string) bool { + for _, event := range rpt.Events { + if event.Stage == stage && event.Module == module && strings.Contains(event.Message, messageSubstring) { + return true + } + } + return false +} + func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) { t.Helper() diff --git a/internal/config/config.go b/internal/config/config.go index 791cb93..59c6f89 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -13,7 +13,7 @@ const ( DefaultInputReader = "json-files" DefaultOutputModules = "json" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" - DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,assign-ids,validate-output" + DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output" ) // MergeOptions captures raw CLI option values before validation. @@ -98,27 +98,13 @@ func NewMergeConfig(opts MergeOptions) (Config, error) { cfg.AutocorrectFile = "" } - if contains(cfg.PreprocessingModules, "normalize-speakers") { - if cfg.SpeakersFile == "" { - return Config{}, errors.New("--speakers is required when normalize-speakers is enabled") - } - if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil { - return Config{}, err - } - } else if cfg.SpeakersFile != "" { + if cfg.SpeakersFile != "" { if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil { return Config{}, err } } - if contains(cfg.PostprocessingModules, "autocorrect") { - if cfg.AutocorrectFile == "" { - return Config{}, errors.New("--autocorrect is required when autocorrect is enabled") - } - if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil { - return Config{}, err - } - } else if cfg.AutocorrectFile != "" { + if cfg.AutocorrectFile != "" { if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil { return Config{}, err }