Cleaned up documentation and development artifcats in advance of release
This commit is contained in:
47
README.md
47
README.md
@@ -31,21 +31,34 @@ go run ./cmd/seriatim merge \
|
|||||||
seriatim merge [flags]
|
seriatim merge [flags]
|
||||||
```
|
```
|
||||||
|
|
||||||
Required flags for the default pipeline:
|
Global flags:
|
||||||
|
|
||||||
- `--input-file`: input transcript JSON file. Repeat once per speaker/input file.
|
| Flag | Description |
|
||||||
- `--output-file`: merged transcript JSON output path.
|
| --- | --- |
|
||||||
|
| `--help` | Show command help. |
|
||||||
|
| `--version` | Show application version. Local builds default to `dev`; release builds inject the release version. |
|
||||||
|
|
||||||
Optional flags:
|
`merge` flags:
|
||||||
|
|
||||||
- `--report-file`: write a JSON report with pipeline events.
|
| Flag | Required | Default | Description |
|
||||||
- `--speakers`: speaker map YAML file. When omitted, input file basenames are used as speaker labels.
|
| --- | --- | --- | --- |
|
||||||
- `--autocorrect`: autocorrect rules file. When omitted, the default `autocorrect` module no-ops.
|
| `--input-file` | Yes | none | Input transcript JSON file. Repeat once per speaker/input file. |
|
||||||
- `--input-reader`: input reader module. Default: `json-files`.
|
| `--output-file` | Yes | none | Merged transcript JSON output path. |
|
||||||
- `--output-modules`: comma-separated output modules. Default: `json`.
|
| `--report-file` | No | none | Optional report JSON output path. |
|
||||||
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
| `--speakers` | No | none | Speaker map YAML file. When omitted, input file basenames are used as speaker labels. |
|
||||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`.
|
| `--autocorrect` | No | none | Autocorrect rules YAML file. When omitted, the default `autocorrect` module leaves text unchanged. |
|
||||||
- `--coalesce-gap`: maximum same-speaker gap in seconds for `coalesce`. Default: `3.0`.
|
| `--input-reader` | No | `json-files` | Input reader module. |
|
||||||
|
| `--output-modules` | No | `json` | Comma-separated output modules. |
|
||||||
|
| `--preprocessing-modules` | No | `validate-raw,normalize-speakers,trim-text` | Comma-separated preprocessing modules, evaluated in order. |
|
||||||
|
| `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. |
|
||||||
|
| `--coalesce-gap` | No | `3.0` | Maximum same-speaker gap in seconds for `coalesce`. Must be a non-negative float. |
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
|
||||||
|
| Environment Variable | Default | Description |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| `SERIATIM_OVERLAP_WORD_RUN_GAP` | `0.75` | Maximum gap in seconds between adjacent timed words when `resolve-overlaps` builds word-run replacement segments. Must be a positive float. |
|
||||||
|
| `SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW` | `0.4` | Near-start window in seconds for ordering replacement word runs shortest-first. Must be a positive float. |
|
||||||
|
|
||||||
## Input JSON Format
|
## Input JSON Format
|
||||||
|
|
||||||
@@ -312,4 +325,12 @@ Matching behavior:
|
|||||||
|
|
||||||
- Only JSON input is supported.
|
- Only JSON input is supported.
|
||||||
- Overlap resolution depends on WhisperX word timing; groups without usable word timing remain unresolved.
|
- Overlap resolution depends on WhisperX word timing; groups without usable word timing remain unresolved.
|
||||||
- Coalescing and alternate output formats are not implemented yet.
|
- Alternate output formats are not implemented yet.
|
||||||
|
|
||||||
|
## Release Builds
|
||||||
|
|
||||||
|
Local builds record version metadata as `dev`. Release builds should inject the release version with `ldflags`:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
go build -ldflags "-X gitea.maximumdirect.net/eric/seriatim/internal/buildinfo.Version=v1.0.0" ./cmd/seriatim
|
||||||
|
```
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
The initial use case is merging independently transcribed speaker audio tracks from the same recorded session, such as a weekly tabletop RPG session. The architecture should also support meetings, podcasts, interviews, and other multi-speaker events.
|
The initial use case is merging independently transcribed speaker audio tracks from the same recorded session, such as a weekly tabletop RPG session. The architecture should also support meetings, podcasts, interviews, and other multi-speaker events.
|
||||||
|
|
||||||
`seriatim` will be implemented in Go.
|
`seriatim` is implemented in Go.
|
||||||
|
|
||||||
## Goals
|
## Goals
|
||||||
|
|
||||||
@@ -21,19 +21,19 @@ The initial use case is merging independently transcribed speaker audio tracks f
|
|||||||
9. Emit one or more output artifacts through output writers.
|
9. Emit one or more output artifacts through output writers.
|
||||||
10. Produce report data for validation findings, corrections, and transformations.
|
10. Produce report data for validation findings, corrections, and transformations.
|
||||||
|
|
||||||
## Non-goals for the MVP
|
## Non-goals
|
||||||
|
|
||||||
The MVP should not attempt to:
|
The 1.0 release does not attempt to:
|
||||||
|
|
||||||
- Perform transcription.
|
- Perform transcription.
|
||||||
- Perform audio diarization.
|
- Perform audio diarization.
|
||||||
- Use an LLM.
|
- Use an LLM.
|
||||||
- Summarize transcript content.
|
- Summarize transcript content.
|
||||||
- Infer speaker identity from audio or text.
|
- Infer speaker identity from audio or text.
|
||||||
- Fully resolve crosstalk.
|
- Fully resolve every crosstalk case.
|
||||||
- Load arbitrary third-party code as dynamic plugins.
|
- Load arbitrary third-party code as dynamic plugins.
|
||||||
|
|
||||||
The MVP should support runtime composition of built-in modules by canonical module name. Arbitrary external plugin loading can be considered later.
|
The application supports runtime composition of built-in modules by canonical module name. Arbitrary external plugin loading can be considered later.
|
||||||
|
|
||||||
## Core Assumption
|
## Core Assumption
|
||||||
|
|
||||||
@@ -78,7 +78,7 @@ The configuration stage produces an application config value that is passed thro
|
|||||||
|
|
||||||
The input stage converts external inputs into raw transcript documents with source metadata.
|
The input stage converts external inputs into raw transcript documents with source metadata.
|
||||||
|
|
||||||
The MVP input method is one or more JSON files passed with repeated `--input-file` flags:
|
The current input method is one or more JSON files passed with repeated `--input-file` flags:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
seriatim merge --input-file eric.json --input-file mike.json --output-file merged.json
|
seriatim merge --input-file eric.json --input-file mike.json --output-file merged.json
|
||||||
@@ -107,7 +107,7 @@ Preprocessing starts with raw transcript documents from input readers and must e
|
|||||||
Preprocessing modules are selected at runtime with a comma-separated list of canonical module names:
|
Preprocessing modules are selected at runtime with a comma-separated list of canonical module names:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
--preprocessing-modules validate-raw,normalize-speakers,trim-text,autocorrect
|
--preprocessing-modules validate-raw,normalize-speakers,trim-text
|
||||||
```
|
```
|
||||||
|
|
||||||
Modules run in the exact order provided. Unknown module names are configuration errors.
|
Modules run in the exact order provided. Unknown module names are configuration errors.
|
||||||
@@ -120,7 +120,6 @@ Potential preprocessing modules include:
|
|||||||
- Speaker name normalization based on input filename.
|
- Speaker name normalization based on input filename.
|
||||||
- Timing validation and deterministic correction.
|
- Timing validation and deterministic correction.
|
||||||
- Text trimming.
|
- Text trimming.
|
||||||
- Word replacement from `autocorrect.yml`.
|
|
||||||
|
|
||||||
Preprocessing should not depend on global chronological ordering across speakers. Modules that need the globally merged transcript belong in postprocessing.
|
Preprocessing should not depend on global chronological ordering across speakers. Modules that need the globally merged transcript belong in postprocessing.
|
||||||
|
|
||||||
@@ -147,7 +146,7 @@ The postprocessing stage applies zero or more modules to the merged transcript.
|
|||||||
Postprocessing modules are selected at runtime with a comma-separated list of canonical module names:
|
Postprocessing modules are selected at runtime with a comma-separated list of canonical module names:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
--postprocessing-modules detect-overlaps,resolve-overlaps,coalesce,assign-ids,validate-output
|
--postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output
|
||||||
```
|
```
|
||||||
|
|
||||||
Modules run in the exact order provided. Unknown module names are configuration errors.
|
Modules run in the exact order provided. Unknown module names are configuration errors.
|
||||||
@@ -168,7 +167,7 @@ Any module that can reorder, split, merge, drop, or create segments must run bef
|
|||||||
|
|
||||||
The output stage emits one or more artifacts from the final transcript and report model.
|
The output stage emits one or more artifacts from the final transcript and report model.
|
||||||
|
|
||||||
The MVP output format is JSON, specified with:
|
The current output format is JSON, specified with:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
--output-file merged.json
|
--output-file merged.json
|
||||||
@@ -202,7 +201,7 @@ This classification should guide Go interfaces and package boundaries. It should
|
|||||||
|
|
||||||
## Runtime Module Composition
|
## Runtime Module Composition
|
||||||
|
|
||||||
The MVP should support runtime composition of built-in modules.
|
The application supports runtime composition of built-in modules.
|
||||||
|
|
||||||
Module names are canonical strings registered at startup. CLI flags refer to those names. The configuration stage resolves names into module instances before the pipeline runs.
|
Module names are canonical strings registered at startup. CLI flags refer to those names. The configuration stage resolves names into module instances before the pipeline runs.
|
||||||
|
|
||||||
@@ -214,9 +213,9 @@ seriatim merge \
|
|||||||
--input-file mike.json \
|
--input-file mike.json \
|
||||||
--speakers speakers.yml \
|
--speakers speakers.yml \
|
||||||
--autocorrect autocorrect.yml \
|
--autocorrect autocorrect.yml \
|
||||||
--preprocessing-modules validate-raw,normalize-speakers,trim-text,autocorrect \
|
--preprocessing-modules validate-raw,normalize-speakers,trim-text \
|
||||||
--postprocessing-modules detect-overlaps,resolve-overlaps,coalesce,assign-ids,validate-output \
|
--postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output \
|
||||||
--output-module json \
|
--output-modules json \
|
||||||
--output-file merged.json \
|
--output-file merged.json \
|
||||||
--report-file report.json
|
--report-file report.json
|
||||||
```
|
```
|
||||||
@@ -358,7 +357,7 @@ Initial classifications may include:
|
|||||||
- `backchannel`
|
- `backchannel`
|
||||||
- `crosstalk`
|
- `crosstalk`
|
||||||
|
|
||||||
The MVP `resolve-overlaps` module may be a stub that marks groups as unresolved. This preserves the architecture for future word-level crosstalk serialization without complicating the initial implementation.
|
The `resolve-overlaps` module uses preserved word-level timing to replace detected overlap-group segments with smaller word-run segments when usable timing is available. Groups without usable word timing remain unresolved for later passes or human review.
|
||||||
|
|
||||||
Overlap resolution should be non-destructive. Original segment text, timing, and source metadata must remain recoverable.
|
Overlap resolution should be non-destructive. Original segment text, timing, and source metadata must remain recoverable.
|
||||||
|
|
||||||
@@ -403,31 +402,28 @@ To support this:
|
|||||||
- Record application version in output metadata.
|
- Record application version in output metadata.
|
||||||
- Record enabled module names and module order in output metadata or report data.
|
- Record enabled module names and module order in output metadata or report data.
|
||||||
|
|
||||||
## Suggested Go Package Layout
|
## Go Package Layout
|
||||||
|
|
||||||
```text
|
```text
|
||||||
cmd/seriatim/ CLI entrypoint
|
cmd/seriatim/ CLI entrypoint
|
||||||
internal/config/ CLI/env/config loading and validation
|
internal/config/ CLI/env/config loading and validation
|
||||||
internal/pipeline/ Pipeline orchestration and module registry
|
internal/pipeline/ Pipeline orchestration and module registry
|
||||||
internal/input/ Input readers
|
internal/builtin/ Built-in pipeline modules
|
||||||
internal/raw/ Raw transcript structs
|
internal/artifact/ Conversion from internal model to public output schema
|
||||||
internal/schema/ Schema loading and validation helpers
|
internal/buildinfo/ Build-time version metadata
|
||||||
internal/speaker/ Speaker map parsing and lookup
|
internal/speaker/ Speaker map parsing and lookup
|
||||||
internal/model/ Canonical and merged transcript models
|
internal/model/ Canonical and merged transcript models
|
||||||
internal/preprocess/ Preprocessing modules
|
|
||||||
internal/merge/ Deterministic merge logic
|
|
||||||
internal/postprocess/ Postprocessing modules
|
|
||||||
internal/overlap/ Overlap detection and refinement helpers
|
internal/overlap/ Overlap detection and refinement helpers
|
||||||
internal/autocorrect/ Word replacement rules
|
internal/autocorrect/ Word replacement rules
|
||||||
internal/report/ Report model and event accumulation
|
internal/report/ Report model and event accumulation
|
||||||
internal/output/ Output writers
|
schema/ Public output contract and JSON Schema validation
|
||||||
```
|
```
|
||||||
|
|
||||||
Package boundaries should follow data ownership. Shared models belong in `internal/model`; stage-specific behavior belongs in the relevant stage package.
|
Package boundaries should follow data ownership. Shared models belong in `internal/model`; stage-specific behavior belongs in the relevant stage package.
|
||||||
|
|
||||||
## MVP Defaults
|
## Default Modules
|
||||||
|
|
||||||
The MVP should define documented defaults equivalent to explicit module lists.
|
The default pipeline is equivalent to explicit module lists.
|
||||||
|
|
||||||
Recommended default preprocessing modules:
|
Recommended default preprocessing modules:
|
||||||
|
|
||||||
@@ -438,7 +434,11 @@ validate-raw,normalize-speakers,trim-text
|
|||||||
Recommended default postprocessing modules:
|
Recommended default postprocessing modules:
|
||||||
|
|
||||||
```text
|
```text
|
||||||
detect-overlaps,resolve-overlaps,assign-ids,validate-output
|
detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output
|
||||||
```
|
```
|
||||||
|
|
||||||
Optional modules such as `autocorrect` and `coalesce` should be opt-in until their behavior is thoroughly specified and tested.
|
The default output module is:
|
||||||
|
|
||||||
|
```text
|
||||||
|
json
|
||||||
|
```
|
||||||
|
|||||||
@@ -1,15 +1,13 @@
|
|||||||
package artifact
|
package artifact
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/schema"
|
"gitea.maximumdirect.net/eric/seriatim/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const ApplicationName = "seriatim"
|
||||||
ApplicationName = "seriatim"
|
|
||||||
Version = "dev"
|
|
||||||
)
|
|
||||||
|
|
||||||
// FromMerged converts the internal merged transcript model into the public
|
// FromMerged converts the internal merged transcript model into the public
|
||||||
// serialized output contract.
|
// serialized output contract.
|
||||||
@@ -47,7 +45,7 @@ func FromMerged(cfg config.Config, merged model.MergedTranscript) schema.Transcr
|
|||||||
return schema.Transcript{
|
return schema.Transcript{
|
||||||
Metadata: schema.Metadata{
|
Metadata: schema.Metadata{
|
||||||
Application: ApplicationName,
|
Application: ApplicationName,
|
||||||
Version: Version,
|
Version: buildinfo.Version,
|
||||||
InputReader: cfg.InputReader,
|
InputReader: cfg.InputReader,
|
||||||
InputFiles: append([]string(nil), cfg.InputFiles...),
|
InputFiles: append([]string(nil), cfg.InputFiles...),
|
||||||
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
||||||
|
|||||||
22
internal/artifact/transcript_test.go
Normal file
22
internal/artifact/transcript_test.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package artifact
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFromMergedUsesBuildVersion(t *testing.T) {
|
||||||
|
original := buildinfo.Version
|
||||||
|
t.Cleanup(func() {
|
||||||
|
buildinfo.Version = original
|
||||||
|
})
|
||||||
|
buildinfo.Version = "v1.0.0-test"
|
||||||
|
|
||||||
|
transcript := FromMerged(config.Config{}, model.MergedTranscript{})
|
||||||
|
if transcript.Metadata.Version != "v1.0.0-test" {
|
||||||
|
t.Fatalf("version = %q, want v1.0.0-test", transcript.Metadata.Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
7
internal/buildinfo/buildinfo.go
Normal file
7
internal/buildinfo/buildinfo.go
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
package buildinfo
|
||||||
|
|
||||||
|
// Version is the application version recorded in output artifacts and CLI
|
||||||
|
// metadata. Release builds can override it with:
|
||||||
|
//
|
||||||
|
// go build -ldflags "-X gitea.maximumdirect.net/eric/seriatim/internal/buildinfo.Version=v1.0.0"
|
||||||
|
var Version = "dev"
|
||||||
@@ -2,6 +2,7 @@ package builtin
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
@@ -9,13 +10,13 @@ import (
|
|||||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||||
)
|
)
|
||||||
|
|
||||||
type placeholderMerger struct{}
|
type chronologicalMerger struct{}
|
||||||
|
|
||||||
func (placeholderMerger) Name() string {
|
func (chronologicalMerger) Name() string {
|
||||||
return "placeholder-merger"
|
return "chronological-merge"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
func (chronologicalMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return model.MergedTranscript{}, nil, err
|
return model.MergedTranscript{}, nil, err
|
||||||
}
|
}
|
||||||
@@ -33,6 +34,6 @@ func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscri
|
|||||||
Segments: segments,
|
Segments: segments,
|
||||||
OverlapGroups: nil,
|
OverlapGroups: nil,
|
||||||
}, []report.Event{
|
}, []report.Event{
|
||||||
report.Info("merge", "placeholder-merger", "merged placeholder canonical transcript(s)"),
|
report.Info("merge", "chronological-merge", fmt.Sprintf("merged %d canonical transcript(s) into %d segment(s)", len(in), len(segments))),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,24 +16,6 @@ import (
|
|||||||
"gitea.maximumdirect.net/eric/seriatim/schema"
|
"gitea.maximumdirect.net/eric/seriatim/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
type noopPostprocessor struct {
|
|
||||||
name string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p noopPostprocessor) Name() string {
|
|
||||||
return p.name
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p noopPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
|
||||||
if err := ctx.Err(); err != nil {
|
|
||||||
return model.MergedTranscript{}, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return in, []report.Event{
|
|
||||||
report.Info("postprocessing", p.name, "completed no-op postprocessing module"),
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type assignIDs struct{}
|
type assignIDs struct{}
|
||||||
|
|
||||||
func (assignIDs) Name() string {
|
func (assignIDs) Name() string {
|
||||||
|
|||||||
@@ -13,35 +13,58 @@ import (
|
|||||||
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
|
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
|
||||||
)
|
)
|
||||||
|
|
||||||
type noopPreprocessor struct {
|
type validateRaw struct{}
|
||||||
name string
|
|
||||||
requires pipeline.ModelState
|
func (validateRaw) Name() string {
|
||||||
produces pipeline.ModelState
|
return "validate-raw"
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p noopPreprocessor) Name() string {
|
func (validateRaw) Requires() pipeline.ModelState {
|
||||||
return p.name
|
return pipeline.StateRaw
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p noopPreprocessor) Requires() pipeline.ModelState {
|
func (validateRaw) Produces() pipeline.ModelState {
|
||||||
return p.requires
|
return pipeline.StateRaw
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p noopPreprocessor) Produces() pipeline.ModelState {
|
func (validateRaw) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||||
return p.produces
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return pipeline.PreprocessState{}, nil, err
|
return pipeline.PreprocessState{}, nil, err
|
||||||
}
|
}
|
||||||
if in.State != p.requires {
|
if in.State != pipeline.StateRaw {
|
||||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "validate-raw", pipeline.StateRaw, in.State)
|
||||||
|
}
|
||||||
|
if len(in.Raw) == 0 {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: no raw transcript(s) to validate")
|
||||||
|
}
|
||||||
|
|
||||||
|
for transcriptIndex, transcript := range in.Raw {
|
||||||
|
if strings.TrimSpace(transcript.Source) == "" {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %d has empty source", transcriptIndex)
|
||||||
|
}
|
||||||
|
for segmentIndex, segment := range transcript.Segments {
|
||||||
|
if segment.Start < 0 {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has negative start", transcript.Source, segmentIndex)
|
||||||
|
}
|
||||||
|
if segment.End < segment.Start {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has end before start", transcript.Source, segmentIndex)
|
||||||
|
}
|
||||||
|
for wordIndex, word := range segment.Words {
|
||||||
|
if !word.Timed {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if word.Start < 0 {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has negative start", transcript.Source, segmentIndex, wordIndex)
|
||||||
|
}
|
||||||
|
if word.End < word.Start {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has end before start", transcript.Source, segmentIndex, wordIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
in.State = p.produces
|
|
||||||
return in, []report.Event{
|
return in, []report.Event{
|
||||||
report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
|
report.Info("preprocessing", "validate-raw", fmt.Sprintf("validated %d raw transcript(s)", len(in.Raw))),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
153
internal/builtin/preprocess_test.go
Normal file
153
internal/builtin/preprocess_test.go
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
package builtin
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestValidateRawAcceptsValidRawTranscripts(t *testing.T) {
|
||||||
|
state := pipeline.PreprocessState{
|
||||||
|
State: pipeline.StateRaw,
|
||||||
|
Raw: []model.RawTranscript{
|
||||||
|
{
|
||||||
|
Source: "input.json",
|
||||||
|
Segments: []model.RawSegment{
|
||||||
|
{
|
||||||
|
Start: 1,
|
||||||
|
End: 2,
|
||||||
|
Text: "hello",
|
||||||
|
Words: []model.Word{
|
||||||
|
{Text: "hello", Start: 1, End: 1.5, Timed: true},
|
||||||
|
{Text: "untimed"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, events, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("validate raw: %v", err)
|
||||||
|
}
|
||||||
|
if got.State != pipeline.StateRaw || len(got.Raw) != 1 {
|
||||||
|
t.Fatalf("unexpected state: %#v", got)
|
||||||
|
}
|
||||||
|
if len(events) != 1 || !strings.Contains(events[0].Message, "validated 1 raw transcript(s)") {
|
||||||
|
t.Fatalf("events = %#v", events)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRawRejectsInvalidState(t *testing.T) {
|
||||||
|
state := pipeline.PreprocessState{State: pipeline.StateCanonical}
|
||||||
|
|
||||||
|
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
assertPreprocessError(t, err, `requires state "raw"`)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRawRejectsNoRawTranscripts(t *testing.T) {
|
||||||
|
state := pipeline.PreprocessState{State: pipeline.StateRaw}
|
||||||
|
|
||||||
|
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
assertPreprocessError(t, err, "no raw transcript(s)")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRawRejectsEmptySource(t *testing.T) {
|
||||||
|
state := validRawState()
|
||||||
|
state.Raw[0].Source = " "
|
||||||
|
|
||||||
|
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
assertPreprocessError(t, err, "raw transcript 0 has empty source")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRawRejectsInvalidSegmentTiming(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
mutate func(*model.RawSegment)
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "negative start",
|
||||||
|
mutate: func(segment *model.RawSegment) {
|
||||||
|
segment.Start = -1
|
||||||
|
},
|
||||||
|
want: "segment 0 has negative start",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "end before start",
|
||||||
|
mutate: func(segment *model.RawSegment) {
|
||||||
|
segment.Start = 2
|
||||||
|
segment.End = 1
|
||||||
|
},
|
||||||
|
want: "segment 0 has end before start",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
state := validRawState()
|
||||||
|
test.mutate(&state.Raw[0].Segments[0])
|
||||||
|
|
||||||
|
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
assertPreprocessError(t, err, test.want)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateRawRejectsInvalidTimedWordTiming(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
word model.Word
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "negative start",
|
||||||
|
word: model.Word{Text: "bad", Start: -1, End: 1, Timed: true},
|
||||||
|
want: "word 0 has negative start",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "end before start",
|
||||||
|
word: model.Word{Text: "bad", Start: 2, End: 1, Timed: true},
|
||||||
|
want: "word 0 has end before start",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
state := validRawState()
|
||||||
|
state.Raw[0].Segments[0].Words = []model.Word{test.word}
|
||||||
|
|
||||||
|
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
|
||||||
|
assertPreprocessError(t, err, test.want)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validRawState() pipeline.PreprocessState {
|
||||||
|
return pipeline.PreprocessState{
|
||||||
|
State: pipeline.StateRaw,
|
||||||
|
Raw: []model.RawTranscript{
|
||||||
|
{
|
||||||
|
Source: "input.json",
|
||||||
|
Segments: []model.RawSegment{
|
||||||
|
{Start: 1, End: 2, Text: ""},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func assertPreprocessError(t *testing.T, err error, want string) {
|
||||||
|
t.Helper()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("expected error containing %q", want)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), want) {
|
||||||
|
t.Fatalf("expected error containing %q, got %v", want, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,15 +2,15 @@ package builtin
|
|||||||
|
|
||||||
import "gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
import "gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||||
|
|
||||||
// NewRegistry registers the MVP built-in modules.
|
// NewRegistry registers the built-in modules.
|
||||||
func NewRegistry() *pipeline.Registry {
|
func NewRegistry() *pipeline.Registry {
|
||||||
registry := pipeline.NewRegistry()
|
registry := pipeline.NewRegistry()
|
||||||
|
|
||||||
registry.RegisterInputReader(jsonFilesReader{})
|
registry.RegisterInputReader(jsonFilesReader{})
|
||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
registry.RegisterPreprocessor(validateRaw{})
|
||||||
registry.RegisterPreprocessor(normalizeSpeakers{})
|
registry.RegisterPreprocessor(normalizeSpeakers{})
|
||||||
registry.RegisterPreprocessor(trimText{})
|
registry.RegisterPreprocessor(trimText{})
|
||||||
registry.RegisterMerger(placeholderMerger{})
|
registry.RegisterMerger(chronologicalMerger{})
|
||||||
registry.RegisterPostprocessor(detectOverlaps{})
|
registry.RegisterPostprocessor(detectOverlaps{})
|
||||||
registry.RegisterPostprocessor(resolveOverlaps{})
|
registry.RegisterPostprocessor(resolveOverlaps{})
|
||||||
registry.RegisterPostprocessor(backchannelPostprocessor{})
|
registry.RegisterPostprocessor(backchannelPostprocessor{})
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
|||||||
t.Fatalf("did not expect words in output:\n%s", outputJSON)
|
t.Fatalf("did not expect words in output:\n%s", outputJSON)
|
||||||
}
|
}
|
||||||
if len(transcript.OverlapGroups) != 0 {
|
if len(transcript.OverlapGroups) != 0 {
|
||||||
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
t.Fatalf("expected output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
||||||
}
|
}
|
||||||
|
|
||||||
var rpt report.Report
|
var rpt report.Report
|
||||||
@@ -87,7 +87,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
|||||||
"validate-raw",
|
"validate-raw",
|
||||||
"normalize-speakers",
|
"normalize-speakers",
|
||||||
"trim-text",
|
"trim-text",
|
||||||
"placeholder-merger",
|
"chronological-merge",
|
||||||
"detect-overlaps",
|
"detect-overlaps",
|
||||||
"resolve-overlaps",
|
"resolve-overlaps",
|
||||||
"backchannel",
|
"backchannel",
|
||||||
@@ -105,6 +105,9 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
|||||||
if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 3 output segment(s)") {
|
if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 3 output segment(s)") {
|
||||||
t.Fatal("expected validate-output report event")
|
t.Fatal("expected validate-output report event")
|
||||||
}
|
}
|
||||||
|
if !hasReportEvent(rpt, "merge", "chronological-merge", "merged 2 canonical transcript(s) into 3 segment(s)") {
|
||||||
|
t.Fatal("expected chronological-merge report event")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMergeTieBreakOrder(t *testing.T) {
|
func TestMergeTieBreakOrder(t *testing.T) {
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ package cli
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
|
||||||
)
|
)
|
||||||
|
|
||||||
// NewRootCommand builds the seriatim command tree.
|
// NewRootCommand builds the seriatim command tree.
|
||||||
@@ -9,6 +11,7 @@ func NewRootCommand() *cobra.Command {
|
|||||||
cmd := &cobra.Command{
|
cmd := &cobra.Command{
|
||||||
Use: "seriatim",
|
Use: "seriatim",
|
||||||
Short: "Merge per-speaker transcripts into a chronological transcript",
|
Short: "Merge per-speaker transcripts into a chronological transcript",
|
||||||
|
Version: buildinfo.Version,
|
||||||
SilenceErrors: true,
|
SilenceErrors: true,
|
||||||
SilenceUsage: true,
|
SilenceUsage: true,
|
||||||
}
|
}
|
||||||
|
|||||||
10
internal/cli/root_test.go
Normal file
10
internal/cli/root_test.go
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
package cli
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestRootCommandDefaultsVersionToDev(t *testing.T) {
|
||||||
|
cmd := NewRootCommand()
|
||||||
|
if cmd.Version != "dev" {
|
||||||
|
t.Fatalf("version = %q, want dev", cmd.Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/artifact"
|
"gitea.maximumdirect.net/eric/seriatim/internal/artifact"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||||
@@ -13,7 +14,6 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
applicationName = artifact.ApplicationName
|
applicationName = artifact.ApplicationName
|
||||||
version = artifact.Version
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Run validates module composition, executes the pipeline, and emits outputs.
|
// Run validates module composition, executes the pipeline, and emits outputs.
|
||||||
@@ -149,7 +149,7 @@ func finalizeReport(cfg config.Config, events []report.Event) report.Report {
|
|||||||
return report.Report{
|
return report.Report{
|
||||||
Metadata: report.Metadata{
|
Metadata: report.Metadata{
|
||||||
Application: applicationName,
|
Application: applicationName,
|
||||||
Version: version,
|
Version: buildinfo.Version,
|
||||||
InputReader: cfg.InputReader,
|
InputReader: cfg.InputReader,
|
||||||
InputFiles: append([]string(nil), cfg.InputFiles...),
|
InputFiles: append([]string(nil), cfg.InputFiles...),
|
||||||
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
||||||
|
|||||||
21
internal/pipeline/runner_test.go
Normal file
21
internal/pipeline/runner_test.go
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFinalizeReportUsesBuildVersion(t *testing.T) {
|
||||||
|
original := buildinfo.Version
|
||||||
|
t.Cleanup(func() {
|
||||||
|
buildinfo.Version = original
|
||||||
|
})
|
||||||
|
buildinfo.Version = "v1.0.0-test"
|
||||||
|
|
||||||
|
rpt := finalizeReport(config.Config{}, nil)
|
||||||
|
if rpt.Metadata.Version != "v1.0.0-test" {
|
||||||
|
t.Fatalf("version = %q, want v1.0.0-test", rpt.Metadata.Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user