From 9cca88280f7bfc922891ab533e78e9bac599bbf1 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Tue, 28 Apr 2026 14:39:00 -0500 Subject: [PATCH] Added support for a minimal JSON output schema --- README.md | 30 +++++- architecture.md | 5 +- internal/artifact/transcript.go | 35 +++++++ internal/artifact/transcript_test.go | 53 ++++++++++ internal/builtin/output.go | 3 +- internal/builtin/postprocess.go | 13 ++- internal/builtin/postprocess_test.go | 50 +++++++++ internal/cli/merge.go | 1 + internal/cli/merge_test.go | 100 ++++++++++++++++++ internal/config/config.go | 21 ++++ internal/config/config_test.go | 65 ++++++++++++ internal/pipeline/interfaces.go | 3 +- internal/pipeline/runner.go | 5 +- schema/minimal-output.schema.json | 38 +++++++ schema/output.go | 131 +++++++++++++++++------ schema/output_test.go | 149 +++++++++++++++++++++++++++ 16 files changed, 658 insertions(+), 44 deletions(-) create mode 100644 schema/minimal-output.schema.json diff --git a/README.md b/README.md index fc66720..58a5b5b 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Global flags: | `--autocorrect` | No | none | Autocorrect rules YAML file. When omitted, the default `autocorrect` module leaves text unchanged. | | `--input-reader` | No | `json-files` | Input reader module. | | `--output-modules` | No | `json` | Comma-separated output modules. | +| `--output-schema` | No | `seriatim` | JSON output contract. Allowed values are `seriatim` and `minimal`. | | `--preprocessing-modules` | No | `validate-raw,normalize-speakers,trim-text` | Comma-separated preprocessing modules, evaluated in order. | | `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. | | `--coalesce-gap` | No | `3.0` | Maximum same-speaker gap in seconds for `coalesce`; also used as the `resolve-overlaps` context window. Must be a non-negative float. | @@ -156,7 +157,9 @@ The old `inputs:` direct mapping format is no longer supported. ## Output JSON Format -The merged output uses the current seriatim envelope: +`--output-modules json` controls the writer. `--output-schema` controls the JSON contract that writer serializes. + +The default `seriatim` schema uses the full seriatim envelope: ```json { @@ -206,6 +209,29 @@ The merged output uses the current seriatim envelope: } ``` +The `minimal` schema emits minimal metadata and compact ordered segments: + +```json +{ + "metadata": { + "application": "seriatim", + "version": "dev", + "output_schema": "minimal" + }, + "segments": [ + { + "id": 1, + "start": 1.25, + "end": 3.5, + "speaker": "Eric Rakestraw", + "text": "Hello there." + } + ] +} +``` + +Minimal output intentionally omits overlap groups, categories, source/provenance fields, and pipeline configuration metadata. + Segments are sorted deterministically by: ```text @@ -220,7 +246,7 @@ The public Go output contract is available from: import "gitea.maximumdirect.net/eric/seriatim/schema" ``` -The same package embeds the machine-readable JSON Schema in `schema/output.schema.json`. The default `validate-output` postprocessor validates the output shape and verifies final segment IDs are present, sequential, and start at `1`. +The same package embeds machine-readable JSON Schemas in `schema/output.schema.json` and `schema/minimal-output.schema.json`. The default `validate-output` postprocessor validates the selected output shape and verifies final segment IDs are present, sequential, and start at `1`. ## Overlap Detection diff --git a/architecture.md b/architecture.md index 4e996db..70ebeb2 100644 --- a/architecture.md +++ b/architecture.md @@ -216,6 +216,7 @@ seriatim merge \ --preprocessing-modules validate-raw,normalize-speakers,trim-text \ --postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output \ --output-modules json \ + --output-schema seriatim \ --output-file merged.json \ --report-file report.json ``` @@ -260,7 +261,7 @@ type Postprocessor interface { type OutputWriter interface { Name() string - Write(ctx context.Context, out FinalTranscript, report Report, cfg Config) ([]ReportEvent, error) + Write(ctx context.Context, out any, report Report, cfg Config) ([]ReportEvent, error) } ``` @@ -386,7 +387,7 @@ A valid merged transcript should satisfy: - Any `overlap_group_id` on a segment refers to an existing overlap group. - Every overlap group references at least two segments. - Every referenced segment exists. -- Output validates against the output schema. +- Output validates against the selected output schema. ## Determinism Requirements diff --git a/internal/artifact/transcript.go b/internal/artifact/transcript.go index ac5750d..1e8b5e2 100644 --- a/internal/artifact/transcript.go +++ b/internal/artifact/transcript.go @@ -57,6 +57,41 @@ func FromMerged(cfg config.Config, merged model.MergedTranscript) schema.Transcr } } +// MinimalFromMerged converts the internal merged transcript model into the +// compact public serialized output contract. +func MinimalFromMerged(cfg config.Config, merged model.MergedTranscript) schema.MinimalTranscript { + segments := make([]schema.MinimalSegment, len(merged.Segments)) + for index, segment := range merged.Segments { + segments[index] = schema.MinimalSegment{ + ID: segment.ID, + Start: segment.Start, + End: segment.End, + Speaker: segment.Speaker, + Text: segment.Text, + } + } + + return schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: ApplicationName, + Version: buildinfo.Version, + OutputSchema: config.OutputSchemaMinimal, + }, + Segments: segments, + } +} + +// SelectedFromMerged converts the internal merged transcript model into the +// runtime-selected public output contract. +func SelectedFromMerged(cfg config.Config, merged model.MergedTranscript) any { + switch cfg.OutputSchema { + case config.OutputSchemaMinimal: + return MinimalFromMerged(cfg, merged) + default: + return FromMerged(cfg, merged) + } +} + func copyIntPtr(value *int) *int { if value == nil { return nil diff --git a/internal/artifact/transcript_test.go b/internal/artifact/transcript_test.go index 864fec9..087b3d6 100644 --- a/internal/artifact/transcript_test.go +++ b/internal/artifact/transcript_test.go @@ -1,11 +1,13 @@ package artifact import ( + "reflect" "testing" "gitea.maximumdirect.net/eric/seriatim/internal/buildinfo" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" + "gitea.maximumdirect.net/eric/seriatim/schema" ) func TestFromMergedUsesBuildVersion(t *testing.T) { @@ -20,3 +22,54 @@ func TestFromMergedUsesBuildVersion(t *testing.T) { t.Fatalf("version = %q, want v1.0.0-test", transcript.Metadata.Version) } } + +func TestSelectedFromMergedDefaultsToSeriatimTranscript(t *testing.T) { + got := SelectedFromMerged(config.Config{}, model.MergedTranscript{}) + if _, ok := got.(schema.Transcript); !ok { + t.Fatalf("selected artifact type = %T, want schema.Transcript", got) + } +} + +func TestMinimalFromMergedEmitsOnlyMinimalShape(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + { + ID: 1, + Source: "input.json", + SourceRef: "word-run:1:1:1", + DerivedFrom: []string{"input.json#0"}, + Speaker: "Alice", + Start: 1, + End: 2, + Text: "hello", + Categories: []string{"backchannel"}, + OverlapGroupID: 1, + }, + }, + OverlapGroups: []model.OverlapGroup{ + {ID: 1, Start: 1, End: 2, Segments: []string{"input.json#0"}, Speakers: []string{"Alice"}, Class: "unknown", Resolution: "unresolved"}, + }, + } + + got := MinimalFromMerged(config.Config{OutputSchema: config.OutputSchemaMinimal}, merged) + want := schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: ApplicationName, + Version: buildinfo.Version, + OutputSchema: config.OutputSchemaMinimal, + }, + Segments: []schema.MinimalSegment{ + {ID: 1, Start: 1, End: 2, Speaker: "Alice", Text: "hello"}, + }, + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("minimal transcript = %#v, want %#v", got, want) + } +} + +func TestSelectedFromMergedUsesMinimalWhenConfigured(t *testing.T) { + got := SelectedFromMerged(config.Config{OutputSchema: config.OutputSchemaMinimal}, model.MergedTranscript{}) + if _, ok := got.(schema.MinimalTranscript); !ok { + t.Fatalf("selected artifact type = %T, want schema.MinimalTranscript", got) + } +} diff --git a/internal/builtin/output.go b/internal/builtin/output.go index 9d451d9..2b46881 100644 --- a/internal/builtin/output.go +++ b/internal/builtin/output.go @@ -7,7 +7,6 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/report" - "gitea.maximumdirect.net/eric/seriatim/schema" ) type jsonOutputWriter struct{} @@ -16,7 +15,7 @@ func (jsonOutputWriter) Name() string { return "json" } -func (jsonOutputWriter) Write(ctx context.Context, out schema.Transcript, rpt report.Report, cfg config.Config) ([]report.Event, error) { +func (jsonOutputWriter) Write(ctx context.Context, out any, rpt report.Report, cfg config.Config) ([]report.Event, error) { if err := ctx.Err(); err != nil { return nil, err } diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index 358b9c3..887f69c 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -47,8 +47,17 @@ func (validateOutput) Process(ctx context.Context, in model.MergedTranscript, cf return model.MergedTranscript{}, nil, err } - transcript := artifact.FromMerged(cfg, in) - if err := schema.ValidateTranscript(transcript); err != nil { + selected := artifact.SelectedFromMerged(cfg, in) + var err error + switch transcript := selected.(type) { + case schema.MinimalTranscript: + err = schema.ValidateMinimalTranscript(transcript) + case schema.Transcript: + err = schema.ValidateTranscript(transcript) + default: + err = fmt.Errorf("unsupported output artifact type %T", selected) + } + if err != nil { return model.MergedTranscript{}, nil, fmt.Errorf("validate-output: %w", err) } diff --git a/internal/builtin/postprocess_test.go b/internal/builtin/postprocess_test.go index 91bab46..afbaaa6 100644 --- a/internal/builtin/postprocess_test.go +++ b/internal/builtin/postprocess_test.go @@ -49,6 +49,56 @@ func TestValidateOutputFailsBeforeAssignIDs(t *testing.T) { } } +func TestValidateOutputUsesMinimalSchemaWhenConfigured(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + { + ID: 1, + Source: "input.json", + SourceRef: "word-run:1:1:1", + DerivedFrom: []string{"input.json#0"}, + Speaker: "Alice", + Start: 1, + End: 2, + Text: "hello", + Categories: []string{"backchannel"}, + OverlapGroupID: 1, + }, + }, + } + + cfg := testConfig() + cfg.OutputSchema = config.OutputSchemaMinimal + got, events, err := validateOutput{}.Process(context.Background(), merged, cfg) + if err != nil { + t.Fatalf("validate output: %v", err) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + if len(events) != 1 || !strings.Contains(events[0].Message, "validated 1 output segment(s)") { + t.Fatalf("events = %#v", events) + } +} + +func TestValidateOutputMinimalFailsBeforeAssignIDs(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + {Source: "input.json", Speaker: "Alice", Start: 1, End: 2, Text: "hello"}, + }, + } + + cfg := testConfig() + cfg.OutputSchema = config.OutputSchemaMinimal + _, _, err := validateOutput{}.Process(context.Background(), merged, cfg) + if err == nil { + t.Fatal("expected validation error") + } + if !strings.Contains(err.Error(), "segment 0 has id 0; want 1") { + t.Fatalf("unexpected error: %v", err) + } +} + func testConfig() config.Config { return config.Config{ InputReader: config.DefaultInputReader, diff --git a/internal/cli/merge.go b/internal/cli/merge.go index 7d7553b..9e0a0f2 100644 --- a/internal/cli/merge.go +++ b/internal/cli/merge.go @@ -32,6 +32,7 @@ func newMergeCommand() *cobra.Command { flags.StringVar(&opts.AutocorrectFile, "autocorrect", "", "autocorrect rules file") flags.StringVar(&opts.InputReader, "input-reader", config.DefaultInputReader, "input reader module") flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules") + flags.StringVar(&opts.OutputSchema, "output-schema", config.DefaultOutputSchema, "output JSON schema: seriatim or minimal") flags.StringVar(&opts.PreprocessingModules, "preprocessing-modules", config.DefaultPreprocessingModules, "comma-separated preprocessing modules") flags.StringVar(&opts.PostprocessingModules, "postprocessing-modules", config.DefaultPostprocessingModules, "comma-separated postprocessing modules") flags.StringVar(&opts.CoalesceGap, "coalesce-gap", config.DefaultCoalesceGapValue, "maximum same-speaker gap in seconds for coalesce") diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index a0e6cd5..4f527a3 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -10,6 +10,7 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" + "gitea.maximumdirect.net/eric/seriatim/schema" ) func TestMergeWritesMergedOutputAndReport(t *testing.T) { @@ -111,6 +112,64 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { } } +func TestMergeWritesMinimalOutputSchema(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": " Yeah. "}, + {"start": 8, "end": 9, "text": " next "} + ] + }`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--output-schema", "minimal", + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript schema.MinimalTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.Application != "seriatim" { + t.Fatalf("application = %q, want seriatim", transcript.Metadata.Application) + } + if transcript.Metadata.OutputSchema != "minimal" { + t.Fatalf("output_schema = %q, want minimal", transcript.Metadata.OutputSchema) + } + if got, want := len(transcript.Segments), 2; got != want { + t.Fatalf("segment count = %d, want %d", got, want) + } + for index, segment := range transcript.Segments { + if segment.ID != index+1 { + t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1) + } + } + if transcript.Segments[0].Speaker != "input.json" || transcript.Segments[0].Text != "Yeah." { + t.Fatalf("first segment = %#v", transcript.Segments[0]) + } + + outputBytes, err := os.ReadFile(output) + if err != nil { + t.Fatalf("read output: %v", err) + } + for _, forbidden := range []string{"overlap_groups", "categories", "source", "derived_from"} { + if strings.Contains(string(outputBytes), forbidden) { + t.Fatalf("minimal output contains %q:\n%s", forbidden, outputBytes) + } + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 2 output segment(s)") { + t.Fatal("expected validate-output report event") + } +} + func TestMergeTieBreakOrder(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ @@ -182,6 +241,29 @@ func TestMergeValidateOutputBeforeAssignIDsFails(t *testing.T) { } } +func TestMergeValidateMinimalOutputBeforeAssignIDsFails(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": "hello"} + ] + }`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--output-schema", "minimal", + "--postprocessing-modules", "validate-output,assign-ids", + ) + if err == nil { + t.Fatal("expected validation error") + } + if !strings.Contains(err.Error(), "validate-output: segment 0 has id 0; want 1") { + t.Fatalf("unexpected error: %v", err) + } +} + func TestMergeDetectsOverlapGroups(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ @@ -963,6 +1045,24 @@ func TestUnknownModulesFailDuringValidation(t *testing.T) { } } +func TestUnknownOutputSchemaFailsDuringValidation(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--output-schema", "compact", + ) + if err == nil { + t.Fatal("expected output schema error") + } + if !strings.Contains(err.Error(), "--output-schema must be one of") { + t.Fatalf("unexpected error: %v", err) + } +} + func TestInvalidPreprocessingOrderFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) diff --git a/internal/config/config.go b/internal/config/config.go index 88b86db..bd350b2 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -13,6 +13,7 @@ import ( const ( DefaultInputReader = "json-files" DefaultOutputModules = "json" + DefaultOutputSchema = OutputSchemaSeriatim DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultOverlapWordRunGap = 0.75 @@ -25,6 +26,8 @@ const ( WordRunReorderWindowEnv = "SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW" BackchannelMaxDurationEnv = "SERIATIM_BACKCHANNEL_MAX_DURATION" FillerMaxDurationEnv = "SERIATIM_FILLER_MAX_DURATION" + OutputSchemaSeriatim = "seriatim" + OutputSchemaMinimal = "minimal" ) // MergeOptions captures raw CLI option values before validation. @@ -36,6 +39,7 @@ type MergeOptions struct { AutocorrectFile string InputReader string OutputModules string + OutputSchema string PreprocessingModules string PostprocessingModules string CoalesceGap string @@ -50,6 +54,7 @@ type Config struct { AutocorrectFile string InputReader string OutputModules []string + OutputSchema string PreprocessingModules []string PostprocessingModules []string OverlapWordRunGap float64 @@ -64,6 +69,7 @@ func NewMergeConfig(opts MergeOptions) (Config, error) { cfg := Config{ InputReader: strings.TrimSpace(opts.InputReader), OutputModules: nil, + OutputSchema: strings.TrimSpace(opts.OutputSchema), PreprocessingModules: nil, PostprocessingModules: nil, OverlapWordRunGap: DefaultOverlapWordRunGap, @@ -76,6 +82,12 @@ func NewMergeConfig(opts MergeOptions) (Config, error) { if cfg.InputReader == "" { return Config{}, errors.New("--input-reader is required") } + if cfg.OutputSchema == "" { + cfg.OutputSchema = DefaultOutputSchema + } + if err := validateOutputSchema(cfg.OutputSchema); err != nil { + return Config{}, err + } var err error cfg.OutputModules, err = parseModuleList(opts.OutputModules) @@ -174,6 +186,15 @@ func parseModuleList(value string) ([]string, error) { return names, nil } +func validateOutputSchema(value string) error { + switch value { + case OutputSchemaSeriatim, OutputSchemaMinimal: + return nil + default: + return fmt.Errorf("--output-schema must be one of %q or %q", OutputSchemaSeriatim, OutputSchemaMinimal) + } +} + func normalizeInputFiles(paths []string) ([]string, error) { if len(paths) == 0 { return nil, errors.New("at least one --input-file is required") diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 20b521f..3033392 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -46,6 +46,71 @@ func TestDuplicateInputFilesFailValidation(t *testing.T) { } } +func TestOutputSchemaDefaultsToSeriatim(t *testing.T) { + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + cfg, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err != nil { + t.Fatalf("config failed: %v", err) + } + if cfg.OutputSchema != DefaultOutputSchema { + t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, DefaultOutputSchema) + } +} + +func TestOutputSchemaAcceptsMinimal(t *testing.T) { + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + cfg, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + OutputSchema: OutputSchemaMinimal, + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err != nil { + t.Fatalf("config failed: %v", err) + } + if cfg.OutputSchema != OutputSchemaMinimal { + t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaMinimal) + } +} + +func TestOutputSchemaRejectsUnknownValue(t *testing.T) { + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + _, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + OutputSchema: "compact", + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err == nil { + t.Fatal("expected output schema error") + } + if !strings.Contains(err.Error(), "--output-schema must be one of") { + t.Fatalf("unexpected error: %v", err) + } +} + func TestOverlapWordRunGapDefaultsTo075(t *testing.T) { t.Setenv(OverlapWordRunGapEnv, "") dir := t.TempDir() diff --git a/internal/pipeline/interfaces.go b/internal/pipeline/interfaces.go index 1227eb7..79f1d46 100644 --- a/internal/pipeline/interfaces.go +++ b/internal/pipeline/interfaces.go @@ -6,7 +6,6 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" - "gitea.maximumdirect.net/eric/seriatim/schema" ) // ModelState identifies which representation a preprocessing module consumes. @@ -53,5 +52,5 @@ type Postprocessor interface { // OutputWriter emits final artifacts. type OutputWriter interface { Name() string - Write(ctx context.Context, out schema.Transcript, rpt report.Report, cfg config.Config) ([]report.Event, error) + Write(ctx context.Context, out any, rpt report.Report, cfg config.Config) ([]report.Event, error) } diff --git a/internal/pipeline/runner.go b/internal/pipeline/runner.go index 6c7e1df..965ce92 100644 --- a/internal/pipeline/runner.go +++ b/internal/pipeline/runner.go @@ -9,7 +9,6 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" - "gitea.maximumdirect.net/eric/seriatim/schema" ) const ( @@ -141,8 +140,8 @@ func validatePreprocessors(modules []Preprocessor) error { return nil } -func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) schema.Transcript { - return artifact.FromMerged(cfg, merged) +func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) any { + return artifact.SelectedFromMerged(cfg, merged) } func finalizeReport(cfg config.Config, events []report.Event) report.Report { diff --git a/schema/minimal-output.schema.json b/schema/minimal-output.schema.json new file mode 100644 index 0000000..ca6fa24 --- /dev/null +++ b/schema/minimal-output.schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/minimal-output.schema.json", + "title": "seriatim minimal output transcript", + "type": "object", + "additionalProperties": false, + "required": ["metadata", "segments"], + "properties": { + "metadata": { + "type": "object", + "additionalProperties": false, + "required": ["application", "version", "output_schema"], + "properties": { + "application": { "type": "string" }, + "version": { "type": "string" }, + "output_schema": { "type": "string", "const": "minimal" } + } + }, + "segments": { + "type": "array", + "items": { "$ref": "#/$defs/segment" } + } + }, + "$defs": { + "segment": { + "type": "object", + "additionalProperties": false, + "required": ["id", "start", "end", "speaker", "text"], + "properties": { + "id": { "type": "integer", "minimum": 1 }, + "start": { "type": "number" }, + "end": { "type": "number" }, + "speaker": { "type": "string" }, + "text": { "type": "string" } + } + } + } +} diff --git a/schema/output.go b/schema/output.go index ea988cb..4510563 100644 --- a/schema/output.go +++ b/schema/output.go @@ -10,15 +10,18 @@ import ( "github.com/santhosh-tekuri/jsonschema/v6" ) -//go:embed output.schema.json +//go:embed *.schema.json var schemaFS embed.FS -const outputSchemaPath = "output.schema.json" +const ( + outputSchemaPath = "output.schema.json" + minimalOutputSchemaPath = "minimal-output.schema.json" +) var ( - compiledOutputSchema *jsonschema.Schema - compileOnce sync.Once - compileErr error + compiledSchemas = make(map[string]*jsonschema.Schema) + compileErrs = make(map[string]error) + compileMu sync.Mutex ) // Transcript is seriatim's public JSON output contract. @@ -28,6 +31,12 @@ type Transcript struct { OverlapGroups []OverlapGroup `json:"overlap_groups"` } +// MinimalTranscript is seriatim's compact public JSON output contract. +type MinimalTranscript struct { + Metadata MinimalMetadata `json:"metadata"` + Segments []MinimalSegment `json:"segments"` +} + // Metadata records the pipeline configuration that produced an artifact. type Metadata struct { Application string `json:"application"` @@ -39,6 +48,13 @@ type Metadata struct { OutputModules []string `json:"output_modules"` } +// MinimalMetadata records minimal artifact identity. +type MinimalMetadata struct { + Application string `json:"application"` + Version string `json:"version"` + OutputSchema string `json:"output_schema"` +} + // Segment is the public transcript segment shape. type Segment struct { ID int `json:"id"` @@ -54,6 +70,15 @@ type Segment struct { OverlapGroupID int `json:"overlap_group_id,omitempty"` } +// MinimalSegment is the compact public transcript segment shape. +type MinimalSegment struct { + ID int `json:"id"` + Start float64 `json:"start"` + End float64 `json:"end"` + Speaker string `json:"speaker"` + Text string `json:"text"` +} + // OverlapGroup describes a detected overlapping speech region. type OverlapGroup struct { ID int `json:"id"` @@ -79,8 +104,32 @@ func ValidateTranscript(transcript Transcript) error { return ValidateJSON(data) } +// ValidateMinimalTranscript validates a minimal transcript against the minimal +// JSON schema and seriatim-specific semantic rules. +func ValidateMinimalTranscript(transcript MinimalTranscript) error { + if err := validateMinimalSemantics(transcript); err != nil { + return err + } + + data, err := json.Marshal(transcript) + if err != nil { + return fmt.Errorf("marshal minimal transcript for schema validation: %w", err) + } + return ValidateMinimalJSON(data) +} + // ValidateJSON validates serialized output JSON against the public schema. func ValidateJSON(data []byte) error { + return validateJSONWithSchema(data, outputSchemaPath) +} + +// ValidateMinimalJSON validates serialized minimal output JSON against the +// minimal public schema. +func ValidateMinimalJSON(data []byte) error { + return validateJSONWithSchema(data, minimalOutputSchemaPath) +} + +func validateJSONWithSchema(data []byte, schemaPath string) error { var value any decoder := json.NewDecoder(bytes.NewReader(data)) decoder.UseNumber() @@ -88,7 +137,7 @@ func ValidateJSON(data []byte) error { return fmt.Errorf("decode output JSON for schema validation: %w", err) } - compiled, err := outputSchema() + compiled, err := outputSchema(schemaPath) if err != nil { return err } @@ -98,32 +147,39 @@ func ValidateJSON(data []byte) error { return nil } -func outputSchema() (*jsonschema.Schema, error) { - compileOnce.Do(func() { - data, err := schemaFS.ReadFile(outputSchemaPath) - if err != nil { - compileErr = fmt.Errorf("read embedded output schema: %w", err) - return - } - var schemaDocument any - decoder := json.NewDecoder(bytes.NewReader(data)) - decoder.UseNumber() - if err := decoder.Decode(&schemaDocument); err != nil { - compileErr = fmt.Errorf("decode embedded output schema: %w", err) - return - } +func outputSchema(schemaPath string) (*jsonschema.Schema, error) { + compileMu.Lock() + defer compileMu.Unlock() - compiler := jsonschema.NewCompiler() - if err := compiler.AddResource(outputSchemaPath, schemaDocument); err != nil { - compileErr = fmt.Errorf("load embedded output schema: %w", err) - return - } - compiledOutputSchema, compileErr = compiler.Compile(outputSchemaPath) - if compileErr != nil { - compileErr = fmt.Errorf("compile embedded output schema: %w", compileErr) - } - }) - return compiledOutputSchema, compileErr + if compiled, exists := compiledSchemas[schemaPath]; exists { + return compiled, compileErrs[schemaPath] + } + + data, err := schemaFS.ReadFile(schemaPath) + if err != nil { + compileErrs[schemaPath] = fmt.Errorf("read embedded output schema: %w", err) + return nil, compileErrs[schemaPath] + } + var schemaDocument any + decoder := json.NewDecoder(bytes.NewReader(data)) + decoder.UseNumber() + if err := decoder.Decode(&schemaDocument); err != nil { + compileErrs[schemaPath] = fmt.Errorf("decode embedded output schema: %w", err) + return nil, compileErrs[schemaPath] + } + + compiler := jsonschema.NewCompiler() + if err := compiler.AddResource(schemaPath, schemaDocument); err != nil { + compileErrs[schemaPath] = fmt.Errorf("load embedded output schema: %w", err) + return nil, compileErrs[schemaPath] + } + compiled, err := compiler.Compile(schemaPath) + if err != nil { + compileErrs[schemaPath] = fmt.Errorf("compile embedded output schema: %w", err) + return nil, compileErrs[schemaPath] + } + compiledSchemas[schemaPath] = compiled + return compiled, nil } func validateSemantics(transcript Transcript) error { @@ -143,3 +199,16 @@ func validateSemantics(transcript Transcript) error { } return nil } + +func validateMinimalSemantics(transcript MinimalTranscript) error { + for index, segment := range transcript.Segments { + wantID := index + 1 + if segment.ID != wantID { + return fmt.Errorf("segment %d has id %d; want %d", index, segment.ID, wantID) + } + if segment.End < segment.Start { + return fmt.Errorf("segment %d has end %.3f before start %.3f", index, segment.End, segment.Start) + } + } + return nil +} diff --git a/schema/output_test.go b/schema/output_test.go index d7467a9..11ca295 100644 --- a/schema/output_test.go +++ b/schema/output_test.go @@ -13,6 +13,96 @@ func TestValidateTranscriptAcceptsValidTranscript(t *testing.T) { } } +func TestValidateMinimalTranscriptAcceptsValidTranscript(t *testing.T) { + transcript := validMinimalTranscript() + + if err := ValidateMinimalTranscript(transcript); err != nil { + t.Fatalf("validate minimal transcript: %v", err) + } +} + +func TestValidateMinimalJSONRejectsMissingRequiredField(t *testing.T) { + err := ValidateMinimalJSON([]byte(`{ + "metadata": { + "application": "seriatim", + "version": "dev", + "output_schema": "minimal" + } + }`)) + assertErrorContains(t, err, "segments") +} + +func TestValidateMinimalJSONRejectsWrongFieldType(t *testing.T) { + err := ValidateMinimalJSON([]byte(`{ + "metadata": { + "application": "seriatim", + "version": "dev", + "output_schema": "minimal" + }, + "segments": [ + { + "id": "1", + "start": 1, + "end": 2, + "speaker": "Alice", + "text": "hello" + } + ] + }`)) + assertErrorContains(t, err, "id") +} + +func TestValidateMinimalJSONRejectsUnexpectedFields(t *testing.T) { + tests := []struct { + name string + json string + }{ + { + name: "top-level overlap groups", + json: `{ + "metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, + "segments": [], + "overlap_groups": [] + }`, + }, + { + name: "segment source", + json: `{ + "metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, + "segments": [{"id": 1, "source": "input.json", "start": 1, "end": 2, "speaker": "Alice", "text": "hello"}] + }`, + }, + { + name: "segment categories", + json: `{ + "metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, + "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "categories": ["backchannel"]}] + }`, + }, + { + name: "segment derived from", + json: `{ + "metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, + "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "derived_from": ["input.json#0"]}] + }`, + }, + { + name: "segment words", + json: `{ + "metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, + "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "words": []}] + }`, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + err := ValidateMinimalJSON([]byte(test.json)) + assertErrorContains(t, err, "additional properties") + }) + } +} + func TestValidateJSONRejectsMissingRequiredField(t *testing.T) { err := ValidateJSON([]byte(`{ "metadata": { @@ -135,6 +225,46 @@ func TestValidateTranscriptRejectsInvalidTiming(t *testing.T) { assertErrorContains(t, err, "segment 0 has end") } +func TestValidateMinimalTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T) { + tests := []struct { + name string + ids []int + want string + }{ + {name: "missing zero id", ids: []int{0}, want: "segment 0 has id 0; want 1"}, + {name: "does not start at one", ids: []int{2}, want: "segment 0 has id 2; want 1"}, + {name: "gap", ids: []int{1, 3}, want: "segment 1 has id 3; want 2"}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + transcript := validMinimalTranscript() + transcript.Segments = transcript.Segments[:0] + for index, id := range test.ids { + transcript.Segments = append(transcript.Segments, MinimalSegment{ + ID: id, + Start: float64(index), + End: float64(index) + 1, + Speaker: "Alice", + Text: "hello", + }) + } + + err := ValidateMinimalTranscript(transcript) + assertErrorContains(t, err, test.want) + }) + } +} + +func TestValidateMinimalTranscriptRejectsInvalidTiming(t *testing.T) { + transcript := validMinimalTranscript() + transcript.Segments[0].Start = 2 + transcript.Segments[0].End = 1 + + err := ValidateMinimalTranscript(transcript) + assertErrorContains(t, err, "segment 0 has end") +} + func TestValidateTranscriptRejectsInvalidOverlapGroupTiming(t *testing.T) { transcript := validTranscript() transcript.OverlapGroups = []OverlapGroup{ @@ -153,6 +283,25 @@ func TestValidateTranscriptRejectsInvalidOverlapGroupTiming(t *testing.T) { assertErrorContains(t, err, "overlap_group 0 has end") } +func validMinimalTranscript() MinimalTranscript { + return MinimalTranscript{ + Metadata: MinimalMetadata{ + Application: "seriatim", + Version: "dev", + OutputSchema: "minimal", + }, + Segments: []MinimalSegment{ + { + ID: 1, + Start: 1, + End: 2, + Speaker: "Alice", + Text: "hello", + }, + }, + } +} + func validTranscript() Transcript { sourceIndex := 0 return Transcript{