From 6cb739be55404095fde817d5ae99fb51fb07e76a Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Mon, 27 Apr 2026 21:27:19 -0500 Subject: [PATCH] Add output validation against a defined JSON schema --- README.md | 8 ++ go.mod | 2 + go.sum | 6 + internal/artifact/transcript.go | 68 ++++++++++ internal/builtin/output.go | 6 +- internal/builtin/postprocess.go | 23 ++++ internal/builtin/postprocess_test.go | 60 +++++++++ internal/builtin/registry.go | 2 +- internal/cli/merge_test.go | 25 ++++ internal/pipeline/interfaces.go | 3 +- internal/pipeline/runner.go | 31 +---- schema/output.go | 145 ++++++++++++++++++++ schema/output.schema.json | 98 ++++++++++++++ schema/output_test.go | 191 +++++++++++++++++++++++++++ 14 files changed, 638 insertions(+), 30 deletions(-) create mode 100644 internal/artifact/transcript.go create mode 100644 internal/builtin/postprocess_test.go create mode 100644 schema/output.go create mode 100644 schema/output.schema.json create mode 100644 schema/output_test.go diff --git a/README.md b/README.md index 456ed75..bfbd759 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,14 @@ Segments are sorted deterministically by: Final segment IDs are assigned after sorting and start at `1`. +The public Go output contract is available from: + +```go +import "gitea.maximumdirect.net/eric/seriatim/schema" +``` + +The same package embeds the machine-readable JSON Schema in `schema/output.schema.json`. The default `validate-output` postprocessor validates the output shape and verifies final segment IDs are present, sequential, and start at `1`. + ## Overlap Detection The default postprocessing pipeline detects overlapping segment groups. diff --git a/go.mod b/go.mod index 7cf9fde..ffe0575 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module gitea.maximumdirect.net/eric/seriatim go 1.25 require ( + github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 github.com/spf13/cobra v1.10.1 gopkg.in/yaml.v3 v3.0.1 ) @@ -10,4 +11,5 @@ require ( require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/spf13/pflag v1.0.9 // indirect + golang.org/x/text v0.14.0 // indirect ) diff --git a/go.sum b/go.sum index 7af0519..aa91b04 100644 --- a/go.sum +++ b/go.sum @@ -1,11 +1,17 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI= +github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEVZGK7IN2kJkjTuQ= +github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s= github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0= github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/artifact/transcript.go b/internal/artifact/transcript.go new file mode 100644 index 0000000..1e8d322 --- /dev/null +++ b/internal/artifact/transcript.go @@ -0,0 +1,68 @@ +package artifact + +import ( + "gitea.maximumdirect.net/eric/seriatim/internal/config" + "gitea.maximumdirect.net/eric/seriatim/internal/model" + "gitea.maximumdirect.net/eric/seriatim/schema" +) + +const ( + ApplicationName = "seriatim" + Version = "dev" +) + +// FromMerged converts the internal merged transcript model into the public +// serialized output contract. +func FromMerged(cfg config.Config, merged model.MergedTranscript) schema.Transcript { + segments := make([]schema.Segment, len(merged.Segments)) + for index, segment := range merged.Segments { + segments[index] = schema.Segment{ + ID: segment.ID, + Source: segment.Source, + SourceSegmentIndex: copyIntPtr(segment.SourceSegmentIndex), + SourceRef: segment.SourceRef, + DerivedFrom: append([]string(nil), segment.DerivedFrom...), + Speaker: segment.Speaker, + Start: segment.Start, + End: segment.End, + Text: segment.Text, + Categories: append([]string(nil), segment.Categories...), + OverlapGroupID: segment.OverlapGroupID, + } + } + + overlapGroups := make([]schema.OverlapGroup, len(merged.OverlapGroups)) + for index, group := range merged.OverlapGroups { + overlapGroups[index] = schema.OverlapGroup{ + ID: group.ID, + Start: group.Start, + End: group.End, + Segments: append([]string(nil), group.Segments...), + Speakers: append([]string(nil), group.Speakers...), + Class: group.Class, + Resolution: group.Resolution, + } + } + + return schema.Transcript{ + Metadata: schema.Metadata{ + Application: ApplicationName, + Version: Version, + InputReader: cfg.InputReader, + InputFiles: append([]string(nil), cfg.InputFiles...), + PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...), + PostprocessingModules: append([]string(nil), cfg.PostprocessingModules...), + OutputModules: append([]string(nil), cfg.OutputModules...), + }, + Segments: segments, + OverlapGroups: overlapGroups, + } +} + +func copyIntPtr(value *int) *int { + if value == nil { + return nil + } + copied := *value + return &copied +} diff --git a/internal/builtin/output.go b/internal/builtin/output.go index 4151e6c..9d451d9 100644 --- a/internal/builtin/output.go +++ b/internal/builtin/output.go @@ -6,8 +6,8 @@ import ( "os" "gitea.maximumdirect.net/eric/seriatim/internal/config" - "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" + "gitea.maximumdirect.net/eric/seriatim/schema" ) type jsonOutputWriter struct{} @@ -16,7 +16,7 @@ func (jsonOutputWriter) Name() string { return "json" } -func (jsonOutputWriter) Write(ctx context.Context, out model.FinalTranscript, rpt report.Report, cfg config.Config) ([]report.Event, error) { +func (jsonOutputWriter) Write(ctx context.Context, out schema.Transcript, rpt report.Report, cfg config.Config) ([]report.Event, error) { if err := ctx.Err(); err != nil { return nil, err } @@ -34,6 +34,6 @@ func (jsonOutputWriter) Write(ctx context.Context, out model.FinalTranscript, rp } return []report.Event{ - report.Info("output", "json", "wrote placeholder transcript JSON"), + report.Info("output", "json", "wrote transcript JSON"), }, nil } diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index 1d76278..0a476df 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -4,6 +4,7 @@ import ( "context" "fmt" + "gitea.maximumdirect.net/eric/seriatim/internal/artifact" "gitea.maximumdirect.net/eric/seriatim/internal/autocorrect" "gitea.maximumdirect.net/eric/seriatim/internal/backchannel" "gitea.maximumdirect.net/eric/seriatim/internal/coalesce" @@ -12,6 +13,7 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/overlap" "gitea.maximumdirect.net/eric/seriatim/internal/report" + "gitea.maximumdirect.net/eric/seriatim/schema" ) type noopPostprocessor struct { @@ -52,6 +54,27 @@ func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg con }, nil } +type validateOutput struct{} + +func (validateOutput) Name() string { + return "validate-output" +} + +func (validateOutput) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) { + if err := ctx.Err(); err != nil { + return model.MergedTranscript{}, nil, err + } + + transcript := artifact.FromMerged(cfg, in) + if err := schema.ValidateTranscript(transcript); err != nil { + return model.MergedTranscript{}, nil, fmt.Errorf("validate-output: %w", err) + } + + return in, []report.Event{ + report.Info("postprocessing", "validate-output", fmt.Sprintf("validated %d output segment(s)", len(in.Segments))), + }, nil +} + type detectOverlaps struct{} func (detectOverlaps) Name() string { diff --git a/internal/builtin/postprocess_test.go b/internal/builtin/postprocess_test.go new file mode 100644 index 0000000..91bab46 --- /dev/null +++ b/internal/builtin/postprocess_test.go @@ -0,0 +1,60 @@ +package builtin + +import ( + "context" + "strings" + "testing" + + "gitea.maximumdirect.net/eric/seriatim/internal/config" + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +func TestValidateOutputSucceedsAfterAssignIDs(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + {Source: "input.json", Speaker: "Alice", Start: 1, End: 2, Text: "hello"}, + {Source: "input.json", Speaker: "Alice", Start: 3, End: 4, Text: "again"}, + }, + } + + withIDs, _, err := assignIDs{}.Process(context.Background(), merged, testConfig()) + if err != nil { + t.Fatalf("assign IDs: %v", err) + } + got, events, err := validateOutput{}.Process(context.Background(), withIDs, testConfig()) + if err != nil { + t.Fatalf("validate output: %v", err) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if len(events) != 1 || !strings.Contains(events[0].Message, "validated 2 output segment(s)") { + t.Fatalf("events = %#v", events) + } +} + +func TestValidateOutputFailsBeforeAssignIDs(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + {Source: "input.json", Speaker: "Alice", Start: 1, End: 2, Text: "hello"}, + }, + } + + _, _, err := validateOutput{}.Process(context.Background(), merged, testConfig()) + if err == nil { + t.Fatal("expected validation error") + } + if !strings.Contains(err.Error(), "segment 0 has id 0; want 1") { + t.Fatalf("unexpected error: %v", err) + } +} + +func testConfig() config.Config { + return config.Config{ + InputReader: config.DefaultInputReader, + InputFiles: []string{"input.json"}, + PreprocessingModules: []string{"validate-raw", "normalize-speakers", "trim-text"}, + PostprocessingModules: []string{"assign-ids", "validate-output"}, + OutputModules: []string{"json"}, + } +} diff --git a/internal/builtin/registry.go b/internal/builtin/registry.go index e58a642..25815cb 100644 --- a/internal/builtin/registry.go +++ b/internal/builtin/registry.go @@ -17,7 +17,7 @@ func NewRegistry() *pipeline.Registry { registry.RegisterPostprocessor(fillerPostprocessor{}) registry.RegisterPostprocessor(coalescePostprocessor{}) registry.RegisterPostprocessor(assignIDs{}) - registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"}) + registry.RegisterPostprocessor(validateOutput{}) registry.RegisterPostprocessor(autocorrectPostprocessor{}) registry.RegisterOutputWriter(jsonOutputWriter{}) diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 98df1cc..1d8d2ad 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -102,6 +102,9 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { if !equalStrings(gotModules, wantModules) { t.Fatalf("report event order mismatch:\ngot %v\nwant %v", gotModules, wantModules) } + if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 3 output segment(s)") { + t.Fatal("expected validate-output report event") + } } func TestMergeTieBreakOrder(t *testing.T) { @@ -153,6 +156,28 @@ func TestMergeTieBreakOrder(t *testing.T) { } } +func TestMergeValidateOutputBeforeAssignIDsFails(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": "hello"} + ] + }`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--postprocessing-modules", "validate-output,assign-ids", + ) + if err == nil { + t.Fatal("expected validation error") + } + if !strings.Contains(err.Error(), "validate-output: segment 0 has id 0; want 1") { + t.Fatalf("unexpected error: %v", err) + } +} + func TestMergeDetectsOverlapGroups(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ diff --git a/internal/pipeline/interfaces.go b/internal/pipeline/interfaces.go index 3f34187..1227eb7 100644 --- a/internal/pipeline/interfaces.go +++ b/internal/pipeline/interfaces.go @@ -6,6 +6,7 @@ import ( "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" + "gitea.maximumdirect.net/eric/seriatim/schema" ) // ModelState identifies which representation a preprocessing module consumes. @@ -52,5 +53,5 @@ type Postprocessor interface { // OutputWriter emits final artifacts. type OutputWriter interface { Name() string - Write(ctx context.Context, out model.FinalTranscript, rpt report.Report, cfg config.Config) ([]report.Event, error) + Write(ctx context.Context, out schema.Transcript, rpt report.Report, cfg config.Config) ([]report.Event, error) } diff --git a/internal/pipeline/runner.go b/internal/pipeline/runner.go index 1944e34..cba1d3d 100644 --- a/internal/pipeline/runner.go +++ b/internal/pipeline/runner.go @@ -4,14 +4,16 @@ import ( "context" "fmt" + "gitea.maximumdirect.net/eric/seriatim/internal/artifact" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" + "gitea.maximumdirect.net/eric/seriatim/schema" ) const ( - applicationName = "seriatim" - version = "dev" + applicationName = artifact.ApplicationName + version = artifact.Version ) // Run validates module composition, executes the pipeline, and emits outputs. @@ -139,29 +141,8 @@ func validatePreprocessors(modules []Preprocessor) error { return nil } -func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) model.FinalTranscript { - segments := make([]model.Segment, len(merged.Segments)) - copy(segments, merged.Segments) - for index := range segments { - segments[index].Words = nil - segments[index].DerivedFrom = append([]string(nil), segments[index].DerivedFrom...) - } - overlapGroups := make([]model.OverlapGroup, len(merged.OverlapGroups)) - copy(overlapGroups, merged.OverlapGroups) - - return model.FinalTranscript{ - Metadata: model.OutputMetadata{ - Application: applicationName, - Version: version, - InputReader: cfg.InputReader, - InputFiles: append([]string(nil), cfg.InputFiles...), - PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...), - PostprocessingModules: append([]string(nil), cfg.PostprocessingModules...), - OutputModules: append([]string(nil), cfg.OutputModules...), - }, - Segments: segments, - OverlapGroups: overlapGroups, - } +func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) schema.Transcript { + return artifact.FromMerged(cfg, merged) } func finalizeReport(cfg config.Config, events []report.Event) report.Report { diff --git a/schema/output.go b/schema/output.go new file mode 100644 index 0000000..ea988cb --- /dev/null +++ b/schema/output.go @@ -0,0 +1,145 @@ +package schema + +import ( + "bytes" + "embed" + "encoding/json" + "fmt" + "sync" + + "github.com/santhosh-tekuri/jsonschema/v6" +) + +//go:embed output.schema.json +var schemaFS embed.FS + +const outputSchemaPath = "output.schema.json" + +var ( + compiledOutputSchema *jsonschema.Schema + compileOnce sync.Once + compileErr error +) + +// Transcript is seriatim's public JSON output contract. +type Transcript struct { + Metadata Metadata `json:"metadata"` + Segments []Segment `json:"segments"` + OverlapGroups []OverlapGroup `json:"overlap_groups"` +} + +// Metadata records the pipeline configuration that produced an artifact. +type Metadata struct { + Application string `json:"application"` + Version string `json:"version"` + InputReader string `json:"input_reader"` + InputFiles []string `json:"input_files"` + PreprocessingModules []string `json:"preprocessing_modules"` + PostprocessingModules []string `json:"postprocessing_modules"` + OutputModules []string `json:"output_modules"` +} + +// Segment is the public transcript segment shape. +type Segment struct { + ID int `json:"id"` + Source string `json:"source"` + SourceSegmentIndex *int `json:"source_segment_index,omitempty"` + SourceRef string `json:"source_ref,omitempty"` + DerivedFrom []string `json:"derived_from,omitempty"` + Speaker string `json:"speaker"` + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` + Categories []string `json:"categories,omitempty"` + OverlapGroupID int `json:"overlap_group_id,omitempty"` +} + +// OverlapGroup describes a detected overlapping speech region. +type OverlapGroup struct { + ID int `json:"id"` + Start float64 `json:"start"` + End float64 `json:"end"` + Segments []string `json:"segments"` + Speakers []string `json:"speakers"` + Class string `json:"class"` + Resolution string `json:"resolution"` +} + +// ValidateTranscript validates a typed transcript against the public JSON +// schema and seriatim-specific semantic rules. +func ValidateTranscript(transcript Transcript) error { + if err := validateSemantics(transcript); err != nil { + return err + } + + data, err := json.Marshal(transcript) + if err != nil { + return fmt.Errorf("marshal transcript for schema validation: %w", err) + } + return ValidateJSON(data) +} + +// ValidateJSON validates serialized output JSON against the public schema. +func ValidateJSON(data []byte) error { + var value any + decoder := json.NewDecoder(bytes.NewReader(data)) + decoder.UseNumber() + if err := decoder.Decode(&value); err != nil { + return fmt.Errorf("decode output JSON for schema validation: %w", err) + } + + compiled, err := outputSchema() + if err != nil { + return err + } + if err := compiled.Validate(value); err != nil { + return fmt.Errorf("output schema validation failed: %w", err) + } + return nil +} + +func outputSchema() (*jsonschema.Schema, error) { + compileOnce.Do(func() { + data, err := schemaFS.ReadFile(outputSchemaPath) + if err != nil { + compileErr = fmt.Errorf("read embedded output schema: %w", err) + return + } + var schemaDocument any + decoder := json.NewDecoder(bytes.NewReader(data)) + decoder.UseNumber() + if err := decoder.Decode(&schemaDocument); err != nil { + compileErr = fmt.Errorf("decode embedded output schema: %w", err) + return + } + + compiler := jsonschema.NewCompiler() + if err := compiler.AddResource(outputSchemaPath, schemaDocument); err != nil { + compileErr = fmt.Errorf("load embedded output schema: %w", err) + return + } + compiledOutputSchema, compileErr = compiler.Compile(outputSchemaPath) + if compileErr != nil { + compileErr = fmt.Errorf("compile embedded output schema: %w", compileErr) + } + }) + return compiledOutputSchema, compileErr +} + +func validateSemantics(transcript Transcript) error { + for index, segment := range transcript.Segments { + wantID := index + 1 + if segment.ID != wantID { + return fmt.Errorf("segment %d has id %d; want %d", index, segment.ID, wantID) + } + if segment.End < segment.Start { + return fmt.Errorf("segment %d has end %.3f before start %.3f", index, segment.End, segment.Start) + } + } + for index, group := range transcript.OverlapGroups { + if group.End < group.Start { + return fmt.Errorf("overlap_group %d has end %.3f before start %.3f", index, group.End, group.Start) + } + } + return nil +} diff --git a/schema/output.schema.json b/schema/output.schema.json new file mode 100644 index 0000000..4266233 --- /dev/null +++ b/schema/output.schema.json @@ -0,0 +1,98 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/output.schema.json", + "title": "seriatim output transcript", + "type": "object", + "additionalProperties": false, + "required": ["metadata", "segments", "overlap_groups"], + "properties": { + "metadata": { + "type": "object", + "additionalProperties": false, + "required": [ + "application", + "version", + "input_reader", + "input_files", + "preprocessing_modules", + "postprocessing_modules", + "output_modules" + ], + "properties": { + "application": { "type": "string" }, + "version": { "type": "string" }, + "input_reader": { "type": "string" }, + "input_files": { + "type": "array", + "items": { "type": "string" } + }, + "preprocessing_modules": { + "type": "array", + "items": { "type": "string" } + }, + "postprocessing_modules": { + "type": "array", + "items": { "type": "string" } + }, + "output_modules": { + "type": "array", + "items": { "type": "string" } + } + } + }, + "segments": { + "type": "array", + "items": { "$ref": "#/$defs/segment" } + }, + "overlap_groups": { + "type": "array", + "items": { "$ref": "#/$defs/overlap_group" } + } + }, + "$defs": { + "segment": { + "type": "object", + "additionalProperties": false, + "required": ["id", "source", "speaker", "start", "end", "text"], + "properties": { + "id": { "type": "integer", "minimum": 1 }, + "source": { "type": "string" }, + "source_segment_index": { "type": "integer", "minimum": 0 }, + "source_ref": { "type": "string" }, + "derived_from": { + "type": "array", + "items": { "type": "string" } + }, + "speaker": { "type": "string" }, + "start": { "type": "number" }, + "end": { "type": "number" }, + "text": { "type": "string" }, + "categories": { + "type": "array", + "items": { "type": "string" } + }, + "overlap_group_id": { "type": "integer", "minimum": 1 } + } + }, + "overlap_group": { + "type": "object", + "additionalProperties": false, + "required": ["id", "start", "end", "segments", "speakers", "class", "resolution"], + "properties": { + "id": { "type": "integer", "minimum": 1 }, + "start": { "type": "number" }, + "end": { "type": "number" }, + "segments": { + "type": "array", + "items": { "type": "string" } + }, + "speakers": { + "type": "array", + "items": { "type": "string" } + }, + "class": { "type": "string" }, + "resolution": { "type": "string" } + } + } + } +} diff --git a/schema/output_test.go b/schema/output_test.go new file mode 100644 index 0000000..d7467a9 --- /dev/null +++ b/schema/output_test.go @@ -0,0 +1,191 @@ +package schema + +import ( + "strings" + "testing" +) + +func TestValidateTranscriptAcceptsValidTranscript(t *testing.T) { + transcript := validTranscript() + + if err := ValidateTranscript(transcript); err != nil { + t.Fatalf("validate transcript: %v", err) + } +} + +func TestValidateJSONRejectsMissingRequiredField(t *testing.T) { + err := ValidateJSON([]byte(`{ + "metadata": { + "application": "seriatim", + "version": "dev", + "input_reader": "json-files", + "input_files": [], + "preprocessing_modules": [], + "postprocessing_modules": [], + "output_modules": [] + }, + "segments": [] + }`)) + assertErrorContains(t, err, "overlap_groups") +} + +func TestValidateJSONRejectsWrongFieldType(t *testing.T) { + err := ValidateJSON([]byte(`{ + "metadata": { + "application": "seriatim", + "version": "dev", + "input_reader": "json-files", + "input_files": [], + "preprocessing_modules": [], + "postprocessing_modules": [], + "output_modules": [] + }, + "segments": [ + { + "id": "1", + "source": "input.json", + "speaker": "Alice", + "start": 1, + "end": 2, + "text": "hello" + } + ], + "overlap_groups": [] + }`)) + assertErrorContains(t, err, "id") +} + +func TestValidateJSONRejectsUnexpectedInternalFields(t *testing.T) { + tests := []struct { + name string + field string + }{ + {name: "internal ref", field: `"internal_ref": "internal-1",`}, + {name: "words", field: `"words": [],`}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + err := ValidateJSON([]byte(`{ + "metadata": { + "application": "seriatim", + "version": "dev", + "input_reader": "json-files", + "input_files": [], + "preprocessing_modules": [], + "postprocessing_modules": [], + "output_modules": [] + }, + "segments": [ + { + "id": 1, + ` + test.field + ` + "source": "input.json", + "speaker": "Alice", + "start": 1, + "end": 2, + "text": "hello" + } + ], + "overlap_groups": [] + }`)) + assertErrorContains(t, err, "additional properties") + }) + } +} + +func TestValidateTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T) { + tests := []struct { + name string + ids []int + want string + }{ + {name: "missing zero id", ids: []int{0}, want: "segment 0 has id 0; want 1"}, + {name: "does not start at one", ids: []int{2}, want: "segment 0 has id 2; want 1"}, + {name: "gap", ids: []int{1, 3}, want: "segment 1 has id 3; want 2"}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + transcript := validTranscript() + transcript.Segments = transcript.Segments[:0] + for index, id := range test.ids { + transcript.Segments = append(transcript.Segments, Segment{ + ID: id, + Source: "input.json", + Speaker: "Alice", + Start: float64(index), + End: float64(index) + 1, + Text: "hello", + }) + } + + err := ValidateTranscript(transcript) + assertErrorContains(t, err, test.want) + }) + } +} + +func TestValidateTranscriptRejectsInvalidTiming(t *testing.T) { + transcript := validTranscript() + transcript.Segments[0].Start = 2 + transcript.Segments[0].End = 1 + + err := ValidateTranscript(transcript) + assertErrorContains(t, err, "segment 0 has end") +} + +func TestValidateTranscriptRejectsInvalidOverlapGroupTiming(t *testing.T) { + transcript := validTranscript() + transcript.OverlapGroups = []OverlapGroup{ + { + ID: 1, + Start: 3, + End: 2, + Segments: []string{"input.json#0"}, + Speakers: []string{"Alice"}, + Class: "unknown", + Resolution: "unresolved", + }, + } + + err := ValidateTranscript(transcript) + assertErrorContains(t, err, "overlap_group 0 has end") +} + +func validTranscript() Transcript { + sourceIndex := 0 + return Transcript{ + Metadata: Metadata{ + Application: "seriatim", + Version: "dev", + InputReader: "json-files", + InputFiles: []string{"input.json"}, + PreprocessingModules: []string{"validate-raw", "normalize-speakers", "trim-text"}, + PostprocessingModules: []string{"assign-ids", "validate-output"}, + OutputModules: []string{"json"}, + }, + Segments: []Segment{ + { + ID: 1, + Source: "input.json", + SourceSegmentIndex: &sourceIndex, + Speaker: "Alice", + Start: 1, + End: 2, + Text: "hello", + }, + }, + OverlapGroups: []OverlapGroup{}, + } +} + +func assertErrorContains(t *testing.T, err error, want string) { + t.Helper() + if err == nil { + t.Fatalf("expected error containing %q", want) + } + if !strings.Contains(err.Error(), want) { + t.Fatalf("expected error containing %q, got %v", want, err) + } +}