Added support for a minimal JSON output schema

This commit is contained in:
2026-04-28 14:39:00 -05:00
parent a3ca6665a9
commit 9cca88280f
16 changed files with 658 additions and 44 deletions

View File

@@ -10,6 +10,7 @@ import (
"gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
func TestMergeWritesMergedOutputAndReport(t *testing.T) {
@@ -111,6 +112,64 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
}
}
func TestMergeWritesMinimalOutputSchema(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 2, "text": " Yeah. "},
{"start": 8, "end": 9, "text": " next "}
]
}`)
output := filepath.Join(dir, "merged.json")
reportPath := filepath.Join(dir, "report.json")
err := executeMerge(
"--input-file", input,
"--output-file", output,
"--output-schema", "minimal",
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript schema.MinimalTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.Application != "seriatim" {
t.Fatalf("application = %q, want seriatim", transcript.Metadata.Application)
}
if transcript.Metadata.OutputSchema != "minimal" {
t.Fatalf("output_schema = %q, want minimal", transcript.Metadata.OutputSchema)
}
if got, want := len(transcript.Segments), 2; got != want {
t.Fatalf("segment count = %d, want %d", got, want)
}
for index, segment := range transcript.Segments {
if segment.ID != index+1 {
t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1)
}
}
if transcript.Segments[0].Speaker != "input.json" || transcript.Segments[0].Text != "Yeah." {
t.Fatalf("first segment = %#v", transcript.Segments[0])
}
outputBytes, err := os.ReadFile(output)
if err != nil {
t.Fatalf("read output: %v", err)
}
for _, forbidden := range []string{"overlap_groups", "categories", "source", "derived_from"} {
if strings.Contains(string(outputBytes), forbidden) {
t.Fatalf("minimal output contains %q:\n%s", forbidden, outputBytes)
}
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
if !hasReportEvent(rpt, "postprocessing", "validate-output", "validated 2 output segment(s)") {
t.Fatal("expected validate-output report event")
}
}
func TestMergeTieBreakOrder(t *testing.T) {
dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{
@@ -182,6 +241,29 @@ func TestMergeValidateOutputBeforeAssignIDsFails(t *testing.T) {
}
}
func TestMergeValidateMinimalOutputBeforeAssignIDsFails(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 2, "text": "hello"}
]
}`)
output := filepath.Join(dir, "merged.json")
err := executeMerge(
"--input-file", input,
"--output-file", output,
"--output-schema", "minimal",
"--postprocessing-modules", "validate-output,assign-ids",
)
if err == nil {
t.Fatal("expected validation error")
}
if !strings.Contains(err.Error(), "validate-output: segment 0 has id 0; want 1") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestMergeDetectsOverlapGroups(t *testing.T) {
dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{
@@ -963,6 +1045,24 @@ func TestUnknownModulesFailDuringValidation(t *testing.T) {
}
}
func TestUnknownOutputSchemaFailsDuringValidation(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
output := filepath.Join(dir, "merged.json")
err := executeMerge(
"--input-file", input,
"--output-file", output,
"--output-schema", "compact",
)
if err == nil {
t.Fatal("expected output schema error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestInvalidPreprocessingOrderFails(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)