From e6d3b4a46e52b47c46a2e7e76932de52223b9301 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Fri, 8 May 2026 15:00:46 +0000 Subject: [PATCH] Harden trim integration --- internal/cli/trim_test.go | 309 +++++++++++++++++++++++++++++++++ internal/trim/artifact.go | 5 + internal/trim/artifact_test.go | 138 +++++++++++++++ 3 files changed, 452 insertions(+) create mode 100644 internal/trim/artifact_test.go diff --git a/internal/cli/trim_test.go b/internal/cli/trim_test.go index e32fbfe..196c5ea 100644 --- a/internal/cli/trim_test.go +++ b/internal/cli/trim_test.go @@ -166,6 +166,247 @@ func TestTrimExplicitOutputSchemaChangesOutputSchema(t *testing.T) { assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID}) } +func TestTrimExplicitOutputSchemaConvertsMinimalToIntermediate(t *testing.T) { + dir := t.TempDir() + input := writeTrimMinimalFixture(t, dir, "input-minimal.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1-2", + "--output-schema", config.OutputSchemaIntermediate, + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate { + t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate) + } + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID}) +} + +func TestTrimIntermediateInputPreservesIntermediateOutputAndCategories(t *testing.T) { + dir := t.TempDir() + input := writeTrimIntermediateFixture(t, dir, "input-intermediate.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "2", + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate { + t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate) + } + if len(transcript.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(transcript.Segments)) + } + if transcript.Segments[0].ID != 1 { + t.Fatalf("segment ID = %d, want 1", transcript.Segments[0].ID) + } + assertIntSliceEqual(t, []int{len(transcript.Segments[0].Categories)}, []int{2}) + if transcript.Segments[0].Categories[0] != "filler" || transcript.Segments[0].Categories[1] != "backchannel" { + t.Fatalf("categories = %v, want [filler backchannel]", transcript.Segments[0].Categories) + } +} + +func TestTrimFullInputPreservesFullShapeAndRecomputesOverlapGroups(t *testing.T) { + dir := t.TempDir() + input := writeTrimFullOverlapFixture(t, dir, "input-full-overlap.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1,2", + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.Transcript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID}) + if len(transcript.OverlapGroups) != 1 { + t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups)) + } + if transcript.OverlapGroups[0].ID != 1 { + t.Fatalf("overlap group id = %d, want 1", transcript.OverlapGroups[0].ID) + } + if transcript.Segments[0].OverlapGroupID != 1 || transcript.Segments[1].OverlapGroupID != 1 { + t.Fatalf("segment overlap IDs = %d,%d, want 1,1", transcript.Segments[0].OverlapGroupID, transcript.Segments[1].OverlapGroupID) + } +} + +func TestTrimMalformedSelectorFailsWithClearError(t *testing.T) { + dir := t.TempDir() + input := writeTrimFullFixture(t, dir, "input.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1-", + ) + if err == nil { + t.Fatal("expected malformed selector error") + } + if !strings.Contains(err.Error(), "invalid selector") || !strings.Contains(err.Error(), "malformed element") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestTrimMalformedInputArtifactFailsClearly(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "broken.json", `{"metadata":`) + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1", + ) + if err == nil { + t.Fatal("expected malformed artifact error") + } + if !strings.Contains(err.Error(), "input JSON is malformed") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestTrimDuplicateInputSegmentIDsFail(t *testing.T) { + dir := t.TempDir() + input := writeTrimMinimalWithIDsFixture(t, dir, "input-dup.json", []int{1, 1}) + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1", + ) + if err == nil { + t.Fatal("expected duplicate segment ID failure") + } + if !strings.Contains(err.Error(), "not a valid seriatim output artifact") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestTrimNonSequentialInputSegmentIDsFail(t *testing.T) { + dir := t.TempDir() + input := writeTrimMinimalWithIDsFixture(t, dir, "input-nonseq.json", []int{1, 3}) + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1", + ) + if err == nil { + t.Fatal("expected non-sequential segment ID failure") + } + if !strings.Contains(err.Error(), "not a valid seriatim output artifact") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestTrimKeepSelectorWithOverlappingRanges(t *testing.T) { + dir := t.TempDir() + input := writeTrimFullFixture(t, dir, "input.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "1-3,2-4", + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.Transcript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 4 { + t.Fatalf("segment count = %d, want 4", len(transcript.Segments)) + } + assertSequentialIDs(t, []int{ + transcript.Segments[0].ID, + transcript.Segments[1].ID, + transcript.Segments[2].ID, + transcript.Segments[3].ID, + }) +} + +func TestTrimRemoveSelectorWithOverlappingRanges(t *testing.T) { + dir := t.TempDir() + input := writeTrimFullFixture(t, dir, "input.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--remove", "2-3,3-4", + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.Transcript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(transcript.Segments)) + } + if transcript.Segments[0].Text != "one" { + t.Fatalf("remaining segment = %#v, want one", transcript.Segments[0]) + } +} + +func TestTrimSelectorOrderDoesNotAffectTranscriptOrder(t *testing.T) { + dir := t.TempDir() + input := writeTrimFullFixture(t, dir, "input.json") + output := filepath.Join(dir, "trimmed.json") + + err := executeTrim( + "--input-file", input, + "--output-file", output, + "--keep", "4,1,3", + ) + if err != nil { + t.Fatalf("trim failed: %v", err) + } + + var transcript schema.Transcript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 3 { + t.Fatalf("segment count = %d, want 3", len(transcript.Segments)) + } + got := []string{ + transcript.Segments[0].Text, + transcript.Segments[1].Text, + transcript.Segments[2].Text, + } + want := []string{"one", "three", "four"} + if got[0] != want[0] || got[1] != want[1] || got[2] != want[2] { + t.Fatalf("segment text order = %v, want %v", got, want) + } +} + func TestTrimAllowEmptyBehavior(t *testing.T) { dir := t.TempDir() input := writeTrimFullFixture(t, dir, "input.json") @@ -396,6 +637,74 @@ func writeTrimMinimalFixture(t *testing.T, dir string, name string) string { return writeTrimArtifactFile(t, dir, name, value) } +func writeTrimIntermediateFixture(t *testing.T, dir string, name string) string { + t.Helper() + + value := schema.IntermediateTranscript{ + Metadata: schema.IntermediateMetadata{ + Application: "seriatim", + Version: "v-test", + OutputSchema: config.OutputSchemaIntermediate, + }, + Segments: []schema.IntermediateSegment{ + {ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one", Categories: []string{"word-run"}}, + {ID: 2, Start: 2, End: 3, Speaker: "B", Text: "two", Categories: []string{"filler", "backchannel"}}, + }, + } + + return writeTrimArtifactFile(t, dir, name, value) +} + +func writeTrimMinimalWithIDsFixture(t *testing.T, dir string, name string, ids []int) string { + t.Helper() + + if len(ids) < 2 { + t.Fatalf("need at least two IDs, got %d", len(ids)) + } + value := schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: "seriatim", + Version: "v-test", + OutputSchema: config.OutputSchemaMinimal, + }, + Segments: []schema.MinimalSegment{ + {ID: ids[0], Start: 1, End: 2, Speaker: "A", Text: "one"}, + {ID: ids[1], Start: 2, End: 3, Speaker: "B", Text: "two"}, + }, + } + + return writeTrimArtifactFile(t, dir, name, value) +} + +func writeTrimFullOverlapFixture(t *testing.T, dir string, name string) string { + t.Helper() + + first := 10 + second := 20 + third := 30 + value := schema.Transcript{ + Metadata: schema.Metadata{ + Application: "seriatim", + Version: "v-test", + InputReader: "json-files", + InputFiles: []string{"a.json"}, + PreprocessingModules: []string{"validate-raw"}, + PostprocessingModules: []string{"detect-overlaps", "assign-ids"}, + OutputModules: []string{"json"}, + }, + Segments: []schema.Segment{ + {ID: 1, Source: "a.json", SourceSegmentIndex: &first, SourceRef: "a.json#10", Speaker: "A", Start: 1, End: 3, Text: "one", OverlapGroupID: 5}, + {ID: 2, Source: "a.json", SourceSegmentIndex: &second, SourceRef: "a.json#20", Speaker: "B", Start: 2, End: 4, Text: "two", OverlapGroupID: 5}, + {ID: 3, Source: "a.json", SourceSegmentIndex: &third, SourceRef: "a.json#30", Speaker: "C", Start: 6, End: 7, Text: "three", OverlapGroupID: 6}, + }, + OverlapGroups: []schema.OverlapGroup{ + {ID: 99, Start: 0, End: 100, Segments: []string{"stale"}, Speakers: []string{"stale"}, Class: "unknown", Resolution: "unresolved"}, + }, + } + + return writeTrimArtifactFile(t, dir, name, value) +} + func writeTrimArtifactFile(t *testing.T, dir string, name string, value any) string { t.Helper() diff --git a/internal/trim/artifact.go b/internal/trim/artifact.go index b8e0c27..cfe1e0d 100644 --- a/internal/trim/artifact.go +++ b/internal/trim/artifact.go @@ -31,6 +31,11 @@ type ApplyArtifactResult struct { // ParseArtifactJSON parses and validates a serialized seriatim output artifact. func ParseArtifactJSON(data []byte) (Artifact, error) { + var decoded any + if err := json.Unmarshal(data, &decoded); err != nil { + return Artifact{}, fmt.Errorf("input JSON is malformed: %w", err) + } + var full schema.Transcript if err := json.Unmarshal(data, &full); err == nil { if err := schema.ValidateTranscript(full); err == nil { diff --git a/internal/trim/artifact_test.go b/internal/trim/artifact_test.go new file mode 100644 index 0000000..a00c048 --- /dev/null +++ b/internal/trim/artifact_test.go @@ -0,0 +1,138 @@ +package trim + +import ( + "encoding/json" + "strings" + "testing" + + "gitea.maximumdirect.net/eric/seriatim/schema" +) + +func TestParseArtifactJSONRejectsMalformedJSON(t *testing.T) { + _, err := ParseArtifactJSON([]byte(`{"metadata":`)) + if err == nil { + t.Fatal("expected malformed JSON error") + } + if !strings.Contains(err.Error(), "input JSON is malformed") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestParseArtifactJSONRejectsDuplicateSegmentIDs(t *testing.T) { + first := 10 + second := 20 + value := schema.Transcript{ + Metadata: schema.Metadata{ + Application: "seriatim", + Version: "v-test", + }, + Segments: []schema.Segment{ + {ID: 1, Source: "a.json", SourceSegmentIndex: &first, Speaker: "A", Start: 1, End: 2, Text: "one"}, + {ID: 1, Source: "a.json", SourceSegmentIndex: &second, Speaker: "B", Start: 2, End: 3, Text: "two"}, + }, + OverlapGroups: []schema.OverlapGroup{}, + } + data := mustMarshalJSON(t, value) + + _, err := ParseArtifactJSON(data) + if err == nil { + t.Fatal("expected invalid artifact error") + } + if !strings.Contains(err.Error(), "not a valid seriatim output artifact") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestParseArtifactJSONRejectsNonSequentialSegmentIDs(t *testing.T) { + first := 10 + second := 20 + value := schema.Transcript{ + Metadata: schema.Metadata{ + Application: "seriatim", + Version: "v-test", + }, + Segments: []schema.Segment{ + {ID: 1, Source: "a.json", SourceSegmentIndex: &first, Speaker: "A", Start: 1, End: 2, Text: "one"}, + {ID: 3, Source: "a.json", SourceSegmentIndex: &second, Speaker: "B", Start: 2, End: 3, Text: "two"}, + }, + OverlapGroups: []schema.OverlapGroup{}, + } + data := mustMarshalJSON(t, value) + + _, err := ParseArtifactJSON(data) + if err == nil { + t.Fatal("expected invalid artifact error") + } + if !strings.Contains(err.Error(), "not a valid seriatim output artifact") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestConvertArtifactMinimalToIntermediate(t *testing.T) { + value := schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: "seriatim", + Version: "v-test", + OutputSchema: SchemaMinimal, + }, + Segments: []schema.MinimalSegment{ + {ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one"}, + {ID: 2, Start: 2, End: 3, Speaker: "B", Text: "two"}, + }, + } + artifact := Artifact{ + Schema: SchemaMinimal, + Minimal: &value, + } + + converted, err := ConvertArtifact(artifact, SchemaIntermediate) + if err != nil { + t.Fatalf("convert failed: %v", err) + } + if converted.Schema != SchemaIntermediate { + t.Fatalf("schema = %q, want %q", converted.Schema, SchemaIntermediate) + } + if converted.Intermediate == nil { + t.Fatal("expected intermediate artifact") + } + if len(converted.Intermediate.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(converted.Intermediate.Segments)) + } + if converted.Intermediate.Segments[0].ID != 1 || converted.Intermediate.Segments[1].ID != 2 { + t.Fatalf("unexpected IDs: %#v", converted.Intermediate.Segments) + } +} + +func TestConvertArtifactMinimalToFullFails(t *testing.T) { + value := schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: "seriatim", + Version: "v-test", + OutputSchema: SchemaMinimal, + }, + Segments: []schema.MinimalSegment{ + {ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one"}, + }, + } + artifact := Artifact{ + Schema: SchemaMinimal, + Minimal: &value, + } + + _, err := ConvertArtifact(artifact, SchemaFull) + if err == nil { + t.Fatal("expected conversion error") + } + if !strings.Contains(err.Error(), "cannot emit") { + t.Fatalf("unexpected error: %v", err) + } +} + +func mustMarshalJSON(t *testing.T, value any) []byte { + t.Helper() + data, err := json.Marshal(value) + if err != nil { + t.Fatalf("marshal: %v", err) + } + return data +}