diff --git a/internal/cli/normalize_test.go b/internal/cli/normalize_test.go index 65cbb59..7917802 100644 --- a/internal/cli/normalize_test.go +++ b/internal/cli/normalize_test.go @@ -1,9 +1,13 @@ package cli import ( + "os" "path/filepath" "strings" "testing" + + "gitea.maximumdirect.net/eric/seriatim/internal/config" + "gitea.maximumdirect.net/eric/seriatim/schema" ) func TestNormalizeCommandIsRecognized(t *testing.T) { @@ -80,7 +84,171 @@ func TestNormalizeInvalidOutputModuleFails(t *testing.T) { } } -func TestNormalizeValidFlagsReachNotImplementedBoundary(t *testing.T) { +func TestNormalizeDefaultOutputSchemaIsIntermediate(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"id": 99, "start": 5, "end": 6, "speaker": "Bob", "text": "second", "categories": ["filler"]}, + {"id": 10, "start": 1, "end": 2, "speaker": "Alice", "text": "first", "categories": ["backchannel"]} + ] + }`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate { + t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate) + } + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if transcript.Segments[0].ID != 1 || transcript.Segments[1].ID != 2 { + t.Fatalf("segment IDs = %d,%d, want 1,2", transcript.Segments[0].ID, transcript.Segments[1].ID) + } + if transcript.Segments[0].Text != "first" || transcript.Segments[1].Text != "second" { + t.Fatalf("unexpected sort order: %#v", transcript.Segments) + } + if len(transcript.Segments[0].Categories) != 1 || transcript.Segments[0].Categories[0] != "backchannel" { + t.Fatalf("expected categories preserved on first segment, got %#v", transcript.Segments[0].Categories) + } +} + +func TestNormalizeBareArrayInputToIntermediateOutput(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `[ + {"start": 2, "end": 3, "speaker": "Bob", "text": "second"}, + {"start": 1, "end": 2, "speaker": "Alice", "text": "first"} + ]`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + "--output-schema", config.OutputSchemaIntermediate, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[1].Speaker != "Bob" { + t.Fatalf("unexpected sorted speakers: %#v", transcript.Segments) + } +} + +func TestNormalizeInputIndexTieBreakerIsDeterministic(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `[ + {"start": 1, "end": 2, "speaker": "Zulu", "text": "first in"}, + {"start": 1, "end": 2, "speaker": "Alpha", "text": "second in"} + ]`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if transcript.Segments[0].Speaker != "Zulu" || transcript.Segments[1].Speaker != "Alpha" { + t.Fatalf("tie-break order mismatch: %#v", transcript.Segments) + } +} + +func TestNormalizeMinimalSchemaOmitsCategories(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"start": 1, "end": 2, "speaker": "Alice", "text": "first", "categories": ["filler"]} + ] + }`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + "--output-schema", config.OutputSchemaMinimal, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.MinimalTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal { + t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal) + } + if len(transcript.Segments) != 1 || transcript.Segments[0].ID != 1 { + t.Fatalf("unexpected minimal output: %#v", transcript.Segments) + } + bytes, readErr := os.ReadFile(output) + if readErr != nil { + t.Fatalf("read output: %v", readErr) + } + if strings.Contains(string(bytes), "categories") { + t.Fatalf("minimal output unexpectedly contains categories:\n%s", string(bytes)) + } +} + +func TestNormalizeFullSchemaOutputValidatesAndHasProvenanceFallback(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `[ + {"start": 1, "end": 2, "speaker": "Alice", "text": "first"}, + {"start": 3, "end": 4, "speaker": "Bob", "text": "second", "source":"custom.json", "source_segment_index": 7} + ]`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + "--output-schema", config.OutputSchemaFull, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.Transcript + readJSON(t, output, &transcript) + if err := schema.ValidateTranscript(transcript); err != nil { + t.Fatalf("full output should validate: %v", err) + } + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if transcript.Segments[0].Source != filepath.Base(input) { + t.Fatalf("source fallback = %q, want %q", transcript.Segments[0].Source, filepath.Base(input)) + } + if transcript.Segments[0].SourceSegmentIndex == nil || *transcript.Segments[0].SourceSegmentIndex != 0 { + t.Fatalf("source_segment_index fallback = %v, want 0", transcript.Segments[0].SourceSegmentIndex) + } + if transcript.Segments[1].Source != "custom.json" { + t.Fatalf("explicit source preserved = %q, want custom.json", transcript.Segments[1].Source) + } + if transcript.Segments[1].SourceSegmentIndex == nil || *transcript.Segments[1].SourceSegmentIndex != 7 { + t.Fatalf("explicit source_segment_index preserved = %v, want 7", transcript.Segments[1].SourceSegmentIndex) + } + if transcript.OverlapGroups == nil || len(transcript.OverlapGroups) != 0 { + t.Fatalf("overlap_groups = %#v, want empty array", transcript.OverlapGroups) + } +} + +func TestNormalizeEmptySegmentsArrayProducesValidOutput(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) output := filepath.Join(dir, "normalized.json") @@ -89,11 +257,38 @@ func TestNormalizeValidFlagsReachNotImplementedBoundary(t *testing.T) { "--input-file", input, "--output-file", output, ) - if err == nil { - t.Fatal("expected not implemented error") + if err != nil { + t.Fatalf("normalize failed: %v", err) } - if !strings.Contains(err.Error(), "not implemented") { - t.Fatalf("unexpected error: %v", err) + + var transcript schema.IntermediateTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 0 { + t.Fatalf("segment count = %d, want 0", len(transcript.Segments)) + } + if err := schema.ValidateIntermediateTranscript(transcript); err != nil { + t.Fatalf("intermediate output should validate: %v", err) + } +} + +func TestNormalizeSelectedOutputSchemaIsHonored(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"speaker":"A","text":"one"}]}`) + output := filepath.Join(dir, "normalized.json") + + err := executeNormalize( + "--input-file", input, + "--output-file", output, + "--output-schema", config.OutputSchemaMinimal, + ) + if err != nil { + t.Fatalf("normalize failed: %v", err) + } + + var transcript schema.MinimalTranscript + readJSON(t, output, &transcript) + if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal { + t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal) } } diff --git a/internal/normalize/build.go b/internal/normalize/build.go new file mode 100644 index 0000000..c0ec4a7 --- /dev/null +++ b/internal/normalize/build.go @@ -0,0 +1,158 @@ +package normalize + +import ( + "fmt" + "path/filepath" + "sort" + "strings" + + "gitea.maximumdirect.net/eric/seriatim/internal/artifact" + "gitea.maximumdirect.net/eric/seriatim/internal/buildinfo" + "gitea.maximumdirect.net/eric/seriatim/internal/config" + "gitea.maximumdirect.net/eric/seriatim/schema" +) + +// Build converts parsed normalize input into a selected seriatim output schema. +func Build(parsed ParsedTranscript, cfg config.NormalizeConfig) (any, error) { + ordered := sortedSegments(parsed.Segments) + + switch cfg.OutputSchema { + case config.OutputSchemaMinimal: + output := buildMinimal(ordered) + if err := schema.ValidateMinimalTranscript(output); err != nil { + return nil, fmt.Errorf("validate normalize output: %w", err) + } + return output, nil + case config.OutputSchemaIntermediate: + output := buildIntermediate(ordered) + if err := schema.ValidateIntermediateTranscript(output); err != nil { + return nil, fmt.Errorf("validate normalize output: %w", err) + } + return output, nil + case config.OutputSchemaFull: + output := buildFull(ordered, cfg) + if err := schema.ValidateTranscript(output); err != nil { + return nil, fmt.Errorf("validate normalize output: %w", err) + } + return output, nil + default: + return nil, fmt.Errorf("unsupported output schema %q", cfg.OutputSchema) + } +} + +func sortedSegments(input []InputSegment) []InputSegment { + ordered := make([]InputSegment, len(input)) + copy(ordered, input) + sort.SliceStable(ordered, func(i, j int) bool { + left := ordered[i] + right := ordered[j] + if left.Start != right.Start { + return left.Start < right.Start + } + if left.End != right.End { + return left.End < right.End + } + if left.InputIndex != right.InputIndex { + return left.InputIndex < right.InputIndex + } + return left.Speaker < right.Speaker + }) + return ordered +} + +func buildMinimal(segments []InputSegment) schema.MinimalTranscript { + outputSegments := make([]schema.MinimalSegment, len(segments)) + for index, segment := range segments { + outputSegments[index] = schema.MinimalSegment{ + ID: index + 1, + Start: segment.Start, + End: segment.End, + Speaker: segment.Speaker, + Text: segment.Text, + } + } + + return schema.MinimalTranscript{ + Metadata: schema.MinimalMetadata{ + Application: artifact.ApplicationName, + Version: buildinfo.Version, + OutputSchema: config.OutputSchemaMinimal, + }, + Segments: outputSegments, + } +} + +func buildIntermediate(segments []InputSegment) schema.IntermediateTranscript { + outputSegments := make([]schema.IntermediateSegment, len(segments)) + for index, segment := range segments { + outputSegments[index] = schema.IntermediateSegment{ + ID: index + 1, + Start: segment.Start, + End: segment.End, + Speaker: segment.Speaker, + Text: segment.Text, + Categories: append([]string(nil), segment.Categories...), + } + } + + return schema.IntermediateTranscript{ + Metadata: schema.IntermediateMetadata{ + Application: artifact.ApplicationName, + Version: buildinfo.Version, + OutputSchema: config.OutputSchemaIntermediate, + }, + Segments: outputSegments, + } +} + +func buildFull(segments []InputSegment, cfg config.NormalizeConfig) schema.Transcript { + defaultSource := filepath.Base(cfg.InputFile) + outputSegments := make([]schema.Segment, len(segments)) + for index, segment := range segments { + source := strings.TrimSpace(segment.Source) + if source == "" { + source = defaultSource + } + + sourceSegmentIndex := copyIntPtr(segment.SourceSegmentIndex) + if sourceSegmentIndex == nil { + fallback := segment.InputIndex + sourceSegmentIndex = &fallback + } + + outputSegments[index] = schema.Segment{ + ID: index + 1, + Source: source, + SourceSegmentIndex: sourceSegmentIndex, + SourceRef: segment.SourceRef, + DerivedFrom: append([]string(nil), segment.DerivedFrom...), + Speaker: segment.Speaker, + Start: segment.Start, + End: segment.End, + Text: segment.Text, + Categories: append([]string(nil), segment.Categories...), + } + } + + return schema.Transcript{ + Metadata: schema.Metadata{ + Application: artifact.ApplicationName, + Version: buildinfo.Version, + InputReader: "normalize-input", + InputFiles: []string{cfg.InputFile}, + PreprocessingModules: []string{}, + PostprocessingModules: []string{}, + OutputModules: append([]string(nil), cfg.OutputModules...), + }, + Segments: outputSegments, + OverlapGroups: []schema.OverlapGroup{}, + } +} + +func copyIntPtr(value *int) *int { + if value == nil { + return nil + } + copied := *value + return &copied +} diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go index 48465a1..5c57212 100644 --- a/internal/normalize/normalize.go +++ b/internal/normalize/normalize.go @@ -2,7 +2,9 @@ package normalize import ( "context" + "encoding/json" "fmt" + "os" "gitea.maximumdirect.net/eric/seriatim/internal/config" ) @@ -14,10 +16,34 @@ func Run(ctx context.Context, cfg config.NormalizeConfig) error { return err } - if _, err := ParseFile(cfg.InputFile); err != nil { + parsed, err := ParseFile(cfg.InputFile) + if err != nil { return err } - // TODO: Implement transcript normalization transformation. - return fmt.Errorf("normalize command is not implemented yet") + output, err := Build(parsed, cfg) + if err != nil { + return err + } + + if err := writeOutputJSON(cfg.OutputFile, output); err != nil { + return err + } + + return nil +} + +func writeOutputJSON(path string, value any) error { + file, err := os.Create(path) + if err != nil { + return err + } + defer file.Close() + + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + if err := encoder.Encode(value); err != nil { + return fmt.Errorf("encode normalize output JSON: %w", err) + } + return nil }