diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go index d176d7c..48465a1 100644 --- a/internal/normalize/normalize.go +++ b/internal/normalize/normalize.go @@ -14,6 +14,10 @@ func Run(ctx context.Context, cfg config.NormalizeConfig) error { return err } + if _, err := ParseFile(cfg.InputFile); err != nil { + return err + } + // TODO: Implement transcript normalization transformation. return fmt.Errorf("normalize command is not implemented yet") } diff --git a/internal/normalize/parse.go b/internal/normalize/parse.go new file mode 100644 index 0000000..6f130d3 --- /dev/null +++ b/internal/normalize/parse.go @@ -0,0 +1,197 @@ +package normalize + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "os" + "strings" +) + +// InputShape identifies which top-level input shape was parsed. +type InputShape string + +const ( + ShapeObjectWithSegments InputShape = "object_with_segments" + ShapeBareSegmentsArray InputShape = "bare_segments_array" +) + +// ParsedTranscript is the validated normalize input model. +type ParsedTranscript struct { + Shape InputShape + Segments []InputSegment +} + +// InputSegment is a validated segment from normalize input. +type InputSegment struct { + InputIndex int + OriginalID *int + Start float64 + End float64 + Speaker string + Text string + Categories []string + Source string + SourceSegmentIndex *int + SourceRef string + DerivedFrom []string + OverlapGroupID *int +} + +type inputSegmentPayload struct { + ID *int `json:"id"` + Start *float64 `json:"start"` + End *float64 `json:"end"` + Speaker *string `json:"speaker"` + Text *string `json:"text"` + Categories []string `json:"categories"` + Source string `json:"source"` + SourceSegmentIndex *int `json:"source_segment_index"` + SourceRef string `json:"source_ref"` + DerivedFrom []string `json:"derived_from"` + OverlapGroupID *int `json:"overlap_group_id"` +} + +// ParseFile parses normalize input JSON from file path. +func ParseFile(path string) (ParsedTranscript, error) { + file, err := os.Open(path) + if err != nil { + return ParsedTranscript{}, err + } + defer file.Close() + + return ParseReader(file) +} + +// ParseReader parses normalize input JSON from a reader. +func ParseReader(reader io.Reader) (ParsedTranscript, error) { + var raw json.RawMessage + decoder := json.NewDecoder(reader) + decoder.UseNumber() + if err := decoder.Decode(&raw); err != nil { + return ParsedTranscript{}, fmt.Errorf("decode normalize input JSON: %w", err) + } + if err := ensureSingleValue(decoder); err != nil { + return ParsedTranscript{}, err + } + + trimmed := bytes.TrimSpace(raw) + if len(trimmed) == 0 { + return ParsedTranscript{}, fmt.Errorf("normalize input is empty") + } + + switch trimmed[0] { + case '{': + return parseObjectShape(trimmed) + case '[': + segments, err := parseSegmentsArray(trimmed) + if err != nil { + return ParsedTranscript{}, err + } + return ParsedTranscript{ + Shape: ShapeBareSegmentsArray, + Segments: segments, + }, nil + default: + return ParsedTranscript{}, fmt.Errorf("normalize input must be a top-level object with \"segments\" or a top-level segment array") + } +} + +func ensureSingleValue(decoder *json.Decoder) error { + var extra json.RawMessage + err := decoder.Decode(&extra) + if err == io.EOF { + return nil + } + if err == nil { + return fmt.Errorf("normalize input must contain exactly one top-level JSON value") + } + return fmt.Errorf("decode normalize input JSON: %w", err) +} + +func parseObjectShape(raw []byte) (ParsedTranscript, error) { + var object map[string]json.RawMessage + if err := json.Unmarshal(raw, &object); err != nil { + return ParsedTranscript{}, fmt.Errorf("decode normalize object input: %w", err) + } + + segmentsRaw, exists := object["segments"] + if !exists { + return ParsedTranscript{}, fmt.Errorf("normalize object input must contain a \"segments\" field") + } + + segments, err := parseSegmentsArray(segmentsRaw) + if err != nil { + return ParsedTranscript{}, err + } + + return ParsedTranscript{ + Shape: ShapeObjectWithSegments, + Segments: segments, + }, nil +} + +func parseSegmentsArray(raw []byte) ([]InputSegment, error) { + var segmentValues []json.RawMessage + if err := json.Unmarshal(raw, &segmentValues); err != nil { + return nil, fmt.Errorf("normalize input \"segments\" must be an array") + } + + segments := make([]InputSegment, len(segmentValues)) + for index, segmentRaw := range segmentValues { + segment, err := parseSegment(index, segmentRaw) + if err != nil { + return nil, err + } + segments[index] = segment + } + return segments, nil +} + +func parseSegment(index int, raw []byte) (InputSegment, error) { + var payload inputSegmentPayload + if err := json.Unmarshal(raw, &payload); err != nil { + return InputSegment{}, fmt.Errorf("segment %d: invalid segment object: %w", index, err) + } + + if payload.Start == nil { + return InputSegment{}, fmt.Errorf("segment %d is missing required field \"start\"", index) + } + if payload.End == nil { + return InputSegment{}, fmt.Errorf("segment %d is missing required field \"end\"", index) + } + if payload.Speaker == nil { + return InputSegment{}, fmt.Errorf("segment %d is missing required field \"speaker\"", index) + } + if payload.Text == nil { + return InputSegment{}, fmt.Errorf("segment %d is missing required field \"text\"", index) + } + + if *payload.Start < 0 { + return InputSegment{}, fmt.Errorf("segment %d has start %v; start must be >= 0", index, *payload.Start) + } + if *payload.End < *payload.Start { + return InputSegment{}, fmt.Errorf("segment %d has end %v before start %v", index, *payload.End, *payload.Start) + } + + speaker := strings.TrimSpace(*payload.Speaker) + if speaker == "" { + return InputSegment{}, fmt.Errorf("segment %d has empty \"speaker\"; speaker must be non-empty", index) + } + + return InputSegment{ + InputIndex: index, + OriginalID: payload.ID, + Start: *payload.Start, + End: *payload.End, + Speaker: speaker, + Text: *payload.Text, + Categories: append([]string(nil), payload.Categories...), + Source: payload.Source, + SourceSegmentIndex: payload.SourceSegmentIndex, + SourceRef: payload.SourceRef, + DerivedFrom: append([]string(nil), payload.DerivedFrom...), + OverlapGroupID: payload.OverlapGroupID, + }, nil +} diff --git a/internal/normalize/parse_test.go b/internal/normalize/parse_test.go new file mode 100644 index 0000000..3a3bb1a --- /dev/null +++ b/internal/normalize/parse_test.go @@ -0,0 +1,181 @@ +package normalize + +import ( + "strings" + "testing" +) + +func TestParseReaderObjectWithSegmentsParses(t *testing.T) { + input := `{ + "segments": [ + {"start": 1.0, "end": 2.0, "speaker": " Alice ", "text": "hello", "id": 100} + ] + }` + + parsed, err := ParseReader(strings.NewReader(input)) + if err != nil { + t.Fatalf("parse failed: %v", err) + } + if parsed.Shape != ShapeObjectWithSegments { + t.Fatalf("shape = %q, want %q", parsed.Shape, ShapeObjectWithSegments) + } + if len(parsed.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(parsed.Segments)) + } + segment := parsed.Segments[0] + if segment.Speaker != "Alice" { + t.Fatalf("speaker = %q, want %q", segment.Speaker, "Alice") + } + if segment.OriginalID == nil || *segment.OriginalID != 100 { + t.Fatalf("original id = %v, want 100", segment.OriginalID) + } +} + +func TestParseReaderBareSegmentArrayParses(t *testing.T) { + input := `[ + {"start": 1.0, "end": 2.0, "speaker": "Alice", "text": "hello"}, + {"start": 3.0, "end": 4.0, "speaker": "Bob", "text": "world"} + ]` + + parsed, err := ParseReader(strings.NewReader(input)) + if err != nil { + t.Fatalf("parse failed: %v", err) + } + if parsed.Shape != ShapeBareSegmentsArray { + t.Fatalf("shape = %q, want %q", parsed.Shape, ShapeBareSegmentsArray) + } + if len(parsed.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(parsed.Segments)) + } +} + +func TestParseReaderInvalidJSONFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`{"segments":`)) + if err == nil { + t.Fatal("expected parse error") + } + if !strings.Contains(err.Error(), "decode normalize input JSON") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestParseReaderObjectMissingSegmentsFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`{"items":[]}`)) + if err == nil { + t.Fatal("expected missing segments error") + } + if !strings.Contains(err.Error(), "must contain a \"segments\" field") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestParseReaderSegmentsNotArrayFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`{"segments": {}}`)) + if err == nil { + t.Fatal("expected segments not array error") + } + if !strings.Contains(err.Error(), "\"segments\" must be an array") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestParseReaderTopLevelScalarShapesFail(t *testing.T) { + tests := []string{`"text"`, `42`, `null`} + for _, input := range tests { + _, err := ParseReader(strings.NewReader(input)) + if err == nil { + t.Fatalf("expected top-level shape error for %s", input) + } + if !strings.Contains(err.Error(), "top-level object") { + t.Fatalf("unexpected error for %s: %v", input, err) + } + } +} + +func TestParseReaderMissingStartFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"end":2,"speaker":"A","text":"t"}]`)) + assertContains(t, err, `missing required field "start"`) +} + +func TestParseReaderMissingEndFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":1,"speaker":"A","text":"t"}]`)) + assertContains(t, err, `missing required field "end"`) +} + +func TestParseReaderMissingSpeakerFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"text":"t"}]`)) + assertContains(t, err, `missing required field "speaker"`) +} + +func TestParseReaderEmptySpeakerFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":" ","text":"t"}]`)) + assertContains(t, err, `speaker must be non-empty`) +} + +func TestParseReaderMissingTextFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":"A"}]`)) + assertContains(t, err, `missing required field "text"`) +} + +func TestParseReaderEndBeforeStartFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":3,"end":2,"speaker":"A","text":"t"}]`)) + assertContains(t, err, "before start") +} + +func TestParseReaderNegativeStartFails(t *testing.T) { + _, err := ParseReader(strings.NewReader(`[{"start":-1,"end":2,"speaker":"A","text":"t"}]`)) + assertContains(t, err, "start must be >= 0") +} + +func TestParseReaderEmptySegmentsArrayAccepted(t *testing.T) { + parsed, err := ParseReader(strings.NewReader(`{"segments":[]}`)) + if err != nil { + t.Fatalf("parse failed: %v", err) + } + if len(parsed.Segments) != 0 { + t.Fatalf("segment count = %d, want 0", len(parsed.Segments)) + } +} + +func TestParseReaderCategoriesPreservedWhenValid(t *testing.T) { + parsed, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":"A","text":"t","categories":["filler","backchannel"]}]`)) + if err != nil { + t.Fatalf("parse failed: %v", err) + } + if len(parsed.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(parsed.Segments)) + } + if len(parsed.Segments[0].Categories) != 2 { + t.Fatalf("categories length = %d, want 2", len(parsed.Segments[0].Categories)) + } + if parsed.Segments[0].Categories[0] != "filler" || parsed.Segments[0].Categories[1] != "backchannel" { + t.Fatalf("categories = %v", parsed.Segments[0].Categories) + } +} + +func TestParseReaderOriginalInputIndexPreserved(t *testing.T) { + input := `[ + {"start":1,"end":2,"speaker":"A","text":"one"}, + {"start":2,"end":3,"speaker":"B","text":"two"}, + {"start":3,"end":4,"speaker":"C","text":"three"} + ]` + parsed, err := ParseReader(strings.NewReader(input)) + if err != nil { + t.Fatalf("parse failed: %v", err) + } + for index, segment := range parsed.Segments { + if segment.InputIndex != index { + t.Fatalf("segment %d input index = %d, want %d", index, segment.InputIndex, index) + } + } +} + +func assertContains(t *testing.T, err error, fragment string) { + t.Helper() + if err == nil { + t.Fatalf("expected error containing %q", fragment) + } + if !strings.Contains(err.Error(), fragment) { + t.Fatalf("error = %q, want substring %q", err.Error(), fragment) + } +}