package normalize import ( "bytes" "encoding/json" "fmt" "io" "os" "strings" ) // InputShape identifies which top-level input shape was parsed. type InputShape string const ( ShapeObjectWithSegments InputShape = "object_with_segments" ShapeBareSegmentsArray InputShape = "bare_segments_array" ) // ParsedTranscript is the validated normalize input model. type ParsedTranscript struct { Shape InputShape Segments []InputSegment } // InputSegment is a validated segment from normalize input. type InputSegment struct { InputIndex int OriginalID *int Start float64 End float64 Speaker string Text string Categories []string Source string SourceSegmentIndex *int SourceRef string DerivedFrom []string OverlapGroupID *int } type inputSegmentPayload struct { ID *int `json:"id"` Start *float64 `json:"start"` End *float64 `json:"end"` Speaker *string `json:"speaker"` Text *string `json:"text"` Categories []string `json:"categories"` Source string `json:"source"` SourceSegmentIndex *int `json:"source_segment_index"` SourceRef string `json:"source_ref"` DerivedFrom []string `json:"derived_from"` OverlapGroupID *int `json:"overlap_group_id"` } // ParseFile parses normalize input JSON from file path. func ParseFile(path string) (ParsedTranscript, error) { file, err := os.Open(path) if err != nil { return ParsedTranscript{}, err } defer file.Close() return ParseReader(file) } // ParseReader parses normalize input JSON from a reader. func ParseReader(reader io.Reader) (ParsedTranscript, error) { var raw json.RawMessage decoder := json.NewDecoder(reader) decoder.UseNumber() if err := decoder.Decode(&raw); err != nil { return ParsedTranscript{}, fmt.Errorf("decode normalize input JSON: %w", err) } if err := ensureSingleValue(decoder); err != nil { return ParsedTranscript{}, err } trimmed := bytes.TrimSpace(raw) if len(trimmed) == 0 { return ParsedTranscript{}, fmt.Errorf("normalize input is empty") } switch trimmed[0] { case '{': return parseObjectShape(trimmed) case '[': segments, err := parseSegmentsArray(trimmed) if err != nil { return ParsedTranscript{}, err } return ParsedTranscript{ Shape: ShapeBareSegmentsArray, Segments: segments, }, nil default: return ParsedTranscript{}, fmt.Errorf("normalize input must be a top-level object with \"segments\" or a top-level segment array") } } func ensureSingleValue(decoder *json.Decoder) error { var extra json.RawMessage err := decoder.Decode(&extra) if err == io.EOF { return nil } if err == nil { return fmt.Errorf("normalize input must contain exactly one top-level JSON value") } return fmt.Errorf("decode normalize input JSON: %w", err) } func parseObjectShape(raw []byte) (ParsedTranscript, error) { var object map[string]json.RawMessage if err := json.Unmarshal(raw, &object); err != nil { return ParsedTranscript{}, fmt.Errorf("decode normalize object input: %w", err) } segmentsRaw, exists := object["segments"] if !exists { return ParsedTranscript{}, fmt.Errorf("normalize object input must contain a \"segments\" field") } segments, err := parseSegmentsArray(segmentsRaw) if err != nil { return ParsedTranscript{}, err } return ParsedTranscript{ Shape: ShapeObjectWithSegments, Segments: segments, }, nil } func parseSegmentsArray(raw []byte) ([]InputSegment, error) { var segmentValues []json.RawMessage if err := json.Unmarshal(raw, &segmentValues); err != nil { return nil, fmt.Errorf("normalize input \"segments\" must be an array") } segments := make([]InputSegment, len(segmentValues)) for index, segmentRaw := range segmentValues { segment, err := parseSegment(index, segmentRaw) if err != nil { return nil, err } segments[index] = segment } return segments, nil } func parseSegment(index int, raw []byte) (InputSegment, error) { var payload inputSegmentPayload if err := json.Unmarshal(raw, &payload); err != nil { return InputSegment{}, fmt.Errorf("segment %d: invalid segment object: %w", index, err) } if payload.Start == nil { return InputSegment{}, fmt.Errorf("segment %d is missing required field \"start\"", index) } if payload.End == nil { return InputSegment{}, fmt.Errorf("segment %d is missing required field \"end\"", index) } if payload.Speaker == nil { return InputSegment{}, fmt.Errorf("segment %d is missing required field \"speaker\"", index) } if payload.Text == nil { return InputSegment{}, fmt.Errorf("segment %d is missing required field \"text\"", index) } if *payload.Start < 0 { return InputSegment{}, fmt.Errorf("segment %d has start %v; start must be >= 0", index, *payload.Start) } if *payload.End < *payload.Start { return InputSegment{}, fmt.Errorf("segment %d has end %v before start %v", index, *payload.End, *payload.Start) } speaker := strings.TrimSpace(*payload.Speaker) if speaker == "" { return InputSegment{}, fmt.Errorf("segment %d has empty \"speaker\"; speaker must be non-empty", index) } return InputSegment{ InputIndex: index, OriginalID: payload.ID, Start: *payload.Start, End: *payload.End, Speaker: speaker, Text: *payload.Text, Categories: append([]string(nil), payload.Categories...), Source: payload.Source, SourceSegmentIndex: payload.SourceSegmentIndex, SourceRef: payload.SourceRef, DerivedFrom: append([]string(nil), payload.DerivedFrom...), OverlapGroupID: payload.OverlapGroupID, }, nil }