Add normalize input parsing
This commit is contained in:
197
internal/normalize/parse.go
Normal file
197
internal/normalize/parse.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package normalize
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// InputShape identifies which top-level input shape was parsed.
|
||||
type InputShape string
|
||||
|
||||
const (
|
||||
ShapeObjectWithSegments InputShape = "object_with_segments"
|
||||
ShapeBareSegmentsArray InputShape = "bare_segments_array"
|
||||
)
|
||||
|
||||
// ParsedTranscript is the validated normalize input model.
|
||||
type ParsedTranscript struct {
|
||||
Shape InputShape
|
||||
Segments []InputSegment
|
||||
}
|
||||
|
||||
// InputSegment is a validated segment from normalize input.
|
||||
type InputSegment struct {
|
||||
InputIndex int
|
||||
OriginalID *int
|
||||
Start float64
|
||||
End float64
|
||||
Speaker string
|
||||
Text string
|
||||
Categories []string
|
||||
Source string
|
||||
SourceSegmentIndex *int
|
||||
SourceRef string
|
||||
DerivedFrom []string
|
||||
OverlapGroupID *int
|
||||
}
|
||||
|
||||
type inputSegmentPayload struct {
|
||||
ID *int `json:"id"`
|
||||
Start *float64 `json:"start"`
|
||||
End *float64 `json:"end"`
|
||||
Speaker *string `json:"speaker"`
|
||||
Text *string `json:"text"`
|
||||
Categories []string `json:"categories"`
|
||||
Source string `json:"source"`
|
||||
SourceSegmentIndex *int `json:"source_segment_index"`
|
||||
SourceRef string `json:"source_ref"`
|
||||
DerivedFrom []string `json:"derived_from"`
|
||||
OverlapGroupID *int `json:"overlap_group_id"`
|
||||
}
|
||||
|
||||
// ParseFile parses normalize input JSON from file path.
|
||||
func ParseFile(path string) (ParsedTranscript, error) {
|
||||
file, err := os.Open(path)
|
||||
if err != nil {
|
||||
return ParsedTranscript{}, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
return ParseReader(file)
|
||||
}
|
||||
|
||||
// ParseReader parses normalize input JSON from a reader.
|
||||
func ParseReader(reader io.Reader) (ParsedTranscript, error) {
|
||||
var raw json.RawMessage
|
||||
decoder := json.NewDecoder(reader)
|
||||
decoder.UseNumber()
|
||||
if err := decoder.Decode(&raw); err != nil {
|
||||
return ParsedTranscript{}, fmt.Errorf("decode normalize input JSON: %w", err)
|
||||
}
|
||||
if err := ensureSingleValue(decoder); err != nil {
|
||||
return ParsedTranscript{}, err
|
||||
}
|
||||
|
||||
trimmed := bytes.TrimSpace(raw)
|
||||
if len(trimmed) == 0 {
|
||||
return ParsedTranscript{}, fmt.Errorf("normalize input is empty")
|
||||
}
|
||||
|
||||
switch trimmed[0] {
|
||||
case '{':
|
||||
return parseObjectShape(trimmed)
|
||||
case '[':
|
||||
segments, err := parseSegmentsArray(trimmed)
|
||||
if err != nil {
|
||||
return ParsedTranscript{}, err
|
||||
}
|
||||
return ParsedTranscript{
|
||||
Shape: ShapeBareSegmentsArray,
|
||||
Segments: segments,
|
||||
}, nil
|
||||
default:
|
||||
return ParsedTranscript{}, fmt.Errorf("normalize input must be a top-level object with \"segments\" or a top-level segment array")
|
||||
}
|
||||
}
|
||||
|
||||
func ensureSingleValue(decoder *json.Decoder) error {
|
||||
var extra json.RawMessage
|
||||
err := decoder.Decode(&extra)
|
||||
if err == io.EOF {
|
||||
return nil
|
||||
}
|
||||
if err == nil {
|
||||
return fmt.Errorf("normalize input must contain exactly one top-level JSON value")
|
||||
}
|
||||
return fmt.Errorf("decode normalize input JSON: %w", err)
|
||||
}
|
||||
|
||||
func parseObjectShape(raw []byte) (ParsedTranscript, error) {
|
||||
var object map[string]json.RawMessage
|
||||
if err := json.Unmarshal(raw, &object); err != nil {
|
||||
return ParsedTranscript{}, fmt.Errorf("decode normalize object input: %w", err)
|
||||
}
|
||||
|
||||
segmentsRaw, exists := object["segments"]
|
||||
if !exists {
|
||||
return ParsedTranscript{}, fmt.Errorf("normalize object input must contain a \"segments\" field")
|
||||
}
|
||||
|
||||
segments, err := parseSegmentsArray(segmentsRaw)
|
||||
if err != nil {
|
||||
return ParsedTranscript{}, err
|
||||
}
|
||||
|
||||
return ParsedTranscript{
|
||||
Shape: ShapeObjectWithSegments,
|
||||
Segments: segments,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseSegmentsArray(raw []byte) ([]InputSegment, error) {
|
||||
var segmentValues []json.RawMessage
|
||||
if err := json.Unmarshal(raw, &segmentValues); err != nil {
|
||||
return nil, fmt.Errorf("normalize input \"segments\" must be an array")
|
||||
}
|
||||
|
||||
segments := make([]InputSegment, len(segmentValues))
|
||||
for index, segmentRaw := range segmentValues {
|
||||
segment, err := parseSegment(index, segmentRaw)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
segments[index] = segment
|
||||
}
|
||||
return segments, nil
|
||||
}
|
||||
|
||||
func parseSegment(index int, raw []byte) (InputSegment, error) {
|
||||
var payload inputSegmentPayload
|
||||
if err := json.Unmarshal(raw, &payload); err != nil {
|
||||
return InputSegment{}, fmt.Errorf("segment %d: invalid segment object: %w", index, err)
|
||||
}
|
||||
|
||||
if payload.Start == nil {
|
||||
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"start\"", index)
|
||||
}
|
||||
if payload.End == nil {
|
||||
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"end\"", index)
|
||||
}
|
||||
if payload.Speaker == nil {
|
||||
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"speaker\"", index)
|
||||
}
|
||||
if payload.Text == nil {
|
||||
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"text\"", index)
|
||||
}
|
||||
|
||||
if *payload.Start < 0 {
|
||||
return InputSegment{}, fmt.Errorf("segment %d has start %v; start must be >= 0", index, *payload.Start)
|
||||
}
|
||||
if *payload.End < *payload.Start {
|
||||
return InputSegment{}, fmt.Errorf("segment %d has end %v before start %v", index, *payload.End, *payload.Start)
|
||||
}
|
||||
|
||||
speaker := strings.TrimSpace(*payload.Speaker)
|
||||
if speaker == "" {
|
||||
return InputSegment{}, fmt.Errorf("segment %d has empty \"speaker\"; speaker must be non-empty", index)
|
||||
}
|
||||
|
||||
return InputSegment{
|
||||
InputIndex: index,
|
||||
OriginalID: payload.ID,
|
||||
Start: *payload.Start,
|
||||
End: *payload.End,
|
||||
Speaker: speaker,
|
||||
Text: *payload.Text,
|
||||
Categories: append([]string(nil), payload.Categories...),
|
||||
Source: payload.Source,
|
||||
SourceSegmentIndex: payload.SourceSegmentIndex,
|
||||
SourceRef: payload.SourceRef,
|
||||
DerivedFrom: append([]string(nil), payload.DerivedFrom...),
|
||||
OverlapGroupID: payload.OverlapGroupID,
|
||||
}, nil
|
||||
}
|
||||
Reference in New Issue
Block a user