package builtin import ( "context" "encoding/json" "fmt" "os" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/report" ) type jsonFilesReader struct{} func (jsonFilesReader) Name() string { return "json-files" } func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.RawTranscript, []report.Event, error) { if err := ctx.Err(); err != nil { return nil, nil, err } raw := make([]model.RawTranscript, 0, len(cfg.InputFiles)) events := make([]report.Event, 0, len(cfg.InputFiles)+1) for _, inputFile := range cfg.InputFiles { transcript, newEvents, err := readRawTranscript(inputFile) if err != nil { return nil, nil, err } raw = append(raw, transcript) events = append(events, newEvents...) } events = append(events, report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw)))) return raw, events, nil } type rawTranscriptFile struct { Segments json.RawMessage `json:"segments"` } type rawSegmentFile struct { Start json.RawMessage `json:"start"` End json.RawMessage `json:"end"` Text json.RawMessage `json:"text"` Words json.RawMessage `json:"words"` } type rawWordFile struct { Word json.RawMessage `json:"word"` Start json.RawMessage `json:"start"` End json.RawMessage `json:"end"` Score json.RawMessage `json:"score"` Speaker json.RawMessage `json:"speaker"` } func readRawTranscript(path string) (model.RawTranscript, []report.Event, error) { data, err := os.ReadFile(path) if err != nil { return model.RawTranscript{}, nil, fmt.Errorf("read input file %q: %w", path, err) } var parsed rawTranscriptFile if err := json.Unmarshal(data, &parsed); err != nil { return model.RawTranscript{}, nil, fmt.Errorf("parse input file %q: %w", path, err) } if parsed.Segments == nil || isJSONNull(parsed.Segments) { return model.RawTranscript{}, nil, fmt.Errorf("input file %q must contain top-level segments array", path) } var rawSegments []rawSegmentFile if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil { return model.RawTranscript{}, nil, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err) } segments := make([]model.RawSegment, 0, len(rawSegments)) events := make([]report.Event, 0) for index, segment := range rawSegments { if segment.Start == nil || isJSONNull(segment.Start) { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric start", path, index) } if segment.End == nil || isJSONNull(segment.End) { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric end", path, index) } if segment.Text == nil || isJSONNull(segment.Text) { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing string text", path, index) } var start float64 if err := json.Unmarshal(segment.Start, &start); err != nil { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d start must be numeric", path, index) } var end float64 if err := json.Unmarshal(segment.End, &end); err != nil { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d end must be numeric", path, index) } var text string if err := json.Unmarshal(segment.Text, &text); err != nil { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d text must be a string", path, index) } if start < 0 { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has negative start", path, index) } if end < start { return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has end before start", path, index) } words, newEvents, err := parseRawWords(path, index, segment.Words) if err != nil { return model.RawTranscript{}, nil, err } events = append(events, newEvents...) segments = append(segments, model.RawSegment{ Start: start, End: end, Text: text, Words: words, }) } return model.RawTranscript{ Source: path, Segments: segments, }, events, nil } func parseRawWords(path string, segmentIndex int, raw json.RawMessage) ([]model.Word, []report.Event, error) { if raw == nil || isJSONNull(raw) { return nil, nil, nil } var rawWords []rawWordFile if err := json.Unmarshal(raw, &rawWords); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d words must be an array: %w", path, segmentIndex, err) } words := make([]model.Word, 0, len(rawWords)) events := make([]report.Event, 0) for wordIndex, rawWord := range rawWords { if rawWord.Word == nil || isJSONNull(rawWord.Word) { return nil, nil, fmt.Errorf("input file %q segment %d word %d missing string word", path, segmentIndex, wordIndex) } var text string if err := json.Unmarshal(rawWord.Word, &text); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d word %d word must be a string", path, segmentIndex, wordIndex) } word := model.Word{ Text: text, } hasStart := rawWord.Start != nil && !isJSONNull(rawWord.Start) hasEnd := rawWord.End != nil && !isJSONNull(rawWord.End) var start float64 var end float64 if hasStart { if err := json.Unmarshal(rawWord.Start, &start); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d word %d start must be numeric", path, segmentIndex, wordIndex) } if start < 0 { return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative start", path, segmentIndex, wordIndex) } } if hasEnd { if err := json.Unmarshal(rawWord.End, &end); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d word %d end must be numeric", path, segmentIndex, wordIndex) } if end < 0 { return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative end", path, segmentIndex, wordIndex) } } if hasStart && hasEnd { if end < start { return nil, nil, fmt.Errorf("input file %q segment %d word %d has end before start", path, segmentIndex, wordIndex) } word.Start = start word.End = end word.Timed = true } else { events = append(events, report.Warning( "input", "json-files", fmt.Sprintf("input file %q segment %d word %d %q has no complete timing and will not anchor overlap resolution", path, segmentIndex, wordIndex, text), )) } if rawWord.Score != nil && !isJSONNull(rawWord.Score) { if err := json.Unmarshal(rawWord.Score, &word.Score); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d word %d score must be numeric", path, segmentIndex, wordIndex) } } if rawWord.Speaker != nil && !isJSONNull(rawWord.Speaker) { if err := json.Unmarshal(rawWord.Speaker, &word.Speaker); err != nil { return nil, nil, fmt.Errorf("input file %q segment %d word %d speaker must be a string", path, segmentIndex, wordIndex) } } words = append(words, word) } return words, events, nil } func isJSONNull(value json.RawMessage) bool { return string(value) == "null" }