Implemented an initial transcript merge stage

This commit is contained in:
2026-04-26 13:57:13 -05:00
parent c32349a017
commit 18f1873776
8 changed files with 535 additions and 28 deletions

View File

@@ -2,7 +2,9 @@ package builtin
import (
"context"
"encoding/json"
"fmt"
"os"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
@@ -22,10 +24,92 @@ func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.Raw
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
for _, inputFile := range cfg.InputFiles {
raw = append(raw, model.RawTranscript{Source: inputFile})
transcript, err := readRawTranscript(inputFile)
if err != nil {
return nil, nil, err
}
raw = append(raw, transcript)
}
return raw, []report.Event{
report.Info("input", "json-files", fmt.Sprintf("accepted %d input file(s)", len(raw))),
report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))),
}, nil
}
type rawTranscriptFile struct {
Segments json.RawMessage `json:"segments"`
}
type rawSegmentFile struct {
Start json.RawMessage `json:"start"`
End json.RawMessage `json:"end"`
Text json.RawMessage `json:"text"`
}
func readRawTranscript(path string) (model.RawTranscript, error) {
data, err := os.ReadFile(path)
if err != nil {
return model.RawTranscript{}, fmt.Errorf("read input file %q: %w", path, err)
}
var parsed rawTranscriptFile
if err := json.Unmarshal(data, &parsed); err != nil {
return model.RawTranscript{}, fmt.Errorf("parse input file %q: %w", path, err)
}
if parsed.Segments == nil || isJSONNull(parsed.Segments) {
return model.RawTranscript{}, fmt.Errorf("input file %q must contain top-level segments array", path)
}
var rawSegments []rawSegmentFile
if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
}
segments := make([]model.RawSegment, 0, len(rawSegments))
for index, segment := range rawSegments {
if segment.Start == nil || isJSONNull(segment.Start) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
}
if segment.End == nil || isJSONNull(segment.End) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
}
if segment.Text == nil || isJSONNull(segment.Text) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing string text", path, index)
}
var start float64
if err := json.Unmarshal(segment.Start, &start); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
}
var end float64
if err := json.Unmarshal(segment.End, &end); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
}
var text string
if err := json.Unmarshal(segment.Text, &text); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d text must be a string", path, index)
}
if start < 0 {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has negative start", path, index)
}
if end < start {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has end before start", path, index)
}
segments = append(segments, model.RawSegment{
Start: start,
End: end,
Text: text,
})
}
return model.RawTranscript{
Source: path,
Segments: segments,
}, nil
}
func isJSONNull(value json.RawMessage) bool {
return string(value) == "null"
}