|
|
|
|
@@ -23,17 +23,18 @@ func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.Raw
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
|
|
|
|
|
events := make([]report.Event, 0, len(cfg.InputFiles)+1)
|
|
|
|
|
for _, inputFile := range cfg.InputFiles {
|
|
|
|
|
transcript, err := readRawTranscript(inputFile)
|
|
|
|
|
transcript, newEvents, err := readRawTranscript(inputFile)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, nil, err
|
|
|
|
|
}
|
|
|
|
|
raw = append(raw, transcript)
|
|
|
|
|
events = append(events, newEvents...)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return raw, []report.Event{
|
|
|
|
|
report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))),
|
|
|
|
|
}, nil
|
|
|
|
|
events = append(events, report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))))
|
|
|
|
|
return raw, events, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type rawTranscriptFile struct {
|
|
|
|
|
@@ -44,70 +45,163 @@ type rawSegmentFile struct {
|
|
|
|
|
Start json.RawMessage `json:"start"`
|
|
|
|
|
End json.RawMessage `json:"end"`
|
|
|
|
|
Text json.RawMessage `json:"text"`
|
|
|
|
|
Words json.RawMessage `json:"words"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func readRawTranscript(path string) (model.RawTranscript, error) {
|
|
|
|
|
type rawWordFile struct {
|
|
|
|
|
Word json.RawMessage `json:"word"`
|
|
|
|
|
Start json.RawMessage `json:"start"`
|
|
|
|
|
End json.RawMessage `json:"end"`
|
|
|
|
|
Score json.RawMessage `json:"score"`
|
|
|
|
|
Speaker json.RawMessage `json:"speaker"`
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func readRawTranscript(path string) (model.RawTranscript, []report.Event, error) {
|
|
|
|
|
data, err := os.ReadFile(path)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("read input file %q: %w", path, err)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("read input file %q: %w", path, err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var parsed rawTranscriptFile
|
|
|
|
|
if err := json.Unmarshal(data, &parsed); err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("parse input file %q: %w", path, err)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("parse input file %q: %w", path, err)
|
|
|
|
|
}
|
|
|
|
|
if parsed.Segments == nil || isJSONNull(parsed.Segments) {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q must contain top-level segments array", path)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q must contain top-level segments array", path)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var rawSegments []rawSegmentFile
|
|
|
|
|
if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
segments := make([]model.RawSegment, 0, len(rawSegments))
|
|
|
|
|
events := make([]report.Event, 0)
|
|
|
|
|
for index, segment := range rawSegments {
|
|
|
|
|
if segment.Start == nil || isJSONNull(segment.Start) {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
|
|
|
|
|
}
|
|
|
|
|
if segment.End == nil || isJSONNull(segment.End) {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
|
|
|
|
|
}
|
|
|
|
|
if segment.Text == nil || isJSONNull(segment.Text) {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing string text", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing string text", path, index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var start float64
|
|
|
|
|
if err := json.Unmarshal(segment.Start, &start); err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
|
|
|
|
|
}
|
|
|
|
|
var end float64
|
|
|
|
|
if err := json.Unmarshal(segment.End, &end); err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
|
|
|
|
|
}
|
|
|
|
|
var text string
|
|
|
|
|
if err := json.Unmarshal(segment.Text, &text); err != nil {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d text must be a string", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d text must be a string", path, index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if start < 0 {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has negative start", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has negative start", path, index)
|
|
|
|
|
}
|
|
|
|
|
if end < start {
|
|
|
|
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has end before start", path, index)
|
|
|
|
|
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has end before start", path, index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
words, newEvents, err := parseRawWords(path, index, segment.Words)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return model.RawTranscript{}, nil, err
|
|
|
|
|
}
|
|
|
|
|
events = append(events, newEvents...)
|
|
|
|
|
|
|
|
|
|
segments = append(segments, model.RawSegment{
|
|
|
|
|
Start: start,
|
|
|
|
|
End: end,
|
|
|
|
|
Text: text,
|
|
|
|
|
Words: words,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return model.RawTranscript{
|
|
|
|
|
Source: path,
|
|
|
|
|
Segments: segments,
|
|
|
|
|
}, nil
|
|
|
|
|
}, events, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func parseRawWords(path string, segmentIndex int, raw json.RawMessage) ([]model.Word, []report.Event, error) {
|
|
|
|
|
if raw == nil || isJSONNull(raw) {
|
|
|
|
|
return nil, nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var rawWords []rawWordFile
|
|
|
|
|
if err := json.Unmarshal(raw, &rawWords); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d words must be an array: %w", path, segmentIndex, err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
words := make([]model.Word, 0, len(rawWords))
|
|
|
|
|
events := make([]report.Event, 0)
|
|
|
|
|
for wordIndex, rawWord := range rawWords {
|
|
|
|
|
if rawWord.Word == nil || isJSONNull(rawWord.Word) {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d missing string word", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var text string
|
|
|
|
|
if err := json.Unmarshal(rawWord.Word, &text); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d word must be a string", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
word := model.Word{
|
|
|
|
|
Text: text,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hasStart := rawWord.Start != nil && !isJSONNull(rawWord.Start)
|
|
|
|
|
hasEnd := rawWord.End != nil && !isJSONNull(rawWord.End)
|
|
|
|
|
var start float64
|
|
|
|
|
var end float64
|
|
|
|
|
if hasStart {
|
|
|
|
|
if err := json.Unmarshal(rawWord.Start, &start); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d start must be numeric", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
if start < 0 {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative start", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if hasEnd {
|
|
|
|
|
if err := json.Unmarshal(rawWord.End, &end); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d end must be numeric", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
if end < 0 {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative end", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if hasStart && hasEnd {
|
|
|
|
|
if end < start {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d has end before start", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
word.Start = start
|
|
|
|
|
word.End = end
|
|
|
|
|
word.Timed = true
|
|
|
|
|
} else {
|
|
|
|
|
events = append(events, report.Warning(
|
|
|
|
|
"input",
|
|
|
|
|
"json-files",
|
|
|
|
|
fmt.Sprintf("input file %q segment %d word %d %q has no complete timing and will not anchor overlap resolution", path, segmentIndex, wordIndex, text),
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
if rawWord.Score != nil && !isJSONNull(rawWord.Score) {
|
|
|
|
|
if err := json.Unmarshal(rawWord.Score, &word.Score); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d score must be numeric", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if rawWord.Speaker != nil && !isJSONNull(rawWord.Speaker) {
|
|
|
|
|
if err := json.Unmarshal(rawWord.Speaker, &word.Speaker); err != nil {
|
|
|
|
|
return nil, nil, fmt.Errorf("input file %q segment %d word %d speaker must be a string", path, segmentIndex, wordIndex)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
words = append(words, word)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return words, events, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func isJSONNull(value json.RawMessage) bool {
|
|
|
|
|
|