111 lines
3.8 KiB
Go
111 lines
3.8 KiB
Go
package model
|
|
|
|
// RawTranscript is a loaded input document before canonical normalization.
|
|
type RawTranscript struct {
|
|
Source string `json:"source"`
|
|
Segments []RawSegment `json:"segments"`
|
|
}
|
|
|
|
// RawSegment is the supported WhisperX segment subset.
|
|
type RawSegment struct {
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Words []Word `json:"words,omitempty"`
|
|
}
|
|
|
|
// CanonicalTranscript is a per-speaker transcript in seriatim's internal model.
|
|
type CanonicalTranscript struct {
|
|
Source string `json:"source"`
|
|
Segments []Segment `json:"segments"`
|
|
}
|
|
|
|
// MergedTranscript is the globally merged in-memory transcript.
|
|
type MergedTranscript struct {
|
|
Segments []Segment `json:"segments"`
|
|
OverlapGroups []OverlapGroup `json:"overlap_groups"`
|
|
}
|
|
|
|
// FinalTranscript is the serialized transcript artifact.
|
|
type FinalTranscript struct {
|
|
Metadata OutputMetadata `json:"metadata"`
|
|
Segments []Segment `json:"segments"`
|
|
OverlapGroups []OverlapGroup `json:"overlap_groups"`
|
|
}
|
|
|
|
// OutputMetadata records the pipeline configuration that produced an artifact.
|
|
type OutputMetadata struct {
|
|
Application string `json:"application"`
|
|
Version string `json:"version"`
|
|
InputReader string `json:"input_reader"`
|
|
InputFiles []string `json:"input_files"`
|
|
PreprocessingModules []string `json:"preprocessing_modules"`
|
|
PostprocessingModules []string `json:"postprocessing_modules"`
|
|
OutputModules []string `json:"output_modules"`
|
|
}
|
|
|
|
// Segment is the canonical transcript segment shape used by the framework.
|
|
type Segment struct {
|
|
ID int `json:"id,omitempty"`
|
|
InternalRef string `json:"internal_ref,omitempty"`
|
|
Source string `json:"source"`
|
|
SourceSegmentIndex *int `json:"source_segment_index,omitempty"`
|
|
SourceRef string `json:"source_ref,omitempty"`
|
|
DerivedFrom []string `json:"derived_from,omitempty"`
|
|
Speaker string `json:"speaker"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Text string `json:"text"`
|
|
Categories []string `json:"categories,omitempty"`
|
|
Words []Word `json:"words,omitempty"`
|
|
OverlapGroupID int `json:"overlap_group_id,omitempty"`
|
|
}
|
|
|
|
// Word preserves optional word-level timing data.
|
|
type Word struct {
|
|
Text string `json:"text"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Score float64 `json:"score,omitempty"`
|
|
Speaker string `json:"speaker,omitempty"`
|
|
Timed bool `json:"-"`
|
|
}
|
|
|
|
// OverlapGroup describes a detected overlapping speech region.
|
|
type OverlapGroup struct {
|
|
ID int `json:"id"`
|
|
Start float64 `json:"start"`
|
|
End float64 `json:"end"`
|
|
Segments []string `json:"segments"`
|
|
Speakers []string `json:"speakers"`
|
|
Class string `json:"class"`
|
|
Resolution string `json:"resolution"`
|
|
}
|
|
|
|
// SegmentLess defines the deterministic chronological ordering used by merge
|
|
// and postprocessing modules.
|
|
func SegmentLess(left Segment, right Segment) bool {
|
|
if left.Start != right.Start {
|
|
return left.Start < right.Start
|
|
}
|
|
if left.End != right.End {
|
|
return left.End < right.End
|
|
}
|
|
if left.Source != right.Source {
|
|
return left.Source < right.Source
|
|
}
|
|
if left.SourceSegmentIndex != nil && right.SourceSegmentIndex != nil && *left.SourceSegmentIndex != *right.SourceSegmentIndex {
|
|
return *left.SourceSegmentIndex < *right.SourceSegmentIndex
|
|
}
|
|
if left.SourceSegmentIndex == nil && right.SourceSegmentIndex != nil {
|
|
return false
|
|
}
|
|
if left.SourceSegmentIndex != nil && right.SourceSegmentIndex == nil {
|
|
return true
|
|
}
|
|
if left.SourceRef != right.SourceRef {
|
|
return left.SourceRef < right.SourceRef
|
|
}
|
|
return left.Speaker < right.Speaker
|
|
}
|