Added initial segment overlap resolution logic

This commit is contained in:
2026-04-27 15:52:53 -05:00
parent e42a2326e8
commit 1b9f4bd922
16 changed files with 1357 additions and 59 deletions

View File

@@ -23,17 +23,18 @@ func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.Raw
}
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
events := make([]report.Event, 0, len(cfg.InputFiles)+1)
for _, inputFile := range cfg.InputFiles {
transcript, err := readRawTranscript(inputFile)
transcript, newEvents, err := readRawTranscript(inputFile)
if err != nil {
return nil, nil, err
}
raw = append(raw, transcript)
events = append(events, newEvents...)
}
return raw, []report.Event{
report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))),
}, nil
events = append(events, report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))))
return raw, events, nil
}
type rawTranscriptFile struct {
@@ -44,70 +45,163 @@ type rawSegmentFile struct {
Start json.RawMessage `json:"start"`
End json.RawMessage `json:"end"`
Text json.RawMessage `json:"text"`
Words json.RawMessage `json:"words"`
}
func readRawTranscript(path string) (model.RawTranscript, error) {
type rawWordFile struct {
Word json.RawMessage `json:"word"`
Start json.RawMessage `json:"start"`
End json.RawMessage `json:"end"`
Score json.RawMessage `json:"score"`
Speaker json.RawMessage `json:"speaker"`
}
func readRawTranscript(path string) (model.RawTranscript, []report.Event, error) {
data, err := os.ReadFile(path)
if err != nil {
return model.RawTranscript{}, fmt.Errorf("read input file %q: %w", path, err)
return model.RawTranscript{}, nil, fmt.Errorf("read input file %q: %w", path, err)
}
var parsed rawTranscriptFile
if err := json.Unmarshal(data, &parsed); err != nil {
return model.RawTranscript{}, fmt.Errorf("parse input file %q: %w", path, err)
return model.RawTranscript{}, nil, fmt.Errorf("parse input file %q: %w", path, err)
}
if parsed.Segments == nil || isJSONNull(parsed.Segments) {
return model.RawTranscript{}, fmt.Errorf("input file %q must contain top-level segments array", path)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q must contain top-level segments array", path)
}
var rawSegments []rawSegmentFile
if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
}
segments := make([]model.RawSegment, 0, len(rawSegments))
events := make([]report.Event, 0)
for index, segment := range rawSegments {
if segment.Start == nil || isJSONNull(segment.Start) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
}
if segment.End == nil || isJSONNull(segment.End) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
}
if segment.Text == nil || isJSONNull(segment.Text) {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing string text", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing string text", path, index)
}
var start float64
if err := json.Unmarshal(segment.Start, &start); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
}
var end float64
if err := json.Unmarshal(segment.End, &end); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
}
var text string
if err := json.Unmarshal(segment.Text, &text); err != nil {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d text must be a string", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d text must be a string", path, index)
}
if start < 0 {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has negative start", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has negative start", path, index)
}
if end < start {
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has end before start", path, index)
return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has end before start", path, index)
}
words, newEvents, err := parseRawWords(path, index, segment.Words)
if err != nil {
return model.RawTranscript{}, nil, err
}
events = append(events, newEvents...)
segments = append(segments, model.RawSegment{
Start: start,
End: end,
Text: text,
Words: words,
})
}
return model.RawTranscript{
Source: path,
Segments: segments,
}, nil
}, events, nil
}
func parseRawWords(path string, segmentIndex int, raw json.RawMessage) ([]model.Word, []report.Event, error) {
if raw == nil || isJSONNull(raw) {
return nil, nil, nil
}
var rawWords []rawWordFile
if err := json.Unmarshal(raw, &rawWords); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d words must be an array: %w", path, segmentIndex, err)
}
words := make([]model.Word, 0, len(rawWords))
events := make([]report.Event, 0)
for wordIndex, rawWord := range rawWords {
if rawWord.Word == nil || isJSONNull(rawWord.Word) {
return nil, nil, fmt.Errorf("input file %q segment %d word %d missing string word", path, segmentIndex, wordIndex)
}
var text string
if err := json.Unmarshal(rawWord.Word, &text); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d word %d word must be a string", path, segmentIndex, wordIndex)
}
word := model.Word{
Text: text,
}
hasStart := rawWord.Start != nil && !isJSONNull(rawWord.Start)
hasEnd := rawWord.End != nil && !isJSONNull(rawWord.End)
var start float64
var end float64
if hasStart {
if err := json.Unmarshal(rawWord.Start, &start); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d word %d start must be numeric", path, segmentIndex, wordIndex)
}
if start < 0 {
return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative start", path, segmentIndex, wordIndex)
}
}
if hasEnd {
if err := json.Unmarshal(rawWord.End, &end); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d word %d end must be numeric", path, segmentIndex, wordIndex)
}
if end < 0 {
return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative end", path, segmentIndex, wordIndex)
}
}
if hasStart && hasEnd {
if end < start {
return nil, nil, fmt.Errorf("input file %q segment %d word %d has end before start", path, segmentIndex, wordIndex)
}
word.Start = start
word.End = end
word.Timed = true
} else {
events = append(events, report.Warning(
"input",
"json-files",
fmt.Sprintf("input file %q segment %d word %d %q has no complete timing and will not anchor overlap resolution", path, segmentIndex, wordIndex, text),
))
}
if rawWord.Score != nil && !isJSONNull(rawWord.Score) {
if err := json.Unmarshal(rawWord.Score, &word.Score); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d word %d score must be numeric", path, segmentIndex, wordIndex)
}
}
if rawWord.Speaker != nil && !isJSONNull(rawWord.Speaker) {
if err := json.Unmarshal(rawWord.Speaker, &word.Speaker); err != nil {
return nil, nil, fmt.Errorf("input file %q segment %d word %d speaker must be a string", path, segmentIndex, wordIndex)
}
}
words = append(words, word)
}
return words, events, nil
}
func isJSONNull(value json.RawMessage) bool {

View File

@@ -26,21 +26,7 @@ func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscri
}
sort.SliceStable(segments, func(i, j int) bool {
left := segments[i]
right := segments[j]
if left.Start != right.Start {
return left.Start < right.Start
}
if left.End != right.End {
return left.End < right.End
}
if left.Source != right.Source {
return left.Source < right.Source
}
if left.SourceSegmentIndex != right.SourceSegmentIndex {
return left.SourceSegmentIndex < right.SourceSegmentIndex
}
return left.Speaker < right.Speaker
return model.SegmentLess(segments[i], segments[j])
})
return model.MergedTranscript{

View File

@@ -66,6 +66,37 @@ func (detectOverlaps) Process(ctx context.Context, in model.MergedTranscript, cf
}, nil
}
type resolveOverlaps struct{}
func (resolveOverlaps) Name() string {
return "resolve-overlaps"
}
func (resolveOverlaps) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
resolved, summary, err := overlap.Resolve(in, cfg.OverlapWordRunGap)
if err != nil {
return model.MergedTranscript{}, nil, err
}
return resolved, []report.Event{
report.Info(
"postprocessing",
"resolve-overlaps",
fmt.Sprintf(
"processed %d overlap group(s); changed %d; removed %d original segment(s); created %d replacement segment(s)",
summary.GroupsProcessed,
summary.GroupsChanged,
summary.OriginalsRemoved,
summary.ReplacementsCreated,
),
),
}, nil
}
type autocorrectPostprocessor struct{}
func (autocorrectPostprocessor) Name() string {

View File

@@ -123,13 +123,15 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat
segments := make([]model.Segment, 0, len(raw.Segments))
for index, rawSegment := range raw.Segments {
sourceSegmentIndex := index
segments = append(segments, model.Segment{
Source: raw.Source,
SourceSegmentIndex: index,
SourceSegmentIndex: &sourceSegmentIndex,
Speaker: canonicalSpeaker,
Start: rawSegment.Start,
End: rawSegment.End,
Text: rawSegment.Text,
Words: append([]model.Word(nil), rawSegment.Words...),
})
}

View File

@@ -12,7 +12,7 @@ func NewRegistry() *pipeline.Registry {
registry.RegisterPreprocessor(trimText{})
registry.RegisterMerger(placeholderMerger{})
registry.RegisterPostprocessor(detectOverlaps{})
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
registry.RegisterPostprocessor(resolveOverlaps{})
registry.RegisterPostprocessor(assignIDs{})
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
registry.RegisterPostprocessor(autocorrectPostprocessor{})