180 lines
5.8 KiB
Go
180 lines
5.8 KiB
Go
package builtin
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
|
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
|
|
)
|
|
|
|
type validateRaw struct{}
|
|
|
|
func (validateRaw) Name() string {
|
|
return "validate-raw"
|
|
}
|
|
|
|
func (validateRaw) Requires() pipeline.ModelState {
|
|
return pipeline.StateRaw
|
|
}
|
|
|
|
func (validateRaw) Produces() pipeline.ModelState {
|
|
return pipeline.StateRaw
|
|
}
|
|
|
|
func (validateRaw) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
|
if err := ctx.Err(); err != nil {
|
|
return pipeline.PreprocessState{}, nil, err
|
|
}
|
|
if in.State != pipeline.StateRaw {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "validate-raw", pipeline.StateRaw, in.State)
|
|
}
|
|
if len(in.Raw) == 0 {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: no raw transcript(s) to validate")
|
|
}
|
|
|
|
for transcriptIndex, transcript := range in.Raw {
|
|
if strings.TrimSpace(transcript.Source) == "" {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %d has empty source", transcriptIndex)
|
|
}
|
|
for segmentIndex, segment := range transcript.Segments {
|
|
if segment.Start < 0 {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has negative start", transcript.Source, segmentIndex)
|
|
}
|
|
if segment.End < segment.Start {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has end before start", transcript.Source, segmentIndex)
|
|
}
|
|
for wordIndex, word := range segment.Words {
|
|
if !word.Timed {
|
|
continue
|
|
}
|
|
if word.Start < 0 {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has negative start", transcript.Source, segmentIndex, wordIndex)
|
|
}
|
|
if word.End < word.Start {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has end before start", transcript.Source, segmentIndex, wordIndex)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return in, []report.Event{
|
|
report.Info("preprocessing", "validate-raw", fmt.Sprintf("validated %d raw transcript(s)", len(in.Raw))),
|
|
}, nil
|
|
}
|
|
|
|
type trimText struct{}
|
|
|
|
func (trimText) Name() string {
|
|
return "trim-text"
|
|
}
|
|
|
|
func (trimText) Requires() pipeline.ModelState {
|
|
return pipeline.StateCanonical
|
|
}
|
|
|
|
func (trimText) Produces() pipeline.ModelState {
|
|
return pipeline.StateCanonical
|
|
}
|
|
|
|
func (trimText) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
|
if err := ctx.Err(); err != nil {
|
|
return pipeline.PreprocessState{}, nil, err
|
|
}
|
|
if in.State != pipeline.StateCanonical {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "trim-text", pipeline.StateCanonical, in.State)
|
|
}
|
|
|
|
for transcriptIndex := range in.Canonical {
|
|
for segmentIndex := range in.Canonical[transcriptIndex].Segments {
|
|
in.Canonical[transcriptIndex].Segments[segmentIndex].Text = strings.TrimSpace(in.Canonical[transcriptIndex].Segments[segmentIndex].Text)
|
|
}
|
|
}
|
|
|
|
return in, []report.Event{
|
|
report.Info("preprocessing", "trim-text", "trimmed canonical segment text"),
|
|
}, nil
|
|
}
|
|
|
|
type normalizeSpeakers struct{}
|
|
|
|
func (normalizeSpeakers) Name() string {
|
|
return "normalize-speakers"
|
|
}
|
|
|
|
func (normalizeSpeakers) Requires() pipeline.ModelState {
|
|
return pipeline.StateRaw
|
|
}
|
|
|
|
func (normalizeSpeakers) Produces() pipeline.ModelState {
|
|
return pipeline.StateCanonical
|
|
}
|
|
|
|
func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
|
if err := ctx.Err(); err != nil {
|
|
return pipeline.PreprocessState{}, nil, err
|
|
}
|
|
if in.State != pipeline.StateRaw {
|
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
|
}
|
|
|
|
var speakers speaker.Map
|
|
useSpeakerMap := cfg.SpeakersFile != ""
|
|
if useSpeakerMap {
|
|
var err error
|
|
speakers, err = speaker.LoadMap(cfg.SpeakersFile)
|
|
if err != nil {
|
|
return pipeline.PreprocessState{}, nil, err
|
|
}
|
|
}
|
|
|
|
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
|
for _, raw := range in.Raw {
|
|
canonicalSpeaker := filepath.Base(raw.Source)
|
|
if useSpeakerMap {
|
|
var err error
|
|
canonicalSpeaker, err = speakers.SpeakerForSource(raw.Source)
|
|
if err != nil {
|
|
return pipeline.PreprocessState{}, nil, err
|
|
}
|
|
}
|
|
|
|
segments := make([]model.Segment, 0, len(raw.Segments))
|
|
for index, rawSegment := range raw.Segments {
|
|
sourceSegmentIndex := index
|
|
segments = append(segments, model.Segment{
|
|
Source: raw.Source,
|
|
SourceSegmentIndex: &sourceSegmentIndex,
|
|
Speaker: canonicalSpeaker,
|
|
Start: rawSegment.Start,
|
|
End: rawSegment.End,
|
|
Text: rawSegment.Text,
|
|
Words: append([]model.Word(nil), rawSegment.Words...),
|
|
})
|
|
}
|
|
|
|
canonical = append(canonical, model.CanonicalTranscript{
|
|
Source: raw.Source,
|
|
Segments: segments,
|
|
})
|
|
}
|
|
|
|
message := "created canonical transcript(s) from raw input"
|
|
if !useSpeakerMap {
|
|
message = "created canonical transcript(s) using input basenames as speaker labels"
|
|
}
|
|
|
|
return pipeline.PreprocessState{
|
|
State: pipeline.StateCanonical,
|
|
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
|
Canonical: canonical,
|
|
}, []report.Event{
|
|
report.Info("preprocessing", "normalize-speakers", message),
|
|
}, nil
|
|
}
|