Files
seriatim/internal/builtin/preprocess.go

140 lines
4.1 KiB
Go

package builtin
import (
"context"
"fmt"
"strings"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
)
type noopPreprocessor struct {
name string
requires pipeline.ModelState
produces pipeline.ModelState
}
func (p noopPreprocessor) Name() string {
return p.name
}
func (p noopPreprocessor) Requires() pipeline.ModelState {
return p.requires
}
func (p noopPreprocessor) Produces() pipeline.ModelState {
return p.produces
}
func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
if err := ctx.Err(); err != nil {
return pipeline.PreprocessState{}, nil, err
}
if in.State != p.requires {
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
}
in.State = p.produces
return in, []report.Event{
report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
}, nil
}
type trimText struct{}
func (trimText) Name() string {
return "trim-text"
}
func (trimText) Requires() pipeline.ModelState {
return pipeline.StateCanonical
}
func (trimText) Produces() pipeline.ModelState {
return pipeline.StateCanonical
}
func (trimText) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
if err := ctx.Err(); err != nil {
return pipeline.PreprocessState{}, nil, err
}
if in.State != pipeline.StateCanonical {
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "trim-text", pipeline.StateCanonical, in.State)
}
for transcriptIndex := range in.Canonical {
for segmentIndex := range in.Canonical[transcriptIndex].Segments {
in.Canonical[transcriptIndex].Segments[segmentIndex].Text = strings.TrimSpace(in.Canonical[transcriptIndex].Segments[segmentIndex].Text)
}
}
return in, []report.Event{
report.Info("preprocessing", "trim-text", "trimmed canonical segment text"),
}, nil
}
type normalizeSpeakers struct{}
func (normalizeSpeakers) Name() string {
return "normalize-speakers"
}
func (normalizeSpeakers) Requires() pipeline.ModelState {
return pipeline.StateRaw
}
func (normalizeSpeakers) Produces() pipeline.ModelState {
return pipeline.StateCanonical
}
func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
if err := ctx.Err(); err != nil {
return pipeline.PreprocessState{}, nil, err
}
if in.State != pipeline.StateRaw {
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
}
speakers, err := speaker.LoadMap(cfg.SpeakersFile)
if err != nil {
return pipeline.PreprocessState{}, nil, err
}
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
for _, raw := range in.Raw {
canonicalSpeaker, err := speakers.SpeakerForSource(raw.Source)
if err != nil {
return pipeline.PreprocessState{}, nil, err
}
segments := make([]model.Segment, 0, len(raw.Segments))
for index, rawSegment := range raw.Segments {
segments = append(segments, model.Segment{
Source: raw.Source,
SourceSegmentIndex: index,
Speaker: canonicalSpeaker,
Start: rawSegment.Start,
End: rawSegment.End,
Text: rawSegment.Text,
})
}
canonical = append(canonical, model.CanonicalTranscript{
Source: raw.Source,
Segments: segments,
})
}
return pipeline.PreprocessState{
State: pipeline.StateCanonical,
Raw: append([]model.RawTranscript(nil), in.Raw...),
Canonical: canonical,
}, []report.Event{
report.Info("preprocessing", "normalize-speakers", "created canonical transcript(s) from raw input"),
}, nil
}