package builtin import ( "context" "fmt" "path/filepath" "strings" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/pipeline" "gitea.maximumdirect.net/eric/seriatim/internal/report" "gitea.maximumdirect.net/eric/seriatim/internal/speaker" ) type validateRaw struct{} func (validateRaw) Name() string { return "validate-raw" } func (validateRaw) Requires() pipeline.ModelState { return pipeline.StateRaw } func (validateRaw) Produces() pipeline.ModelState { return pipeline.StateRaw } func (validateRaw) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) { if err := ctx.Err(); err != nil { return pipeline.PreprocessState{}, nil, err } if in.State != pipeline.StateRaw { return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "validate-raw", pipeline.StateRaw, in.State) } if len(in.Raw) == 0 { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: no raw transcript(s) to validate") } for transcriptIndex, transcript := range in.Raw { if strings.TrimSpace(transcript.Source) == "" { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %d has empty source", transcriptIndex) } for segmentIndex, segment := range transcript.Segments { if segment.Start < 0 { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has negative start", transcript.Source, segmentIndex) } if segment.End < segment.Start { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has end before start", transcript.Source, segmentIndex) } for wordIndex, word := range segment.Words { if !word.Timed { continue } if word.Start < 0 { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has negative start", transcript.Source, segmentIndex, wordIndex) } if word.End < word.Start { return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has end before start", transcript.Source, segmentIndex, wordIndex) } } } } return in, []report.Event{ report.Info("preprocessing", "validate-raw", fmt.Sprintf("validated %d raw transcript(s)", len(in.Raw))), }, nil } type trimText struct{} func (trimText) Name() string { return "trim-text" } func (trimText) Requires() pipeline.ModelState { return pipeline.StateCanonical } func (trimText) Produces() pipeline.ModelState { return pipeline.StateCanonical } func (trimText) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) { if err := ctx.Err(); err != nil { return pipeline.PreprocessState{}, nil, err } if in.State != pipeline.StateCanonical { return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "trim-text", pipeline.StateCanonical, in.State) } for transcriptIndex := range in.Canonical { for segmentIndex := range in.Canonical[transcriptIndex].Segments { in.Canonical[transcriptIndex].Segments[segmentIndex].Text = strings.TrimSpace(in.Canonical[transcriptIndex].Segments[segmentIndex].Text) } } return in, []report.Event{ report.Info("preprocessing", "trim-text", "trimmed canonical segment text"), }, nil } type normalizeSpeakers struct{} func (normalizeSpeakers) Name() string { return "normalize-speakers" } func (normalizeSpeakers) Requires() pipeline.ModelState { return pipeline.StateRaw } func (normalizeSpeakers) Produces() pipeline.ModelState { return pipeline.StateCanonical } func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) { if err := ctx.Err(); err != nil { return pipeline.PreprocessState{}, nil, err } if in.State != pipeline.StateRaw { return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State) } var speakers speaker.Map useSpeakerMap := cfg.SpeakersFile != "" if useSpeakerMap { var err error speakers, err = speaker.LoadMap(cfg.SpeakersFile) if err != nil { return pipeline.PreprocessState{}, nil, err } } canonical := make([]model.CanonicalTranscript, 0, len(in.Raw)) for _, raw := range in.Raw { canonicalSpeaker := filepath.Base(raw.Source) if useSpeakerMap { var err error canonicalSpeaker, err = speakers.SpeakerForSource(raw.Source) if err != nil { return pipeline.PreprocessState{}, nil, err } } segments := make([]model.Segment, 0, len(raw.Segments)) for index, rawSegment := range raw.Segments { sourceSegmentIndex := index segments = append(segments, model.Segment{ Source: raw.Source, SourceSegmentIndex: &sourceSegmentIndex, Speaker: canonicalSpeaker, Start: rawSegment.Start, End: rawSegment.End, Text: rawSegment.Text, Words: append([]model.Word(nil), rawSegment.Words...), }) } canonical = append(canonical, model.CanonicalTranscript{ Source: raw.Source, Segments: segments, }) } message := "created canonical transcript(s) from raw input" if !useSpeakerMap { message = "created canonical transcript(s) using input basenames as speaker labels" } return pipeline.PreprocessState{ State: pipeline.StateCanonical, Raw: append([]model.RawTranscript(nil), in.Raw...), Canonical: canonical, }, []report.Event{ report.Info("preprocessing", "normalize-speakers", message), }, nil }