seriatim/internal/builtin/preprocess.go

package builtin

import (
	"context"
	"fmt"
	"path/filepath"
	"strings"

	"gitea.maximumdirect.net/eric/seriatim/internal/config"
	"gitea.maximumdirect.net/eric/seriatim/internal/model"
	"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
	"gitea.maximumdirect.net/eric/seriatim/internal/report"
	"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
)

type noopPreprocessor struct {
	name     string
	requires pipeline.ModelState
	produces pipeline.ModelState
}

func (p noopPreprocessor) Name() string {
	return p.name
}

func (p noopPreprocessor) Requires() pipeline.ModelState {
	return p.requires
}

func (p noopPreprocessor) Produces() pipeline.ModelState {
	return p.produces
}

func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
	if err := ctx.Err(); err != nil {
		return pipeline.PreprocessState{}, nil, err
	}
	if in.State != p.requires {
		return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
	}

	in.State = p.produces
	return in, []report.Event{
		report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
	}, nil
}

type trimText struct{}

func (trimText) Name() string {
	return "trim-text"
}

func (trimText) Requires() pipeline.ModelState {
	return pipeline.StateCanonical
}

func (trimText) Produces() pipeline.ModelState {
	return pipeline.StateCanonical
}

func (trimText) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
	if err := ctx.Err(); err != nil {
		return pipeline.PreprocessState{}, nil, err
	}
	if in.State != pipeline.StateCanonical {
		return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "trim-text", pipeline.StateCanonical, in.State)
	}

	for transcriptIndex := range in.Canonical {
		for segmentIndex := range in.Canonical[transcriptIndex].Segments {
			in.Canonical[transcriptIndex].Segments[segmentIndex].Text = strings.TrimSpace(in.Canonical[transcriptIndex].Segments[segmentIndex].Text)
		}
	}

	return in, []report.Event{
		report.Info("preprocessing", "trim-text", "trimmed canonical segment text"),
	}, nil
}

type normalizeSpeakers struct{}

func (normalizeSpeakers) Name() string {
	return "normalize-speakers"
}

func (normalizeSpeakers) Requires() pipeline.ModelState {
	return pipeline.StateRaw
}

func (normalizeSpeakers) Produces() pipeline.ModelState {
	return pipeline.StateCanonical
}

func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
	if err := ctx.Err(); err != nil {
		return pipeline.PreprocessState{}, nil, err
	}
	if in.State != pipeline.StateRaw {
		return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
	}

	var speakers speaker.Map
	useSpeakerMap := cfg.SpeakersFile != ""
	if useSpeakerMap {
		var err error
		speakers, err = speaker.LoadMap(cfg.SpeakersFile)
		if err != nil {
			return pipeline.PreprocessState{}, nil, err
		}
	}

	canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
	for _, raw := range in.Raw {
		canonicalSpeaker := filepath.Base(raw.Source)
		if useSpeakerMap {
			var err error
			canonicalSpeaker, err = speakers.SpeakerForSource(raw.Source)
			if err != nil {
				return pipeline.PreprocessState{}, nil, err
			}
		}

		segments := make([]model.Segment, 0, len(raw.Segments))
		for index, rawSegment := range raw.Segments {
			sourceSegmentIndex := index
			segments = append(segments, model.Segment{
				Source:             raw.Source,
				SourceSegmentIndex: &sourceSegmentIndex,
				Speaker:            canonicalSpeaker,
				Start:              rawSegment.Start,
				End:                rawSegment.End,
				Text:               rawSegment.Text,
				Words:              append([]model.Word(nil), rawSegment.Words...),
			})
		}

		canonical = append(canonical, model.CanonicalTranscript{
			Source:   raw.Source,
			Segments: segments,
		})
	}

	message := "created canonical transcript(s) from raw input"
	if !useSpeakerMap {
		message = "created canonical transcript(s) using input basenames as speaker labels"
	}

	return pipeline.PreprocessState{
			State:     pipeline.StateCanonical,
			Raw:       append([]model.RawTranscript(nil), in.Raw...),
			Canonical: canonical,
		}, []report.Event{
			report.Info("preprocessing", "normalize-speakers", message),
		}, nil
}