Cleaned up documentation and development artifcats in advance of release

This commit is contained in:
2026-04-27 21:48:04 -05:00
parent 6cb739be55
commit 28c2eea340
15 changed files with 336 additions and 92 deletions

View File

@@ -2,6 +2,7 @@ package builtin
import (
"context"
"fmt"
"sort"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
@@ -9,13 +10,13 @@ import (
"gitea.maximumdirect.net/eric/seriatim/internal/report"
)
type placeholderMerger struct{}
type chronologicalMerger struct{}
func (placeholderMerger) Name() string {
return "placeholder-merger"
func (chronologicalMerger) Name() string {
return "chronological-merge"
}
func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
func (chronologicalMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
@@ -33,6 +34,6 @@ func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscri
Segments: segments,
OverlapGroups: nil,
}, []report.Event{
report.Info("merge", "placeholder-merger", "merged placeholder canonical transcript(s)"),
report.Info("merge", "chronological-merge", fmt.Sprintf("merged %d canonical transcript(s) into %d segment(s)", len(in), len(segments))),
}, nil
}

View File

@@ -16,24 +16,6 @@ import (
"gitea.maximumdirect.net/eric/seriatim/schema"
)
type noopPostprocessor struct {
name string
}
func (p noopPostprocessor) Name() string {
return p.name
}
func (p noopPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
return in, []report.Event{
report.Info("postprocessing", p.name, "completed no-op postprocessing module"),
}, nil
}
type assignIDs struct{}
func (assignIDs) Name() string {

View File

@@ -13,35 +13,58 @@ import (
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
)
type noopPreprocessor struct {
name string
requires pipeline.ModelState
produces pipeline.ModelState
type validateRaw struct{}
func (validateRaw) Name() string {
return "validate-raw"
}
func (p noopPreprocessor) Name() string {
return p.name
func (validateRaw) Requires() pipeline.ModelState {
return pipeline.StateRaw
}
func (p noopPreprocessor) Requires() pipeline.ModelState {
return p.requires
func (validateRaw) Produces() pipeline.ModelState {
return pipeline.StateRaw
}
func (p noopPreprocessor) Produces() pipeline.ModelState {
return p.produces
}
func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
func (validateRaw) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
if err := ctx.Err(); err != nil {
return pipeline.PreprocessState{}, nil, err
}
if in.State != p.requires {
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
if in.State != pipeline.StateRaw {
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "validate-raw", pipeline.StateRaw, in.State)
}
if len(in.Raw) == 0 {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: no raw transcript(s) to validate")
}
for transcriptIndex, transcript := range in.Raw {
if strings.TrimSpace(transcript.Source) == "" {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %d has empty source", transcriptIndex)
}
for segmentIndex, segment := range transcript.Segments {
if segment.Start < 0 {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has negative start", transcript.Source, segmentIndex)
}
if segment.End < segment.Start {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d has end before start", transcript.Source, segmentIndex)
}
for wordIndex, word := range segment.Words {
if !word.Timed {
continue
}
if word.Start < 0 {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has negative start", transcript.Source, segmentIndex, wordIndex)
}
if word.End < word.Start {
return pipeline.PreprocessState{}, nil, fmt.Errorf("validate-raw: raw transcript %q segment %d word %d has end before start", transcript.Source, segmentIndex, wordIndex)
}
}
}
}
in.State = p.produces
return in, []report.Event{
report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
report.Info("preprocessing", "validate-raw", fmt.Sprintf("validated %d raw transcript(s)", len(in.Raw))),
}, nil
}

View File

@@ -0,0 +1,153 @@
package builtin
import (
"context"
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
)
func TestValidateRawAcceptsValidRawTranscripts(t *testing.T) {
state := pipeline.PreprocessState{
State: pipeline.StateRaw,
Raw: []model.RawTranscript{
{
Source: "input.json",
Segments: []model.RawSegment{
{
Start: 1,
End: 2,
Text: "hello",
Words: []model.Word{
{Text: "hello", Start: 1, End: 1.5, Timed: true},
{Text: "untimed"},
},
},
},
},
},
}
got, events, err := validateRaw{}.Process(context.Background(), state, config.Config{})
if err != nil {
t.Fatalf("validate raw: %v", err)
}
if got.State != pipeline.StateRaw || len(got.Raw) != 1 {
t.Fatalf("unexpected state: %#v", got)
}
if len(events) != 1 || !strings.Contains(events[0].Message, "validated 1 raw transcript(s)") {
t.Fatalf("events = %#v", events)
}
}
func TestValidateRawRejectsInvalidState(t *testing.T) {
state := pipeline.PreprocessState{State: pipeline.StateCanonical}
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
assertPreprocessError(t, err, `requires state "raw"`)
}
func TestValidateRawRejectsNoRawTranscripts(t *testing.T) {
state := pipeline.PreprocessState{State: pipeline.StateRaw}
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
assertPreprocessError(t, err, "no raw transcript(s)")
}
func TestValidateRawRejectsEmptySource(t *testing.T) {
state := validRawState()
state.Raw[0].Source = " "
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
assertPreprocessError(t, err, "raw transcript 0 has empty source")
}
func TestValidateRawRejectsInvalidSegmentTiming(t *testing.T) {
tests := []struct {
name string
mutate func(*model.RawSegment)
want string
}{
{
name: "negative start",
mutate: func(segment *model.RawSegment) {
segment.Start = -1
},
want: "segment 0 has negative start",
},
{
name: "end before start",
mutate: func(segment *model.RawSegment) {
segment.Start = 2
segment.End = 1
},
want: "segment 0 has end before start",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
state := validRawState()
test.mutate(&state.Raw[0].Segments[0])
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
assertPreprocessError(t, err, test.want)
})
}
}
func TestValidateRawRejectsInvalidTimedWordTiming(t *testing.T) {
tests := []struct {
name string
word model.Word
want string
}{
{
name: "negative start",
word: model.Word{Text: "bad", Start: -1, End: 1, Timed: true},
want: "word 0 has negative start",
},
{
name: "end before start",
word: model.Word{Text: "bad", Start: 2, End: 1, Timed: true},
want: "word 0 has end before start",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
state := validRawState()
state.Raw[0].Segments[0].Words = []model.Word{test.word}
_, _, err := validateRaw{}.Process(context.Background(), state, config.Config{})
assertPreprocessError(t, err, test.want)
})
}
}
func validRawState() pipeline.PreprocessState {
return pipeline.PreprocessState{
State: pipeline.StateRaw,
Raw: []model.RawTranscript{
{
Source: "input.json",
Segments: []model.RawSegment{
{Start: 1, End: 2, Text: ""},
},
},
},
}
}
func assertPreprocessError(t *testing.T, err error, want string) {
t.Helper()
if err == nil {
t.Fatalf("expected error containing %q", want)
}
if !strings.Contains(err.Error(), want) {
t.Fatalf("expected error containing %q, got %v", want, err)
}
}

View File

@@ -2,15 +2,15 @@ package builtin
import "gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
// NewRegistry registers the MVP built-in modules.
// NewRegistry registers the built-in modules.
func NewRegistry() *pipeline.Registry {
registry := pipeline.NewRegistry()
registry.RegisterInputReader(jsonFilesReader{})
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
registry.RegisterPreprocessor(validateRaw{})
registry.RegisterPreprocessor(normalizeSpeakers{})
registry.RegisterPreprocessor(trimText{})
registry.RegisterMerger(placeholderMerger{})
registry.RegisterMerger(chronologicalMerger{})
registry.RegisterPostprocessor(detectOverlaps{})
registry.RegisterPostprocessor(resolveOverlaps{})
registry.RegisterPostprocessor(backchannelPostprocessor{})