Implemented the initial Go framework
This commit is contained in:
31
internal/builtin/input.go
Normal file
31
internal/builtin/input.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type jsonFilesReader struct{}
|
||||
|
||||
func (jsonFilesReader) Name() string {
|
||||
return "json-files"
|
||||
}
|
||||
|
||||
func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.RawTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
|
||||
for _, inputFile := range cfg.InputFiles {
|
||||
raw = append(raw, model.RawTranscript{Source: inputFile})
|
||||
}
|
||||
|
||||
return raw, []report.Event{
|
||||
report.Info("input", "json-files", fmt.Sprintf("accepted %d input file(s)", len(raw))),
|
||||
}, nil
|
||||
}
|
||||
52
internal/builtin/merge.go
Normal file
52
internal/builtin/merge.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type placeholderMerger struct{}
|
||||
|
||||
func (placeholderMerger) Name() string {
|
||||
return "placeholder-merger"
|
||||
}
|
||||
|
||||
func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
segments := make([]model.Segment, 0)
|
||||
for _, transcript := range in {
|
||||
segments = append(segments, transcript.Segments...)
|
||||
}
|
||||
|
||||
sort.SliceStable(segments, func(i, j int) bool {
|
||||
left := segments[i]
|
||||
right := segments[j]
|
||||
if left.Start != right.Start {
|
||||
return left.Start < right.Start
|
||||
}
|
||||
if left.End != right.End {
|
||||
return left.End < right.End
|
||||
}
|
||||
if left.Source != right.Source {
|
||||
return left.Source < right.Source
|
||||
}
|
||||
if left.SourceSegmentIndex != right.SourceSegmentIndex {
|
||||
return left.SourceSegmentIndex < right.SourceSegmentIndex
|
||||
}
|
||||
return left.Speaker < right.Speaker
|
||||
})
|
||||
|
||||
return model.MergedTranscript{
|
||||
Segments: segments,
|
||||
OverlapGroups: nil,
|
||||
}, []report.Event{
|
||||
report.Info("merge", "placeholder-merger", "merged placeholder canonical transcript(s)"),
|
||||
}, nil
|
||||
}
|
||||
39
internal/builtin/output.go
Normal file
39
internal/builtin/output.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type jsonOutputWriter struct{}
|
||||
|
||||
func (jsonOutputWriter) Name() string {
|
||||
return "json"
|
||||
}
|
||||
|
||||
func (jsonOutputWriter) Write(ctx context.Context, out model.FinalTranscript, rpt report.Report, cfg config.Config) ([]report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file, err := os.Create(cfg.OutputFile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
enc := json.NewEncoder(file)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return []report.Event{
|
||||
report.Info("output", "json", "wrote placeholder transcript JSON"),
|
||||
}, nil
|
||||
}
|
||||
47
internal/builtin/postprocess.go
Normal file
47
internal/builtin/postprocess.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type noopPostprocessor struct {
|
||||
name string
|
||||
}
|
||||
|
||||
func (p noopPostprocessor) Name() string {
|
||||
return p.name
|
||||
}
|
||||
|
||||
func (p noopPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
return in, []report.Event{
|
||||
report.Info("postprocessing", p.name, "completed no-op postprocessing module"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type assignIDs struct{}
|
||||
|
||||
func (assignIDs) Name() string {
|
||||
return "assign-ids"
|
||||
}
|
||||
|
||||
func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
for index := range in.Segments {
|
||||
in.Segments[index].ID = index + 1
|
||||
}
|
||||
|
||||
return in, []report.Event{
|
||||
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
|
||||
}, nil
|
||||
}
|
||||
82
internal/builtin/preprocess.go
Normal file
82
internal/builtin/preprocess.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type noopPreprocessor struct {
|
||||
name string
|
||||
requires pipeline.ModelState
|
||||
produces pipeline.ModelState
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Name() string {
|
||||
return p.name
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Requires() pipeline.ModelState {
|
||||
return p.requires
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Produces() pipeline.ModelState {
|
||||
return p.produces
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pipeline.PreprocessState{}, nil, err
|
||||
}
|
||||
if in.State != p.requires {
|
||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
|
||||
}
|
||||
|
||||
in.State = p.produces
|
||||
return in, []report.Event{
|
||||
report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type normalizeSpeakers struct{}
|
||||
|
||||
func (normalizeSpeakers) Name() string {
|
||||
return "normalize-speakers"
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Requires() pipeline.ModelState {
|
||||
return pipeline.StateRaw
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Produces() pipeline.ModelState {
|
||||
return pipeline.StateCanonical
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pipeline.PreprocessState{}, nil, err
|
||||
}
|
||||
if in.State != pipeline.StateRaw {
|
||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
||||
}
|
||||
|
||||
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
||||
for _, raw := range in.Raw {
|
||||
canonical = append(canonical, model.CanonicalTranscript{
|
||||
Source: raw.Source,
|
||||
Segments: nil,
|
||||
})
|
||||
}
|
||||
|
||||
return pipeline.PreprocessState{
|
||||
State: pipeline.StateCanonical,
|
||||
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
||||
Canonical: canonical,
|
||||
}, []report.Event{
|
||||
report.Info("preprocessing", "normalize-speakers", "created placeholder canonical transcript(s)"),
|
||||
}, nil
|
||||
}
|
||||
23
internal/builtin/registry.go
Normal file
23
internal/builtin/registry.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package builtin
|
||||
|
||||
import "gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
|
||||
// NewRegistry registers the MVP built-in modules.
|
||||
func NewRegistry() *pipeline.Registry {
|
||||
registry := pipeline.NewRegistry()
|
||||
|
||||
registry.RegisterInputReader(jsonFilesReader{})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
||||
registry.RegisterPreprocessor(normalizeSpeakers{})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "trim-text", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
||||
registry.RegisterMerger(placeholderMerger{})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
|
||||
registry.RegisterPostprocessor(assignIDs{})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"})
|
||||
registry.RegisterOutputWriter(jsonOutputWriter{})
|
||||
|
||||
return registry
|
||||
}
|
||||
39
internal/cli/merge.go
Normal file
39
internal/cli/merge.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/builtin"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
)
|
||||
|
||||
func newMergeCommand() *cobra.Command {
|
||||
var opts config.MergeOptions
|
||||
|
||||
cmd := &cobra.Command{
|
||||
Use: "merge",
|
||||
Short: "Run the transcript merge pipeline",
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
cfg, err := config.NewMergeConfig(opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return pipeline.Run(cmd.Context(), cfg, builtin.NewRegistry())
|
||||
},
|
||||
}
|
||||
|
||||
flags := cmd.Flags()
|
||||
flags.StringArrayVar(&opts.InputFiles, "input-file", nil, "input transcript file; may be repeated")
|
||||
flags.StringVar(&opts.OutputFile, "output-file", "", "output transcript JSON file")
|
||||
flags.StringVar(&opts.ReportFile, "report-file", "", "optional report JSON file")
|
||||
flags.StringVar(&opts.SpeakersFile, "speakers", "", "speaker map file")
|
||||
flags.StringVar(&opts.AutocorrectFile, "autocorrect", "", "autocorrect rules file")
|
||||
flags.StringVar(&opts.InputReader, "input-reader", config.DefaultInputReader, "input reader module")
|
||||
flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules")
|
||||
flags.StringVar(&opts.PreprocessingModules, "preprocessing-modules", config.DefaultPreprocessingModules, "comma-separated preprocessing modules")
|
||||
flags.StringVar(&opts.PostprocessingModules, "postprocessing-modules", config.DefaultPostprocessingModules, "comma-separated postprocessing modules")
|
||||
|
||||
return cmd
|
||||
}
|
||||
282
internal/cli/merge_test.go
Normal file
282
internal/cli/merge_test.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
reportPath := filepath.Join(dir, "report.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputB,
|
||||
"--input-file", inputA,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
"--report-file", reportPath,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
outputBytes, err := os.ReadFile(output)
|
||||
if err != nil {
|
||||
t.Fatalf("read output bytes: %v", err)
|
||||
}
|
||||
outputJSON := string(outputBytes)
|
||||
if !strings.Contains(outputJSON, `"segments": []`) {
|
||||
t.Fatalf("expected segments to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
if !strings.Contains(outputJSON, `"overlap_groups": []`) {
|
||||
t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
if transcript.Metadata.Application != "seriatim" {
|
||||
t.Fatalf("unexpected application metadata: %q", transcript.Metadata.Application)
|
||||
}
|
||||
if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) {
|
||||
t.Fatalf("input files not sorted deterministically: got %v want %v", got, want)
|
||||
}
|
||||
if len(transcript.Segments) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no segments, got %d", len(transcript.Segments))
|
||||
}
|
||||
if len(transcript.OverlapGroups) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
||||
}
|
||||
|
||||
var rpt report.Report
|
||||
readJSON(t, reportPath, &rpt)
|
||||
gotModules := make([]string, 0, len(rpt.Events))
|
||||
for _, event := range rpt.Events {
|
||||
gotModules = append(gotModules, event.Module)
|
||||
}
|
||||
wantModules := []string{
|
||||
"json-files",
|
||||
"validate-raw",
|
||||
"normalize-speakers",
|
||||
"trim-text",
|
||||
"placeholder-merger",
|
||||
"detect-overlaps",
|
||||
"resolve-overlaps",
|
||||
"assign-ids",
|
||||
"validate-output",
|
||||
"json",
|
||||
}
|
||||
if !equalStrings(gotModules, wantModules) {
|
||||
t.Fatalf("report event order mismatch:\ngot %v\nwant %v", gotModules, wantModules)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
args []string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "input reader",
|
||||
args: []string{"--input-reader", "missing-reader"},
|
||||
want: `unknown input reader "missing-reader"`,
|
||||
},
|
||||
{
|
||||
name: "preprocessing",
|
||||
args: []string{"--preprocessing-modules", "validate-raw,missing-module"},
|
||||
want: `unknown preprocessing module "missing-module"`,
|
||||
},
|
||||
{
|
||||
name: "postprocessing",
|
||||
args: []string{"--postprocessing-modules", "missing-module"},
|
||||
want: `unknown postprocessing module "missing-module"`,
|
||||
},
|
||||
{
|
||||
name: "output",
|
||||
args: []string{"--output-modules", "missing-module"},
|
||||
want: `unknown output module "missing-module"`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
args := []string{
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
}
|
||||
args = append(args, test.args...)
|
||||
|
||||
err := executeMerge(args...)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), test.want) {
|
||||
t.Fatalf("expected error to contain %q, got %q", test.want, err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--output-file", output,
|
||||
"--preprocessing-modules", "trim-text,validate-raw",
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), `requires state "canonical"`) {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", filepath.Join(dir, "missing.json"),
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--input-file") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--speakers is required") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--autocorrect is required") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
outputA := filepath.Join(dir, "merged-a.json")
|
||||
outputB := filepath.Join(dir, "merged-b.json")
|
||||
|
||||
args := []string{
|
||||
"--input-file", inputB,
|
||||
"--input-file", inputA,
|
||||
"--speakers", speakers,
|
||||
}
|
||||
|
||||
err := executeMerge(append(append([]string(nil), args...), "--output-file", outputA)...)
|
||||
if err != nil {
|
||||
t.Fatalf("first merge failed: %v", err)
|
||||
}
|
||||
err = executeMerge(append(append([]string(nil), args...), "--output-file", outputB)...)
|
||||
if err != nil {
|
||||
t.Fatalf("second merge failed: %v", err)
|
||||
}
|
||||
|
||||
first, err := os.ReadFile(outputA)
|
||||
if err != nil {
|
||||
t.Fatalf("read first output: %v", err)
|
||||
}
|
||||
second, err := os.ReadFile(outputB)
|
||||
if err != nil {
|
||||
t.Fatalf("read second output: %v", err)
|
||||
}
|
||||
if string(first) != string(second) {
|
||||
t.Fatalf("expected byte-stable output\nfirst:\n%s\nsecond:\n%s", first, second)
|
||||
}
|
||||
}
|
||||
|
||||
func executeMerge(args ...string) error {
|
||||
cmd := NewRootCommand()
|
||||
cmd.SetArgs(append([]string{"merge"}, args...))
|
||||
return cmd.Execute()
|
||||
}
|
||||
|
||||
func writeFile(t *testing.T, dir string, name string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte("{}\n"), 0o600); err != nil {
|
||||
t.Fatalf("write file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func readJSON(t *testing.T, path string, target any) {
|
||||
t.Helper()
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", path, err)
|
||||
}
|
||||
if err := json.Unmarshal(data, target); err != nil {
|
||||
t.Fatalf("unmarshal %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
func equalStrings(left []string, right []string) bool {
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for index := range left {
|
||||
if left[index] != right[index] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
18
internal/cli/root.go
Normal file
18
internal/cli/root.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// NewRootCommand builds the seriatim command tree.
|
||||
func NewRootCommand() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "seriatim",
|
||||
Short: "Merge per-speaker transcripts into a chronological transcript",
|
||||
SilenceErrors: true,
|
||||
SilenceUsage: true,
|
||||
}
|
||||
|
||||
cmd.AddCommand(newMergeCommand())
|
||||
return cmd
|
||||
}
|
||||
206
internal/config/config.go
Normal file
206
internal/config/config.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultInputReader = "json-files"
|
||||
DefaultOutputModules = "json"
|
||||
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,assign-ids,validate-output"
|
||||
)
|
||||
|
||||
// MergeOptions captures raw CLI option values before validation.
|
||||
type MergeOptions struct {
|
||||
InputFiles []string
|
||||
OutputFile string
|
||||
ReportFile string
|
||||
SpeakersFile string
|
||||
AutocorrectFile string
|
||||
InputReader string
|
||||
OutputModules string
|
||||
PreprocessingModules string
|
||||
PostprocessingModules string
|
||||
}
|
||||
|
||||
// Config is the validated runtime configuration for a merge invocation.
|
||||
type Config struct {
|
||||
InputFiles []string
|
||||
OutputFile string
|
||||
ReportFile string
|
||||
SpeakersFile string
|
||||
AutocorrectFile string
|
||||
InputReader string
|
||||
OutputModules []string
|
||||
PreprocessingModules []string
|
||||
PostprocessingModules []string
|
||||
}
|
||||
|
||||
// NewMergeConfig validates raw merge options and returns normalized config.
|
||||
func NewMergeConfig(opts MergeOptions) (Config, error) {
|
||||
cfg := Config{
|
||||
InputReader: strings.TrimSpace(opts.InputReader),
|
||||
OutputModules: nil,
|
||||
PreprocessingModules: nil,
|
||||
PostprocessingModules: nil,
|
||||
}
|
||||
|
||||
if cfg.InputReader == "" {
|
||||
return Config{}, errors.New("--input-reader is required")
|
||||
}
|
||||
|
||||
var err error
|
||||
cfg.OutputModules, err = parseModuleList(opts.OutputModules)
|
||||
if err != nil {
|
||||
return Config{}, fmt.Errorf("--output-modules: %w", err)
|
||||
}
|
||||
cfg.PreprocessingModules, err = parseModuleList(opts.PreprocessingModules)
|
||||
if err != nil {
|
||||
return Config{}, fmt.Errorf("--preprocessing-modules: %w", err)
|
||||
}
|
||||
cfg.PostprocessingModules, err = parseModuleList(opts.PostprocessingModules)
|
||||
if err != nil {
|
||||
return Config{}, fmt.Errorf("--postprocessing-modules: %w", err)
|
||||
}
|
||||
if len(cfg.OutputModules) == 0 {
|
||||
return Config{}, errors.New("--output-modules must include at least one module")
|
||||
}
|
||||
|
||||
cfg.InputFiles, err = normalizeInputFiles(opts.InputFiles)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
cfg.OutputFile, err = normalizeOutputPath(opts.OutputFile, "--output-file")
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
if opts.ReportFile != "" {
|
||||
cfg.ReportFile, err = normalizeOutputPath(opts.ReportFile, "--report-file")
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
}
|
||||
|
||||
cfg.SpeakersFile = filepath.Clean(strings.TrimSpace(opts.SpeakersFile))
|
||||
if opts.SpeakersFile == "" {
|
||||
cfg.SpeakersFile = ""
|
||||
}
|
||||
cfg.AutocorrectFile = filepath.Clean(strings.TrimSpace(opts.AutocorrectFile))
|
||||
if opts.AutocorrectFile == "" {
|
||||
cfg.AutocorrectFile = ""
|
||||
}
|
||||
|
||||
if contains(cfg.PreprocessingModules, "normalize-speakers") {
|
||||
if cfg.SpeakersFile == "" {
|
||||
return Config{}, errors.New("--speakers is required when normalize-speakers is enabled")
|
||||
}
|
||||
if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
} else if cfg.SpeakersFile != "" {
|
||||
if err := requireFile(cfg.SpeakersFile, "--speakers"); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
}
|
||||
|
||||
if contains(cfg.PreprocessingModules, "autocorrect") || contains(cfg.PostprocessingModules, "autocorrect") {
|
||||
if cfg.AutocorrectFile == "" {
|
||||
return Config{}, errors.New("--autocorrect is required when autocorrect is enabled")
|
||||
}
|
||||
if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
} else if cfg.AutocorrectFile != "" {
|
||||
if err := requireFile(cfg.AutocorrectFile, "--autocorrect"); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
}
|
||||
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func parseModuleList(value string) ([]string, error) {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
parts := strings.Split(value, ",")
|
||||
names := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
name := strings.TrimSpace(part)
|
||||
if name == "" {
|
||||
return nil, errors.New("module names cannot be empty")
|
||||
}
|
||||
names = append(names, name)
|
||||
}
|
||||
return names, nil
|
||||
}
|
||||
|
||||
func normalizeInputFiles(paths []string) ([]string, error) {
|
||||
if len(paths) == 0 {
|
||||
return nil, errors.New("at least one --input-file is required")
|
||||
}
|
||||
|
||||
normalized := make([]string, 0, len(paths))
|
||||
for _, path := range paths {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
return nil, errors.New("--input-file cannot be empty")
|
||||
}
|
||||
|
||||
clean := filepath.Clean(path)
|
||||
if err := requireFile(clean, "--input-file"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalized = append(normalized, clean)
|
||||
}
|
||||
sort.Strings(normalized)
|
||||
return normalized, nil
|
||||
}
|
||||
|
||||
func normalizeOutputPath(path string, flag string) (string, error) {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
return "", fmt.Errorf("%s is required", flag)
|
||||
}
|
||||
|
||||
clean := filepath.Clean(path)
|
||||
parent := filepath.Dir(clean)
|
||||
stat, err := os.Stat(parent)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("%s parent directory %q: %w", flag, parent, err)
|
||||
}
|
||||
if !stat.IsDir() {
|
||||
return "", fmt.Errorf("%s parent path %q is not a directory", flag, parent)
|
||||
}
|
||||
return clean, nil
|
||||
}
|
||||
|
||||
func requireFile(path string, flag string) error {
|
||||
stat, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %q: %w", flag, path, err)
|
||||
}
|
||||
if stat.IsDir() {
|
||||
return fmt.Errorf("%s %q is a directory, not a file", flag, path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func contains(values []string, target string) bool {
|
||||
for _, value := range values {
|
||||
if value == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
35
internal/config/config_test.go
Normal file
35
internal/config/config_test.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestOmittingNormalizeSpeakersDoesNotRequireSpeakers(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeTempFile(t, dir, "input.json")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
_, err := NewMergeConfig(MergeOptions{
|
||||
InputFiles: []string{input},
|
||||
OutputFile: output,
|
||||
InputReader: DefaultInputReader,
|
||||
OutputModules: DefaultOutputModules,
|
||||
PreprocessingModules: "validate-raw",
|
||||
PostprocessingModules: DefaultPostprocessingModules,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("expected speakers file to be optional when normalize-speakers is omitted, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func writeTempFile(t *testing.T, dir string, name string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte("{}\n"), 0o600); err != nil {
|
||||
t.Fatalf("write temp file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
70
internal/model/model.go
Normal file
70
internal/model/model.go
Normal file
@@ -0,0 +1,70 @@
|
||||
package model
|
||||
|
||||
// RawTranscript is a loaded input document before canonical normalization.
|
||||
type RawTranscript struct {
|
||||
Source string `json:"source"`
|
||||
}
|
||||
|
||||
// CanonicalTranscript is a per-speaker transcript in seriatim's internal model.
|
||||
type CanonicalTranscript struct {
|
||||
Source string `json:"source"`
|
||||
Segments []Segment `json:"segments"`
|
||||
}
|
||||
|
||||
// MergedTranscript is the globally merged in-memory transcript.
|
||||
type MergedTranscript struct {
|
||||
Segments []Segment `json:"segments"`
|
||||
OverlapGroups []OverlapGroup `json:"overlap_groups"`
|
||||
}
|
||||
|
||||
// FinalTranscript is the serialized transcript artifact.
|
||||
type FinalTranscript struct {
|
||||
Metadata OutputMetadata `json:"metadata"`
|
||||
Segments []Segment `json:"segments"`
|
||||
OverlapGroups []OverlapGroup `json:"overlap_groups"`
|
||||
}
|
||||
|
||||
// OutputMetadata records the pipeline configuration that produced an artifact.
|
||||
type OutputMetadata struct {
|
||||
Application string `json:"application"`
|
||||
Version string `json:"version"`
|
||||
InputReader string `json:"input_reader"`
|
||||
InputFiles []string `json:"input_files"`
|
||||
PreprocessingModules []string `json:"preprocessing_modules"`
|
||||
PostprocessingModules []string `json:"postprocessing_modules"`
|
||||
OutputModules []string `json:"output_modules"`
|
||||
}
|
||||
|
||||
// Segment is the canonical transcript segment shape used by the framework.
|
||||
type Segment struct {
|
||||
ID int `json:"id,omitempty"`
|
||||
InternalRef string `json:"internal_ref,omitempty"`
|
||||
Source string `json:"source"`
|
||||
SourceSegmentIndex int `json:"source_segment_index"`
|
||||
Speaker string `json:"speaker"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Words []Word `json:"words,omitempty"`
|
||||
OverlapGroupID int `json:"overlap_group_id,omitempty"`
|
||||
}
|
||||
|
||||
// Word preserves optional word-level timing data.
|
||||
type Word struct {
|
||||
Text string `json:"text"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Score float64 `json:"score,omitempty"`
|
||||
Speaker string `json:"speaker,omitempty"`
|
||||
}
|
||||
|
||||
// OverlapGroup describes a detected overlapping speech region.
|
||||
type OverlapGroup struct {
|
||||
ID int `json:"id"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Segments []string `json:"segments"`
|
||||
Speakers []string `json:"speakers"`
|
||||
Class string `json:"class"`
|
||||
Resolution string `json:"resolution"`
|
||||
}
|
||||
56
internal/pipeline/interfaces.go
Normal file
56
internal/pipeline/interfaces.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
// ModelState identifies which representation a preprocessing module consumes.
|
||||
type ModelState string
|
||||
|
||||
const (
|
||||
StateRaw ModelState = "raw"
|
||||
StateCanonical ModelState = "canonical"
|
||||
)
|
||||
|
||||
// PreprocessState carries transcript data as it moves from raw to canonical.
|
||||
type PreprocessState struct {
|
||||
State ModelState
|
||||
Raw []model.RawTranscript
|
||||
Canonical []model.CanonicalTranscript
|
||||
}
|
||||
|
||||
// InputReader loads external input specs into raw transcript documents.
|
||||
type InputReader interface {
|
||||
Name() string
|
||||
Read(ctx context.Context, cfg config.Config) ([]model.RawTranscript, []report.Event, error)
|
||||
}
|
||||
|
||||
// Preprocessor transforms preprocessing state.
|
||||
type Preprocessor interface {
|
||||
Name() string
|
||||
Requires() ModelState
|
||||
Produces() ModelState
|
||||
Process(ctx context.Context, in PreprocessState, cfg config.Config) (PreprocessState, []report.Event, error)
|
||||
}
|
||||
|
||||
// Merger combines canonical transcripts into one merged transcript.
|
||||
type Merger interface {
|
||||
Name() string
|
||||
Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error)
|
||||
}
|
||||
|
||||
// Postprocessor transforms a merged transcript.
|
||||
type Postprocessor interface {
|
||||
Name() string
|
||||
Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error)
|
||||
}
|
||||
|
||||
// OutputWriter emits final artifacts.
|
||||
type OutputWriter interface {
|
||||
Name() string
|
||||
Write(ctx context.Context, out model.FinalTranscript, rpt report.Report, cfg config.Config) ([]report.Event, error)
|
||||
}
|
||||
98
internal/pipeline/registry.go
Normal file
98
internal/pipeline/registry.go
Normal file
@@ -0,0 +1,98 @@
|
||||
package pipeline
|
||||
|
||||
import "fmt"
|
||||
|
||||
// Registry stores all built-in modules addressable by canonical name.
|
||||
type Registry struct {
|
||||
inputReaders map[string]InputReader
|
||||
preprocessors map[string]Preprocessor
|
||||
postprocessors map[string]Postprocessor
|
||||
outputWriters map[string]OutputWriter
|
||||
merger Merger
|
||||
}
|
||||
|
||||
// NewRegistry creates an empty module registry.
|
||||
func NewRegistry() *Registry {
|
||||
return &Registry{
|
||||
inputReaders: make(map[string]InputReader),
|
||||
preprocessors: make(map[string]Preprocessor),
|
||||
postprocessors: make(map[string]Postprocessor),
|
||||
outputWriters: make(map[string]OutputWriter),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterInputReader registers an input reader by name.
|
||||
func (r *Registry) RegisterInputReader(module InputReader) {
|
||||
r.inputReaders[module.Name()] = module
|
||||
}
|
||||
|
||||
// RegisterPreprocessor registers a preprocessing module by name.
|
||||
func (r *Registry) RegisterPreprocessor(module Preprocessor) {
|
||||
r.preprocessors[module.Name()] = module
|
||||
}
|
||||
|
||||
// RegisterPostprocessor registers a postprocessing module by name.
|
||||
func (r *Registry) RegisterPostprocessor(module Postprocessor) {
|
||||
r.postprocessors[module.Name()] = module
|
||||
}
|
||||
|
||||
// RegisterOutputWriter registers an output writer by name.
|
||||
func (r *Registry) RegisterOutputWriter(module OutputWriter) {
|
||||
r.outputWriters[module.Name()] = module
|
||||
}
|
||||
|
||||
// RegisterMerger registers the merger implementation.
|
||||
func (r *Registry) RegisterMerger(module Merger) {
|
||||
r.merger = module
|
||||
}
|
||||
|
||||
func (r *Registry) resolveInputReader(name string) (InputReader, error) {
|
||||
module, ok := r.inputReaders[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unknown input reader %q", name)
|
||||
}
|
||||
return module, nil
|
||||
}
|
||||
|
||||
func (r *Registry) resolvePreprocessors(names []string) ([]Preprocessor, error) {
|
||||
modules := make([]Preprocessor, 0, len(names))
|
||||
for _, name := range names {
|
||||
module, ok := r.preprocessors[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unknown preprocessing module %q", name)
|
||||
}
|
||||
modules = append(modules, module)
|
||||
}
|
||||
return modules, nil
|
||||
}
|
||||
|
||||
func (r *Registry) resolvePostprocessors(names []string) ([]Postprocessor, error) {
|
||||
modules := make([]Postprocessor, 0, len(names))
|
||||
for _, name := range names {
|
||||
module, ok := r.postprocessors[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unknown postprocessing module %q", name)
|
||||
}
|
||||
modules = append(modules, module)
|
||||
}
|
||||
return modules, nil
|
||||
}
|
||||
|
||||
func (r *Registry) resolveOutputWriters(names []string) ([]OutputWriter, error) {
|
||||
modules := make([]OutputWriter, 0, len(names))
|
||||
for _, name := range names {
|
||||
module, ok := r.outputWriters[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unknown output module %q", name)
|
||||
}
|
||||
modules = append(modules, module)
|
||||
}
|
||||
return modules, nil
|
||||
}
|
||||
|
||||
func (r *Registry) resolveMerger() (Merger, error) {
|
||||
if r.merger == nil {
|
||||
return nil, fmt.Errorf("no merger registered")
|
||||
}
|
||||
return r.merger, nil
|
||||
}
|
||||
176
internal/pipeline/runner.go
Normal file
176
internal/pipeline/runner.go
Normal file
@@ -0,0 +1,176 @@
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
const (
|
||||
applicationName = "seriatim"
|
||||
version = "dev"
|
||||
)
|
||||
|
||||
// Run validates module composition, executes the pipeline, and emits outputs.
|
||||
func Run(ctx context.Context, cfg config.Config, registry *Registry) error {
|
||||
plan, err := resolvePlan(cfg, registry)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
events := make([]report.Event, 0)
|
||||
|
||||
raw, newEvents, err := plan.inputReader.Read(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
events = append(events, newEvents...)
|
||||
|
||||
state := PreprocessState{
|
||||
State: StateRaw,
|
||||
Raw: raw,
|
||||
}
|
||||
for _, module := range plan.preprocessors {
|
||||
state, newEvents, err = module.Process(ctx, state, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
events = append(events, newEvents...)
|
||||
}
|
||||
if state.State != StateCanonical {
|
||||
return fmt.Errorf("preprocessing ended in state %q; expected %q", state.State, StateCanonical)
|
||||
}
|
||||
|
||||
merged, newEvents, err := plan.merger.Merge(ctx, state.Canonical, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
events = append(events, newEvents...)
|
||||
|
||||
for _, module := range plan.postprocessors {
|
||||
merged, newEvents, err = module.Process(ctx, merged, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
events = append(events, newEvents...)
|
||||
}
|
||||
|
||||
final := finalizeTranscript(cfg, merged)
|
||||
rpt := finalizeReport(cfg, events)
|
||||
|
||||
for _, module := range plan.outputWriters {
|
||||
newEvents, err = module.Write(ctx, final, rpt, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
events = append(events, newEvents...)
|
||||
}
|
||||
|
||||
rpt = finalizeReport(cfg, events)
|
||||
if cfg.ReportFile != "" {
|
||||
if err := report.WriteJSON(cfg.ReportFile, rpt); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type executionPlan struct {
|
||||
inputReader InputReader
|
||||
preprocessors []Preprocessor
|
||||
merger Merger
|
||||
postprocessors []Postprocessor
|
||||
outputWriters []OutputWriter
|
||||
}
|
||||
|
||||
func resolvePlan(cfg config.Config, registry *Registry) (executionPlan, error) {
|
||||
inputReader, err := registry.resolveInputReader(cfg.InputReader)
|
||||
if err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
|
||||
preprocessors, err := registry.resolvePreprocessors(cfg.PreprocessingModules)
|
||||
if err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
if err := validatePreprocessors(preprocessors); err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
|
||||
merger, err := registry.resolveMerger()
|
||||
if err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
|
||||
postprocessors, err := registry.resolvePostprocessors(cfg.PostprocessingModules)
|
||||
if err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
|
||||
outputWriters, err := registry.resolveOutputWriters(cfg.OutputModules)
|
||||
if err != nil {
|
||||
return executionPlan{}, err
|
||||
}
|
||||
|
||||
return executionPlan{
|
||||
inputReader: inputReader,
|
||||
preprocessors: preprocessors,
|
||||
merger: merger,
|
||||
postprocessors: postprocessors,
|
||||
outputWriters: outputWriters,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func validatePreprocessors(modules []Preprocessor) error {
|
||||
state := StateRaw
|
||||
for _, module := range modules {
|
||||
if module.Requires() != state {
|
||||
return fmt.Errorf("preprocessing module %q requires state %q but current state is %q", module.Name(), module.Requires(), state)
|
||||
}
|
||||
state = module.Produces()
|
||||
}
|
||||
if state != StateCanonical {
|
||||
return fmt.Errorf("preprocessing chain ends in state %q; expected %q", state, StateCanonical)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) model.FinalTranscript {
|
||||
segments := make([]model.Segment, len(merged.Segments))
|
||||
copy(segments, merged.Segments)
|
||||
overlapGroups := make([]model.OverlapGroup, len(merged.OverlapGroups))
|
||||
copy(overlapGroups, merged.OverlapGroups)
|
||||
|
||||
return model.FinalTranscript{
|
||||
Metadata: model.OutputMetadata{
|
||||
Application: applicationName,
|
||||
Version: version,
|
||||
InputReader: cfg.InputReader,
|
||||
InputFiles: append([]string(nil), cfg.InputFiles...),
|
||||
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
||||
PostprocessingModules: append([]string(nil), cfg.PostprocessingModules...),
|
||||
OutputModules: append([]string(nil), cfg.OutputModules...),
|
||||
},
|
||||
Segments: segments,
|
||||
OverlapGroups: overlapGroups,
|
||||
}
|
||||
}
|
||||
|
||||
func finalizeReport(cfg config.Config, events []report.Event) report.Report {
|
||||
return report.Report{
|
||||
Metadata: report.Metadata{
|
||||
Application: applicationName,
|
||||
Version: version,
|
||||
InputReader: cfg.InputReader,
|
||||
InputFiles: append([]string(nil), cfg.InputFiles...),
|
||||
PreprocessingModules: append([]string(nil), cfg.PreprocessingModules...),
|
||||
PostprocessingModules: append([]string(nil), cfg.PostprocessingModules...),
|
||||
OutputModules: append([]string(nil), cfg.OutputModules...),
|
||||
},
|
||||
Events: append([]report.Event(nil), events...),
|
||||
}
|
||||
}
|
||||
64
internal/report/report.go
Normal file
64
internal/report/report.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
)
|
||||
|
||||
// Severity classifies report events.
|
||||
type Severity string
|
||||
|
||||
const (
|
||||
SeverityInfo Severity = "info"
|
||||
SeverityWarning Severity = "warning"
|
||||
SeverityCorrected Severity = "corrected"
|
||||
SeverityError Severity = "error"
|
||||
)
|
||||
|
||||
// Event records a validation finding, correction, or pipeline action.
|
||||
type Event struct {
|
||||
Severity Severity `json:"severity"`
|
||||
Stage string `json:"stage"`
|
||||
Module string `json:"module"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
// Report is the deterministic report artifact emitted by the framework.
|
||||
type Report struct {
|
||||
Metadata Metadata `json:"metadata"`
|
||||
Events []Event `json:"events"`
|
||||
}
|
||||
|
||||
// Metadata records the pipeline configuration that produced the report.
|
||||
type Metadata struct {
|
||||
Application string `json:"application"`
|
||||
Version string `json:"version"`
|
||||
InputReader string `json:"input_reader"`
|
||||
InputFiles []string `json:"input_files"`
|
||||
PreprocessingModules []string `json:"preprocessing_modules"`
|
||||
PostprocessingModules []string `json:"postprocessing_modules"`
|
||||
OutputModules []string `json:"output_modules"`
|
||||
}
|
||||
|
||||
// Info constructs an informational report event.
|
||||
func Info(stage string, module string, message string) Event {
|
||||
return Event{
|
||||
Severity: SeverityInfo,
|
||||
Stage: stage,
|
||||
Module: module,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
|
||||
// WriteJSON writes a deterministic JSON report.
|
||||
func WriteJSON(path string, rpt Report) error {
|
||||
file, err := os.Create(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
enc := json.NewEncoder(file)
|
||||
enc.SetIndent("", " ")
|
||||
return enc.Encode(rpt)
|
||||
}
|
||||
Reference in New Issue
Block a user