Implemented the initial Go framework
This commit is contained in:
39
internal/cli/merge.go
Normal file
39
internal/cli/merge.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/builtin"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
)
|
||||
|
||||
func newMergeCommand() *cobra.Command {
|
||||
var opts config.MergeOptions
|
||||
|
||||
cmd := &cobra.Command{
|
||||
Use: "merge",
|
||||
Short: "Run the transcript merge pipeline",
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
cfg, err := config.NewMergeConfig(opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return pipeline.Run(cmd.Context(), cfg, builtin.NewRegistry())
|
||||
},
|
||||
}
|
||||
|
||||
flags := cmd.Flags()
|
||||
flags.StringArrayVar(&opts.InputFiles, "input-file", nil, "input transcript file; may be repeated")
|
||||
flags.StringVar(&opts.OutputFile, "output-file", "", "output transcript JSON file")
|
||||
flags.StringVar(&opts.ReportFile, "report-file", "", "optional report JSON file")
|
||||
flags.StringVar(&opts.SpeakersFile, "speakers", "", "speaker map file")
|
||||
flags.StringVar(&opts.AutocorrectFile, "autocorrect", "", "autocorrect rules file")
|
||||
flags.StringVar(&opts.InputReader, "input-reader", config.DefaultInputReader, "input reader module")
|
||||
flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules")
|
||||
flags.StringVar(&opts.PreprocessingModules, "preprocessing-modules", config.DefaultPreprocessingModules, "comma-separated preprocessing modules")
|
||||
flags.StringVar(&opts.PostprocessingModules, "postprocessing-modules", config.DefaultPostprocessingModules, "comma-separated postprocessing modules")
|
||||
|
||||
return cmd
|
||||
}
|
||||
282
internal/cli/merge_test.go
Normal file
282
internal/cli/merge_test.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
reportPath := filepath.Join(dir, "report.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputB,
|
||||
"--input-file", inputA,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
"--report-file", reportPath,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
outputBytes, err := os.ReadFile(output)
|
||||
if err != nil {
|
||||
t.Fatalf("read output bytes: %v", err)
|
||||
}
|
||||
outputJSON := string(outputBytes)
|
||||
if !strings.Contains(outputJSON, `"segments": []`) {
|
||||
t.Fatalf("expected segments to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
if !strings.Contains(outputJSON, `"overlap_groups": []`) {
|
||||
t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
if transcript.Metadata.Application != "seriatim" {
|
||||
t.Fatalf("unexpected application metadata: %q", transcript.Metadata.Application)
|
||||
}
|
||||
if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) {
|
||||
t.Fatalf("input files not sorted deterministically: got %v want %v", got, want)
|
||||
}
|
||||
if len(transcript.Segments) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no segments, got %d", len(transcript.Segments))
|
||||
}
|
||||
if len(transcript.OverlapGroups) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
||||
}
|
||||
|
||||
var rpt report.Report
|
||||
readJSON(t, reportPath, &rpt)
|
||||
gotModules := make([]string, 0, len(rpt.Events))
|
||||
for _, event := range rpt.Events {
|
||||
gotModules = append(gotModules, event.Module)
|
||||
}
|
||||
wantModules := []string{
|
||||
"json-files",
|
||||
"validate-raw",
|
||||
"normalize-speakers",
|
||||
"trim-text",
|
||||
"placeholder-merger",
|
||||
"detect-overlaps",
|
||||
"resolve-overlaps",
|
||||
"assign-ids",
|
||||
"validate-output",
|
||||
"json",
|
||||
}
|
||||
if !equalStrings(gotModules, wantModules) {
|
||||
t.Fatalf("report event order mismatch:\ngot %v\nwant %v", gotModules, wantModules)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
args []string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "input reader",
|
||||
args: []string{"--input-reader", "missing-reader"},
|
||||
want: `unknown input reader "missing-reader"`,
|
||||
},
|
||||
{
|
||||
name: "preprocessing",
|
||||
args: []string{"--preprocessing-modules", "validate-raw,missing-module"},
|
||||
want: `unknown preprocessing module "missing-module"`,
|
||||
},
|
||||
{
|
||||
name: "postprocessing",
|
||||
args: []string{"--postprocessing-modules", "missing-module"},
|
||||
want: `unknown postprocessing module "missing-module"`,
|
||||
},
|
||||
{
|
||||
name: "output",
|
||||
args: []string{"--output-modules", "missing-module"},
|
||||
want: `unknown output module "missing-module"`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
args := []string{
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
}
|
||||
args = append(args, test.args...)
|
||||
|
||||
err := executeMerge(args...)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), test.want) {
|
||||
t.Fatalf("expected error to contain %q, got %q", test.want, err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--output-file", output,
|
||||
"--preprocessing-modules", "trim-text,validate-raw",
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), `requires state "canonical"`) {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", filepath.Join(dir, "missing.json"),
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--input-file") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--speakers is required") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
"--preprocessing-modules", "validate-raw,normalize-speakers,autocorrect",
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "--autocorrect is required") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
outputA := filepath.Join(dir, "merged-a.json")
|
||||
outputB := filepath.Join(dir, "merged-b.json")
|
||||
|
||||
args := []string{
|
||||
"--input-file", inputB,
|
||||
"--input-file", inputA,
|
||||
"--speakers", speakers,
|
||||
}
|
||||
|
||||
err := executeMerge(append(append([]string(nil), args...), "--output-file", outputA)...)
|
||||
if err != nil {
|
||||
t.Fatalf("first merge failed: %v", err)
|
||||
}
|
||||
err = executeMerge(append(append([]string(nil), args...), "--output-file", outputB)...)
|
||||
if err != nil {
|
||||
t.Fatalf("second merge failed: %v", err)
|
||||
}
|
||||
|
||||
first, err := os.ReadFile(outputA)
|
||||
if err != nil {
|
||||
t.Fatalf("read first output: %v", err)
|
||||
}
|
||||
second, err := os.ReadFile(outputB)
|
||||
if err != nil {
|
||||
t.Fatalf("read second output: %v", err)
|
||||
}
|
||||
if string(first) != string(second) {
|
||||
t.Fatalf("expected byte-stable output\nfirst:\n%s\nsecond:\n%s", first, second)
|
||||
}
|
||||
}
|
||||
|
||||
func executeMerge(args ...string) error {
|
||||
cmd := NewRootCommand()
|
||||
cmd.SetArgs(append([]string{"merge"}, args...))
|
||||
return cmd.Execute()
|
||||
}
|
||||
|
||||
func writeFile(t *testing.T, dir string, name string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte("{}\n"), 0o600); err != nil {
|
||||
t.Fatalf("write file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func readJSON(t *testing.T, path string, target any) {
|
||||
t.Helper()
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", path, err)
|
||||
}
|
||||
if err := json.Unmarshal(data, target); err != nil {
|
||||
t.Fatalf("unmarshal %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
|
||||
func equalStrings(left []string, right []string) bool {
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for index := range left {
|
||||
if left[index] != right[index] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
18
internal/cli/root.go
Normal file
18
internal/cli/root.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// NewRootCommand builds the seriatim command tree.
|
||||
func NewRootCommand() *cobra.Command {
|
||||
cmd := &cobra.Command{
|
||||
Use: "seriatim",
|
||||
Short: "Merge per-speaker transcripts into a chronological transcript",
|
||||
SilenceErrors: true,
|
||||
SilenceUsage: true,
|
||||
}
|
||||
|
||||
cmd.AddCommand(newMergeCommand())
|
||||
return cmd
|
||||
}
|
||||
Reference in New Issue
Block a user