Implemented an initial transcript merge stage
This commit is contained in:
5
go.mod
5
go.mod
@@ -2,7 +2,10 @@ module gitea.maximumdirect.net/eric/seriatim
|
|||||||
|
|
||||||
go 1.25
|
go 1.25
|
||||||
|
|
||||||
require github.com/spf13/cobra v1.10.1
|
require (
|
||||||
|
github.com/spf13/cobra v1.10.1
|
||||||
|
gopkg.in/yaml.v3 v3.0.1
|
||||||
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -6,5 +6,7 @@ github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s=
|
|||||||
github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0=
|
github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0=
|
||||||
github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
|
github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
|
||||||
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ package builtin
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
@@ -22,10 +24,92 @@ func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.Raw
|
|||||||
|
|
||||||
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
|
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
|
||||||
for _, inputFile := range cfg.InputFiles {
|
for _, inputFile := range cfg.InputFiles {
|
||||||
raw = append(raw, model.RawTranscript{Source: inputFile})
|
transcript, err := readRawTranscript(inputFile)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
raw = append(raw, transcript)
|
||||||
}
|
}
|
||||||
|
|
||||||
return raw, []report.Event{
|
return raw, []report.Event{
|
||||||
report.Info("input", "json-files", fmt.Sprintf("accepted %d input file(s)", len(raw))),
|
report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type rawTranscriptFile struct {
|
||||||
|
Segments json.RawMessage `json:"segments"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type rawSegmentFile struct {
|
||||||
|
Start json.RawMessage `json:"start"`
|
||||||
|
End json.RawMessage `json:"end"`
|
||||||
|
Text json.RawMessage `json:"text"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func readRawTranscript(path string) (model.RawTranscript, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("read input file %q: %w", path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var parsed rawTranscriptFile
|
||||||
|
if err := json.Unmarshal(data, &parsed); err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("parse input file %q: %w", path, err)
|
||||||
|
}
|
||||||
|
if parsed.Segments == nil || isJSONNull(parsed.Segments) {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q must contain top-level segments array", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
var rawSegments []rawSegmentFile
|
||||||
|
if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
segments := make([]model.RawSegment, 0, len(rawSegments))
|
||||||
|
for index, segment := range rawSegments {
|
||||||
|
if segment.Start == nil || isJSONNull(segment.Start) {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric start", path, index)
|
||||||
|
}
|
||||||
|
if segment.End == nil || isJSONNull(segment.End) {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric end", path, index)
|
||||||
|
}
|
||||||
|
if segment.Text == nil || isJSONNull(segment.Text) {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing string text", path, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
var start float64
|
||||||
|
if err := json.Unmarshal(segment.Start, &start); err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d start must be numeric", path, index)
|
||||||
|
}
|
||||||
|
var end float64
|
||||||
|
if err := json.Unmarshal(segment.End, &end); err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d end must be numeric", path, index)
|
||||||
|
}
|
||||||
|
var text string
|
||||||
|
if err := json.Unmarshal(segment.Text, &text); err != nil {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d text must be a string", path, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
if start < 0 {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has negative start", path, index)
|
||||||
|
}
|
||||||
|
if end < start {
|
||||||
|
return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has end before start", path, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
segments = append(segments, model.RawSegment{
|
||||||
|
Start: start,
|
||||||
|
End: end,
|
||||||
|
Text: text,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return model.RawTranscript{
|
||||||
|
Source: path,
|
||||||
|
Segments: segments,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func isJSONNull(value json.RawMessage) bool {
|
||||||
|
return string(value) == "null"
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,11 +3,13 @@ package builtin
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||||
|
"gitea.maximumdirect.net/eric/seriatim/internal/speaker"
|
||||||
)
|
)
|
||||||
|
|
||||||
type noopPreprocessor struct {
|
type noopPreprocessor struct {
|
||||||
@@ -42,6 +44,39 @@ func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessSta
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type trimText struct{}
|
||||||
|
|
||||||
|
func (trimText) Name() string {
|
||||||
|
return "trim-text"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (trimText) Requires() pipeline.ModelState {
|
||||||
|
return pipeline.StateCanonical
|
||||||
|
}
|
||||||
|
|
||||||
|
func (trimText) Produces() pipeline.ModelState {
|
||||||
|
return pipeline.StateCanonical
|
||||||
|
}
|
||||||
|
|
||||||
|
func (trimText) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return pipeline.PreprocessState{}, nil, err
|
||||||
|
}
|
||||||
|
if in.State != pipeline.StateCanonical {
|
||||||
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "trim-text", pipeline.StateCanonical, in.State)
|
||||||
|
}
|
||||||
|
|
||||||
|
for transcriptIndex := range in.Canonical {
|
||||||
|
for segmentIndex := range in.Canonical[transcriptIndex].Segments {
|
||||||
|
in.Canonical[transcriptIndex].Segments[segmentIndex].Text = strings.TrimSpace(in.Canonical[transcriptIndex].Segments[segmentIndex].Text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return in, []report.Event{
|
||||||
|
report.Info("preprocessing", "trim-text", "trimmed canonical segment text"),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
type normalizeSpeakers struct{}
|
type normalizeSpeakers struct{}
|
||||||
|
|
||||||
func (normalizeSpeakers) Name() string {
|
func (normalizeSpeakers) Name() string {
|
||||||
@@ -64,11 +99,33 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat
|
|||||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
speakers, err := speaker.LoadMap(cfg.SpeakersFile)
|
||||||
|
if err != nil {
|
||||||
|
return pipeline.PreprocessState{}, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
||||||
for _, raw := range in.Raw {
|
for _, raw := range in.Raw {
|
||||||
|
canonicalSpeaker, err := speakers.SpeakerForSource(raw.Source)
|
||||||
|
if err != nil {
|
||||||
|
return pipeline.PreprocessState{}, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
segments := make([]model.Segment, 0, len(raw.Segments))
|
||||||
|
for index, rawSegment := range raw.Segments {
|
||||||
|
segments = append(segments, model.Segment{
|
||||||
|
Source: raw.Source,
|
||||||
|
SourceSegmentIndex: index,
|
||||||
|
Speaker: canonicalSpeaker,
|
||||||
|
Start: rawSegment.Start,
|
||||||
|
End: rawSegment.End,
|
||||||
|
Text: rawSegment.Text,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
canonical = append(canonical, model.CanonicalTranscript{
|
canonical = append(canonical, model.CanonicalTranscript{
|
||||||
Source: raw.Source,
|
Source: raw.Source,
|
||||||
Segments: nil,
|
Segments: segments,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,6 +134,6 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat
|
|||||||
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
||||||
Canonical: canonical,
|
Canonical: canonical,
|
||||||
}, []report.Event{
|
}, []report.Event{
|
||||||
report.Info("preprocessing", "normalize-speakers", "created placeholder canonical transcript(s)"),
|
report.Info("preprocessing", "normalize-speakers", "created canonical transcript(s) from raw input"),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ func NewRegistry() *pipeline.Registry {
|
|||||||
registry.RegisterInputReader(jsonFilesReader{})
|
registry.RegisterInputReader(jsonFilesReader{})
|
||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
||||||
registry.RegisterPreprocessor(normalizeSpeakers{})
|
registry.RegisterPreprocessor(normalizeSpeakers{})
|
||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "trim-text", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
registry.RegisterPreprocessor(trimText{})
|
||||||
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
||||||
registry.RegisterMerger(placeholderMerger{})
|
registry.RegisterMerger(placeholderMerger{})
|
||||||
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
||||||
|
|||||||
@@ -11,11 +11,25 @@ import (
|
|||||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
inputA := writeFile(t, dir, "a.json")
|
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||||
inputB := writeFile(t, dir, "b.json")
|
"segments": [
|
||||||
speakers := writeFile(t, dir, "speakers.yml")
|
{"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored"}]},
|
||||||
|
{"start": 1, "end": 2, "text": "first a"}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||||
|
"segments": [
|
||||||
|
{"start": 5, "end": 6, "text": "first b"}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
a.json:
|
||||||
|
speaker: Alice
|
||||||
|
b.json:
|
||||||
|
speaker: Bob
|
||||||
|
`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
reportPath := filepath.Join(dir, "report.json")
|
reportPath := filepath.Join(dir, "report.json")
|
||||||
|
|
||||||
@@ -37,9 +51,6 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
|||||||
t.Fatalf("read output bytes: %v", err)
|
t.Fatalf("read output bytes: %v", err)
|
||||||
}
|
}
|
||||||
outputJSON := string(outputBytes)
|
outputJSON := string(outputBytes)
|
||||||
if !strings.Contains(outputJSON, `"segments": []`) {
|
|
||||||
t.Fatalf("expected segments to serialize as an empty array, got:\n%s", outputJSON)
|
|
||||||
}
|
|
||||||
if !strings.Contains(outputJSON, `"overlap_groups": []`) {
|
if !strings.Contains(outputJSON, `"overlap_groups": []`) {
|
||||||
t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON)
|
t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON)
|
||||||
}
|
}
|
||||||
@@ -49,8 +60,17 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
|||||||
if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) {
|
if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) {
|
||||||
t.Fatalf("input files not sorted deterministically: got %v want %v", got, want)
|
t.Fatalf("input files not sorted deterministically: got %v want %v", got, want)
|
||||||
}
|
}
|
||||||
if len(transcript.Segments) != 0 {
|
if got, want := len(transcript.Segments), 3; got != want {
|
||||||
t.Fatalf("expected placeholder output to contain no segments, got %d", len(transcript.Segments))
|
t.Fatalf("expected merged output to contain %d segments, got %d", want, got)
|
||||||
|
}
|
||||||
|
assertSegment(t, transcript.Segments[0], 1, inputA, 1, "Alice", 1, 2, "first a")
|
||||||
|
assertSegment(t, transcript.Segments[1], 2, inputB, 0, "Bob", 5, 6, "first b")
|
||||||
|
assertSegment(t, transcript.Segments[2], 3, inputA, 0, "Alice", 10, 11, "second a")
|
||||||
|
if strings.Contains(outputJSON, "internal_ref") {
|
||||||
|
t.Fatalf("did not expect internal_ref in output:\n%s", outputJSON)
|
||||||
|
}
|
||||||
|
if strings.Contains(outputJSON, "words") {
|
||||||
|
t.Fatalf("did not expect words in output:\n%s", outputJSON)
|
||||||
}
|
}
|
||||||
if len(transcript.OverlapGroups) != 0 {
|
if len(transcript.OverlapGroups) != 0 {
|
||||||
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
||||||
@@ -79,10 +99,62 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMergeTieBreakOrder(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||||
|
"segments": [
|
||||||
|
{"start": 1, "end": 4, "text": "a-late-end"},
|
||||||
|
{"start": 1, "end": 2, "text": "a-index-one"}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||||
|
"segments": [
|
||||||
|
{"start": 1, "end": 2, "text": "b-same-time"}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
a.json:
|
||||||
|
speaker: Alice
|
||||||
|
b.json:
|
||||||
|
speaker: Bob
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", inputB,
|
||||||
|
"--input-file", inputA,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("merge failed: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var transcript model.FinalTranscript
|
||||||
|
readJSON(t, output, &transcript)
|
||||||
|
got := []string{
|
||||||
|
transcript.Segments[0].Text,
|
||||||
|
transcript.Segments[1].Text,
|
||||||
|
transcript.Segments[2].Text,
|
||||||
|
}
|
||||||
|
want := []string{"a-index-one", "b-same-time", "a-late-end"}
|
||||||
|
if !equalStrings(got, want) {
|
||||||
|
t.Fatalf("tie-break order mismatch: got %v want %v", got, want)
|
||||||
|
}
|
||||||
|
for index, segment := range transcript.Segments {
|
||||||
|
if segment.ID != index+1 {
|
||||||
|
t.Fatalf("segment %d has id %d; want %d", index, segment.ID, index+1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeFile(t, dir, "input.json")
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
speakers := writeFile(t, dir, "speakers.yml")
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
@@ -134,7 +206,7 @@ func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
|||||||
|
|
||||||
func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeFile(t, dir, "input.json")
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
err := executeMerge(
|
err := executeMerge(
|
||||||
@@ -152,7 +224,10 @@ func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
|||||||
|
|
||||||
func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
speakers := writeFile(t, dir, "speakers.yml")
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
missing.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
err := executeMerge(
|
err := executeMerge(
|
||||||
@@ -170,7 +245,7 @@ func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
|||||||
|
|
||||||
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeFile(t, dir, "input.json")
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
err := executeMerge(
|
err := executeMerge(
|
||||||
@@ -187,8 +262,11 @@ func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
|||||||
|
|
||||||
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
input := writeFile(t, dir, "input.json")
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
speakers := writeFile(t, dir, "speakers.yml")
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
output := filepath.Join(dir, "merged.json")
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
err := executeMerge(
|
err := executeMerge(
|
||||||
@@ -207,9 +285,14 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
|||||||
|
|
||||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
inputA := writeFile(t, dir, "a.json")
|
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
||||||
inputB := writeFile(t, dir, "b.json")
|
inputB := writeJSONFile(t, dir, "b.json", `{"segments":[{"start":1,"end":2,"text":"b"}]}`)
|
||||||
speakers := writeFile(t, dir, "speakers.yml")
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
a.json:
|
||||||
|
speaker: Alice
|
||||||
|
b.json:
|
||||||
|
speaker: Bob
|
||||||
|
`)
|
||||||
outputA := filepath.Join(dir, "merged-a.json")
|
outputA := filepath.Join(dir, "merged-a.json")
|
||||||
outputB := filepath.Join(dir, "merged-b.json")
|
outputB := filepath.Join(dir, "merged-b.json")
|
||||||
|
|
||||||
@@ -241,17 +324,192 @@ func TestOutputJSONIsByteStable(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMissingSpeakerMappingFails(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
other.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), `speaker map has no entry for input basename "input.json"`) {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMalformedJSONFails(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{"segments":[`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "parse input file") {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMissingTopLevelSegmentsFails(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", `{}`)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "must contain top-level segments array") {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
json string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "missing start",
|
||||||
|
json: `{"segments":[{"end":1,"text":"x"}]}`,
|
||||||
|
want: "segment 0 missing numeric start",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "wrong typed end",
|
||||||
|
json: `{"segments":[{"start":0,"end":"1","text":"x"}]}`,
|
||||||
|
want: "segment 0 end must be numeric",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "wrong typed text",
|
||||||
|
json: `{"segments":[{"start":0,"end":1,"text":7}]}`,
|
||||||
|
want: "segment 0 text must be a string",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "null text",
|
||||||
|
json: `{"segments":[{"start":0,"end":1,"text":null}]}`,
|
||||||
|
want: "segment 0 missing string text",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", test.json)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), input) {
|
||||||
|
t.Fatalf("expected error to contain source path %q, got %v", input, err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), test.want) {
|
||||||
|
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInvalidTimingFails(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
json string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "negative start",
|
||||||
|
json: `{"segments":[{"start":-1,"end":1,"text":"x"}]}`,
|
||||||
|
want: "segment 0 has negative start",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "end before start",
|
||||||
|
json: `{"segments":[{"start":2,"end":1,"text":"x"}]}`,
|
||||||
|
want: "segment 0 has end before start",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
t.Run(test.name, func(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
input := writeJSONFile(t, dir, "input.json", test.json)
|
||||||
|
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||||
|
input.json:
|
||||||
|
speaker: Alice
|
||||||
|
`)
|
||||||
|
output := filepath.Join(dir, "merged.json")
|
||||||
|
|
||||||
|
err := executeMerge(
|
||||||
|
"--input-file", input,
|
||||||
|
"--speakers", speakers,
|
||||||
|
"--output-file", output,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), test.want) {
|
||||||
|
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func executeMerge(args ...string) error {
|
func executeMerge(args ...string) error {
|
||||||
cmd := NewRootCommand()
|
cmd := NewRootCommand()
|
||||||
cmd.SetArgs(append([]string{"merge"}, args...))
|
cmd.SetArgs(append([]string{"merge"}, args...))
|
||||||
return cmd.Execute()
|
return cmd.Execute()
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeFile(t *testing.T, dir string, name string) string {
|
func writeJSONFile(t *testing.T, dir string, name string, content string) string {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
path := filepath.Join(dir, name)
|
path := filepath.Join(dir, name)
|
||||||
if err := os.WriteFile(path, []byte("{}\n"), 0o600); err != nil {
|
if err := os.WriteFile(path, []byte(content+"\n"), 0o600); err != nil {
|
||||||
|
t.Fatalf("write file: %v", err)
|
||||||
|
}
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeYAMLFile(t *testing.T, dir string, name string, content string) string {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
path := filepath.Join(dir, name)
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
|
||||||
t.Fatalf("write file: %v", err)
|
t.Fatalf("write file: %v", err)
|
||||||
}
|
}
|
||||||
return path
|
return path
|
||||||
@@ -280,3 +538,29 @@ func equalStrings(left []string, right []string) bool {
|
|||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
if segment.ID != id {
|
||||||
|
t.Fatalf("segment ID = %d, want %d", segment.ID, id)
|
||||||
|
}
|
||||||
|
if segment.Source != source {
|
||||||
|
t.Fatalf("segment source = %q, want %q", segment.Source, source)
|
||||||
|
}
|
||||||
|
if segment.SourceSegmentIndex != sourceIndex {
|
||||||
|
t.Fatalf("segment source index = %d, want %d", segment.SourceSegmentIndex, sourceIndex)
|
||||||
|
}
|
||||||
|
if segment.Speaker != speaker {
|
||||||
|
t.Fatalf("segment speaker = %q, want %q", segment.Speaker, speaker)
|
||||||
|
}
|
||||||
|
if segment.Start != start {
|
||||||
|
t.Fatalf("segment start = %f, want %f", segment.Start, start)
|
||||||
|
}
|
||||||
|
if segment.End != end {
|
||||||
|
t.Fatalf("segment end = %f, want %f", segment.End, end)
|
||||||
|
}
|
||||||
|
if segment.Text != text {
|
||||||
|
t.Fatalf("segment text = %q, want %q", segment.Text, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,6 +3,14 @@ package model
|
|||||||
// RawTranscript is a loaded input document before canonical normalization.
|
// RawTranscript is a loaded input document before canonical normalization.
|
||||||
type RawTranscript struct {
|
type RawTranscript struct {
|
||||||
Source string `json:"source"`
|
Source string `json:"source"`
|
||||||
|
Segments []RawSegment `json:"segments"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RawSegment is the supported WhisperX segment subset.
|
||||||
|
type RawSegment struct {
|
||||||
|
Start float64 `json:"start"`
|
||||||
|
End float64 `json:"end"`
|
||||||
|
Text string `json:"text"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// CanonicalTranscript is a per-speaker transcript in seriatim's internal model.
|
// CanonicalTranscript is a per-speaker transcript in seriatim's internal model.
|
||||||
|
|||||||
69
internal/speaker/map.go
Normal file
69
internal/speaker/map.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package speaker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Map resolves input file basenames to canonical speaker names.
|
||||||
|
type Map struct {
|
||||||
|
inputs map[string]Input
|
||||||
|
}
|
||||||
|
|
||||||
|
// Input describes one input entry in speakers.yml.
|
||||||
|
type Input struct {
|
||||||
|
Speaker string `yaml:"speaker"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fileSchema struct {
|
||||||
|
Inputs map[string]Input `yaml:"inputs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadMap parses a speakers.yml file.
|
||||||
|
func LoadMap(path string) (Map, error) {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return Map{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var parsed fileSchema
|
||||||
|
if err := yaml.Unmarshal(data, &parsed); err != nil {
|
||||||
|
return Map{}, fmt.Errorf("parse speaker map %q: %w", path, err)
|
||||||
|
}
|
||||||
|
if len(parsed.Inputs) == 0 {
|
||||||
|
return Map{}, fmt.Errorf("speaker map %q must contain at least one inputs entry", path)
|
||||||
|
}
|
||||||
|
|
||||||
|
inputs := make(map[string]Input, len(parsed.Inputs))
|
||||||
|
for key, input := range parsed.Inputs {
|
||||||
|
basename := filepath.Base(strings.TrimSpace(key))
|
||||||
|
if basename == "." || basename == "" {
|
||||||
|
return Map{}, fmt.Errorf("speaker map %q contains an empty input key", path)
|
||||||
|
}
|
||||||
|
if _, exists := inputs[basename]; exists {
|
||||||
|
return Map{}, fmt.Errorf("speaker map %q contains duplicate basename mapping for %q", path, basename)
|
||||||
|
}
|
||||||
|
|
||||||
|
input.Speaker = strings.TrimSpace(input.Speaker)
|
||||||
|
if input.Speaker == "" {
|
||||||
|
return Map{}, fmt.Errorf("speaker map entry %q must include speaker", basename)
|
||||||
|
}
|
||||||
|
inputs[basename] = input
|
||||||
|
}
|
||||||
|
|
||||||
|
return Map{inputs: inputs}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SpeakerForSource returns the canonical speaker for a transcript source path.
|
||||||
|
func (m Map) SpeakerForSource(source string) (string, error) {
|
||||||
|
basename := filepath.Base(source)
|
||||||
|
input, ok := m.inputs[basename]
|
||||||
|
if !ok {
|
||||||
|
return "", fmt.Errorf("speaker map has no entry for input basename %q", basename)
|
||||||
|
}
|
||||||
|
return input.Speaker, nil
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user