Implemented an initial transcript merge stage
This commit is contained in:
@@ -11,11 +11,25 @@ import (
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
"segments": [
|
||||
{"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored"}]},
|
||||
{"start": 1, "end": 2, "text": "first a"}
|
||||
]
|
||||
}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||
"segments": [
|
||||
{"start": 5, "end": 6, "text": "first b"}
|
||||
]
|
||||
}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
a.json:
|
||||
speaker: Alice
|
||||
b.json:
|
||||
speaker: Bob
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
reportPath := filepath.Join(dir, "report.json")
|
||||
|
||||
@@ -37,9 +51,6 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
t.Fatalf("read output bytes: %v", err)
|
||||
}
|
||||
outputJSON := string(outputBytes)
|
||||
if !strings.Contains(outputJSON, `"segments": []`) {
|
||||
t.Fatalf("expected segments to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
if !strings.Contains(outputJSON, `"overlap_groups": []`) {
|
||||
t.Fatalf("expected overlap_groups to serialize as an empty array, got:\n%s", outputJSON)
|
||||
}
|
||||
@@ -49,8 +60,17 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
if got, want := transcript.Metadata.InputFiles, []string{inputA, inputB}; !equalStrings(got, want) {
|
||||
t.Fatalf("input files not sorted deterministically: got %v want %v", got, want)
|
||||
}
|
||||
if len(transcript.Segments) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no segments, got %d", len(transcript.Segments))
|
||||
if got, want := len(transcript.Segments), 3; got != want {
|
||||
t.Fatalf("expected merged output to contain %d segments, got %d", want, got)
|
||||
}
|
||||
assertSegment(t, transcript.Segments[0], 1, inputA, 1, "Alice", 1, 2, "first a")
|
||||
assertSegment(t, transcript.Segments[1], 2, inputB, 0, "Bob", 5, 6, "first b")
|
||||
assertSegment(t, transcript.Segments[2], 3, inputA, 0, "Alice", 10, 11, "second a")
|
||||
if strings.Contains(outputJSON, "internal_ref") {
|
||||
t.Fatalf("did not expect internal_ref in output:\n%s", outputJSON)
|
||||
}
|
||||
if strings.Contains(outputJSON, "words") {
|
||||
t.Fatalf("did not expect words in output:\n%s", outputJSON)
|
||||
}
|
||||
if len(transcript.OverlapGroups) != 0 {
|
||||
t.Fatalf("expected placeholder output to contain no overlap groups, got %d", len(transcript.OverlapGroups))
|
||||
@@ -79,10 +99,62 @@ func TestMergeWritesPlaceholderOutputAndReport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeTieBreakOrder(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
"segments": [
|
||||
{"start": 1, "end": 4, "text": "a-late-end"},
|
||||
{"start": 1, "end": 2, "text": "a-index-one"}
|
||||
]
|
||||
}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||
"segments": [
|
||||
{"start": 1, "end": 2, "text": "b-same-time"}
|
||||
]
|
||||
}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
a.json:
|
||||
speaker: Alice
|
||||
b.json:
|
||||
speaker: Bob
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputB,
|
||||
"--input-file", inputA,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
got := []string{
|
||||
transcript.Segments[0].Text,
|
||||
transcript.Segments[1].Text,
|
||||
transcript.Segments[2].Text,
|
||||
}
|
||||
want := []string{"a-index-one", "b-same-time", "a-late-end"}
|
||||
if !equalStrings(got, want) {
|
||||
t.Fatalf("tie-break order mismatch: got %v want %v", got, want)
|
||||
}
|
||||
for index, segment := range transcript.Segments {
|
||||
if segment.ID != index+1 {
|
||||
t.Fatalf("segment %d has id %d; want %d", index, segment.ID, index+1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
tests := []struct {
|
||||
@@ -134,7 +206,7 @@ func TestUnknownModulesFailDuringValidation(t *testing.T) {
|
||||
|
||||
func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
@@ -152,7 +224,10 @@ func TestInvalidPreprocessingOrderFails(t *testing.T) {
|
||||
|
||||
func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
missing.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
@@ -170,7 +245,7 @@ func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) {
|
||||
|
||||
func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
@@ -187,8 +262,11 @@ func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) {
|
||||
|
||||
func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeFile(t, dir, "input.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
@@ -207,9 +285,14 @@ func TestAutocorrectRequiresAutocorrectFile(t *testing.T) {
|
||||
|
||||
func TestOutputJSONIsByteStable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeFile(t, dir, "a.json")
|
||||
inputB := writeFile(t, dir, "b.json")
|
||||
speakers := writeFile(t, dir, "speakers.yml")
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{"segments":[{"start":1,"end":2,"text":"b"}]}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
a.json:
|
||||
speaker: Alice
|
||||
b.json:
|
||||
speaker: Bob
|
||||
`)
|
||||
outputA := filepath.Join(dir, "merged-a.json")
|
||||
outputB := filepath.Join(dir, "merged-b.json")
|
||||
|
||||
@@ -241,17 +324,192 @@ func TestOutputJSONIsByteStable(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingSpeakerMappingFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
other.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), `speaker map has no entry for input basename "input.json"`) {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMalformedJSONFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", `{"segments":[`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "parse input file") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingTopLevelSegmentsFails(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", `{}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "must contain top-level segments array") {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
json string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "missing start",
|
||||
json: `{"segments":[{"end":1,"text":"x"}]}`,
|
||||
want: "segment 0 missing numeric start",
|
||||
},
|
||||
{
|
||||
name: "wrong typed end",
|
||||
json: `{"segments":[{"start":0,"end":"1","text":"x"}]}`,
|
||||
want: "segment 0 end must be numeric",
|
||||
},
|
||||
{
|
||||
name: "wrong typed text",
|
||||
json: `{"segments":[{"start":0,"end":1,"text":7}]}`,
|
||||
want: "segment 0 text must be a string",
|
||||
},
|
||||
{
|
||||
name: "null text",
|
||||
json: `{"segments":[{"start":0,"end":1,"text":null}]}`,
|
||||
want: "segment 0 missing string text",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", test.json)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), input) {
|
||||
t.Fatalf("expected error to contain source path %q, got %v", input, err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), test.want) {
|
||||
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestInvalidTimingFails(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
json string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "negative start",
|
||||
json: `{"segments":[{"start":-1,"end":1,"text":"x"}]}`,
|
||||
want: "segment 0 has negative start",
|
||||
},
|
||||
{
|
||||
name: "end before start",
|
||||
json: `{"segments":[{"start":2,"end":1,"text":"x"}]}`,
|
||||
want: "segment 0 has end before start",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", test.json)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs:
|
||||
input.json:
|
||||
speaker: Alice
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), test.want) {
|
||||
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func executeMerge(args ...string) error {
|
||||
cmd := NewRootCommand()
|
||||
cmd.SetArgs(append([]string{"merge"}, args...))
|
||||
return cmd.Execute()
|
||||
}
|
||||
|
||||
func writeFile(t *testing.T, dir string, name string) string {
|
||||
func writeJSONFile(t *testing.T, dir string, name string, content string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte("{}\n"), 0o600); err != nil {
|
||||
if err := os.WriteFile(path, []byte(content+"\n"), 0o600); err != nil {
|
||||
t.Fatalf("write file: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func writeYAMLFile(t *testing.T, dir string, name string, content string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
|
||||
t.Fatalf("write file: %v", err)
|
||||
}
|
||||
return path
|
||||
@@ -280,3 +538,29 @@ func equalStrings(left []string, right []string) bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func assertSegment(t *testing.T, segment model.Segment, id int, source string, sourceIndex int, speaker string, start float64, end float64, text string) {
|
||||
t.Helper()
|
||||
|
||||
if segment.ID != id {
|
||||
t.Fatalf("segment ID = %d, want %d", segment.ID, id)
|
||||
}
|
||||
if segment.Source != source {
|
||||
t.Fatalf("segment source = %q, want %q", segment.Source, source)
|
||||
}
|
||||
if segment.SourceSegmentIndex != sourceIndex {
|
||||
t.Fatalf("segment source index = %d, want %d", segment.SourceSegmentIndex, sourceIndex)
|
||||
}
|
||||
if segment.Speaker != speaker {
|
||||
t.Fatalf("segment speaker = %q, want %q", segment.Speaker, speaker)
|
||||
}
|
||||
if segment.Start != start {
|
||||
t.Fatalf("segment start = %f, want %f", segment.Start, start)
|
||||
}
|
||||
if segment.End != end {
|
||||
t.Fatalf("segment end = %f, want %f", segment.End, end)
|
||||
}
|
||||
if segment.Text != text {
|
||||
t.Fatalf("segment text = %q, want %q", segment.Text, text)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user