From 1b9f4bd922466512f54c35551a39234c54995992 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Mon, 27 Apr 2026 15:52:53 -0500 Subject: [PATCH] Added initial segment overlap resolution logic --- README.md | 56 ++++- internal/builtin/input.go | 130 ++++++++++-- internal/builtin/merge.go | 16 +- internal/builtin/postprocess.go | 31 +++ internal/builtin/preprocess.go | 4 +- internal/builtin/registry.go | 2 +- internal/cli/merge_test.go | 298 +++++++++++++++++++++++++- internal/config/config.go | 26 +++ internal/config/config_test.go | 80 +++++++ internal/model/model.go | 51 ++++- internal/overlap/detect.go | 13 +- internal/overlap/detect_test.go | 6 +- internal/overlap/resolve.go | 344 ++++++++++++++++++++++++++++++ internal/overlap/resolve_test.go | 345 +++++++++++++++++++++++++++++++ internal/pipeline/runner.go | 4 + internal/report/report.go | 10 + 16 files changed, 1357 insertions(+), 59 deletions(-) create mode 100644 internal/overlap/resolve.go create mode 100644 internal/overlap/resolve_test.go diff --git a/README.md b/README.md index a13598f..6953450 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ `seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order. -The current implementation supports the `merge` command. It reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, assigns consecutive numeric `id` values, and writes a merged JSON artifact. +The current implementation supports the `merge` command. It reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, detects and resolves overlaps when word-level timing is available, assigns consecutive numeric `id` values, and writes a merged JSON artifact. ## Usage @@ -56,7 +56,11 @@ Each input file must be valid JSON with a top-level `segments` array. The curren { "start": 1.25, "end": 3.5, - "text": "Hello there." + "text": "Hello there.", + "words": [ + {"word": "Hello", "start": 1.25, "end": 1.55, "score": 0.98}, + {"word": "there.", "start": 1.7, "end": 2.0} + ] } ] } @@ -68,7 +72,16 @@ Required segment fields: - `end`: number, must be `>= start`. - `text`: string. -Other WhisperX fields, including `words` and raw diarization speaker labels, are ignored for now. +Optional word fields: + +- `words`: array of word timing objects. +- `words[].word`: string. +- `words[].start`: optional number, must be `>= 0` when present. +- `words[].end`: optional number, must be `>= start` when present with `start`. +- `words[].score`: optional number. +- `words[].speaker`: optional raw speaker label string. + +Word-level timing is preserved internally for overlap resolution. If a word is missing `start` or `end`, seriatim keeps the word text, emits a warning in the optional report, and does not use that word as a timing anchor. Word timing is not emitted in the final JSON artifact. ## Speaker Map Format @@ -150,6 +163,16 @@ The merged output uses the current seriatim envelope: "end": 3.5, "text": "Hello there.", "overlap_group_id": 1 + }, + { + "id": 2, + "source": "eric.json", + "source_ref": "word-run:1:1:1", + "derived_from": ["eric.json#0"], + "speaker": "Eric Rakestraw", + "start": 2.0, + "end": 2.5, + "text": "Resolved word run" } ], "overlap_groups": [ @@ -169,7 +192,7 @@ The merged output uses the current seriatim envelope: Segments are sorted deterministically by: ```text -(start, end, source, source_segment_index, speaker) +(start, end, source, source_segment_index/source_ref, speaker) ``` Final segment IDs are assigned after sorting and start at `1`. @@ -187,7 +210,27 @@ Overlap behavior: - Segments in detected groups receive `overlap_group_id`. - `overlap_groups[].segments` contains stable references in `source#source_segment_index` format. - `class` is currently `unknown`. -- `resolution` is currently `unresolved`; overlap resolution is still a no-op. +- `resolution` is `unresolved` until `resolve-overlaps` replaces the group. + +## Overlap Resolution + +The default postprocessing pipeline runs `resolve-overlaps` after `detect-overlaps`. + +For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: + +- Words are included when their interval intersects the overlap window: `word.end > group.start && word.start < group.end`. +- Untimed words are included in replacement text in original word order when nearby timed words create a replacement run. +- Untimed words do not affect replacement segment start/end times or word-run gap splitting. +- Words for the same speaker are merged into one run when the gap between adjacent words is no greater than `SERIATIM_OVERLAP_WORD_RUN_GAP`. +- The default word-run gap is `0.75` seconds. +- Set `SERIATIM_OVERLAP_WORD_RUN_GAP` to a positive number of seconds to override the default. +- Replacement segment text is built by joining word text with single spaces. +- Replacement segments include `source_ref` and `derived_from`. +- Replacement segments omit `source_segment_index` because they are derived from one or more original segments. +- Resolved overlap groups are removed from `overlap_groups`. +- Replacement segments are left without `overlap_group_id`; future passes can detect any remaining overlap. +- If a speaker has no usable word timing in a group, that speaker's original segment is kept. +- If no speakers in a group have usable word timing, the original group and annotations remain unchanged. ## Autocorrect @@ -227,6 +270,5 @@ Matching behavior: ## Current Limitations - Only JSON input is supported. -- Word-level timing data is not preserved yet. -- Overlap resolution is currently a no-op module. +- Overlap resolution depends on WhisperX word timing; groups without usable word timing remain unresolved. - Coalescing and alternate output formats are not implemented yet. diff --git a/internal/builtin/input.go b/internal/builtin/input.go index d9bef18..1f5e034 100644 --- a/internal/builtin/input.go +++ b/internal/builtin/input.go @@ -23,17 +23,18 @@ func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.Raw } raw := make([]model.RawTranscript, 0, len(cfg.InputFiles)) + events := make([]report.Event, 0, len(cfg.InputFiles)+1) for _, inputFile := range cfg.InputFiles { - transcript, err := readRawTranscript(inputFile) + transcript, newEvents, err := readRawTranscript(inputFile) if err != nil { return nil, nil, err } raw = append(raw, transcript) + events = append(events, newEvents...) } - return raw, []report.Event{ - report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw))), - }, nil + events = append(events, report.Info("input", "json-files", fmt.Sprintf("decoded %d input file(s)", len(raw)))) + return raw, events, nil } type rawTranscriptFile struct { @@ -44,70 +45,163 @@ type rawSegmentFile struct { Start json.RawMessage `json:"start"` End json.RawMessage `json:"end"` Text json.RawMessage `json:"text"` + Words json.RawMessage `json:"words"` } -func readRawTranscript(path string) (model.RawTranscript, error) { +type rawWordFile struct { + Word json.RawMessage `json:"word"` + Start json.RawMessage `json:"start"` + End json.RawMessage `json:"end"` + Score json.RawMessage `json:"score"` + Speaker json.RawMessage `json:"speaker"` +} + +func readRawTranscript(path string) (model.RawTranscript, []report.Event, error) { data, err := os.ReadFile(path) if err != nil { - return model.RawTranscript{}, fmt.Errorf("read input file %q: %w", path, err) + return model.RawTranscript{}, nil, fmt.Errorf("read input file %q: %w", path, err) } var parsed rawTranscriptFile if err := json.Unmarshal(data, &parsed); err != nil { - return model.RawTranscript{}, fmt.Errorf("parse input file %q: %w", path, err) + return model.RawTranscript{}, nil, fmt.Errorf("parse input file %q: %w", path, err) } if parsed.Segments == nil || isJSONNull(parsed.Segments) { - return model.RawTranscript{}, fmt.Errorf("input file %q must contain top-level segments array", path) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q must contain top-level segments array", path) } var rawSegments []rawSegmentFile if err := json.Unmarshal(parsed.Segments, &rawSegments); err != nil { - return model.RawTranscript{}, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q top-level segments must be an array: %w", path, err) } segments := make([]model.RawSegment, 0, len(rawSegments)) + events := make([]report.Event, 0) for index, segment := range rawSegments { if segment.Start == nil || isJSONNull(segment.Start) { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric start", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric start", path, index) } if segment.End == nil || isJSONNull(segment.End) { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing numeric end", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing numeric end", path, index) } if segment.Text == nil || isJSONNull(segment.Text) { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d missing string text", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d missing string text", path, index) } var start float64 if err := json.Unmarshal(segment.Start, &start); err != nil { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d start must be numeric", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d start must be numeric", path, index) } var end float64 if err := json.Unmarshal(segment.End, &end); err != nil { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d end must be numeric", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d end must be numeric", path, index) } var text string if err := json.Unmarshal(segment.Text, &text); err != nil { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d text must be a string", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d text must be a string", path, index) } if start < 0 { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has negative start", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has negative start", path, index) } if end < start { - return model.RawTranscript{}, fmt.Errorf("input file %q segment %d has end before start", path, index) + return model.RawTranscript{}, nil, fmt.Errorf("input file %q segment %d has end before start", path, index) } + words, newEvents, err := parseRawWords(path, index, segment.Words) + if err != nil { + return model.RawTranscript{}, nil, err + } + events = append(events, newEvents...) + segments = append(segments, model.RawSegment{ Start: start, End: end, Text: text, + Words: words, }) } return model.RawTranscript{ Source: path, Segments: segments, - }, nil + }, events, nil +} + +func parseRawWords(path string, segmentIndex int, raw json.RawMessage) ([]model.Word, []report.Event, error) { + if raw == nil || isJSONNull(raw) { + return nil, nil, nil + } + + var rawWords []rawWordFile + if err := json.Unmarshal(raw, &rawWords); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d words must be an array: %w", path, segmentIndex, err) + } + + words := make([]model.Word, 0, len(rawWords)) + events := make([]report.Event, 0) + for wordIndex, rawWord := range rawWords { + if rawWord.Word == nil || isJSONNull(rawWord.Word) { + return nil, nil, fmt.Errorf("input file %q segment %d word %d missing string word", path, segmentIndex, wordIndex) + } + + var text string + if err := json.Unmarshal(rawWord.Word, &text); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d word %d word must be a string", path, segmentIndex, wordIndex) + } + + word := model.Word{ + Text: text, + } + + hasStart := rawWord.Start != nil && !isJSONNull(rawWord.Start) + hasEnd := rawWord.End != nil && !isJSONNull(rawWord.End) + var start float64 + var end float64 + if hasStart { + if err := json.Unmarshal(rawWord.Start, &start); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d word %d start must be numeric", path, segmentIndex, wordIndex) + } + if start < 0 { + return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative start", path, segmentIndex, wordIndex) + } + } + if hasEnd { + if err := json.Unmarshal(rawWord.End, &end); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d word %d end must be numeric", path, segmentIndex, wordIndex) + } + if end < 0 { + return nil, nil, fmt.Errorf("input file %q segment %d word %d has negative end", path, segmentIndex, wordIndex) + } + } + if hasStart && hasEnd { + if end < start { + return nil, nil, fmt.Errorf("input file %q segment %d word %d has end before start", path, segmentIndex, wordIndex) + } + word.Start = start + word.End = end + word.Timed = true + } else { + events = append(events, report.Warning( + "input", + "json-files", + fmt.Sprintf("input file %q segment %d word %d %q has no complete timing and will not anchor overlap resolution", path, segmentIndex, wordIndex, text), + )) + } + if rawWord.Score != nil && !isJSONNull(rawWord.Score) { + if err := json.Unmarshal(rawWord.Score, &word.Score); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d word %d score must be numeric", path, segmentIndex, wordIndex) + } + } + if rawWord.Speaker != nil && !isJSONNull(rawWord.Speaker) { + if err := json.Unmarshal(rawWord.Speaker, &word.Speaker); err != nil { + return nil, nil, fmt.Errorf("input file %q segment %d word %d speaker must be a string", path, segmentIndex, wordIndex) + } + } + words = append(words, word) + } + + return words, events, nil } func isJSONNull(value json.RawMessage) bool { diff --git a/internal/builtin/merge.go b/internal/builtin/merge.go index cfc7f06..c3c2dcf 100644 --- a/internal/builtin/merge.go +++ b/internal/builtin/merge.go @@ -26,21 +26,7 @@ func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscri } sort.SliceStable(segments, func(i, j int) bool { - left := segments[i] - right := segments[j] - if left.Start != right.Start { - return left.Start < right.Start - } - if left.End != right.End { - return left.End < right.End - } - if left.Source != right.Source { - return left.Source < right.Source - } - if left.SourceSegmentIndex != right.SourceSegmentIndex { - return left.SourceSegmentIndex < right.SourceSegmentIndex - } - return left.Speaker < right.Speaker + return model.SegmentLess(segments[i], segments[j]) }) return model.MergedTranscript{ diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index 64b7d9e..b0c80f7 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -66,6 +66,37 @@ func (detectOverlaps) Process(ctx context.Context, in model.MergedTranscript, cf }, nil } +type resolveOverlaps struct{} + +func (resolveOverlaps) Name() string { + return "resolve-overlaps" +} + +func (resolveOverlaps) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) { + if err := ctx.Err(); err != nil { + return model.MergedTranscript{}, nil, err + } + + resolved, summary, err := overlap.Resolve(in, cfg.OverlapWordRunGap) + if err != nil { + return model.MergedTranscript{}, nil, err + } + + return resolved, []report.Event{ + report.Info( + "postprocessing", + "resolve-overlaps", + fmt.Sprintf( + "processed %d overlap group(s); changed %d; removed %d original segment(s); created %d replacement segment(s)", + summary.GroupsProcessed, + summary.GroupsChanged, + summary.OriginalsRemoved, + summary.ReplacementsCreated, + ), + ), + }, nil +} + type autocorrectPostprocessor struct{} func (autocorrectPostprocessor) Name() string { diff --git a/internal/builtin/preprocess.go b/internal/builtin/preprocess.go index 3fbbcfd..e54898c 100644 --- a/internal/builtin/preprocess.go +++ b/internal/builtin/preprocess.go @@ -123,13 +123,15 @@ func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessStat segments := make([]model.Segment, 0, len(raw.Segments)) for index, rawSegment := range raw.Segments { + sourceSegmentIndex := index segments = append(segments, model.Segment{ Source: raw.Source, - SourceSegmentIndex: index, + SourceSegmentIndex: &sourceSegmentIndex, Speaker: canonicalSpeaker, Start: rawSegment.Start, End: rawSegment.End, Text: rawSegment.Text, + Words: append([]model.Word(nil), rawSegment.Words...), }) } diff --git a/internal/builtin/registry.go b/internal/builtin/registry.go index fcadc8a..b4dbf4e 100644 --- a/internal/builtin/registry.go +++ b/internal/builtin/registry.go @@ -12,7 +12,7 @@ func NewRegistry() *pipeline.Registry { registry.RegisterPreprocessor(trimText{}) registry.RegisterMerger(placeholderMerger{}) registry.RegisterPostprocessor(detectOverlaps{}) - registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"}) + registry.RegisterPostprocessor(resolveOverlaps{}) registry.RegisterPostprocessor(assignIDs{}) registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"}) registry.RegisterPostprocessor(autocorrectPostprocessor{}) diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 19401f2..c166d54 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -15,7 +15,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{ "segments": [ - {"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored"}]}, + {"start": 10, "end": 11, "text": " second a ", "words": [{"word": "ignored", "start": 10.1, "end": 10.2}]}, {"start": 1, "end": 2, "text": "first a"} ] }`) @@ -217,6 +217,107 @@ func TestMergeDetectsOverlapGroups(t *testing.T) { } } +func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + { + "start": 1, + "end": 5, + "text": "alice original", + "words": [ + {"word": "outside", "start": 0.5, "end": 1.0}, + {"word": "hello", "start": 1.1, "end": 1.2, "score": 0.98, "speaker": "SPEAKER_00"}, + {"word": "there", "start": 1.8, "end": 2.0}, + {"word": "later", "start": 3.0, "end": 3.1} + ] + } + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + { + "start": 1.5, + "end": 4, + "text": "bob original", + "words": [ + {"word": "bob", "start": 1.55, "end": 1.7}, + {"word": "reply", "start": 2.0, "end": 2.2} + ] + } + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", inputB, + "--input-file", inputA, + "--speakers", speakers, + "--output-file", output, + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.OverlapGroups) != 0 { + t.Fatalf("overlap groups = %#v, want none", transcript.OverlapGroups) + } + if got, want := len(transcript.Segments), 3; got != want { + t.Fatalf("segment count = %d, want %d", got, want) + } + + wantTexts := []string{"hello there", "bob reply", "later"} + wantSpeakers := []string{"Alice", "Bob", "Alice"} + wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1", "word-run:1:1:2"} + for index, segment := range transcript.Segments { + if segment.ID != index+1 { + t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1) + } + if segment.Text != wantTexts[index] { + t.Fatalf("segment %d text = %q, want %q", index, segment.Text, wantTexts[index]) + } + if segment.Speaker != wantSpeakers[index] { + t.Fatalf("segment %d speaker = %q, want %q", index, segment.Speaker, wantSpeakers[index]) + } + if segment.SourceRef != wantRefs[index] { + t.Fatalf("segment %d source_ref = %q, want %q", index, segment.SourceRef, wantRefs[index]) + } + if segment.SourceSegmentIndex != nil { + t.Fatalf("segment %d source_segment_index = %d, want nil", index, *segment.SourceSegmentIndex) + } + if segment.OverlapGroupID != 0 { + t.Fatalf("segment %d overlap_group_id = %d, want 0", index, segment.OverlapGroupID) + } + } + if !equalStrings(transcript.Segments[0].DerivedFrom, []string{inputA + "#0"}) { + t.Fatalf("segment 0 derived_from = %v", transcript.Segments[0].DerivedFrom) + } + + outputBytes, err := os.ReadFile(output) + if err != nil { + t.Fatalf("read output bytes: %v", err) + } + if strings.Contains(string(outputBytes), "words") { + t.Fatalf("did not expect word timing in output:\n%s", outputBytes) + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "postprocessing", "resolve-overlaps", "processed 1 overlap group(s); changed 1; removed 2 original segment(s); created 3 replacement segment(s)") { + t.Fatal("expected resolve-overlaps summary report event") + } +} + func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ @@ -650,6 +751,194 @@ func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) { } } +func TestInvalidWordFieldsFailWithSourceAndIndex(t *testing.T) { + tests := []struct { + name string + json string + want string + }{ + { + name: "words not array", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":{}}]}`, + want: "segment 0 words must be an array", + }, + { + name: "missing word", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"start":0,"end":0.1}]}]}`, + want: "segment 0 word 0 missing string word", + }, + { + name: "wrong typed word", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":7,"start":0,"end":0.1}]}]}`, + want: "segment 0 word 0 word must be a string", + }, + { + name: "wrong typed start", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":"0","end":0.1}]}]}`, + want: "segment 0 word 0 start must be numeric", + }, + { + name: "wrong typed end", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":"0.1"}]}]}`, + want: "segment 0 word 0 end must be numeric", + }, + { + name: "wrong typed score", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":0.1,"score":"good"}]}]}`, + want: "segment 0 word 0 score must be numeric", + }, + { + name: "wrong typed speaker", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0,"end":0.1,"speaker":7}]}]}`, + want: "segment 0 word 0 speaker must be a string", + }, + { + name: "negative start", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":-0.1,"end":0.1}]}]}`, + want: "segment 0 word 0 has negative start", + }, + { + name: "end before start", + json: `{"segments":[{"start":0,"end":1,"text":"x","words":[{"word":"x","start":0.2,"end":0.1}]}]}`, + want: "segment 0 word 0 has end before start", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", test.json) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + ) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), input) { + t.Fatalf("expected error to contain source path %q, got %v", input, err) + } + if !strings.Contains(err.Error(), test.want) { + t.Fatalf("expected error to contain %q, got %v", test.want, err) + } + }) + } +} + +func TestUntimedWordsAreAcceptedAndReported(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + { + "start": 1, + "end": 2, + "text": "about 13", + "words": [ + {"word": "about", "start": 1.1, "end": 1.2}, + {"word": "13"} + ] + } + ] + }`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "input", "json-files", `segment 0 word 1 "13" has no complete timing`) { + t.Fatal("expected untimed word warning report event") + } + foundWarning := false + for _, event := range rpt.Events { + if event.Stage == "input" && event.Module == "json-files" && strings.Contains(event.Message, `"13" has no complete timing`) { + foundWarning = event.Severity == report.SeverityWarning + } + } + if !foundWarning { + t.Fatal("expected untimed word event to use warning severity") + } +} + +func TestMergeResolutionPreservesUntimedWordText(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + { + "start": 1, + "end": 3, + "text": "about 13 and a half", + "words": [ + {"word": "about", "start": 1.1, "end": 1.2}, + {"word": "13"}, + {"word": "and", "start": 1.24, "end": 1.3}, + {"word": "a", "start": 1.32, "end": 1.34}, + {"word": "half", "start": 1.36, "end": 1.5} + ] + } + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + { + "start": 1.15, + "end": 2, + "text": "bob overlap", + "words": [ + {"word": "bob", "start": 1.16, "end": 1.25}, + {"word": "overlap", "start": 1.3, "end": 1.5} + ] + } + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.OverlapGroups) != 0 { + t.Fatalf("expected overlap group to be resolved, got %#v", transcript.OverlapGroups) + } + + found := false + for _, segment := range transcript.Segments { + if segment.Speaker == "Alice" && segment.Text == "about 13 and a half" { + found = true + if segment.Start != 1.1 || segment.End != 1.5 { + t.Fatalf("Alice replacement bounds = %f-%f, want 1.1-1.5", segment.Start, segment.End) + } + } + } + if !found { + t.Fatalf("expected Alice replacement to preserve untimed word text, got %#v", transcript.Segments) + } +} + func TestInvalidTimingFails(t *testing.T) { tests := []struct { name string @@ -761,8 +1050,11 @@ func assertSegment(t *testing.T, segment model.Segment, id int, source string, s if segment.Source != source { t.Fatalf("segment source = %q, want %q", segment.Source, source) } - if segment.SourceSegmentIndex != sourceIndex { - t.Fatalf("segment source index = %d, want %d", segment.SourceSegmentIndex, sourceIndex) + if segment.SourceSegmentIndex == nil { + t.Fatalf("segment source index = nil, want %d", sourceIndex) + } + if *segment.SourceSegmentIndex != sourceIndex { + t.Fatalf("segment source index = %d, want %d", *segment.SourceSegmentIndex, sourceIndex) } if segment.Speaker != speaker { t.Fatalf("segment speaker = %q, want %q", segment.Speaker, speaker) diff --git a/internal/config/config.go b/internal/config/config.go index b7a54c3..9c64dea 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -6,6 +6,7 @@ import ( "os" "path/filepath" "sort" + "strconv" "strings" ) @@ -14,6 +15,8 @@ const ( DefaultOutputModules = "json" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,autocorrect,assign-ids,validate-output" + DefaultOverlapWordRunGap = 0.75 + OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP" ) // MergeOptions captures raw CLI option values before validation. @@ -40,6 +43,7 @@ type Config struct { OutputModules []string PreprocessingModules []string PostprocessingModules []string + OverlapWordRunGap float64 } // NewMergeConfig validates raw merge options and returns normalized config. @@ -49,6 +53,7 @@ func NewMergeConfig(opts MergeOptions) (Config, error) { OutputModules: nil, PreprocessingModules: nil, PostprocessingModules: nil, + OverlapWordRunGap: DefaultOverlapWordRunGap, } if cfg.InputReader == "" { @@ -110,6 +115,11 @@ func NewMergeConfig(opts MergeOptions) (Config, error) { } } + cfg.OverlapWordRunGap, err = parseOverlapWordRunGap() + if err != nil { + return Config{}, err + } + return cfg, nil } @@ -187,6 +197,22 @@ func requireFile(path string, flag string) error { return nil } +func parseOverlapWordRunGap() (float64, error) { + value := strings.TrimSpace(os.Getenv(OverlapWordRunGapEnv)) + if value == "" { + return DefaultOverlapWordRunGap, nil + } + + gap, err := strconv.ParseFloat(value, 64) + if err != nil { + return 0, fmt.Errorf("%s must be a positive number of seconds: %w", OverlapWordRunGapEnv, err) + } + if gap <= 0 { + return 0, fmt.Errorf("%s must be positive", OverlapWordRunGapEnv) + } + return gap, nil +} + func contains(values []string, target string) bool { for _, value := range values { if value == target { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 66b5c0b..c38ff46 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -46,6 +46,86 @@ func TestDuplicateInputFilesFailValidation(t *testing.T) { } } +func TestOverlapWordRunGapDefaultsTo075(t *testing.T) { + t.Setenv(OverlapWordRunGapEnv, "") + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + cfg, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err != nil { + t.Fatalf("config failed: %v", err) + } + if cfg.OverlapWordRunGap != DefaultOverlapWordRunGap { + t.Fatalf("gap = %f, want %f", cfg.OverlapWordRunGap, DefaultOverlapWordRunGap) + } +} + +func TestOverlapWordRunGapUsesValidEnvOverride(t *testing.T) { + t.Setenv(OverlapWordRunGapEnv, "1.25") + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + cfg, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err != nil { + t.Fatalf("config failed: %v", err) + } + if cfg.OverlapWordRunGap != 1.25 { + t.Fatalf("gap = %f, want 1.25", cfg.OverlapWordRunGap) + } +} + +func TestOverlapWordRunGapRejectsInvalidEnvOverride(t *testing.T) { + tests := []struct { + name string + value string + want string + }{ + {name: "non-numeric", value: "fast", want: "must be a positive number"}, + {name: "zero", value: "0", want: "must be positive"}, + {name: "negative", value: "-0.1", want: "must be positive"}, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + t.Setenv(OverlapWordRunGapEnv, test.value) + dir := t.TempDir() + input := writeTempFile(t, dir, "input.json") + output := filepath.Join(dir, "merged.json") + + _, err := NewMergeConfig(MergeOptions{ + InputFiles: []string{input}, + OutputFile: output, + InputReader: DefaultInputReader, + OutputModules: DefaultOutputModules, + PreprocessingModules: DefaultPreprocessingModules, + PostprocessingModules: DefaultPostprocessingModules, + }) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), test.want) { + t.Fatalf("expected error to contain %q, got %v", test.want, err) + } + }) + } +} + func writeTempFile(t *testing.T, dir string, name string) string { t.Helper() diff --git a/internal/model/model.go b/internal/model/model.go index 2360397..9c3c8f1 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -11,6 +11,7 @@ type RawSegment struct { Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` + Words []Word `json:"words,omitempty"` } // CanonicalTranscript is a per-speaker transcript in seriatim's internal model. @@ -45,16 +46,18 @@ type OutputMetadata struct { // Segment is the canonical transcript segment shape used by the framework. type Segment struct { - ID int `json:"id,omitempty"` - InternalRef string `json:"internal_ref,omitempty"` - Source string `json:"source"` - SourceSegmentIndex int `json:"source_segment_index"` - Speaker string `json:"speaker"` - Start float64 `json:"start"` - End float64 `json:"end"` - Text string `json:"text"` - Words []Word `json:"words,omitempty"` - OverlapGroupID int `json:"overlap_group_id,omitempty"` + ID int `json:"id,omitempty"` + InternalRef string `json:"internal_ref,omitempty"` + Source string `json:"source"` + SourceSegmentIndex *int `json:"source_segment_index,omitempty"` + SourceRef string `json:"source_ref,omitempty"` + DerivedFrom []string `json:"derived_from,omitempty"` + Speaker string `json:"speaker"` + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` + Words []Word `json:"words,omitempty"` + OverlapGroupID int `json:"overlap_group_id,omitempty"` } // Word preserves optional word-level timing data. @@ -64,6 +67,7 @@ type Word struct { End float64 `json:"end"` Score float64 `json:"score,omitempty"` Speaker string `json:"speaker,omitempty"` + Timed bool `json:"-"` } // OverlapGroup describes a detected overlapping speech region. @@ -76,3 +80,30 @@ type OverlapGroup struct { Class string `json:"class"` Resolution string `json:"resolution"` } + +// SegmentLess defines the deterministic chronological ordering used by merge +// and postprocessing modules. +func SegmentLess(left Segment, right Segment) bool { + if left.Start != right.Start { + return left.Start < right.Start + } + if left.End != right.End { + return left.End < right.End + } + if left.Source != right.Source { + return left.Source < right.Source + } + if left.SourceSegmentIndex != nil && right.SourceSegmentIndex != nil && *left.SourceSegmentIndex != *right.SourceSegmentIndex { + return *left.SourceSegmentIndex < *right.SourceSegmentIndex + } + if left.SourceSegmentIndex == nil && right.SourceSegmentIndex != nil { + return false + } + if left.SourceSegmentIndex != nil && right.SourceSegmentIndex == nil { + return true + } + if left.SourceRef != right.SourceRef { + return left.SourceRef < right.SourceRef + } + return left.Speaker < right.Speaker +} diff --git a/internal/overlap/detect.go b/internal/overlap/detect.go index 86e6a7b..3eb96eb 100644 --- a/internal/overlap/detect.go +++ b/internal/overlap/detect.go @@ -77,7 +77,7 @@ func finalizeCandidate(in *model.MergedTranscript, candidate overlapCandidate, c refs := make([]string, 0, len(candidate.indices)) for _, index := range candidate.indices { in.Segments[index].OverlapGroupID = groupID - refs = append(refs, segmentRef(in.Segments[index])) + refs = append(refs, SegmentRef(in.Segments[index])) } in.OverlapGroups = append(in.OverlapGroups, model.OverlapGroup{ @@ -106,8 +106,15 @@ func distinctSpeakers(segments []model.Segment, indices []int) []string { return speakers } -func segmentRef(segment model.Segment) string { - return fmt.Sprintf("%s#%d", segment.Source, segment.SourceSegmentIndex) +// SegmentRef returns the stable overlap reference for a segment. +func SegmentRef(segment model.Segment) string { + if segment.SourceSegmentIndex != nil { + return fmt.Sprintf("%s#%d", segment.Source, *segment.SourceSegmentIndex) + } + if segment.SourceRef != "" { + return segment.SourceRef + } + return segment.Source } func clearExisting(in *model.MergedTranscript) { diff --git a/internal/overlap/detect_test.go b/internal/overlap/detect_test.go index cb4975b..2d7a1b6 100644 --- a/internal/overlap/detect_test.go +++ b/internal/overlap/detect_test.go @@ -132,7 +132,7 @@ func TestDetectIsIdempotent(t *testing.T) { func segment(source string, sourceIndex int, speaker string, start float64, end float64) model.Segment { return model.Segment{ Source: source, - SourceSegmentIndex: sourceIndex, + SourceSegmentIndex: intPtr(sourceIndex), Speaker: speaker, Start: start, End: end, @@ -140,6 +140,10 @@ func segment(source string, sourceIndex int, speaker string, start float64, end } } +func intPtr(value int) *int { + return &value +} + func assertGroup(t *testing.T, merged model.MergedTranscript, groupIndex int, id int, start float64, end float64, refs []string, speakers []string) { t.Helper() if len(merged.OverlapGroups) <= groupIndex { diff --git a/internal/overlap/resolve.go b/internal/overlap/resolve.go new file mode 100644 index 0000000..6508cb5 --- /dev/null +++ b/internal/overlap/resolve.go @@ -0,0 +1,344 @@ +package overlap + +import ( + "fmt" + "sort" + "strings" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +// ResolutionSummary records deterministic counters for a resolve-overlaps pass. +type ResolutionSummary struct { + GroupsProcessed int + GroupsChanged int + OriginalsRemoved int + ReplacementsCreated int +} + +// Resolve replaces detected overlap-group segments with word-run segments when +// word-level timing is available. +func Resolve(in model.MergedTranscript, wordRunGap float64) (model.MergedTranscript, ResolutionSummary, error) { + summary := ResolutionSummary{ + GroupsProcessed: len(in.OverlapGroups), + } + if len(in.OverlapGroups) == 0 { + return in, summary, nil + } + + refToIndex := make(map[string]int, len(in.Segments)) + for index, segment := range in.Segments { + refToIndex[SegmentRef(segment)] = index + } + + removeRefs := make(map[string]struct{}) + clearAnnotationRefs := make(map[string]struct{}) + removeGroupIDs := make(map[int]struct{}) + replacements := make([]model.Segment, 0) + + for _, group := range in.OverlapGroups { + resolved, err := resolveGroup(in, group, refToIndex, wordRunGap) + if err != nil { + return model.MergedTranscript{}, ResolutionSummary{}, err + } + if len(resolved.replacements) == 0 { + continue + } + + summary.GroupsChanged++ + removeGroupIDs[group.ID] = struct{}{} + replacements = append(replacements, resolved.replacements...) + + for _, ref := range group.Segments { + clearAnnotationRefs[ref] = struct{}{} + } + for _, ref := range resolved.removeRefs { + if _, exists := removeRefs[ref]; !exists { + summary.OriginalsRemoved++ + } + removeRefs[ref] = struct{}{} + } + summary.ReplacementsCreated += len(resolved.replacements) + } + + if summary.GroupsChanged == 0 { + return in, summary, nil + } + + segments := make([]model.Segment, 0, len(in.Segments)-len(removeRefs)+len(replacements)) + for _, segment := range in.Segments { + ref := SegmentRef(segment) + if _, remove := removeRefs[ref]; remove { + continue + } + if _, clear := clearAnnotationRefs[ref]; clear { + segment.OverlapGroupID = 0 + } + segments = append(segments, segment) + } + segments = append(segments, replacements...) + sort.SliceStable(segments, func(i, j int) bool { + return model.SegmentLess(segments[i], segments[j]) + }) + + overlapGroups := make([]model.OverlapGroup, 0, len(in.OverlapGroups)-len(removeGroupIDs)) + for _, group := range in.OverlapGroups { + if _, remove := removeGroupIDs[group.ID]; remove { + continue + } + overlapGroups = append(overlapGroups, group) + } + + return model.MergedTranscript{ + Segments: segments, + OverlapGroups: overlapGroups, + }, summary, nil +} + +type resolvedGroup struct { + removeRefs []string + replacements []model.Segment +} + +type resolutionWord struct { + word model.Word + source string + ref string + sequence int +} + +type wordRun struct { + timedWords []resolutionWord + untimedWords []resolutionWord + start float64 + end float64 +} + +func resolveGroup(in model.MergedTranscript, group model.OverlapGroup, refToIndex map[string]int, wordRunGap float64) (resolvedGroup, error) { + segmentsBySpeaker := make(map[string][]model.Segment) + refsBySpeaker := make(map[string][]string) + for _, ref := range group.Segments { + index, exists := refToIndex[ref] + if !exists { + return resolvedGroup{}, fmt.Errorf("overlap group %d references missing segment %q", group.ID, ref) + } + segment := in.Segments[index] + segmentsBySpeaker[segment.Speaker] = append(segmentsBySpeaker[segment.Speaker], segment) + refsBySpeaker[segment.Speaker] = append(refsBySpeaker[segment.Speaker], ref) + } + + speakers := groupSpeakerOrder(group, segmentsBySpeaker) + resolved := resolvedGroup{} + for speakerIndex, speaker := range speakers { + timedWords, untimedWords := gatherResolutionWords(segmentsBySpeaker[speaker], group.Start, group.End) + if len(timedWords) == 0 { + continue + } + + runs := buildWordRuns(timedWords, wordRunGap) + if len(runs) == 0 { + continue + } + attachUntimedWords(runs, untimedWords) + + resolved.removeRefs = append(resolved.removeRefs, refsBySpeaker[speaker]...) + for runIndex, run := range runs { + resolved.replacements = append(resolved.replacements, replacementSegment(group.ID, speakerIndex+1, runIndex+1, speaker, run)) + } + } + + return resolved, nil +} + +func groupSpeakerOrder(group model.OverlapGroup, segmentsBySpeaker map[string][]model.Segment) []string { + seen := make(map[string]struct{}, len(group.Speakers)) + speakers := make([]string, 0, len(group.Speakers)) + for _, speaker := range group.Speakers { + if _, exists := segmentsBySpeaker[speaker]; !exists { + continue + } + if _, exists := seen[speaker]; exists { + continue + } + seen[speaker] = struct{}{} + speakers = append(speakers, speaker) + } + + extra := make([]string, 0) + for speaker := range segmentsBySpeaker { + if _, exists := seen[speaker]; exists { + continue + } + extra = append(extra, speaker) + } + sort.Strings(extra) + speakers = append(speakers, extra...) + return speakers +} + +func gatherResolutionWords(segments []model.Segment, groupStart float64, groupEnd float64) ([]resolutionWord, []resolutionWord) { + timedWords := make([]resolutionWord, 0) + untimedWords := make([]resolutionWord, 0) + sequence := 0 + for _, segment := range segments { + ref := SegmentRef(segment) + for _, word := range segment.Words { + candidate := resolutionWord{ + word: word, + source: segment.Source, + ref: ref, + sequence: sequence, + } + sequence++ + if !word.Timed { + untimedWords = append(untimedWords, candidate) + continue + } + if word.End <= groupStart || word.Start >= groupEnd { + continue + } + timedWords = append(timedWords, candidate) + } + } + + sort.SliceStable(timedWords, func(i, j int) bool { + left := timedWords[i].word + right := timedWords[j].word + if left.Start != right.Start { + return left.Start < right.Start + } + if left.End != right.End { + return left.End < right.End + } + return left.Text < right.Text + }) + return timedWords, untimedWords +} + +func buildWordRuns(words []resolutionWord, wordRunGap float64) []wordRun { + if len(words) == 0 { + return nil + } + + runs := make([]wordRun, 0) + current := newWordRun(words[0]) + previousEnd := words[0].word.End + for _, word := range words[1:] { + if word.word.Start-previousEnd <= wordRunGap { + current.add(word) + } else { + runs = append(runs, current.finish()) + current = newWordRun(word) + } + previousEnd = word.word.End + } + runs = append(runs, current.finish()) + return runs +} + +func newWordRun(word resolutionWord) wordRun { + return wordRun{ + timedWords: []resolutionWord{word}, + start: word.word.Start, + end: word.word.End, + } +} + +func (r *wordRun) add(word resolutionWord) { + r.timedWords = append(r.timedWords, word) + if word.word.Start < r.start { + r.start = word.word.Start + } + if word.word.End > r.end { + r.end = word.word.End + } +} + +func (r wordRun) finish() wordRun { + return r +} + +func attachUntimedWords(runs []wordRun, untimedWords []resolutionWord) { + if len(runs) == 0 || len(untimedWords) == 0 { + return + } + + for _, word := range untimedWords { + target := 0 + for index, run := range runs { + if word.sequence < run.firstSequence() { + if index == 0 { + target = 0 + } else { + target = index - 1 + } + break + } + target = index + } + runs[target].untimedWords = append(runs[target].untimedWords, word) + } +} + +func (r wordRun) firstSequence() int { + first := r.timedWords[0].sequence + for _, word := range r.timedWords[1:] { + if word.sequence < first { + first = word.sequence + } + } + return first +} + +func (r wordRun) allWordsInTextOrder() []resolutionWord { + words := make([]resolutionWord, 0, len(r.timedWords)+len(r.untimedWords)) + words = append(words, r.timedWords...) + words = append(words, r.untimedWords...) + sort.SliceStable(words, func(i, j int) bool { + return words[i].sequence < words[j].sequence + }) + return words +} + +func replacementSegment(groupID int, speakerIndex int, runIndex int, speaker string, run wordRun) model.Segment { + orderedWords := run.allWordsInTextOrder() + words := make([]model.Word, 0, len(orderedWords)) + text := make([]string, 0, len(orderedWords)) + refs := make([]string, 0, len(orderedWords)) + source := "" + for _, word := range orderedWords { + words = append(words, word.word) + text = append(text, word.word.Text) + refs = append(refs, word.ref) + if source == "" { + source = word.source + } else if source != word.source { + source = "derived" + } + } + + return model.Segment{ + Source: source, + SourceRef: fmt.Sprintf("word-run:%d:%d:%d", groupID, speakerIndex, runIndex), + DerivedFrom: uniqueSortedStrings(refs), + Speaker: speaker, + Start: run.start, + End: run.end, + Text: strings.Join(text, " "), + Words: words, + } +} + +func uniqueSortedStrings(values []string) []string { + seen := make(map[string]struct{}, len(values)) + unique := make([]string, 0, len(values)) + for _, value := range values { + if _, exists := seen[value]; exists { + continue + } + seen[value] = struct{}{} + unique = append(unique, value) + } + sort.Strings(unique) + return unique +} diff --git a/internal/overlap/resolve_test.go b/internal/overlap/resolve_test.go new file mode 100644 index 0000000..e184997 --- /dev/null +++ b/internal/overlap/resolve_test.go @@ -0,0 +1,345 @@ +package overlap + +import ( + "reflect" + "testing" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +func TestResolveNoOverlapGroupsIsNoOp(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 2, word("hello", 1.1, 1.2)), + }, + } + + got, summary, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if !reflect.DeepEqual(got, merged) { + t.Fatalf("expected no-op result:\ngot %#v\nwant %#v", got, merged) + } + if summary.GroupsProcessed != 0 || summary.GroupsChanged != 0 { + t.Fatalf("unexpected summary: %#v", summary) + } +} + +func TestResolveCreatesChronologicalWordRunSegments(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 5, word("A1", 1.1, 1.2), word("A2", 1.8, 2.0)), + segmentWithWords("b.json", 0, "Bob", 1.5, 4, word("B1", 1.55, 1.7), word("B2", 2.6, 2.8)), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + merged.Segments[1].OverlapGroupID = 1 + + got, summary, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if summary.GroupsProcessed != 1 || summary.GroupsChanged != 1 || summary.OriginalsRemoved != 2 || summary.ReplacementsCreated != 3 { + t.Fatalf("unexpected summary: %#v", summary) + } + if len(got.OverlapGroups) != 0 { + t.Fatalf("expected resolved group to be removed, got %#v", got.OverlapGroups) + } + + gotTexts := []string{got.Segments[0].Text, got.Segments[1].Text, got.Segments[2].Text} + wantTexts := []string{"A1 A2", "B1", "B2"} + if !reflect.DeepEqual(gotTexts, wantTexts) { + t.Fatalf("texts = %v, want %v", gotTexts, wantTexts) + } + for _, segment := range got.Segments { + if segment.ID != 0 { + t.Fatalf("replacement segment has ID %d, want 0", segment.ID) + } + if segment.SourceSegmentIndex != nil { + t.Fatalf("replacement segment source index = %d, want nil", *segment.SourceSegmentIndex) + } + if segment.OverlapGroupID != 0 { + t.Fatalf("replacement segment overlap group ID = %d, want 0", segment.OverlapGroupID) + } + if segment.SourceRef == "" { + t.Fatal("replacement segment missing source_ref") + } + } +} + +func TestResolveIncludesWordsByIntervalIntersection(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords( + "a.json", + 0, + "Alice", + 9, + 21, + word("before", 9.5, 10), + word("left-edge", 9.9, 10.1), + word("inside", 11, 11.2), + word("right-edge", 19.9, 20.1), + word("after", 20, 20.2), + ), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 10, 20, []string{"a.json#0"}, []string{"Alice"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + + got, _, err := Resolve(merged, 10) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + if got.Segments[0].Text != "left-edge inside right-edge" { + t.Fatalf("text = %q", got.Segments[0].Text) + } +} + +func TestResolveWordRunGapThreshold(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 4, word("one", 1, 1.1), word("two", 1.85, 2), word("three", 2.8, 3)), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 4, []string{"a.json#0"}, []string{"Alice"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + + got, _, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[0].Text != "one two" || got.Segments[1].Text != "three" { + t.Fatalf("unexpected replacement texts: %#v", got.Segments) + } +} + +func TestResolvePartialResolutionKeepsNoWordSpeakerOriginals(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 5, word("hello", 1.2, 1.4)), + segmentWithWords("b.json", 0, "Bob", 2, 4), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + merged.Segments[1].OverlapGroupID = 1 + + got, summary, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if summary.OriginalsRemoved != 1 || summary.ReplacementsCreated != 1 { + t.Fatalf("unexpected summary: %#v", summary) + } + if len(got.OverlapGroups) != 0 { + t.Fatalf("expected changed group to be removed, got %#v", got.OverlapGroups) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[0].Text != "hello" || got.Segments[1].Text != "Bob" { + t.Fatalf("unexpected segment texts: %#v", got.Segments) + } + if got.Segments[1].SourceSegmentIndex == nil { + t.Fatal("kept original should retain source_segment_index") + } + if got.Segments[1].OverlapGroupID != 0 { + t.Fatalf("kept original overlap group ID = %d, want 0", got.Segments[1].OverlapGroupID) + } +} + +func TestResolveGroupWithNoUsableWordsRemainsUnchanged(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 5), + segmentWithWords("b.json", 0, "Bob", 2, 4), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 5, []string{"a.json#0", "b.json#0"}, []string{"Alice", "Bob"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + merged.Segments[1].OverlapGroupID = 1 + + got, summary, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if summary.GroupsChanged != 0 || summary.OriginalsRemoved != 0 || summary.ReplacementsCreated != 0 { + t.Fatalf("unexpected summary: %#v", summary) + } + if !reflect.DeepEqual(got, merged) { + t.Fatalf("expected unchanged transcript:\ngot %#v\nwant %#v", got, merged) + } +} + +func TestResolveReplacementProvenanceIsDeterministic(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 1, "Alice", 1, 3, word("second", 1.5, 1.6)), + segmentWithWords("a.json", 0, "Alice", 1, 3, word("first", 1.1, 1.2)), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 3, []string{"a.json#1", "a.json#0"}, []string{"Alice"}), + }, + } + + got, _, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + segment := got.Segments[0] + if segment.SourceRef != "word-run:1:1:1" { + t.Fatalf("source_ref = %q", segment.SourceRef) + } + if !reflect.DeepEqual(segment.DerivedFrom, []string{"a.json#0", "a.json#1"}) { + t.Fatalf("derived_from = %v", segment.DerivedFrom) + } +} + +func TestResolveIncludesUntimedWordsInTextWithoutChangingBounds(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords( + "a.json", + 0, + "Alice", + 1, + 3, + untimedWord("pre"), + word("one", 1.1, 1.2), + untimedWord("middle"), + word("two", 1.4, 1.5), + untimedWord("post"), + ), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 3, []string{"a.json#0"}, []string{"Alice"}), + }, + } + + got, _, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + segment := got.Segments[0] + if segment.Text != "pre one middle two post" { + t.Fatalf("text = %q", segment.Text) + } + if segment.Start != 1.1 || segment.End != 1.5 { + t.Fatalf("bounds = %f-%f, want 1.1-1.5", segment.Start, segment.End) + } +} + +func TestResolveUntimedWordsDoNotBridgeWordRunGap(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords( + "a.json", + 0, + "Alice", + 1, + 4, + word("one", 1, 1.1), + untimedWord("middle"), + word("two", 2, 2.1), + ), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 4, []string{"a.json#0"}, []string{"Alice"}), + }, + } + + got, _, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[0].Text != "one middle" || got.Segments[1].Text != "two" { + t.Fatalf("unexpected texts: %#v", got.Segments) + } + if got.Segments[0].End != 1.1 || got.Segments[1].Start != 2 { + t.Fatalf("untimed word changed bounds: %#v", got.Segments) + } +} + +func TestResolveSpeakerWithOnlyUntimedWordsIsNotReplaced(t *testing.T) { + merged := model.MergedTranscript{ + Segments: []model.Segment{ + segmentWithWords("a.json", 0, "Alice", 1, 3, untimedWord("hello")), + }, + OverlapGroups: []model.OverlapGroup{ + group(1, 1, 3, []string{"a.json#0"}, []string{"Alice"}), + }, + } + merged.Segments[0].OverlapGroupID = 1 + + got, summary, err := Resolve(merged, 0.75) + if err != nil { + t.Fatalf("resolve failed: %v", err) + } + if summary.GroupsChanged != 0 { + t.Fatalf("unexpected summary: %#v", summary) + } + if !reflect.DeepEqual(got, merged) { + t.Fatalf("expected unchanged transcript:\ngot %#v\nwant %#v", got, merged) + } +} + +func segmentWithWords(source string, sourceIndex int, speaker string, start float64, end float64, words ...model.Word) model.Segment { + segment := segment(source, sourceIndex, speaker, start, end) + segment.Words = words + return segment +} + +func word(text string, start float64, end float64) model.Word { + return model.Word{ + Text: text, + Start: start, + End: end, + Timed: true, + } +} + +func untimedWord(text string) model.Word { + return model.Word{ + Text: text, + } +} + +func group(id int, start float64, end float64, refs []string, speakers []string) model.OverlapGroup { + return model.OverlapGroup{ + ID: id, + Start: start, + End: end, + Segments: refs, + Speakers: speakers, + Class: defaultClass, + Resolution: defaultResolution, + } +} diff --git a/internal/pipeline/runner.go b/internal/pipeline/runner.go index c7e6763..1944e34 100644 --- a/internal/pipeline/runner.go +++ b/internal/pipeline/runner.go @@ -142,6 +142,10 @@ func validatePreprocessors(modules []Preprocessor) error { func finalizeTranscript(cfg config.Config, merged model.MergedTranscript) model.FinalTranscript { segments := make([]model.Segment, len(merged.Segments)) copy(segments, merged.Segments) + for index := range segments { + segments[index].Words = nil + segments[index].DerivedFrom = append([]string(nil), segments[index].DerivedFrom...) + } overlapGroups := make([]model.OverlapGroup, len(merged.OverlapGroups)) copy(overlapGroups, merged.OverlapGroups) diff --git a/internal/report/report.go b/internal/report/report.go index 2a8a21e..b9c8141 100644 --- a/internal/report/report.go +++ b/internal/report/report.go @@ -50,6 +50,16 @@ func Info(stage string, module string, message string) Event { } } +// Warning constructs a warning report event. +func Warning(stage string, module string, message string) Event { + return Event{ + Severity: SeverityWarning, + Stage: stage, + Module: module, + Message: message, + } +} + // WriteJSON writes a deterministic JSON report. func WriteJSON(path string, rpt Report) error { file, err := os.Create(path)