From bbfb8aba443e4f72aad6cf55c9b534f2d3428801 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Mon, 27 Apr 2026 19:49:25 -0500 Subject: [PATCH] Implemented a module to detect backchannel segments, and updated the coalesce module to ignore them when coalescing same-speaker turns --- README.md | 21 ++++- internal/backchannel/backchannel.go | 60 +++++++++++++ internal/backchannel/backchannel_test.go | 104 +++++++++++++++++++++++ internal/builtin/postprocess.go | 18 ++++ internal/builtin/registry.go | 1 + internal/cli/merge_test.go | 87 +++++++++++++++++++ internal/coalesce/coalesce.go | 23 ++++- internal/coalesce/coalesce_test.go | 49 +++++++++++ internal/config/config.go | 2 +- internal/model/model.go | 1 + 10 files changed, 360 insertions(+), 6 deletions(-) create mode 100644 internal/backchannel/backchannel.go create mode 100644 internal/backchannel/backchannel_test.go diff --git a/README.md b/README.md index e4b056e..aca79dc 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Optional flags: - `--input-reader`: input reader module. Default: `json-files`. - `--output-modules`: comma-separated output modules. Default: `json`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. -- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`. +- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`. - `--coalesce-gap`: maximum same-speaker gap in seconds for `coalesce`. Default: `3.0`. ## Input JSON Format @@ -151,7 +151,7 @@ The merged output uses the current seriatim envelope: "input_reader": "json-files", "input_files": ["eric.json", "mike.json"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], - "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], + "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], "output_modules": ["json"] }, "segments": [ @@ -173,7 +173,8 @@ The merged output uses the current seriatim envelope: "speaker": "Eric Rakestraw", "start": 2.0, "end": 2.5, - "text": "Resolved word run" + "text": "Resolved word run", + "categories": ["backchannel"] } ], "overlap_groups": [ @@ -215,7 +216,7 @@ Overlap behavior: ## Overlap Resolution -The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `coalesce`, then a second `detect-overlaps` pass. +The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `coalesce`, then a second `detect-overlaps` pass. For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: @@ -236,12 +237,24 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word - If a speaker has no usable word timing in a group, that speaker's original segment is kept. - If no speakers in a group have usable word timing, the original group and annotations remain unchanged. +## Backchannels + +The default pipeline runs `backchannel` before `coalesce`. It tags short acknowledgement segments with: + +```json +"categories": ["backchannel"] +``` + +Backchannel matching is case-insensitive, trims surrounding whitespace, and requires a matching acknowledgement phrase, no more than three whitespace-delimited words, and duration no greater than `1.0` second. + ## Coalescing The default pipeline runs `coalesce` before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`. Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`. +Different-speaker backchannel segments do not block coalescing of surrounding same-speaker segments. When same-speaker segments are coalesced, any `backchannel` category from the merged inputs is dropped from the coalesced segment. + ## Autocorrect Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report. diff --git a/internal/backchannel/backchannel.go b/internal/backchannel/backchannel.go new file mode 100644 index 0000000..5a897e2 --- /dev/null +++ b/internal/backchannel/backchannel.go @@ -0,0 +1,60 @@ +package backchannel + +import ( + "regexp" + "strings" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +const Category = "backchannel" + +var patterns = []*regexp.Regexp{ + regexp.MustCompile(`(?i)^(yeah|yep|yes|right|okay|ok|sure|mm+h?m+|uh[- ]huh|mhm|mm-hmm)\.?$`), + regexp.MustCompile(`(?i)^(yeah|yep|right|okay|ok)([,.\s]+(yeah|yep|right|okay|ok))*\.?$`), + regexp.MustCompile(`(?i)^(i see|got it|makes sense|that makes sense|fair enough|sounds good)\.?$`), +} + +// Apply tags matching short acknowledgement segments. +func Apply(in model.MergedTranscript) (model.MergedTranscript, int) { + tagged := 0 + for index := range in.Segments { + if !matches(in.Segments[index]) { + continue + } + if hasCategory(in.Segments[index], Category) { + continue + } + in.Segments[index].Categories = append(in.Segments[index].Categories, Category) + tagged++ + } + return in, tagged +} + +func matches(segment model.Segment) bool { + text := strings.TrimSpace(segment.Text) + if text == "" { + return false + } + if len(strings.Fields(text)) > 3 { + return false + } + if segment.End-segment.Start > 1.0 { + return false + } + for _, pattern := range patterns { + if pattern.MatchString(text) { + return true + } + } + return false +} + +func hasCategory(segment model.Segment, category string) bool { + for _, existing := range segment.Categories { + if existing == category { + return true + } + } + return false +} diff --git a/internal/backchannel/backchannel_test.go b/internal/backchannel/backchannel_test.go new file mode 100644 index 0000000..291cbbb --- /dev/null +++ b/internal/backchannel/backchannel_test.go @@ -0,0 +1,104 @@ +package backchannel + +import ( + "reflect" + "testing" + + "gitea.maximumdirect.net/eric/seriatim/internal/model" +) + +func TestApplyTagsVerySafeBackchannels(t *testing.T) { + for _, text := range []string{"yeah", "Yep.", "mmhm", "uh-huh", "mm-hmm"} { + t.Run(text, func(t *testing.T) { + got, tagged := Apply(transcript(segment(text, 1, 1.5))) + if tagged != 1 { + t.Fatalf("tagged = %d, want 1", tagged) + } + assertCategories(t, got.Segments[0], []string{Category}) + }) + } +} + +func TestApplyTagsRepeatedBackchannels(t *testing.T) { + got, tagged := Apply(transcript(segment("Yeah, okay yep.", 1, 1.8))) + if tagged != 1 { + t.Fatalf("tagged = %d, want 1", tagged) + } + assertCategories(t, got.Segments[0], []string{Category}) +} + +func TestApplyTagsShortAcknowledgements(t *testing.T) { + for _, text := range []string{"i see", "Got it.", "sounds good"} { + t.Run(text, func(t *testing.T) { + got, tagged := Apply(transcript(segment(text, 1, 1.8))) + if tagged != 1 { + t.Fatalf("tagged = %d, want 1", tagged) + } + assertCategories(t, got.Segments[0], []string{Category}) + }) + } +} + +func TestApplyMatchesTrimAwareCaseInsensitive(t *testing.T) { + got, tagged := Apply(transcript(segment(" YES. ", 1, 1.2))) + if tagged != 1 { + t.Fatalf("tagged = %d, want 1", tagged) + } + assertCategories(t, got.Segments[0], []string{Category}) +} + +func TestApplyDoesNotTagNonMatches(t *testing.T) { + got, tagged := Apply(transcript(segment("yeah I think so", 1, 1.5))) + if tagged != 0 { + t.Fatalf("tagged = %d, want 0", tagged) + } + assertCategories(t, got.Segments[0], nil) +} + +func TestApplyRejectsWordCountOverThree(t *testing.T) { + got, tagged := Apply(transcript(segment("that makes sense okay", 1, 1.5))) + if tagged != 0 { + t.Fatalf("tagged = %d, want 0", tagged) + } + assertCategories(t, got.Segments[0], nil) +} + +func TestApplyRejectsDurationOverOneSecond(t *testing.T) { + got, tagged := Apply(transcript(segment("yeah", 1, 2.1))) + if tagged != 0 { + t.Fatalf("tagged = %d, want 0", tagged) + } + assertCategories(t, got.Segments[0], nil) +} + +func TestApplyPreservesExistingCategoriesAndAvoidsDuplicate(t *testing.T) { + existing := segment("yeah", 1, 1.2) + existing.Categories = []string{"manual", Category} + + got, tagged := Apply(transcript(existing)) + if tagged != 0 { + t.Fatalf("tagged = %d, want 0", tagged) + } + assertCategories(t, got.Segments[0], []string{"manual", Category}) +} + +func transcript(segments ...model.Segment) model.MergedTranscript { + return model.MergedTranscript{Segments: segments} +} + +func segment(text string, start float64, end float64) model.Segment { + return model.Segment{ + Source: "input.json", + Speaker: "Alice", + Start: start, + End: end, + Text: text, + } +} + +func assertCategories(t *testing.T, segment model.Segment, want []string) { + t.Helper() + if !reflect.DeepEqual(segment.Categories, want) { + t.Fatalf("categories = %v, want %v", segment.Categories, want) + } +} diff --git a/internal/builtin/postprocess.go b/internal/builtin/postprocess.go index 41e0c74..41913c5 100644 --- a/internal/builtin/postprocess.go +++ b/internal/builtin/postprocess.go @@ -5,6 +5,7 @@ import ( "fmt" "gitea.maximumdirect.net/eric/seriatim/internal/autocorrect" + "gitea.maximumdirect.net/eric/seriatim/internal/backchannel" "gitea.maximumdirect.net/eric/seriatim/internal/coalesce" "gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/model" @@ -98,6 +99,23 @@ func (resolveOverlaps) Process(ctx context.Context, in model.MergedTranscript, c }, nil } +type backchannelPostprocessor struct{} + +func (backchannelPostprocessor) Name() string { + return "backchannel" +} + +func (backchannelPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) { + if err := ctx.Err(); err != nil { + return model.MergedTranscript{}, nil, err + } + + out, tagged := backchannel.Apply(in) + return out, []report.Event{ + report.Info("postprocessing", "backchannel", fmt.Sprintf("tagged %d backchannel segment(s)", tagged)), + }, nil +} + type coalescePostprocessor struct{} func (coalescePostprocessor) Name() string { diff --git a/internal/builtin/registry.go b/internal/builtin/registry.go index c1794f5..7faf232 100644 --- a/internal/builtin/registry.go +++ b/internal/builtin/registry.go @@ -13,6 +13,7 @@ func NewRegistry() *pipeline.Registry { registry.RegisterMerger(placeholderMerger{}) registry.RegisterPostprocessor(detectOverlaps{}) registry.RegisterPostprocessor(resolveOverlaps{}) + registry.RegisterPostprocessor(backchannelPostprocessor{}) registry.RegisterPostprocessor(coalescePostprocessor{}) registry.RegisterPostprocessor(assignIDs{}) registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"}) diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index f8d636e..132e997 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { "placeholder-merger", "detect-overlaps", "resolve-overlaps", + "backchannel", "coalesce", "detect-overlaps", "autocorrect", @@ -585,6 +586,92 @@ func TestMergeCoalesceGapOverridePreventsMerge(t *testing.T) { } } +func TestMergeTagsBackchannelSegments(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "input.json", `{ + "segments": [ + {"start": 1, "end": 1.5, "text": " Yeah. "}, + {"start": 6, "end": 7, "text": "not a backchannel"} + ] + }`) + output := filepath.Join(dir, "merged.json") + reportPath := filepath.Join(dir, "report.json") + + err := executeMerge( + "--input-file", input, + "--output-file", output, + "--report-file", reportPath, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if !equalStrings(transcript.Segments[0].Categories, []string{"backchannel"}) { + t.Fatalf("segment categories = %v, want [backchannel]", transcript.Segments[0].Categories) + } + if len(transcript.Segments[1].Categories) != 0 { + t.Fatalf("unexpected categories = %v", transcript.Segments[1].Categories) + } + + var rpt report.Report + readJSON(t, reportPath, &rpt) + if !hasReportEvent(rpt, "postprocessing", "backchannel", "tagged 1 backchannel segment(s)") { + t.Fatal("expected backchannel report event") + } +} + +func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) { + dir := t.TempDir() + inputA := writeJSONFile(t, dir, "a.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": "first"}, + {"start": 3, "end": 4, "text": "second"} + ] + }`) + inputB := writeJSONFile(t, dir, "b.json", `{ + "segments": [ + {"start": 2.2, "end": 2.5, "text": "yeah"} + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] +`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", inputA, + "--input-file", inputB, + "--speakers", speakers, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if len(transcript.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(transcript.Segments)) + } + if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "first second" { + t.Fatalf("first segment = %#v, want coalesced Alice", transcript.Segments[0]) + } + if len(transcript.Segments[0].Categories) != 0 { + t.Fatalf("coalesced segment categories = %v, want none", transcript.Segments[0].Categories) + } + if transcript.Segments[1].Speaker != "Bob" || !equalStrings(transcript.Segments[1].Categories, []string{"backchannel"}) { + t.Fatalf("second segment = %#v, want Bob backchannel", transcript.Segments[1]) + } +} + func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ diff --git a/internal/coalesce/coalesce.go b/internal/coalesce/coalesce.go index dac65db..52c12f2 100644 --- a/internal/coalesce/coalesce.go +++ b/internal/coalesce/coalesce.go @@ -27,16 +27,24 @@ func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summ coalescedID := 0 current := newRun(in.Segments[0]) + pendingBackchannels := make([]model.Segment, 0) for _, segment := range in.Segments[1:] { if current.canMerge(segment, gap) { current.add(segment) continue } + if segment.Speaker != current.speaker() && hasCategory(segment, "backchannel") { + pendingBackchannels = append(pendingBackchannels, segment) + continue + } coalescedID = appendRun(&out, current, coalescedID, &summary) + out.Segments = append(out.Segments, pendingBackchannels...) + pendingBackchannels = pendingBackchannels[:0] current = newRun(segment) } - appendRun(&out, current, coalescedID, &summary) + coalescedID = appendRun(&out, current, coalescedID, &summary) + out.Segments = append(out.Segments, pendingBackchannels...) return out, summary } @@ -56,6 +64,10 @@ func (r run) canMerge(next model.Segment, gap float64) bool { return current.Speaker == next.Speaker && next.Start-current.End <= gap } +func (r run) speaker() string { + return r.segments[0].Speaker +} + func (r *run) add(segment model.Segment) { r.segments = append(r.segments, segment) } @@ -116,3 +128,12 @@ func segmentRef(segment model.Segment) string { } return segment.Source } + +func hasCategory(segment model.Segment, category string) bool { + for _, existing := range segment.Categories { + if existing == category { + return true + } + } + return false +} diff --git a/internal/coalesce/coalesce_test.go b/internal/coalesce/coalesce_test.go index 33cbda4..cff3ebf 100644 --- a/internal/coalesce/coalesce_test.go +++ b/internal/coalesce/coalesce_test.go @@ -137,6 +137,55 @@ func TestApplyDerivedProvenanceForMixedSourcesAndDerivedInputs(t *testing.T) { } } +func TestApplyDropsBackchannelCategoryFromMergedSameSpeakerRun(t *testing.T) { + first := segment("a.json", 0, "Alice", 1, 2, "yeah") + first.Categories = []string{"backchannel"} + second := segment("a.json", 1, "Alice", 2.5, 3, "more") + + got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3) + if len(got.Segments) != 1 { + t.Fatalf("segment count = %d, want 1", len(got.Segments)) + } + if got.Segments[0].Categories != nil { + t.Fatalf("categories = %v, want nil", got.Segments[0].Categories) + } +} + +func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) { + first := segment("a.json", 0, "Alice", 1, 2, "first") + backchannel := segment("b.json", 0, "Bob", 2.2, 2.5, "yeah") + backchannel.Categories = []string{"backchannel"} + second := segment("a.json", 1, "Alice", 3, 4, "second") + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, backchannel, second}}, 3) + if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 2 { + t.Fatalf("segment count = %d, want 2", len(got.Segments)) + } + if got.Segments[0].Text != "first second" { + t.Fatalf("first output text = %q, want first second", got.Segments[0].Text) + } + if got.Segments[1].Text != "yeah" || !reflect.DeepEqual(got.Segments[1].Categories, []string{"backchannel"}) { + t.Fatalf("second output segment = %#v", got.Segments[1]) + } +} + +func TestApplyDifferentSpeakerNonBackchannelStillBlocksMerge(t *testing.T) { + first := segment("a.json", 0, "Alice", 1, 2, "first") + bob := segment("b.json", 0, "Bob", 2.2, 2.5, "interruption") + second := segment("a.json", 1, "Alice", 3, 4, "second") + + got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, bob, second}}, 3) + if summary.OriginalSegmentsMerged != 0 || summary.CoalescedSegments != 0 { + t.Fatalf("summary = %#v", summary) + } + if len(got.Segments) != 3 { + t.Fatalf("segment count = %d, want 3", len(got.Segments)) + } +} + func segment(source string, sourceIndex int, speaker string, start float64, end float64, text string) model.Segment { return model.Segment{ Source: source, diff --git a/internal/config/config.go b/internal/config/config.go index 80818aa..4ebc641 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,7 +14,7 @@ const ( DefaultInputReader = "json-files" DefaultOutputModules = "json" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" - DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" + DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultOverlapWordRunGap = 0.75 DefaultWordRunReorderWindow = 0.4 DefaultCoalesceGap = 3.0 diff --git a/internal/model/model.go b/internal/model/model.go index 9c3c8f1..b129ec9 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -56,6 +56,7 @@ type Segment struct { Start float64 `json:"start"` End float64 `json:"end"` Text string `json:"text"` + Categories []string `json:"categories,omitempty"` Words []Word `json:"words,omitempty"` OverlapGroupID int `json:"overlap_group_id,omitempty"` }