Implemented a module to detect backchannel segments, and updated the coalesce module to ignore them when coalescing same-speaker turns

This commit is contained in:
2026-04-27 19:49:25 -05:00
parent aab6d12730
commit bbfb8aba44
10 changed files with 360 additions and 6 deletions

View File

@@ -44,7 +44,7 @@ Optional flags:
- `--input-reader`: input reader module. Default: `json-files`. - `--input-reader`: input reader module. Default: `json-files`.
- `--output-modules`: comma-separated output modules. Default: `json`. - `--output-modules`: comma-separated output modules. Default: `json`.
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`. - `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`. - `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`.
- `--coalesce-gap`: maximum same-speaker gap in seconds for `coalesce`. Default: `3.0`. - `--coalesce-gap`: maximum same-speaker gap in seconds for `coalesce`. Default: `3.0`.
## Input JSON Format ## Input JSON Format
@@ -151,7 +151,7 @@ The merged output uses the current seriatim envelope:
"input_reader": "json-files", "input_reader": "json-files",
"input_files": ["eric.json", "mike.json"], "input_files": ["eric.json", "mike.json"],
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
"output_modules": ["json"] "output_modules": ["json"]
}, },
"segments": [ "segments": [
@@ -173,7 +173,8 @@ The merged output uses the current seriatim envelope:
"speaker": "Eric Rakestraw", "speaker": "Eric Rakestraw",
"start": 2.0, "start": 2.0,
"end": 2.5, "end": 2.5,
"text": "Resolved word run" "text": "Resolved word run",
"categories": ["backchannel"]
} }
], ],
"overlap_groups": [ "overlap_groups": [
@@ -215,7 +216,7 @@ Overlap behavior:
## Overlap Resolution ## Overlap Resolution
The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `coalesce`, then a second `detect-overlaps` pass. The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `coalesce`, then a second `detect-overlaps` pass.
For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments: For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments:
@@ -236,12 +237,24 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word
- If a speaker has no usable word timing in a group, that speaker's original segment is kept. - If a speaker has no usable word timing in a group, that speaker's original segment is kept.
- If no speakers in a group have usable word timing, the original group and annotations remain unchanged. - If no speakers in a group have usable word timing, the original group and annotations remain unchanged.
## Backchannels
The default pipeline runs `backchannel` before `coalesce`. It tags short acknowledgement segments with:
```json
"categories": ["backchannel"]
```
Backchannel matching is case-insensitive, trims surrounding whitespace, and requires a matching acknowledgement phrase, no more than three whitespace-delimited words, and duration no greater than `1.0` second.
## Coalescing ## Coalescing
The default pipeline runs `coalesce` before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`. The default pipeline runs `coalesce` before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`.
Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`. Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`.
Different-speaker backchannel segments do not block coalescing of surrounding same-speaker segments. When same-speaker segments are coalesced, any `backchannel` category from the merged inputs is dropped from the coalesced segment.
## Autocorrect ## Autocorrect
Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report. Autocorrect is included in the default postprocessing pipeline. If `--autocorrect` is omitted, the module leaves transcript text unchanged and records a skip event in the optional report.

View File

@@ -0,0 +1,60 @@
package backchannel
import (
"regexp"
"strings"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
const Category = "backchannel"
var patterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)^(yeah|yep|yes|right|okay|ok|sure|mm+h?m+|uh[- ]huh|mhm|mm-hmm)\.?$`),
regexp.MustCompile(`(?i)^(yeah|yep|right|okay|ok)([,.\s]+(yeah|yep|right|okay|ok))*\.?$`),
regexp.MustCompile(`(?i)^(i see|got it|makes sense|that makes sense|fair enough|sounds good)\.?$`),
}
// Apply tags matching short acknowledgement segments.
func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
tagged := 0
for index := range in.Segments {
if !matches(in.Segments[index]) {
continue
}
if hasCategory(in.Segments[index], Category) {
continue
}
in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
tagged++
}
return in, tagged
}
func matches(segment model.Segment) bool {
text := strings.TrimSpace(segment.Text)
if text == "" {
return false
}
if len(strings.Fields(text)) > 3 {
return false
}
if segment.End-segment.Start > 1.0 {
return false
}
for _, pattern := range patterns {
if pattern.MatchString(text) {
return true
}
}
return false
}
func hasCategory(segment model.Segment, category string) bool {
for _, existing := range segment.Categories {
if existing == category {
return true
}
}
return false
}

View File

@@ -0,0 +1,104 @@
package backchannel
import (
"reflect"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
func TestApplyTagsVerySafeBackchannels(t *testing.T) {
for _, text := range []string{"yeah", "Yep.", "mmhm", "uh-huh", "mm-hmm"} {
t.Run(text, func(t *testing.T) {
got, tagged := Apply(transcript(segment(text, 1, 1.5)))
if tagged != 1 {
t.Fatalf("tagged = %d, want 1", tagged)
}
assertCategories(t, got.Segments[0], []string{Category})
})
}
}
func TestApplyTagsRepeatedBackchannels(t *testing.T) {
got, tagged := Apply(transcript(segment("Yeah, okay yep.", 1, 1.8)))
if tagged != 1 {
t.Fatalf("tagged = %d, want 1", tagged)
}
assertCategories(t, got.Segments[0], []string{Category})
}
func TestApplyTagsShortAcknowledgements(t *testing.T) {
for _, text := range []string{"i see", "Got it.", "sounds good"} {
t.Run(text, func(t *testing.T) {
got, tagged := Apply(transcript(segment(text, 1, 1.8)))
if tagged != 1 {
t.Fatalf("tagged = %d, want 1", tagged)
}
assertCategories(t, got.Segments[0], []string{Category})
})
}
}
func TestApplyMatchesTrimAwareCaseInsensitive(t *testing.T) {
got, tagged := Apply(transcript(segment(" YES. ", 1, 1.2)))
if tagged != 1 {
t.Fatalf("tagged = %d, want 1", tagged)
}
assertCategories(t, got.Segments[0], []string{Category})
}
func TestApplyDoesNotTagNonMatches(t *testing.T) {
got, tagged := Apply(transcript(segment("yeah I think so", 1, 1.5)))
if tagged != 0 {
t.Fatalf("tagged = %d, want 0", tagged)
}
assertCategories(t, got.Segments[0], nil)
}
func TestApplyRejectsWordCountOverThree(t *testing.T) {
got, tagged := Apply(transcript(segment("that makes sense okay", 1, 1.5)))
if tagged != 0 {
t.Fatalf("tagged = %d, want 0", tagged)
}
assertCategories(t, got.Segments[0], nil)
}
func TestApplyRejectsDurationOverOneSecond(t *testing.T) {
got, tagged := Apply(transcript(segment("yeah", 1, 2.1)))
if tagged != 0 {
t.Fatalf("tagged = %d, want 0", tagged)
}
assertCategories(t, got.Segments[0], nil)
}
func TestApplyPreservesExistingCategoriesAndAvoidsDuplicate(t *testing.T) {
existing := segment("yeah", 1, 1.2)
existing.Categories = []string{"manual", Category}
got, tagged := Apply(transcript(existing))
if tagged != 0 {
t.Fatalf("tagged = %d, want 0", tagged)
}
assertCategories(t, got.Segments[0], []string{"manual", Category})
}
func transcript(segments ...model.Segment) model.MergedTranscript {
return model.MergedTranscript{Segments: segments}
}
func segment(text string, start float64, end float64) model.Segment {
return model.Segment{
Source: "input.json",
Speaker: "Alice",
Start: start,
End: end,
Text: text,
}
}
func assertCategories(t *testing.T, segment model.Segment, want []string) {
t.Helper()
if !reflect.DeepEqual(segment.Categories, want) {
t.Fatalf("categories = %v, want %v", segment.Categories, want)
}
}

View File

@@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"gitea.maximumdirect.net/eric/seriatim/internal/autocorrect" "gitea.maximumdirect.net/eric/seriatim/internal/autocorrect"
"gitea.maximumdirect.net/eric/seriatim/internal/backchannel"
"gitea.maximumdirect.net/eric/seriatim/internal/coalesce" "gitea.maximumdirect.net/eric/seriatim/internal/coalesce"
"gitea.maximumdirect.net/eric/seriatim/internal/config" "gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/model" "gitea.maximumdirect.net/eric/seriatim/internal/model"
@@ -98,6 +99,23 @@ func (resolveOverlaps) Process(ctx context.Context, in model.MergedTranscript, c
}, nil }, nil
} }
type backchannelPostprocessor struct{}
func (backchannelPostprocessor) Name() string {
return "backchannel"
}
func (backchannelPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
if err := ctx.Err(); err != nil {
return model.MergedTranscript{}, nil, err
}
out, tagged := backchannel.Apply(in)
return out, []report.Event{
report.Info("postprocessing", "backchannel", fmt.Sprintf("tagged %d backchannel segment(s)", tagged)),
}, nil
}
type coalescePostprocessor struct{} type coalescePostprocessor struct{}
func (coalescePostprocessor) Name() string { func (coalescePostprocessor) Name() string {

View File

@@ -13,6 +13,7 @@ func NewRegistry() *pipeline.Registry {
registry.RegisterMerger(placeholderMerger{}) registry.RegisterMerger(placeholderMerger{})
registry.RegisterPostprocessor(detectOverlaps{}) registry.RegisterPostprocessor(detectOverlaps{})
registry.RegisterPostprocessor(resolveOverlaps{}) registry.RegisterPostprocessor(resolveOverlaps{})
registry.RegisterPostprocessor(backchannelPostprocessor{})
registry.RegisterPostprocessor(coalescePostprocessor{}) registry.RegisterPostprocessor(coalescePostprocessor{})
registry.RegisterPostprocessor(assignIDs{}) registry.RegisterPostprocessor(assignIDs{})
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"}) registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})

View File

@@ -90,6 +90,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
"placeholder-merger", "placeholder-merger",
"detect-overlaps", "detect-overlaps",
"resolve-overlaps", "resolve-overlaps",
"backchannel",
"coalesce", "coalesce",
"detect-overlaps", "detect-overlaps",
"autocorrect", "autocorrect",
@@ -585,6 +586,92 @@ func TestMergeCoalesceGapOverridePreventsMerge(t *testing.T) {
} }
} }
func TestMergeTagsBackchannelSegments(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 1.5, "text": " Yeah. "},
{"start": 6, "end": 7, "text": "not a backchannel"}
]
}`)
output := filepath.Join(dir, "merged.json")
reportPath := filepath.Join(dir, "report.json")
err := executeMerge(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript model.FinalTranscript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if !equalStrings(transcript.Segments[0].Categories, []string{"backchannel"}) {
t.Fatalf("segment categories = %v, want [backchannel]", transcript.Segments[0].Categories)
}
if len(transcript.Segments[1].Categories) != 0 {
t.Fatalf("unexpected categories = %v", transcript.Segments[1].Categories)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
if !hasReportEvent(rpt, "postprocessing", "backchannel", "tagged 1 backchannel segment(s)") {
t.Fatal("expected backchannel report event")
}
}
func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) {
dir := t.TempDir()
inputA := writeJSONFile(t, dir, "a.json", `{
"segments": [
{"start": 1, "end": 2, "text": "first"},
{"start": 3, "end": 4, "text": "second"}
]
}`)
inputB := writeJSONFile(t, dir, "b.json", `{
"segments": [
{"start": 2.2, "end": 2.5, "text": "yeah"}
]
}`)
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
- speaker: Alice
match: ["a.json"]
- speaker: Bob
match: ["b.json"]
`)
output := filepath.Join(dir, "merged.json")
err := executeMerge(
"--input-file", inputA,
"--input-file", inputB,
"--speakers", speakers,
"--output-file", output,
)
if err != nil {
t.Fatalf("merge failed: %v", err)
}
var transcript model.FinalTranscript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "first second" {
t.Fatalf("first segment = %#v, want coalesced Alice", transcript.Segments[0])
}
if len(transcript.Segments[0].Categories) != 0 {
t.Fatalf("coalesced segment categories = %v, want none", transcript.Segments[0].Categories)
}
if transcript.Segments[1].Speaker != "Bob" || !equalStrings(transcript.Segments[1].Categories, []string{"backchannel"}) {
t.Fatalf("second segment = %#v, want Bob backchannel", transcript.Segments[1])
}
}
func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{

View File

@@ -27,16 +27,24 @@ func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summ
coalescedID := 0 coalescedID := 0
current := newRun(in.Segments[0]) current := newRun(in.Segments[0])
pendingBackchannels := make([]model.Segment, 0)
for _, segment := range in.Segments[1:] { for _, segment := range in.Segments[1:] {
if current.canMerge(segment, gap) { if current.canMerge(segment, gap) {
current.add(segment) current.add(segment)
continue continue
} }
if segment.Speaker != current.speaker() && hasCategory(segment, "backchannel") {
pendingBackchannels = append(pendingBackchannels, segment)
continue
}
coalescedID = appendRun(&out, current, coalescedID, &summary) coalescedID = appendRun(&out, current, coalescedID, &summary)
out.Segments = append(out.Segments, pendingBackchannels...)
pendingBackchannels = pendingBackchannels[:0]
current = newRun(segment) current = newRun(segment)
} }
appendRun(&out, current, coalescedID, &summary) coalescedID = appendRun(&out, current, coalescedID, &summary)
out.Segments = append(out.Segments, pendingBackchannels...)
return out, summary return out, summary
} }
@@ -56,6 +64,10 @@ func (r run) canMerge(next model.Segment, gap float64) bool {
return current.Speaker == next.Speaker && next.Start-current.End <= gap return current.Speaker == next.Speaker && next.Start-current.End <= gap
} }
func (r run) speaker() string {
return r.segments[0].Speaker
}
func (r *run) add(segment model.Segment) { func (r *run) add(segment model.Segment) {
r.segments = append(r.segments, segment) r.segments = append(r.segments, segment)
} }
@@ -116,3 +128,12 @@ func segmentRef(segment model.Segment) string {
} }
return segment.Source return segment.Source
} }
func hasCategory(segment model.Segment, category string) bool {
for _, existing := range segment.Categories {
if existing == category {
return true
}
}
return false
}

View File

@@ -137,6 +137,55 @@ func TestApplyDerivedProvenanceForMixedSourcesAndDerivedInputs(t *testing.T) {
} }
} }
func TestApplyDropsBackchannelCategoryFromMergedSameSpeakerRun(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "yeah")
first.Categories = []string{"backchannel"}
second := segment("a.json", 1, "Alice", 2.5, 3, "more")
got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
if len(got.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(got.Segments))
}
if got.Segments[0].Categories != nil {
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
}
}
func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
backchannel := segment("b.json", 0, "Bob", 2.2, 2.5, "yeah")
backchannel.Categories = []string{"backchannel"}
second := segment("a.json", 1, "Alice", 3, 4, "second")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, backchannel, second}}, 3)
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(got.Segments))
}
if got.Segments[0].Text != "first second" {
t.Fatalf("first output text = %q, want first second", got.Segments[0].Text)
}
if got.Segments[1].Text != "yeah" || !reflect.DeepEqual(got.Segments[1].Categories, []string{"backchannel"}) {
t.Fatalf("second output segment = %#v", got.Segments[1])
}
}
func TestApplyDifferentSpeakerNonBackchannelStillBlocksMerge(t *testing.T) {
first := segment("a.json", 0, "Alice", 1, 2, "first")
bob := segment("b.json", 0, "Bob", 2.2, 2.5, "interruption")
second := segment("a.json", 1, "Alice", 3, 4, "second")
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, bob, second}}, 3)
if summary.OriginalSegmentsMerged != 0 || summary.CoalescedSegments != 0 {
t.Fatalf("summary = %#v", summary)
}
if len(got.Segments) != 3 {
t.Fatalf("segment count = %d, want 3", len(got.Segments))
}
}
func segment(source string, sourceIndex int, speaker string, start float64, end float64, text string) model.Segment { func segment(source string, sourceIndex int, speaker string, start float64, end float64, text string) model.Segment {
return model.Segment{ return model.Segment{
Source: source, Source: source,

View File

@@ -14,7 +14,7 @@ const (
DefaultInputReader = "json-files" DefaultInputReader = "json-files"
DefaultOutputModules = "json" DefaultOutputModules = "json"
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
DefaultOverlapWordRunGap = 0.75 DefaultOverlapWordRunGap = 0.75
DefaultWordRunReorderWindow = 0.4 DefaultWordRunReorderWindow = 0.4
DefaultCoalesceGap = 3.0 DefaultCoalesceGap = 3.0

View File

@@ -56,6 +56,7 @@ type Segment struct {
Start float64 `json:"start"` Start float64 `json:"start"`
End float64 `json:"end"` End float64 `json:"end"`
Text string `json:"text"` Text string `json:"text"`
Categories []string `json:"categories,omitempty"`
Words []Word `json:"words,omitempty"` Words []Word `json:"words,omitempty"`
OverlapGroupID int `json:"overlap_group_id,omitempty"` OverlapGroupID int `json:"overlap_group_id,omitempty"`
} }