Implemented a module to detect filler segments, and skip them for purposes of same-speaker segment coalescing
This commit is contained in:
18
README.md
18
README.md
@@ -44,7 +44,7 @@ Optional flags:
|
||||
- `--input-reader`: input reader module. Default: `json-files`.
|
||||
- `--output-modules`: comma-separated output modules. Default: `json`.
|
||||
- `--preprocessing-modules`: comma-separated preprocessing modules. Default: `validate-raw,normalize-speakers,trim-text`.
|
||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`.
|
||||
- `--postprocessing-modules`: comma-separated postprocessing modules. Default: `detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output`.
|
||||
- `--coalesce-gap`: maximum same-speaker gap in seconds for `coalesce`. Default: `3.0`.
|
||||
|
||||
## Input JSON Format
|
||||
@@ -151,7 +151,7 @@ The merged output uses the current seriatim envelope:
|
||||
"input_reader": "json-files",
|
||||
"input_files": ["eric.json", "mike.json"],
|
||||
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
|
||||
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
|
||||
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "filler", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
|
||||
"output_modules": ["json"]
|
||||
},
|
||||
"segments": [
|
||||
@@ -216,7 +216,7 @@ Overlap behavior:
|
||||
|
||||
## Overlap Resolution
|
||||
|
||||
The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `coalesce`, then a second `detect-overlaps` pass.
|
||||
The default postprocessing pipeline runs `detect-overlaps`, then `resolve-overlaps`, then `backchannel`, then `filler`, then `coalesce`, then a second `detect-overlaps` pass.
|
||||
|
||||
For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word timing to build smaller word-run replacement segments:
|
||||
|
||||
@@ -247,13 +247,23 @@ The default pipeline runs `backchannel` before `coalesce`. It tags short acknowl
|
||||
|
||||
Backchannel matching is case-insensitive, trims surrounding whitespace, and requires a matching acknowledgement phrase, no more than three whitespace-delimited words, and duration no greater than `1.0` second.
|
||||
|
||||
## Fillers
|
||||
|
||||
The default pipeline runs `filler` after `backchannel` and before `coalesce`. It tags short filler utterances with:
|
||||
|
||||
```json
|
||||
"categories": ["filler"]
|
||||
```
|
||||
|
||||
Filler matching is case-insensitive, trims surrounding whitespace, and requires only filler tokens such as `um`, `uh`, `er`, `erm`, `ah`, `eh`, `hmm`, `mm`, or repeated combinations of those tokens. Matching segments must contain no more than three whitespace-delimited words and have duration no greater than `1.0` second.
|
||||
|
||||
## Coalescing
|
||||
|
||||
The default pipeline runs `coalesce` before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`.
|
||||
|
||||
Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`.
|
||||
|
||||
Different-speaker backchannel segments do not block coalescing of surrounding same-speaker segments. When same-speaker segments are coalesced, any `backchannel` category from the merged inputs is dropped from the coalesced segment.
|
||||
Different-speaker backchannel and filler segments do not block coalescing of surrounding same-speaker segments. When same-speaker segments are coalesced, any `backchannel` or `filler` category from the merged inputs is dropped from the coalesced segment.
|
||||
|
||||
## Autocorrect
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/backchannel"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/coalesce"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/filler"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/overlap"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
@@ -116,6 +117,23 @@ func (backchannelPostprocessor) Process(ctx context.Context, in model.MergedTran
|
||||
}, nil
|
||||
}
|
||||
|
||||
type fillerPostprocessor struct{}
|
||||
|
||||
func (fillerPostprocessor) Name() string {
|
||||
return "filler"
|
||||
}
|
||||
|
||||
func (fillerPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
out, tagged := filler.Apply(in)
|
||||
return out, []report.Event{
|
||||
report.Info("postprocessing", "filler", fmt.Sprintf("tagged %d filler segment(s)", tagged)),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type coalescePostprocessor struct{}
|
||||
|
||||
func (coalescePostprocessor) Name() string {
|
||||
|
||||
@@ -14,6 +14,7 @@ func NewRegistry() *pipeline.Registry {
|
||||
registry.RegisterPostprocessor(detectOverlaps{})
|
||||
registry.RegisterPostprocessor(resolveOverlaps{})
|
||||
registry.RegisterPostprocessor(backchannelPostprocessor{})
|
||||
registry.RegisterPostprocessor(fillerPostprocessor{})
|
||||
registry.RegisterPostprocessor(coalescePostprocessor{})
|
||||
registry.RegisterPostprocessor(assignIDs{})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
|
||||
|
||||
@@ -91,6 +91,7 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
|
||||
"detect-overlaps",
|
||||
"resolve-overlaps",
|
||||
"backchannel",
|
||||
"filler",
|
||||
"coalesce",
|
||||
"detect-overlaps",
|
||||
"autocorrect",
|
||||
@@ -625,6 +626,45 @@ func TestMergeTagsBackchannelSegments(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeTagsFillerSegments(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "input.json", `{
|
||||
"segments": [
|
||||
{"start": 1, "end": 1.5, "text": " Um uh "},
|
||||
{"start": 6, "end": 7, "text": "not filler"}
|
||||
]
|
||||
}`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
reportPath := filepath.Join(dir, "report.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", input,
|
||||
"--output-file", output,
|
||||
"--report-file", reportPath,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
if len(transcript.Segments) != 2 {
|
||||
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
|
||||
}
|
||||
if !equalStrings(transcript.Segments[0].Categories, []string{"filler"}) {
|
||||
t.Fatalf("segment categories = %v, want [filler]", transcript.Segments[0].Categories)
|
||||
}
|
||||
if len(transcript.Segments[1].Categories) != 0 {
|
||||
t.Fatalf("unexpected categories = %v", transcript.Segments[1].Categories)
|
||||
}
|
||||
|
||||
var rpt report.Report
|
||||
readJSON(t, reportPath, &rpt)
|
||||
if !hasReportEvent(rpt, "postprocessing", "filler", "tagged 1 filler segment(s)") {
|
||||
t.Fatal("expected filler report event")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
@@ -672,6 +712,53 @@ func TestMergeCoalescesAroundDifferentSpeakerBackchannel(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeCoalescesAroundDifferentSpeakerFiller(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
inputA := writeJSONFile(t, dir, "a.json", `{
|
||||
"segments": [
|
||||
{"start": 1, "end": 2, "text": "first"},
|
||||
{"start": 3, "end": 4, "text": "second"}
|
||||
]
|
||||
}`)
|
||||
inputB := writeJSONFile(t, dir, "b.json", `{
|
||||
"segments": [
|
||||
{"start": 2.2, "end": 2.5, "text": "um"}
|
||||
]
|
||||
}`)
|
||||
speakers := writeYAMLFile(t, dir, "speakers.yml", `match:
|
||||
- speaker: Alice
|
||||
match: ["a.json"]
|
||||
- speaker: Bob
|
||||
match: ["b.json"]
|
||||
`)
|
||||
output := filepath.Join(dir, "merged.json")
|
||||
|
||||
err := executeMerge(
|
||||
"--input-file", inputA,
|
||||
"--input-file", inputB,
|
||||
"--speakers", speakers,
|
||||
"--output-file", output,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("merge failed: %v", err)
|
||||
}
|
||||
|
||||
var transcript model.FinalTranscript
|
||||
readJSON(t, output, &transcript)
|
||||
if len(transcript.Segments) != 2 {
|
||||
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
|
||||
}
|
||||
if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[0].Text != "first second" {
|
||||
t.Fatalf("first segment = %#v, want coalesced Alice", transcript.Segments[0])
|
||||
}
|
||||
if len(transcript.Segments[0].Categories) != 0 {
|
||||
t.Fatalf("coalesced segment categories = %v, want none", transcript.Segments[0].Categories)
|
||||
}
|
||||
if transcript.Segments[1].Speaker != "Bob" || !equalStrings(transcript.Segments[1].Categories, []string{"filler"}) {
|
||||
t.Fatalf("second segment = %#v, want Bob filler", transcript.Segments[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{
|
||||
|
||||
@@ -27,24 +27,24 @@ func Apply(in model.MergedTranscript, gap float64) (model.MergedTranscript, Summ
|
||||
coalescedID := 0
|
||||
|
||||
current := newRun(in.Segments[0])
|
||||
pendingBackchannels := make([]model.Segment, 0)
|
||||
pendingSkipped := make([]model.Segment, 0)
|
||||
for _, segment := range in.Segments[1:] {
|
||||
if current.canMerge(segment, gap) {
|
||||
current.add(segment)
|
||||
continue
|
||||
}
|
||||
if segment.Speaker != current.speaker() && hasCategory(segment, "backchannel") {
|
||||
pendingBackchannels = append(pendingBackchannels, segment)
|
||||
if segment.Speaker != current.speaker() && hasAnyCategory(segment, "backchannel", "filler") {
|
||||
pendingSkipped = append(pendingSkipped, segment)
|
||||
continue
|
||||
}
|
||||
|
||||
coalescedID = appendRun(&out, current, coalescedID, &summary)
|
||||
out.Segments = append(out.Segments, pendingBackchannels...)
|
||||
pendingBackchannels = pendingBackchannels[:0]
|
||||
out.Segments = append(out.Segments, pendingSkipped...)
|
||||
pendingSkipped = pendingSkipped[:0]
|
||||
current = newRun(segment)
|
||||
}
|
||||
coalescedID = appendRun(&out, current, coalescedID, &summary)
|
||||
out.Segments = append(out.Segments, pendingBackchannels...)
|
||||
out.Segments = append(out.Segments, pendingSkipped...)
|
||||
|
||||
return out, summary
|
||||
}
|
||||
@@ -129,10 +129,12 @@ func segmentRef(segment model.Segment) string {
|
||||
return segment.Source
|
||||
}
|
||||
|
||||
func hasCategory(segment model.Segment, category string) bool {
|
||||
func hasAnyCategory(segment model.Segment, categories ...string) bool {
|
||||
for _, existing := range segment.Categories {
|
||||
if existing == category {
|
||||
return true
|
||||
for _, category := range categories {
|
||||
if existing == category {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
||||
@@ -151,6 +151,20 @@ func TestApplyDropsBackchannelCategoryFromMergedSameSpeakerRun(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDropsFillerCategoryFromMergedSameSpeakerRun(t *testing.T) {
|
||||
first := segment("a.json", 0, "Alice", 1, 2, "um")
|
||||
first.Categories = []string{"filler"}
|
||||
second := segment("a.json", 1, "Alice", 2.5, 3, "more")
|
||||
|
||||
got, _ := Apply(model.MergedTranscript{Segments: []model.Segment{first, second}}, 3)
|
||||
if len(got.Segments) != 1 {
|
||||
t.Fatalf("segment count = %d, want 1", len(got.Segments))
|
||||
}
|
||||
if got.Segments[0].Categories != nil {
|
||||
t.Fatalf("categories = %v, want nil", got.Segments[0].Categories)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) {
|
||||
first := segment("a.json", 0, "Alice", 1, 2, "first")
|
||||
backchannel := segment("b.json", 0, "Bob", 2.2, 2.5, "yeah")
|
||||
@@ -172,6 +186,27 @@ func TestApplySkipsDifferentSpeakerBackchannelAsMergeBlocker(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySkipsDifferentSpeakerFillerAsMergeBlocker(t *testing.T) {
|
||||
first := segment("a.json", 0, "Alice", 1, 2, "first")
|
||||
filler := segment("b.json", 0, "Bob", 2.2, 2.5, "um")
|
||||
filler.Categories = []string{"filler"}
|
||||
second := segment("a.json", 1, "Alice", 3, 4, "second")
|
||||
|
||||
got, summary := Apply(model.MergedTranscript{Segments: []model.Segment{first, filler, second}}, 3)
|
||||
if summary.OriginalSegmentsMerged != 2 || summary.CoalescedSegments != 1 {
|
||||
t.Fatalf("summary = %#v", summary)
|
||||
}
|
||||
if len(got.Segments) != 2 {
|
||||
t.Fatalf("segment count = %d, want 2", len(got.Segments))
|
||||
}
|
||||
if got.Segments[0].Text != "first second" {
|
||||
t.Fatalf("first output text = %q, want first second", got.Segments[0].Text)
|
||||
}
|
||||
if got.Segments[1].Text != "um" || !reflect.DeepEqual(got.Segments[1].Categories, []string{"filler"}) {
|
||||
t.Fatalf("second output segment = %#v", got.Segments[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyDifferentSpeakerNonBackchannelStillBlocksMerge(t *testing.T) {
|
||||
first := segment("a.json", 0, "Alice", 1, 2, "first")
|
||||
bob := segment("b.json", 0, "Bob", 2.2, 2.5, "interruption")
|
||||
|
||||
@@ -14,7 +14,7 @@ const (
|
||||
DefaultInputReader = "json-files"
|
||||
DefaultOutputModules = "json"
|
||||
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
|
||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
|
||||
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
|
||||
DefaultOverlapWordRunGap = 0.75
|
||||
DefaultWordRunReorderWindow = 0.4
|
||||
DefaultCoalesceGap = 3.0
|
||||
|
||||
56
internal/filler/filler.go
Normal file
56
internal/filler/filler.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package filler
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
const Category = "filler"
|
||||
|
||||
var patterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)$`),
|
||||
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)(\s+(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+))*$`),
|
||||
}
|
||||
|
||||
// Apply tags short filler segments.
|
||||
func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
|
||||
tagged := 0
|
||||
for index := range in.Segments {
|
||||
if !matches(in.Segments[index]) || hasCategory(in.Segments[index], Category) {
|
||||
continue
|
||||
}
|
||||
in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
|
||||
tagged++
|
||||
}
|
||||
return in, tagged
|
||||
}
|
||||
|
||||
func matches(segment model.Segment) bool {
|
||||
text := strings.TrimSpace(segment.Text)
|
||||
if text == "" {
|
||||
return false
|
||||
}
|
||||
if len(strings.Fields(text)) > 3 {
|
||||
return false
|
||||
}
|
||||
if segment.End-segment.Start > 1.0 {
|
||||
return false
|
||||
}
|
||||
for _, pattern := range patterns {
|
||||
if pattern.MatchString(text) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasCategory(segment model.Segment, category string) bool {
|
||||
for _, existing := range segment.Categories {
|
||||
if existing == category {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
96
internal/filler/filler_test.go
Normal file
96
internal/filler/filler_test.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package filler
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
func TestApplyTagsVerySafeFillers(t *testing.T) {
|
||||
for _, text := range []string{"um", "uhhh", "ER", "ermm", "ah", "eh", "hmmm", "mm", "mmm"} {
|
||||
t.Run(text, func(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment(text, 1, 1.5)))
|
||||
if tagged != 1 {
|
||||
t.Fatalf("tagged = %d, want 1", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], []string{Category})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyTagsRepeatedFillers(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment("um uh hmm", 1, 1.8)))
|
||||
if tagged != 1 {
|
||||
t.Fatalf("tagged = %d, want 1", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], []string{Category})
|
||||
}
|
||||
|
||||
func TestApplyMatchesTrimAwareCaseInsensitive(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment(" UM uh ", 1, 1.5)))
|
||||
if tagged != 1 {
|
||||
t.Fatalf("tagged = %d, want 1", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], []string{Category})
|
||||
}
|
||||
|
||||
func TestApplyDoesNotTagNonMatches(t *testing.T) {
|
||||
for _, text := range []string{"um okay", "uh-huh", "hmm, okay"} {
|
||||
t.Run(text, func(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment(text, 1, 1.5)))
|
||||
if tagged != 0 {
|
||||
t.Fatalf("tagged = %d, want 0", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], nil)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyRejectsWordCountOverThree(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment("um uh er ah", 1, 1.5)))
|
||||
if tagged != 0 {
|
||||
t.Fatalf("tagged = %d, want 0", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], nil)
|
||||
}
|
||||
|
||||
func TestApplyRejectsDurationOverOneSecond(t *testing.T) {
|
||||
got, tagged := Apply(transcript(segment("um", 1, 2.1)))
|
||||
if tagged != 0 {
|
||||
t.Fatalf("tagged = %d, want 0", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], nil)
|
||||
}
|
||||
|
||||
func TestApplyPreservesExistingCategoriesAndAvoidsDuplicate(t *testing.T) {
|
||||
existing := segment("um", 1, 1.2)
|
||||
existing.Categories = []string{"manual", Category}
|
||||
|
||||
got, tagged := Apply(transcript(existing))
|
||||
if tagged != 0 {
|
||||
t.Fatalf("tagged = %d, want 0", tagged)
|
||||
}
|
||||
assertCategories(t, got.Segments[0], []string{"manual", Category})
|
||||
}
|
||||
|
||||
func transcript(segments ...model.Segment) model.MergedTranscript {
|
||||
return model.MergedTranscript{Segments: segments}
|
||||
}
|
||||
|
||||
func segment(text string, start float64, end float64) model.Segment {
|
||||
return model.Segment{
|
||||
Source: "input.json",
|
||||
Speaker: "Alice",
|
||||
Start: start,
|
||||
End: end,
|
||||
Text: text,
|
||||
}
|
||||
}
|
||||
|
||||
func assertCategories(t *testing.T, segment model.Segment, want []string) {
|
||||
t.Helper()
|
||||
if !reflect.DeepEqual(segment.Categories, want) {
|
||||
t.Fatalf("categories = %v, want %v", segment.Categories, want)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user