Implemented a module to detect filler segments, and skip them for purposes of same-speaker segment coalescing

2026-04-27 19:58:55 -05:00
parent bbfb8aba44
commit fb0519c561
9 changed files with 319 additions and 14 deletions
--- a/internal/filler/filler.go
+++ b/internal/filler/filler.go
@@ -0,0 +1,56 @@
+package filler
+
+import (
+	"regexp"
+	"strings"
+
+	"gitea.maximumdirect.net/eric/seriatim/internal/model"
+)
+
+const Category = "filler"
+
+var patterns = []*regexp.Regexp{
+	regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)$`),
+	regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)(\s+(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+))*$`),
+}
+
+// Apply tags short filler segments.
+func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
+	tagged := 0
+	for index := range in.Segments {
+		if !matches(in.Segments[index]) || hasCategory(in.Segments[index], Category) {
+			continue
+		}
+		in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
+		tagged++
+	}
+	return in, tagged
+}
+
+func matches(segment model.Segment) bool {
+	text := strings.TrimSpace(segment.Text)
+	if text == "" {
+		return false
+	}
+	if len(strings.Fields(text)) > 3 {
+		return false
+	}
+	if segment.End-segment.Start > 1.0 {
+		return false
+	}
+	for _, pattern := range patterns {
+		if pattern.MatchString(text) {
+			return true
+		}
+	}
+	return false
+}
+
+func hasCategory(segment model.Segment, category string) bool {
+	for _, existing := range segment.Categories {
+		if existing == category {
+			return true
+		}
+	}
+	return false
+}
--- a/internal/filler/filler_test.go
+++ b/internal/filler/filler_test.go
@@ -0,0 +1,96 @@
+package filler
+
+import (
+	"reflect"
+	"testing"
+
+	"gitea.maximumdirect.net/eric/seriatim/internal/model"
+)
+
+func TestApplyTagsVerySafeFillers(t *testing.T) {
+	for _, text := range []string{"um", "uhhh", "ER", "ermm", "ah", "eh", "hmmm", "mm", "mmm"} {
+		t.Run(text, func(t *testing.T) {
+			got, tagged := Apply(transcript(segment(text, 1, 1.5)))
+			if tagged != 1 {
+				t.Fatalf("tagged = %d, want 1", tagged)
+			}
+			assertCategories(t, got.Segments[0], []string{Category})
+		})
+	}
+}
+
+func TestApplyTagsRepeatedFillers(t *testing.T) {
+	got, tagged := Apply(transcript(segment("um uh hmm", 1, 1.8)))
+	if tagged != 1 {
+		t.Fatalf("tagged = %d, want 1", tagged)
+	}
+	assertCategories(t, got.Segments[0], []string{Category})
+}
+
+func TestApplyMatchesTrimAwareCaseInsensitive(t *testing.T) {
+	got, tagged := Apply(transcript(segment("  UM uh  ", 1, 1.5)))
+	if tagged != 1 {
+		t.Fatalf("tagged = %d, want 1", tagged)
+	}
+	assertCategories(t, got.Segments[0], []string{Category})
+}
+
+func TestApplyDoesNotTagNonMatches(t *testing.T) {
+	for _, text := range []string{"um okay", "uh-huh", "hmm, okay"} {
+		t.Run(text, func(t *testing.T) {
+			got, tagged := Apply(transcript(segment(text, 1, 1.5)))
+			if tagged != 0 {
+				t.Fatalf("tagged = %d, want 0", tagged)
+			}
+			assertCategories(t, got.Segments[0], nil)
+		})
+	}
+}
+
+func TestApplyRejectsWordCountOverThree(t *testing.T) {
+	got, tagged := Apply(transcript(segment("um uh er ah", 1, 1.5)))
+	if tagged != 0 {
+		t.Fatalf("tagged = %d, want 0", tagged)
+	}
+	assertCategories(t, got.Segments[0], nil)
+}
+
+func TestApplyRejectsDurationOverOneSecond(t *testing.T) {
+	got, tagged := Apply(transcript(segment("um", 1, 2.1)))
+	if tagged != 0 {
+		t.Fatalf("tagged = %d, want 0", tagged)
+	}
+	assertCategories(t, got.Segments[0], nil)
+}
+
+func TestApplyPreservesExistingCategoriesAndAvoidsDuplicate(t *testing.T) {
+	existing := segment("um", 1, 1.2)
+	existing.Categories = []string{"manual", Category}
+
+	got, tagged := Apply(transcript(existing))
+	if tagged != 0 {
+		t.Fatalf("tagged = %d, want 0", tagged)
+	}
+	assertCategories(t, got.Segments[0], []string{"manual", Category})
+}
+
+func transcript(segments ...model.Segment) model.MergedTranscript {
+	return model.MergedTranscript{Segments: segments}
+}
+
+func segment(text string, start float64, end float64) model.Segment {
+	return model.Segment{
+		Source:  "input.json",
+		Speaker: "Alice",
+		Start:   start,
+		End:     end,
+		Text:    text,
+	}
+}
+
+func assertCategories(t *testing.T, segment model.Segment, want []string) {
+	t.Helper()
+	if !reflect.DeepEqual(segment.Categories, want) {
+		t.Fatalf("categories = %v, want %v", segment.Categories, want)
+	}
+}