Minor updates to overlap detection and segment coalescing logic

2026-04-28 14:11:38 -05:00
parent 28c2eea340
commit a3ca6665a9
14 changed files with 662 additions and 95 deletions
--- a/internal/filler/filler.go
+++ b/internal/filler/filler.go
@@ -3,6 +3,7 @@ package filler
 import (
 	"regexp"
 	"strings"
+	"unicode"

 	"gitea.maximumdirect.net/eric/seriatim/internal/model"
 )
@@ -15,10 +16,10 @@ var patterns = []*regexp.Regexp{
 }

 // Apply tags short filler segments.
-func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
+func Apply(in model.MergedTranscript, maxDuration float64) (model.MergedTranscript, int) {
 	tagged := 0
 	for index := range in.Segments {
-		if !matches(in.Segments[index]) || hasCategory(in.Segments[index], Category) {
+		if !matches(in.Segments[index], maxDuration) || hasCategory(in.Segments[index], Category) {
 			continue
 		}
 		in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
@@ -27,15 +28,15 @@ func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
 	return in, tagged
 }

-func matches(segment model.Segment) bool {
-	text := strings.TrimSpace(segment.Text)
+func matches(segment model.Segment, maxDuration float64) bool {
+	text := normalizeForMatching(segment.Text)
 	if text == "" {
 		return false
 	}
 	if len(strings.Fields(text)) > 3 {
 		return false
 	}
-	if segment.End-segment.Start > 1.0 {
+	if segment.End-segment.Start > maxDuration {
 		return false
 	}
 	for _, pattern := range patterns {
@@ -46,6 +47,16 @@ func matches(segment model.Segment) bool {
 	return false
 }

+func normalizeForMatching(text string) string {
+	text = strings.Map(func(r rune) rune {
+		if unicode.IsPunct(r) {
+			return ' '
+		}
+		return r
+	}, text)
+	return strings.Join(strings.Fields(text), " ")
+}
+
 func hasCategory(segment model.Segment, category string) bool {
 	for _, existing := range segment.Categories {
 		if existing == category {
--- a/internal/filler/filler_test.go
+++ b/internal/filler/filler_test.go
@@ -10,7 +10,7 @@ import (
 func TestApplyTagsVerySafeFillers(t *testing.T) {
 	for _, text := range []string{"um", "uhhh", "ER", "ermm", "ah", "eh", "hmmm", "mm", "mmm"} {
 		t.Run(text, func(t *testing.T) {
-			got, tagged := Apply(transcript(segment(text, 1, 1.5)))
+			got, tagged := Apply(transcript(segment(text, 1, 1.5)), 1.0)
 			if tagged != 1 {
 				t.Fatalf("tagged = %d, want 1", tagged)
 			}
@@ -20,7 +20,7 @@ func TestApplyTagsVerySafeFillers(t *testing.T) {
 }

 func TestApplyTagsRepeatedFillers(t *testing.T) {
-	got, tagged := Apply(transcript(segment("um uh hmm", 1, 1.8)))
+	got, tagged := Apply(transcript(segment("um uh hmm", 1, 1.8)), 1.0)
 	if tagged != 1 {
 		t.Fatalf("tagged = %d, want 1", tagged)
 	}
@@ -28,17 +28,29 @@ func TestApplyTagsRepeatedFillers(t *testing.T) {
 }

 func TestApplyMatchesTrimAwareCaseInsensitive(t *testing.T) {
-	got, tagged := Apply(transcript(segment("  UM uh  ", 1, 1.5)))
+	got, tagged := Apply(transcript(segment("  UM uh  ", 1, 1.5)), 1.0)
 	if tagged != 1 {
 		t.Fatalf("tagged = %d, want 1", tagged)
 	}
 	assertCategories(t, got.Segments[0], []string{Category})
 }

+func TestApplyIgnoresPunctuationWhenMatching(t *testing.T) {
+	for _, text := range []string{"um.", "uh?!", "um, uh... hmm!", "hmm--mm"} {
+		t.Run(text, func(t *testing.T) {
+			got, tagged := Apply(transcript(segment(text, 1, 1.8)), 1.0)
+			if tagged != 1 {
+				t.Fatalf("tagged = %d, want 1", tagged)
+			}
+			assertCategories(t, got.Segments[0], []string{Category})
+		})
+	}
+}
+
 func TestApplyDoesNotTagNonMatches(t *testing.T) {
 	for _, text := range []string{"um okay", "uh-huh", "hmm, okay"} {
 		t.Run(text, func(t *testing.T) {
-			got, tagged := Apply(transcript(segment(text, 1, 1.5)))
+			got, tagged := Apply(transcript(segment(text, 1, 1.5)), 1.0)
 			if tagged != 0 {
 				t.Fatalf("tagged = %d, want 0", tagged)
 			}
@@ -48,15 +60,29 @@ func TestApplyDoesNotTagNonMatches(t *testing.T) {
 }

 func TestApplyRejectsWordCountOverThree(t *testing.T) {
-	got, tagged := Apply(transcript(segment("um uh er ah", 1, 1.5)))
+	got, tagged := Apply(transcript(segment("um uh er ah", 1, 1.5)), 1.0)
 	if tagged != 0 {
 		t.Fatalf("tagged = %d, want 0", tagged)
 	}
 	assertCategories(t, got.Segments[0], nil)
 }

-func TestApplyRejectsDurationOverOneSecond(t *testing.T) {
-	got, tagged := Apply(transcript(segment("um", 1, 2.1)))
+func TestApplyUsesConfiguredMaxDuration(t *testing.T) {
+	got, tagged := Apply(transcript(segment("um", 1, 2.2)), 1.25)
+	if tagged != 1 {
+		t.Fatalf("tagged = %d, want 1", tagged)
+	}
+	assertCategories(t, got.Segments[0], []string{Category})
+
+	got, tagged = Apply(transcript(segment("um", 1, 2.3)), 1.25)
+	if tagged != 0 {
+		t.Fatalf("tagged = %d, want 0", tagged)
+	}
+	assertCategories(t, got.Segments[0], nil)
+}
+
+func TestApplyRejectsDurationOverConfiguredMax(t *testing.T) {
+	got, tagged := Apply(transcript(segment("um", 1, 2.1)), 1.0)
 	if tagged != 0 {
 		t.Fatalf("tagged = %d, want 0", tagged)
 	}
@@ -67,7 +93,7 @@ func TestApplyPreservesExistingCategoriesAndAvoidsDuplicate(t *testing.T) {
 	existing := segment("um", 1, 1.2)
 	existing.Categories = []string{"manual", Category}

-	got, tagged := Apply(transcript(existing))
+	got, tagged := Apply(transcript(existing), 1.0)
 	if tagged != 0 {
 		t.Fatalf("tagged = %d, want 0", tagged)
 	}