Files
seriatim/internal/filler/filler.go

68 lines
1.5 KiB
Go

package filler
import (
"regexp"
"strings"
"unicode"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
)
const Category = "filler"
var patterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)$`),
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)(\s+(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+))*$`),
}
// Apply tags short filler segments.
func Apply(in model.MergedTranscript, maxDuration float64) (model.MergedTranscript, int) {
tagged := 0
for index := range in.Segments {
if !matches(in.Segments[index], maxDuration) || hasCategory(in.Segments[index], Category) {
continue
}
in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
tagged++
}
return in, tagged
}
func matches(segment model.Segment, maxDuration float64) bool {
text := normalizeForMatching(segment.Text)
if text == "" {
return false
}
if len(strings.Fields(text)) > 3 {
return false
}
if segment.End-segment.Start > maxDuration {
return false
}
for _, pattern := range patterns {
if pattern.MatchString(text) {
return true
}
}
return false
}
func normalizeForMatching(text string) string {
text = strings.Map(func(r rune) rune {
if unicode.IsPunct(r) {
return ' '
}
return r
}, text)
return strings.Join(strings.Fields(text), " ")
}
func hasCategory(segment model.Segment, category string) bool {
for _, existing := range segment.Categories {
if existing == category {
return true
}
}
return false
}