Implemented a module to detect filler segments, and skip them for purposes of same-speaker segment coalescing
This commit is contained in:
56
internal/filler/filler.go
Normal file
56
internal/filler/filler.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package filler
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
)
|
||||
|
||||
const Category = "filler"
|
||||
|
||||
var patterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)$`),
|
||||
regexp.MustCompile(`(?i)^(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+)(\s+(um+|uh+|er+|erm+|ah+|eh+|hmm+|mm+|mmm+))*$`),
|
||||
}
|
||||
|
||||
// Apply tags short filler segments.
|
||||
func Apply(in model.MergedTranscript) (model.MergedTranscript, int) {
|
||||
tagged := 0
|
||||
for index := range in.Segments {
|
||||
if !matches(in.Segments[index]) || hasCategory(in.Segments[index], Category) {
|
||||
continue
|
||||
}
|
||||
in.Segments[index].Categories = append(in.Segments[index].Categories, Category)
|
||||
tagged++
|
||||
}
|
||||
return in, tagged
|
||||
}
|
||||
|
||||
func matches(segment model.Segment) bool {
|
||||
text := strings.TrimSpace(segment.Text)
|
||||
if text == "" {
|
||||
return false
|
||||
}
|
||||
if len(strings.Fields(text)) > 3 {
|
||||
return false
|
||||
}
|
||||
if segment.End-segment.Start > 1.0 {
|
||||
return false
|
||||
}
|
||||
for _, pattern := range patterns {
|
||||
if pattern.MatchString(text) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasCategory(segment model.Segment, category string) bool {
|
||||
for _, existing := range segment.Categories {
|
||||
if existing == category {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user