Implemented substring matching for speakers.yml

This commit is contained in:
2026-04-26 19:20:00 -05:00
parent fe00600762
commit 99d0c425d6
4 changed files with 308 additions and 74 deletions

View File

@@ -9,18 +9,19 @@ import (
"gopkg.in/yaml.v3"
)
// Map resolves input file basenames to canonical speaker names.
// Map resolves input file basenames to canonical speaker names using ordered substring rules.
type Map struct {
inputs map[string]Input
rules []Rule
}
// Input describes one input entry in speakers.yml.
type Input struct {
Speaker string `yaml:"speaker"`
// Rule describes one ordered speaker matching rule.
type Rule struct {
Speaker string `yaml:"speaker"`
Match []string `yaml:"match"`
}
type fileSchema struct {
Inputs map[string]Input `yaml:"inputs"`
Match []Rule `yaml:"match"`
}
// LoadMap parses a speakers.yml file.
@@ -34,36 +35,49 @@ func LoadMap(path string) (Map, error) {
if err := yaml.Unmarshal(data, &parsed); err != nil {
return Map{}, fmt.Errorf("parse speaker map %q: %w", path, err)
}
if len(parsed.Inputs) == 0 {
return Map{}, fmt.Errorf("speaker map %q must contain at least one inputs entry", path)
if len(parsed.Match) == 0 {
return Map{}, fmt.Errorf("speaker map %q must contain at least one match rule", path)
}
inputs := make(map[string]Input, len(parsed.Inputs))
for key, input := range parsed.Inputs {
basename := filepath.Base(strings.TrimSpace(key))
if basename == "." || basename == "" {
return Map{}, fmt.Errorf("speaker map %q contains an empty input key", path)
seenSpeakers := make(map[string]struct{}, len(parsed.Match))
rules := make([]Rule, 0, len(parsed.Match))
for index, rule := range parsed.Match {
rule.Speaker = strings.TrimSpace(rule.Speaker)
if rule.Speaker == "" {
return Map{}, fmt.Errorf("speaker map rule %d must include speaker", index)
}
if _, exists := inputs[basename]; exists {
return Map{}, fmt.Errorf("speaker map %q contains duplicate basename mapping for %q", path, basename)
if _, exists := seenSpeakers[rule.Speaker]; exists {
return Map{}, fmt.Errorf("speaker map contains duplicate speaker %q", rule.Speaker)
}
seenSpeakers[rule.Speaker] = struct{}{}
if len(rule.Match) == 0 {
return Map{}, fmt.Errorf("speaker map rule %d for speaker %q must include at least one match string", index, rule.Speaker)
}
for matchIndex, match := range rule.Match {
match = strings.TrimSpace(match)
if match == "" {
return Map{}, fmt.Errorf("speaker map rule %d for speaker %q contains empty match string at index %d", index, rule.Speaker, matchIndex)
}
rule.Match[matchIndex] = match
}
input.Speaker = strings.TrimSpace(input.Speaker)
if input.Speaker == "" {
return Map{}, fmt.Errorf("speaker map entry %q must include speaker", basename)
}
inputs[basename] = input
rules = append(rules, rule)
}
return Map{inputs: inputs}, nil
return Map{rules: rules}, nil
}
// SpeakerForSource returns the canonical speaker for a transcript source path.
func (m Map) SpeakerForSource(source string) (string, error) {
basename := filepath.Base(source)
input, ok := m.inputs[basename]
if !ok {
return "", fmt.Errorf("speaker map has no entry for input basename %q", basename)
normalized := strings.ToLower(basename)
for _, rule := range m.rules {
for _, match := range rule.Match {
if strings.Contains(normalized, strings.ToLower(match)) {
return rule.Speaker, nil
}
}
}
return input.Speaker, nil
return "", fmt.Errorf("speaker map has no match for input basename %q", basename)
}