Implemented substring matching for speakers.yml
This commit is contained in:
@@ -9,18 +9,19 @@ import (
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Map resolves input file basenames to canonical speaker names.
|
||||
// Map resolves input file basenames to canonical speaker names using ordered substring rules.
|
||||
type Map struct {
|
||||
inputs map[string]Input
|
||||
rules []Rule
|
||||
}
|
||||
|
||||
// Input describes one input entry in speakers.yml.
|
||||
type Input struct {
|
||||
Speaker string `yaml:"speaker"`
|
||||
// Rule describes one ordered speaker matching rule.
|
||||
type Rule struct {
|
||||
Speaker string `yaml:"speaker"`
|
||||
Match []string `yaml:"match"`
|
||||
}
|
||||
|
||||
type fileSchema struct {
|
||||
Inputs map[string]Input `yaml:"inputs"`
|
||||
Match []Rule `yaml:"match"`
|
||||
}
|
||||
|
||||
// LoadMap parses a speakers.yml file.
|
||||
@@ -34,36 +35,49 @@ func LoadMap(path string) (Map, error) {
|
||||
if err := yaml.Unmarshal(data, &parsed); err != nil {
|
||||
return Map{}, fmt.Errorf("parse speaker map %q: %w", path, err)
|
||||
}
|
||||
if len(parsed.Inputs) == 0 {
|
||||
return Map{}, fmt.Errorf("speaker map %q must contain at least one inputs entry", path)
|
||||
if len(parsed.Match) == 0 {
|
||||
return Map{}, fmt.Errorf("speaker map %q must contain at least one match rule", path)
|
||||
}
|
||||
|
||||
inputs := make(map[string]Input, len(parsed.Inputs))
|
||||
for key, input := range parsed.Inputs {
|
||||
basename := filepath.Base(strings.TrimSpace(key))
|
||||
if basename == "." || basename == "" {
|
||||
return Map{}, fmt.Errorf("speaker map %q contains an empty input key", path)
|
||||
seenSpeakers := make(map[string]struct{}, len(parsed.Match))
|
||||
rules := make([]Rule, 0, len(parsed.Match))
|
||||
for index, rule := range parsed.Match {
|
||||
rule.Speaker = strings.TrimSpace(rule.Speaker)
|
||||
if rule.Speaker == "" {
|
||||
return Map{}, fmt.Errorf("speaker map rule %d must include speaker", index)
|
||||
}
|
||||
if _, exists := inputs[basename]; exists {
|
||||
return Map{}, fmt.Errorf("speaker map %q contains duplicate basename mapping for %q", path, basename)
|
||||
if _, exists := seenSpeakers[rule.Speaker]; exists {
|
||||
return Map{}, fmt.Errorf("speaker map contains duplicate speaker %q", rule.Speaker)
|
||||
}
|
||||
seenSpeakers[rule.Speaker] = struct{}{}
|
||||
|
||||
if len(rule.Match) == 0 {
|
||||
return Map{}, fmt.Errorf("speaker map rule %d for speaker %q must include at least one match string", index, rule.Speaker)
|
||||
}
|
||||
for matchIndex, match := range rule.Match {
|
||||
match = strings.TrimSpace(match)
|
||||
if match == "" {
|
||||
return Map{}, fmt.Errorf("speaker map rule %d for speaker %q contains empty match string at index %d", index, rule.Speaker, matchIndex)
|
||||
}
|
||||
rule.Match[matchIndex] = match
|
||||
}
|
||||
|
||||
input.Speaker = strings.TrimSpace(input.Speaker)
|
||||
if input.Speaker == "" {
|
||||
return Map{}, fmt.Errorf("speaker map entry %q must include speaker", basename)
|
||||
}
|
||||
inputs[basename] = input
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
|
||||
return Map{inputs: inputs}, nil
|
||||
return Map{rules: rules}, nil
|
||||
}
|
||||
|
||||
// SpeakerForSource returns the canonical speaker for a transcript source path.
|
||||
func (m Map) SpeakerForSource(source string) (string, error) {
|
||||
basename := filepath.Base(source)
|
||||
input, ok := m.inputs[basename]
|
||||
if !ok {
|
||||
return "", fmt.Errorf("speaker map has no entry for input basename %q", basename)
|
||||
normalized := strings.ToLower(basename)
|
||||
for _, rule := range m.rules {
|
||||
for _, match := range rule.Match {
|
||||
if strings.Contains(normalized, strings.ToLower(match)) {
|
||||
return rule.Speaker, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return input.Speaker, nil
|
||||
return "", fmt.Errorf("speaker map has no match for input basename %q", basename)
|
||||
}
|
||||
|
||||
155
internal/speaker/map_test.go
Normal file
155
internal/speaker/map_test.go
Normal file
@@ -0,0 +1,155 @@
|
||||
package speaker
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSpeakerForSourceMatchesBasenameCaseInsensitive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := writeSpeakerMap(t, dir, `match:
|
||||
- speaker: "Eric Rakestraw"
|
||||
match:
|
||||
- "eric_rakestraw"
|
||||
`)
|
||||
|
||||
speakers, err := LoadMap(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load speaker map: %v", err)
|
||||
}
|
||||
|
||||
got, err := speakers.SpeakerForSource(filepath.Join(dir, "2026-04-19-Eric_Rakestraw.json"))
|
||||
if err != nil {
|
||||
t.Fatalf("resolve speaker: %v", err)
|
||||
}
|
||||
if got != "Eric Rakestraw" {
|
||||
t.Fatalf("speaker = %q, want %q", got, "Eric Rakestraw")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpeakerForSourceUsesBasenameOnly(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := writeSpeakerMap(t, dir, `match:
|
||||
- speaker: "Directory Match"
|
||||
match:
|
||||
- "speaker-dir"
|
||||
`)
|
||||
|
||||
speakers, err := LoadMap(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load speaker map: %v", err)
|
||||
}
|
||||
|
||||
_, err = speakers.SpeakerForSource(filepath.Join(dir, "speaker-dir", "input.json"))
|
||||
if err == nil {
|
||||
t.Fatal("expected no match")
|
||||
}
|
||||
if !strings.Contains(err.Error(), `input.json`) {
|
||||
t.Fatalf("expected basename in error, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpeakerForSourceUsesFirstMatchingRule(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := writeSpeakerMap(t, dir, `match:
|
||||
- speaker: "First"
|
||||
match:
|
||||
- "adam"
|
||||
- speaker: "Second"
|
||||
match:
|
||||
- "adam_rakestraw"
|
||||
`)
|
||||
|
||||
speakers, err := LoadMap(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load speaker map: %v", err)
|
||||
}
|
||||
|
||||
got, err := speakers.SpeakerForSource("2026-04-19-Adam_Rakestraw.json")
|
||||
if err != nil {
|
||||
t.Fatalf("resolve speaker: %v", err)
|
||||
}
|
||||
if got != "First" {
|
||||
t.Fatalf("speaker = %q, want %q", got, "First")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadMapValidation(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "missing top-level match",
|
||||
content: `inputs: {}`,
|
||||
want: "must contain at least one match rule",
|
||||
},
|
||||
{
|
||||
name: "empty match list",
|
||||
content: `match: []`,
|
||||
want: "must contain at least one match rule",
|
||||
},
|
||||
{
|
||||
name: "empty speaker",
|
||||
content: `match:
|
||||
- speaker: ""
|
||||
match: ["eric"]
|
||||
`,
|
||||
want: "must include speaker",
|
||||
},
|
||||
{
|
||||
name: "empty rule match list",
|
||||
content: `match:
|
||||
- speaker: "Eric"
|
||||
match: []
|
||||
`,
|
||||
want: "must include at least one match string",
|
||||
},
|
||||
{
|
||||
name: "empty match string",
|
||||
content: `match:
|
||||
- speaker: "Eric"
|
||||
match: [" "]
|
||||
`,
|
||||
want: "contains empty match string",
|
||||
},
|
||||
{
|
||||
name: "duplicate speaker",
|
||||
content: `match:
|
||||
- speaker: "Eric"
|
||||
match: ["eric"]
|
||||
- speaker: "Eric"
|
||||
match: ["rakestraw"]
|
||||
`,
|
||||
want: `duplicate speaker "Eric"`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := writeSpeakerMap(t, dir, test.content)
|
||||
|
||||
_, err := LoadMap(path)
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), test.want) {
|
||||
t.Fatalf("expected error to contain %q, got %v", test.want, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func writeSpeakerMap(t *testing.T, dir string, content string) string {
|
||||
t.Helper()
|
||||
|
||||
path := filepath.Join(dir, "speakers.yml")
|
||||
if err := os.WriteFile(path, []byte(content), 0o600); err != nil {
|
||||
t.Fatalf("write speaker map: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
Reference in New Issue
Block a user