From 99d0c425d6f43a100fd74956eff6648d7f797199 Mon Sep 17 00:00:00 2001 From: Eric Rakestraw Date: Sun, 26 Apr 2026 19:20:00 -0500 Subject: [PATCH] Implemented substring matching for speakers.yml --- README.md | 52 ++++++++++-- internal/cli/merge_test.go | 111 ++++++++++++++++--------- internal/speaker/map.go | 64 +++++++++------ internal/speaker/map_test.go | 155 +++++++++++++++++++++++++++++++++++ 4 files changed, 308 insertions(+), 74 deletions(-) create mode 100644 internal/speaker/map_test.go diff --git a/README.md b/README.md index 118a6f9..e3fbf75 100644 --- a/README.md +++ b/README.md @@ -74,22 +74,56 @@ Other WhisperX fields, including `words` and raw diarization speaker labels, are ## Speaker Map Format -`speakers.yml` maps each input file basename to one canonical speaker name: +`speakers.yml` maps input files to canonical speaker names using ordered substring rules: ```yaml -inputs: - 2026-04-19-Eric_Rakestraw.json: - speaker: "Eric Rakestraw" +match: + - speaker: "Eric Rakestraw" + match: + - "Eric_Rakestraw" + - "Eric" - 2026-04-19-Mike_Brown.json: - speaker: "Mike Brown" + - speaker: "Mike Brown" + match: + - "Mike_Brown" + - "mb" +``` + +For each `--input-file`, `seriatim` takes the file basename and evaluates the rules in order. The first rule with a matching substring wins, and no later rules are evaluated. + +For example, this input: + +```text +samples/raw/2026-04-19-Eric_Rakestraw.json +``` + +matches this rule because the basename contains `Eric_Rakestraw`: + +```yaml +- speaker: "Eric Rakestraw" + match: + - "Eric_Rakestraw" ``` Important details: -- Keys are matched against the basename of each `--input-file`, not the full path. -- Every input file must have exactly one matching entry. -- `speaker` is required and must be non-empty. +- Matching is against the input file basename, not the full path. +- Matching is case-insensitive. +- Rules are evaluated from first to last. +- Each rule must have a non-empty `speaker`. +- Each rule must have at least one non-empty `match` string. +- Duplicate speaker names are invalid. +- Every input file must match at least one rule or the command fails. + +Deprecated old format: + +```yaml +inputs: + eric.json: + speaker: "Eric Rakestraw" +``` + +The old `inputs:` direct mapping format is no longer supported. ## Output JSON Format diff --git a/internal/cli/merge_test.go b/internal/cli/merge_test.go index 1bb9f1f..5d1d1c3 100644 --- a/internal/cli/merge_test.go +++ b/internal/cli/merge_test.go @@ -24,11 +24,11 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) { {"start": 5, "end": 6, "text": "first b"} ] }`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - a.json: - speaker: Alice - b.json: - speaker: Bob + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] `) output := filepath.Join(dir, "merged.json") reportPath := filepath.Join(dir, "report.json") @@ -112,11 +112,11 @@ func TestMergeTieBreakOrder(t *testing.T) { {"start": 1, "end": 2, "text": "b-same-time"} ] }`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - a.json: - speaker: Alice - b.json: - speaker: Bob + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] `) output := filepath.Join(dir, "merged.json") @@ -148,12 +148,43 @@ func TestMergeTieBreakOrder(t *testing.T) { } } +func TestSpeakerMatchingUsesFirstMatchingRuleCaseInsensitive(t *testing.T) { + dir := t.TempDir() + input := writeJSONFile(t, dir, "2026-04-19-Adam_Rakestraw.json", `{ + "segments": [ + {"start": 1, "end": 2, "text": "hello"} + ] + }`) + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: First Match + match: ["adam"] + - speaker: Later Match + match: ["Adam_Rakestraw"] +`) + output := filepath.Join(dir, "merged.json") + + err := executeMerge( + "--input-file", input, + "--speakers", speakers, + "--output-file", output, + ) + if err != nil { + t.Fatalf("merge failed: %v", err) + } + + var transcript model.FinalTranscript + readJSON(t, output, &transcript) + if got, want := transcript.Segments[0].Speaker, "First Match"; got != want { + t.Fatalf("speaker = %q, want %q", got, want) + } +} + func TestUnknownModulesFailDuringValidation(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") @@ -224,9 +255,9 @@ func TestInvalidPreprocessingOrderFails(t *testing.T) { func TestMissingInputFileFailsBeforePipelineExecution(t *testing.T) { dir := t.TempDir() - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - missing.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["missing.json"] `) output := filepath.Join(dir, "merged.json") @@ -263,9 +294,9 @@ func TestNormalizeSpeakersRequiresSpeakersFile(t *testing.T) { func TestAutocorrectRequiresAutocorrectFile(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") @@ -287,11 +318,11 @@ func TestOutputJSONIsByteStable(t *testing.T) { dir := t.TempDir() inputA := writeJSONFile(t, dir, "a.json", `{"segments":[{"start":2,"end":3,"text":"a"}]}`) inputB := writeJSONFile(t, dir, "b.json", `{"segments":[{"start":1,"end":2,"text":"b"}]}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - a.json: - speaker: Alice - b.json: - speaker: Bob + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["a.json"] + - speaker: Bob + match: ["b.json"] `) outputA := filepath.Join(dir, "merged-a.json") outputB := filepath.Join(dir, "merged-b.json") @@ -327,9 +358,9 @@ func TestOutputJSONIsByteStable(t *testing.T) { func TestMissingSpeakerMappingFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - other.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["other.json"] `) output := filepath.Join(dir, "merged.json") @@ -341,7 +372,7 @@ func TestMissingSpeakerMappingFails(t *testing.T) { if err == nil { t.Fatal("expected error") } - if !strings.Contains(err.Error(), `speaker map has no entry for input basename "input.json"`) { + if !strings.Contains(err.Error(), `speaker map has no match for input basename "input.json"`) { t.Fatalf("unexpected error: %v", err) } } @@ -349,9 +380,9 @@ func TestMissingSpeakerMappingFails(t *testing.T) { func TestMalformedJSONFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{"segments":[`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") @@ -371,9 +402,9 @@ func TestMalformedJSONFails(t *testing.T) { func TestMissingTopLevelSegmentsFails(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", `{}`) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") @@ -422,9 +453,9 @@ func TestInvalidSegmentFieldsFailWithSourceAndIndex(t *testing.T) { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") @@ -468,9 +499,9 @@ func TestInvalidTimingFails(t *testing.T) { t.Run(test.name, func(t *testing.T) { dir := t.TempDir() input := writeJSONFile(t, dir, "input.json", test.json) - speakers := writeYAMLFile(t, dir, "speakers.yml", `inputs: - input.json: - speaker: Alice + speakers := writeYAMLFile(t, dir, "speakers.yml", `match: + - speaker: Alice + match: ["input.json"] `) output := filepath.Join(dir, "merged.json") diff --git a/internal/speaker/map.go b/internal/speaker/map.go index 0ab3ffa..696346d 100644 --- a/internal/speaker/map.go +++ b/internal/speaker/map.go @@ -9,18 +9,19 @@ import ( "gopkg.in/yaml.v3" ) -// Map resolves input file basenames to canonical speaker names. +// Map resolves input file basenames to canonical speaker names using ordered substring rules. type Map struct { - inputs map[string]Input + rules []Rule } -// Input describes one input entry in speakers.yml. -type Input struct { - Speaker string `yaml:"speaker"` +// Rule describes one ordered speaker matching rule. +type Rule struct { + Speaker string `yaml:"speaker"` + Match []string `yaml:"match"` } type fileSchema struct { - Inputs map[string]Input `yaml:"inputs"` + Match []Rule `yaml:"match"` } // LoadMap parses a speakers.yml file. @@ -34,36 +35,49 @@ func LoadMap(path string) (Map, error) { if err := yaml.Unmarshal(data, &parsed); err != nil { return Map{}, fmt.Errorf("parse speaker map %q: %w", path, err) } - if len(parsed.Inputs) == 0 { - return Map{}, fmt.Errorf("speaker map %q must contain at least one inputs entry", path) + if len(parsed.Match) == 0 { + return Map{}, fmt.Errorf("speaker map %q must contain at least one match rule", path) } - inputs := make(map[string]Input, len(parsed.Inputs)) - for key, input := range parsed.Inputs { - basename := filepath.Base(strings.TrimSpace(key)) - if basename == "." || basename == "" { - return Map{}, fmt.Errorf("speaker map %q contains an empty input key", path) + seenSpeakers := make(map[string]struct{}, len(parsed.Match)) + rules := make([]Rule, 0, len(parsed.Match)) + for index, rule := range parsed.Match { + rule.Speaker = strings.TrimSpace(rule.Speaker) + if rule.Speaker == "" { + return Map{}, fmt.Errorf("speaker map rule %d must include speaker", index) } - if _, exists := inputs[basename]; exists { - return Map{}, fmt.Errorf("speaker map %q contains duplicate basename mapping for %q", path, basename) + if _, exists := seenSpeakers[rule.Speaker]; exists { + return Map{}, fmt.Errorf("speaker map contains duplicate speaker %q", rule.Speaker) + } + seenSpeakers[rule.Speaker] = struct{}{} + + if len(rule.Match) == 0 { + return Map{}, fmt.Errorf("speaker map rule %d for speaker %q must include at least one match string", index, rule.Speaker) + } + for matchIndex, match := range rule.Match { + match = strings.TrimSpace(match) + if match == "" { + return Map{}, fmt.Errorf("speaker map rule %d for speaker %q contains empty match string at index %d", index, rule.Speaker, matchIndex) + } + rule.Match[matchIndex] = match } - input.Speaker = strings.TrimSpace(input.Speaker) - if input.Speaker == "" { - return Map{}, fmt.Errorf("speaker map entry %q must include speaker", basename) - } - inputs[basename] = input + rules = append(rules, rule) } - return Map{inputs: inputs}, nil + return Map{rules: rules}, nil } // SpeakerForSource returns the canonical speaker for a transcript source path. func (m Map) SpeakerForSource(source string) (string, error) { basename := filepath.Base(source) - input, ok := m.inputs[basename] - if !ok { - return "", fmt.Errorf("speaker map has no entry for input basename %q", basename) + normalized := strings.ToLower(basename) + for _, rule := range m.rules { + for _, match := range rule.Match { + if strings.Contains(normalized, strings.ToLower(match)) { + return rule.Speaker, nil + } + } } - return input.Speaker, nil + return "", fmt.Errorf("speaker map has no match for input basename %q", basename) } diff --git a/internal/speaker/map_test.go b/internal/speaker/map_test.go new file mode 100644 index 0000000..c943444 --- /dev/null +++ b/internal/speaker/map_test.go @@ -0,0 +1,155 @@ +package speaker + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestSpeakerForSourceMatchesBasenameCaseInsensitive(t *testing.T) { + dir := t.TempDir() + path := writeSpeakerMap(t, dir, `match: + - speaker: "Eric Rakestraw" + match: + - "eric_rakestraw" +`) + + speakers, err := LoadMap(path) + if err != nil { + t.Fatalf("load speaker map: %v", err) + } + + got, err := speakers.SpeakerForSource(filepath.Join(dir, "2026-04-19-Eric_Rakestraw.json")) + if err != nil { + t.Fatalf("resolve speaker: %v", err) + } + if got != "Eric Rakestraw" { + t.Fatalf("speaker = %q, want %q", got, "Eric Rakestraw") + } +} + +func TestSpeakerForSourceUsesBasenameOnly(t *testing.T) { + dir := t.TempDir() + path := writeSpeakerMap(t, dir, `match: + - speaker: "Directory Match" + match: + - "speaker-dir" +`) + + speakers, err := LoadMap(path) + if err != nil { + t.Fatalf("load speaker map: %v", err) + } + + _, err = speakers.SpeakerForSource(filepath.Join(dir, "speaker-dir", "input.json")) + if err == nil { + t.Fatal("expected no match") + } + if !strings.Contains(err.Error(), `input.json`) { + t.Fatalf("expected basename in error, got %v", err) + } +} + +func TestSpeakerForSourceUsesFirstMatchingRule(t *testing.T) { + dir := t.TempDir() + path := writeSpeakerMap(t, dir, `match: + - speaker: "First" + match: + - "adam" + - speaker: "Second" + match: + - "adam_rakestraw" +`) + + speakers, err := LoadMap(path) + if err != nil { + t.Fatalf("load speaker map: %v", err) + } + + got, err := speakers.SpeakerForSource("2026-04-19-Adam_Rakestraw.json") + if err != nil { + t.Fatalf("resolve speaker: %v", err) + } + if got != "First" { + t.Fatalf("speaker = %q, want %q", got, "First") + } +} + +func TestLoadMapValidation(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + { + name: "missing top-level match", + content: `inputs: {}`, + want: "must contain at least one match rule", + }, + { + name: "empty match list", + content: `match: []`, + want: "must contain at least one match rule", + }, + { + name: "empty speaker", + content: `match: + - speaker: "" + match: ["eric"] +`, + want: "must include speaker", + }, + { + name: "empty rule match list", + content: `match: + - speaker: "Eric" + match: [] +`, + want: "must include at least one match string", + }, + { + name: "empty match string", + content: `match: + - speaker: "Eric" + match: [" "] +`, + want: "contains empty match string", + }, + { + name: "duplicate speaker", + content: `match: + - speaker: "Eric" + match: ["eric"] + - speaker: "Eric" + match: ["rakestraw"] +`, + want: `duplicate speaker "Eric"`, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + dir := t.TempDir() + path := writeSpeakerMap(t, dir, test.content) + + _, err := LoadMap(path) + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), test.want) { + t.Fatalf("expected error to contain %q, got %v", test.want, err) + } + }) + } +} + +func writeSpeakerMap(t *testing.T, dir string, content string) string { + t.Helper() + + path := filepath.Join(dir, "speakers.yml") + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("write speaker map: %v", err) + } + return path +}