16 Commits

Author SHA1 Message Date
6dbb7ab17e Review normalize command architecture
All checks were successful
ci/woodpecker/tag/release Pipeline was successful
2026-05-09 12:38:06 +00:00
3591041fa8 Document normalize command 2026-05-09 12:35:48 +00:00
5b008e272c Add normalize report diagnostics 2026-05-09 12:34:37 +00:00
6c780f6293 Implement normalize output conversion 2026-05-09 12:32:18 +00:00
c132f3fd5d Add normalize input parsing 2026-05-09 12:29:12 +00:00
3679435063 Add normalize command scaffold 2026-05-09 12:26:47 +00:00
e6d3b4a46e Harden trim integration
All checks were successful
ci/woodpecker/tag/release Pipeline was successful
2026-05-08 15:00:46 +00:00
54f7717de8 Document trim command 2026-05-08 14:57:52 +00:00
c48b02d2ec Add trim report output 2026-05-08 14:56:24 +00:00
ac3dcf2557 Add trim CLI command 2026-05-08 14:53:59 +00:00
1c0e4438ae Recompute overlap groups during trim 2026-05-08 14:47:52 +00:00
52f7729100 Add artifact trim transformation 2026-05-08 14:44:31 +00:00
2c82f8bf5c Add trim selector parsing 2026-05-08 14:41:47 +00:00
d865bda4a9 Updated .gitignore to ignore .codex and related files 2026-05-08 14:36:00 +00:00
f20f06db12 Bugfixes and documentation cleanup for v1.0 release.
All checks were successful
ci/woodpecker/tag/release Pipeline was successful
2026-05-01 11:30:29 -05:00
c9e98e14b5 Fixed a bug with respect to checksum generation for the Woodpecker release pipeline
All checks were successful
ci/woodpecker/tag/release Pipeline was successful
2026-05-01 10:51:14 -05:00
34 changed files with 4891 additions and 182 deletions

4
.gitignore vendored
View File

@@ -1,3 +1,7 @@
# ---> Codex
.codex
AGENTS.md
# ---> Go # ---> Go
# If you prefer the allow list template instead of the deny list, see community template: # If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore

View File

@@ -42,8 +42,9 @@ steps:
from_secret: GITEA_RELEASE_TOKEN from_secret: GITEA_RELEASE_TOKEN
files: files:
- dist/seriatim-* - dist/seriatim-*
checksum: dist/seriatim-* checksum: sha256
checksum-file: SHA256SUMS checksum-file: SHA256SUMS
checksum-flatten: true
file-exists: skip file-exists: skip
overwrite: false overwrite: false
prerelease: false prerelease: false

145
README.md
View File

@@ -1,8 +1,8 @@
# seriatim # seriatim
`seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order. `seriatim` merges per-speaker WhisperX-style JSON transcripts into a single JSON transcript that preserves speaker identity and chronological order. It also trims existing seriatim output artifacts by segment ID and normalizes external transcript-like JSON into standard seriatim output schemas.
The current implementation supports the `merge` command. It reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, detects and resolves overlaps when word-level timing is available, assigns consecutive numeric `id` values, and writes a merged JSON artifact. The current implementation supports the `merge`, `trim`, and `normalize` commands. `merge` reads one or more input JSON files, optionally maps each input file to a canonical speaker using `speakers.yml`, sorts all segments by timestamp, detects and resolves overlaps when word-level timing is available, assigns consecutive numeric `id` values, and writes a merged JSON artifact. `trim` reads an existing seriatim output artifact and projects it to a retained segment subset. `normalize` reads transcript-like JSON input, validates required segment fields, sorts deterministically, assigns fresh IDs, and emits a selected seriatim output schema.
## Usage ## Usage
@@ -25,10 +25,39 @@ go run ./cmd/seriatim merge \
--report-file report.json --report-file report.json
``` ```
Trim an existing seriatim artifact:
```sh
go run ./cmd/seriatim trim \
--input-file merged.json \
--output-file trimmed.json \
--keep "1-10, 15, 20-25"
```
Normalize external transcript-style JSON:
```sh
go run ./cmd/seriatim normalize \
--input-file transcript.json \
--output-file normalized.json
```
Normalize an Audita-style bare segment array to full schema with report output:
```sh
go run ./cmd/seriatim normalize \
--input-file audita-segments.json \
--output-file normalized-full.json \
--output-schema seriatim-full \
--report-file normalize-report.json
```
## CLI ## CLI
```text ```text
seriatim merge [flags] seriatim merge [flags]
seriatim trim [flags]
seriatim normalize [flags]
``` ```
Global flags: Global flags:
@@ -49,16 +78,98 @@ Global flags:
| `--autocorrect` | No | none | Autocorrect rules YAML file. When omitted, the default `autocorrect` module leaves text unchanged. | | `--autocorrect` | No | none | Autocorrect rules YAML file. When omitted, the default `autocorrect` module leaves text unchanged. |
| `--input-reader` | No | `json-files` | Input reader module. | | `--input-reader` | No | `json-files` | Input reader module. |
| `--output-modules` | No | `json` | Comma-separated output modules. | | `--output-modules` | No | `json` | Comma-separated output modules. |
| `--output-schema` | No | `default` | JSON output contract. Allowed values are `default`, `minimal`, and `seriatim`. | | `--output-schema` | No | `seriatim-intermediate` | JSON output contract. Allowed values are `seriatim-minimal`, `seriatim-intermediate`, and `seriatim-full`. If omitted, the runtime default is used; consumers that depend on a specific shape should set this explicitly. |
| `--preprocessing-modules` | No | `validate-raw,normalize-speakers,trim-text` | Comma-separated preprocessing modules, evaluated in order. | | `--preprocessing-modules` | No | `validate-raw,normalize-speakers,trim-text` | Comma-separated preprocessing modules, evaluated in order. |
| `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. | | `--postprocessing-modules` | No | `detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output` | Comma-separated postprocessing modules, evaluated in order. |
| `--coalesce-gap` | No | `3.0` | Maximum same-speaker gap in seconds for `coalesce`; also used as the `resolve-overlaps` context window. Must be a non-negative float. | | `--coalesce-gap` | No | `3.0` | Maximum same-speaker gap in seconds for `coalesce`; also used as the `resolve-overlaps` context window. Must be a non-negative float. |
`trim` flags:
| Flag | Required | Default | Description |
| --- | --- | --- | --- |
| `--input-file` | Yes | none | Input seriatim output artifact JSON file. |
| `--output-file` | Yes | none | Trimmed transcript JSON output path. |
| `--keep` | Exactly one of `--keep` or `--remove` is required | none | Segment ID selector to retain. |
| `--remove` | Exactly one of `--keep` or `--remove` is required | none | Segment ID selector to drop. |
| `--output-schema` | No | preserve input artifact schema | Optional output schema override: `seriatim-minimal`, `seriatim-intermediate`, or `seriatim-full`. |
| `--report-file` | No | none | Optional report JSON output path. |
| `--allow-empty` | No | `false` | Allow trimming to zero retained segments. |
`trim` selection rules:
- `--keep` and `--remove` are mutually exclusive.
- Exactly one of `--keep` or `--remove` is required.
- Selection is by segment ID only.
- Invalid selected segment IDs fail the command by default.
`trim` selector syntax:
- Segment IDs are positive 1-based integers.
- Inclusive ranges are supported: `1-10`.
- Comma-separated selectors are supported: `1-10,15,20-25`.
- Whitespace around numbers, commas, and hyphens is allowed: `1 - 10, 15, 20 - 25`.
- Duplicate and overlapping ranges are accepted and normalized as a union.
- Descending ranges (for example `10-1`) are rejected.
`trim` behavior:
- `trim` consumes existing seriatim JSON output artifacts only.
- `trim` does not accept raw WhisperX transcript JSON as input.
- Retained output segment IDs are renumbered sequentially from `1` to `N`.
- Transcript order is preserved from input transcript order; selector order does not reorder output.
- When output schema is `seriatim-full`, overlap groups are recomputed from retained segments.
- `--output-schema seriatim-full` is supported when trim has full-schema artifact data to emit; trim does not synthesize missing full-schema provenance from minimal/intermediate input artifacts.
- `trim` does not run merge postprocessors such as `resolve-overlaps`, `coalesce`, or `autocorrect`.
`trim` report output:
- When `--report-file` is provided, the report includes standard trim/validation/output events.
- The report includes a `trim-audit` event containing trim operation metadata, including selected IDs, retained/removed counts, removed IDs, and old-to-new segment ID mapping.
- Old-to-new ID mapping is emitted as a deterministic ordered array of `{old_id, new_id}` pairs.
`normalize` flags:
| Flag | Required | Default | Description |
| --- | --- | --- | --- |
| `--input-file` | Yes | none | Input transcript JSON file. |
| `--output-file` | Yes | none | Normalized transcript JSON output path. |
| `--output-schema` | No | `seriatim-intermediate` (resolved via `SERIATIM_OUTPUT_SCHEMA` when set) | Output JSON schema: `seriatim-minimal`, `seriatim-intermediate`, or `seriatim-full`. |
| `--output-modules` | No | `json` | Comma-separated output modules. Current normalize support is `json` only. |
| `--report-file` | No | none | Optional report JSON output path. |
`normalize` input shapes:
- Top-level object with a `segments` array.
- Bare top-level array of segment objects (for example, Audita-style output).
`normalize` required segment fields:
- `start`
- `end`
- `speaker`
- `text`
`normalize` behavior:
- Validates `start >= 0`, `end >= start`, and non-empty `speaker`.
- Accepts existing input `id` values as provenance only.
- Reassigns output segment IDs sequentially from `1` to `N`.
- Sorts deterministically by `(start, end, original_input_index, speaker)`.
- Uses original input order only as a tie-breaker.
- Does not run merge postprocessors such as overlap detection, overlap resolution, coalescing, or autocorrect.
- Useful for converting external transcript outputs into standard seriatim artifacts.
`normalize` report output:
- When `--report-file` is provided, normalize emits deterministic report events with input shape detection, segment counts, schema/module selections, sorting/ID diagnostics, and output write/validation summaries.
- A machine-readable `normalize-audit` event is included for downstream tooling.
Environment variables: Environment variables:
| Environment Variable | Default | Description | | Environment Variable | Default | Description |
| --- | --- | --- | | --- | --- | --- |
| `SERIATIM_OVERLAP_WORD_RUN_GAP` | `0.75` | Maximum gap in seconds between adjacent timed words when `resolve-overlaps` builds word-run replacement segments. Must be a positive float. | | `SERIATIM_OUTPUT_SCHEMA` | `seriatim-intermediate` | Output schema used when `--output-schema` is not explicitly provided. Allowed values are `seriatim-minimal`, `seriatim-intermediate`, and `seriatim-full`. The CLI flag takes precedence. |
| `SERIATIM_OVERLAP_WORD_RUN_GAP` | `1.0` | Maximum gap in seconds between adjacent timed words when `resolve-overlaps` builds word-run replacement segments. Must be a positive float. |
| `SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW` | `1.0` | Near-start window in seconds for ordering replacement word runs shortest-first. Must be a positive float. | | `SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW` | `1.0` | Near-start window in seconds for ordering replacement word runs shortest-first. Must be a positive float. |
| `SERIATIM_BACKCHANNEL_MAX_DURATION` | `2.0` | Maximum duration in seconds for `backchannel` classification. Must be a positive float. | | `SERIATIM_BACKCHANNEL_MAX_DURATION` | `2.0` | Maximum duration in seconds for `backchannel` classification. Must be a positive float. |
| `SERIATIM_FILLER_MAX_DURATION` | `1.25` | Maximum duration in seconds for `filler` classification. Must be a positive float. | | `SERIATIM_FILLER_MAX_DURATION` | `1.25` | Maximum duration in seconds for `filler` classification. Must be a positive float. |
@@ -159,14 +270,16 @@ The old `inputs:` direct mapping format is no longer supported.
`--output-modules json` controls the writer. `--output-schema` controls the JSON contract that writer serializes. `--output-modules json` controls the writer. `--output-schema` controls the JSON contract that writer serializes.
The `default` schema is the default output contract. It stays close to `minimal`, but adds optional `categories` on each segment: The named schemas are stable public contracts. If a consumer depends on a specific shape, it should request that schema explicitly at runtime. The runtime default selection may change in a future release.
The `seriatim-intermediate` schema is the current default selection when neither `--output-schema` nor `SERIATIM_OUTPUT_SCHEMA` is set. It stays close to the minimal schema, but adds optional `categories` on each segment:
```json ```json
{ {
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "default" "output_schema": "seriatim-intermediate"
}, },
"segments": [ "segments": [
{ {
@@ -181,7 +294,7 @@ The `default` schema is the default output contract. It stays close to `minimal`
} }
``` ```
The explicit `seriatim` schema uses the full seriatim envelope: The `seriatim-full` schema uses the full seriatim envelope:
```json ```json
{ {
@@ -191,7 +304,7 @@ The explicit `seriatim` schema uses the full seriatim envelope:
"input_reader": "json-files", "input_reader": "json-files",
"input_files": ["eric.json", "mike.json"], "input_files": ["eric.json", "mike.json"],
"preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"], "preprocessing_modules": ["validate-raw", "normalize-speakers", "trim-text"],
"postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "filler", "coalesce", "resolve-danglers", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"], "postprocessing_modules": ["detect-overlaps", "resolve-overlaps", "backchannel", "filler", "resolve-danglers", "coalesce", "detect-overlaps", "autocorrect", "assign-ids", "validate-output"],
"output_modules": ["json"] "output_modules": ["json"]
}, },
"segments": [ "segments": [
@@ -231,14 +344,14 @@ The explicit `seriatim` schema uses the full seriatim envelope:
} }
``` ```
The `minimal` schema emits minimal metadata and compact ordered segments: The `seriatim-minimal` schema emits minimal metadata and compact ordered segments:
```json ```json
{ {
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "minimal" "output_schema": "seriatim-minimal"
}, },
"segments": [ "segments": [
{ {
@@ -254,7 +367,7 @@ The `minimal` schema emits minimal metadata and compact ordered segments:
Minimal output intentionally omits categories, overlap groups, source/provenance fields, and pipeline configuration metadata. Minimal output intentionally omits categories, overlap groups, source/provenance fields, and pipeline configuration metadata.
Default output intentionally omits overlap groups and source/provenance fields, but keeps optional `categories` and minimal metadata. Intermediate output intentionally omits overlap groups and source/provenance fields, but keeps optional `categories` and minimal metadata.
Segments are sorted deterministically by: Segments are sorted deterministically by:
@@ -270,7 +383,7 @@ The public Go output contract is available from:
import "gitea.maximumdirect.net/eric/seriatim/schema" import "gitea.maximumdirect.net/eric/seriatim/schema"
``` ```
The same package embeds machine-readable JSON Schemas in `schema/output.schema.json`, `schema/default-output.schema.json`, and `schema/minimal-output.schema.json`. The default `validate-output` postprocessor validates the selected output shape and verifies final segment IDs are present, sequential, and start at `1`. The same package embeds machine-readable JSON Schemas in `schema/full-output.schema.json`, `schema/intermediate-output.schema.json`, and `schema/minimal-output.schema.json`. The default `validate-output` postprocessor validates the selected output shape and verifies final segment IDs are present, sequential, and start at `1`.
## Overlap Detection ## Overlap Detection
@@ -295,12 +408,12 @@ For each detected overlap group, `resolve-overlaps` uses preserved WhisperX word
- The resolution window expands the detected overlap group by `--coalesce-gap` seconds on both sides. - The resolution window expands the detected overlap group by `--coalesce-gap` seconds on both sides.
- Nearby same-speaker context segments are included when they intersect the expanded window and their start or end is within `--coalesce-gap` of the original overlap boundary. - Nearby same-speaker context segments are included when they intersect the expanded window and their start or end is within `--coalesce-gap` of the original overlap boundary.
- Words are included when their interval intersects the expanded resolution window. - Once a segment is selected for replacement, all timed words from that segment participate in word-run construction; the window controls segment selection, not per-word clipping.
- Context segments that are part of another detected overlap group are not pulled into the current group. - Context segments that are part of another detected overlap group are not pulled into the current group.
- Untimed words are included in replacement text in original word order when nearby timed words create a replacement run. - Untimed words are included in replacement text in original word order when nearby timed words create a replacement run.
- Untimed words do not affect replacement segment start/end times or word-run gap splitting. - Untimed words do not affect replacement segment start/end times or word-run gap splitting.
- Words for the same speaker are merged into one run when the gap between adjacent words is no greater than `SERIATIM_OVERLAP_WORD_RUN_GAP`. - Words for the same speaker are merged into one run when the gap between adjacent words is no greater than `SERIATIM_OVERLAP_WORD_RUN_GAP`.
- The default word-run gap is `0.75` seconds. - The default word-run gap is `1.0` seconds.
- Set `SERIATIM_OVERLAP_WORD_RUN_GAP` to a positive number of seconds to override the default. - Set `SERIATIM_OVERLAP_WORD_RUN_GAP` to a positive number of seconds to override the default.
- Near-start replacement word runs are reordered so shorter segments come first when adjacent starts are within `SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW`. - Near-start replacement word runs are reordered so shorter segments come first when adjacent starts are within `SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW`.
- The default word-run reorder window is `1.0` seconds. - The default word-run reorder window is `1.0` seconds.
@@ -339,12 +452,12 @@ The default pipeline runs `resolve-danglers` before `coalesce` and before the se
- Dangling-end fragments have no more than two words and end in punctuation. - Dangling-end fragments have no more than two words and end in punctuation.
- Dangling-start fragments have no more than two words. - Dangling-start fragments have no more than two words.
- Matching uses any shared `derived_from` value. - Matching uses same-speaker segments with any shared `derived_from` value.
- Merged segments use `source_ref` values such as `resolve-danglers:1`, keep the target segment's transcript position, and union `derived_from`. - Merged segments use `source_ref` values such as `resolve-danglers:1`, keep the target segment's transcript position, and union `derived_from`.
## Coalescing ## Coalescing
The default pipeline runs `coalesce` after `resolve-danglers` and the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`. The default pipeline runs `coalesce` after `resolve-danglers` and before the second overlap detection pass. It merges adjacent same-speaker segments in the transcript's current order when `next.start - current.end <= --coalesce-gap`.
Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`. Coalesced segments use `source_ref` values such as `coalesce:1`, include `derived_from`, and omit `source_segment_index`.

View File

@@ -1,6 +1,10 @@
# seriatim Architecture # seriatim Architecture
`seriatim` is a deterministic transcript merge utility for combining multiple per-speaker transcript inputs into a single chronologically ordered diarized transcript. `seriatim` is a deterministic transcript utility for:
- merging multiple per-speaker transcript inputs into a single chronologically ordered diarized transcript, and
- projecting existing seriatim transcript artifacts through deterministic segment-ID trimming, and
- canonicalizing external transcript-style JSON inputs into standard seriatim output schemas.
The initial use case is merging independently transcribed speaker audio tracks from the same recorded session, such as a weekly tabletop RPG session. The architecture should also support meetings, podcasts, interviews, and other multi-speaker events. The initial use case is merging independently transcribed speaker audio tracks from the same recorded session, such as a weekly tabletop RPG session. The architecture should also support meetings, podcasts, interviews, and other multi-speaker events.
@@ -20,6 +24,7 @@ The initial use case is merging independently transcribed speaker audio tracks f
8. Detect and annotate overlapping speech regions. 8. Detect and annotate overlapping speech regions.
9. Emit one or more output artifacts through output writers. 9. Emit one or more output artifacts through output writers.
10. Produce report data for validation findings, corrections, and transformations. 10. Produce report data for validation findings, corrections, and transformations.
11. Support artifact-level transcript projection commands that operate on existing seriatim output.
## Non-goals ## Non-goals
@@ -56,6 +61,8 @@ configuration check
Each stage has an explicit data contract. Input and output stages perform I/O. Processing stages should be deterministic transformations over in-memory models and should record report events for validation findings, corrections, and transformations. Each stage has an explicit data contract. Input and output stages perform I/O. Processing stages should be deterministic transformations over in-memory models and should record report events for validation findings, corrections, and transformations.
`merge` runs this pipeline. `trim` and `normalize` are intentionally separate from this pipeline and operate at the artifact layer.
## Stage Contracts ## Stage Contracts
### 1. Configuration Check ### 1. Configuration Check
@@ -173,6 +180,14 @@ The current output format is JSON, specified with:
--output-file merged.json --output-file merged.json
``` ```
The current named JSON schemas are:
- `seriatim-minimal`
- `seriatim-intermediate`
- `seriatim-full`
The current runtime default selection is `seriatim-intermediate`, but default selection may change over time. Consumers that depend on a specific schema should request it explicitly.
Future output formats may include: Future output formats may include:
- Markdown. - Markdown.
@@ -183,6 +198,41 @@ Future output formats may include:
Output writers should be selected from an explicit registry and should consume the final transcript model read-only. Multiple output writers may run for a single invocation. Output writers should be selected from an explicit registry and should consume the final transcript model read-only. Multiple output writers may run for a single invocation.
### 7. Artifact Projection Stage (`trim` command)
`trim` is an artifact-level command that reads an existing seriatim output artifact and emits a projected artifact containing a segment-ID subset.
Design constraints:
- `trim` runs after `merge`, not as a merge postprocessor.
- `trim` validates the input artifact against supported seriatim output schemas.
- `trim` performs deterministic keep/remove selection by segment ID.
- `trim` renumbers retained IDs to `1..N` in transcript order.
- `trim` validates the final output against the selected output schema before writing.
- `trim` records audit metadata in report output.
`trim` is intentionally separate from merge postprocessing because it consumes already-emitted public artifacts. This separation keeps merge semantics stable and avoids rerunning merge-only transforms on projected artifacts.
`trim` must not rerun merge postprocessors such as `resolve-overlaps`, `coalesce`, or `autocorrect`.
### 8. Artifact Canonicalization Stage (`normalize` command)
`normalize` is an artifact-level command that reads transcript-like JSON and emits a standard seriatim output artifact in a selected schema.
Design constraints:
- `normalize` runs outside the merge pipeline and does not invoke merge preprocessing or postprocessing modules.
- `normalize` accepts two input shapes: object-with-`segments` and bare segment arrays.
- `normalize` validates required segment fields (`start`, `end`, `speaker`, `text`) and timing/speaker constraints.
- `normalize` sorts segments deterministically by chronological keys and stable input-index tie-breakers.
- `normalize` assigns fresh sequential output IDs (`1..N`) after sorting.
- `normalize` validates final output against the selected schema before writing.
- `normalize` writes optional deterministic report diagnostics when `--report-file` is requested.
`normalize` is intended for canonicalizing external transcript outputs (including Audita-style bare arrays) into seriatim contracts, not for running merge-time language or overlap transformations.
`normalize` must not run merge postprocessors such as overlap detection, overlap resolution, coalescing, or autocorrect.
## Module Classification ## Module Classification
Modules should be classified by their contract and allowed effects. Modules should be classified by their contract and allowed effects.
@@ -216,7 +266,7 @@ seriatim merge \
--preprocessing-modules validate-raw,normalize-speakers,trim-text \ --preprocessing-modules validate-raw,normalize-speakers,trim-text \
--postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output \ --postprocessing-modules detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output \
--output-modules json \ --output-modules json \
--output-schema default \ --output-schema seriatim-intermediate \
--output-file merged.json \ --output-file merged.json \
--report-file report.json --report-file report.json
``` ```
@@ -358,7 +408,7 @@ Initial classifications may include:
- `backchannel` - `backchannel`
- `crosstalk` - `crosstalk`
The `resolve-overlaps` module uses preserved word-level timing to replace detected overlap-group segments with smaller word-run segments when usable timing is available. Resolution expands each overlap window by the configured coalesce gap so nearby same-speaker context can be absorbed into the replacement runs. Groups without usable word timing remain unresolved for later passes or human review. The `resolve-overlaps` module uses preserved word-level timing to replace detected overlap-group segments with smaller word-run segments when usable timing is available. Resolution expands each overlap window by the configured coalesce gap so nearby same-speaker context can be absorbed into the replacement runs. Once a segment is selected for replacement, all timed words from that segment participate in word-run construction so text is not clipped at the window boundary. Groups without usable word timing remain unresolved for later passes or human review.
Overlap resolution should be non-destructive. Original segment text, timing, and source metadata must remain recoverable. Overlap resolution should be non-destructive. Original segment text, timing, and source metadata must remain recoverable.
@@ -389,6 +439,8 @@ A valid merged transcript should satisfy:
- Every referenced segment exists. - Every referenced segment exists.
- Output validates against the selected output schema. - Output validates against the selected output schema.
For full-schema trim output, overlap groups are recomputed from retained segments so overlap annotations and group references remain internally consistent after projection.
## Determinism Requirements ## Determinism Requirements
Given the same inputs, config, and application version, `seriatim` should produce byte-stable JSON output where practical. Given the same inputs, config, and application version, `seriatim` should produce byte-stable JSON output where practical.
@@ -403,6 +455,19 @@ To support this:
- Record application version in output metadata. - Record application version in output metadata.
- Record enabled module names and module order in output metadata or report data. - Record enabled module names and module order in output metadata or report data.
Trim-specific determinism requirements:
- Selector normalization and retained IDs are deterministic.
- Old-to-new ID mapping in trim reports is emitted in deterministic order.
- Full-schema overlap recomputation is deterministic for the same input artifact and selector.
Normalize-specific determinism requirements:
- Input-shape detection is deterministic.
- Segment ordering is deterministic for identical input data.
- Output IDs are always reassigned sequentially after deterministic sorting.
- Normalize diagnostic reports are deterministic for identical inputs and configuration.
## Go Package Layout ## Go Package Layout
```text ```text
@@ -411,6 +476,8 @@ internal/config/ CLI/env/config loading and validation
internal/pipeline/ Pipeline orchestration and module registry internal/pipeline/ Pipeline orchestration and module registry
internal/builtin/ Built-in pipeline modules internal/builtin/ Built-in pipeline modules
internal/artifact/ Conversion from internal model to public output schema internal/artifact/ Conversion from internal model to public output schema
internal/normalize/ Normalize input parsing, validation, deterministic sorting, schema conversion, and diagnostics
internal/trim/ Artifact parsing, trim selection, schema conversion, overlap recomputation for full schema
internal/buildinfo/ Build-time version metadata internal/buildinfo/ Build-time version metadata
internal/speaker/ Speaker map parsing and lookup internal/speaker/ Speaker map parsing and lookup
internal/model/ Canonical and merged transcript models internal/model/ Canonical and merged transcript models
@@ -422,6 +489,18 @@ schema/ Public output contract and JSON Schema validation
Package boundaries should follow data ownership. Shared models belong in `internal/model`; stage-specific behavior belongs in the relevant stage package. Package boundaries should follow data ownership. Shared models belong in `internal/model`; stage-specific behavior belongs in the relevant stage package.
For trim:
- `internal/trim` contains pure transformation logic over artifact structs.
- CLI command code handles only flag parsing, file I/O, and report emission.
- Transform logic is deterministic and pure except for command-layer I/O.
For normalize:
- `internal/normalize` contains parsing/validation and deterministic schema conversion logic.
- CLI command code handles flag parsing and delegates execution.
- Normalize remains artifact-level and does not compose merge pipeline modules.
## Default Modules ## Default Modules
The default pipeline is equivalent to explicit module lists. The default pipeline is equivalent to explicit module lists.

View File

@@ -57,12 +57,12 @@ func FromMerged(cfg config.Config, merged model.MergedTranscript) schema.Transcr
} }
} }
// DefaultFromMerged converts the internal merged transcript model into the // IntermediateFromMerged converts the internal merged transcript model into the
// compact default public serialized output contract. // compact intermediate public serialized output contract.
func DefaultFromMerged(cfg config.Config, merged model.MergedTranscript) schema.DefaultTranscript { func IntermediateFromMerged(cfg config.Config, merged model.MergedTranscript) schema.IntermediateTranscript {
segments := make([]schema.DefaultSegment, len(merged.Segments)) segments := make([]schema.IntermediateSegment, len(merged.Segments))
for index, segment := range merged.Segments { for index, segment := range merged.Segments {
segments[index] = schema.DefaultSegment{ segments[index] = schema.IntermediateSegment{
ID: segment.ID, ID: segment.ID,
Start: segment.Start, Start: segment.Start,
End: segment.End, End: segment.End,
@@ -72,11 +72,11 @@ func DefaultFromMerged(cfg config.Config, merged model.MergedTranscript) schema.
} }
} }
return schema.DefaultTranscript{ return schema.IntermediateTranscript{
Metadata: schema.DefaultMetadata{ Metadata: schema.IntermediateMetadata{
Application: ApplicationName, Application: ApplicationName,
Version: buildinfo.Version, Version: buildinfo.Version,
OutputSchema: config.OutputSchemaDefault, OutputSchema: config.OutputSchemaIntermediate,
}, },
Segments: segments, Segments: segments,
} }
@@ -110,14 +110,14 @@ func MinimalFromMerged(cfg config.Config, merged model.MergedTranscript) schema.
// runtime-selected public output contract. // runtime-selected public output contract.
func SelectedFromMerged(cfg config.Config, merged model.MergedTranscript) any { func SelectedFromMerged(cfg config.Config, merged model.MergedTranscript) any {
switch cfg.OutputSchema { switch cfg.OutputSchema {
case config.OutputSchemaDefault:
return DefaultFromMerged(cfg, merged)
case config.OutputSchemaMinimal: case config.OutputSchemaMinimal:
return MinimalFromMerged(cfg, merged) return MinimalFromMerged(cfg, merged)
case config.OutputSchemaSeriatim: case config.OutputSchemaFull:
return FromMerged(cfg, merged) return FromMerged(cfg, merged)
case config.OutputSchemaIntermediate:
return IntermediateFromMerged(cfg, merged)
default: default:
return FromMerged(cfg, merged) return IntermediateFromMerged(cfg, merged)
} }
} }

View File

@@ -23,28 +23,28 @@ func TestFromMergedUsesBuildVersion(t *testing.T) {
} }
} }
func TestSelectedFromMergedDefaultsToSeriatimTranscript(t *testing.T) { func TestSelectedFromMergedDefaultsToIntermediateTranscript(t *testing.T) {
got := SelectedFromMerged(config.Config{}, model.MergedTranscript{}) got := SelectedFromMerged(config.Config{}, model.MergedTranscript{})
if _, ok := got.(schema.IntermediateTranscript); !ok {
t.Fatalf("selected artifact type = %T, want schema.IntermediateTranscript", got)
}
}
func TestSelectedFromMergedUsesIntermediateWhenConfigured(t *testing.T) {
got := SelectedFromMerged(config.Config{OutputSchema: config.OutputSchemaIntermediate}, model.MergedTranscript{})
if _, ok := got.(schema.IntermediateTranscript); !ok {
t.Fatalf("selected artifact type = %T, want schema.IntermediateTranscript", got)
}
}
func TestSelectedFromMergedUsesFullWhenConfigured(t *testing.T) {
got := SelectedFromMerged(config.Config{OutputSchema: config.OutputSchemaFull}, model.MergedTranscript{})
if _, ok := got.(schema.Transcript); !ok { if _, ok := got.(schema.Transcript); !ok {
t.Fatalf("selected artifact type = %T, want schema.Transcript", got) t.Fatalf("selected artifact type = %T, want schema.Transcript", got)
} }
} }
func TestSelectedFromMergedUsesDefaultWhenConfigured(t *testing.T) { func TestIntermediateFromMergedEmitsOnlyIntermediateShape(t *testing.T) {
got := SelectedFromMerged(config.Config{OutputSchema: config.OutputSchemaDefault}, model.MergedTranscript{})
if _, ok := got.(schema.DefaultTranscript); !ok {
t.Fatalf("selected artifact type = %T, want schema.DefaultTranscript", got)
}
}
func TestSelectedFromMergedUsesSeriatimWhenConfigured(t *testing.T) {
got := SelectedFromMerged(config.Config{OutputSchema: config.OutputSchemaSeriatim}, model.MergedTranscript{})
if _, ok := got.(schema.Transcript); !ok {
t.Fatalf("selected artifact type = %T, want schema.Transcript", got)
}
}
func TestDefaultFromMergedEmitsOnlyDefaultShape(t *testing.T) {
merged := model.MergedTranscript{ merged := model.MergedTranscript{
Segments: []model.Segment{ Segments: []model.Segment{
{ {
@@ -65,19 +65,19 @@ func TestDefaultFromMergedEmitsOnlyDefaultShape(t *testing.T) {
}, },
} }
got := DefaultFromMerged(config.Config{OutputSchema: config.OutputSchemaDefault}, merged) got := IntermediateFromMerged(config.Config{OutputSchema: config.OutputSchemaIntermediate}, merged)
want := schema.DefaultTranscript{ want := schema.IntermediateTranscript{
Metadata: schema.DefaultMetadata{ Metadata: schema.IntermediateMetadata{
Application: ApplicationName, Application: ApplicationName,
Version: buildinfo.Version, Version: buildinfo.Version,
OutputSchema: config.OutputSchemaDefault, OutputSchema: config.OutputSchemaIntermediate,
}, },
Segments: []schema.DefaultSegment{ Segments: []schema.IntermediateSegment{
{ID: 1, Start: 1, End: 2, Speaker: "Alice", Text: "hello", Categories: []string{"backchannel"}}, {ID: 1, Start: 1, End: 2, Speaker: "Alice", Text: "hello", Categories: []string{"backchannel"}},
}, },
} }
if !reflect.DeepEqual(got, want) { if !reflect.DeepEqual(got, want) {
t.Fatalf("default transcript = %#v, want %#v", got, want) t.Fatalf("intermediate transcript = %#v, want %#v", got, want)
} }
} }

View File

@@ -51,8 +51,8 @@ func (validateOutput) Process(ctx context.Context, in model.MergedTranscript, cf
selected := artifact.SelectedFromMerged(cfg, in) selected := artifact.SelectedFromMerged(cfg, in)
var err error var err error
switch transcript := selected.(type) { switch transcript := selected.(type) {
case schema.DefaultTranscript: case schema.IntermediateTranscript:
err = schema.ValidateDefaultTranscript(transcript) err = schema.ValidateIntermediateTranscript(transcript)
case schema.MinimalTranscript: case schema.MinimalTranscript:
err = schema.ValidateMinimalTranscript(transcript) err = schema.ValidateMinimalTranscript(transcript)
case schema.Transcript: case schema.Transcript:

View File

@@ -81,7 +81,7 @@ func TestValidateOutputUsesMinimalSchemaWhenConfigured(t *testing.T) {
} }
} }
func TestValidateOutputUsesSeriatimSchemaWhenConfigured(t *testing.T) { func TestValidateOutputUsesFullSchemaWhenConfigured(t *testing.T) {
merged := model.MergedTranscript{ merged := model.MergedTranscript{
Segments: []model.Segment{ Segments: []model.Segment{
{ {
@@ -100,7 +100,7 @@ func TestValidateOutputUsesSeriatimSchemaWhenConfigured(t *testing.T) {
} }
cfg := testConfig() cfg := testConfig()
cfg.OutputSchema = config.OutputSchemaSeriatim cfg.OutputSchema = config.OutputSchemaFull
got, events, err := validateOutput{}.Process(context.Background(), merged, cfg) got, events, err := validateOutput{}.Process(context.Background(), merged, cfg)
if err != nil { if err != nil {
t.Fatalf("validate output: %v", err) t.Fatalf("validate output: %v", err)

View File

@@ -15,7 +15,12 @@ func newMergeCommand() *cobra.Command {
Use: "merge", Use: "merge",
Short: "Run the transcript merge pipeline", Short: "Run the transcript merge pipeline",
RunE: func(cmd *cobra.Command, args []string) error { RunE: func(cmd *cobra.Command, args []string) error {
cfg, err := config.NewMergeConfig(opts) mergeOpts := opts
if !cmd.Flags().Changed("output-schema") {
mergeOpts.OutputSchema = ""
}
cfg, err := config.NewMergeConfig(mergeOpts)
if err != nil { if err != nil {
return err return err
} }
@@ -32,7 +37,7 @@ func newMergeCommand() *cobra.Command {
flags.StringVar(&opts.AutocorrectFile, "autocorrect", "", "autocorrect rules file") flags.StringVar(&opts.AutocorrectFile, "autocorrect", "", "autocorrect rules file")
flags.StringVar(&opts.InputReader, "input-reader", config.DefaultInputReader, "input reader module") flags.StringVar(&opts.InputReader, "input-reader", config.DefaultInputReader, "input reader module")
flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules") flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules")
flags.StringVar(&opts.OutputSchema, "output-schema", config.DefaultOutputSchema, "output JSON schema: default, minimal, or seriatim") flags.StringVar(&opts.OutputSchema, "output-schema", config.DefaultOutputSchema, "output JSON schema: seriatim-minimal, seriatim-intermediate (default), or seriatim-full")
flags.StringVar(&opts.PreprocessingModules, "preprocessing-modules", config.DefaultPreprocessingModules, "comma-separated preprocessing modules") flags.StringVar(&opts.PreprocessingModules, "preprocessing-modules", config.DefaultPreprocessingModules, "comma-separated preprocessing modules")
flags.StringVar(&opts.PostprocessingModules, "postprocessing-modules", config.DefaultPostprocessingModules, "comma-separated postprocessing modules") flags.StringVar(&opts.PostprocessingModules, "postprocessing-modules", config.DefaultPostprocessingModules, "comma-separated postprocessing modules")
flags.StringVar(&opts.CoalesceGap, "coalesce-gap", config.DefaultCoalesceGapValue, "maximum same-speaker gap in seconds for coalesce") flags.StringVar(&opts.CoalesceGap, "coalesce-gap", config.DefaultCoalesceGapValue, "maximum same-speaker gap in seconds for coalesce")

View File

@@ -114,7 +114,8 @@ func TestMergeWritesMergedOutputAndReport(t *testing.T) {
} }
} }
func TestMergeWritesDefaultOutputSchema(t *testing.T) { func TestMergeWritesIntermediateOutputSchema(t *testing.T) {
t.Setenv(config.OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{ input := writeJSONFile(t, dir, "input.json", `{
"segments": [ "segments": [
@@ -131,10 +132,10 @@ func TestMergeWritesDefaultOutputSchema(t *testing.T) {
t.Fatalf("merge failed: %v", err) t.Fatalf("merge failed: %v", err)
} }
var transcript schema.DefaultTranscript var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript) readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaDefault { if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate {
t.Fatalf("output_schema = %q, want default", transcript.Metadata.OutputSchema) t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate)
} }
if len(transcript.Segments) != 1 { if len(transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(transcript.Segments)) t.Fatalf("segment count = %d, want 1", len(transcript.Segments))
@@ -149,12 +150,12 @@ func TestMergeWritesDefaultOutputSchema(t *testing.T) {
} }
for _, forbidden := range []string{"overlap_groups", "source", "derived_from", "words"} { for _, forbidden := range []string{"overlap_groups", "source", "derived_from", "words"} {
if strings.Contains(string(outputBytes), forbidden) { if strings.Contains(string(outputBytes), forbidden) {
t.Fatalf("default output contains %q:\n%s", forbidden, outputBytes) t.Fatalf("intermediate output contains %q:\n%s", forbidden, outputBytes)
} }
} }
} }
func TestMergeWritesSeriatimOutputSchema(t *testing.T) { func TestMergeWritesFullOutputSchema(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{ input := writeJSONFile(t, dir, "input.json", `{
"segments": [ "segments": [
@@ -166,7 +167,7 @@ func TestMergeWritesSeriatimOutputSchema(t *testing.T) {
err := executeMergeRaw( err := executeMergeRaw(
"--input-file", input, "--input-file", input,
"--output-file", output, "--output-file", output,
"--output-schema", "seriatim", "--output-schema", config.OutputSchemaFull,
) )
if err != nil { if err != nil {
t.Fatalf("merge failed: %v", err) t.Fatalf("merge failed: %v", err)
@@ -203,7 +204,7 @@ func TestMergeWritesMinimalOutputSchema(t *testing.T) {
err := executeMerge( err := executeMerge(
"--input-file", input, "--input-file", input,
"--output-file", output, "--output-file", output,
"--output-schema", "minimal", "--output-schema", config.OutputSchemaMinimal,
"--report-file", reportPath, "--report-file", reportPath,
) )
if err != nil { if err != nil {
@@ -215,8 +216,8 @@ func TestMergeWritesMinimalOutputSchema(t *testing.T) {
if transcript.Metadata.Application != "seriatim" { if transcript.Metadata.Application != "seriatim" {
t.Fatalf("application = %q, want seriatim", transcript.Metadata.Application) t.Fatalf("application = %q, want seriatim", transcript.Metadata.Application)
} }
if transcript.Metadata.OutputSchema != "minimal" { if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal {
t.Fatalf("output_schema = %q, want minimal", transcript.Metadata.OutputSchema) t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal)
} }
if got, want := len(transcript.Segments), 2; got != want { if got, want := len(transcript.Segments), 2; got != want {
t.Fatalf("segment count = %d, want %d", got, want) t.Fatalf("segment count = %d, want %d", got, want)
@@ -330,7 +331,7 @@ func TestMergeValidateMinimalOutputBeforeAssignIDsFails(t *testing.T) {
err := executeMerge( err := executeMerge(
"--input-file", input, "--input-file", input,
"--output-file", output, "--output-file", output,
"--output-schema", "minimal", "--output-schema", config.OutputSchemaMinimal,
"--postprocessing-modules", "validate-output,assign-ids", "--postprocessing-modules", "validate-output,assign-ids",
) )
if err == nil { if err == nil {
@@ -472,16 +473,16 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
var transcript model.FinalTranscript var transcript model.FinalTranscript
readJSON(t, output, &transcript) readJSON(t, output, &transcript)
if len(transcript.OverlapGroups) != 0 { if got, want := len(transcript.OverlapGroups), 1; got != want {
t.Fatalf("overlap groups = %#v, want none", transcript.OverlapGroups) t.Fatalf("overlap group count = %d, want %d (%#v)", got, want, transcript.OverlapGroups)
} }
if got, want := len(transcript.Segments), 3; got != want { if got, want := len(transcript.Segments), 2; got != want {
t.Fatalf("segment count = %d, want %d", got, want) t.Fatalf("segment count = %d, want %d", got, want)
} }
wantTexts := []string{"outside hello there", "bob reply", "later"} wantTexts := []string{"outside hello there later", "bob reply"}
wantSpeakers := []string{"Alice", "Bob", "Alice"} wantSpeakers := []string{"Alice", "Bob"}
wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1", "word-run:1:1:2"} wantRefs := []string{"word-run:1:1:1", "word-run:1:2:1"}
for index, segment := range transcript.Segments { for index, segment := range transcript.Segments {
if segment.ID != index+1 { if segment.ID != index+1 {
t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1) t.Fatalf("segment %d id = %d, want %d", index, segment.ID, index+1)
@@ -498,8 +499,8 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
if segment.SourceSegmentIndex != nil { if segment.SourceSegmentIndex != nil {
t.Fatalf("segment %d source_segment_index = %d, want nil", index, *segment.SourceSegmentIndex) t.Fatalf("segment %d source_segment_index = %d, want nil", index, *segment.SourceSegmentIndex)
} }
if segment.OverlapGroupID != 0 { if segment.OverlapGroupID != 1 {
t.Fatalf("segment %d overlap_group_id = %d, want 0", index, segment.OverlapGroupID) t.Fatalf("segment %d overlap_group_id = %d, want 1", index, segment.OverlapGroupID)
} }
} }
if !equalStrings(transcript.Segments[0].DerivedFrom, []string{inputA + "#0"}) { if !equalStrings(transcript.Segments[0].DerivedFrom, []string{inputA + "#0"}) {
@@ -516,7 +517,7 @@ func TestMergeResolvesOverlapGroupsWithWordRuns(t *testing.T) {
var rpt report.Report var rpt report.Report
readJSON(t, reportPath, &rpt) readJSON(t, reportPath, &rpt)
if !hasReportEvent(rpt, "postprocessing", "resolve-overlaps", "processed 1 overlap group(s); changed 1; removed 2 original segment(s); created 3 replacement segment(s)") { if !hasReportEvent(rpt, "postprocessing", "resolve-overlaps", "processed 1 overlap group(s); changed 1; removed 2 original segment(s); created 2 replacement segment(s)") {
t.Fatal("expected resolve-overlaps summary report event") t.Fatal("expected resolve-overlaps summary report event")
} }
} }
@@ -1927,7 +1928,7 @@ func TestMergeResolveOverlapsPreservesAbsorbedContextPrefix(t *testing.T) {
"--input-file", inputA, "--input-file", inputA,
"--input-file", inputB, "--input-file", inputB,
"--speakers", speakers, "--speakers", speakers,
"--output-schema", "minimal", "--output-schema", config.OutputSchemaMinimal,
"--output-file", output, "--output-file", output,
) )
if err != nil { if err != nil {
@@ -1994,7 +1995,7 @@ func executeMerge(args ...string) error {
if !hasOutputSchemaFlag(args) { if !hasOutputSchemaFlag(args) {
// Most integration tests were written against the full envelope; keep // Most integration tests were written against the full envelope; keep
// that behavior unless the caller explicitly asks for another schema. // that behavior unless the caller explicitly asks for another schema.
args = append(args, "--output-schema", config.OutputSchemaSeriatim) args = append(args, "--output-schema", config.OutputSchemaFull)
} }
return executeMergeRaw(args...) return executeMergeRaw(args...)
} }

39
internal/cli/normalize.go Normal file
View File

@@ -0,0 +1,39 @@
package cli
import (
"github.com/spf13/cobra"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/normalize"
)
func newNormalizeCommand() *cobra.Command {
var opts config.NormalizeOptions
cmd := &cobra.Command{
Use: "normalize",
Short: "Normalize a transcript artifact into a standard seriatim output shape",
RunE: func(cmd *cobra.Command, args []string) error {
normalizeOpts := opts
if !cmd.Flags().Changed("output-schema") {
normalizeOpts.OutputSchema = ""
}
cfg, err := config.NewNormalizeConfig(normalizeOpts)
if err != nil {
return err
}
return normalize.Run(cmd.Context(), cfg)
},
}
flags := cmd.Flags()
flags.StringVar(&opts.InputFile, "input-file", "", "input transcript JSON file")
flags.StringVar(&opts.OutputFile, "output-file", "", "output transcript JSON file")
flags.StringVar(&opts.ReportFile, "report-file", "", "optional report JSON file")
flags.StringVar(&opts.OutputSchema, "output-schema", config.DefaultOutputSchema, "output JSON schema: seriatim-minimal, seriatim-intermediate, or seriatim-full")
flags.StringVar(&opts.OutputModules, "output-modules", config.DefaultOutputModules, "comma-separated output modules")
return cmd
}

View File

@@ -0,0 +1,457 @@
package cli
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
func TestNormalizeCommandIsRecognized(t *testing.T) {
cmd := NewRootCommand()
cmd.SetArgs([]string{"normalize", "--help"})
if err := cmd.Execute(); err != nil {
t.Fatalf("normalize command should be recognized: %v", err)
}
}
func TestNormalizeMissingInputFileFails(t *testing.T) {
dir := t.TempDir()
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--output-file", output,
)
if err == nil {
t.Fatal("expected missing input-file error")
}
if !strings.Contains(err.Error(), "--input-file is required") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNormalizeMissingOutputFileFails(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
err := executeNormalize(
"--input-file", input,
)
if err == nil {
t.Fatal("expected missing output-file error")
}
if !strings.Contains(err.Error(), "--output-file is required") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNormalizeInvalidOutputSchemaFails(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-schema", "compact",
)
if err == nil {
t.Fatal("expected invalid output schema error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNormalizeInvalidOutputModuleFails(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-modules", "yaml",
)
if err == nil {
t.Fatal("expected invalid output module error")
}
if !strings.Contains(err.Error(), "unknown output module") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNormalizeDefaultOutputSchemaIsIntermediate(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"id": 99, "start": 5, "end": 6, "speaker": "Bob", "text": "second", "categories": ["filler"]},
{"id": 10, "start": 1, "end": 2, "speaker": "Alice", "text": "first", "categories": ["backchannel"]}
]
}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate {
t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate)
}
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].ID != 1 || transcript.Segments[1].ID != 2 {
t.Fatalf("segment IDs = %d,%d, want 1,2", transcript.Segments[0].ID, transcript.Segments[1].ID)
}
if transcript.Segments[0].Text != "first" || transcript.Segments[1].Text != "second" {
t.Fatalf("unexpected sort order: %#v", transcript.Segments)
}
if len(transcript.Segments[0].Categories) != 1 || transcript.Segments[0].Categories[0] != "backchannel" {
t.Fatalf("expected categories preserved on first segment, got %#v", transcript.Segments[0].Categories)
}
}
func TestNormalizeBareArrayInputToIntermediateOutput(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `[
{"start": 2, "end": 3, "speaker": "Bob", "text": "second"},
{"start": 1, "end": 2, "speaker": "Alice", "text": "first"}
]`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-schema", config.OutputSchemaIntermediate,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].Speaker != "Alice" || transcript.Segments[1].Speaker != "Bob" {
t.Fatalf("unexpected sorted speakers: %#v", transcript.Segments)
}
}
func TestNormalizeInputIndexTieBreakerIsDeterministic(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `[
{"start": 1, "end": 2, "speaker": "Zulu", "text": "first in"},
{"start": 1, "end": 2, "speaker": "Alpha", "text": "second in"}
]`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if transcript.Segments[0].Speaker != "Zulu" || transcript.Segments[1].Speaker != "Alpha" {
t.Fatalf("tie-break order mismatch: %#v", transcript.Segments)
}
}
func TestNormalizeMinimalSchemaOmitsCategories(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{
"segments": [
{"start": 1, "end": 2, "speaker": "Alice", "text": "first", "categories": ["filler"]}
]
}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-schema", config.OutputSchemaMinimal,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.MinimalTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal {
t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal)
}
if len(transcript.Segments) != 1 || transcript.Segments[0].ID != 1 {
t.Fatalf("unexpected minimal output: %#v", transcript.Segments)
}
bytes, readErr := os.ReadFile(output)
if readErr != nil {
t.Fatalf("read output: %v", readErr)
}
if strings.Contains(string(bytes), "categories") {
t.Fatalf("minimal output unexpectedly contains categories:\n%s", string(bytes))
}
}
func TestNormalizeFullSchemaOutputValidatesAndHasProvenanceFallback(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `[
{"start": 1, "end": 2, "speaker": "Alice", "text": "first"},
{"start": 3, "end": 4, "speaker": "Bob", "text": "second", "source":"custom.json", "source_segment_index": 7}
]`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-schema", config.OutputSchemaFull,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if err := schema.ValidateTranscript(transcript); err != nil {
t.Fatalf("full output should validate: %v", err)
}
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].Source != filepath.Base(input) {
t.Fatalf("source fallback = %q, want %q", transcript.Segments[0].Source, filepath.Base(input))
}
if transcript.Segments[0].SourceSegmentIndex == nil || *transcript.Segments[0].SourceSegmentIndex != 0 {
t.Fatalf("source_segment_index fallback = %v, want 0", transcript.Segments[0].SourceSegmentIndex)
}
if transcript.Segments[1].Source != "custom.json" {
t.Fatalf("explicit source preserved = %q, want custom.json", transcript.Segments[1].Source)
}
if transcript.Segments[1].SourceSegmentIndex == nil || *transcript.Segments[1].SourceSegmentIndex != 7 {
t.Fatalf("explicit source_segment_index preserved = %v, want 7", transcript.Segments[1].SourceSegmentIndex)
}
if transcript.OverlapGroups == nil || len(transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups = %#v, want empty array", transcript.OverlapGroups)
}
}
func TestNormalizeEmptySegmentsArrayProducesValidOutput(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 0 {
t.Fatalf("segment count = %d, want 0", len(transcript.Segments))
}
if err := schema.ValidateIntermediateTranscript(transcript); err != nil {
t.Fatalf("intermediate output should validate: %v", err)
}
}
func TestNormalizeSelectedOutputSchemaIsHonored(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"speaker":"A","text":"one"}]}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--output-schema", config.OutputSchemaMinimal,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var transcript schema.MinimalTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal {
t.Fatalf("output schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal)
}
}
func TestNormalizeReportFileWrittenAndContainsObjectInputShape(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"speaker":"A","text":"one"}]}`)
output := filepath.Join(dir, "normalized.json")
reportPath := filepath.Join(dir, "report.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
audit := extractNormalizeAudit(t, rpt)
if audit.InputShape != "object_with_segments" {
t.Fatalf("input shape = %q, want object_with_segments", audit.InputShape)
}
if audit.InputSegmentCount != 1 {
t.Fatalf("input segment count = %d, want 1", audit.InputSegmentCount)
}
if audit.OutputSchema != config.OutputSchemaIntermediate {
t.Fatalf("output schema = %q, want %q", audit.OutputSchema, config.OutputSchemaIntermediate)
}
if len(audit.OutputModules) != 1 || audit.OutputModules[0] != "json" {
t.Fatalf("output modules = %v, want [json]", audit.OutputModules)
}
}
func TestNormalizeReportIncludesBareArrayShape(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `[{"start":1,"end":2,"speaker":"A","text":"one"}]`)
output := filepath.Join(dir, "normalized.json")
reportPath := filepath.Join(dir, "report.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
audit := extractNormalizeAudit(t, rpt)
if audit.InputShape != "bare_segments_array" {
t.Fatalf("input shape = %q, want bare_segments_array", audit.InputShape)
}
}
func TestNormalizeReportDoesNotIncludeTranscriptText(t *testing.T) {
dir := t.TempDir()
const segmentText = "normalize-report-secret-text"
input := writeJSONFile(t, dir, "input.json", `[{"start":1,"end":2,"speaker":"A","text":"`+segmentText+`"}]`)
output := filepath.Join(dir, "normalized.json")
reportPath := filepath.Join(dir, "report.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
for _, event := range rpt.Events {
if strings.Contains(event.Message, segmentText) {
t.Fatalf("report unexpectedly contained transcript text in event %#v", event)
}
}
}
func TestNormalizeReportEmptyInputEmitsWarning(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[]}`)
output := filepath.Join(dir, "normalized.json")
reportPath := filepath.Join(dir, "report.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
)
if err != nil {
t.Fatalf("normalize failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
found := false
for _, event := range rpt.Events {
if event.Stage == "normalize" && event.Module == "normalize" && event.Severity == report.SeverityWarning &&
strings.Contains(event.Message, "zero segments") {
found = true
break
}
}
if !found {
t.Fatalf("expected empty transcript warning event, got %#v", rpt.Events)
}
}
func TestNormalizeReportWriteFailureReturnsClearError(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "input.json", `{"segments":[{"start":1,"end":2,"speaker":"A","text":"one"}]}`)
output := filepath.Join(dir, "normalized.json")
err := executeNormalize(
"--input-file", input,
"--output-file", output,
"--report-file", dir,
)
if err == nil {
t.Fatal("expected report write failure")
}
if !strings.Contains(err.Error(), "write --report-file") {
t.Fatalf("unexpected error: %v", err)
}
}
func executeNormalize(args ...string) error {
cmd := NewRootCommand()
cmd.SetArgs(append([]string{"normalize"}, args...))
return cmd.Execute()
}
type normalizeAudit struct {
Command string `json:"command"`
InputFile string `json:"input_file"`
OutputFile string `json:"output_file"`
InputShape string `json:"input_shape"`
InputSegmentCount int `json:"input_segment_count"`
OutputSchema string `json:"output_schema"`
OutputModules []string `json:"output_modules"`
IDsReassigned bool `json:"ids_reassigned"`
SortingChangedInput bool `json:"sorting_changed_input_order"`
SegmentsWithCategories int `json:"segments_with_categories"`
}
func extractNormalizeAudit(t *testing.T, rpt report.Report) normalizeAudit {
t.Helper()
for _, event := range rpt.Events {
if event.Stage == "normalize" && event.Module == "normalize-audit" {
var audit normalizeAudit
if err := json.Unmarshal([]byte(event.Message), &audit); err != nil {
t.Fatalf("decode normalize audit: %v", err)
}
return audit
}
}
t.Fatalf("missing normalize-audit event: %#v", rpt.Events)
return normalizeAudit{}
}

View File

@@ -10,12 +10,14 @@ import (
func NewRootCommand() *cobra.Command { func NewRootCommand() *cobra.Command {
cmd := &cobra.Command{ cmd := &cobra.Command{
Use: "seriatim", Use: "seriatim",
Short: "Merge per-speaker transcripts into a chronological transcript", Short: "Merge, trim, and normalize transcript artifacts",
Version: buildinfo.Version, Version: buildinfo.Version,
SilenceErrors: true, SilenceErrors: true,
SilenceUsage: true, SilenceUsage: true,
} }
cmd.AddCommand(newMergeCommand()) cmd.AddCommand(newMergeCommand())
cmd.AddCommand(newNormalizeCommand())
cmd.AddCommand(newTrimCommand())
return cmd return cmd
} }

191
internal/cli/trim.go Normal file
View File

@@ -0,0 +1,191 @@
package cli
import (
"encoding/json"
"fmt"
"os"
"sort"
"github.com/spf13/cobra"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
triminternal "gitea.maximumdirect.net/eric/seriatim/internal/trim"
)
type trimAuditReport struct {
Operation string `json:"operation"`
InputFile string `json:"input_file"`
OutputFile string `json:"output_file"`
InputSchema string `json:"input_schema"`
OutputSchema string `json:"output_schema"`
Mode string `json:"mode"`
Selector string `json:"selector"`
SelectedIDs []int `json:"selected_ids"`
AllowEmpty bool `json:"allow_empty"`
InputSegmentCount int `json:"input_segment_count"`
RetainedSegmentCount int `json:"retained_segment_count"`
RemovedSegmentCount int `json:"removed_segment_count"`
RemovedInputIDs []int `json:"removed_input_ids"`
OldToNewIDMapping []trimIDMapping `json:"old_to_new_id_mapping"`
OverlapGroupsRecomputed bool `json:"overlap_groups_recomputed"`
}
type trimIDMapping struct {
OldID int `json:"old_id"`
NewID int `json:"new_id"`
}
func newTrimCommand() *cobra.Command {
var opts config.TrimOptions
cmd := &cobra.Command{
Use: "trim",
Short: "Trim an existing seriatim transcript artifact by segment ID",
RunE: func(cmd *cobra.Command, args []string) error {
trimOpts := opts
if !cmd.Flags().Changed("output-schema") {
trimOpts.OutputSchema = ""
}
cfg, err := config.NewTrimConfig(trimOpts)
if err != nil {
return err
}
selector, err := triminternal.ParseSelector(cfg.Selector)
if err != nil {
return fmt.Errorf("invalid selector %q: %w", cfg.Selector, err)
}
data, err := os.ReadFile(cfg.InputFile)
if err != nil {
return fmt.Errorf("read --input-file %q: %w", cfg.InputFile, err)
}
artifact, err := triminternal.ParseArtifactJSON(data)
if err != nil {
return fmt.Errorf("--input-file %q: %w", cfg.InputFile, err)
}
inputSegmentCount := artifact.SegmentCount()
inputSchema := artifact.Schema
mode := triminternal.ModeKeep
if cfg.Mode == "remove" {
mode = triminternal.ModeRemove
}
trimmed, err := triminternal.ApplyArtifact(artifact, triminternal.Options{
Mode: mode,
Selector: selector,
AllowEmpty: cfg.AllowEmpty,
})
if err != nil {
return err
}
outputSchema := artifact.Schema
if cfg.OutputSchema != "" {
outputSchema = cfg.OutputSchema
}
outputArtifact, err := triminternal.ConvertArtifact(trimmed.Artifact, outputSchema)
if err != nil {
return err
}
if err := triminternal.ValidateArtifact(outputArtifact); err != nil {
return fmt.Errorf("validate trimmed output: %w", err)
}
if err := writeOutputJSON(cfg.OutputFile, outputArtifact.Value()); err != nil {
return err
}
if cfg.ReportFile != "" {
audit := trimAuditReport{
Operation: "trim",
InputFile: cfg.InputFile,
OutputFile: cfg.OutputFile,
InputSchema: inputSchema,
OutputSchema: outputArtifact.Schema,
Mode: cfg.Mode,
Selector: cfg.Selector,
SelectedIDs: selector.IDs(),
AllowEmpty: cfg.AllowEmpty,
InputSegmentCount: inputSegmentCount,
RetainedSegmentCount: len(trimmed.OldToNewID),
RemovedSegmentCount: len(trimmed.RemovedIDs),
RemovedInputIDs: append([]int(nil), trimmed.RemovedIDs...),
OldToNewIDMapping: orderedIDMapping(trimmed.OldToNewID),
OverlapGroupsRecomputed: trimmed.OverlapGroupsRecomputed,
}
auditJSON, err := json.Marshal(audit)
if err != nil {
return fmt.Errorf("marshal trim audit report: %w", err)
}
rpt := report.Report{
Metadata: report.Metadata{
Application: outputArtifact.Application(),
Version: outputArtifact.Version(),
InputReader: "trim-artifact",
InputFiles: []string{cfg.InputFile},
OutputModules: []string{"json"},
},
Events: []report.Event{
report.Info("trim", "trim", fmt.Sprintf("trimmed %d input segment(s) into %d output segment(s) with mode=%s", inputSegmentCount, outputArtifact.SegmentCount(), cfg.Mode)),
report.Info("trim", "trim-audit", string(auditJSON)),
report.Info("trim", "validate-output", fmt.Sprintf("validated %d output segment(s)", outputArtifact.SegmentCount())),
report.Info("output", "json", "wrote transcript JSON"),
},
}
if err := report.WriteJSON(cfg.ReportFile, rpt); err != nil {
return err
}
}
return nil
},
}
flags := cmd.Flags()
flags.StringVar(&opts.InputFile, "input-file", "", "input seriatim transcript artifact JSON file")
flags.StringVar(&opts.OutputFile, "output-file", "", "output transcript JSON file")
flags.StringVar(&opts.ReportFile, "report-file", "", "optional report JSON file")
flags.StringVar(&opts.Keep, "keep", "", "segment ID selector to keep (for example: 1-10,15)")
flags.StringVar(&opts.Remove, "remove", "", "segment ID selector to remove (for example: 1-10,15)")
flags.StringVar(&opts.OutputSchema, "output-schema", "", "optional output JSON schema override: seriatim-minimal, seriatim-intermediate, or seriatim-full")
flags.BoolVar(&opts.AllowEmpty, "allow-empty", false, "allow trimming to an empty transcript")
return cmd
}
func writeOutputJSON(path string, value any) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
enc := json.NewEncoder(file)
enc.SetIndent("", " ")
return enc.Encode(value)
}
func orderedIDMapping(mapping map[int]int) []trimIDMapping {
keys := make([]int, 0, len(mapping))
for oldID := range mapping {
keys = append(keys, oldID)
}
sort.Ints(keys)
pairs := make([]trimIDMapping, 0, len(keys))
for _, oldID := range keys {
pairs = append(pairs, trimIDMapping{
OldID: oldID,
NewID: mapping[oldID],
})
}
return pairs
}

758
internal/cli/trim_test.go Normal file
View File

@@ -0,0 +1,758 @@
package cli
import (
"encoding/json"
"os"
"path/filepath"
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
func TestTrimKeepModeEndToEnd(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "2,4",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].Text != "two" || transcript.Segments[1].Text != "four" {
t.Fatalf("unexpected kept text order: %#v", transcript.Segments)
}
assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID})
}
func TestTrimRemoveModeEndToEnd(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--remove", "2,4",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
if transcript.Segments[0].Text != "one" || transcript.Segments[1].Text != "three" {
t.Fatalf("unexpected remaining text order: %#v", transcript.Segments)
}
assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID})
}
func TestTrimMutualExclusionFailure(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
"--remove", "2",
)
if err == nil {
t.Fatal("expected mutual exclusion error")
}
if !strings.Contains(err.Error(), "mutually exclusive") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimMissingSelectionFailure(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
)
if err == nil {
t.Fatal("expected selection flag error")
}
if !strings.Contains(err.Error(), "exactly one of --keep or --remove is required") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimInvalidSelectedIDFailure(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "99",
)
if err == nil {
t.Fatal("expected missing selected ID error")
}
if !strings.Contains(err.Error(), "does not exist") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimOmittedOutputSchemaPreservesInputSchema(t *testing.T) {
dir := t.TempDir()
input := writeTrimMinimalFixture(t, dir, "input-minimal.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.MinimalTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal {
t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal)
}
if len(transcript.Segments) != 1 || transcript.Segments[0].ID != 1 {
t.Fatalf("unexpected minimal trim output: %#v", transcript.Segments)
}
}
func TestTrimExplicitOutputSchemaChangesOutputSchema(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1,3",
"--output-schema", config.OutputSchemaMinimal,
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.MinimalTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaMinimal {
t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaMinimal)
}
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID})
}
func TestTrimExplicitOutputSchemaConvertsMinimalToIntermediate(t *testing.T) {
dir := t.TempDir()
input := writeTrimMinimalFixture(t, dir, "input-minimal.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1-2",
"--output-schema", config.OutputSchemaIntermediate,
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate {
t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate)
}
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID})
}
func TestTrimIntermediateInputPreservesIntermediateOutputAndCategories(t *testing.T) {
dir := t.TempDir()
input := writeTrimIntermediateFixture(t, dir, "input-intermediate.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "2",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.IntermediateTranscript
readJSON(t, output, &transcript)
if transcript.Metadata.OutputSchema != config.OutputSchemaIntermediate {
t.Fatalf("output_schema = %q, want %q", transcript.Metadata.OutputSchema, config.OutputSchemaIntermediate)
}
if len(transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(transcript.Segments))
}
if transcript.Segments[0].ID != 1 {
t.Fatalf("segment ID = %d, want 1", transcript.Segments[0].ID)
}
assertIntSliceEqual(t, []int{len(transcript.Segments[0].Categories)}, []int{2})
if transcript.Segments[0].Categories[0] != "filler" || transcript.Segments[0].Categories[1] != "backchannel" {
t.Fatalf("categories = %v, want [filler backchannel]", transcript.Segments[0].Categories)
}
}
func TestTrimFullInputPreservesFullShapeAndRecomputesOverlapGroups(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullOverlapFixture(t, dir, "input-full-overlap.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1,2",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(transcript.Segments))
}
assertSequentialIDs(t, []int{transcript.Segments[0].ID, transcript.Segments[1].ID})
if len(transcript.OverlapGroups) != 1 {
t.Fatalf("overlap group count = %d, want 1", len(transcript.OverlapGroups))
}
if transcript.OverlapGroups[0].ID != 1 {
t.Fatalf("overlap group id = %d, want 1", transcript.OverlapGroups[0].ID)
}
if transcript.Segments[0].OverlapGroupID != 1 || transcript.Segments[1].OverlapGroupID != 1 {
t.Fatalf("segment overlap IDs = %d,%d, want 1,1", transcript.Segments[0].OverlapGroupID, transcript.Segments[1].OverlapGroupID)
}
}
func TestTrimMalformedSelectorFailsWithClearError(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1-",
)
if err == nil {
t.Fatal("expected malformed selector error")
}
if !strings.Contains(err.Error(), "invalid selector") || !strings.Contains(err.Error(), "malformed element") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimMalformedInputArtifactFailsClearly(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "broken.json", `{"metadata":`)
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err == nil {
t.Fatal("expected malformed artifact error")
}
if !strings.Contains(err.Error(), "input JSON is malformed") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimDuplicateInputSegmentIDsFail(t *testing.T) {
dir := t.TempDir()
input := writeTrimMinimalWithIDsFixture(t, dir, "input-dup.json", []int{1, 1})
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err == nil {
t.Fatal("expected duplicate segment ID failure")
}
if !strings.Contains(err.Error(), "not a valid seriatim output artifact") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimNonSequentialInputSegmentIDsFail(t *testing.T) {
dir := t.TempDir()
input := writeTrimMinimalWithIDsFixture(t, dir, "input-nonseq.json", []int{1, 3})
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err == nil {
t.Fatal("expected non-sequential segment ID failure")
}
if !strings.Contains(err.Error(), "not a valid seriatim output artifact") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimKeepSelectorWithOverlappingRanges(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1-3,2-4",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 4 {
t.Fatalf("segment count = %d, want 4", len(transcript.Segments))
}
assertSequentialIDs(t, []int{
transcript.Segments[0].ID,
transcript.Segments[1].ID,
transcript.Segments[2].ID,
transcript.Segments[3].ID,
})
}
func TestTrimRemoveSelectorWithOverlappingRanges(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--remove", "2-3,3-4",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(transcript.Segments))
}
if transcript.Segments[0].Text != "one" {
t.Fatalf("remaining segment = %#v, want one", transcript.Segments[0])
}
}
func TestTrimSelectorOrderDoesNotAffectTranscriptOrder(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "4,1,3",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 3 {
t.Fatalf("segment count = %d, want 3", len(transcript.Segments))
}
got := []string{
transcript.Segments[0].Text,
transcript.Segments[1].Text,
transcript.Segments[2].Text,
}
want := []string{"one", "three", "four"}
if got[0] != want[0] || got[1] != want[1] || got[2] != want[2] {
t.Fatalf("segment text order = %v, want %v", got, want)
}
}
func TestTrimAllowEmptyBehavior(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--remove", "1-4",
)
if err == nil {
t.Fatal("expected empty-output error")
}
if !strings.Contains(err.Error(), "empty transcript") {
t.Fatalf("unexpected error: %v", err)
}
err = executeTrim(
"--input-file", input,
"--output-file", output,
"--remove", "1-4",
"--allow-empty",
)
if err != nil {
t.Fatalf("trim with --allow-empty failed: %v", err)
}
var transcript schema.Transcript
readJSON(t, output, &transcript)
if len(transcript.Segments) != 0 {
t.Fatalf("segment count = %d, want 0", len(transcript.Segments))
}
}
func TestTrimRejectsNonSeriatimInputArtifacts(t *testing.T) {
dir := t.TempDir()
input := writeJSONFile(t, dir, "raw-whisperx.json", `{
"segments": [
{"start": 1, "end": 2, "text": "hello"}
]
}`)
output := filepath.Join(dir, "trimmed.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err == nil {
t.Fatal("expected invalid artifact error")
}
if !strings.Contains(err.Error(), "not a valid seriatim output artifact") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestTrimReportFileContainsAuditFields(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
reportPath := filepath.Join(dir, "trim-report.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
"--remove", "4,2",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
if len(rpt.Events) == 0 {
t.Fatal("expected report events")
}
if !hasReportEvent(rpt, "trim", "trim", "trimmed 4 input segment(s) into 2 output segment(s) with mode=remove") {
t.Fatal("expected trim summary event")
}
if !hasReportEvent(rpt, "trim", "validate-output", "validated 2 output segment(s)") {
t.Fatal("expected validation event")
}
audit := extractTrimAuditEvent(t, rpt)
if audit.Operation != "trim" {
t.Fatalf("operation = %q, want trim", audit.Operation)
}
if audit.InputFile != input {
t.Fatalf("input_file = %q, want %q", audit.InputFile, input)
}
if audit.OutputFile != output {
t.Fatalf("output_file = %q, want %q", audit.OutputFile, output)
}
if audit.InputSchema != config.OutputSchemaFull || audit.OutputSchema != config.OutputSchemaFull {
t.Fatalf("schemas = %q -> %q, want full -> full", audit.InputSchema, audit.OutputSchema)
}
if audit.Mode != "remove" {
t.Fatalf("mode = %q, want remove", audit.Mode)
}
if audit.Selector != "4,2" {
t.Fatalf("selector = %q, want %q", audit.Selector, "4,2")
}
assertIntSliceEqual(t, audit.SelectedIDs, []int{2, 4})
if audit.AllowEmpty {
t.Fatal("allow_empty should be false")
}
if audit.InputSegmentCount != 4 || audit.RetainedSegmentCount != 2 || audit.RemovedSegmentCount != 2 {
t.Fatalf("counts = input:%d retained:%d removed:%d, want 4/2/2", audit.InputSegmentCount, audit.RetainedSegmentCount, audit.RemovedSegmentCount)
}
assertIntSliceEqual(t, audit.RemovedInputIDs, []int{2, 4})
if len(audit.OldToNewIDMapping) != 2 {
t.Fatalf("mapping length = %d, want 2", len(audit.OldToNewIDMapping))
}
if audit.OldToNewIDMapping[0].OldID != 1 || audit.OldToNewIDMapping[0].NewID != 1 {
t.Fatalf("mapping[0] = %#v, want old_id=1 new_id=1", audit.OldToNewIDMapping[0])
}
if audit.OldToNewIDMapping[1].OldID != 3 || audit.OldToNewIDMapping[1].NewID != 2 {
t.Fatalf("mapping[1] = %#v, want old_id=3 new_id=2", audit.OldToNewIDMapping[1])
}
if !audit.OverlapGroupsRecomputed {
t.Fatal("expected overlap_groups_recomputed=true for full schema trim")
}
}
func TestTrimReportOldToNewMappingIsDeterministicSorted(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
reportPath := filepath.Join(dir, "trim-report.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--report-file", reportPath,
"--keep", "4,1,3",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
var rpt report.Report
readJSON(t, reportPath, &rpt)
audit := extractTrimAuditEvent(t, rpt)
if len(audit.OldToNewIDMapping) != 3 {
t.Fatalf("mapping length = %d, want 3", len(audit.OldToNewIDMapping))
}
for index, expectedOld := range []int{1, 3, 4} {
if audit.OldToNewIDMapping[index].OldID != expectedOld {
t.Fatalf("mapping[%d].old_id = %d, want %d", index, audit.OldToNewIDMapping[index].OldID, expectedOld)
}
}
}
func TestTrimNoReportFileWhenOmitted(t *testing.T) {
dir := t.TempDir()
input := writeTrimFullFixture(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
reportPath := filepath.Join(dir, "trim-report.json")
err := executeTrim(
"--input-file", input,
"--output-file", output,
"--keep", "1",
)
if err != nil {
t.Fatalf("trim failed: %v", err)
}
_, statErr := os.Stat(reportPath)
if !os.IsNotExist(statErr) {
t.Fatalf("expected no report file at %q, got err=%v", reportPath, statErr)
}
}
func executeTrim(args ...string) error {
cmd := NewRootCommand()
cmd.SetArgs(append([]string{"trim"}, args...))
return cmd.Execute()
}
func writeTrimFullFixture(t *testing.T, dir string, name string) string {
t.Helper()
first := 10
second := 20
third := 30
fourth := 40
value := schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
InputReader: "json-files",
InputFiles: []string{"a.json"},
PreprocessingModules: []string{"validate-raw"},
PostprocessingModules: []string{"assign-ids"},
OutputModules: []string{"json"},
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &first, SourceRef: "a.json#10", Speaker: "A", Start: 1, End: 2, Text: "one", OverlapGroupID: 9},
{ID: 2, Source: "a.json", SourceSegmentIndex: &second, SourceRef: "a.json#20", Speaker: "B", Start: 2, End: 3, Text: "two", OverlapGroupID: 9},
{ID: 3, Source: "a.json", SourceSegmentIndex: &third, SourceRef: "a.json#30", Speaker: "C", Start: 4, End: 5, Text: "three", OverlapGroupID: 10},
{ID: 4, Source: "a.json", SourceSegmentIndex: &fourth, SourceRef: "a.json#40", Speaker: "D", Start: 5, End: 6, Text: "four", OverlapGroupID: 10},
},
OverlapGroups: []schema.OverlapGroup{
{ID: 9, Start: 1, End: 3, Segments: []string{"a.json#10", "a.json#20"}, Speakers: []string{"A", "B"}, Class: "unknown", Resolution: "unresolved"},
},
}
return writeTrimArtifactFile(t, dir, name, value)
}
func writeTrimMinimalFixture(t *testing.T, dir string, name string) string {
t.Helper()
value := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: config.OutputSchemaMinimal,
},
Segments: []schema.MinimalSegment{
{ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one"},
{ID: 2, Start: 2, End: 3, Speaker: "B", Text: "two"},
},
}
return writeTrimArtifactFile(t, dir, name, value)
}
func writeTrimIntermediateFixture(t *testing.T, dir string, name string) string {
t.Helper()
value := schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: config.OutputSchemaIntermediate,
},
Segments: []schema.IntermediateSegment{
{ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one", Categories: []string{"word-run"}},
{ID: 2, Start: 2, End: 3, Speaker: "B", Text: "two", Categories: []string{"filler", "backchannel"}},
},
}
return writeTrimArtifactFile(t, dir, name, value)
}
func writeTrimMinimalWithIDsFixture(t *testing.T, dir string, name string, ids []int) string {
t.Helper()
if len(ids) < 2 {
t.Fatalf("need at least two IDs, got %d", len(ids))
}
value := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: config.OutputSchemaMinimal,
},
Segments: []schema.MinimalSegment{
{ID: ids[0], Start: 1, End: 2, Speaker: "A", Text: "one"},
{ID: ids[1], Start: 2, End: 3, Speaker: "B", Text: "two"},
},
}
return writeTrimArtifactFile(t, dir, name, value)
}
func writeTrimFullOverlapFixture(t *testing.T, dir string, name string) string {
t.Helper()
first := 10
second := 20
third := 30
value := schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
InputReader: "json-files",
InputFiles: []string{"a.json"},
PreprocessingModules: []string{"validate-raw"},
PostprocessingModules: []string{"detect-overlaps", "assign-ids"},
OutputModules: []string{"json"},
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &first, SourceRef: "a.json#10", Speaker: "A", Start: 1, End: 3, Text: "one", OverlapGroupID: 5},
{ID: 2, Source: "a.json", SourceSegmentIndex: &second, SourceRef: "a.json#20", Speaker: "B", Start: 2, End: 4, Text: "two", OverlapGroupID: 5},
{ID: 3, Source: "a.json", SourceSegmentIndex: &third, SourceRef: "a.json#30", Speaker: "C", Start: 6, End: 7, Text: "three", OverlapGroupID: 6},
},
OverlapGroups: []schema.OverlapGroup{
{ID: 99, Start: 0, End: 100, Segments: []string{"stale"}, Speakers: []string{"stale"}, Class: "unknown", Resolution: "unresolved"},
},
}
return writeTrimArtifactFile(t, dir, name, value)
}
func writeTrimArtifactFile(t *testing.T, dir string, name string, value any) string {
t.Helper()
data, err := json.MarshalIndent(value, "", " ")
if err != nil {
t.Fatalf("marshal fixture: %v", err)
}
path := filepath.Join(dir, name)
if err := os.WriteFile(path, append(data, '\n'), 0o600); err != nil {
t.Fatalf("write fixture: %v", err)
}
return path
}
func assertSequentialIDs(t *testing.T, ids []int) {
t.Helper()
for index, id := range ids {
want := index + 1
if id != want {
t.Fatalf("id at index %d = %d, want %d", index, id, want)
}
}
}
func extractTrimAuditEvent(t *testing.T, rpt report.Report) trimAuditReport {
t.Helper()
for _, event := range rpt.Events {
if event.Stage == "trim" && event.Module == "trim-audit" {
var audit trimAuditReport
if err := json.Unmarshal([]byte(event.Message), &audit); err != nil {
t.Fatalf("decode trim audit event: %v", err)
}
return audit
}
}
t.Fatal("missing trim-audit event")
return trimAuditReport{}
}
func assertIntSliceEqual(t *testing.T, got []int, want []int) {
t.Helper()
if len(got) != len(want) {
t.Fatalf("slice length = %d, want %d", len(got), len(want))
}
for index := range got {
if got[index] != want[index] {
t.Fatalf("slice[%d] = %d, want %d (full got=%v, want=%v)", index, got[index], want[index], got, want)
}
}
}

View File

@@ -13,22 +13,23 @@ import (
const ( const (
DefaultInputReader = "json-files" DefaultInputReader = "json-files"
DefaultOutputModules = "json" DefaultOutputModules = "json"
DefaultOutputSchema = OutputSchemaDefault DefaultOutputSchema = OutputSchemaIntermediate
DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text" DefaultPreprocessingModules = "validate-raw,normalize-speakers,trim-text"
DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output" DefaultPostprocessingModules = "detect-overlaps,resolve-overlaps,backchannel,filler,resolve-danglers,coalesce,detect-overlaps,autocorrect,assign-ids,validate-output"
DefaultOverlapWordRunGap = 0.75 DefaultOverlapWordRunGap = 1.0
DefaultWordRunReorderWindow = 1.0 DefaultWordRunReorderWindow = 1.0
DefaultCoalesceGap = 3.0 DefaultCoalesceGap = 3.0
DefaultCoalesceGapValue = "3.0" DefaultCoalesceGapValue = "3.0"
DefaultBackchannelMaxDuration = 2.0 DefaultBackchannelMaxDuration = 2.0
DefaultFillerMaxDuration = 1.25 DefaultFillerMaxDuration = 1.25
OutputSchemaEnv = "SERIATIM_OUTPUT_SCHEMA"
OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP" OverlapWordRunGapEnv = "SERIATIM_OVERLAP_WORD_RUN_GAP"
WordRunReorderWindowEnv = "SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW" WordRunReorderWindowEnv = "SERIATIM_OVERLAP_WORD_RUN_REORDER_WINDOW"
BackchannelMaxDurationEnv = "SERIATIM_BACKCHANNEL_MAX_DURATION" BackchannelMaxDurationEnv = "SERIATIM_BACKCHANNEL_MAX_DURATION"
FillerMaxDurationEnv = "SERIATIM_FILLER_MAX_DURATION" FillerMaxDurationEnv = "SERIATIM_FILLER_MAX_DURATION"
OutputSchemaDefault = "default" OutputSchemaMinimal = "seriatim-minimal"
OutputSchemaSeriatim = "seriatim" OutputSchemaIntermediate = "seriatim-intermediate"
OutputSchemaMinimal = "minimal" OutputSchemaFull = "seriatim-full"
) )
// MergeOptions captures raw CLI option values before validation. // MergeOptions captures raw CLI option values before validation.
@@ -46,6 +47,26 @@ type MergeOptions struct {
CoalesceGap string CoalesceGap string
} }
// TrimOptions captures raw CLI option values before validation.
type TrimOptions struct {
InputFile string
OutputFile string
ReportFile string
Keep string
Remove string
OutputSchema string
AllowEmpty bool
}
// NormalizeOptions captures raw CLI option values before validation.
type NormalizeOptions struct {
InputFile string
OutputFile string
ReportFile string
OutputSchema string
OutputModules string
}
// Config is the validated runtime configuration for a merge invocation. // Config is the validated runtime configuration for a merge invocation.
type Config struct { type Config struct {
InputFiles []string InputFiles []string
@@ -65,12 +86,32 @@ type Config struct {
FillerMaxDuration float64 FillerMaxDuration float64
} }
// TrimConfig is the validated runtime configuration for a trim invocation.
type TrimConfig struct {
InputFile string
OutputFile string
ReportFile string
Mode string
Selector string
OutputSchema string
AllowEmpty bool
}
// NormalizeConfig is the validated runtime configuration for a normalize invocation.
type NormalizeConfig struct {
InputFile string
OutputFile string
ReportFile string
OutputSchema string
OutputModules []string
}
// NewMergeConfig validates raw merge options and returns normalized config. // NewMergeConfig validates raw merge options and returns normalized config.
func NewMergeConfig(opts MergeOptions) (Config, error) { func NewMergeConfig(opts MergeOptions) (Config, error) {
cfg := Config{ cfg := Config{
InputReader: strings.TrimSpace(opts.InputReader), InputReader: strings.TrimSpace(opts.InputReader),
OutputModules: nil, OutputModules: nil,
OutputSchema: strings.TrimSpace(opts.OutputSchema), OutputSchema: "",
PreprocessingModules: nil, PreprocessingModules: nil,
PostprocessingModules: nil, PostprocessingModules: nil,
OverlapWordRunGap: DefaultOverlapWordRunGap, OverlapWordRunGap: DefaultOverlapWordRunGap,
@@ -83,14 +124,12 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
if cfg.InputReader == "" { if cfg.InputReader == "" {
return Config{}, errors.New("--input-reader is required") return Config{}, errors.New("--input-reader is required")
} }
if cfg.OutputSchema == "" { var err error
cfg.OutputSchema = DefaultOutputSchema cfg.OutputSchema, err = resolveOutputSchema(opts.OutputSchema)
} if err != nil {
if err := validateOutputSchema(cfg.OutputSchema); err != nil {
return Config{}, err return Config{}, err
} }
var err error
cfg.OutputModules, err = parseModuleList(opts.OutputModules) cfg.OutputModules, err = parseModuleList(opts.OutputModules)
if err != nil { if err != nil {
return Config{}, fmt.Errorf("--output-modules: %w", err) return Config{}, fmt.Errorf("--output-modules: %w", err)
@@ -169,6 +208,111 @@ func NewMergeConfig(opts MergeOptions) (Config, error) {
return cfg, nil return cfg, nil
} }
// NewTrimConfig validates raw trim options and returns normalized config.
func NewTrimConfig(opts TrimOptions) (TrimConfig, error) {
inputFile := filepath.Clean(strings.TrimSpace(opts.InputFile))
if strings.TrimSpace(opts.InputFile) == "" {
return TrimConfig{}, errors.New("--input-file is required")
}
if err := requireFile(inputFile, "--input-file"); err != nil {
return TrimConfig{}, err
}
outputFile, err := normalizeOutputPath(opts.OutputFile, "--output-file")
if err != nil {
return TrimConfig{}, err
}
reportFile := ""
if strings.TrimSpace(opts.ReportFile) != "" {
reportFile, err = normalizeOutputPath(opts.ReportFile, "--report-file")
if err != nil {
return TrimConfig{}, err
}
}
keep := strings.TrimSpace(opts.Keep)
remove := strings.TrimSpace(opts.Remove)
if keep == "" && remove == "" {
return TrimConfig{}, errors.New("exactly one of --keep or --remove is required")
}
if keep != "" && remove != "" {
return TrimConfig{}, errors.New("--keep and --remove are mutually exclusive")
}
mode := "keep"
selector := keep
if remove != "" {
mode = "remove"
selector = remove
}
outputSchema := strings.TrimSpace(opts.OutputSchema)
if outputSchema != "" {
if err := validateOutputSchema(outputSchema); err != nil {
return TrimConfig{}, err
}
}
return TrimConfig{
InputFile: inputFile,
OutputFile: outputFile,
ReportFile: reportFile,
Mode: mode,
Selector: selector,
OutputSchema: outputSchema,
AllowEmpty: opts.AllowEmpty,
}, nil
}
// NewNormalizeConfig validates raw normalize options and returns normalized config.
func NewNormalizeConfig(opts NormalizeOptions) (NormalizeConfig, error) {
inputFile := filepath.Clean(strings.TrimSpace(opts.InputFile))
if strings.TrimSpace(opts.InputFile) == "" {
return NormalizeConfig{}, errors.New("--input-file is required")
}
if err := requireFile(inputFile, "--input-file"); err != nil {
return NormalizeConfig{}, err
}
outputFile, err := normalizeOutputPath(opts.OutputFile, "--output-file")
if err != nil {
return NormalizeConfig{}, err
}
reportFile := ""
if strings.TrimSpace(opts.ReportFile) != "" {
reportFile, err = normalizeOutputPath(opts.ReportFile, "--report-file")
if err != nil {
return NormalizeConfig{}, err
}
}
outputSchema, err := resolveOutputSchema(opts.OutputSchema)
if err != nil {
return NormalizeConfig{}, err
}
outputModules, err := parseModuleList(opts.OutputModules)
if err != nil {
return NormalizeConfig{}, fmt.Errorf("--output-modules: %w", err)
}
if len(outputModules) == 0 {
return NormalizeConfig{}, errors.New("--output-modules must include at least one module")
}
if err := validateNormalizeOutputModules(outputModules); err != nil {
return NormalizeConfig{}, err
}
return NormalizeConfig{
InputFile: inputFile,
OutputFile: outputFile,
ReportFile: reportFile,
OutputSchema: outputSchema,
OutputModules: outputModules,
}, nil
}
func parseModuleList(value string) ([]string, error) { func parseModuleList(value string) ([]string, error) {
value = strings.TrimSpace(value) value = strings.TrimSpace(value)
if value == "" { if value == "" {
@@ -189,13 +333,27 @@ func parseModuleList(value string) ([]string, error) {
func validateOutputSchema(value string) error { func validateOutputSchema(value string) error {
switch value { switch value {
case OutputSchemaDefault, OutputSchemaSeriatim, OutputSchemaMinimal: case OutputSchemaMinimal, OutputSchemaIntermediate, OutputSchemaFull:
return nil return nil
default: default:
return fmt.Errorf("--output-schema must be one of %q, %q, or %q", OutputSchemaDefault, OutputSchemaMinimal, OutputSchemaSeriatim) return fmt.Errorf("--output-schema must be one of %q, %q, or %q", OutputSchemaMinimal, OutputSchemaIntermediate, OutputSchemaFull)
} }
} }
func resolveOutputSchema(value string) (string, error) {
value = strings.TrimSpace(value)
if value == "" {
value = strings.TrimSpace(os.Getenv(OutputSchemaEnv))
}
if value == "" {
value = DefaultOutputSchema
}
if err := validateOutputSchema(value); err != nil {
return "", err
}
return value, nil
}
func normalizeInputFiles(paths []string) ([]string, error) { func normalizeInputFiles(paths []string) ([]string, error) {
if len(paths) == 0 { if len(paths) == 0 {
return nil, errors.New("at least one --input-file is required") return nil, errors.New("at least one --input-file is required")
@@ -308,3 +466,12 @@ func contains(values []string, target string) bool {
} }
return false return false
} }
func validateNormalizeOutputModules(modules []string) error {
for _, module := range modules {
if module != "json" {
return fmt.Errorf("unknown output module %q", module)
}
}
return nil
}

View File

@@ -46,7 +46,8 @@ func TestDuplicateInputFilesFailValidation(t *testing.T) {
} }
} }
func TestOutputSchemaDefaultsToDefault(t *testing.T) { func TestOutputSchemaDefaultsToIntermediate(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json") output := filepath.Join(dir, "merged.json")
@@ -67,7 +68,8 @@ func TestOutputSchemaDefaultsToDefault(t *testing.T) {
} }
} }
func TestOutputSchemaAcceptsDefault(t *testing.T) { func TestOutputSchemaAcceptsIntermediate(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json") output := filepath.Join(dir, "merged.json")
@@ -77,19 +79,20 @@ func TestOutputSchemaAcceptsDefault(t *testing.T) {
OutputFile: output, OutputFile: output,
InputReader: DefaultInputReader, InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules, OutputModules: DefaultOutputModules,
OutputSchema: OutputSchemaDefault, OutputSchema: OutputSchemaIntermediate,
PreprocessingModules: DefaultPreprocessingModules, PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules, PostprocessingModules: DefaultPostprocessingModules,
}) })
if err != nil { if err != nil {
t.Fatalf("config failed: %v", err) t.Fatalf("config failed: %v", err)
} }
if cfg.OutputSchema != OutputSchemaDefault { if cfg.OutputSchema != OutputSchemaIntermediate {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaDefault) t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaIntermediate)
} }
} }
func TestOutputSchemaAcceptsMinimal(t *testing.T) { func TestOutputSchemaAcceptsMinimal(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json") output := filepath.Join(dir, "merged.json")
@@ -111,7 +114,8 @@ func TestOutputSchemaAcceptsMinimal(t *testing.T) {
} }
} }
func TestOutputSchemaAcceptsSeriatim(t *testing.T) { func TestOutputSchemaAcceptsFull(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json") output := filepath.Join(dir, "merged.json")
@@ -121,19 +125,87 @@ func TestOutputSchemaAcceptsSeriatim(t *testing.T) {
OutputFile: output, OutputFile: output,
InputReader: DefaultInputReader, InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules, OutputModules: DefaultOutputModules,
OutputSchema: OutputSchemaSeriatim, OutputSchema: OutputSchemaFull,
PreprocessingModules: DefaultPreprocessingModules, PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules, PostprocessingModules: DefaultPostprocessingModules,
}) })
if err != nil { if err != nil {
t.Fatalf("config failed: %v", err) t.Fatalf("config failed: %v", err)
} }
if cfg.OutputSchema != OutputSchemaSeriatim { if cfg.OutputSchema != OutputSchemaFull {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaSeriatim) t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaFull)
}
}
func TestOutputSchemaUsesEnvWhenFlagOmitted(t *testing.T) {
t.Setenv(OutputSchemaEnv, OutputSchemaFull)
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json")
cfg, err := NewMergeConfig(MergeOptions{
InputFiles: []string{input},
OutputFile: output,
InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules,
PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules,
})
if err != nil {
t.Fatalf("config failed: %v", err)
}
if cfg.OutputSchema != OutputSchemaFull {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaFull)
}
}
func TestOutputSchemaFlagOverridesEnv(t *testing.T) {
t.Setenv(OutputSchemaEnv, OutputSchemaFull)
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json")
cfg, err := NewMergeConfig(MergeOptions{
InputFiles: []string{input},
OutputFile: output,
InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules,
OutputSchema: OutputSchemaMinimal,
PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules,
})
if err != nil {
t.Fatalf("config failed: %v", err)
}
if cfg.OutputSchema != OutputSchemaMinimal {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaMinimal)
}
}
func TestOutputSchemaRejectsInvalidEnvValue(t *testing.T) {
t.Setenv(OutputSchemaEnv, "compact")
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json")
_, err := NewMergeConfig(MergeOptions{
InputFiles: []string{input},
OutputFile: output,
InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules,
PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules,
})
if err == nil {
t.Fatal("expected output schema error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
} }
} }
func TestOutputSchemaRejectsUnknownValue(t *testing.T) { func TestOutputSchemaRejectsUnknownValue(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json") output := filepath.Join(dir, "merged.json")
@@ -155,7 +227,36 @@ func TestOutputSchemaRejectsUnknownValue(t *testing.T) {
} }
} }
func TestOverlapWordRunGapDefaultsTo075(t *testing.T) { func TestOutputSchemaRejectsLegacyValues(t *testing.T) {
tests := []string{"default", "minimal", "seriatim"}
for _, legacy := range tests {
t.Run(legacy, func(t *testing.T) {
t.Setenv(OutputSchemaEnv, "")
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "merged.json")
_, err := NewMergeConfig(MergeOptions{
InputFiles: []string{input},
OutputFile: output,
InputReader: DefaultInputReader,
OutputModules: DefaultOutputModules,
OutputSchema: legacy,
PreprocessingModules: DefaultPreprocessingModules,
PostprocessingModules: DefaultPostprocessingModules,
})
if err == nil {
t.Fatal("expected output schema error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
}
})
}
}
func TestOverlapWordRunGapDefaultsTo1(t *testing.T) {
t.Setenv(OverlapWordRunGapEnv, "") t.Setenv(OverlapWordRunGapEnv, "")
dir := t.TempDir() dir := t.TempDir()
input := writeTempFile(t, dir, "input.json") input := writeTempFile(t, dir, "input.json")
@@ -511,6 +612,206 @@ func TestCoalesceGapRejectsInvalidOverride(t *testing.T) {
} }
} }
func TestNewTrimConfigRequiresInputAndOutput(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
_, err := NewTrimConfig(TrimOptions{
OutputFile: output,
Keep: "1",
})
if err == nil || !strings.Contains(err.Error(), "--input-file is required") {
t.Fatalf("expected input-file required error, got %v", err)
}
_, err = NewTrimConfig(TrimOptions{
InputFile: input,
Keep: "1",
})
if err == nil || !strings.Contains(err.Error(), "--output-file is required") {
t.Fatalf("expected output-file required error, got %v", err)
}
}
func TestNewTrimConfigRequiresExactlyOneSelectorFlag(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
_, err := NewTrimConfig(TrimOptions{
InputFile: input,
OutputFile: output,
})
if err == nil || !strings.Contains(err.Error(), "exactly one of --keep or --remove is required") {
t.Fatalf("expected missing selector error, got %v", err)
}
_, err = NewTrimConfig(TrimOptions{
InputFile: input,
OutputFile: output,
Keep: "1",
Remove: "2",
})
if err == nil || !strings.Contains(err.Error(), "mutually exclusive") {
t.Fatalf("expected mutually exclusive selector error, got %v", err)
}
}
func TestNewTrimConfigAcceptsOutputSchemaOverride(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
reportPath := filepath.Join(dir, "report.json")
cfg, err := NewTrimConfig(TrimOptions{
InputFile: input,
OutputFile: output,
ReportFile: reportPath,
Remove: "3-5",
OutputSchema: OutputSchemaMinimal,
AllowEmpty: true,
})
if err != nil {
t.Fatalf("config failed: %v", err)
}
if cfg.Mode != "remove" {
t.Fatalf("mode = %q, want remove", cfg.Mode)
}
if cfg.Selector != "3-5" {
t.Fatalf("selector = %q, want 3-5", cfg.Selector)
}
if cfg.OutputSchema != OutputSchemaMinimal {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaMinimal)
}
if !cfg.AllowEmpty {
t.Fatal("allow empty should be true")
}
if cfg.ReportFile != reportPath {
t.Fatalf("report file = %q, want %q", cfg.ReportFile, reportPath)
}
}
func TestNewTrimConfigRejectsInvalidOutputSchemaOverride(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "trimmed.json")
_, err := NewTrimConfig(TrimOptions{
InputFile: input,
OutputFile: output,
Keep: "1",
OutputSchema: "compact",
})
if err == nil {
t.Fatal("expected output schema validation error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNewNormalizeConfigRequiresInputFile(t *testing.T) {
dir := t.TempDir()
output := filepath.Join(dir, "normalized.json")
_, err := NewNormalizeConfig(NormalizeOptions{
OutputFile: output,
OutputModules: DefaultOutputModules,
})
if err == nil {
t.Fatal("expected input-file required error")
}
if !strings.Contains(err.Error(), "--input-file is required") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNewNormalizeConfigRequiresOutputFile(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
_, err := NewNormalizeConfig(NormalizeOptions{
InputFile: input,
OutputModules: DefaultOutputModules,
})
if err == nil {
t.Fatal("expected output-file required error")
}
if !strings.Contains(err.Error(), "--output-file is required") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNewNormalizeConfigResolvesOutputSchemaDefaultAndEnv(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "normalized.json")
t.Setenv(OutputSchemaEnv, "")
cfg, err := NewNormalizeConfig(NormalizeOptions{
InputFile: input,
OutputFile: output,
OutputModules: DefaultOutputModules,
})
if err != nil {
t.Fatalf("config failed: %v", err)
}
if cfg.OutputSchema != DefaultOutputSchema {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, DefaultOutputSchema)
}
t.Setenv(OutputSchemaEnv, OutputSchemaMinimal)
cfg, err = NewNormalizeConfig(NormalizeOptions{
InputFile: input,
OutputFile: output,
OutputModules: DefaultOutputModules,
})
if err != nil {
t.Fatalf("config failed: %v", err)
}
if cfg.OutputSchema != OutputSchemaMinimal {
t.Fatalf("output schema = %q, want %q", cfg.OutputSchema, OutputSchemaMinimal)
}
}
func TestNewNormalizeConfigRejectsInvalidOutputSchema(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "normalized.json")
_, err := NewNormalizeConfig(NormalizeOptions{
InputFile: input,
OutputFile: output,
OutputSchema: "compact",
OutputModules: DefaultOutputModules,
})
if err == nil {
t.Fatal("expected output schema error")
}
if !strings.Contains(err.Error(), "--output-schema must be one of") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestNewNormalizeConfigRejectsUnknownOutputModule(t *testing.T) {
dir := t.TempDir()
input := writeTempFile(t, dir, "input.json")
output := filepath.Join(dir, "normalized.json")
_, err := NewNormalizeConfig(NormalizeOptions{
InputFile: input,
OutputFile: output,
OutputModules: "json,yaml",
})
if err == nil {
t.Fatal("expected output module error")
}
if !strings.Contains(err.Error(), "unknown output module") {
t.Fatalf("unexpected error: %v", err)
}
}
func assertPositiveFloatEnvValidation(t *testing.T, envName string) { func assertPositiveFloatEnvValidation(t *testing.T, envName string) {
t.Helper() t.Helper()

View File

@@ -184,7 +184,7 @@ func nearestPriorMatch(segments []model.Segment, consumed []bool, index int) int
if consumed[candidate] { if consumed[candidate] {
continue continue
} }
if sharesDerivedFrom(segments[index], segments[candidate]) { if segments[index].Speaker == segments[candidate].Speaker && sharesDerivedFrom(segments[index], segments[candidate]) {
return candidate return candidate
} }
} }
@@ -196,7 +196,7 @@ func nearestSubsequentMatch(segments []model.Segment, consumed []bool, index int
if consumed[candidate] { if consumed[candidate] {
continue continue
} }
if sharesDerivedFrom(segments[index], segments[candidate]) { if segments[index].Speaker == segments[candidate].Speaker && sharesDerivedFrom(segments[index], segments[candidate]) {
return candidate return candidate
} }
} }

View File

@@ -51,6 +51,30 @@ func TestApplyUsesAnyDerivedFromIntersection(t *testing.T) {
assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1", "source#2", "source#3"}) assertSegment(t, got.Segments[0], "resolve-danglers:1", "target end.", 1, 4, []string{"source#1", "source#2", "source#3"})
} }
func TestApplyDoesNotMergeDanglersAcrossSpeakersBackward(t *testing.T) {
in := transcript(
segment("a", "Alice", 1, 2, "target", []string{"source#1"}),
segment("b", "Bob", 3, 4, "end.", []string{"source#1"}),
)
got, summary := Apply(in)
if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) {
t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in)
}
}
func TestApplyDoesNotMergeDanglersAcrossSpeakersForward(t *testing.T) {
in := transcript(
segment("a", "Alice", 1, 2, "start", []string{"source#1"}),
segment("b", "Bob", 3, 4, "target", []string{"source#1"}),
)
got, summary := Apply(in)
if summary.DanglersMerged != 0 || !reflect.DeepEqual(got, in) {
t.Fatalf("unexpected merge:\ngot %#v\nwant %#v", got, in)
}
}
func TestApplyDoesNotMergeWithoutSharedProvenance(t *testing.T) { func TestApplyDoesNotMergeWithoutSharedProvenance(t *testing.T) {
in := transcript( in := transcript(
segment("a", "Alice", 1, 2, "target", []string{"source#1"}), segment("a", "Alice", 1, 2, "target", []string{"source#1"}),

216
internal/normalize/build.go Normal file
View File

@@ -0,0 +1,216 @@
package normalize
import (
"fmt"
"path/filepath"
"sort"
"strings"
"gitea.maximumdirect.net/eric/seriatim/internal/artifact"
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
// BuildResult contains normalize output plus deterministic transformation diagnostics.
type BuildResult struct {
Output any
SortingChanged bool
IDsReassigned bool
SegmentsWithCategories int
}
// Build converts parsed normalize input into a selected seriatim output schema.
func Build(parsed ParsedTranscript, cfg config.NormalizeConfig) (BuildResult, error) {
ordered := sortedSegments(parsed.Segments)
sortingChanged := didSortingChangeOrder(ordered)
idsReassigned := didReassignIDs(ordered)
segmentsWithCategories := countSegmentsWithCategories(ordered)
switch cfg.OutputSchema {
case config.OutputSchemaMinimal:
output := buildMinimal(ordered)
if err := schema.ValidateMinimalTranscript(output); err != nil {
return BuildResult{}, fmt.Errorf("validate normalize output: %w", err)
}
return BuildResult{
Output: output,
SortingChanged: sortingChanged,
IDsReassigned: idsReassigned,
SegmentsWithCategories: segmentsWithCategories,
}, nil
case config.OutputSchemaIntermediate:
output := buildIntermediate(ordered)
if err := schema.ValidateIntermediateTranscript(output); err != nil {
return BuildResult{}, fmt.Errorf("validate normalize output: %w", err)
}
return BuildResult{
Output: output,
SortingChanged: sortingChanged,
IDsReassigned: idsReassigned,
SegmentsWithCategories: segmentsWithCategories,
}, nil
case config.OutputSchemaFull:
output := buildFull(ordered, cfg)
if err := schema.ValidateTranscript(output); err != nil {
return BuildResult{}, fmt.Errorf("validate normalize output: %w", err)
}
return BuildResult{
Output: output,
SortingChanged: sortingChanged,
IDsReassigned: idsReassigned,
SegmentsWithCategories: segmentsWithCategories,
}, nil
default:
return BuildResult{}, fmt.Errorf("unsupported output schema %q", cfg.OutputSchema)
}
}
func sortedSegments(input []InputSegment) []InputSegment {
ordered := make([]InputSegment, len(input))
copy(ordered, input)
sort.SliceStable(ordered, func(i, j int) bool {
left := ordered[i]
right := ordered[j]
if left.Start != right.Start {
return left.Start < right.Start
}
if left.End != right.End {
return left.End < right.End
}
if left.InputIndex != right.InputIndex {
return left.InputIndex < right.InputIndex
}
return left.Speaker < right.Speaker
})
return ordered
}
func buildMinimal(segments []InputSegment) schema.MinimalTranscript {
outputSegments := make([]schema.MinimalSegment, len(segments))
for index, segment := range segments {
outputSegments[index] = schema.MinimalSegment{
ID: index + 1,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
}
}
return schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: artifact.ApplicationName,
Version: buildinfo.Version,
OutputSchema: config.OutputSchemaMinimal,
},
Segments: outputSegments,
}
}
func buildIntermediate(segments []InputSegment) schema.IntermediateTranscript {
outputSegments := make([]schema.IntermediateSegment, len(segments))
for index, segment := range segments {
outputSegments[index] = schema.IntermediateSegment{
ID: index + 1,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
Categories: append([]string(nil), segment.Categories...),
}
}
return schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: artifact.ApplicationName,
Version: buildinfo.Version,
OutputSchema: config.OutputSchemaIntermediate,
},
Segments: outputSegments,
}
}
func buildFull(segments []InputSegment, cfg config.NormalizeConfig) schema.Transcript {
defaultSource := filepath.Base(cfg.InputFile)
outputSegments := make([]schema.Segment, len(segments))
for index, segment := range segments {
source := strings.TrimSpace(segment.Source)
if source == "" {
source = defaultSource
}
sourceSegmentIndex := copyIntPtr(segment.SourceSegmentIndex)
if sourceSegmentIndex == nil {
fallback := segment.InputIndex
sourceSegmentIndex = &fallback
}
outputSegments[index] = schema.Segment{
ID: index + 1,
Source: source,
SourceSegmentIndex: sourceSegmentIndex,
SourceRef: segment.SourceRef,
DerivedFrom: append([]string(nil), segment.DerivedFrom...),
Speaker: segment.Speaker,
Start: segment.Start,
End: segment.End,
Text: segment.Text,
Categories: append([]string(nil), segment.Categories...),
}
}
return schema.Transcript{
Metadata: schema.Metadata{
Application: artifact.ApplicationName,
Version: buildinfo.Version,
InputReader: "normalize-input",
InputFiles: []string{cfg.InputFile},
PreprocessingModules: []string{},
PostprocessingModules: []string{},
OutputModules: append([]string(nil), cfg.OutputModules...),
},
Segments: outputSegments,
OverlapGroups: []schema.OverlapGroup{},
}
}
func copyIntPtr(value *int) *int {
if value == nil {
return nil
}
copied := *value
return &copied
}
func didSortingChangeOrder(segments []InputSegment) bool {
for index, segment := range segments {
if segment.InputIndex != index {
return true
}
}
return false
}
func didReassignIDs(segments []InputSegment) bool {
if len(segments) == 0 {
return false
}
for index, segment := range segments {
newID := index + 1
if segment.OriginalID == nil || *segment.OriginalID != newID {
return true
}
}
return false
}
func countSegmentsWithCategories(segments []InputSegment) int {
count := 0
for _, segment := range segments {
if len(segment.Categories) > 0 {
count++
}
}
return count
}

View File

@@ -0,0 +1,121 @@
package normalize
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"gitea.maximumdirect.net/eric/seriatim/internal/artifact"
"gitea.maximumdirect.net/eric/seriatim/internal/buildinfo"
"gitea.maximumdirect.net/eric/seriatim/internal/config"
"gitea.maximumdirect.net/eric/seriatim/internal/report"
)
type normalizeAudit struct {
Command string `json:"command"`
InputFile string `json:"input_file"`
OutputFile string `json:"output_file"`
InputShape string `json:"input_shape"`
InputSegmentCount int `json:"input_segment_count"`
OutputSchema string `json:"output_schema"`
OutputModules []string `json:"output_modules"`
IDsReassigned bool `json:"ids_reassigned"`
SortingChangedInput bool `json:"sorting_changed_input_order"`
SegmentsWithCategories int `json:"segments_with_categories"`
}
// Run executes artifact-level normalization.
func Run(ctx context.Context, cfg config.NormalizeConfig) error {
if err := ctx.Err(); err != nil {
return err
}
parsed, err := ParseFile(cfg.InputFile)
if err != nil {
return err
}
built, err := Build(parsed, cfg)
if err != nil {
return err
}
if err := writeOutputJSON(cfg.OutputFile, built.Output); err != nil {
return err
}
if cfg.ReportFile != "" {
audit := normalizeAudit{
Command: "normalize",
InputFile: cfg.InputFile,
OutputFile: cfg.OutputFile,
InputShape: string(parsed.Shape),
InputSegmentCount: len(parsed.Segments),
OutputSchema: cfg.OutputSchema,
OutputModules: append([]string(nil), cfg.OutputModules...),
IDsReassigned: built.IDsReassigned,
SortingChangedInput: built.SortingChanged,
SegmentsWithCategories: built.SegmentsWithCategories,
}
auditJSON, err := json.Marshal(audit)
if err != nil {
return fmt.Errorf("marshal normalize audit: %w", err)
}
events := []report.Event{
report.Info("normalize", "normalize", "started normalize command"),
report.Info("normalize", "normalize", fmt.Sprintf("input file: %s", cfg.InputFile)),
report.Info("normalize", "normalize", fmt.Sprintf("detected input shape: %s", parsed.Shape)),
report.Info("normalize", "normalize", fmt.Sprintf("input segment count: %d", len(parsed.Segments))),
report.Info("normalize", "normalize", fmt.Sprintf("selected output schema: %s", cfg.OutputSchema)),
report.Info("normalize", "normalize", fmt.Sprintf("selected output modules: %s", strings.Join(cfg.OutputModules, ","))),
report.Info("normalize", "normalize", fmt.Sprintf("output file: %s", cfg.OutputFile)),
report.Info("normalize", "normalize", fmt.Sprintf("ids reassigned: %t", built.IDsReassigned)),
report.Info("normalize", "normalize", fmt.Sprintf("sorting changed input order: %t", built.SortingChanged)),
report.Info("normalize", "normalize", fmt.Sprintf("segments with categories: %d", built.SegmentsWithCategories)),
report.Info("normalize", "normalize-audit", string(auditJSON)),
}
if len(parsed.Segments) == 0 {
events = append(events, report.Warning("normalize", "normalize", "input transcript contains zero segments"))
}
events = append(events,
report.Info("normalize", "validate-output", fmt.Sprintf("validated %d output segment(s)", len(parsed.Segments))),
report.Info("output", "json", "wrote transcript JSON"),
)
rpt := report.Report{
Metadata: report.Metadata{
Application: artifact.ApplicationName,
Version: buildinfo.Version,
InputReader: "normalize-input",
InputFiles: []string{cfg.InputFile},
PreprocessingModules: []string{},
PostprocessingModules: []string{},
OutputModules: append([]string(nil), cfg.OutputModules...),
},
Events: events,
}
if err := report.WriteJSON(cfg.ReportFile, rpt); err != nil {
return fmt.Errorf("write --report-file %q: %w", cfg.ReportFile, err)
}
}
return nil
}
func writeOutputJSON(path string, value any) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
if err := encoder.Encode(value); err != nil {
return fmt.Errorf("encode normalize output JSON: %w", err)
}
return nil
}

197
internal/normalize/parse.go Normal file
View File

@@ -0,0 +1,197 @@
package normalize
import (
"bytes"
"encoding/json"
"fmt"
"io"
"os"
"strings"
)
// InputShape identifies which top-level input shape was parsed.
type InputShape string
const (
ShapeObjectWithSegments InputShape = "object_with_segments"
ShapeBareSegmentsArray InputShape = "bare_segments_array"
)
// ParsedTranscript is the validated normalize input model.
type ParsedTranscript struct {
Shape InputShape
Segments []InputSegment
}
// InputSegment is a validated segment from normalize input.
type InputSegment struct {
InputIndex int
OriginalID *int
Start float64
End float64
Speaker string
Text string
Categories []string
Source string
SourceSegmentIndex *int
SourceRef string
DerivedFrom []string
OverlapGroupID *int
}
type inputSegmentPayload struct {
ID *int `json:"id"`
Start *float64 `json:"start"`
End *float64 `json:"end"`
Speaker *string `json:"speaker"`
Text *string `json:"text"`
Categories []string `json:"categories"`
Source string `json:"source"`
SourceSegmentIndex *int `json:"source_segment_index"`
SourceRef string `json:"source_ref"`
DerivedFrom []string `json:"derived_from"`
OverlapGroupID *int `json:"overlap_group_id"`
}
// ParseFile parses normalize input JSON from file path.
func ParseFile(path string) (ParsedTranscript, error) {
file, err := os.Open(path)
if err != nil {
return ParsedTranscript{}, err
}
defer file.Close()
return ParseReader(file)
}
// ParseReader parses normalize input JSON from a reader.
func ParseReader(reader io.Reader) (ParsedTranscript, error) {
var raw json.RawMessage
decoder := json.NewDecoder(reader)
decoder.UseNumber()
if err := decoder.Decode(&raw); err != nil {
return ParsedTranscript{}, fmt.Errorf("decode normalize input JSON: %w", err)
}
if err := ensureSingleValue(decoder); err != nil {
return ParsedTranscript{}, err
}
trimmed := bytes.TrimSpace(raw)
if len(trimmed) == 0 {
return ParsedTranscript{}, fmt.Errorf("normalize input is empty")
}
switch trimmed[0] {
case '{':
return parseObjectShape(trimmed)
case '[':
segments, err := parseSegmentsArray(trimmed)
if err != nil {
return ParsedTranscript{}, err
}
return ParsedTranscript{
Shape: ShapeBareSegmentsArray,
Segments: segments,
}, nil
default:
return ParsedTranscript{}, fmt.Errorf("normalize input must be a top-level object with \"segments\" or a top-level segment array")
}
}
func ensureSingleValue(decoder *json.Decoder) error {
var extra json.RawMessage
err := decoder.Decode(&extra)
if err == io.EOF {
return nil
}
if err == nil {
return fmt.Errorf("normalize input must contain exactly one top-level JSON value")
}
return fmt.Errorf("decode normalize input JSON: %w", err)
}
func parseObjectShape(raw []byte) (ParsedTranscript, error) {
var object map[string]json.RawMessage
if err := json.Unmarshal(raw, &object); err != nil {
return ParsedTranscript{}, fmt.Errorf("decode normalize object input: %w", err)
}
segmentsRaw, exists := object["segments"]
if !exists {
return ParsedTranscript{}, fmt.Errorf("normalize object input must contain a \"segments\" field")
}
segments, err := parseSegmentsArray(segmentsRaw)
if err != nil {
return ParsedTranscript{}, err
}
return ParsedTranscript{
Shape: ShapeObjectWithSegments,
Segments: segments,
}, nil
}
func parseSegmentsArray(raw []byte) ([]InputSegment, error) {
var segmentValues []json.RawMessage
if err := json.Unmarshal(raw, &segmentValues); err != nil {
return nil, fmt.Errorf("normalize input \"segments\" must be an array")
}
segments := make([]InputSegment, len(segmentValues))
for index, segmentRaw := range segmentValues {
segment, err := parseSegment(index, segmentRaw)
if err != nil {
return nil, err
}
segments[index] = segment
}
return segments, nil
}
func parseSegment(index int, raw []byte) (InputSegment, error) {
var payload inputSegmentPayload
if err := json.Unmarshal(raw, &payload); err != nil {
return InputSegment{}, fmt.Errorf("segment %d: invalid segment object: %w", index, err)
}
if payload.Start == nil {
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"start\"", index)
}
if payload.End == nil {
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"end\"", index)
}
if payload.Speaker == nil {
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"speaker\"", index)
}
if payload.Text == nil {
return InputSegment{}, fmt.Errorf("segment %d is missing required field \"text\"", index)
}
if *payload.Start < 0 {
return InputSegment{}, fmt.Errorf("segment %d has start %v; start must be >= 0", index, *payload.Start)
}
if *payload.End < *payload.Start {
return InputSegment{}, fmt.Errorf("segment %d has end %v before start %v", index, *payload.End, *payload.Start)
}
speaker := strings.TrimSpace(*payload.Speaker)
if speaker == "" {
return InputSegment{}, fmt.Errorf("segment %d has empty \"speaker\"; speaker must be non-empty", index)
}
return InputSegment{
InputIndex: index,
OriginalID: payload.ID,
Start: *payload.Start,
End: *payload.End,
Speaker: speaker,
Text: *payload.Text,
Categories: append([]string(nil), payload.Categories...),
Source: payload.Source,
SourceSegmentIndex: payload.SourceSegmentIndex,
SourceRef: payload.SourceRef,
DerivedFrom: append([]string(nil), payload.DerivedFrom...),
OverlapGroupID: payload.OverlapGroupID,
}, nil
}

View File

@@ -0,0 +1,181 @@
package normalize
import (
"strings"
"testing"
)
func TestParseReaderObjectWithSegmentsParses(t *testing.T) {
input := `{
"segments": [
{"start": 1.0, "end": 2.0, "speaker": " Alice ", "text": "hello", "id": 100}
]
}`
parsed, err := ParseReader(strings.NewReader(input))
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if parsed.Shape != ShapeObjectWithSegments {
t.Fatalf("shape = %q, want %q", parsed.Shape, ShapeObjectWithSegments)
}
if len(parsed.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(parsed.Segments))
}
segment := parsed.Segments[0]
if segment.Speaker != "Alice" {
t.Fatalf("speaker = %q, want %q", segment.Speaker, "Alice")
}
if segment.OriginalID == nil || *segment.OriginalID != 100 {
t.Fatalf("original id = %v, want 100", segment.OriginalID)
}
}
func TestParseReaderBareSegmentArrayParses(t *testing.T) {
input := `[
{"start": 1.0, "end": 2.0, "speaker": "Alice", "text": "hello"},
{"start": 3.0, "end": 4.0, "speaker": "Bob", "text": "world"}
]`
parsed, err := ParseReader(strings.NewReader(input))
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if parsed.Shape != ShapeBareSegmentsArray {
t.Fatalf("shape = %q, want %q", parsed.Shape, ShapeBareSegmentsArray)
}
if len(parsed.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(parsed.Segments))
}
}
func TestParseReaderInvalidJSONFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`{"segments":`))
if err == nil {
t.Fatal("expected parse error")
}
if !strings.Contains(err.Error(), "decode normalize input JSON") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestParseReaderObjectMissingSegmentsFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`{"items":[]}`))
if err == nil {
t.Fatal("expected missing segments error")
}
if !strings.Contains(err.Error(), "must contain a \"segments\" field") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestParseReaderSegmentsNotArrayFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`{"segments": {}}`))
if err == nil {
t.Fatal("expected segments not array error")
}
if !strings.Contains(err.Error(), "\"segments\" must be an array") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestParseReaderTopLevelScalarShapesFail(t *testing.T) {
tests := []string{`"text"`, `42`, `null`, `true`}
for _, input := range tests {
_, err := ParseReader(strings.NewReader(input))
if err == nil {
t.Fatalf("expected top-level shape error for %s", input)
}
if !strings.Contains(err.Error(), "top-level object") {
t.Fatalf("unexpected error for %s: %v", input, err)
}
}
}
func TestParseReaderMissingStartFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"end":2,"speaker":"A","text":"t"}]`))
assertContains(t, err, `missing required field "start"`)
}
func TestParseReaderMissingEndFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":1,"speaker":"A","text":"t"}]`))
assertContains(t, err, `missing required field "end"`)
}
func TestParseReaderMissingSpeakerFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"text":"t"}]`))
assertContains(t, err, `missing required field "speaker"`)
}
func TestParseReaderEmptySpeakerFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":" ","text":"t"}]`))
assertContains(t, err, `speaker must be non-empty`)
}
func TestParseReaderMissingTextFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":"A"}]`))
assertContains(t, err, `missing required field "text"`)
}
func TestParseReaderEndBeforeStartFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":3,"end":2,"speaker":"A","text":"t"}]`))
assertContains(t, err, "before start")
}
func TestParseReaderNegativeStartFails(t *testing.T) {
_, err := ParseReader(strings.NewReader(`[{"start":-1,"end":2,"speaker":"A","text":"t"}]`))
assertContains(t, err, "start must be >= 0")
}
func TestParseReaderEmptySegmentsArrayAccepted(t *testing.T) {
parsed, err := ParseReader(strings.NewReader(`{"segments":[]}`))
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if len(parsed.Segments) != 0 {
t.Fatalf("segment count = %d, want 0", len(parsed.Segments))
}
}
func TestParseReaderCategoriesPreservedWhenValid(t *testing.T) {
parsed, err := ParseReader(strings.NewReader(`[{"start":1,"end":2,"speaker":"A","text":"t","categories":["filler","backchannel"]}]`))
if err != nil {
t.Fatalf("parse failed: %v", err)
}
if len(parsed.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(parsed.Segments))
}
if len(parsed.Segments[0].Categories) != 2 {
t.Fatalf("categories length = %d, want 2", len(parsed.Segments[0].Categories))
}
if parsed.Segments[0].Categories[0] != "filler" || parsed.Segments[0].Categories[1] != "backchannel" {
t.Fatalf("categories = %v", parsed.Segments[0].Categories)
}
}
func TestParseReaderOriginalInputIndexPreserved(t *testing.T) {
input := `[
{"start":1,"end":2,"speaker":"A","text":"one"},
{"start":2,"end":3,"speaker":"B","text":"two"},
{"start":3,"end":4,"speaker":"C","text":"three"}
]`
parsed, err := ParseReader(strings.NewReader(input))
if err != nil {
t.Fatalf("parse failed: %v", err)
}
for index, segment := range parsed.Segments {
if segment.InputIndex != index {
t.Fatalf("segment %d input index = %d, want %d", index, segment.InputIndex, index)
}
}
}
func assertContains(t *testing.T, err error, fragment string) {
t.Helper()
if err == nil {
t.Fatalf("expected error containing %q", fragment)
}
if !strings.Contains(err.Error(), fragment) {
t.Fatalf("error = %q, want substring %q", err.Error(), fragment)
}
}

367
internal/trim/apply.go Normal file
View File

@@ -0,0 +1,367 @@
package trim
import (
"fmt"
"gitea.maximumdirect.net/eric/seriatim/internal/model"
"gitea.maximumdirect.net/eric/seriatim/internal/overlap"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
// Mode controls how selector IDs are applied.
type Mode string
const (
ModeKeep Mode = "keep"
ModeRemove Mode = "remove"
)
// Options configures transcript trimming.
type Options struct {
Mode Mode
Selector Selector
AllowEmpty bool
}
// Result contains trimming output and ID mapping metadata.
type Result struct {
Transcript schema.Transcript
OldToNewID map[int]int
RemovedIDs []int
}
// IntermediateResult contains trimming output for intermediate schema artifacts.
type IntermediateResult struct {
Transcript schema.IntermediateTranscript
OldToNewID map[int]int
RemovedIDs []int
}
// MinimalResult contains trimming output for minimal schema artifacts.
type MinimalResult struct {
Transcript schema.MinimalTranscript
OldToNewID map[int]int
RemovedIDs []int
}
// Apply trims a full seriatim output transcript by segment ID.
func Apply(input schema.Transcript, opts Options) (Result, error) {
if err := validateMode(opts.Mode); err != nil {
return Result{}, err
}
selected := opts.Selector.IDs()
if len(selected) == 0 {
return Result{}, fmt.Errorf("selector cannot be empty")
}
inputIDs := make([]int, len(input.Segments))
for index, segment := range input.Segments {
inputIDs[index] = segment.ID
}
idIndex, err := validateInputIDs(inputIDs)
if err != nil {
return Result{}, err
}
if err := validateSelectedIDsExist(selected, idIndex); err != nil {
return Result{}, err
}
kept := make([]schema.Segment, 0, len(input.Segments))
removed := make([]int, 0, len(input.Segments))
oldToNew := make(map[int]int, len(input.Segments))
for _, segment := range input.Segments {
keep := opts.Mode == ModeKeep && opts.Selector.Contains(segment.ID)
if opts.Mode == ModeRemove {
keep = !opts.Selector.Contains(segment.ID)
}
if !keep {
removed = append(removed, segment.ID)
continue
}
rewritten := copySegment(segment)
rewritten.ID = len(kept) + 1
rewritten.OverlapGroupID = 0
kept = append(kept, rewritten)
oldToNew[segment.ID] = rewritten.ID
}
if len(kept) == 0 && !opts.AllowEmpty {
return Result{}, fmt.Errorf("trim operation produced an empty transcript; set AllowEmpty to true to permit this")
}
kept, groups := recomputeOverlapGroups(kept)
if groups == nil {
groups = make([]schema.OverlapGroup, 0)
}
out := copyTranscript(input)
out.Segments = kept
out.OverlapGroups = groups
return Result{
Transcript: out,
OldToNewID: oldToNew,
RemovedIDs: removed,
}, nil
}
// ApplyIntermediate trims an intermediate seriatim output transcript by
// segment ID.
func ApplyIntermediate(input schema.IntermediateTranscript, opts Options) (IntermediateResult, error) {
if err := validateMode(opts.Mode); err != nil {
return IntermediateResult{}, err
}
selected := opts.Selector.IDs()
if len(selected) == 0 {
return IntermediateResult{}, fmt.Errorf("selector cannot be empty")
}
inputIDs := make([]int, len(input.Segments))
for index, segment := range input.Segments {
inputIDs[index] = segment.ID
}
idIndex, err := validateInputIDs(inputIDs)
if err != nil {
return IntermediateResult{}, err
}
if err := validateSelectedIDsExist(selected, idIndex); err != nil {
return IntermediateResult{}, err
}
kept := make([]schema.IntermediateSegment, 0, len(input.Segments))
removed := make([]int, 0, len(input.Segments))
oldToNew := make(map[int]int, len(input.Segments))
for _, segment := range input.Segments {
keep := opts.Mode == ModeKeep && opts.Selector.Contains(segment.ID)
if opts.Mode == ModeRemove {
keep = !opts.Selector.Contains(segment.ID)
}
if !keep {
removed = append(removed, segment.ID)
continue
}
rewritten := schema.IntermediateSegment{
ID: len(kept) + 1,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
Categories: append([]string(nil), segment.Categories...),
}
kept = append(kept, rewritten)
oldToNew[segment.ID] = rewritten.ID
}
if len(kept) == 0 && !opts.AllowEmpty {
return IntermediateResult{}, fmt.Errorf("trim operation produced an empty transcript; set AllowEmpty to true to permit this")
}
return IntermediateResult{
Transcript: schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: input.Metadata.OutputSchema,
},
Segments: kept,
},
OldToNewID: oldToNew,
RemovedIDs: removed,
}, nil
}
// ApplyMinimal trims a minimal seriatim output transcript by segment ID.
func ApplyMinimal(input schema.MinimalTranscript, opts Options) (MinimalResult, error) {
if err := validateMode(opts.Mode); err != nil {
return MinimalResult{}, err
}
selected := opts.Selector.IDs()
if len(selected) == 0 {
return MinimalResult{}, fmt.Errorf("selector cannot be empty")
}
inputIDs := make([]int, len(input.Segments))
for index, segment := range input.Segments {
inputIDs[index] = segment.ID
}
idIndex, err := validateInputIDs(inputIDs)
if err != nil {
return MinimalResult{}, err
}
if err := validateSelectedIDsExist(selected, idIndex); err != nil {
return MinimalResult{}, err
}
kept := make([]schema.MinimalSegment, 0, len(input.Segments))
removed := make([]int, 0, len(input.Segments))
oldToNew := make(map[int]int, len(input.Segments))
for _, segment := range input.Segments {
keep := opts.Mode == ModeKeep && opts.Selector.Contains(segment.ID)
if opts.Mode == ModeRemove {
keep = !opts.Selector.Contains(segment.ID)
}
if !keep {
removed = append(removed, segment.ID)
continue
}
rewritten := schema.MinimalSegment{
ID: len(kept) + 1,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
}
kept = append(kept, rewritten)
oldToNew[segment.ID] = rewritten.ID
}
if len(kept) == 0 && !opts.AllowEmpty {
return MinimalResult{}, fmt.Errorf("trim operation produced an empty transcript; set AllowEmpty to true to permit this")
}
return MinimalResult{
Transcript: schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: input.Metadata.OutputSchema,
},
Segments: kept,
},
OldToNewID: oldToNew,
RemovedIDs: removed,
}, nil
}
func validateMode(mode Mode) error {
switch mode {
case ModeKeep, ModeRemove:
return nil
default:
return fmt.Errorf("invalid trim mode %q", mode)
}
}
func validateInputIDs(ids []int) (map[int]int, error) {
seen := make(map[int]int, len(ids))
for index, id := range ids {
if id <= 0 {
return nil, fmt.Errorf("input transcript has non-positive segment ID %d at index %d", id, index)
}
if firstIndex, exists := seen[id]; exists {
return nil, fmt.Errorf("input transcript has duplicate segment ID %d at indexes %d and %d", id, firstIndex, index)
}
seen[id] = index
}
for id := 1; id <= len(ids); id++ {
if _, exists := seen[id]; !exists {
return nil, fmt.Errorf("input transcript segment IDs must be sequential 1..%d; missing ID %d", len(ids), id)
}
}
return seen, nil
}
func validateSelectedIDsExist(selected []int, idIndex map[int]int) error {
for _, id := range selected {
if _, exists := idIndex[id]; !exists {
return fmt.Errorf("selected segment ID %d does not exist in input transcript", id)
}
}
return nil
}
func recomputeOverlapGroups(segments []schema.Segment) ([]schema.Segment, []schema.OverlapGroup) {
if len(segments) == 0 {
return segments, make([]schema.OverlapGroup, 0)
}
modelSegments := make([]model.Segment, len(segments))
for index, segment := range segments {
modelSegments[index] = model.Segment{
ID: segment.ID,
Source: segment.Source,
SourceSegmentIndex: copyIntPtr(segment.SourceSegmentIndex),
SourceRef: segment.SourceRef,
DerivedFrom: append([]string(nil), segment.DerivedFrom...),
Speaker: segment.Speaker,
Start: segment.Start,
End: segment.End,
Text: segment.Text,
Categories: append([]string(nil), segment.Categories...),
OverlapGroupID: segment.OverlapGroupID,
}
}
detected := overlap.Detect(model.MergedTranscript{
Segments: modelSegments,
})
rewrittenSegments := make([]schema.Segment, len(segments))
for index, segment := range segments {
rewritten := copySegment(segment)
rewritten.OverlapGroupID = detected.Segments[index].OverlapGroupID
rewrittenSegments[index] = rewritten
}
groups := make([]schema.OverlapGroup, len(detected.OverlapGroups))
for index, group := range detected.OverlapGroups {
groups[index] = schema.OverlapGroup{
ID: group.ID,
Start: group.Start,
End: group.End,
Segments: append([]string(nil), group.Segments...),
Speakers: append([]string(nil), group.Speakers...),
Class: group.Class,
Resolution: group.Resolution,
}
}
return rewrittenSegments, groups
}
func copyTranscript(input schema.Transcript) schema.Transcript {
return schema.Transcript{
Metadata: schema.Metadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
InputReader: input.Metadata.InputReader,
InputFiles: append([]string(nil), input.Metadata.InputFiles...),
PreprocessingModules: append([]string(nil), input.Metadata.PreprocessingModules...),
PostprocessingModules: append([]string(nil), input.Metadata.PostprocessingModules...),
OutputModules: append([]string(nil), input.Metadata.OutputModules...),
},
Segments: append([]schema.Segment(nil), input.Segments...),
OverlapGroups: append([]schema.OverlapGroup(nil), input.OverlapGroups...),
}
}
func copySegment(input schema.Segment) schema.Segment {
return schema.Segment{
ID: input.ID,
Source: input.Source,
SourceSegmentIndex: copyIntPtr(input.SourceSegmentIndex),
SourceRef: input.SourceRef,
DerivedFrom: append([]string(nil), input.DerivedFrom...),
Speaker: input.Speaker,
Start: input.Start,
End: input.End,
Text: input.Text,
Categories: append([]string(nil), input.Categories...),
OverlapGroupID: input.OverlapGroupID,
}
}
func copyIntPtr(value *int) *int {
if value == nil {
return nil
}
copied := *value
return &copied
}

668
internal/trim/apply_test.go Normal file
View File

@@ -0,0 +1,668 @@
package trim
import (
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
func TestApplyKeepModeRenumbersFromOne(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "2,4")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(result.Transcript.Segments))
}
assertSegmentIDs(t, result.Transcript.Segments, []int{1, 2})
assertSegmentTexts(t, result.Transcript.Segments, []string{"beta", "delta"})
assertIntMap(t, result.OldToNewID, map[int]int{2: 1, 4: 2})
assertIntSlice(t, result.RemovedIDs, []int{1, 3})
}
func TestApplyRemoveModeRenumbersFromOne(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "2,4")
result, err := Apply(input, Options{
Mode: ModeRemove,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
assertSegmentIDs(t, result.Transcript.Segments, []int{1, 2})
assertSegmentTexts(t, result.Transcript.Segments, []string{"alpha", "gamma"})
assertIntMap(t, result.OldToNewID, map[int]int{1: 1, 3: 2})
assertIntSlice(t, result.RemovedIDs, []int{2, 4})
}
func TestApplySelectorOrderDoesNotChangeTranscriptOrder(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "4,1,3")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
assertSegmentIDs(t, result.Transcript.Segments, []int{1, 2, 3})
assertSegmentTexts(t, result.Transcript.Segments, []string{"alpha", "gamma", "delta"})
}
func TestApplyFailsWhenSelectedIDDoesNotExist(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "2,99")
_, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err == nil {
t.Fatal("expected missing selected ID error")
}
if !strings.Contains(err.Error(), "does not exist") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestApplyFailsOnDuplicateInputIDs(t *testing.T) {
input := fullTranscriptFixture()
input.Segments[2].ID = 2
selector := mustParseSelector(t, "2")
_, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err == nil {
t.Fatal("expected duplicate input ID error")
}
if !strings.Contains(err.Error(), "duplicate segment ID") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestApplyFailsOnMissingOrNonSequentialInputIDs(t *testing.T) {
input := fullTranscriptFixture()
input.Segments[1].ID = 5
selector := mustParseSelector(t, "1")
_, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err == nil {
t.Fatal("expected non-sequential input ID error")
}
if !strings.Contains(err.Error(), "must be sequential") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestApplyFailsOnNonPositiveInputIDs(t *testing.T) {
input := fullTranscriptFixture()
input.Segments[0].ID = 0
selector := mustParseSelector(t, "1")
_, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err == nil {
t.Fatal("expected non-positive input ID error")
}
if !strings.Contains(err.Error(), "non-positive") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestApplyEmptyOutputFailsUnlessAllowEmpty(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "1-4")
_, err := Apply(input, Options{
Mode: ModeRemove,
Selector: selector,
})
if err == nil {
t.Fatal("expected empty-output error")
}
if !strings.Contains(err.Error(), "empty transcript") {
t.Fatalf("unexpected error: %v", err)
}
allowed, err := Apply(input, Options{
Mode: ModeRemove,
Selector: selector,
AllowEmpty: true,
})
if err != nil {
t.Fatalf("apply with AllowEmpty failed: %v", err)
}
if len(allowed.Transcript.Segments) != 0 {
t.Fatalf("segment count = %d, want 0", len(allowed.Transcript.Segments))
}
assertIntMap(t, allowed.OldToNewID, map[int]int{})
assertIntSlice(t, allowed.RemovedIDs, []int{1, 2, 3, 4})
}
func TestApplyPreservesRetainedSegmentFieldsAndClearsOverlapIDs(t *testing.T) {
input := fullTranscriptFixture()
selector := mustParseSelector(t, "2")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
segment := result.Transcript.Segments[0]
if segment.ID != 1 {
t.Fatalf("segment ID = %d, want 1", segment.ID)
}
if segment.Source != "b.json" {
t.Fatalf("source = %q, want %q", segment.Source, "b.json")
}
if segment.SourceSegmentIndex == nil || *segment.SourceSegmentIndex != 20 {
t.Fatalf("source_segment_index = %v, want 20", segment.SourceSegmentIndex)
}
if segment.SourceRef != "b.json#20" {
t.Fatalf("source_ref = %q, want %q", segment.SourceRef, "b.json#20")
}
if !equalStringSlices(segment.DerivedFrom, []string{"b.json#19", "b.json#20"}) {
t.Fatalf("derived_from = %v, want %v", segment.DerivedFrom, []string{"b.json#19", "b.json#20"})
}
if !equalStringSlices(segment.Categories, []string{"filler", "backchannel"}) {
t.Fatalf("categories = %v, want %v", segment.Categories, []string{"filler", "backchannel"})
}
if segment.Speaker != "Bob" {
t.Fatalf("speaker = %q, want Bob", segment.Speaker)
}
if segment.Start != 2 || segment.End != 3 {
t.Fatalf("times = %.3f-%.3f, want 2.000-3.000", segment.Start, segment.End)
}
if segment.Text != "beta" {
t.Fatalf("text = %q, want beta", segment.Text)
}
if segment.OverlapGroupID != 0 {
t.Fatalf("overlap_group_id = %d, want 0", segment.OverlapGroupID)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
}
func TestApplyFullSchemaRemovesStaleOverlapGroups(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1,3")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
for index, segment := range result.Transcript.Segments {
if segment.OverlapGroupID != 0 {
t.Fatalf("segment %d overlap_group_id = %d, want 0", index, segment.OverlapGroupID)
}
}
}
func TestApplyFullSchemaRecomputesOverlapGroup(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1,2")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
assertSegmentIDs(t, result.Transcript.Segments, []int{1, 2})
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
}, []int{1, 1})
if len(result.Transcript.OverlapGroups) != 1 {
t.Fatalf("overlap_groups count = %d, want 1", len(result.Transcript.OverlapGroups))
}
group := result.Transcript.OverlapGroups[0]
if group.ID != 1 {
t.Fatalf("group ID = %d, want 1", group.ID)
}
if group.Start != 1 || group.End != 4 {
t.Fatalf("group times = %.3f-%.3f, want 1.000-4.000", group.Start, group.End)
}
if !equalStringSlices(group.Segments, []string{"a.json#10", "b.json#20"}) {
t.Fatalf("group segments = %v, want %v", group.Segments, []string{"a.json#10", "b.json#20"})
}
if !equalStringSlices(group.Speakers, []string{"Alice", "Bob"}) {
t.Fatalf("group speakers = %v, want %v", group.Speakers, []string{"Alice", "Bob"})
}
}
func TestApplyFullSchemaDropsGroupWhenFewerThanTwoSpeakersRemain(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "1")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].OverlapGroupID != 0 {
t.Fatalf("segment overlap_group_id = %d, want 0", result.Transcript.Segments[0].OverlapGroupID)
}
}
func TestApplyFullSchemaHandlesTransitiveOverlaps(t *testing.T) {
input := transitiveOverlapFixture()
selector := mustParseSelector(t, "1-3")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 1 {
t.Fatalf("overlap_groups count = %d, want 1", len(result.Transcript.OverlapGroups))
}
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
result.Transcript.Segments[2].OverlapGroupID,
}, []int{1, 1, 1})
group := result.Transcript.OverlapGroups[0]
if group.Start != 10 || group.End != 15 {
t.Fatalf("group times = %.3f-%.3f, want 10.000-15.000", group.Start, group.End)
}
}
func TestApplyFullSchemaBoundaryTouchingNotGrouped(t *testing.T) {
input := boundaryFixture()
selector := mustParseSelector(t, "1-2")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if len(result.Transcript.OverlapGroups) != 0 {
t.Fatalf("overlap_groups count = %d, want 0", len(result.Transcript.OverlapGroups))
}
assertIntSlice(t, []int{
result.Transcript.Segments[0].OverlapGroupID,
result.Transcript.Segments[1].OverlapGroupID,
}, []int{0, 0})
}
func TestApplyIntermediateDoesNotIncludeOverlapGroups(t *testing.T) {
input := schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: "seriatim-intermediate",
},
Segments: []schema.IntermediateSegment{
{ID: 1, Start: 1, End: 3, Speaker: "Alice", Text: "alpha", Categories: []string{"word-run"}},
{ID: 2, Start: 2, End: 4, Speaker: "Bob", Text: "beta", Categories: []string{"filler"}},
},
}
selector := mustParseSelector(t, "1")
result, err := ApplyIntermediate(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply intermediate failed: %v", err)
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].ID != 1 {
t.Fatalf("segment id = %d, want 1", result.Transcript.Segments[0].ID)
}
if err := schema.ValidateIntermediateTranscript(result.Transcript); err != nil {
t.Fatalf("intermediate output should remain valid: %v", err)
}
}
func TestApplyMinimalDoesNotIncludeOverlapGroups(t *testing.T) {
input := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: "seriatim-minimal",
},
Segments: []schema.MinimalSegment{
{ID: 1, Start: 1, End: 3, Speaker: "Alice", Text: "alpha"},
{ID: 2, Start: 2, End: 4, Speaker: "Bob", Text: "beta"},
},
}
selector := mustParseSelector(t, "2")
result, err := ApplyMinimal(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply minimal failed: %v", err)
}
if len(result.Transcript.Segments) != 1 {
t.Fatalf("segment count = %d, want 1", len(result.Transcript.Segments))
}
if result.Transcript.Segments[0].ID != 1 {
t.Fatalf("segment id = %d, want 1", result.Transcript.Segments[0].ID)
}
if err := schema.ValidateMinimalTranscript(result.Transcript); err != nil {
t.Fatalf("minimal output should remain valid: %v", err)
}
}
func TestApplyOutputInvariantsValidAfterRenumberAndOverlapRecompute(t *testing.T) {
input := overlapTranscriptFixture()
selector := mustParseSelector(t, "2,1")
result, err := Apply(input, Options{
Mode: ModeKeep,
Selector: selector,
})
if err != nil {
t.Fatalf("apply failed: %v", err)
}
if err := schema.ValidateTranscript(result.Transcript); err != nil {
t.Fatalf("trim output should remain valid: %v", err)
}
}
func mustParseSelector(t *testing.T, value string) Selector {
t.Helper()
selector, err := ParseSelector(value)
if err != nil {
t.Fatalf("selector parse failed for %q: %v", value, err)
}
return selector
}
func fullTranscriptFixture() schema.Transcript {
firstIndex := 10
secondIndex := 20
thirdIndex := 30
fourthIndex := 40
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
InputReader: "json-files",
InputFiles: []string{"a.json", "b.json"},
PreprocessingModules: []string{"validate-raw"},
PostprocessingModules: []string{"detect-overlaps"},
OutputModules: []string{"json"},
},
Segments: []schema.Segment{
{
ID: 1,
Source: "a.json",
SourceSegmentIndex: &firstIndex,
SourceRef: "a.json#10",
DerivedFrom: []string{"a.json#10"},
Speaker: "Alice",
Start: 1,
End: 2,
Text: "alpha",
Categories: []string{"word-run"},
OverlapGroupID: 7,
},
{
ID: 2,
Source: "b.json",
SourceSegmentIndex: &secondIndex,
SourceRef: "b.json#20",
DerivedFrom: []string{"b.json#19", "b.json#20"},
Speaker: "Bob",
Start: 2,
End: 3,
Text: "beta",
Categories: []string{"filler", "backchannel"},
OverlapGroupID: 7,
},
{
ID: 3,
Source: "c.json",
SourceSegmentIndex: &thirdIndex,
SourceRef: "c.json#30",
DerivedFrom: []string{"c.json#30"},
Speaker: "Carol",
Start: 3,
End: 4,
Text: "gamma",
Categories: []string{"normal"},
OverlapGroupID: 8,
},
{
ID: 4,
Source: "d.json",
SourceSegmentIndex: &fourthIndex,
SourceRef: "d.json#40",
DerivedFrom: []string{"d.json#40"},
Speaker: "Dan",
Start: 4,
End: 5,
Text: "delta",
Categories: []string{"normal"},
OverlapGroupID: 9,
},
},
OverlapGroups: []schema.OverlapGroup{
{
ID: 7,
Start: 1.5,
End: 3.1,
Segments: []string{"a.json#10", "b.json#20"},
Speakers: []string{"Alice", "Bob"},
Class: "unknown",
Resolution: "unresolved",
},
},
}
}
func overlapTranscriptFixture() schema.Transcript {
first := 10
second := 20
third := 30
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
InputReader: "json-files",
InputFiles: []string{"a.json", "b.json", "c.json"},
PreprocessingModules: []string{"validate-raw"},
PostprocessingModules: []string{"detect-overlaps"},
OutputModules: []string{"json"},
},
Segments: []schema.Segment{
{
ID: 1,
Source: "a.json",
SourceSegmentIndex: &first,
SourceRef: "a.json#10",
Speaker: "Alice",
Start: 1,
End: 4,
Text: "a",
OverlapGroupID: 99,
},
{
ID: 2,
Source: "b.json",
SourceSegmentIndex: &second,
SourceRef: "b.json#20",
Speaker: "Bob",
Start: 2,
End: 3,
Text: "b",
OverlapGroupID: 99,
},
{
ID: 3,
Source: "c.json",
SourceSegmentIndex: &third,
SourceRef: "c.json#30",
Speaker: "Carol",
Start: 10,
End: 11,
Text: "c",
OverlapGroupID: 100,
},
},
OverlapGroups: []schema.OverlapGroup{
{
ID: 99,
Start: 0,
End: 100,
Segments: []string{"stale#1", "stale#2"},
Speakers: []string{"stale"},
Class: "unknown",
Resolution: "unresolved",
},
},
}
}
func transitiveOverlapFixture() schema.Transcript {
one := 1
two := 2
three := 3
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &one, Speaker: "Alice", Start: 10, End: 14, Text: "a"},
{ID: 2, Source: "b.json", SourceSegmentIndex: &two, Speaker: "Bob", Start: 12, End: 13, Text: "b"},
{ID: 3, Source: "c.json", SourceSegmentIndex: &three, Speaker: "Carol", Start: 13.5, End: 15, Text: "c"},
},
OverlapGroups: []schema.OverlapGroup{{ID: 77}},
}
}
func boundaryFixture() schema.Transcript {
one := 1
two := 2
return schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &one, Speaker: "Alice", Start: 1, End: 2, Text: "a", OverlapGroupID: 7},
{ID: 2, Source: "b.json", SourceSegmentIndex: &two, Speaker: "Bob", Start: 2, End: 3, Text: "b", OverlapGroupID: 7},
},
OverlapGroups: []schema.OverlapGroup{{ID: 7, Start: 1, End: 3}},
}
}
func assertSegmentIDs(t *testing.T, segments []schema.Segment, want []int) {
t.Helper()
got := make([]int, len(segments))
for index, segment := range segments {
got[index] = segment.ID
}
assertIntSlice(t, got, want)
}
func assertSegmentTexts(t *testing.T, segments []schema.Segment, want []string) {
t.Helper()
got := make([]string, len(segments))
for index, segment := range segments {
got[index] = segment.Text
}
if !equalStringSlices(got, want) {
t.Fatalf("segment texts = %v, want %v", got, want)
}
}
func assertIntSlice(t *testing.T, got []int, want []int) {
t.Helper()
if len(got) != len(want) {
t.Fatalf("slice length = %d, want %d", len(got), len(want))
}
for index := range got {
if got[index] != want[index] {
t.Fatalf("slice[%d] = %d, want %d (full got=%v, want=%v)", index, got[index], want[index], got, want)
}
}
}
func assertIntMap(t *testing.T, got map[int]int, want map[int]int) {
t.Helper()
if len(got) != len(want) {
t.Fatalf("map length = %d, want %d", len(got), len(want))
}
for key, wantValue := range want {
gotValue, exists := got[key]
if !exists {
t.Fatalf("missing map key %d", key)
}
if gotValue != wantValue {
t.Fatalf("map[%d] = %d, want %d", key, gotValue, wantValue)
}
}
}
func equalStringSlices(got []string, want []string) bool {
if len(got) != len(want) {
return false
}
for index := range got {
if got[index] != want[index] {
return false
}
}
return true
}

396
internal/trim/artifact.go Normal file
View File

@@ -0,0 +1,396 @@
package trim
import (
"encoding/json"
"fmt"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
const (
SchemaMinimal = "seriatim-minimal"
SchemaIntermediate = "seriatim-intermediate"
SchemaFull = "seriatim-full"
)
// Artifact stores a parsed seriatim output artifact of one supported schema.
type Artifact struct {
Schema string
Full *schema.Transcript
Intermediate *schema.IntermediateTranscript
Minimal *schema.MinimalTranscript
}
// ApplyArtifactResult contains trimmed artifact output and ID mapping metadata.
type ApplyArtifactResult struct {
Artifact Artifact
OldToNewID map[int]int
RemovedIDs []int
OverlapGroupsRecomputed bool
}
// ParseArtifactJSON parses and validates a serialized seriatim output artifact.
func ParseArtifactJSON(data []byte) (Artifact, error) {
var decoded any
if err := json.Unmarshal(data, &decoded); err != nil {
return Artifact{}, fmt.Errorf("input JSON is malformed: %w", err)
}
var full schema.Transcript
if err := json.Unmarshal(data, &full); err == nil {
if err := schema.ValidateTranscript(full); err == nil {
return Artifact{
Schema: SchemaFull,
Full: &full,
}, nil
}
}
var intermediate schema.IntermediateTranscript
if err := json.Unmarshal(data, &intermediate); err == nil {
if err := schema.ValidateIntermediateTranscript(intermediate); err == nil {
return Artifact{
Schema: SchemaIntermediate,
Intermediate: &intermediate,
}, nil
}
}
var minimal schema.MinimalTranscript
if err := json.Unmarshal(data, &minimal); err == nil {
if err := schema.ValidateMinimalTranscript(minimal); err == nil {
return Artifact{
Schema: SchemaMinimal,
Minimal: &minimal,
}, nil
}
}
return Artifact{}, fmt.Errorf("input JSON is not a valid seriatim output artifact")
}
// ValidateArtifact validates an artifact against its declared schema.
func ValidateArtifact(artifact Artifact) error {
switch artifact.Schema {
case SchemaFull:
if artifact.Full == nil {
return fmt.Errorf("full artifact payload is missing")
}
return schema.ValidateTranscript(*artifact.Full)
case SchemaIntermediate:
if artifact.Intermediate == nil {
return fmt.Errorf("intermediate artifact payload is missing")
}
return schema.ValidateIntermediateTranscript(*artifact.Intermediate)
case SchemaMinimal:
if artifact.Minimal == nil {
return fmt.Errorf("minimal artifact payload is missing")
}
return schema.ValidateMinimalTranscript(*artifact.Minimal)
default:
return fmt.Errorf("unsupported artifact schema %q", artifact.Schema)
}
}
// Value returns the artifact value for JSON serialization.
func (artifact Artifact) Value() any {
switch artifact.Schema {
case SchemaFull:
if artifact.Full == nil {
return schema.Transcript{}
}
return *artifact.Full
case SchemaIntermediate:
if artifact.Intermediate == nil {
return schema.IntermediateTranscript{}
}
return *artifact.Intermediate
case SchemaMinimal:
if artifact.Minimal == nil {
return schema.MinimalTranscript{}
}
return *artifact.Minimal
default:
return nil
}
}
// SegmentCount returns the number of segments in the artifact.
func (artifact Artifact) SegmentCount() int {
switch artifact.Schema {
case SchemaFull:
if artifact.Full == nil {
return 0
}
return len(artifact.Full.Segments)
case SchemaIntermediate:
if artifact.Intermediate == nil {
return 0
}
return len(artifact.Intermediate.Segments)
case SchemaMinimal:
if artifact.Minimal == nil {
return 0
}
return len(artifact.Minimal.Segments)
default:
return 0
}
}
// Application returns artifact metadata application name.
func (artifact Artifact) Application() string {
switch artifact.Schema {
case SchemaFull:
if artifact.Full == nil {
return ""
}
return artifact.Full.Metadata.Application
case SchemaIntermediate:
if artifact.Intermediate == nil {
return ""
}
return artifact.Intermediate.Metadata.Application
case SchemaMinimal:
if artifact.Minimal == nil {
return ""
}
return artifact.Minimal.Metadata.Application
default:
return ""
}
}
// Version returns artifact metadata version.
func (artifact Artifact) Version() string {
switch artifact.Schema {
case SchemaFull:
if artifact.Full == nil {
return ""
}
return artifact.Full.Metadata.Version
case SchemaIntermediate:
if artifact.Intermediate == nil {
return ""
}
return artifact.Intermediate.Metadata.Version
case SchemaMinimal:
if artifact.Minimal == nil {
return ""
}
return artifact.Minimal.Metadata.Version
default:
return ""
}
}
// ApplyArtifact trims a parsed artifact while preserving its input schema.
func ApplyArtifact(input Artifact, opts Options) (ApplyArtifactResult, error) {
switch input.Schema {
case SchemaFull:
if input.Full == nil {
return ApplyArtifactResult{}, fmt.Errorf("full artifact payload is missing")
}
result, err := Apply(*input.Full, opts)
if err != nil {
return ApplyArtifactResult{}, err
}
out := result.Transcript
return ApplyArtifactResult{
Artifact: Artifact{
Schema: SchemaFull,
Full: &out,
},
OldToNewID: result.OldToNewID,
RemovedIDs: result.RemovedIDs,
OverlapGroupsRecomputed: true,
}, nil
case SchemaIntermediate:
if input.Intermediate == nil {
return ApplyArtifactResult{}, fmt.Errorf("intermediate artifact payload is missing")
}
result, err := ApplyIntermediate(*input.Intermediate, opts)
if err != nil {
return ApplyArtifactResult{}, err
}
out := result.Transcript
return ApplyArtifactResult{
Artifact: Artifact{
Schema: SchemaIntermediate,
Intermediate: &out,
},
OldToNewID: result.OldToNewID,
RemovedIDs: result.RemovedIDs,
OverlapGroupsRecomputed: false,
}, nil
case SchemaMinimal:
if input.Minimal == nil {
return ApplyArtifactResult{}, fmt.Errorf("minimal artifact payload is missing")
}
result, err := ApplyMinimal(*input.Minimal, opts)
if err != nil {
return ApplyArtifactResult{}, err
}
out := result.Transcript
return ApplyArtifactResult{
Artifact: Artifact{
Schema: SchemaMinimal,
Minimal: &out,
},
OldToNewID: result.OldToNewID,
RemovedIDs: result.RemovedIDs,
OverlapGroupsRecomputed: false,
}, nil
default:
return ApplyArtifactResult{}, fmt.Errorf("unsupported artifact schema %q", input.Schema)
}
}
// ConvertArtifact converts a parsed artifact to another supported output schema.
func ConvertArtifact(input Artifact, outputSchema string) (Artifact, error) {
if outputSchema == "" || outputSchema == input.Schema {
return input, nil
}
switch input.Schema {
case SchemaFull:
if input.Full == nil {
return Artifact{}, fmt.Errorf("full artifact payload is missing")
}
switch outputSchema {
case SchemaIntermediate:
out := intermediateFromFull(*input.Full)
return Artifact{
Schema: SchemaIntermediate,
Intermediate: &out,
}, nil
case SchemaMinimal:
out := minimalFromFull(*input.Full)
return Artifact{
Schema: SchemaMinimal,
Minimal: &out,
}, nil
default:
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
}
case SchemaIntermediate:
if input.Intermediate == nil {
return Artifact{}, fmt.Errorf("intermediate artifact payload is missing")
}
switch outputSchema {
case SchemaMinimal:
out := minimalFromIntermediate(*input.Intermediate)
return Artifact{
Schema: SchemaMinimal,
Minimal: &out,
}, nil
case SchemaFull:
return Artifact{}, fmt.Errorf("cannot emit %q from %q input artifact", SchemaFull, SchemaIntermediate)
default:
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
}
case SchemaMinimal:
if input.Minimal == nil {
return Artifact{}, fmt.Errorf("minimal artifact payload is missing")
}
switch outputSchema {
case SchemaIntermediate:
out := intermediateFromMinimal(*input.Minimal)
return Artifact{
Schema: SchemaIntermediate,
Intermediate: &out,
}, nil
case SchemaFull:
return Artifact{}, fmt.Errorf("cannot emit %q from %q input artifact", SchemaFull, SchemaMinimal)
default:
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
}
default:
return Artifact{}, fmt.Errorf("unsupported input schema %q", input.Schema)
}
}
func intermediateFromFull(input schema.Transcript) schema.IntermediateTranscript {
segments := make([]schema.IntermediateSegment, len(input.Segments))
for index, segment := range input.Segments {
segments[index] = schema.IntermediateSegment{
ID: segment.ID,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
Categories: append([]string(nil), segment.Categories...),
}
}
return schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: SchemaIntermediate,
},
Segments: segments,
}
}
func minimalFromFull(input schema.Transcript) schema.MinimalTranscript {
segments := make([]schema.MinimalSegment, len(input.Segments))
for index, segment := range input.Segments {
segments[index] = schema.MinimalSegment{
ID: segment.ID,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
}
}
return schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: SchemaMinimal,
},
Segments: segments,
}
}
func minimalFromIntermediate(input schema.IntermediateTranscript) schema.MinimalTranscript {
segments := make([]schema.MinimalSegment, len(input.Segments))
for index, segment := range input.Segments {
segments[index] = schema.MinimalSegment{
ID: segment.ID,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
}
}
return schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: SchemaMinimal,
},
Segments: segments,
}
}
func intermediateFromMinimal(input schema.MinimalTranscript) schema.IntermediateTranscript {
segments := make([]schema.IntermediateSegment, len(input.Segments))
for index, segment := range input.Segments {
segments[index] = schema.IntermediateSegment{
ID: segment.ID,
Start: segment.Start,
End: segment.End,
Speaker: segment.Speaker,
Text: segment.Text,
}
}
return schema.IntermediateTranscript{
Metadata: schema.IntermediateMetadata{
Application: input.Metadata.Application,
Version: input.Metadata.Version,
OutputSchema: SchemaIntermediate,
},
Segments: segments,
}
}

View File

@@ -0,0 +1,138 @@
package trim
import (
"encoding/json"
"strings"
"testing"
"gitea.maximumdirect.net/eric/seriatim/schema"
)
func TestParseArtifactJSONRejectsMalformedJSON(t *testing.T) {
_, err := ParseArtifactJSON([]byte(`{"metadata":`))
if err == nil {
t.Fatal("expected malformed JSON error")
}
if !strings.Contains(err.Error(), "input JSON is malformed") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestParseArtifactJSONRejectsDuplicateSegmentIDs(t *testing.T) {
first := 10
second := 20
value := schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &first, Speaker: "A", Start: 1, End: 2, Text: "one"},
{ID: 1, Source: "a.json", SourceSegmentIndex: &second, Speaker: "B", Start: 2, End: 3, Text: "two"},
},
OverlapGroups: []schema.OverlapGroup{},
}
data := mustMarshalJSON(t, value)
_, err := ParseArtifactJSON(data)
if err == nil {
t.Fatal("expected invalid artifact error")
}
if !strings.Contains(err.Error(), "not a valid seriatim output artifact") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestParseArtifactJSONRejectsNonSequentialSegmentIDs(t *testing.T) {
first := 10
second := 20
value := schema.Transcript{
Metadata: schema.Metadata{
Application: "seriatim",
Version: "v-test",
},
Segments: []schema.Segment{
{ID: 1, Source: "a.json", SourceSegmentIndex: &first, Speaker: "A", Start: 1, End: 2, Text: "one"},
{ID: 3, Source: "a.json", SourceSegmentIndex: &second, Speaker: "B", Start: 2, End: 3, Text: "two"},
},
OverlapGroups: []schema.OverlapGroup{},
}
data := mustMarshalJSON(t, value)
_, err := ParseArtifactJSON(data)
if err == nil {
t.Fatal("expected invalid artifact error")
}
if !strings.Contains(err.Error(), "not a valid seriatim output artifact") {
t.Fatalf("unexpected error: %v", err)
}
}
func TestConvertArtifactMinimalToIntermediate(t *testing.T) {
value := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: SchemaMinimal,
},
Segments: []schema.MinimalSegment{
{ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one"},
{ID: 2, Start: 2, End: 3, Speaker: "B", Text: "two"},
},
}
artifact := Artifact{
Schema: SchemaMinimal,
Minimal: &value,
}
converted, err := ConvertArtifact(artifact, SchemaIntermediate)
if err != nil {
t.Fatalf("convert failed: %v", err)
}
if converted.Schema != SchemaIntermediate {
t.Fatalf("schema = %q, want %q", converted.Schema, SchemaIntermediate)
}
if converted.Intermediate == nil {
t.Fatal("expected intermediate artifact")
}
if len(converted.Intermediate.Segments) != 2 {
t.Fatalf("segment count = %d, want 2", len(converted.Intermediate.Segments))
}
if converted.Intermediate.Segments[0].ID != 1 || converted.Intermediate.Segments[1].ID != 2 {
t.Fatalf("unexpected IDs: %#v", converted.Intermediate.Segments)
}
}
func TestConvertArtifactMinimalToFullFails(t *testing.T) {
value := schema.MinimalTranscript{
Metadata: schema.MinimalMetadata{
Application: "seriatim",
Version: "v-test",
OutputSchema: SchemaMinimal,
},
Segments: []schema.MinimalSegment{
{ID: 1, Start: 1, End: 2, Speaker: "A", Text: "one"},
},
}
artifact := Artifact{
Schema: SchemaMinimal,
Minimal: &value,
}
_, err := ConvertArtifact(artifact, SchemaFull)
if err == nil {
t.Fatal("expected conversion error")
}
if !strings.Contains(err.Error(), "cannot emit") {
t.Fatalf("unexpected error: %v", err)
}
}
func mustMarshalJSON(t *testing.T, value any) []byte {
t.Helper()
data, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal: %v", err)
}
return data
}

156
internal/trim/selector.go Normal file
View File

@@ -0,0 +1,156 @@
package trim
import (
"fmt"
"regexp"
"sort"
"strconv"
"strings"
)
var selectorElementPattern = regexp.MustCompile(`^([+-]?\d+)(?:\s*-\s*([+-]?\d+))?$`)
// Selector represents a normalized union of segment IDs.
type Selector struct {
ranges []idRange
}
type idRange struct {
start int
end int
}
// ParseSelector parses an inline segment selector expression.
func ParseSelector(input string) (Selector, error) {
if strings.TrimSpace(input) == "" {
return Selector{}, fmt.Errorf("selector cannot be empty")
}
parts := strings.Split(input, ",")
ranges := make([]idRange, 0, len(parts))
for index, raw := range parts {
element := strings.TrimSpace(raw)
if element == "" {
return Selector{}, fmt.Errorf("selector element %d cannot be empty", index+1)
}
rangeValue, err := parseElement(element)
if err != nil {
return Selector{}, fmt.Errorf("selector element %d %q: %w", index+1, element, err)
}
ranges = append(ranges, rangeValue)
}
normalized := normalizeRanges(ranges)
if len(normalized) == 0 {
return Selector{}, fmt.Errorf("selector cannot be empty")
}
return Selector{ranges: normalized}, nil
}
// Contains returns true when id is included by this selector.
func (s Selector) Contains(id int) bool {
if id <= 0 {
return false
}
index := sort.Search(len(s.ranges), func(i int) bool {
return s.ranges[i].end >= id
})
if index == len(s.ranges) {
return false
}
rangeValue := s.ranges[index]
return id >= rangeValue.start && id <= rangeValue.end
}
// IDs returns a deterministic ascending list of unique segment IDs.
func (s Selector) IDs() []int {
total := 0
for _, rangeValue := range s.ranges {
total += rangeValue.end - rangeValue.start + 1
}
ids := make([]int, 0, total)
for _, rangeValue := range s.ranges {
for id := rangeValue.start; id <= rangeValue.end; id++ {
ids = append(ids, id)
}
}
return ids
}
func parseElement(element string) (idRange, error) {
matches := selectorElementPattern.FindStringSubmatch(element)
if matches == nil {
return idRange{}, fmt.Errorf("malformed element")
}
start, err := parseID(matches[1])
if err != nil {
return idRange{}, err
}
if matches[2] == "" {
return idRange{start: start, end: start}, nil
}
end, err := parseID(matches[2])
if err != nil {
return idRange{}, fmt.Errorf("invalid range end: %w", err)
}
if start > end {
return idRange{}, fmt.Errorf("descending range %d-%d is invalid", start, end)
}
return idRange{start: start, end: end}, nil
}
func parseID(value string) (int, error) {
value = strings.TrimSpace(value)
if value == "" {
return 0, fmt.Errorf("missing segment ID")
}
id, err := strconv.Atoi(value)
if err != nil {
return 0, fmt.Errorf("segment ID must be an integer")
}
if id <= 0 {
return 0, fmt.Errorf("segment ID must be positive")
}
return id, nil
}
func normalizeRanges(in []idRange) []idRange {
if len(in) == 0 {
return nil
}
sorted := make([]idRange, len(in))
copy(sorted, in)
sort.Slice(sorted, func(i, j int) bool {
if sorted[i].start == sorted[j].start {
return sorted[i].end < sorted[j].end
}
return sorted[i].start < sorted[j].start
})
merged := make([]idRange, 0, len(sorted))
for _, next := range sorted {
if len(merged) == 0 {
merged = append(merged, next)
continue
}
last := &merged[len(merged)-1]
if next.start <= last.end+1 {
if next.end > last.end {
last.end = next.end
}
continue
}
merged = append(merged, next)
}
return merged
}

View File

@@ -0,0 +1,127 @@
package trim
import (
"strings"
"testing"
)
func TestParseSelectorSingleID(t *testing.T) {
selector, err := ParseSelector("1")
if err != nil {
t.Fatalf("parse failed: %v", err)
}
assertIDs(t, selector, []int{1})
assertContains(t, selector, map[int]bool{1: true, 2: false, 0: false, -1: false})
}
func TestParseSelectorInclusiveRange(t *testing.T) {
selector, err := ParseSelector("1-3")
if err != nil {
t.Fatalf("parse failed: %v", err)
}
assertIDs(t, selector, []int{1, 2, 3})
}
func TestParseSelectorCommaSeparatedCombination(t *testing.T) {
selector, err := ParseSelector("1-3,8,10-12")
if err != nil {
t.Fatalf("parse failed: %v", err)
}
assertIDs(t, selector, []int{1, 2, 3, 8, 10, 11, 12})
}
func TestParseSelectorWhitespaceTolerance(t *testing.T) {
selector, err := ParseSelector(" 1 - 3 , 8 , 10 - 12 ")
if err != nil {
t.Fatalf("parse failed: %v", err)
}
assertIDs(t, selector, []int{1, 2, 3, 8, 10, 11, 12})
}
func TestParseSelectorDuplicatesAndOverlapsNormalizeUnion(t *testing.T) {
selector, err := ParseSelector("1-4,2,4,3-6,6")
if err != nil {
t.Fatalf("parse failed: %v", err)
}
assertIDs(t, selector, []int{1, 2, 3, 4, 5, 6})
assertContains(t, selector, map[int]bool{1: true, 5: true, 6: true, 7: false})
}
func TestParseSelectorDeterministicNormalizedOutput(t *testing.T) {
left, err := ParseSelector("8,1-3,2,10-12")
if err != nil {
t.Fatalf("parse left failed: %v", err)
}
right, err := ParseSelector("10-12,3,2,1,8")
if err != nil {
t.Fatalf("parse right failed: %v", err)
}
leftIDs := left.IDs()
rightIDs := right.IDs()
if !equalInts(leftIDs, rightIDs) {
t.Fatalf("normalized IDs mismatch: %v vs %v", leftIDs, rightIDs)
}
}
func TestParseSelectorFailures(t *testing.T) {
tests := []struct {
name string
selector string
wantError string
}{
{name: "empty", selector: "", wantError: "cannot be empty"},
{name: "whitespace only", selector: " ", wantError: "cannot be empty"},
{name: "zero", selector: "0", wantError: "must be positive"},
{name: "negative", selector: "-1", wantError: "must be positive"},
{name: "range includes zero", selector: "0-2", wantError: "must be positive"},
{name: "descending range", selector: "10-1", wantError: "descending range"},
{name: "empty element", selector: "1,,2", wantError: "cannot be empty"},
{name: "trailing comma", selector: "1,", wantError: "cannot be empty"},
{name: "malformed alpha", selector: "abc", wantError: "malformed element"},
{name: "malformed range", selector: "1-2-3", wantError: "malformed element"},
{name: "missing end", selector: "1-", wantError: "malformed element"},
{name: "missing start", selector: "-2", wantError: "must be positive"},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
_, err := ParseSelector(test.selector)
if err == nil {
t.Fatalf("expected error for %q", test.selector)
}
if !strings.Contains(err.Error(), test.wantError) {
t.Fatalf("error = %q, want substring %q", err.Error(), test.wantError)
}
})
}
}
func assertIDs(t *testing.T, selector Selector, want []int) {
t.Helper()
got := selector.IDs()
if !equalInts(got, want) {
t.Fatalf("IDs = %v, want %v", got, want)
}
}
func assertContains(t *testing.T, selector Selector, checks map[int]bool) {
t.Helper()
for id, want := range checks {
if got := selector.Contains(id); got != want {
t.Fatalf("Contains(%d) = %t, want %t", id, got, want)
}
}
}
func equalInts(left []int, right []int) bool {
if len(left) != len(right) {
return false
}
for index := range left {
if left[index] != right[index] {
return false
}
}
return true
}

View File

@@ -1,6 +1,6 @@
{ {
"$schema": "https://json-schema.org/draft/2020-12/schema", "$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/output.schema.json", "$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/full-output.schema.json",
"title": "seriatim full output transcript", "title": "seriatim full output transcript",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,

View File

@@ -1,7 +1,7 @@
{ {
"$schema": "https://json-schema.org/draft/2020-12/schema", "$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/default-output.schema.json", "$id": "https://gitea.maximumdirect.net/eric/seriatim/schema/intermediate-output.schema.json",
"title": "seriatim default output transcript", "title": "seriatim intermediate output transcript",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,
"required": ["metadata", "segments"], "required": ["metadata", "segments"],
@@ -13,7 +13,7 @@
"properties": { "properties": {
"application": { "type": "string" }, "application": { "type": "string" },
"version": { "type": "string" }, "version": { "type": "string" },
"output_schema": { "type": "string", "const": "default" } "output_schema": { "type": "string", "const": "seriatim-intermediate" }
} }
}, },
"segments": { "segments": {

View File

@@ -13,7 +13,7 @@
"properties": { "properties": {
"application": { "type": "string" }, "application": { "type": "string" },
"version": { "type": "string" }, "version": { "type": "string" },
"output_schema": { "type": "string", "const": "minimal" } "output_schema": { "type": "string", "const": "seriatim-minimal" }
} }
}, },
"segments": { "segments": {

View File

@@ -14,8 +14,8 @@ import (
var schemaFS embed.FS var schemaFS embed.FS
const ( const (
outputSchemaPath = "output.schema.json" fullOutputSchemaPath = "full-output.schema.json"
defaultOutputSchemaPath = "default-output.schema.json" intermediateOutputSchemaPath = "intermediate-output.schema.json"
minimalOutputSchemaPath = "minimal-output.schema.json" minimalOutputSchemaPath = "minimal-output.schema.json"
) )
@@ -32,10 +32,10 @@ type Transcript struct {
OverlapGroups []OverlapGroup `json:"overlap_groups"` OverlapGroups []OverlapGroup `json:"overlap_groups"`
} }
// DefaultTranscript is seriatim's default public JSON output contract. // IntermediateTranscript is seriatim's intermediate public JSON output contract.
type DefaultTranscript struct { type IntermediateTranscript struct {
Metadata DefaultMetadata `json:"metadata"` Metadata IntermediateMetadata `json:"metadata"`
Segments []DefaultSegment `json:"segments"` Segments []IntermediateSegment `json:"segments"`
} }
// MinimalTranscript is seriatim's compact public JSON output contract. // MinimalTranscript is seriatim's compact public JSON output contract.
@@ -55,8 +55,8 @@ type Metadata struct {
OutputModules []string `json:"output_modules"` OutputModules []string `json:"output_modules"`
} }
// DefaultMetadata records default artifact identity. // IntermediateMetadata records intermediate artifact identity.
type DefaultMetadata struct { type IntermediateMetadata struct {
Application string `json:"application"` Application string `json:"application"`
Version string `json:"version"` Version string `json:"version"`
OutputSchema string `json:"output_schema"` OutputSchema string `json:"output_schema"`
@@ -84,9 +84,9 @@ type Segment struct {
OverlapGroupID int `json:"overlap_group_id,omitempty"` OverlapGroupID int `json:"overlap_group_id,omitempty"`
} }
// DefaultSegment is the compact public transcript segment shape with // IntermediateSegment is the compact public transcript segment shape with
// categories. // categories.
type DefaultSegment struct { type IntermediateSegment struct {
ID int `json:"id"` ID int `json:"id"`
Start float64 `json:"start"` Start float64 `json:"start"`
End float64 `json:"end"` End float64 `json:"end"`
@@ -115,7 +115,7 @@ type OverlapGroup struct {
Resolution string `json:"resolution"` Resolution string `json:"resolution"`
} }
// ValidateTranscript validates a typed transcript against the public JSON // ValidateTranscript validates a full transcript against the public JSON
// schema and seriatim-specific semantic rules. // schema and seriatim-specific semantic rules.
func ValidateTranscript(transcript Transcript) error { func ValidateTranscript(transcript Transcript) error {
if err := validateSemantics(transcript); err != nil { if err := validateSemantics(transcript); err != nil {
@@ -129,18 +129,18 @@ func ValidateTranscript(transcript Transcript) error {
return ValidateJSON(data) return ValidateJSON(data)
} }
// ValidateDefaultTranscript validates the default transcript against the // ValidateIntermediateTranscript validates the intermediate transcript against
// default JSON schema and seriatim-specific semantic rules. // the intermediate JSON schema and seriatim-specific semantic rules.
func ValidateDefaultTranscript(transcript DefaultTranscript) error { func ValidateIntermediateTranscript(transcript IntermediateTranscript) error {
if err := validateDefaultSemantics(transcript); err != nil { if err := validateIntermediateSemantics(transcript); err != nil {
return err return err
} }
data, err := json.Marshal(transcript) data, err := json.Marshal(transcript)
if err != nil { if err != nil {
return fmt.Errorf("marshal default transcript for schema validation: %w", err) return fmt.Errorf("marshal intermediate transcript for schema validation: %w", err)
} }
return ValidateDefaultJSON(data) return ValidateIntermediateJSON(data)
} }
// ValidateMinimalTranscript validates a minimal transcript against the minimal // ValidateMinimalTranscript validates a minimal transcript against the minimal
@@ -159,13 +159,13 @@ func ValidateMinimalTranscript(transcript MinimalTranscript) error {
// ValidateJSON validates serialized output JSON against the public schema. // ValidateJSON validates serialized output JSON against the public schema.
func ValidateJSON(data []byte) error { func ValidateJSON(data []byte) error {
return validateJSONWithSchema(data, outputSchemaPath) return validateJSONWithSchema(data, fullOutputSchemaPath)
} }
// ValidateDefaultJSON validates serialized default output JSON against the // ValidateIntermediateJSON validates serialized intermediate output JSON
// default public schema. // against the intermediate public schema.
func ValidateDefaultJSON(data []byte) error { func ValidateIntermediateJSON(data []byte) error {
return validateJSONWithSchema(data, defaultOutputSchemaPath) return validateJSONWithSchema(data, intermediateOutputSchemaPath)
} }
// ValidateMinimalJSON validates serialized minimal output JSON against the // ValidateMinimalJSON validates serialized minimal output JSON against the
@@ -245,7 +245,7 @@ func validateSemantics(transcript Transcript) error {
return nil return nil
} }
func validateDefaultSemantics(transcript DefaultTranscript) error { func validateIntermediateSemantics(transcript IntermediateTranscript) error {
for index, segment := range transcript.Segments { for index, segment := range transcript.Segments {
wantID := index + 1 wantID := index + 1
if segment.ID != wantID { if segment.ID != wantID {

View File

@@ -21,11 +21,11 @@ func TestValidateMinimalTranscriptAcceptsValidTranscript(t *testing.T) {
} }
} }
func TestValidateDefaultTranscriptAcceptsValidTranscript(t *testing.T) { func TestValidateIntermediateTranscriptAcceptsValidTranscript(t *testing.T) {
transcript := validDefaultTranscript() transcript := validIntermediateTranscript()
if err := ValidateDefaultTranscript(transcript); err != nil { if err := ValidateIntermediateTranscript(transcript); err != nil {
t.Fatalf("validate default transcript: %v", err) t.Fatalf("validate intermediate transcript: %v", err)
} }
} }
@@ -34,7 +34,7 @@ func TestValidateMinimalJSONRejectsMissingRequiredField(t *testing.T) {
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "minimal" "output_schema": "seriatim-minimal"
} }
}`)) }`))
assertErrorContains(t, err, "segments") assertErrorContains(t, err, "segments")
@@ -45,7 +45,7 @@ func TestValidateMinimalJSONRejectsWrongFieldType(t *testing.T) {
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "minimal" "output_schema": "seriatim-minimal"
}, },
"segments": [ "segments": [
{ {
@@ -68,7 +68,7 @@ func TestValidateMinimalJSONRejectsUnexpectedFields(t *testing.T) {
{ {
name: "top-level overlap groups", name: "top-level overlap groups",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-minimal"},
"segments": [], "segments": [],
"overlap_groups": [] "overlap_groups": []
}`, }`,
@@ -76,28 +76,28 @@ func TestValidateMinimalJSONRejectsUnexpectedFields(t *testing.T) {
{ {
name: "segment source", name: "segment source",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-minimal"},
"segments": [{"id": 1, "source": "input.json", "start": 1, "end": 2, "speaker": "Alice", "text": "hello"}] "segments": [{"id": 1, "source": "input.json", "start": 1, "end": 2, "speaker": "Alice", "text": "hello"}]
}`, }`,
}, },
{ {
name: "segment categories", name: "segment categories",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-minimal"},
"segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "categories": ["backchannel"]}] "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "categories": ["backchannel"]}]
}`, }`,
}, },
{ {
name: "segment derived from", name: "segment derived from",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-minimal"},
"segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "derived_from": ["input.json#0"]}] "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "derived_from": ["input.json#0"]}]
}`, }`,
}, },
{ {
name: "segment words", name: "segment words",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "minimal"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-minimal"},
"segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "words": []}] "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "words": []}]
}`, }`,
}, },
@@ -111,23 +111,23 @@ func TestValidateMinimalJSONRejectsUnexpectedFields(t *testing.T) {
} }
} }
func TestValidateDefaultJSONRejectsMissingRequiredField(t *testing.T) { func TestValidateIntermediateJSONRejectsMissingRequiredField(t *testing.T) {
err := ValidateDefaultJSON([]byte(`{ err := ValidateIntermediateJSON([]byte(`{
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "default" "output_schema": "seriatim-intermediate"
} }
}`)) }`))
assertErrorContains(t, err, "segments") assertErrorContains(t, err, "segments")
} }
func TestValidateDefaultJSONRejectsWrongFieldType(t *testing.T) { func TestValidateIntermediateJSONRejectsWrongFieldType(t *testing.T) {
err := ValidateDefaultJSON([]byte(`{ err := ValidateIntermediateJSON([]byte(`{
"metadata": { "metadata": {
"application": "seriatim", "application": "seriatim",
"version": "dev", "version": "dev",
"output_schema": "default" "output_schema": "seriatim-intermediate"
}, },
"segments": [ "segments": [
{ {
@@ -142,7 +142,7 @@ func TestValidateDefaultJSONRejectsWrongFieldType(t *testing.T) {
assertErrorContains(t, err, "id") assertErrorContains(t, err, "id")
} }
func TestValidateDefaultJSONRejectsUnexpectedFields(t *testing.T) { func TestValidateIntermediateJSONRejectsUnexpectedFields(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
json string json string
@@ -150,7 +150,7 @@ func TestValidateDefaultJSONRejectsUnexpectedFields(t *testing.T) {
{ {
name: "top-level overlap groups", name: "top-level overlap groups",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "default"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-intermediate"},
"segments": [], "segments": [],
"overlap_groups": [] "overlap_groups": []
}`, }`,
@@ -158,21 +158,21 @@ func TestValidateDefaultJSONRejectsUnexpectedFields(t *testing.T) {
{ {
name: "segment source", name: "segment source",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "default"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-intermediate"},
"segments": [{"id": 1, "source": "input.json", "start": 1, "end": 2, "speaker": "Alice", "text": "hello"}] "segments": [{"id": 1, "source": "input.json", "start": 1, "end": 2, "speaker": "Alice", "text": "hello"}]
}`, }`,
}, },
{ {
name: "segment derived from", name: "segment derived from",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "default"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-intermediate"},
"segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "derived_from": ["input.json#0"]}] "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "derived_from": ["input.json#0"]}]
}`, }`,
}, },
{ {
name: "segment words", name: "segment words",
json: `{ json: `{
"metadata": {"application": "seriatim", "version": "dev", "output_schema": "default"}, "metadata": {"application": "seriatim", "version": "dev", "output_schema": "seriatim-intermediate"},
"segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "words": []}] "segments": [{"id": 1, "start": 1, "end": 2, "speaker": "Alice", "text": "hello", "words": []}]
}`, }`,
}, },
@@ -180,7 +180,7 @@ func TestValidateDefaultJSONRejectsUnexpectedFields(t *testing.T) {
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {
err := ValidateDefaultJSON([]byte(test.json)) err := ValidateIntermediateJSON([]byte(test.json))
assertErrorContains(t, err, "additional properties") assertErrorContains(t, err, "additional properties")
}) })
} }
@@ -267,7 +267,7 @@ func TestValidateJSONRejectsUnexpectedInternalFields(t *testing.T) {
} }
} }
func TestValidateDefaultTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T) { func TestValidateIntermediateTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
ids []int ids []int
@@ -280,10 +280,10 @@ func TestValidateDefaultTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T)
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {
transcript := validDefaultTranscript() transcript := validIntermediateTranscript()
transcript.Segments = transcript.Segments[:0] transcript.Segments = transcript.Segments[:0]
for index, id := range test.ids { for index, id := range test.ids {
transcript.Segments = append(transcript.Segments, DefaultSegment{ transcript.Segments = append(transcript.Segments, IntermediateSegment{
ID: id, ID: id,
Start: float64(index), Start: float64(index),
End: float64(index) + 1, End: float64(index) + 1,
@@ -292,18 +292,18 @@ func TestValidateDefaultTranscriptRejectsMissingOrNonSequentialIDs(t *testing.T)
}) })
} }
err := ValidateDefaultTranscript(transcript) err := ValidateIntermediateTranscript(transcript)
assertErrorContains(t, err, test.want) assertErrorContains(t, err, test.want)
}) })
} }
} }
func TestValidateDefaultTranscriptRejectsInvalidTiming(t *testing.T) { func TestValidateIntermediateTranscriptRejectsInvalidTiming(t *testing.T) {
transcript := validDefaultTranscript() transcript := validIntermediateTranscript()
transcript.Segments[0].Start = 2 transcript.Segments[0].Start = 2
transcript.Segments[0].End = 1 transcript.Segments[0].End = 1
err := ValidateDefaultTranscript(transcript) err := ValidateIntermediateTranscript(transcript)
assertErrorContains(t, err, "segment 0 has end") assertErrorContains(t, err, "segment 0 has end")
} }
@@ -411,7 +411,7 @@ func validMinimalTranscript() MinimalTranscript {
Metadata: MinimalMetadata{ Metadata: MinimalMetadata{
Application: "seriatim", Application: "seriatim",
Version: "dev", Version: "dev",
OutputSchema: "minimal", OutputSchema: "seriatim-minimal",
}, },
Segments: []MinimalSegment{ Segments: []MinimalSegment{
{ {
@@ -425,14 +425,14 @@ func validMinimalTranscript() MinimalTranscript {
} }
} }
func validDefaultTranscript() DefaultTranscript { func validIntermediateTranscript() IntermediateTranscript {
return DefaultTranscript{ return IntermediateTranscript{
Metadata: DefaultMetadata{ Metadata: IntermediateMetadata{
Application: "seriatim", Application: "seriatim",
Version: "dev", Version: "dev",
OutputSchema: "default", OutputSchema: "seriatim-intermediate",
}, },
Segments: []DefaultSegment{ Segments: []IntermediateSegment{
{ {
ID: 1, ID: 1,
Start: 1, Start: 1,