392 lines
11 KiB
Go
392 lines
11 KiB
Go
package trim
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
|
|
"gitea.maximumdirect.net/eric/seriatim/schema"
|
|
)
|
|
|
|
const (
|
|
SchemaMinimal = "seriatim-minimal"
|
|
SchemaIntermediate = "seriatim-intermediate"
|
|
SchemaFull = "seriatim-full"
|
|
)
|
|
|
|
// Artifact stores a parsed seriatim output artifact of one supported schema.
|
|
type Artifact struct {
|
|
Schema string
|
|
Full *schema.Transcript
|
|
Intermediate *schema.IntermediateTranscript
|
|
Minimal *schema.MinimalTranscript
|
|
}
|
|
|
|
// ApplyArtifactResult contains trimmed artifact output and ID mapping metadata.
|
|
type ApplyArtifactResult struct {
|
|
Artifact Artifact
|
|
OldToNewID map[int]int
|
|
RemovedIDs []int
|
|
OverlapGroupsRecomputed bool
|
|
}
|
|
|
|
// ParseArtifactJSON parses and validates a serialized seriatim output artifact.
|
|
func ParseArtifactJSON(data []byte) (Artifact, error) {
|
|
var full schema.Transcript
|
|
if err := json.Unmarshal(data, &full); err == nil {
|
|
if err := schema.ValidateTranscript(full); err == nil {
|
|
return Artifact{
|
|
Schema: SchemaFull,
|
|
Full: &full,
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
var intermediate schema.IntermediateTranscript
|
|
if err := json.Unmarshal(data, &intermediate); err == nil {
|
|
if err := schema.ValidateIntermediateTranscript(intermediate); err == nil {
|
|
return Artifact{
|
|
Schema: SchemaIntermediate,
|
|
Intermediate: &intermediate,
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
var minimal schema.MinimalTranscript
|
|
if err := json.Unmarshal(data, &minimal); err == nil {
|
|
if err := schema.ValidateMinimalTranscript(minimal); err == nil {
|
|
return Artifact{
|
|
Schema: SchemaMinimal,
|
|
Minimal: &minimal,
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
return Artifact{}, fmt.Errorf("input JSON is not a valid seriatim output artifact")
|
|
}
|
|
|
|
// ValidateArtifact validates an artifact against its declared schema.
|
|
func ValidateArtifact(artifact Artifact) error {
|
|
switch artifact.Schema {
|
|
case SchemaFull:
|
|
if artifact.Full == nil {
|
|
return fmt.Errorf("full artifact payload is missing")
|
|
}
|
|
return schema.ValidateTranscript(*artifact.Full)
|
|
case SchemaIntermediate:
|
|
if artifact.Intermediate == nil {
|
|
return fmt.Errorf("intermediate artifact payload is missing")
|
|
}
|
|
return schema.ValidateIntermediateTranscript(*artifact.Intermediate)
|
|
case SchemaMinimal:
|
|
if artifact.Minimal == nil {
|
|
return fmt.Errorf("minimal artifact payload is missing")
|
|
}
|
|
return schema.ValidateMinimalTranscript(*artifact.Minimal)
|
|
default:
|
|
return fmt.Errorf("unsupported artifact schema %q", artifact.Schema)
|
|
}
|
|
}
|
|
|
|
// Value returns the artifact value for JSON serialization.
|
|
func (artifact Artifact) Value() any {
|
|
switch artifact.Schema {
|
|
case SchemaFull:
|
|
if artifact.Full == nil {
|
|
return schema.Transcript{}
|
|
}
|
|
return *artifact.Full
|
|
case SchemaIntermediate:
|
|
if artifact.Intermediate == nil {
|
|
return schema.IntermediateTranscript{}
|
|
}
|
|
return *artifact.Intermediate
|
|
case SchemaMinimal:
|
|
if artifact.Minimal == nil {
|
|
return schema.MinimalTranscript{}
|
|
}
|
|
return *artifact.Minimal
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// SegmentCount returns the number of segments in the artifact.
|
|
func (artifact Artifact) SegmentCount() int {
|
|
switch artifact.Schema {
|
|
case SchemaFull:
|
|
if artifact.Full == nil {
|
|
return 0
|
|
}
|
|
return len(artifact.Full.Segments)
|
|
case SchemaIntermediate:
|
|
if artifact.Intermediate == nil {
|
|
return 0
|
|
}
|
|
return len(artifact.Intermediate.Segments)
|
|
case SchemaMinimal:
|
|
if artifact.Minimal == nil {
|
|
return 0
|
|
}
|
|
return len(artifact.Minimal.Segments)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
// Application returns artifact metadata application name.
|
|
func (artifact Artifact) Application() string {
|
|
switch artifact.Schema {
|
|
case SchemaFull:
|
|
if artifact.Full == nil {
|
|
return ""
|
|
}
|
|
return artifact.Full.Metadata.Application
|
|
case SchemaIntermediate:
|
|
if artifact.Intermediate == nil {
|
|
return ""
|
|
}
|
|
return artifact.Intermediate.Metadata.Application
|
|
case SchemaMinimal:
|
|
if artifact.Minimal == nil {
|
|
return ""
|
|
}
|
|
return artifact.Minimal.Metadata.Application
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// Version returns artifact metadata version.
|
|
func (artifact Artifact) Version() string {
|
|
switch artifact.Schema {
|
|
case SchemaFull:
|
|
if artifact.Full == nil {
|
|
return ""
|
|
}
|
|
return artifact.Full.Metadata.Version
|
|
case SchemaIntermediate:
|
|
if artifact.Intermediate == nil {
|
|
return ""
|
|
}
|
|
return artifact.Intermediate.Metadata.Version
|
|
case SchemaMinimal:
|
|
if artifact.Minimal == nil {
|
|
return ""
|
|
}
|
|
return artifact.Minimal.Metadata.Version
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
// ApplyArtifact trims a parsed artifact while preserving its input schema.
|
|
func ApplyArtifact(input Artifact, opts Options) (ApplyArtifactResult, error) {
|
|
switch input.Schema {
|
|
case SchemaFull:
|
|
if input.Full == nil {
|
|
return ApplyArtifactResult{}, fmt.Errorf("full artifact payload is missing")
|
|
}
|
|
result, err := Apply(*input.Full, opts)
|
|
if err != nil {
|
|
return ApplyArtifactResult{}, err
|
|
}
|
|
out := result.Transcript
|
|
return ApplyArtifactResult{
|
|
Artifact: Artifact{
|
|
Schema: SchemaFull,
|
|
Full: &out,
|
|
},
|
|
OldToNewID: result.OldToNewID,
|
|
RemovedIDs: result.RemovedIDs,
|
|
OverlapGroupsRecomputed: true,
|
|
}, nil
|
|
case SchemaIntermediate:
|
|
if input.Intermediate == nil {
|
|
return ApplyArtifactResult{}, fmt.Errorf("intermediate artifact payload is missing")
|
|
}
|
|
result, err := ApplyIntermediate(*input.Intermediate, opts)
|
|
if err != nil {
|
|
return ApplyArtifactResult{}, err
|
|
}
|
|
out := result.Transcript
|
|
return ApplyArtifactResult{
|
|
Artifact: Artifact{
|
|
Schema: SchemaIntermediate,
|
|
Intermediate: &out,
|
|
},
|
|
OldToNewID: result.OldToNewID,
|
|
RemovedIDs: result.RemovedIDs,
|
|
OverlapGroupsRecomputed: false,
|
|
}, nil
|
|
case SchemaMinimal:
|
|
if input.Minimal == nil {
|
|
return ApplyArtifactResult{}, fmt.Errorf("minimal artifact payload is missing")
|
|
}
|
|
result, err := ApplyMinimal(*input.Minimal, opts)
|
|
if err != nil {
|
|
return ApplyArtifactResult{}, err
|
|
}
|
|
out := result.Transcript
|
|
return ApplyArtifactResult{
|
|
Artifact: Artifact{
|
|
Schema: SchemaMinimal,
|
|
Minimal: &out,
|
|
},
|
|
OldToNewID: result.OldToNewID,
|
|
RemovedIDs: result.RemovedIDs,
|
|
OverlapGroupsRecomputed: false,
|
|
}, nil
|
|
default:
|
|
return ApplyArtifactResult{}, fmt.Errorf("unsupported artifact schema %q", input.Schema)
|
|
}
|
|
}
|
|
|
|
// ConvertArtifact converts a parsed artifact to another supported output schema.
|
|
func ConvertArtifact(input Artifact, outputSchema string) (Artifact, error) {
|
|
if outputSchema == "" || outputSchema == input.Schema {
|
|
return input, nil
|
|
}
|
|
|
|
switch input.Schema {
|
|
case SchemaFull:
|
|
if input.Full == nil {
|
|
return Artifact{}, fmt.Errorf("full artifact payload is missing")
|
|
}
|
|
switch outputSchema {
|
|
case SchemaIntermediate:
|
|
out := intermediateFromFull(*input.Full)
|
|
return Artifact{
|
|
Schema: SchemaIntermediate,
|
|
Intermediate: &out,
|
|
}, nil
|
|
case SchemaMinimal:
|
|
out := minimalFromFull(*input.Full)
|
|
return Artifact{
|
|
Schema: SchemaMinimal,
|
|
Minimal: &out,
|
|
}, nil
|
|
default:
|
|
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
|
|
}
|
|
case SchemaIntermediate:
|
|
if input.Intermediate == nil {
|
|
return Artifact{}, fmt.Errorf("intermediate artifact payload is missing")
|
|
}
|
|
switch outputSchema {
|
|
case SchemaMinimal:
|
|
out := minimalFromIntermediate(*input.Intermediate)
|
|
return Artifact{
|
|
Schema: SchemaMinimal,
|
|
Minimal: &out,
|
|
}, nil
|
|
case SchemaFull:
|
|
return Artifact{}, fmt.Errorf("cannot emit %q from %q input artifact", SchemaFull, SchemaIntermediate)
|
|
default:
|
|
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
|
|
}
|
|
case SchemaMinimal:
|
|
if input.Minimal == nil {
|
|
return Artifact{}, fmt.Errorf("minimal artifact payload is missing")
|
|
}
|
|
switch outputSchema {
|
|
case SchemaIntermediate:
|
|
out := intermediateFromMinimal(*input.Minimal)
|
|
return Artifact{
|
|
Schema: SchemaIntermediate,
|
|
Intermediate: &out,
|
|
}, nil
|
|
case SchemaFull:
|
|
return Artifact{}, fmt.Errorf("cannot emit %q from %q input artifact", SchemaFull, SchemaMinimal)
|
|
default:
|
|
return Artifact{}, fmt.Errorf("unsupported output schema %q", outputSchema)
|
|
}
|
|
default:
|
|
return Artifact{}, fmt.Errorf("unsupported input schema %q", input.Schema)
|
|
}
|
|
}
|
|
|
|
func intermediateFromFull(input schema.Transcript) schema.IntermediateTranscript {
|
|
segments := make([]schema.IntermediateSegment, len(input.Segments))
|
|
for index, segment := range input.Segments {
|
|
segments[index] = schema.IntermediateSegment{
|
|
ID: segment.ID,
|
|
Start: segment.Start,
|
|
End: segment.End,
|
|
Speaker: segment.Speaker,
|
|
Text: segment.Text,
|
|
Categories: append([]string(nil), segment.Categories...),
|
|
}
|
|
}
|
|
return schema.IntermediateTranscript{
|
|
Metadata: schema.IntermediateMetadata{
|
|
Application: input.Metadata.Application,
|
|
Version: input.Metadata.Version,
|
|
OutputSchema: SchemaIntermediate,
|
|
},
|
|
Segments: segments,
|
|
}
|
|
}
|
|
|
|
func minimalFromFull(input schema.Transcript) schema.MinimalTranscript {
|
|
segments := make([]schema.MinimalSegment, len(input.Segments))
|
|
for index, segment := range input.Segments {
|
|
segments[index] = schema.MinimalSegment{
|
|
ID: segment.ID,
|
|
Start: segment.Start,
|
|
End: segment.End,
|
|
Speaker: segment.Speaker,
|
|
Text: segment.Text,
|
|
}
|
|
}
|
|
return schema.MinimalTranscript{
|
|
Metadata: schema.MinimalMetadata{
|
|
Application: input.Metadata.Application,
|
|
Version: input.Metadata.Version,
|
|
OutputSchema: SchemaMinimal,
|
|
},
|
|
Segments: segments,
|
|
}
|
|
}
|
|
|
|
func minimalFromIntermediate(input schema.IntermediateTranscript) schema.MinimalTranscript {
|
|
segments := make([]schema.MinimalSegment, len(input.Segments))
|
|
for index, segment := range input.Segments {
|
|
segments[index] = schema.MinimalSegment{
|
|
ID: segment.ID,
|
|
Start: segment.Start,
|
|
End: segment.End,
|
|
Speaker: segment.Speaker,
|
|
Text: segment.Text,
|
|
}
|
|
}
|
|
return schema.MinimalTranscript{
|
|
Metadata: schema.MinimalMetadata{
|
|
Application: input.Metadata.Application,
|
|
Version: input.Metadata.Version,
|
|
OutputSchema: SchemaMinimal,
|
|
},
|
|
Segments: segments,
|
|
}
|
|
}
|
|
|
|
func intermediateFromMinimal(input schema.MinimalTranscript) schema.IntermediateTranscript {
|
|
segments := make([]schema.IntermediateSegment, len(input.Segments))
|
|
for index, segment := range input.Segments {
|
|
segments[index] = schema.IntermediateSegment{
|
|
ID: segment.ID,
|
|
Start: segment.Start,
|
|
End: segment.End,
|
|
Speaker: segment.Speaker,
|
|
Text: segment.Text,
|
|
}
|
|
}
|
|
return schema.IntermediateTranscript{
|
|
Metadata: schema.IntermediateMetadata{
|
|
Application: input.Metadata.Application,
|
|
Version: input.Metadata.Version,
|
|
OutputSchema: SchemaIntermediate,
|
|
},
|
|
Segments: segments,
|
|
}
|
|
}
|