Implemented the initial Go framework
This commit is contained in:
31
internal/builtin/input.go
Normal file
31
internal/builtin/input.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type jsonFilesReader struct{}
|
||||
|
||||
func (jsonFilesReader) Name() string {
|
||||
return "json-files"
|
||||
}
|
||||
|
||||
func (jsonFilesReader) Read(ctx context.Context, cfg config.Config) ([]model.RawTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
raw := make([]model.RawTranscript, 0, len(cfg.InputFiles))
|
||||
for _, inputFile := range cfg.InputFiles {
|
||||
raw = append(raw, model.RawTranscript{Source: inputFile})
|
||||
}
|
||||
|
||||
return raw, []report.Event{
|
||||
report.Info("input", "json-files", fmt.Sprintf("accepted %d input file(s)", len(raw))),
|
||||
}, nil
|
||||
}
|
||||
52
internal/builtin/merge.go
Normal file
52
internal/builtin/merge.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type placeholderMerger struct{}
|
||||
|
||||
func (placeholderMerger) Name() string {
|
||||
return "placeholder-merger"
|
||||
}
|
||||
|
||||
func (placeholderMerger) Merge(ctx context.Context, in []model.CanonicalTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
segments := make([]model.Segment, 0)
|
||||
for _, transcript := range in {
|
||||
segments = append(segments, transcript.Segments...)
|
||||
}
|
||||
|
||||
sort.SliceStable(segments, func(i, j int) bool {
|
||||
left := segments[i]
|
||||
right := segments[j]
|
||||
if left.Start != right.Start {
|
||||
return left.Start < right.Start
|
||||
}
|
||||
if left.End != right.End {
|
||||
return left.End < right.End
|
||||
}
|
||||
if left.Source != right.Source {
|
||||
return left.Source < right.Source
|
||||
}
|
||||
if left.SourceSegmentIndex != right.SourceSegmentIndex {
|
||||
return left.SourceSegmentIndex < right.SourceSegmentIndex
|
||||
}
|
||||
return left.Speaker < right.Speaker
|
||||
})
|
||||
|
||||
return model.MergedTranscript{
|
||||
Segments: segments,
|
||||
OverlapGroups: nil,
|
||||
}, []report.Event{
|
||||
report.Info("merge", "placeholder-merger", "merged placeholder canonical transcript(s)"),
|
||||
}, nil
|
||||
}
|
||||
39
internal/builtin/output.go
Normal file
39
internal/builtin/output.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"os"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type jsonOutputWriter struct{}
|
||||
|
||||
func (jsonOutputWriter) Name() string {
|
||||
return "json"
|
||||
}
|
||||
|
||||
func (jsonOutputWriter) Write(ctx context.Context, out model.FinalTranscript, rpt report.Report, cfg config.Config) ([]report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
file, err := os.Create(cfg.OutputFile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
enc := json.NewEncoder(file)
|
||||
enc.SetIndent("", " ")
|
||||
if err := enc.Encode(out); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return []report.Event{
|
||||
report.Info("output", "json", "wrote placeholder transcript JSON"),
|
||||
}, nil
|
||||
}
|
||||
47
internal/builtin/postprocess.go
Normal file
47
internal/builtin/postprocess.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type noopPostprocessor struct {
|
||||
name string
|
||||
}
|
||||
|
||||
func (p noopPostprocessor) Name() string {
|
||||
return p.name
|
||||
}
|
||||
|
||||
func (p noopPostprocessor) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
return in, []report.Event{
|
||||
report.Info("postprocessing", p.name, "completed no-op postprocessing module"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type assignIDs struct{}
|
||||
|
||||
func (assignIDs) Name() string {
|
||||
return "assign-ids"
|
||||
}
|
||||
|
||||
func (assignIDs) Process(ctx context.Context, in model.MergedTranscript, cfg config.Config) (model.MergedTranscript, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return model.MergedTranscript{}, nil, err
|
||||
}
|
||||
|
||||
for index := range in.Segments {
|
||||
in.Segments[index].ID = index + 1
|
||||
}
|
||||
|
||||
return in, []report.Event{
|
||||
report.Info("postprocessing", "assign-ids", "assigned final segment IDs"),
|
||||
}, nil
|
||||
}
|
||||
82
internal/builtin/preprocess.go
Normal file
82
internal/builtin/preprocess.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package builtin
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/config"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/model"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
"gitea.maximumdirect.net/eric/seriatim/internal/report"
|
||||
)
|
||||
|
||||
type noopPreprocessor struct {
|
||||
name string
|
||||
requires pipeline.ModelState
|
||||
produces pipeline.ModelState
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Name() string {
|
||||
return p.name
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Requires() pipeline.ModelState {
|
||||
return p.requires
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Produces() pipeline.ModelState {
|
||||
return p.produces
|
||||
}
|
||||
|
||||
func (p noopPreprocessor) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pipeline.PreprocessState{}, nil, err
|
||||
}
|
||||
if in.State != p.requires {
|
||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", p.name, p.requires, in.State)
|
||||
}
|
||||
|
||||
in.State = p.produces
|
||||
return in, []report.Event{
|
||||
report.Info("preprocessing", p.name, "completed no-op preprocessing module"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
type normalizeSpeakers struct{}
|
||||
|
||||
func (normalizeSpeakers) Name() string {
|
||||
return "normalize-speakers"
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Requires() pipeline.ModelState {
|
||||
return pipeline.StateRaw
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Produces() pipeline.ModelState {
|
||||
return pipeline.StateCanonical
|
||||
}
|
||||
|
||||
func (normalizeSpeakers) Process(ctx context.Context, in pipeline.PreprocessState, cfg config.Config) (pipeline.PreprocessState, []report.Event, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return pipeline.PreprocessState{}, nil, err
|
||||
}
|
||||
if in.State != pipeline.StateRaw {
|
||||
return pipeline.PreprocessState{}, nil, fmt.Errorf("preprocessing module %q requires state %q but received %q", "normalize-speakers", pipeline.StateRaw, in.State)
|
||||
}
|
||||
|
||||
canonical := make([]model.CanonicalTranscript, 0, len(in.Raw))
|
||||
for _, raw := range in.Raw {
|
||||
canonical = append(canonical, model.CanonicalTranscript{
|
||||
Source: raw.Source,
|
||||
Segments: nil,
|
||||
})
|
||||
}
|
||||
|
||||
return pipeline.PreprocessState{
|
||||
State: pipeline.StateCanonical,
|
||||
Raw: append([]model.RawTranscript(nil), in.Raw...),
|
||||
Canonical: canonical,
|
||||
}, []report.Event{
|
||||
report.Info("preprocessing", "normalize-speakers", "created placeholder canonical transcript(s)"),
|
||||
}, nil
|
||||
}
|
||||
23
internal/builtin/registry.go
Normal file
23
internal/builtin/registry.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package builtin
|
||||
|
||||
import "gitea.maximumdirect.net/eric/seriatim/internal/pipeline"
|
||||
|
||||
// NewRegistry registers the MVP built-in modules.
|
||||
func NewRegistry() *pipeline.Registry {
|
||||
registry := pipeline.NewRegistry()
|
||||
|
||||
registry.RegisterInputReader(jsonFilesReader{})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "validate-raw", requires: pipeline.StateRaw, produces: pipeline.StateRaw})
|
||||
registry.RegisterPreprocessor(normalizeSpeakers{})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "trim-text", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
||||
registry.RegisterPreprocessor(noopPreprocessor{name: "autocorrect", requires: pipeline.StateCanonical, produces: pipeline.StateCanonical})
|
||||
registry.RegisterMerger(placeholderMerger{})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "detect-overlaps"})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "resolve-overlaps"})
|
||||
registry.RegisterPostprocessor(assignIDs{})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "validate-output"})
|
||||
registry.RegisterPostprocessor(noopPostprocessor{name: "autocorrect"})
|
||||
registry.RegisterOutputWriter(jsonOutputWriter{})
|
||||
|
||||
return registry
|
||||
}
|
||||
Reference in New Issue
Block a user