feedkit/scheduler/scheduler.go

package scheduler

import (
	"context"
	"fmt"
	"hash/fnv"
	"math/rand"
	"sync"
	"time"

	"gitea.maximumdirect.net/ejr/feedkit/event"
	"gitea.maximumdirect.net/ejr/feedkit/logging"
	"gitea.maximumdirect.net/ejr/feedkit/sources"
)

// Logger is a printf-style logger used throughout scheduler.
// It is an alias to the shared feedkit logging type so callers can pass
// one function everywhere without type mismatch friction.
type Logger = logging.Logf

// Job describes one scheduler task.
//
// A Job may be backed by either:
//   - a polling source (sources.PollSource): uses Every + jitter and calls Poll()
//   - a stream source (sources.StreamSource): ignores Every and calls Run()
//
// Jitter behavior:
//   - For polling sources: Jitter is applied at startup and before each poll tick.
//   - For stream sources: Jitter is applied once at startup only (optional; useful to avoid
//     reconnect storms when many instances start together).
type Job struct {
	Source           sources.Input
	Every            time.Duration
	StreamExitPolicy StreamExitPolicy
	StreamBackoff    StreamBackoff

	// Jitter is the maximum additional delay added before each poll.
	// Example: if Every=15m and Jitter=30s, each poll will occur at:
	//   tick time + random(0..30s)
	//
	// If Jitter == 0 for polling sources, we compute a default jitter based on Every.
	//
	// For stream sources, Jitter is treated as *startup jitter only*.
	Jitter time.Duration
}

// StreamExitPolicy controls how the scheduler handles non-fatal stream exits.
type StreamExitPolicy string

const (
	StreamExitPolicyRestart StreamExitPolicy = "restart"
	StreamExitPolicyStop    StreamExitPolicy = "stop"
	StreamExitPolicyFatal   StreamExitPolicy = "fatal"
)

// StreamBackoff controls restart pacing for stream supervision.
type StreamBackoff struct {
	Initial time.Duration
	Max     time.Duration
	Jitter  time.Duration
}

type Scheduler struct {
	Jobs []Job
	Out  chan<- event.Event
	Logf Logger
}

const (
	defaultStreamBackoffInitial = 1 * time.Second
	defaultStreamBackoffMax     = 1 * time.Minute
	defaultStreamBackoffJitter  = 250 * time.Millisecond
	streamBackoffResetAfter     = 5 * time.Minute
)

var timeNow = time.Now

// Run starts one goroutine per job.
// Poll jobs run on their own interval and emit 0..N events per poll.
// Stream jobs run continuously and emit events as they arrive.
func (s *Scheduler) Run(ctx context.Context) error {
	if s.Out == nil {
		return fmt.Errorf("scheduler.Run: Out channel is nil")
	}
	if len(s.Jobs) == 0 {
		return fmt.Errorf("scheduler.Run: no jobs configured")
	}

	runCtx, cancel := context.WithCancel(ctx)
	defer cancel()

	fatalErrCh := make(chan error, 1)
	var wg sync.WaitGroup
	for _, job := range s.Jobs {
		job := job // capture loop variable
		wg.Add(1)
		go func() {
			defer wg.Done()
			s.runJob(runCtx, job, fatalErrCh)
		}()
	}

	done := make(chan struct{})
	go func() {
		wg.Wait()
		close(done)
	}()

	select {
	case err := <-fatalErrCh:
		cancel()
		<-done
		return err
	case <-runCtx.Done():
		<-done
		return runCtx.Err()
	}
}

func (s *Scheduler) runJob(ctx context.Context, job Job, fatalErrCh chan<- error) {
	if job.Source == nil {
		s.logf("scheduler: job has nil source")
		return
	}

	// Stream sources: event-driven.
	if ss, ok := job.Source.(sources.StreamSource); ok {
		s.runStream(ctx, job, ss, fatalErrCh)
		return
	}

	// Poll sources: time-based.
	ps, ok := job.Source.(sources.PollSource)
	if !ok {
		s.logf("scheduler: source %T (%s) implements neither Poll() nor Run()", job.Source, job.Source.Name())
		return
	}
	if job.Every <= 0 {
		s.logf("scheduler: polling job %q missing/invalid interval (sources[].every)", ps.Name())
		return
	}

	s.runPoller(ctx, job, ps)
}

func (s *Scheduler) runStream(ctx context.Context, job Job, src sources.StreamSource, fatalErrCh chan<- error) {
	policy := effectiveStreamExitPolicy(job.StreamExitPolicy)
	backoff := effectiveStreamBackoff(job.StreamBackoff)
	rng := seededRNG(src.Name())

	// Optional startup jitter: helps avoid reconnect storms if many daemons start at once.
	if job.Jitter > 0 {
		if !sleepJitter(ctx, rng, job.Jitter) {
			return
		}
	}

	nextDelay := backoff.Initial
	for {
		startedAt := timeNow()
		err := src.Run(ctx, s.Out)
		if ctx.Err() != nil {
			return
		}

		normalizedErr := normalizeStreamExitError(src.Name(), err)
		if sources.IsStreamFatal(normalizedErr) {
			s.reportFatal(fatalErrCh, fmt.Errorf("scheduler: stream source %q exited fatally: %w", src.Name(), normalizedErr))
			return
		}

		switch policy {
		case StreamExitPolicyStop:
			s.logf("scheduler: stream source %q stopped after exit: %v", src.Name(), normalizedErr)
			return
		case StreamExitPolicyFatal:
			s.reportFatal(fatalErrCh, fmt.Errorf("scheduler: stream source %q exited under fatal policy: %w", src.Name(), normalizedErr))
			return
		}

		if streamRunWasStable(startedAt, timeNow()) {
			nextDelay = backoff.Initial
		}

		delay := nextDelay + randomDuration(rng, backoff.Jitter)
		s.logf("scheduler: stream source %q exited; restarting in %s: %v", src.Name(), delay, normalizedErr)
		if !sleepDuration(ctx, delay) {
			return
		}
		nextDelay = nextStreamBackoff(nextDelay, backoff.Max)
	}
}

func (s *Scheduler) runPoller(ctx context.Context, job Job, src sources.PollSource) {
	// Compute jitter: either configured per job, or a sensible default.
	jitter := effectiveJitter(job.Every, job.Jitter)

	// Each worker gets its own RNG (safe + no lock contention).
	rng := seededRNG(src.Name())

	// Optional startup jitter: avoids all jobs firing at the exact moment the daemon starts.
	if !sleepJitter(ctx, rng, jitter) {
		return
	}

	// Immediate poll at startup (after startup jitter).
	s.pollOnce(ctx, src)

	t := time.NewTicker(job.Every)
	defer t.Stop()

	for {
		select {
		case <-t.C:
			// Per-tick jitter: spreads calls out within the interval.
			if !sleepJitter(ctx, rng, jitter) {
				return
			}
			s.pollOnce(ctx, src)

		case <-ctx.Done():
			return
		}
	}
}

func (s *Scheduler) pollOnce(ctx context.Context, src sources.PollSource) {
	events, err := src.Poll(ctx)
	if err != nil {
		s.logf("scheduler: poll failed (%s): %v", src.Name(), err)
		return
	}

	for _, e := range events {
		select {
		case s.Out <- e:
		case <-ctx.Done():
			return
		}
	}
}

func (s *Scheduler) logf(format string, args ...any) {
	if s.Logf == nil {
		return
	}
	s.Logf(format, args...)
}

func (s *Scheduler) reportFatal(ch chan<- error, err error) {
	if err == nil {
		return
	}
	select {
	case ch <- err:
	default:
	}
}

// ---- helpers ----

func effectiveStreamExitPolicy(policy StreamExitPolicy) StreamExitPolicy {
	switch policy {
	case StreamExitPolicyStop, StreamExitPolicyFatal:
		return policy
	default:
		return StreamExitPolicyRestart
	}
}

func effectiveStreamBackoff(cfg StreamBackoff) StreamBackoff {
	out := cfg
	if out.Initial <= 0 {
		out.Initial = defaultStreamBackoffInitial
	}
	if out.Max <= 0 {
		out.Max = defaultStreamBackoffMax
	}
	if out.Max < out.Initial {
		out.Max = out.Initial
	}
	if out.Jitter < 0 {
		out.Jitter = 0
	}
	return out
}

func normalizeStreamExitError(sourceName string, err error) error {
	if err != nil {
		return err
	}
	return sources.StreamRetryable(fmt.Errorf("stream source %q exited unexpectedly without error", sourceName))
}

func nextStreamBackoff(current, max time.Duration) time.Duration {
	if current <= 0 {
		current = defaultStreamBackoffInitial
	}
	if max <= 0 {
		max = defaultStreamBackoffMax
	}
	if current >= max {
		return max
	}
	next := current * 2
	if next < current || next > max {
		return max
	}
	return next
}

func streamRunWasStable(startedAt, endedAt time.Time) bool {
	if startedAt.IsZero() || endedAt.IsZero() {
		return false
	}
	return endedAt.Sub(startedAt) >= streamBackoffResetAfter
}

func seededRNG(name string) *rand.Rand {
	seed := timeNow().UnixNano() ^ int64(hashStringFNV32a(name))
	return rand.New(rand.NewSource(seed))
}

// effectiveJitter chooses a jitter value.
// - If configuredMax > 0, use it (but clamp).
// - Else default to min(every/10, 30s).
// - Clamp to at most every/2 (so jitter can’t delay more than half the interval).
func effectiveJitter(every time.Duration, configuredMax time.Duration) time.Duration {
	if every <= 0 {
		return 0
	}

	j := configuredMax
	if j <= 0 {
		j = every / 10
		if j > 30*time.Second {
			j = 30 * time.Second
		}
	}

	// Clamp jitter so it doesn’t dominate the schedule.
	maxAllowed := every / 2
	if j > maxAllowed {
		j = maxAllowed
	}
	if j < 0 {
		j = 0
	}
	return j
}

// sleepJitter sleeps for a random duration in [0, max].
// Returns false if the context is cancelled while waiting.
func sleepJitter(ctx context.Context, rng *rand.Rand, max time.Duration) bool {
	if max <= 0 {
		return true
	}

	return sleepDuration(ctx, randomDuration(rng, max))
}

func randomDuration(rng *rand.Rand, max time.Duration) time.Duration {
	if max <= 0 {
		return 0
	}
	// Int63n requires a positive argument.
	// We add 1 so max itself is attainable.
	n := rng.Int63n(int64(max) + 1)
	return time.Duration(n)
}

func sleepDuration(ctx context.Context, d time.Duration) bool {
	if d <= 0 {
		return true
	}
	timer := time.NewTimer(d)
	defer timer.Stop()

	select {
	case <-timer.C:
		return true
	case <-ctx.Done():
		return false
	}
}

func hashStringFNV32a(s string) uint32 {
	h := fnv.New32a()
	_, _ = h.Write([]byte(s))
	return h.Sum32()
}