feat(outbox): implement concurrent claim mechanism with UPDATE RETURNING + SKIP LOCKED
- Add migration 0004 to introduce 'claiming' status and timeout index - Add StatusClaiming to platformevent domain and allow it in Validate() - Rewrite ListDue as transactional UPDATE ... RETURNING with FOR UPDATE SKIP LOCKED - Add ReleaseStaleClaims to reset expired claiming events back to retrying - Worker Start() now runs a 30s ticker for stale claim recovery (5m timeout) - Update stubEventStore in tests to satisfy new EventStore interface Refs: D-02
This commit is contained in:
@@ -20,6 +20,7 @@ type EventStore interface {
|
||||
RecordDeliveryAttempt(ctx context.Context, eventID string, attemptNo int, responseStatus int, responseBody string, errorMessage string) error
|
||||
MarkRetry(ctx context.Context, eventID string, attemptCount int, nextAttemptAt time.Time, lastError string) error
|
||||
MarkDeadLetter(ctx context.Context, eventID string, attemptCount int, finalError string) error
|
||||
ReleaseStaleClaims(ctx context.Context, timeout time.Duration) (int, error)
|
||||
}
|
||||
|
||||
type Worker struct {
|
||||
@@ -31,6 +32,7 @@ type Worker struct {
|
||||
MaxRetries int
|
||||
BatchSize int
|
||||
PollInterval time.Duration
|
||||
ClaimTimeout time.Duration
|
||||
RetrySchedule []time.Duration
|
||||
Now func() time.Time
|
||||
Logger *slog.Logger
|
||||
@@ -52,6 +54,7 @@ func NewWorker(platform, callbackURL string, store EventStore, client *http.Clie
|
||||
MaxRetries: maxRetries,
|
||||
BatchSize: 20,
|
||||
PollInterval: 5 * time.Second,
|
||||
ClaimTimeout: 5 * time.Minute,
|
||||
RetrySchedule: []time.Duration{10 * time.Second, 30 * time.Second, 60 * time.Second, 5 * time.Minute, 15 * time.Minute},
|
||||
Now: time.Now,
|
||||
}
|
||||
@@ -63,6 +66,8 @@ func (w *Worker) Start(ctx context.Context) {
|
||||
}
|
||||
ticker := time.NewTicker(w.pollInterval())
|
||||
defer ticker.Stop()
|
||||
claimTicker := time.NewTicker(30 * time.Second)
|
||||
defer claimTicker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -77,6 +82,16 @@ func (w *Worker) Start(ctx context.Context) {
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-claimTicker.C:
|
||||
if w.Store != nil {
|
||||
if _, err := w.Store.ReleaseStaleClaims(ctx, w.claimTimeout()); err != nil && w.Logger != nil {
|
||||
w.Logger.Error("release stale claims failed", "platform", w.Platform, "error", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,6 +184,13 @@ func (w *Worker) pollInterval() time.Duration {
|
||||
return w.PollInterval
|
||||
}
|
||||
|
||||
func (w *Worker) claimTimeout() time.Duration {
|
||||
if w.ClaimTimeout <= 0 {
|
||||
return 5 * time.Minute
|
||||
}
|
||||
return w.ClaimTimeout
|
||||
}
|
||||
|
||||
func (w *Worker) now() time.Time {
|
||||
if w.Now == nil {
|
||||
return time.Now()
|
||||
|
||||
Reference in New Issue
Block a user