Commit 85f7ce01 authored by Jacob Vosmaer (GitLab)'s avatar Jacob Vosmaer (GitLab)

Merge branch 'feature/add-metrics-for-queueing-mechanism' into 'master'

Add metrics for queueing mechanism

See merge request !122
parents 7378c05b d34f519b
...@@ -3,6 +3,8 @@ package queueing ...@@ -3,6 +3,8 @@ package queueing
import ( import (
"errors" "errors"
"time" "time"
"github.com/prometheus/client_golang/prometheus"
) )
type errTooManyRequests struct{ error } type errTooManyRequests struct{ error }
...@@ -11,58 +13,184 @@ type errQueueingTimedout struct{ error } ...@@ -11,58 +13,184 @@ type errQueueingTimedout struct{ error }
var ErrTooManyRequests = &errTooManyRequests{errors.New("too many requests queued")} var ErrTooManyRequests = &errTooManyRequests{errors.New("too many requests queued")}
var ErrQueueingTimedout = &errQueueingTimedout{errors.New("queueing timedout")} var ErrQueueingTimedout = &errQueueingTimedout{errors.New("queueing timedout")}
type queueMetrics struct {
queueingLimit prometheus.Gauge
queueingQueueLimit prometheus.Gauge
queueingQueueTimeout prometheus.Gauge
queueingBusy prometheus.Gauge
queueingWaiting prometheus.Gauge
queueingWaitingTime prometheus.Histogram
queueingErrors *prometheus.CounterVec
}
// newQueueMetrics prepares Prometheus metrics for queueing mechanism
// name specifies name of the queue, used to label metrics with ConstLabel `queue_name`
// Don't call newQueueMetrics twice with the same name argument!
// timeout specifies the timeout of storing a request in queue - queueMetrics
// uses it to calculate histogram buckets for gitlab_workhorse_queueing_waiting_time
// metric
func newQueueMetrics(name string, timeout time.Duration) *queueMetrics {
waitingTimeBuckets := []float64{
timeout.Seconds() * 0.01,
timeout.Seconds() * 0.05,
timeout.Seconds() * 0.10,
timeout.Seconds() * 0.25,
timeout.Seconds() * 0.50,
timeout.Seconds() * 0.75,
timeout.Seconds() * 0.90,
timeout.Seconds() * 0.95,
timeout.Seconds() * 0.99,
timeout.Seconds(),
}
metrics := &queueMetrics{
queueingLimit: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "gitlab_workhorse_queueing_limit",
Help: "Current limit set for the queueing mechanism",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
}),
queueingQueueLimit: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "gitlab_workhorse_queueing_queue_limit",
Help: "Current queueLimit set for the queueing mechanism",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
}),
queueingQueueTimeout: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "gitlab_workhorse_queueing_queue_timeout",
Help: "Current queueTimeout set for the queueing mechanism",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
}),
queueingBusy: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "gitlab_workhorse_queueing_busy",
Help: "How many queued requests are now processed",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
}),
queueingWaiting: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "gitlab_workhorse_queueing_waiting",
Help: "How many requests are now queued",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
}),
queueingWaitingTime: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "gitlab_workhorse_queueing_waiting_time",
Help: "How many time a request spent in queue",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
Buckets: waitingTimeBuckets,
}),
queueingErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "gitlab_workhorse_queueing_errors",
Help: "How many times the TooManyRequests or QueueintTimedout errors were returned while queueing, partitioned by error type",
ConstLabels: prometheus.Labels{
"queue_name": name,
},
},
[]string{"type"},
),
}
prometheus.MustRegister(metrics.queueingLimit)
prometheus.MustRegister(metrics.queueingQueueLimit)
prometheus.MustRegister(metrics.queueingQueueTimeout)
prometheus.MustRegister(metrics.queueingBusy)
prometheus.MustRegister(metrics.queueingWaiting)
prometheus.MustRegister(metrics.queueingWaitingTime)
prometheus.MustRegister(metrics.queueingErrors)
return metrics
}
type Queue struct { type Queue struct {
*queueMetrics
name string
busyCh chan struct{} busyCh chan struct{}
waitingCh chan struct{} waitingCh chan time.Time
timeout time.Duration
} }
// NewQueue creates a new queue // newQueue creates a new queue
// name specifies name used to label queue metrics.
// Don't call newQueue twice with the same name argument!
// limit specifies number of requests run concurrently // limit specifies number of requests run concurrently
// queueLimit specifies maximum number of requests that can be queued // queueLimit specifies maximum number of requests that can be queued
// timeout specifies the time limit of storing the request in the queue
// if the number of requests is above the limit // if the number of requests is above the limit
func NewQueue(limit, queueLimit uint) *Queue { func newQueue(name string, limit, queueLimit uint, timeout time.Duration) *Queue {
return &Queue{ queue := &Queue{
name: name,
busyCh: make(chan struct{}, limit), busyCh: make(chan struct{}, limit),
waitingCh: make(chan struct{}, limit+queueLimit), waitingCh: make(chan time.Time, limit+queueLimit),
timeout: timeout,
} }
queue.queueMetrics = newQueueMetrics(name, timeout)
queue.queueingLimit.Set(float64(limit))
queue.queueingQueueLimit.Set(float64(queueLimit))
queue.queueingQueueTimeout.Set(timeout.Seconds())
return queue
} }
// Acquire takes one slot from the Queue // Acquire takes one slot from the Queue
// and returns when a request should be processed // and returns when a request should be processed
// it allows up to (limit) of requests running at a time // it allows up to (limit) of requests running at a time
// it allows to queue up to (queue-limit) requests // it allows to queue up to (queue-limit) requests
func (s *Queue) Acquire(timeout time.Duration) (err error) { func (s *Queue) Acquire() (err error) {
// push item to a queue to claim your own slot (non-blocking) // push item to a queue to claim your own slot (non-blocking)
select { select {
case s.waitingCh <- struct{}{}: case s.waitingCh <- time.Now():
s.queueingWaiting.Inc()
break break
default: default:
s.queueingErrors.WithLabelValues("too_many_requests").Inc()
return ErrTooManyRequests return ErrTooManyRequests
} }
defer func() { defer func() {
if err != nil { if err != nil {
<-s.waitingCh waitStarted := <-s.waitingCh
s.queueingWaiting.Dec()
s.queueingWaitingTime.Observe(float64(time.Since(waitStarted).Seconds()))
} }
}() }()
// fast path: push item to current processed items (non-blocking) // fast path: push item to current processed items (non-blocking)
select { select {
case s.busyCh <- struct{}{}: case s.busyCh <- struct{}{}:
s.queueingBusy.Inc()
return nil return nil
default: default:
break break
} }
timer := time.NewTimer(timeout) timer := time.NewTimer(s.timeout)
defer timer.Stop() defer timer.Stop()
// push item to current processed items (blocking) // push item to current processed items (blocking)
select { select {
case s.busyCh <- struct{}{}: case s.busyCh <- struct{}{}:
s.queueingBusy.Inc()
return nil return nil
case <-timer.C: case <-timer.C:
s.queueingErrors.WithLabelValues("queueing_timedout").Inc()
return ErrQueueingTimedout return ErrQueueingTimedout
} }
} }
...@@ -71,6 +199,10 @@ func (s *Queue) Acquire(timeout time.Duration) (err error) { ...@@ -71,6 +199,10 @@ func (s *Queue) Acquire(timeout time.Duration) (err error) {
// It triggers next request to be processed if it's in queue // It triggers next request to be processed if it's in queue
func (s *Queue) Release() { func (s *Queue) Release() {
// dequeue from queue to allow next request to be processed // dequeue from queue to allow next request to be processed
<-s.waitingCh waitStarted := <-s.waitingCh
s.queueingWaiting.Dec()
s.queueingWaitingTime.Observe(float64(time.Since(waitStarted).Seconds()))
<-s.busyCh <-s.busyCh
s.queueingBusy.Dec()
} }
...@@ -6,46 +6,46 @@ import ( ...@@ -6,46 +6,46 @@ import (
) )
func TestNormalQueueing(t *testing.T) { func TestNormalQueueing(t *testing.T) {
q := NewQueue(2, 1) q := newQueue("queue 1", 2, 1, time.Microsecond)
err1 := q.Acquire(time.Microsecond) err1 := q.Acquire()
if err1 != nil { if err1 != nil {
t.Fatal("we should acquire a new slot") t.Fatal("we should acquire a new slot")
} }
err2 := q.Acquire(time.Microsecond) err2 := q.Acquire()
if err2 != nil { if err2 != nil {
t.Fatal("we should acquire a new slot") t.Fatal("we should acquire a new slot")
} }
err3 := q.Acquire(time.Microsecond) err3 := q.Acquire()
if err3 != ErrQueueingTimedout { if err3 != ErrQueueingTimedout {
t.Fatal("we should timeout") t.Fatal("we should timeout")
} }
q.Release() q.Release()
err4 := q.Acquire(time.Microsecond) err4 := q.Acquire()
if err4 != nil { if err4 != nil {
t.Fatal("we should acquire a new slot") t.Fatal("we should acquire a new slot")
} }
} }
func TestQueueLimit(t *testing.T) { func TestQueueLimit(t *testing.T) {
q := NewQueue(1, 0) q := newQueue("queue 2", 1, 0, time.Microsecond)
err1 := q.Acquire(time.Microsecond) err1 := q.Acquire()
if err1 != nil { if err1 != nil {
t.Fatal("we should acquire a new slot") t.Fatal("we should acquire a new slot")
} }
err2 := q.Acquire(time.Microsecond) err2 := q.Acquire()
if err2 != ErrTooManyRequests { if err2 != ErrTooManyRequests {
t.Fatal("we should fail because of not enough slots in queue") t.Fatal("we should fail because of not enough slots in queue")
} }
} }
func TestQueueProcessing(t *testing.T) { func TestQueueProcessing(t *testing.T) {
q := NewQueue(1, 1) q := newQueue("queue 3", 1, 1, time.Second)
err1 := q.Acquire(time.Microsecond) err1 := q.Acquire()
if err1 != nil { if err1 != nil {
t.Fatal("we should acquire a new slot") t.Fatal("we should acquire a new slot")
} }
...@@ -55,7 +55,7 @@ func TestQueueProcessing(t *testing.T) { ...@@ -55,7 +55,7 @@ func TestQueueProcessing(t *testing.T) {
q.Release() q.Release()
}() }()
err2 := q.Acquire(time.Second) err2 := q.Acquire()
if err2 != nil { if err2 != nil {
t.Fatal("we should acquire slot after the previous one finished") t.Fatal("we should acquire slot after the previous one finished")
} }
......
...@@ -9,7 +9,14 @@ import ( ...@@ -9,7 +9,14 @@ import (
const DefaultTimeout = 30 * time.Second const DefaultTimeout = 30 * time.Second
func QueueRequests(h http.Handler, limit, queueLimit uint, queueTimeout time.Duration) http.Handler { // QueueRequests creates a new request queue
// name specifies the name of queue, used to label Prometheus metrics
// Don't call QueueRequests twice with the same name argument!
// h specifies a http.Handler which will handle the queue requests
// limit specifies number of requests run concurrently
// queueLimit specifies maximum number of requests that can be queued
// queueTimeout specifies the time limit of storing the request in the queue
func QueueRequests(name string, h http.Handler, limit, queueLimit uint, queueTimeout time.Duration) http.Handler {
if limit == 0 { if limit == 0 {
return h return h
} }
...@@ -17,10 +24,10 @@ func QueueRequests(h http.Handler, limit, queueLimit uint, queueTimeout time.Dur ...@@ -17,10 +24,10 @@ func QueueRequests(h http.Handler, limit, queueLimit uint, queueTimeout time.Dur
queueTimeout = DefaultTimeout queueTimeout = DefaultTimeout
} }
queue := NewQueue(limit, queueLimit) queue := newQueue(name, limit, queueLimit, queueTimeout)
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
err := queue.Acquire(queueTimeout) err := queue.Acquire()
switch err { switch err {
case nil: case nil:
......
...@@ -21,7 +21,7 @@ func pausedHttpHandler(pauseCh chan struct{}) http.Handler { ...@@ -21,7 +21,7 @@ func pausedHttpHandler(pauseCh chan struct{}) http.Handler {
func TestNormalRequestProcessing(t *testing.T) { func TestNormalRequestProcessing(t *testing.T) {
w := httptest.NewRecorder() w := httptest.NewRecorder()
h := QueueRequests(httpHandler, 1, 1, time.Second) h := QueueRequests("Normal request processing", httpHandler, 1, 1, time.Second)
h.ServeHTTP(w, nil) h.ServeHTTP(w, nil)
if w.Code != 200 { if w.Code != 200 {
t.Fatal("QueueRequests should process request") t.Fatal("QueueRequests should process request")
...@@ -32,11 +32,11 @@ func TestNormalRequestProcessing(t *testing.T) { ...@@ -32,11 +32,11 @@ func TestNormalRequestProcessing(t *testing.T) {
// then it runs a number of requests that are going through queue, // then it runs a number of requests that are going through queue,
// we return the response of first finished request, // we return the response of first finished request,
// where status of request can be 200, 429 or 503 // where status of request can be 200, 429 or 503
func testSlowRequestProcessing(count int, limit, queueLimit uint, queueTimeout time.Duration) *httptest.ResponseRecorder { func testSlowRequestProcessing(name string, count int, limit, queueLimit uint, queueTimeout time.Duration) *httptest.ResponseRecorder {
pauseCh := make(chan struct{}) pauseCh := make(chan struct{})
defer close(pauseCh) defer close(pauseCh)
handler := QueueRequests(pausedHttpHandler(pauseCh), limit, queueLimit, queueTimeout) handler := QueueRequests("Slow request processing: "+name, pausedHttpHandler(pauseCh), limit, queueLimit, queueTimeout)
respCh := make(chan *httptest.ResponseRecorder, count) respCh := make(chan *httptest.ResponseRecorder, count)
...@@ -57,7 +57,7 @@ func testSlowRequestProcessing(count int, limit, queueLimit uint, queueTimeout t ...@@ -57,7 +57,7 @@ func testSlowRequestProcessing(count int, limit, queueLimit uint, queueTimeout t
// the queue limit and length is 1, // the queue limit and length is 1,
// the second request gets timed-out // the second request gets timed-out
func TestQueueingTimeout(t *testing.T) { func TestQueueingTimeout(t *testing.T) {
w := testSlowRequestProcessing(2, 1, 1, time.Microsecond) w := testSlowRequestProcessing("timeout", 2, 1, 1, time.Microsecond)
if w.Code != 503 { if w.Code != 503 {
t.Fatal("QueueRequests should timeout queued request") t.Fatal("QueueRequests should timeout queued request")
...@@ -68,7 +68,7 @@ func TestQueueingTimeout(t *testing.T) { ...@@ -68,7 +68,7 @@ func TestQueueingTimeout(t *testing.T) {
// the queue limit and length is 1, // the queue limit and length is 1,
// so the third request has to be rejected with 429 // so the third request has to be rejected with 429
func TestQueueingTooManyRequests(t *testing.T) { func TestQueueingTooManyRequests(t *testing.T) {
w := testSlowRequestProcessing(3, 1, 1, time.Minute) w := testSlowRequestProcessing("too many requests", 3, 1, 1, time.Minute)
if w.Code != 429 { if w.Code != 429 {
t.Fatal("QueueRequests should return immediately and return too many requests") t.Fatal("QueueRequests should return immediately and return too many requests")
......
...@@ -119,7 +119,7 @@ func (u *Upstream) configureRoutes() { ...@@ -119,7 +119,7 @@ func (u *Upstream) configureRoutes() {
) )
uploadAccelerateProxy := upload.Accelerate(path.Join(u.DocumentRoot, "uploads/tmp"), proxy) uploadAccelerateProxy := upload.Accelerate(path.Join(u.DocumentRoot, "uploads/tmp"), proxy)
ciAPIProxyQueue := queueing.QueueRequests(uploadAccelerateProxy, u.APILimit, u.APIQueueLimit, u.APIQueueTimeout) ciAPIProxyQueue := queueing.QueueRequests("ci_api_job_requests", uploadAccelerateProxy, u.APILimit, u.APIQueueLimit, u.APIQueueTimeout)
ciAPILongPolling := builds.RegisterHandler(ciAPIProxyQueue, redis.WatchKey, u.APICILongPollingDuration) ciAPILongPolling := builds.RegisterHandler(ciAPIProxyQueue, redis.WatchKey, u.APICILongPollingDuration)
u.Routes = []routeEntry{ u.Routes = []routeEntry{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment