fix(oohelperd): metrics improvements after design review (#903)

This diff updates the metrics according to https://github.com/ooni/probe/issues/2183#issuecomment-1230327725
This commit is contained in:
Simone Basso 2022-08-29 16:36:46 +02:00 committed by GitHub
parent ffc2527fc5
commit 8c855ca597
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 24 deletions

View File

@ -48,38 +48,37 @@ var _ http.Handler = &handler{}
func (h *handler) ServeHTTP(w http.ResponseWriter, req *http.Request) { func (h *handler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
metricRequestsInflight.Inc() metricRequestsInflight.Inc()
defer metricRequestsInflight.Dec() defer metricRequestsInflight.Dec()
metricRequestsTotal.Inc()
w.Header().Add("Server", fmt.Sprintf( w.Header().Add("Server", fmt.Sprintf(
"oohelperd/%s ooniprobe-engine/%s", version.Version, version.Version, "oohelperd/%s ooniprobe-engine/%s", version.Version, version.Version,
)) ))
if req.Method != "POST" { if req.Method != "POST" {
metricRequestsByStatusCode.WithLabelValues("400").Inc() metricRequestsCount.WithLabelValues("400", "bad_request_method").Inc()
w.WriteHeader(400) w.WriteHeader(400)
return return
} }
reader := &io.LimitedReader{R: req.Body, N: h.MaxAcceptableBody} reader := &io.LimitedReader{R: req.Body, N: h.MaxAcceptableBody}
data, err := netxlite.ReadAllContext(req.Context(), reader) data, err := netxlite.ReadAllContext(req.Context(), reader)
if err != nil { if err != nil {
metricRequestsByStatusCode.WithLabelValues("400").Inc() metricRequestsCount.WithLabelValues("400", "request_body_too_large").Inc()
w.WriteHeader(400) w.WriteHeader(400)
return return
} }
var creq ctrlRequest var creq ctrlRequest
if err := json.Unmarshal(data, &creq); err != nil { if err := json.Unmarshal(data, &creq); err != nil {
metricRequestsByStatusCode.WithLabelValues("400").Inc() metricRequestsCount.WithLabelValues("400", "cannot_unmarshal_request_body").Inc()
w.WriteHeader(400) w.WriteHeader(400)
return return
} }
started := time.Now() started := time.Now()
cresp, err := measure(req.Context(), h, &creq) cresp, err := measure(req.Context(), h, &creq)
elapsed := time.Since(started) elapsed := time.Since(started)
metricMeasurementTime.Observe(float64(elapsed.Seconds())) metricWCTaskDurationSeconds.Observe(float64(elapsed.Seconds()))
if err != nil { if err != nil {
metricRequestsByStatusCode.WithLabelValues("400").Inc() metricRequestsCount.WithLabelValues("400", "measurement_failed").Inc()
w.WriteHeader(400) w.WriteHeader(400)
return return
} }
metricRequestsByStatusCode.WithLabelValues("200").Inc() metricRequestsCount.WithLabelValues("200", "ok").Inc()
// We assume that the following call cannot fail because it's a // We assume that the following call cannot fail because it's a
// clearly-serializable data structure. // clearly-serializable data structure.
data, err = json.Marshal(cresp) data, err = json.Marshal(cresp)

View File

@ -3,6 +3,8 @@ package main
// //
// Metrics definitions // Metrics definitions
// //
// See https://github.com/ooni/probe/issues/2183#issuecomment-1230327725
//
import ( import (
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@ -10,30 +12,26 @@ import (
) )
var ( var (
// metricRequestsTotal counts the total number of requests // metricRequestsCount counts the number of requests we served.
metricRequestsTotal = promauto.NewCounter(prometheus.CounterOpts{ metricRequestsCount = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "oohelperd_requests_total", Name: "oohelperd_requests_count",
Help: "The total number of processed requests", Help: "Total number of processed requests",
}) }, []string{"code", "reason"})
// metricRequestsByStatusCode counts the number of requests that // metricRequestsInflight gauges the number of requests currently inflight.
// have returned a given status code to the caller.
metricRequestsByStatusCode = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "oohelperd_requests_by_status_code",
Help: "Total number of processed requests by status code",
}, []string{"code"})
// metricRequestsInflight counts the number of requests currently inflight.
metricRequestsInflight = promauto.NewGauge(prometheus.GaugeOpts{ metricRequestsInflight = promauto.NewGauge(prometheus.GaugeOpts{
Name: "oohelperd_requests_inflight", Name: "oohelperd_requests_inflight_gauge",
Help: "The number or requests currently inflight", Help: "The number or requests currently inflight",
}) })
// metricMeasurementTime summarizes the time to perform a measurement. // metricWCTaskDurationSeconds summarizes the duration of the web connectivity measurement task.
metricMeasurementTime = promauto.NewSummary(prometheus.SummaryOpts{ metricWCTaskDurationSeconds = promauto.NewSummary(prometheus.SummaryOpts{
Name: "oohelperd_measurement_time", Name: "oohelperd_wctask_duration_seconds",
Help: "Summarizes the time to perform a test-helper measurement (in seconds)", Help: "Summarizes the time to perform a test-helper measurement (in seconds)",
// See https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/ // See https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/
//
// TODO(bassosimone,FedericoCeratto): investigate whether using
// a shorter-than-10m observation interval is better for us
Objectives: map[float64]float64{ Objectives: map[float64]float64{
0.25: 0.010, // 0.240 <= φ <= 0.260 0.25: 0.010, // 0.240 <= φ <= 0.260
0.5: 0.010, // 0.490 <= φ <= 0.510 0.5: 0.010, // 0.490 <= φ <= 0.510