ooni-probe-cli/internal/experiment/webconnectivity/analysishttpdiff.go
Simone Basso 5e76c6ec92
refactor(webconnectivity@v0.5): improve logging clarity (#964)
We're bumping the experiment's version number because we changed the name of the field used to contain late/duplicate DNS responses. We have also changed the algorithm to determine `#dnsDiff`. However, the change should only impact how we log this information. Overall, here the idea is to provide users with a reasonably clear explanation of how the probe maps observations to blocking and accessible using expected/unexpected as the conceptual framework.

Part of https://github.com/ooni/probe/issues/2237
2022-09-15 07:03:53 +02:00

270 lines
8.5 KiB
Go

package webconnectivity
//
// HTTP diff analysis
//
import (
"net/url"
"reflect"
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity"
"github.com/ooni/probe-cli/v3/internal/measurexlite"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/runtimex"
)
// analysisHTTPDiff computes the HTTP diff between the final request-response
// observed by the probe and the TH's result. The caller is responsible of passing
// us a valid probe observation and a valid TH observation with nil failure.
func (tk *TestKeys) analysisHTTPDiff(logger model.Logger,
probe *model.ArchivalHTTPRequestResult, th *webconnectivity.ControlHTTPRequestResult) {
// make sure the caller respected the contract
runtimex.PanicIfTrue(
probe.Failure != nil || th.Failure != nil,
"the caller should have passed us successful HTTP observations",
)
// if we're dealing with an HTTPS request, don't perform any comparison
// under the assumption that we're good if we're using TLS
URL, err := url.Parse(probe.Request.URL)
if err != nil {
return // looks like a bug
}
if URL.Scheme == "https" {
logger.Infof("HTTP: HTTPS && no error => #%d is successful", probe.TransactionID)
tk.BlockingFlags |= analysisFlagSuccess
return
}
// original HTTP diff algorithm adapted for this implementation
tk.httpDiffBodyLengthChecks(probe, th)
tk.httpDiffStatusCodeMatch(probe, th)
tk.httpDiffHeadersMatch(probe, th)
tk.httpDiffTitleMatch(probe, th)
if tk.StatusCodeMatch != nil && *tk.StatusCodeMatch {
if tk.BodyLengthMatch != nil && *tk.BodyLengthMatch {
logger.Infof(
"HTTP: statusCodeMatch && bodyLengthMatch => #%d is successful",
probe.TransactionID,
)
tk.BlockingFlags |= analysisFlagSuccess
return
}
logger.Infof("HTTP: body length: MISMATCH (see #%d)", probe.TransactionID)
if tk.HeadersMatch != nil && *tk.HeadersMatch {
logger.Infof(
"HTTP: statusCodeMatch && headersMatch => #%d is successful",
probe.TransactionID,
)
tk.BlockingFlags |= analysisFlagSuccess
return
}
logger.Infof("HTTP: uncommon headers: MISMATCH (see #%d)", probe.TransactionID)
if tk.TitleMatch != nil && *tk.TitleMatch {
logger.Infof(
"HTTP: statusCodeMatch && titleMatch => #%d is successful",
probe.TransactionID,
)
tk.BlockingFlags |= analysisFlagSuccess
return
}
logger.Infof("HTTP: title: MISMATCH (see #%d)", probe.TransactionID)
} else {
logger.Infof("HTTP: status code: MISMATCH (see #%d)", probe.TransactionID)
}
tk.BlockingFlags |= analysisFlagHTTPDiff
logger.Warnf("HTTP: it seems #%d is a case of httpDiff", probe.TransactionID)
}
// httpDiffBodyLengthChecks compares the bodies lengths.
func (tk *TestKeys) httpDiffBodyLengthChecks(
probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
control := ctrl.BodyLength
if control <= 0 {
return // no actual length
}
response := probe.Response
if response.BodyIsTruncated {
return // cannot trust body length in this case
}
measurement := int64(len(response.Body.Value))
if measurement <= 0 {
return // no actual length
}
const bodyProportionFactor = 0.7
var proportion float64
if measurement >= control {
proportion = float64(control) / float64(measurement)
} else {
proportion = float64(measurement) / float64(control)
}
good := proportion > bodyProportionFactor
tk.BodyLengthMatch = &good
}
// httpDiffStatusCodeMatch compares the status codes.
func (tk *TestKeys) httpDiffStatusCodeMatch(
probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
control := ctrl.StatusCode
measurement := probe.Response.Code
if control <= 0 {
return // no real status code
}
if measurement <= 0 {
return // no real status code
}
good := control == measurement
if !good && control/100 != 2 {
// Avoid comparison if it seems the TH failed _and_ the two
// status codes are not equal. Originally, this algorithm was
// https://github.com/measurement-kit/measurement-kit/blob/b55fbecb205be62c736249b689df0c45ae342804/src/libmeasurement_kit/ooni/web_connectivity.cpp#L60
// and excluded the case where the TH failed with 5xx.
//
// Then, we discovered when implementing websteps a bunch
// of control failure modes that suggested to be more
// cautious. See https://github.com/bassosimone/websteps-illustrated/blob/632f27443ab9d94fb05efcf5e0b0c1ce190221e2/internal/engine/experiment/websteps/analysisweb.go#L137.
//
// However, it seems a bit retarded to avoid comparison
// when both the TH and the probe failed equallty. See
// https://github.com/ooni/probe/issues/2287, which refers
// to a measurement where both the probe and the TH fail
// with 404, but we fail to say "status_code_match = true".
//
// See https://explorer.ooni.org/measurement/20220911T203447Z_webconnectivity_IT_30722_n1_YDZQZOHAziEJk6o9?input=http%3A%2F%2Fwww.webbox.com%2Findex.php
// for a measurement where this was fixed.
return
}
tk.StatusCodeMatch = &good
}
// httpDiffHeadersMatch compares the uncommon headers.
func (tk *TestKeys) httpDiffHeadersMatch(
probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
control := ctrl.Headers
measurement := probe.Response.Headers
if len(control) <= 0 || len(measurement) <= 0 {
return
}
// Implementation note: using map because we only care about the
// keys being different and we ignore the values.
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
commonHeaders := map[string]bool{
"date": true,
"content-type": true,
"server": true,
"cache-control": true,
"vary": true,
"set-cookie": true,
"location": true,
"expires": true,
"x-powered-by": true,
"content-encoding": true,
"last-modified": true,
"accept-ranges": true,
"pragma": true,
"x-frame-options": true,
"etag": true,
"x-content-type-options": true,
"age": true,
"via": true,
"p3p": true,
"x-xss-protection": true,
"content-language": true,
"cf-ray": true,
"strict-transport-security": true,
"link": true,
"x-varnish": true,
}
matching := make(map[string]int)
ours := make(map[string]bool)
for key := range measurement {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inMeasurement
}
ours[key] = true
}
theirs := make(map[string]bool)
for key := range control {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inControl
}
theirs[key] = true
}
// if they are equal we're done
if good := reflect.DeepEqual(ours, theirs); good {
tk.HeadersMatch = &good
return
}
// compute the intersection of uncommon headers
found := false
for _, value := range matching {
if (value & inBoth) == inBoth {
found = true
break
}
}
tk.HeadersMatch = &found
}
// httpDiffTitleMatch compares the titles.
func (tk *TestKeys) httpDiffTitleMatch(
probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
response := probe.Response
if response.Code <= 0 {
return
}
if response.BodyIsTruncated {
return
}
if ctrl.StatusCode <= 0 {
return
}
control := ctrl.Title
measurementBody := response.Body.Value
measurement := measurexlite.WebGetTitle(measurementBody)
if control == "" || measurement == "" {
return
}
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
words := make(map[string]int)
// We don't consider to match words that are shorter than 5
// characters (5 is the average word length for english)
//
// The original implementation considered the word order but
// considering different languages it seems we could have less
// false positives by ignoring the word order.
const minWordLength = 5
for _, word := range strings.Split(measurement, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inMeasurement
}
}
for _, word := range strings.Split(control, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inControl
}
}
good := true
for _, score := range words {
if (score & inBoth) != inBoth {
good = false
break
}
}
tk.TitleMatch = &good
}