ooni-probe-cli/internal/experiment/webconnectivity/analysishttpdiff.go

package webconnectivity

//
// HTTP diff analysis
//

import (
	"net/url"
	"reflect"
	"strings"

	"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity"
	"github.com/ooni/probe-cli/v3/internal/measurexlite"
	"github.com/ooni/probe-cli/v3/internal/model"
	"github.com/ooni/probe-cli/v3/internal/runtimex"
)

// analysisHTTPDiff computes the HTTP diff between the final request-response
// observed by the probe and the TH's result. The caller is responsible of passing
// us a valid probe observation and a valid TH observation with nil failure.
func (tk *TestKeys) analysisHTTPDiff(logger model.Logger,
	probe *model.ArchivalHTTPRequestResult, th *webconnectivity.ControlHTTPRequestResult) {
	// make sure the caller respected the contract
	runtimex.PanicIfTrue(
		probe.Failure != nil || th.Failure != nil,
		"the caller should have passed us successful HTTP observations",
	)

	// if we're dealing with an HTTPS request, don't perform any comparison
	// under the assumption that we're good if we're using TLS
	URL, err := url.Parse(probe.Request.URL)
	if err != nil {
		return // looks like a bug
	}
	if URL.Scheme == "https" {
		logger.Infof("HTTP: HTTPS && no error => #%d is successful", probe.TransactionID)
		tk.BlockingFlags |= analysisFlagSuccess
		return
	}

	// original HTTP diff algorithm adapted for this implementation
	tk.httpDiffBodyLengthChecks(probe, th)
	tk.httpDiffStatusCodeMatch(probe, th)
	tk.httpDiffHeadersMatch(probe, th)
	tk.httpDiffTitleMatch(probe, th)

	if tk.StatusCodeMatch != nil && *tk.StatusCodeMatch {
		if tk.BodyLengthMatch != nil && *tk.BodyLengthMatch {
			logger.Infof(
				"HTTP: statusCodeMatch && bodyLengthMatch => #%d is successful",
				probe.TransactionID,
			)
			tk.BlockingFlags |= analysisFlagSuccess
			return
		}
		logger.Infof("HTTP: body length: MISMATCH (see #%d)", probe.TransactionID)
		if tk.HeadersMatch != nil && *tk.HeadersMatch {
			logger.Infof(
				"HTTP: statusCodeMatch && headersMatch => #%d is successful",
				probe.TransactionID,
			)
			tk.BlockingFlags |= analysisFlagSuccess
			return
		}
		logger.Infof("HTTP: uncommon headers: MISMATCH (see #%d)", probe.TransactionID)
		if tk.TitleMatch != nil && *tk.TitleMatch {
			logger.Infof(
				"HTTP: statusCodeMatch && titleMatch => #%d is successful",
				probe.TransactionID,
			)
			tk.BlockingFlags |= analysisFlagSuccess
			return
		}
		logger.Infof("HTTP: title: MISMATCH (see #%d)", probe.TransactionID)
	} else {
		logger.Infof("HTTP: status code: MISMATCH (see #%d)", probe.TransactionID)
	}

	tk.BlockingFlags |= analysisFlagHTTPDiff
	logger.Warnf("HTTP: it seems #%d is a case of httpDiff", probe.TransactionID)
}

// httpDiffBodyLengthChecks compares the bodies lengths.
func (tk *TestKeys) httpDiffBodyLengthChecks(
	probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
	control := ctrl.BodyLength
	if control <= 0 {
		return // no actual length
	}
	response := probe.Response
	if response.BodyIsTruncated {
		return // cannot trust body length in this case
	}
	measurement := int64(len(response.Body.Value))
	if measurement <= 0 {
		return // no actual length
	}
	const bodyProportionFactor = 0.7
	var proportion float64
	if measurement >= control {
		proportion = float64(control) / float64(measurement)
	} else {
		proportion = float64(measurement) / float64(control)
	}
	good := proportion > bodyProportionFactor
	tk.BodyLengthMatch = &good
}

// httpDiffStatusCodeMatch compares the status codes.
func (tk *TestKeys) httpDiffStatusCodeMatch(
	probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
	control := ctrl.StatusCode
	measurement := probe.Response.Code
	if control <= 0 {
		return // no real status code
	}
	if measurement <= 0 {
		return // no real status code
	}
	good := control == measurement
	if !good && control/100 != 2 {
		// Avoid comparison if it seems the TH failed _and_ the two
		// status codes are not equal. Originally, this algorithm was
		// https://github.com/measurement-kit/measurement-kit/blob/b55fbecb205be62c736249b689df0c45ae342804/src/libmeasurement_kit/ooni/web_connectivity.cpp#L60
		// and excluded the case where the TH failed with 5xx.
		//
		// Then, we discovered when implementing websteps a bunch
		// of control failure modes that suggested to be more
		// cautious. See https://github.com/bassosimone/websteps-illustrated/blob/632f27443ab9d94fb05efcf5e0b0c1ce190221e2/internal/engine/experiment/websteps/analysisweb.go#L137.
		//
		// However, it seems a bit retarded to avoid comparison
		// when both the TH and the probe failed equallty. See
		// https://github.com/ooni/probe/issues/2287, which refers
		// to a measurement where both the probe and the TH fail
		// with 404, but we fail to say "status_code_match = true".
		//
		// See https://explorer.ooni.org/measurement/20220911T203447Z_webconnectivity_IT_30722_n1_YDZQZOHAziEJk6o9?input=http%3A%2F%2Fwww.webbox.com%2Findex.php
		// for a measurement where this was fixed.
		return
	}
	tk.StatusCodeMatch = &good
}

// httpDiffHeadersMatch compares the uncommon headers.
func (tk *TestKeys) httpDiffHeadersMatch(
	probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
	control := ctrl.Headers
	measurement := probe.Response.Headers
	if len(control) <= 0 || len(measurement) <= 0 {
		return
	}
	// Implementation note: using map because we only care about the
	// keys being different and we ignore the values.
	const (
		inMeasurement = 1 << 0
		inControl     = 1 << 1
		inBoth        = inMeasurement | inControl
	)
	commonHeaders := map[string]bool{
		"date":                      true,
		"content-type":              true,
		"server":                    true,
		"cache-control":             true,
		"vary":                      true,
		"set-cookie":                true,
		"location":                  true,
		"expires":                   true,
		"x-powered-by":              true,
		"content-encoding":          true,
		"last-modified":             true,
		"accept-ranges":             true,
		"pragma":                    true,
		"x-frame-options":           true,
		"etag":                      true,
		"x-content-type-options":    true,
		"age":                       true,
		"via":                       true,
		"p3p":                       true,
		"x-xss-protection":          true,
		"content-language":          true,
		"cf-ray":                    true,
		"strict-transport-security": true,
		"link":                      true,
		"x-varnish":                 true,
	}
	matching := make(map[string]int)
	ours := make(map[string]bool)
	for key := range measurement {
		key = strings.ToLower(key)
		if _, ok := commonHeaders[key]; !ok {
			matching[key] |= inMeasurement
		}
		ours[key] = true
	}
	theirs := make(map[string]bool)
	for key := range control {
		key = strings.ToLower(key)
		if _, ok := commonHeaders[key]; !ok {
			matching[key] |= inControl
		}
		theirs[key] = true
	}
	// if they are equal we're done
	if good := reflect.DeepEqual(ours, theirs); good {
		tk.HeadersMatch = &good
		return
	}
	// compute the intersection of uncommon headers
	found := false
	for _, value := range matching {
		if (value & inBoth) == inBoth {
			found = true
			break
		}
	}
	tk.HeadersMatch = &found
}

// httpDiffTitleMatch compares the titles.
func (tk *TestKeys) httpDiffTitleMatch(
	probe *model.ArchivalHTTPRequestResult, ctrl *webconnectivity.ControlHTTPRequestResult) {
	response := probe.Response
	if response.Code <= 0 {
		return
	}
	if response.BodyIsTruncated {
		return
	}
	if ctrl.StatusCode <= 0 {
		return
	}
	control := ctrl.Title
	measurementBody := response.Body.Value
	measurement := measurexlite.WebGetTitle(measurementBody)
	if control == "" || measurement == "" {
		return
	}
	const (
		inMeasurement = 1 << 0
		inControl     = 1 << 1
		inBoth        = inMeasurement | inControl
	)
	words := make(map[string]int)
	// We don't consider to match words that are shorter than 5
	// characters (5 is the average word length for english)
	//
	// The original implementation considered the word order but
	// considering different languages it seems we could have less
	// false positives by ignoring the word order.
	const minWordLength = 5
	for _, word := range strings.Split(measurement, " ") {
		if len(word) >= minWordLength {
			words[strings.ToLower(word)] |= inMeasurement
		}
	}
	for _, word := range strings.Split(control, " ") {
		if len(word) >= minWordLength {
			words[strings.ToLower(word)] |= inControl
		}
	}
	good := true
	for _, score := range words {
		if (score & inBoth) != inBoth {
			good = false
			break
		}
	}
	tk.TitleMatch = &good
}