ooni-probe-cli/internal/engine/experiment/webconnectivity/httpanalysis.go
Simone Basso 3b27780836
fix(webconnectivity): ignore any status code <= 0 (#579)
This diff changes the algorithm used by webconnectivity's
httpanalysis.go to ignore any status code <= 0 rather
than just ignoring the == 0 case.

Make sure we add test cases for when the control's status
code is negative rather than being zero.

While there, simplify code where boolean checks could be
more compact according to staticcheck.

Closes https://github.com/ooni/probe/issues/1825
2021-11-05 13:51:22 +01:00

253 lines
7.6 KiB
Go

package webconnectivity
import (
"reflect"
"regexp"
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/urlgetter"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity/internal"
"github.com/ooni/probe-cli/v3/internal/engine/model"
)
// HTTPAnalysisResult contains the results of the analysis performed on the
// client. We obtain it by comparing the measurement and the control.
type HTTPAnalysisResult struct {
BodyLengthMatch *bool `json:"body_length_match"`
BodyProportion float64 `json:"body_proportion"`
StatusCodeMatch *bool `json:"status_code_match"`
HeadersMatch *bool `json:"headers_match"`
TitleMatch *bool `json:"title_match"`
}
// Log logs the results of the analysis
func (har HTTPAnalysisResult) Log(logger model.Logger) {
logger.Infof("BodyLengthMatch: %+v", internal.BoolPointerToString(har.BodyLengthMatch))
logger.Infof("BodyProportion: %+v", har.BodyProportion)
logger.Infof("StatusCodeMatch: %+v", internal.BoolPointerToString(har.StatusCodeMatch))
logger.Infof("HeadersMatch: %+v", internal.BoolPointerToString(har.HeadersMatch))
logger.Infof("TitleMatch: %+v", internal.BoolPointerToString(har.TitleMatch))
}
// HTTPAnalysis performs follow-up analysis on the webconnectivity measurement by
// comparing the measurement test keys and the control.
func HTTPAnalysis(tk urlgetter.TestKeys, ctrl ControlResponse) (out HTTPAnalysisResult) {
out.BodyLengthMatch, out.BodyProportion = HTTPBodyLengthChecks(tk, ctrl)
out.StatusCodeMatch = HTTPStatusCodeMatch(tk, ctrl)
out.HeadersMatch = HTTPHeadersMatch(tk, ctrl)
out.TitleMatch = HTTPTitleMatch(tk, ctrl)
return
}
// HTTPBodyLengthChecks returns whether the measured body is reasonably
// long as much as the control body as well as the proportion between
// the two bodies. This check may return nil, nil when such a
// comparison would actually not be applicable.
func HTTPBodyLengthChecks(
tk urlgetter.TestKeys, ctrl ControlResponse) (match *bool, proportion float64) {
control := ctrl.HTTPRequest.BodyLength
if control <= 0 {
return
}
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.BodyIsTruncated {
return
}
measurement := int64(len(response.Body.Value))
if measurement <= 0 {
return
}
const bodyProportionFactor = 0.7
if measurement >= control {
proportion = float64(control) / float64(measurement)
} else {
proportion = float64(measurement) / float64(control)
}
v := proportion > bodyProportionFactor
match = &v
return
}
// HTTPStatusCodeMatch returns whether the status code of the measurement
// matches the status code of the control, or nil if such comparison
// is actually not applicable.
func HTTPStatusCodeMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
control := ctrl.HTTPRequest.StatusCode
if len(tk.Requests) < 1 {
return // no real status code
}
measurement := tk.Requests[0].Response.Code
if control <= 0 {
return // no real status code
}
if measurement <= 0 {
return // no real status code
}
value := control == measurement
if value {
// if the status codes are equal, they clearly match
out = &value
return
}
// This fix is part of Web Connectivity in MK and in Python since
// basically forever; my recollection is that we want to work around
// cases where the test helper is failing(?!). Unlike previous
// implementations, this implementation avoids a false positive
// when both measurement and control statuses are 500.
if control/100 == 5 {
return
}
out = &value
return
}
// HTTPHeadersMatch returns whether uncommon headers match between control and
// measurement, or nil if check is not applicable.
func HTTPHeadersMatch(tk urlgetter.TestKeys, ctrl ControlResponse) *bool {
if len(tk.Requests) <= 0 {
return nil
}
if tk.Requests[0].Response.Code <= 0 {
return nil
}
if ctrl.HTTPRequest.StatusCode <= 0 {
return nil
}
control := ctrl.HTTPRequest.Headers
// Implementation note: using map because we only care about the
// keys being different and we ignore the values.
measurement := tk.Requests[0].Response.Headers
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
commonHeaders := map[string]bool{
"date": true,
"content-type": true,
"server": true,
"cache-control": true,
"vary": true,
"set-cookie": true,
"location": true,
"expires": true,
"x-powered-by": true,
"content-encoding": true,
"last-modified": true,
"accept-ranges": true,
"pragma": true,
"x-frame-options": true,
"etag": true,
"x-content-type-options": true,
"age": true,
"via": true,
"p3p": true,
"x-xss-protection": true,
"content-language": true,
"cf-ray": true,
"strict-transport-security": true,
"link": true,
"x-varnish": true,
}
matching := make(map[string]int)
ours := make(map[string]bool)
for key := range measurement {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inMeasurement
}
ours[key] = true
}
theirs := make(map[string]bool)
for key := range control {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inControl
}
theirs[key] = true
}
// if they are equal we're done
if good := reflect.DeepEqual(ours, theirs); good {
return &good
}
// compute the intersection of uncommon headers
var intersection int
for _, value := range matching {
if (value & inBoth) == inBoth {
intersection++
}
}
good := intersection > 0
return &good
}
// GetTitle returns the title or an empty string.
func GetTitle(measurementBody string) string {
// MK used {1,128} but we're making it larger here to get longer titles
// e.g. <http://www.isa.gov.il/Pages/default.aspx>'s one
re := regexp.MustCompile(`(?i)<title>([^<]{1,512})</title>`)
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}
// HTTPTitleMatch returns whether the measurement and the control titles
// reasonably match, or nil if not applicable.
func HTTPTitleMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.Code <= 0 {
return
}
if response.BodyIsTruncated {
return
}
if ctrl.HTTPRequest.StatusCode <= 0 {
return
}
control := ctrl.HTTPRequest.Title
measurementBody := response.Body.Value
measurement := GetTitle(measurementBody)
if measurement == "" {
return
}
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
words := make(map[string]int)
// We don't consider to match words that are shorter than 5
// characters (5 is the average word length for english)
//
// The original implementation considered the word order but
// considering different languages it seems we could have less
// false positives by ignoring the word order.
const minWordLength = 5
for _, word := range strings.Split(measurement, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inMeasurement
}
}
for _, word := range strings.Split(control, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inControl
}
}
good := true
for _, score := range words {
if (score & inBoth) != inBoth {
good = false
break
}
}
return &good
}