ooni-probe-cli/internal/engine/experiment/webconnectivity/httpanalysis.go
Simone Basso 273b70bacc
refactor: interfaces and data types into the model package (#642)
## Checklist

- [x] I have read the [contribution guidelines](https://github.com/ooni/probe-cli/blob/master/CONTRIBUTING.md)
- [x] reference issue for this pull request: https://github.com/ooni/probe/issues/1885
- [x] related ooni/spec pull request: N/A

Location of the issue tracker: https://github.com/ooni/probe

## Description

This PR contains a set of changes to move important interfaces and data types into the `./internal/model` package.

The criteria for including an interface or data type in here is roughly that the type should be important and used by several packages. We are especially interested to move more interfaces here to increase modularity.

An additional side effect is that, by reading this package, one should be able to understand more quickly how different parts of the codebase interact with each other.

This is what I want to move in `internal/model`:

- [x] most important interfaces from `internal/netxlite`
- [x] everything that was previously part of `internal/engine/model`
- [x] mocks from `internal/netxlite/mocks` should also be moved in here as a subpackage
2022-01-03 13:53:23 +01:00

253 lines
7.5 KiB
Go

package webconnectivity
import (
"reflect"
"regexp"
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/urlgetter"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity/internal"
"github.com/ooni/probe-cli/v3/internal/model"
)
// HTTPAnalysisResult contains the results of the analysis performed on the
// client. We obtain it by comparing the measurement and the control.
type HTTPAnalysisResult struct {
BodyLengthMatch *bool `json:"body_length_match"`
BodyProportion float64 `json:"body_proportion"`
StatusCodeMatch *bool `json:"status_code_match"`
HeadersMatch *bool `json:"headers_match"`
TitleMatch *bool `json:"title_match"`
}
// Log logs the results of the analysis
func (har HTTPAnalysisResult) Log(logger model.Logger) {
logger.Infof("BodyLengthMatch: %+v", internal.BoolPointerToString(har.BodyLengthMatch))
logger.Infof("BodyProportion: %+v", har.BodyProportion)
logger.Infof("StatusCodeMatch: %+v", internal.BoolPointerToString(har.StatusCodeMatch))
logger.Infof("HeadersMatch: %+v", internal.BoolPointerToString(har.HeadersMatch))
logger.Infof("TitleMatch: %+v", internal.BoolPointerToString(har.TitleMatch))
}
// HTTPAnalysis performs follow-up analysis on the webconnectivity measurement by
// comparing the measurement test keys and the control.
func HTTPAnalysis(tk urlgetter.TestKeys, ctrl ControlResponse) (out HTTPAnalysisResult) {
out.BodyLengthMatch, out.BodyProportion = HTTPBodyLengthChecks(tk, ctrl)
out.StatusCodeMatch = HTTPStatusCodeMatch(tk, ctrl)
out.HeadersMatch = HTTPHeadersMatch(tk, ctrl)
out.TitleMatch = HTTPTitleMatch(tk, ctrl)
return
}
// HTTPBodyLengthChecks returns whether the measured body is reasonably
// long as much as the control body as well as the proportion between
// the two bodies. This check may return nil, nil when such a
// comparison would actually not be applicable.
func HTTPBodyLengthChecks(
tk urlgetter.TestKeys, ctrl ControlResponse) (match *bool, proportion float64) {
control := ctrl.HTTPRequest.BodyLength
if control <= 0 {
return
}
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.BodyIsTruncated {
return
}
measurement := int64(len(response.Body.Value))
if measurement <= 0 {
return
}
const bodyProportionFactor = 0.7
if measurement >= control {
proportion = float64(control) / float64(measurement)
} else {
proportion = float64(measurement) / float64(control)
}
v := proportion > bodyProportionFactor
match = &v
return
}
// HTTPStatusCodeMatch returns whether the status code of the measurement
// matches the status code of the control, or nil if such comparison
// is actually not applicable.
func HTTPStatusCodeMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
control := ctrl.HTTPRequest.StatusCode
if len(tk.Requests) < 1 {
return // no real status code
}
measurement := tk.Requests[0].Response.Code
if control <= 0 {
return // no real status code
}
if measurement <= 0 {
return // no real status code
}
value := control == measurement
if value {
// if the status codes are equal, they clearly match
out = &value
return
}
// This fix is part of Web Connectivity in MK and in Python since
// basically forever; my recollection is that we want to work around
// cases where the test helper is failing(?!). Unlike previous
// implementations, this implementation avoids a false positive
// when both measurement and control statuses are 500.
if control/100 == 5 {
return
}
out = &value
return
}
// HTTPHeadersMatch returns whether uncommon headers match between control and
// measurement, or nil if check is not applicable.
func HTTPHeadersMatch(tk urlgetter.TestKeys, ctrl ControlResponse) *bool {
if len(tk.Requests) <= 0 {
return nil
}
if tk.Requests[0].Response.Code <= 0 {
return nil
}
if ctrl.HTTPRequest.StatusCode <= 0 {
return nil
}
control := ctrl.HTTPRequest.Headers
// Implementation note: using map because we only care about the
// keys being different and we ignore the values.
measurement := tk.Requests[0].Response.Headers
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
commonHeaders := map[string]bool{
"date": true,
"content-type": true,
"server": true,
"cache-control": true,
"vary": true,
"set-cookie": true,
"location": true,
"expires": true,
"x-powered-by": true,
"content-encoding": true,
"last-modified": true,
"accept-ranges": true,
"pragma": true,
"x-frame-options": true,
"etag": true,
"x-content-type-options": true,
"age": true,
"via": true,
"p3p": true,
"x-xss-protection": true,
"content-language": true,
"cf-ray": true,
"strict-transport-security": true,
"link": true,
"x-varnish": true,
}
matching := make(map[string]int)
ours := make(map[string]bool)
for key := range measurement {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inMeasurement
}
ours[key] = true
}
theirs := make(map[string]bool)
for key := range control {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inControl
}
theirs[key] = true
}
// if they are equal we're done
if good := reflect.DeepEqual(ours, theirs); good {
return &good
}
// compute the intersection of uncommon headers
var intersection int
for _, value := range matching {
if (value & inBoth) == inBoth {
intersection++
}
}
good := intersection > 0
return &good
}
// GetTitle returns the title or an empty string.
func GetTitle(measurementBody string) string {
// MK used {1,128} but we're making it larger here to get longer titles
// e.g. <http://www.isa.gov.il/Pages/default.aspx>'s one
re := regexp.MustCompile(`(?i)<title>([^<]{1,512})</title>`)
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}
// HTTPTitleMatch returns whether the measurement and the control titles
// reasonably match, or nil if not applicable.
func HTTPTitleMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.Code <= 0 {
return
}
if response.BodyIsTruncated {
return
}
if ctrl.HTTPRequest.StatusCode <= 0 {
return
}
control := ctrl.HTTPRequest.Title
measurementBody := response.Body.Value
measurement := GetTitle(measurementBody)
if measurement == "" {
return
}
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
words := make(map[string]int)
// We don't consider to match words that are shorter than 5
// characters (5 is the average word length for english)
//
// The original implementation considered the word order but
// considering different languages it seems we could have less
// false positives by ignoring the word order.
const minWordLength = 5
for _, word := range strings.Split(measurement, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inMeasurement
}
}
for _, word := range strings.Split(control, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inControl
}
}
good := true
for _, score := range words {
if (score & inBoth) != inBoth {
good = false
break
}
}
return &good
}