ooni-probe-cli/internal/engine/experiment/webconnectivity/httpanalysis.go
Simone Basso d57c78bc71
chore: merge probe-engine into probe-cli (#201)
This is how I did it:

1. `git clone https://github.com/ooni/probe-engine internal/engine`

2. ```
(cd internal/engine && git describe --tags)
v0.23.0
```

3. `nvim go.mod` (merging `go.mod` with `internal/engine/go.mod`

4. `rm -rf internal/.git internal/engine/go.{mod,sum}`

5. `git add internal/engine`

6. `find . -type f -name \*.go -exec sed -i 's@/ooni/probe-engine@/ooni/probe-cli/v3/internal/engine@g' {} \;`

7. `go build ./...` (passes)

8. `go test -race ./...` (temporary failure on RiseupVPN)

9. `go mod tidy`

10. this commit message

Once this piece of work is done, we can build a new version of `ooniprobe` that
is using `internal/engine` directly. We need to do more work to ensure all the
other functionality in `probe-engine` (e.g. making mobile packages) are still WAI.

Part of https://github.com/ooni/probe/issues/1335
2021-02-02 12:05:47 +01:00

251 lines
7.4 KiB
Go

package webconnectivity
import (
"reflect"
"regexp"
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/urlgetter"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity/internal"
"github.com/ooni/probe-cli/v3/internal/engine/model"
)
// HTTPAnalysisResult contains the results of the analysis performed on the
// client. We obtain it by comparing the measurement and the control.
type HTTPAnalysisResult struct {
BodyLengthMatch *bool `json:"body_length_match"`
BodyProportion float64 `json:"body_proportion"`
StatusCodeMatch *bool `json:"status_code_match"`
HeadersMatch *bool `json:"headers_match"`
TitleMatch *bool `json:"title_match"`
}
// Log logs the results of the analysis
func (har HTTPAnalysisResult) Log(logger model.Logger) {
logger.Infof("BodyLengthMatch: %+v", internal.BoolPointerToString(har.BodyLengthMatch))
logger.Infof("BodyProportion: %+v", har.BodyProportion)
logger.Infof("StatusCodeMatch: %+v", internal.BoolPointerToString(har.StatusCodeMatch))
logger.Infof("HeadersMatch: %+v", internal.BoolPointerToString(har.HeadersMatch))
logger.Infof("TitleMatch: %+v", internal.BoolPointerToString(har.TitleMatch))
}
// HTTPAnalysis performs follow-up analysis on the webconnectivity measurement by
// comparing the measurement test keys and the control.
func HTTPAnalysis(tk urlgetter.TestKeys, ctrl ControlResponse) (out HTTPAnalysisResult) {
out.BodyLengthMatch, out.BodyProportion = HTTPBodyLengthChecks(tk, ctrl)
out.StatusCodeMatch = HTTPStatusCodeMatch(tk, ctrl)
out.HeadersMatch = HTTPHeadersMatch(tk, ctrl)
out.TitleMatch = HTTPTitleMatch(tk, ctrl)
return
}
// HTTPBodyLengthChecks returns whether the measured body is reasonably
// long as much as the control body as well as the proportion between
// the two bodies. This check may return nil, nil when such a
// comparison would actually not be applicable.
func HTTPBodyLengthChecks(
tk urlgetter.TestKeys, ctrl ControlResponse) (match *bool, proportion float64) {
control := ctrl.HTTPRequest.BodyLength
if control <= 0 {
return
}
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.BodyIsTruncated {
return
}
measurement := int64(len(response.Body.Value))
if measurement <= 0 {
return
}
const bodyProportionFactor = 0.7
if measurement >= control {
proportion = float64(control) / float64(measurement)
} else {
proportion = float64(measurement) / float64(control)
}
v := proportion > bodyProportionFactor
match = &v
return
}
// HTTPStatusCodeMatch returns whether the status code of the measurement
// matches the status code of the control, or nil if such comparison
// is actually not applicable.
func HTTPStatusCodeMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
control := ctrl.HTTPRequest.StatusCode
if len(tk.Requests) < 1 {
return // no real status code
}
measurement := tk.Requests[0].Response.Code
if control == 0 {
return // no real status code
}
if measurement == 0 {
return // no real status code
}
value := control == measurement
if value == true {
// if the status codes are equal, they clearly match
out = &value
return
}
// This fix is part of Web Connectivity in MK and in Python since
// basically forever; my recollection is that we want to work around
// cases where the test helper is failing(?!). Unlike previous
// implementations, this implementation avoids a false positive
// when both measurement and control statuses are 500.
if control/100 == 5 {
return
}
out = &value
return
}
// HTTPHeadersMatch returns whether uncommon headers match between control and
// measurement, or nil if check is not applicable.
func HTTPHeadersMatch(tk urlgetter.TestKeys, ctrl ControlResponse) *bool {
if len(tk.Requests) <= 0 {
return nil
}
if tk.Requests[0].Response.Code == 0 {
return nil
}
if ctrl.HTTPRequest.StatusCode == 0 {
return nil
}
control := ctrl.HTTPRequest.Headers
// Implementation note: using map because we only care about the
// keys being different and we ignore the values.
measurement := tk.Requests[0].Response.Headers
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
commonHeaders := map[string]bool{
"date": true,
"content-type": true,
"server": true,
"cache-control": true,
"vary": true,
"set-cookie": true,
"location": true,
"expires": true,
"x-powered-by": true,
"content-encoding": true,
"last-modified": true,
"accept-ranges": true,
"pragma": true,
"x-frame-options": true,
"etag": true,
"x-content-type-options": true,
"age": true,
"via": true,
"p3p": true,
"x-xss-protection": true,
"content-language": true,
"cf-ray": true,
"strict-transport-security": true,
"link": true,
"x-varnish": true,
}
matching := make(map[string]int)
ours := make(map[string]bool)
for key := range measurement {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inMeasurement
}
ours[key] = true
}
theirs := make(map[string]bool)
for key := range control {
key = strings.ToLower(key)
if _, ok := commonHeaders[key]; !ok {
matching[key] |= inControl
}
theirs[key] = true
}
// if they are equal we're done
if good := reflect.DeepEqual(ours, theirs); good {
return &good
}
// compute the intersection of uncommon headers
var intersection int
for _, value := range matching {
if (value & inBoth) == inBoth {
intersection++
}
}
good := intersection > 0
return &good
}
// GetTitle returns the title or an empty string.
func GetTitle(measurementBody string) string {
re := regexp.MustCompile(`(?i)<title>([^<]{1,128})</title>`) // like MK
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}
// HTTPTitleMatch returns whether the measurement and the control titles
// reasonably match, or nil if not applicable.
func HTTPTitleMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
if len(tk.Requests) <= 0 {
return
}
response := tk.Requests[0].Response
if response.Code == 0 {
return
}
if response.BodyIsTruncated {
return
}
if ctrl.HTTPRequest.StatusCode == 0 {
return
}
control := ctrl.HTTPRequest.Title
measurementBody := response.Body.Value
measurement := GetTitle(measurementBody)
if measurement == "" {
return
}
const (
inMeasurement = 1 << 0
inControl = 1 << 1
inBoth = inMeasurement | inControl
)
words := make(map[string]int)
// We don't consider to match words that are shorter than 5
// characters (5 is the average word length for english)
//
// The original implementation considered the word order but
// considering different languages it seems we could have less
// false positives by ignoring the word order.
const minWordLength = 5
for _, word := range strings.Split(measurement, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inMeasurement
}
}
for _, word := range strings.Split(control, " ") {
if len(word) >= minWordLength {
words[strings.ToLower(word)] |= inControl
}
}
good := true
for _, score := range words {
if (score & inBoth) != inBoth {
good = false
break
}
}
return &good
}