refactor: move WebGetTitle inside measurexlite (#895)

Part of https://github.com/ooni/probe/issues/2240
This commit is contained in:
Simone Basso 2022-08-28 20:26:40 +02:00 committed by GitHub
parent bb6563f363
commit 7c1b2bbcb0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 88 additions and 17 deletions

View File

@ -12,7 +12,7 @@ import (
"sync"
"time"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity"
"github.com/ooni/probe-cli/v3/internal/measurexlite"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/netxlite"
"github.com/ooni/probe-cli/v3/internal/tracex"
@ -100,7 +100,7 @@ func httpDo(ctx context.Context, config *httpConfig) {
Failure: httpMapFailure(err),
StatusCode: int64(resp.StatusCode),
Headers: headers,
Title: webconnectivity.GetTitle(string(data)),
Title: measurexlite.WebGetTitle(string(data)),
}
}

View File

@ -2,11 +2,11 @@ package webconnectivity
import (
"reflect"
"regexp"
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/urlgetter"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity/internal"
"github.com/ooni/probe-cli/v3/internal/measurexlite"
"github.com/ooni/probe-cli/v3/internal/model"
)
@ -184,18 +184,6 @@ func HTTPHeadersMatch(tk urlgetter.TestKeys, ctrl ControlResponse) *bool {
return &good
}
// GetTitle returns the title or an empty string.
func GetTitle(measurementBody string) string {
// MK used {1,128} but we're making it larger here to get longer titles
// e.g. <http://www.isa.gov.il/Pages/default.aspx>'s one
re := regexp.MustCompile(`(?i)<title>([^<]{1,512})</title>`)
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}
// HTTPTitleMatch returns whether the measurement and the control titles
// reasonably match, or nil if not applicable.
func HTTPTitleMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
@ -214,7 +202,7 @@ func HTTPTitleMatch(tk urlgetter.TestKeys, ctrl ControlResponse) (out *bool) {
}
control := ctrl.HTTPRequest.Title
measurementBody := response.Body.Value
measurement := GetTitle(measurementBody)
measurement := measurexlite.WebGetTitle(measurementBody)
if measurement == "" {
return
}

View File

@ -10,6 +10,7 @@ import (
"strings"
"github.com/ooni/probe-cli/v3/internal/engine/experiment/webconnectivity"
"github.com/ooni/probe-cli/v3/internal/measurexlite"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/runtimex"
)
@ -208,7 +209,7 @@ func (tk *TestKeys) httpDiffTitleMatch(
}
control := ctrl.Title
measurementBody := response.Body.Value
measurement := webconnectivity.GetTitle(measurementBody)
measurement := measurexlite.WebGetTitle(measurementBody)
if control == "" || measurement == "" {
return
}

View File

@ -0,0 +1,19 @@
package measurexlite
//
// Code to process web results (e.g., from web connectivity)
//
import "regexp"
// WebGetTitle returns the title or an empty string.
func WebGetTitle(measurementBody string) string {
// MK used {1,128} but we're making it larger here to get longer titles
// e.g. <http://www.isa.gov.il/Pages/default.aspx>'s one
re := regexp.MustCompile(`(?i)<title>([^<]{1,512})</title>`)
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}

View File

@ -0,0 +1,63 @@
package measurexlite
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ooni/probe-cli/v3/internal/randx"
)
func TestWebGetTitle(t *testing.T) {
type args struct {
body string
}
tests := []struct {
name string
args args
wantOut string
}{{
name: "with empty input",
args: args{
body: "",
},
wantOut: "",
}, {
name: "with body containing no titles",
args: args{
body: "<HTML/>",
},
wantOut: "",
}, {
name: "success with UTF-7 body",
args: args{
body: "<HTML><TITLE>La community di MSN</TITLE></HTML>",
},
wantOut: "La community di MSN",
}, {
name: "success with UTF-8 body",
args: args{
body: "<HTML><TITLE>La comunità di MSN</TITLE></HTML>",
},
wantOut: "La comunità di MSN",
}, {
name: "when the title is too long",
args: args{
body: "<HTML><TITLE>" + randx.Letters(1024) + "</TITLE></HTML>",
},
wantOut: "",
}, {
name: "success with case variations",
args: args{
body: "<HTML><TiTLe>La commUNity di MSN</tITLE></HTML>",
},
wantOut: "La commUNity di MSN",
}}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotOut := WebGetTitle(tt.args.body)
if diff := cmp.Diff(tt.wantOut, gotOut); diff != "" {
t.Fatal(diff)
}
})
}
}