refactor: move WebGetTitle inside measurexlite (#895)

Part of https://github.com/ooni/probe/issues/2240
This commit is contained in:
Simone Basso
2022-08-28 20:26:40 +02:00
committed by GitHub
parent bb6563f363
commit 7c1b2bbcb0
5 changed files with 88 additions and 17 deletions
+19
View File
@@ -0,0 +1,19 @@
package measurexlite
//
// Code to process web results (e.g., from web connectivity)
//
import "regexp"
// WebGetTitle returns the title or an empty string.
func WebGetTitle(measurementBody string) string {
// MK used {1,128} but we're making it larger here to get longer titles
// e.g. <http://www.isa.gov.il/Pages/default.aspx>'s one
re := regexp.MustCompile(`(?i)<title>([^<]{1,512})</title>`)
v := re.FindStringSubmatch(measurementBody)
if len(v) < 2 {
return ""
}
return v[1]
}
+63
View File
@@ -0,0 +1,63 @@
package measurexlite
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ooni/probe-cli/v3/internal/randx"
)
func TestWebGetTitle(t *testing.T) {
type args struct {
body string
}
tests := []struct {
name string
args args
wantOut string
}{{
name: "with empty input",
args: args{
body: "",
},
wantOut: "",
}, {
name: "with body containing no titles",
args: args{
body: "<HTML/>",
},
wantOut: "",
}, {
name: "success with UTF-7 body",
args: args{
body: "<HTML><TITLE>La community di MSN</TITLE></HTML>",
},
wantOut: "La community di MSN",
}, {
name: "success with UTF-8 body",
args: args{
body: "<HTML><TITLE>La comunità di MSN</TITLE></HTML>",
},
wantOut: "La comunità di MSN",
}, {
name: "when the title is too long",
args: args{
body: "<HTML><TITLE>" + randx.Letters(1024) + "</TITLE></HTML>",
},
wantOut: "",
}, {
name: "success with case variations",
args: args{
body: "<HTML><TiTLe>La commUNity di MSN</tITLE></HTML>",
},
wantOut: "La commUNity di MSN",
}}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotOut := WebGetTitle(tt.args.body)
if diff := cmp.Diff(tt.wantOut, gotOut); diff != "" {
t.Fatal(diff)
}
})
}
}