open-link: make URL parsing more lenient

URLs are extremely loosely defined and can take many shapes which may
not be parsed at all if unusual characters like the exclamation mark are
present. To ensure lists and odd use of spaces are not parsed as links
some sanity-checks are in place:
	- the URL's schema must be at least two characters long
	- the URL's authority, path, and fragment must have a combined
	  length of 8 characters or longer
	- the URL must not contain a whitespace character, >, ), or "
	- the URL may only contain a ] when followed by a different allowed
	  character or at the end of the line (necessary for IPv6
	  authorities)

The tests for this function now include links with an exclamation point
and IPv6 addresses. The tests are given names to be easier identifiable.

Link: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
Reported-by: "Bence Ferdinandy" <bence@ferdinandy.com>
Cc: "Koni Marti" <koni.marti@gmail.com>
Fixes: e1d8bc4d17 ("msgviewer: open http links from messages")
Signed-off-by: Moritz Poldrack <git@moritz.sh>
Acked-by: Tim Culverhouse <tim@timculverhouse.com>
This commit is contained in:
Moritz Poldrack 2022-09-11 02:14:53 +02:00 committed by Robin Jarry
parent ba9d79fd2d
commit fad90c2956
3 changed files with 61 additions and 34 deletions
lib/parse

View file

@ -4,14 +4,12 @@ import (
"bufio"
"bytes"
"io"
"net/url"
"regexp"
"strings"
)
var (
submatch = `(https?:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,10}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*))`
httpRe = regexp.MustCompile("\"" + submatch + "\"" + "|" + "\\(" + submatch + "\\)" + "|" + "<" + submatch + ">" + "|" + submatch)
)
var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
@ -23,16 +21,12 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
linkMap := make(map[string]struct{})
for scanner.Scan() {
line := scanner.Text()
if !strings.Contains(line, "http") {
continue
}
for _, word := range strings.Fields(line) {
if links := httpRe.FindStringSubmatch(word); len(links) > 0 {
for _, l := range links[1:] {
if l != "" {
linkMap[strings.TrimSpace(l)] = struct{}{}
}
if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
if _, err := url.Parse(links[0]); err != nil {
continue
}
linkMap[strings.TrimSpace(links[0])] = struct{}{}
}
}
}