2022-06-14 21:10:48 +02:00
|
|
|
package parse
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"io"
|
2022-09-11 02:14:53 +02:00
|
|
|
"net/url"
|
2022-06-14 21:10:48 +02:00
|
|
|
"regexp"
|
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
2022-09-11 02:14:53 +02:00
|
|
|
var urlRe = regexp.MustCompile(`([\w\d]{2,}:([^\s>\]\)"]|\][^\s>\)"]|\]$){8,})`)
|
2022-06-14 21:10:48 +02:00
|
|
|
|
|
|
|
// HttpLinks searches a reader for a http link and returns a copy of the
|
|
|
|
// reader and a slice with links.
|
|
|
|
func HttpLinks(r io.Reader) (io.Reader, []string) {
|
|
|
|
var buf bytes.Buffer
|
|
|
|
tr := io.TeeReader(r, &buf)
|
|
|
|
|
|
|
|
scanner := bufio.NewScanner(tr)
|
|
|
|
linkMap := make(map[string]struct{})
|
|
|
|
for scanner.Scan() {
|
|
|
|
line := scanner.Text()
|
|
|
|
for _, word := range strings.Fields(line) {
|
2022-09-11 02:14:53 +02:00
|
|
|
if links := urlRe.FindStringSubmatch(word); len(links) > 0 {
|
|
|
|
if _, err := url.Parse(links[0]); err != nil {
|
|
|
|
continue
|
2022-06-14 21:10:48 +02:00
|
|
|
}
|
2022-09-11 02:14:53 +02:00
|
|
|
linkMap[strings.TrimSpace(links[0])] = struct{}{}
|
2022-06-14 21:10:48 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
results := []string{}
|
2022-07-31 22:16:40 +02:00
|
|
|
for link := range linkMap {
|
2022-06-14 21:10:48 +02:00
|
|
|
results = append(results, link)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &buf, results
|
|
|
|
}
|