aerc/worker/lib/parse.go

328 lines
8.8 KiB
Go

package lib
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"regexp"
"strings"
"time"
"git.sr.ht/~rjarry/aerc/logging"
"git.sr.ht/~rjarry/aerc/models"
"github.com/emersion/go-message"
_ "github.com/emersion/go-message/charset"
"github.com/emersion/go-message/mail"
)
// RFC 1123Z regexp
var dateRe = regexp.MustCompile(`(((Mon|Tue|Wed|Thu|Fri|Sat|Sun))[,]?\s[0-9]{1,2})\s` +
`(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s` +
`([0-9]{4})\s([0-9]{2}):([0-9]{2})(:([0-9]{2}))?\s([\+|\-][0-9]{4})`)
func FetchEntityPartReader(e *message.Entity, index []int) (io.Reader, error) {
if len(index) == 0 {
// non multipart, simply return everything
return bufReader(e)
}
if mpr := e.MultipartReader(); mpr != nil {
idx := 0
for {
idx++
part, err := mpr.NextPart()
if err != nil {
return nil, err
}
if idx == index[0] {
rest := index[1:]
if len(rest) < 1 {
return bufReader(part)
}
return FetchEntityPartReader(part, index[1:])
}
}
}
return nil, fmt.Errorf("FetchEntityPartReader: unexpected code reached")
}
// TODO: the UI doesn't seem to like readers which aren't buffers
func bufReader(e *message.Entity) (io.Reader, error) {
var buf bytes.Buffer
if _, err := io.Copy(&buf, e.Body); err != nil {
return nil, err
}
return &buf, nil
}
// split a MIME type into its major and minor parts
func splitMIME(m string) (string, string) {
parts := strings.Split(m, "/")
if len(parts) != 2 {
return parts[0], ""
}
return parts[0], parts[1]
}
func fixContentType(h message.Header) (string, map[string]string) {
ct, rest := h.Get("Content-Type"), ""
if i := strings.Index(ct, ";"); i > 0 {
ct, rest = ct[:i], ct[i:]
}
// check if there are quotes around the content type
if strings.Contains(ct, "\"") {
header := strings.ReplaceAll(ct, "\"", "")
if rest != "" {
header += rest
}
h.Set("Content-Type", header)
if contenttype, params, err := h.ContentType(); err == nil {
return contenttype, params
}
}
// if all else fails, return text/plain
return "text/plain", nil
}
func ParseEntityStructure(e *message.Entity) (*models.BodyStructure, error) {
var body models.BodyStructure
contentType, ctParams, err := e.Header.ContentType()
if err != nil {
// try to fix the error; if all measures fail, then return a
// text/plain content type to display at least plaintext
contentType, ctParams = fixContentType(e.Header)
}
mimeType, mimeSubType := splitMIME(contentType)
body.MIMEType = mimeType
body.MIMESubType = mimeSubType
body.Params = ctParams
body.Description = e.Header.Get("content-description")
body.Encoding = e.Header.Get("content-transfer-encoding")
if cd := e.Header.Get("content-disposition"); cd != "" {
contentDisposition, cdParams, err := e.Header.ContentDisposition()
if err != nil {
return nil, fmt.Errorf("could not parse content disposition: %w", err)
}
body.Disposition = contentDisposition
body.DispositionParams = cdParams
}
body.Parts = []*models.BodyStructure{}
if mpr := e.MultipartReader(); mpr != nil {
for {
part, err := mpr.NextPart()
if errors.Is(err, io.EOF) {
return &body, nil
} else if err != nil {
return nil, err
}
ps, err := ParseEntityStructure(part)
if err != nil {
return nil, fmt.Errorf("could not parse child entity structure: %w", err)
}
body.Parts = append(body.Parts, ps)
}
}
return &body, nil
}
var DateParseError = errors.New("date parsing failed")
func parseEnvelope(h *mail.Header) (*models.Envelope, error) {
from, err := parseAddressList(h, "from")
if err != nil {
return nil, fmt.Errorf("could not read from address: %w", err)
}
to, err := parseAddressList(h, "to")
if err != nil {
return nil, fmt.Errorf("could not read to address: %w", err)
}
cc, err := parseAddressList(h, "cc")
if err != nil {
return nil, fmt.Errorf("could not read cc address: %w", err)
}
bcc, err := parseAddressList(h, "bcc")
if err != nil {
return nil, fmt.Errorf("could not read bcc address: %w", err)
}
replyTo, err := parseAddressList(h, "reply-to")
if err != nil {
return nil, fmt.Errorf("could not read reply-to address: %w", err)
}
subj, err := h.Subject()
if err != nil {
return nil, fmt.Errorf("could not read subject: %w", err)
}
msgID, err := h.MessageID()
if err != nil {
// proper parsing failed, so fall back to whatever is there
msgID, err = h.Text("message-id")
if err != nil {
return nil, err
}
}
date, err := parseDate(h)
if err != nil {
// still return a valid struct plus a sentinel date parsing error
// if only the date parsing failed
err = fmt.Errorf("%w: %v", DateParseError, err)
}
return &models.Envelope{
Date: date,
Subject: subj,
MessageId: msgID,
From: from,
ReplyTo: replyTo,
To: to,
Cc: cc,
Bcc: bcc,
}, err
}
// parseDate tries to parse the date from the Date header with non std formats
// if this fails it tries to parse the received header as well
func parseDate(h *mail.Header) (time.Time, error) {
t, err := h.Date()
if err == nil {
return t, nil
}
text, err := h.Text("date")
// sometimes, no error occurs but the date is empty.
// In this case, guess time from received header field
if err != nil || text == "" {
t, err := parseReceivedHeader(h)
if err == nil {
return t, nil
}
}
layouts := []string{
// X-Mailer: EarthLink Zoo Mail 1.0
"Mon, _2 Jan 2006 15:04:05 -0700 (GMT-07:00)",
}
for _, layout := range layouts {
if t, err := time.Parse(layout, text); err == nil {
return t, nil
}
}
// still no success, try the received header as a last resort
t, err = parseReceivedHeader(h)
if err != nil {
return time.Time{}, fmt.Errorf("unrecognized date format: %s", text)
}
return t, nil
}
func parseReceivedHeader(h *mail.Header) (time.Time, error) {
guess, err := h.Text("received")
if err != nil {
return time.Time{}, fmt.Errorf("received header not parseable: %w",
err)
}
return time.Parse(time.RFC1123Z, dateRe.FindString(guess))
}
func parseAddressList(h *mail.Header, key string) ([]*mail.Address, error) {
hdr, err := h.Text(key)
if err != nil && !message.IsUnknownCharset(err) {
return nil, err
}
if hdr == "" {
return nil, nil
}
add, err := mail.ParseAddressList(hdr)
if err != nil {
return []*mail.Address{{Name: hdr}}, nil
}
return add, err
}
// RawMessage is an interface that describes a raw message
type RawMessage interface {
NewReader() (io.ReadCloser, error)
ModelFlags() ([]models.Flag, error)
Labels() ([]string, error)
UID() uint32
}
// MessageInfo populates a models.MessageInfo struct for the message.
// based on the reader returned by NewReader
func MessageInfo(raw RawMessage) (*models.MessageInfo, error) {
var parseErr error
r, err := raw.NewReader()
if err != nil {
return nil, err
}
defer r.Close()
msg, err := ReadMessage(r)
if err != nil {
return nil, fmt.Errorf("could not read message: %w", err)
}
bs, err := ParseEntityStructure(msg)
if errors.As(err, new(message.UnknownEncodingError)) {
parseErr = err
} else if err != nil {
return nil, fmt.Errorf("could not get structure: %w", err)
}
h := &mail.Header{Header: msg.Header}
env, err := parseEnvelope(h)
if err != nil && !errors.Is(err, DateParseError) {
return nil, fmt.Errorf("could not parse envelope: %w", err)
// if only the date parsing failed we still get the rest of the
// envelop structure in a valid state.
// Date parsing errors are fairly common and it's better to be
// slightly off than to not be able to read the mails at all
// hence we continue here
}
recDate, _ := parseReceivedHeader(h)
if recDate.IsZero() {
// better than nothing, if incorrect
recDate = env.Date
}
flags, err := raw.ModelFlags()
if err != nil {
return nil, err
}
labels, err := raw.Labels()
if err != nil {
return nil, err
}
return &models.MessageInfo{
BodyStructure: bs,
Envelope: env,
Flags: flags,
Labels: labels,
InternalDate: recDate,
RFC822Headers: &mail.Header{Header: msg.Header},
Size: 0,
Uid: raw.UID(),
Error: parseErr,
}, nil
}
// NewCRLFReader returns a reader with CRLF line endings
func NewCRLFReader(r io.Reader) io.Reader {
var buf bytes.Buffer
scanner := bufio.NewScanner(r)
for scanner.Scan() {
buf.WriteString(scanner.Text() + "\r\n")
}
return &buf
}
// ReadMessage is a wrapper for the message.Read function to read a message
// from r. The message's encoding and charset are automatically decoded to
// UTF-8. If an unknown charset is encountered, the error is logged but a nil
// error is returned since the entity object can still be read.
func ReadMessage(r io.Reader) (*message.Entity, error) {
entity, err := message.Read(r)
if message.IsUnknownCharset(err) {
logging.Warnf("unknown charset encountered")
} else if err != nil {
return nil, fmt.Errorf("could not read message: %w", err)
}
return entity, nil
}