hmn/src/embed/embed.go

200 lines
4.9 KiB
Go

package embed
import (
"bytes"
"context"
"errors"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"git.handmade.network/hmn/hmn/src/logging"
"git.handmade.network/hmn/hmn/src/utils"
)
var DownloadTooBigError = errors.New("download too big")
var NoEmbedFound = errors.New("no embed found")
type Embeddable struct {
Url string
File *Embed
}
type Embed struct {
Data []byte
ContentType string
Filename string
}
var EmbeddableUrlRegex = regexp.MustCompile(`^https?://(youtu\.be|(www\.)?youtube\.com/watch)`)
func IsUrlEmbeddable(u string) bool {
return EmbeddableUrlRegex.MatchString(u)
}
func GetEmbeddableFromUrls(ctx context.Context, urls []string, maxSize int, httpTimeout time.Duration, httpMaxAttempts int) (*Embeddable, error) {
embedError := NoEmbedFound
for _, urlStr := range urls {
u, err := url.Parse(urlStr)
if err != nil {
continue
}
if u.Scheme == "" {
u.Scheme = "https"
urlStr = u.String()
}
if (u.Scheme != "http" && u.Scheme != "https") || u.Host == "" {
continue
}
if IsUrlEmbeddable(urlStr) {
result := Embeddable{
Url: urlStr,
}
return &result, nil
}
if httpMaxAttempts > 0 {
httpMaxAttempts -= 1
embed, err := FetchEmbed(ctx, urlStr, httpTimeout, maxSize)
if err != nil {
embedError = err
continue
}
result := Embeddable{
File: embed,
}
return &result, nil
}
}
return nil, embedError
}
// If the url points to a file, downloads and returns the file.
// If the url points to an html page, parses opengraph and tries to fetch an image/video/audio file according to that.
// maxSize only limits the actual filesize. In the case of html we always fetch up to 100kb even if maxSize is smaller.
func FetchEmbed(ctx context.Context, urlStr string, timeout time.Duration, maxSize int) (*Embed, error) {
logging.ExtractLogger(ctx).Debug().Msg("Fetching embed")
client := &http.Client{
Timeout: timeout,
}
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return nil, err
}
res, err := client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode < 200 || res.StatusCode > 299 {
return nil, NoEmbedFound
}
contentType := res.Header.Get("Content-Type")
logging.ExtractLogger(ctx).Debug().Str("type", contentType).Msg("Got first result")
if strings.HasPrefix(contentType, "text/html") || strings.HasPrefix(contentType, "application/html") {
var buffer bytes.Buffer
_, err := io.CopyN(&buffer, res.Body, 100*1024) // NOTE(asaf): If the opengraph stuff isn't in the first 100kb, we don't care.
res.Body.Close()
if err != nil && !errors.Is(err, io.EOF) {
return nil, err
}
partialHtml := buffer.Bytes()
urlStr = ExtractEmbedFromOpenGraph(partialHtml)
logging.ExtractLogger(ctx).Debug().Str("url", urlStr).Msg("Got ograph")
if urlStr == "" {
return nil, NoEmbedFound
}
req, err = http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return nil, err
}
res, err = client.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode < 200 || res.StatusCode > 299 {
return nil, NoEmbedFound
}
contentType = res.Header.Get("Content-Type")
}
var buffer bytes.Buffer
n, err := io.CopyN(&buffer, res.Body, int64(maxSize+1))
res.Body.Close()
if err != nil && !errors.Is(err, io.EOF) {
return nil, err
}
filename := ""
u, err := url.Parse(urlStr)
if err == nil {
lastSlash := utils.IntMax(strings.LastIndex(u.Path, "/"), 0)
filename = u.Path[lastSlash:]
}
result := Embed{
Data: buffer.Bytes(),
ContentType: contentType,
Filename: filename,
}
if n == int64(maxSize+1) {
err = DownloadTooBigError
} else {
err = nil
}
return &result, err
}
var metaRegex = regexp.MustCompile(`<meta\s+([^>]+)/?>`)
var metaAttrRegex = regexp.MustCompile(`(?P<key>\w+)="(?P<value>[^"]+)"`)
var OGKeys = []string{
"og:audio",
"og:video",
"og:image",
"og:audio:url",
"og:image:url",
"og:video:url",
"og:audio:secure_url",
"og:image:secure_url",
"og:video:secure_url",
"twitter:image",
}
// Tries to find an opengraph image/video/audio url in the provided html
// Since we only need to look at meta tags in the head, we don't need the full html document.
func ExtractEmbedFromOpenGraph(partialHtml []byte) string {
keyIdx := metaAttrRegex.SubexpIndex("key")
valueIdx := metaAttrRegex.SubexpIndex("value")
html := string(partialHtml)
matches := metaRegex.FindAllStringSubmatch(html, -1)
for _, m := range matches {
if len(m) > 1 {
content := ""
prop := ""
attrs := metaAttrRegex.FindAllStringSubmatch(m[1], -1)
for _, attr := range attrs {
key := attr[keyIdx]
value := attr[valueIdx]
if key == "name" || key == "property" {
for _, ogKey := range OGKeys {
if value == ogKey {
prop = value
}
}
} else if key == "content" {
content = value
}
}
if content != "" && prop != "" {
return content
}
}
}
return ""
}