From 84d31a7049b94e1a84f418883db7447fbf3d4a2f Mon Sep 17 00:00:00 2001 From: Travis Ralston <travpc@gmail.com> Date: Tue, 4 Jul 2023 23:00:37 -0600 Subject: [PATCH] Reshuffle url previewers to match thumbnailer structure --- .../preview_controller/preview_controller.go | 4 +-- .../preview_resource_handler.go | 11 +++---- pipelines/_steps/url_preview/upload_image.go | 4 +-- pipelines/pipeline_preview/pipeline.go | 21 +++++++------- url_previewing/m/errors.go | 5 ++++ url_previewing/m/preview_result.go | 18 ++++++++++++ url_previewing/m/url_payload.go | 10 +++++++ .../calculated.go} | 22 +++++++------- .../{oembed_previewer.go => p/oembed.go} | 16 +++++----- .../opengraph.go} | 26 +++++++++-------- url_previewing/types.go | 29 ------------------- url_previewing/{ => u}/acl.go | 2 +- url_previewing/{ => u}/http.go | 17 ++++++----- url_previewing/{util.go => u/summarize.go} | 4 +-- 14 files changed, 101 insertions(+), 88 deletions(-) create mode 100644 url_previewing/m/errors.go create mode 100644 url_previewing/m/preview_result.go create mode 100644 url_previewing/m/url_payload.go rename url_previewing/{calculated_previewer.go => p/calculated.go} (59%) rename url_previewing/{oembed_previewer.go => p/oembed.go} (82%) rename url_previewing/{opengraph_previewer.go => p/opengraph.go} (81%) delete mode 100644 url_previewing/types.go rename url_previewing/{ => u}/acl.go (99%) rename url_previewing/{ => u}/http.go (87%) rename url_previewing/{util.go => u/summarize.go} (92%) diff --git a/controllers/preview_controller/preview_controller.go b/controllers/preview_controller/preview_controller.go index a4e6b6a4..a6936453 100644 --- a/controllers/preview_controller/preview_controller.go +++ b/controllers/preview_controller/preview_controller.go @@ -13,7 +13,7 @@ import ( "github.com/turt2live/matrix-media-repo/storage" "github.com/turt2live/matrix-media-repo/storage/stores" "github.com/turt2live/matrix-media-repo/types" - "github.com/turt2live/matrix-media-repo/url_previewing" + "github.com/turt2live/matrix-media-repo/url_previewing/m" "github.com/turt2live/matrix-media-repo/util" ) @@ -55,7 +55,7 @@ func GetPreview(urlStr string, onHost string, forUserId string, atTs int64, lang return nil, common.ErrInvalidHost } parsedUrl.Fragment = "" // Remove fragment because it's not important for servers - urlToPreview := &url_previewing.UrlPayload{ + urlToPreview := &m.UrlPayload{ UrlString: urlStr, ParsedUrl: parsedUrl, } diff --git a/controllers/preview_controller/preview_resource_handler.go b/controllers/preview_controller/preview_resource_handler.go index 0c7c5ac0..789d7c36 100644 --- a/controllers/preview_controller/preview_resource_handler.go +++ b/controllers/preview_controller/preview_resource_handler.go @@ -5,7 +5,8 @@ import ( "sync" "github.com/getsentry/sentry-go" - url_previewers2 "github.com/turt2live/matrix-media-repo/url_previewing" + url_previewers2 "github.com/turt2live/matrix-media-repo/url_previewing/m" + "github.com/turt2live/matrix-media-repo/url_previewing/p" "github.com/turt2live/matrix-media-repo/util/stream_util" "github.com/disintegration/imaging" @@ -80,28 +81,28 @@ func urlPreviewWorkFn(request *resource_handler.WorkRequest) (resp *urlPreviewRe db := storage.GetDatabase().GetUrlStore(ctx) - var preview url_previewers2.Result + var preview url_previewers2.PreviewResult err := url_previewers2.ErrPreviewUnsupported // Try oEmbed first if info.allowOEmbed { ctx = ctx.LogWithFields(logrus.Fields{"worker_previewer": "oEmbed"}) ctx.Log.Info("Trying oEmbed previewer") - preview, err = url_previewers2.GenerateOEmbedPreview(info.urlPayload, info.languageHeader, ctx) + preview, err = p.GenerateOEmbedPreview(info.urlPayload, info.languageHeader, ctx) } // Then try OpenGraph if err == url_previewers2.ErrPreviewUnsupported { ctx = ctx.LogWithFields(logrus.Fields{"worker_previewer": "OpenGraph"}) ctx.Log.Info("oEmbed preview for this URL is unsupported or disabled - treating it as a OpenGraph") - preview, err = url_previewers2.GenerateOpenGraphPreview(info.urlPayload, info.languageHeader, ctx) + preview, err = p.GenerateOpenGraphPreview(info.urlPayload, info.languageHeader, ctx) } // Finally try scraping if err == url_previewers2.ErrPreviewUnsupported { ctx = ctx.LogWithFields(logrus.Fields{"worker_previewer": "File"}) ctx.Log.Info("OpenGraph preview for this URL is unsupported - treating it as a file") - preview, err = url_previewers2.GenerateCalculatedPreview(info.urlPayload, info.languageHeader, ctx) + preview, err = p.GenerateCalculatedPreview(info.urlPayload, info.languageHeader, ctx) } if err != nil { diff --git a/pipelines/_steps/url_preview/upload_image.go b/pipelines/_steps/url_preview/upload_image.go index 791d3812..2b423dcd 100644 --- a/pipelines/_steps/url_preview/upload_image.go +++ b/pipelines/_steps/url_preview/upload_image.go @@ -9,11 +9,11 @@ import ( "github.com/turt2live/matrix-media-repo/datastores" "github.com/turt2live/matrix-media-repo/pipelines/pipeline_upload" "github.com/turt2live/matrix-media-repo/thumbnailing" - "github.com/turt2live/matrix-media-repo/url_previewing" + "github.com/turt2live/matrix-media-repo/url_previewing/m" "github.com/turt2live/matrix-media-repo/util" ) -func UploadImage(ctx rcontext.RequestContext, image *url_previewing.Image, onHost string, userId string, forRecord *database.DbUrlPreview) { +func UploadImage(ctx rcontext.RequestContext, image *m.PreviewImage, onHost string, userId string, forRecord *database.DbUrlPreview) { if image == nil || image.Data == nil { return } diff --git a/pipelines/pipeline_preview/pipeline.go b/pipelines/pipeline_preview/pipeline.go index bd5312ee..9f8978d1 100644 --- a/pipelines/pipeline_preview/pipeline.go +++ b/pipelines/pipeline_preview/pipeline.go @@ -10,7 +10,8 @@ import ( "github.com/turt2live/matrix-media-repo/common/rcontext" "github.com/turt2live/matrix-media-repo/database" "github.com/turt2live/matrix-media-repo/pipelines/_steps/url_preview" - url_previewers2 "github.com/turt2live/matrix-media-repo/url_previewing" + "github.com/turt2live/matrix-media-repo/url_previewing/m" + "github.com/turt2live/matrix-media-repo/url_previewing/p" "github.com/turt2live/matrix-media-repo/util" "golang.org/x/sync/singleflight" ) @@ -52,34 +53,34 @@ func Execute(ctx rcontext.RequestContext, onHost string, previewUrl string, user // Step 4: Join the singleflight queue r, err, _ := sf.Do(fmt.Sprintf("%s:%s_%d/%s", onHost, previewUrl, opts.Timestamp, opts.LanguageHeader), func() (interface{}, error) { - payload := &url_previewers2.UrlPayload{ + payload := &m.UrlPayload{ UrlString: previewUrl, ParsedUrl: parsedUrl, } - var preview url_previewers2.Result - err = url_previewers2.ErrPreviewUnsupported + var preview m.PreviewResult + err = m.ErrPreviewUnsupported // Step 5: Try oEmbed if ctx.Config.UrlPreviews.OEmbed { ctx.Log.Debug("Trying oEmbed previewer") - preview, err = url_previewers2.GenerateOEmbedPreview(payload, opts.LanguageHeader, ctx) + preview, err = p.GenerateOEmbedPreview(payload, opts.LanguageHeader, ctx) } // Step 6: Try OpenGraph - if err == url_previewers2.ErrPreviewUnsupported { + if err == m.ErrPreviewUnsupported { ctx.Log.Debug("Trying OpenGraph previewer") - preview, err = url_previewers2.GenerateOpenGraphPreview(payload, opts.LanguageHeader, ctx) + preview, err = p.GenerateOpenGraphPreview(payload, opts.LanguageHeader, ctx) } // Step 7: Try scraping - if err == url_previewers2.ErrPreviewUnsupported { + if err == m.ErrPreviewUnsupported { ctx.Log.Debug("Trying built-in previewer") - preview, err = url_previewers2.GenerateCalculatedPreview(payload, opts.LanguageHeader, ctx) + preview, err = p.GenerateCalculatedPreview(payload, opts.LanguageHeader, ctx) } // Step 8: Finish processing if err != nil { - if err == url_previewers2.ErrPreviewUnsupported { + if err == m.ErrPreviewUnsupported { err = common.ErrMediaNotFound } diff --git a/url_previewing/m/errors.go b/url_previewing/m/errors.go new file mode 100644 index 00000000..8d2b3ee1 --- /dev/null +++ b/url_previewing/m/errors.go @@ -0,0 +1,5 @@ +package m + +import "errors" + +var ErrPreviewUnsupported = errors.New("preview not supported by this previewer") diff --git a/url_previewing/m/preview_result.go b/url_previewing/m/preview_result.go new file mode 100644 index 00000000..2cb55087 --- /dev/null +++ b/url_previewing/m/preview_result.go @@ -0,0 +1,18 @@ +package m + +import "io" + +type PreviewResult struct { + Url string + SiteName string + Type string + Description string + Title string + Image *PreviewImage +} + +type PreviewImage struct { + ContentType string + Data io.ReadCloser + Filename string +} diff --git a/url_previewing/m/url_payload.go b/url_previewing/m/url_payload.go new file mode 100644 index 00000000..381c096b --- /dev/null +++ b/url_previewing/m/url_payload.go @@ -0,0 +1,10 @@ +package m + +import ( + "net/url" +) + +type UrlPayload struct { + UrlString string + ParsedUrl *url.URL +} diff --git a/url_previewing/calculated_previewer.go b/url_previewing/p/calculated.go similarity index 59% rename from url_previewing/calculated_previewer.go rename to url_previewing/p/calculated.go index 73365dd0..ac2b4184 100644 --- a/url_previewing/calculated_previewer.go +++ b/url_previewing/p/calculated.go @@ -1,4 +1,4 @@ -package url_previewing +package p import ( "github.com/prometheus/client_golang/prometheus" @@ -6,23 +6,25 @@ import ( "github.com/turt2live/matrix-media-repo/common/rcontext" "github.com/turt2live/matrix-media-repo/metrics" "github.com/turt2live/matrix-media-repo/thumbnailing" + "github.com/turt2live/matrix-media-repo/url_previewing/m" + "github.com/turt2live/matrix-media-repo/url_previewing/u" ) -func GenerateCalculatedPreview(urlPayload *UrlPayload, languageHeader string, ctx rcontext.RequestContext) (Result, error) { - r, filename, contentType, err := downloadRawContent(urlPayload, ctx.Config.UrlPreviews.FilePreviewTypes, languageHeader, ctx) +func GenerateCalculatedPreview(urlPayload *m.UrlPayload, languageHeader string, ctx rcontext.RequestContext) (m.PreviewResult, error) { + r, filename, contentType, err := u.DownloadRawContent(urlPayload, ctx.Config.UrlPreviews.FilePreviewTypes, languageHeader, ctx) if err != nil { ctx.Log.Warn("Error downloading content: ", err) // Make sure the unsupported error gets passed through - if err == ErrPreviewUnsupported { - return Result{}, ErrPreviewUnsupported + if err == m.ErrPreviewUnsupported { + return m.PreviewResult{}, m.ErrPreviewUnsupported } // We'll consider it not found for the sake of processing - return Result{}, common.ErrMediaNotFound + return m.PreviewResult{}, common.ErrMediaNotFound } - img := &Image{ + img := &m.PreviewImage{ Data: r, ContentType: contentType, Filename: filename, @@ -41,11 +43,11 @@ func GenerateCalculatedPreview(urlPayload *UrlPayload, languageHeader string, ct description = "" } - result := &Result{ + result := &m.PreviewResult{ Type: "", // intentionally empty Url: urlPayload.ParsedUrl.String(), - Title: summarize(filename, ctx.Config.UrlPreviews.NumTitleWords, ctx.Config.UrlPreviews.MaxTitleLength), - Description: summarize(description, ctx.Config.UrlPreviews.NumWords, ctx.Config.UrlPreviews.MaxLength), + Title: u.Summarize(filename, ctx.Config.UrlPreviews.NumTitleWords, ctx.Config.UrlPreviews.MaxTitleLength), + Description: u.Summarize(description, ctx.Config.UrlPreviews.NumWords, ctx.Config.UrlPreviews.MaxLength), SiteName: "", // intentionally empty } diff --git a/url_previewing/oembed_previewer.go b/url_previewing/p/oembed.go similarity index 82% rename from url_previewing/oembed_previewer.go rename to url_previewing/p/oembed.go index 6e424993..73db21a3 100644 --- a/url_previewing/oembed_previewer.go +++ b/url_previewing/p/oembed.go @@ -1,4 +1,4 @@ -package url_previewing +package p import ( "bytes" @@ -8,6 +8,8 @@ import ( "sync" "github.com/getsentry/sentry-go" + "github.com/turt2live/matrix-media-repo/url_previewing/m" + "github.com/turt2live/matrix-media-repo/url_previewing/u" "github.com/dyatlov/go-oembed/oembed" "github.com/k3a/html2text" @@ -45,10 +47,10 @@ func getOembed() *oembed.Oembed { return oembedInstance } -func GenerateOEmbedPreview(urlPayload *UrlPayload, languageHeader string, ctx rcontext.RequestContext) (Result, error) { +func GenerateOEmbedPreview(urlPayload *m.UrlPayload, languageHeader string, ctx rcontext.RequestContext) (m.PreviewResult, error) { item := getOembed().FindItem(urlPayload.ParsedUrl.String()) if item == nil { - return Result{}, ErrPreviewUnsupported + return m.PreviewResult{}, m.ErrPreviewUnsupported } info, err := item.FetchOembed(oembed.Options{ @@ -57,7 +59,7 @@ func GenerateOEmbedPreview(urlPayload *UrlPayload, languageHeader string, ctx rc }) if err != nil { ctx.Log.Error("Error getting oEmbed: ", err) - return Result{}, err + return m.PreviewResult{}, err } if info.Type == "rich" { @@ -66,7 +68,7 @@ func GenerateOEmbedPreview(urlPayload *UrlPayload, languageHeader string, ctx rc info.ThumbnailURL = info.URL } - graph := &Result{ + graph := &m.PreviewResult{ Type: info.Type, Url: info.URL, Title: info.Title, @@ -83,12 +85,12 @@ func GenerateOEmbedPreview(urlPayload *UrlPayload, languageHeader string, ctx rc } imgAbsUrl := urlPayload.ParsedUrl.ResolveReference(imgUrl) - imgUrlPayload := &UrlPayload{ + imgUrlPayload := &m.UrlPayload{ UrlString: imgAbsUrl.String(), ParsedUrl: imgAbsUrl, } - img, err := downloadImage(imgUrlPayload, languageHeader, ctx) + img, err := u.DownloadImage(imgUrlPayload, languageHeader, ctx) if err != nil { ctx.Log.Error("Non-fatal error getting thumbnail (downloading image): ", err) sentry.CaptureException(err) diff --git a/url_previewing/opengraph_previewer.go b/url_previewing/p/opengraph.go similarity index 81% rename from url_previewing/opengraph_previewer.go rename to url_previewing/p/opengraph.go index 288beb41..cb510a76 100644 --- a/url_previewing/opengraph_previewer.go +++ b/url_previewing/p/opengraph.go @@ -1,4 +1,4 @@ -package url_previewing +package p import ( "net/url" @@ -6,6 +6,8 @@ import ( "strings" "github.com/getsentry/sentry-go" + "github.com/turt2live/matrix-media-repo/url_previewing/m" + "github.com/turt2live/matrix-media-repo/url_previewing/u" "github.com/PuerkitoBio/goquery" "github.com/dyatlov/go-opengraph/opengraph" @@ -18,25 +20,25 @@ import ( var ogSupportedTypes = []string{"text/*"} -func GenerateOpenGraphPreview(urlPayload *UrlPayload, languageHeader string, ctx rcontext.RequestContext) (Result, error) { - html, err := downloadHtmlContent(urlPayload, ogSupportedTypes, languageHeader, ctx) +func GenerateOpenGraphPreview(urlPayload *m.UrlPayload, languageHeader string, ctx rcontext.RequestContext) (m.PreviewResult, error) { + html, err := u.DownloadHtmlContent(urlPayload, ogSupportedTypes, languageHeader, ctx) if err != nil { ctx.Log.Error("Error downloading content: ", err) // Make sure the unsupported error gets passed through - if err == ErrPreviewUnsupported { - return Result{}, ErrPreviewUnsupported + if err == m.ErrPreviewUnsupported { + return m.PreviewResult{}, m.ErrPreviewUnsupported } // We'll consider it not found for the sake of processing - return Result{}, common.ErrMediaNotFound + return m.PreviewResult{}, common.ErrMediaNotFound } og := opengraph.NewOpenGraph() err = og.ProcessHTML(strings.NewReader(html)) if err != nil { ctx.Log.Error("Error getting OpenGraph: ", err) - return Result{}, err + return m.PreviewResult{}, err } if og.Title == "" { @@ -50,10 +52,10 @@ func GenerateOpenGraphPreview(urlPayload *UrlPayload, languageHeader string, ctx } // Be sure to trim the title and description - og.Title = summarize(og.Title, ctx.Config.UrlPreviews.NumTitleWords, ctx.Config.UrlPreviews.MaxTitleLength) - og.Description = summarize(og.Description, ctx.Config.UrlPreviews.NumWords, ctx.Config.UrlPreviews.MaxLength) + og.Title = u.Summarize(og.Title, ctx.Config.UrlPreviews.NumTitleWords, ctx.Config.UrlPreviews.MaxTitleLength) + og.Description = u.Summarize(og.Description, ctx.Config.UrlPreviews.NumWords, ctx.Config.UrlPreviews.MaxLength) - graph := &Result{ + graph := &m.PreviewResult{ Type: og.Type, Url: og.URL, Title: og.Title, @@ -70,12 +72,12 @@ func GenerateOpenGraphPreview(urlPayload *UrlPayload, languageHeader string, ctx } imgAbsUrl := urlPayload.ParsedUrl.ResolveReference(imgUrl) - imgUrlPayload := &UrlPayload{ + imgUrlPayload := &m.UrlPayload{ UrlString: imgAbsUrl.String(), ParsedUrl: imgAbsUrl, } - img, err := downloadImage(imgUrlPayload, languageHeader, ctx) + img, err := u.DownloadImage(imgUrlPayload, languageHeader, ctx) if err != nil { ctx.Log.Error("Non-fatal error getting thumbnail (downloading image): ", err) sentry.CaptureException(err) diff --git a/url_previewing/types.go b/url_previewing/types.go deleted file mode 100644 index dd330fac..00000000 --- a/url_previewing/types.go +++ /dev/null @@ -1,29 +0,0 @@ -package url_previewing - -import ( - "errors" - "io" - "net/url" -) - -type Result struct { - Url string - SiteName string - Type string - Description string - Title string - Image *Image -} - -type Image struct { - ContentType string - Data io.ReadCloser - Filename string -} - -type UrlPayload struct { - UrlString string - ParsedUrl *url.URL -} - -var ErrPreviewUnsupported = errors.New("preview not supported by this previewer") diff --git a/url_previewing/acl.go b/url_previewing/u/acl.go similarity index 99% rename from url_previewing/acl.go rename to url_previewing/u/acl.go index 6bb3372b..e734f879 100644 --- a/url_previewing/acl.go +++ b/url_previewing/u/acl.go @@ -1,4 +1,4 @@ -package url_previewing +package u import ( "net" diff --git a/url_previewing/http.go b/url_previewing/u/http.go similarity index 87% rename from url_previewing/http.go rename to url_previewing/u/http.go index c8ba4b26..69349c16 100644 --- a/url_previewing/http.go +++ b/url_previewing/u/http.go @@ -1,4 +1,4 @@ -package url_previewing +package u import ( "context" @@ -14,11 +14,12 @@ import ( "github.com/ryanuber/go-glob" "github.com/turt2live/matrix-media-repo/common" "github.com/turt2live/matrix-media-repo/common/rcontext" + "github.com/turt2live/matrix-media-repo/url_previewing/m" "github.com/turt2live/matrix-media-repo/util" "github.com/turt2live/matrix-media-repo/util/readers" ) -func doHttpGet(urlPayload *UrlPayload, languageHeader string, ctx rcontext.RequestContext) (*http.Response, error) { +func doHttpGet(urlPayload *m.UrlPayload, languageHeader string, ctx rcontext.RequestContext) (*http.Response, error) { var client *http.Client dialer := &net.Dialer{ @@ -120,7 +121,7 @@ func doHttpGet(urlPayload *UrlPayload, languageHeader string, ctx rcontext.Reque return client.Do(req) } -func downloadRawContent(urlPayload *UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (io.ReadCloser, string, string, error) { +func DownloadRawContent(urlPayload *m.UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (io.ReadCloser, string, string, error) { ctx.Log.Info("Fetching remote content...") resp, err := doHttpGet(urlPayload, languageHeader, ctx) if err != nil { @@ -146,7 +147,7 @@ func downloadRawContent(urlPayload *UrlPayload, supportedTypes []string, languag contentType := resp.Header.Get("Content-Type") for _, supportedType := range supportedTypes { if !glob.Glob(supportedType, contentType) { - return nil, "", "", ErrPreviewUnsupported + return nil, "", "", m.ErrPreviewUnsupported } } @@ -160,8 +161,8 @@ func downloadRawContent(urlPayload *UrlPayload, supportedTypes []string, languag return reader, filename, contentType, nil } -func downloadHtmlContent(urlPayload *UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (string, error) { - r, _, contentType, err := downloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) +func DownloadHtmlContent(urlPayload *m.UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (string, error) { + r, _, contentType, err := DownloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) if err != nil { return "", err } @@ -174,7 +175,7 @@ func downloadHtmlContent(urlPayload *UrlPayload, supportedTypes []string, langua return html, nil } -func downloadImage(urlPayload *UrlPayload, languageHeader string, ctx rcontext.RequestContext) (*Image, error) { +func DownloadImage(urlPayload *m.UrlPayload, languageHeader string, ctx rcontext.RequestContext) (*m.PreviewImage, error) { ctx.Log.Info("Getting image from " + urlPayload.ParsedUrl.String()) resp, err := doHttpGet(urlPayload, languageHeader, ctx) if err != nil { @@ -185,7 +186,7 @@ func downloadImage(urlPayload *UrlPayload, languageHeader string, ctx rcontext.R return nil, errors.New("error during transfer") } - image := &Image{ + image := &m.PreviewImage{ ContentType: resp.Header.Get("Content-Type"), Data: resp.Body, } diff --git a/url_previewing/util.go b/url_previewing/u/summarize.go similarity index 92% rename from url_previewing/util.go rename to url_previewing/u/summarize.go index c364bcb5..c7ef2ff9 100644 --- a/url_previewing/util.go +++ b/url_previewing/u/summarize.go @@ -1,4 +1,4 @@ -package url_previewing +package u import ( "regexp" @@ -9,7 +9,7 @@ var surroundingWhitespace = regexp.MustCompile(`^[\s\p{Zs}]+|[\s\p{Zs}]+$`) var interiorWhitespace = regexp.MustCompile(`[\s\p{Zs}]{2,}`) var newlines = regexp.MustCompile(`[\r\n]`) -func summarize(text string, maxWords int, maxLength int) string { +func Summarize(text string, maxWords int, maxLength int) string { // Normalize the whitespace to be something useful (crush it to one giant line) text = surroundingWhitespace.ReplaceAllString(text, "") text = interiorWhitespace.ReplaceAllString(text, " ") -- GitLab