From 21b4c031983f2b102c79b33b2d64f575e10912db Mon Sep 17 00:00:00 2001
From: Travis Ralston <travpc@gmail.com>
Date: Mon, 29 Jan 2018 20:40:10 -0700
Subject: [PATCH] Limit the URL preview title length by words

Fixes #62
---
 config.sample.yaml                                       | 1 +
 .../turt2live/matrix-media-repo/config/config.go         | 2 ++
 .../services/url_service/opengraph_previewer.go          | 9 ++++++---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/config.sample.yaml b/config.sample.yaml
index 6cb6258b..70a8b7c4 100644
--- a/config.sample.yaml
+++ b/config.sample.yaml
@@ -91,6 +91,7 @@ urlPreviews:
   enabled: true # If enabled, the preview_url routes will be accessible
   maxPageSizeBytes: 10485760 # 10MB default, 0 to disable
   numWords: 30 # The number of words to include in a preview (maximum)
+  numTitleWords: 30 # The maximum number of words to include in a preview's title
 
   # The number of workers to use when generating url previews. Raise this number if url
   # previews are slow or timing out.
diff --git a/src/github.com/turt2live/matrix-media-repo/config/config.go b/src/github.com/turt2live/matrix-media-repo/config/config.go
index 69207837..edeea26f 100644
--- a/src/github.com/turt2live/matrix-media-repo/config/config.go
+++ b/src/github.com/turt2live/matrix-media-repo/config/config.go
@@ -62,6 +62,7 @@ type ThumbnailSize struct {
 type UrlPreviewsConfig struct {
 	Enabled            bool     `yaml:"enabled"`
 	NumWords           int      `yaml:"numWords"`
+	NumTitleWords      int      `yaml:"numTitleWords"`
 	MaxPageSizeBytes   int64    `yaml:"maxPageSizeBytes"`
 	NumWorkers         int      `yaml:"numWorkers"`
 	DisallowedNetworks []string `yaml:"disallowedNetworks,flow"`
@@ -204,6 +205,7 @@ func NewDefaultConfig() *MediaRepoConfig {
 		UrlPreviews: &UrlPreviewsConfig{
 			Enabled:          true,
 			NumWords:         30,
+			NumTitleWords:    30,
 			MaxPageSizeBytes: 10485760, // 10mb
 			NumWorkers:       10,
 			DisallowedNetworks: []string{
diff --git a/src/github.com/turt2live/matrix-media-repo/services/url_service/opengraph_previewer.go b/src/github.com/turt2live/matrix-media-repo/services/url_service/opengraph_previewer.go
index 1cab61ca..a397395d 100644
--- a/src/github.com/turt2live/matrix-media-repo/services/url_service/opengraph_previewer.go
+++ b/src/github.com/turt2live/matrix-media-repo/services/url_service/opengraph_previewer.go
@@ -71,11 +71,15 @@ func (p *openGraphUrlPreviewer) GeneratePreview(urlStr string) (openGraphResult,
 		og.Images = calcImages(html)
 	}
 
+	// Be sure to trim the title and description
+	og.Title = summarize(og.Title, config.Get().UrlPreviews.NumTitleWords)
+	og.Description = summarize(og.Description, config.Get().UrlPreviews.NumWords)
+
 	graph := &openGraphResult{
 		Type:        og.Type,
 		Url:         og.URL,
 		Title:       og.Title,
-		Description: summarize(og.Description),
+		Description: og.Description,
 		SiteName:    og.SiteName,
 	}
 
@@ -260,7 +264,7 @@ func calcImages(html string) []*opengraph.Image {
 	return []*opengraph.Image{&img}
 }
 
-func summarize(text string) (string) {
+func summarize(text string, maxWords int) (string) {
 	// Normalize the whitespace to be something useful (crush it to one giant line)
 	surroundingWhitespace := regexp.MustCompile(`^[\s\p{Zs}]+|[\s\p{Zs}]+$`)
 	interiorWhitespace := regexp.MustCompile(`[\s\p{Zs}]{2,}`)
@@ -269,7 +273,6 @@ func summarize(text string) (string) {
 	text = interiorWhitespace.ReplaceAllString(text, " ")
 	text = newlines.ReplaceAllString(text, " ")
 
-	maxWords := config.Get().UrlPreviews.NumWords
 	words := strings.Split(text, " ")
 	if len(words) < maxWords {
 		return text
-- 
GitLab