From b854035d63e99a9d2aa7cf79e431ad604c3306c8 Mon Sep 17 00:00:00 2001 From: Travis Ralston <travpc@gmail.com> Date: Fri, 31 Jul 2020 18:34:04 -0600 Subject: [PATCH] Convert URL previews to UTF8 for handling Fixes https://github.com/turt2live/matrix-media-repo/issues/233 --- CHANGELOG.md | 1 + .../preview_controller/previewers/http.go | 5 +- go.mod | 3 +- go.sum | 2 + util/encoding.go | 46 +++++++++++++++++++ 5 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 util/encoding.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 870cd129..98e5768b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Fixed incorrect HTTP status codes for bad thumbnail requests. * Fixed dimension checking on thumbnails. * Fixed handling of EXIF metadata. Thanks @sorunome! +* Fixed handling of URL previews for some encodings. ## [1.1.3] - July 15th, 2020 diff --git a/controllers/preview_controller/previewers/http.go b/controllers/preview_controller/previewers/http.go index f006ecab..7d566241 100644 --- a/controllers/preview_controller/previewers/http.go +++ b/controllers/preview_controller/previewers/http.go @@ -17,6 +17,7 @@ import ( "github.com/turt2live/matrix-media-repo/common/rcontext" "github.com/turt2live/matrix-media-repo/controllers/preview_controller/acl" "github.com/turt2live/matrix-media-repo/controllers/preview_controller/preview_types" + "github.com/turt2live/matrix-media-repo/util" "github.com/turt2live/matrix-media-repo/util/cleanup" ) @@ -168,10 +169,10 @@ func downloadRawContent(urlPayload *preview_types.UrlPayload, supportedTypes []s } func downloadHtmlContent(urlPayload *preview_types.UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (string, error) { - raw, _, _, _, err := downloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) + raw, _, contentType, _, err := downloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) html := "" if raw != nil { - html = string(raw) + html = util.ToUtf8(string(raw), contentType) } return html, err } diff --git a/go.mod b/go.mod index e2cbb549..c29cce40 100644 --- a/go.mod +++ b/go.mod @@ -51,13 +51,14 @@ require ( github.com/rubyist/circuitbreaker v2.2.1+incompatible github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd github.com/ryanuber/go-glob v1.0.0 + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35 github.com/sirupsen/logrus v1.4.2 github.com/smartystreets/goconvey v1.6.4 // indirect github.com/tebeka/strftime v0.1.3 // indirect golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 golang.org/x/image v0.0.0-20200119044424-58c23975cae1 - golang.org/x/net v0.0.0-20200513185701-a91f0712d120 // indirect + golang.org/x/net v0.0.0-20200513185701-a91f0712d120 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 // indirect gopkg.in/ini.v1 v1.52.0 // indirect gopkg.in/yaml.v2 v2.2.8 diff --git a/go.sum b/go.sum index 0c8b8134..825362ce 100644 --- a/go.sum +++ b/go.sum @@ -781,6 +781,8 @@ github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd h1:CmH9+J6ZSsIjUK github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk= github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk= github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35 h1:eajwn6K3weW5cd1ZXLu2sJ4pvwlBiCWY4uDejOr73gM= github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35/go.mod h1:wozgYq9WEBQBaIJe4YZ0qTSFAMxmcwBhQH0fO0R34Z0= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= diff --git a/util/encoding.go b/util/encoding.go new file mode 100644 index 00000000..c841f295 --- /dev/null +++ b/util/encoding.go @@ -0,0 +1,46 @@ +package util + +import ( + "io/ioutil" + "strings" + "unicode/utf8" + + "github.com/saintfish/chardet" + "golang.org/x/net/html/charset" +) + +func ToUtf8(text string, possibleContentType string) string { + if utf8.ValidString(text) { + return text + } + + textCharset := "" + + if possibleContentType != "" { + _, name, ok := charset.DetermineEncoding([]byte(text), possibleContentType) + if ok { + textCharset = name + } + } + + if textCharset == "" { + detector := chardet.NewTextDetector() + cs, err := detector.DetectBest([]byte(text)) + if err != nil { + return text // best we can do + } + textCharset = cs.Charset + } + + r, err := charset.NewReader(strings.NewReader(text), textCharset) + if err != nil { + return text // best we can do + } + + converted, err := ioutil.ReadAll(r) + if err != nil { + return text // best we can do + } + + return string(converted) +} -- GitLab