diff --git a/CHANGELOG.md b/CHANGELOG.md index 870cd1297f3edd7efa44417817ed1df2db04ea4c..98e5768b32dff287ee0cd63315f8a610d607a199 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Fixed incorrect HTTP status codes for bad thumbnail requests. * Fixed dimension checking on thumbnails. * Fixed handling of EXIF metadata. Thanks @sorunome! +* Fixed handling of URL previews for some encodings. ## [1.1.3] - July 15th, 2020 diff --git a/controllers/preview_controller/previewers/http.go b/controllers/preview_controller/previewers/http.go index f006ecab16988675caf583f0442c7ba5d1ade2d8..7d566241be31611b16ef54739beeee96b41137ee 100644 --- a/controllers/preview_controller/previewers/http.go +++ b/controllers/preview_controller/previewers/http.go @@ -17,6 +17,7 @@ import ( "github.com/turt2live/matrix-media-repo/common/rcontext" "github.com/turt2live/matrix-media-repo/controllers/preview_controller/acl" "github.com/turt2live/matrix-media-repo/controllers/preview_controller/preview_types" + "github.com/turt2live/matrix-media-repo/util" "github.com/turt2live/matrix-media-repo/util/cleanup" ) @@ -168,10 +169,10 @@ func downloadRawContent(urlPayload *preview_types.UrlPayload, supportedTypes []s } func downloadHtmlContent(urlPayload *preview_types.UrlPayload, supportedTypes []string, languageHeader string, ctx rcontext.RequestContext) (string, error) { - raw, _, _, _, err := downloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) + raw, _, contentType, _, err := downloadRawContent(urlPayload, supportedTypes, languageHeader, ctx) html := "" if raw != nil { - html = string(raw) + html = util.ToUtf8(string(raw), contentType) } return html, err } diff --git a/go.mod b/go.mod index e2cbb5495a6aaff55143f196b6bc7fbc1a352999..c29cce400959d0d8f4c7b122eeb16ad321c748a8 100644 --- a/go.mod +++ b/go.mod @@ -51,13 +51,14 @@ require ( github.com/rubyist/circuitbreaker v2.2.1+incompatible github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd github.com/ryanuber/go-glob v1.0.0 + github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35 github.com/sirupsen/logrus v1.4.2 github.com/smartystreets/goconvey v1.6.4 // indirect github.com/tebeka/strftime v0.1.3 // indirect golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59 golang.org/x/image v0.0.0-20200119044424-58c23975cae1 - golang.org/x/net v0.0.0-20200513185701-a91f0712d120 // indirect + golang.org/x/net v0.0.0-20200513185701-a91f0712d120 golang.org/x/time v0.0.0-20191024005414-555d28b269f0 // indirect gopkg.in/ini.v1 v1.52.0 // indirect gopkg.in/yaml.v2 v2.2.8 diff --git a/go.sum b/go.sum index 0c8b8134f6668f4021a1babdf8a8168a56be1b8d..825362ce9db5c3f9cf8a38e3a18af09029292d40 100644 --- a/go.sum +++ b/go.sum @@ -781,6 +781,8 @@ github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd h1:CmH9+J6ZSsIjUK github.com/rwcarlsen/goexif v0.0.0-20190401172101-9e8deecbddbd/go.mod h1:hPqNNc0+uJM6H+SuU8sEs5K5IQeKccPqeSjfgcKGgPk= github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk= github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35 h1:eajwn6K3weW5cd1ZXLu2sJ4pvwlBiCWY4uDejOr73gM= github.com/sebest/xff v0.0.0-20160910043805-6c115e0ffa35/go.mod h1:wozgYq9WEBQBaIJe4YZ0qTSFAMxmcwBhQH0fO0R34Z0= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= diff --git a/util/encoding.go b/util/encoding.go new file mode 100644 index 0000000000000000000000000000000000000000..c841f29580e7d39ea6e35c7e919288da0ef2f908 --- /dev/null +++ b/util/encoding.go @@ -0,0 +1,46 @@ +package util + +import ( + "io/ioutil" + "strings" + "unicode/utf8" + + "github.com/saintfish/chardet" + "golang.org/x/net/html/charset" +) + +func ToUtf8(text string, possibleContentType string) string { + if utf8.ValidString(text) { + return text + } + + textCharset := "" + + if possibleContentType != "" { + _, name, ok := charset.DetermineEncoding([]byte(text), possibleContentType) + if ok { + textCharset = name + } + } + + if textCharset == "" { + detector := chardet.NewTextDetector() + cs, err := detector.DetectBest([]byte(text)) + if err != nil { + return text // best we can do + } + textCharset = cs.Charset + } + + r, err := charset.NewReader(strings.NewReader(text), textCharset) + if err != nil { + return text // best we can do + } + + converted, err := ioutil.ReadAll(r) + if err != nil { + return text // best we can do + } + + return string(converted) +}