From bd663b43a0b2d26936ba8a6172090b845a17550c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= <fred@miniflux.net>
Date: Sat, 25 Nov 2017 18:08:59 -0800
Subject: [PATCH] Improve HTML sanitizer

---
 reader/json/parser_test.go         |  2 +-
 reader/rss/parser_test.go          |  2 +-
 reader/sanitizer/sanitizer.go      | 11 ++++++-----
 reader/sanitizer/sanitizer_test.go | 20 ++++++++++++++++++++
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go
index ecb11a18..dd680f85 100644
--- a/reader/json/parser_test.go
+++ b/reader/json/parser_test.go
@@ -148,7 +148,7 @@ func TestParsePodcast(t *testing.T) {
 		t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[0].Title)
 	}
 
-	if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged & Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
+	if feed.Entries[0].Content != `Chris has worked at <a href="http://adobe.com/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Adobe</a> and as a founder of Rogue Sheep, which won an Apple Design Award for Postage. Chris’s new company is Aged &amp; Distilled with Guy English — which shipped <a href="http://aged-and-distilled.com/napkin/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Napkin</a>, a Mac app for visual collaboration. Chris is also the co-host of The Record. He lives on <a href="http://www.ci.bainbridge-isl.wa.us/" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Bainbridge Island</a>, a quick ferry ride from Seattle.` {
 		t.Errorf(`Incorrect entry content, got: "%s"`, feed.Entries[0].Content)
 	}
 
diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go
index 9f1a557e..f92be508 100644
--- a/reader/rss/parser_test.go
+++ b/reader/rss/parser_test.go
@@ -94,7 +94,7 @@ func TestParseRss2Sample(t *testing.T) {
 		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
 	}
 
-	if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Star City</a>.` {
+	if feed.Entries[0].Content != `How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia&#39;s <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm" rel="noopener noreferrer" target="_blank" referrerpolicy="no-referrer">Star City</a>.` {
 		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
 	}
 }
diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go
index 6af034c8..ad286c6e 100644
--- a/reader/sanitizer/sanitizer.go
+++ b/reader/sanitizer/sanitizer.go
@@ -7,10 +7,11 @@ package sanitizer
 import (
 	"bytes"
 	"fmt"
-	"github.com/miniflux/miniflux2/reader/url"
 	"io"
 	"strings"
 
+	"github.com/miniflux/miniflux2/reader/url"
+
 	"golang.org/x/net/html"
 )
 
@@ -33,7 +34,7 @@ func Sanitize(baseURL, input string) string {
 		token := tokenizer.Token()
 		switch token.Type {
 		case html.TextToken:
-			buffer.WriteString(token.Data)
+			buffer.WriteString(html.EscapeString(token.Data))
 		case html.StartTagToken:
 			tagName := token.DataAtom.String()
 
@@ -72,8 +73,8 @@ func Sanitize(baseURL, input string) string {
 	}
 }
 
-func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (attrNames []string, html string) {
-	var htmlAttrs []string
+func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
+	var htmlAttrs, attrNames []string
 	var err error
 
 	for _, attribute := range attributes {
@@ -99,7 +100,7 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) (a
 		}
 
 		attrNames = append(attrNames, attribute.Key)
-		htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, value))
+		htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
 	}
 
 	extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go
index 73862d37..6456378f 100644
--- a/reader/sanitizer/sanitizer_test.go
+++ b/reader/sanitizer/sanitizer_test.go
@@ -142,3 +142,23 @@ func TestPixelTracker(t *testing.T) {
 		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
 	}
 }
+
+func TestXmlEntities(t *testing.T) {
+	input := `<pre>echo "test" &gt; /etc/hosts</pre>`
+	expected := `<pre>echo &#34;test&#34; &gt; /etc/hosts</pre>`
+	output := Sanitize("http://example.org/", input)
+
+	if expected != output {
+		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+	}
+}
+
+func TestEspaceAttributes(t *testing.T) {
+	input := `<td rowspan="<b>test</b>">test</td>`
+	expected := `<td rowspan="&lt;b&gt;test&lt;/b&gt;">test</td>`
+	output := Sanitize("http://example.org/", input)
+
+	if expected != output {
+		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+	}
+}
-- 
GitLab