From 36d773223481dd42d31499b3ea73e6999ff9f58e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fre=CC=81de=CC=81ric=20Guillot?= <fred@miniflux.net>
Date: Wed, 18 Sep 2019 22:27:25 -0700
Subject: [PATCH] Disable strict XML parsing

This change should improve parsing of broken XML feeds.

See https://golang.org/pkg/encoding/xml/#Decoder
---
 reader/atom/parser.go      |  1 +
 reader/atom/parser_test.go | 19 +++++++++++++++++++
 reader/opml/parser.go      |  1 +
 reader/opml/parser_test.go | 34 ++++++++++++++++++++++++++++++++++
 reader/rdf/parser.go       |  1 +
 reader/rdf/parser_test.go  | 19 +++++++++++++++++++
 reader/rss/parser.go       |  1 +
 reader/rss/parser_test.go  | 19 +++++++++++++++++++
 8 files changed, 95 insertions(+)

diff --git a/reader/atom/parser.go b/reader/atom/parser.go
index 85be4b55..4749c1af 100644
--- a/reader/atom/parser.go
+++ b/reader/atom/parser.go
@@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	atomFeed := new(atomFeed)
 	decoder := xml.NewDecoder(data)
 	decoder.Entity = xml.HTMLEntity
+	decoder.Strict = false
 	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(atomFeed)
diff --git a/reader/atom/parser_test.go b/reader/atom/parser_test.go
index dc425751..746c767a 100644
--- a/reader/atom/parser_test.go
+++ b/reader/atom/parser_test.go
@@ -577,3 +577,22 @@ func TestParseWithHTMLEntity(t *testing.T) {
 		t.Errorf(`Incorrect title, got: %q`, feed.Title)
 	}
 }
+
+func TestParseWithInvalidCharacterEntity(t *testing.T) {
+	data := `
+		<?xml version="1.0" encoding="utf-8"?>
+		<feed xmlns="http://www.w3.org/2005/Atom">
+			<title>Example Feed</title>
+			<link href="http://example.org/a&b"/>
+		</feed>
+	`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.SiteURL != "http://example.org/a&b" {
+		t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
+	}
+}
diff --git a/reader/opml/parser.go b/reader/opml/parser.go
index 1bdec20f..97974bb5 100644
--- a/reader/opml/parser.go
+++ b/reader/opml/parser.go
@@ -17,6 +17,7 @@ func Parse(data io.Reader) (SubcriptionList, *errors.LocalizedError) {
 	feeds := new(opml)
 	decoder := xml.NewDecoder(data)
 	decoder.Entity = xml.HTMLEntity
+	decoder.Strict = false
 	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feeds)
diff --git a/reader/opml/parser_test.go b/reader/opml/parser_test.go
index f6ce6346..6c09db89 100644
--- a/reader/opml/parser_test.go
+++ b/reader/opml/parser_test.go
@@ -193,6 +193,40 @@ func TestParseOpmlVersion1WithoutOuterOutline(t *testing.T) {
 		}
 	}
 }
+
+func TestParseOpmlWithInvalidCharacterEntity(t *testing.T) {
+	data := `<?xml version="1.0"?>
+	<opml version="1.0">
+		<head>
+			<title>mySubscriptions.opml</title>
+		</head>
+		<body>
+			<outline title="Feed 1">
+				<outline type="rss" title="Feed 1" xmlUrl="http://example.org/feed1/a&b" htmlUrl="http://example.org/c&d"></outline>
+			</outline>
+		</body>
+	</opml>
+	`
+
+	var expected SubcriptionList
+	expected = append(expected, &Subcription{Title: "Feed 1", FeedURL: "http://example.org/feed1/a&b", SiteURL: "http://example.org/c&d", CategoryName: ""})
+
+	subscriptions, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if len(subscriptions) != 1 {
+		t.Errorf("Wrong number of subscriptions: %d instead of %d", len(subscriptions), 1)
+	}
+
+	for i := 0; i < len(subscriptions); i++ {
+		if !subscriptions[i].Equals(expected[i]) {
+			t.Errorf(`Subscription are different: "%v" vs "%v"`, subscriptions[i], expected[i])
+		}
+	}
+}
+
 func TestParseInvalidXML(t *testing.T) {
 	data := `garbage`
 	_, err := Parse(bytes.NewBufferString(data))
diff --git a/reader/rdf/parser.go b/reader/rdf/parser.go
index f9423176..861ce8c5 100644
--- a/reader/rdf/parser.go
+++ b/reader/rdf/parser.go
@@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	feed := new(rdfFeed)
 	decoder := xml.NewDecoder(data)
 	decoder.Entity = xml.HTMLEntity
+	decoder.Strict = false
 	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feed)
diff --git a/reader/rdf/parser_test.go b/reader/rdf/parser_test.go
index 734b763a..4f3d033c 100644
--- a/reader/rdf/parser_test.go
+++ b/reader/rdf/parser_test.go
@@ -403,3 +403,22 @@ func TestParseFeedWithHTMLEntity(t *testing.T) {
 		t.Errorf(`Incorrect title, got: %q`, feed.Title)
 	}
 }
+
+func TestParseFeedWithInvalidCharacterEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
+	  <channel>
+			<title>Example Feed</title>
+			<link>http://example.org/a&b</link>
+	  </channel>
+	</rdf:RDF>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.SiteURL != "http://example.org/a&b" {
+		t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
+	}
+}
diff --git a/reader/rss/parser.go b/reader/rss/parser.go
index 2b464059..79bd1448 100644
--- a/reader/rss/parser.go
+++ b/reader/rss/parser.go
@@ -18,6 +18,7 @@ func Parse(data io.Reader) (*model.Feed, *errors.LocalizedError) {
 	feed := new(rssFeed)
 	decoder := xml.NewDecoder(data)
 	decoder.Entity = xml.HTMLEntity
+	decoder.Strict = false
 	decoder.CharsetReader = encoding.CharsetReader
 
 	err := decoder.Decode(feed)
diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go
index 72cec4e1..e6049274 100644
--- a/reader/rss/parser_test.go
+++ b/reader/rss/parser_test.go
@@ -633,3 +633,22 @@ func TestParseWithHTMLEntity(t *testing.T) {
 		t.Errorf(`Incorrect title, got: %q`, feed.Title)
 	}
 }
+
+func TestParseWithInvalidCharacterEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/a&b</link>
+			<title>Example Feed</title>
+		</channel>
+		</rss>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.SiteURL != "https://example.org/a&b" {
+		t.Errorf(`Incorrect url, got: %q`, feed.SiteURL)
+	}
+}
-- 
GitLab