From 1eb01b39e718439072fd8e81d98387a5db260ad6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= <f@miniflux.net>
Date: Fri, 4 Mar 2022 16:49:44 -0800
Subject: [PATCH] Use truncated entry description as title if unavailable

---
 reader/atom/atom_03.go            |  4 ++
 reader/atom/atom_03_test.go       | 58 +++++++++++++++++++++++++-
 reader/atom/atom_10.go            |  5 +++
 reader/atom/atom_10_test.go       | 67 ++++++++++++++++++++++++++++++-
 reader/json/json.go               | 22 ++++------
 reader/json/parser_test.go        | 64 ++++++++++++++++++++++++++---
 reader/rss/parser_test.go         | 26 +++++++++++-
 reader/rss/rss.go                 |  4 ++
 reader/sanitizer/truncate.go      | 23 +++++++++++
 reader/sanitizer/truncate_test.go | 65 ++++++++++++++++++++++++++++++
 10 files changed, 314 insertions(+), 24 deletions(-)
 create mode 100644 reader/sanitizer/truncate.go
 create mode 100644 reader/sanitizer/truncate_test.go

diff --git a/reader/atom/atom_03.go b/reader/atom/atom_03.go
index d10d5cc8..3e8dc6d0 100644
--- a/reader/atom/atom_03.go
+++ b/reader/atom/atom_03.go
@@ -60,6 +60,10 @@ func (a *atom03Feed) Transform(baseURL string) *model.Feed {
 			item.Author = a.Author.String()
 		}
 
+		if item.Title == "" {
+			item.Title = sanitizer.TruncateHTML(item.Content, 100)
+		}
+
 		if item.Title == "" {
 			item.Title = item.URL
 		}
diff --git a/reader/atom/atom_03_test.go b/reader/atom/atom_03_test.go
index 75083d93..f88424c3 100644
--- a/reader/atom/atom_03_test.go
+++ b/reader/atom/atom_03_test.go
@@ -98,7 +98,7 @@ func TestParseAtom03WithoutFeedTitle(t *testing.T) {
 	}
 }
 
-func TestParseAtom03WithoutEntryTitle(t *testing.T) {
+func TestParseAtom03WithoutEntryTitleButWithLink(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
 		<title>dive into mark</title>
@@ -125,6 +125,62 @@ func TestParseAtom03WithoutEntryTitle(t *testing.T) {
 	}
 }
 
+func TestParseAtom03WithoutEntryTitleButWithSummary(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+		<title>dive into mark</title>
+		<link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
+		<modified>2003-12-13T18:30:02Z</modified>
+		<author><name>Mark Pilgrim</name></author>
+		<entry>
+			<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
+			<id>tag:diveintomark.org,2003:3.2397</id>
+			<summary type="text/plain">It&apos;s a test</summary>
+		</entry>
+	</feed>`
+
+	feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Title != "It's a test" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
+func TestParseAtom03WithoutEntryTitleButWithXMLContent(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+		<title>dive into mark</title>
+		<link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
+		<modified>2003-12-13T18:30:02Z</modified>
+		<author><name>Mark Pilgrim</name></author>
+		<entry>
+			<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
+			<id>tag:diveintomark.org,2003:3.2397</id>
+			<content mode="xml" type="text/html"><p>Some text.</p></content>
+		</entry>
+	</feed>`
+
+	feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Title != "Some text." {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
 func TestParseAtom03WithSummaryOnly(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
diff --git a/reader/atom/atom_10.go b/reader/atom/atom_10.go
index 4b45603b..441f8a51 100644
--- a/reader/atom/atom_10.go
+++ b/reader/atom/atom_10.go
@@ -16,6 +16,7 @@ import (
 	"miniflux.app/model"
 	"miniflux.app/reader/date"
 	"miniflux.app/reader/media"
+	"miniflux.app/reader/sanitizer"
 	"miniflux.app/url"
 )
 
@@ -64,6 +65,10 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed {
 			item.Author = a.Authors.String()
 		}
 
+		if item.Title == "" {
+			item.Title = sanitizer.TruncateHTML(item.Content, 100)
+		}
+
 		if item.Title == "" {
 			item.Title = item.URL
 		}
diff --git a/reader/atom/atom_10_test.go b/reader/atom/atom_10_test.go
index 51381765..a0ee192b 100644
--- a/reader/atom/atom_10_test.go
+++ b/reader/atom/atom_10_test.go
@@ -100,7 +100,7 @@ func TestParseFeedWithoutTitle(t *testing.T) {
 	}
 }
 
-func TestParseEntryWithoutTitle(t *testing.T) {
+func TestParseEntryWithoutTitleButWithURL(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 
@@ -116,7 +116,6 @@ func TestParseEntryWithoutTitle(t *testing.T) {
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<summary>Some text.</summary>
 	  </entry>
 
 	</feed>`
@@ -131,6 +130,70 @@ func TestParseEntryWithoutTitle(t *testing.T) {
 	}
 }
 
+func TestParseEntryWithoutTitleButWithSummary(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+	  <updated>2003-12-13T18:30:02Z</updated>
+	  <author>
+		<name>John Doe</name>
+	  </author>
+	  <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
+
+	  <entry>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary>Some text.</summary>
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "Some text." {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
+func TestParseEntryWithoutTitleButWithXHTMLContent(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+	  <updated>2003-12-13T18:30:02Z</updated>
+	  <author>
+		<name>John Doe</name>
+	  </author>
+	  <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
+
+	  <entry>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content type="xhtml">
+			<div xmlns="http://www.w3.org/1999/xhtml">AT&amp;T bought <b>by SBC</b>!</div>
+		</content>
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "AT&T bought by SBC!" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
 func TestParseFeedURL(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
diff --git a/reader/json/json.go b/reader/json/json.go
index 31ba961c..589d7cb0 100644
--- a/reader/json/json.go
+++ b/reader/json/json.go
@@ -12,6 +12,7 @@ import (
 	"miniflux.app/logger"
 	"miniflux.app/model"
 	"miniflux.app/reader/date"
+	"miniflux.app/reader/sanitizer"
 	"miniflux.app/url"
 )
 
@@ -130,9 +131,13 @@ func (j *jsonItem) GetHash() string {
 }
 
 func (j *jsonItem) GetTitle() string {
-	for _, value := range []string{j.Title, j.Summary, j.Text, j.URL} {
+	if j.Title != "" {
+		return j.Title
+	}
+
+	for _, value := range []string{j.Summary, j.Text, j.HTML} {
 		if value != "" {
-			return truncate(value)
+			return sanitizer.TruncateHTML(value, 100)
 		}
 	}
 
@@ -186,16 +191,3 @@ func getAuthor(author jsonAuthor) string {
 
 	return ""
 }
-
-func truncate(str string) string {
-	max := 100
-	str = strings.TrimSpace(str)
-
-	// Convert to runes to be safe with unicode
-	runes := []rune(str)
-	if len(runes) > max {
-		return string(runes[:max]) + "â€¦"
-	}
-
-	return str
-}
diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go
index 0bd6e6c7..5ba82d45 100644
--- a/reader/json/parser_test.go
+++ b/reader/json/parser_test.go
@@ -76,7 +76,7 @@ func TestParseJsonFeed(t *testing.T) {
 		t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL)
 	}
 
-	if feed.Entries[1].Title != "https://example.org/initial-post" {
+	if feed.Entries[1].Title != "Hello, world!" {
 		t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title)
 	}
 
@@ -398,7 +398,7 @@ func TestParseFeedItemWithoutID(t *testing.T) {
 	}
 }
 
-func TestParseFeedItemWithoutTitle(t *testing.T) {
+func TestParseFeedItemWithoutTitleButWithURL(t *testing.T) {
 	data := `{
 		"version": "https://jsonfeed.org/version/1",
 		"title": "My Example Feed",
@@ -425,7 +425,7 @@ func TestParseFeedItemWithoutTitle(t *testing.T) {
 	}
 }
 
-func TestParseTruncateItemTitle(t *testing.T) {
+func TestParseFeedItemWithoutTitleButWithSummary(t *testing.T) {
 	data := `{
 		"version": "https://jsonfeed.org/version/1",
 		"title": "My Example Feed",
@@ -433,7 +433,7 @@ func TestParseTruncateItemTitle(t *testing.T) {
 		"feed_url": "https://example.org/feed.json",
 		"items": [
 			{
-				"title": "` + strings.Repeat("a", 200) + `"
+				"summary": "This is some text content."
 			}
 		]
 	}`
@@ -447,9 +447,63 @@ func TestParseTruncateItemTitle(t *testing.T) {
 		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
 	}
 
-	if len(feed.Entries[0].Title) != 103 {
+	if feed.Entries[0].Title != "This is some text content." {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
+func TestParseFeedItemWithoutTitleButWithHTMLContent(t *testing.T) {
+	data := `{
+		"version": "https://jsonfeed.org/version/1",
+		"title": "My Example Feed",
+		"home_page_url": "https://example.org/",
+		"feed_url": "https://example.org/feed.json",
+		"items": [
+			{
+				"content_html": "This is <strong>HTML</strong>."
+			}
+		]
+	}`
+
+	feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Title != "This is HTML." {
 		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
 	}
+}
+
+func TestParseFeedItemWithoutTitleButWithTextContent(t *testing.T) {
+	data := `{
+		"version": "https://jsonfeed.org/version/1",
+		"title": "My Example Feed",
+		"home_page_url": "https://example.org/",
+		"feed_url": "https://example.org/feed.json",
+		"items": [
+			{
+				"content_text": "` + strings.Repeat("a", 200) + `"
+			}
+		]
+	}`
+
+	feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if len(feed.Entries[0].Title) != 103 {
+		t.Errorf("Incorrect entry title, got: %d", len(feed.Entries[0].Title))
+	}
 
 	if len([]rune(feed.Entries[0].Title)) != 101 {
 		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go
index 197994c7..9be293b4 100644
--- a/reader/rss/parser_test.go
+++ b/reader/rss/parser_test.go
@@ -115,7 +115,7 @@ func TestParseFeedWithoutTitle(t *testing.T) {
 	}
 }
 
-func TestParseEntryWithoutTitle(t *testing.T) {
+func TestParseEntryWithoutTitleAndDescription(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0">
 		<channel>
@@ -136,6 +136,30 @@ func TestParseEntryWithoutTitle(t *testing.T) {
 	}
 }
 
+func TestParseEntryWithoutTitleButWithDescription(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0">
+		<channel>
+			<link>https://example.org/</link>
+			<item>
+				<link>https://example.org/item</link>
+				<description>
+					This is the description
+				</description>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "This is the description" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
 func TestParseEntryWithMediaTitle(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
diff --git a/reader/rss/rss.go b/reader/rss/rss.go
index db082393..fb042632 100644
--- a/reader/rss/rss.go
+++ b/reader/rss/rss.go
@@ -73,6 +73,10 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed {
 			}
 		}
 
+		if entry.Title == "" {
+			entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
+		}
+
 		if entry.Title == "" {
 			entry.Title = entry.URL
 		}
diff --git a/reader/sanitizer/truncate.go b/reader/sanitizer/truncate.go
new file mode 100644
index 00000000..04acc1d6
--- /dev/null
+++ b/reader/sanitizer/truncate.go
@@ -0,0 +1,23 @@
+// Copyright 2022 FrÃ©dÃ©ric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import "strings"
+
+func TruncateHTML(input string, max int) string {
+	text := StripTags(input)
+	text = strings.ReplaceAll(text, "\n", " ")
+	text = strings.ReplaceAll(text, "\t", " ")
+	text = strings.ReplaceAll(text, "  ", " ")
+	text = strings.TrimSpace(text)
+
+	// Convert to runes to be safe with unicode
+	runes := []rune(text)
+	if len(runes) > max {
+		return strings.TrimSpace(string(runes[:max])) + "â€¦"
+	}
+
+	return text
+}
diff --git a/reader/sanitizer/truncate_test.go b/reader/sanitizer/truncate_test.go
new file mode 100644
index 00000000..2c7e87b6
--- /dev/null
+++ b/reader/sanitizer/truncate_test.go
@@ -0,0 +1,65 @@
+// Copyright 2022 FrÃ©dÃ©ric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import "testing"
+
+func TestTruncateHTMWithTextLowerThanLimitL(t *testing.T) {
+	input := `This is a <strong>bug ðŸ›</strong>.`
+	expected := `This is a bug ðŸ›.`
+	output := TruncateHTML(input, 50)
+
+	if expected != output {
+		t.Errorf(`Wrong output: %q != %q`, expected, output)
+	}
+}
+
+func TestTruncateHTMLWithTextAboveLimit(t *testing.T) {
+	input := `This is <strong>HTML</strong>.`
+	expected := `Thisâ€¦`
+	output := TruncateHTML(input, 4)
+
+	if expected != output {
+		t.Errorf(`Wrong output: %q != %q`, expected, output)
+	}
+}
+
+func TestTruncateHTMLWithUnicodeTextAboveLimit(t *testing.T) {
+	input := `This is a <strong>bike ðŸš²</strong>.`
+	expected := `Thisâ€¦`
+	output := TruncateHTML(input, 4)
+
+	if expected != output {
+		t.Errorf(`Wrong output: %q != %q`, expected, output)
+	}
+}
+
+func TestTruncateHTMLWithMultilineTextAboveLimit(t *testing.T) {
+	input := `
+		This is a <strong>bike
+		ðŸš²</strong>.
+
+	`
+	expected := `This is a bikeâ€¦`
+	output := TruncateHTML(input, 15)
+
+	if expected != output {
+		t.Errorf(`Wrong output: %q != %q`, expected, output)
+	}
+}
+
+func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) {
+	input := `
+		This is a <strong>bike
+ ðŸš²</strong>.
+
+	`
+	expected := `This is a bike ðŸš².`
+	output := TruncateHTML(input, 20)
+
+	if expected != output {
+		t.Errorf(`Wrong output: %q != %q`, expected, output)
+	}
+}
-- 
GitLab