From 1eb01b39e718439072fd8e81d98387a5db260ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Guillot?= <f@miniflux.net> Date: Fri, 4 Mar 2022 16:49:44 -0800 Subject: [PATCH] Use truncated entry description as title if unavailable --- reader/atom/atom_03.go | 4 ++ reader/atom/atom_03_test.go | 58 +++++++++++++++++++++++++- reader/atom/atom_10.go | 5 +++ reader/atom/atom_10_test.go | 67 ++++++++++++++++++++++++++++++- reader/json/json.go | 22 ++++------ reader/json/parser_test.go | 64 ++++++++++++++++++++++++++--- reader/rss/parser_test.go | 26 +++++++++++- reader/rss/rss.go | 4 ++ reader/sanitizer/truncate.go | 23 +++++++++++ reader/sanitizer/truncate_test.go | 65 ++++++++++++++++++++++++++++++ 10 files changed, 314 insertions(+), 24 deletions(-) create mode 100644 reader/sanitizer/truncate.go create mode 100644 reader/sanitizer/truncate_test.go diff --git a/reader/atom/atom_03.go b/reader/atom/atom_03.go index d10d5cc8..3e8dc6d0 100644 --- a/reader/atom/atom_03.go +++ b/reader/atom/atom_03.go @@ -60,6 +60,10 @@ func (a *atom03Feed) Transform(baseURL string) *model.Feed { item.Author = a.Author.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_03_test.go b/reader/atom/atom_03_test.go index 75083d93..f88424c3 100644 --- a/reader/atom/atom_03_test.go +++ b/reader/atom/atom_03_test.go @@ -98,7 +98,7 @@ func TestParseAtom03WithoutFeedTitle(t *testing.T) { } } -func TestParseAtom03WithoutEntryTitle(t *testing.T) { +func TestParseAtom03WithoutEntryTitleButWithLink(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed version="0.3" xmlns="http://purl.org/atom/ns#"> <title>dive into mark</title> @@ -125,6 +125,62 @@ func TestParseAtom03WithoutEntryTitle(t *testing.T) { } } +func TestParseAtom03WithoutEntryTitleButWithSummary(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed version="0.3" xmlns="http://purl.org/atom/ns#"> + <title>dive into mark</title> + <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> + <modified>2003-12-13T18:30:02Z</modified> + <author><name>Mark Pilgrim</name></author> + <entry> + <link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/> + <id>tag:diveintomark.org,2003:3.2397</id> + <summary type="text/plain">It's a test</summary> + </entry> + </feed>` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "It's a test" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseAtom03WithoutEntryTitleButWithXMLContent(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed version="0.3" xmlns="http://purl.org/atom/ns#"> + <title>dive into mark</title> + <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> + <modified>2003-12-13T18:30:02Z</modified> + <author><name>Mark Pilgrim</name></author> + <entry> + <link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/> + <id>tag:diveintomark.org,2003:3.2397</id> + <content mode="xml" type="text/html"><p>Some text.</p></content> + </entry> + </feed>` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseAtom03WithSummaryOnly(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed version="0.3" xmlns="http://purl.org/atom/ns#"> diff --git a/reader/atom/atom_10.go b/reader/atom/atom_10.go index 4b45603b..441f8a51 100644 --- a/reader/atom/atom_10.go +++ b/reader/atom/atom_10.go @@ -16,6 +16,7 @@ import ( "miniflux.app/model" "miniflux.app/reader/date" "miniflux.app/reader/media" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -64,6 +65,10 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed { item.Author = a.Authors.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_10_test.go b/reader/atom/atom_10_test.go index 51381765..a0ee192b 100644 --- a/reader/atom/atom_10_test.go +++ b/reader/atom/atom_10_test.go @@ -100,7 +100,7 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleButWithURL(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> @@ -116,7 +116,6 @@ func TestParseEntryWithoutTitle(t *testing.T) { <link href="http://example.org/2003/12/13/atom03"/> <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> - <summary>Some text.</summary> </entry> </feed>` @@ -131,6 +130,70 @@ func TestParseEntryWithoutTitle(t *testing.T) { } } +func TestParseEntryWithoutTitleButWithSummary(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithoutTitleButWithXHTMLContent(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <content type="xhtml"> + <div xmlns="http://www.w3.org/1999/xhtml">AT&T bought <b>by SBC</b>!</div> + </content> + </entry> + + </feed>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "AT&T bought by SBC!" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseFeedURL(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> diff --git a/reader/json/json.go b/reader/json/json.go index 31ba961c..589d7cb0 100644 --- a/reader/json/json.go +++ b/reader/json/json.go @@ -12,6 +12,7 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/date" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -130,9 +131,13 @@ func (j *jsonItem) GetHash() string { } func (j *jsonItem) GetTitle() string { - for _, value := range []string{j.Title, j.Summary, j.Text, j.URL} { + if j.Title != "" { + return j.Title + } + + for _, value := range []string{j.Summary, j.Text, j.HTML} { if value != "" { - return truncate(value) + return sanitizer.TruncateHTML(value, 100) } } @@ -186,16 +191,3 @@ func getAuthor(author jsonAuthor) string { return "" } - -func truncate(str string) string { - max := 100 - str = strings.TrimSpace(str) - - // Convert to runes to be safe with unicode - runes := []rune(str) - if len(runes) > max { - return string(runes[:max]) + "…" - } - - return str -} diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go index 0bd6e6c7..5ba82d45 100644 --- a/reader/json/parser_test.go +++ b/reader/json/parser_test.go @@ -76,7 +76,7 @@ func TestParseJsonFeed(t *testing.T) { t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) } - if feed.Entries[1].Title != "https://example.org/initial-post" { + if feed.Entries[1].Title != "Hello, world!" { t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title) } @@ -398,7 +398,7 @@ func TestParseFeedItemWithoutID(t *testing.T) { } } -func TestParseFeedItemWithoutTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithURL(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -425,7 +425,7 @@ func TestParseFeedItemWithoutTitle(t *testing.T) { } } -func TestParseTruncateItemTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithSummary(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -433,7 +433,7 @@ func TestParseTruncateItemTitle(t *testing.T) { "feed_url": "https://example.org/feed.json", "items": [ { - "title": "` + strings.Repeat("a", 200) + `" + "summary": "This is some text content." } ] }` @@ -447,9 +447,63 @@ func TestParseTruncateItemTitle(t *testing.T) { t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) } - if len(feed.Entries[0].Title) != 103 { + if feed.Entries[0].Title != "This is some text content." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedItemWithoutTitleButWithHTMLContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_html": "This is <strong>HTML</strong>." + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "This is HTML." { t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) } +} + +func TestParseFeedItemWithoutTitleButWithTextContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_text": "` + strings.Repeat("a", 200) + `" + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if len(feed.Entries[0].Title) != 103 { + t.Errorf("Incorrect entry title, got: %d", len(feed.Entries[0].Title)) + } if len([]rune(feed.Entries[0].Title)) != 101 { t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 197994c7..9be293b4 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -115,7 +115,7 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleAndDescription(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0"> <channel> @@ -136,6 +136,30 @@ func TestParseEntryWithoutTitle(t *testing.T) { } } +func TestParseEntryWithoutTitleButWithDescription(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <link>https://example.org/</link> + <item> + <link>https://example.org/item</link> + <description> + This is the description + </description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "This is the description" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseEntryWithMediaTitle(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"> diff --git a/reader/rss/rss.go b/reader/rss/rss.go index db082393..fb042632 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -73,6 +73,10 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { } } + if entry.Title == "" { + entry.Title = sanitizer.TruncateHTML(entry.Content, 100) + } + if entry.Title == "" { entry.Title = entry.URL } diff --git a/reader/sanitizer/truncate.go b/reader/sanitizer/truncate.go new file mode 100644 index 00000000..04acc1d6 --- /dev/null +++ b/reader/sanitizer/truncate.go @@ -0,0 +1,23 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "strings" + +func TruncateHTML(input string, max int) string { + text := StripTags(input) + text = strings.ReplaceAll(text, "\n", " ") + text = strings.ReplaceAll(text, "\t", " ") + text = strings.ReplaceAll(text, " ", " ") + text = strings.TrimSpace(text) + + // Convert to runes to be safe with unicode + runes := []rune(text) + if len(runes) > max { + return strings.TrimSpace(string(runes[:max])) + "…" + } + + return text +} diff --git a/reader/sanitizer/truncate_test.go b/reader/sanitizer/truncate_test.go new file mode 100644 index 00000000..2c7e87b6 --- /dev/null +++ b/reader/sanitizer/truncate_test.go @@ -0,0 +1,65 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestTruncateHTMWithTextLowerThanLimitL(t *testing.T) { + input := `This is a <strong>bug ðŸ›</strong>.` + expected := `This is a bug ðŸ›.` + output := TruncateHTML(input, 50) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithTextAboveLimit(t *testing.T) { + input := `This is <strong>HTML</strong>.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithUnicodeTextAboveLimit(t *testing.T) { + input := `This is a <strong>bike 🚲</strong>.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextAboveLimit(t *testing.T) { + input := ` + This is a <strong>bike + 🚲</strong>. + + ` + expected := `This is a bike…` + output := TruncateHTML(input, 15) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) { + input := ` + This is a <strong>bike + 🚲</strong>. + + ` + expected := `This is a bike 🚲.` + output := TruncateHTML(input, 20) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} -- GitLab