diff --git a/reader/atom/atom_03.go b/reader/atom/atom_03.go index d10d5cc83ac9562ab527730bf7ba21e8ddf10502..3e8dc6d074f23cf71c688801305430c3805ccac1 100644 --- a/reader/atom/atom_03.go +++ b/reader/atom/atom_03.go @@ -60,6 +60,10 @@ func (a *atom03Feed) Transform(baseURL string) *model.Feed { item.Author = a.Author.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_03_test.go b/reader/atom/atom_03_test.go index 75083d93d9cbe2d61d933b0591b38f7c5dc6f747..f88424c380961eda55c5b065193926eb1c5c1448 100644 --- a/reader/atom/atom_03_test.go +++ b/reader/atom/atom_03_test.go @@ -98,7 +98,7 @@ func TestParseAtom03WithoutFeedTitle(t *testing.T) { } } -func TestParseAtom03WithoutEntryTitle(t *testing.T) { +func TestParseAtom03WithoutEntryTitleButWithLink(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed version="0.3" xmlns="http://purl.org/atom/ns#"> <title>dive into mark</title> @@ -125,6 +125,62 @@ func TestParseAtom03WithoutEntryTitle(t *testing.T) { } } +func TestParseAtom03WithoutEntryTitleButWithSummary(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed version="0.3" xmlns="http://purl.org/atom/ns#"> + <title>dive into mark</title> + <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> + <modified>2003-12-13T18:30:02Z</modified> + <author><name>Mark Pilgrim</name></author> + <entry> + <link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/> + <id>tag:diveintomark.org,2003:3.2397</id> + <summary type="text/plain">It's a test</summary> + </entry> + </feed>` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "It's a test" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseAtom03WithoutEntryTitleButWithXMLContent(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed version="0.3" xmlns="http://purl.org/atom/ns#"> + <title>dive into mark</title> + <link rel="alternate" type="text/html" href="http://diveintomark.org/"/> + <modified>2003-12-13T18:30:02Z</modified> + <author><name>Mark Pilgrim</name></author> + <entry> + <link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/> + <id>tag:diveintomark.org,2003:3.2397</id> + <content mode="xml" type="text/html"><p>Some text.</p></content> + </entry> + </feed>` + + feed, err := Parse("http://diveintomark.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseAtom03WithSummaryOnly(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed version="0.3" xmlns="http://purl.org/atom/ns#"> diff --git a/reader/atom/atom_10.go b/reader/atom/atom_10.go index 4b45603ba5f357c6b3eeb9393de726b22d1d2ba5..441f8a51196d9caefc7c36bc4716c7e6d20cbc60 100644 --- a/reader/atom/atom_10.go +++ b/reader/atom/atom_10.go @@ -16,6 +16,7 @@ import ( "miniflux.app/model" "miniflux.app/reader/date" "miniflux.app/reader/media" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -64,6 +65,10 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed { item.Author = a.Authors.String() } + if item.Title == "" { + item.Title = sanitizer.TruncateHTML(item.Content, 100) + } + if item.Title == "" { item.Title = item.URL } diff --git a/reader/atom/atom_10_test.go b/reader/atom/atom_10_test.go index 5138176575bc42de1952445c187c0b0c4ca66d89..a0ee192b1f48bc0aed6bd4f8e5e356c68fe4488c 100644 --- a/reader/atom/atom_10_test.go +++ b/reader/atom/atom_10_test.go @@ -100,7 +100,7 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleButWithURL(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> @@ -116,7 +116,6 @@ func TestParseEntryWithoutTitle(t *testing.T) { <link href="http://example.org/2003/12/13/atom03"/> <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> - <summary>Some text.</summary> </entry> </feed>` @@ -131,6 +130,70 @@ func TestParseEntryWithoutTitle(t *testing.T) { } } +func TestParseEntryWithoutTitleButWithSummary(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <summary>Some text.</summary> + </entry> + + </feed>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "Some text." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseEntryWithoutTitleButWithXHTMLContent(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <feed xmlns="http://www.w3.org/2005/Atom"> + + <title>Example Feed</title> + <link href="http://example.org/"/> + <updated>2003-12-13T18:30:02Z</updated> + <author> + <name>John Doe</name> + </author> + <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id> + + <entry> + <link href="http://example.org/2003/12/13/atom03"/> + <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> + <updated>2003-12-13T18:30:02Z</updated> + <content type="xhtml"> + <div xmlns="http://www.w3.org/1999/xhtml">AT&T bought <b>by SBC</b>!</div> + </content> + </entry> + + </feed>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "AT&T bought by SBC!" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseFeedURL(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> diff --git a/reader/json/json.go b/reader/json/json.go index 31ba961c812d037d5c4f7d6fbba946e168ad1e47..589d7cb0edccca46e19ffa9b32597f49307d19da 100644 --- a/reader/json/json.go +++ b/reader/json/json.go @@ -12,6 +12,7 @@ import ( "miniflux.app/logger" "miniflux.app/model" "miniflux.app/reader/date" + "miniflux.app/reader/sanitizer" "miniflux.app/url" ) @@ -130,9 +131,13 @@ func (j *jsonItem) GetHash() string { } func (j *jsonItem) GetTitle() string { - for _, value := range []string{j.Title, j.Summary, j.Text, j.URL} { + if j.Title != "" { + return j.Title + } + + for _, value := range []string{j.Summary, j.Text, j.HTML} { if value != "" { - return truncate(value) + return sanitizer.TruncateHTML(value, 100) } } @@ -186,16 +191,3 @@ func getAuthor(author jsonAuthor) string { return "" } - -func truncate(str string) string { - max := 100 - str = strings.TrimSpace(str) - - // Convert to runes to be safe with unicode - runes := []rune(str) - if len(runes) > max { - return string(runes[:max]) + "…" - } - - return str -} diff --git a/reader/json/parser_test.go b/reader/json/parser_test.go index 0bd6e6c72b43dad82cec4324533dd5fac146abf0..5ba82d45587f03f7697ec754bc021de80f76e524 100644 --- a/reader/json/parser_test.go +++ b/reader/json/parser_test.go @@ -76,7 +76,7 @@ func TestParseJsonFeed(t *testing.T) { t.Errorf("Incorrect entry URL, got: %s", feed.Entries[1].URL) } - if feed.Entries[1].Title != "https://example.org/initial-post" { + if feed.Entries[1].Title != "Hello, world!" { t.Errorf(`Incorrect entry title, got: "%s"`, feed.Entries[1].Title) } @@ -398,7 +398,7 @@ func TestParseFeedItemWithoutID(t *testing.T) { } } -func TestParseFeedItemWithoutTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithURL(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -425,7 +425,7 @@ func TestParseFeedItemWithoutTitle(t *testing.T) { } } -func TestParseTruncateItemTitle(t *testing.T) { +func TestParseFeedItemWithoutTitleButWithSummary(t *testing.T) { data := `{ "version": "https://jsonfeed.org/version/1", "title": "My Example Feed", @@ -433,7 +433,7 @@ func TestParseTruncateItemTitle(t *testing.T) { "feed_url": "https://example.org/feed.json", "items": [ { - "title": "` + strings.Repeat("a", 200) + `" + "summary": "This is some text content." } ] }` @@ -447,9 +447,63 @@ func TestParseTruncateItemTitle(t *testing.T) { t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) } - if len(feed.Entries[0].Title) != 103 { + if feed.Entries[0].Title != "This is some text content." { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + +func TestParseFeedItemWithoutTitleButWithHTMLContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_html": "This is <strong>HTML</strong>." + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if feed.Entries[0].Title != "This is HTML." { t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) } +} + +func TestParseFeedItemWithoutTitleButWithTextContent(t *testing.T) { + data := `{ + "version": "https://jsonfeed.org/version/1", + "title": "My Example Feed", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "items": [ + { + "content_text": "` + strings.Repeat("a", 200) + `" + } + ] + }` + + feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if len(feed.Entries) != 1 { + t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries)) + } + + if len(feed.Entries[0].Title) != 103 { + t.Errorf("Incorrect entry title, got: %d", len(feed.Entries[0].Title)) + } if len([]rune(feed.Entries[0].Title)) != 101 { t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) diff --git a/reader/rss/parser_test.go b/reader/rss/parser_test.go index 197994c71f7bd9ca341dc256cc19a705a21f43f5..9be293b42d3cd374797d70d73767ea829e59cb6e 100644 --- a/reader/rss/parser_test.go +++ b/reader/rss/parser_test.go @@ -115,7 +115,7 @@ func TestParseFeedWithoutTitle(t *testing.T) { } } -func TestParseEntryWithoutTitle(t *testing.T) { +func TestParseEntryWithoutTitleAndDescription(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0"> <channel> @@ -136,6 +136,30 @@ func TestParseEntryWithoutTitle(t *testing.T) { } } +func TestParseEntryWithoutTitleButWithDescription(t *testing.T) { + data := `<?xml version="1.0" encoding="utf-8"?> + <rss version="2.0"> + <channel> + <link>https://example.org/</link> + <item> + <link>https://example.org/item</link> + <description> + This is the description + </description> + </item> + </channel> + </rss>` + + feed, err := Parse("https://example.org/", bytes.NewBufferString(data)) + if err != nil { + t.Fatal(err) + } + + if feed.Entries[0].Title != "This is the description" { + t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title) + } +} + func TestParseEntryWithMediaTitle(t *testing.T) { data := `<?xml version="1.0" encoding="utf-8"?> <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"> diff --git a/reader/rss/rss.go b/reader/rss/rss.go index db08239321f21cb64b2cd0639d9625037023b8dc..fb042632af26e26496d0d23b41ff65e9cb92117b 100644 --- a/reader/rss/rss.go +++ b/reader/rss/rss.go @@ -73,6 +73,10 @@ func (r *rssFeed) Transform(baseURL string) *model.Feed { } } + if entry.Title == "" { + entry.Title = sanitizer.TruncateHTML(entry.Content, 100) + } + if entry.Title == "" { entry.Title = entry.URL } diff --git a/reader/sanitizer/truncate.go b/reader/sanitizer/truncate.go new file mode 100644 index 0000000000000000000000000000000000000000..04acc1d6299e529616237287f3f67e6ca6e4188b --- /dev/null +++ b/reader/sanitizer/truncate.go @@ -0,0 +1,23 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "strings" + +func TruncateHTML(input string, max int) string { + text := StripTags(input) + text = strings.ReplaceAll(text, "\n", " ") + text = strings.ReplaceAll(text, "\t", " ") + text = strings.ReplaceAll(text, " ", " ") + text = strings.TrimSpace(text) + + // Convert to runes to be safe with unicode + runes := []rune(text) + if len(runes) > max { + return strings.TrimSpace(string(runes[:max])) + "…" + } + + return text +} diff --git a/reader/sanitizer/truncate_test.go b/reader/sanitizer/truncate_test.go new file mode 100644 index 0000000000000000000000000000000000000000..2c7e87b64961e4e2a76975959e0822662a1a344d --- /dev/null +++ b/reader/sanitizer/truncate_test.go @@ -0,0 +1,65 @@ +// Copyright 2022 Frédéric Guillot. All rights reserved. +// Use of this source code is governed by the Apache 2.0 +// license that can be found in the LICENSE file. + +package sanitizer + +import "testing" + +func TestTruncateHTMWithTextLowerThanLimitL(t *testing.T) { + input := `This is a <strong>bug ðŸ›</strong>.` + expected := `This is a bug ðŸ›.` + output := TruncateHTML(input, 50) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithTextAboveLimit(t *testing.T) { + input := `This is <strong>HTML</strong>.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithUnicodeTextAboveLimit(t *testing.T) { + input := `This is a <strong>bike 🚲</strong>.` + expected := `This…` + output := TruncateHTML(input, 4) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextAboveLimit(t *testing.T) { + input := ` + This is a <strong>bike + 🚲</strong>. + + ` + expected := `This is a bike…` + output := TruncateHTML(input, 15) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +} + +func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) { + input := ` + This is a <strong>bike + 🚲</strong>. + + ` + expected := `This is a bike 🚲.` + output := TruncateHTML(input, 20) + + if expected != output { + t.Errorf(`Wrong output: %q != %q`, expected, output) + } +}