Skip to content
Snippets Groups Projects
Commit 31435ef8 authored by Frédéric Guillot's avatar Frédéric Guillot Committed by Frédéric Guillot
Browse files

Add rewrite rule to fix Medium.com images

parent d75ff0c5
No related branches found
No related tags found
No related merge requests found
......@@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
return "", err
}
document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
document.Find("script,style").Each(func(i int, s *goquery.Selection) {
removeNodes(s)
})
......
......@@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
return entryContent
}
func fixMediumImages(entryURL, entryContent string) string {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
if err != nil {
return entryContent
}
doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
noscriptElement := paragraphImage.Find("noscript")
paragraphImage.ReplaceWithHtml(noscriptElement.Text())
})
output, _ := doc.Find("body").First().Html()
return output
}
func addYoutubeVideo(entryURL, entryContent string) string {
matches := youtubeRegex.FindStringSubmatch(entryURL)
......
......@@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
entryContent = replaceLineFeeds(entryContent)
case "convert_text_link", "convert_text_links":
entryContent = replaceTextLinks(entryContent)
case "fix_medium_images":
entryContent = fixMediumImages(entryURL, entryContent)
}
}
......
......@@ -4,7 +4,10 @@
package rewrite // import "miniflux.app/reader/rewrite"
import "testing"
import (
"strings"
"testing"
)
func TestReplaceTextLinks(t *testing.T) {
scenarios := map[string]string{
......@@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
}
}
func TestMediumImage(t *testing.T) {
content := `
<figure class="ht hu hv hw hx hy cy cz paragraph-image">
<div class="hz ia ib ic aj">
<div class="cy cz hs">
<div class="ii s ib ij">
<div class="ik il s">
<div class="id ie t u v if aj bk ig ih">
<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
</div>
<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
<noscript>
<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
</noscript>
</div>
</div>
</div>
</div>
</figure>
`
expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
output := Rewriter("https://example.org/article", content, "fix_medium_images")
output = strings.TrimSpace(output)
if expected != output {
t.Errorf(`Not expected output: %s`, output)
}
}
......@@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
"invidio.us": "add_invidious_video",
"xkcd.com": "add_image_title",
"framatube.org": "nl2br,convert_text_link",
"medium.com": "fix_medium_images",
}
......@@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
// List of predefined scraper rules (alphabetically sorted)
// domain => CSS selectors
var predefinedRules = map[string]string{
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
"cbc.ca": ".story-content",
"darkreading.com": "#article-main:not(header)",
"developpez.com": "div[itemprop=articleBody]",
"dilbert.com": "span.comic-title-name, img.img-comic",
"bbc.co.uk": "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
"cbc.ca": ".story-content",
"darkreading.com": "#article-main:not(header)",
"developpez.com": "div[itemprop=articleBody]",
"dilbert.com": "span.comic-title-name, img.img-comic",
"financialsamurai.com": "article",
"francetvinfo.fr": ".text",
"github.com": "article.entry-content",
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
"igen.fr": "section.corps",
"ing.dk": "section.body",
"lapresse.ca": ".amorce, .entry",
"lemonde.fr": "article",
"lepoint.fr": ".art-text",
"lesjoiesducode.fr": ".blog-post-content img",
"lesnumeriques.com": ".text",
"linux.com": "div.content, div[property]",
"medium.com": ".section-content",
"mac4ever.com": "div[itemprop=articleBody]",
"monwindows.com": ".blog-post-body",
"npr.org": "#storytext",
"oneindia.com": ".io-article-body",
"opensource.com": "div[property]",
"osnews.com": "div.newscontent1",
"phoronix.com": "div.content",
"pseudo-sciences.org": "#art_main",
"raywenderlich.com": "article",
"slate.fr": ".field-items",
"techcrunch.com": "div.article-entry",
"theoatmeal.com": "div#comic",
"theregister.co.uk": "#body",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "main figure, article",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"openingsource.org": "article.suxing-popup-gallery",
"francetvinfo.fr": ".text",
"github.com": "article.entry-content",
"heise.de": "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
"igen.fr": "section.corps",
"ing.dk": "section.body",
"lapresse.ca": ".amorce, .entry",
"lemonde.fr": "article",
"lepoint.fr": ".art-text",
"lesjoiesducode.fr": ".blog-post-content img",
"lesnumeriques.com": ".text",
"linux.com": "div.content, div[property]",
"mac4ever.com": "div[itemprop=articleBody]",
"monwindows.com": ".blog-post-body",
"npr.org": "#storytext",
"oneindia.com": ".io-article-body",
"opensource.com": "div[property]",
"osnews.com": "div.newscontent1",
"phoronix.com": "div.content",
"pseudo-sciences.org": "#art_main",
"raywenderlich.com": "article",
"slate.fr": ".field-items",
"techcrunch.com": "div.article-entry",
"theoatmeal.com": "div#comic",
"theregister.co.uk": "#body",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "main figure, article",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"openingsource.org": "article.suxing-popup-gallery",
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment