From bb0d2bf6759e8e5673b48c85d558b22b2415b17e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= <jks@iki.fi>
Date: Mon, 3 Jan 2022 17:47:10 +0200
Subject: [PATCH] Add Youtube videos in Quanta articles

Some articles (especially the recent year-in-review ones) include a Youtube
video. The server-side rendered articles do not include the Youtube iframe,
but they do have a script that looks like

    <script type="text/javascript" data-reactid="6">
      window.__APOLLO_STATE__ = {
        ...
          youtube_id: "9uASADiYe_8",

We add a reformatting function that tries to detect obvious JavaScript code
that has a field or variable called youtube_id that has an 11-character
double-quoted value, and adds the referenced Youtube videos in the beginning of
the article. This is slightly more general than needed for Quanta, in the hope
that it could be useful for similar sites.
---
 reader/rewrite/rewrite_functions.go | 26 ++++++++++++++++++++++----
 reader/rewrite/rewriter.go          |  2 ++
 reader/rewrite/rules.go             |  2 +-
 reader/scraper/rules.go             |  2 +-
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/reader/rewrite/rewrite_functions.go b/reader/rewrite/rewrite_functions.go
index 8bcc1091..4ea9ce4e 100644
--- a/reader/rewrite/rewrite_functions.go
+++ b/reader/rewrite/rewrite_functions.go
@@ -15,10 +15,11 @@ import (
 )
 
 var (
-	youtubeRegex  = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
-	invidioRegex  = regexp.MustCompile(`https?:\/\/(.*)\/watch\?v=(.*)`)
-	imgRegex      = regexp.MustCompile(`<img [^>]+>`)
-	textLinkRegex = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
+	youtubeRegex   = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+	youtubeIdRegex = regexp.MustCompile(`youtube_id"?\s*[:=]\s*"([a-zA-Z0-9_-]{11})"`)
+	invidioRegex   = regexp.MustCompile(`https?:\/\/(.*)\/watch\?v=(.*)`)
+	imgRegex       = regexp.MustCompile(`<img [^>]+>`)
+	textLinkRegex  = regexp.MustCompile(`(?mi)(\bhttps?:\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])`)
 )
 
 func addImageTitle(entryURL, entryContent string) string {
@@ -219,6 +220,23 @@ func addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent string) string {
 	return entryContent
 }
 
+func addYoutubeVideoFromId(entryContent string) string {
+	matches := youtubeIdRegex.FindAllStringSubmatch(entryContent, -1)
+	if matches == nil {
+		return entryContent
+	}
+	sb := strings.Builder{}
+	for _, match := range matches {
+		if len(match) == 2 {
+			sb.WriteString(`<iframe width="650" height="350" frameborder="0" src="https://www.youtube-nocookie.com/embed/`)
+			sb.WriteString(match[1])
+			sb.WriteString(`" allowfullscreen></iframe><br>`)
+		}
+	}
+	sb.WriteString(entryContent)
+	return sb.String()
+}
+
 func addInvidiousVideo(entryURL, entryContent string) string {
 	matches := invidioRegex.FindStringSubmatch(entryURL)
 	if len(matches) == 3 {
diff --git a/reader/rewrite/rewriter.go b/reader/rewrite/rewriter.go
index 27058b55..e806bf16 100644
--- a/reader/rewrite/rewriter.go
+++ b/reader/rewrite/rewriter.go
@@ -74,6 +74,8 @@ func applyRule(entryURL, entryContent string, rule rule) string {
 		entryContent = addInvidiousVideo(entryURL, entryContent)
 	case "add_youtube_video_using_invidious_player":
 		entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent)
+	case "add_youtube_video_from_id":
+		entryContent = addYoutubeVideoFromId(entryContent)
 	case "add_pdf_download_link":
 		entryContent = addPDFLink(entryURL, entryContent)
 	case "nl2br":
diff --git a/reader/rewrite/rules.go b/reader/rewrite/rules.go
index fb615546..29fce0db 100644
--- a/reader/rewrite/rules.go
+++ b/reader/rewrite/rules.go
@@ -26,7 +26,7 @@ var predefinedRules = map[string]string{
 	"oglaf.com":              "add_image_title",
 	"optipess.com":           "add_image_title",
 	"peebleslab.com":         "add_image_title",
-	"quantamagazine.org":     `remove("h6:not(.byline,.post__title__kicker), #comments, .next-post__content, .footer__section, figure .outer--content")`,
+	"quantamagazine.org":     `add_youtube_video_from_id, remove("h6:not(.byline,.post__title__kicker), #comments, .next-post__content, .footer__section, figure .outer--content, script")`,
 	"sentfromthemoon.com":    "add_image_title",
 	"thedoghousediaries.com": "add_image_title",
 	"treelobsters.com":       "add_image_title",
diff --git a/reader/scraper/rules.go b/reader/scraper/rules.go
index 0352b6bd..7666ac78 100644
--- a/reader/scraper/rules.go
+++ b/reader/scraper/rules.go
@@ -33,7 +33,7 @@ var predefinedRules = map[string]string{
 	"osnews.com":           "div.newscontent1",
 	"phoronix.com":         "div.content",
 	"pseudo-sciences.org":  "#art_main",
-	"quantamagazine.org":   ".outer--content, figure",
+	"quantamagazine.org":   ".outer--content, figure, script",
 	"raywenderlich.com":    "article",
 	"slate.fr":             ".field-items",
 	"techcrunch.com":       "div.article-entry",
-- 
GitLab