From e38d906ff975e7517d727ed2cfef544f443dff31 Mon Sep 17 00:00:00 2001 From: jimbo1qaz Date: Sat, 17 Aug 2019 20:48:15 -0700 Subject: [PATCH] Fix timestamp links in Youtube video descriptions For some reason, in NewPipeExtractor, comments were loaded from JSON by YoutubeCommentsInfoItemExtractor as text, sent via CommentsInfoItem#getCommentText to NewPipe, where timestamps are converted to hyperlinks using Linkify: https://github.com/TeamNewPipe/NewPipe/pull/2168 On the other hand, video descriptions are handled in NewPipeExtractor by scraping the watch-page HTML. There, timestamp links were previously mangled (and now properly parsed), before being sent as HTML via YoutubeStreamExtractor#getDescription to NewPipe (where HTML gets converted to Spanned). The logic introduced in this commit is different from the above PR, since it operates in the extractor, and mutates the HTML DOM rather than identifying via regex. --- .../extractors/YoutubeStreamExtractor.java | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index d1da5376f..44e77b01e 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -30,6 +30,8 @@ import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /* * Created by Christian Schabesberger on 06.08.15. @@ -162,14 +164,54 @@ public class YoutubeStreamExtractor extends StreamExtractor { } } + // onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;" + // :00 is NOT recognized as a timestamp in description or comments. + // 0:00 is recognized in both description and comments. + // https://www.youtube.com/watch?v=4cccfDXu1vA + private final static Pattern DESCRIPTION_TIMESTAMP_ONCLICK_REGEX = Pattern.compile( + "seekTo\\(" + + "(?:(\\d+)\\*3600\\+)?" // hours? + + "(\\d+)\\*60\\+" // minutes + + "(\\d+)" // seconds + + "\\)"); + + @SafeVarargs + private static T coalesce(T... args) { + for (T arg : args) { + if (arg != null) return arg; + } + throw new IllegalArgumentException("all arguments to coalesce() were null"); + } + private String parseHtmlAndGetFullLinks(String descriptionHtml) throws MalformedURLException, UnsupportedEncodingException, ParsingException { final Document description = Jsoup.parse(descriptionHtml, getUrl()); for(Element a : description.select("a")) { final String rawUrl = a.attr("abs:href"); final URL redirectLink = new URL(rawUrl); - final String queryString = redirectLink.getQuery(); - if(queryString != null) { + + final Matcher onClickTimestamp; + final String queryString; + if ((onClickTimestamp = DESCRIPTION_TIMESTAMP_ONCLICK_REGEX.matcher(a.attr("onclick"))) + .find()) { + a.removeAttr("onclick"); + + String hours = coalesce(onClickTimestamp.group(1), "0"); + String minutes = onClickTimestamp.group(2); + String seconds = onClickTimestamp.group(3); + + int timestamp = 0; + timestamp += Integer.parseInt(hours) * 3600; + timestamp += Integer.parseInt(minutes) * 60; + timestamp += Integer.parseInt(seconds); + + String setTimestamp = "&t=" + timestamp; + + // Even after clicking https://youtu.be/...?t=6, + // getUrl() is https://www.youtube.com/watch?v=..., never youtu.be, never &t=. + a.attr("href", getUrl() + setTimestamp); + + } else if((queryString = redirectLink.getQuery()) != null) { // if the query string is null we are not dealing with a redirect link, // so we don't need to override it. final String link =