Fix timestamp links in Youtube video descriptions
For some reason, in NewPipeExtractor, comments were loaded from JSON by YoutubeCommentsInfoItemExtractor as text, sent via CommentsInfoItem#getCommentText to NewPipe, where timestamps are converted to hyperlinks using Linkify: https://github.com/TeamNewPipe/NewPipe/pull/2168 On the other hand, video descriptions are handled in NewPipeExtractor by scraping the watch-page HTML. There, timestamp links were previously mangled (and now properly parsed), before being sent as HTML via YoutubeStreamExtractor#getDescription to NewPipe (where HTML gets converted to Spanned). The logic introduced in this commit is different from the above PR, since it operates in the extractor, and mutates the HTML DOM rather than identifying via regex.
This commit is contained in:
parent
430da57350
commit
e38d906ff9
|
@ -30,6 +30,8 @@ import java.io.UnsupportedEncodingException;
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Created by Christian Schabesberger on 06.08.15.
|
* Created by Christian Schabesberger on 06.08.15.
|
||||||
|
@ -162,14 +164,54 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;"
|
||||||
|
// :00 is NOT recognized as a timestamp in description or comments.
|
||||||
|
// 0:00 is recognized in both description and comments.
|
||||||
|
// https://www.youtube.com/watch?v=4cccfDXu1vA
|
||||||
|
private final static Pattern DESCRIPTION_TIMESTAMP_ONCLICK_REGEX = Pattern.compile(
|
||||||
|
"seekTo\\("
|
||||||
|
+ "(?:(\\d+)\\*3600\\+)?" // hours?
|
||||||
|
+ "(\\d+)\\*60\\+" // minutes
|
||||||
|
+ "(\\d+)" // seconds
|
||||||
|
+ "\\)");
|
||||||
|
|
||||||
|
@SafeVarargs
|
||||||
|
private static <T> T coalesce(T... args) {
|
||||||
|
for (T arg : args) {
|
||||||
|
if (arg != null) return arg;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("all arguments to coalesce() were null");
|
||||||
|
}
|
||||||
|
|
||||||
private String parseHtmlAndGetFullLinks(String descriptionHtml)
|
private String parseHtmlAndGetFullLinks(String descriptionHtml)
|
||||||
throws MalformedURLException, UnsupportedEncodingException, ParsingException {
|
throws MalformedURLException, UnsupportedEncodingException, ParsingException {
|
||||||
final Document description = Jsoup.parse(descriptionHtml, getUrl());
|
final Document description = Jsoup.parse(descriptionHtml, getUrl());
|
||||||
for(Element a : description.select("a")) {
|
for(Element a : description.select("a")) {
|
||||||
final String rawUrl = a.attr("abs:href");
|
final String rawUrl = a.attr("abs:href");
|
||||||
final URL redirectLink = new URL(rawUrl);
|
final URL redirectLink = new URL(rawUrl);
|
||||||
final String queryString = redirectLink.getQuery();
|
|
||||||
if(queryString != null) {
|
final Matcher onClickTimestamp;
|
||||||
|
final String queryString;
|
||||||
|
if ((onClickTimestamp = DESCRIPTION_TIMESTAMP_ONCLICK_REGEX.matcher(a.attr("onclick")))
|
||||||
|
.find()) {
|
||||||
|
a.removeAttr("onclick");
|
||||||
|
|
||||||
|
String hours = coalesce(onClickTimestamp.group(1), "0");
|
||||||
|
String minutes = onClickTimestamp.group(2);
|
||||||
|
String seconds = onClickTimestamp.group(3);
|
||||||
|
|
||||||
|
int timestamp = 0;
|
||||||
|
timestamp += Integer.parseInt(hours) * 3600;
|
||||||
|
timestamp += Integer.parseInt(minutes) * 60;
|
||||||
|
timestamp += Integer.parseInt(seconds);
|
||||||
|
|
||||||
|
String setTimestamp = "&t=" + timestamp;
|
||||||
|
|
||||||
|
// Even after clicking https://youtu.be/...?t=6,
|
||||||
|
// getUrl() is https://www.youtube.com/watch?v=..., never youtu.be, never &t=.
|
||||||
|
a.attr("href", getUrl() + setTimestamp);
|
||||||
|
|
||||||
|
} else if((queryString = redirectLink.getQuery()) != null) {
|
||||||
// if the query string is null we are not dealing with a redirect link,
|
// if the query string is null we are not dealing with a redirect link,
|
||||||
// so we don't need to override it.
|
// so we don't need to override it.
|
||||||
final String link =
|
final String link =
|
||||||
|
|
Loading…
Reference in New Issue