[YouTube] Fix hashtags links extraction and escape text in attribute descriptions + HTML links
webCommandMetadata object is contained inside a commandMetadata one, so it is not accessible from the root of the navigationEndpoint object. The corresponding statement has been moved at the bottom of the specific endpoints parsing, as the webCommandMetadata object is present almost everywhere, otherwise URLs of some endpoints would have be changed, such as uploader URLs (from channel IDs to handles). As no ParsingException is now thrown by getUrlFromNavigationEndpoint, and so by getTextFromObject, getUrlFromObject and getTextAtKey, the methods which were catching ParsingExceptions thrown by these methods had to be updated. URLs got in the HTML version of getTextFromObject are now escaped properly to provide valid HTML to clients. This has been also done for attribute descriptions, with the description text for this type of descriptions. As YouTube descriptions are in HTML format (except for the fallback on the JSON player response, which is plain text and only happens when there is no visual metadata or a breaking change), all URLs returned are escaped, so tests which are testing presence of URLs with escaped characters had to be updated (it was only the case for YoutubeStreamExtractorDefaultTest.DescriptionTestUnboxing).
This commit is contained in:
parent
99ab9777ad
commit
1556adbb2d
|
@ -822,7 +822,7 @@ public final class YoutubeParsingHelper {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final String url = "https://music.youtube.com/sw.js";
|
final String url = "https://music.youtube.com/sw.js";
|
||||||
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
|
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
|
||||||
final String response = getDownloader().get(url, headers).responseBody();
|
final String response = getDownloader().get(url, headers).responseBody();
|
||||||
musicClientVersion = getStringResultFromRegexArray(response,
|
musicClientVersion = getStringResultFromRegexArray(response,
|
||||||
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
|
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
|
||||||
|
@ -843,18 +843,11 @@ public final class YoutubeParsingHelper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
|
public static String getUrlFromNavigationEndpoint(
|
||||||
throws ParsingException {
|
@Nonnull final JsonObject navigationEndpoint) {
|
||||||
if (navigationEndpoint.has("webCommandMetadata")) {
|
|
||||||
// this case needs to be handled before the browseEndpoint,
|
|
||||||
// e.g. for hashtags in comments
|
|
||||||
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
|
|
||||||
if (metadata.has("url")) {
|
|
||||||
return "https://www.youtube.com" + metadata.getString("url");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (navigationEndpoint.has("urlEndpoint")) {
|
if (navigationEndpoint.has("urlEndpoint")) {
|
||||||
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
|
String internUrl = navigationEndpoint.getObject("urlEndpoint")
|
||||||
|
.getString("url");
|
||||||
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
|
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
|
||||||
// remove https://www.youtube.com part to fall in the next if block
|
// remove https://www.youtube.com part to fall in the next if block
|
||||||
internUrl = internUrl.substring(23);
|
internUrl = internUrl.substring(23);
|
||||||
|
@ -879,7 +872,9 @@ public final class YoutubeParsingHelper {
|
||||||
|| internUrl.startsWith("/watch")) {
|
|| internUrl.startsWith("/watch")) {
|
||||||
return "https://www.youtube.com" + internUrl;
|
return "https://www.youtube.com" + internUrl;
|
||||||
}
|
}
|
||||||
} else if (navigationEndpoint.has("browseEndpoint")) {
|
}
|
||||||
|
|
||||||
|
if (navigationEndpoint.has("browseEndpoint")) {
|
||||||
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
|
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
|
||||||
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
|
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
|
||||||
final String browseId = browseEndpoint.getString("browseId");
|
final String browseId = browseEndpoint.getString("browseId");
|
||||||
|
@ -892,26 +887,39 @@ public final class YoutubeParsingHelper {
|
||||||
if (!isNullOrEmpty(canonicalBaseUrl)) {
|
if (!isNullOrEmpty(canonicalBaseUrl)) {
|
||||||
return "https://www.youtube.com" + canonicalBaseUrl;
|
return "https://www.youtube.com" + canonicalBaseUrl;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
|
if (navigationEndpoint.has("watchEndpoint")) {
|
||||||
+ browseEndpoint + "\")");
|
|
||||||
} else if (navigationEndpoint.has("watchEndpoint")) {
|
|
||||||
final StringBuilder url = new StringBuilder();
|
final StringBuilder url = new StringBuilder();
|
||||||
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
|
url.append("https://www.youtube.com/watch?v=")
|
||||||
.getObject("watchEndpoint").getString(VIDEO_ID));
|
.append(navigationEndpoint.getObject("watchEndpoint")
|
||||||
|
.getString(VIDEO_ID));
|
||||||
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
|
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
|
||||||
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
|
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
|
||||||
.getString("playlistId"));
|
.getString("playlistId"));
|
||||||
}
|
}
|
||||||
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
|
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
|
||||||
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
|
url.append("&t=")
|
||||||
|
.append(navigationEndpoint.getObject("watchEndpoint")
|
||||||
.getInt("startTimeSeconds"));
|
.getInt("startTimeSeconds"));
|
||||||
}
|
}
|
||||||
return url.toString();
|
return url.toString();
|
||||||
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
|
|
||||||
return "https://www.youtube.com/playlist?list="
|
|
||||||
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
|
||||||
|
return "https://www.youtube.com/playlist?list="
|
||||||
|
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
|
||||||
|
.getString("playlistId");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (navigationEndpoint.has("commandMetadata")) {
|
||||||
|
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
|
||||||
|
.getObject("webCommandMetadata");
|
||||||
|
if (metadata.has("url")) {
|
||||||
|
return "https://www.youtube.com" + metadata.getString("url");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -924,8 +932,7 @@ public final class YoutubeParsingHelper {
|
||||||
* @return text in the JSON object or {@code null}
|
* @return text in the JSON object or {@code null}
|
||||||
*/
|
*/
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String getTextFromObject(final JsonObject textObject, final boolean html)
|
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
|
||||||
throws ParsingException {
|
|
||||||
if (isNullOrEmpty(textObject)) {
|
if (isNullOrEmpty(textObject)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -944,12 +951,12 @@ public final class YoutubeParsingHelper {
|
||||||
String text = run.getString("text");
|
String text = run.getString("text");
|
||||||
|
|
||||||
if (html) {
|
if (html) {
|
||||||
text = Entities.escape(text);
|
|
||||||
if (run.has("navigationEndpoint")) {
|
if (run.has("navigationEndpoint")) {
|
||||||
final String url = getUrlFromNavigationEndpoint(run
|
final String url = getUrlFromNavigationEndpoint(
|
||||||
.getObject("navigationEndpoint"));
|
run.getObject("navigationEndpoint"));
|
||||||
if (!isNullOrEmpty(url)) {
|
if (!isNullOrEmpty(url)) {
|
||||||
text = "<a href=\"" + url + "\">" + text + "</a>";
|
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
|
||||||
|
+ "</a>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1015,11 +1022,12 @@ public final class YoutubeParsingHelper {
|
||||||
}
|
}
|
||||||
|
|
||||||
final String content = attributedDescription.getString("content");
|
final String content = attributedDescription.getString("content");
|
||||||
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
|
|
||||||
if (content == null) {
|
if (content == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
|
||||||
|
|
||||||
final StringBuilder textBuilder = new StringBuilder();
|
final StringBuilder textBuilder = new StringBuilder();
|
||||||
int textStart = 0;
|
int textStart = 0;
|
||||||
|
|
||||||
|
@ -1038,12 +1046,7 @@ public final class YoutubeParsingHelper {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
final String url;
|
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
|
||||||
try {
|
|
||||||
url = getUrlFromNavigationEndpoint(navigationEndpoint);
|
|
||||||
} catch (final ParsingException e) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (url == null) {
|
if (url == null) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -1062,9 +1065,9 @@ public final class YoutubeParsingHelper {
|
||||||
.replaceFirst("^[/•] *", "");
|
.replaceFirst("^[/•] *", "");
|
||||||
|
|
||||||
textBuilder.append("<a href=\"")
|
textBuilder.append("<a href=\"")
|
||||||
.append(url)
|
.append(Entities.escape(url))
|
||||||
.append("\">")
|
.append("\">")
|
||||||
.append(linkText)
|
.append(Entities.escape(linkText))
|
||||||
.append("</a>");
|
.append("</a>");
|
||||||
|
|
||||||
textStart = startIndex + length;
|
textStart = startIndex + length;
|
||||||
|
@ -1081,13 +1084,12 @@ public final class YoutubeParsingHelper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
|
public static String getTextFromObject(final JsonObject textObject) {
|
||||||
return getTextFromObject(textObject, false);
|
return getTextFromObject(textObject, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {
|
public static String getUrlFromObject(final JsonObject textObject) {
|
||||||
|
|
||||||
if (isNullOrEmpty(textObject)) {
|
if (isNullOrEmpty(textObject)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -1108,8 +1110,7 @@ public final class YoutubeParsingHelper {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
|
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
|
||||||
throws ParsingException {
|
|
||||||
if (jsonObject.isString(theKey)) {
|
if (jsonObject.isString(theKey)) {
|
||||||
return jsonObject.getString(theKey);
|
return jsonObject.getString(theKey);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -45,13 +45,10 @@ public class YoutubeChannelInfoItemExtractor implements ChannelInfoItemExtractor
|
||||||
this.channelInfoItem = channelInfoItem;
|
this.channelInfoItem = channelInfoItem;
|
||||||
|
|
||||||
boolean wHandle = false;
|
boolean wHandle = false;
|
||||||
try {
|
final String subscriberCountText = getTextFromObject(
|
||||||
final String subscriberCountText = getTextFromObject(
|
channelInfoItem.getObject("subscriberCountText"));
|
||||||
channelInfoItem.getObject("subscriberCountText"));
|
if (subscriberCountText != null) {
|
||||||
if (subscriberCountText != null) {
|
wHandle = subscriberCountText.startsWith("@");
|
||||||
wHandle = subscriberCountText.startsWith("@");
|
|
||||||
}
|
|
||||||
} catch (final ParsingException ignored) {
|
|
||||||
}
|
}
|
||||||
this.withHandle = wHandle;
|
this.withHandle = wHandle;
|
||||||
}
|
}
|
||||||
|
|
|
@ -168,11 +168,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
title = playerResponse.getObject("videoDetails").getString("title");
|
title = playerResponse.getObject("videoDetails").getString("title");
|
||||||
|
|
||||||
if (isNullOrEmpty(title)) {
|
if (isNullOrEmpty(title)) {
|
||||||
try {
|
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
|
||||||
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
|
|
||||||
} catch (final ParsingException ignored) {
|
|
||||||
// Age-restricted videos cause a ParsingException here
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isNullOrEmpty(title)) {
|
if (isNullOrEmpty(title)) {
|
||||||
throw new ParsingException("Could not get name");
|
throw new ParsingException("Could not get name");
|
||||||
|
@ -285,21 +281,17 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
public Description getDescription() throws ParsingException {
|
public Description getDescription() throws ParsingException {
|
||||||
assertPageFetched();
|
assertPageFetched();
|
||||||
// Description with more info on links
|
// Description with more info on links
|
||||||
try {
|
final String videoSecondaryInfoRendererDescription = getTextFromObject(
|
||||||
final String description = getTextFromObject(
|
getVideoSecondaryInfoRenderer().getObject("description"),
|
||||||
getVideoSecondaryInfoRenderer().getObject("description"),
|
true);
|
||||||
true);
|
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
|
||||||
if (!isNullOrEmpty(description)) {
|
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
|
||||||
return new Description(description, Description.HTML);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
final String attributedDescription = getAttributedDescription(
|
final String attributedDescription = getAttributedDescription(
|
||||||
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
|
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
|
||||||
if (!isNullOrEmpty(attributedDescription)) {
|
if (!isNullOrEmpty(attributedDescription)) {
|
||||||
return new Description(attributedDescription, Description.HTML);
|
return new Description(attributedDescription, Description.HTML);
|
||||||
}
|
|
||||||
} catch (final ParsingException ignored) {
|
|
||||||
// Age-restricted videos cause a ParsingException here
|
|
||||||
}
|
}
|
||||||
|
|
||||||
String description = playerResponse.getObject("videoDetails")
|
String description = playerResponse.getObject("videoDetails")
|
||||||
|
@ -400,14 +392,8 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getViewCount() throws ParsingException {
|
public long getViewCount() throws ParsingException {
|
||||||
String views = null;
|
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
|
||||||
|
.getObject("videoViewCountRenderer").getObject("viewCount"));
|
||||||
try {
|
|
||||||
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
|
|
||||||
.getObject("videoViewCountRenderer").getObject("viewCount"));
|
|
||||||
} catch (final ParsingException ignored) {
|
|
||||||
// Age-restricted videos cause a ParsingException here
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isNullOrEmpty(views)) {
|
if (isNullOrEmpty(views)) {
|
||||||
views = playerResponse.getObject("videoDetails").getString("viewCount");
|
views = playerResponse.getObject("videoDetails").getString("viewCount");
|
||||||
|
@ -795,7 +781,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
return getTextFromObject(playerResponse.getObject("playabilityStatus")
|
return getTextFromObject(playerResponse.getObject("playabilityStatus")
|
||||||
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
|
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
|
||||||
.getObject("reason"));
|
.getObject("reason"));
|
||||||
} catch (final ParsingException | NullPointerException e) {
|
} catch (final NullPointerException e) {
|
||||||
return null; // No error message
|
return null; // No error message
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -183,10 +183,10 @@ public class YoutubeStreamExtractorDefaultTest {
|
||||||
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
|
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
|
||||||
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
|
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
|
||||||
@Override public List<String> expectedDescriptionContains() {
|
@Override public List<String> expectedDescriptionContains() {
|
||||||
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||||
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||||
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||||
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
|
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
|
||||||
}
|
}
|
||||||
@Override public long expectedLength() { return 434; }
|
@Override public long expectedLength() { return 434; }
|
||||||
@Override public long expectedViewCountAtLeast() { return 21229200; }
|
@Override public long expectedViewCountAtLeast() { return 21229200; }
|
||||||
|
|
Loading…
Reference in New Issue