Merge pull request #1032 from AudricV/yt_fix-comments-hashtags-links-extraction
[YouTube] Fix hashtags links extraction and escape HTML links
This commit is contained in:
commit
19e4b216c9
|
@ -822,7 +822,7 @@ public final class YoutubeParsingHelper {
|
|||
|
||||
try {
|
||||
final String url = "https://music.youtube.com/sw.js";
|
||||
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
|
||||
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
|
||||
final String response = getDownloader().get(url, headers).responseBody();
|
||||
musicClientVersion = getStringResultFromRegexArray(response,
|
||||
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
|
||||
|
@ -843,18 +843,11 @@ public final class YoutubeParsingHelper {
|
|||
}
|
||||
|
||||
@Nullable
|
||||
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
|
||||
throws ParsingException {
|
||||
if (navigationEndpoint.has("webCommandMetadata")) {
|
||||
// this case needs to be handled before the browseEndpoint,
|
||||
// e.g. for hashtags in comments
|
||||
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
|
||||
if (metadata.has("url")) {
|
||||
return "https://www.youtube.com" + metadata.getString("url");
|
||||
}
|
||||
}
|
||||
public static String getUrlFromNavigationEndpoint(
|
||||
@Nonnull final JsonObject navigationEndpoint) {
|
||||
if (navigationEndpoint.has("urlEndpoint")) {
|
||||
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
|
||||
String internUrl = navigationEndpoint.getObject("urlEndpoint")
|
||||
.getString("url");
|
||||
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
|
||||
// remove https://www.youtube.com part to fall in the next if block
|
||||
internUrl = internUrl.substring(23);
|
||||
|
@ -879,7 +872,9 @@ public final class YoutubeParsingHelper {
|
|||
|| internUrl.startsWith("/watch")) {
|
||||
return "https://www.youtube.com" + internUrl;
|
||||
}
|
||||
} else if (navigationEndpoint.has("browseEndpoint")) {
|
||||
}
|
||||
|
||||
if (navigationEndpoint.has("browseEndpoint")) {
|
||||
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
|
||||
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
|
||||
final String browseId = browseEndpoint.getString("browseId");
|
||||
|
@ -892,26 +887,39 @@ public final class YoutubeParsingHelper {
|
|||
if (!isNullOrEmpty(canonicalBaseUrl)) {
|
||||
return "https://www.youtube.com" + canonicalBaseUrl;
|
||||
}
|
||||
}
|
||||
|
||||
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
|
||||
+ browseEndpoint + "\")");
|
||||
} else if (navigationEndpoint.has("watchEndpoint")) {
|
||||
if (navigationEndpoint.has("watchEndpoint")) {
|
||||
final StringBuilder url = new StringBuilder();
|
||||
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
|
||||
.getObject("watchEndpoint").getString(VIDEO_ID));
|
||||
url.append("https://www.youtube.com/watch?v=")
|
||||
.append(navigationEndpoint.getObject("watchEndpoint")
|
||||
.getString(VIDEO_ID));
|
||||
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
|
||||
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
|
||||
.getString("playlistId"));
|
||||
}
|
||||
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
|
||||
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
|
||||
url.append("&t=")
|
||||
.append(navigationEndpoint.getObject("watchEndpoint")
|
||||
.getInt("startTimeSeconds"));
|
||||
}
|
||||
return url.toString();
|
||||
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
|
||||
return "https://www.youtube.com/playlist?list="
|
||||
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
|
||||
}
|
||||
|
||||
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
|
||||
return "https://www.youtube.com/playlist?list="
|
||||
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
|
||||
.getString("playlistId");
|
||||
}
|
||||
|
||||
if (navigationEndpoint.has("commandMetadata")) {
|
||||
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
|
||||
.getObject("webCommandMetadata");
|
||||
if (metadata.has("url")) {
|
||||
return "https://www.youtube.com" + metadata.getString("url");
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -924,8 +932,7 @@ public final class YoutubeParsingHelper {
|
|||
* @return text in the JSON object or {@code null}
|
||||
*/
|
||||
@Nullable
|
||||
public static String getTextFromObject(final JsonObject textObject, final boolean html)
|
||||
throws ParsingException {
|
||||
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
|
||||
if (isNullOrEmpty(textObject)) {
|
||||
return null;
|
||||
}
|
||||
|
@ -944,12 +951,12 @@ public final class YoutubeParsingHelper {
|
|||
String text = run.getString("text");
|
||||
|
||||
if (html) {
|
||||
text = Entities.escape(text);
|
||||
if (run.has("navigationEndpoint")) {
|
||||
final String url = getUrlFromNavigationEndpoint(run
|
||||
.getObject("navigationEndpoint"));
|
||||
final String url = getUrlFromNavigationEndpoint(
|
||||
run.getObject("navigationEndpoint"));
|
||||
if (!isNullOrEmpty(url)) {
|
||||
text = "<a href=\"" + url + "\">" + text + "</a>";
|
||||
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
|
||||
+ "</a>";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1015,11 +1022,12 @@ public final class YoutubeParsingHelper {
|
|||
}
|
||||
|
||||
final String content = attributedDescription.getString("content");
|
||||
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
|
||||
if (content == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
|
||||
|
||||
final StringBuilder textBuilder = new StringBuilder();
|
||||
int textStart = 0;
|
||||
|
||||
|
@ -1038,12 +1046,7 @@ public final class YoutubeParsingHelper {
|
|||
continue;
|
||||
}
|
||||
|
||||
final String url;
|
||||
try {
|
||||
url = getUrlFromNavigationEndpoint(navigationEndpoint);
|
||||
} catch (final ParsingException e) {
|
||||
continue;
|
||||
}
|
||||
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
|
||||
|
||||
if (url == null) {
|
||||
continue;
|
||||
|
@ -1062,9 +1065,9 @@ public final class YoutubeParsingHelper {
|
|||
.replaceFirst("^[/•] *", "");
|
||||
|
||||
textBuilder.append("<a href=\"")
|
||||
.append(url)
|
||||
.append(Entities.escape(url))
|
||||
.append("\">")
|
||||
.append(linkText)
|
||||
.append(Entities.escape(linkText))
|
||||
.append("</a>");
|
||||
|
||||
textStart = startIndex + length;
|
||||
|
@ -1081,13 +1084,12 @@ public final class YoutubeParsingHelper {
|
|||
}
|
||||
|
||||
@Nullable
|
||||
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
|
||||
public static String getTextFromObject(final JsonObject textObject) {
|
||||
return getTextFromObject(textObject, false);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {
|
||||
|
||||
public static String getUrlFromObject(final JsonObject textObject) {
|
||||
if (isNullOrEmpty(textObject)) {
|
||||
return null;
|
||||
}
|
||||
|
@ -1108,8 +1110,7 @@ public final class YoutubeParsingHelper {
|
|||
}
|
||||
|
||||
@Nullable
|
||||
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
|
||||
throws ParsingException {
|
||||
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
|
||||
if (jsonObject.isString(theKey)) {
|
||||
return jsonObject.getString(theKey);
|
||||
} else {
|
||||
|
|
|
@ -45,13 +45,10 @@ public class YoutubeChannelInfoItemExtractor implements ChannelInfoItemExtractor
|
|||
this.channelInfoItem = channelInfoItem;
|
||||
|
||||
boolean wHandle = false;
|
||||
try {
|
||||
final String subscriberCountText = getTextFromObject(
|
||||
channelInfoItem.getObject("subscriberCountText"));
|
||||
if (subscriberCountText != null) {
|
||||
wHandle = subscriberCountText.startsWith("@");
|
||||
}
|
||||
} catch (final ParsingException ignored) {
|
||||
final String subscriberCountText = getTextFromObject(
|
||||
channelInfoItem.getObject("subscriberCountText"));
|
||||
if (subscriberCountText != null) {
|
||||
wHandle = subscriberCountText.startsWith("@");
|
||||
}
|
||||
this.withHandle = wHandle;
|
||||
}
|
||||
|
|
|
@ -168,11 +168,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
|||
title = playerResponse.getObject("videoDetails").getString("title");
|
||||
|
||||
if (isNullOrEmpty(title)) {
|
||||
try {
|
||||
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
|
||||
} catch (final ParsingException ignored) {
|
||||
// Age-restricted videos cause a ParsingException here
|
||||
}
|
||||
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
|
||||
|
||||
if (isNullOrEmpty(title)) {
|
||||
throw new ParsingException("Could not get name");
|
||||
|
@ -285,21 +281,17 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
|||
public Description getDescription() throws ParsingException {
|
||||
assertPageFetched();
|
||||
// Description with more info on links
|
||||
try {
|
||||
final String description = getTextFromObject(
|
||||
getVideoSecondaryInfoRenderer().getObject("description"),
|
||||
true);
|
||||
if (!isNullOrEmpty(description)) {
|
||||
return new Description(description, Description.HTML);
|
||||
}
|
||||
final String videoSecondaryInfoRendererDescription = getTextFromObject(
|
||||
getVideoSecondaryInfoRenderer().getObject("description"),
|
||||
true);
|
||||
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
|
||||
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
|
||||
}
|
||||
|
||||
final String attributedDescription = getAttributedDescription(
|
||||
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
|
||||
if (!isNullOrEmpty(attributedDescription)) {
|
||||
return new Description(attributedDescription, Description.HTML);
|
||||
}
|
||||
} catch (final ParsingException ignored) {
|
||||
// Age-restricted videos cause a ParsingException here
|
||||
final String attributedDescription = getAttributedDescription(
|
||||
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
|
||||
if (!isNullOrEmpty(attributedDescription)) {
|
||||
return new Description(attributedDescription, Description.HTML);
|
||||
}
|
||||
|
||||
String description = playerResponse.getObject("videoDetails")
|
||||
|
@ -400,14 +392,8 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
|||
|
||||
@Override
|
||||
public long getViewCount() throws ParsingException {
|
||||
String views = null;
|
||||
|
||||
try {
|
||||
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
|
||||
.getObject("videoViewCountRenderer").getObject("viewCount"));
|
||||
} catch (final ParsingException ignored) {
|
||||
// Age-restricted videos cause a ParsingException here
|
||||
}
|
||||
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
|
||||
.getObject("videoViewCountRenderer").getObject("viewCount"));
|
||||
|
||||
if (isNullOrEmpty(views)) {
|
||||
views = playerResponse.getObject("videoDetails").getString("viewCount");
|
||||
|
@ -795,7 +781,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
|||
return getTextFromObject(playerResponse.getObject("playabilityStatus")
|
||||
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
|
||||
.getObject("reason"));
|
||||
} catch (final ParsingException | NullPointerException e) {
|
||||
} catch (final NullPointerException e) {
|
||||
return null; // No error message
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,10 +183,10 @@ public class YoutubeStreamExtractorDefaultTest {
|
|||
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
|
||||
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
|
||||
@Override public List<String> expectedDescriptionContains() {
|
||||
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
|
||||
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
|
||||
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
|
||||
}
|
||||
@Override public long expectedLength() { return 434; }
|
||||
@Override public long expectedViewCountAtLeast() { return 21229200; }
|
||||
|
|
|
@ -3,10 +3,10 @@
|
|||
"httpMethod": "GET",
|
||||
"url": "https://www.youtube.com/sw.js",
|
||||
"headers": {
|
||||
"Origin": [
|
||||
"Referer": [
|
||||
"https://www.youtube.com"
|
||||
],
|
||||
"Referer": [
|
||||
"Origin": [
|
||||
"https://www.youtube.com"
|
||||
],
|
||||
"Accept-Language": [
|
||||
|
@ -29,7 +29,7 @@
|
|||
"https://www.youtube.com"
|
||||
],
|
||||
"alt-svc": [
|
||||
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
|
||||
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
|
||||
],
|
||||
"cache-control": [
|
||||
"private, max-age\u003d0"
|
||||
|
@ -41,10 +41,10 @@
|
|||
"same-origin; report-to\u003d\"youtube_main\""
|
||||
],
|
||||
"date": [
|
||||
"Mon, 28 Nov 2022 20:27:36 GMT"
|
||||
"Sun, 26 Feb 2023 17:48:54 GMT"
|
||||
],
|
||||
"expires": [
|
||||
"Mon, 28 Nov 2022 20:27:36 GMT"
|
||||
"Sun, 26 Feb 2023 17:48:54 GMT"
|
||||
],
|
||||
"p3p": [
|
||||
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
|
||||
|
@ -59,9 +59,9 @@
|
|||
"ESF"
|
||||
],
|
||||
"set-cookie": [
|
||||
"YSC\u003ddaTQ98V-voQ; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dTue, 03-Mar-2020 20:27:36 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"CONSENT\u003dPENDING+452; expires\u003dWed, 27-Nov-2024 20:27:36 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
|
||||
"YSC\u003dYJXWRWCYVkE; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 17:48:54 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"CONSENT\u003dPENDING+668; expires\u003dTue, 25-Feb-2025 17:48:54 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
|
||||
],
|
||||
"strict-transport-security": [
|
||||
"max-age\u003d31536000"
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -29,7 +29,7 @@
|
|||
"https://www.youtube.com"
|
||||
],
|
||||
"alt-svc": [
|
||||
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
|
||||
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
|
||||
],
|
||||
"cache-control": [
|
||||
"private, max-age\u003d0"
|
||||
|
@ -41,10 +41,10 @@
|
|||
"same-origin; report-to\u003d\"youtube_main\""
|
||||
],
|
||||
"date": [
|
||||
"Tue, 22 Nov 2022 10:40:20 GMT"
|
||||
"Sun, 26 Feb 2023 10:57:08 GMT"
|
||||
],
|
||||
"expires": [
|
||||
"Tue, 22 Nov 2022 10:40:20 GMT"
|
||||
"Sun, 26 Feb 2023 10:57:08 GMT"
|
||||
],
|
||||
"p3p": [
|
||||
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
|
||||
|
@ -59,9 +59,9 @@
|
|||
"ESF"
|
||||
],
|
||||
"set-cookie": [
|
||||
"YSC\u003ddIhq5C9znKU; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dWed, 26-Feb-2020 10:40:20 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"CONSENT\u003dPENDING+600; expires\u003dThu, 21-Nov-2024 10:40:19 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
|
||||
"YSC\u003dL2wyk8wP8TA; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 10:57:08 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
|
||||
"CONSENT\u003dPENDING+005; expires\u003dTue, 25-Feb-2025 10:57:08 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
|
||||
],
|
||||
"strict-transport-security": [
|
||||
"max-age\u003d31536000"
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue