diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/comments/CommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/comments/CommentsExtractor.java index 4795e9897..ac4792fc0 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/comments/CommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/comments/CommentsExtractor.java @@ -2,6 +2,7 @@ package org.schabi.newpipe.extractor.comments; import org.schabi.newpipe.extractor.ListExtractor; import org.schabi.newpipe.extractor.StreamingService; +import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; @@ -17,7 +18,7 @@ public abstract class CommentsExtractor extends ListExtractor * @apiNote Warning: This method is experimental and may get removed in a future release. * @return true if the comments are disabled otherwise false (default) */ - public boolean isCommentsDisabled() { + public boolean isCommentsDisabled() throws ExtractionException { return false; } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java index 959397c4f..018e3efb6 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java @@ -1,8 +1,18 @@ package org.schabi.newpipe.extractor.services.youtube.extractors; -import com.grack.nanojson.JsonArray; -import com.grack.nanojson.JsonObject; -import com.grack.nanojson.JsonParser; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse; +import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.prepareDesktopJsonBuilder; +import static org.schabi.newpipe.extractor.utils.Utils.UTF_8; +import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Optional; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + import org.schabi.newpipe.extractor.Page; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.comments.CommentsExtractor; @@ -10,38 +20,19 @@ import org.schabi.newpipe.extractor.comments.CommentsInfoItem; import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor; import org.schabi.newpipe.extractor.comments.CommentsInfoItemsCollector; import org.schabi.newpipe.extractor.downloader.Downloader; -import org.schabi.newpipe.extractor.downloader.Response; import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; -import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; -import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; +import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.utils.JsonUtils; -import org.schabi.newpipe.extractor.utils.Parser; -import javax.annotation.Nonnull; -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.regex.Pattern; - -import static java.util.Collections.singletonList; -import static org.schabi.newpipe.extractor.utils.Utils.UTF_8; -import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty; +import com.grack.nanojson.JsonArray; +import com.grack.nanojson.JsonObject; +import com.grack.nanojson.JsonWriter; public class YoutubeCommentsExtractor extends CommentsExtractor { - // using the mobile site for comments because it loads faster and uses get requests instead of post - private static final String USER_AGENT = "Mozilla/5.0 (Android 9; Mobile; rv:78.0) Gecko/20100101 Firefox/78.0"; - private static final Pattern YT_CLIENT_NAME_PATTERN = Pattern.compile("INNERTUBE_CONTEXT_CLIENT_NAME\\\":(.*?)[,}]"); - private String ytClientVersion; - private String ytClientName; - private String responseBody; + private JsonObject nextResponse; /** * Caching mechanism and holder of the commentsDisabled value. @@ -52,6 +43,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { * If the method or another one that is depending on disabled comments * is now called again, the method execution can avoid unnecessary calls */ + @SuppressWarnings("OptionalUsedAsFieldOrParameterType") private Optional optCommentsDisabled = Optional.empty(); public YoutubeCommentsExtractor( @@ -60,6 +52,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { super(service, uiHandler); } + @Nonnull @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { @@ -81,163 +74,177 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { /** * Finds the initial comments token and initializes commentsDisabled. + * * @return the continuation token or null if none was found */ - private String findInitialCommentsToken() { - final String continuationStartPattern = "continuation\":\""; + @Nullable + private String findInitialCommentsToken() throws ExtractionException { - String commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}"); - if (commentsTokenInside == null || !commentsTokenInside.contains(continuationStartPattern)) { - commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}"); + final JsonArray jArray = JsonUtils.getArray(nextResponse, + "contents.twoColumnWatchNextResults.results.results.contents"); + + final Optional itemSectionRenderer = jArray.stream().filter(o -> { + JsonObject jObj = (JsonObject) o; + + if (jObj.has("itemSectionRenderer")) { + try { + return JsonUtils.getString(jObj, "itemSectionRenderer.targetId") + .equals("comments-section"); + } catch (final ParsingException ignored) { + } + } + + return false; + }).findFirst(); + + final String token; + + if (itemSectionRenderer.isPresent()) { + token = JsonUtils.getString(((JsonObject) itemSectionRenderer.get()) + .getObject("itemSectionRenderer").getArray("contents").getObject(0), + "continuationItemRenderer.continuationEndpoint.continuationCommand.token"); + } else { + token = null; } - // If no continuation token is found the comments are disabled - if (commentsTokenInside == null || !commentsTokenInside.contains(continuationStartPattern)) { + if (token == null) { optCommentsDisabled = Optional.of(true); return null; } - // If a continuation token is found there are >= 0 comments - final String commentsToken = findValue(commentsTokenInside, continuationStartPattern, "\""); - optCommentsDisabled = Optional.of(false); - return commentsToken; + return token; } + @Nonnull private InfoItemsPage getInfoItemsPageForDisabledComments() { return new InfoItemsPage<>(Collections.emptyList(), null, Collections.emptyList()); } - private Page getNextPage(final JsonObject ajaxJson) throws ParsingException { - final JsonArray arr; + @Nullable + private Page getNextPage(@Nonnull final JsonObject ajaxJson) throws ExtractionException { + final JsonArray jsonArray; + final JsonArray onResponseReceivedEndpoints = ajaxJson.getArray( + "onResponseReceivedEndpoints"); + final JsonObject endpoint = onResponseReceivedEndpoints.getObject( + onResponseReceivedEndpoints.size() - 1); + try { - arr = JsonUtils.getArray(ajaxJson, "response.continuationContents.commentSectionContinuation.continuations"); + jsonArray = endpoint.getObject("reloadContinuationItemsCommand", endpoint.getObject( + "appendContinuationItemsAction")).getArray("continuationItems"); } catch (final Exception e) { return null; } - if (arr.isEmpty()) { + if (jsonArray.isEmpty()) { return null; } + final String continuation; try { - continuation = JsonUtils.getString(arr.getObject(0), "nextContinuationData.continuation"); + continuation = JsonUtils.getString(jsonArray.getObject(jsonArray.size() - 1), + "continuationItemRenderer.continuationEndpoint.continuationCommand.token"); } catch (final Exception e) { return null; } return getNextPage(continuation); } + @Nonnull private Page getNextPage(final String continuation) throws ParsingException { - final Map params = new HashMap<>(); - params.put("action_get_comments", "1"); - params.put("pbj", "1"); - params.put("ctoken", continuation); - try { - return new Page("https://m.youtube.com/watch_comment?" + getDataString(params)); - } catch (final UnsupportedEncodingException e) { - throw new ParsingException("Could not get next page url", e); - } + return new Page(getUrl(), continuation); // URL is ignored tho } @Override - public InfoItemsPage getPage(final Page page) throws IOException, ExtractionException { + public InfoItemsPage getPage(final Page page) + throws IOException, ExtractionException { if (optCommentsDisabled.orElse(false)) { return getInfoItemsPageForDisabledComments(); } - if (page == null || isNullOrEmpty(page.getUrl())) { - throw new IllegalArgumentException("Page doesn't contain an URL"); + if (page == null || isNullOrEmpty(page.getId())) { + throw new IllegalArgumentException("Page doesn't have the continuation."); } - final String ajaxResponse = makeAjaxRequest(page.getUrl()); - final JsonObject ajaxJson; - try { - ajaxJson = JsonParser.array().from(ajaxResponse).getObject(1); - } catch (final Exception e) { - throw new ParsingException("Could not parse json data for comments", e); - } - final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(getServiceId()); + final Localization localization = getExtractorLocalization(); + final byte[] body = JsonWriter.string(prepareDesktopJsonBuilder(localization, + getExtractorContentCountry()) + .value("continuation", page.getId()) + .done()) + .getBytes(UTF_8); + + final JsonObject ajaxJson = getJsonPostResponse("next", body, localization); + + final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector( + getServiceId()); collectCommentsFrom(collector, ajaxJson); return new InfoItemsPage<>(collector, getNextPage(ajaxJson)); } - private void collectCommentsFrom(final CommentsInfoItemsCollector collector, final JsonObject ajaxJson) throws ParsingException { - final JsonArray contents; - try { - contents = JsonUtils.getArray(ajaxJson, "response.continuationContents.commentSectionContinuation.items"); - } catch (final Exception e) { - //no comments + private void collectCommentsFrom(final CommentsInfoItemsCollector collector, + @Nonnull final JsonObject ajaxJson) throws ParsingException { + + final JsonArray onResponseReceivedEndpoints = ajaxJson.getArray( + "onResponseReceivedEndpoints"); + final JsonObject commentsEndpoint = onResponseReceivedEndpoints.getObject( + onResponseReceivedEndpoints.size() - 1); + + final String path; + + if (commentsEndpoint.has("reloadContinuationItemsCommand")) { + path = "reloadContinuationItemsCommand.continuationItems"; + } else if (commentsEndpoint.has("appendContinuationItemsAction")) { + path = "appendContinuationItemsAction.continuationItems"; + } else { + // No comments return; } + + final JsonArray contents; + try { + contents = (JsonArray) JsonUtils.getArray(commentsEndpoint, path).clone(); + } catch (final Exception e) { + // No comments + return; + } + + final int index = contents.size() - 1; + if (contents.getObject(index).has("continuationItemRenderer")) { + contents.remove(index); + } + final List comments; try { - comments = JsonUtils.getValues(contents, "commentThreadRenderer.comment.commentRenderer"); + comments = JsonUtils.getValues(contents, + "commentThreadRenderer.comment.commentRenderer"); } catch (final Exception e) { - throw new ParsingException("unable to get parse youtube comments", e); + throw new ParsingException("Unable to get parse youtube comments", e); } for (final Object c : comments) { if (c instanceof JsonObject) { - final CommentsInfoItemExtractor extractor = - new YoutubeCommentsInfoItemExtractor((JsonObject) c, getUrl(), getTimeAgoParser()); + final CommentsInfoItemExtractor extractor = new YoutubeCommentsInfoItemExtractor( + (JsonObject) c, getUrl(), getTimeAgoParser()); collector.commit(extractor); } } } @Override - public void onFetchPage(@Nonnull final Downloader downloader) throws IOException, ExtractionException { - final Map> requestHeaders = new HashMap<>(); - requestHeaders.put("User-Agent", singletonList(USER_AGENT)); - final Response response = downloader.get(getUrl(), requestHeaders, getExtractorLocalization()); - responseBody = YoutubeParsingHelper.unescapeDocument(response.responseBody()); - ytClientVersion = findValue(responseBody, "INNERTUBE_CONTEXT_CLIENT_VERSION\":\"", "\""); - ytClientName = Parser.matchGroup1(YT_CLIENT_NAME_PATTERN, responseBody); + public void onFetchPage(@Nonnull final Downloader downloader) + throws IOException, ExtractionException { + final Localization localization = getExtractorLocalization(); + final byte[] body = JsonWriter.string(prepareDesktopJsonBuilder(localization, + getExtractorContentCountry()) + .value("videoId", getId()) + .done()) + .getBytes(UTF_8); + + nextResponse = getJsonPostResponse("next", body, localization); } - private String makeAjaxRequest(final String siteUrl) throws IOException, ReCaptchaException { - final Map> requestHeaders = new HashMap<>(); - requestHeaders.put("Accept", singletonList("*/*")); - requestHeaders.put("User-Agent", singletonList(USER_AGENT)); - requestHeaders.put("X-YouTube-Client-Version", singletonList(ytClientVersion)); - requestHeaders.put("X-YouTube-Client-Name", singletonList(ytClientName)); - return getDownloader().get(siteUrl, requestHeaders, getExtractorLocalization()).responseBody(); - } - - private String getDataString(final Map params) throws UnsupportedEncodingException { - final StringBuilder result = new StringBuilder(); - boolean first = true; - for (final Map.Entry entry : params.entrySet()) { - if (first) { - first = false; - } else { - result.append("&"); - } - result.append(URLEncoder.encode(entry.getKey(), UTF_8)); - result.append("="); - result.append(URLEncoder.encode(entry.getValue(), UTF_8)); - } - return result.toString(); - } - - private String findValue(final String doc, final String start, final String end) { - int beginIndex = doc.indexOf(start); - // Start string was not found - if (beginIndex == -1) { - return null; - } - beginIndex = beginIndex + start.length(); - final int endIndex = doc.indexOf(end, beginIndex); - // End string was not found - if (endIndex == -1) { - return null; - } - return doc.substring(beginIndex, endIndex); - } - @Override - public boolean isCommentsDisabled() { + public boolean isCommentsDisabled() throws ExtractionException { // Check if commentsDisabled has to be initialized if (!optCommentsDisabled.isPresent()) { // Initialize commentsDisabled diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java index 7a68ad458..de4ad3578 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java @@ -21,7 +21,9 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract private final String url; private final TimeAgoParser timeAgoParser; - public YoutubeCommentsInfoItemExtractor(JsonObject json, String url, TimeAgoParser timeAgoParser) { + public YoutubeCommentsInfoItemExtractor(final JsonObject json, + final String url, + final TimeAgoParser timeAgoParser) { this.json = json; this.url = url; this.timeAgoParser = timeAgoParser; @@ -37,7 +39,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract try { final JsonArray arr = JsonUtils.getArray(json, "authorThumbnail.thumbnails"); return JsonUtils.getString(arr.getObject(2), "url"); - } catch (Exception e) { + } catch (final Exception e) { throw new ParsingException("Could not get thumbnail url", e); } } @@ -46,7 +48,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract public String getName() throws ParsingException { try { return getTextFromObject(JsonUtils.getObject(json, "authorText")); - } catch (Exception e) { + } catch (final Exception e) { return EMPTY_STRING; } } @@ -55,7 +57,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract public String getTextualUploadDate() throws ParsingException { try { return getTextFromObject(JsonUtils.getObject(json, "publishedTimeText")); - } catch (Exception e) { + } catch (final Exception e) { throw new ParsingException("Could not get publishedTimeText", e); } } @@ -64,7 +66,8 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract @Override public DateWrapper getUploadDate() throws ParsingException { String textualPublishedTime = getTextualUploadDate(); - if (timeAgoParser != null && textualPublishedTime != null && !textualPublishedTime.isEmpty()) { + if (timeAgoParser != null && textualPublishedTime != null + && !textualPublishedTime.isEmpty()) { return timeAgoParser.parse(textualPublishedTime); } else { return null; @@ -72,33 +75,51 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract } /** - * @implNote The method is parsing internally a localized string.
+ * @implNote The method tries first to get the exact like count by using the accessibility data + * returned. But if the parsing of this accessibility data fails, the method parses internally + * a localized string. + *
*
    - *
  • - * More than 1k likes will result in an inaccurate number - *
  • - *
  • - * This will fail for other languages than English. - * However as long as the Extractor only uses "en-GB" - * (as seen in {@link org.schabi.newpipe.extractor.services.youtube.YoutubeService#SUPPORTED_LANGUAGES}) - * everything will work fine. - *
  • + *
  • More than 1k likes will result in an inaccurate number
  • + *
  • This will fail for other languages than English. However as long as the Extractor + * only uses "en-GB" (as seen in {@link + * org.schabi.newpipe.extractor.services.youtube.YoutubeService#getSupportedLocalizations}) + * , everything will work fine.
  • *
*
* Consider using {@link #getTextualLikeCount()} */ @Override public int getLikeCount() throws ParsingException { - // This may return a language dependent version, e.g. in German: 3,3 Mio - final String textualLikeCount = getTextualLikeCount(); + // Try first to get the exact like count by using the accessibility data + final String likeCount; try { - if (Utils.isBlank(textualLikeCount)) { + likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(json, + "actionButtons.commentActionButtonsRenderer.likeButton.toggleButtonRenderer.accessibilityData.accessibilityData.label")); + } catch (final Exception e) { + // Use the approximate like count returned into the voteCount object + // This may return a language dependent version, e.g. in German: 3,3 Mio + final String textualLikeCount = getTextualLikeCount(); + try { + if (Utils.isBlank(textualLikeCount)) { + return 0; + } + + return (int) Utils.mixedNumberWordToLong(textualLikeCount); + } catch (final Exception i) { + throw new ParsingException( + "Unexpected error while converting textual like count to like count", i); + } + } + + try { + if (Utils.isBlank(likeCount)) { return 0; } - return (int) Utils.mixedNumberWordToLong(textualLikeCount); - } catch (Exception e) { - throw new ParsingException("Unexpected error while converting textual like count to like count", e); + return Integer.parseInt(likeCount); + } catch (final Exception e) { + throw new ParsingException("Unexpected error while parsing like count as Integer", e); } } @@ -133,8 +154,8 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract return EMPTY_STRING; } return getTextFromObject(voteCountObj); - } catch (Exception e) { - throw new ParsingException("Could not get vote count", e); + } catch (final Exception e) { + throw new ParsingException("Could not get the vote count", e); } } @@ -148,9 +169,10 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract return EMPTY_STRING; } final String commentText = getTextFromObject(contentText); - // youtube adds U+FEFF in some comments. eg. https://www.youtube.com/watch?v=Nj4F63E59io + // YouTube adds U+FEFF in some comments. + // eg. https://www.youtube.com/watch?v=Nj4F63E59io return Utils.removeUTF8BOM(commentText); - } catch (Exception e) { + } catch (final Exception e) { throw new ParsingException("Could not get comment text", e); } } @@ -159,7 +181,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract public String getCommentId() throws ParsingException { try { return JsonUtils.getString(json, "commentId"); - } catch (Exception e) { + } catch (final Exception e) { throw new ParsingException("Could not get comment id", e); } } @@ -169,14 +191,16 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract try { JsonArray arr = JsonUtils.getArray(json, "authorThumbnail.thumbnails"); return JsonUtils.getString(arr.getObject(2), "url"); - } catch (Exception e) { + } catch (final Exception e) { throw new ParsingException("Could not get author thumbnail", e); } } @Override public boolean isHeartedByUploader() throws ParsingException { - return json.has("creatorHeart"); + final JsonObject commentActionButtonsRenderer = json.getObject("actionButtons") + .getObject("commentActionButtonsRenderer"); + return commentActionButtonsRenderer.has("creatorHeart"); } @Override @@ -185,15 +209,14 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract } public boolean isUploaderVerified() { - // impossible to get this information from the mobile layout - return false; + return json.has("authorCommentBadge"); } @Override public String getUploaderName() throws ParsingException { try { return getTextFromObject(JsonUtils.getObject(json, "authorText")); - } catch (Exception e) { + } catch (final Exception e) { return EMPTY_STRING; } } @@ -201,10 +224,10 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract @Override public String getUploaderUrl() throws ParsingException { try { - return "https://youtube.com/channel/" + JsonUtils.getString(json, "authorEndpoint.browseEndpoint.browseId"); - } catch (Exception e) { + return "https://www.youtube.com/channel/" + JsonUtils.getString(json, + "authorEndpoint.browseEndpoint.browseId"); + } catch (final Exception e) { return EMPTY_STRING; } } - } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java index 421fc13f3..0e83c07b8 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeCommentsLinkHandlerFactory.java @@ -16,7 +16,7 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory { @Override public String getUrl(String id) { - return "https://m.youtube.com/watch?v=" + id; + return "https://www.youtube.com/watch?v=" + id; } @Override