Use the youtubei API for YouTube comments

Migrate YouTube comments to the desktop version by using the `next` endpoint of the InnerTube internal API.
With the desktop version, we are able to get the exact like count of YouTube comments (by parsing the accessibility data) (the current extraction is used as a fallback). We are also now able to get if the uploader of the comment is verified or not.

Co-authored-by: TiA4f8R <74829229+TiA4f8R@users.noreply.github.com>
This commit is contained in:
FireMasterK 2021-07-29 23:23:43 +05:30 committed by TiA4f8R
parent 286d839a3b
commit f3e4c9d689
No known key found for this signature in database
GPG Key ID: E6D3E7F5949450DD
4 changed files with 188 additions and 157 deletions

View File

@ -2,6 +2,7 @@ package org.schabi.newpipe.extractor.comments;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
@ -17,7 +18,7 @@ public abstract class CommentsExtractor extends ListExtractor<CommentsInfoItem>
* @apiNote Warning: This method is experimental and may get removed in a future release.
* @return <code>true</code> if the comments are disabled otherwise <code>false</code> (default)
*/
public boolean isCommentsDisabled() {
public boolean isCommentsDisabled() throws ExtractionException {
return false;
}

View File

@ -1,8 +1,18 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse;
import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.prepareDesktopJsonBuilder;
import static org.schabi.newpipe.extractor.utils.Utils.UTF_8;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.comments.CommentsExtractor;
@ -10,38 +20,19 @@ import org.schabi.newpipe.extractor.comments.CommentsInfoItem;
import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor;
import org.schabi.newpipe.extractor.comments.CommentsInfoItemsCollector;
import org.schabi.newpipe.extractor.downloader.Downloader;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Parser;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.regex.Pattern;
import static java.util.Collections.singletonList;
import static org.schabi.newpipe.extractor.utils.Utils.UTF_8;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonWriter;
public class YoutubeCommentsExtractor extends CommentsExtractor {
// using the mobile site for comments because it loads faster and uses get requests instead of post
private static final String USER_AGENT = "Mozilla/5.0 (Android 9; Mobile; rv:78.0) Gecko/20100101 Firefox/78.0";
private static final Pattern YT_CLIENT_NAME_PATTERN = Pattern.compile("INNERTUBE_CONTEXT_CLIENT_NAME\\\":(.*?)[,}]");
private String ytClientVersion;
private String ytClientName;
private String responseBody;
private JsonObject nextResponse;
/**
* Caching mechanism and holder of the commentsDisabled value.
@ -52,6 +43,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
* If the method or another one that is depending on disabled comments
* is now called again, the method execution can avoid unnecessary calls
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private Optional<Boolean> optCommentsDisabled = Optional.empty();
public YoutubeCommentsExtractor(
@ -60,6 +52,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
super(service, uiHandler);
}
@Nonnull
@Override
public InfoItemsPage<CommentsInfoItem> getInitialPage()
throws IOException, ExtractionException {
@ -81,163 +74,177 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
/**
* Finds the initial comments token and initializes commentsDisabled.
*
* @return the continuation token or null if none was found
*/
private String findInitialCommentsToken() {
final String continuationStartPattern = "continuation\":\"";
@Nullable
private String findInitialCommentsToken() throws ExtractionException {
String commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}");
if (commentsTokenInside == null || !commentsTokenInside.contains(continuationStartPattern)) {
commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}");
final JsonArray jArray = JsonUtils.getArray(nextResponse,
"contents.twoColumnWatchNextResults.results.results.contents");
final Optional<Object> itemSectionRenderer = jArray.stream().filter(o -> {
JsonObject jObj = (JsonObject) o;
if (jObj.has("itemSectionRenderer")) {
try {
return JsonUtils.getString(jObj, "itemSectionRenderer.targetId")
.equals("comments-section");
} catch (final ParsingException ignored) {
}
}
return false;
}).findFirst();
final String token;
if (itemSectionRenderer.isPresent()) {
token = JsonUtils.getString(((JsonObject) itemSectionRenderer.get())
.getObject("itemSectionRenderer").getArray("contents").getObject(0),
"continuationItemRenderer.continuationEndpoint.continuationCommand.token");
} else {
token = null;
}
// If no continuation token is found the comments are disabled
if (commentsTokenInside == null || !commentsTokenInside.contains(continuationStartPattern)) {
if (token == null) {
optCommentsDisabled = Optional.of(true);
return null;
}
// If a continuation token is found there are >= 0 comments
final String commentsToken = findValue(commentsTokenInside, continuationStartPattern, "\"");
optCommentsDisabled = Optional.of(false);
return commentsToken;
return token;
}
@Nonnull
private InfoItemsPage<CommentsInfoItem> getInfoItemsPageForDisabledComments() {
return new InfoItemsPage<>(Collections.emptyList(), null, Collections.emptyList());
}
private Page getNextPage(final JsonObject ajaxJson) throws ParsingException {
final JsonArray arr;
@Nullable
private Page getNextPage(@Nonnull final JsonObject ajaxJson) throws ExtractionException {
final JsonArray jsonArray;
final JsonArray onResponseReceivedEndpoints = ajaxJson.getArray(
"onResponseReceivedEndpoints");
final JsonObject endpoint = onResponseReceivedEndpoints.getObject(
onResponseReceivedEndpoints.size() - 1);
try {
arr = JsonUtils.getArray(ajaxJson, "response.continuationContents.commentSectionContinuation.continuations");
jsonArray = endpoint.getObject("reloadContinuationItemsCommand", endpoint.getObject(
"appendContinuationItemsAction")).getArray("continuationItems");
} catch (final Exception e) {
return null;
}
if (arr.isEmpty()) {
if (jsonArray.isEmpty()) {
return null;
}
final String continuation;
try {
continuation = JsonUtils.getString(arr.getObject(0), "nextContinuationData.continuation");
continuation = JsonUtils.getString(jsonArray.getObject(jsonArray.size() - 1),
"continuationItemRenderer.continuationEndpoint.continuationCommand.token");
} catch (final Exception e) {
return null;
}
return getNextPage(continuation);
}
@Nonnull
private Page getNextPage(final String continuation) throws ParsingException {
final Map<String, String> params = new HashMap<>();
params.put("action_get_comments", "1");
params.put("pbj", "1");
params.put("ctoken", continuation);
try {
return new Page("https://m.youtube.com/watch_comment?" + getDataString(params));
} catch (final UnsupportedEncodingException e) {
throw new ParsingException("Could not get next page url", e);
}
return new Page(getUrl(), continuation); // URL is ignored tho
}
@Override
public InfoItemsPage<CommentsInfoItem> getPage(final Page page) throws IOException, ExtractionException {
public InfoItemsPage<CommentsInfoItem> getPage(final Page page)
throws IOException, ExtractionException {
if (optCommentsDisabled.orElse(false)) {
return getInfoItemsPageForDisabledComments();
}
if (page == null || isNullOrEmpty(page.getUrl())) {
throw new IllegalArgumentException("Page doesn't contain an URL");
if (page == null || isNullOrEmpty(page.getId())) {
throw new IllegalArgumentException("Page doesn't have the continuation.");
}
final String ajaxResponse = makeAjaxRequest(page.getUrl());
final JsonObject ajaxJson;
try {
ajaxJson = JsonParser.array().from(ajaxResponse).getObject(1);
} catch (final Exception e) {
throw new ParsingException("Could not parse json data for comments", e);
}
final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(getServiceId());
final Localization localization = getExtractorLocalization();
final byte[] body = JsonWriter.string(prepareDesktopJsonBuilder(localization,
getExtractorContentCountry())
.value("continuation", page.getId())
.done())
.getBytes(UTF_8);
final JsonObject ajaxJson = getJsonPostResponse("next", body, localization);
final CommentsInfoItemsCollector collector = new CommentsInfoItemsCollector(
getServiceId());
collectCommentsFrom(collector, ajaxJson);
return new InfoItemsPage<>(collector, getNextPage(ajaxJson));
}
private void collectCommentsFrom(final CommentsInfoItemsCollector collector, final JsonObject ajaxJson) throws ParsingException {
final JsonArray contents;
try {
contents = JsonUtils.getArray(ajaxJson, "response.continuationContents.commentSectionContinuation.items");
} catch (final Exception e) {
//no comments
private void collectCommentsFrom(final CommentsInfoItemsCollector collector,
@Nonnull final JsonObject ajaxJson) throws ParsingException {
final JsonArray onResponseReceivedEndpoints = ajaxJson.getArray(
"onResponseReceivedEndpoints");
final JsonObject commentsEndpoint = onResponseReceivedEndpoints.getObject(
onResponseReceivedEndpoints.size() - 1);
final String path;
if (commentsEndpoint.has("reloadContinuationItemsCommand")) {
path = "reloadContinuationItemsCommand.continuationItems";
} else if (commentsEndpoint.has("appendContinuationItemsAction")) {
path = "appendContinuationItemsAction.continuationItems";
} else {
// No comments
return;
}
final JsonArray contents;
try {
contents = (JsonArray) JsonUtils.getArray(commentsEndpoint, path).clone();
} catch (final Exception e) {
// No comments
return;
}
final int index = contents.size() - 1;
if (contents.getObject(index).has("continuationItemRenderer")) {
contents.remove(index);
}
final List<Object> comments;
try {
comments = JsonUtils.getValues(contents, "commentThreadRenderer.comment.commentRenderer");
comments = JsonUtils.getValues(contents,
"commentThreadRenderer.comment.commentRenderer");
} catch (final Exception e) {
throw new ParsingException("unable to get parse youtube comments", e);
throw new ParsingException("Unable to get parse youtube comments", e);
}
for (final Object c : comments) {
if (c instanceof JsonObject) {
final CommentsInfoItemExtractor extractor =
new YoutubeCommentsInfoItemExtractor((JsonObject) c, getUrl(), getTimeAgoParser());
final CommentsInfoItemExtractor extractor = new YoutubeCommentsInfoItemExtractor(
(JsonObject) c, getUrl(), getTimeAgoParser());
collector.commit(extractor);
}
}
}
@Override
public void onFetchPage(@Nonnull final Downloader downloader) throws IOException, ExtractionException {
final Map<String, List<String>> requestHeaders = new HashMap<>();
requestHeaders.put("User-Agent", singletonList(USER_AGENT));
final Response response = downloader.get(getUrl(), requestHeaders, getExtractorLocalization());
responseBody = YoutubeParsingHelper.unescapeDocument(response.responseBody());
ytClientVersion = findValue(responseBody, "INNERTUBE_CONTEXT_CLIENT_VERSION\":\"", "\"");
ytClientName = Parser.matchGroup1(YT_CLIENT_NAME_PATTERN, responseBody);
public void onFetchPage(@Nonnull final Downloader downloader)
throws IOException, ExtractionException {
final Localization localization = getExtractorLocalization();
final byte[] body = JsonWriter.string(prepareDesktopJsonBuilder(localization,
getExtractorContentCountry())
.value("videoId", getId())
.done())
.getBytes(UTF_8);
nextResponse = getJsonPostResponse("next", body, localization);
}
private String makeAjaxRequest(final String siteUrl) throws IOException, ReCaptchaException {
final Map<String, List<String>> requestHeaders = new HashMap<>();
requestHeaders.put("Accept", singletonList("*/*"));
requestHeaders.put("User-Agent", singletonList(USER_AGENT));
requestHeaders.put("X-YouTube-Client-Version", singletonList(ytClientVersion));
requestHeaders.put("X-YouTube-Client-Name", singletonList(ytClientName));
return getDownloader().get(siteUrl, requestHeaders, getExtractorLocalization()).responseBody();
}
private String getDataString(final Map<String, String> params) throws UnsupportedEncodingException {
final StringBuilder result = new StringBuilder();
boolean first = true;
for (final Map.Entry<String, String> entry : params.entrySet()) {
if (first) {
first = false;
} else {
result.append("&");
}
result.append(URLEncoder.encode(entry.getKey(), UTF_8));
result.append("=");
result.append(URLEncoder.encode(entry.getValue(), UTF_8));
}
return result.toString();
}
private String findValue(final String doc, final String start, final String end) {
int beginIndex = doc.indexOf(start);
// Start string was not found
if (beginIndex == -1) {
return null;
}
beginIndex = beginIndex + start.length();
final int endIndex = doc.indexOf(end, beginIndex);
// End string was not found
if (endIndex == -1) {
return null;
}
return doc.substring(beginIndex, endIndex);
}
@Override
public boolean isCommentsDisabled() {
public boolean isCommentsDisabled() throws ExtractionException {
// Check if commentsDisabled has to be initialized
if (!optCommentsDisabled.isPresent()) {
// Initialize commentsDisabled

View File

@ -21,7 +21,9 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
private final String url;
private final TimeAgoParser timeAgoParser;
public YoutubeCommentsInfoItemExtractor(JsonObject json, String url, TimeAgoParser timeAgoParser) {
public YoutubeCommentsInfoItemExtractor(final JsonObject json,
final String url,
final TimeAgoParser timeAgoParser) {
this.json = json;
this.url = url;
this.timeAgoParser = timeAgoParser;
@ -37,7 +39,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
try {
final JsonArray arr = JsonUtils.getArray(json, "authorThumbnail.thumbnails");
return JsonUtils.getString(arr.getObject(2), "url");
} catch (Exception e) {
} catch (final Exception e) {
throw new ParsingException("Could not get thumbnail url", e);
}
}
@ -46,7 +48,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
public String getName() throws ParsingException {
try {
return getTextFromObject(JsonUtils.getObject(json, "authorText"));
} catch (Exception e) {
} catch (final Exception e) {
return EMPTY_STRING;
}
}
@ -55,7 +57,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
public String getTextualUploadDate() throws ParsingException {
try {
return getTextFromObject(JsonUtils.getObject(json, "publishedTimeText"));
} catch (Exception e) {
} catch (final Exception e) {
throw new ParsingException("Could not get publishedTimeText", e);
}
}
@ -64,7 +66,8 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
@Override
public DateWrapper getUploadDate() throws ParsingException {
String textualPublishedTime = getTextualUploadDate();
if (timeAgoParser != null && textualPublishedTime != null && !textualPublishedTime.isEmpty()) {
if (timeAgoParser != null && textualPublishedTime != null
&& !textualPublishedTime.isEmpty()) {
return timeAgoParser.parse(textualPublishedTime);
} else {
return null;
@ -72,33 +75,51 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
}
/**
* @implNote The method is parsing internally a localized string.<br>
* @implNote The method tries first to get the exact like count by using the accessibility data
* returned. But if the parsing of this accessibility data fails, the method parses internally
* a localized string.
* <br>
* <ul>
* <li>
* More than 1k likes will result in an inaccurate number
* </li>
* <li>
* This will fail for other languages than English.
* However as long as the Extractor only uses "en-GB"
* (as seen in {@link org.schabi.newpipe.extractor.services.youtube.YoutubeService#SUPPORTED_LANGUAGES})
* everything will work fine.
* </li>
* <li>More than 1k likes will result in an inaccurate number</li>
* <li>This will fail for other languages than English. However as long as the Extractor
* only uses "en-GB" (as seen in {@link
* org.schabi.newpipe.extractor.services.youtube.YoutubeService#getSupportedLocalizations})
* , everything will work fine.</li>
* </ul>
* <br>
* Consider using {@link #getTextualLikeCount()}
*/
@Override
public int getLikeCount() throws ParsingException {
// This may return a language dependent version, e.g. in German: 3,3 Mio
final String textualLikeCount = getTextualLikeCount();
// Try first to get the exact like count by using the accessibility data
final String likeCount;
try {
if (Utils.isBlank(textualLikeCount)) {
likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(json,
"actionButtons.commentActionButtonsRenderer.likeButton.toggleButtonRenderer.accessibilityData.accessibilityData.label"));
} catch (final Exception e) {
// Use the approximate like count returned into the voteCount object
// This may return a language dependent version, e.g. in German: 3,3 Mio
final String textualLikeCount = getTextualLikeCount();
try {
if (Utils.isBlank(textualLikeCount)) {
return 0;
}
return (int) Utils.mixedNumberWordToLong(textualLikeCount);
} catch (final Exception i) {
throw new ParsingException(
"Unexpected error while converting textual like count to like count", i);
}
}
try {
if (Utils.isBlank(likeCount)) {
return 0;
}
return (int) Utils.mixedNumberWordToLong(textualLikeCount);
} catch (Exception e) {
throw new ParsingException("Unexpected error while converting textual like count to like count", e);
return Integer.parseInt(likeCount);
} catch (final Exception e) {
throw new ParsingException("Unexpected error while parsing like count as Integer", e);
}
}
@ -133,8 +154,8 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
return EMPTY_STRING;
}
return getTextFromObject(voteCountObj);
} catch (Exception e) {
throw new ParsingException("Could not get vote count", e);
} catch (final Exception e) {
throw new ParsingException("Could not get the vote count", e);
}
}
@ -148,9 +169,10 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
return EMPTY_STRING;
}
final String commentText = getTextFromObject(contentText);
// youtube adds U+FEFF in some comments. eg. https://www.youtube.com/watch?v=Nj4F63E59io<feff>
// YouTube adds U+FEFF in some comments.
// eg. https://www.youtube.com/watch?v=Nj4F63E59io<feff>
return Utils.removeUTF8BOM(commentText);
} catch (Exception e) {
} catch (final Exception e) {
throw new ParsingException("Could not get comment text", e);
}
}
@ -159,7 +181,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
public String getCommentId() throws ParsingException {
try {
return JsonUtils.getString(json, "commentId");
} catch (Exception e) {
} catch (final Exception e) {
throw new ParsingException("Could not get comment id", e);
}
}
@ -169,14 +191,16 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
try {
JsonArray arr = JsonUtils.getArray(json, "authorThumbnail.thumbnails");
return JsonUtils.getString(arr.getObject(2), "url");
} catch (Exception e) {
} catch (final Exception e) {
throw new ParsingException("Could not get author thumbnail", e);
}
}
@Override
public boolean isHeartedByUploader() throws ParsingException {
return json.has("creatorHeart");
final JsonObject commentActionButtonsRenderer = json.getObject("actionButtons")
.getObject("commentActionButtonsRenderer");
return commentActionButtonsRenderer.has("creatorHeart");
}
@Override
@ -185,15 +209,14 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
}
public boolean isUploaderVerified() {
// impossible to get this information from the mobile layout
return false;
return json.has("authorCommentBadge");
}
@Override
public String getUploaderName() throws ParsingException {
try {
return getTextFromObject(JsonUtils.getObject(json, "authorText"));
} catch (Exception e) {
} catch (final Exception e) {
return EMPTY_STRING;
}
}
@ -201,10 +224,10 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
@Override
public String getUploaderUrl() throws ParsingException {
try {
return "https://youtube.com/channel/" + JsonUtils.getString(json, "authorEndpoint.browseEndpoint.browseId");
} catch (Exception e) {
return "https://www.youtube.com/channel/" + JsonUtils.getString(json,
"authorEndpoint.browseEndpoint.browseId");
} catch (final Exception e) {
return EMPTY_STRING;
}
}
}

View File

@ -16,7 +16,7 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory {
@Override
public String getUrl(String id) {
return "https://m.youtube.com/watch?v=" + id;
return "https://www.youtube.com/watch?v=" + id;
}
@Override