[YouTube] Improve detection of reCAPTCHA pages

This commit is contained in:
Mauricio Colli 2019-10-29 02:00:29 -03:00
parent 9a325b280d
commit c4fe2183ce
No known key found for this signature in database
GPG Key ID: F200BFD6F29DDD85
8 changed files with 59 additions and 26 deletions

View File

@ -62,6 +62,9 @@ public interface Downloader {
DownloadResponse head(String siteUrl) throws IOException, ReCaptchaException; DownloadResponse head(String siteUrl) throws IOException, ReCaptchaException;
DownloadResponse get(String siteUrl, Localization localization)
throws IOException, ReCaptchaException;
DownloadResponse get(String siteUrl, DownloadRequest request) DownloadResponse get(String siteUrl, DownloadRequest request)
throws IOException, ReCaptchaException; throws IOException, ReCaptchaException;

View File

@ -7,6 +7,7 @@ import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.NewPipe; import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
@ -14,6 +15,7 @@ import org.schabi.newpipe.extractor.channel.ChannelExtractor;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.utils.DonationLinkHelper; import org.schabi.newpipe.extractor.utils.DonationLinkHelper;
@ -60,8 +62,8 @@ public class YoutubeChannelExtractor extends ChannelExtractor {
@Override @Override
public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
String channelUrl = super.getUrl() + CHANNEL_URL_PARAMETERS; String channelUrl = super.getUrl() + CHANNEL_URL_PARAMETERS;
String pageContent = downloader.download(channelUrl); final DownloadResponse response = downloader.get(channelUrl);
doc = Jsoup.parse(pageContent, channelUrl); doc = YoutubeParsingHelper.parseAndCheckPage(channelUrl, response);
} }
@Override @Override

View File

@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
@ -35,8 +36,9 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor {
@Override @Override
public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
String pageContent = downloader.download(getUrl()); final String url = getUrl();
doc = Jsoup.parse(pageContent, getUrl()); final DownloadResponse response = downloader.get(url);
doc = YoutubeParsingHelper.parseAndCheckPage(url, response);
} }
@Override @Override

View File

@ -3,6 +3,7 @@ package org.schabi.newpipe.extractor.services.youtube.extractors;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
@ -12,6 +13,7 @@ import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler; import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
import org.schabi.newpipe.extractor.utils.Localization; import org.schabi.newpipe.extractor.utils.Localization;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.utils.Parser;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
@ -52,13 +54,9 @@ public class YoutubeSearchExtractor extends SearchExtractor {
@Override @Override
public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
final String site;
final String url = getUrl(); final String url = getUrl();
//String url = builder.build().toString(); final DownloadResponse response = downloader.get(url, getLocalization());
//if we've been passed a valid language code, append it to the URL doc = YoutubeParsingHelper.parseAndCheckPage(url, response);
site = downloader.download(url, getLocalization());
doc = Jsoup.parse(site, url);
} }
@Override @Override

View File

@ -18,6 +18,7 @@ import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.linkhandler.LinkHandler; import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
import org.schabi.newpipe.extractor.services.youtube.ItagItem; import org.schabi.newpipe.extractor.services.youtube.ItagItem;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.*; import org.schabi.newpipe.extractor.stream.*;
import org.schabi.newpipe.extractor.utils.Localization; import org.schabi.newpipe.extractor.utils.Localization;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.utils.Parser;
@ -536,7 +537,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
if (watch.size() < 1) { if (watch.size() < 1) {
return null;// prevent the snackbar notification "report error" on age-restricted videos return null;// prevent the snackbar notification "report error" on age-restricted videos
} }
collector.commit(extractVideoPreviewInfo(watch.first().select("li").first())); collector.commit(extractVideoPreviewInfo(watch.first().select("li").first()));
return collector.getItems().get(0); return collector.getItems().get(0);
} catch (Exception e) { } catch (Exception e) {
@ -611,18 +612,12 @@ public class YoutubeStreamExtractor extends StreamExtractor {
private String pageHtml = null; private String pageHtml = null;
private String getPageHtml(Downloader downloader) throws IOException, ExtractionException {
final String verifiedUrl = getUrl() + VERIFIED_URL_PARAMS;
if (pageHtml == null) {
pageHtml = downloader.download(verifiedUrl);
}
return pageHtml;
}
@Override @Override
public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException { public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
final String pageContent = getPageHtml(downloader); final String verifiedUrl = getUrl() + VERIFIED_URL_PARAMS;
doc = Jsoup.parse(pageContent, getUrl()); final DownloadResponse response = downloader.get(verifiedUrl);
pageHtml = response.getResponseBody();
doc = YoutubeParsingHelper.parseAndCheckPage(verifiedUrl, response);
final String playerUrl; final String playerUrl;
// Check if the video is age restricted // Check if the video is age restricted
@ -634,7 +629,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
playerUrl = info.url; playerUrl = info.url;
isAgeRestricted = true; isAgeRestricted = true;
} else { } else {
final JsonObject ytPlayerConfig = getPlayerConfig(pageContent); final JsonObject ytPlayerConfig = getPlayerConfig();
playerArgs = getPlayerArgs(ytPlayerConfig); playerArgs = getPlayerArgs(ytPlayerConfig);
playerUrl = getPlayerUrl(ytPlayerConfig); playerUrl = getPlayerUrl(ytPlayerConfig);
isAgeRestricted = false; isAgeRestricted = false;
@ -650,9 +645,9 @@ public class YoutubeStreamExtractor extends StreamExtractor {
} }
} }
private JsonObject getPlayerConfig(String pageContent) throws ParsingException { private JsonObject getPlayerConfig() throws ParsingException {
try { try {
String ytPlayerConfigRaw = Parser.matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageContent); String ytPlayerConfigRaw = Parser.matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageHtml);
return JsonParser.object().from(ytPlayerConfigRaw); return JsonParser.object().from(ytPlayerConfigRaw);
} catch (Parser.RegexException e) { } catch (Parser.RegexException e) {
String errorReason = getErrorMessage(); String errorReason = getErrorMessage();

View File

@ -24,12 +24,14 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.Downloader; import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.kiosk.KioskExtractor; import org.schabi.newpipe.extractor.kiosk.KioskExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.StreamInfoItem; import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector; import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.utils.Localization; import org.schabi.newpipe.extractor.utils.Localization;
@ -56,8 +58,8 @@ public class YoutubeTrendingExtractor extends KioskExtractor<StreamInfoItem> {
url += "?gl=" + contentCountry; url += "?gl=" + contentCountry;
} }
String pageContent = downloader.download(url); final DownloadResponse response = downloader.get(url);
doc = Jsoup.parse(pageContent, url); doc = YoutubeParsingHelper.parseAndCheckPage(url, response);
} }
@Override @Override

View File

@ -1,7 +1,11 @@
package org.schabi.newpipe.extractor.services.youtube.linkHandler; package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import java.net.URL; import java.net.URL;
@ -30,6 +34,23 @@ public class YoutubeParsingHelper {
private YoutubeParsingHelper() { private YoutubeParsingHelper() {
} }
private static final String[] RECAPTCHA_DETECTION_SELECTORS = {
"form[action*=\"/das_captcha\"]",
"input[name*=\"action_recaptcha_verify\"]"
};
public static Document parseAndCheckPage(final String url, final DownloadResponse response) throws ReCaptchaException {
final Document document = Jsoup.parse(response.getResponseBody(), url);
for (String detectionSelector : RECAPTCHA_DETECTION_SELECTORS) {
if (!document.select(detectionSelector).isEmpty()) {
throw new ReCaptchaException("reCAPTCHA challenge requested (detected with selector: \"" + detectionSelector + "\")", url);
}
}
return document;
}
public static boolean isYoutubeURL(URL url) { public static boolean isYoutubeURL(URL url) {
String host = url.getHost(); String host = url.getHost();
return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com") return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com")

View File

@ -16,6 +16,8 @@ import org.schabi.newpipe.extractor.DownloadResponse;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.utils.Localization; import org.schabi.newpipe.extractor.utils.Localization;
import static java.util.Collections.singletonList;
/* /*
* Created by Christian Schabesberger on 28.01.16. * Created by Christian Schabesberger on 28.01.16.
* *
@ -194,6 +196,14 @@ public class Downloader implements org.schabi.newpipe.extractor.Downloader {
return new DownloadResponse(con.getResponseCode(), null, con.getHeaderFields()); return new DownloadResponse(con.getResponseCode(), null, con.getHeaderFields());
} }
@Override
public DownloadResponse get(String siteUrl, Localization localization) throws IOException, ReCaptchaException {
final Map<String, List<String>> requestHeaders = new HashMap<>();
requestHeaders.put("Accept-Language", singletonList(localization.getLanguage()));
return get(siteUrl, new DownloadRequest(null, requestHeaders));
}
@Override @Override
public DownloadResponse get(String siteUrl, DownloadRequest request) public DownloadResponse get(String siteUrl, DownloadRequest request)
throws IOException, ReCaptchaException { throws IOException, ReCaptchaException {