From 5d883d100c96233a4f068f1853e11297bb5533fe Mon Sep 17 00:00:00 2001 From: wb9688 Date: Mon, 24 Feb 2020 18:24:36 +0100 Subject: [PATCH] Implement pagination in YoutubeSearchExtractor --- .../extractors/YoutubeSearchExtractor.java | 96 +++++++++++-------- .../YoutubeSearchQueryHandlerFactory.java | 8 +- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java index 5d04a3891..07954334f 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java @@ -5,9 +5,7 @@ import com.grack.nanojson.JsonObject; import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParserException; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.downloader.Downloader; @@ -19,12 +17,12 @@ import org.schabi.newpipe.extractor.localization.TimeAgoParser; import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector; import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; -import org.schabi.newpipe.extractor.utils.Parser; import java.io.IOException; -import java.io.UnsupportedEncodingException; -import java.net.MalformedURLException; -import java.net.URL; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import javax.annotation.Nonnull; @@ -73,58 +71,70 @@ public class YoutubeSearchExtractor extends SearchExtractor { @Override public String getSearchSuggestion() { - final Element el = doc.select("div[class*=\"spell-correction\"]").first(); - if (el != null) { - return el.select("a").first().text(); - } else { + JsonObject showingResultsForRenderer = initialData.getObject("contents") + .getObject("twoColumnSearchResultsRenderer").getObject("primaryContents") + .getObject("sectionListRenderer").getArray("contents").getObject(0) + .getObject("itemSectionRenderer").getArray("contents").getObject(0) + .getObject("showingResultsForRenderer"); + if (showingResultsForRenderer == null) { return ""; + } else { + return showingResultsForRenderer.getObject("correctedQuery").getArray("runs") + .getObject(0).getString("text"); } } @Nonnull @Override public InfoItemsPage getInitialPage() throws ExtractionException { - return new InfoItemsPage<>(collectItems(doc), getNextPageUrl()); + InfoItemsSearchCollector collector = getInfoItemSearchCollector(); + JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer") + .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents") + .getObject(0).getObject("itemSectionRenderer").getArray("contents"); + + collectStreamsFrom(collector, videos); + return new InfoItemsPage<>(collector, getNextPageUrl()); } @Override public String getNextPageUrl() throws ExtractionException { - return getUrl() + "&page=" + 2; + return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer") + .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents") + .getObject(0).getObject("itemSectionRenderer").getArray("continuations")); } @Override public InfoItemsPage getPage(String pageUrl) throws IOException, ExtractionException { - // TODO: Get extracting next pages working - final String response = getDownloader().get(pageUrl, getExtractorLocalization()).responseBody(); - doc = Jsoup.parse(response, pageUrl); + if (pageUrl == null || pageUrl.isEmpty()) { + throw new ExtractionException(new IllegalArgumentException("Page url is empty or null")); + } - return new InfoItemsPage<>(collectItems(doc), getNextPageUrlFromCurrentUrl(pageUrl)); - } - - private String getNextPageUrlFromCurrentUrl(String currentUrl) - throws MalformedURLException, UnsupportedEncodingException { - final int pageNr = Integer.parseInt( - Parser.compatParseMap( - new URL(currentUrl) - .getQuery()) - .get("page")); - - return currentUrl.replace("&page=" + pageNr, - "&page=" + Integer.toString(pageNr + 1)); - } - - private InfoItemsSearchCollector collectItems(Document doc) throws NothingFoundException, ParsingException { InfoItemsSearchCollector collector = getInfoItemSearchCollector(); + JsonArray ajaxJson; + try { + Map> headers = new HashMap<>(); + headers.put("X-YouTube-Client-Name", Collections.singletonList("1")); + headers.put("X-YouTube-Client-Version", Collections.singletonList("2.20200221.03.00")); // TODO: Automatically get YouTube client version somehow + final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody(); + ajaxJson = JsonParser.array().from(response); + } catch (JsonParserException pe) { + throw new ParsingException("Could not parse json data for next streams", pe); + } + + JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response") + .getObject("continuationContents").getObject("itemSectionContinuation"); + + collectStreamsFrom(collector, itemSectionRenderer.getArray("contents")); + + return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations"))); + } + + private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException, ParsingException { collector.reset(); final TimeAgoParser timeAgoParser = getTimeAgoParser(); - if (initialData == null) initialData = YoutubeParsingHelper.getInitialData(doc.toString()); - JsonArray list = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer") - .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents") - .getObject(0).getObject("itemSectionRenderer").getArray("contents"); - - for (Object item : list) { + for (Object item : videos) { if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) { throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer") .getObject("bodyText").getArray("runs").getObject(0).getString("text")); @@ -136,7 +146,17 @@ public class YoutubeSearchExtractor extends SearchExtractor { collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer"))); } } - return collector; } + private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException { + if (continuations == null) { + return ""; + } + + JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData"); + String continuation = nextContinuationData.getString("continuation"); + String clickTrackingParams = nextContinuationData.getString("clickTrackingParams"); + return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation + + "&itct=" + clickTrackingParams; + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeSearchQueryHandlerFactory.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeSearchQueryHandlerFactory.java index c17600742..13481b345 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeSearchQueryHandlerFactory.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/linkHandler/YoutubeSearchQueryHandlerFactory.java @@ -24,13 +24,13 @@ public class YoutubeSearchQueryHandlerFactory extends SearchQueryHandlerFactory public String getUrl(String searchString, List contentFilters, String sortFilter) throws ParsingException { try { final String url = "https://www.youtube.com/results" - + "?q=" + URLEncoder.encode(searchString, CHARSET_UTF_8); + + "?search_query=" + URLEncoder.encode(searchString, CHARSET_UTF_8); if (contentFilters.size() > 0) { switch (contentFilters.get(0)) { - case VIDEOS: return url + "&sp=EgIQAVAU"; - case CHANNELS: return url + "&sp=EgIQAlAU"; - case PLAYLISTS: return url + "&sp=EgIQA1AU"; + case VIDEOS: return url + "&sp=EgIQAQ%253D%253D"; + case CHANNELS: return url + "&sp=EgIQAg%253D%253D"; + case PLAYLISTS: return url + "&sp=EgIQAw%253D%253D"; case ALL: default: }