Implement pagination in YoutubeSearchExtractor

This commit is contained in:
wb9688 2020-02-24 18:24:36 +01:00 committed by TobiGr
parent c0a8e01889
commit 5d883d100c
2 changed files with 62 additions and 42 deletions

View File

@ -5,9 +5,7 @@ import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser; import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException; import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.InfoItem; import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.StreamingService; import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader; import org.schabi.newpipe.extractor.downloader.Downloader;
@ -19,12 +17,12 @@ import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector; import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
import org.schabi.newpipe.extractor.search.SearchExtractor; import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper; import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.utils.Parser;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException; import java.util.Collections;
import java.net.MalformedURLException; import java.util.HashMap;
import java.net.URL; import java.util.List;
import java.util.Map;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
@ -73,58 +71,70 @@ public class YoutubeSearchExtractor extends SearchExtractor {
@Override @Override
public String getSearchSuggestion() { public String getSearchSuggestion() {
final Element el = doc.select("div[class*=\"spell-correction\"]").first(); JsonObject showingResultsForRenderer = initialData.getObject("contents")
if (el != null) { .getObject("twoColumnSearchResultsRenderer").getObject("primaryContents")
return el.select("a").first().text(); .getObject("sectionListRenderer").getArray("contents").getObject(0)
} else { .getObject("itemSectionRenderer").getArray("contents").getObject(0)
.getObject("showingResultsForRenderer");
if (showingResultsForRenderer == null) {
return ""; return "";
} else {
return showingResultsForRenderer.getObject("correctedQuery").getArray("runs")
.getObject(0).getString("text");
} }
} }
@Nonnull @Nonnull
@Override @Override
public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException { public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {
return new InfoItemsPage<>(collectItems(doc), getNextPageUrl()); InfoItemsSearchCollector collector = getInfoItemSearchCollector();
JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
collectStreamsFrom(collector, videos);
return new InfoItemsPage<>(collector, getNextPageUrl());
} }
@Override @Override
public String getNextPageUrl() throws ExtractionException { public String getNextPageUrl() throws ExtractionException {
return getUrl() + "&page=" + 2; return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("continuations"));
} }
@Override @Override
public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException { public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
// TODO: Get extracting next pages working if (pageUrl == null || pageUrl.isEmpty()) {
final String response = getDownloader().get(pageUrl, getExtractorLocalization()).responseBody(); throw new ExtractionException(new IllegalArgumentException("Page url is empty or null"));
doc = Jsoup.parse(response, pageUrl);
return new InfoItemsPage<>(collectItems(doc), getNextPageUrlFromCurrentUrl(pageUrl));
} }
private String getNextPageUrlFromCurrentUrl(String currentUrl)
throws MalformedURLException, UnsupportedEncodingException {
final int pageNr = Integer.parseInt(
Parser.compatParseMap(
new URL(currentUrl)
.getQuery())
.get("page"));
return currentUrl.replace("&page=" + pageNr,
"&page=" + Integer.toString(pageNr + 1));
}
private InfoItemsSearchCollector collectItems(Document doc) throws NothingFoundException, ParsingException {
InfoItemsSearchCollector collector = getInfoItemSearchCollector(); InfoItemsSearchCollector collector = getInfoItemSearchCollector();
JsonArray ajaxJson;
try {
Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList("2.20200221.03.00")); // TODO: Automatically get YouTube client version somehow
final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();
ajaxJson = JsonParser.array().from(response);
} catch (JsonParserException pe) {
throw new ParsingException("Could not parse json data for next streams", pe);
}
JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response")
.getObject("continuationContents").getObject("itemSectionContinuation");
collectStreamsFrom(collector, itemSectionRenderer.getArray("contents"));
return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations")));
}
private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException, ParsingException {
collector.reset(); collector.reset();
final TimeAgoParser timeAgoParser = getTimeAgoParser(); final TimeAgoParser timeAgoParser = getTimeAgoParser();
if (initialData == null) initialData = YoutubeParsingHelper.getInitialData(doc.toString()); for (Object item : videos) {
JsonArray list = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
for (Object item : list) {
if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) { if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer") throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
.getObject("bodyText").getArray("runs").getObject(0).getString("text")); .getObject("bodyText").getArray("runs").getObject(0).getString("text"));
@ -136,7 +146,17 @@ public class YoutubeSearchExtractor extends SearchExtractor {
collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer"))); collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
} }
} }
return collector;
} }
private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException {
if (continuations == null) {
return "";
}
JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData");
String continuation = nextContinuationData.getString("continuation");
String clickTrackingParams = nextContinuationData.getString("clickTrackingParams");
return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation
+ "&itct=" + clickTrackingParams;
}
} }

View File

@ -24,13 +24,13 @@ public class YoutubeSearchQueryHandlerFactory extends SearchQueryHandlerFactory
public String getUrl(String searchString, List<String> contentFilters, String sortFilter) throws ParsingException { public String getUrl(String searchString, List<String> contentFilters, String sortFilter) throws ParsingException {
try { try {
final String url = "https://www.youtube.com/results" final String url = "https://www.youtube.com/results"
+ "?q=" + URLEncoder.encode(searchString, CHARSET_UTF_8); + "?search_query=" + URLEncoder.encode(searchString, CHARSET_UTF_8);
if (contentFilters.size() > 0) { if (contentFilters.size() > 0) {
switch (contentFilters.get(0)) { switch (contentFilters.get(0)) {
case VIDEOS: return url + "&sp=EgIQAVAU"; case VIDEOS: return url + "&sp=EgIQAQ%253D%253D";
case CHANNELS: return url + "&sp=EgIQAlAU"; case CHANNELS: return url + "&sp=EgIQAg%253D%253D";
case PLAYLISTS: return url + "&sp=EgIQA1AU"; case PLAYLISTS: return url + "&sp=EgIQAw%253D%253D";
case ALL: case ALL:
default: default:
} }