From ed850d0688d069a7f54f2c9c9bfd2be0f58ce055 Mon Sep 17 00:00:00 2001 From: bopol Date: Fri, 5 Mar 2021 13:33:25 +0100 Subject: [PATCH 1/2] [youtube] improve comments extraction performance - do not parse responseBody twice for continuation instead try to get commentsTokenInside with the new pattern ("sectionListRenderer") and try again with the old pattern ("commentSectionRenderer") on failure - do not unescape responseBody multiple times -> parse responseBody less times --- .../youtube/YoutubeParsingHelper.java | 10 ++++++++ .../extractors/YoutubeCommentsExtractor.java | 24 +++++++------------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java index bde3520ed..4368e1ce8 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java @@ -824,4 +824,14 @@ public class YoutubeParsingHelper { return false; } + + public static String unescapeDocument(final String doc) { + return doc + .replaceAll("\\\\x22", "\"") + .replaceAll("\\\\x7b", "{") + .replaceAll("\\\\x7d", "}") + .replaceAll("\\\\x5b", "[") + .replaceAll("\\\\x5d", "]"); + } + } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java index ee76f3598..7ab811580 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java @@ -15,6 +15,7 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler; +import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper; import org.schabi.newpipe.extractor.utils.JsonUtils; import org.schabi.newpipe.extractor.utils.Parser; @@ -46,11 +47,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { - final String commentsTokenInside; - if (responseBody.contains("commentSectionRenderer")) { + String commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}"); + if (!commentsTokenInside.contains("continuation")) { commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}"); - } else { - commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}"); } final String commentsToken = findValue(commentsTokenInside, "continuation\":\"", "\""); return getPage(getNextPage(commentsToken)); @@ -133,7 +132,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { final Map> requestHeaders = new HashMap<>(); requestHeaders.put("User-Agent", singletonList(USER_AGENT)); final Response response = downloader.get(getUrl(), requestHeaders, getExtractorLocalization()); - responseBody = response.responseBody(); + responseBody = YoutubeParsingHelper.unescapeDocument(response.responseBody()); ytClientVersion = findValue(responseBody, "INNERTUBE_CONTEXT_CLIENT_VERSION\":\"", "\""); ytClientName = Parser.matchGroup1(YT_CLIENT_NAME_PATTERN, responseBody); } @@ -163,16 +162,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { return result.toString(); } - private String findValue(String doc, String start, String end) { - final String unescaped = doc - .replaceAll("\\\\x22", "\"") - .replaceAll("\\\\x7b", "{") - .replaceAll("\\\\x7d", "}") - .replaceAll("\\\\x5b", "[") - .replaceAll("\\\\x5d", "]"); - - final int beginIndex = unescaped.indexOf(start) + start.length(); - final int endIndex = unescaped.indexOf(end, beginIndex); - return unescaped.substring(beginIndex, endIndex); + private String findValue(final String doc, final String start, final String end) { + final int beginIndex = doc.indexOf(start) + start.length(); + final int endIndex = doc.indexOf(end, beginIndex); + return doc.substring(beginIndex, endIndex); } } From ff5273b882d8945e3e02a511aebb79ab195d102b Mon Sep 17 00:00:00 2001 From: bopol Date: Fri, 5 Mar 2021 14:39:01 +0100 Subject: [PATCH 2/2] Update extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java Co-authored-by: Tobi --- .../services/youtube/extractors/YoutubeCommentsExtractor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java index 7ab811580..4e6da2c53 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java @@ -48,7 +48,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor { @Override public InfoItemsPage getInitialPage() throws IOException, ExtractionException { String commentsTokenInside = findValue(responseBody, "sectionListRenderer", "}"); - if (!commentsTokenInside.contains("continuation")) { + if (!commentsTokenInside.contains("continuation\":\"")) { commentsTokenInside = findValue(responseBody, "commentSectionRenderer", "}"); } final String commentsToken = findValue(commentsTokenInside, "continuation\":\"", "\"");