diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractor.java index 5813cf09d..3f0192e9b 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractor.java @@ -10,40 +10,62 @@ import org.schabi.newpipe.extractor.localization.Localization; import org.schabi.newpipe.extractor.utils.Parser; import javax.annotation.Nonnull; +import java.util.regex.Pattern; /** - * YouTube restricts streaming their media in multiple ways by requiring clients to apply a cipher - * function on parameters of requests. - * The cipher function is sent alongside as a JavaScript function. + * The extractor of YouTube's base JavaScript player file. + * *

- * This class handling fetching the JavaScript file in order to allow other classes to extract the - * needed functions. + * YouTube restrict streaming their media in multiple ways by requiring their HTML5 clients to use + * a signature timestamp, and on streaming URLs a signature deobfuscation function for some + * contents and a throttling parameter deobfuscation one for all contents. + *

+ * + *

+ * This class handles fetching of this base JavaScript player file in order to allow other classes + * to extract the needed data. + *

+ * + *

+ * It will try to get the player URL from YouTube's IFrame resource first, and from a YouTube embed + * watch page as a fallback. + *

*/ public final class YoutubeJavaScriptExtractor { private static final String HTTPS = "https:"; + private static final String BASE_JS_PLAYER_URL_FORMAT = + "https://www.youtube.com/s/player/%s/player_ias.vflset/en_GB/base.js"; + private static final Pattern IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN = Pattern.compile( + "player\\\\/([a-z0-9]{8})\\\\/"); + private static final Pattern EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN = Pattern.compile( + "\"jsUrl\":\"(/s/player/[A-Za-z0-9]+/player_ias\\.vflset/[A-Za-z_-]+/base\\.js)\""); private static String cachedJavaScriptCode; private YoutubeJavaScriptExtractor() { } /** - * Extracts the JavaScript file. The result is cached, so subsequent calls use the result of - * previous calls. + * Extracts the JavaScript file. * - * @param videoId Does not influence the result, but a valid video id may help in the chance - * that YouTube tracks it. - * @return The whole JavaScript file as a string. - * @throws ParsingException If the extraction failed. + *

+ * The result is cached, so subsequent calls use the result of previous calls. + *

+ * + * @param videoId a YouTube video ID, which doesn't influence the result, but it may help in + * the chance that YouTube track it + * @return the whole JavaScript file as a string + * @throws ParsingException if the extraction failed */ @Nonnull - public static String extractJavaScriptCode(final String videoId) throws ParsingException { + public static String extractJavaScriptCode(@Nonnull final String videoId) + throws ParsingException { if (cachedJavaScriptCode == null) { String url; try { - url = YoutubeJavaScriptExtractor.extractJavaScriptUrl(); - } catch (final Exception i) { - url = YoutubeJavaScriptExtractor.extractJavaScriptUrl(videoId); + url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource(); + } catch (final Exception e) { + url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage(videoId); } final String playerJsUrl = YoutubeJavaScriptExtractor.cleanJavaScriptUrl(url); cachedJavaScriptCode = YoutubeJavaScriptExtractor.downloadJavaScriptCode(playerJsUrl); @@ -53,75 +75,83 @@ public final class YoutubeJavaScriptExtractor { } /** - * Same as {@link YoutubeJavaScriptExtractor#extractJavaScriptCode(String)} but with a constant - * value for videoId. - * Possible because the videoId has no influence on the result. + * Reset the cached JavaScript code. + * *

- * In the off chance that YouTube tracks with which video id the request is made, it may make - * sense to pass in video ids. - */ - @Nonnull - public static String extractJavaScriptCode() throws ParsingException { - return extractJavaScriptCode("d4IGg5dqeO8"); - } - - /** - * Reset the JavaScript code. It will be fetched again the next time - * {@link #extractJavaScriptCode()} or {@link #extractJavaScriptCode(String)} is called. + * It will be fetched again the next time {@link #extractJavaScriptCode(String)} is called. + *

*/ public static void resetJavaScriptCode() { cachedJavaScriptCode = null; } - public static String extractJavaScriptUrl() throws ParsingException { + @Nonnull + static String extractJavaScriptUrlWithIframeResource() throws ParsingException { + final String iframeUrl; + final String iframeContent; try { - final String iframeUrl = "https://www.youtube.com/iframe_api"; - final String iframeContent = NewPipe.getDownloader() - .get(iframeUrl, Localization.DEFAULT).responseBody(); - final String hashPattern = "player\\\\\\/([a-z0-9]{8})\\\\\\/"; - final String hash = Parser.matchGroup1(hashPattern, iframeContent); - - return String.format( - "https://www.youtube.com/s/player/%s/player_ias.vflset/en_US/base.js", hash); - } catch (final Exception ignored) { + iframeUrl = "https://www.youtube.com/iframe_api"; + iframeContent = NewPipe.getDownloader() + .get(iframeUrl, Localization.DEFAULT) + .responseBody(); + } catch (final Exception e) { + throw new ParsingException("Could not fetch IFrame resource", e); } - throw new ParsingException("Iframe API did not provide YouTube player js url"); + try { + final String hash = Parser.matchGroup1( + IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN, iframeContent); + return String.format(BASE_JS_PLAYER_URL_FORMAT, hash); + } catch (final Parser.RegexException e) { + throw new ParsingException( + "IFrame resource didn't provide JavaScript base player's hash", e); + } } - public static String extractJavaScriptUrl(final String videoId) throws ParsingException { + @Nonnull + static String extractJavaScriptUrlWithEmbedWatchPage(@Nonnull final String videoId) + throws ParsingException { + final String embedUrl; + final String embedPageContent; try { - final String embedUrl = "https://www.youtube.com/embed/" + videoId; - final String embedPageContent = NewPipe.getDownloader() - .get(embedUrl, Localization.DEFAULT).responseBody(); - - try { - final String assetsPattern = "\"assets\":.+?\"js\":\\s*(\"[^\"]+\")"; - return Parser.matchGroup1(assetsPattern, embedPageContent) - .replace("\\", "").replace("\"", ""); - } catch (final Parser.RegexException ex) { - // playerJsUrl is still available in the file, just somewhere else TODO - // it is ok not to find it, see how that's handled in getDeobfuscationCode() - final Document doc = Jsoup.parse(embedPageContent); - final Elements elems = doc.select("script").attr("name", "player_ias/base"); - for (final Element elem : elems) { - if (elem.attr("src").contains("base.js")) { - return elem.attr("src"); - } - } - } - } catch (final Exception ignored) { + embedUrl = "https://www.youtube.com/embed/" + videoId; + embedPageContent = NewPipe.getDownloader() + .get(embedUrl, Localization.DEFAULT) + .responseBody(); + } catch (final Exception e) { + throw new ParsingException("Could not fetch embedded watch page", e); } - throw new ParsingException("Embedded info did not provide YouTube player js url"); + // Parse HTML response with jsoup and look at script elements first + final Document doc = Jsoup.parse(embedPageContent); + final Elements elems = doc.select("script") + .attr("name", "player/base"); + for (final Element elem : elems) { + // Script URLs should be relative and not absolute + final String playerUrl = elem.attr("src"); + if (playerUrl.contains("base.js")) { + return playerUrl; + } + } + + // Use regexes to match the URL in a JavaScript embedded script of the HTML page + try { + return Parser.matchGroup1( + EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN, embedPageContent); + } catch (final Parser.RegexException e) { + throw new ParsingException( + "Embedded watch page didn't provide JavaScript base player's URL", e); + } } @Nonnull private static String cleanJavaScriptUrl(@Nonnull final String playerJsUrl) { if (playerJsUrl.startsWith("//")) { + // https part has to be added manually if the URL is protocol-relative return HTTPS + playerJsUrl; } else if (playerJsUrl.startsWith("/")) { - // sometimes https://www.youtube.com part has to be added manually + // https://www.youtube.com part has to be added manually if the URL is relative to + // YouTube's domain return HTTPS + "//www.youtube.com" + playerJsUrl; } else { return playerJsUrl; @@ -129,12 +159,15 @@ public final class YoutubeJavaScriptExtractor { } @Nonnull - private static String downloadJavaScriptCode(final String playerJsUrl) + private static String downloadJavaScriptCode(@Nonnull final String playerJsUrl) throws ParsingException { try { - return NewPipe.getDownloader().get(playerJsUrl, Localization.DEFAULT).responseBody(); + return NewPipe.getDownloader() + .get(playerJsUrl, Localization.DEFAULT) + .responseBody(); } catch (final Exception e) { - throw new ParsingException("Could not get player js code from url: " + playerJsUrl); + throw new ParsingException( + "Could not get JavaScript base player's code from URL: " + playerJsUrl, e); } } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index 28f2b7b91..70f6255a8 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -814,9 +814,9 @@ public class YoutubeStreamExtractor extends StreamExtractor { @Override public void onFetchPage(@Nonnull final Downloader downloader) throws IOException, ExtractionException { - initStsFromPlayerJsIfNeeded(); - final String videoId = getId(); + initStsFromPlayerJsIfNeeded(videoId); + final Localization localization = getExtractorLocalization(); final ContentCountry contentCountry = getExtractorContentCountry(); html5Cpn = generateContentPlaybackNonce(); @@ -1052,8 +1052,6 @@ public class YoutubeStreamExtractor extends StreamExtractor { @Nonnull final Localization localization, @Nonnull final String videoId) throws IOException, ExtractionException { - initStsFromPlayerJsIfNeeded(); - // Because a cpn is unique to each request, we need to generate it again html5Cpn = generateContentPlaybackNonce(); @@ -1110,9 +1108,9 @@ public class YoutubeStreamExtractor extends StreamExtractor { .getString("videoId")); } - private static void storePlayerJs() throws ParsingException { + private static void storePlayerJs(@Nonnull final String videoId) throws ParsingException { try { - playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(); + playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(videoId); } catch (final Exception e) { throw new ParsingException("Could not store JavaScript player", e); } @@ -1177,12 +1175,13 @@ public class YoutubeStreamExtractor extends StreamExtractor { return cachedDeobfuscationCode; } - private static void initStsFromPlayerJsIfNeeded() throws ParsingException { + private static void initStsFromPlayerJsIfNeeded(@Nonnull final String videoId) + throws ParsingException { if (!isNullOrEmpty(sts)) { return; } if (playerCode == null) { - storePlayerJs(); + storePlayerJs(videoId); if (playerCode == null) { throw new ParsingException("playerCode is null"); } diff --git a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractorTest.java b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractorTest.java index 1433623ee..f7ceef140 100644 --- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractorTest.java +++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeJavaScriptExtractorTest.java @@ -20,21 +20,20 @@ public class YoutubeJavaScriptExtractorTest { @Test public void testExtractJavaScriptUrlIframe() throws ParsingException { - assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl().endsWith("base.js")); + assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource() + .endsWith("base.js")); } @Test public void testExtractJavaScriptUrlEmbed() throws ParsingException { - assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl("d4IGg5dqeO8").endsWith("base.js")); + assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage("d4IGg5dqeO8") + .endsWith("base.js")); } @Test public void testExtractJavaScript__success() throws ParsingException { String playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode("d4IGg5dqeO8"); assertPlayerJsCode(playerJsCode); - - playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(); - assertPlayerJsCode(playerJsCode); } @Test