Merge pull request #1087 from AudricV/yt_js-extractor-improvements-and-fixes
[YouTube] Improve and fix YoutubeJavaScriptExtractor
This commit is contained in:
commit
3faaf4301c
|
@ -10,40 +10,62 @@ import org.schabi.newpipe.extractor.localization.Localization;
|
||||||
import org.schabi.newpipe.extractor.utils.Parser;
|
import org.schabi.newpipe.extractor.utils.Parser;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
import javax.annotation.Nonnull;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* YouTube restricts streaming their media in multiple ways by requiring clients to apply a cipher
|
* The extractor of YouTube's base JavaScript player file.
|
||||||
* function on parameters of requests.
|
*
|
||||||
* The cipher function is sent alongside as a JavaScript function.
|
|
||||||
* <p>
|
* <p>
|
||||||
* This class handling fetching the JavaScript file in order to allow other classes to extract the
|
* YouTube restrict streaming their media in multiple ways by requiring their HTML5 clients to use
|
||||||
* needed functions.
|
* a signature timestamp, and on streaming URLs a signature deobfuscation function for some
|
||||||
|
* contents and a throttling parameter deobfuscation one for all contents.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This class handles fetching of this base JavaScript player file in order to allow other classes
|
||||||
|
* to extract the needed data.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* It will try to get the player URL from YouTube's IFrame resource first, and from a YouTube embed
|
||||||
|
* watch page as a fallback.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class YoutubeJavaScriptExtractor {
|
public final class YoutubeJavaScriptExtractor {
|
||||||
|
|
||||||
private static final String HTTPS = "https:";
|
private static final String HTTPS = "https:";
|
||||||
|
private static final String BASE_JS_PLAYER_URL_FORMAT =
|
||||||
|
"https://www.youtube.com/s/player/%s/player_ias.vflset/en_GB/base.js";
|
||||||
|
private static final Pattern IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN = Pattern.compile(
|
||||||
|
"player\\\\/([a-z0-9]{8})\\\\/");
|
||||||
|
private static final Pattern EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN = Pattern.compile(
|
||||||
|
"\"jsUrl\":\"(/s/player/[A-Za-z0-9]+/player_ias\\.vflset/[A-Za-z_-]+/base\\.js)\"");
|
||||||
private static String cachedJavaScriptCode;
|
private static String cachedJavaScriptCode;
|
||||||
|
|
||||||
private YoutubeJavaScriptExtractor() {
|
private YoutubeJavaScriptExtractor() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts the JavaScript file. The result is cached, so subsequent calls use the result of
|
* Extracts the JavaScript file.
|
||||||
* previous calls.
|
|
||||||
*
|
*
|
||||||
* @param videoId Does not influence the result, but a valid video id may help in the chance
|
* <p>
|
||||||
* that YouTube tracks it.
|
* The result is cached, so subsequent calls use the result of previous calls.
|
||||||
* @return The whole JavaScript file as a string.
|
* </p>
|
||||||
* @throws ParsingException If the extraction failed.
|
*
|
||||||
|
* @param videoId a YouTube video ID, which doesn't influence the result, but it may help in
|
||||||
|
* the chance that YouTube track it
|
||||||
|
* @return the whole JavaScript file as a string
|
||||||
|
* @throws ParsingException if the extraction failed
|
||||||
*/
|
*/
|
||||||
@Nonnull
|
@Nonnull
|
||||||
public static String extractJavaScriptCode(final String videoId) throws ParsingException {
|
public static String extractJavaScriptCode(@Nonnull final String videoId)
|
||||||
|
throws ParsingException {
|
||||||
if (cachedJavaScriptCode == null) {
|
if (cachedJavaScriptCode == null) {
|
||||||
String url;
|
String url;
|
||||||
try {
|
try {
|
||||||
url = YoutubeJavaScriptExtractor.extractJavaScriptUrl();
|
url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource();
|
||||||
} catch (final Exception i) {
|
} catch (final Exception e) {
|
||||||
url = YoutubeJavaScriptExtractor.extractJavaScriptUrl(videoId);
|
url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage(videoId);
|
||||||
}
|
}
|
||||||
final String playerJsUrl = YoutubeJavaScriptExtractor.cleanJavaScriptUrl(url);
|
final String playerJsUrl = YoutubeJavaScriptExtractor.cleanJavaScriptUrl(url);
|
||||||
cachedJavaScriptCode = YoutubeJavaScriptExtractor.downloadJavaScriptCode(playerJsUrl);
|
cachedJavaScriptCode = YoutubeJavaScriptExtractor.downloadJavaScriptCode(playerJsUrl);
|
||||||
|
@ -53,75 +75,83 @@ public final class YoutubeJavaScriptExtractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Same as {@link YoutubeJavaScriptExtractor#extractJavaScriptCode(String)} but with a constant
|
* Reset the cached JavaScript code.
|
||||||
* value for videoId.
|
*
|
||||||
* Possible because the videoId has no influence on the result.
|
|
||||||
* <p>
|
* <p>
|
||||||
* In the off chance that YouTube tracks with which video id the request is made, it may make
|
* It will be fetched again the next time {@link #extractJavaScriptCode(String)} is called.
|
||||||
* sense to pass in video ids.
|
* </p>
|
||||||
*/
|
|
||||||
@Nonnull
|
|
||||||
public static String extractJavaScriptCode() throws ParsingException {
|
|
||||||
return extractJavaScriptCode("d4IGg5dqeO8");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reset the JavaScript code. It will be fetched again the next time
|
|
||||||
* {@link #extractJavaScriptCode()} or {@link #extractJavaScriptCode(String)} is called.
|
|
||||||
*/
|
*/
|
||||||
public static void resetJavaScriptCode() {
|
public static void resetJavaScriptCode() {
|
||||||
cachedJavaScriptCode = null;
|
cachedJavaScriptCode = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String extractJavaScriptUrl() throws ParsingException {
|
@Nonnull
|
||||||
|
static String extractJavaScriptUrlWithIframeResource() throws ParsingException {
|
||||||
|
final String iframeUrl;
|
||||||
|
final String iframeContent;
|
||||||
try {
|
try {
|
||||||
final String iframeUrl = "https://www.youtube.com/iframe_api";
|
iframeUrl = "https://www.youtube.com/iframe_api";
|
||||||
final String iframeContent = NewPipe.getDownloader()
|
iframeContent = NewPipe.getDownloader()
|
||||||
.get(iframeUrl, Localization.DEFAULT).responseBody();
|
.get(iframeUrl, Localization.DEFAULT)
|
||||||
final String hashPattern = "player\\\\\\/([a-z0-9]{8})\\\\\\/";
|
.responseBody();
|
||||||
final String hash = Parser.matchGroup1(hashPattern, iframeContent);
|
} catch (final Exception e) {
|
||||||
|
throw new ParsingException("Could not fetch IFrame resource", e);
|
||||||
return String.format(
|
|
||||||
"https://www.youtube.com/s/player/%s/player_ias.vflset/en_US/base.js", hash);
|
|
||||||
} catch (final Exception ignored) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new ParsingException("Iframe API did not provide YouTube player js url");
|
try {
|
||||||
|
final String hash = Parser.matchGroup1(
|
||||||
|
IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN, iframeContent);
|
||||||
|
return String.format(BASE_JS_PLAYER_URL_FORMAT, hash);
|
||||||
|
} catch (final Parser.RegexException e) {
|
||||||
|
throw new ParsingException(
|
||||||
|
"IFrame resource didn't provide JavaScript base player's hash", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String extractJavaScriptUrl(final String videoId) throws ParsingException {
|
@Nonnull
|
||||||
|
static String extractJavaScriptUrlWithEmbedWatchPage(@Nonnull final String videoId)
|
||||||
|
throws ParsingException {
|
||||||
|
final String embedUrl;
|
||||||
|
final String embedPageContent;
|
||||||
try {
|
try {
|
||||||
final String embedUrl = "https://www.youtube.com/embed/" + videoId;
|
embedUrl = "https://www.youtube.com/embed/" + videoId;
|
||||||
final String embedPageContent = NewPipe.getDownloader()
|
embedPageContent = NewPipe.getDownloader()
|
||||||
.get(embedUrl, Localization.DEFAULT).responseBody();
|
.get(embedUrl, Localization.DEFAULT)
|
||||||
|
.responseBody();
|
||||||
try {
|
} catch (final Exception e) {
|
||||||
final String assetsPattern = "\"assets\":.+?\"js\":\\s*(\"[^\"]+\")";
|
throw new ParsingException("Could not fetch embedded watch page", e);
|
||||||
return Parser.matchGroup1(assetsPattern, embedPageContent)
|
|
||||||
.replace("\\", "").replace("\"", "");
|
|
||||||
} catch (final Parser.RegexException ex) {
|
|
||||||
// playerJsUrl is still available in the file, just somewhere else TODO
|
|
||||||
// it is ok not to find it, see how that's handled in getDeobfuscationCode()
|
|
||||||
final Document doc = Jsoup.parse(embedPageContent);
|
|
||||||
final Elements elems = doc.select("script").attr("name", "player_ias/base");
|
|
||||||
for (final Element elem : elems) {
|
|
||||||
if (elem.attr("src").contains("base.js")) {
|
|
||||||
return elem.attr("src");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (final Exception ignored) {
|
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new ParsingException("Embedded info did not provide YouTube player js url");
|
// Parse HTML response with jsoup and look at script elements first
|
||||||
|
final Document doc = Jsoup.parse(embedPageContent);
|
||||||
|
final Elements elems = doc.select("script")
|
||||||
|
.attr("name", "player/base");
|
||||||
|
for (final Element elem : elems) {
|
||||||
|
// Script URLs should be relative and not absolute
|
||||||
|
final String playerUrl = elem.attr("src");
|
||||||
|
if (playerUrl.contains("base.js")) {
|
||||||
|
return playerUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use regexes to match the URL in a JavaScript embedded script of the HTML page
|
||||||
|
try {
|
||||||
|
return Parser.matchGroup1(
|
||||||
|
EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN, embedPageContent);
|
||||||
|
} catch (final Parser.RegexException e) {
|
||||||
|
throw new ParsingException(
|
||||||
|
"Embedded watch page didn't provide JavaScript base player's URL", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nonnull
|
@Nonnull
|
||||||
private static String cleanJavaScriptUrl(@Nonnull final String playerJsUrl) {
|
private static String cleanJavaScriptUrl(@Nonnull final String playerJsUrl) {
|
||||||
if (playerJsUrl.startsWith("//")) {
|
if (playerJsUrl.startsWith("//")) {
|
||||||
|
// https part has to be added manually if the URL is protocol-relative
|
||||||
return HTTPS + playerJsUrl;
|
return HTTPS + playerJsUrl;
|
||||||
} else if (playerJsUrl.startsWith("/")) {
|
} else if (playerJsUrl.startsWith("/")) {
|
||||||
// sometimes https://www.youtube.com part has to be added manually
|
// https://www.youtube.com part has to be added manually if the URL is relative to
|
||||||
|
// YouTube's domain
|
||||||
return HTTPS + "//www.youtube.com" + playerJsUrl;
|
return HTTPS + "//www.youtube.com" + playerJsUrl;
|
||||||
} else {
|
} else {
|
||||||
return playerJsUrl;
|
return playerJsUrl;
|
||||||
|
@ -129,12 +159,15 @@ public final class YoutubeJavaScriptExtractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nonnull
|
@Nonnull
|
||||||
private static String downloadJavaScriptCode(final String playerJsUrl)
|
private static String downloadJavaScriptCode(@Nonnull final String playerJsUrl)
|
||||||
throws ParsingException {
|
throws ParsingException {
|
||||||
try {
|
try {
|
||||||
return NewPipe.getDownloader().get(playerJsUrl, Localization.DEFAULT).responseBody();
|
return NewPipe.getDownloader()
|
||||||
|
.get(playerJsUrl, Localization.DEFAULT)
|
||||||
|
.responseBody();
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new ParsingException("Could not get player js code from url: " + playerJsUrl);
|
throw new ParsingException(
|
||||||
|
"Could not get JavaScript base player's code from URL: " + playerJsUrl, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -814,9 +814,9 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
@Override
|
@Override
|
||||||
public void onFetchPage(@Nonnull final Downloader downloader)
|
public void onFetchPage(@Nonnull final Downloader downloader)
|
||||||
throws IOException, ExtractionException {
|
throws IOException, ExtractionException {
|
||||||
initStsFromPlayerJsIfNeeded();
|
|
||||||
|
|
||||||
final String videoId = getId();
|
final String videoId = getId();
|
||||||
|
initStsFromPlayerJsIfNeeded(videoId);
|
||||||
|
|
||||||
final Localization localization = getExtractorLocalization();
|
final Localization localization = getExtractorLocalization();
|
||||||
final ContentCountry contentCountry = getExtractorContentCountry();
|
final ContentCountry contentCountry = getExtractorContentCountry();
|
||||||
html5Cpn = generateContentPlaybackNonce();
|
html5Cpn = generateContentPlaybackNonce();
|
||||||
|
@ -1052,8 +1052,6 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
@Nonnull final Localization localization,
|
@Nonnull final Localization localization,
|
||||||
@Nonnull final String videoId)
|
@Nonnull final String videoId)
|
||||||
throws IOException, ExtractionException {
|
throws IOException, ExtractionException {
|
||||||
initStsFromPlayerJsIfNeeded();
|
|
||||||
|
|
||||||
// Because a cpn is unique to each request, we need to generate it again
|
// Because a cpn is unique to each request, we need to generate it again
|
||||||
html5Cpn = generateContentPlaybackNonce();
|
html5Cpn = generateContentPlaybackNonce();
|
||||||
|
|
||||||
|
@ -1110,9 +1108,9 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
.getString("videoId"));
|
.getString("videoId"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void storePlayerJs() throws ParsingException {
|
private static void storePlayerJs(@Nonnull final String videoId) throws ParsingException {
|
||||||
try {
|
try {
|
||||||
playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode();
|
playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(videoId);
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new ParsingException("Could not store JavaScript player", e);
|
throw new ParsingException("Could not store JavaScript player", e);
|
||||||
}
|
}
|
||||||
|
@ -1177,12 +1175,13 @@ public class YoutubeStreamExtractor extends StreamExtractor {
|
||||||
return cachedDeobfuscationCode;
|
return cachedDeobfuscationCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void initStsFromPlayerJsIfNeeded() throws ParsingException {
|
private static void initStsFromPlayerJsIfNeeded(@Nonnull final String videoId)
|
||||||
|
throws ParsingException {
|
||||||
if (!isNullOrEmpty(sts)) {
|
if (!isNullOrEmpty(sts)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (playerCode == null) {
|
if (playerCode == null) {
|
||||||
storePlayerJs();
|
storePlayerJs(videoId);
|
||||||
if (playerCode == null) {
|
if (playerCode == null) {
|
||||||
throw new ParsingException("playerCode is null");
|
throw new ParsingException("playerCode is null");
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,21 +20,20 @@ public class YoutubeJavaScriptExtractorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractJavaScriptUrlIframe() throws ParsingException {
|
public void testExtractJavaScriptUrlIframe() throws ParsingException {
|
||||||
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl().endsWith("base.js"));
|
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource()
|
||||||
|
.endsWith("base.js"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractJavaScriptUrlEmbed() throws ParsingException {
|
public void testExtractJavaScriptUrlEmbed() throws ParsingException {
|
||||||
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl("d4IGg5dqeO8").endsWith("base.js"));
|
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage("d4IGg5dqeO8")
|
||||||
|
.endsWith("base.js"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExtractJavaScript__success() throws ParsingException {
|
public void testExtractJavaScript__success() throws ParsingException {
|
||||||
String playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode("d4IGg5dqeO8");
|
String playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode("d4IGg5dqeO8");
|
||||||
assertPlayerJsCode(playerJsCode);
|
assertPlayerJsCode(playerJsCode);
|
||||||
|
|
||||||
playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode();
|
|
||||||
assertPlayerJsCode(playerJsCode);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue