refactored YouTube-linkHandler to use less regex and more URL-methods

This commit is contained in:
Connectety-W 2019-01-13 12:52:07 +01:00
parent a78ad16235
commit 98f49852d7
No known key found for this signature in database
GPG Key ID: 8F39B4F36D48B3F8
7 changed files with 277 additions and 92 deletions

View File

@ -1,9 +1,9 @@
package org.schabi.newpipe.extractor.services.youtube.linkHandler; package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import java.net.URL;
import java.util.List; import java.util.List;
/* /*
@ -29,25 +29,53 @@ import java.util.List;
public class YoutubeChannelLinkHandlerFactory extends ListLinkHandlerFactory { public class YoutubeChannelLinkHandlerFactory extends ListLinkHandlerFactory {
private static final YoutubeChannelLinkHandlerFactory instance = new YoutubeChannelLinkHandlerFactory(); private static final YoutubeChannelLinkHandlerFactory instance = new YoutubeChannelLinkHandlerFactory();
private static final String ID_PATTERN = "/(user/[A-Za-z0-9_-]*|channel/[A-Za-z0-9_-]*)";
public static YoutubeChannelLinkHandlerFactory getInstance() { public static YoutubeChannelLinkHandlerFactory getInstance() {
return instance; return instance;
} }
@Override
public String getId(String url) throws ParsingException {
return Parser.matchGroup1(ID_PATTERN, url);
}
@Override @Override
public String getUrl(String id, List<String> contentFilters, String searchFilter) { public String getUrl(String id, List<String> contentFilters, String searchFilter) {
return "https://www.youtube.com/" + id; return "https://www.youtube.com/" + id;
} }
@Override
public String getId(String url) throws ParsingException {
try {
URL urlObj = new URL(url);
String path = urlObj.getPath();
if (!(YoutubeParsingHelper.isYoutubeURL(urlObj) || urlObj.getHost().equalsIgnoreCase("hooktube.com"))) {
throw new ParsingException("the URL given is not a Youtube-URL");
}
if (!path.startsWith("/user/") && !path.startsWith("/channel/")) {
throw new ParsingException("the URL given is neither a channel nor an user");
}
// remove leading "/"
path = path.substring(1);
String[] splitPath = path.split("/");
String id = splitPath[1];
if (id == null || !id.matches("[A-Za-z0-9_-]+")) {
throw new ParsingException("The given id is not a Youtube-Video-ID");
}
return splitPath[0] + "/" + id;
} catch (final Exception exception) {
throw new ParsingException("Error could not parse url :" + exception.getMessage(), exception);
}
}
@Override @Override
public boolean onAcceptUrl(String url) { public boolean onAcceptUrl(String url) {
return (url.contains("youtube") || url.contains("youtu.be") || url.contains("hooktube.com")) try {
&& (url.contains("/user/") || url.contains("/channel/")); getId(url);
} catch (ParsingException e) {
return false;
}
return true;
} }
} }

View File

@ -3,6 +3,8 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import java.net.URL;
/* /*
* Created by Christian Schabesberger on 02.03.16. * Created by Christian Schabesberger on 02.03.16.
* *
@ -28,6 +30,42 @@ public class YoutubeParsingHelper {
private YoutubeParsingHelper() { private YoutubeParsingHelper() {
} }
private static boolean isHTTP(URL url) {
// make sure its http or https
String protocol = url.getProtocol();
if (!protocol.equals("http") && !protocol.equals("https")) {
return false;
}
boolean usesDefaultPort = url.getPort() == url.getDefaultPort();
boolean setsNoPort = url.getPort() == -1;
return setsNoPort || usesDefaultPort;
}
public static boolean isYoutubeURL(URL url) {
// make sure its http or https
if (!isHTTP(url))
return false;
// make sure its a known youtube url
String host = url.getHost();
return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com")
|| host.equalsIgnoreCase("m.youtube.com");
}
public static boolean isYoutubeALikeURL(URL url) {
// make sure its http or https
if (!isHTTP(url))
return false;
// make sure its a known youtube url
String host = url.getHost();
return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com")
|| host.equalsIgnoreCase("m.youtube.com") || host.equalsIgnoreCase("www.youtube-nocookie.com")
|| host.equalsIgnoreCase("youtu.be") || host.equalsIgnoreCase("hooktube.com");
}
public static long parseDurationString(String input) public static long parseDurationString(String input)
throws ParsingException, NumberFormatException { throws ParsingException, NumberFormatException {

View File

@ -1,16 +1,15 @@
package org.schabi.newpipe.extractor.services.youtube.linkHandler; package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.utils.Utils;
import java.net.URL;
import java.util.List; import java.util.List;
public class YoutubePlaylistLinkHandlerFactory extends ListLinkHandlerFactory { public class YoutubePlaylistLinkHandlerFactory extends ListLinkHandlerFactory {
private static final YoutubePlaylistLinkHandlerFactory instance = new YoutubePlaylistLinkHandlerFactory(); private static final YoutubePlaylistLinkHandlerFactory instance = new YoutubePlaylistLinkHandlerFactory();
private static final String ID_PATTERN = "([\\-a-zA-Z0-9_]{10,})";
public static YoutubePlaylistLinkHandlerFactory getInstance() { public static YoutubePlaylistLinkHandlerFactory getInstance() {
return instance; return instance;
@ -24,17 +23,35 @@ public class YoutubePlaylistLinkHandlerFactory extends ListLinkHandlerFactory {
@Override @Override
public String getId(String url) throws ParsingException { public String getId(String url) throws ParsingException {
try { try {
return Parser.matchGroup1("list=" + ID_PATTERN, url); URL urlObj = new URL(url);
if (!YoutubeParsingHelper.isYoutubeURL(urlObj)) {
throw new ParsingException("the url given is not a Youtube-URL");
}
String listID = Utils.getQueryValue(urlObj, "list");
if (listID == null) {
throw new ParsingException("the url given does not include a playlist");
}
if (!listID.matches("[a-zA-Z0-9_-]{10,}")) {
throw new ParsingException("the list-ID given in the URL does not match the list pattern");
}
return listID;
} catch (final Exception exception) { } catch (final Exception exception) {
throw new ParsingException("Error could not parse url :" + exception.getMessage(), exception); throw new ParsingException("Error could not parse url :" + exception.getMessage(), exception);
} }
} }
@Override @Override
public boolean onAcceptUrl(final String url) { public boolean onAcceptUrl(final String url) {
final boolean hasNotEmptyUrl = url != null && !url.isEmpty(); try {
final boolean isYoutubeDomain = hasNotEmptyUrl && (url.contains("youtube") || url.contains("youtu.be")); getId(url);
return isYoutubeDomain && url.contains("list="); } catch (ParsingException e) {
return false;
}
return true;
} }
} }

View File

@ -1,21 +1,14 @@
package org.schabi.newpipe.extractor.services.youtube.linkHandler; package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory;
import org.schabi.newpipe.extractor.exceptions.FoundAdException; import org.schabi.newpipe.extractor.exceptions.FoundAdException;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException; import org.schabi.newpipe.extractor.linkhandler.LinkHandlerFactory;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.utils.Utils;
import java.io.IOException; import java.net.MalformedURLException;
import java.io.UnsupportedEncodingException;
import java.net.URI; import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.net.URLDecoder; import java.net.URL;
/* /*
* Created by Christian Schabesberger on 02.02.16. * Created by Christian Schabesberger on 02.02.16.
@ -40,7 +33,6 @@ import java.net.URLDecoder;
public class YoutubeStreamLinkHandlerFactory extends LinkHandlerFactory { public class YoutubeStreamLinkHandlerFactory extends LinkHandlerFactory {
private static final YoutubeStreamLinkHandlerFactory instance = new YoutubeStreamLinkHandlerFactory(); private static final YoutubeStreamLinkHandlerFactory instance = new YoutubeStreamLinkHandlerFactory();
private static final String ID_PATTERN = "([\\-a-zA-Z0-9_]{11})";
private YoutubeStreamLinkHandlerFactory() { private YoutubeStreamLinkHandlerFactory() {
} }
@ -49,78 +41,138 @@ public class YoutubeStreamLinkHandlerFactory extends LinkHandlerFactory {
return instance; return instance;
} }
private static String assertIsID(String id) throws ParsingException {
if (id == null || !id.matches("[a-zA-Z0-9_-]{11}")) {
throw new ParsingException("The given string is not a Youtube-Video-ID");
}
return id;
}
@Override @Override
public String getUrl(String id) { public String getUrl(String id) {
return "https://www.youtube.com/watch?v=" + id; return "https://www.youtube.com/watch?v=" + id;
} }
@Override @Override
public String getId(String url) throws ParsingException, IllegalArgumentException { public String getId(String urlString) throws ParsingException, IllegalArgumentException {
if (url.isEmpty()) { try {
throw new IllegalArgumentException("The url parameter should not be empty"); URI uri = new URI(urlString);
if (uri.getScheme().equals("vnd.youtube")) {
String scheme = uri.getSchemeSpecificPart();
if (scheme.startsWith("//")) {
urlString = "https:" + scheme;
} else {
return assertIsID(scheme);
}
}
} catch (URISyntaxException ignored) {
} }
String lowercaseUrl = url.toLowerCase(); URL url;
if (lowercaseUrl.contains("youtube")) {
if (lowercaseUrl.contains("list=")) {
throw new ParsingException("Error no suitable url: " + url);
}
if (url.contains("attribution_link")) {
try { try {
String escapedQuery = Parser.matchGroup1("u=(.[^&|$]*)", url); url = new URL(urlString);
String query = URLDecoder.decode(escapedQuery, "UTF-8"); } catch (MalformedURLException e) {
return Parser.matchGroup1("v=" + ID_PATTERN, query); throw new IllegalArgumentException("The given URL is not valid");
} catch (UnsupportedEncodingException uee) { }
throw new ParsingException("Could not parse attribution_link", uee);
String host = url.getHost();
String path = url.getPath();
// remove leading "/" of URL-path if URL-path is given
if (!path.isEmpty()) {
path = path.substring(1);
}
if (!YoutubeParsingHelper.isYoutubeALikeURL(url)) {
if (host.equalsIgnoreCase("googleads.g.doubleclick.net")) {
throw new FoundAdException("Error found ad: " + urlString);
}
throw new ParsingException("The url is not a Youtube-URL");
}
if (YoutubePlaylistLinkHandlerFactory.getInstance().acceptUrl(urlString)) {
throw new ParsingException("Error no suitable url: " + urlString);
}
// using uppercase instead of lowercase, because toLowercase replaces some unicode characters
// with their lowercase ASCII equivalent. Using toLowercase could result in faultily matching unicode urls.
switch (host.toUpperCase()) {
case "WWW.YOUTUBE-NOCOOKIE.COM": {
if (path.startsWith("embed/")) {
String id = path.split("/")[1];
return assertIsID(id);
} }
} }
if (url.contains("vnd.youtube")) {
return Parser.matchGroup1(ID_PATTERN, url); case "YOUTUBE.COM":
case "WWW.YOUTUBE.COM":
case "M.YOUTUBE.COM": {
if (path.equals("attribution_link")) {
String uQueryValue = Utils.getQueryValue(url, "u");
URL decodedURL;
try {
decodedURL = new URL("http://www.youtube.com" + uQueryValue);
} catch (MalformedURLException e) {
throw new ParsingException("Error no suitable url: " + urlString);
} }
if (url.contains("embed")) {
return Parser.matchGroup1("embed/" + ID_PATTERN, url); String viewQueryValue = Utils.getQueryValue(decodedURL, "v");
return assertIsID(viewQueryValue);
} }
if (url.contains("googleads")) {
throw new FoundAdException("Error found add: " + url); if (path.startsWith("embed/")) {
String id = path.split("/")[1];
return assertIsID(id);
} }
return Parser.matchGroup1("[?&]v=" + ID_PATTERN, url);
String viewQueryValue = Utils.getQueryValue(url, "v");
return assertIsID(viewQueryValue);
} }
if (lowercaseUrl.contains("youtu.be")) {
if (lowercaseUrl.contains("list=")) { case "YOUTU.BE": {
throw new ParsingException("Error no suitable url: " + url); String viewQueryValue = Utils.getQueryValue(url, "v");
if (viewQueryValue != null) {
return assertIsID(viewQueryValue);
} }
if (url.contains("v=")) {
return Parser.matchGroup1("v=" + ID_PATTERN, url); return assertIsID(path);
} }
return Parser.matchGroup1("[Yy][Oo][Uu][Tt][Uu]\\.[Bb][Ee]/" + ID_PATTERN, url);
} case "HOOKTUBE.COM": {
if (lowercaseUrl.contains("hooktube")) { if (path.equals("watch")) {
if (lowercaseUrl.contains("&v=") String viewQueryValue = Utils.getQueryValue(url, "v");
|| lowercaseUrl.contains("?v=")) { if (viewQueryValue != null) {
return Parser.matchGroup1("[?&]v=" + ID_PATTERN, url); return assertIsID(viewQueryValue);
}
if (url.contains("/embed/")) {
return Parser.matchGroup1("embed/" + ID_PATTERN, url);
}
if (url.contains("/v/")) {
return Parser.matchGroup1("v/" + ID_PATTERN, url);
}
if (url.contains("/watch/")) {
return Parser.matchGroup1("watch/" + ID_PATTERN, url);
} }
} }
throw new ParsingException("Error no suitable url: " + url); if (path.startsWith("embed/")) {
String id = path.substring("embed/".length());
return assertIsID(id);
}
if (path.startsWith("v/")) {
String id = path.substring("v/".length());
return assertIsID(id);
}
if (path.startsWith("watch/")) {
String id = path.substring("watch/".length());
return assertIsID(id);
}
}
}
throw new ParsingException("Error no suitable url: " + urlString);
} }
@Override @Override
public boolean onAcceptUrl(final String url) throws FoundAdException { public boolean onAcceptUrl(final String url) throws FoundAdException {
final String lowercaseUrl = url.toLowerCase();
if (!lowercaseUrl.contains("youtube") &&
!lowercaseUrl.contains("youtu.be") &&
!lowercaseUrl.contains("hooktube")) {
return false;
// bad programming I know <-- nice meme
}
try { try {
getId(url); getId(url);
return true; return true;

View File

@ -21,8 +21,9 @@ package org.schabi.newpipe.extractor.services.youtube.linkHandler;
*/ */
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory; import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.utils.Parser;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.List; import java.util.List;
public class YoutubeTrendingLinkHandlerFactory extends ListLinkHandlerFactory { public class YoutubeTrendingLinkHandlerFactory extends ListLinkHandlerFactory {
@ -38,6 +39,14 @@ public class YoutubeTrendingLinkHandlerFactory extends ListLinkHandlerFactory {
@Override @Override
public boolean onAcceptUrl(final String url) { public boolean onAcceptUrl(final String url) {
return Parser.isMatch("^(https://|http://|)(www.|m.|)youtube.com/feed/trending(|\\?.*)$", url); URL urlObj;
try {
urlObj = new URL(url);
} catch (MalformedURLException e) {
return false;
}
String urlPath = urlObj.getPath();
return YoutubeParsingHelper.isYoutubeURL(urlObj) && urlPath.equals("/feed/trending");
} }
} }

View File

@ -2,6 +2,9 @@ package org.schabi.newpipe.extractor.utils;
import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.exceptions.ParsingException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.List; import java.util.List;
public class Utils { public class Utils {
@ -57,5 +60,43 @@ public class Utils {
} }
return url; return url;
} }
/**
* get the value of a URL-query by name.
* if a url-query is give multiple times, only the value of the first query is returned
*
* @param url the url to be used
* @param parameterName the pattern that will be used to check the url
* @return a string that contains the value of the query parameter or null if nothing was found
*/
public static String getQueryValue(URL url, String parameterName) {
String urlQuery = url.getQuery();
if (urlQuery != null) {
for (String param : urlQuery.split("&")) {
String[] params = param.split("=", 2);
String query;
try {
query = URLDecoder.decode(params[0], "UTF-8");
} catch (UnsupportedEncodingException e) {
System.err.println("Cannot decode string with UTF-8. using the string without decoding");
e.printStackTrace();
query = params[0];
} }
if (query.equals(parameterName)) {
try {
return URLDecoder.decode(params[1], "UTF-8");
} catch (UnsupportedEncodingException e) {
System.err.println("Cannot decode string with UTF-8. using the string without decoding");
e.printStackTrace();
return params[1];
}
}
}
}
return null;
}
}

View File

@ -60,9 +60,9 @@ public class YoutubeStreamLinkHandlerFactoryTest {
public void getIdfromYt() throws Exception { public void getIdfromYt() throws Exception {
assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI").getId());
assertEquals("W-fFHeTX70Q", linkHandler.fromUrl("https://www.youtube.com/watch?v=W-fFHeTX70Q").getId()); assertEquals("W-fFHeTX70Q", linkHandler.fromUrl("https://www.youtube.com/watch?v=W-fFHeTX70Q").getId());
assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI?t=100").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://www.youtube.com/watch?v=jZViOEv90dI&t=100").getId());
assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI?t=100").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI&t=100").getId());
assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI?t=100").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI&t=100").getId());
assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://youtu.be/jZViOEv90dI?t=9s").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("https://youtu.be/jZViOEv90dI?t=9s").getId());
assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://Youtu.be/jZViOEv90dI?t=9s").getId()); assertEquals("jZViOEv90dI", linkHandler.fromUrl("HTTPS://Youtu.be/jZViOEv90dI?t=9s").getId());
assertEquals("uEJuoEs1UxY", linkHandler.fromUrl("http://www.youtube.com/watch_popup?v=uEJuoEs1UxY").getId()); assertEquals("uEJuoEs1UxY", linkHandler.fromUrl("http://www.youtube.com/watch_popup?v=uEJuoEs1UxY").getId());
@ -85,9 +85,9 @@ public class YoutubeStreamLinkHandlerFactoryTest {
@Test @Test
public void testAcceptYtUrl() throws ParsingException { public void testAcceptYtUrl() throws ParsingException {
assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI")); assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI"));
assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI?t=100")); assertTrue(linkHandler.acceptUrl("https://www.youtube.com/watch?v=jZViOEv90dI&t=100"));
assertTrue(linkHandler.acceptUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI?t=100")); assertTrue(linkHandler.acceptUrl("https://WWW.YouTube.com/watch?v=jZViOEv90dI&t=100"));
assertTrue(linkHandler.acceptUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI?t=100")); assertTrue(linkHandler.acceptUrl("HTTPS://www.youtube.com/watch?v=jZViOEv90dI&t=100"));
assertTrue(linkHandler.acceptUrl("https://youtu.be/jZViOEv90dI?t=9s")); assertTrue(linkHandler.acceptUrl("https://youtu.be/jZViOEv90dI?t=9s"));
assertTrue(linkHandler.acceptUrl("https://www.youtube.com/embed/jZViOEv90dI")); assertTrue(linkHandler.acceptUrl("https://www.youtube.com/embed/jZViOEv90dI"));
assertTrue(linkHandler.acceptUrl("https://www.youtube-nocookie.com/embed/jZViOEv90dI")); assertTrue(linkHandler.acceptUrl("https://www.youtube-nocookie.com/embed/jZViOEv90dI"));