Base Implementation: Parse the upload date of StreamInfoItems

In the format '2 days ago' (in English) on a YouTube channel page.
(Parser extensible to other pages.)
This commit is contained in:
wojcik-online 2019-10-02 02:02:01 -03:00 committed by Mauricio Colli
parent 514ed7bdc1
commit 180836c180
No known key found for this signature in database
GPG Key ID: F200BFD6F29DDD85
16 changed files with 316 additions and 44 deletions

View File

@ -17,6 +17,7 @@ import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandlerFactory;
import org.schabi.newpipe.extractor.playlist.PlaylistExtractor;
import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.stream.StreamExtractor;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor;
import org.schabi.newpipe.extractor.utils.Localization;
@ -296,6 +297,9 @@ public abstract class StreamingService {
return getCommentsExtractor(llhf.fromUrl(url), NewPipe.getPreferredLocalization());
}
public TimeAgoParser getTimeAgoParser() {
return new TimeAgoParser(TimeAgoParser.DEFAULT_AGO_PHRASES);
}
/**
* Figures out where the link is pointing to (a channel, a video, a playlist, etc.)

View File

@ -79,21 +79,20 @@ public class SoundcloudParsingHelper {
return dl.head(apiUrl).getResponseCode() == 200;
}
public static String toDateString(String time) throws ParsingException {
static Date parseDate(String time) throws ParsingException {
try {
Date date;
// Have two date formats, one for the 'api.soundc...' and the other 'api-v2.soundc...'.
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(time);
} catch (ParseException e1) {
try {
date = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(time);
} catch (Exception e) {
date = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss +0000").parse(time);
return new SimpleDateFormat("yyyy/MM/dd HH:mm:ss +0000").parse(time);
} catch (ParseException e2) {
throw new ParsingException(e1.getMessage(), e2);
}
}
}
SimpleDateFormat newDateFormat = new SimpleDateFormat("yyyy-MM-dd");
return newDateFormat.format(date);
} catch (ParseException e) {
throw new ParsingException(e.getMessage(), e);
}
static String toTextualDate(String time) throws ParsingException {
return new SimpleDateFormat("yyyy-MM-dd").format(parseDate(time));
}
/**

View File

@ -51,7 +51,7 @@ public class SoundcloudStreamExtractor extends StreamExtractor {
@Nonnull
@Override
public String getUploadDate() throws ParsingException {
return SoundcloudParsingHelper.toDateString(track.getString("created_at"));
return SoundcloudParsingHelper.toTextualDate(track.getString("created_at"));
}
@Nonnull

View File

@ -5,6 +5,8 @@ import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
import org.schabi.newpipe.extractor.stream.StreamType;
import java.util.Calendar;
import static org.schabi.newpipe.extractor.utils.Utils.replaceHttpWithHttps;
public class SoundcloudStreamInfoItemExtractor implements StreamInfoItemExtractor {
@ -41,8 +43,19 @@ public class SoundcloudStreamInfoItemExtractor implements StreamInfoItemExtracto
}
@Override
public String getUploadDate() throws ParsingException {
return SoundcloudParsingHelper.toDateString(itemObject.getString("created_at"));
public String getTextualUploadDate() throws ParsingException {
return SoundcloudParsingHelper.toTextualDate(getCreatedAt());
}
@Override
public Calendar getUploadDate() throws ParsingException {
Calendar uploadTime = Calendar.getInstance();
uploadTime.setTime(SoundcloudParsingHelper.parseDate(getCreatedAt()));
return uploadTime;
}
private String getCreatedAt() {
return itemObject.getString("created_at");
}
@Override

View File

@ -18,6 +18,7 @@ import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.utils.DonationLinkHelper;
import org.schabi.newpipe.extractor.utils.Localization;
import org.schabi.newpipe.extractor.utils.Parser;
@ -53,6 +54,8 @@ public class YoutubeChannelExtractor extends ChannelExtractor {
private static final String CHANNEL_FEED_BASE = "https://www.youtube.com/feeds/videos.xml?channel_id=";
private static final String CHANNEL_URL_PARAMETERS = "/videos?view=0&flow=list&sort=dd&live_view=10000&gl=US&hl=en";
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
public YoutubeChannelExtractor(StreamingService service, ListLinkHandler linkHandler, Localization localization) {
@ -230,7 +233,7 @@ public class YoutubeChannelExtractor extends ChannelExtractor {
final String uploaderUrl = getUrl();
for (final Element li : element.children()) {
if (li.select("div[class=\"feed-item-dismissable\"]").first() != null) {
collector.commit(new YoutubeStreamInfoItemExtractor(li) {
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
@Override
public String getUrl() throws ParsingException {
try {

View File

@ -18,6 +18,7 @@ import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingH
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.utils.Localization;
import org.schabi.newpipe.extractor.utils.Utils;
@ -28,6 +29,8 @@ import java.io.IOException;
@SuppressWarnings("WeakerAccess")
public class YoutubePlaylistExtractor extends PlaylistExtractor {
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
public YoutubePlaylistExtractor(StreamingService service, ListLinkHandler linkHandler, Localization localization) {
@ -192,7 +195,7 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor {
continue;
}
collector.commit(new YoutubeStreamInfoItemExtractor(li) {
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
public Element uploaderLink;
@Override
@ -258,7 +261,7 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor {
}
@Override
public String getUploadDate() throws ParsingException {
public String getTextualUploadDate() throws ParsingException {
return "";
}

View File

@ -9,6 +9,7 @@ import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
@ -129,7 +130,7 @@ public class YoutubeSearchExtractor extends SearchExtractor {
// video item type
} else if ((el = item.select("div[class*=\"yt-lockup-video\"]").first()) != null) {
collector.commit(new YoutubeStreamInfoItemExtractor(el));
collector.commit(new YoutubeStreamInfoItemExtractor(el, getService().getTimeAgoParser()));
} else if ((el = item.select("div[class*=\"yt-lockup-channel\"]").first()) != null) {
collector.commit(new YoutubeChannelInfoItemExtractor(el));
} else if ((el = item.select("div[class*=\"yt-lockup-playlist\"]").first()) != null &&

View File

@ -75,6 +75,8 @@ public class YoutubeStreamExtractor extends StreamExtractor {
/*//////////////////////////////////////////////////////////////////////////*/
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
@Nullable
private JsonObject playerArgs;
@ -932,7 +934,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
* This is encapsulated in a StreamInfoItem object, which is a subset of the fields in a full StreamInfo.
*/
private StreamInfoItemExtractor extractVideoPreviewInfo(final Element li) {
return new YoutubeStreamInfoItemExtractor(li) {
return new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
@Override
public String getUrl() throws ParsingException {
@ -959,7 +961,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
}
@Override
public String getUploadDate() throws ParsingException {
public String getTextualUploadDate() throws ParsingException {
return "";
}

View File

@ -1,12 +1,17 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import org.schabi.newpipe.extractor.utils.Utils;
import javax.annotation.Nullable;
import java.util.Calendar;
/*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* YoutubeStreamInfoItemExtractor.java is part of NewPipe.
@ -28,9 +33,18 @@ import org.schabi.newpipe.extractor.utils.Utils;
public class YoutubeStreamInfoItemExtractor implements StreamInfoItemExtractor {
private final Element item;
private final TimeAgoParser timeAgoParser;
public YoutubeStreamInfoItemExtractor(Element item) {
private String cachedUploadDate;
/**
* Creates an extractor of StreamInfoItems from a YouTube page.
* @param item The page element
* @param timeAgoParser A parser of the textual dates or {@code null}.
*/
public YoutubeStreamInfoItemExtractor(Element item, @Nullable TimeAgoParser timeAgoParser) {
this.item = item;
this.timeAgoParser = timeAgoParser;
}
@Override
@ -126,20 +140,35 @@ public class YoutubeStreamInfoItemExtractor implements StreamInfoItemExtractor {
}
@Override
public String getUploadDate() throws ParsingException {
public String getTextualUploadDate() throws ParsingException {
if (cachedUploadDate != null) {
return cachedUploadDate;
}
try {
Element meta = item.select("div[class=\"yt-lockup-meta\"]").first();
if (meta == null) return "";
Element li = meta.select("li").first();
if(li == null) return "";
final Elements li = meta.select("li");
if (li.isEmpty()) return "";
return meta.select("li").first().text();
return cachedUploadDate = li.first().text();
} catch (Exception e) {
throw new ParsingException("Could not get upload date", e);
}
}
@Override
public Calendar getUploadDate() throws ParsingException {
String textualUploadDate = getTextualUploadDate();
if (timeAgoParser != null
&& textualUploadDate != null && !"".equals(textualUploadDate)) {
return timeAgoParser.parse(textualUploadDate);
} else {
return null;
}
}
@Override
public long getViewCount() throws ParsingException {
String input;

View File

@ -35,12 +35,15 @@ import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingH
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.utils.Localization;
import org.schabi.newpipe.extractor.stream.TimeAgoParser;
import javax.annotation.Nonnull;
import java.io.IOException;
public class YoutubeTrendingExtractor extends KioskExtractor<StreamInfoItem> {
private final TimeAgoParser timeAgoParser = getService().getTimeAgoParser();
private Document doc;
public YoutubeTrendingExtractor(StreamingService service,
@ -93,7 +96,7 @@ public class YoutubeTrendingExtractor extends KioskExtractor<StreamInfoItem> {
for(Element ul : uls) {
for(final Element li : ul.children()) {
final Element el = li.select("div[class*=\"yt-lockup-dismissable\"]").first();
collector.commit(new YoutubeStreamInfoItemExtractor(li) {
collector.commit(new YoutubeStreamInfoItemExtractor(li, timeAgoParser) {
@Override
public String getUrl() throws ParsingException {
try {

View File

@ -22,6 +22,8 @@ package org.schabi.newpipe.extractor.stream;
import org.schabi.newpipe.extractor.InfoItem;
import java.util.Calendar;
/**
* Info object for previews of unopened videos, eg search results, related videos
*/
@ -29,7 +31,8 @@ public class StreamInfoItem extends InfoItem {
private final StreamType streamType;
private String uploaderName;
private String uploadDate;
private String textualUploadDate;
private Calendar uploadDate;
private long viewCount = -1;
private long duration = -1;
@ -52,14 +55,6 @@ public class StreamInfoItem extends InfoItem {
this.uploaderName = uploader_name;
}
public String getUploadDate() {
return uploadDate;
}
public void setUploadDate(String upload_date) {
this.uploadDate = upload_date;
}
public long getViewCount() {
return viewCount;
}
@ -84,12 +79,36 @@ public class StreamInfoItem extends InfoItem {
this.uploaderUrl = uploaderUrl;
}
/**
* @return The original textual upload date as returned by the streaming service.
* @see #getUploadDate()
*/
public String getTextualUploadDate() {
return textualUploadDate;
}
public void setTextualUploadDate(String upload_date) {
this.textualUploadDate = upload_date;
}
/**
* @return The (approximated) date and time this item was uploaded or {@code null}.
* @see #getTextualUploadDate()
*/
public Calendar getUploadDate() {
return uploadDate;
}
public void setUploadDate(Calendar uploadDate) {
this.uploadDate = uploadDate;
}
@Override
public String toString() {
return "StreamInfoItem{" +
"streamType=" + streamType +
", uploaderName='" + uploaderName + '\'' +
", uploadDate='" + uploadDate + '\'' +
", textualUploadDate='" + textualUploadDate + '\'' +
", viewCount=" + viewCount +
", duration=" + duration +
", uploaderUrl='" + uploaderUrl + '\'' +

View File

@ -3,6 +3,8 @@ package org.schabi.newpipe.extractor.stream;
import org.schabi.newpipe.extractor.InfoItemExtractor;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import java.util.Calendar;
/*
* Created by Christian Schabesberger on 28.02.16.
*
@ -64,10 +66,30 @@ public interface StreamInfoItemExtractor extends InfoItemExtractor {
String getUploaderUrl() throws ParsingException;
/**
* Extract the uploader name
* @return the uploader name
* @throws ParsingException thrown if there is an error in the extraction
* Extract the textual upload date of this item.
* The original textual date provided by the service may be used if it is short;
* otherwise the format "yyyy-MM-dd" or an locale specific version is preferred.
*
* @return The original textual upload date.
* @throws ParsingException if there is an error in the extraction
* @see #getUploadDate()
*/
String getUploadDate() throws ParsingException;
String getTextualUploadDate() throws ParsingException;
/**
* Extracts the upload date and time of this item and parses it.
* <p>
* If the service doesn't provide an exact time, an approximation can be returned.
* The approximation should be marked by setting seconds and milliseconds to zero.
* <br>
* If the service doesn't provide any date at all, then {@code null} should be returned.
* </p>
*
* @return The (approximated) date and time this item was uploaded or {@code null}.
* @throws ParsingException if there is an error in the extraction
* or the extracted date couldn't be parsed.
* @see #getTextualUploadDate()
*/
Calendar getUploadDate() throws ParsingException;
}

View File

@ -61,10 +61,15 @@ public class StreamInfoItemsCollector extends InfoItemsCollector<StreamInfoItem,
addError(e);
}
try {
resultItem.setUploadDate(extractor.getUploadDate());
resultItem.setTextualUploadDate(extractor.getTextualUploadDate());
} catch (Exception e) {
addError(e);
}
try {
resultItem.setUploadDate(extractor.getUploadDate());
} catch (ParsingException e) {
addError(e);
}
try {
resultItem.setViewCount(extractor.getViewCount());
} catch (Exception e) {

View File

@ -0,0 +1,158 @@
package org.schabi.newpipe.extractor.stream;
/*
* Created by wojcik.online on 2018-01-25.
*/
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Map;
/**
* A helper class that is meant to be used by services that need to parse upload dates in the
* format '2 days ago' or similar.
*/
public class TimeAgoParser {
/**
* A set of english phrases that are contained in the time units.
* (e.g. '7 minutes ago' contains 'min')
*/
public static Map<TimeAgoUnit, Collection<String>> DEFAULT_AGO_PHRASES =
new EnumMap<>(TimeAgoUnit.class);
private final Map<TimeAgoUnit, Collection<String>> agoPhrases;
private final Calendar consistentNow;
/**
* Creates a helper to parse upload dates in the format '2 days ago'.
* <p>
* Instantiate a new {@link TimeAgoParser} every time you extract a new batch of items.
* </p>
* @param agoPhrases A set of phrases how to recognize the time units in a given language.
*/
public TimeAgoParser(Map<TimeAgoUnit, Collection<String>> agoPhrases) {
this.agoPhrases = agoPhrases;
consistentNow = Calendar.getInstance();
}
/**
* Parses a textual date in the format '2 days ago' into a Calendar representation.
* Beginning with days ago, marks the date as approximated by setting minutes, seconds
* and milliseconds to 0.
* @param textualDate The original date as provided by the streaming service
* @return The parsed (approximated) time
* @throws ParsingException if the time unit could not be recognized
*/
public Calendar parse(String textualDate) throws ParsingException {
int timeAgoAmount;
try {
timeAgoAmount = parseTimeAgoAmount(textualDate);
} catch (NumberFormatException e) {
// If there is no valid number in the textual date,
// assume it is 1 (as in 'a second ago').
timeAgoAmount = 1;
}
TimeAgoUnit timeAgoUnit = parseTimeAgoUnit(textualDate);
return getCalendar(timeAgoAmount, timeAgoUnit);
}
private int parseTimeAgoAmount(String textualDate) throws NumberFormatException {
String timeValueStr = textualDate.replaceAll("\\D+", "");
return Integer.parseInt(timeValueStr);
}
private TimeAgoUnit parseTimeAgoUnit(String textualDate) throws ParsingException {
for (TimeAgoUnit timeAgoUnit : agoPhrases.keySet()) {
for (String agoPhrase : agoPhrases.get(timeAgoUnit)) {
if (textualDate.toLowerCase().contains(agoPhrase.toLowerCase())){
return timeAgoUnit;
}
}
}
throw new ParsingException("Unable to parse the date: " + textualDate);
}
private Calendar getCalendar(int timeAgoAmount, TimeAgoUnit timeAgoUnit) {
Calendar calendarTime = getNow();
switch (timeAgoUnit) {
case SECONDS:
calendarTime.add(Calendar.SECOND, -timeAgoAmount);
break;
case MINUTES:
calendarTime.add(Calendar.MINUTE, -timeAgoAmount);
break;
case HOURS:
calendarTime.add(Calendar.HOUR_OF_DAY, -timeAgoAmount);
break;
case DAYS:
calendarTime.add(Calendar.DAY_OF_MONTH, -timeAgoAmount);
markApproximatedTime(calendarTime);
break;
case WEEKS:
calendarTime.add(Calendar.WEEK_OF_YEAR, -timeAgoAmount);
markApproximatedTime(calendarTime);
break;
case MONTHS:
calendarTime.add(Calendar.MONTH, -timeAgoAmount);
markApproximatedTime(calendarTime);
break;
case YEARS:
calendarTime.add(Calendar.YEAR, -timeAgoAmount);
// Prevent `PrettyTime` from showing '12 months ago'.
calendarTime.add(Calendar.DAY_OF_MONTH, -1);
markApproximatedTime(calendarTime);
break;
}
return calendarTime;
}
private Calendar getNow() {
return (Calendar) consistentNow.clone();
}
/**
* Marks the time as approximated by setting minutes, seconds and milliseconds to 0.
* @param calendarTime Time to be marked as approximated
*/
private void markApproximatedTime(Calendar calendarTime) {
calendarTime.set(Calendar.MINUTE, 0);
calendarTime.set(Calendar.SECOND, 0);
calendarTime.set(Calendar.MILLISECOND, 0);
}
static {
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.SECONDS, Collections.singleton("sec"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MINUTES, Collections.singleton("min"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.HOURS, Collections.singleton("hour"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.DAYS, Collections.singleton("day"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.WEEKS, Collections.singleton("week"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.MONTHS, Collections.singleton("month"));
DEFAULT_AGO_PHRASES.put(TimeAgoUnit.YEARS, Collections.singleton("year"));
}
public enum TimeAgoUnit {
SECONDS,
MINUTES,
HOURS,
DAYS,
WEEKS,
MONTHS,
YEARS,
}
}

View File

@ -41,6 +41,7 @@ import static java.util.Collections.singletonList;
public class Downloader implements org.schabi.newpipe.extractor.Downloader {
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0";
private static final String DEFAULT_HTTP_ACCEPT_LANGUAGE = "en";
private static String mCookies = "";
private static Downloader instance = null;
@ -171,6 +172,7 @@ public class Downloader implements org.schabi.newpipe.extractor.Downloader {
URL url = new URL(siteUrl);
HttpsURLConnection con = (HttpsURLConnection) url.openConnection();
// HttpsURLConnection con = NetCipher.getHttpsURLConnection(url);
con.setRequestProperty("Accept-Language", DEFAULT_HTTP_ACCEPT_LANGUAGE);
return dl(con);
}

View File

@ -4,6 +4,7 @@ import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import java.util.Calendar;
import java.util.List;
import static org.junit.Assert.*;
@ -27,6 +28,14 @@ public final class DefaultTests {
StreamInfoItem streamInfoItem = (StreamInfoItem) item;
assertNotEmpty("Uploader name not set: " + item, streamInfoItem.getUploaderName());
assertNotEmpty("Uploader url not set: " + item, streamInfoItem.getUploaderUrl());
final String textualUploadDate = streamInfoItem.getTextualUploadDate();
if (textualUploadDate != null && !textualUploadDate.isEmpty()) {
final Calendar uploadDate = streamInfoItem.getUploadDate();
assertNotNull("No parsed upload date", uploadDate);
assertTrue("Upload date not in the past", uploadDate.before(Calendar.getInstance()));
}
}
}
}