[YouTube] Optimize extracting auto-translated captions

Faster and ordered: captions provided by the user are at the beginning of the list, auto-translated captions are at the end
This commit is contained in:
TobiGr 2024-05-10 19:19:42 +02:00
parent ff030ad297
commit 30a4b3617d
3 changed files with 137 additions and 46 deletions

View File

@ -670,52 +670,72 @@ public class YoutubeStreamExtractor extends StreamExtractor {
assertPageFetched();
// We cannot store the subtitles list because the media format may change
final List<SubtitlesStream> subtitlesToReturn = new ArrayList<>();
final List<SubtitlesStream> subtitles = new ArrayList<>();
final List<SubtitlesStream> autoTranslatedSubtitles = new ArrayList<>();
final JsonObject renderer = playerResponse.getObject("captions")
.getObject("playerCaptionsTracklistRenderer");
final JsonArray captionsArray = renderer.getArray("captionTracks");
// Generate list of languages available for auto-translations
final List<String> translationLanguages;
if (renderer.has("translationLanguages")) {
translationLanguages = renderer.getArray("translationLanguages")
.stream()
.map(JsonObject.class::cast)
.map(lang -> lang.getString("languageCode"))
.collect(Collectors.toList());
} else {
translationLanguages = Collections.emptyList();
}
// Add subtitles
for (int i = 0; i < captionsArray.size(); i++) {
final JsonObject caption = captionsArray.getObject(i);
final String languageCode = caption.getString("languageCode");
final String baseUrl = caption.getString("baseUrl");
final String vssId = caption.getString("vssId");
if (languageCode != null && baseUrl != null && vssId != null) {
final boolean isAutoGenerated = vssId.startsWith("a.");
final String cleanUrl = baseUrl
// Remove preexisting format if exists
.replaceAll("&fmt=[^&]*", "")
// Remove translation language
.replaceAll("&tlang=[^&]*", "");
if (languageCode == null || baseUrl == null || vssId == null) {
continue;
}
subtitlesToReturn.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix(), true)
.setMediaFormat(format)
.setLanguageCode(languageCode)
.setAutoGenerated(isAutoGenerated)
.setAutoTranslated(false)
.build());
if (i == 0 && caption.getBoolean("isTranslatable")
&& renderer.has("translationLanguages")) {
final JsonArray languages = renderer.getArray("translationLanguages");
for (int j = 0; j < languages.size(); j++) {
final JsonObject lang = languages.getObject(j);
final String tLanguageCode = lang.getString("languageCode");
subtitlesToReturn.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix()
+ "&tlang=" + tLanguageCode, true)
.setMediaFormat(format)
.setLanguageCode(tLanguageCode)
.setAutoGenerated(isAutoGenerated)
.setAutoTranslated(true)
.build());
}
final boolean isAutoGenerated = vssId.startsWith("a.");
final String cleanUrl = baseUrl
// Remove preexisting format if exists
.replaceAll("&fmt=[^&]*", "")
// Remove translation language
.replaceAll("&tlang=[^&]*", "");
// add base subtitles
subtitles.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix(), true)
.setMediaFormat(format)
.setLanguageCode(languageCode)
.setAutoGenerated(isAutoGenerated)
.setAutoTranslated(false)
.build());
// add auto-translations of this subtitle if available
if (caption.getBoolean("isTranslatable")) {
for (final String tLanguageCode : translationLanguages) {
autoTranslatedSubtitles.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix()
+ "&tlang=" + tLanguageCode, true)
.setMediaFormat(format)
.setLanguageCode(tLanguageCode)
.setAutoGenerated(true)
.setAutoTranslated(true)
.setBaseLanguageCode(languageCode)
.build());
}
}
}
return subtitlesToReturn;
// add auto-translations at the end for better sorting
subtitles.addAll(autoTranslatedSubtitles);
return subtitles;
}
@Override

View File

@ -12,6 +12,8 @@ import javax.annotation.Nullable;
public final class SubtitlesStream extends Stream {
private final MediaFormat format;
@Nullable
private final Locale baseLocale;
private final Locale locale;
private final boolean autoGenerated;
private final boolean autoTranslated;
@ -31,6 +33,8 @@ public final class SubtitlesStream extends Stream {
@Nullable
private String manifestUrl;
private String languageCode;
@Nullable
private String baseLanguageCode;
// Use of the Boolean class instead of the primitive type needed for setter call check
private Boolean autoGenerated;
private Boolean autoTranslated;
@ -142,6 +146,18 @@ public final class SubtitlesStream extends Stream {
return this;
}
/**
* Set the language code of the base language used to auto-translate
* the {@link SubtitlesStream} to the current language code.
*
* @param baseLanguageCode the language code of the {@link SubtitlesStream}
* @return this {@link Builder} instance
*/
public Builder setBaseLanguageCode(@Nullable final String baseLanguageCode) {
this.baseLanguageCode = baseLanguageCode;
return this;
}
/**
* Set whether the subtitles have been auto-generated by the streaming service.
*
@ -222,26 +238,29 @@ public final class SubtitlesStream extends Stream {
}
return new SubtitlesStream(id, content, isUrl, mediaFormat, deliveryMethod,
languageCode, autoGenerated, autoTranslated, manifestUrl);
languageCode, autoGenerated, autoTranslated, baseLanguageCode, manifestUrl);
}
}
/**
* Create a new subtitles stream.
*
* @param id the identifier which uniquely identifies the stream, e.g. for YouTube
* this would be the itag
* @param content the content or the URL of the stream, depending on whether isUrl is
* true
* @param isUrl whether content is the URL or the actual content of e.g. a DASH
* manifest
* @param mediaFormat the {@link MediaFormat} used by the stream
* @param deliveryMethod the {@link DeliveryMethod} of the stream
* @param languageCode the language code of the stream
* @param autoGenerated whether the subtitles are auto-generated by the streaming service
* @param autoTranslated whether the subtitles are auto-translated by the streaming service
* @param manifestUrl the URL of the manifest this stream comes from (if applicable,
* otherwise null)
* @param id the identifier which uniquely identifies the stream, e.g. for YouTube
* this would be the itag
* @param content the content or the URL of the stream, depending on whether isUrl is
* true
* @param isUrl whether content is the URL or the actual content of e.g. a DASH
* manifest
* @param mediaFormat the {@link MediaFormat} used by the stream
* @param deliveryMethod the {@link DeliveryMethod} of the stream
* @param languageCode the language code of the stream
* @param autoGenerated whether the subtitles are auto-generated by the streaming service
* @param autoTranslated whether the subtitles are auto-translated by the streaming service
* @param baseLanguageCode the language code of the base language used to translate
* the subtitles to the current language
* or null if the subtitles are not auto-translated
* @param manifestUrl the URL of the manifest this stream comes from (if applicable,
* otherwise null)
*/
@SuppressWarnings("checkstyle:ParameterNumber")
private SubtitlesStream(@Nonnull final String id,
@ -252,6 +271,7 @@ public final class SubtitlesStream extends Stream {
@Nonnull final String languageCode,
final boolean autoGenerated,
final boolean autoTranslated,
@Nullable final String baseLanguageCode,
@Nullable final String manifestUrl) throws ParsingException {
super(id, content, isUrl, mediaFormat, deliveryMethod, manifestUrl);
this.locale = LocaleCompat.forLanguageTag(languageCode).orElseThrow(
@ -261,6 +281,13 @@ public final class SubtitlesStream extends Stream {
this.format = mediaFormat;
this.autoGenerated = autoGenerated;
this.autoTranslated = autoTranslated;
if (baseLanguageCode == null) {
this.baseLocale = null;
} else {
this.baseLocale = LocaleCompat.forLanguageTag(baseLanguageCode).orElseThrow(
() -> new ParsingException(
"not a valid locale language code: " + baseLanguageCode));
}
}
/**
@ -337,6 +364,37 @@ public final class SubtitlesStream extends Stream {
return locale;
}
/**
* Get the {@link Locale baseLocale} which was used to automatically translated the subtitles
* into the current {@link #locale}.
*
* @return the {@link Locale baseLocale} for the subtitle translation
* or {@code null} if the subtitle is not auto-translated
*/
@Nullable
public Locale getBaseLocale() {
return baseLocale;
}
/**
* Get the display base language name of the subtitles.
*
* @return the display language name of the subtitles
*/
public String getDisplayBaseLanguageName() {
return locale.getDisplayName(locale);
}
/**
* Get the language tag of the subtitles.
*
* @return the language tag of the subtitles
*/
public String getBaseLanguageTag() {
return code;
}
/**
* No subtitles which are currently extracted use an {@link ItagItem}, so {@code null} is
* returned by this method.
@ -348,4 +406,16 @@ public final class SubtitlesStream extends Stream {
public ItagItem getItagItem() {
return null;
}
@Override
public String toString() {
return "SubtitlesStream{"
+ "format=" + format
+ ", baseLocale=" + baseLocale
+ ", locale=" + locale
+ ", autoGenerated=" + autoGenerated
+ ", autoTranslated=" + autoTranslated
+ ", code='" + code + '\''
+ '}';
}
}

View File

@ -1,5 +1,6 @@
package org.schabi.newpipe.extractor.utils;
import javax.annotation.Nonnull;
import java.util.Locale;
import java.util.Optional;
@ -16,7 +17,7 @@ public final class LocaleCompat {
// Source: The AndroidX LocaleListCompat class's private forLanguageTagCompat() method.
// Use Locale.forLanguageTag() on Android API level >= 21 / Java instead.
public static Optional<Locale> forLanguageTag(final String str) {
public static Optional<Locale> forLanguageTag(@Nonnull final String str) {
if (str.contains("-")) {
final String[] args = str.split("-", -1);
if (args.length > 2) {