[YouTube] Optimize extracting auto-translated captions

Faster and ordered: captions provided by the user are at the beginning of the list, auto-translated captions are at the end
This commit is contained in:
TobiGr 2024-05-10 19:19:42 +02:00
parent ff030ad297
commit 30a4b3617d
3 changed files with 137 additions and 46 deletions

View File

@ -670,52 +670,72 @@ public class YoutubeStreamExtractor extends StreamExtractor {
assertPageFetched(); assertPageFetched();
// We cannot store the subtitles list because the media format may change // We cannot store the subtitles list because the media format may change
final List<SubtitlesStream> subtitlesToReturn = new ArrayList<>(); final List<SubtitlesStream> subtitles = new ArrayList<>();
final List<SubtitlesStream> autoTranslatedSubtitles = new ArrayList<>();
final JsonObject renderer = playerResponse.getObject("captions") final JsonObject renderer = playerResponse.getObject("captions")
.getObject("playerCaptionsTracklistRenderer"); .getObject("playerCaptionsTracklistRenderer");
final JsonArray captionsArray = renderer.getArray("captionTracks"); final JsonArray captionsArray = renderer.getArray("captionTracks");
// Generate list of languages available for auto-translations
final List<String> translationLanguages;
if (renderer.has("translationLanguages")) {
translationLanguages = renderer.getArray("translationLanguages")
.stream()
.map(JsonObject.class::cast)
.map(lang -> lang.getString("languageCode"))
.collect(Collectors.toList());
} else {
translationLanguages = Collections.emptyList();
}
// Add subtitles
for (int i = 0; i < captionsArray.size(); i++) { for (int i = 0; i < captionsArray.size(); i++) {
final JsonObject caption = captionsArray.getObject(i); final JsonObject caption = captionsArray.getObject(i);
final String languageCode = caption.getString("languageCode"); final String languageCode = caption.getString("languageCode");
final String baseUrl = caption.getString("baseUrl"); final String baseUrl = caption.getString("baseUrl");
final String vssId = caption.getString("vssId"); final String vssId = caption.getString("vssId");
if (languageCode != null && baseUrl != null && vssId != null) { if (languageCode == null || baseUrl == null || vssId == null) {
final boolean isAutoGenerated = vssId.startsWith("a."); continue;
final String cleanUrl = baseUrl }
// Remove preexisting format if exists
.replaceAll("&fmt=[^&]*", "")
// Remove translation language
.replaceAll("&tlang=[^&]*", "");
subtitlesToReturn.add(new SubtitlesStream.Builder() final boolean isAutoGenerated = vssId.startsWith("a.");
.setContent(cleanUrl + "&fmt=" + format.getSuffix(), true) final String cleanUrl = baseUrl
.setMediaFormat(format) // Remove preexisting format if exists
.setLanguageCode(languageCode) .replaceAll("&fmt=[^&]*", "")
.setAutoGenerated(isAutoGenerated) // Remove translation language
.setAutoTranslated(false) .replaceAll("&tlang=[^&]*", "");
.build());
if (i == 0 && caption.getBoolean("isTranslatable") // add base subtitles
&& renderer.has("translationLanguages")) { subtitles.add(new SubtitlesStream.Builder()
final JsonArray languages = renderer.getArray("translationLanguages"); .setContent(cleanUrl + "&fmt=" + format.getSuffix(), true)
for (int j = 0; j < languages.size(); j++) { .setMediaFormat(format)
final JsonObject lang = languages.getObject(j); .setLanguageCode(languageCode)
final String tLanguageCode = lang.getString("languageCode"); .setAutoGenerated(isAutoGenerated)
subtitlesToReturn.add(new SubtitlesStream.Builder() .setAutoTranslated(false)
.setContent(cleanUrl + "&fmt=" + format.getSuffix() .build());
+ "&tlang=" + tLanguageCode, true)
.setMediaFormat(format) // add auto-translations of this subtitle if available
.setLanguageCode(tLanguageCode) if (caption.getBoolean("isTranslatable")) {
.setAutoGenerated(isAutoGenerated) for (final String tLanguageCode : translationLanguages) {
.setAutoTranslated(true) autoTranslatedSubtitles.add(new SubtitlesStream.Builder()
.build()); .setContent(cleanUrl + "&fmt=" + format.getSuffix()
} + "&tlang=" + tLanguageCode, true)
.setMediaFormat(format)
.setLanguageCode(tLanguageCode)
.setAutoGenerated(true)
.setAutoTranslated(true)
.setBaseLanguageCode(languageCode)
.build());
} }
} }
} }
return subtitlesToReturn; // add auto-translations at the end for better sorting
subtitles.addAll(autoTranslatedSubtitles);
return subtitles;
} }
@Override @Override

View File

@ -12,6 +12,8 @@ import javax.annotation.Nullable;
public final class SubtitlesStream extends Stream { public final class SubtitlesStream extends Stream {
private final MediaFormat format; private final MediaFormat format;
@Nullable
private final Locale baseLocale;
private final Locale locale; private final Locale locale;
private final boolean autoGenerated; private final boolean autoGenerated;
private final boolean autoTranslated; private final boolean autoTranslated;
@ -31,6 +33,8 @@ public final class SubtitlesStream extends Stream {
@Nullable @Nullable
private String manifestUrl; private String manifestUrl;
private String languageCode; private String languageCode;
@Nullable
private String baseLanguageCode;
// Use of the Boolean class instead of the primitive type needed for setter call check // Use of the Boolean class instead of the primitive type needed for setter call check
private Boolean autoGenerated; private Boolean autoGenerated;
private Boolean autoTranslated; private Boolean autoTranslated;
@ -142,6 +146,18 @@ public final class SubtitlesStream extends Stream {
return this; return this;
} }
/**
* Set the language code of the base language used to auto-translate
* the {@link SubtitlesStream} to the current language code.
*
* @param baseLanguageCode the language code of the {@link SubtitlesStream}
* @return this {@link Builder} instance
*/
public Builder setBaseLanguageCode(@Nullable final String baseLanguageCode) {
this.baseLanguageCode = baseLanguageCode;
return this;
}
/** /**
* Set whether the subtitles have been auto-generated by the streaming service. * Set whether the subtitles have been auto-generated by the streaming service.
* *
@ -222,26 +238,29 @@ public final class SubtitlesStream extends Stream {
} }
return new SubtitlesStream(id, content, isUrl, mediaFormat, deliveryMethod, return new SubtitlesStream(id, content, isUrl, mediaFormat, deliveryMethod,
languageCode, autoGenerated, autoTranslated, manifestUrl); languageCode, autoGenerated, autoTranslated, baseLanguageCode, manifestUrl);
} }
} }
/** /**
* Create a new subtitles stream. * Create a new subtitles stream.
* *
* @param id the identifier which uniquely identifies the stream, e.g. for YouTube * @param id the identifier which uniquely identifies the stream, e.g. for YouTube
* this would be the itag * this would be the itag
* @param content the content or the URL of the stream, depending on whether isUrl is * @param content the content or the URL of the stream, depending on whether isUrl is
* true * true
* @param isUrl whether content is the URL or the actual content of e.g. a DASH * @param isUrl whether content is the URL or the actual content of e.g. a DASH
* manifest * manifest
* @param mediaFormat the {@link MediaFormat} used by the stream * @param mediaFormat the {@link MediaFormat} used by the stream
* @param deliveryMethod the {@link DeliveryMethod} of the stream * @param deliveryMethod the {@link DeliveryMethod} of the stream
* @param languageCode the language code of the stream * @param languageCode the language code of the stream
* @param autoGenerated whether the subtitles are auto-generated by the streaming service * @param autoGenerated whether the subtitles are auto-generated by the streaming service
* @param autoTranslated whether the subtitles are auto-translated by the streaming service * @param autoTranslated whether the subtitles are auto-translated by the streaming service
* @param manifestUrl the URL of the manifest this stream comes from (if applicable, * @param baseLanguageCode the language code of the base language used to translate
* otherwise null) * the subtitles to the current language
* or null if the subtitles are not auto-translated
* @param manifestUrl the URL of the manifest this stream comes from (if applicable,
* otherwise null)
*/ */
@SuppressWarnings("checkstyle:ParameterNumber") @SuppressWarnings("checkstyle:ParameterNumber")
private SubtitlesStream(@Nonnull final String id, private SubtitlesStream(@Nonnull final String id,
@ -252,6 +271,7 @@ public final class SubtitlesStream extends Stream {
@Nonnull final String languageCode, @Nonnull final String languageCode,
final boolean autoGenerated, final boolean autoGenerated,
final boolean autoTranslated, final boolean autoTranslated,
@Nullable final String baseLanguageCode,
@Nullable final String manifestUrl) throws ParsingException { @Nullable final String manifestUrl) throws ParsingException {
super(id, content, isUrl, mediaFormat, deliveryMethod, manifestUrl); super(id, content, isUrl, mediaFormat, deliveryMethod, manifestUrl);
this.locale = LocaleCompat.forLanguageTag(languageCode).orElseThrow( this.locale = LocaleCompat.forLanguageTag(languageCode).orElseThrow(
@ -261,6 +281,13 @@ public final class SubtitlesStream extends Stream {
this.format = mediaFormat; this.format = mediaFormat;
this.autoGenerated = autoGenerated; this.autoGenerated = autoGenerated;
this.autoTranslated = autoTranslated; this.autoTranslated = autoTranslated;
if (baseLanguageCode == null) {
this.baseLocale = null;
} else {
this.baseLocale = LocaleCompat.forLanguageTag(baseLanguageCode).orElseThrow(
() -> new ParsingException(
"not a valid locale language code: " + baseLanguageCode));
}
} }
/** /**
@ -337,6 +364,37 @@ public final class SubtitlesStream extends Stream {
return locale; return locale;
} }
/**
* Get the {@link Locale baseLocale} which was used to automatically translated the subtitles
* into the current {@link #locale}.
*
* @return the {@link Locale baseLocale} for the subtitle translation
* or {@code null} if the subtitle is not auto-translated
*/
@Nullable
public Locale getBaseLocale() {
return baseLocale;
}
/**
* Get the display base language name of the subtitles.
*
* @return the display language name of the subtitles
*/
public String getDisplayBaseLanguageName() {
return locale.getDisplayName(locale);
}
/**
* Get the language tag of the subtitles.
*
* @return the language tag of the subtitles
*/
public String getBaseLanguageTag() {
return code;
}
/** /**
* No subtitles which are currently extracted use an {@link ItagItem}, so {@code null} is * No subtitles which are currently extracted use an {@link ItagItem}, so {@code null} is
* returned by this method. * returned by this method.
@ -348,4 +406,16 @@ public final class SubtitlesStream extends Stream {
public ItagItem getItagItem() { public ItagItem getItagItem() {
return null; return null;
} }
@Override
public String toString() {
return "SubtitlesStream{"
+ "format=" + format
+ ", baseLocale=" + baseLocale
+ ", locale=" + locale
+ ", autoGenerated=" + autoGenerated
+ ", autoTranslated=" + autoTranslated
+ ", code='" + code + '\''
+ '}';
}
} }

View File

@ -1,5 +1,6 @@
package org.schabi.newpipe.extractor.utils; package org.schabi.newpipe.extractor.utils;
import javax.annotation.Nonnull;
import java.util.Locale; import java.util.Locale;
import java.util.Optional; import java.util.Optional;
@ -16,7 +17,7 @@ public final class LocaleCompat {
// Source: The AndroidX LocaleListCompat class's private forLanguageTagCompat() method. // Source: The AndroidX LocaleListCompat class's private forLanguageTagCompat() method.
// Use Locale.forLanguageTag() on Android API level >= 21 / Java instead. // Use Locale.forLanguageTag() on Android API level >= 21 / Java instead.
public static Optional<Locale> forLanguageTag(final String str) { public static Optional<Locale> forLanguageTag(@Nonnull final String str) {
if (str.contains("-")) { if (str.contains("-")) {
final String[] args = str.split("-", -1); final String[] args = str.split("-", -1);
if (args.length > 2) { if (args.length > 2) {