From 30a4b3617ded61df668a04def287cff56c1cc60f Mon Sep 17 00:00:00 2001 From: TobiGr Date: Fri, 10 May 2024 19:19:42 +0200 Subject: [PATCH] [YouTube] Optimize extracting auto-translated captions Faster and ordered: captions provided by the user are at the beginning of the list, auto-translated captions are at the end --- .../extractors/YoutubeStreamExtractor.java | 82 ++++++++++------ .../extractor/stream/SubtitlesStream.java | 98 ++++++++++++++++--- .../newpipe/extractor/utils/LocaleCompat.java | 3 +- 3 files changed, 137 insertions(+), 46 deletions(-) diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java index 923b30157..7e0177743 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java @@ -670,52 +670,72 @@ public class YoutubeStreamExtractor extends StreamExtractor { assertPageFetched(); // We cannot store the subtitles list because the media format may change - final List subtitlesToReturn = new ArrayList<>(); + final List subtitles = new ArrayList<>(); + final List autoTranslatedSubtitles = new ArrayList<>(); final JsonObject renderer = playerResponse.getObject("captions") .getObject("playerCaptionsTracklistRenderer"); final JsonArray captionsArray = renderer.getArray("captionTracks"); + // Generate list of languages available for auto-translations + final List translationLanguages; + if (renderer.has("translationLanguages")) { + translationLanguages = renderer.getArray("translationLanguages") + .stream() + .map(JsonObject.class::cast) + .map(lang -> lang.getString("languageCode")) + .collect(Collectors.toList()); + } else { + translationLanguages = Collections.emptyList(); + } + + // Add subtitles for (int i = 0; i < captionsArray.size(); i++) { final JsonObject caption = captionsArray.getObject(i); final String languageCode = caption.getString("languageCode"); final String baseUrl = caption.getString("baseUrl"); final String vssId = caption.getString("vssId"); - if (languageCode != null && baseUrl != null && vssId != null) { - final boolean isAutoGenerated = vssId.startsWith("a."); - final String cleanUrl = baseUrl - // Remove preexisting format if exists - .replaceAll("&fmt=[^&]*", "") - // Remove translation language - .replaceAll("&tlang=[^&]*", ""); + if (languageCode == null || baseUrl == null || vssId == null) { + continue; + } - subtitlesToReturn.add(new SubtitlesStream.Builder() - .setContent(cleanUrl + "&fmt=" + format.getSuffix(), true) - .setMediaFormat(format) - .setLanguageCode(languageCode) - .setAutoGenerated(isAutoGenerated) - .setAutoTranslated(false) - .build()); - if (i == 0 && caption.getBoolean("isTranslatable") - && renderer.has("translationLanguages")) { - final JsonArray languages = renderer.getArray("translationLanguages"); - for (int j = 0; j < languages.size(); j++) { - final JsonObject lang = languages.getObject(j); - final String tLanguageCode = lang.getString("languageCode"); - subtitlesToReturn.add(new SubtitlesStream.Builder() - .setContent(cleanUrl + "&fmt=" + format.getSuffix() - + "&tlang=" + tLanguageCode, true) - .setMediaFormat(format) - .setLanguageCode(tLanguageCode) - .setAutoGenerated(isAutoGenerated) - .setAutoTranslated(true) - .build()); - } + final boolean isAutoGenerated = vssId.startsWith("a."); + final String cleanUrl = baseUrl + // Remove preexisting format if exists + .replaceAll("&fmt=[^&]*", "") + // Remove translation language + .replaceAll("&tlang=[^&]*", ""); + + // add base subtitles + subtitles.add(new SubtitlesStream.Builder() + .setContent(cleanUrl + "&fmt=" + format.getSuffix(), true) + .setMediaFormat(format) + .setLanguageCode(languageCode) + .setAutoGenerated(isAutoGenerated) + .setAutoTranslated(false) + .build()); + + // add auto-translations of this subtitle if available + if (caption.getBoolean("isTranslatable")) { + for (final String tLanguageCode : translationLanguages) { + autoTranslatedSubtitles.add(new SubtitlesStream.Builder() + .setContent(cleanUrl + "&fmt=" + format.getSuffix() + + "&tlang=" + tLanguageCode, true) + .setMediaFormat(format) + .setLanguageCode(tLanguageCode) + .setAutoGenerated(true) + .setAutoTranslated(true) + .setBaseLanguageCode(languageCode) + .build()); } } + } - return subtitlesToReturn; + // add auto-translations at the end for better sorting + subtitles.addAll(autoTranslatedSubtitles); + + return subtitles; } @Override diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/SubtitlesStream.java b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/SubtitlesStream.java index 22a4379be..c3ef29a76 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/SubtitlesStream.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/SubtitlesStream.java @@ -12,6 +12,8 @@ import javax.annotation.Nullable; public final class SubtitlesStream extends Stream { private final MediaFormat format; + @Nullable + private final Locale baseLocale; private final Locale locale; private final boolean autoGenerated; private final boolean autoTranslated; @@ -31,6 +33,8 @@ public final class SubtitlesStream extends Stream { @Nullable private String manifestUrl; private String languageCode; + @Nullable + private String baseLanguageCode; // Use of the Boolean class instead of the primitive type needed for setter call check private Boolean autoGenerated; private Boolean autoTranslated; @@ -142,6 +146,18 @@ public final class SubtitlesStream extends Stream { return this; } + /** + * Set the language code of the base language used to auto-translate + * the {@link SubtitlesStream} to the current language code. + * + * @param baseLanguageCode the language code of the {@link SubtitlesStream} + * @return this {@link Builder} instance + */ + public Builder setBaseLanguageCode(@Nullable final String baseLanguageCode) { + this.baseLanguageCode = baseLanguageCode; + return this; + } + /** * Set whether the subtitles have been auto-generated by the streaming service. * @@ -222,26 +238,29 @@ public final class SubtitlesStream extends Stream { } return new SubtitlesStream(id, content, isUrl, mediaFormat, deliveryMethod, - languageCode, autoGenerated, autoTranslated, manifestUrl); + languageCode, autoGenerated, autoTranslated, baseLanguageCode, manifestUrl); } } /** * Create a new subtitles stream. * - * @param id the identifier which uniquely identifies the stream, e.g. for YouTube - * this would be the itag - * @param content the content or the URL of the stream, depending on whether isUrl is - * true - * @param isUrl whether content is the URL or the actual content of e.g. a DASH - * manifest - * @param mediaFormat the {@link MediaFormat} used by the stream - * @param deliveryMethod the {@link DeliveryMethod} of the stream - * @param languageCode the language code of the stream - * @param autoGenerated whether the subtitles are auto-generated by the streaming service - * @param autoTranslated whether the subtitles are auto-translated by the streaming service - * @param manifestUrl the URL of the manifest this stream comes from (if applicable, - * otherwise null) + * @param id the identifier which uniquely identifies the stream, e.g. for YouTube + * this would be the itag + * @param content the content or the URL of the stream, depending on whether isUrl is + * true + * @param isUrl whether content is the URL or the actual content of e.g. a DASH + * manifest + * @param mediaFormat the {@link MediaFormat} used by the stream + * @param deliveryMethod the {@link DeliveryMethod} of the stream + * @param languageCode the language code of the stream + * @param autoGenerated whether the subtitles are auto-generated by the streaming service + * @param autoTranslated whether the subtitles are auto-translated by the streaming service + * @param baseLanguageCode the language code of the base language used to translate + * the subtitles to the current language + * or null if the subtitles are not auto-translated + * @param manifestUrl the URL of the manifest this stream comes from (if applicable, + * otherwise null) */ @SuppressWarnings("checkstyle:ParameterNumber") private SubtitlesStream(@Nonnull final String id, @@ -252,6 +271,7 @@ public final class SubtitlesStream extends Stream { @Nonnull final String languageCode, final boolean autoGenerated, final boolean autoTranslated, + @Nullable final String baseLanguageCode, @Nullable final String manifestUrl) throws ParsingException { super(id, content, isUrl, mediaFormat, deliveryMethod, manifestUrl); this.locale = LocaleCompat.forLanguageTag(languageCode).orElseThrow( @@ -261,6 +281,13 @@ public final class SubtitlesStream extends Stream { this.format = mediaFormat; this.autoGenerated = autoGenerated; this.autoTranslated = autoTranslated; + if (baseLanguageCode == null) { + this.baseLocale = null; + } else { + this.baseLocale = LocaleCompat.forLanguageTag(baseLanguageCode).orElseThrow( + () -> new ParsingException( + "not a valid locale language code: " + baseLanguageCode)); + } } /** @@ -337,6 +364,37 @@ public final class SubtitlesStream extends Stream { return locale; } + /** + * Get the {@link Locale baseLocale} which was used to automatically translated the subtitles + * into the current {@link #locale}. + * + * @return the {@link Locale baseLocale} for the subtitle translation + * or {@code null} if the subtitle is not auto-translated + */ + @Nullable + public Locale getBaseLocale() { + return baseLocale; + } + + /** + * Get the display base language name of the subtitles. + * + * @return the display language name of the subtitles + */ + public String getDisplayBaseLanguageName() { + return locale.getDisplayName(locale); + } + + /** + * Get the language tag of the subtitles. + * + * @return the language tag of the subtitles + */ + public String getBaseLanguageTag() { + return code; + } + + /** * No subtitles which are currently extracted use an {@link ItagItem}, so {@code null} is * returned by this method. @@ -348,4 +406,16 @@ public final class SubtitlesStream extends Stream { public ItagItem getItagItem() { return null; } + + @Override + public String toString() { + return "SubtitlesStream{" + + "format=" + format + + ", baseLocale=" + baseLocale + + ", locale=" + locale + + ", autoGenerated=" + autoGenerated + + ", autoTranslated=" + autoTranslated + + ", code='" + code + '\'' + + '}'; + } } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/LocaleCompat.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/LocaleCompat.java index 082a56824..fb21faf22 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/LocaleCompat.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/LocaleCompat.java @@ -1,5 +1,6 @@ package org.schabi.newpipe.extractor.utils; +import javax.annotation.Nonnull; import java.util.Locale; import java.util.Optional; @@ -16,7 +17,7 @@ public final class LocaleCompat { // Source: The AndroidX LocaleListCompat class's private forLanguageTagCompat() method. // Use Locale.forLanguageTag() on Android API level >= 21 / Java instead. - public static Optional forLanguageTag(final String str) { + public static Optional forLanguageTag(@Nonnull final String str) { if (str.contains("-")) { final String[] args = str.split("-", -1); if (args.length > 2) {