From 7244be762706bc6b2aa8eddee14538b07e5b65b8 Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Sat, 24 Sep 2022 19:55:46 +0200 Subject: [PATCH] [YouTube] Add JavaScript lexer to parse completely throttling decryption function (#905) --- .../youtube/YoutubeThrottlingDecrypter.java | 21 +- .../newpipe/extractor/utils/StringUtils.java | 91 -- .../jsextractor/JavaScriptExtractor.java | 50 + .../extractor/utils/jsextractor/Lexer.java | 311 +++++ .../extractor/utils/jsextractor/Token.java | 121 ++ .../utils/jsextractor/TokenStream.java | 1161 +++++++++++++++++ .../utils/JavaScriptExtractorTest.java | 54 + .../extractor/utils/StringUtilsTest.java | 71 - extractor/src/test/resources/es5.js | 182 +++ 9 files changed, 1889 insertions(+), 173 deletions(-) delete mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/utils/StringUtils.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/JavaScriptExtractor.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Token.java create mode 100644 extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java create mode 100644 extractor/src/test/java/org/schabi/newpipe/extractor/utils/JavaScriptExtractorTest.java delete mode 100644 extractor/src/test/java/org/schabi/newpipe/extractor/utils/StringUtilsTest.java create mode 100644 extractor/src/test/resources/es5.js diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeThrottlingDecrypter.java b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeThrottlingDecrypter.java index 8160e6315..f812e7007 100644 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeThrottlingDecrypter.java +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeThrottlingDecrypter.java @@ -3,7 +3,7 @@ package org.schabi.newpipe.extractor.services.youtube; import org.schabi.newpipe.extractor.exceptions.ParsingException; import org.schabi.newpipe.extractor.utils.JavaScript; import org.schabi.newpipe.extractor.utils.Parser; -import org.schabi.newpipe.extractor.utils.StringUtils; +import org.schabi.newpipe.extractor.utils.jsextractor.JavaScriptExtractor; import java.util.HashMap; import java.util.Map; @@ -119,21 +119,12 @@ public final class YoutubeThrottlingDecrypter { private static String parseDecodeFunction(final String playerJsCode, final String functionName) throws Parser.RegexException { try { - return parseWithParenthesisMatching(playerJsCode, functionName); + return parseWithLexer(playerJsCode, functionName); } catch (final Exception e) { return parseWithRegex(playerJsCode, functionName); } } - @Nonnull - private static String parseWithParenthesisMatching(final String playerJsCode, - final String functionName) { - final String functionBase = functionName + "=function"; - return validateFunction(functionBase - + StringUtils.matchToClosingParenthesis(playerJsCode, functionBase) - + ";"); - } - @Nonnull private static String parseWithRegex(final String playerJsCode, final String functionName) throws Parser.RegexException { @@ -153,6 +144,14 @@ public final class YoutubeThrottlingDecrypter { return function; } + @Nonnull + private static String parseWithLexer(final String playerJsCode, final String functionName) + throws ParsingException { + final String functionBase = functionName + "=function"; + return functionBase + JavaScriptExtractor.matchToClosingBrace(playerJsCode, functionBase) + + ";"; + } + private static boolean containsNParam(final String url) { return Parser.isMatch(N_PARAM_PATTERN, url); } diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/StringUtils.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/StringUtils.java deleted file mode 100644 index 9a6091a4d..000000000 --- a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/StringUtils.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.schabi.newpipe.extractor.utils; - -import javax.annotation.Nonnull; - -public final class StringUtils { - - private StringUtils() { - } - - /** - * @param string The string to search in. - * @param start A string from which to start searching. - * @return A substring where each '{' matches a '}'. - * @throws IndexOutOfBoundsException If {@code string} does not contain {@code start} - * or parenthesis could not be matched . - */ - @Nonnull - public static String matchToClosingParenthesis(@Nonnull final String string, - @Nonnull final String start) { - int startIndex = string.indexOf(start); - if (startIndex < 0) { - throw new IndexOutOfBoundsException(); - } - - startIndex += start.length(); - int endIndex = findNextParenthesis(string, startIndex, true); - ++endIndex; - - int openParenthesis = 1; - while (openParenthesis > 0) { - endIndex = findNextParenthesis(string, endIndex, false); - - switch (string.charAt(endIndex)) { - case '{': - ++openParenthesis; - break; - case '}': - --openParenthesis; - break; - default: - break; - } - ++endIndex; - } - - return string.substring(startIndex, endIndex); - } - - private static int findNextParenthesis(@Nonnull final String string, - final int offset, - final boolean onlyOpen) { - boolean lastEscaped = false; - char quote = ' '; - - for (int i = offset; i < string.length(); i++) { - boolean thisEscaped = false; - final char c = string.charAt(i); - - switch (c) { - case '{': - if (quote == ' ') { - return i; - } - break; - case '}': - if (!onlyOpen && quote == ' ') { - return i; - } - break; - case '\\': - if (!lastEscaped) { - thisEscaped = true; - } - break; - case '\'': - case '"': - if (!lastEscaped) { - if (quote == ' ') { - quote = c; - } else if (quote == c) { - quote = ' '; - } - } - } - - lastEscaped = thisEscaped; - } - - return -1; - } -} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/JavaScriptExtractor.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/JavaScriptExtractor.java new file mode 100644 index 000000000..da2aadac3 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/JavaScriptExtractor.java @@ -0,0 +1,50 @@ +package org.schabi.newpipe.extractor.utils.jsextractor; + +import org.schabi.newpipe.extractor.exceptions.ParsingException; + +import javax.annotation.Nonnull; + +/** + * Utility class for extracting functions from JavaScript code. + */ +public final class JavaScriptExtractor { + private JavaScriptExtractor() { + + } + + /** + * Searches the given JavaScript code for the identifier of a function + * and returns its body. + * + * @param jsCode JavaScript code + * @param start start of the function (without the opening brace) + * @return extracted code (opening brace + function + closing brace) + * @throws ParsingException + */ + @Nonnull + public static String matchToClosingBrace(final String jsCode, final String start) + throws ParsingException { + int startIndex = jsCode.indexOf(start); + if (startIndex < 0) { + throw new ParsingException("Start not found"); + } + startIndex += start.length(); + final String js = jsCode.substring(startIndex); + + final Lexer lexer = new Lexer(js); + boolean visitedOpenBrace = false; + + while (true) { + final Lexer.ParsedToken parsedToken = lexer.getNextToken(); + final Token t = parsedToken.token; + + if (t == Token.LC) { + visitedOpenBrace = true; + } else if (visitedOpenBrace && lexer.isBalanced()) { + return js.substring(0, parsedToken.end); + } else if (t == Token.EOF) { + throw new ParsingException("Could not find matching braces"); + } + } + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java new file mode 100644 index 000000000..b92a850cf --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Lexer.java @@ -0,0 +1,311 @@ +package org.schabi.newpipe.extractor.utils.jsextractor; + +import org.mozilla.javascript.Context; +import org.schabi.newpipe.extractor.exceptions.ParsingException; + +import java.util.Stack; + +/** + * JavaScript lexer that is able to parse JavaScript code and return its + * tokens. + * + *

+ * The algorithm for distinguishing between division operators and regex literals + * was taken from the RESS lexer. + *

+ */ +public class Lexer { + private static class Paren { + public final boolean funcExpr; + public final boolean conditional; + + Paren(final boolean funcExpr, final boolean conditional) { + this.funcExpr = funcExpr; + this.conditional = conditional; + } + } + + private static class Brace { + public final boolean isBlock; + public final Paren paren; + + Brace(final boolean isBlock, final Paren paren) { + this.isBlock = isBlock; + this.paren = paren; + } + } + + private static class MetaToken { + public final Token token; + public final int lineno; + + MetaToken(final Token token, final int lineno) { + this.token = token; + this.lineno = lineno; + } + } + + private static class BraceMetaToken extends MetaToken { + public final Brace brace; + + BraceMetaToken(final Token token, final int lineno, final Brace brace) { + super(token, lineno); + this.brace = brace; + } + } + + private static class ParenMetaToken extends MetaToken { + public final Paren paren; + + ParenMetaToken(final Token token, final int lineno, final Paren paren) { + super(token, lineno); + this.paren = paren; + } + } + + private static class LookBehind { + private final MetaToken[] list; + + LookBehind() { + list = new MetaToken[3]; + } + + void push(final MetaToken t) { + MetaToken toShift = t; + for (int i = 0; i < 3; i++) { + final MetaToken tmp = list[i]; + list[i] = toShift; + toShift = tmp; + } + } + + MetaToken one() { + return list[0]; + } + + MetaToken two() { + return list[1]; + } + + MetaToken three() { + return list[2]; + } + + boolean oneIs(final Token token) { + return list[0] != null && list[0].token == token; + } + + boolean twoIs(final Token token) { + return list[1] != null && list[1].token == token; + } + + boolean threeIs(final Token token) { + return list[2] != null && list[2].token == token; + } + } + + /** + * Parsed token, containing the token and its position in the input string + */ + public static class ParsedToken { + public final Token token; + public final int start; + public final int end; + + ParsedToken(final Token token, final int start, final int end) { + this.token = token; + this.start = start; + this.end = end; + } + } + + private final TokenStream stream; + private final LookBehind lastThree; + private final Stack braceStack; + private final Stack parenStack; + + /** + * Create a new JavaScript lexer with the given source code + * + * @param js JavaScript code + * @param languageVersion JavaScript version (from Rhino) + */ + public Lexer(final String js, final int languageVersion) { + stream = new TokenStream(js, 0, languageVersion); + lastThree = new LookBehind(); + braceStack = new Stack<>(); + parenStack = new Stack<>(); + } + + /** + * Create a new JavaScript lexer with the given source code + * + * @param js JavaScript code + */ + public Lexer(final String js) { + this(js, Context.VERSION_DEFAULT); + } + + /** + * Continue parsing and return the next token + * @return next token + * @throws ParsingException + */ + public ParsedToken getNextToken() throws ParsingException { + Token token = stream.nextToken(); + + if ((token == Token.DIV || token == Token.ASSIGN_DIV) && isRegexStart()) { + stream.readRegExp(token); + token = Token.REGEXP; + } + + final ParsedToken parsedToken = new ParsedToken(token, stream.tokenBeg, stream.tokenEnd); + keepBooks(parsedToken); + return parsedToken; + } + + /** + * Check if the parser is balanced (equal amount of open and closed parentheses and braces) + * @return true if balanced + */ + public boolean isBalanced() { + return braceStack.isEmpty() && parenStack.isEmpty(); + } + + /** + * Evaluate the token for possible regex start and handle updating the + * `self.last_three`, `self.paren_stack` and `self.brace_stack` + */ + void keepBooks(final ParsedToken parsedToken) throws ParsingException { + if (parsedToken.token.isPunct) { + switch (parsedToken.token) { + case LP: + handleOpenParenBooks(); + return; + case LC: + handleOpenBraceBooks(); + return; + case RP: + handleCloseParenBooks(parsedToken.start); + return; + case RC: + handleCloseBraceBooks(parsedToken.start); + return; + } + } + if (parsedToken.token != Token.COMMENT) { + lastThree.push(new MetaToken(parsedToken.token, stream.lineno)); + } + } + + /** + * Handle the book keeping when we find an `(` + */ + void handleOpenParenBooks() { + boolean funcExpr = false; + if (lastThree.oneIs(Token.FUNCTION)) { + funcExpr = lastThree.two() != null && checkForExpression(lastThree.two().token); + } else if (lastThree.twoIs(Token.FUNCTION)) { + funcExpr = lastThree.three() != null && checkForExpression(lastThree.three().token); + } + + final boolean conditional = lastThree.one() != null + && lastThree.one().token.isConditional(); + + final Paren paren = new Paren(funcExpr, conditional); + parenStack.push(paren); + lastThree.push(new ParenMetaToken(Token.LP, stream.lineno, paren)); + } + + /** + * Handle the book keeping when we find an `{` + */ + void handleOpenBraceBooks() { + boolean isBlock = true; + if (lastThree.one() != null) { + switch (lastThree.one().token) { + case LP: + case LC: + case CASE: + isBlock = false; + break; + case COLON: + isBlock = !braceStack.isEmpty() && braceStack.lastElement().isBlock; + break; + case RETURN: + case YIELD: + case YIELD_STAR: + isBlock = lastThree.two() != null && lastThree.two().lineno != stream.lineno; + break; + default: + isBlock = !lastThree.one().token.isOp; + } + } + + Paren paren = null; + if (lastThree.one() instanceof ParenMetaToken && lastThree.one().token == Token.RP) { + paren = ((ParenMetaToken) lastThree.one()).paren; + } + final Brace brace = new Brace(isBlock, paren); + braceStack.push(brace); + lastThree.push(new BraceMetaToken(Token.LC, stream.lineno, brace)); + } + + /** + * Handle the book keeping when we find an `)` + */ + void handleCloseParenBooks(final int start) throws ParsingException { + if (parenStack.isEmpty()) { + throw new ParsingException("unmached closing paren at " + start); + } + lastThree.push(new ParenMetaToken(Token.RP, stream.lineno, parenStack.pop())); + } + + /** + * Handle the book keeping when we find an `}` + */ + void handleCloseBraceBooks(final int start) throws ParsingException { + if (braceStack.isEmpty()) { + throw new ParsingException("unmatched closing brace at " + start); + } + lastThree.push(new BraceMetaToken(Token.RC, stream.lineno, braceStack.pop())); + } + + boolean checkForExpression(final Token token) { + return token.isOp || token == Token.RETURN || token == Token.CASE; + } + + /** + * Detect if the `/` is the beginning of a regex or is division + * see this for more details + * + * @return isRegexStart + */ + boolean isRegexStart() { + if (lastThree.one() != null) { + final Token t = lastThree.one().token; + if (t.isKeyw) { + return t != Token.THIS; + } else if (t == Token.RP && lastThree.one() instanceof ParenMetaToken) { + return ((ParenMetaToken) lastThree.one()).paren.conditional; + } else if (t == Token.RC && lastThree.one() instanceof BraceMetaToken) { + final BraceMetaToken mt = (BraceMetaToken) lastThree.one(); + if (mt.brace.isBlock) { + if (mt.brace.paren != null) { + return !mt.brace.paren.funcExpr; + } else { + return true; + } + } else { + return false; + } + } else if (t.isPunct) { + return t != Token.RB; + } else { + return false; + } + } + + return true; + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Token.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Token.java new file mode 100644 index 000000000..2c4fb414a --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/Token.java @@ -0,0 +1,121 @@ +package org.schabi.newpipe.extractor.utils.jsextractor; + +public enum Token { + ERROR, + EOF, + EOL, + RETURN(false, false, true), + BITOR(true, true, false), + BITXOR(true, true, false), + BITAND(true, true, false), + EQ(true, true, false), + NE(true, true, false), + LT(true, true, false), + LE(true, true, false), + GT(true, true, false), + GE(true, true, false), + LSH(true, true, false), + RSH(true, true, false), + URSH(true, true, false), + ADD(true, true, false), + SUB(true, true, false), + MUL(true, true, false), + DIV(true, true, false), + MOD(true, true, false), + NOT(true, true, false), + BITNOT(true, true, false), + NEW(true, false, true), + DELPROP(true, false, true), + TYPEOF(true, false, true), + NAME, + NUMBER, + STRING, + NULL(false, false, true), + THIS(false, false, true), + FALSE(false, false, true), + TRUE(false, false, true), + SHEQ(true, true, false), // shallow equality (===) + SHNE(true, true, false), // shallow inequality (!==) + REGEXP, + THROW(true, false, true), + IN(true, false, true), + INSTANCEOF(true, false, true), + YIELD(false, false, true), // JS 1.7 yield pseudo keyword + EXP(true, true, false), // Exponentiation Operator + BIGINT, // ES2020 BigInt + TRY(false, false, true), + SEMI(false, true, false), // semicolon + LB(false, true, false), // left and right brackets + RB(false, true, false), + LC(false, true, false), // left and right curlies (braces) + RC(false, true, false), + LP(false, true, false), // left and right parentheses + RP(false, true, false), + COMMA(false, true, false), // comma operator + ASSIGN(true, true, false), // simple assignment (=) + ASSIGN_BITOR(true, true, false), // |= + ASSIGN_BITXOR(true, true, false), // ^= + ASSIGN_BITAND(true, true, false), // |= + ASSIGN_LSH(true, true, false), // <<= + ASSIGN_RSH(true, true, false), // >>= + ASSIGN_URSH(true, true, false), // >>>= + ASSIGN_ADD(true, true, false), // += + ASSIGN_SUB(true, true, false), // -= + ASSIGN_MUL(true, true, false), // *= + ASSIGN_DIV(true, true, false), // /= + ASSIGN_MOD(true, true, false), // %= + ASSIGN_EXP(true, true, false), // **= + HOOK(true, true, false), // conditional (?:) + COLON(true, true, false), + OR(true, true, false), // logical or (||) + AND(true, true, false), // logical and (&&) + INC(true, true, false), // increment/decrement (++ --) + DEC(true, true, false), + DOT(false, true, false), // member operator (.) + FUNCTION(false, false, true), // function keyword + EXPORT(false, false, true), // export keyword + IMPORT(false, false, true), // import keyword + IF(false, false, true), // if keyword + ELSE(false, false, true), // else keyword + SWITCH(false, false, true), // switch keyword + CASE(false, false, true), // case keyword + DEFAULT(false, false, true), // default keyword + WHILE(false, false, true), // while keyword + DO(false, false, true), // do keyword + FOR(false, false, true), // for keyword + BREAK(false, false, true), // break keyword + CONTINUE(false, false, true), // continue keyword + VAR(false, false, true), // var keyword + WITH(false, false, true), // with keyword + CATCH(false, false, true), // catch keyword + FINALLY(false, false, true), // finally keyword + VOID(true, false, true), // void keyword + RESERVED(false, false, true), // reserved keywords + LET(false, false, true), // JS 1.7 let pseudo keyword + CONST(false, false, true), + DEBUGGER(false, false, true), + COMMENT, + ARROW(false, true, false), // ES6 ArrowFunction + YIELD_STAR(false, false, true), // ES6 "yield *", a specialization of yield + TEMPLATE_LITERAL; // template literal + + public final boolean isOp; + public final boolean isPunct; + public final boolean isKeyw; + + Token(final boolean isOp, final boolean isPunct, final boolean isKeyw) { + this.isOp = isOp; + this.isPunct = isPunct; + this.isKeyw = isKeyw; + } + + Token() { + this.isOp = false; + this.isPunct = false; + this.isKeyw = false; + } + + public boolean isConditional() { + return this == IF || this == FOR || this == WHILE || this == WITH; + } +} diff --git a/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java new file mode 100644 index 000000000..81651d227 --- /dev/null +++ b/extractor/src/main/java/org/schabi/newpipe/extractor/utils/jsextractor/TokenStream.java @@ -0,0 +1,1161 @@ +package org.schabi.newpipe.extractor.utils.jsextractor; + +import org.mozilla.javascript.Context; +import org.mozilla.javascript.Kit; +import org.mozilla.javascript.ObjToIntMap; +import org.mozilla.javascript.ScriptRuntime; +import org.schabi.newpipe.extractor.exceptions.ParsingException; + +/* Source: Mozilla Rhino, org.mozilla.javascript.Token + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * */ +class TokenStream { + /* + * For chars - because we need something out-of-range + * to check. (And checking EOF by exception is annoying.) + * Note distinction from EOF token type! + */ + private static final int EOF_CHAR = -1; + + /* + * Return value for readDigits() to signal the caller has + * to return an number format problem. + */ + private static final int REPORT_NUMBER_FORMAT_ERROR = -2; + + private static final char BYTE_ORDER_MARK = '\uFEFF'; + private static final char NUMERIC_SEPARATOR = '_'; + + TokenStream(final String sourceString, final int lineno, final int languageVersion) { + this.sourceString = sourceString; + this.sourceCursor = 0; + this.cursor = 0; + + this.lineno = lineno; + this.languageVersion = languageVersion; + } + + static boolean isKeyword(final String s, final int version, final boolean isStrict) { + return Token.EOF != stringToKeyword(s, version, isStrict); + } + + private static Token stringToKeyword(final String name, final int version, + final boolean isStrict) { + if (version < Context.VERSION_ES6) { + return stringToKeywordForJS(name); + } + return stringToKeywordForES(name, isStrict); + } + + /** JavaScript 1.8 and earlier */ + private static Token stringToKeywordForJS(final String name) { + switch (name) { + case "break": + return Token.BREAK; + case "case": + return Token.CASE; + case "continue": + return Token.CONTINUE; + case "default": + return Token.DEFAULT; + case "delete": + return Token.DELPROP; + case "do": + return Token.DO; + case "else": + return Token.ELSE; + case "export": + return Token.EXPORT; + case "false": + return Token.FALSE; + case "for": + return Token.FOR; + case "function": + return Token.FUNCTION; + case "if": + return Token.IF; + case "in": + return Token.IN; + case "let": + return Token.LET; + case "new": + return Token.NEW; + case "null": + return Token.NULL; + case "return": + return Token.RETURN; + case "switch": + return Token.SWITCH; + case "this": + return Token.THIS; + case "true": + return Token.TRUE; + case "typeof": + return Token.TYPEOF; + case "var": + return Token.VAR; + case "void": + return Token.VOID; + case "while": + return Token.WHILE; + case "with": + return Token.WITH; + case "yield": + return Token.YIELD; + case "throw": + return Token.THROW; + case "catch": + return Token.CATCH; + case "const": + return Token.CONST; + case "debugger": + return Token.DEBUGGER; + case "finally": + return Token.FINALLY; + case "instanceof": + return Token.INSTANCEOF; + case "try": + return Token.TRY; + case "abstract": + case "boolean": + case "byte": + case "char": + case "class": + case "double": + case "enum": + case "extends": + case "final": + case "float": + case "goto": + case "implements": + case "import": + case "int": + case "interface": + case "long": + case "native": + case "package": + case "private": + case "protected": + case "public": + case "short": + case "static": + case "super": + case "synchronized": + case "throws": + case "transient": + case "volatile": + return Token.RESERVED; + } + return Token.EOF; + } + + /** ECMAScript 6. */ + private static Token stringToKeywordForES(final String name, final boolean isStrict) { + switch (name) { + case "break": + return Token.BREAK; + case "case": + return Token.CASE; + case "catch": + return Token.CATCH; + case "const": + return Token.CONST; + case "continue": + return Token.CONTINUE; + case "debugger": + return Token.DEBUGGER; + case "default": + return Token.DEFAULT; + case "delete": + return Token.DELPROP; + case "do": + return Token.DO; + case "else": + return Token.ELSE; + case "export": + return Token.EXPORT; + case "finally": + return Token.FINALLY; + case "for": + return Token.FOR; + case "function": + return Token.FUNCTION; + case "if": + return Token.IF; + case "import": + return Token.IMPORT; + case "in": + return Token.IN; + case "instanceof": + return Token.INSTANCEOF; + case "new": + return Token.NEW; + case "return": + return Token.RETURN; + case "switch": + return Token.SWITCH; + case "this": + return Token.THIS; + case "throw": + return Token.THROW; + case "try": + return Token.TRY; + case "typeof": + return Token.TYPEOF; + case "var": + return Token.VAR; + case "void": + return Token.VOID; + case "while": + return Token.WHILE; + case "with": + return Token.WITH; + case "yield": + return Token.YIELD; + case "false": + return Token.FALSE; + case "null": + return Token.NULL; + case "true": + return Token.TRUE; + case "let": + return Token.LET; + case "class": + case "extends": + case "super": + case "await": + case "enum": + return Token.RESERVED; + case "implements": + case "interface": + case "package": + case "private": + case "protected": + case "public": + case "static": + if (isStrict) { + return Token.RESERVED; + } + break; + } + return Token.EOF; + } + + @SuppressWarnings("checkstyle:MethodLength") + final Token getToken() throws ParsingException { + int c; + + for (;;) { + // Eat whitespace, possibly sensitive to newlines. + for (;;) { + c = getChar(); + if (c == EOF_CHAR) { + tokenBeg = cursor - 1; + tokenEnd = cursor; + return Token.EOF; + } else if (c == '\n') { + dirtyLine = false; + tokenBeg = cursor - 1; + tokenEnd = cursor; + return Token.EOL; + } else if (!isJSSpace(c)) { + if (c != '-') { + dirtyLine = true; + } + break; + } + } + + // Assume the token will be 1 char - fixed up below. + tokenBeg = cursor - 1; + tokenEnd = cursor; + + // identifier/keyword/instanceof? + // watch out for starting with a + final boolean identifierStart; + boolean isUnicodeEscapeStart = false; + if (c == '\\') { + c = getChar(); + if (c == 'u') { + identifierStart = true; + isUnicodeEscapeStart = true; + stringBufferTop = 0; + } else { + identifierStart = false; + ungetChar(c); + c = '\\'; + } + } else { + identifierStart = Character.isJavaIdentifierStart((char) c); + if (identifierStart) { + stringBufferTop = 0; + addToString(c); + } + } + + if (identifierStart) { + boolean containsEscape = isUnicodeEscapeStart; + for (;;) { + if (isUnicodeEscapeStart) { + // strictly speaking we should probably push-back + // all the bad characters if the uXXXX + // sequence is malformed. But since there isn't a + // correct context(is there?) for a bad Unicode + // escape sequence in an identifier, we can report + // an error here. + int escapeVal = 0; + for (int i = 0; i != 4; ++i) { + c = getChar(); + escapeVal = Kit.xDigitToInt(c, escapeVal); + // Next check takes care about c < 0 and bad escape + if (escapeVal < 0) { + break; + } + } + if (escapeVal < 0) { + throw new ParsingException("invalid unicode escape"); + } + addToString(escapeVal); + isUnicodeEscapeStart = false; + } else { + c = getChar(); + if (c == '\\') { + c = getChar(); + if (c == 'u') { + isUnicodeEscapeStart = true; + containsEscape = true; + } else { + throw new ParsingException( + String.format("illegal character: '%c'", c)); + } + } else { + if (c == EOF_CHAR + || c == BYTE_ORDER_MARK + || !Character.isJavaIdentifierPart((char) c)) { + break; + } + addToString(c); + } + } + } + ungetChar(c); + + String str = getStringFromBuffer(); + if (!containsEscape) { + // OPT we shouldn't have to make a string (object!) to + // check if it's a keyword. + + // Return the corresponding token if it's a keyword + Token result = stringToKeyword(str, languageVersion, STRICT_MODE); + if (result != Token.EOF) { + if ((result == Token.LET || result == Token.YIELD) + && languageVersion < Context.VERSION_1_7) { + // LET and YIELD are tokens only in 1.7 and later + string = result == Token.LET ? "let" : "yield"; + result = Token.NAME; + } + // Save the string in case we need to use in + // object literal definitions. + this.string = (String) allStrings.intern(str); + if (result != Token.RESERVED) { + return result; + } else if (languageVersion >= Context.VERSION_ES6) { + return result; + } else if (!IS_RESERVED_KEYWORD_AS_IDENTIFIER) { + return result; + } + } + } else if (isKeyword( + str, + languageVersion, + STRICT_MODE)) { + // If a string contains unicodes, and converted to a keyword, + // we convert the last character back to unicode + str = convertLastCharToHex(str); + } + this.string = (String) allStrings.intern(str); + return Token.NAME; + } + + // is it a number? + if (isDigit(c) || (c == '.' && isDigit(peekChar()))) { + stringBufferTop = 0; + int base = 10; + final boolean es6 = languageVersion >= Context.VERSION_ES6; + boolean isOldOctal = false; + + if (c == '0') { + c = getChar(); + if (c == 'x' || c == 'X') { + base = 16; + c = getChar(); + } else if (es6 && (c == 'o' || c == 'O')) { + base = 8; + c = getChar(); + } else if (es6 && (c == 'b' || c == 'B')) { + base = 2; + c = getChar(); + } else if (isDigit(c)) { + base = 8; + isOldOctal = true; + } else { + addToString('0'); + } + } + + final int emptyDetector = stringBufferTop; + if (base == 10 || base == 16 || (base == 8 && !isOldOctal) || base == 2) { + c = readDigits(base, c); + if (c == REPORT_NUMBER_FORMAT_ERROR) { + throw new ParsingException("number format error"); + } + } else { + while (isDigit(c)) { + // finally the oldOctal case + if (c >= '8') { + /* + * We permit 08 and 09 as decimal numbers, which + * makes our behavior a superset of the ECMA + * numeric grammar. We might not always be so + * permissive, so we warn about it. + */ + base = 10; + + c = readDigits(base, c); + if (c == REPORT_NUMBER_FORMAT_ERROR) { + throw new ParsingException("number format error"); + } + break; + } + addToString(c); + c = getChar(); + } + } + if (stringBufferTop == emptyDetector && base != 10) { + throw new ParsingException("number format error"); + } + + if (es6 && c == 'n') { + c = getChar(); + } else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) { + if (c == '.') { + addToString(c); + c = getChar(); + c = readDigits(base, c); + if (c == REPORT_NUMBER_FORMAT_ERROR) { + throw new ParsingException("number format error"); + } + } + if (c == 'e' || c == 'E') { + addToString(c); + c = getChar(); + if (c == '+' || c == '-') { + addToString(c); + c = getChar(); + } + if (!isDigit(c)) { + throw new ParsingException("missing exponent"); + } + c = readDigits(base, c); + if (c == REPORT_NUMBER_FORMAT_ERROR) { + throw new ParsingException("number format error"); + } + } + } + ungetChar(c); + this.string = getStringFromBuffer(); + return Token.NUMBER; + } + + // is it a string or template literal? + if (c == '"' || c == '\'' || c == '`') { + // We attempt to accumulate a string the fast way, by + // building it directly out of the reader. But if there + // are any escaped characters in the string, we revert to + // building it out of a StringBuffer. + + // delimiter for last string literal scanned + final int quoteChar = c; + stringBufferTop = 0; + + c = getCharIgnoreLineEnd(false); + strLoop: + while (c != quoteChar) { + boolean unterminated = false; + if (c == EOF_CHAR) { + unterminated = true; + } else if (c == '\n') { + switch (lineEndChar) { + case '\n': + case '\r': + unterminated = true; + break; + case 0x2028: // + case 0x2029: // + // Line/Paragraph separators need to be included as is + c = lineEndChar; + break; + default: + break; + } + } + + if (unterminated) { + throw new ParsingException("unterminated string literal"); + } + + if (c == '\\') { + // We've hit an escaped character + int escapeVal; + + c = getChar(); + switch (c) { + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + + // \v a late addition to the ECMA spec, + // it is not in Java, so use 0xb + case 'v': + c = 0xb; + break; + + case 'u': + // Get 4 hex digits; if the u escape is not + // followed by 4 hex digits, use 'u' + the + // literal character sequence that follows. + final int escapeStart = stringBufferTop; + addToString('u'); + escapeVal = 0; + for (int i = 0; i != 4; ++i) { + c = getChar(); + escapeVal = Kit.xDigitToInt(c, escapeVal); + if (escapeVal < 0) { + continue strLoop; + } + addToString(c); + } + // prepare for replace of stored 'u' sequence + // by escape value + stringBufferTop = escapeStart; + c = escapeVal; + break; + case 'x': + // Get 2 hex digits, defaulting to 'x'+literal + // sequence, as above. + c = getChar(); + escapeVal = Kit.xDigitToInt(c, 0); + if (escapeVal < 0) { + addToString('x'); + continue strLoop; + } + final int c1 = c; + c = getChar(); + escapeVal = Kit.xDigitToInt(c, escapeVal); + if (escapeVal < 0) { + addToString('x'); + addToString(c1); + continue strLoop; + } + // got 2 hex digits + c = escapeVal; + break; + + case '\n': + // Remove line terminator after escape to follow + // SpiderMonkey and C/C++ + c = getChar(); + continue strLoop; + + default: + if ('0' <= c && c < '8') { + int val = c - '0'; + c = getChar(); + if ('0' <= c && c < '8') { + val = 8 * val + c - '0'; + c = getChar(); + if ('0' <= c && c < '8' && val <= 037) { + // c is 3rd char of octal sequence only + // if the resulting val <= 0377 + val = 8 * val + c - '0'; + c = getChar(); + } + } + ungetChar(c); + c = val; + } + } + } + addToString(c); + c = getChar(false); + } + + final String str = getStringFromBuffer(); + this.string = (String) allStrings.intern(str); + return quoteChar == '`' ? Token.TEMPLATE_LITERAL : Token.STRING; + } + + switch (c) { + case ';': + return Token.SEMI; + case '[': + return Token.LB; + case ']': + return Token.RB; + case '{': + return Token.LC; + case '}': + return Token.RC; + case '(': + return Token.LP; + case ')': + return Token.RP; + case ',': + return Token.COMMA; + case '?': + return Token.HOOK; + case ':': + return Token.COLON; + case '.': + return Token.DOT; + + case '|': + if (matchChar('|')) { + return Token.OR; + } else if (matchChar('=')) { + return Token.ASSIGN_BITOR; + } else { + return Token.BITOR; + } + + case '^': + if (matchChar('=')) { + return Token.ASSIGN_BITXOR; + } + return Token.BITXOR; + + case '&': + if (matchChar('&')) { + return Token.AND; + } else if (matchChar('=')) { + return Token.ASSIGN_BITAND; + } else { + return Token.BITAND; + } + + case '=': + if (matchChar('=')) { + if (matchChar('=')) { + return Token.SHEQ; + } + return Token.EQ; + } else if (matchChar('>')) { + return Token.ARROW; + } else { + return Token.ASSIGN; + } + + case '!': + if (matchChar('=')) { + if (matchChar('=')) { + return Token.SHNE; + } + return Token.NE; + } + return Token.NOT; + + case '<': + /* NB:treat HTML begin-comment as comment-till-eol */ + if (matchChar('!')) { + if (matchChar('-')) { + if (matchChar('-')) { + tokenBeg = cursor - 4; + skipLine(); + return Token.COMMENT; + } + ungetCharIgnoreLineEnd('-'); + } + ungetCharIgnoreLineEnd('!'); + } + if (matchChar('<')) { + if (matchChar('=')) { + return Token.ASSIGN_LSH; + } + return Token.LSH; + } + if (matchChar('=')) { + return Token.LE; + } + return Token.LT; + + case '>': + if (matchChar('>')) { + if (matchChar('>')) { + if (matchChar('=')) { + return Token.ASSIGN_URSH; + } + return Token.URSH; + } + if (matchChar('=')) { + return Token.ASSIGN_RSH; + } + return Token.RSH; + } + if (matchChar('=')) { + return Token.GE; + } + return Token.GT; + + case '*': + if (languageVersion >= Context.VERSION_ES6) { + if (matchChar('*')) { + if (matchChar('=')) { + return Token.ASSIGN_EXP; + } + return Token.EXP; + } + } + if (matchChar('=')) { + return Token.ASSIGN_MUL; + } + return Token.MUL; + + case '/': + // is it a // comment? + if (matchChar('/')) { + tokenBeg = cursor - 2; + skipLine(); + return Token.COMMENT; + } + // is it a /* or /** comment? + if (matchChar('*')) { + boolean lookForSlash = false; + tokenBeg = cursor - 2; + if (matchChar('*')) { + lookForSlash = true; + } + for (;;) { + c = getChar(); + if (c == EOF_CHAR) { + tokenEnd = cursor - 1; + throw new ParsingException("unterminated comment"); + } else if (c == '*') { + lookForSlash = true; + } else if (c == '/') { + if (lookForSlash) { + tokenEnd = cursor; + return Token.COMMENT; + } + } else { + lookForSlash = false; + tokenEnd = cursor; + } + } + } + + if (matchChar('=')) { + return Token.ASSIGN_DIV; + } + return Token.DIV; + + case '%': + if (matchChar('=')) { + return Token.ASSIGN_MOD; + } + return Token.MOD; + + case '~': + return Token.BITNOT; + + case '+': + if (matchChar('=')) { + return Token.ASSIGN_ADD; + } else if (matchChar('+')) { + return Token.INC; + } else { + return Token.ADD; + } + + case '-': + Token t = Token.SUB; + if (matchChar('=')) { + t = Token.ASSIGN_SUB; + } else if (matchChar('-')) { + if (!dirtyLine) { + // treat HTML end-comment after possible whitespace + // after line start as comment-until-eol + if (matchChar('>')) { + skipLine(); + return Token.COMMENT; + } + } + t = Token.DEC; + } + dirtyLine = true; + return t; + + default: + throw new ParsingException(String.format("illegal character: '%c'", c)); + } + } + } + + /* + * Helper to read the next digits according to the base + * and ignore the number separator if there is one. + */ + private int readDigits(final int base, final int firstC) { + if (isDigit(base, firstC)) { + addToString(firstC); + + int c = getChar(); + if (c == EOF_CHAR) { + return EOF_CHAR; + } + + while (true) { + if (c == NUMERIC_SEPARATOR) { + // we do no peek here, we are optimistic for performance + // reasons and because peekChar() only does an getChar/ungetChar. + c = getChar(); + // if the line ends after the separator we have + // to report this as an error + if (c == '\n' || c == EOF_CHAR) { + return REPORT_NUMBER_FORMAT_ERROR; + } + + if (!isDigit(base, c)) { + // bad luck we have to roll back + ungetChar(c); + return NUMERIC_SEPARATOR; + } + addToString(NUMERIC_SEPARATOR); + } else if (isDigit(base, c)) { + addToString(c); + c = getChar(); + if (c == EOF_CHAR) { + return EOF_CHAR; + } + } else { + return c; + } + } + } + return firstC; + } + + private static boolean isAlpha(final int c) { + // Use 'Z' < 'a' + if (c <= 'Z') { + return 'A' <= c; + } + return 'a' <= c && c <= 'z'; + } + + private static boolean isDigit(final int base, final int c) { + return (base == 10 && isDigit(c)) + || (base == 16 && isHexDigit(c)) + || (base == 8 && isOctalDigit(c)) + || (base == 2 && isDualDigit(c)); + } + + private static boolean isDualDigit(final int c) { + return '0' == c || c == '1'; + } + + private static boolean isOctalDigit(final int c) { + return '0' <= c && c <= '7'; + } + + private static boolean isDigit(final int c) { + return '0' <= c && c <= '9'; + } + + private static boolean isHexDigit(final int c) { + return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); + } + + /* As defined in ECMA. jsscan.c uses C isspace() (which allows + * \v, I think.) note that code in getChar() implicitly accepts + * '\r' == \u000D as well. + */ + private static boolean isJSSpace(final int c) { + if (c <= 127) { + return c == 0x20 || c == 0x9 || c == 0xC || c == 0xB; + } + return c == 0xA0 + || c == BYTE_ORDER_MARK + || Character.getType((char) c) == Character.SPACE_SEPARATOR; + } + + private static boolean isJSFormatChar(final int c) { + return c > 127 && Character.getType((char) c) == Character.FORMAT; + } + + /** Parser calls the method when it gets / or /= in literal context. */ + void readRegExp(final Token startToken) throws ParsingException { + final int start = tokenBeg; + stringBufferTop = 0; + if (startToken == Token.ASSIGN_DIV) { + // Miss-scanned /= + addToString('='); + } else { + if (startToken != Token.DIV) { + Kit.codeBug(); + } + if (peekChar() == '*') { + tokenEnd = cursor - 1; + this.string = new String(stringBuffer, 0, stringBufferTop); + throw new ParsingException("msg.unterminated.re.lit"); + } + } + + boolean inCharSet = false; // true if inside a '['..']' pair + int c; + while ((c = getChar()) != '/' || inCharSet) { + if (c == '\n' || c == EOF_CHAR) { + throw new ParsingException("msg.unterminated.re.lit"); + } + if (c == '\\') { + addToString(c); + c = getChar(); + if (c == '\n' || c == EOF_CHAR) { + throw new ParsingException("msg.unterminated.re.lit"); + } + } else if (c == '[') { + inCharSet = true; + } else if (c == ']') { + inCharSet = false; + } + addToString(c); + } + final int reEnd = stringBufferTop; + + while (true) { + c = getCharIgnoreLineEnd(); + if ("gimysu".indexOf(c) != -1) { + addToString(c); + } else if (isAlpha(c)) { + throw new ParsingException("msg.invalid.re.flag"); + } else { + ungetCharIgnoreLineEnd(c); + break; + } + } + + tokenEnd = start + stringBufferTop + 2; // include slashes + this.string = new String(stringBuffer, 0, reEnd); + } + + private String getStringFromBuffer() { + tokenEnd = cursor; + return new String(stringBuffer, 0, stringBufferTop); + } + + private void addToString(final int c) { + final int n = stringBufferTop; + if (n == stringBuffer.length) { + final char[] tmp = new char[stringBuffer.length * 2]; + System.arraycopy(stringBuffer, 0, tmp, 0, n); + stringBuffer = tmp; + } + stringBuffer[n] = (char) c; + stringBufferTop = n + 1; + } + + private void ungetChar(final int c) { + // can not unread past across line boundary + if (ungetCursor != 0 && ungetBuffer[ungetCursor - 1] == '\n') { + Kit.codeBug(); + } + ungetBuffer[ungetCursor++] = c; + cursor--; + } + + private boolean matchChar(final int test) { + final int c = getCharIgnoreLineEnd(); + if (c == test) { + tokenEnd = cursor; + return true; + } + ungetCharIgnoreLineEnd(c); + return false; + } + + private int peekChar() { + final int c = getChar(); + ungetChar(c); + return c; + } + + private int getChar() { + return getChar(true, false); + } + + private int getChar(final boolean skipFormattingChars) { + return getChar(skipFormattingChars, false); + } + + private int getChar(final boolean skipFormattingChars, final boolean ignoreLineEnd) { + if (ungetCursor != 0) { + cursor++; + return ungetBuffer[--ungetCursor]; + } + + for (;;) { + if (sourceCursor == sourceString.length()) { + hitEOF = true; + return EOF_CHAR; + } + cursor++; + int c = sourceString.charAt(sourceCursor++); + + if (!ignoreLineEnd && lineEndChar >= 0) { + if (lineEndChar == '\r' && c == '\n') { + lineEndChar = '\n'; + continue; + } + lineEndChar = -1; + lineStart = sourceCursor - 1; + lineno++; + } + + if (c <= 127) { + if (c == '\n' || c == '\r') { + lineEndChar = c; + c = '\n'; + } + } else { + if (c == BYTE_ORDER_MARK) { + return c; // BOM is considered whitespace + } + if (skipFormattingChars && isJSFormatChar(c)) { + continue; + } + if (ScriptRuntime.isJSLineTerminator(c)) { + lineEndChar = c; + c = '\n'; + } + } + return c; + } + } + + private int getCharIgnoreLineEnd() { + return getChar(true, true); + } + + private int getCharIgnoreLineEnd(final boolean skipFormattingChars) { + return getChar(skipFormattingChars, true); + } + + private void ungetCharIgnoreLineEnd(final int c) { + ungetBuffer[ungetCursor++] = c; + cursor--; + } + + @SuppressWarnings("checkstyle:emptyblock") + private void skipLine() { + // skip to end of line + int c; + while ((c = getChar()) != EOF_CHAR && c != '\n') { } + ungetChar(c); + tokenEnd = cursor; + } + + /** Return the current position of the scanner cursor. */ + public int getCursor() { + return cursor; + } + + /** Return the absolute source offset of the last scanned token. */ + public int getTokenBeg() { + return tokenBeg; + } + + /** Return the absolute source end-offset of the last scanned token. */ + public int getTokenEnd() { + return tokenEnd; + } + + /** Return tokenEnd - tokenBeg */ + public int getTokenLength() { + return tokenEnd - tokenBeg; + } + + public String getTokenRaw() { + return sourceString.substring(tokenBeg, tokenEnd); + } + + private static String convertLastCharToHex(final String str) { + final int lastIndex = str.length() - 1; + final StringBuilder buf = new StringBuilder(str.substring(0, lastIndex)); + buf.append("\\u"); + final String hexCode = Integer.toHexString(str.charAt(lastIndex)); + for (int i = 0; i < 4 - hexCode.length(); ++i) { + buf.append('0'); + } + buf.append(hexCode); + return buf.toString(); + } + + public Token nextToken() throws ParsingException { + Token tt = getToken(); + while (tt == Token.EOL || tt == Token.COMMENT) { + tt = getToken(); + } + return tt; + } + + // stuff other than whitespace since start of line + private boolean dirtyLine; + private String string = ""; + + private char[] stringBuffer = new char[128]; + private int stringBufferTop; + private final ObjToIntMap allStrings = new ObjToIntMap(50); + + // Room to backtrace from to < on failed match of the last - in