|
22 | 22 | import com.google.javascript.jscomp.parsing.parser.util.ErrorReporter; |
23 | 23 | import com.google.javascript.jscomp.parsing.parser.util.SourcePosition; |
24 | 24 | import com.google.javascript.jscomp.parsing.parser.util.SourceRange; |
| 25 | +import com.google.javascript.jscomp.parsing.parser.util.UnicodeMatch; |
25 | 26 | import java.util.ArrayList; |
26 | 27 | import javax.annotation.Nullable; |
27 | 28 |
|
@@ -869,111 +870,18 @@ private static String processUnicodeEscapes(String value) { |
869 | 870 | return value; |
870 | 871 | } |
871 | 872 |
|
872 | | - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
| 873 | + /** |
| 874 | + * Interface from UnicodeRegex. Includes old optimizations. |
| 875 | + */ |
873 | 876 | private static boolean isIdentifierStart(char ch) { |
874 | | - // Most code is written in pure ASCII, so create a fast path here. |
875 | | - if (ch <= 127) { |
876 | | - // Intentionally avoiding short circuiting behavior of "||" and "&&". |
877 | | - // This minimizes branches in this code which minimizes branch prediction misses. |
878 | | - return ((ch >= 'A' & ch <= 'Z') | (ch >= 'a' & ch <= 'z') | (ch == '_' | ch == '$')); |
879 | | - } |
880 | | - |
881 | | - // Handle non-ASCII characters. |
882 | | - // TODO(tjgq): This should include all characters with the ID_Start property. |
883 | | - if (Character.isLetter(ch)) { |
884 | | - return true; |
885 | | - } |
886 | | - |
887 | | - // Workaround for b/36459436. |
888 | | - // When running under GWT/J2CL, Character.isLetter only handles ASCII. |
889 | | - // Angular relies heavily on Latin Small Letter Barred O and Greek Capital Letter Delta. |
890 | | - // Greek letters are occasionally found in math code. |
891 | | - // Latin letters are found in our own tests. |
892 | | - return (ch >= 0x00C0 & ch <= 0x00D6) // Latin letters |
893 | | - // 0x00D7 = multiplication sign, not a letter |
894 | | - | (ch >= 0x00D8 & ch <= 0x00F6) // Latin letters |
895 | | - // 0x00F7 = division sign, not a letter |
896 | | - | (ch >= 0x00F8 & ch <= 0x00FF) // Latin letters |
897 | | - | ch == 0x0275 // Latin Barred O |
898 | | - | (ch >= 0x0391 & ch <= 0x03A1) // Greek uppercase letters |
899 | | - // 0x03A2 = unassigned |
900 | | - | (ch >= 0x03A3 & ch <= 0x03A9) // Remaining Greek uppercase letters |
901 | | - | (ch >= 0x03B1 & ch <= 0x03C9); // Greek lowercase letters |
902 | | - } |
903 | | - |
904 | | - // Check if char is Unicode Category "Combining spacing mark (Mc)" |
905 | | - // This list is not exhaustive! |
906 | | - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
907 | | - private static boolean isCombiningMark(char ch) { |
908 | | - return ( |
909 | | - // 0300-036F |
910 | | - (0x0300 <= ch & ch <= 0x036F) | |
911 | | - // 1AB0–1AFF |
912 | | - (0x1AB0 <= ch & ch <= 0x1AFF) | |
913 | | - // 1DC0–1DFF |
914 | | - (0x1DC0 <= ch & ch <= 0x1DFF) | |
915 | | - // 20D0–20FF |
916 | | - (0x20D0 <= ch & ch <= 0x20FF) | |
917 | | - // FE20–FE2F |
918 | | - (0xFE20 <= ch & ch <= 0xFE2F) |
919 | | - ); |
920 | | - // TODO (ctjl): Implement in a more reliable and future-proofed way, i.e.: |
921 | | - // return Character.getType(ch) == Character.NON_SPACING_MARK; |
922 | | - } |
923 | | - |
924 | | - // TODO (ctjl): Implement |
925 | | - private static boolean isConnectorPunctuation() { |
926 | | - return true; |
| 877 | + return UnicodeMatch.isJavascriptIdentifierStart(ch); |
927 | 878 | } |
928 | | - |
929 | | - // TODO (ctjl): Implement |
930 | | - private static boolean isZeroWidthJoiner() { |
931 | | - return true; |
932 | | - } |
933 | | - |
934 | | - // TODO (ctjl): Implement |
935 | | - private static boolean isZeroWidthNonJoiner() { |
936 | | - return true; |
937 | | - } |
938 | | - |
939 | | - @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code |
| 879 | + |
| 880 | + /** |
| 881 | + * Interface from UnicodeRegex. Includes old optimizations. |
| 882 | + */ |
940 | 883 | private static boolean isIdentifierPart(char ch) { |
941 | | - /** |
942 | | - https://www.ecma-international.org/ecma-262/5.1/#sec-7.6 |
943 | | - IdentifierPart :: |
944 | | - IdentifierStart |
945 | | - ✓ isIdentifierPart() |
946 | | -
|
947 | | - UnicodeCombiningMark |
948 | | - ✓ isCombiningMark() |
949 | | -
|
950 | | - UnicodeDigit |
951 | | - ✓ Character.isDigit() |
952 | | -
|
953 | | - UnicodeConnectorPunctuation |
954 | | - ✓ isConnectorPunctuation() |
955 | | -
|
956 | | - <ZWNJ> |
957 | | - ✓ isZeroWidthNonJoiner() |
958 | | - |
959 | | - <ZWJ> |
960 | | - ✓ isZeroWidthJoiner() |
961 | | - */ |
962 | | - |
963 | | - // Most code is written in pure ASCII, so create a fast path here. |
964 | | - if (ch <= 127) { |
965 | | - return ((ch >= 'A' & ch <= 'Z') |
966 | | - | (ch >= 'a' & ch <= 'z') |
967 | | - | (ch >= '0' & ch <= '9') |
968 | | - | (ch == '_' | ch == '$')); |
969 | | - } |
970 | | - |
971 | | - // Handle non-ASCII characters. |
972 | | - // TODO(tjgq): This should include all characters with the ID_Continue property, plus |
973 | | - // TODO(ctjl): Implement remaining grammar (zero-width joiners, etc.) |
974 | | - return isIdentifierStart(ch) |
975 | | - || isCombiningMark(ch) |
976 | | - || Character.isDigit(ch); |
| 884 | + return UnicodeMatch.isJavascriptIdentifierPart(ch); |
977 | 885 | } |
978 | 886 |
|
979 | 887 | private Token scanStringLiteral(int beginIndex, char terminator) { |
|
0 commit comments