From 7edf5b798385006eaf58efbcaa42f42081f7e849 Mon Sep 17 00:00:00 2001 From: Colin Fleming Date: Thu, 28 Jul 2016 15:43:35 +1200 Subject: [PATCH] Add support for Java Character.isJavaIdentifierStart() and Character.isJavaIdentifierPart() character classes via \p{javaJavaIdentifierStart} and \p{javaJavaIdentifierPart}. --- java/com/google/re2j/JavaCategoryTables.java | 815 ++++++++++++++++++ java/com/google/re2j/MakeJavaCategories.java | 90 ++ java/com/google/re2j/Parser.java | 4 + javatests/com/google/re2j/FindTest.java | 43 + javatests/com/google/re2j/RE2CompileTest.java | 2 + 5 files changed, 954 insertions(+) create mode 100644 java/com/google/re2j/JavaCategoryTables.java create mode 100644 java/com/google/re2j/MakeJavaCategories.java diff --git a/java/com/google/re2j/JavaCategoryTables.java b/java/com/google/re2j/JavaCategoryTables.java new file mode 100644 index 00000000..90b9eb5b --- /dev/null +++ b/java/com/google/re2j/JavaCategoryTables.java @@ -0,0 +1,815 @@ + +package com.google.re2j; + +// AUTOGENERATED by MakeJavaCategories.java - do not modify + +import java.util.HashMap; +import java.util.Map; + +class JavaCategoryTables { + private static Map Categories() { + Map map = new HashMap(); + map.put("javaJavaIdentifierStart", javaJavaIdentifierStart); + map.put("javaJavaIdentifierPart", javaJavaIdentifierPart); + return map; + } + + private static final int[][] javaJavaIdentifierStart = makejavaJavaIdentifierStart(); + private static int[][] makejavaJavaIdentifierStart() { + return new int[][] { + {0x00024, 0x00024, 1}, + {0x00041, 0x0005a, 1}, + {0x0005f, 0x0005f, 1}, + {0x00061, 0x0007a, 1}, + {0x000a2, 0x000a5, 1}, + {0x000aa, 0x000aa, 1}, + {0x000b5, 0x000b5, 1}, + {0x000ba, 0x000ba, 1}, + {0x000c0, 0x000d6, 1}, + {0x000d8, 0x000f6, 1}, + {0x000f8, 0x00236, 1}, + {0x00250, 0x002c1, 1}, + {0x002c6, 0x002d1, 1}, + {0x002e0, 0x002e4, 1}, + {0x002ee, 0x002ee, 1}, + {0x0037a, 0x0037a, 1}, + {0x00386, 0x00386, 1}, + {0x00388, 0x0038a, 1}, + {0x0038c, 0x0038c, 1}, + {0x0038e, 0x003a1, 1}, + {0x003a3, 0x003ce, 1}, + {0x003d0, 0x003f5, 1}, + {0x003f7, 0x003fb, 1}, + {0x00400, 0x00481, 1}, + {0x0048a, 0x004ce, 1}, + {0x004d0, 0x004f5, 1}, + {0x004f8, 0x004f9, 1}, + {0x00500, 0x0050f, 1}, + {0x00531, 0x00556, 1}, + {0x00559, 0x00559, 1}, + {0x00561, 0x00587, 1}, + {0x005d0, 0x005ea, 1}, + {0x005f0, 0x005f2, 1}, + {0x00621, 0x0063a, 1}, + {0x00640, 0x0064a, 1}, + {0x0066e, 0x0066f, 1}, + {0x00671, 0x006d3, 1}, + {0x006d5, 0x006d5, 1}, + {0x006e5, 0x006e6, 1}, + {0x006ee, 0x006ef, 1}, + {0x006fa, 0x006fc, 1}, + {0x006ff, 0x006ff, 1}, + {0x00710, 0x00710, 1}, + {0x00712, 0x0072f, 1}, + {0x0074d, 0x0074f, 1}, + {0x00780, 0x007a5, 1}, + {0x007b1, 0x007b1, 1}, + {0x00904, 0x00939, 1}, + {0x0093d, 0x0093d, 1}, + {0x00950, 0x00950, 1}, + {0x00958, 0x00961, 1}, + {0x00985, 0x0098c, 1}, + {0x0098f, 0x00990, 1}, + {0x00993, 0x009a8, 1}, + {0x009aa, 0x009b0, 1}, + {0x009b2, 0x009b2, 1}, + {0x009b6, 0x009b9, 1}, + {0x009bd, 0x009bd, 1}, + {0x009dc, 0x009dd, 1}, + {0x009df, 0x009e1, 1}, + {0x009f0, 0x009f3, 1}, + {0x00a05, 0x00a0a, 1}, + {0x00a0f, 0x00a10, 1}, + {0x00a13, 0x00a28, 1}, + {0x00a2a, 0x00a30, 1}, + {0x00a32, 0x00a33, 1}, + {0x00a35, 0x00a36, 1}, + {0x00a38, 0x00a39, 1}, + {0x00a59, 0x00a5c, 1}, + {0x00a5e, 0x00a5e, 1}, + {0x00a72, 0x00a74, 1}, + {0x00a85, 0x00a8d, 1}, + {0x00a8f, 0x00a91, 1}, + {0x00a93, 0x00aa8, 1}, + {0x00aaa, 0x00ab0, 1}, + {0x00ab2, 0x00ab3, 1}, + {0x00ab5, 0x00ab9, 1}, + {0x00abd, 0x00abd, 1}, + {0x00ad0, 0x00ad0, 1}, + {0x00ae0, 0x00ae1, 1}, + {0x00af1, 0x00af1, 1}, + {0x00b05, 0x00b0c, 1}, + {0x00b0f, 0x00b10, 1}, + {0x00b13, 0x00b28, 1}, + {0x00b2a, 0x00b30, 1}, + {0x00b32, 0x00b33, 1}, + {0x00b35, 0x00b39, 1}, + {0x00b3d, 0x00b3d, 1}, + {0x00b5c, 0x00b5d, 1}, + {0x00b5f, 0x00b61, 1}, + {0x00b71, 0x00b71, 1}, + {0x00b83, 0x00b83, 1}, + {0x00b85, 0x00b8a, 1}, + {0x00b8e, 0x00b90, 1}, + {0x00b92, 0x00b95, 1}, + {0x00b99, 0x00b9a, 1}, + {0x00b9c, 0x00b9c, 1}, + {0x00b9e, 0x00b9f, 1}, + {0x00ba3, 0x00ba4, 1}, + {0x00ba8, 0x00baa, 1}, + {0x00bae, 0x00bb5, 1}, + {0x00bb7, 0x00bb9, 1}, + {0x00bf9, 0x00bf9, 1}, + {0x00c05, 0x00c0c, 1}, + {0x00c0e, 0x00c10, 1}, + {0x00c12, 0x00c28, 1}, + {0x00c2a, 0x00c33, 1}, + {0x00c35, 0x00c39, 1}, + {0x00c60, 0x00c61, 1}, + {0x00c85, 0x00c8c, 1}, + {0x00c8e, 0x00c90, 1}, + {0x00c92, 0x00ca8, 1}, + {0x00caa, 0x00cb3, 1}, + {0x00cb5, 0x00cb9, 1}, + {0x00cbd, 0x00cbd, 1}, + {0x00cde, 0x00cde, 1}, + {0x00ce0, 0x00ce1, 1}, + {0x00d05, 0x00d0c, 1}, + {0x00d0e, 0x00d10, 1}, + {0x00d12, 0x00d28, 1}, + {0x00d2a, 0x00d39, 1}, + {0x00d60, 0x00d61, 1}, + {0x00d85, 0x00d96, 1}, + {0x00d9a, 0x00db1, 1}, + {0x00db3, 0x00dbb, 1}, + {0x00dbd, 0x00dbd, 1}, + {0x00dc0, 0x00dc6, 1}, + {0x00e01, 0x00e30, 1}, + {0x00e32, 0x00e33, 1}, + {0x00e3f, 0x00e46, 1}, + {0x00e81, 0x00e82, 1}, + {0x00e84, 0x00e84, 1}, + {0x00e87, 0x00e88, 1}, + {0x00e8a, 0x00e8a, 1}, + {0x00e8d, 0x00e8d, 1}, + {0x00e94, 0x00e97, 1}, + {0x00e99, 0x00e9f, 1}, + {0x00ea1, 0x00ea3, 1}, + {0x00ea5, 0x00ea5, 1}, + {0x00ea7, 0x00ea7, 1}, + {0x00eaa, 0x00eab, 1}, + {0x00ead, 0x00eb0, 1}, + {0x00eb2, 0x00eb3, 1}, + {0x00ebd, 0x00ebd, 1}, + {0x00ec0, 0x00ec4, 1}, + {0x00ec6, 0x00ec6, 1}, + {0x00edc, 0x00edd, 1}, + {0x00f00, 0x00f00, 1}, + {0x00f40, 0x00f47, 1}, + {0x00f49, 0x00f6a, 1}, + {0x00f88, 0x00f8b, 1}, + {0x01000, 0x01021, 1}, + {0x01023, 0x01027, 1}, + {0x01029, 0x0102a, 1}, + {0x01050, 0x01055, 1}, + {0x010a0, 0x010c5, 1}, + {0x010d0, 0x010f8, 1}, + {0x01100, 0x01159, 1}, + {0x0115f, 0x011a2, 1}, + {0x011a8, 0x011f9, 1}, + {0x01200, 0x01206, 1}, + {0x01208, 0x01246, 1}, + {0x01248, 0x01248, 1}, + {0x0124a, 0x0124d, 1}, + {0x01250, 0x01256, 1}, + {0x01258, 0x01258, 1}, + {0x0125a, 0x0125d, 1}, + {0x01260, 0x01286, 1}, + {0x01288, 0x01288, 1}, + {0x0128a, 0x0128d, 1}, + {0x01290, 0x012ae, 1}, + {0x012b0, 0x012b0, 1}, + {0x012b2, 0x012b5, 1}, + {0x012b8, 0x012be, 1}, + {0x012c0, 0x012c0, 1}, + {0x012c2, 0x012c5, 1}, + {0x012c8, 0x012ce, 1}, + {0x012d0, 0x012d6, 1}, + {0x012d8, 0x012ee, 1}, + {0x012f0, 0x0130e, 1}, + {0x01310, 0x01310, 1}, + {0x01312, 0x01315, 1}, + {0x01318, 0x0131e, 1}, + {0x01320, 0x01346, 1}, + {0x01348, 0x0135a, 1}, + {0x013a0, 0x013f4, 1}, + {0x01401, 0x0166c, 1}, + {0x0166f, 0x01676, 1}, + {0x01681, 0x0169a, 1}, + {0x016a0, 0x016ea, 1}, + {0x016ee, 0x016f0, 1}, + {0x01700, 0x0170c, 1}, + {0x0170e, 0x01711, 1}, + {0x01720, 0x01731, 1}, + {0x01740, 0x01751, 1}, + {0x01760, 0x0176c, 1}, + {0x0176e, 0x01770, 1}, + {0x01780, 0x017b3, 1}, + {0x017d7, 0x017d7, 1}, + {0x017db, 0x017dc, 1}, + {0x01820, 0x01877, 1}, + {0x01880, 0x018a8, 1}, + {0x01900, 0x0191c, 1}, + {0x01950, 0x0196d, 1}, + {0x01970, 0x01974, 1}, + {0x01d00, 0x01d6b, 1}, + {0x01e00, 0x01e9b, 1}, + {0x01ea0, 0x01ef9, 1}, + {0x01f00, 0x01f15, 1}, + {0x01f18, 0x01f1d, 1}, + {0x01f20, 0x01f45, 1}, + {0x01f48, 0x01f4d, 1}, + {0x01f50, 0x01f57, 1}, + {0x01f59, 0x01f59, 1}, + {0x01f5b, 0x01f5b, 1}, + {0x01f5d, 0x01f5d, 1}, + {0x01f5f, 0x01f7d, 1}, + {0x01f80, 0x01fb4, 1}, + {0x01fb6, 0x01fbc, 1}, + {0x01fbe, 0x01fbe, 1}, + {0x01fc2, 0x01fc4, 1}, + {0x01fc6, 0x01fcc, 1}, + {0x01fd0, 0x01fd3, 1}, + {0x01fd6, 0x01fdb, 1}, + {0x01fe0, 0x01fec, 1}, + {0x01ff2, 0x01ff4, 1}, + {0x01ff6, 0x01ffc, 1}, + {0x0203f, 0x02040, 1}, + {0x02054, 0x02054, 1}, + {0x02071, 0x02071, 1}, + {0x0207f, 0x0207f, 1}, + {0x020a0, 0x020b1, 1}, + {0x02102, 0x02102, 1}, + {0x02107, 0x02107, 1}, + {0x0210a, 0x02113, 1}, + {0x02115, 0x02115, 1}, + {0x02119, 0x0211d, 1}, + {0x02124, 0x02124, 1}, + {0x02126, 0x02126, 1}, + {0x02128, 0x02128, 1}, + {0x0212a, 0x0212d, 1}, + {0x0212f, 0x02131, 1}, + {0x02133, 0x02139, 1}, + {0x0213d, 0x0213f, 1}, + {0x02145, 0x02149, 1}, + {0x02160, 0x02183, 1}, + {0x03005, 0x03007, 1}, + {0x03021, 0x03029, 1}, + {0x03031, 0x03035, 1}, + {0x03038, 0x0303c, 1}, + {0x03041, 0x03096, 1}, + {0x0309d, 0x0309f, 1}, + {0x030a1, 0x030ff, 1}, + {0x03105, 0x0312c, 1}, + {0x03131, 0x0318e, 1}, + {0x031a0, 0x031b7, 1}, + {0x031f0, 0x031ff, 1}, + {0x03400, 0x04db5, 1}, + {0x04e00, 0x09fa5, 1}, + {0x0a000, 0x0a48c, 1}, + {0x0ac00, 0x0d7a3, 1}, + {0x0f900, 0x0fa2d, 1}, + {0x0fa30, 0x0fa6a, 1}, + {0x0fb00, 0x0fb06, 1}, + {0x0fb13, 0x0fb17, 1}, + {0x0fb1d, 0x0fb1d, 1}, + {0x0fb1f, 0x0fb28, 1}, + {0x0fb2a, 0x0fb36, 1}, + {0x0fb38, 0x0fb3c, 1}, + {0x0fb3e, 0x0fb3e, 1}, + {0x0fb40, 0x0fb41, 1}, + {0x0fb43, 0x0fb44, 1}, + {0x0fb46, 0x0fbb1, 1}, + {0x0fbd3, 0x0fd3d, 1}, + {0x0fd50, 0x0fd8f, 1}, + {0x0fd92, 0x0fdc7, 1}, + {0x0fdf0, 0x0fdfc, 1}, + {0x0fe33, 0x0fe34, 1}, + {0x0fe4d, 0x0fe4f, 1}, + {0x0fe69, 0x0fe69, 1}, + {0x0fe70, 0x0fe74, 1}, + {0x0fe76, 0x0fefc, 1}, + {0x0ff04, 0x0ff04, 1}, + {0x0ff21, 0x0ff3a, 1}, + {0x0ff3f, 0x0ff3f, 1}, + {0x0ff41, 0x0ff5a, 1}, + {0x0ff65, 0x0ffbe, 1}, + {0x0ffc2, 0x0ffc7, 1}, + {0x0ffca, 0x0ffcf, 1}, + {0x0ffd2, 0x0ffd7, 1}, + {0x0ffda, 0x0ffdc, 1}, + {0x0ffe0, 0x0ffe1, 1}, + {0x0ffe5, 0x0ffe6, 1}, + {0x10000, 0x1000b, 1}, + {0x1000d, 0x10026, 1}, + {0x10028, 0x1003a, 1}, + {0x1003c, 0x1003d, 1}, + {0x1003f, 0x1004d, 1}, + {0x10050, 0x1005d, 1}, + {0x10080, 0x100fa, 1}, + {0x10300, 0x1031e, 1}, + {0x10330, 0x1034a, 1}, + {0x10380, 0x1039d, 1}, + {0x10400, 0x1049d, 1}, + {0x10800, 0x10805, 1}, + {0x10808, 0x10808, 1}, + {0x1080a, 0x10835, 1}, + {0x10837, 0x10838, 1}, + {0x1083c, 0x1083c, 1}, + {0x1083f, 0x1083f, 1}, + {0x1d400, 0x1d454, 1}, + {0x1d456, 0x1d49c, 1}, + {0x1d49e, 0x1d49f, 1}, + {0x1d4a2, 0x1d4a2, 1}, + {0x1d4a5, 0x1d4a6, 1}, + {0x1d4a9, 0x1d4ac, 1}, + {0x1d4ae, 0x1d4b9, 1}, + {0x1d4bb, 0x1d4bb, 1}, + {0x1d4bd, 0x1d4c3, 1}, + {0x1d4c5, 0x1d505, 1}, + {0x1d507, 0x1d50a, 1}, + {0x1d50d, 0x1d514, 1}, + {0x1d516, 0x1d51c, 1}, + {0x1d51e, 0x1d539, 1}, + {0x1d53b, 0x1d53e, 1}, + {0x1d540, 0x1d544, 1}, + {0x1d546, 0x1d546, 1}, + {0x1d54a, 0x1d550, 1}, + {0x1d552, 0x1d6a3, 1}, + {0x1d6a8, 0x1d6c0, 1}, + {0x1d6c2, 0x1d6da, 1}, + {0x1d6dc, 0x1d6fa, 1}, + {0x1d6fc, 0x1d714, 1}, + {0x1d716, 0x1d734, 1}, + {0x1d736, 0x1d74e, 1}, + {0x1d750, 0x1d76e, 1}, + {0x1d770, 0x1d788, 1}, + {0x1d78a, 0x1d7a8, 1}, + {0x1d7aa, 0x1d7c2, 1}, + {0x1d7c4, 0x1d7c9, 1}, + {0x20000, 0x2a6d6, 1}, + {0x2f800, 0x2fa1d, 1}, + }; + } + + private static final int[][] javaJavaIdentifierPart = makejavaJavaIdentifierPart(); + private static int[][] makejavaJavaIdentifierPart() { + return new int[][] { + {0x00000, 0x00008, 1}, + {0x0000e, 0x0001b, 1}, + {0x00024, 0x00024, 1}, + {0x00030, 0x00039, 1}, + {0x00041, 0x0005a, 1}, + {0x0005f, 0x0005f, 1}, + {0x00061, 0x0007a, 1}, + {0x0007f, 0x0009f, 1}, + {0x000a2, 0x000a5, 1}, + {0x000aa, 0x000aa, 1}, + {0x000ad, 0x000ad, 1}, + {0x000b5, 0x000b5, 1}, + {0x000ba, 0x000ba, 1}, + {0x000c0, 0x000d6, 1}, + {0x000d8, 0x000f6, 1}, + {0x000f8, 0x00236, 1}, + {0x00250, 0x002c1, 1}, + {0x002c6, 0x002d1, 1}, + {0x002e0, 0x002e4, 1}, + {0x002ee, 0x002ee, 1}, + {0x00300, 0x00357, 1}, + {0x0035d, 0x0036f, 1}, + {0x0037a, 0x0037a, 1}, + {0x00386, 0x00386, 1}, + {0x00388, 0x0038a, 1}, + {0x0038c, 0x0038c, 1}, + {0x0038e, 0x003a1, 1}, + {0x003a3, 0x003ce, 1}, + {0x003d0, 0x003f5, 1}, + {0x003f7, 0x003fb, 1}, + {0x00400, 0x00481, 1}, + {0x00483, 0x00486, 1}, + {0x0048a, 0x004ce, 1}, + {0x004d0, 0x004f5, 1}, + {0x004f8, 0x004f9, 1}, + {0x00500, 0x0050f, 1}, + {0x00531, 0x00556, 1}, + {0x00559, 0x00559, 1}, + {0x00561, 0x00587, 1}, + {0x00591, 0x005a1, 1}, + {0x005a3, 0x005b9, 1}, + {0x005bb, 0x005bd, 1}, + {0x005bf, 0x005bf, 1}, + {0x005c1, 0x005c2, 1}, + {0x005c4, 0x005c4, 1}, + {0x005d0, 0x005ea, 1}, + {0x005f0, 0x005f2, 1}, + {0x00600, 0x00603, 1}, + {0x00610, 0x00615, 1}, + {0x00621, 0x0063a, 1}, + {0x00640, 0x00658, 1}, + {0x00660, 0x00669, 1}, + {0x0066e, 0x006d3, 1}, + {0x006d5, 0x006dd, 1}, + {0x006df, 0x006e8, 1}, + {0x006ea, 0x006fc, 1}, + {0x006ff, 0x006ff, 1}, + {0x0070f, 0x0074a, 1}, + {0x0074d, 0x0074f, 1}, + {0x00780, 0x007b1, 1}, + {0x00901, 0x00939, 1}, + {0x0093c, 0x0094d, 1}, + {0x00950, 0x00954, 1}, + {0x00958, 0x00963, 1}, + {0x00966, 0x0096f, 1}, + {0x00981, 0x00983, 1}, + {0x00985, 0x0098c, 1}, + {0x0098f, 0x00990, 1}, + {0x00993, 0x009a8, 1}, + {0x009aa, 0x009b0, 1}, + {0x009b2, 0x009b2, 1}, + {0x009b6, 0x009b9, 1}, + {0x009bc, 0x009c4, 1}, + {0x009c7, 0x009c8, 1}, + {0x009cb, 0x009cd, 1}, + {0x009d7, 0x009d7, 1}, + {0x009dc, 0x009dd, 1}, + {0x009df, 0x009e3, 1}, + {0x009e6, 0x009f3, 1}, + {0x00a01, 0x00a03, 1}, + {0x00a05, 0x00a0a, 1}, + {0x00a0f, 0x00a10, 1}, + {0x00a13, 0x00a28, 1}, + {0x00a2a, 0x00a30, 1}, + {0x00a32, 0x00a33, 1}, + {0x00a35, 0x00a36, 1}, + {0x00a38, 0x00a39, 1}, + {0x00a3c, 0x00a3c, 1}, + {0x00a3e, 0x00a42, 1}, + {0x00a47, 0x00a48, 1}, + {0x00a4b, 0x00a4d, 1}, + {0x00a59, 0x00a5c, 1}, + {0x00a5e, 0x00a5e, 1}, + {0x00a66, 0x00a74, 1}, + {0x00a81, 0x00a83, 1}, + {0x00a85, 0x00a8d, 1}, + {0x00a8f, 0x00a91, 1}, + {0x00a93, 0x00aa8, 1}, + {0x00aaa, 0x00ab0, 1}, + {0x00ab2, 0x00ab3, 1}, + {0x00ab5, 0x00ab9, 1}, + {0x00abc, 0x00ac5, 1}, + {0x00ac7, 0x00ac9, 1}, + {0x00acb, 0x00acd, 1}, + {0x00ad0, 0x00ad0, 1}, + {0x00ae0, 0x00ae3, 1}, + {0x00ae6, 0x00aef, 1}, + {0x00af1, 0x00af1, 1}, + {0x00b01, 0x00b03, 1}, + {0x00b05, 0x00b0c, 1}, + {0x00b0f, 0x00b10, 1}, + {0x00b13, 0x00b28, 1}, + {0x00b2a, 0x00b30, 1}, + {0x00b32, 0x00b33, 1}, + {0x00b35, 0x00b39, 1}, + {0x00b3c, 0x00b43, 1}, + {0x00b47, 0x00b48, 1}, + {0x00b4b, 0x00b4d, 1}, + {0x00b56, 0x00b57, 1}, + {0x00b5c, 0x00b5d, 1}, + {0x00b5f, 0x00b61, 1}, + {0x00b66, 0x00b6f, 1}, + {0x00b71, 0x00b71, 1}, + {0x00b82, 0x00b83, 1}, + {0x00b85, 0x00b8a, 1}, + {0x00b8e, 0x00b90, 1}, + {0x00b92, 0x00b95, 1}, + {0x00b99, 0x00b9a, 1}, + {0x00b9c, 0x00b9c, 1}, + {0x00b9e, 0x00b9f, 1}, + {0x00ba3, 0x00ba4, 1}, + {0x00ba8, 0x00baa, 1}, + {0x00bae, 0x00bb5, 1}, + {0x00bb7, 0x00bb9, 1}, + {0x00bbe, 0x00bc2, 1}, + {0x00bc6, 0x00bc8, 1}, + {0x00bca, 0x00bcd, 1}, + {0x00bd7, 0x00bd7, 1}, + {0x00be7, 0x00bef, 1}, + {0x00bf9, 0x00bf9, 1}, + {0x00c01, 0x00c03, 1}, + {0x00c05, 0x00c0c, 1}, + {0x00c0e, 0x00c10, 1}, + {0x00c12, 0x00c28, 1}, + {0x00c2a, 0x00c33, 1}, + {0x00c35, 0x00c39, 1}, + {0x00c3e, 0x00c44, 1}, + {0x00c46, 0x00c48, 1}, + {0x00c4a, 0x00c4d, 1}, + {0x00c55, 0x00c56, 1}, + {0x00c60, 0x00c61, 1}, + {0x00c66, 0x00c6f, 1}, + {0x00c82, 0x00c83, 1}, + {0x00c85, 0x00c8c, 1}, + {0x00c8e, 0x00c90, 1}, + {0x00c92, 0x00ca8, 1}, + {0x00caa, 0x00cb3, 1}, + {0x00cb5, 0x00cb9, 1}, + {0x00cbc, 0x00cc4, 1}, + {0x00cc6, 0x00cc8, 1}, + {0x00cca, 0x00ccd, 1}, + {0x00cd5, 0x00cd6, 1}, + {0x00cde, 0x00cde, 1}, + {0x00ce0, 0x00ce1, 1}, + {0x00ce6, 0x00cef, 1}, + {0x00d02, 0x00d03, 1}, + {0x00d05, 0x00d0c, 1}, + {0x00d0e, 0x00d10, 1}, + {0x00d12, 0x00d28, 1}, + {0x00d2a, 0x00d39, 1}, + {0x00d3e, 0x00d43, 1}, + {0x00d46, 0x00d48, 1}, + {0x00d4a, 0x00d4d, 1}, + {0x00d57, 0x00d57, 1}, + {0x00d60, 0x00d61, 1}, + {0x00d66, 0x00d6f, 1}, + {0x00d82, 0x00d83, 1}, + {0x00d85, 0x00d96, 1}, + {0x00d9a, 0x00db1, 1}, + {0x00db3, 0x00dbb, 1}, + {0x00dbd, 0x00dbd, 1}, + {0x00dc0, 0x00dc6, 1}, + {0x00dca, 0x00dca, 1}, + {0x00dcf, 0x00dd4, 1}, + {0x00dd6, 0x00dd6, 1}, + {0x00dd8, 0x00ddf, 1}, + {0x00df2, 0x00df3, 1}, + {0x00e01, 0x00e3a, 1}, + {0x00e3f, 0x00e4e, 1}, + {0x00e50, 0x00e59, 1}, + {0x00e81, 0x00e82, 1}, + {0x00e84, 0x00e84, 1}, + {0x00e87, 0x00e88, 1}, + {0x00e8a, 0x00e8a, 1}, + {0x00e8d, 0x00e8d, 1}, + {0x00e94, 0x00e97, 1}, + {0x00e99, 0x00e9f, 1}, + {0x00ea1, 0x00ea3, 1}, + {0x00ea5, 0x00ea5, 1}, + {0x00ea7, 0x00ea7, 1}, + {0x00eaa, 0x00eab, 1}, + {0x00ead, 0x00eb9, 1}, + {0x00ebb, 0x00ebd, 1}, + {0x00ec0, 0x00ec4, 1}, + {0x00ec6, 0x00ec6, 1}, + {0x00ec8, 0x00ecd, 1}, + {0x00ed0, 0x00ed9, 1}, + {0x00edc, 0x00edd, 1}, + {0x00f00, 0x00f00, 1}, + {0x00f18, 0x00f19, 1}, + {0x00f20, 0x00f29, 1}, + {0x00f35, 0x00f35, 1}, + {0x00f37, 0x00f37, 1}, + {0x00f39, 0x00f39, 1}, + {0x00f3e, 0x00f47, 1}, + {0x00f49, 0x00f6a, 1}, + {0x00f71, 0x00f84, 1}, + {0x00f86, 0x00f8b, 1}, + {0x00f90, 0x00f97, 1}, + {0x00f99, 0x00fbc, 1}, + {0x00fc6, 0x00fc6, 1}, + {0x01000, 0x01021, 1}, + {0x01023, 0x01027, 1}, + {0x01029, 0x0102a, 1}, + {0x0102c, 0x01032, 1}, + {0x01036, 0x01039, 1}, + {0x01040, 0x01049, 1}, + {0x01050, 0x01059, 1}, + {0x010a0, 0x010c5, 1}, + {0x010d0, 0x010f8, 1}, + {0x01100, 0x01159, 1}, + {0x0115f, 0x011a2, 1}, + {0x011a8, 0x011f9, 1}, + {0x01200, 0x01206, 1}, + {0x01208, 0x01246, 1}, + {0x01248, 0x01248, 1}, + {0x0124a, 0x0124d, 1}, + {0x01250, 0x01256, 1}, + {0x01258, 0x01258, 1}, + {0x0125a, 0x0125d, 1}, + {0x01260, 0x01286, 1}, + {0x01288, 0x01288, 1}, + {0x0128a, 0x0128d, 1}, + {0x01290, 0x012ae, 1}, + {0x012b0, 0x012b0, 1}, + {0x012b2, 0x012b5, 1}, + {0x012b8, 0x012be, 1}, + {0x012c0, 0x012c0, 1}, + {0x012c2, 0x012c5, 1}, + {0x012c8, 0x012ce, 1}, + {0x012d0, 0x012d6, 1}, + {0x012d8, 0x012ee, 1}, + {0x012f0, 0x0130e, 1}, + {0x01310, 0x01310, 1}, + {0x01312, 0x01315, 1}, + {0x01318, 0x0131e, 1}, + {0x01320, 0x01346, 1}, + {0x01348, 0x0135a, 1}, + {0x01369, 0x01371, 1}, + {0x013a0, 0x013f4, 1}, + {0x01401, 0x0166c, 1}, + {0x0166f, 0x01676, 1}, + {0x01681, 0x0169a, 1}, + {0x016a0, 0x016ea, 1}, + {0x016ee, 0x016f0, 1}, + {0x01700, 0x0170c, 1}, + {0x0170e, 0x01714, 1}, + {0x01720, 0x01734, 1}, + {0x01740, 0x01753, 1}, + {0x01760, 0x0176c, 1}, + {0x0176e, 0x01770, 1}, + {0x01772, 0x01773, 1}, + {0x01780, 0x017d3, 1}, + {0x017d7, 0x017d7, 1}, + {0x017db, 0x017dd, 1}, + {0x017e0, 0x017e9, 1}, + {0x0180b, 0x0180d, 1}, + {0x01810, 0x01819, 1}, + {0x01820, 0x01877, 1}, + {0x01880, 0x018a9, 1}, + {0x01900, 0x0191c, 1}, + {0x01920, 0x0192b, 1}, + {0x01930, 0x0193b, 1}, + {0x01946, 0x0196d, 1}, + {0x01970, 0x01974, 1}, + {0x01d00, 0x01d6b, 1}, + {0x01e00, 0x01e9b, 1}, + {0x01ea0, 0x01ef9, 1}, + {0x01f00, 0x01f15, 1}, + {0x01f18, 0x01f1d, 1}, + {0x01f20, 0x01f45, 1}, + {0x01f48, 0x01f4d, 1}, + {0x01f50, 0x01f57, 1}, + {0x01f59, 0x01f59, 1}, + {0x01f5b, 0x01f5b, 1}, + {0x01f5d, 0x01f5d, 1}, + {0x01f5f, 0x01f7d, 1}, + {0x01f80, 0x01fb4, 1}, + {0x01fb6, 0x01fbc, 1}, + {0x01fbe, 0x01fbe, 1}, + {0x01fc2, 0x01fc4, 1}, + {0x01fc6, 0x01fcc, 1}, + {0x01fd0, 0x01fd3, 1}, + {0x01fd6, 0x01fdb, 1}, + {0x01fe0, 0x01fec, 1}, + {0x01ff2, 0x01ff4, 1}, + {0x01ff6, 0x01ffc, 1}, + {0x0200c, 0x0200f, 1}, + {0x0202a, 0x0202e, 1}, + {0x0203f, 0x02040, 1}, + {0x02054, 0x02054, 1}, + {0x02060, 0x02063, 1}, + {0x0206a, 0x0206f, 1}, + {0x02071, 0x02071, 1}, + {0x0207f, 0x0207f, 1}, + {0x020a0, 0x020b1, 1}, + {0x020d0, 0x020dc, 1}, + {0x020e1, 0x020e1, 1}, + {0x020e5, 0x020ea, 1}, + {0x02102, 0x02102, 1}, + {0x02107, 0x02107, 1}, + {0x0210a, 0x02113, 1}, + {0x02115, 0x02115, 1}, + {0x02119, 0x0211d, 1}, + {0x02124, 0x02124, 1}, + {0x02126, 0x02126, 1}, + {0x02128, 0x02128, 1}, + {0x0212a, 0x0212d, 1}, + {0x0212f, 0x02131, 1}, + {0x02133, 0x02139, 1}, + {0x0213d, 0x0213f, 1}, + {0x02145, 0x02149, 1}, + {0x02160, 0x02183, 1}, + {0x03005, 0x03007, 1}, + {0x03021, 0x0302f, 1}, + {0x03031, 0x03035, 1}, + {0x03038, 0x0303c, 1}, + {0x03041, 0x03096, 1}, + {0x03099, 0x0309a, 1}, + {0x0309d, 0x0309f, 1}, + {0x030a1, 0x030ff, 1}, + {0x03105, 0x0312c, 1}, + {0x03131, 0x0318e, 1}, + {0x031a0, 0x031b7, 1}, + {0x031f0, 0x031ff, 1}, + {0x03400, 0x04db5, 1}, + {0x04e00, 0x09fa5, 1}, + {0x0a000, 0x0a48c, 1}, + {0x0ac00, 0x0d7a3, 1}, + {0x0f900, 0x0fa2d, 1}, + {0x0fa30, 0x0fa6a, 1}, + {0x0fb00, 0x0fb06, 1}, + {0x0fb13, 0x0fb17, 1}, + {0x0fb1d, 0x0fb28, 1}, + {0x0fb2a, 0x0fb36, 1}, + {0x0fb38, 0x0fb3c, 1}, + {0x0fb3e, 0x0fb3e, 1}, + {0x0fb40, 0x0fb41, 1}, + {0x0fb43, 0x0fb44, 1}, + {0x0fb46, 0x0fbb1, 1}, + {0x0fbd3, 0x0fd3d, 1}, + {0x0fd50, 0x0fd8f, 1}, + {0x0fd92, 0x0fdc7, 1}, + {0x0fdf0, 0x0fdfc, 1}, + {0x0fe00, 0x0fe0f, 1}, + {0x0fe20, 0x0fe23, 1}, + {0x0fe33, 0x0fe34, 1}, + {0x0fe4d, 0x0fe4f, 1}, + {0x0fe69, 0x0fe69, 1}, + {0x0fe70, 0x0fe74, 1}, + {0x0fe76, 0x0fefc, 1}, + {0x0feff, 0x0feff, 1}, + {0x0ff04, 0x0ff04, 1}, + {0x0ff10, 0x0ff19, 1}, + {0x0ff21, 0x0ff3a, 1}, + {0x0ff3f, 0x0ff3f, 1}, + {0x0ff41, 0x0ff5a, 1}, + {0x0ff65, 0x0ffbe, 1}, + {0x0ffc2, 0x0ffc7, 1}, + {0x0ffca, 0x0ffcf, 1}, + {0x0ffd2, 0x0ffd7, 1}, + {0x0ffda, 0x0ffdc, 1}, + {0x0ffe0, 0x0ffe1, 1}, + {0x0ffe5, 0x0ffe6, 1}, + {0x0fff9, 0x0fffb, 1}, + {0x10000, 0x1000b, 1}, + {0x1000d, 0x10026, 1}, + {0x10028, 0x1003a, 1}, + {0x1003c, 0x1003d, 1}, + {0x1003f, 0x1004d, 1}, + {0x10050, 0x1005d, 1}, + {0x10080, 0x100fa, 1}, + {0x10300, 0x1031e, 1}, + {0x10330, 0x1034a, 1}, + {0x10380, 0x1039d, 1}, + {0x10400, 0x1049d, 1}, + {0x104a0, 0x104a9, 1}, + {0x10800, 0x10805, 1}, + {0x10808, 0x10808, 1}, + {0x1080a, 0x10835, 1}, + {0x10837, 0x10838, 1}, + {0x1083c, 0x1083c, 1}, + {0x1083f, 0x1083f, 1}, + {0x1d165, 0x1d169, 1}, + {0x1d16d, 0x1d182, 1}, + {0x1d185, 0x1d18b, 1}, + {0x1d1aa, 0x1d1ad, 1}, + {0x1d400, 0x1d454, 1}, + {0x1d456, 0x1d49c, 1}, + {0x1d49e, 0x1d49f, 1}, + {0x1d4a2, 0x1d4a2, 1}, + {0x1d4a5, 0x1d4a6, 1}, + {0x1d4a9, 0x1d4ac, 1}, + {0x1d4ae, 0x1d4b9, 1}, + {0x1d4bb, 0x1d4bb, 1}, + {0x1d4bd, 0x1d4c3, 1}, + {0x1d4c5, 0x1d505, 1}, + {0x1d507, 0x1d50a, 1}, + {0x1d50d, 0x1d514, 1}, + {0x1d516, 0x1d51c, 1}, + {0x1d51e, 0x1d539, 1}, + {0x1d53b, 0x1d53e, 1}, + {0x1d540, 0x1d544, 1}, + {0x1d546, 0x1d546, 1}, + {0x1d54a, 0x1d550, 1}, + {0x1d552, 0x1d6a3, 1}, + {0x1d6a8, 0x1d6c0, 1}, + {0x1d6c2, 0x1d6da, 1}, + {0x1d6dc, 0x1d6fa, 1}, + {0x1d6fc, 0x1d714, 1}, + {0x1d716, 0x1d734, 1}, + {0x1d736, 0x1d74e, 1}, + {0x1d750, 0x1d76e, 1}, + {0x1d770, 0x1d788, 1}, + {0x1d78a, 0x1d7a8, 1}, + {0x1d7aa, 0x1d7c2, 1}, + {0x1d7c4, 0x1d7c9, 1}, + {0x1d7ce, 0x1d7ff, 1}, + {0x20000, 0x2a6d6, 1}, + {0x2f800, 0x2fa1d, 1}, + {0xe0001, 0xe0001, 1}, + {0xe0020, 0xe007f, 1}, + {0xe0100, 0xe01ef, 1}, + }; + } + + static final Map CATEGORIES = Categories(); +} diff --git a/java/com/google/re2j/MakeJavaCategories.java b/java/com/google/re2j/MakeJavaCategories.java new file mode 100644 index 00000000..a467ce2e --- /dev/null +++ b/java/com/google/re2j/MakeJavaCategories.java @@ -0,0 +1,90 @@ +package com.google.re2j; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * @author Colin Fleming + */ +public class MakeJavaCategories { + + static abstract class Category { + final String categoryName; + final String publishName; + + protected Category(String categoryName, String publishName) { + this.categoryName = categoryName; + this.publishName = publishName; + } + + protected abstract boolean matches(int codePoint); + + public void writeTo(Writer writer) throws IOException { + List ranges = new ArrayList(); + int first = -1; + + for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) { + if (matches(cp)) { + if (first == -1) first = cp; + } else { + if (first >= 0) { + writer.write(String.format(" {0x%05x, 0x%05x, 1},\n", first, cp - 1)); + first = -1; + } + } + } + + if (first >= 0) { + writer.write(String.format(" {0x%05x, 0x%05x, 1},\n", first, Character.MAX_CODE_POINT)); + } + } + } + + public static void main(String[] args) throws IOException { + List categories = Arrays.asList( + new Category("javaJavaIdentifierStart", "JAVA_IDENTIFIER_START") { + @Override + protected boolean matches(int codePoint) { + return Character.isJavaIdentifierStart(codePoint); + } + }, + new Category("javaJavaIdentifierPart", "JAVA_IDENTIFIER_PART") { + @Override + protected boolean matches(int codePoint) { + return Character.isJavaIdentifierPart(codePoint); + } + }); + + Writer writer = new BufferedWriter(new FileWriter("java/com/google/re2j/JavaCategoryTables.java")); + try { + writer.write("\npackage com.google.re2j;\n\n"); + writer.write("// AUTOGENERATED by MakeJavaCategories.java - do not modify\n\n"); + writer.write("import java.util.HashMap;\n"); + writer.write("import java.util.Map;\n\n"); + writer.write("class JavaCategoryTables {\n"); + writer.write(" private static Map Categories() {\n"); + writer.write(" Map map = new HashMap();\n"); + for (Category category : categories) { + writer.write(" map.put(\"" + category.categoryName + "\", " + category.categoryName + ");\n"); + } + writer.write(" return map;\n"); + writer.write(" }\n\n"); + for (Category category : categories) { + writer.write(" private static final int[][] " + category.categoryName + " = make" + category.categoryName + "();\n"); + writer.write(" private static int[][] make" + category.categoryName + "() {\n"); + writer.write(" return new int[][] {\n"); + category.writeTo(writer); + writer.write(" };\n"); + writer.write(" }\n\n"); + } + + writer.write(" static final Map CATEGORIES = Categories();\n"); + writer.write("}\n"); + } finally { + writer.flush(); + writer.close(); + } + } +} diff --git a/java/com/google/re2j/Parser.java b/java/com/google/re2j/Parser.java index 447bcac8..81e1d3b3 100644 --- a/java/com/google/re2j/Parser.java +++ b/java/com/google/re2j/Parser.java @@ -1524,6 +1524,10 @@ private static Pair unicodeTable(String name) { if (table != null) { return Pair.of(table, UnicodeTables.FOLD_SCRIPT.get(name)); } + table = JavaCategoryTables.CATEGORIES.get(name); + if (table != null) { + return Pair.of(table, null); + } return null; } diff --git a/javatests/com/google/re2j/FindTest.java b/javatests/com/google/re2j/FindTest.java index 6210c0a5..4e78e9b5 100644 --- a/javatests/com/google/re2j/FindTest.java +++ b/javatests/com/google/re2j/FindTest.java @@ -20,6 +20,8 @@ import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; +import com.google.common.base.Charsets; + @RunWith(Parameterized.class) public class FindTest { @@ -172,8 +174,49 @@ String submatchString(int i, int j) { 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + + // Java identifier tests + javaTest("javaJavaIdentifierStart", + JavaCategoryTables.CATEGORIES.get("javaJavaIdentifierStart"), + false), + javaTest("javaJavaIdentifierStart", + JavaCategoryTables.CATEGORIES.get("javaJavaIdentifierStart"), + true), + javaTest("javaJavaIdentifierPart", + JavaCategoryTables.CATEGORIES.get("javaJavaIdentifierPart"), + false), + javaTest("javaJavaIdentifierPart", + JavaCategoryTables.CATEGORIES.get("javaJavaIdentifierPart"), + true), }; + static Test javaTest(String categoryName, int[][] category, boolean isNegative) { + String pattern = (isNegative ? "\\P{" + categoryName + "}+" + : "\\p{" + categoryName + "}+"); + + // For positive cases, create a string from the first and last codepoint in each range. + // For negative cases, use the codepoint previous to the first and the codepoint subsequent + // to the last in each range. + StringBuilder buffer = new StringBuilder(); + for (int i = 0; i < category.length; i++) { + int lower = category[i][0]; + int upper = category[i][1]; + + if (isNegative) lower--; + if (isNegative) upper++; + if (lower >= 0) { + buffer.append(Character.toChars(lower)); + } + if (upper <= Character.MAX_CODE_POINT) { + buffer.append(Character.toChars(upper)); + } + } + + String testText = buffer.toString(); + return new Test(pattern, testText, 1, 0, testText.getBytes(Charsets.UTF_8).length); + } + + @Parameters public static Test[] testCases() { return FIND_TESTS; diff --git a/javatests/com/google/re2j/RE2CompileTest.java b/javatests/com/google/re2j/RE2CompileTest.java index 7a265c5b..99a69870 100644 --- a/javatests/com/google/re2j/RE2CompileTest.java +++ b/javatests/com/google/re2j/RE2CompileTest.java @@ -33,6 +33,8 @@ public static String[][] testData() { {"\\!\\\\", null}, {"abc]", null}, // Matches the closing bracket literally. {"a??", null}, + {"\\p{javaJavaIdentifierStart}", null}, + {"\\p{javaJavaIdentifierPart}", null}, {"*", "missing argument to repetition operator: `*`"}, {"+", "missing argument to repetition operator: `+`"}, {"?", "missing argument to repetition operator: `?`"},