lcompilers · akshanshbhatt · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022 · Aug 9, 2022
diff --git a/src/lpython/parser/tokenizer.re b/src/lpython/parser/tokenizer.re
@@ -253,12 +253,14 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
         // initialization is not needed. One can think of them as local
         // variables of the re2c block.
         unsigned char *mar;//, *ctxmar;
+        /*!include:re2c "unicode_categories.re" */
         /*!re2c
             re2c:define:YYCURSOR = cur;
             re2c:define:YYMARKER = mar;
             // re2c:define:YYCTXMARKER = ctxmar;
             re2c:yyfill:enable = 0;
             re2c:define:YYCTYPE = "unsigned char";
+            re2c:flags:utf-8 = 1;
 
             end = "\x00";
             whitespace = [ \t\v\r]+;
@@ -268,7 +270,7 @@ int Tokenizer::lex(Allocator &al, YYSTYPE &yylval, Location &loc, diag::Diagnost
             int_bin = "0"[bB][01]+;
             int_hex = "0"[xX][0-9a-fA-F]+;
             int_dec = digit+ (digit | "_" digit)*;
-            char =  [a-zA-Z_];
+            char = L | "_";
             name = char (char | digit)*;
             significand = (digit+ "." digit*) | ("." digit+);
             exp = [eE][-+]? digit+;

diff --git a/tests/parser/unicode.py b/tests/parser/unicode.py
@@ -0,0 +1,12 @@
+def test_frompyfunc_name(self):
+    # name conversion was failing for python 3 strings
+    # resulting in the default '?' name. Also test utf-8
+    # encoding using non-ascii name.
+    def cassé(x):
+        return x
+
+ℙƴtℌøἤ = 1
+print(ℙƴtℌøἤ)
+
+if 诶 != 2:
+    pass
diff --git a/tests/reference/ast_new-unicode-d3199dc.json b/tests/reference/ast_new-unicode-d3199dc.json
@@ -0,0 +1,13 @@
+{
+    "basename": "ast_new-unicode-d3199dc",
+    "cmd": "lpython --show-ast --new-parser --no-color {infile} -o {outfile}",
+    "infile": "tests/parser/unicode.py",
+    "infile_hash": "3e528c289a84fea33fc77e001303be40462884f95f5bfbab960125b6",
+    "outfile": null,
+    "outfile_hash": null,
+    "stdout": "ast_new-unicode-d3199dc.stdout",
+    "stdout_hash": "f823fd385e5a7c49558532b879dce3005229321e995631fabb19f6b1",
+    "stderr": null,
+    "stderr_hash": null,
+    "returncode": 0
+}
diff --git a/tests/reference/ast_new-unicode-d3199dc.stdout b/tests/reference/ast_new-unicode-d3199dc.stdout
@@ -0,0 +1 @@
+(Module [(FunctionDef test_frompyfunc_name ([] [(self () ())] [] [] [] [] []) [(FunctionDef cassé ([] [(x () ())] [] [] [] [] []) [(Return (Name x Load))] [] () ())] [] () ()) (Assign [(Name ℙƴtℌøἤ Store)] (ConstantInt 1 ()) ()) (Expr (Call (Name print Load) [(Name ℙƴtℌøἤ Load)] [])) (If (Compare (Name 诶 Load) NotEq [(ConstantInt 2 ())]) [(Pass)] [])] [])
diff --git a/tests/tests.toml b/tests/tests.toml
@@ -528,6 +528,10 @@ ast_new = true
 filename = "parser/tuple1.py"
 ast_new = true
 
+[[test]]
+filename = "parser/unicode.py"
+ast_new = true
+
 [[test]]
 filename = "parser/type_comment1.py"
 ast_new = true
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		(Module [(FunctionDef test_frompyfunc_name ([] [(self () ())] [] [] [] [] []) [(FunctionDef cassé ([] [(x () ())] [] [] [] [] []) [(Return (Name x Load))] [] () ())] [] () ()) (Assign [(Name ℙƴtℌøἤ Store)] (ConstantInt 1 ()) ()) (Expr (Call (Name print Load) [(Name ℙƴtℌøἤ Load)] [])) (If (Compare (Name 诶 Load) NotEq [(ConstantInt 2 ())]) [(Pass)] [])] [])