Skip to content

Commit 8f2b527

Browse files
authored
fix(tree-sitter): simplify arco-kdl multiline strings (#161)
* fix(tree-sitter): simplify arco-kdl multiline strings * style(tree-sitter): use member nodes in highlight fixture * docs(tree-sitter): pin parser runtime header policy
1 parent 3f54473 commit 8f2b527

9 files changed

Lines changed: 10539 additions & 9028 deletions

File tree

.github/workflows/kdl-overlay.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,14 @@ jobs:
5151
- name: Install tree-sitter CLI
5252
run: npm install --global tree-sitter-cli
5353

54-
- name: Regenerate parser from latest grammar
54+
- name: Regenerate generated parser artifacts from grammar source
5555
working-directory: tools/tree-sitter-arco-kdl
5656
run: |
5757
npm install
58-
tree-sitter generate
58+
# parser.c / grammar.json / node-types.json are normal generated outputs.
59+
# src/tree_sitter/parser.h is treated as a vendored, pinned runtime
60+
# header and should only change when the Tree-sitter toolchain moves.
61+
npx tree-sitter generate
5962
6063
- name: Run filtered prek hook
6164
run: uvx --from prek==0.3.6 prek run --all-files --hook-stage manual

tools/tree-sitter-arco-kdl/README.md

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,35 @@ Use `examples/highlight_demo.kdl` as the visual fixture when tuning colors.
5151
If your editor supports capture inspection (e.g. Neovim `:Inspect`), open the
5252
fixture and verify these captures line-by-line before taking screenshots.
5353

54+
## Source of truth
55+
56+
Authored files:
57+
58+
- `grammar.js`: overlay grammar source of truth
59+
- `src/scanner.c`: thin Arco wrapper for external tokens
60+
- `src/vendor/tree_sitter_kdl_external_scanner.inc`: vendored upstream KDL scanner
61+
- `queries/*.scm`: editor queries
62+
- `test/corpus/arco_math.txt`: parser regression corpus
63+
64+
Generated files:
65+
66+
- `src/parser.c`
67+
- `src/grammar.json`
68+
- `src/node-types.json`
69+
- `src/tree_sitter/parser.h` (vendored tree-sitter runtime header, pinned to the current toolchain)
70+
71+
When `grammar.js` changes, regenerate the parser artifacts with:
72+
73+
```sh
74+
npm install
75+
npx tree-sitter generate
76+
```
77+
78+
Do not hand-edit `src/parser.c`. It is generated code.
79+
Do not hand-edit `src/tree_sitter/parser.h` either. Treat it as a vendored,
80+
mostly frozen header that only changes when we intentionally bump the
81+
Tree-sitter CLI/runtime version.
82+
5483
## Installation
5584

5685
### Neovim
@@ -146,7 +175,10 @@ instead of generic KDL highlighting.
146175

147176
## Files
148177

149-
- `grammar.js`: KDL overlay grammar.
178+
- `grammar.js`: KDL overlay grammar, source of truth.
179+
- `src/scanner.c`: thin Arco-specific external scanner shim.
180+
- `src/vendor/tree_sitter_kdl_external_scanner.inc`: vendored upstream KDL scanner implementation.
181+
- `src/parser.c`: generated parser output.
150182
- `queries/injections.scm`: marks `arco_math_text` for language injection.
151183
- `examples/highlight_demo.kdl`: semantic highlight fixture for theme tuning.
152184
- `test/corpus/arco_math.txt`: corpus examples for algebra-body parsing.

tools/tree-sitter-arco-kdl/examples/highlight_demo.kdl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
// Predicates/keywords should share one color
22
set gen alias=g
33
set time alias=t {
4-
1
5-
2
6-
3
4+
member 1
5+
member 2
6+
member 3
77
}
88

99
data generators from="data/generators.csv" {
Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,22 @@
11
const kdl = require("tree-sitter-kdl/grammar");
22

3-
const nodeShape = ($, nameRule, childrenRule) =>
3+
const PURE_MATH_NODE_NAMES = [
4+
"expression",
5+
"minimize",
6+
"maximize",
7+
"expr",
8+
"filter",
9+
"if",
10+
"lower",
11+
"upper",
12+
];
13+
14+
const nodeShape = ($, { nameRule, childrenRule, fieldRule = $.node_field }) =>
415
seq(
516
alias(optional(seq("/-", repeat($._node_space))), $.node_comment),
617
optional($.type),
718
nameRule,
8-
repeat(seq(repeat1($._node_space), $.node_field)),
19+
repeat(seq(repeat1($._node_space), fieldRule)),
920
optional(
1021
seq(
1122
repeat($._node_space),
@@ -17,6 +28,19 @@ const nodeShape = ($, nameRule, childrenRule) =>
1728
$._node_terminator,
1829
);
1930

31+
const mathChildren = ($, mathRule) =>
32+
prec(
33+
2,
34+
seq(
35+
optional(
36+
seq(alias("/-", $.node_children_comment), repeat($._node_space)),
37+
),
38+
"{",
39+
choice(field("math", mathRule), seq(repeat($._linespace))),
40+
"}",
41+
),
42+
);
43+
2044
module.exports = grammar(kdl, {
2145
name: "arco_kdl",
2246

@@ -27,6 +51,22 @@ module.exports = grammar(kdl, {
2751
_node_terminator: ($, previous) =>
2852
choice(previous, $._implicit_terminator),
2953

54+
string: ($, previous) =>
55+
choice(previous, $._multiline_string),
56+
57+
_multiline_string: ($) =>
58+
seq(
59+
'"""',
60+
optional($._newline),
61+
repeat(
62+
choice(
63+
alias(token.immediate(prec(1, /[^"]+/)), $.string_fragment),
64+
alias(token.immediate('"'), $.string_fragment),
65+
),
66+
),
67+
'"""',
68+
),
69+
3070
value: ($) =>
3171
seq(
3272
optional($.type),
@@ -36,83 +76,57 @@ module.exports = grammar(kdl, {
3676
bare_identifier: ($) => $._bare_identifier,
3777

3878
node: ($) =>
39-
choice($.arco_pure_math_node, $.arco_constraint_node, $.kdl_node),
79+
choice(
80+
$.arco_pure_math_node,
81+
$.arco_constraint_node,
82+
$.kdl_node,
83+
),
4084

41-
kdl_node: ($) => prec(1, nodeShape($, $.identifier, $.node_children)),
85+
kdl_node: ($) =>
86+
prec(
87+
1,
88+
nodeShape($, {
89+
nameRule: $.identifier,
90+
childrenRule: $.node_children,
91+
}),
92+
),
4293

4394
// Nodes whose { } body is always algebra text.
4495
arco_pure_math_node: ($) =>
4596
prec(
4697
2,
47-
nodeShape(
48-
$,
49-
field(
50-
"name",
51-
choice(
52-
"expression",
53-
"minimize",
54-
"maximize",
55-
"expr",
56-
"filter",
57-
"if",
58-
"lower",
59-
"upper",
60-
),
61-
),
62-
$.arco_pure_math_children,
63-
),
98+
nodeShape($, {
99+
nameRule: field("name", choice(...PURE_MATH_NODE_NAMES)),
100+
childrenRule: $.arco_pure_math_children,
101+
}),
64102
),
65103

66104
// Constraint nodes can have either KDL children or a math body.
67105
arco_constraint_node: ($) =>
68106
prec(
69107
2,
70-
nodeShape(
71-
$,
72-
field("name", "constraint"),
73-
choice($.arco_constraint_math_children, $.node_children),
74-
),
108+
nodeShape($, {
109+
nameRule: field("name", "constraint"),
110+
childrenRule: choice($.arco_constraint_math_children, $.node_children),
111+
}),
75112
),
76113

77114
// Math body for nodes whose braces are always algebra text.
78115
arco_pure_math_children: ($) =>
79-
prec(
80-
2,
81-
seq(
82-
optional(
83-
seq(alias("/-", $.node_children_comment), repeat($._node_space)),
84-
),
85-
"{",
86-
choice(field("math", $.arco_math_text), seq(repeat($._linespace))),
87-
"}",
88-
),
89-
),
116+
mathChildren($, $.arco_math_text),
90117

91118
// Constraint math body remains stricter so child-node bodies keep parsing
92119
// as KDL instead of being swallowed as free-form math text.
93120
arco_constraint_math_children: ($) =>
94-
prec(
95-
2,
96-
seq(
97-
optional(
98-
seq(alias("/-", $.node_children_comment), repeat($._node_space)),
99-
),
100-
"{",
101-
choice(
102-
field("math", $.arco_constraint_math_text),
103-
seq(repeat($._linespace)),
104-
),
105-
"}",
106-
),
107-
),
121+
mathChildren($, $.arco_constraint_math_text),
108122

109123
// Single opaque token for free-form algebra text in expression/minimize/
110124
// maximize/filter/if/lower/upper nodes.
111-
arco_math_text: (_) => token(prec(10, /[^{}"']+/)),
125+
arco_math_text: (_) => token(prec(10, /[^{}]+/)),
112126

113127
// Constraint math must include an operator or bracket so bare KDL child
114128
// nodes like `if { ... }` still parse through node_children.
115129
arco_constraint_math_text: (_) =>
116-
token(prec(10, /[^{}"']*[<>=!+\-*\/\[\]][^{}"']*/)),
130+
token(prec(10, /[^{}]*[<>=!+\-*\/\[\]][^{}]*/)),
117131
},
118132
});

tools/tree-sitter-arco-kdl/src/grammar.json

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -630,12 +630,21 @@
630630
"type": "CHOICE",
631631
"members": [
632632
{
633-
"type": "SYMBOL",
634-
"name": "_raw_string"
633+
"type": "CHOICE",
634+
"members": [
635+
{
636+
"type": "SYMBOL",
637+
"name": "_raw_string"
638+
},
639+
{
640+
"type": "SYMBOL",
641+
"name": "_escaped_string"
642+
}
643+
]
635644
},
636645
{
637646
"type": "SYMBOL",
638-
"name": "_escaped_string"
647+
"name": "_multiline_string"
639648
}
640649
]
641650
},
@@ -1214,6 +1223,67 @@
12141223
}
12151224
]
12161225
},
1226+
"_multiline_string": {
1227+
"type": "SEQ",
1228+
"members": [
1229+
{
1230+
"type": "STRING",
1231+
"value": "\"\"\""
1232+
},
1233+
{
1234+
"type": "CHOICE",
1235+
"members": [
1236+
{
1237+
"type": "SYMBOL",
1238+
"name": "_newline"
1239+
},
1240+
{
1241+
"type": "BLANK"
1242+
}
1243+
]
1244+
},
1245+
{
1246+
"type": "REPEAT",
1247+
"content": {
1248+
"type": "CHOICE",
1249+
"members": [
1250+
{
1251+
"type": "ALIAS",
1252+
"content": {
1253+
"type": "IMMEDIATE_TOKEN",
1254+
"content": {
1255+
"type": "PREC",
1256+
"value": 1,
1257+
"content": {
1258+
"type": "PATTERN",
1259+
"value": "[^\"]+"
1260+
}
1261+
}
1262+
},
1263+
"named": true,
1264+
"value": "string_fragment"
1265+
},
1266+
{
1267+
"type": "ALIAS",
1268+
"content": {
1269+
"type": "IMMEDIATE_TOKEN",
1270+
"content": {
1271+
"type": "STRING",
1272+
"value": "\""
1273+
}
1274+
},
1275+
"named": true,
1276+
"value": "string_fragment"
1277+
}
1278+
]
1279+
}
1280+
},
1281+
{
1282+
"type": "STRING",
1283+
"value": "\"\"\""
1284+
}
1285+
]
1286+
},
12171287
"bare_identifier": {
12181288
"type": "SYMBOL",
12191289
"name": "_bare_identifier"
@@ -1774,7 +1844,7 @@
17741844
"value": 10,
17751845
"content": {
17761846
"type": "PATTERN",
1777-
"value": "[^{}\"']+"
1847+
"value": "[^{}]+"
17781848
}
17791849
}
17801850
},
@@ -1785,7 +1855,7 @@
17851855
"value": 10,
17861856
"content": {
17871857
"type": "PATTERN",
1788-
"value": "[^{}\"']*[<>=!+\\-*\\/\\[\\]][^{}\"']*"
1858+
"value": "[^{}]*[<>=!+\\-*\\/\\[\\]][^{}]*"
17891859
}
17901860
}
17911861
}

tools/tree-sitter-arco-kdl/src/node-types.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@
494494
"named": true,
495495
"fields": {},
496496
"children": {
497-
"multiple": false,
497+
"multiple": true,
498498
"required": false,
499499
"types": [
500500
{
@@ -573,6 +573,10 @@
573573
"type": "\"",
574574
"named": false
575575
},
576+
{
577+
"type": "\"\"\"",
578+
"named": false
579+
},
576580
{
577581
"type": "#",
578582
"named": false

0 commit comments

Comments
 (0)