Skip to content

Commit

Permalink
[micro-syntax] Minor HTM8 cleanup
Browse files Browse the repository at this point in the history
And I realized that these problems are similar:

- shell here doc - search until prefix-selected delimiter (line-wise)
- C++ multi-line string - search until prefix-selected delimiter
- HTML <script> <STYLE> - search until prefix-selected delimiter

And then these are similar:

- C++ /* - search until */
- HTML
  - <!-- search until -->
  - <? search until ?>
  - <![CDATA[ search until ]]>

I'm also not sure if <!DOCTYPE foo> needs to be multi-line.  Probably.

Maybe we need a mode that's not multi-line then?  It will work better
for start tags too.

    <a href=foo
       class=d></a>

Well actually we have already taken care of that, because <a and > are
different tokens.  The mode persists across lines.
  • Loading branch information
Andy C committed Jan 20, 2025
1 parent e314d99 commit f011d93
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 24 deletions.
2 changes: 2 additions & 0 deletions doctools/micro-syntax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ readonly -a HTML_TESTS=(
<not-a-tag> --> more <p>'
'foo <![CDATA[ hello
<not-a-tag> ]]> more <p>'
'not-cdata <![cdata[ hello
<not-a-tag> ]]> more <p>'
'<script>if (x<y) {
console.log("hi"); } </script> hi <p>'
'<style>p { background-color: red;
Expand Down
9 changes: 9 additions & 0 deletions doctools/micro_syntax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,15 @@ int ScanOne(Reader* reader, OutputStream* out, Hook* hook) {
}
} break;

// TODO: I think we need a mode to escape into strstr(), for
// C++ - ending */
// HTML - ending --> ?> ]]> </SCRipt>
//
// So instead of returning 'eol', we can return a string to search for?
// Then we keep looking for more lines.
//
// This is similar to the problems of here doc and C++ multi-line
// strings. The main difference is that we're not using a submatch.
default:
break;
}
Expand Down
54 changes: 30 additions & 24 deletions doctools/micro_syntax.re2c.h
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ enum class html_mode_e {
Comm, // <!-- -->
Preprocessing, // <? ?>
CData, // <![CDATA[ x ]]>
HtmlCData, // <script> <style>
};

// LeftStartTag -> RightStartTag <a href=/ >
Expand All @@ -709,9 +710,9 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
name = [a-zA-Z][a-zA-Z0-9:_-]* ;
// TODO: check this pattern
char_name = '&' [a-zA-Z][a-zA-Z0-9]* ';' ;
char_dec = '&#' [0-9]+ ';' ;
char_hex = '&#x' [0-9a-fA-F]+ ';' ;
char_name = "&" [a-zA-Z][a-zA-Z0-9]* ";" ;
char_dec = "&#" [0-9]+ ";" ;
char_hex = "&#x" [0-9a-fA-F]+ ";" ;
*/

switch (lexer->line_mode) {
Expand All @@ -725,22 +726,24 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
char_dec { TOK(Id::CharEscape); }
char_hex { TOK(Id::CharEscape); }
'&' { TOK(Id::BadAmpersand); }
'>' { TOK(Id::BadGreaterThan); }
'<' { TOK(Id::BadLessThan); }
"&" { TOK(Id::BadAmpersand); }
">" { TOK(Id::BadGreaterThan); }
"<" { TOK(Id::BadLessThan); }
'</' name '>' { TOK(Id::EndTag); }
"</" name ">" { TOK(Id::EndTag); }
'<' name {
"<" name {
TOK_MODE(Id::TagNameLeft, html_mode_e::AttrName);
// TODO: <script> <style> - special logic for strstr()
}
'<!' [^\x00>]* '>' { TOK(Id::Str); }
// Problem: these can span more than one linee ... it needs to be
// another mode? The end tag might be technically the same.
"<!" [^\x00>]* ">" { TOK(Id::Comm); }
'<!--' { TOK_MODE(Id::Comm, html_mode_e::Comm); }
'<?' { TOK_MODE(Id::Comm, html_mode_e::Preprocessing); }
'<![CDATA[' { TOK_MODE(Id::Str, html_mode_e::CData); }
"<!--" { TOK_MODE(Id::Comm, html_mode_e::Comm); }
"<?" { TOK_MODE(Id::Comm, html_mode_e::Preprocessing); }
"<![CDATA[" { TOK_MODE(Id::Str, html_mode_e::CData); }
// Like RawData
Expand All @@ -754,8 +757,11 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
/*!re2c
nul { return true; } // TODO: error
'>' { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
'/>' { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
// TODO: If the tag was <script> or <STYLE>, then we want to enter
// HtmlCData mode, until we hit </script> or </STYLE>.
// This is live throughout AttrName, AttrValue, SQ, DQ states?
">" { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
"/>" { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
space_required name {
// <a missing> - stay in the AttrName mode
Expand All @@ -776,8 +782,8 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
/*!re2c
nul { return true; } // TODO: error
'"' { TOK_MODE(Id::Str, html_mode_e::DQ); }
"'" { TOK_MODE(Id::Str, html_mode_e::SQ); }
["] { TOK_MODE(Id::Str, html_mode_e::DQ); }
['] { TOK_MODE(Id::Str, html_mode_e::SQ); }
// Unquoted value - a single token
unquoted_value = [^\x00 \r\n\t<>&"']+ ;
Expand All @@ -799,11 +805,11 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
// we would only need these for translation to XML, not
// highlighting?
'&' { TOK(Id::BadAmpersand); }
'>' { TOK(Id::BadGreaterThan); }
'<' { TOK(Id::BadLessThan); }
"&" { TOK(Id::BadAmpersand); }
">" { TOK(Id::BadGreaterThan); }
"<" { TOK(Id::BadLessThan); }
'"' { TOK_MODE(Id::Str, html_mode_e::AttrName); }
["] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
* { TOK(Id::Str); }
*/
}
Expand All @@ -818,10 +824,10 @@ bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
// we would only need these for translation to XML, not
// highlighting?
'&' { TOK(Id::BadAmpersand); }
'>' { TOK(Id::BadGreaterThan); }
'<' { TOK(Id::BadLessThan); }
"'" { TOK_MODE(Id::Str, html_mode_e::AttrName); }
"&" { TOK(Id::BadAmpersand); }
">" { TOK(Id::BadGreaterThan); }
"<" { TOK(Id::BadLessThan); }
['] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
* { TOK(Id::Str); }
*/
Expand Down

0 comments on commit f011d93

Please sign in to comment.