Skip to content

Commit b7eb4f0

Browse files
committed
feat: initial markdown lexer/serializer implementation
potentially a few bugs in here!
0 parents  commit b7eb4f0

File tree

7 files changed

+1224
-0
lines changed

7 files changed

+1224
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# go-markdown

go.mod

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/jonlinkens/go-markdown
2+
3+
go 1.22.1

lexer/enrich.go

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
package lexer
2+
3+
import (
4+
"strconv"
5+
"strings"
6+
"unicode"
7+
)
8+
9+
type HeadingMeta struct {
10+
Level int `json:"level"`
11+
}
12+
type FencedCodeBlockMeta struct {
13+
Language string `json:"language"`
14+
}
15+
16+
type LinkMeta struct {
17+
Src string `json:"src"`
18+
}
19+
20+
type ImageMeta struct {
21+
Src string `json:"src"`
22+
}
23+
24+
type BlockquoteMeta struct {
25+
Depth int `json:"depth"`
26+
}
27+
28+
type OrderedListMeta struct {
29+
Number int `json:"number"`
30+
}
31+
32+
func (l *Lexer) enrichToken(token Token) Token {
33+
34+
switch token.Type {
35+
case TokenEOF:
36+
return Token{Type: token.Type, Value: ""}
37+
case TokenNewLine:
38+
return Token{Type: token.Type, Value: ""}
39+
40+
case TokenHeading:
41+
level := countLeadingChars(token.Value, '#')
42+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimLeadingChars(token.Value, '#', level), Meta: HeadingMeta{Level: level}}
43+
44+
case TokenBold:
45+
boldChar := rune(token.Value[0])
46+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimSurroundingChars(token.Value, boldChar, 2)}
47+
48+
case TokenItalic:
49+
italicChar := rune(token.Value[0])
50+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimSurroundingChars(token.Value, italicChar, 1)}
51+
52+
case TokenInlineCode:
53+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimSurroundingChars(token.Value, '`', 1)}
54+
55+
case TokenFencedCodeBlock:
56+
language := parseLanguageFromFencedCodeBlock(token.Value)
57+
if len(language) > 0 {
58+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimCodeBlock(token.Value, language), Meta: FencedCodeBlockMeta{Language: language}}
59+
}
60+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimCodeBlock(token.Value, language)}
61+
62+
case TokenUnorderedList:
63+
listChar := rune(token.Value[0])
64+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimLeadingChars(token.Value, listChar, 1)}
65+
66+
case TokenOrderedList:
67+
number, cleanValue := parseOrderedListParts(token.Value)
68+
69+
return Token{Type: token.Type, Value: token.Value, CleanValue: cleanValue, Meta: OrderedListMeta{Number: number}}
70+
71+
case TokenLink:
72+
title, url := parseLink(token.Value)
73+
return Token{Type: token.Type, Value: token.Value, CleanValue: title, Meta: LinkMeta{Src: url}}
74+
75+
case TokenImage:
76+
alt, src := parseImage(token.Value)
77+
return Token{Type: token.Type, Value: token.Value, CleanValue: alt, Meta: ImageMeta{Src: src}}
78+
79+
case TokenBlockquote:
80+
depth := countLeadingChars(token.Value, '>')
81+
token.CleanValue = trimLeadingChars(token.Value, '>', depth)
82+
return Token{Type: token.Type, Value: token.Value, CleanValue: trimLeadingChars(token.Value, '>', depth), Meta: BlockquoteMeta{Depth: depth}}
83+
}
84+
85+
return Token{Type: token.Type, Value: token.Value, CleanValue: token.Value}
86+
}
87+
88+
func countLeadingChars(s string, char rune) int {
89+
count := 0
90+
for _, c := range s {
91+
if c == char {
92+
count++
93+
} else {
94+
break
95+
}
96+
}
97+
return count
98+
}
99+
100+
func trimLeadingChars(s string, char rune, count int) string {
101+
i := 0
102+
for i < len(s) && i < count && rune(s[i]) == char {
103+
i++
104+
}
105+
106+
return strings.TrimFunc(s[i:], func(r rune) bool {
107+
return unicode.IsSpace(r) && r != '\n'
108+
})
109+
}
110+
111+
func trimEndingChars(s string, char rune, count int) string {
112+
i := len(s) - 1
113+
for i >= 0 && i >= len(s)-count && rune(s[i]) == char {
114+
i--
115+
}
116+
117+
return s[:i+1]
118+
119+
}
120+
121+
func trimSurroundingChars(s string, char rune, count int) string {
122+
s = trimLeadingChars(s, char, count)
123+
s = trimEndingChars(s, char, count)
124+
return s
125+
}
126+
127+
func trimCodeBlock(s string, language string) string {
128+
s = trimSurroundingChars(s, '`', 3)
129+
130+
if len(language) <= 0 {
131+
return s
132+
}
133+
134+
return s[len(language):]
135+
}
136+
137+
func parseLanguageFromFencedCodeBlock(s string) string {
138+
if s[0:4] == "```\n" {
139+
return ""
140+
}
141+
142+
codeBlock := trimLeadingChars(s, '`', 3)
143+
144+
words := strings.FieldsFunc(codeBlock, func(r rune) bool {
145+
return r == ' ' || r == '\n'
146+
})
147+
148+
return words[0]
149+
}
150+
151+
func parseOrderedListParts(s string) (int, string) {
152+
parts := strings.Split(s, ".")
153+
number, err := strconv.Atoi(parts[0])
154+
if err != nil {
155+
panic(err)
156+
}
157+
158+
return number, strings.Trim(parts[1], " ")
159+
}
160+
161+
func parseLink(s string) (string, string) {
162+
startText := strings.Index(s, "[") + 1
163+
endText := strings.Index(s, "]")
164+
startURL := strings.Index(s, "(") + 1
165+
endURL := strings.Index(s, ")")
166+
167+
text := s[startText:endText]
168+
url := s[startURL:endURL]
169+
return text, url
170+
}
171+
172+
func parseImage(s string) (string, string) {
173+
startAlt := strings.Index(s, "![") + 2
174+
endAlt := strings.Index(s, "]")
175+
startURL := strings.Index(s, "(") + 1
176+
endURL := strings.Index(s, ")")
177+
178+
alt := s[startAlt:endAlt]
179+
src := s[startURL:endURL]
180+
return alt, src
181+
182+
}

0 commit comments

Comments
 (0)