Skip to content

Commit

Permalink
Merge pull request #1 from ZacHooper/dev/cursor
Browse files Browse the repository at this point in the history
perf: scan through rather than copying token list
  • Loading branch information
ZacHooper authored Jun 28, 2024
2 parents e9bdec3 + 67d5df9 commit 7c6fa7d
Show file tree
Hide file tree
Showing 12 changed files with 164 additions and 131 deletions.
Binary file added .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Python
/__pycache__/
*.py[cod]
venv/
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ Copy the `json.📦` file into your project and it should be usable.
## TODO

1. ~~Fix parsing of floats.
2. Refactor the parser to use slices of the tokens. The way we are currently doing it probably results in a lot of copys of the list.
3. Checking if the first token is a right brace twice. Should be able to do this once.
2. Refactor the parser to use slices of the tokens
3. ~~Checking if the first token is a right brace twice. Should be able to do this once.
4. Add streaming capabilities.
5. Better handle escaped characters in Strings.
5. ~~Better handle escaped characters in Strings.
6. Moooore performance
7. Add GitHub action to build package

### Edge Cases to Consider

Expand Down Expand Up @@ -171,3 +173,14 @@ Circular references are not valid in JSON but if your parser encounters them, it
```

JSON doesn’t technically disallow duplicate keys, but parsers should decide how to handle them (`value2` will overwrite `value1` in most parsers).

## Progress

V0.0.1

- Working but slooooow. So long that I didn't wait to see when it finished parsing the canada.json file.

V0.0.2

- Addressed copy issues.
- Now parses the canada.json file in 700ms on my hardware. Still slow but actually usable now.
6 changes: 2 additions & 4 deletions benchmark.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ import time
fn main() raises:
# with open("data/citm_catalog.json", "r") as f:
# with open("data/twitter.json", "r") as f:
with open("data/twitter_small.json", "r") as f:
# with open("data/canada_data.json", "r") as f:
# with open("data/canada_data_small.json", "r") as f:
# with open("small_json.json", "r") as f:
with open("data/canada_data.json", "r") as f:
var text = f.read()
var start = time.now()
var raw_data = json.loads(text)
var end = time.now()
print("Time taken to parse JSON: ", (end - start) / 1000000, "ms")
# print(json.dumps(raw_data))
18 changes: 18 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json
from datetime import datetime as time


def main():
# with open("data/citm_catalog.json", "r") as f:
# with open("data/twitter.json", "r") as f:
with open("data/canada_data.json", "r") as f:
text = f.read()
start = time.now()
raw_data = json.loads(text)
end = time.now()
print("Time taken to parse JSON: ", (end - start) / 1000000, "ms")
print("Time taken to parse JSON: ", (end - start), "ns")


if __name__ == "__main__":
main()
Binary file added build/json.📦
Binary file not shown.
Binary file removed json.📦
Binary file not shown.
145 changes: 64 additions & 81 deletions json/lexer.mojo
Original file line number Diff line number Diff line change
@@ -1,119 +1,102 @@
from json.types import JSON_QUOTE, JSON_WHITESPACE, JSON_SYNTAX, Value
from json.types import (
JSON_QUOTE,
JSON_WHITESPACE,
JSON_SYNTAX,
Value,
JSON_NUMBER,
JSON_ESCAPE,
)


@value
struct LexResult:
var value: Value
var is_null: Bool

fn __init__(inout self, value: Value, is_null: Bool):
self.value = value
self.is_null = is_null


fn lex_string(inout string: String) raises -> LexResult:
fn lex_string(string: String, inout position: Int) raises -> String:
var json_string: String = ""
var start_of_string = position

if string[0] == JSON_QUOTE:
string = string[1:]
else:
return LexResult(Value(None), True)
var skip = False

for i in range(len(string)):
for i in range(len(string) - position):
if skip:
skip = False
continue
var c = string[i + start_of_string]
# Handle empty string
if string[i] == JSON_QUOTE and len(json_string) == 0:
string = string[1:]
return LexResult(Value(json_string), False)
# Handle end of string. Make sure the previous character is not an escape character
if string[i] == JSON_QUOTE and json_string[-1] != "\\":
string = string[i + 1 :]
return LexResult(Value(json_string), False)
if c == JSON_QUOTE and len(json_string) == 0:
position += 1
return json_string
elif c == JSON_ESCAPE:
# Add the escape character and the next character
json_string += c
json_string += string[i + start_of_string + 1]
# Then skip the next character
skip = True
continue
# Handle end of string
elif c == JSON_QUOTE:
position += i + 1
return json_string
# Handle escape characters
elif json_string == "\\" and string[i] != "\\":
json_string = json_string[:-1]
json_string += string[i]
else:
json_string += string[i]
json_string += c

raise Error("Expected end-of-string quote")


fn lex_number(inout string: String) raises -> LexResult:
fn lex_number(string: String, inout position: Int) raises -> Value:
var json_number: String = ""
var number_characters = "1234567890-e."
var original_position = position

for i in range(len(string)):
var c = string[i]
for i in range(len(string) - position):
var c = string[i + original_position]
if c in number_characters:
json_number += c
else:
break

# Remove the number from the full JSON String
string = string[len(json_number) :]

if not len(json_number):
return LexResult(Value(None), True)
position += len(json_number)

if "." in json_number:
var num = atof(json_number)
return LexResult(Value(num), False)

return LexResult(Value(atol(json_number)), False)


fn lex_bool(inout string: String) -> LexResult:
if string.startswith("true"):
string = string[4:]
return LexResult(Value(True), False)
elif string.startswith("false"):
string = string[5:]
return LexResult(Value(False), False)
else:
return LexResult(Value(None), True)
return Value(num)


fn lex_null(inout string: String) -> LexResult:
if string.startswith("null"):
string = string[4:]
return LexResult(Value(None), True)
else:
return LexResult(Value(None), False)
return Value(atol(json_number))


fn lex(raw_string: String) raises -> List[Value]:
var tokens = List[Value]()
var string = raw_string
var position: Int = 0

while len(string):
var json_string = lex_string(string)
if json_string.is_null == False:
tokens.append(json_string.value)
continue

var json_number = lex_number(string)
if json_number.is_null == False:
tokens.append(json_number.value)
continue

var json_bool = lex_bool(string)
if json_bool.is_null == False:
tokens.append(json_bool.value)
while position < len(string):
if string[position] in JSON_WHITESPACE:
position += 1
continue

var json_null = lex_null(string)
if json_null.is_null == True:
elif string[position] == JSON_QUOTE:
position += 1
var json_string = lex_string(string, position)
tokens.append(Value(json_string))
elif string[position] in JSON_NUMBER:
var json_number = lex_number(string, position)
tokens.append(json_number)
elif string[position] == "t":
tokens.append(Value(True))
position += 4
elif string[position] == "f":
tokens.append(Value(False))
position += 5
elif string[position] == "n":
tokens.append(Value(None))
continue

if string[0] in JSON_WHITESPACE:
string = string[1:]
elif string[0] in JSON_SYNTAX:
tokens.append(Value(string[0]))
string = string[1:]
position += 4
elif string[position] in JSON_SYNTAX:
tokens.append(Value(string[position]))
position += 1
else:
raise Error(
"Unexpected character: " + string[0] + " Near: " + string[1:]
"Unexpected character: "
+ string[position]
+ " Near: "
+ string[position - 10 : position + 10]
)

return tokens
52 changes: 30 additions & 22 deletions json/parser.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -9,70 +9,78 @@ fn is_special_token(token: Value, special_token: String) -> Bool:
return False


fn parse_array(inout tokens: List[Value]) raises -> Value:
fn parse_array(tokens: List[Value], inout position: Int) raises -> Value:
# First check if this is the end of the array
var first_token = tokens[0]
var first_token = tokens[position]
var json_array = List[Value]()
if is_special_token(first_token, JSON_RIGHTBRACKET):
return Value(JsonList(json_array))

# Loop through each token in the array. If Comma move to the next token
# If RightBracket, return the array
while len(tokens) > 0:
var next_token = tokens[0]
var next_token = tokens[position]
if is_special_token(next_token, JSON_RIGHTBRACKET):
return Value(JsonList(json_array))
elif is_special_token(next_token, JSON_COMMA):
tokens = tokens[1:]
# tokens = tokens[1:]
position += 1
else:
var parsed_token = parse(tokens)
var parsed_token = parse(tokens, position)
json_array.append(parsed_token)
tokens = tokens[1:]
# tokens = tokens[1:]
position += 1

return Value(JsonList(json_array))


fn parse_object(inout tokens: List[Value]) raises -> Value:
fn parse_object(tokens: List[Value], inout position: Int) raises -> Value:
# Make sure it's not an empty object
var first_token = tokens[0]
var first_token = tokens[position]
var json_object = Dict[String, Value]()
if is_special_token(first_token, JSON_RIGHTBRACE):
return Value(JsonDict(json_object))

# Loop through each key-value pair in the object
while len(tokens) > 0:
# Get the key
var key = tokens[0]
var key = tokens[position]

# Check if key is special token
if is_special_token(key, JSON_RIGHTBRACE):
return Value(JsonDict(json_object))
if is_special_token(tokens[0], JSON_COMMA):
tokens = tokens[1:]
if is_special_token(tokens[position], JSON_COMMA):
# tokens = tokens[1:]
position += 1
continue

# Check next token is a colon
tokens = tokens[1:]
if is_special_token(tokens[0], JSON_COLON) == False:
# tokens = tokens[1:]
position += 1
if is_special_token(tokens[position], JSON_COLON) == False:
raise Error("Expected colon after key in object")

# Get the value of the key
tokens = tokens[1:]
var value = parse(tokens)
# tokens = tokens[1:]
position += 1
var value = parse(tokens, position)
json_object[key._variant[String]] = value
tokens = tokens[1:]
# tokens = tokens[1:]
position += 1

return Value(JsonDict(json_object))


fn parse(inout tokens: List[Value]) raises -> Value:
var first_token = tokens[0]
fn parse(tokens: List[Value], inout position: Int) raises -> Value:
var first_token = tokens[position]

if is_special_token(first_token, JSON_LEFTBRACE):
tokens = tokens[1:]
return parse_object(tokens)
# tokens = tokens[1:]
position += 1
return parse_object(tokens, position)
if is_special_token(first_token, JSON_LEFTBRACKET):
tokens = tokens[1:]
return parse_array(tokens)
# tokens = tokens[1:]
position += 1
return parse_array(tokens, position)

return first_token
3 changes: 2 additions & 1 deletion json/python_compatability.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ fn loads(raw_json: String) raises -> Value:
var tokens = lex(raw_json)

print("Parsing JSON")
return parse(tokens)
var initial_position = 0
return parse(tokens, initial_position)


fn dumps(value: Value) raises -> String:
Expand Down
2 changes: 2 additions & 0 deletions json/types.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ from collections import List, Dict
alias JSON_QUOTE = '"'
alias JSON_WHITESPACE = " \t\n"
alias JSON_SYNTAX = "{}[],:"
alias JSON_NUMBER = "-0123456789"
alias JSON_ESCAPE = "\\"

alias JSON_LEFTBRACKET = "["
alias JSON_RIGHTBRACKET = "]"
Expand Down
Loading

0 comments on commit 7c6fa7d

Please sign in to comment.