diff --git a/assets/coverage.svg b/assets/coverage.svg index 6bfc8fa..fe06143 100644 --- a/assets/coverage.svg +++ b/assets/coverage.svg @@ -9,13 +9,13 @@ - + coverage coverage - 99% - 99% + 94% + 94% diff --git a/dictdatabase/byte_codes.py b/dictdatabase/byte_codes.py index 5f50482..7d6383d 100644 --- a/dictdatabase/byte_codes.py +++ b/dictdatabase/byte_codes.py @@ -8,4 +8,5 @@ SPACE = 32 TAB = 9 NEWLINE = 10 +COLON = 58 COMMA = 44 diff --git a/dictdatabase/indexing.py b/dictdatabase/indexing.py index c5eaabc..760224a 100644 --- a/dictdatabase/indexing.py +++ b/dictdatabase/indexing.py @@ -1,6 +1,7 @@ +from dataclasses import dataclass import orjson import os -from . import config +from . import config, utils, byte_codes, io_bytes # Problem: Multiple read processes will concurrently read and write the same file # In some cases this will result in a empty read error, thats why the try-except exists @@ -21,6 +22,42 @@ # - Leave everything as is. While not ideal, it works. When empty read error occurs, don't use the index for that read + + + + +@dataclass +class KeyFinderState: + skip_next = False + in_str = False + list_depth = 0 + dict_depth = 1 + key_start = None + key_end = None + value_end = None + indices = [] + i = 1 + + +def batched_find_all_top_level_keys(db_name): + state, b = KeyFinderState(), 0 + while True: + batch_start = b * 10_000_000 + batch_end = batch_start + 10_000_000 + + batch_bytes = io_bytes.read_bytes(db_name, batch_start, batch_end) + + if batch_start == 0 and batch_bytes[0] != byte_codes.OPEN_CURLY: + raise ValueError("The first byte of the database file must be an opening curly brace") + if len(batch_bytes) == 0: + break + utils.find_all_top_level_keys(batch_bytes, state, len(batch_bytes)) + return state.indices + + + + + class Indexer: """ The Indexer takes the name of a database file, and tries to load the .index file @@ -57,6 +94,7 @@ def __init__(self, db_name: str): self.data = {} + def get(self, key): """ Returns a list of 5 elements for a key if it exists, otherwise None diff --git a/dictdatabase/utils.py b/dictdatabase/utils.py index 052c3cf..e1ecf27 100644 --- a/dictdatabase/utils.py +++ b/dictdatabase/utils.py @@ -1,8 +1,10 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Tuple import os import glob from . import config, byte_codes +from . indexing import KeyFinderState def file_info(db_name: str) -> Tuple[str, bool, str, bool]: @@ -37,17 +39,70 @@ def find_all(file_name: str) -> list[str]: return files_all + +def find_all_top_level_keys(json_bytes: bytes, state: KeyFinderState, batch_size: int) -> KeyFinderState: + """ + In the bytes of the json object find all top level keys and the start and end + indices of their values. + """ + + while state.i < batch_size: + current = json_bytes[state.i] + if state.skip_next: + state.skip_next = False + elif current == byte_codes.BACKSLASH: + state.skip_next = True + elif current == byte_codes.QUOTE: + if state.dict_depth == 1 and state.list_depth == 0: + if state.in_str: + state.key_end = state.i + state.i += 1 + while json_bytes[state.i] in [byte_codes.SPACE, byte_codes.COLON]: + state.i += 1 + state.value_start = state.i + else: + state.key_start = state.i + 1 + state.in_str = not state.in_str + elif state.in_str or current in [byte_codes.SPACE, byte_codes.TAB, byte_codes.NEWLINE]: + pass + elif current == byte_codes.OPEN_SQUARE: + state.list_depth += 1 + elif current == byte_codes.CLOSE_SQUARE: + state.list_depth -= 1 + elif current == byte_codes.OPEN_CURLY: + state.dict_depth += 1 + elif current == byte_codes.CLOSE_CURLY: + state.dict_depth -= 1 + elif state.list_depth == 0 and state.dict_depth == 1: + state.indices.append((json_bytes[state.key_start:state.key_end].decode(), state.value_start, state.i + 1)) + state.i += 1 + + + + + + + def seek_index_through_value_bytes(json_bytes: bytes, index: int) -> int: """ Finds the index of the next comma or closing bracket/brace after the value of a key-value pair in a bytes object containing valid JSON when decoded. + Valid start indices are the index after the colon or the index after that. + + Example: + + 01234567 + "2": {}, + + Valid start indices are 4 and 5. Returns 7. + Args: - `json_bytes`: A bytes object containing valid JSON when decoded - `index`: The start index in json_bytes Returns: - - The end index of the value. + - The end index of the first byte right after the value's bytes. """ # See https://www.json.org/json-en.html for the JSON syntax