initial commit

r2d4 · r2d4 · commit acc9fed2a039 · 2023-05-06T15:02:46.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+env
+.venv
+.ruff_cache
+dist
+*.egg-info
+**/__pycache__
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Matt Rickard
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# parserLLM
+
+Use a context-free grammar and a parser generator to determine valid next tokens for an LLM generation. See [examples/example.py](examples/example.py) for an example of how to use this library.
diff --git a/examples/example.py b/examples/example.py
@@ -0,0 +1,62 @@
+
+import sys
+from pathlib import Path
+
+from lark import Lark
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+
+from parserllm import complete_cf  # noqa: E402
+
+model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b")
+tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b")
+
+# model = AutoModelForCausalLM.from_pretrained("distilgpt2", trust_remote_code=True)
+# tokenizer = AutoTokenizer.from_pretrained("distilgpt2", trust_remote_code=True)
+
+json_grammar = r"""
+    ?start: value
+
+    ?value: object
+          | array
+          | string
+          | "true"             -> true
+          | "false"            -> false
+          | "null"             -> null
+
+    array  : "[" [value ("," value)*] "]"
+    object : "{" [pair ("," pair)*] "}"
+    pair   : string ":" value
+
+    string : ESCAPED_STRING
+
+    %import common.ESCAPED_STRING
+    %import common.SIGNED_NUMBER
+    %import common.WS
+
+    %ignore WS
+"""
+
+
+### Create the JSON parser with Lark, using the LALR algorithm
+json_parser = Lark(json_grammar, parser='lalr',
+                # Using the basic lexer isn't required, and isn't usually recommended.
+                   # But, it's good enough for JSON, and it's slightly faster.
+                   lexer='basic',
+                   # Disabling propagate_positions and placeholders slightly improves speed
+                   propagate_positions=False,
+                   maybe_placeholders=False,
+                   regex=True)
+
+prompt = "Write the first three letters of the alphabet in valid JSON format\n"
+print(complete_cf(prompt, json_parser, "", 
+                  tokenizer, 
+                  model,
+                  max_new_tokens=15, 
+                  debug=True))
+
+print("regular\n", ' '.join(tokenizer.batch_decode(model.generate(tokenizer.encode(prompt, return_tensors="pt"),
+                max_new_tokens=30,
+                pad_token_id=tokenizer.eos_token_id,
+))))
diff --git a/parserllm/__init__.py b/parserllm/__init__.py
@@ -0,0 +1 @@
+from parserllm.parserllm import complete_cf
diff --git a/parserllm/parserllm.py b/parserllm/parserllm.py
@@ -0,0 +1,68 @@
+
+import regex
+from lark import UnexpectedCharacters, UnexpectedInput
+from rellm import complete_re
+from transformers import PreTrainedModel, PreTrainedTokenizer
+
+
+def extract_terminal_regex(parser, stop_token):
+    regex_map = {}
+    for term in parser.terminals:
+        if term.pattern:
+            regex_map[term.name] = regex.compile(term.pattern.to_regexp())
+    
+    regex_map['$END'] = regex.compile(stop_token)
+    return regex_map
+
+class ParserState():
+    def __init__(self, parser):
+        self.parser = parser
+        self.last_expected = []
+        self.partial_token = ""
+    
+    def next_lex(self, input_str):
+        try:
+            print("input_str: ", input_str)
+            self.parser.parse(input_str)
+        except UnexpectedCharacters:
+            # return the last set of expected tokens if we're mid-token
+            print("partial_token: ", self.partial_token, "last_expected: ", self.last_expected)
+            self.partial_token = input_str
+            return self.last_expected
+        except UnexpectedInput as e:
+            expected_tokens = e.expected
+            self.last_expected = expected_tokens
+            print("expected_tokens: ", expected_tokens)
+            return expected_tokens
+ 
+        return []
+
+def complete_cf(prompt:str, parser, partial_completion, tokenizer: PreTrainedTokenizer, 
+                model: PreTrainedModel, max_new_tokens: int = 3, 
+                debug: bool = False,
+                **model_kwargs):
+    """
+    Complete a prompt with a regex pattern.
+    """
+    gen_tokens = 0
+    prompt_plus_completion = prompt + partial_completion
+
+    terminal_regexes = extract_terminal_regex(parser, tokenizer.decode(tokenizer.eos_token_id))
+    parser_state = ParserState(parser )
+    
+    while gen_tokens < max_new_tokens:
+        prompt_token_ids = tokenizer.encode(prompt_plus_completion, return_tensors="pt")
+        prompt_token_ids.shape[1]
+
+        valid_next_lex = parser_state.next_lex(partial_completion)
+        if len(valid_next_lex) == 0 or (len(valid_next_lex) == 1 and '$END' in valid_next_lex):
+            break
+        r = [terminal_regexes[t] for t in valid_next_lex]
+
+        next_token_completion = complete_re(prompt_plus_completion, r, tokenizer, model, stop_after_match=True, debug=debug, **model_kwargs)
+
+        partial_completion += next_token_completion
+        prompt_plus_completion = prompt_plus_completion + next_token_completion
+        gen_tokens += 1
+
+    return partial_completion
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# parserLLM`
	`2`	`+`
	`3`	`+Use a context-free grammar and a parser generator to determine valid next tokens for an LLM generation. See [examples/example.py](examples/example.py) for an example of how to use this library.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from parserllm.parserllm import complete_cf`