llmware-ai · doberst · Mar 1, 2024 · Jan 29, 2024 · Jan 29, 2024 · Feb 28, 2024
diff --git a/llmware/util.py b/llmware/util.py
@@ -795,9 +795,15 @@ def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=T
         self.one_letter_removal = one_letter_removal
 
     def tokenize(self, text):
+
+        # strip the whitespace from the beginning and end of the text so we can tokenize the data
+        text = text.strip()
+        # start with basic whitespace tokenizing, 
+        #is there a reason the text is being split on one space only?   
+        #text2 = text.split(" ")
+        # this line will split on whitespace regardless of tab or multispaces between words
+        text2 = text.split()
 
-        # start with basic whitespace tokenizing
-        text2 = text.split(" ")
 
         if self.remove_punctuation:
             text2 = Utilities().clean_list(text2)