Skip to content

Commit

Permalink
save token map by default
Browse files Browse the repository at this point in the history
  • Loading branch information
zhoupingjay committed Sep 3, 2023
1 parent ee43fdf commit 0d81995
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion sanguo_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, source = 'sanguo-utf8.txt', block_size = 192, training_set_ra
self.decoder = None
self.data = None

def ingest(self, gen_dataset=True):
def ingest(self, gen_dataset=True, gen_token_map=True):
with open(self.source, 'r', encoding='utf-8') as f:
self.text = f.read()
print(f"Length of text: {len(self.text)}") # 606051 Chinese characters
Expand All @@ -41,6 +41,9 @@ def ingest(self, gen_dataset=True):
self.data = torch.tensor(self.encoder(self.text), dtype=torch.long)
# print(self.data.shape, self.data.dtype)

if gen_token_map:
self.save_token_map()

if gen_dataset:
self.gen_dataset()

Expand Down

0 comments on commit 0d81995

Please sign in to comment.