Skip to content

Commit

Permalink
feat(chunker): add tokenizer argument for chunking
Browse files Browse the repository at this point in the history
- Add `tokenizer_name_or_path` argument to chunk_doc_corpus.py and preprocess_wiki.py
- Update chunker initialization to use the specified tokenizer
- Document the new argument in the usage instructions
  • Loading branch information
ignorejjj committed Jan 10, 2025
1 parent 0464e6d commit 32d64e8
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/chunk-doc-corpus.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ You will get a JSONL file with the following format:
- `output_path`: Path to the output JSONL file.
- `chunk_by`: Chunking method to use. Can be `token`, `word`, `sentence`, or `recursive`.
- `chunk_size`: Size of chunks.
- `tokenizer_name_or_path`: Name or path of the tokenizer that used for chunking.
10 changes: 6 additions & 4 deletions scripts/chunk_doc_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def save_jsonl(documents, file_path):
parser.add_argument("--chunk_by", default="token", choices=["token", "word", "sentence", "recursive"],
help="Chunking method to use")
parser.add_argument("--chunk_size", default=512, type=int, help="Size of chunks")
parser.add_argument("--tokenizer_name_or_path", default='o200k_base', type=str)

args = parser.parse_args()

# Load documents
Expand All @@ -37,13 +39,13 @@ def save_jsonl(documents, file_path):

# Initialize chunker
if args.chunk_by == "token":
chunker = chonkie.TokenChunker(chunk_size=args.chunk_size)
chunker = chonkie.TokenChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
elif args.chunk_by == "sentence":
chunker = chonkie.SentenceChunker(chunk_size=args.chunk_size)
chunker = chonkie.SentenceChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
elif args.chunk_by == "recursive":
chunker = chonkie.RecursiveChunker(chunk_size=args.chunk_size, min_characters_per_chunk=1)
chunker = chonkie.RecursiveChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size, min_characters_per_chunk=1)
elif args.chunk_by == "word":
chunker = chonkie.WordChunker(chunk_size=args.chunk_size)
chunker = chonkie.WordChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
else:
raise ValueError(f"Invalid chunking method: {args.chunk_by}")

Expand Down
9 changes: 5 additions & 4 deletions scripts/preprocess_wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def single_worker(docs):
parser.add_argument("--dump_path", type=str)
parser.add_argument("--chunk_by", default="token", choices=["token", "word", "sentence", "recursive"], type=str)
parser.add_argument("--chunk_size", default=512, type=int)
parser.add_argument("--tokenizer_name_or_path", default='o200k_base', type=str)
parser.add_argument("--num_workers", default=4, type=int)
parser.add_argument("--save_path", type=str, default="clean_corpus.jsonl")
args = parser.parse_args()
Expand Down Expand Up @@ -203,13 +204,13 @@ def single_worker(docs):

# Initialize a Chonkie chunker, based on the chunk_by argument
if args.chunk_by == "token":
chunker = chonkie.TokenChunker(chunk_size=args.chunk_size)
chunker = chonkie.TokenChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
elif args.chunk_by == "sentence":
chunker = chonkie.SentenceChunker(chunk_size=args.chunk_size)
chunker = chonkie.SentenceChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
elif args.chunk_by == "recursive":
chunker = chonkie.RecursiveChunker(chunk_size=args.chunk_size, min_characters_per_chunk=1)
chunker = chonkie.RecursiveChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size, min_characters_per_chunk=1)
elif args.chunk_by == "word":
chunker = chonkie.WordChunker(chunk_size=args.chunk_size)
chunker = chonkie.WordChunker(tokenizer=args.tokenizer_name_or_path, chunk_size=args.chunk_size)
else:
raise ValueError(f"Invalid chunking method: {args.chunk_by}")

Expand Down

0 comments on commit 32d64e8

Please sign in to comment.