-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathsingleton.rs
70 lines (61 loc) · 2.48 KB
/
singleton.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
use std::sync::Arc;
use lazy_static::lazy_static;
use parking_lot::Mutex;
use crate::vendor_tiktoken::CoreBPE;
use crate::{cl100k_base, llama3_base, o200k_base, p50k_base, p50k_edit, r50k_base};
/// Returns a singleton instance of the r50k_base tokenizer. (also known as `gpt2`)
/// Use for GPT-3 models like `davinci`
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
pub fn r50k_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref R50K_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(r50k_base().unwrap()));
}
R50K_BASE.clone()
}
/// Returns a singleton instance of the p50k_base tokenizer.
/// Use for Code models, `text-davinci-002`, `text-davinci-003`
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer.
pub fn p50k_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref P50K_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(p50k_base().unwrap()));
}
P50K_BASE.clone()
}
/// Returns a singleton instance of the p50k_edit tokenizer.
/// Use for edit models like `text-davinci-edit-001`, `code-davinci-edit-001`
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer.
pub fn p50k_edit_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref P50K_EDIT: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(p50k_edit().unwrap()));
}
P50K_EDIT.clone()
}
/// Returns a singleton instance of the cl100k_base tokenizer.
/// Use for ChatGPT models, `text-embedding-ada-002`
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
pub fn cl100k_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref CL100K_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(cl100k_base().unwrap()));
}
CL100K_BASE.clone()
}
/// Returns a singleton instance of the o200k_base tokenizer.
/// Use for GPT-4o models.
///
/// This function will only initialize the tokenizer once, and then return a reference the tokenizer
pub fn o200k_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref O200K_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(o200k_base().unwrap()));
}
O200K_BASE.clone()
}
pub fn llama3_base_singleton() -> Arc<Mutex<CoreBPE>> {
lazy_static! {
static ref LLAMA3_BASE: Arc<Mutex<CoreBPE>> = Arc::new(Mutex::new(llama3_base().unwrap()));
}
LLAMA3_BASE.clone()
}