diff --git a/.travis.yml b/.travis.yml index a125e95..ddf927e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,8 +14,8 @@ dist: bionic script: - go get -u ./... - - go test -v github.com/sugarme/tokenizer/normalizer - - go test -v github.com/sugarme/tokenizer/model/bpe - - go test -v github.com/sugarme/tokenizer/model/wordpiece - - go test -v github.com/sugarme/tokenizer/pretokenizer - - go test -v github.com/sugarme/tokenizer + - go test -v github.com/season-studio/tokenizer/normalizer + - go test -v github.com/season-studio/tokenizer/model/bpe + - go test -v github.com/season-studio/tokenizer/model/wordpiece + - go test -v github.com/season-studio/tokenizer/pretokenizer + - go test -v github.com/season-studio/tokenizer diff --git a/added-vocabulary.go b/added-vocabulary.go index cafeb11..a714d8a 100644 --- a/added-vocabulary.go +++ b/added-vocabulary.go @@ -8,7 +8,7 @@ import ( "unicode" "github.com/sugarme/regexpset" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) // AddedToken represents a token added by the user on top of the diff --git a/added-vocabulary_test.go b/added-vocabulary_test.go index 442c239..635744c 100644 --- a/added-vocabulary_test.go +++ b/added-vocabulary_test.go @@ -5,8 +5,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type ModelMock struct { diff --git a/bpe_test.go b/bpe_test.go index 77a60a2..732ff23 100644 --- a/bpe_test.go +++ b/bpe_test.go @@ -6,13 +6,13 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" - // "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + // "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func getByteLevelBPE() (retVal *tokenizer.Tokenizer) { diff --git a/config_test.go b/config_test.go index a582ed8..88b67c9 100644 --- a/config_test.go +++ b/config_test.go @@ -5,7 +5,7 @@ import ( "fmt" "os" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) func ExampleConfig() { diff --git a/decoder/bpe.go b/decoder/bpe.go index 44bfec7..e34080d 100644 --- a/decoder/bpe.go +++ b/decoder/bpe.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Allows decoding Original BPE by joining all the tokens and then replacing diff --git a/decoder/byte-fallback.go b/decoder/byte-fallback.go index 763f85a..cff8508 100644 --- a/decoder/byte-fallback.go +++ b/decoder/byte-fallback.go @@ -5,7 +5,7 @@ import ( "strings" "unicode/utf8" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type ByteFallback struct { diff --git a/decoder/ctc.go b/decoder/ctc.go index f1f0655..26d8f79 100644 --- a/decoder/ctc.go +++ b/decoder/ctc.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type CTC struct { diff --git a/decoder/decoder.go b/decoder/decoder.go index 60a9e8a..de7c9fa 100644 --- a/decoder/decoder.go +++ b/decoder/decoder.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type DecoderBase struct { diff --git a/decoder/fuse.go b/decoder/fuse.go index 8f6cc3e..3784ab6 100644 --- a/decoder/fuse.go +++ b/decoder/fuse.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Fuse constructs Fuse decoder diff --git a/decoder/sequence.go b/decoder/sequence.go index 64c8929..7a1cab0 100644 --- a/decoder/sequence.go +++ b/decoder/sequence.go @@ -1,7 +1,7 @@ package decoder import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Sequence struct { diff --git a/decoder/sequence_test.go b/decoder/sequence_test.go index aa60154..2fde580 100644 --- a/decoder/sequence_test.go +++ b/decoder/sequence_test.go @@ -5,8 +5,8 @@ import ( // "strings" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestSequence(t *testing.T) { diff --git a/decoder/strip.go b/decoder/strip.go index dd428e2..8113c4f 100644 --- a/decoder/strip.go +++ b/decoder/strip.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Strip struct { diff --git a/decoder/wordpiece.go b/decoder/wordpiece.go index 8c66b62..a6940b5 100644 --- a/decoder/wordpiece.go +++ b/decoder/wordpiece.go @@ -4,7 +4,7 @@ import ( "fmt" "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // WordPieceDecoder takes care of decoding a list of wordpiece tokens diff --git a/encoding.go b/encoding.go index 65756ce..e4366ed 100644 --- a/encoding.go +++ b/encoding.go @@ -5,7 +5,7 @@ import ( "log" "reflect" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) type PaddingDirection int diff --git a/encoding_test.go b/encoding_test.go index bde15da..27b7bf3 100644 --- a/encoding_test.go +++ b/encoding_test.go @@ -5,7 +5,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestTokenizer_MergeWith(t *testing.T) { diff --git a/example/basic/bert.go b/example/basic/bert.go index 1b11369..575011f 100644 --- a/example/basic/bert.go +++ b/example/basic/bert.go @@ -4,13 +4,13 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func runBERT() { diff --git a/example/basic/bpe.go b/example/basic/bpe.go index 3beff62..d514550 100644 --- a/example/basic/bpe.go +++ b/example/basic/bpe.go @@ -4,11 +4,11 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func runBPE() { diff --git a/example/basic/wordlevel.go b/example/basic/wordlevel.go index 47d933f..a9d69c4 100644 --- a/example/basic/wordlevel.go +++ b/example/basic/wordlevel.go @@ -5,11 +5,11 @@ import ( "log" "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordlevel" - "github.com/sugarme/tokenizer/normalizer" - // "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordlevel" + "github.com/season-studio/tokenizer/normalizer" + // "github.com/season-studio/tokenizer/pretokenizer" ) type customNormalizer struct{} diff --git a/example/bpe/test.go b/example/bpe/test.go index b9e1651..9514917 100644 --- a/example/bpe/test.go +++ b/example/bpe/test.go @@ -4,9 +4,9 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" ) func runTest() { diff --git a/example/bpe/train.go b/example/bpe/train.go index eaf16ea..6826042 100644 --- a/example/bpe/train.go +++ b/example/bpe/train.go @@ -5,9 +5,9 @@ import ( "log" "time" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" ) func runTrain() { diff --git a/example/decode/main.go b/example/decode/main.go index af0ece0..22211e7 100644 --- a/example/decode/main.go +++ b/example/decode/main.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer/pretrained" ) func main() { diff --git a/example/pretrained/main.go b/example/pretrained/main.go index 632542d..79725b5 100644 --- a/example/pretrained/main.go +++ b/example/pretrained/main.go @@ -5,8 +5,8 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretrained" ) var ( diff --git a/example/truncation/main.go b/example/truncation/main.go index af8bd08..cef3d13 100644 --- a/example/truncation/main.go +++ b/example/truncation/main.go @@ -4,8 +4,8 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretrained" ) func main() { diff --git a/example/unigram/main.go b/example/unigram/main.go index f5feb3c..4920d07 100644 --- a/example/unigram/main.go +++ b/example/unigram/main.go @@ -4,9 +4,9 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/unigram" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/unigram" + "github.com/season-studio/tokenizer/util" ) func main() { diff --git a/example_test.go b/example_test.go index 19caace..5ea81a5 100644 --- a/example_test.go +++ b/example_test.go @@ -4,7 +4,7 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer/pretrained" ) func ExampleTokenizer_Encode() { diff --git a/go.mod b/go.mod index 772381f..d9c5ac9 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ -module github.com/sugarme/tokenizer +module github.com/season-studio/tokenizer -go 1.23 +go 1.23.0 + +toolchain go1.24.6 require ( github.com/emirpasic/gods v1.18.1 diff --git a/model/bpe/bpe.go b/model/bpe/bpe.go index d408818..6b55435 100644 --- a/model/bpe/bpe.go +++ b/model/bpe/bpe.go @@ -15,9 +15,9 @@ import ( "log" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/util" ) type Merges map[Pair]PairVal @@ -385,14 +385,14 @@ func (b *BPE) MergeWord(w string) *Word { byteLen = len(string(r)) // if first rune, add prefix - if byteIdx == 0 { - s = fmt.Sprintf("%v%v", prefix, string(r)) - } else if currRuneIdx == len(chars) { // last rune, add suffix + currRuneIdx++ + if currRuneIdx == len(chars) { // last rune, add suffix s = fmt.Sprintf("%v%v", string(r), suffix) + } else if byteIdx == 0 { + s = fmt.Sprintf("%v%v", prefix, string(r)) } else { // the rest s = string(r) } - currRuneIdx++ // If `s` exists in vocab, add its id, otherwise add id of `unk` vocab := *b.Vocab diff --git a/model/bpe/bpe_test.go b/model/bpe/bpe_test.go index 337988a..9d9b063 100644 --- a/model/bpe/bpe_test.go +++ b/model/bpe/bpe_test.go @@ -11,9 +11,9 @@ import ( // "strings" "testing" - "github.com/sugarme/tokenizer" - bpe "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + bpe "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/util" ) func TestBPE_FromFiles(t *testing.T) { diff --git a/model/bpe/trainer.go b/model/bpe/trainer.go index 106171f..6e17a24 100644 --- a/model/bpe/trainer.go +++ b/model/bpe/trainer.go @@ -15,7 +15,7 @@ import ( // 2.2 stars // progressbar "github.com/cheggaaa/pb/v3" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Map with no value diff --git a/model/bpe/trainer_test.go b/model/bpe/trainer_test.go index 045cee1..49e021e 100644 --- a/model/bpe/trainer_test.go +++ b/model/bpe/trainer_test.go @@ -5,7 +5,7 @@ import ( "sort" "testing" - bpe "github.com/sugarme/tokenizer/model/bpe" + bpe "github.com/season-studio/tokenizer/model/bpe" ) func TestBpeTrainer_Train(t *testing.T) { diff --git a/model/bpe/word_test.go b/model/bpe/word_test.go index 21dad14..c8f6188 100644 --- a/model/bpe/word_test.go +++ b/model/bpe/word_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - bpe "github.com/sugarme/tokenizer/model/bpe" + bpe "github.com/season-studio/tokenizer/model/bpe" ) func TestMerge_Merge(t *testing.T) { diff --git a/model/unigram/unigram.go b/model/unigram/unigram.go index 0823a08..00ce85a 100644 --- a/model/unigram/unigram.go +++ b/model/unigram/unigram.go @@ -9,8 +9,8 @@ import ( "strings" "unicode/utf8" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) // TokenScore represents a token and its score in the Unigram model diff --git a/model/unigram/unigram_test.go b/model/unigram/unigram_test.go index e4dac9c..0981daa 100644 --- a/model/unigram/unigram_test.go +++ b/model/unigram/unigram_test.go @@ -4,7 +4,7 @@ import ( "testing" "reflect" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) // Test cases ported from Rust implementation: diff --git a/model/wordlevel/wordlevel.go b/model/wordlevel/wordlevel.go index 8fc7806..c820ae7 100644 --- a/model/wordlevel/wordlevel.go +++ b/model/wordlevel/wordlevel.go @@ -7,7 +7,7 @@ import ( "path/filepath" "sort" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type config struct { diff --git a/model/wordpiece/trainer.go b/model/wordpiece/trainer.go index 8e0f4a2..7a04a76 100644 --- a/model/wordpiece/trainer.go +++ b/model/wordpiece/trainer.go @@ -1,8 +1,8 @@ package wordpiece import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" ) // WordPieceTrainerBuilder can be used to create a `WordPieceTrainer` with a custom diff --git a/model/wordpiece/wordpiece.go b/model/wordpiece/wordpiece.go index 409fc65..a5505ca 100644 --- a/model/wordpiece/wordpiece.go +++ b/model/wordpiece/wordpiece.go @@ -8,10 +8,10 @@ import ( "path/filepath" "sort" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/util" ) type config struct { diff --git a/model/wordpiece/wordpiece_test.go b/model/wordpiece/wordpiece_test.go index d891776..85a94f8 100644 --- a/model/wordpiece/wordpiece_test.go +++ b/model/wordpiece/wordpiece_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/wordpiece" ) func TestWordpieceBuilder(t *testing.T) { diff --git a/normalizer/normalized.go b/normalizer/normalized.go index 00a31c7..4fa370c 100644 --- a/normalizer/normalized.go +++ b/normalizer/normalized.go @@ -8,8 +8,8 @@ import ( "unicode" "unicode/utf8" - "github.com/sugarme/tokenizer/util" - slice "github.com/sugarme/tokenizer/util/slice" + "github.com/season-studio/tokenizer/util" + slice "github.com/season-studio/tokenizer/util/slice" // "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" diff --git a/normalizer/normalized_test.go b/normalizer/normalized_test.go index f010fb2..013b24a 100644 --- a/normalizer/normalized_test.go +++ b/normalizer/normalized_test.go @@ -11,8 +11,8 @@ import ( // "golang.org/x/text/transform" // "golang.org/x/text/unicode/norm" - "github.com/sugarme/tokenizer/normalizer" - // "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + // "github.com/season-studio/tokenizer/util" ) func TestNormalized_NFDAddsNewChars(t *testing.T) { diff --git a/normalizer/pattern.go b/normalizer/pattern.go index 1a5ebe7..95f573f 100644 --- a/normalizer/pattern.go +++ b/normalizer/pattern.go @@ -5,7 +5,7 @@ import ( // "reflect" "regexp" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) // Pattern is used to split a NormalizedString diff --git a/normalizer/pattern_test.go b/normalizer/pattern_test.go index e98af59..2f3a9a4 100644 --- a/normalizer/pattern_test.go +++ b/normalizer/pattern_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) func doTest(t *testing.T, p normalizer.Pattern, inside string, want []normalizer.OffsetsMatch) { diff --git a/normalizer/precompiled.go b/normalizer/precompiled.go index 9afd0a6..042a9db 100644 --- a/normalizer/precompiled.go +++ b/normalizer/precompiled.go @@ -3,7 +3,7 @@ package normalizer import ( "strings" - "github.com/sugarme/tokenizer/spm" + "github.com/season-studio/tokenizer/spm" "github.com/rivo/uniseg" ) diff --git a/pretokenizer.go b/pretokenizer.go index 6a55545..365f6d7 100644 --- a/pretokenizer.go +++ b/pretokenizer.go @@ -7,7 +7,7 @@ import ( "log" // "reflect" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) type PreToken struct { diff --git a/pretokenizer/bert.go b/pretokenizer/bert.go index 9c40aad..0f31356 100644 --- a/pretokenizer/bert.go +++ b/pretokenizer/bert.go @@ -4,8 +4,8 @@ import ( // "fmt" // "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func isBertPunc(x rune) (retVal bool) { diff --git a/pretokenizer/bert_test.go b/pretokenizer/bert_test.go index 0dbe146..69691db 100644 --- a/pretokenizer/bert_test.go +++ b/pretokenizer/bert_test.go @@ -4,9 +4,9 @@ import ( "reflect" "testing" - tokenizer "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" + tokenizer "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestBertPreTokenize(t *testing.T) { diff --git a/pretokenizer/bytelevel.go b/pretokenizer/bytelevel.go index f3e3ff6..d9e7662 100644 --- a/pretokenizer/bytelevel.go +++ b/pretokenizer/bytelevel.go @@ -4,8 +4,8 @@ import ( "regexp" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // Regular epxression to split string to `word` token diff --git a/pretokenizer/bytelevel_test.go b/pretokenizer/bytelevel_test.go index fd25d2d..b3e10d0 100644 --- a/pretokenizer/bytelevel_test.go +++ b/pretokenizer/bytelevel_test.go @@ -5,9 +5,9 @@ import ( "strings" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" ) type charidx struct { diff --git a/pretokenizer/delimiter.go b/pretokenizer/delimiter.go index 7e51fc0..8dbbe24 100644 --- a/pretokenizer/delimiter.go +++ b/pretokenizer/delimiter.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type CharDelimiterSplit struct { diff --git a/pretokenizer/digits.go b/pretokenizer/digits.go index 95131a4..8c24db3 100644 --- a/pretokenizer/digits.go +++ b/pretokenizer/digits.go @@ -3,8 +3,8 @@ package pretokenizer import ( "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Digits struct { diff --git a/pretokenizer/digits_test.go b/pretokenizer/digits_test.go index 173dac2..d590134 100644 --- a/pretokenizer/digits_test.go +++ b/pretokenizer/digits_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestNumbers(t *testing.T) { diff --git a/pretokenizer/metaspace.go b/pretokenizer/metaspace.go index 580529d..50a769a 100644 --- a/pretokenizer/metaspace.go +++ b/pretokenizer/metaspace.go @@ -4,8 +4,8 @@ import ( // "log" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // PrependScheme defines how the meta character should be prepended diff --git a/pretokenizer/metaspace_test.go b/pretokenizer/metaspace_test.go index 94e5a0c..f2dbcca 100644 --- a/pretokenizer/metaspace_test.go +++ b/pretokenizer/metaspace_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestMetaspace_Decode(t *testing.T) { diff --git a/pretokenizer/punctuation.go b/pretokenizer/punctuation.go index 66de8ad..6af60a0 100644 --- a/pretokenizer/punctuation.go +++ b/pretokenizer/punctuation.go @@ -3,8 +3,8 @@ package pretokenizer import ( "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // bpunc is the BERT extension of the Punctuation character range diff --git a/pretokenizer/punctuation_test.go b/pretokenizer/punctuation_test.go index e18400c..eaf876c 100644 --- a/pretokenizer/punctuation_test.go +++ b/pretokenizer/punctuation_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestPunctuation(t *testing.T) { diff --git a/pretokenizer/sequence.go b/pretokenizer/sequence.go index f860d9c..c604bd5 100644 --- a/pretokenizer/sequence.go +++ b/pretokenizer/sequence.go @@ -1,7 +1,7 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Sequence struct { diff --git a/pretokenizer/split.go b/pretokenizer/split.go index 8a2fc93..95b7f36 100644 --- a/pretokenizer/split.go +++ b/pretokenizer/split.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Split struct { diff --git a/pretokenizer/split_test.go b/pretokenizer/split_test.go index dbb689e..5c77c7e 100644 --- a/pretokenizer/split_test.go +++ b/pretokenizer/split_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestSplit(t *testing.T) { diff --git a/pretokenizer/unicode-script.go b/pretokenizer/unicode-script.go index f34436b..d4f31d9 100644 --- a/pretokenizer/unicode-script.go +++ b/pretokenizer/unicode-script.go @@ -4,8 +4,8 @@ import ( "log" "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // GetScript returns key to script in `unicode.Scripts`. diff --git a/pretokenizer/unicode-script_test.go b/pretokenizer/unicode-script_test.go index 81263ef..ed4feec 100644 --- a/pretokenizer/unicode-script_test.go +++ b/pretokenizer/unicode-script_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestGetScript(t *testing.T) { diff --git a/pretokenizer/whitespace.go b/pretokenizer/whitespace.go index 00d071a..25392f8 100644 --- a/pretokenizer/whitespace.go +++ b/pretokenizer/whitespace.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Whitespace struct{} diff --git a/pretokenizer/whitespace_test.go b/pretokenizer/whitespace_test.go index 470fc62..642d5c7 100644 --- a/pretokenizer/whitespace_test.go +++ b/pretokenizer/whitespace_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestWhitespace(t *testing.T) { diff --git a/pretrained/added-tokens.go b/pretrained/added-tokens.go index 253c012..d96e025 100644 --- a/pretrained/added-tokens.go +++ b/pretrained/added-tokens.go @@ -1,7 +1,7 @@ package pretrained import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func CreateAddedTokens(data []tokenizer.TokenConfig) (specialToks, toks []tokenizer.AddedToken) { diff --git a/pretrained/bert.go b/pretrained/bert.go index 9f3384e..83ca812 100644 --- a/pretrained/bert.go +++ b/pretrained/bert.go @@ -4,13 +4,13 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // BertBaseUncase loads pretrained BERT tokenizer. diff --git a/pretrained/common_test.go b/pretrained/common_test.go index 4ca1453..a315f32 100644 --- a/pretrained/common_test.go +++ b/pretrained/common_test.go @@ -4,7 +4,7 @@ package pretrained import ( "encoding/json" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" "os" ) diff --git a/pretrained/decoder.go b/pretrained/decoder.go index afff515..46e8afb 100644 --- a/pretrained/decoder.go +++ b/pretrained/decoder.go @@ -15,11 +15,11 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/util" ) func CreateDecoder(config map[string]interface{}) (tokenizer.Decoder, error) { diff --git a/pretrained/decoder_test.go b/pretrained/decoder_test.go index 74b9006..6c264f5 100644 --- a/pretrained/decoder_test.go +++ b/pretrained/decoder_test.go @@ -4,7 +4,7 @@ import ( "log" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) /* diff --git a/pretrained/gpt2.go b/pretrained/gpt2.go index 8a47e27..e57f6da 100644 --- a/pretrained/gpt2.go +++ b/pretrained/gpt2.go @@ -4,12 +4,12 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // GPT2 loads GPT2 (small) tokenizer from vocab and merges files. diff --git a/pretrained/model.go b/pretrained/model.go index 507790d..b4708ed 100644 --- a/pretrained/model.go +++ b/pretrained/model.go @@ -4,13 +4,13 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/model/unigram" - "github.com/sugarme/tokenizer/model/wordlevel" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/model/unigram" + "github.com/season-studio/tokenizer/model/wordlevel" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/util" ) // This file provides functions to create tokenizer.Model from input data. diff --git a/pretrained/model_test.go b/pretrained/model_test.go index 7656aac..13c9fb6 100644 --- a/pretrained/model_test.go +++ b/pretrained/model_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) func TestCreateBPE(t *testing.T) { diff --git a/pretrained/normalizer.go b/pretrained/normalizer.go index fa6bee3..93e06b4 100644 --- a/pretrained/normalizer.go +++ b/pretrained/normalizer.go @@ -18,9 +18,9 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/spm" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/spm" + "github.com/season-studio/tokenizer/util" ) // CreateNormalizer creates Normalizer from config data. diff --git a/pretrained/padding.go b/pretrained/padding.go index 389fce9..bc6ca71 100644 --- a/pretrained/padding.go +++ b/pretrained/padding.go @@ -3,8 +3,8 @@ package pretrained import ( "reflect" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) func CreatePaddingParams(config map[string]interface{}) (*tokenizer.PaddingParams, error) { diff --git a/pretrained/pretokenizer.go b/pretrained/pretokenizer.go index a53e842..09e3791 100644 --- a/pretrained/pretokenizer.go +++ b/pretrained/pretokenizer.go @@ -17,10 +17,10 @@ import ( "fmt" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/util" ) func CreatePreTokenizer(config map[string]interface{}) (tokenizer.PreTokenizer, error) { diff --git a/pretrained/processor.go b/pretrained/processor.go index 06feeac..72e37ea 100644 --- a/pretrained/processor.go +++ b/pretrained/processor.go @@ -10,10 +10,10 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func CreatePostProcessor(config map[string]interface{}) (tokenizer.PostProcessor, error) { diff --git a/pretrained/processor_test.go b/pretrained/processor_test.go index 393f807..0acf1fc 100644 --- a/pretrained/processor_test.go +++ b/pretrained/processor_test.go @@ -4,7 +4,7 @@ import ( "log" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // e.g. `hf-internal-testing/llama-tokenizer` diff --git a/pretrained/roberta.go b/pretrained/roberta.go index ea742cd..e9c4abb 100644 --- a/pretrained/roberta.go +++ b/pretrained/roberta.go @@ -4,12 +4,12 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // RobertaBase loads pretrained RoBERTa tokenizer. diff --git a/pretrained/tokenizer.go b/pretrained/tokenizer.go index e5769eb..3086b05 100644 --- a/pretrained/tokenizer.go +++ b/pretrained/tokenizer.go @@ -6,7 +6,7 @@ import ( "io" "os" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // FromFile constructs a new Tokenizer from json data file (normally 'tokenizer.json') diff --git a/pretrained/tokenizer_test.go b/pretrained/tokenizer_test.go index 1373fdf..d33271f 100644 --- a/pretrained/tokenizer_test.go +++ b/pretrained/tokenizer_test.go @@ -5,7 +5,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestFromFile(t *testing.T) { diff --git a/pretrained/truncation.go b/pretrained/truncation.go index 7a8c63d..86870d8 100644 --- a/pretrained/truncation.go +++ b/pretrained/truncation.go @@ -1,8 +1,8 @@ package pretrained import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) func CreateTruncationParams(config map[string]interface{}) (*tokenizer.TruncationParams, error) { diff --git a/processor/bert.go b/processor/bert.go index 2055035..86efc02 100644 --- a/processor/bert.go +++ b/processor/bert.go @@ -1,7 +1,7 @@ package processor import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type PostToken struct { diff --git a/processor/bytelevel.go b/processor/bytelevel.go index 168e040..43077e3 100644 --- a/processor/bytelevel.go +++ b/processor/bytelevel.go @@ -1,8 +1,8 @@ package processor import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) type ByteLevelProcessing struct { diff --git a/processor/roberta.go b/processor/roberta.go index 1c921b0..7ac4397 100644 --- a/processor/roberta.go +++ b/processor/roberta.go @@ -1,8 +1,8 @@ package processor import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) // RobertaProcessing is a post post processor for Roberta model @@ -86,6 +86,9 @@ func (rp *RobertaProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, newPairEncoding.Overflowing = newOverflowPairEncoding } + } else { + newEncoding = encoding + newPairEncoding = pairEncoding } if !addSpecialTokens { diff --git a/processor/sequence.go b/processor/sequence.go index cad57cc..cd33527 100644 --- a/processor/sequence.go +++ b/processor/sequence.go @@ -1,6 +1,6 @@ package processor -import "github.com/sugarme/tokenizer" +import "github.com/season-studio/tokenizer" type Sequence struct { processors []tokenizer.PostProcessor diff --git a/processor/sequence_test.go b/processor/sequence_test.go index 4316908..5841838 100644 --- a/processor/sequence_test.go +++ b/processor/sequence_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestSequence(t *testing.T) { diff --git a/processor/template.go b/processor/template.go index a94e1f7..f24245e 100644 --- a/processor/template.go +++ b/processor/template.go @@ -6,8 +6,8 @@ import ( "strconv" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) type SequenceEnum int diff --git a/processor/template_test.go b/processor/template_test.go index 70f1da4..14cebdd 100644 --- a/processor/template_test.go +++ b/processor/template_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestPiece(t *testing.T) { diff --git a/tokenizer.go b/tokenizer.go index d4d4990..bf949b5 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -17,8 +17,8 @@ import ( progressbar "github.com/schollz/progressbar/v2" // "golang.org/x/sync/errgroup" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/util" ) const ( diff --git a/util/slice/int_test.go b/util/slice/int_test.go index e9e33f1..a3e1402 100644 --- a/util/slice/int_test.go +++ b/util/slice/int_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - slice "github.com/sugarme/tokenizer/util/slice" + slice "github.com/season-studio/tokenizer/util/slice" ) func TestInsertInt(t *testing.T) { diff --git a/util/slice/string_test.go b/util/slice/string_test.go index 2b658df..2bfa126 100644 --- a/util/slice/string_test.go +++ b/util/slice/string_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - slice "github.com/sugarme/tokenizer/util/slice" + slice "github.com/season-studio/tokenizer/util/slice" ) func TestInsertStr(t *testing.T) {