From f1e9495abbe769c3ba4466f827699ab8878b9dbe Mon Sep 17 00:00:00 2001 From: season-studio Date: Tue, 2 Sep 2025 14:31:13 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=EF=BC=881=EF=BC=89=E4=BF=AE=E6=AD=A3robert?= =?UTF-8?q?a=E5=90=8E=E5=A4=84=E7=90=86=E8=BF=87=E7=A8=8B=E4=B8=AD?= =?UTF-8?q?=E8=8B=A5trimOffsets=E4=B8=BAfalse=EF=BC=8C=E5=88=99=E5=8E=9F?= =?UTF-8?q?=E6=9D=A5=E7=9A=84=E7=BC=96=E7=A0=81=E7=BB=93=E6=9E=9C=E4=BC=9A?= =?UTF-8?q?=E8=A2=AB=E4=B8=A2=E5=BC=83=E7=9A=84=E9=97=AE=E9=A2=98=20?= =?UTF-8?q?=EF=BC=882=EF=BC=89=E4=BF=AE=E6=AD=A3bpe=E7=9A=84MergeWord?= =?UTF-8?q?=E4=B8=AD=E5=9B=A0=E4=B8=BA=E9=95=BF=E5=BA=A6=E5=88=A4=E6=96=AD?= =?UTF-8?q?=E9=80=BB=E8=BE=91=E9=94=99=E8=AF=AF=E5=AF=BC=E8=87=B4suffix?= =?UTF-8?q?=E6=B0=B8=E8=BF=9C=E6=97=A0=E6=B3=95=E7=94=9F=E6=95=88=E3=80=81?= =?UTF-8?q?=E4=BB=A5=E5=8F=8A=E5=88=A4=E6=96=AD=E9=A1=BA=E5=BA=8F=E9=97=AE?= =?UTF-8?q?=E9=A2=98=E5=AF=BC=E8=87=B4=E5=8D=95=E5=AD=97=E6=AF=8D=E8=AF=8D?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E5=90=88=E5=B9=B6suffix=E7=9A=84=E4=B8=A4?= =?UTF-8?q?=E4=B8=AA=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- model/bpe/bpe.go | 8 ++++---- processor/roberta.go | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/model/bpe/bpe.go b/model/bpe/bpe.go index d408818..6d989d4 100644 --- a/model/bpe/bpe.go +++ b/model/bpe/bpe.go @@ -385,14 +385,14 @@ func (b *BPE) MergeWord(w string) *Word { byteLen = len(string(r)) // if first rune, add prefix - if byteIdx == 0 { - s = fmt.Sprintf("%v%v", prefix, string(r)) - } else if currRuneIdx == len(chars) { // last rune, add suffix + currRuneIdx++ + if currRuneIdx == len(chars) { // last rune, add suffix s = fmt.Sprintf("%v%v", string(r), suffix) + } else if byteIdx == 0 { + s = fmt.Sprintf("%v%v", prefix, string(r)) } else { // the rest s = string(r) } - currRuneIdx++ // If `s` exists in vocab, add its id, otherwise add id of `unk` vocab := *b.Vocab diff --git a/processor/roberta.go b/processor/roberta.go index 1c921b0..18aa9a2 100644 --- a/processor/roberta.go +++ b/processor/roberta.go @@ -86,6 +86,9 @@ func (rp *RobertaProcessing) Process(encoding, pairEncoding *tokenizer.Encoding, newPairEncoding.Overflowing = newOverflowPairEncoding } + } else { + newEncoding = encoding + newPairEncoding = pairEncoding } if !addSpecialTokens { From 3f14538b3edf474f9ff6655f9c26cff75e1bb809 Mon Sep 17 00:00:00 2001 From: season-studio <66667151+season-studio@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:11:42 +0800 Subject: [PATCH 2/3] Update go.mod --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 772381f..e0e5093 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/sugarme/tokenizer +module github.com/season-studio/tokenizer go 1.23 From e9353cd8519f2bbf7788b6604b416fb766db20de Mon Sep 17 00:00:00 2001 From: season-studio Date: Tue, 2 Sep 2025 15:44:53 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=A8=A1=E5=9D=97?= =?UTF-8?q?=E5=90=8D=E4=BB=A5=E4=BE=BF=E4=BA=8E=E4=B8=8E=E5=8E=9F=E7=89=88?= =?UTF-8?q?=E5=8C=BA=E5=88=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 10 +++++----- added-vocabulary.go | 2 +- added-vocabulary_test.go | 4 ++-- bpe_test.go | 12 ++++++------ config_test.go | 2 +- decoder/bpe.go | 2 +- decoder/byte-fallback.go | 2 +- decoder/ctc.go | 2 +- decoder/decoder.go | 2 +- decoder/fuse.go | 2 +- decoder/sequence.go | 2 +- decoder/sequence_test.go | 4 ++-- decoder/strip.go | 2 +- decoder/wordpiece.go | 2 +- encoding.go | 2 +- encoding_test.go | 2 +- example/basic/bert.go | 14 +++++++------- example/basic/bpe.go | 10 +++++----- example/basic/wordlevel.go | 10 +++++----- example/bpe/test.go | 6 +++--- example/bpe/train.go | 6 +++--- example/decode/main.go | 2 +- example/pretrained/main.go | 4 ++-- example/truncation/main.go | 4 ++-- example/unigram/main.go | 6 +++--- example_test.go | 2 +- go.mod | 4 +++- model/bpe/bpe.go | 6 +++--- model/bpe/bpe_test.go | 6 +++--- model/bpe/trainer.go | 2 +- model/bpe/trainer_test.go | 2 +- model/bpe/word_test.go | 2 +- model/unigram/unigram.go | 4 ++-- model/unigram/unigram_test.go | 2 +- model/wordlevel/wordlevel.go | 2 +- model/wordpiece/trainer.go | 4 ++-- model/wordpiece/wordpiece.go | 8 ++++---- model/wordpiece/wordpiece_test.go | 4 ++-- normalizer/normalized.go | 4 ++-- normalizer/normalized_test.go | 4 ++-- normalizer/pattern.go | 2 +- normalizer/pattern_test.go | 2 +- normalizer/precompiled.go | 2 +- pretokenizer.go | 2 +- pretokenizer/bert.go | 4 ++-- pretokenizer/bert_test.go | 6 +++--- pretokenizer/bytelevel.go | 4 ++-- pretokenizer/bytelevel_test.go | 6 +++--- pretokenizer/delimiter.go | 4 ++-- pretokenizer/digits.go | 4 ++-- pretokenizer/digits_test.go | 4 ++-- pretokenizer/metaspace.go | 4 ++-- pretokenizer/metaspace_test.go | 4 ++-- pretokenizer/punctuation.go | 4 ++-- pretokenizer/punctuation_test.go | 4 ++-- pretokenizer/sequence.go | 2 +- pretokenizer/split.go | 4 ++-- pretokenizer/split_test.go | 4 ++-- pretokenizer/unicode-script.go | 4 ++-- pretokenizer/unicode-script_test.go | 4 ++-- pretokenizer/whitespace.go | 4 ++-- pretokenizer/whitespace_test.go | 4 ++-- pretrained/added-tokens.go | 2 +- pretrained/bert.go | 14 +++++++------- pretrained/common_test.go | 2 +- pretrained/decoder.go | 10 +++++----- pretrained/decoder_test.go | 2 +- pretrained/gpt2.go | 12 ++++++------ pretrained/model.go | 14 +++++++------- pretrained/model_test.go | 2 +- pretrained/normalizer.go | 6 +++--- pretrained/padding.go | 4 ++-- pretrained/pretokenizer.go | 8 ++++---- pretrained/processor.go | 8 ++++---- pretrained/processor_test.go | 2 +- pretrained/roberta.go | 12 ++++++------ pretrained/tokenizer.go | 2 +- pretrained/tokenizer_test.go | 2 +- pretrained/truncation.go | 4 ++-- processor/bert.go | 2 +- processor/bytelevel.go | 4 ++-- processor/roberta.go | 4 ++-- processor/sequence.go | 2 +- processor/sequence_test.go | 4 ++-- processor/template.go | 4 ++-- processor/template_test.go | 2 +- tokenizer.go | 4 ++-- util/slice/int_test.go | 2 +- util/slice/string_test.go | 2 +- 89 files changed, 196 insertions(+), 194 deletions(-) diff --git a/.travis.yml b/.travis.yml index a125e95..ddf927e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,8 +14,8 @@ dist: bionic script: - go get -u ./... - - go test -v github.com/sugarme/tokenizer/normalizer - - go test -v github.com/sugarme/tokenizer/model/bpe - - go test -v github.com/sugarme/tokenizer/model/wordpiece - - go test -v github.com/sugarme/tokenizer/pretokenizer - - go test -v github.com/sugarme/tokenizer + - go test -v github.com/season-studio/tokenizer/normalizer + - go test -v github.com/season-studio/tokenizer/model/bpe + - go test -v github.com/season-studio/tokenizer/model/wordpiece + - go test -v github.com/season-studio/tokenizer/pretokenizer + - go test -v github.com/season-studio/tokenizer diff --git a/added-vocabulary.go b/added-vocabulary.go index cafeb11..a714d8a 100644 --- a/added-vocabulary.go +++ b/added-vocabulary.go @@ -8,7 +8,7 @@ import ( "unicode" "github.com/sugarme/regexpset" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) // AddedToken represents a token added by the user on top of the diff --git a/added-vocabulary_test.go b/added-vocabulary_test.go index 442c239..635744c 100644 --- a/added-vocabulary_test.go +++ b/added-vocabulary_test.go @@ -5,8 +5,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type ModelMock struct { diff --git a/bpe_test.go b/bpe_test.go index 77a60a2..732ff23 100644 --- a/bpe_test.go +++ b/bpe_test.go @@ -6,13 +6,13 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" - // "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + // "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func getByteLevelBPE() (retVal *tokenizer.Tokenizer) { diff --git a/config_test.go b/config_test.go index a582ed8..88b67c9 100644 --- a/config_test.go +++ b/config_test.go @@ -5,7 +5,7 @@ import ( "fmt" "os" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) func ExampleConfig() { diff --git a/decoder/bpe.go b/decoder/bpe.go index 44bfec7..e34080d 100644 --- a/decoder/bpe.go +++ b/decoder/bpe.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Allows decoding Original BPE by joining all the tokens and then replacing diff --git a/decoder/byte-fallback.go b/decoder/byte-fallback.go index 763f85a..cff8508 100644 --- a/decoder/byte-fallback.go +++ b/decoder/byte-fallback.go @@ -5,7 +5,7 @@ import ( "strings" "unicode/utf8" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type ByteFallback struct { diff --git a/decoder/ctc.go b/decoder/ctc.go index f1f0655..26d8f79 100644 --- a/decoder/ctc.go +++ b/decoder/ctc.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type CTC struct { diff --git a/decoder/decoder.go b/decoder/decoder.go index 60a9e8a..de7c9fa 100644 --- a/decoder/decoder.go +++ b/decoder/decoder.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type DecoderBase struct { diff --git a/decoder/fuse.go b/decoder/fuse.go index 8f6cc3e..3784ab6 100644 --- a/decoder/fuse.go +++ b/decoder/fuse.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Fuse constructs Fuse decoder diff --git a/decoder/sequence.go b/decoder/sequence.go index 64c8929..7a1cab0 100644 --- a/decoder/sequence.go +++ b/decoder/sequence.go @@ -1,7 +1,7 @@ package decoder import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Sequence struct { diff --git a/decoder/sequence_test.go b/decoder/sequence_test.go index aa60154..2fde580 100644 --- a/decoder/sequence_test.go +++ b/decoder/sequence_test.go @@ -5,8 +5,8 @@ import ( // "strings" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestSequence(t *testing.T) { diff --git a/decoder/strip.go b/decoder/strip.go index dd428e2..8113c4f 100644 --- a/decoder/strip.go +++ b/decoder/strip.go @@ -3,7 +3,7 @@ package decoder import ( "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Strip struct { diff --git a/decoder/wordpiece.go b/decoder/wordpiece.go index 8c66b62..a6940b5 100644 --- a/decoder/wordpiece.go +++ b/decoder/wordpiece.go @@ -4,7 +4,7 @@ import ( "fmt" "strings" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // WordPieceDecoder takes care of decoding a list of wordpiece tokens diff --git a/encoding.go b/encoding.go index 65756ce..e4366ed 100644 --- a/encoding.go +++ b/encoding.go @@ -5,7 +5,7 @@ import ( "log" "reflect" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) type PaddingDirection int diff --git a/encoding_test.go b/encoding_test.go index bde15da..27b7bf3 100644 --- a/encoding_test.go +++ b/encoding_test.go @@ -5,7 +5,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestTokenizer_MergeWith(t *testing.T) { diff --git a/example/basic/bert.go b/example/basic/bert.go index 1b11369..575011f 100644 --- a/example/basic/bert.go +++ b/example/basic/bert.go @@ -4,13 +4,13 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func runBERT() { diff --git a/example/basic/bpe.go b/example/basic/bpe.go index 3beff62..d514550 100644 --- a/example/basic/bpe.go +++ b/example/basic/bpe.go @@ -4,11 +4,11 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func runBPE() { diff --git a/example/basic/wordlevel.go b/example/basic/wordlevel.go index 47d933f..a9d69c4 100644 --- a/example/basic/wordlevel.go +++ b/example/basic/wordlevel.go @@ -5,11 +5,11 @@ import ( "log" "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordlevel" - "github.com/sugarme/tokenizer/normalizer" - // "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordlevel" + "github.com/season-studio/tokenizer/normalizer" + // "github.com/season-studio/tokenizer/pretokenizer" ) type customNormalizer struct{} diff --git a/example/bpe/test.go b/example/bpe/test.go index b9e1651..9514917 100644 --- a/example/bpe/test.go +++ b/example/bpe/test.go @@ -4,9 +4,9 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" ) func runTest() { diff --git a/example/bpe/train.go b/example/bpe/train.go index eaf16ea..6826042 100644 --- a/example/bpe/train.go +++ b/example/bpe/train.go @@ -5,9 +5,9 @@ import ( "log" "time" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" ) func runTrain() { diff --git a/example/decode/main.go b/example/decode/main.go index af0ece0..22211e7 100644 --- a/example/decode/main.go +++ b/example/decode/main.go @@ -3,7 +3,7 @@ package main import ( "fmt" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer/pretrained" ) func main() { diff --git a/example/pretrained/main.go b/example/pretrained/main.go index 632542d..79725b5 100644 --- a/example/pretrained/main.go +++ b/example/pretrained/main.go @@ -5,8 +5,8 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretrained" ) var ( diff --git a/example/truncation/main.go b/example/truncation/main.go index af8bd08..cef3d13 100644 --- a/example/truncation/main.go +++ b/example/truncation/main.go @@ -4,8 +4,8 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretrained" ) func main() { diff --git a/example/unigram/main.go b/example/unigram/main.go index f5feb3c..4920d07 100644 --- a/example/unigram/main.go +++ b/example/unigram/main.go @@ -4,9 +4,9 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/unigram" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/unigram" + "github.com/season-studio/tokenizer/util" ) func main() { diff --git a/example_test.go b/example_test.go index 19caace..5ea81a5 100644 --- a/example_test.go +++ b/example_test.go @@ -4,7 +4,7 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer/pretrained" + "github.com/season-studio/tokenizer/pretrained" ) func ExampleTokenizer_Encode() { diff --git a/go.mod b/go.mod index e0e5093..d9c5ac9 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ module github.com/season-studio/tokenizer -go 1.23 +go 1.23.0 + +toolchain go1.24.6 require ( github.com/emirpasic/gods v1.18.1 diff --git a/model/bpe/bpe.go b/model/bpe/bpe.go index 6d989d4..6b55435 100644 --- a/model/bpe/bpe.go +++ b/model/bpe/bpe.go @@ -15,9 +15,9 @@ import ( "log" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/util" ) type Merges map[Pair]PairVal diff --git a/model/bpe/bpe_test.go b/model/bpe/bpe_test.go index 337988a..9d9b063 100644 --- a/model/bpe/bpe_test.go +++ b/model/bpe/bpe_test.go @@ -11,9 +11,9 @@ import ( // "strings" "testing" - "github.com/sugarme/tokenizer" - bpe "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + bpe "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/util" ) func TestBPE_FromFiles(t *testing.T) { diff --git a/model/bpe/trainer.go b/model/bpe/trainer.go index 106171f..6e17a24 100644 --- a/model/bpe/trainer.go +++ b/model/bpe/trainer.go @@ -15,7 +15,7 @@ import ( // 2.2 stars // progressbar "github.com/cheggaaa/pb/v3" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // Map with no value diff --git a/model/bpe/trainer_test.go b/model/bpe/trainer_test.go index 045cee1..49e021e 100644 --- a/model/bpe/trainer_test.go +++ b/model/bpe/trainer_test.go @@ -5,7 +5,7 @@ import ( "sort" "testing" - bpe "github.com/sugarme/tokenizer/model/bpe" + bpe "github.com/season-studio/tokenizer/model/bpe" ) func TestBpeTrainer_Train(t *testing.T) { diff --git a/model/bpe/word_test.go b/model/bpe/word_test.go index 21dad14..c8f6188 100644 --- a/model/bpe/word_test.go +++ b/model/bpe/word_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - bpe "github.com/sugarme/tokenizer/model/bpe" + bpe "github.com/season-studio/tokenizer/model/bpe" ) func TestMerge_Merge(t *testing.T) { diff --git a/model/unigram/unigram.go b/model/unigram/unigram.go index 0823a08..00ce85a 100644 --- a/model/unigram/unigram.go +++ b/model/unigram/unigram.go @@ -9,8 +9,8 @@ import ( "strings" "unicode/utf8" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) // TokenScore represents a token and its score in the Unigram model diff --git a/model/unigram/unigram_test.go b/model/unigram/unigram_test.go index e4dac9c..0981daa 100644 --- a/model/unigram/unigram_test.go +++ b/model/unigram/unigram_test.go @@ -4,7 +4,7 @@ import ( "testing" "reflect" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) // Test cases ported from Rust implementation: diff --git a/model/wordlevel/wordlevel.go b/model/wordlevel/wordlevel.go index 8fc7806..c820ae7 100644 --- a/model/wordlevel/wordlevel.go +++ b/model/wordlevel/wordlevel.go @@ -7,7 +7,7 @@ import ( "path/filepath" "sort" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type config struct { diff --git a/model/wordpiece/trainer.go b/model/wordpiece/trainer.go index 8e0f4a2..7a04a76 100644 --- a/model/wordpiece/trainer.go +++ b/model/wordpiece/trainer.go @@ -1,8 +1,8 @@ package wordpiece import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/bpe" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/bpe" ) // WordPieceTrainerBuilder can be used to create a `WordPieceTrainer` with a custom diff --git a/model/wordpiece/wordpiece.go b/model/wordpiece/wordpiece.go index 409fc65..a5505ca 100644 --- a/model/wordpiece/wordpiece.go +++ b/model/wordpiece/wordpiece.go @@ -8,10 +8,10 @@ import ( "path/filepath" "sort" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/util" ) type config struct { diff --git a/model/wordpiece/wordpiece_test.go b/model/wordpiece/wordpiece_test.go index d891776..85a94f8 100644 --- a/model/wordpiece/wordpiece_test.go +++ b/model/wordpiece/wordpiece_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model/wordpiece" ) func TestWordpieceBuilder(t *testing.T) { diff --git a/normalizer/normalized.go b/normalizer/normalized.go index 00a31c7..4fa370c 100644 --- a/normalizer/normalized.go +++ b/normalizer/normalized.go @@ -8,8 +8,8 @@ import ( "unicode" "unicode/utf8" - "github.com/sugarme/tokenizer/util" - slice "github.com/sugarme/tokenizer/util/slice" + "github.com/season-studio/tokenizer/util" + slice "github.com/season-studio/tokenizer/util/slice" // "golang.org/x/text/transform" "golang.org/x/text/unicode/norm" diff --git a/normalizer/normalized_test.go b/normalizer/normalized_test.go index f010fb2..013b24a 100644 --- a/normalizer/normalized_test.go +++ b/normalizer/normalized_test.go @@ -11,8 +11,8 @@ import ( // "golang.org/x/text/transform" // "golang.org/x/text/unicode/norm" - "github.com/sugarme/tokenizer/normalizer" - // "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + // "github.com/season-studio/tokenizer/util" ) func TestNormalized_NFDAddsNewChars(t *testing.T) { diff --git a/normalizer/pattern.go b/normalizer/pattern.go index 1a5ebe7..95f573f 100644 --- a/normalizer/pattern.go +++ b/normalizer/pattern.go @@ -5,7 +5,7 @@ import ( // "reflect" "regexp" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) // Pattern is used to split a NormalizedString diff --git a/normalizer/pattern_test.go b/normalizer/pattern_test.go index e98af59..2f3a9a4 100644 --- a/normalizer/pattern_test.go +++ b/normalizer/pattern_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) func doTest(t *testing.T, p normalizer.Pattern, inside string, want []normalizer.OffsetsMatch) { diff --git a/normalizer/precompiled.go b/normalizer/precompiled.go index 9afd0a6..042a9db 100644 --- a/normalizer/precompiled.go +++ b/normalizer/precompiled.go @@ -3,7 +3,7 @@ package normalizer import ( "strings" - "github.com/sugarme/tokenizer/spm" + "github.com/season-studio/tokenizer/spm" "github.com/rivo/uniseg" ) diff --git a/pretokenizer.go b/pretokenizer.go index 6a55545..365f6d7 100644 --- a/pretokenizer.go +++ b/pretokenizer.go @@ -7,7 +7,7 @@ import ( "log" // "reflect" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer/normalizer" ) type PreToken struct { diff --git a/pretokenizer/bert.go b/pretokenizer/bert.go index 9c40aad..0f31356 100644 --- a/pretokenizer/bert.go +++ b/pretokenizer/bert.go @@ -4,8 +4,8 @@ import ( // "fmt" // "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func isBertPunc(x rune) (retVal bool) { diff --git a/pretokenizer/bert_test.go b/pretokenizer/bert_test.go index 0dbe146..69691db 100644 --- a/pretokenizer/bert_test.go +++ b/pretokenizer/bert_test.go @@ -4,9 +4,9 @@ import ( "reflect" "testing" - tokenizer "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" + tokenizer "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestBertPreTokenize(t *testing.T) { diff --git a/pretokenizer/bytelevel.go b/pretokenizer/bytelevel.go index f3e3ff6..d9e7662 100644 --- a/pretokenizer/bytelevel.go +++ b/pretokenizer/bytelevel.go @@ -4,8 +4,8 @@ import ( "regexp" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // Regular epxression to split string to `word` token diff --git a/pretokenizer/bytelevel_test.go b/pretokenizer/bytelevel_test.go index fd25d2d..b3e10d0 100644 --- a/pretokenizer/bytelevel_test.go +++ b/pretokenizer/bytelevel_test.go @@ -5,9 +5,9 @@ import ( "strings" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" ) type charidx struct { diff --git a/pretokenizer/delimiter.go b/pretokenizer/delimiter.go index 7e51fc0..8dbbe24 100644 --- a/pretokenizer/delimiter.go +++ b/pretokenizer/delimiter.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type CharDelimiterSplit struct { diff --git a/pretokenizer/digits.go b/pretokenizer/digits.go index 95131a4..8c24db3 100644 --- a/pretokenizer/digits.go +++ b/pretokenizer/digits.go @@ -3,8 +3,8 @@ package pretokenizer import ( "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Digits struct { diff --git a/pretokenizer/digits_test.go b/pretokenizer/digits_test.go index 173dac2..d590134 100644 --- a/pretokenizer/digits_test.go +++ b/pretokenizer/digits_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestNumbers(t *testing.T) { diff --git a/pretokenizer/metaspace.go b/pretokenizer/metaspace.go index 580529d..50a769a 100644 --- a/pretokenizer/metaspace.go +++ b/pretokenizer/metaspace.go @@ -4,8 +4,8 @@ import ( // "log" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // PrependScheme defines how the meta character should be prepended diff --git a/pretokenizer/metaspace_test.go b/pretokenizer/metaspace_test.go index 94e5a0c..f2dbcca 100644 --- a/pretokenizer/metaspace_test.go +++ b/pretokenizer/metaspace_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestMetaspace_Decode(t *testing.T) { diff --git a/pretokenizer/punctuation.go b/pretokenizer/punctuation.go index 66de8ad..6af60a0 100644 --- a/pretokenizer/punctuation.go +++ b/pretokenizer/punctuation.go @@ -3,8 +3,8 @@ package pretokenizer import ( "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // bpunc is the BERT extension of the Punctuation character range diff --git a/pretokenizer/punctuation_test.go b/pretokenizer/punctuation_test.go index e18400c..eaf876c 100644 --- a/pretokenizer/punctuation_test.go +++ b/pretokenizer/punctuation_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestPunctuation(t *testing.T) { diff --git a/pretokenizer/sequence.go b/pretokenizer/sequence.go index f860d9c..c604bd5 100644 --- a/pretokenizer/sequence.go +++ b/pretokenizer/sequence.go @@ -1,7 +1,7 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type Sequence struct { diff --git a/pretokenizer/split.go b/pretokenizer/split.go index 8a2fc93..95b7f36 100644 --- a/pretokenizer/split.go +++ b/pretokenizer/split.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Split struct { diff --git a/pretokenizer/split_test.go b/pretokenizer/split_test.go index dbb689e..5c77c7e 100644 --- a/pretokenizer/split_test.go +++ b/pretokenizer/split_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestSplit(t *testing.T) { diff --git a/pretokenizer/unicode-script.go b/pretokenizer/unicode-script.go index f34436b..d4f31d9 100644 --- a/pretokenizer/unicode-script.go +++ b/pretokenizer/unicode-script.go @@ -4,8 +4,8 @@ import ( "log" "unicode" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) // GetScript returns key to script in `unicode.Scripts`. diff --git a/pretokenizer/unicode-script_test.go b/pretokenizer/unicode-script_test.go index 81263ef..ed4feec 100644 --- a/pretokenizer/unicode-script_test.go +++ b/pretokenizer/unicode-script_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestGetScript(t *testing.T) { diff --git a/pretokenizer/whitespace.go b/pretokenizer/whitespace.go index 00d071a..25392f8 100644 --- a/pretokenizer/whitespace.go +++ b/pretokenizer/whitespace.go @@ -1,8 +1,8 @@ package pretokenizer import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) type Whitespace struct{} diff --git a/pretokenizer/whitespace_test.go b/pretokenizer/whitespace_test.go index 470fc62..642d5c7 100644 --- a/pretokenizer/whitespace_test.go +++ b/pretokenizer/whitespace_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" ) func TestWhitespace(t *testing.T) { diff --git a/pretrained/added-tokens.go b/pretrained/added-tokens.go index 253c012..d96e025 100644 --- a/pretrained/added-tokens.go +++ b/pretrained/added-tokens.go @@ -1,7 +1,7 @@ package pretrained import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func CreateAddedTokens(data []tokenizer.TokenConfig) (specialToks, toks []tokenizer.AddedToken) { diff --git a/pretrained/bert.go b/pretrained/bert.go index 9f3384e..83ca812 100644 --- a/pretrained/bert.go +++ b/pretrained/bert.go @@ -4,13 +4,13 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // BertBaseUncase loads pretrained BERT tokenizer. diff --git a/pretrained/common_test.go b/pretrained/common_test.go index 4ca1453..a315f32 100644 --- a/pretrained/common_test.go +++ b/pretrained/common_test.go @@ -4,7 +4,7 @@ package pretrained import ( "encoding/json" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" "os" ) diff --git a/pretrained/decoder.go b/pretrained/decoder.go index afff515..46e8afb 100644 --- a/pretrained/decoder.go +++ b/pretrained/decoder.go @@ -15,11 +15,11 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/util" ) func CreateDecoder(config map[string]interface{}) (tokenizer.Decoder, error) { diff --git a/pretrained/decoder_test.go b/pretrained/decoder_test.go index 74b9006..6c264f5 100644 --- a/pretrained/decoder_test.go +++ b/pretrained/decoder_test.go @@ -4,7 +4,7 @@ import ( "log" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) /* diff --git a/pretrained/gpt2.go b/pretrained/gpt2.go index 8a47e27..e57f6da 100644 --- a/pretrained/gpt2.go +++ b/pretrained/gpt2.go @@ -4,12 +4,12 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // GPT2 loads GPT2 (small) tokenizer from vocab and merges files. diff --git a/pretrained/model.go b/pretrained/model.go index 507790d..b4708ed 100644 --- a/pretrained/model.go +++ b/pretrained/model.go @@ -4,13 +4,13 @@ import ( "fmt" "log" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/model" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/model/unigram" - "github.com/sugarme/tokenizer/model/wordlevel" - "github.com/sugarme/tokenizer/model/wordpiece" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/model" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/model/unigram" + "github.com/season-studio/tokenizer/model/wordlevel" + "github.com/season-studio/tokenizer/model/wordpiece" + "github.com/season-studio/tokenizer/util" ) // This file provides functions to create tokenizer.Model from input data. diff --git a/pretrained/model_test.go b/pretrained/model_test.go index 7656aac..13c9fb6 100644 --- a/pretrained/model_test.go +++ b/pretrained/model_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/util" ) func TestCreateBPE(t *testing.T) { diff --git a/pretrained/normalizer.go b/pretrained/normalizer.go index fa6bee3..93e06b4 100644 --- a/pretrained/normalizer.go +++ b/pretrained/normalizer.go @@ -18,9 +18,9 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/spm" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/spm" + "github.com/season-studio/tokenizer/util" ) // CreateNormalizer creates Normalizer from config data. diff --git a/pretrained/padding.go b/pretrained/padding.go index 389fce9..bc6ca71 100644 --- a/pretrained/padding.go +++ b/pretrained/padding.go @@ -3,8 +3,8 @@ package pretrained import ( "reflect" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) func CreatePaddingParams(config map[string]interface{}) (*tokenizer.PaddingParams, error) { diff --git a/pretrained/pretokenizer.go b/pretrained/pretokenizer.go index a53e842..09e3791 100644 --- a/pretrained/pretokenizer.go +++ b/pretrained/pretokenizer.go @@ -17,10 +17,10 @@ import ( "fmt" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/util" ) func CreatePreTokenizer(config map[string]interface{}) (tokenizer.PreTokenizer, error) { diff --git a/pretrained/processor.go b/pretrained/processor.go index 06feeac..72e37ea 100644 --- a/pretrained/processor.go +++ b/pretrained/processor.go @@ -10,10 +10,10 @@ package pretrained import ( "fmt" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) func CreatePostProcessor(config map[string]interface{}) (tokenizer.PostProcessor, error) { diff --git a/pretrained/processor_test.go b/pretrained/processor_test.go index 393f807..0acf1fc 100644 --- a/pretrained/processor_test.go +++ b/pretrained/processor_test.go @@ -4,7 +4,7 @@ import ( "log" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // e.g. `hf-internal-testing/llama-tokenizer` diff --git a/pretrained/roberta.go b/pretrained/roberta.go index ea742cd..e9c4abb 100644 --- a/pretrained/roberta.go +++ b/pretrained/roberta.go @@ -4,12 +4,12 @@ import ( "log" "os" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/decoder" - "github.com/sugarme/tokenizer/model/bpe" - "github.com/sugarme/tokenizer/pretokenizer" - "github.com/sugarme/tokenizer/processor" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/decoder" + "github.com/season-studio/tokenizer/model/bpe" + "github.com/season-studio/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer/processor" + "github.com/season-studio/tokenizer/util" ) // RobertaBase loads pretrained RoBERTa tokenizer. diff --git a/pretrained/tokenizer.go b/pretrained/tokenizer.go index e5769eb..3086b05 100644 --- a/pretrained/tokenizer.go +++ b/pretrained/tokenizer.go @@ -6,7 +6,7 @@ import ( "io" "os" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) // FromFile constructs a new Tokenizer from json data file (normally 'tokenizer.json') diff --git a/pretrained/tokenizer_test.go b/pretrained/tokenizer_test.go index 1373fdf..d33271f 100644 --- a/pretrained/tokenizer_test.go +++ b/pretrained/tokenizer_test.go @@ -5,7 +5,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestFromFile(t *testing.T) { diff --git a/pretrained/truncation.go b/pretrained/truncation.go index 7a8c63d..86870d8 100644 --- a/pretrained/truncation.go +++ b/pretrained/truncation.go @@ -1,8 +1,8 @@ package pretrained import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) func CreateTruncationParams(config map[string]interface{}) (*tokenizer.TruncationParams, error) { diff --git a/processor/bert.go b/processor/bert.go index 2055035..86efc02 100644 --- a/processor/bert.go +++ b/processor/bert.go @@ -1,7 +1,7 @@ package processor import ( - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) type PostToken struct { diff --git a/processor/bytelevel.go b/processor/bytelevel.go index 168e040..43077e3 100644 --- a/processor/bytelevel.go +++ b/processor/bytelevel.go @@ -1,8 +1,8 @@ package processor import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) type ByteLevelProcessing struct { diff --git a/processor/roberta.go b/processor/roberta.go index 18aa9a2..7ac4397 100644 --- a/processor/roberta.go +++ b/processor/roberta.go @@ -1,8 +1,8 @@ package processor import ( - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) // RobertaProcessing is a post post processor for Roberta model diff --git a/processor/sequence.go b/processor/sequence.go index cad57cc..cd33527 100644 --- a/processor/sequence.go +++ b/processor/sequence.go @@ -1,6 +1,6 @@ package processor -import "github.com/sugarme/tokenizer" +import "github.com/season-studio/tokenizer" type Sequence struct { processors []tokenizer.PostProcessor diff --git a/processor/sequence_test.go b/processor/sequence_test.go index 4316908..5841838 100644 --- a/processor/sequence_test.go +++ b/processor/sequence_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/pretokenizer" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/pretokenizer" ) func TestSequence(t *testing.T) { diff --git a/processor/template.go b/processor/template.go index a94e1f7..f24245e 100644 --- a/processor/template.go +++ b/processor/template.go @@ -6,8 +6,8 @@ import ( "strconv" "strings" - "github.com/sugarme/tokenizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer" + "github.com/season-studio/tokenizer/util" ) type SequenceEnum int diff --git a/processor/template_test.go b/processor/template_test.go index 70f1da4..14cebdd 100644 --- a/processor/template_test.go +++ b/processor/template_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - "github.com/sugarme/tokenizer" + "github.com/season-studio/tokenizer" ) func TestPiece(t *testing.T) { diff --git a/tokenizer.go b/tokenizer.go index d4d4990..bf949b5 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -17,8 +17,8 @@ import ( progressbar "github.com/schollz/progressbar/v2" // "golang.org/x/sync/errgroup" - "github.com/sugarme/tokenizer/normalizer" - "github.com/sugarme/tokenizer/util" + "github.com/season-studio/tokenizer/normalizer" + "github.com/season-studio/tokenizer/util" ) const ( diff --git a/util/slice/int_test.go b/util/slice/int_test.go index e9e33f1..a3e1402 100644 --- a/util/slice/int_test.go +++ b/util/slice/int_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - slice "github.com/sugarme/tokenizer/util/slice" + slice "github.com/season-studio/tokenizer/util/slice" ) func TestInsertInt(t *testing.T) { diff --git a/util/slice/string_test.go b/util/slice/string_test.go index 2b658df..2bfa126 100644 --- a/util/slice/string_test.go +++ b/util/slice/string_test.go @@ -4,7 +4,7 @@ import ( "reflect" "testing" - slice "github.com/sugarme/tokenizer/util/slice" + slice "github.com/season-studio/tokenizer/util/slice" ) func TestInsertStr(t *testing.T) {