Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ dist: bionic

script:
- go get -u ./...
- go test -v github.com/sugarme/tokenizer/normalizer
- go test -v github.com/sugarme/tokenizer/model/bpe
- go test -v github.com/sugarme/tokenizer/model/wordpiece
- go test -v github.com/sugarme/tokenizer/pretokenizer
- go test -v github.com/sugarme/tokenizer
- go test -v github.com/season-studio/tokenizer/normalizer
- go test -v github.com/season-studio/tokenizer/model/bpe
- go test -v github.com/season-studio/tokenizer/model/wordpiece
- go test -v github.com/season-studio/tokenizer/pretokenizer
- go test -v github.com/season-studio/tokenizer
2 changes: 1 addition & 1 deletion added-vocabulary.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"unicode"

"github.com/sugarme/regexpset"
"github.com/sugarme/tokenizer/normalizer"
"github.com/season-studio/tokenizer/normalizer"
)

// AddedToken represents a token added by the user on top of the
Expand Down
4 changes: 2 additions & 2 deletions added-vocabulary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import (
"reflect"
"testing"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/normalizer"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/normalizer"
)

type ModelMock struct {
Expand Down
12 changes: 6 additions & 6 deletions bpe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import (
"reflect"
"testing"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/bpe"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model/bpe"

// "github.com/sugarme/tokenizer/normalizer"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/sugarme/tokenizer/processor"
"github.com/sugarme/tokenizer/util"
// "github.com/season-studio/tokenizer/normalizer"
"github.com/season-studio/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer/processor"
"github.com/season-studio/tokenizer/util"
)

func getByteLevelBPE() (retVal *tokenizer.Tokenizer) {
Expand Down
2 changes: 1 addition & 1 deletion config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"fmt"
"os"

"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer/util"
)

func ExampleConfig() {
Expand Down
2 changes: 1 addition & 1 deletion decoder/bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package decoder
import (
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

// Allows decoding Original BPE by joining all the tokens and then replacing
Expand Down
2 changes: 1 addition & 1 deletion decoder/byte-fallback.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"strings"
"unicode/utf8"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

type ByteFallback struct {
Expand Down
2 changes: 1 addition & 1 deletion decoder/ctc.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package decoder
import (
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

type CTC struct {
Expand Down
2 changes: 1 addition & 1 deletion decoder/decoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package decoder
import (
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

type DecoderBase struct {
Expand Down
2 changes: 1 addition & 1 deletion decoder/fuse.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package decoder
import (
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

// Fuse constructs Fuse decoder
Expand Down
2 changes: 1 addition & 1 deletion decoder/sequence.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package decoder

import (
"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

type Sequence struct {
Expand Down
4 changes: 2 additions & 2 deletions decoder/sequence_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import (
// "strings"
"testing"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/pretokenizer"
)

func TestSequence(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion decoder/strip.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package decoder
import (
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

type Strip struct {
Expand Down
2 changes: 1 addition & 1 deletion decoder/wordpiece.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"fmt"
"strings"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

// WordPieceDecoder takes care of decoding a list of wordpiece tokens
Expand Down
2 changes: 1 addition & 1 deletion encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"log"
"reflect"

"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer/util"
)

type PaddingDirection int
Expand Down
2 changes: 1 addition & 1 deletion encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"reflect"
"testing"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

func TestTokenizer_MergeWith(t *testing.T) {
Expand Down
14 changes: 7 additions & 7 deletions example/basic/bert.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/decoder"
"github.com/sugarme/tokenizer/model/wordpiece"
"github.com/sugarme/tokenizer/normalizer"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/sugarme/tokenizer/processor"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/decoder"
"github.com/season-studio/tokenizer/model/wordpiece"
"github.com/season-studio/tokenizer/normalizer"
"github.com/season-studio/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer/processor"
"github.com/season-studio/tokenizer/util"
)

func runBERT() {
Expand Down
10 changes: 5 additions & 5 deletions example/basic/bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/bpe"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/sugarme/tokenizer/processor"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model/bpe"
"github.com/season-studio/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer/processor"
"github.com/season-studio/tokenizer/util"
)

func runBPE() {
Expand Down
10 changes: 5 additions & 5 deletions example/basic/wordlevel.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ import (
"log"
"unicode"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/decoder"
"github.com/sugarme/tokenizer/model/wordlevel"
"github.com/sugarme/tokenizer/normalizer"
// "github.com/sugarme/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/decoder"
"github.com/season-studio/tokenizer/model/wordlevel"
"github.com/season-studio/tokenizer/normalizer"
// "github.com/season-studio/tokenizer/pretokenizer"
)

type customNormalizer struct{}
Expand Down
6 changes: 3 additions & 3 deletions example/bpe/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/bpe"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model/bpe"
"github.com/season-studio/tokenizer/pretokenizer"
)

func runTest() {
Expand Down
6 changes: 3 additions & 3 deletions example/bpe/train.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"log"
"time"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/bpe"
"github.com/sugarme/tokenizer/pretokenizer"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model/bpe"
"github.com/season-studio/tokenizer/pretokenizer"
)

func runTrain() {
Expand Down
2 changes: 1 addition & 1 deletion example/decode/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package main
import (
"fmt"

"github.com/sugarme/tokenizer/pretrained"
"github.com/season-studio/tokenizer/pretrained"
)

func main() {
Expand Down
4 changes: 2 additions & 2 deletions example/pretrained/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/pretrained"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/pretrained"
)

var (
Expand Down
4 changes: 2 additions & 2 deletions example/truncation/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/pretrained"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/pretrained"
)

func main() {
Expand Down
6 changes: 3 additions & 3 deletions example/unigram/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model/unigram"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model/unigram"
"github.com/season-studio/tokenizer/util"
)

func main() {
Expand Down
2 changes: 1 addition & 1 deletion example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"fmt"
"log"

"github.com/sugarme/tokenizer/pretrained"
"github.com/season-studio/tokenizer/pretrained"
)

func ExampleTokenizer_Encode() {
Expand Down
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
module github.com/sugarme/tokenizer
module github.com/season-studio/tokenizer

go 1.23
go 1.23.0

toolchain go1.24.6

require (
github.com/emirpasic/gods v1.18.1
Expand Down
14 changes: 7 additions & 7 deletions model/bpe/bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ import (
"log"
"strings"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/model"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/model"
"github.com/season-studio/tokenizer/util"
)

type Merges map[Pair]PairVal
Expand Down Expand Up @@ -385,14 +385,14 @@ func (b *BPE) MergeWord(w string) *Word {
byteLen = len(string(r))

// if first rune, add prefix
if byteIdx == 0 {
s = fmt.Sprintf("%v%v", prefix, string(r))
} else if currRuneIdx == len(chars) { // last rune, add suffix
currRuneIdx++
if currRuneIdx == len(chars) { // last rune, add suffix
s = fmt.Sprintf("%v%v", string(r), suffix)
} else if byteIdx == 0 {
s = fmt.Sprintf("%v%v", prefix, string(r))
} else { // the rest
s = string(r)
}
currRuneIdx++

// If `s` exists in vocab, add its id, otherwise add id of `unk`
vocab := *b.Vocab
Expand Down
6 changes: 3 additions & 3 deletions model/bpe/bpe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ import (
// "strings"
"testing"

"github.com/sugarme/tokenizer"
bpe "github.com/sugarme/tokenizer/model/bpe"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
bpe "github.com/season-studio/tokenizer/model/bpe"
"github.com/season-studio/tokenizer/util"
)

func TestBPE_FromFiles(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion model/bpe/trainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
// 2.2 stars
// progressbar "github.com/cheggaaa/pb/v3"

"github.com/sugarme/tokenizer"
"github.com/season-studio/tokenizer"
)

// Map with no value
Expand Down
2 changes: 1 addition & 1 deletion model/bpe/trainer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"sort"
"testing"

bpe "github.com/sugarme/tokenizer/model/bpe"
bpe "github.com/season-studio/tokenizer/model/bpe"
)

func TestBpeTrainer_Train(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion model/bpe/word_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"reflect"
"testing"

bpe "github.com/sugarme/tokenizer/model/bpe"
bpe "github.com/season-studio/tokenizer/model/bpe"
)

func TestMerge_Merge(t *testing.T) {
Expand Down
4 changes: 2 additions & 2 deletions model/unigram/unigram.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (
"strings"
"unicode/utf8"

"github.com/sugarme/tokenizer"
"github.com/sugarme/tokenizer/util"
"github.com/season-studio/tokenizer"
"github.com/season-studio/tokenizer/util"
)

// TokenScore represents a token and its score in the Unigram model
Expand Down
Loading