Merge pull request #11 from matlab-deep-learning/tokenizer_optimizations

bwdGitHub · web-flow · commit 32cad3bce5a1 · 2021-05-25T14:43:43.000+01:00
Tokenizer optimizations
diff --git a/+bert/+tokenizer/+internal/BasicTokenizer.m b/+bert/+tokenizer/+internal/BasicTokenizer.m
@@ -34,12 +34,11 @@
             u = this.cleanText(u);
             u = this.tokenizeCJK(u);
             text = u.string();
-            origTokens = this.whiteSpaceTokenize(text);
             if this.IgnoreCase
-                origTokens = lower(origTokens);
-                origTokens = textanalytics.unicode.nfd(origTokens);
+                text = lower(text);
+                text = textanalytics.unicode.nfd(text);
             end
-            u = textanalytics.unicode.UTF32(origTokens);
+            u = textanalytics.unicode.UTF32(text);
             cats = u.characterCategories('Granularity','detailed');
             if this.IgnoreCase
                 [u,cats] = this.stripAccents(u,cats);
diff --git a/+bert/+tokenizer/+internal/FullTokenizer.m b/+bert/+tokenizer/+internal/FullTokenizer.m
@@ -85,9 +85,10 @@
             %   tokens = tokenize(tokenizer,text) tokenizes the input
             %   string text using the FullTokenizer specified by tokenizer.
             basicToks = this.Basic.tokenize(txt);
+            basicToksUnicode = textanalytics.unicode.UTF32(basicToks);
             subToks = cell(numel(basicToks),1);
             for i = 1:numel(basicToks)
-                subToks{i} = this.WordPiece.tokenize(basicToks{i});
+                subToks{i} = this.WordPiece.tokenize(basicToksUnicode(i));
             end
             toks = cat(2,subToks{:});
         end
diff --git a/+bert/+tokenizer/+internal/WhitespaceTokenizer.m b/+bert/+tokenizer/+internal/WhitespaceTokenizer.m
@@ -10,7 +10,7 @@
             %                              by splitting str on whitespace.
             arguments
                 ~
-                text (1,1) string
+                text
             end
             text = strip(text);
             text = split(text).';
diff --git a/+bert/+tokenizer/+internal/WordPieceTokenizer.m b/+bert/+tokenizer/+internal/WordPieceTokenizer.m
@@ -37,16 +37,15 @@
             this.Vocab = this.parseVocab(vocab);
         end
         
-        function tokens = tokenize(this,text)
+        function tokens = tokenize(this,utext)
             arguments
                 this
-                text (1,1) string
+                utext
             end
             tokens = string.empty();
-            wsTokens = this.WhitespaceTokenizer.tokenize(text);
-            wsTokensU = textanalytics.unicode.UTF32(wsTokens);
-            for i = 1:numel(wsTokensU)
-                token = wsTokensU(i);
+            sub = textanalytics.unicode.UTF32();
+            for i = 1:numel(utext)
+                token = utext(i);
                 if numel(token.Data)>this.MaxChar
                     tokens = [tokens,this.Unk]; %#ok
                     continue
@@ -57,14 +56,14 @@
                 while start<(numel(token.Data)+1)
                     finish = numel(token.Data);
                     currentSub = [];
-                    while start<finish+1
-                        sub = textanalytics.unicode.UTF32();
+                    while start<finish+1                        
                         sub.Data = token.Data(start:finish);
                         if start>1
                             sub.Data = [uint32('##'),sub.Data];
                         end
-                        if this.Vocab.isVocabularyWord(sub.string())
-                            currentSub = sub.string();
+                        strForm = sub.string();
+                        if this.Vocab.isVocabularyWord(strForm)
+                            currentSub = strForm;
                             break
                         end
                         finish = finish-1;
diff --git a/test/bert/tokenizer/internal/tWordPieceTokenizer.m b/test/bert/tokenizer/internal/tWordPieceTokenizer.m
@@ -39,7 +39,8 @@ function canSetUnknownToken(test)
             tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'UnknownToken',unk);
             test.verifyEqual(tok.Unk,unk)
             str = "blah";
-            act_out = tok.tokenize(str);
+            ustr = textanalytics.unicode.UTF32(str);
+            act_out = tok.tokenize(ustr);
             exp_out = unk;
             test.verifyEqual(act_out,exp_out);
         end
@@ -50,7 +51,8 @@ function canSetMaxTokenLength(test)
             tok = bert.tokenizer.internal.WordPieceTokenizer(enc,'MaxTokenLength',maxLen);
             test.verifyEqual(tok.MaxChar,maxLen);
             str = "foo";
-            act_out = tok.tokenize(str);
+            ustr = textanalytics.unicode.UTF32(str);
+            act_out = tok.tokenize(ustr);
             exp_out = tok.Unk;
             test.verifyEqual(act_out,exp_out);
         end
@@ -59,7 +61,9 @@ function canTokenize(test)
             enc = wordEncoding(["foo","bar","##foo"]);
             tok = bert.tokenizer.internal.WordPieceTokenizer(enc);
             str = "foo bar foobar barba bafoobar barfoo";
-            act_out = tok.tokenize(str);
+            wsTok = bert.tokenizer.internal.WhitespaceTokenizer;
+            ustr = textanalytics.unicode.UTF32(wsTok.tokenize(str));
+            act_out = tok.tokenize(ustr);
             exp_out = ["foo","bar",tok.Unk,tok.Unk,tok.Unk,"bar","##foo"];
             test.verifyEqual(act_out,exp_out);
         end

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`% by splitting str on whitespace.`
`11`	`11`	`arguments`
`12`	`12`	`~`
`13`		`- text (1,1) string`
	`13`	`+ text`
`14`	`14`	`end`
`15`	`15`	`text = strip(text);`
`16`	`16`	`text = split(text).';`