Skip to content

Commit 6715db0

Browse files
authored
Merge pull request #32 from matlab-deep-learning/japanese-bert
Adding two variations of Japanese-BERT
2 parents 87f02af + 3a2c48f commit 6715db0

14 files changed

+738
-79
lines changed

Diff for: +bert/+internal/convertModelNameToDirectories.m

+9-3
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
% convertModelNameToDirectories Converts the user facing model name to
33
% the directory name used by support files.
44

5-
% Copyright 2021 The MathWorks, Inc.
5+
% Copyright 2021-2023 The MathWorks, Inc.
66
arguments
77
name (1,1) string
88
end
99
modelName = userInputToSupportFileName(name);
10-
dirpath = {"data","networks","bert",modelName};
10+
bertBaseLocation = "bert";
11+
if contains(name,"japanese")
12+
bertBaseLocation = "ja_" + bertBaseLocation;
13+
end
14+
dirpath = {"data","networks",bertBaseLocation,modelName};
1115
end
1216

1317
function supportfileName = userInputToSupportFileName(name)
@@ -26,5 +30,7 @@
2630
"medium", "uncased_L8_H512_A8";
2731
"small", "uncased_L4_H512_A8";
2832
"mini", "uncased_L4_H256_A4";
29-
"tiny", "uncased_L2_H128_A2"];
33+
"tiny", "uncased_L2_H128_A2";
34+
"japanese-base-wwm", "";
35+
"japanese-base", ""];
3036
end

Diff for: +bert/+tokenizer/+internal/BasicTokenizer.m

+23-18
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
classdef BasicTokenizer < bert.tokenizer.internal.Tokenizer
22
% BasicTokenizer Perform basic tokenization.
33

4-
% Copyright 2020 The MathWorks, Inc.
4+
% Copyright 2020-2023 The MathWorks, Inc.
55

66
properties(SetAccess=private)
77
IgnoreCase
@@ -28,24 +28,29 @@
2828
function tokens = tokenize(this,text)
2929
arguments
3030
this (1,1) bert.tokenizer.internal.BasicTokenizer
31-
text (1,1) string
31+
text (1,:) string
3232
end
33-
u = textanalytics.unicode.UTF32(text);
34-
u = this.cleanText(u);
35-
u = this.tokenizeCJK(u);
36-
text = u.string();
37-
if this.IgnoreCase
38-
text = lower(text);
39-
text = textanalytics.unicode.nfd(text);
40-
end
41-
u = textanalytics.unicode.UTF32(text);
42-
cats = u.characterCategories('Granularity','detailed');
43-
if this.IgnoreCase
44-
[u,cats] = this.stripAccents(u,cats);
33+
tokens = cell(1,numel(string));
34+
for i = 1:numel(text)
35+
thisText = text(i);
36+
u = textanalytics.unicode.UTF32(thisText);
37+
u = this.cleanText(u);
38+
u = this.tokenizeCJK(u);
39+
thisText = u.string();
40+
if this.IgnoreCase
41+
thisText = lower(thisText);
42+
thisText = textanalytics.unicode.nfd(thisText);
43+
end
44+
u = textanalytics.unicode.UTF32(thisText);
45+
cats = u.characterCategories('Granularity','detailed');
46+
if this.IgnoreCase
47+
[u,cats] = this.stripAccents(u,cats);
48+
end
49+
theseTokens = this.splitOnPunc(u,cats);
50+
theseTokens = join(cat(2,theseTokens{:})," ");
51+
theseTokens = this.whiteSpaceTokenize(theseTokens);
52+
tokens{i} = theseTokens;
4553
end
46-
tokens = this.splitOnPunc(u,cats);
47-
tokens = join(cat(2,tokens{:})," ");
48-
tokens = this.whiteSpaceTokenize(tokens);
4954
end
5055
end
5156

@@ -160,4 +165,4 @@
160165
inRange(udata,123,126);
161166
cats = string(cats);
162167
tf = (tf)|(cats.startsWith("P"));
163-
end
168+
end

Diff for: +bert/+tokenizer/+internal/FullTokenizer.m

+40-16
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,24 @@
55
% using the vocabulary specified in the newline delimited txt file
66
% vocabFile.
77
%
8-
% tokenizer = FullTokenizer(vocabFile,'IgnoreCase',tf) controls if
9-
% the FullTokenizer is case sensitive or not. The default value for
10-
% tf is true.
8+
% tokenizer = FullTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
9+
% specifies the optional parameter name/value pairs:
10+
%
11+
% 'BasicTokenizer' - Tokenizer used to split text into words.
12+
% If not specified, a default
13+
% BasicTokenizer is constructed.
14+
%
15+
% 'IgnoreCase' - A logical value to control if the
16+
% FullTokenizer is case sensitive or not.
17+
% The default value is true.
1118
%
1219
% FullTokenizer methods:
1320
% tokenize - tokenize text
1421
% encode - encode tokens
1522
% decode - decode encoded tokens
1623
%
1724
% Example:
18-
% % Save a file named vocab.txt with the text on the next 3 lines:
25+
% % Save a file named fakeVocab.txt with the text on the next 3 lines:
1926
% fake
2027
% vo
2128
% ##cab
@@ -30,7 +37,7 @@
3037
% % This returns the encoded form of the tokens - each token is
3138
% % replaced by its corresponding line number in the fakeVocab.txt
3239

33-
% Copyright 2021 The MathWorks, Inc.
40+
% Copyright 2021-2023 The MathWorks, Inc.
3441

3542
properties(Access=private)
3643
Basic
@@ -46,17 +53,24 @@
4653
% using the vocabulary specified in the newline delimited txt file
4754
% vocabFile.
4855
%
49-
% tokenizer = FullTokenizer(vocabFile,'IgnoreCase',tf) controls if
50-
% the FullTokenizer is case sensitive or not. The default value for
51-
% tf is true.
56+
% tokenizer = FullTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...) specifies
57+
% the optional parameter name/value pairs:
58+
%
59+
% 'BasicTokenizer' - Tokenizer used to split text into words.
60+
% If not specified, a default
61+
% BasicTokenizer is constructed.
62+
%
63+
% 'IgnoreCase' - A logical value to control if the
64+
% FullTokenizer is case sensitive or not.
65+
% The default value is true.
5266
%
5367
% FullTokenizer methods:
5468
% tokenize - tokenize text
5569
% encode - encode tokens
5670
% decode - decode encoded tokens
5771
%
5872
% Example:
59-
% % Save a file named vocab.txt with the text on the next 3 lines:
73+
% % Save a file named fakeVocab.txt with the text on the next 3 lines:
6074
% fake
6175
% vo
6276
% ##cab
@@ -72,9 +86,16 @@
7286
% % replaced by its corresponding line number in the fakeVocab.txt
7387
arguments
7488
vocab
89+
nvp.BasicTokenizer = []
7590
nvp.IgnoreCase = true
7691
end
77-
this.Basic = bert.tokenizer.internal.BasicTokenizer('IgnoreCase',nvp.IgnoreCase);
92+
if isempty(nvp.BasicTokenizer)
93+
% Default case
94+
this.Basic = bert.tokenizer.internal.BasicTokenizer('IgnoreCase',nvp.IgnoreCase);
95+
else
96+
mustBeA(nvp.BasicTokenizer,'bert.tokenizer.internal.Tokenizer');
97+
this.Basic = nvp.BasicTokenizer;
98+
end
7899
this.WordPiece = bert.tokenizer.internal.WordPieceTokenizer(vocab);
79100
this.Encoding = this.WordPiece.Vocab;
80101
end
@@ -85,12 +106,15 @@
85106
% tokens = tokenize(tokenizer,text) tokenizes the input
86107
% string text using the FullTokenizer specified by tokenizer.
87108
basicToks = this.Basic.tokenize(txt);
88-
basicToksUnicode = textanalytics.unicode.UTF32(basicToks);
89-
subToks = cell(numel(basicToks),1);
90-
for i = 1:numel(basicToks)
91-
subToks{i} = this.WordPiece.tokenize(basicToksUnicode(i));
109+
toks = cell(numel(txt),1);
110+
for i = 1:numel(txt)
111+
theseBasicToks = textanalytics.unicode.UTF32(basicToks{i});
112+
theseSubToks = cell(numel(theseBasicToks),1);
113+
for j = 1:numel(theseBasicToks)
114+
theseSubToks{j} = this.WordPiece.tokenize(theseBasicToks(j));
115+
end
116+
toks{i} = cat(2,theseSubToks{:});
92117
end
93-
toks = cat(2,subToks{:});
94118
end
95119

96120
function idx = encode(this,tokens)
@@ -109,4 +133,4 @@
109133
tokens = this.Encoding.ind2word(x);
110134
end
111135
end
112-
end
136+
end
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
classdef TokenizedDocumentTokenizer < bert.tokenizer.internal.Tokenizer
2+
% TokenizedDocumentTokenizer Implements a word-level tokenizer using
3+
% tokenizedDocument.
4+
5+
% Copyright 2023 The MathWorks, Inc.
6+
7+
properties
8+
TokenizedDocumentOptions
9+
IgnoreCase
10+
end
11+
12+
methods
13+
function this = TokenizedDocumentTokenizer(varargin,args)
14+
arguments(Repeating)
15+
varargin
16+
end
17+
arguments
18+
args.IgnoreCase (1,1) logical = true
19+
end
20+
this.IgnoreCase = args.IgnoreCase;
21+
this.TokenizedDocumentOptions = varargin;
22+
end
23+
24+
function toks = tokenize(this,txt)
25+
arguments
26+
this
27+
txt (1,:) string
28+
end
29+
if this.IgnoreCase
30+
txt = lower(txt);
31+
end
32+
t = tokenizedDocument(txt,this.TokenizedDocumentOptions{:});
33+
toks = doc2cell(t);
34+
end
35+
end
36+
end

Diff for: +bert/+tokenizer/BERTTokenizer.m

+32-13
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,16 @@
99
% case-insensitive BERTTokenizer using the file vocabFile as
1010
% the vocabulary.
1111
%
12-
% tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
13-
% Constructs a BERTTokenizer which is case-sensitive or not
14-
% according to the scalar logical tf. The default is true.
12+
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
13+
% specifies the optional parameter name/value pairs:
14+
%
15+
% 'IgnoreCase' - A logical value to control if the
16+
% BERTTokenizer is case sensitive or not.
17+
% The default value is true.
18+
%
19+
% 'FullTokenizer' - The underlying word-piece tokenizer.
20+
% If not specified, a default
21+
% FullTokenizer is constructed.
1522
%
1623
% BERTTokenizer properties:
1724
% FullTokenizer - The underlying word-piece tokenizer.
@@ -34,7 +41,7 @@
3441
% tokenizer = bert.tokenizer.BERTTokenizer();
3542
% sequences = tokenizer.encode("Hello World!")
3643

37-
% Copyright 2021 The MathWorks, Inc.
44+
% Copyright 2021-2023 The MathWorks, Inc.
3845

3946
properties(Constant)
4047
PaddingToken = "[PAD]"
@@ -63,9 +70,16 @@
6370
% case-insensitive BERTTokenizer using the file vocabFile as
6471
% the vocabulary.
6572
%
66-
% tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
67-
% Constructs a BERTTokenizer which is case-sensitive or not
68-
% according to the scalar logical tf. The default is true.
73+
% tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
74+
% specifies the optional parameter name/value pairs:
75+
%
76+
% 'IgnoreCase' - A logical value to control if the
77+
% BERTTokenizer is case sensitive or not.
78+
% The default value is true.
79+
%
80+
% 'FullTokenizer' - The underlying word-piece tokenizer.
81+
% If not specified, a default
82+
% FullTokenizer is constructed.
6983
%
7084
% BERTTokenizer properties:
7185
% FullTokenizer - The underlying word-piece tokenizer.
@@ -90,9 +104,15 @@
90104
arguments
91105
vocabFile (1,1) string {mustBeFile} = bert.internal.getSupportFilePath("base","vocab.txt")
92106
nvp.IgnoreCase (1,1) logical = true
107+
nvp.FullTokenizer = []
108+
end
109+
if isempty(nvp.FullTokenizer)
110+
ignoreCase = nvp.IgnoreCase;
111+
this.FullTokenizer = bert.tokenizer.internal.FullTokenizer(vocabFile,'IgnoreCase',ignoreCase);
112+
else
113+
mustBeA(nvp.FullTokenizer,'bert.tokenizer.internal.FullTokenizer');
114+
this.FullTokenizer = nvp.FullTokenizer;
93115
end
94-
ignoreCase = nvp.IgnoreCase;
95-
this.FullTokenizer = bert.tokenizer.internal.FullTokenizer(vocabFile,'IgnoreCase',ignoreCase);
96116
this.PaddingCode = this.FullTokenizer.encode(this.PaddingToken);
97117
this.SeparatorCode = this.FullTokenizer.encode(this.SeparatorToken);
98118
this.StartCode = this.FullTokenizer.encode(this.StartToken);
@@ -131,10 +151,9 @@
131151
inputShape = size(text_a);
132152
text_a = reshape(text_a,[],1);
133153
text_b = reshape(text_b,[],1);
134-
tokenize = @(text) this.FullTokenizer.tokenize(text);
135-
tokens = arrayfun(tokenize,text_a,'UniformOutput',false);
154+
tokens = this.FullTokenizer.tokenize(text_a);
136155
if ~isempty(text_b)
137-
tokens_b = arrayfun(tokenize,text_b,'UniformOutput',false);
156+
tokens_b = this.FullTokenizer.tokenize(text_b);
138157
tokens = cellfun(@(tokens_a,tokens_b) [tokens_a,this.SeparatorToken,tokens_b], tokens, tokens_b, 'UniformOutput', false);
139158
end
140159
tokens = cellfun(@(tokens) [this.StartToken, tokens, this.SeparatorToken], tokens, 'UniformOutput', false);
@@ -218,4 +237,4 @@
218237
text = cellfun(@(x) join(x," "), tokens);
219238
end
220239
end
221-
end
240+
end

0 commit comments

Comments
 (0)