Skip to content

Commit

Permalink
[aisingapore#61] In progress - preprocessor.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ktyap committed Oct 5, 2022
1 parent 05e888d commit fdddbb6
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 4 deletions.
2 changes: 0 additions & 2 deletions sgnlp/models/bieru/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,6 @@ def eval_model(cfg):

best_loss, best_label, best_pred, best_mask = None, None, None, None

epochs = cfg.train_args["epochs"]

start_time = time.time()
#scheduler.step()
test_loss, test_acc, test_label, test_pred, test_mask, test_fscore = evaluate(model, loss_function, test_loader, cuda)
Expand Down
51 changes: 51 additions & 0 deletions sgnlp/models/bieru/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
import re
import unicodedata

class baseTokenizer():
"""Base Tokenizer Class (Inherited by all subclasses) """
def __init__(self):
self.PAD_WORD = '[PAD]'
self.UNK_WORD = '[UNK]'

def unicodeToAscii(self, utterance):
""" Normalize strings"""
return ''.join(
c for c in unicodedata.normalize('NFD', utterance)
if unicodedata.category(c) != 'Mn'
)

def normalizeString(self, raw_utterance):
"""Remove nonalphabetics for each utterance"""
str = self.unicodeToAscii(raw_utterance.lower().strip())
str = re.sub(r"([,.'!?])", r" \1", str)
str = re.sub(r"[^a-zA-Z,.'!?]+", r" ", str)
return str

def process(self, utterance):
pass

class gloveTokenizer(baseTokenizer):
"""Glove Tokenizer for Glove Embedding (End2End Model)"""
def __init__(self, vocab_path):
super(gloveTokenizer, self).__init__()
self.PAD = 0
self.UNK = 1
self.word2id = None
self.loadVocabFromJson(vocab_path)

def loadVocabFromJson(self, path):
self.word2id = json.load(open(path))

def process(self, utterance):
# baseTokenizer.normalizeString : remove nonalphabetics
utterance = self.normalizeString(utterance)
# transform into lower mode.
wordList = [word.lower() for word in utterance.split()]
indexes = [self.word2id.get(word, self.UNK) for word in wordList] # unk: 1
return indexes

class BieruPreprocessor:
def __init__(self):
pass

5 changes: 3 additions & 2 deletions sgnlp/models/bieru/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def train_model(cfg):
else:
loss_function = MaskedNLLLoss()
optimizer = optim.Adam(model.parameters(),
lr=cfg.train_args["lr"],
weight_decay=cfg.train_args["l2"])
lr=cfg.train_args["lr"])
#weight_decay=cfg.train_args["l2"])
#scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10], gamma=0.5, last_epoch=-1)
#scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5,eta_min=4e-08)
#scheduler = optim.lr_scheduler.StepLR(optimizer, 1, 0.99)
Expand Down Expand Up @@ -185,6 +185,7 @@ def train_model(cfg):
train_loader, cuda, optimizer, True)
valid_loss, valid_acc, _,_,_, val_fscore = train(model, loss_function, valid_loader, cuda)
#scheduler.step()

test_loss, test_acc, test_label, test_pred, test_mask, test_fscore = train(model, loss_function, test_loader, cuda)

if best_loss == None or best_loss > test_loss:
Expand Down

0 comments on commit fdddbb6

Please sign in to comment.