From b3d67a5ed1b0dca9aa284ee6a911fc16fcd3b726 Mon Sep 17 00:00:00 2001 From: Kazuya Shimura Date: Fri, 21 Sep 2018 21:55:10 +0900 Subject: [PATCH] add files --- MyEvaluator.py | 82 + MyUpdater.py | 62 + README.md | 114 +- Tree/Amazon_all.tree | 21846 +++++++++++++++++++++++++++++++++++++++++ cnn_model.py | 85 + cnn_train.py | 217 + data_helper.py | 238 + example.sh | 22 + hft_cnn_env.yml | 74 + requirements.txt | 52 + train.py | 198 + tree.py | 49 + xml_cnn_model.py | 76 + 13 files changed, 23113 insertions(+), 2 deletions(-) create mode 100644 MyEvaluator.py create mode 100644 MyUpdater.py create mode 100644 Tree/Amazon_all.tree create mode 100644 cnn_model.py create mode 100644 cnn_train.py create mode 100644 data_helper.py create mode 100644 example.sh create mode 100644 hft_cnn_env.yml create mode 100644 requirements.txt create mode 100644 train.py create mode 100644 tree.py create mode 100644 xml_cnn_model.py diff --git a/MyEvaluator.py b/MyEvaluator.py new file mode 100644 index 0000000..4d2d012 --- /dev/null +++ b/MyEvaluator.py @@ -0,0 +1,82 @@ +import copy + +import six + +from chainer import configuration +from chainer.dataset import convert +from chainer.dataset import iterator as iterator_module +from chainer import functions as F +from chainer import function +from chainer import link +from chainer import reporter as reporter_module +from chainer.training import extensions +from chainer.training import extension +from chainer import cuda +import numpy as np +import scipy.sparse as sp +import pdb + +class MyEvaluator(extensions.Evaluator): + + trigger = 1, 'epoch' + default_name = 'validation' + priority = extension.PRIORITY_WRITER + + name = None + + def __init__(self, iterator, target, class_dim, converter=convert.concat_examples, + device=None, eval_hook=None, eval_func=None): + if isinstance(iterator, iterator_module.Iterator): + iterator = {'main': iterator} + self._iterators = iterator + + if isinstance(target, link.Link): + target = {'main': target} + self._targets = target + + self.converter = converter + self.device = device + self.eval_hook = eval_hook + self.eval_func = eval_func + self.class_dim = class_dim + + def evaluate(self): + + iterator = self._iterators['main'] + eval_func = self.eval_func or self._targets['main'] + + if self.eval_hook: + self.eval_hook(self) + + if hasattr(iterator, 'reset'): + iterator.reset() + it = iterator + else: + it = copy.copy(iterator) + + summary = reporter_module.DictSummary() + + for batch in it: + observation = {} + with reporter_module.report_scope(observation): + row_idx, col_idx, val_idx = [], [], [] + x = cuda.to_gpu(np.array([i[0] for i in batch])) + labels = [l[1] for l in batch] + for i in range(len(labels)): + l_list = list(set(labels[i])) # remove duplicate cateories to avoid double count + for y in l_list: + row_idx.append(i) + col_idx.append(y) + val_idx.append(1) + m = len(labels) + n = self.class_dim + t = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n), dtype=np.int8).todense() + t = cuda.to_gpu(t) + + with function.no_backprop_mode(): + #pdb.set_trace() + loss = F.sigmoid_cross_entropy(eval_func(x), t) + summary.add({MyEvaluator.default_name + '/main/loss':loss}) + summary.add(observation) + + return summary.compute_mean() \ No newline at end of file diff --git a/MyUpdater.py b/MyUpdater.py new file mode 100644 index 0000000..1e86e95 --- /dev/null +++ b/MyUpdater.py @@ -0,0 +1,62 @@ +import six +import numpy as np +import chainer +import chainer.functions as F +import chainer.links as L +from chainer import cuda, training, reporter +from chainer.datasets import get_mnist +from chainer.training import trainer, extensions +from chainer.dataset import convert +from chainer.dataset import iterator as iterator_module +from chainer.datasets import get_mnist +from chainer import optimizer as optimizer_module +import scipy.sparse as sp +import pdb + +class MyUpdater(training.StandardUpdater): + def __init__(self, iterator, optimizer, class_dim, converter=convert.concat_examples, + device=None, loss_func=None): + if isinstance(iterator, iterator_module.Iterator): + iterator = {'main': iterator} + self._iterators = iterator + + if not isinstance(optimizer, dict): + optimizer = {'main': optimizer} + self._optimizers = optimizer + + if device is not None and device >= 0: + for optimizer in six.itervalues(self._optimizers): + optimizer.target.to_gpu(device) + + self.converter = converter + self.loss_func = loss_func + self.device = device + self.iteration = 0 + self.class_dim = class_dim + + def update_core(self): + batch = self._iterators['main'].next() + + x = chainer.cuda.to_gpu(np.array([i[0] for i in batch])) + labels = [l[1] for l in batch] + row_idx, col_idx, val_idx = [], [], [] + for i in range(len(labels)): + l_list = list(set(labels[i])) # remove duplicate cateories to avoid double count + for y in l_list: + row_idx.append(i) + col_idx.append(y) + val_idx.append(1) + m = len(labels) + n = self.class_dim + t = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n), dtype=np.int8).todense() + + + t = chainer.cuda.to_gpu(t) + + optimizer = self._optimizers['main'] + optimizer.target.cleargrads() + loss = F.sigmoid_cross_entropy(optimizer.target(x), t) + chainer.reporter.report({'main/loss':loss}) + loss.backward() + optimizer.update() + diff --git a/README.md b/README.md index e950c0f..b82b110 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,112 @@ -# HFT-CNN -to be prepared. \ No newline at end of file +HFT-CNN +== +このコードでは次の4種類のCNNモデルを用いた文書分類ができます: +* Flat モデル : 階層構造を利用せずに学習 +* Without Fine-tuning (WoFt) モデル : 階層構造を利用するがFine-tuningは利用せずに学習 +* Hierarchical Fine-Tuning (HFT) モデル : 階層構造とFine-tuningを利用して学習 +* XML-CNN モデル ([Liu+ '17](http://nyc.lti.cs.cmu.edu/yiming/Publications/jliu-sigir17.pdf)) : Liuら'17 の提案したモデル + +このコードを用いる際には次の論文をご参照ください: + +**HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization** Kazuya Shimura, Jiyi Li and Fumiyo Fukumoto. EMNLP, 2018. + + +### 各モデルの特徴 + +| 特徴\手法 | Flatモデル | WoFtモデル | HFTモデル | XML-CNNモデル | +|-----------------------:|:-------------:|:-------------:|:-------------:|:-------------------:| +| Hierarchycal Structure | | ✔ | ✔ | | +| Fine-tuning | | ✔ | ✔ | | +| Pooling Type | 1-max pooling | 1-max pooling | 1-max pooling | dynamic max pooling | +| Compact Representation | | | | ✔ | + +## Requirements +このコードを実行するために必要なライブラリのうち、代表的なものを次に示します。 +* Python 3.5.4 以降 +* Chainer 4.0.0 以降 ([chainer](http://chainer.org/)) +* CuPy 4.0.0 以降 ([cupy](https://cupy.chainer.org/)) + +注意: +* 現在のコードのバージョンでは**GPU**を利用することが前提となっています。 +* コードを実行するために必要なライブラリの詳細はrequirements.txtをご参照ください。 + +## Installation +* このページの **clone or download** からコードをダウンロード +* requirements.txtに書かれたライブラリをインストールし、実行環境を構築 +* もし必要であれば、次の手順でAnaconda([anaconda](https://www.anaconda.com/enterprise/))による仮想環境を構築 + 1. [Anacondaのダウンロードページ](https://www.anaconda.com/download/)から自分の環境にあったものをインストール + * 例: Linux(x86アーキテクチャ, 64bit)にインストールする場合: + 1. wget https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh + 1. bash Anaconda3-5.1.0-Linux-x86_64.sh + + でインストールできます。 + 3. Anacondaをインストール後、仮想環境を構築 + ```conda env create -f=hft_cnn_env.yml``` + 4. ```source activate hft_cnn_env``` で仮想環境に切り替え + 5. この環境内でHFT-CNNのコードを実行することが可能 + + +## Quick-start +exmaple.shを実行することでFlatモデルを用いたサンプル文書(Amazon商品レビュー)の自動分類を試すことができます: +``` +bash example.sh +-------------------------------------------------- +Loading data... +Loading train data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 465927/465927 [00:18<00:00, 24959.42it/s] +Loading valid data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24522/24522 [00:00<00:00, 27551.44it/s] +Loading test data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153025/153025 [00:05<00:00, 27051.62it/s] +-------------------------------------------------- +Loading Word embedings... +``` +学習後の結果はCNNディレクトリに保存されます. +* RESULT : テストデータを分類した結果 +* PARAMS : 学習後のCNNのパラメータ +* LOG : 学習のログファイル + +### 学習モデルの変更 +```example.sh```内の ```ModelType``` を変更することで学習するモデルを変更することができます +``` +## Network Type (XML-CNN, CNN-Flat, CNN-Hierarchy, CNN-fine-tuning or Pre-process) +ModelType=XML-CNN +``` +注意: +* CNN-Hierarchy, CNN-fine-tuningを選択する場合には**Pre-process**で学習をしてから学習を行ってください +* Pre-processでは階層構造の第1階層目のみを学習し、CNNのパラメータを保存します +* このときに保存されたパラメータはCNN-Hierarchy, CNN-fine-tuningの両タイプで共有されます + +### 単語の分散表現について +このコードでは単語の分散表現に[fastText](https://github.com/facebookresearch/fastText)の学習結果を利用しています. + +```example.sh```内の```EmbeddingWeightsPath```に単語埋め込み層の初期値として利用したいfastTextの```bin```ファイルを指定することができます。 + +fastTextの```bin```ファイルを用意していない場合、英語Wikipediaコーパスを用いた単語の分散表現が[chakin](https://github.com/chakki-works/chakin)を用いて自動的にダウンロードされます。 + +コードに手を加えず```example.sh```を実行した場合にはWord_embeddingディレクトリに```wiki.en.vec```がダウンロードされ、これが利用されます。 +``` +## Embedding Weights Type (fastText .bin and .vec) +EmbeddingWeightsPath=./Word_embedding/ +``` + + + + +## 新しいデータでモデルを学習 +### データについて +#### 種類 +必要な文書データは3種類です: +* 訓練データ : CNNを学習させるために必要なデータ +* 評価データ: CNNの汎化性能を検証するために必要なデータ +* テストデータ : CNNを用いて分類したいデータ + +評価データは各エポックごとにCNNの汎化誤差を評価する際に用いられ、学習の継続によって過学習が起きた場合にEarly Stoppingを行います. また保存されるCNNのパラメータは汎化誤差が最も小さい時のエポックのものが保存されます. + +#### 形式 + +### 文書データが階層構造を有する場合 + +## License + + + + + diff --git a/Tree/Amazon_all.tree b/Tree/Amazon_all.tree new file mode 100644 index 0000000..57fdafd --- /dev/null +++ b/Tree/Amazon_all.tree @@ -0,0 +1,21846 @@ + +All_Electronics +Alternative_Rock +Alternative_Rock= 0.5) + return np_predicts + +def set_seed_random(seed): + random.seed(seed) + np.random.seed(seed) + if chainer.cuda.available: + chainer.cuda.cupy.random.seed(seed) + +def main(params): + print("") + print('# gpu: {}'.format(params["gpu"])) + print('# unit: {}'.format(params["unit"])) + print('# batch-size: {}'.format(params["batchsize"])) + print('# epoch: {}'.format(params["epoch"])) + print('# number of category: {}'.format(params["output-dimensions"])) + print('# embedding dimension: {}'.format(params["embedding-dimensions"])) + print('# current layer: {}'.format(params["currentDepth"])) + print('# model-type: {}'.format(params["model-type"])) + print('') + + + f = open('./CNN/LOG/configuration_' + params["currentDepth"] + '.txt', 'w') + f.write('# gpu: {}'.format(params["gpu"])+"\n") + f.write('# unit: {}'.format(params["unit"])+"\n") + f.write('# batch-size: {}'.format(params["batchsize"])+"\n") + f.write('# epoch: {}'.format(params["epoch"])+"\n") + f.write('# number of category: {}'.format(params["output-dimensions"])+"\n") + f.write('# embedding dimension: {}'.format(params["embedding-dimensions"])+"\n") + f.write('# current layer: {}'.format(params["currentDepth"])+"\n") + f.write('# model-type: {}'.format(params["model-type"])+"\n") + f.write("\n") + f.close() + + embeddingWeights = params["embeddingWeights"] + embeddingDimensions = params["embedding-dimensions"] + inputData = params["inputData"] + x_train = inputData['X_trn'] + x_test = inputData['X_val'] + y_train = inputData['Y_trn'] + y_test = inputData['Y_val'] + + cnn_params = {"cudnn":USE_CUDNN, + "out_channels":params["outchannels"], + "row_dim":embeddingDimensions, + "batch_size":params["batchsize"], + "hidden_dim":params["unit"], + "n_classes":params["output-dimensions"], + "embeddingWeights":embeddingWeights, + } + if params["fineTuning"] == 0: + cnn_params['mode'] = 'scratch' + elif params["fineTuning"] == 1: + cnn_params['mode'] = 'fine-tuning' + cnn_params['load_param_node_name'] = params['upperDepth'] + + if params["model-type"] == "XML-CNN": + model = xml_cnn_model.CNN(**cnn_params) + else: + model = cnn_model.CNN(**cnn_params) + + if params["gpu"] >= 0: + chainer.cuda.get_device_from_id(params["gpu"]).use() + model.to_gpu() + + optimizer = chainer.optimizers.Adam() + optimizer.setup(model) + + train = tuple_dataset.TupleDataset(x_train, y_train) + test = tuple_dataset.TupleDataset(x_test, y_test) + + train_iter = chainer.iterators.SerialIterator(train, params["batchsize"], repeat=True, shuffle=False) + test_iter = chainer.iterators.SerialIterator(test, params["batchsize"], repeat = False, shuffle=False) + + stop_trigger = training.triggers.EarlyStoppingTrigger( + monitor='validation/main/loss', + max_trigger=(params["epoch"], 'epoch')) + + + from MyUpdater import MyUpdater + updater = MyUpdater(train_iter, optimizer, params["output-dimensions"], device=params["gpu"]) + trainer = training.Trainer(updater, stop_trigger, out='./CNN/') + + from MyEvaluator import MyEvaluator + trainer.extend(MyEvaluator(test_iter, model, class_dim=params["output-dimensions"], device=params["gpu"])) + trainer.extend(extensions.dump_graph('main/loss')) + + trainer.extend(extensions.snapshot_object(model, 'parameters_for_multi_label_model_' + params["currentDepth"] + '.npz'),trigger=training.triggers.MinValueTrigger('validation/main/loss',trigger=(1,'epoch'))) + + trainer.extend(extensions.LogReport(log_name='LOG/log_' + params["currentDepth"] + ".txt", trigger=(1, 'epoch'))) + + trainer.extend(extensions.PrintReport( + ['epoch', 'main/loss', 'validation/main/loss', + 'elapsed_time'])) + trainer.extend(extensions.ProgressBar()) + + trainer.extend( + extensions.PlotReport(['main/loss', 'validation/main/loss'], + 'epoch', file_name='LOG/loss_' + params["currentDepth"] + '.png')) + + trainer.run() + + + filename = 'parameters_for_multi_label_model_' + params["currentDepth"] + '.npz' + src = './CNN/' + dst = './CNN/PARAMS' + shutil.move(os.path.join(src, filename), os.path.join(dst, filename)) + + print ("-"*50) + print ("Testing...") + + X_tst = inputData['X_tst'] + Y_tst = inputData['Y_tst'] + N_eval = len(X_tst) + + cnn_params['mode'] = 'test-predict' + cnn_params['load_param_node_name'] = params["currentDepth"] + + if params["model-type"] == "XML-CNN": + model = xml_cnn_model.CNN(**cnn_params) + else: + model = cnn_model.CNN(**cnn_params) + + model.to_gpu() + output = np.zeros([N_eval,params["output-dimensions"]],dtype=np.int8) + output_probability_file_name = "CNN/RESULT/probability_" + params["currentDepth"] + ".csv" + with open(output_probability_file_name, 'w') as f: + f.write(','.join(params["learning_categories"])+"\n") + + test_batch_size = params["batchsize"] + with chainer.using_config('train', False), chainer.no_backprop_mode(): + for i in tqdm(six.moves.range(0, N_eval, test_batch_size),desc="Predict Test loop"): + x = chainer.Variable(chainer.cuda.to_gpu(X_tst[i:i + test_batch_size])) + t = Y_tst[i:i + test_batch_size] + net_output = F.sigmoid(model(x)) + output[i: i + test_batch_size] = select_function(net_output.data) + with open(output_probability_file_name , 'a') as f: + tmp = chainer.cuda.to_cpu(net_output.data) + low_values_flags = tmp < 0.001 + tmp[low_values_flags] = 0 + np.savetxt(f,tmp,fmt='%.4g',delimiter=",") + return output + +def load_top_level_weights(params): + print ("-"*50) + print ("Testing...") + + embeddingWeights = params["embeddingWeights"] + embeddingDimensions = params["embedding-dimensions"] + inputData = params["inputData"] + + cnn_params = {"cudnn":USE_CUDNN, + "out_channels":params["outchannels"], + "row_dim":embeddingDimensions, + "batch_size":params["batchsize"], + "hidden_dim":params["unit"], + "n_classes":params["output-dimensions"], + "embeddingWeights":embeddingWeights, + } + + X_tst = inputData['X_tst'] + Y_tst = inputData['Y_tst'] + N_eval = len(X_tst) + + cnn_params['mode'] = 'test-predict' + cnn_params['load_param_node_name'] = params["currentDepth"] + if params["model-type"] == "XML-CNN": + model = xml_cnn_model.CNN(**cnn_params) + else: + model = cnn_model.CNN(**cnn_params) + + model.to_gpu() + output = np.zeros([N_eval,params["output-dimensions"]],dtype=np.int8) + output_probability_file_name = "CNN/RESULT/probability_" + params["currentDepth"] + ".csv" + with open(output_probability_file_name, 'w') as f: + f.write(','.join(params["learning_categories"])+"\n") + + test_batch_size = params["batchsize"] + with chainer.using_config('train', False), chainer.no_backprop_mode(): + for i in tqdm(six.moves.range(0, N_eval, test_batch_size),desc="Predict Test loop"): + x = chainer.Variable(chainer.cuda.to_gpu(X_tst[i:i + params["batchsize"]])) + t = Y_tst[i:i + test_batch_size] + net_output = F.sigmoid(model(x)) + output[i: i + test_batch_size] = select_function(net_output.data) + with open(output_probability_file_name , 'a') as f: + tmp = chainer.cuda.to_cpu(net_output.data) + low_values_flags = tmp < 0.001 + tmp[low_values_flags] = 0 + np.savetxt(f,tmp,fmt='%.4g',delimiter=",") + return output + + + diff --git a/data_helper.py b/data_helper.py new file mode 100644 index 0000000..fbbdcc4 --- /dev/null +++ b/data_helper.py @@ -0,0 +1,238 @@ +from gensim.models.wrappers.fasttext import FastText +from gensim.models import KeyedVectors +from tqdm import tqdm +from collections import defaultdict +import scipy.sparse as sp +import numpy as np +from sklearn.metrics import f1_score +from sklearn.metrics import classification_report +from sklearn.preprocessing import MultiLabelBinarizer +from itertools import chain +import re +import chakin +import os +import pdb + +def clean_str(string): + string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip().lower() + +def make_data_list(data, kind_of_data, tree_info, max_sen_len, vocab, catgy, articleID, useWords): + data_list = [] + for line in tqdm(data,desc="Loading " + kind_of_data + " data"): + tmp_dict = dict() + line = line[:-1] + tmp_dict['text'] = ' '.join(clean_str(' '.join(line.split("\t")[1].split(" "))).split(" ")[:useWords]) + [vocab[word] for word in tmp_dict['text'].split(" ")] + tmp_dict['num_words'] = len(tmp_dict['text'].split(" ")) + max_sen_len = max(max_sen_len, tmp_dict['num_words']) + tmp_dict['split'] = kind_of_data + tmp_dict['hie_info'] = list(set([tree_info[cat] for cat in line.split("\t")[0].split(",")])) + tmp_dict['catgy'] = [cat for cat in line.split("\t")[0].split(",")] + [catgy[cat] for cat in line.split("\t")[0].split(",")] + tmp_dict['id'] = str(articleID) + articleID += 1 + data_list.append(tmp_dict) + del tmp_dict + return data_list, max_sen_len, vocab, catgy, articleID + + +def data_load(train, valid, test, tree_info, useWords): + vocab = defaultdict( lambda: len(vocab) ) + catgy = defaultdict( lambda: len(catgy) ) + articleID = 0 + max_sen_len = 0 + + train_list, max_sen_len, vocab, catgy, articleID = make_data_list(train, 'train', tree_info, max_sen_len, vocab, catgy, articleID, useWords) + valid_list, max_sen_len, vocab, catgy, articleID = make_data_list(valid, 'valid', tree_info, max_sen_len, vocab, catgy, articleID, useWords) + test_list, max_sen_len, vocab, catgy, articleID = make_data_list(test, 'test', tree_info, max_sen_len, vocab, catgy, articleID, useWords) + + class_dim = len(catgy) + + data = {} + data['train'] = train_list + data['test'] = test_list + data['valid'] = valid_list + data['vocab'] = vocab + data['catgy'] = catgy + data['max_sen_len'] = max_sen_len + data['class_dim'] = class_dim + return data + +def embedding_weights_load(words_map,embeddingWeights_path): + pre_trained_embedding = None + try: + model = FastText.load_fasttext_format(embeddingWeights_path) + pre_trained_embedding = "bin" + except: + print ("fastText binary file (.bin) is not found!") + if os.path.exists("./Word_embedding/wiki.en.vec"): + print ("Using wikipedia(en) pre-trained word vectors.") + else: + print ("Downloading wikipedia(en) pre-trained word vectors.") + chakin.download(number=2, save_dir="./Word_embedding") + print ("Loading vectors...") + model = KeyedVectors.load_word2vec_format('./Word_embedding/wiki.en.vec') + pre_trained_embedding = "txt" + + vocab_size = len(words_map) + word_dimension = model['a'].shape[0] + W = np.zeros((vocab_size,word_dimension),dtype=np.float32) + + for k,v in words_map.items(): + word = k + word_number = v + + try: + W[word_number][:] = model[word] + except KeyError as e: + if pre_trained_embedding == "bin": + W[word_number][:] = model.seeded_vector(word) + else: + np.random.seed(word_number) + W[word_number][:] = np.random.uniform(-0.25, 0.25, word_dimension) + return W + +def get_catgy_mapping(network_output_order_list, test_labels, prediction,currentDepth): + + predictResult = [] + grandLabels = [] + + for i in range(len(test_labels)): + predictResult.append([]) + grandLabels.append([]) + + class_dim = prediction.shape[1] + + row_idx, col_idx, val_idx = [], [], [] + for i in range(len(test_labels)): + l_list = list(set(test_labels[i])) + for y in l_list: + row_idx.append(i) + col_idx.append(y) + val_idx.append(1) + m = max(row_idx) + 1 + n = max(col_idx) + 1 + n = max(class_dim, n) + test_labels = sp.csr_matrix((val_idx, (row_idx, col_idx)), shape=(m, n), dtype=np.int8).todense() + + np_orderList = np.array(network_output_order_list) + + for i,j in tqdm(enumerate(prediction), desc="Generating predict labels..."): + one_hots = np.where(j == 1)[0] + if len(one_hots) >= 1: + predictResult[i] = np_orderList[one_hots].tolist() + + output_grand_truth_file_name = "CNN/RESULT/grand_truth_" + currentDepth + ".csv" + with open(output_grand_truth_file_name, 'w') as f: + f.write(','.join(network_output_order_list)+"\n") + + with open(output_grand_truth_file_name, 'a') as f: + for i,j in tqdm(enumerate(test_labels), desc="Generating grand truth labels..."): + one_hots = np.where(j == 1)[1] + if len(one_hots) >= 1: + grandLabels[i] = np_orderList[one_hots].tolist() + f.write(",".join(grandLabels[i])+"\n") + else: + f.write("\n") + + return grandLabels,predictResult + + +def write_out_prediction(GrandLabels, PredResult, input_data_dic): + + # Writing out prediction + # =================================================== + print ("-"*50) + print ("Writing out prediction...") + test_data = input_data_dic['test'] + result_file = open("./CNN/RESULT/Prediction.txt", mode="w") + result_file.write("Grand-truth-label\tPrediction-labels\tInput-text\n") + for g,p,t in zip(GrandLabels, PredResult, test_data): + result_file.write("{}\t{}\t{}\n".format(','.join(sorted(g)), ','.join(sorted(p)), t['text'])) + result_file.close() + +# Making Problems +#======================================================== + +def build_input_sentence_data(sentences): + x = np.array(sentences) + return x + +def build_input_label_data(labels, class_order): + from sklearn.preprocessing import MultiLabelBinarizer + from itertools import chain + + bml = MultiLabelBinarizer(classes=class_order, sparse_output=True) + indexes = sp.find(bml.fit_transform(labels)) + Y = [] + + for i in range(len(labels)): + Y.append([]) + for i,j in zip(indexes[0], indexes[1]): + Y[i].append(j) + return Y + +def pad_sentences(sentences, padding_word=-1, max_length=50): + sequence_length = max(max(len(x) for x in sentences), max_length) + padded_sentences = [] + for i in range(len(sentences)): + sentence = sentences[i] + if len(sentence) < max_length: + num_padding = sequence_length - len(sentence) + new_sentence = sentence + [padding_word] * num_padding + else: + new_sentence = sentence[:max_length] + padded_sentences.append(new_sentence) + return padded_sentences + +def build_problem(learning_categories, depth, input_data_dic): + + train_data = input_data_dic['train'] + validation_data = input_data_dic['valid'] + test_data = input_data_dic['test'] + vocab = input_data_dic['vocab'] + max_sen_len = input_data_dic['max_sen_len'] + + if depth == "flat": + trn_text = [[vocab[word] for word in doc['text'].split()] for doc in train_data] + trn_labels = [doc['catgy'] for doc in train_data] + val_text = [[vocab[word] for word in doc['text'].split()] for doc in validation_data] + val_labels = [doc['catgy'] for doc in validation_data] + tst_text = [[vocab[word] for word in doc['text'].split()] for doc in test_data] + tst_labels = [doc['catgy'] for doc in test_data] + + else: + layer = int(depth[:-2]) + trn_text = [[vocab[word] for word in doc['text'].split()] for doc in train_data if (layer in doc['hie_info']) or ((layer-1) in doc['hie_info'])] + trn_labels = [list( set(doc['catgy']) & set(learning_categories)) for doc in train_data if (layer in doc['hie_info']) or ((layer-1) in doc['hie_info'])] + val_text = [[vocab[word] for word in doc['text'].split()] for doc in validation_data if (layer in doc['hie_info']) or ((layer-1) in doc['hie_info'])] + val_labels = [list( set(doc['catgy']) & set(learning_categories)) for doc in validation_data if (layer in doc['hie_info']) or ((layer-1) in doc['hie_info'])] + tst_text = [[vocab[word] for word in doc['text'].split()] for doc in test_data] + tst_labels = [list( set(doc['catgy']) & set(learning_categories)) if layer in doc['hie_info'] else [] for doc in test_data] + + trn_padded = pad_sentences(trn_text, max_length=max_sen_len) + val_padded = pad_sentences(val_text, max_length=max_sen_len) + tst_padded = pad_sentences(tst_text, max_length=max_sen_len) + X_trn = build_input_sentence_data(trn_padded) + X_val = build_input_sentence_data(val_padded) + X_tst = build_input_sentence_data(tst_padded) + Y_trn = build_input_label_data(trn_labels,learning_categories) + Y_val = build_input_label_data(val_labels, learning_categories) + Y_tst = build_input_label_data(tst_labels, learning_categories) + + return X_trn, Y_trn, X_val, Y_val, X_tst, Y_tst + +def order_n(i): return {1:"1st", 2:"2nd", 3:"3rd"}.get(i) or "%dth"%i diff --git a/example.sh b/example.sh new file mode 100644 index 0000000..9fc9009 --- /dev/null +++ b/example.sh @@ -0,0 +1,22 @@ +#! bin/bash +DataDIR=./Sample_data +Train=${DataDIR}/train.txt +Test=${DataDIR}/test.txt +Valid=${DataDIR}/valid.txt + +## Embedding Weights Type (fastText .bin and .vec) +EmbeddingWeightsPath=./Word_embedding/ +## Network Type (XML-CNN, CNN-Flat, CNN-Hierarchy, CNN-fine-tuning or Pre-process) +ModelType=CNN-Flat +### the limit of the sequence +USE_WORDS=13 +### Tree file path +TreefilePath=./Tree/Amazon_all.tree + +mkdir -p CNN +mkdir -p CNN/PARAMS +mkdir -p CNN/LOG +mkdir -p CNN/RESULT +mkdir -p Word_embedding + +python train.py ${Train} ${Test} ${Valid} ${EmbeddingWeightsPath} ${ModelType} ${TreefilePath} ${USE_WORDS} \ No newline at end of file diff --git a/hft_cnn_env.yml b/hft_cnn_env.yml new file mode 100644 index 0000000..dd26d67 --- /dev/null +++ b/hft_cnn_env.yml @@ -0,0 +1,74 @@ +name: hft_cnn_env +channels: +- conda-forge +- defaults +dependencies: +- ca-certificates=2017.11.5=0 +- certifi=2017.11.5=py35_0 +- ncurses=5.9=10 +- openssl=1.0.2n=0 +- pip=9.0.1=py35_1 +- python=3.5.4=3 +- readline=7.0=0 +- setuptools=38.4.0=py35_0 +- sqlite=3.20.1=2 +- tk=8.6.7=0 +- wheel=0.30.0=py35_2 +- xz=5.2.3=0 +- zlib=1.2.11=0 +- anaconda=custom=py35h104c396_0 +- pip: + - alembic==0.9.7 + - apscheduler==3.5.1 + - boto==2.48.0 + - boto3==1.5.22 + - botocore==1.8.36 + - bz2file==0.98 + - chainer==4.0.0 + - chainerui==0.2.0 + - chakin==0.0.6 + - chardet==3.0.4 + - click==6.7 + - comet-ml==1.0.6 + - cupy==4.0.0 + - cycler==0.10.0 + - cysignals==1.6.7 + - cython==0.27.3 + - docutils==0.14 + - fastrlock==0.3 + - filelock==3.0.0 + - flask==0.12.2 + - future==0.16.0 + - gensim==3.2.0 + - idna==2.6 + - itsdangerous==0.24 + - jinja2==2.10 + - jmespath==0.9.3 + - mako==1.0.7 + - markupsafe==1.0 + - matplotlib==2.1.2 + - numpy==1.14.0 + - pandas==0.22.0 + - progressbar2==3.36.0 + - protobuf==3.5.1 + - pyparsing==2.2.0 + - python-dateutil==2.6.1 + - python-editor==1.0.3 + - python-utils==2.3.0 + - pytz==2017.3 + - requests==2.18.4 + - s3transfer==0.1.12 + - scikit-learn==0.19.1 + - scipy==1.0.0 + - six==1.11.0 + - sklearn==0.0 + - smart-open==1.5.6 + - sqlalchemy==1.2.2 + - tqdm==4.19.5 + - treetaggerwrapper==2.2.4 + - tzlocal==1.5.1 + - urllib3==1.22 + - websocket-client==0.47.0 + - werkzeug==0.12.2 + - wurlitzer==1.0.1 + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..21874cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,52 @@ +alembic==0.9.7 +APScheduler==3.5.1 +boto==2.48.0 +boto3==1.5.22 +botocore==1.8.36 +bz2file==0.98 +certifi==2017.11.5 +chainer==4.0.0 +chainerui==0.2.0 +chakin==0.0.6 +chardet==3.0.4 +click==6.7 +cupy==4.0.0 +cycler==0.10.0 +cysignals==1.6.7 +Cython==0.27.3 +docutils==0.14 +fastrlock==0.3 +filelock==3.0.0 +Flask==0.12.2 +future==0.16.0 +gensim==3.2.0 +idna==2.6 +itsdangerous==0.24 +Jinja2==2.10 +jmespath==0.9.3 +Mako==1.0.7 +MarkupSafe==1.0 +matplotlib==2.1.2 +numpy==1.14.0 +pandas==0.22.0 +progressbar2==3.36.0 +protobuf==3.5.1 +pyfasttext==0.4.4 +pyparsing==2.2.0 +python-dateutil==2.6.1 +python-editor==1.0.3 +python-utils==2.3.0 +pytz==2017.3 +requests==2.18.4 +s3transfer==0.1.12 +scikit-learn==0.19.1 +scipy==1.0.0 +six==1.11.0 +sklearn==0.0 +smart-open==1.5.6 +SQLAlchemy==1.2.2 +tqdm==4.19.5 +treetaggerwrapper==2.2.4 +tzlocal==1.5.1 +urllib3==1.22 +Werkzeug==0.12.2 diff --git a/train.py b/train.py new file mode 100644 index 0000000..2038539 --- /dev/null +++ b/train.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +import sys +from collections import defaultdict +import numpy as np +import data_helper +import cnn_train +import random +import scipy.sparse as sp +import tree +import os + +def train_problem(currentDepth, upperDepth, classNum, fineTuning, embeddingWeights, inputData, modelType, learning_categories): + params = {"gpu":0, + "outchannels":128, + "embedding-dimensions":300, + "epoch":40, + "batchsize":100, + "unit":1024, + "output-dimensions":int(classNum), + "fineTuning":int(fineTuning), + "currentDepth":currentDepth, + "upperDepth":upperDepth, + "embeddingWeights": embeddingWeights, + "inputData": inputData, + "model-type": modelType, + "learning_categories": learning_categories + } + if params["model-type"] == "XML-CNN": + params["unit"] = 512 # compact representation + if (params["model-type"] == "CNN-fine-tuning") and (currentDepth == "1st"): + params["fineTuning"] = 0 + + if (currentDepth == "1st") and ((params["model-type"] == "CNN-fine-tuning") or (params["model-type"] == "CNN-Hierarchy")): + network_output = cnn_train.load_top_level_weights(params) + else: + network_output = cnn_train.main(params) + + return network_output + +def make_labels_hie_info_dic(treePath): + label_hierarchical_info_dic = {} + with open(treePath, "r") as f: + for line in f: + line = line[:-1] + category = line.split("<")[-1] + level = len(line.split("<")) + if category not in label_hierarchical_info_dic: + label_hierarchical_info_dic[category] = level + return label_hierarchical_info_dic + +def make_labels_hie_list_dic(labels, label_hierarchical_info_dic): + layer_category_list_dic = {} + for i in range(1,max(label_hierarchical_info_dic.values())+1): + a_set = set([]) + layer_category_list_dic[i] = a_set + for label in labels: + layer_category_list_dic[int(label_hierarchical_info_dic[label])].add(label) + + return layer_category_list_dic + +def make_tree(treeFile_path): + Tree = tree.make() + with open(treeFile_path, mode="r") as f: + for line in f: + line = line[:-1] + line = line.split("\t")[0] + line = line.split("<") + tree.add(Tree, line) + return Tree + + + + +# Main processing +# ================================================================== +def main(): + random.seed(0) + np.random.seed(0) + + # Loading data + # ========================================================== + print ('-'*50) + print ('Loading data...') + train = sys.argv[1] + test = sys.argv[2] + validation = sys.argv[3] + embeddingWeights_path = sys.argv[4] + modelType = sys.argv[5] + treeFile_path = sys.argv[6] + useWords = int(sys.argv[7]) + + f_train = open(train, 'r') + train_lines = f_train.readlines() + f_test = open(test, 'r') + test_lines = f_test.readlines() + f_valid = open(validation, 'r') + valid_lines = f_valid.readlines() + f_train.close() + f_test.close() + f_valid.close() + + # Building Hierarchical information + # ========================================================= + category_hie_info_dic = make_labels_hie_info_dic(treeFile_path) + input_data_dic = data_helper.data_load(train_lines, valid_lines, test_lines, category_hie_info_dic, useWords) + category_hie_list_dic = make_labels_hie_list_dic(list(input_data_dic['catgy'].keys()), category_hie_info_dic) + # Loading Word embeddings + # ========================================================= + print ('-'*50) + print ("Loading Word embedings...") + embeddingWeights=data_helper.embedding_weights_load(input_data_dic['vocab'], embeddingWeights_path) + + # Conditions of each model + # ========================================================= + fineTuning = 0 + if modelType == "XML-CNN" or modelType == "CNN-Flat": + categorizationType="flat" + fineTuning = 0 + elif modelType == "CNN-Hierarchy": + categorizationType="hierarchy" + fineTuning = 0 + elif modelType == "CNN-fine-tuning": + categorizationType="hierarchy" + fineTuning = 1 + elif modelType == "Pre-process": + categorizationType = "pre-process" + fineTuning = 0 + else: + raise TypeError('Unknown model type: %s!' % (modelType)) + + + # Processing in case of pro-processing + # ======================================================== + if categorizationType == "pre-process": + print ('-'*50) + print ("Pre-process for hierarchical categorization...") + Tree = make_tree(treeFile_path) + layer = 1 + depth = data_helper.order_n(1) + upperDepth = None + learning_categories = sorted(category_hie_list_dic[layer]) + X_trn, Y_trn, X_val, Y_val, X_tst, Y_tst = data_helper.build_problem(learning_categories=learning_categories,depth=depth, input_data_dic=input_data_dic) + input_network_data = {"X_trn":X_trn, "Y_trn":Y_trn, "X_val":X_val, "Y_val":Y_val, "X_tst":X_tst, "Y_tst":Y_tst} + Y_pred = train_problem(currentDepth=depth, upperDepth=upperDepth, classNum=len(learning_categories), fineTuning=fineTuning, embeddingWeights=embeddingWeights, inputData=input_network_data, modelType=modelType, learning_categories=learning_categories) + print ("Please change model-type to CNN-Hierarchy of CNN-fine-tuning.") + + + # Processing in case of flat categorization + # ======================================================== + elif categorizationType == "flat": + print ('-'*50) + print ("Processing in case of flat categorization...") + from itertools import chain + learning_categories = sorted(input_data_dic['catgy'].keys()) ## this order is network's output order. + X_trn, Y_trn, X_val, Y_val, X_tst, Y_tst = data_helper.build_problem(learning_categories=learning_categories,depth="flat", input_data_dic=input_data_dic) + input_network_data = {"X_trn":X_trn, "Y_trn":Y_trn, "X_val":X_val, "Y_val":Y_val, "X_tst":X_tst, "Y_tst":Y_tst} + Y_pred = train_problem(currentDepth="flat", upperDepth=None, classNum=len(learning_categories), fineTuning=fineTuning, embeddingWeights=embeddingWeights, inputData=input_network_data, modelType=modelType, learning_categories=learning_categories) + GrandLabels, PredResult = data_helper.get_catgy_mapping(learning_categories, Y_tst, Y_pred, "flat") + data_helper.write_out_prediction(GrandLabels, PredResult, input_data_dic) + + # Processing in case of hierarchical categorization + # ======================================================== + elif categorizationType == "hierarchy": + if not os.path.exists("./CNN/PARAMS/parameters_for_multi_label_model_1st.npz"): + raise FileNotFoundError('Please change "ModelType=CNN-Hierarchy" or "ModelType=CNN-fine-tuning" to "ModelType=Pre-process" in example.sh.') + print ('-'*50) + print ("Processing in case of hierarchical categorization...") + upperDepth = None + Y_tst_concat = [[] for i in range(len(input_data_dic['test']))] + Y_pred_concat = [[] for i in range(len(input_data_dic['test']))] + all_categories = [] + Tree = make_tree(treeFile_path) + layers = list(category_hie_list_dic.keys()) + for layer in layers: + depth = data_helper.order_n(layer) + print ('-'*50) + print ('Learning and categorization processing of ' + depth + ' layer') + learning_categories = sorted(category_hie_list_dic[layer]) + X_trn, Y_trn, X_val, Y_val, X_tst, Y_tst = data_helper.build_problem(learning_categories=learning_categories,depth=depth, input_data_dic=input_data_dic) + input_network_data = {"X_trn":X_trn, "Y_trn":Y_trn, "X_val":X_val, "Y_val":Y_val, "X_tst":X_tst, "Y_tst":Y_tst} + Y_pred = train_problem(currentDepth=depth, upperDepth=upperDepth, classNum=len(learning_categories), fineTuning=fineTuning, embeddingWeights=embeddingWeights, inputData=input_network_data, modelType=modelType, learning_categories=learning_categories) + GrandLabels, PredResult = data_helper.get_catgy_mapping(learning_categories, Y_tst, Y_pred, depth) + upperDepth = depth + for i in range(len(input_data_dic['test'])): + Y_tst_concat[i].extend(GrandLabels[i]) + for i in range(len(input_data_dic['test'])): + for y in PredResult[i]: + if (tree.search_parent(Tree, y) in Y_pred_concat[i]) or (tree.search_parent(Tree, y) == 'root'): + Y_pred_concat[i].append(y) + + all_categories += learning_categories + + print ('-'*50) + print ('Final Result') + data_helper.write_out_prediction(Y_tst_concat, Y_pred_concat, input_data_dic) + +if __name__ == "__main__": + main() diff --git a/tree.py b/tree.py new file mode 100644 index 0000000..2aa8e2e --- /dev/null +++ b/tree.py @@ -0,0 +1,49 @@ +from collections import defaultdict +import copy + +def make(): return defaultdict(make) + +def dicts(t): return {k: dicts(t[k]) for k in t} + +def add(t, path): + for node in path: + t = t[node] + +def search_parent(tree,child,layer=1,prevParent='root'): + for k,v in list(tree.items()): + if(k == child): + return prevParent + else: + if len(v) >= 1: + layer += 1 + found = search_parent(v, child, layer,k) + if found: + return found + layer -=1 + else: + continue + +def search_child(tree,node,layer=1): + if (node == "root" or node =="ROOT" or node == "Root"): + return list(tree.keys()) + for k,v in list(tree.items()): + if(k == node): + return list(v.keys()) + else: + if len(v) >= 1: + layer += 1 + found = search_child(v, node, layer) + if found: + return found + layer -=1 + else: + continue + +def search_path(tree, node): + startNode = copy.deepcopy(node) + path = [startNode] + while (node != "root"): + node = search_parent(tree, node) + path.append(node) + + return path \ No newline at end of file diff --git a/xml_cnn_model.py b/xml_cnn_model.py new file mode 100644 index 0000000..a661304 --- /dev/null +++ b/xml_cnn_model.py @@ -0,0 +1,76 @@ +#! /usr/bin/env python + + +import chainer +import chainer.functions as F +import chainer.links as L +import numpy as np +import random + +class CNN(chainer.Chain): + + def __init__(self, **params): + self.in_channels = 1 + self.out_channels = params["out_channels"] + self.row_dim = params["row_dim"] + self.batch_size = params["batch_size"] if "batch_size" in params else 100 + self.hidden_dim = params["hidden_dim"] + self.n_classes = params["n_classes"] + self.mode = params["mode"] if "mode" in params else None + self.load_param_node_name = params["load_param_node_name"] if "load_param_node_name" in params else None + self.cudnn = params["cudnn"] if "cudnn" in params else 'never' + self.embeddingWeights = params["embeddingWeights"] + self.initializer = chainer.initializers.HeNormal() + + if self.mode == "scratch": + super(CNN, self).__init__() + set_seed_random(0) + with self.init_scope(): + self.lookup = L.EmbedID(in_size = self.embeddingWeights.shape[0], out_size = self.embeddingWeights.shape[1], initialW = self.embeddingWeights, ignore_label = -1) + self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(2, self.row_dim), stride=2,initialW=self.initializer) + self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(3, self.row_dim), stride=2,initialW=self.initializer) + self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(4, self.row_dim), stride=2,initialW=self.initializer) + self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=self.initializer) + self.l2=L.Linear(in_size = self.hidden_dim, out_size = self.n_classes, initialW=self.initializer) + + elif self.mode == "test-predict": + parameters = np.load('./CNN/PARAMS/parameters_for_multi_label_model_' + self.load_param_node_name +'.npz') + super(CNN, self).__init__() + set_seed_random(0) + with self.init_scope(): + self.lookup = L.EmbedID(in_size = self.embeddingWeights.shape[0], out_size = self.embeddingWeights.shape[1], initialW = parameters['lookup/W'], ignore_label = -1) + self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(2, self.row_dim),stride=2,initialW=parameters['conv1/W'],initial_bias=parameters['conv1/b']) + self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(3, self.row_dim),stride=2,initialW=parameters['conv2/W'],initial_bias=parameters['conv2/b']) + self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(4, self.row_dim),stride=2,initialW=parameters['conv3/W'],initial_bias=parameters['conv3/b']) + self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=parameters['l1/W'], initial_bias=parameters['l1/b']) + self.l2=L.Linear(self.hidden_dim, self.n_classes, initialW=parameters['l2/W'], initial_bias = parameters['l2/b']) + + + + def __call__(self, x): + with chainer.using_config('use_cudnn', self.cudnn): + with chainer.using_config('cudnn_deterministic', True): + h_non_static = F.dropout(self.lookup(x),0.25) + h_non_static = F.reshape(h_non_static, (h_non_static.shape[0], 1, h_non_static.shape[1], h_non_static.shape[2])) + + h1 = self.conv1(h_non_static) + h2 = self.conv2(h_non_static) + h3 = self.conv3(h_non_static) + + h1 = F.max_pooling_2d(F.relu(h1), (2,1), stride=1) + h2 = F.max_pooling_2d(F.relu(h2), (2,1), stride=1) + h3 = F.max_pooling_2d(F.relu(h3), (2,1), stride=1) + + h = F.concat((h1,h2,h3),axis=2) + h = F.dropout(F.relu(self.l1(h)), ratio=0.5) + + y = self.l2(h) + + return y + + +def set_seed_random(seed): + random.seed(seed) + np.random.seed(seed) + if chainer.cuda.available: + chainer.cuda.cupy.random.seed(seed)