VAE_LSTM_Text_Encoding/utils.py at master · arogers1/VAE_LSTM_Text_Encoding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from nltk.tokenize import sent_tokenize
from nltk.corpus import reuters, gutenberg, brown, treebank
from gensim.models import KeyedVectors
from scipy import spatial
import numpy as np
import itertools

w2v = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec')

def get_data(lstm_format=True):
	data = []
	all_text = []
	maxlen = 20 if lstm_format else 10
	minlen = 15 if lstm_format else 10
	for t in [brown.words(), reuters.words(), gutenberg.words()]:
		text = get_sentences(t)
		vect, sentences_text = vectorize_sentences(text, maxlen, minlen)
		all_text += sentences_text
		if lstm_format:
			if data == []:
				data = vect
			else:
				data = np.vstack([data, vect])
		else:
			for x in vect:
				data.append(list(itertools.chain.from_iterable(x)))
	data = np.array(data)
	p = np.random.permutation(len(data))
	np.random.shuffle(data)
	return data[p], list(np.array(all_text)[p])

def get_sentences(text):
	"""
	Clean the text and then use NLTK to split it into sentences.
	"""
	s = ''
	for word in text:
		s += word
		s += ' '
	s_cleaned = s.lower()
	for x in ['\xd5d','\n', '\t', '"',"!", '#','$','%','&','(',')','^','*','+',',','-','/',':',';','<','=','>','?','@','[','^',']','_','`','{','|','}','~']:
		s_cleaned = s_cleaned.replace(x, '')
	sentences = sent_tokenize(s_cleaned)
	return sentences

def vectorize_sentences(sentences, maxlen=20, minlen=10):
	"""
	Convert sentence array into word embedding representation.

	PARAMS
	------
	sentences: array of strings; the sentences to be converted to word embedding representation.
	maxlen: int; word length of each sentence in returned np.array. Longer sentences
		will be skipped, and shorter sentences will be padded with zero vectors to be this length.
	minlen: int; minimum word length for sentences to be used in the embedding representation.
		Shorter sentences will be skipped.

	RETURNS
	-------
	3d numpy array of shape (N, maxlen, V), where N is the number of sentences and V is the length of the word embedding
	"""
	vectorized = []
	text = []
	for sentence in sentences:
		words = sentence.split()
		concat_vector = []
		if len(words) > maxlen or len(words) < minlen:
			continue
		elif len(words) < maxlen:	# Zero pad the beginning
			for _ in range(maxlen - len(words)):
				concat_vector.append(np.zeros((300,), dtype=np.float32))
		for word in words:
			try:
				concat_vector.append(w2v[word])
			except:
				concat_vector.append(np.zeros((300,), dtype=np.float32))
				pass
		text.append(sentence)
		vectorized.append(np.array(concat_vector))
	return np.array(vectorized), text

def sentence_embedding_to_text(sentence_vec):
	"""
	Convert sentence embedding to a string using Gensim word2vec
	"""
	text = ''
	for wordvec in sentence_vec:
		if np.all(wordvec == 0):
			continue
		text += w2v.most_similar(positive=[wordvec], topn=1)[0][0]
		text += ' '
	return text

def get_most_similar_encodings(sentence_vec, encoded_sentences, topn=1):
	"""
	Get topn most similar encoded sentences.
	"""
	cos_dists = []
	for sent in encoded_sentences:
		result = spatial.distance.cosine(sentence_vec, sent)
		cos_dists.append(result)
	data_array = np.array(cos_dists)
	maximum_indices = data_array.argsort()[:topn+1][1:]
	new_vecs = encoded_sentences[maximum_indices]
	return new_vecs, maximum_indices

def get_nearest_sentences(sent_idx, encoded_sentences, sentence_embeddings, topn=5):
	"""
	Get the text of the most similar sentences to the given sentence index based on the distance between VAE encodings.
	"""
	sentences = []
	sentences.append(sentence_embedding_to_text(sentence_embeddings[sent_idx]))
	_, neighbor_indices = get_most_similar_encodings(encoded_sentences[sent_idx], encoded_sentences, topn=topn)
	for i in neighbor_indices:
		sentences.append(sentence_embedding_to_text(sentence_embeddings[i]))
	return sentences