任务 6: LSTMs
valid_size = 1000 valid_text = text[:valid_size] train_text = text[valid_size:] train_size = len(train_text) print(train_size, train_text[:64]) print(valid_size, valid_text[:64])
99999000 ons anarchists advocate social relations based upon voluntary as 1000 anarchism originated as a term of abuse first used against earl
batch_size=64 num_unrollings=10 class BatchGenerator(object): def __init__(self, text, batch_size, num_unrollings): self._text = text self._text_size = len(text) self._batch_size = batch_size self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor = [ offset * segment for offset in range(batch_size)] self._last_batch = self._next_batch() def _next_batch(self): """Generate a single batch from the current cursor position in the data.""" batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) for b in range(self._batch_size): batch[b, char2id(self._text[self._cursor[b]])] = 1.0 self._cursor[b] = (self._cursor[b] + 1) % self._text_size return batch def next(self): """Generate the next array of batches from the data. The array consists of the last batch of the previous array, followed by num_unrollings new ones. """ batches = [self._last_batch] for step in range(self._num_unrollings): batches.append(self._next_batch()) self._last_batch = batches[-1] return batches def characters(probabilities): """Turn a 1-hot encoding or a probability distribution over the possible characters back into its (most likely) character representation.""" return [id2char(c) for c in np.argmax(probabilities, 1)] def batches2string(batches): """Convert a sequence of batches back into their (most likely) string representation.""" s = [''] * batches[0].shape[0] for b in batches: s = [''.join(x) for x in zip(s, characters(b))] return s train_batches = BatchGenerator(train_text, batch_size, num_unrollings) valid_batches = BatchGenerator(valid_text, 1, 1) print(batches2string(train_batches.next())) print(batches2string(train_batches.next())) print(batches2string(valid_batches.next())) print(batches2string(valid_batches.next()))
['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad'] ['ists advoca', 'ary governm', 'hes nationa', 'd monasteri', 'raca prince', 'chard baer ', 'rgical lang', 'for passeng', 'the nationa', 'took place ', 'ther well k', 'seven six s', 'ith a gloss', 'robably bee', 'to recogniz', 'ceived the ', 'icant than ', 'ritic of th', 'ight in sig', 's uncaused ', ' lost as in', 'cellular ic', 'e size of t', ' him a stic', 'drugs confu', ' take to co', ' the priest', 'im to name ', 'd barred at', 'standard fo', ' such as es', 'ze on the g', 'e of the or', 'd hiver one', 'y eight mar', 'the lead ch', 'es classica', 'ce the non ', 'al analysis', 'mormons bel', 't or at lea', ' disagreed ', 'ing system ', 'btypes base', 'anguages th', 'r commissio', 'ess one nin', 'nux suse li', ' the first ', 'zi concentr', ' society ne', 'elatively s', 'etworks sha', 'or hirohito', 'litical ini', 'n most of t', 'iskerdoo ri', 'ic overview', 'air compone', 'om acnm acc', ' centerline', 'e than any ', 'devotional ', 'de such dev'] [' a'] ['an']
'ons anarchi', 'when milita', 'lleria arch',
'ists advoca', 'ary governm', 'hes nationa',
ists advoca ary governm hes nationa d monasteri raca prince chard baer rgical lang for passeng the nationa took place ...
一共64行,也就是64个训练实例。拿“the nationa”举个例子吧:
num_unrollings = 10 train_data = 'the nationa' train_inputs = train_data[:num_unrollings] train_labels = train_data[1:] # labels are inputs shifted by one time step. print('train_inputs=', train_inputs) print('train_labels=', train_labels)
train_inputs= the nation train_labels= he nationa
这下子应该明白输入输出分别是什么吧,输出就是输入往后移动一个字符(对应的序列的one-hot向量),也就是(x,y)=(t,h)、(h,e)、(e, )……。当然它们必须构成一个连续序列,不然LSTM试图记忆的根本就不是有意义的句子。如果LSTM模型能够根据一个字符s预测该字符s的下一个字符c,我们不断地执行s+=c的话,就可以生成一整段有意义的文本了。
num_nodes = 64 graph = tf.Graph() with graph.as_default(): # Parameters: # Input gate: input, previous output, and bias. ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) ib = tf.Variable(tf.zeros([1, num_nodes])) # Forget gate: input, previous output, and bias. fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) fb = tf.Variable(tf.zeros([1, num_nodes])) # Memory cell: input, state and bias. cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) cb = tf.Variable(tf.zeros([1, num_nodes])) # Output gate: input, previous output, and bias. ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) ob = tf.Variable(tf.zeros([1, num_nodes])) # Variables saving state across unrollings. saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) # Classifier weights and biases. w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) b = tf.Variable(tf.zeros([vocabulary_size])) # Definition of the cell computation. def lstm_cell(i, o, state): """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf Note that in this formulation, we omit the various connections between the previous state and the gates.""" input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib) forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb) update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb state = forget_gate * state + input_gate * tf.tanh(update) output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob) return output_gate * tf.tanh(state), state # Input data. train_data = list() for _ in range(num_unrollings + 1): train_data.append( tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size])) train_inputs = train_data[:num_unrollings] train_labels = train_data[1:] # labels are inputs shifted by one time step. # Unrolled LSTM loop. outputs = list() output = saved_output state = saved_state for i in train_inputs: output, state = lstm_cell(i, output, state) outputs.append(output) # State saving across unrollings. with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]): # Classifier. logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits, tf.concat(0, train_labels))) # Optimizer. global_step = tf.Variable(0) learning_rate = tf.train.exponential_decay( 10.0, global_step, 5000, 0.1, staircase=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate) gradients, v = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) optimizer = optimizer.apply_gradients( zip(gradients, v), global_step=global_step) # Predictions. train_prediction = tf.nn.softmax(logits) # Sampling and validation eval: batch 1, no unrolling. sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) reset_sample_state = tf.group( saved_sample_output.assign(tf.zeros([1, num_nodes])), saved_sample_state.assign(tf.zeros([1, num_nodes]))) sample_output, sample_state = lstm_cell( sample_input, saved_sample_output, saved_sample_state) with tf.control_dependencies([saved_sample_output.assign(sample_output), saved_sample_state.assign(sample_state)]): sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))
def lstm_cell(i, o, state): """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf Note that in this formulation, we omit the various connections between the previous state and the gates.""" input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib) forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb) update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb state = forget_gate * state + input_gate * tf.tanh(update) output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob) return output_gate * tf.tanh(state), state
# Unrolled LSTM loop. outputs = list() output = saved_output state = saved_state for i in train_inputs: output, state = lstm_cell(i, output, state) ... input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib) forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb) update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb state = forget_gate * state + input_gate * tf.tanh(update) output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob) return output_gate * tf.tanh(state), state
================================================================================ jam also with poles instendent duss where the came incomplemented lead appeam of abra been knows hoine from one nine nine nine four one one blany common majon an les manubely strugule lices for pandels clha prinal temporaty the portand on b d ing doog indersevents procuctions to energos never big or copparter now to bill odners and s atmency of the english towiall were the prograge is annotles manona ================================================================================
题目 1
# Concatenate parameters sx = tf.concat(1, [ix, fx, cx, ox]) sm = tf.concat(1, [im, fm, cm, om]) sb = tf.concat(1, [ib, fb, cb, ob])
# Definition of the cell computation. def lstm_cell(i, o, state): """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf Note that in this formulation, we omit the various connections between the previous state and the gates.""" y = tf.matmul(i, sx) + tf.matmul(o, sm) + sb y_input, y_forget, update, y_output = tf.split(1, 4, y) input_gate = tf.sigmoid(y_input) forget_gate = tf.sigmoid(y_forget) output_gate = tf.sigmoid(y_output) state = forget_gate * state + input_gate * tf.tanh(update) return output_gate * tf.tanh(state), state
================================================================================ hiam result of and colle orgation this socroishn shore the world clasic to godwy on boons uss involund wectre scure o canforles was the on artic world far dows i ation notyarih the conjund jathers penime of the temperstory relege to not the t chardre in the dural forwershadiath didyation d isa astrace by one eight two eig phility fich and dr and humentingmentiindly versmos event wave basether that the ================================================================================ Validation set perplexity: 4.28
bigram_vocabulary_size = vocabulary_size * vocabulary_size class BigramBatchGenerator(object): def __init__(self, text, batch_size, num_unrollings): self._text = text self._text_size_in_chars = len(text) self._text_size = self._text_size_in_chars // 2 self._batch_size = batch_size self._num_unrollings = num_unrollings segment = self._text_size // batch_size self._cursor = [offset * segment for offset in range(batch_size)] self._last_batch = self._next_batch() def _next_batch(self): batch = np.zeros(shape=self._batch_size, dtype=np.int) for b in range(self._batch_size): char_idx = self._cursor[b] * 2 ch1 = char2id(self._text[char_idx]) if self._text_size_in_chars - 1 == char_idx: ch2 = 0 else: ch2 = char2id(self._text[char_idx + 1]) batch[b] = ch1 * vocabulary_size + ch2 self._cursor[b] = (self._cursor[b] + 1) % self._text_size return batch def next(self): batches = [self._last_batch] for step in range(self._num_unrollings): batches.append(self._next_batch()) self._last_batch = batches[-1] return batches def bi2str(encoding): return id2char(encoding // vocabulary_size) + id2char(encoding % vocabulary_size) def bigrams(encodings): return [bi2str(e) for e in encodings] def bibatches2string(batches): s = [''] * batches[0].shape[0] for b in batches: s = [''.join(x) for x in zip(s, bigrams(b))] return s bi_onehot = np.zeros((bigram_vocabulary_size, bigram_vocabulary_size)) np.fill_diagonal(bi_onehot, 1) def bigramonehot(encodings): return [bi_onehot[e] for e in encodings] train_batches = BigramBatchGenerator(train_text, 8, 8) valid_batches = BigramBatchGenerator(valid_text, 1, 1) print (bibatches2string(train_batches.next())) print (bibatches2string(train_batches.next())) print (bibatches2string(valid_batches.next())) print (bibatches2string(valid_batches.next()))
['ons anarchists adv', 'on from the nation', 'significant than i', 'ain drugs confusio', 'ate of the origina', 't or at least not ', 'he first daily col', 'rdoo ricky ricardo'] ['dvocate social rel', 'onal media and fro', ' in jersey and gue', 'ion inability to o', 'nal document fax m', 't parliament s opp', 'ollege newspaper i', 'do this classic in'] [' ana'] ['narc']
目的是用embedding_size = 128的向量来替代27*27的one-hot向量,理论上只要将这27*27个向量映射为互不相同的向量,都可以完成训练,所以这份代码直接用了随机初始化的embedding
# embeddings for all possible bigrams embeddings = tf.Variable(tf.random_uniform([bigram_vocabulary_size, embedding_size], -1.0, 1.0))
# embed input bigrams -> [batch_size, embedding_size] output, state = lstm_cell(tf.nn.embedding_lookup(embeddings, i), output, state)
for l in train_data[1:]: train_labels.append(tf.gather(bigram_one_hot, l))
def lstm_cell(i, o, state): i = tf.nn.dropout(i, keep_prob) mult = tf.matmul(i, x) + tf.matmul(o, m) + biases input_gate = tf.sigmoid(mult[:, :num_nodes]) forget_gate = tf.sigmoid(mult[:, num_nodes:num_nodes * 2]) update = mult[:, num_nodes * 3:num_nodes * 4] state = forget_gate * state + input_gate * tf.tanh(update) output_gate = tf.sigmoid(mult[:, num_nodes * 3:]) output = tf.nn.dropout(output_gate * tf.tanh(state), keep_prob) return output, state
================================================================================ geller depaqand as characturn essaot emonist the signatually only eight eight two zero zero one zero ebra colling with of specified for the however their of boun collecument win by two zero zero zero p kxeone file following from especists tv elements in the souther song perndament to the external musi ykm u kespecame the used from english city of the sullent aiklintial pddediberial mothers dorators a by of is a spate areal depindition religions with has are sproducules well two two industrems in exa ================================================================================ Validation set perplexity: 17.15
题目 3
实现sequence-to-sequence LSTM,让其学会如下序列转换规律:
For example, if your input is: the quick brown fox the model should attempt to output: eht kciuq nworb xof
model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size, target_vocab_size=vocabulary_size, buckets=[(20, 20)], size=256, num_layers=4, max_gradient_norm=5.0, batch_size=batch_size, learning_rate=1.0, learning_rate_decay_factor=0.9, use_lstm=True, forward_only=forward_only)
def rev_id(forward): temp = forward.split(' ') backward = [] for i in range(len(temp)): backward += temp[i][::-1] + ' ' return list(map(lambda x: char2id(x), backward[:-1])) batches = train_batches.next() train_sets = [] batch_encs = list(map(lambda x: list(map(lambda y: char2id(y), list(x))), batches)) batch_decs = list(map(lambda x: rev_id(x), batches)) print('x=', ''.join([id2char(x) for x in batch_encs[0]])) print('y=', ''.join([id2char(x) for x in batch_decs[0]]))
x= he diggers of the e y= eh sreggid fo eht e
训练耗时较长,完整代码在https://github.com/hankcs/udacity-deep-learning 。