TensorFlow练习7: 基于RNN生成古诗词

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 import collections import numpy as np import tensorflow as tf   #-------------------------------数据预处理---------------------------#   poetry_file = 'poetry.txt'   # 诗集 poetrys = [ ] with open ( poetry_file , "r" , encoding = 'utf-8' , ) as f : for line in f : try : title , content = line . strip ( ) . split ( ':' ) content = content . replace ( ' ' , '' ) if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content : continue if len ( content ) < 5 or len ( content ) > 79 : continue content = '[' + content + ']' poetrys . append ( content ) except Exception as e : pass   # 按诗的字数排序 poetrys = sorted ( poetrys , key = lambda line : len ( line ) ) print ( '唐诗总数: ' , len ( poetrys ) )   # 统计每个字出现次数 all_words = [ ] for poetry in poetrys : all_words += [ word for word in poetry ] counter = collections . Counter ( all_words ) count_pairs = sorted ( counter . items ( ) , key = lambda x : - x [ 1 ] ) words , _ = zip ( * count_pairs )   # 取前多少个常用字 words = words [ : len ( words ) ] + ( ' ' , ) # 每个字映射为一个数字ID word_num_map = dict ( zip ( words , range ( len ( words ) ) ) ) # 把诗转换为向量形式,参考TensorFlow练习1 to_num = lambda word : word_num_map . get ( word , len ( words ) ) poetrys_vector = [ list ( map ( to_num , poetry ) ) for poetry in poetrys ] #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] #....]   # 每次取64首诗进行训练 batch_size = 64 n_chunk = len ( poetrys_vector ) / / batch_size x_batches = [ ] y_batches = [ ] for i in range ( n_chunk ) : start_index = i * batch_size end_index = start_index + batch_size   batches = poetrys_vector [ start_index : end_index ] length = max ( map ( len , batches ) ) xdata = np . full ( ( batch_size , length ) , word_num_map [ ' ' ] , np . int32 ) for row in range ( batch_size ) : xdata [ row , : len ( batches [ row ] ) ] = batches [ row ] ydata = np . copy ( xdata ) ydata [ : , : - 1 ] = xdata [ : , 1 : ] """ xdata             ydata [6,2,4,6,9]       [2,4,6,9,9] [1,4,2,8,5]       [4,2,8,5,5] """ x_batches . append ( xdata ) y_batches . append ( ydata )     #---------------------------------------RNN--------------------------------------#   input_data = tf . placeholder ( tf . int32 , [ batch_size , None ] ) output_targets = tf . placeholder ( tf . int32 , [ batch_size , None ] ) # 定义RNN def neural_network ( model = 'lstm' , rnn_size = 128 , num_layers = 2 ) : if model == 'rnn' : cell_fun = tf . nn . rnn_cell . BasicRNNCell elif model == 'gru' : cell_fun = tf . nn . rnn_cell . GRUCell elif model == 'lstm' : cell_fun = tf . nn . rnn_cell . BasicLSTMCell   cell = cell_fun ( rnn_size , state_is_tuple = True ) cell = tf . nn . rnn_cell . MultiRNNCell ( [ cell ] * num_layers , state_is_tuple = True )   initial_state = cell . zero_state ( batch_size , tf . float32 )   with tf . variable_scope ( 'rnnlm' ) : softmax_w = tf . get_variable ( "softmax_w" , [ rnn_size , len ( words ) + 1 ] ) softmax_b = tf . get_variable ( "softmax_b" , [ len ( words ) + 1 ] ) with tf . device ( "/cpu:0" ) : embedding = tf . get_variable ( "embedding" , [ len ( words ) + 1 , rnn_size ] ) inputs = tf . nn . embedding_lookup ( embedding , input_data )   outputs , last_state = tf . nn . dynamic_rnn ( cell , inputs , initial_state = initial_state , scope = 'rnnlm' ) output = tf . reshape ( outputs , [ - 1 , rnn_size ] )   logits = tf . matmul ( output , softmax_w ) + softmax_b probs = tf . nn . softmax ( logits ) return logits , last_state , probs , cell , initial_state #训练 def train_neural_network ( ) : logits , last_state , _ , _ , _ = neural_network ( ) targets = tf . reshape ( output_targets , [ - 1 ] ) loss = tf . nn . seq2seq . sequence_loss_by_example ( [ logits ] , [ targets ] , [ tf . ones_like ( targets , dtype = tf . float32 ) ] , len ( words ) ) cost = tf . reduce_mean ( loss ) learning_rate = tf . Variable ( 0.0 , trainable = False ) tvars = tf . trainable_variables ( ) grads , _ = tf . clip_by_global_norm ( tf . gradients ( cost , tvars ) , 5 ) optimizer = tf . train . AdamOptimizer ( learning_rate ) train_op = optimizer . apply_gradients ( zip ( grads , tvars ) )   with tf . Session ( ) as sess : sess . run ( tf . initialize_all_variables ( ) )   saver = tf . train . Saver ( tf . all_variables ( ) )   for epoch in range ( 50 ) : sess . run ( tf . assign ( learning_rate , 0.002 * ( 0.97 * * epoch ) ) ) n = 0 for batche in range ( n_chunk ) : train_loss , _ , _ = sess . run ( [ cost , last_state , train_op ] , feed_dict = { input_data : x_batches [ n ] , output_targets : y_batches [ n ] } ) n += 1 print ( epoch , batche , train_loss ) if epoch % 7 == 0 : saver . save ( sess , 'poetry.module' , global_step = epoch )   train_neural_network ( )


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 import collections import numpy as np import tensorflow as tf   #-------------------------------数据预处理---------------------------#   poetry_file = 'poetry.txt'   # 诗集 poetrys = [ ] with open ( poetry_file , "r" , encoding = 'utf-8' , ) as f : for line in f : try : title , content = line . strip ( ) . split ( ':' ) content = content . replace ( ' ' , '' ) if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content : continue if len ( content ) < 5 or len ( content ) > 79 : continue content = '[' + content + ']' poetrys . append ( content ) except Exception as e : pass   # 按诗的字数排序 poetrys = sorted ( poetrys , key = lambda line : len ( line ) ) print ( '唐诗总数: ' , len ( poetrys ) )   # 统计每个字出现次数 all_words = [ ] for poetry in poetrys : all_words += [ word for word in poetry ] counter = collections . Counter ( all_words ) count_pairs = sorted ( counter . items ( ) , key = lambda x : - x [ 1 ] ) words , _ = zip ( * count_pairs )   # 取前多少个常用字 words = words [ : len ( words ) ] + ( ' ' , ) # 每个字映射为一个数字ID word_num_map = dict ( zip ( words , range ( len ( words ) ) ) ) # 把诗转换为向量形式,参考TensorFlow练习1 to_num = lambda word : word_num_map . get ( word , len ( words ) ) poetrys_vector = [ list ( map ( to_num , poetry ) ) for poetry in poetrys ] #[[314, 3199, 367, 1556, 26, 179, 680, 0, 3199, 41, 506, 40, 151, 4, 98, 1], #[339, 3, 133, 31, 302, 653, 512, 0, 37, 148, 294, 25, 54, 833, 3, 1, 965, 1315, 377, 1700, 562, 21, 37, 0, 2, 1253, 21, 36, 264, 877, 809, 1] #....]   batch_size = 1 n_chunk = len ( poetrys_vector ) / / batch_size x_batches = [ ] y_batches = [ ] for i in range ( n_chunk ) : start_index = i * batch_size end_index = start_index + batch_size   batches = poetrys_vector [ start_index : end_index ] length = max ( map ( len , batches ) ) xdata = np . full ( ( batch_size , length ) , word_num_map [ ' ' ] , np . int32 ) for row in range ( batch_size ) : xdata [ row , : len ( batches [ row ] ) ] = batches [ row ] ydata = np . copy ( xdata ) ydata [ : , : - 1 ] = xdata [ : , 1 : ] """ xdata             ydata [6,2,4,6,9]       [2,4,6,9,9] [1,4,2,8,5]       [4,2,8,5,5] """ x_batches . append ( xdata ) y_batches . append ( ydata )     #---------------------------------------RNN--------------------------------------#   input_data = tf . placeholder ( tf . int32 , [ batch_size , None ] ) output_targets = tf . placeholder ( tf . int32 , [ batch_size , None ] ) # 定义RNN def neural_network ( model = 'lstm' , rnn_size = 128 , num_layers = 2 ) : if model == 'rnn' : cell_fun = tf . nn . rnn_cell . BasicRNNCell elif model == 'gru' : cell_fun = tf . nn . rnn_cell . GRUCell elif model == 'lstm' : cell_fun = tf . nn . rnn_cell . BasicLSTMCell   cell = cell_fun ( rnn_size , state_is_tuple = True ) cell = tf . nn . rnn_cell . MultiRNNCell ( [ cell ] * num_layers , state_is_tuple = True )   initial_state = cell . zero_state ( batch_size , tf . float32 )   with tf . variable_scope ( 'rnnlm' ) : softmax_w = tf . get_variable ( "softmax_w" , [ rnn_size , len ( words ) + 1 ] ) softmax_b = tf . get_variable ( "softmax_b" , [ len ( words ) + 1 ] ) with tf . device ( "/cpu:0" ) : embedding = tf . get_variable ( "embedding" , [ len ( words ) + 1 , rnn_size ] ) inputs = tf . nn . embedding_lookup ( embedding , input_data )   outputs , last_state = tf . nn . dynamic_rnn ( cell , inputs , initial_state = initial_state , scope = 'rnnlm' ) output = tf . reshape ( outputs , [ - 1 , rnn_size ] )   logits = tf . matmul ( output , softmax_w ) + softmax_b probs = tf . nn . softmax ( logits ) return logits , last_state , probs , cell , initial_state   #-------------------------------生成古诗---------------------------------# # 使用训练完成的模型   def gen_poetry ( ) : def to_word ( weights ) : t = np . cumsum ( weights ) s = np . sum ( weights ) sample = int ( np . searchsorted ( t , np . random . rand ( 1 ) * s ) ) return words [ sample ]   _ , last_state , probs , cell , initial_state = neural_network ( )   with tf . Session ( ) as sess : sess . run ( tf . initialize_all_variables ( ) )   saver = tf . train . Saver ( tf . all_variables ( ) ) saver . restore ( sess , 'poetry.module-49' )   state_ = sess . run ( cell . zero_state ( 1 , tf . float32 ) )   x = np . array ( [ list ( map ( word_num_map . get , '[' ) ) ] ) [ probs_ , state_ ] = sess . run ( [ probs , last_state ] , feed_dict = { input_data : x , initial_state : state_ } ) word = to_word ( probs_ ) #word = words[np.argmax(probs_)] poem = '' while word != ']' : poem += word x = np . zeros ( ( 1 , 1 ) ) x [ 0 , 0 ] = word_num_map [ word ] [ probs_ , state_ ] = sess . run ( [ probs , last_state ] , feed_dict = { input_data : x , initial_state : state_ } ) word = to_word ( probs_ ) #word = words[np.argmax(probs_)] return poem   print ( gen_poetry ( ) )


1 2 3 4 5 6 7 8 9 10 11 12 13 新犬随风起,一璃迹阵悲。 浅昏罢庄哉,清插去园空。 双叶坐成鉴,王妓水正苑。 鸟声不成影,胙滩朱瓮声。 无斑红芜踏,那期日正闲。 吾燕登无士,无处得赵名。   并灭图微蒿,淮头水十荔。 晴花尚乘望,官宽留可求。 最忆青州守,英仍临阳峰。 生人隔天道,在国思山田。 登临闭石土,阵下一欢娱。 林暝今又少,孙频唯在愁。


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 def gen_poetry_with_head ( head ) : def to_word ( weights ) : t = np . cumsum ( weights ) s = np . sum ( weights ) sample = int ( np . searchsorted ( t , np . random . rand ( 1 ) * s ) ) return words [ sample ]   _ , last_state , probs , cell , initial_state = neural_network ( )   with tf . Session ( ) as sess : sess . run ( tf . initialize_all_variables ( ) )   saver = tf . train . Saver ( tf . all_variables ( ) ) saver . restore ( sess , 'poetry.module-49' )   state_ = sess . run ( cell . zero_state ( 1 , tf . float32 ) ) poem = '' i = 0 for word in head : while word != ',' and word != '。' : poem += word x = np . array ( [ list ( map ( word_num_map . get , word ) ) ] ) [ probs_ , state_ ] = sess . run ( [ probs , last_state ] , feed_dict = { input_data : x , initial_state : state_ } ) word = to_word ( probs_ ) time . sleep ( 1 ) if i % 2 == 0 : poem += ',' else : poem += '。' i += 1 return poem   print ( gen_poetry_with_head ( '一二三四' ) )



1 2 3 module_file = tf . train . latest_checkpoint ( '.' ) #print(module_file) saver . restore ( sess , module_file )

tf.initialize_all_variables()  deprecated,使用tf.global_variables_initializer()替代。

