在gru、lstm网络中加入Attention机制，具体看看网络实现：

utils方法:

   #!/usr/bin/python
# -*- coding: utf-8 -*-
from collections  import  Counter
import tensorflow.contrib.keras as kr
import  numpy as  np
import os
import codecs
import tensorflow as tf
def _read_file(filename):
    """读取文件数据"""
    counters=[]
    labels=[]
    #改用codecs模块，2.x open函数不支持utf-8编码，增加代码健壮性
    with  codecs.open(filename,'r',encoding='utf-8') as f:
        for line in f.readlines():
            try:
                label,contet=line.strip().split('\t')
                counters.append(contet.strip().split(" "))
                labels.append(label)
            except Exception as e:
                pass
    return  counters,labels
def  _read_vocab(filename):
    """读取词汇列别"""
    words=list(map(lambda line:line.strip(),codecs.open(filename,'r',encoding='utf-8').readlines()))
    word_to_id=dict(zip(words,range(len(words))))
    return words,word_to_id
def  read_vocab_predict(filename):
    """读取词汇"""
    words = list(map(lambda line: line.strip(), codecs.open(filename, 'r', encoding='utf-8').readlines()))
    word_to_id = dict(zip(words, range(len(words))))
    return  word_to_id
def _read_category():
    """返回一个分类目标分类的结果"""
    categories=["0","1"]
    cat_to_id=dict(zip(categories,range(len(categories))))
    return categories,cat_to_id
def  to_words(content,words):
    """降id表示的内容转换成文字"""
    return ''.join(words[x] for x in content)
def _file_to_ids(filename,word_to_id,max_len=300):
    """将文件转换为id表示"""
    _,cat_to_id=_read_category()
    contents,labels=_read_file(filename)
    data_id=[]
    label_id=[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_len)
    y_pad=kr.utils.to_categorical(label_id)
    return x_pad, y_pad
def  preocess_file(data_path,vocapath,seq_length=300):
    """一次性返回所有的数据"""
    words,word_to_id=_read_vocab(vocapath)
    x_train, y_train = _file_to_ids(data_path, word_to_id, seq_length)
    # x_test,y_test=_file_to_ids(os.path.join(data_path,
    #     'cnews.test.txt'), word_to_id, seq_length)
    # x_val, y_val = _file_to_ids(os.path.join(data_path,
    #     'cnews.val.txt'), word_to_id, seq_length)
    return x_train, y_train,words
# def  preocess_file_test(data_path="/Users/shuubiasahi/Desktop/rnn.txt",seq_length=300):
#     """一次性返回所有的数据"""
#     words,word_to_id=_read_vocab("vocab_cnews.txt")
#     print("words length is:", len(words))
#     print("word_to_id length is:",len(word_to_id))
#     x_train,y_train=_file_to_ids_test(data_path,word_to_id,seq_length)
#     # x_test,y_test=_file_to_ids(os.path.join(data_path,
#     #     'cnews.test.txt'), word_to_id, seq_length)
#     # x_val, y_val = _file_to_ids(os.path.join(data_path,
#     #     'cnews.val.txt'), word_to_id, seq_length)
#     return x_train, y_train,words
# def _file_to_ids_test(filename,word_to_id,max_len=300):
#     """将文件转换为id表示"""
#     _,cat_to_id=_read_category()
#     contents,labels=_read_file(filename)
#
#     data_id=[]
#     label_id=[]
#     for i in range(len(contents)):
#         data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
#         label_id.append(cat_to_id[labels[i]])
#     # 使用keras提供的pad_sequences来将文本pad为固定长度
#
#     print("contens is:", contents)
#     print("data id is:",data_id)
#     print("labels is:", labels)
#     print("label id is:",label_id)
#     x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_len)
#     y_pad=kr.utils.to_categorical(label_id)
#     print("xpad is：",x_pad)
#     print("ypad is:",y_pad)
#     return x_pad, y_pad
def  file_to_ids_single(content,word_to_id,maxlen=300):
    contents=[]
    contents.append(list(content.lower()))
    data_id = []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
    #print("data_id is ：",data_id)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, maxlen)
    return x_pad
def batch_iter(data,batch_size=64,num_epochs=5):
    """生成批次数据"""
    data=np.array(data)
    data_size=len(data)
    num_batchs_per_epchs=int((data_size-1)/batch_size)+1
    for epoch in range(num_epochs):
        indices=np.random.permutation(np.arange(data_size))
        shufflfed_data=data[indices]
        for batch_num  in range(num_batchs_per_epchs):
            start_index=batch_num*batch_size
            end_index=min((batch_num + 1) * batch_size, data_size)
            yield  shufflfed_data[start_index:end_index]
if __name__=='__main__':
    """data_id is ： [[266, 1548, 255]]"""
    words, word_to_id = _read_vocab("vocab_cnews.txt")
    print("len word_to_id:",len(word_to_id))
    result=file_to_ids_single("日你个香蕉芭乐",word_to_id=word_to_id)
    print(result[0][299])
    print(result)
    #build_vocab(Path.baseabusepath)
    # x_train, y_train, words = preocess_file()
    # print(x_train.shape, y_train.shape)

显示config：

  class AttentionConfig(object):
    embedding_dim = 64  # 词向量维度
    seq_len = 300  # 序列长度
    num_classes = 2  # 类别个数
    vocab_size = 9000  # 词汇表的大小
    num_rnn_layers = 2  # 隐含层的层数
    rnn_size = 128  # 隐藏层神经元
    rnn = 'gru'  # lstm 或 gru
    keep_prob = 0.6  # dropout保留比例
    learning_rate = 1e-3  # 学习率
    batch_size = 128  # 每批训练大小
    num_epochs = 10  # 总迭代轮次
    print_per_batch = 100  # 每多少轮输出一次结果
    l2_reg_lambda = 0.006
    attention_dim=100
    max_grad_norm=5
    isgru = False

model:

   import tensorflow as tf
class RnnAttention:
    def __init__(self, config):
        # define input variable
        self.config=config
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_len],name="input_x")
        self.input_y = tf.placeholder(tf.float32,[None,self.config.num_classes],name="input_y")
        self.keep_prob= tf.placeholder(tf.float32, name='keep_prob')
        self.birnn()
        #self.mubirnn()
    def input_embedding(self):
        """词嵌套
         这里先把指定gpu的程序去掉，线上用cpu部署，指定gpu模型会报错
        """
        # with tf.device('/gpu:0'):
        embeddings = tf.get_variable("embedding", [self.config.vocab_size, self.config.embedding_dim])
        inputs = tf.nn.embedding_lookup(embeddings, self.input_x)
        return inputs
    def birnn(self):
        inputs=self.input_embedding()
        with tf.name_scope("rnn"):
            def gru():
                rnn_cell_fw= tf.contrib.rnn.GRUCell(num_units=self.config.rnn_size)
                rnn_cell_cw = tf.contrib.rnn.GRUCell(num_units=self.config.rnn_size)
                return rnn_cell_cw,rnn_cell_fw
            def  lstm():
                rnn_cell_fw = tf.contrib.rnn.LSTMCell(num_units=self.config.rnn_size)
                rnn_cell_cw = tf.contrib.rnn.LSTMCell(num_units=self.config.rnn_size)
                return rnn_cell_cw, rnn_cell_fw
            if self.config.isgru:
                rnn_cell_cw, rnn_cell_fw=gru()
            else:
                rnn_cell_cw, rnn_cell_fw=lstm()
            rnn_outputs,_=tf.nn.bidirectional_dynamic_rnn(cell_fw=rnn_cell_fw,
                                                          cell_bw=rnn_cell_cw,
                                                          inputs=inputs,dtype=tf.float32)
            rnn_outputs=tf.concat(rnn_outputs,2)
        # An attention model
        with tf.name_scope("attention"):
            # Attention mechanism
            sequence_length = rnn_outputs.shape[1].value  # the length of sequences processed in the antecedent RNN layer
            hidden_size = rnn_outputs.shape[2].value  # hidden size of the RNN laye
            W = tf.Variable(
                tf.truncated_normal([hidden_size, self.config.attention_dim],
                                    stddev=0.1), name="W"
            )
            b = tf.Variable(tf.random_normal([self.config.attention_dim], stddev=0.1),
                            name="b")
            u = tf.Variable(tf.random_normal([self.config.attention_dim], stddev=0.1),
                            name="u")
            v = tf.tanh(tf.matmul(tf.reshape(rnn_outputs, [-1, hidden_size]), W) + tf.reshape(b, [1, -1]))
            vu = tf.matmul(v, tf.reshape(u, [-1, 1]))
            exps = tf.reshape(tf.exp(vu), [-1, sequence_length])
            alphas = exps / tf.reshape(tf.reduce_sum(exps, 1), [-1, 1])
            # Output of Bi-gru is reduced with attention vector
            output = tf.reduce_sum(rnn_outputs * tf.reshape(alphas, [-1, sequence_length, 1]), 1)
            #增加weight的损失函数
            tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(W),
                               tf.GraphKeys.REGULARIZATION_LOSSES)
            tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(b),
                               tf.GraphKeys.REGULARIZATION_LOSSES)
            tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(u), tf.GraphKeys.REGULARIZATION_LOSSES)
            dropout_outputs = tf.nn.dropout(
                output, self.keep_prob,
                name="dropout")
        with tf.name_scope("score"):
            W = tf.Variable(
                tf.truncated_normal(
                    [dropout_outputs.shape[1].value, self.config.num_classes], stddev=0.1
                ),
                name="W"
            )
            b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name="b")
            tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(W), tf.GraphKeys.REGULARIZATION_LOSSES)
            tf.losses.add_loss(self.config.l2_reg_lambda * tf.nn.l2_loss(b), tf.GraphKeys.REGULARIZATION_LOSSES)
            self.scores = tf.nn.xw_plus_b(dropout_outputs, W, b, name="scores")
            self.pred_y = tf.nn.softmax(self.scores, name="pred_y")
            tf.add_to_collection('pred_network', self.pred_y)
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
        #计算损失函数
        with tf.name_scope("loss"):
            tf.losses.softmax_cross_entropy(
                logits=self.scores,onehot_labels =self.input_y
            )
            self.cost =tf.losses.get_total_loss()
        # 优化器
        with tf.name_scope("optimize"):
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.config.learning_rate)
            self.train_op = optimizer.minimize(self.cost)
         # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.acc = tf.reduce_mean(
                tf.cast(correct_predictions, "float"),
                name="accuracy")

run：

   from  attentionmodelrnn import RnnAttention
from  configuration import AttentionConfig
from  data_utils  import preocess_file,batch_iter
import time
import tensorflow as tf
import os
from  datetime  import timedelta
#basepath="/Users/shuubiasahi/Documents/python"
#noexperience
#business
#together
basepath="/home/zhoumeixu"
data_path=basepath+"/credit-tftextclassify/tensorflow/noexperience/cnn.txt"
vocapath=basepath+"/credit-tftextclassify/tensorflow/noexperience/vocab.txt"
modelpath=basepath+"/credit-tftextclassify/tensorflow/noexperience/"
print(modelpath,"attenion相关模型开始训练")
def run_epoch(rnn=False):
    # 载入数据
    print('Loading data...')
    start_time = time.time()
    x_train, y_train, words = preocess_file(data_path,
                                            vocapath)
    config = AttentionConfig()
    if config.isgru:
        print('Using attention gru model...')
    else:
        print('Using attention lstm model...')
    config.vocab_size = len(words)
    print("vocab_size is:", config.vocab_size)
    model = RnnAttention(config)
    tensorboard_dir = basepath+'/boardlog'
    end_time = time.time()
    time_dif = end_time - start_time
    time_dif = timedelta(seconds=int(round(time_dif)))
    print('Time usage:', time_dif)
    print('Constructing TensorFlow Graph...')
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    # 配置 tensorboard
    tf.summary.scalar("loss", model.cost)
    tf.summary.scalar("accuracy", model.acc)
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)
    writer.add_graph(session.graph)
    # 生成批次数据
    print('Generating batch...')
    batch_train = batch_iter(list(zip(x_train, y_train)),
        config.batch_size, config.num_epochs)
    def feed_data(batch):
        """准备需要喂入模型的数据"""
        x_batch, y_batch = zip(*batch)
        feed_dict = {
            model.input_x: x_batch,
            model.input_y: y_batch
        }
        return feed_dict, len(x_batch)
    def evaluate(x_, y_):
        """
        模型评估
        一次运行所有的数据会OOM，所以需要分批和汇总
        """
        batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
        total_loss = 0.0
        total_acc = 0.0
        cnt = 0
        for batch in batch_eval:
            feed_dict, cur_batch_len = feed_data(batch)
            feed_dict[model.keep_prob] = 1.0
            loss, acc = session.run([model.loss, model.acc],
                feed_dict=feed_dict)
            total_loss += loss * cur_batch_len
            total_acc += acc * cur_batch_len
            cnt += cur_batch_len
        return total_loss / cnt, total_acc / cnt
    # 训练与验证
    print('Training and evaluating...')
    start_time = time.time()
    print_per_batch = config.print_per_batch
    for i, batch in enumerate(batch_train):
        feed_dict, _ = feed_data(batch)
        feed_dict[model.keep_prob] = config.keep_prob
        if i % 5 == 0:  # 每5次将训练结果写入tensorboard scalar
            s = session.run(merged_summary, feed_dict=feed_dict)
            writer.add_summary(s, i)
        if i % print_per_batch == print_per_batch - 1:  # 每200次输出在训练集和验证集上的性能
            loss_train, acc_train = session.run([model.cost, model.acc],
                feed_dict=feed_dict)
            #loss, acc = evaluate(x_val, y_val)   验证机暂时不需要
            # 时间
            end_time = time.time()
            time_dif = end_time - start_time
            time_dif = timedelta(seconds=int(round(time_dif)))
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
                + '  Time: {3}'
            print(msg.format(i + 1, loss_train, acc_train, time_dif))
        if i%500==0  and i>0:
            graph = tf.graph_util.convert_variables_to_constants(session, session.graph_def,
                                                                 ["keep_prob", "input_x", "score/pred_y"])
            tf.train.write_graph(graph, ".", modelpath+"graphattention.model",
                             as_text=False)
            print("attention模型在第{0}步已经保存".format(i))
        session.run(model.train_op, feed_dict=feed_dict)  # 运行优化
    session.close()
if __name__ == '__main__':
    #run_epoch(rnn=True)
     run_epoch(rnn=False)

结果:

   2017-11-25 13:48:37.183112: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 3920 get requests, put_count=6089 evicted_count=2000 eviction_rate=0.328461 and unsatisfied allocation rate=0
2017-11-25 13:48:37.995920: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 5649 get requests, put_count=8853 evicted_count=3000 eviction_rate=0.338868 and unsatisfied allocation rate=0
2017-11-25 13:48:39.200365: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 2297 get requests, put_count=3569 evicted_count=1000 eviction_rate=0.280191 and unsatisfied allocation rate=0
2017-11-25 13:48:40.431985: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 2448 get requests, put_count=3810 evicted_count=1000 eviction_rate=0.262467 and unsatisfied allocation rate=0
Iter:    100, Train Loss:   0.24, Train Acc:  95.31%,  Time: 0:00:42
Iter:    200, Train Loss:   0.14, Train Acc:  96.09%,  Time: 0:01:23
Iter:    300, Train Loss:   0.14, Train Acc:  95.31%,  Time: 0:02:03
Iter:    400, Train Loss:    0.1, Train Acc:  97.66%,  Time: 0:02:44
Iter:    500, Train Loss:   0.27, Train Acc:  89.84%,  Time: 0:03:25
Converted 10 variables to const ops.
attention模型在第500步已经保存
Iter:    600, Train Loss:   0.16, Train Acc:  93.75%,  Time: 0:04:06
Iter:    700, Train Loss:   0.15, Train Acc:  96.09%,  Time: 0:04:46
Iter:    800, Train Loss:   0.14, Train Acc:  94.53%,  Time: 0:05:26
Iter:    900, Train Loss:    0.1, Train Acc:  95.31%,  Time: 0:06:06
Iter:   1000, Train Loss:   0.11, Train Acc:  93.75%,  Time: 0:06:47
Converted 10 variables to const ops.
attention模型在第1000步已经保存
Iter:   1100, Train Loss:  0.044, Train Acc:  99.22%,  Time: 0:07:28
Iter:   1200, Train Loss:   0.23, Train Acc:  90.62%,  Time: 0:08:09
Iter:   1300, Train Loss:   0.11, Train Acc:  96.88%,  Time: 0:08:51
Iter:   1400, Train Loss:  0.077, Train Acc:  96.88%,  Time: 0:09:31
Iter:   1500, Train Loss:  0.087, Train Acc:  96.09%,  Time: 0:10:11
Converted 10 variables to const ops.
attention模型在第1500步已经保存
Iter:   1600, Train Loss:   0.11, Train Acc:  96.88%,  Time: 0:10:52
Iter:   1700, Train Loss:  0.099, Train Acc:  95.31%,  Time: 0:11:32
Iter:   1800, Train Loss:   0.08, Train Acc:  96.09%,  Time: 0:12:13
Iter:   1900, Train Loss:    0.1, Train Acc:  96.88%,  Time: 0:12:53
Iter:   2000, Train Loss:   0.13, Train Acc:  94.53%,  Time: 0:13:34
Converted 10 variables to const ops.

原文链接：https://blog.csdn.net/luoyexuge/article/details/78631598