update rnn_crf and seq2seq with model train.

2018-07-16 18:59:39 +08:00 · 2018-07-16 18:59:39 +08:00 · 1d1d5f4aa6
parent e482c736a3
commit 1d1d5f4aa6
16 changed files with 392 additions and 1977 deletions
--- a/pycorrector/rnn_crf/infer.py
+++ b/pycorrector/rnn_crf/infer.py
@ -50,7 +50,6 @@ def save_preds(preds, test_ids, X_test, ids_word_dict,
            for j in range(len(sent_ids)):
                if sent_ids[j] != 0:
                    label.append(preds[i][j])
            print(label)
            continue_error = False
            has_error = False
            current_error = 0
@ -80,7 +79,9 @@ def save_preds(preds, test_ids, X_test, ids_word_dict,
 def is_error_label_id(label_id, label_ids_dict):
-    return label_id != label_ids_dict['O']
+    # return label_id != label_ids_dict['O']
    return label_id == label_ids_dict['M'] or label_id == label_ids_dict['R'] or label_id == label_ids_dict[
        'S'] or label_id == label_ids_dict['W']
 if __name__ == '__main__':
--- a/pycorrector/rnn_crf/preprocess.py
+++ b/pycorrector/rnn_crf/preprocess.py
@ -30,7 +30,8 @@ def parse_xml_file(path):
                    error_type_change = 'B-' + error_type
                else:
                    error_type_change = 'I-' + error_type
-                locate_dict[i] = error_type_change
+                # locate_dict[i] = error_type_change
                locate_dict[i] = error_type
        # Segment with pos
        word_seq, pos_seq = segment(text, cut_type='char', pos=True)
        word_arr, label_arr = [], []
@ -69,7 +70,8 @@ def parse_txt_file(input_path, truth_path):
                        error_type_change = 'B-' + error_type
                    else:
                        error_type_change = 'I-' + error_type
-                    locate_dict[i] = error_type_change
+                    # locate_dict[i] = error_type_change
                    locate_dict[i] = error_type
                # for i in range(int(start_off) - 1, int(end_off)):
                #     locate_dict[i] = error_type
                if text_id in truth_dict:
--- a/pycorrector/rnn_crf/rnn_crf_config.py
+++ b/pycorrector/rnn_crf/rnn_crf_config.py
@ -6,14 +6,16 @@ import os
 output_dir = './output'
 # CGED chinese corpus
-train_paths = ['../data/cn/CGED/CGED18_HSK_TrainingSet.xml',
+train_paths = [
    '../data/cn/CGED/CGED18_HSK_TrainingSet.xml',
    '../data/cn/CGED/CGED17_HSK_TrainingSet.xml',
    '../data/cn/CGED/CGED16_HSK_TrainingSet.xml',
    # '../data/cn/CGED/sample_HSK_TrainingSet.xml',
 ]
 train_word_path = output_dir + '/train_words.txt'
 train_label_path = output_dir + '/train_labels.txt'
-test_paths = {'../data/cn/CGED/CGED16_HSK_Test_Input.txt': '../data/cn/CGED/CGED16_HSK_Test_Truth.txt',
+test_paths = {
    '../data/cn/CGED/CGED16_HSK_Test_Input.txt': '../data/cn/CGED/CGED16_HSK_Test_Truth.txt',
    '../data/cn/CGED/CGED17_HSK_Test_Input.txt': '../data/cn/CGED/CGED17_HSK_Test_Truth.txt',
    # '../data/cn/CGED/sample_HSK_Test_Input.txt': '../data/cn/CGED/sample_HSK_Test_Truth.txt',
 }
@ -31,7 +33,7 @@ embedding_dim = 100
 rnn_hidden_dim = 200
 maxlen = 300
 cutoff_frequency = 5
-dropout = 0.5
+dropout = 0.25
 save_model_path = output_dir + '/rnn_crf_model.h5'  # Path of the model saved, default is output_path/model
 # infer
--- a/pycorrector/rnn_crf/rnn_crf_model.py
+++ b/pycorrector/rnn_crf/rnn_crf_model.py
@ -16,7 +16,7 @@ def create_model(word_dict, label_dict, embedding_dim=100, rnn_hidden_dim=200, d
    # build model
    model = Sequential()
    # embedding
-    model.add(Embedding(len(word_dict), embedding_dim))
+    model.add(Embedding(len(word_dict), embedding_dim, mask_zero=True))
    # bilstm
    model.add(Bidirectional(LSTM(rnn_hidden_dim // 2, return_sequences=True,
                                 recurrent_dropout=dropout)))
@ -25,6 +25,7 @@ def create_model(word_dict, label_dict, embedding_dim=100, rnn_hidden_dim=200, d
    model.add(crf)
    # loss
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model
--- a/pycorrector/rnn_crf/train.py
+++ b/pycorrector/rnn_crf/train.py
@ -45,11 +45,15 @@ def train(train_word_path=None,
    # read data to index
    word_ids = vectorize_data(train_word_path, word_ids_dict)
    label_ids = vectorize_data(train_label_path, label_ids_dict)
    max_len = np.max([len(i) for i in word_ids])
    print('max_len:', max_len)
    # pad sequence
    word_seq = pad_sequence(word_ids, maxlen=maxlen)
    label_seq = pad_sequence(label_ids, maxlen=maxlen)
    # reshape label for crf model use
    label_seq = np.reshape(label_seq, (label_seq.shape[0], label_seq.shape[1], 1))
    print(word_seq.shape)
    print(label_seq.shape)
    logger.info("Data loaded.")
    # model
    logger.info("Training BILSTM_CRF model...")
--- a/pycorrector/seq2seq/cged_config.py
+++ b/pycorrector/seq2seq/cged_config.py
@ -3,36 +3,22 @@
 # Brief: Use CGED corpus
 import os
 output_dir = './output'
 model_path = './output/cged_model'  # Path of the model saved, default is output_path/model
 # CGED chinese corpus
-raw_train_paths = ['../data/cn/CGED/CGED18_HSK_TrainingSet.xml',
+raw_train_paths = [
    # '../data/cn/CGED/CGED18_HSK_TrainingSet.xml',
    '../data/cn/CGED/CGED17_HSK_TrainingSet.xml',
-                   '../data/cn/CGED/CGED16_HSK_TrainingSet.xml',
+    # '../data/cn/CGED/CGED16_HSK_TrainingSet.xml',
    # '../data/cn/CGED/sample_HSK_TrainingSet.xml',
 ]
 output_dir = './output'
 train_path = output_dir + '/train.txt'  # Training data path.
 test_path = output_dir + '/test.txt'  # Validation data path.
 num_steps = 3000  # Number of steps to train.
 decode_sentence = False  # Whether we should decode sentences of the user.
-# Config
+# config
 buckets = [(10, 10), (15, 15), (20, 20), (40, 40)]  # use a number of buckets and pad to the closest one for efficiency.
 steps_per_checkpoint = 100
 max_steps = 10000
 max_vocab_size = 10000
 size = 512
 num_layers = 4
 max_gradient_norm = 5.0
 batch_size = 128
-learning_rate = 0.5
+epochs = 10
-learning_rate_decay_factor = 0.99
+rnn_hidden_dim = 200
-use_lstm = False
+save_model_path = output_dir + '/cged_seq2seq_model.h5'  # Path of the model saved
 use_rms_prop = False
-enable_decode_sentence = False  # Test with input error sentence
+if not os.path.exists(output_dir):
-enable_test_decode = True  # Test with test set
+    os.makedirs(output_dir)
 if not os.path.exists(model_path):
    os.makedirs(model_path)
--- a/pycorrector/seq2seq/corpus_reader.py
+++ b/pycorrector/seq2seq/corpus_reader.py
@ -14,12 +14,11 @@ class FCEReader(Reader):
    DROPOUT_TOKENS = {"a", "an", "the", "'ll", "'s", "'m", "'ve"}
    REPLACEMENTS = {"there": "their", "their": "there", "then": "than", "than": "then"}
-    def __init__(self, config, train_path=None, token_2_id=None,
+    def __init__(self, train_path=None, token_2_id=None,
-                 dropout_prob=0.25, replacement_prob=0.25, dataset_copies=2):
+                 dropout_prob=0.25, replacement_prob=0.25):
        super(FCEReader, self).__init__(
-            config, train_path=train_path, token_2_id=token_2_id,
+            train_path=train_path, token_2_id=token_2_id,
-            special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN, FCEReader.UNKNOWN_TOKEN],
+            special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN, FCEReader.UNKNOWN_TOKEN])
            dataset_copies=dataset_copies)
        self.dropout_prob = dropout_prob
        self.replacement_prob = replacement_prob
        self.UNKNOWN_ID = self.token_2_id[FCEReader.UNKNOWN_TOKEN]
@ -33,19 +32,6 @@ class FCEReader(Reader):
                    break
                source = line_src.lower()[5:].strip().split()
                target = line_dst.lower()[5:].strip().split()
                if self.config.enable_special_error:
                    new_source = []
                    for token in source:
                        # Random dropout words from the input
                        dropout_token = (token in FCEReader.DROPOUT_TOKENS and
                                         random.random() < self.dropout_prob)
                        replace_token = (token in FCEReader.REPLACEMENTS and
                                         random.random() < self.replacement_prob)
                        if replace_token:
                            new_source.append(FCEReader.REPLACEMENTS[source])
                        elif not dropout_token:
                            new_source.append(token)
                    source = new_source
                yield source, target
    def unknown_token(self):
@ -68,11 +54,10 @@ class CGEDReader(Reader):
    """
    UNKNOWN_TOKEN = 'UNK'
-    def __init__(self, config, train_path=None, token_2_id=None, dataset_copies=2):
+    def __init__(self, train_path=None, token_2_id=None):
        super(CGEDReader, self).__init__(
-            config, train_path=train_path, token_2_id=token_2_id,
+            train_path=train_path, token_2_id=token_2_id,
-            special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN, CGEDReader.UNKNOWN_TOKEN],
+            special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN, CGEDReader.UNKNOWN_TOKEN])
            dataset_copies=dataset_copies)
        self.UNKNOWN_ID = self.token_2_id[CGEDReader.UNKNOWN_TOKEN]
    def read_samples_by_string(self, path):
@ -98,3 +83,12 @@ class CGEDReader(Reader):
                    if line and len(line) > 5:
                        yield line.lower()[5:].strip().split()
                i += 1
    @staticmethod
    def read_vocab(input_texts):
        vocab = set()
        for line in input_texts:
            for char in line:
                if char not in vocab:
                    vocab.add(char)
        return sorted(list(vocab))
--- a/pycorrector/seq2seq/corrector_model.py
+++ b/pycorrector/seq2seq/corrector_model.py
@ -1,415 +0,0 @@
 # -*- coding: utf-8 -*-
 # Author: XuMing <xuming624@qq.com>
 # Brief:
 import random
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from pycorrector.seq2seq import seq2seq
 from pycorrector.seq2seq.reader import PAD_ID, GO_ID
 class CorrectorModel(object):
    """Sequence-to-sequence model used to correct grammatical errors in text.
    NOTE: mostly copied from TensorFlow's seq2seq_model.py; only modifications
    are:
     - the introduction of RMSProp as an optional optimization algorithm
     - the introduction of a "projection bias" that biases decoding towards
       selecting tokens that appeared in the input
    """
    def __init__(self, source_vocab_size, target_vocab_size, buckets, size,
                 num_layers, max_gradient_norm, batch_size, learning_rate,
                 learning_rate_decay_factor, use_lstm=False,
                 num_samples=512, forward_only=False, config=None,
                 corrective_tokens_mask=None):
        """Create the model.
        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input
            length that will be processed in that bucket, and O specifies
            maximum output length. Training instances that have longer than I
            or outputs longer than O will be pushed to the next bucket and
            padded accordingly. We assume that the list is sorted, e.g., [(2,
            4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g.,
            for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when
            needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the
            model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.config = config
        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="encoder{0}".format(
                                                          i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="decoder{0}".format(
                                                          i)))
            self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                      name="weight{0}".format(
                                                          i)))
        # One hot encoding of corrective tokens.
        corrective_tokens_tensor = tf.constant(corrective_tokens_mask if
                                               corrective_tokens_mask else
                                               np.zeros(self.target_vocab_size),
                                               shape=[self.target_vocab_size],
                                               dtype=tf.float32)
        batched_corrective_tokens = tf.stack(
            [corrective_tokens_tensor] * self.batch_size)
        self.batch_corrective_tokens_mask = batch_corrective_tokens_mask = \
            tf.placeholder(
                tf.float32,
                shape=[None, None],
                name="corrective_tokens")
        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                   for i in range(len(self.decoder_inputs) - 1)]
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary
        # size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w", [size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)
            def sampled_loss(labels, logits):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, labels, logits,
                                                  num_samples,
                                                  self.target_vocab_size)
            softmax_loss_function = sampled_loss
        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            """
            :param encoder_inputs: list of length equal to the input bucket
            length of 1-D tensors (of length equal to the batch size) whose
            elements consist of the token index of each sample in the batch
            at a given index in the input.
            :param decoder_inputs:
            :param do_decode:
            :return:
            """
            if do_decode:
                # Modify bias here to bias the model towards selecting words
                # present in the input sentence.
                input_bias = self.build_input_bias(encoder_inputs,
                                                   batch_corrective_tokens_mask)
                # Redefined seq2seq to allow for the injection of a special
                # decoding function that
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs, decoder_inputs, cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode,
                    loop_fn_factory=
                    apply_input_bias_and_extract_argmax_fn_factory(input_bias))
            else:
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs, decoder_inputs, cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs, self.decoder_inputs, targets,
                self.target_weights, buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            if output_projection is not None:
                for b in range(len(buckets)):
                    # We need to apply the same input bias used during model
                    # evaluation when decoding.
                    input_bias = self.build_input_bias(
                        self.encoder_inputs[:buckets[b][0]],
                        batch_corrective_tokens_mask)
                    self.outputs[b] = [
                        project_and_apply_input_bias(output, output_projection,
                                                     input_bias)
                        for output in self.outputs[b]]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs, self.decoder_inputs, targets,
                self.target_weights, buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)
        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.RMSPropOptimizer(0.001) if self.config.use_rms_prop \
                else tf.train.GradientDescentOptimizer(self.learning_rate)
            # opt = tf.train.AdamOptimizer()
            for b in range(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step))
        self.saver = tf.train.Saver(tf.global_variables())
    def build_input_bias(self, encoder_inputs, batch_corrective_tokens_mask):
        packed_one_hot_inputs = tf.one_hot(indices=tf.stack(
            encoder_inputs, axis=1), depth=self.target_vocab_size)
        return tf.maximum(batch_corrective_tokens_mask,
                          tf.reduce_max(packed_one_hot_inputs,
                                        reduction_indices=1))
    def step(self, session, encoder_inputs, decoder_inputs, target_weights,
             bucket_id, forward_only, corrective_tokens=None):
        """Run a step of the model feeding the given inputs.
        Args:
          session: tensorflow session to use.
          encoder_inputs: list of numpy int vectors to feed as encoder inputs.
          decoder_inputs: list of numpy int vectors to feed as decoder inputs.
          target_weights: list of numpy float vectors to feed as target weights.
          bucket_id: which bucket of the model to use.
          forward_only: whether to do the backward step or only forward.
        Returns:
          A triple consisting of gradient norm (or None if we did not do
          backward), average perplexity, and the outputs.
        Raises:
          ValueError: if length of encoder_inputs, decoder_inputs, or
            target_weights disagrees with bucket size for the specified
            bucket_id.
        """
        # Check if the sizes match.
        encoder_size, decoder_size = self.buckets[bucket_id]
        if len(encoder_inputs) != encoder_size:
            raise ValueError("Encoder length must be equal to the one in bucket,"
                             " %d != %d." % (len(encoder_inputs), encoder_size))
        if len(decoder_inputs) != decoder_size:
            raise ValueError("Decoder length must be equal to the one in bucket,"
                             " %d != %d." % (len(decoder_inputs), decoder_size))
        if len(target_weights) != decoder_size:
            raise ValueError("Weights length must be equal to the one in bucket,"
                             " %d != %d." % (len(target_weights), decoder_size))
        # Input feed: encoder inputs, decoder inputs, target_weights,
        # as provided.
        input_feed = {}
        for l in range(encoder_size):
            input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
        for l in range(decoder_size):
            input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
            input_feed[self.target_weights[l].name] = target_weights[l]
        corrective_tokens_vector = (corrective_tokens if
                                    corrective_tokens is not None else
                                    np.zeros(self.target_vocab_size))
        batch_corrective_tokens = np.repeat([corrective_tokens_vector],
                                            self.batch_size, axis=0)
        input_feed[self.batch_corrective_tokens_mask.name] = (
            batch_corrective_tokens)
        # Since our targets are decoder inputs shifted by one, we need one more.
        last_target = self.decoder_inputs[decoder_size].name
        input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
        # Output feed: depends on whether we do a backward step or not.
        if not forward_only:
            output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
                           self.gradient_norms[bucket_id],  # Gradient norm.
                           self.losses[bucket_id]]  # Loss for this batch.
        else:
            output_feed = [self.losses[bucket_id]]  # Loss for this batch.
            for l in range(decoder_size):  # Output logits.
                output_feed.append(self.outputs[bucket_id][l])
        outputs = session.run(output_feed, input_feed)
        if not forward_only:
            # Gradient norm, loss, no outputs.
            return outputs[1], outputs[2], None
        else:
            # No gradient norm, loss, outputs.
            return None, outputs[0], outputs[1:]
    def get_batch(self, data, bucket_id):
        """Get a random batch of data from the specified bucket, prepare for
        step.
        To feed data in step(..) it must be a list of batch-major vectors, while
        data here contains single length-major cases. So the main logic of this
        function is to re-index data cases to be in the proper format for
        feeding.
        Args:
          data: a tuple of size len(self.buckets) in which each element contains
            lists of pairs of input and output data that we use to create a
            batch.
          bucket_id: integer, which bucket to get the batch for.
        Returns:
          The triple (encoder_inputs, decoder_inputs, target_weights) for
          the constructed batch that has the proper format to call step(...)
          later.
        """
        encoder_size, decoder_size = self.buckets[bucket_id]
        encoder_inputs, decoder_inputs = [], []
        # Get a random batch of encoder and decoder inputs from data,
        # pad them if needed, reverse encoder inputs and add GO to decoder.
        for _ in range(self.batch_size):
            encoder_input, decoder_input = random.choice(data[bucket_id])
            # Encoder inputs are padded and then reversed.
            encoder_pad = [PAD_ID] * (
                encoder_size - len(encoder_input))
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
            # Decoder inputs get an extra "GO" symbol, and are padded then.
            decoder_pad_size = decoder_size - len(decoder_input) - 1
            decoder_inputs.append([GO_ID] + decoder_input +
                                  [PAD_ID] * decoder_pad_size)
        # Now we create batch-major vectors from the data selected above.
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
        # Batch encoder inputs are just re-indexed encoder_inputs.
        for length_idx in range(encoder_size):
            batch_encoder_inputs.append(
                np.array([encoder_inputs[batch_idx][length_idx]
                          for batch_idx in range(self.batch_size)],
                         dtype=np.int32))
        # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
        for length_idx in range(decoder_size):
            batch_decoder_inputs.append(
                np.array([decoder_inputs[batch_idx][length_idx]
                          for batch_idx in range(self.batch_size)],
                         dtype=np.int32))
            # Create target_weights to be 0 for targets that are padding.
            batch_weight = np.ones(self.batch_size, dtype=np.float32)
            for batch_idx in range(self.batch_size):
                # We set weight to 0 if the corresponding target is a PAD
                # symbol. The corresponding target is decoder_input shifted by 1
                # forward.
                if length_idx < decoder_size - 1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == decoder_size - 1 or target == PAD_ID:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)
        return batch_encoder_inputs, batch_decoder_inputs, batch_weights
 def project_and_apply_input_bias(logits, output_projection, input_bias):
    if output_projection is not None:
        logits = nn_ops.xw_plus_b(
            logits, output_projection[0], output_projection[1])
    # Apply softmax to ensure all tokens have a positive value.
    probs = tf.nn.softmax(logits)
    # Apply input bias, which is a mask of shape [batch, vocab len]
    # where each token from the input in addition to all "corrective"
    # tokens are set to 1.0.
    return tf.multiply(probs, input_bias)
 def apply_input_bias_and_extract_argmax_fn_factory(input_bias):
    """
    :param encoder_inputs: list of length equal to the input bucket
    length of 1-D tensors (of length equal to the batch size) whose
    elements consist of the token index of each sample in the batch
    at a given index in the input.
    :return:
    """
    def fn_factory(embedding, output_projection=None, update_embedding=True):
        """Get a loop_function that extracts the previous symbol and embeds it.
        Args:
          embedding: embedding tensor for symbols.
          output_projection: None or a pair (W, B). If provided, each fed previous
            output will first be multiplied by W and added B.
          update_embedding: Boolean; if False, the gradients will not propagate
            through the embeddings.
        Returns:
          A loop function.
        """
        def loop_function(prev, _):
            prev = project_and_apply_input_bias(prev, output_projection,
                                                input_bias)
            prev_symbol = math_ops.argmax(prev, 1)
            # Note that gradients will not propagate through the second
            # parameter of embedding_lookup.
            emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
            if not update_embedding:
                emb_prev = array_ops.stop_gradient(emb_prev)
            return emb_prev, prev_symbol
        return loop_function
    return fn_factory
--- a/pycorrector/seq2seq/fce_config.py
+++ b/pycorrector/seq2seq/fce_config.py
@ -1,35 +0,0 @@
 # -*- coding: utf-8 -*-
 # Author: XuMing <xuming624@qq.com>
 # Brief: Use FCE english corpus
 import os
 # FCE english corpus
 train_path = '../data/en/fce/fce_train.txt'  # Training data path.
 val_path = '../data/en/fce/fce_val.txt'  # Validation data path.
 test_path = '../data/en/fce/fce_test.txt'
 model_path = './output/fce_model'  # Path of the model saved, default is output_path/model
 enable_special_error = False
 num_steps = 3000  # Number of steps to train.
 decode_sentence = False  # Whether we should decode sentences of the user.
 # Config
 buckets = [(10, 10), (15, 15), (20, 20), (40, 40)]  # use a number of buckets and pad to the closest one for efficiency.
 steps_per_checkpoint = 100
 max_steps = 2000
 max_vocab_size = 10000
 size = 512
 num_layers = 1
 max_gradient_norm = 5.0
 batch_size = 64
 learning_rate = 0.5
 learning_rate_decay_factor = 0.99
 use_lstm = False
 use_rms_prop = False
 enable_decode_sentence = False  # Test with input error sentence
 enable_test_decode = True  # Test with test set
 if not os.path.exists(model_path):
    os.makedirs(model_path)
--- a/pycorrector/seq2seq/infer.py
+++ b/pycorrector/seq2seq/infer.py
@ -2,205 +2,135 @@
 # Author: XuMing <xuming624@qq.com>
 # Brief: 
 import sys
 from collections import defaultdict
 import numpy as np
-import tensorflow as tf
+from keras.layers import Input
 from keras.models import Model, load_model
-import cged_config
+from pycorrector.seq2seq import cged_config as config
-from corpus_reader import CGEDReader
+from pycorrector.seq2seq.corpus_reader import GO_TOKEN
-from reader import EOS_ID
+from pycorrector.seq2seq.corpus_reader import CGEDReader
-from utils.text_utils import segment
+from pycorrector.utils.io_utils import get_logger
-from train import create_model
+
 from pycorrector.seq2seq.reader import EOS_TOKEN
 logger = get_logger(__name__)
-def decode(sess, model, data_reader, data_to_decode,
+def decode_sequence(model, rnn_hidden_dim,input_token_index,
-           corrective_tokens=None, verbose=True):
+                    num_decoder_tokens, target_token_index,encoder_input_data,
-    """
+                    reverse_target_char_index, max_decoder_seq_length):
-    Infer the correction sentence
+    # construct the encoder and decoder
-    :param sess:
+    encoder_inputs = model.input[0]  # input_1
-    :param model:
+    encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
-    :param data_reader:
+    encoder_states = [state_h_enc, state_c_enc]
-    :param data_to_decode: an iterable of token lists representing the input
+    encoder_model = Model(encoder_inputs, encoder_states)
        data we want to decode
    :param corrective_tokens
    :param verbose:
    :return:
    """
    model.batch_size = 1
    corrective_tokens_mask = np.zeros(model.target_vocab_size)
    corrective_tokens_mask[EOS_ID] = 1.0
-    if corrective_tokens is None:
+    decoder_inputs = model.input[1]  # input_2
-        corrective_tokens = set()
+    decoder_state_input_h = Input(shape=(rnn_hidden_dim,), name='input_3')
-    for tokens in corrective_tokens:
+    decoder_state_input_c = Input(shape=(rnn_hidden_dim,), name='input_4')
-        for token in tokens:
+    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
-            corrective_tokens_mask[data_reader.convert_token_2_id(token)] = 1.0
+    decoder_lstm = model.layers[3]
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h_dec, state_c_dec]
    decoder_dense = model.layers[4]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
-    for tokens in data_to_decode:
+    # Reverse-lookup token index to decode sequences back to
-        token_ids = [data_reader.convert_token_2_id(token) for token in tokens]
+    # something readable.
    reverse_input_char_index = dict(
        (i, char) for char, i in input_token_index.items())
    reverse_target_char_index = dict(
        (i, char) for char, i in target_token_index.items())
-        # Which bucket does it belong to?
+    # Encode the input as state vectors.
-        matching_buckets = [b for b in range(len(model.buckets))
+    states_value = encoder_model.predict(encoder_input_data)
                            if model.buckets[b][0] > len(token_ids)]
        if not matching_buckets:
            # The input string has more tokens than the largest bucket, so we
            # have to skip it.
            continue
-        bucket_id = min(matching_buckets)
+    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    # target_seq[0, 0, target_token_index[first_char]] = 1.
-        # Get a 1-element batch to feed the sentence to the model.
+    # Sampling loop for a batch of sequences
-        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
+    # (to simplify, here we assume a batch of size 1).
-            {bucket_id: [(token_ids, [])]}, bucket_id)
+    stop_condition = False
    decoded_sentence = ''
-        # Get output logits for the sentence.
+    while not stop_condition:
-        _, _, output_logits = model.step(
+        output_tokens, h, c = decoder_model.predict(
-            sess, encoder_inputs, decoder_inputs, target_weights, bucket_id,
+            [target_seq] + states_value)
            True, corrective_tokens=corrective_tokens_mask)
-        oov_input_tokens = [token for token in tokens if
+        # Sample a token
-                            data_reader.is_unknown_token(token)]
+        sampled_token_index = np.argmax(output_tokens[0, -1, :])
-        outputs = []
+        sampled_char = reverse_target_char_index[sampled_token_index]
-        next_oov_token_idx = 0
+        decoded_sentence += sampled_char
-        for logit in output_logits:
+        # Exit condition: either hit max length
-            max_likelihood_token_id = int(np.argmax(logit, axis=1))
+        # or find stop character.
-            # Check if this logit most likely points to the EOS identifier.
+        if (sampled_char == EOS_TOKEN or
-            if max_likelihood_token_id == EOS_ID:
+                    len(decoded_sentence) > max_decoder_seq_length):
-                break
+            stop_condition = True
-            token = data_reader.convert_id_2_token(max_likelihood_token_id)
+        # Update the target sequence (of length 1).
-            if data_reader.is_unknown_token(token):
+        target_seq = np.zeros((1, 1, num_decoder_tokens))
-                # Replace the "unknown" token with the most probable OOV
+        target_seq[0, 0, sampled_token_index] = 1.
-                # token from the input.
+
-                if next_oov_token_idx < len(oov_input_tokens):
+        # Update states
-                    # If we still have OOV input tokens available,
+        states_value = [h, c]
-                    # pick the next available one.
+
-                    token = oov_input_tokens[next_oov_token_idx]
+    return decoded_sentence
                    # Advance to the next OOV input token.
                    next_oov_token_idx += 1
                else:
                    # If we've already used all OOV input tokens,
                    # then we just leave the token as "UNK"
                    pass
            outputs.append(token)
        if verbose:
            decoded_sentence = " ".join(outputs)
            print("Input: {}".format(" ".join(tokens)))
            print("Output: {}\n".format(decoded_sentence))
        yield outputs
-def decode_sentence(sess, model, data_reader, sentence, corrective_tokens=set(),
+def infer(train_path=None,
-                    verbose=True):
+          test_path=None,
-    """Used with InteractiveSession in IPython """
+          save_model_path=None,
-    return next(decode(sess, model, data_reader, [segment(sentence, 'char')],
+          rnn_hidden_dim=200):
-                       corrective_tokens=corrective_tokens, verbose=verbose))
+    data_reader = CGEDReader(train_path)
    input_texts, target_texts = data_reader.build_dataset(test_path)
    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])
-def evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path,
+    print('num of samples:', len(input_texts))
-                      max_samples=None):
+    print('num of unique input tokens:', num_encoder_tokens)
-    """Evaluates the accuracy and BLEU score of the given model."""
+    print('num of unique output tokens:', num_decoder_tokens)
    print('max sequence length for inputs:', max_encoder_seq_len)
    print('max sequence length for outputs:', max_decoder_seq_len)
-    import nltk  # Loading here to avoid having to bundle it in lambda.
+    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
-    # Build a collection of "baseline" and model-based hypotheses, where the
+    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_len, num_encoder_tokens), dtype='float32')
    # baseline is just the (potentially errant) source sequence.
    baseline_hypotheses = defaultdict(list)  # The model's input
    model_hypotheses = defaultdict(list)  # The actual model's predictions
    targets = defaultdict(list)  # Groundtruth
-    errors = []
+    # one hot representation
    for i, input_text in enumerate(input_texts):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
    logger.info("Data loaded.")
-    n_samples_by_bucket = defaultdict(int)
+    # model
-    n_correct_model_by_bucket = defaultdict(int)
+    logger.info("Infer seq2seq model...")
-    n_correct_baseline_by_bucket = defaultdict(int)
+    model = load_model(save_model_path)
    n_samples = 0
-    # Evaluate the model against all samples in the test data set.
+    decoded_sentences = decode_sequence(model, encoder_input_data, )
-    for source, target in data_reader.read_samples_by_string(test_path):
+    for seq_index in input_text:
-        matching_buckets = [i for i, bucket in enumerate(model.buckets) if
+        print('-')
-                            len(source) < bucket[0]]
+        print('Input sentence:', input_texts[seq_index])
-        if not matching_buckets:
+        print('Decoded sentence:', decoded_sentences[seq_index])
            continue
-        bucket_id = matching_buckets[0]
+    logger.info("Infer has finished.")
        decoding = next(
            decode(sess, model, data_reader, [source],
                   corrective_tokens=corrective_tokens, verbose=False))
        model_hypotheses[bucket_id].append(decoding)
        if decoding == target:
            n_correct_model_by_bucket[bucket_id] += 1
        else:
            errors.append((decoding, target))
        baseline_hypotheses[bucket_id].append(source)
        if source == target:
            n_correct_baseline_by_bucket[bucket_id] += 1
        # nltk.corpus_bleu expects a list of one or more reference
        # translations per sample, so we wrap the target list in another list
        targets[bucket_id].append([target])
        n_samples_by_bucket[bucket_id] += 1
        n_samples += 1
        if max_samples is not None and n_samples > max_samples:
            break
    # Measure the corpus BLEU score and accuracy for the model and baseline
    # across all buckets.
    for bucket_id in targets.keys():
        baseline_bleu_score = nltk.translate.bleu_score.corpus_bleu(
            targets[bucket_id], baseline_hypotheses[bucket_id])
        model_bleu_score = nltk.translate.bleu_score.corpus_bleu(
            targets[bucket_id], model_hypotheses[bucket_id])
        print("Bucket {}: {}".format(bucket_id, model.buckets[bucket_id]))
        print("\tBaseline BLEU = {:.4f}\n\tModel BLEU = {:.4f}".format(
            baseline_bleu_score, model_bleu_score))
        print("\tBaseline Accuracy: {:.4f}".format(
            1.0 * n_correct_baseline_by_bucket[bucket_id] /
            n_samples_by_bucket[bucket_id]))
        print("\tModel Accuracy: {:.4f}".format(
            1.0 * n_correct_model_by_bucket[bucket_id] /
            n_samples_by_bucket[bucket_id]))
    return errors
 def main(_):
    print('Correcting error...')
    # Set the model path.
    model_path = cged_config.model_path
    data_reader = CGEDReader(cged_config, cged_config.train_path)
    if cged_config.enable_decode_sentence:
        # Correct user's sentences.
        with tf.Session() as session:
            model = create_model(session, True, model_path, config=cged_config)
            print("Enter a sentence you'd like to correct")
            correct_new_sentence = input()
            while correct_new_sentence.lower() != 'no':
                decode_sentence(session, model=model, data_reader=data_reader,
                                sentence=correct_new_sentence,
                                corrective_tokens=data_reader.read_tokens(cged_config.train_path))
                print("Enter a sentence you'd like to correct or press NO")
                correct_new_sentence = input()
    elif cged_config.enable_test_decode:
        # Decode test sentences.
        with tf.Session() as session:
            model = create_model(session, True, model_path, config=cged_config)
            print("Loaded model. Beginning decoding.")
            decodings = decode(session, model=model, data_reader=data_reader,
                               data_to_decode=data_reader.read_tokens(cged_config.test_path, is_infer=True),
                               corrective_tokens=data_reader.read_tokens(cged_config.train_path))
            # Write the decoded tokens to stdout.
            for tokens in decodings:
                sys.stdout.flush()
 if __name__ == "__main__":
-    tf.app.run()
+    infer(train_path=config.train_path,
          test_path=config.test_path,
          save_model_path=config.save_model_path,
          rnn_hidden_dim=config.rnn_hidden_dim)
--- a/pycorrector/seq2seq/preprocess.py
+++ b/pycorrector/seq2seq/preprocess.py
@ -5,7 +5,7 @@ from xml.dom import minidom
 from sklearn.model_selection import train_test_split
-import cged_config as config
+import pycorrector.seq2seq.cged_config as config
 from utils.text_utils import segment
--- a/pycorrector/seq2seq/reader.py
+++ b/pycorrector/seq2seq/reader.py
@ -14,12 +14,8 @@ GO_TOKEN = 'GO'
 class Reader:
-    def __init__(self, config, train_path=None, token_2_id=None,
+    def __init__(self, train_path=None, token_2_id=None,
-                 special_tokens=(), dataset_copies=1):
+                 special_tokens=()):
        self.config = config
        self.dataset_copies = dataset_copies
        # Vocabulary
        max_vocab_size = config.max_vocab_size
        if train_path is None:
            self.token_2_id = token_2_id
        else:
@ -36,7 +32,7 @@ class Reader:
            vocab[0:0] = special_tokens
            full_token_id = list(zip(vocab, range(len(vocab))))
            self.full_token_2_id = dict(full_token_id)
-            self.token_2_id = dict(full_token_id[:max_vocab_size])
+            self.token_2_id = dict(full_token_id)
        self.id_2_token = {v: k for k, v in self.token_2_id.items()}
    def read_tokens(self, path):
@ -115,13 +111,21 @@ class Reader:
            target.append(EOS_ID)
            yield source, target
    def read_samples_tokens(self, path):
        """
        Read sample of path's data
        :param path:
        :return: generate list
        """
        for source_words, target_words in self.read_samples_by_string(path):
            target = target_words
            target.append(EOS_TOKEN)
            yield source_words, target
    def build_dataset(self, path):
-        dataset = [[] for _ in self.config.buckets]
+        print('Read data, path:{0}'.format(path))
-        # Copy the data set for different dropouts
+        sources, targets = [], []
-        for _ in range(self.dataset_copies):
+        for source, target in self.read_samples_tokens(path):
-            for source, target in self.read_samples(path):
+            sources.append(source)
-                for bucket_id, (source_size, target_size) in enumerate(self.config.buckets):
+            targets.append(target)
-                    if len(source) < source_size and len(target) < target_size:
+        return sources, targets
                        dataset[bucket_id].append([source, target])
                        break
        return dataset
--- a/pycorrector/seq2seq/seq2seq.py
+++ b/pycorrector/seq2seq/seq2seq.py
--- a/pycorrector/seq2seq/seq2seq_model.py
+++ b/pycorrector/seq2seq/seq2seq_model.py
@ -0,0 +1,144 @@
 # -*- coding: utf-8 -*-
 # Author: XuMing <xuming624@qq.com>
 # Brief: 
 from keras.layers import Input, LSTM, Dense
 from keras.models import Model
 from keras.callbacks import LambdaCallback
 from keras.callbacks import ModelCheckpoint
 from pycorrector.seq2seq.reader import EOS_TOKEN
 import numpy as np
 def create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim=200):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(rnn_hidden_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(rnn_hidden_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    # Run training
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    return model
 def eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, input_token_index, target_token_index,
         max_decoder_seq_length, encoder_input_data, input_texts):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, num_encoder_tokens))
    encoder = LSTM(rnn_hidden_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]
    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, num_decoder_tokens))
    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(rnn_hidden_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)
    # Define sampling models
    encoder_model = Model(encoder_inputs, encoder_states)
    decoder_state_input_h = Input(shape=(rnn_hidden_dim,))
    decoder_state_input_c = Input(shape=(rnn_hidden_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    # Reverse-lookup token index to decode sequences back to
    # something readable.
    reverse_input_char_index = dict(
        (i, char) for char, i in input_token_index.items())
    reverse_target_char_index = dict(
        (i, char) for char, i in target_token_index.items())
    def decode_sequence(input_seq, seq_index):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)
        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        # Populate the first character of target sequence with the start character.
        first_char = input_texts[seq_index][0]
        print('first char:', first_char)
        target_seq[0, 0, target_token_index[first_char]] = 1.
        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = first_char
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict(
                [target_seq] + states_value)
            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_char = reverse_target_char_index[sampled_token_index]
            if sampled_char != EOS_TOKEN:
                decoded_sentence += sampled_char
            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_char == EOS_TOKEN or
                        len(decoded_sentence) > max_decoder_seq_length):
                stop_condition = True
            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0, 0, sampled_token_index] = 1.
            # Update states
            states_value = [h, c]
        return decoded_sentence
    for seq_index in range(10):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = encoder_input_data[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(input_seq, seq_index)
        print('Input sentence:', input_texts[seq_index])
        print('Decoded sentence:', decoded_sentence)
        print('-')
 def callback(save_model_path, logger=None):
    # Print the batch number at the beginning of every batch.
    if logger:
        batch_print_callback = LambdaCallback(
            on_batch_begin=lambda batch, logs: logger.info('batch: %d' % batch))
    else:
        batch_print_callback = LambdaCallback(
            on_batch_begin=lambda batch, logs: print(batch))
    # define the checkpoint, save model
    checkpoint = ModelCheckpoint(save_model_path,
                                 save_best_only=True,
                                 verbose=1)
    return [batch_print_callback, checkpoint]
--- a/pycorrector/seq2seq/train.py
+++ b/pycorrector/seq2seq/train.py
@ -2,143 +2,76 @@
 # Author: XuMing <xuming624@qq.com>
 # Brief: Train seq2seq model for text grammar error correction
 import math
 import os
 import sys
 import time
 import numpy as np
 import tensorflow as tf
-from pycorrector.seq2seq import cged_config
+from pycorrector.seq2seq import cged_config as config
 from pycorrector.seq2seq.corpus_reader import CGEDReader
-from pycorrector.seq2seq.corrector_model import CorrectorModel
+from pycorrector.seq2seq.seq2seq_model import create_model, callback, eval
-from pycorrector.utils.tf_utils import get_ckpt_path
+from pycorrector.utils.io_utils import get_logger
 logger = get_logger(__name__)
-def create_model(session, forward_only, model_path, config=cged_config):
+def train(train_path=None,
-    """
+          save_model_path=None,
-    Create model and load parameters
+          batch_size=64,
-    :param session:
+          epochs=10,
-    :param forward_only:
+          rnn_hidden_dim=200):
    :param model_path:
    :param config:
    :return:
    """
    model = CorrectorModel(
        config.max_vocab_size,
        config.max_vocab_size,
        config.buckets,
        config.size,
        config.num_layers,
        config.max_gradient_norm,
        config.batch_size,
        config.learning_rate,
        config.learning_rate_decay_factor,
        config.use_lstm,
        forward_only=forward_only,
        config=config)
    ckpt_path = get_ckpt_path(model_path)
    if ckpt_path:
        print("Read model parameters from %s" % ckpt_path)
        model.saver.restore(session, ckpt_path)
    else:
        print('Create model...')
        session.run(tf.global_variables_initializer())
    return model
 def train(data_reader, train_path, test_path, model_path):
    print('Read data, train:{0}, test:{1}'.format(train_path, test_path))
    config = data_reader.config
    train_data = data_reader.build_dataset(train_path)
    test_data = data_reader.build_dataset(test_path)
    with tf.Session() as sess:
        # Create model
        print('Create %d layers of %d units.' % (config.num_layers, config.size))
        model = create_model(sess, False, model_path, config=config)
        # Read data into buckets
        train_bucket_sizes = [len(train_data[b]) for b in range(len(config.buckets))]
        print("Training bucket sizes:{}".format(train_bucket_sizes))
        train_total_size = float(sum(train_bucket_sizes))
        print("Total train size:{}".format(train_total_size))
        # Bucket scale
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))]
        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while current_step < config.max_steps:
            # Choose a bucket according to data distribution. We pick a random
            # number in [0, 1] and use the corresponding interval in
            # train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([i for i in range(len(train_buckets_scale))
                             if train_buckets_scale[i] > random_number_01])
            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_data, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            step_time += (time.time() - start_time) / config.steps_per_checkpoint
            loss += step_loss / config.steps_per_checkpoint
            current_step += 1
            # Once in a while, we save checkpoint, print statistics, and run
            # evals.
            if current_step % config.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(float(loss)) if loss < 300 else float(
                    "inf")
                print("global step %d learning rate %.4f step-time %.2f "
                      "perplexity %.2f" % (
                          model.global_step.eval(), model.learning_rate.eval(),
                          step_time, perplexity))
                # Decrease learning rate if no improvement was seen over last
                #  3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(model_path, "translate.ckpt")
                model.saver.save(sess, checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(config.buckets)):
                    if len(test_data[bucket_id]) == 0:
                        print("  eval: empty bucket %d" % bucket_id)
                        continue
                    encoder_inputs, decoder_inputs, target_weights = \
                        model.get_batch(test_data, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                    eval_ppx = math.exp(
                        float(eval_loss)) if eval_loss < 300 else float(
                        "inf")
                    print("  eval: bucket %d perplexity %.2f" % (
                        bucket_id, eval_ppx))
                sys.stdout.flush()
 def main(_):
    print('Training model...')
-    data_reader = CGEDReader(cged_config, cged_config.train_path)
+    data_reader = CGEDReader(train_path)
-    train(data_reader,
+    input_texts, target_texts = data_reader.build_dataset(train_path)
-          cged_config.train_path,
+    print('input_texts:', input_texts[0])
-          cged_config.test_path,
+    print('target_texts:', target_texts[0])
-          cged_config.model_path)
+
    input_characters = data_reader.read_vocab(input_texts)
    target_characters = data_reader.read_vocab(target_texts)
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_len = max([len(text) for text in input_texts])
    max_decoder_seq_len = max([len(text) for text in target_texts])
    print('num of samples:', len(input_texts))
    print('num of unique input tokens:', num_encoder_tokens)
    print('num of unique output tokens:', num_decoder_tokens)
    print('max sequence length for inputs:', max_encoder_seq_len)
    print('max sequence length for outputs:', max_decoder_seq_len)
    input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_len, num_encoder_tokens), dtype='float32')
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32')
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_len, num_decoder_tokens), dtype='float32')
    # one hot representation
    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        for t, char in enumerate(target_text):
            # decoder_target_data is a head of decoder_input_data by one timestep
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    logger.info("Data loaded.")
    # model
    logger.info("Training seq2seq model...")
    model = create_model(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim)
    # save
    callbacks_list = callback(save_model_path, logger)
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks_list)
    logger.info("Training has finished.")
    eval(num_encoder_tokens, num_decoder_tokens, rnn_hidden_dim, input_token_index, target_token_index,
         max_decoder_seq_len, encoder_input_data, input_texts)
 if __name__ == "__main__":
-    tf.app.run()  # CPU, i5, about ten hours
+    train(train_path=config.train_path,
          save_model_path=config.save_model_path,
          batch_size=config.batch_size,
          epochs=config.epochs,
          rnn_hidden_dim=config.rnn_hidden_dim)
--- a/pycorrector/utils/text_utils.py
+++ b/pycorrector/utils/text_utils.py
@ -135,7 +135,10 @@ def segment(sentence, cut_type='word', pos=False, None_flag='O'):
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
-            pos_seq = [None_flag for _ in word_seq]
+            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':