PCLNLP
/
ClinicalNLP

 
			
							#! usr/bin/env python3
# -*- coding:utf-8 -*-
"""
Copyright 2018 The Google AI Language Team Authors.
BASED ON Google_BERT.
reference from :zhoukaiyin/

@Author:Macan
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os,sys
import json
import logging
import copy
import re

##set random seed
import numpy as np 
np.random.seed(26)
import tensorflow as tf
tf.set_random_seed(26)

import codecs
from tensorflow.contrib.layers.python.layers import initializers
from tensorflow.contrib import estimator
import modeling
import optimization
# import optimization_layerwise as optimization
import tokenization
from tensorflow.contrib import rnn
from tensorflow.contrib import crf
import pickle


flags = tf.flags
FLAGS = flags.FLAGS

flags.DEFINE_string(
    "data_dir", '../data',
    "The input datadir.",
)

flags.DEFINE_string(
    "bert_config_file", '../../bert/multi_cased_L-12_H-768_A-12/bert_config.json',
    "The config json file corresponding to the pre-trained BERT model."
)

flags.DEFINE_string(
    "task_name", 'NER', "The name of the task to train."
)

flags.DEFINE_string(
    "output_dir", None,
    "The output directory where the model checkpoints will be written."
)

# Other parameters
flags.DEFINE_string(
    "init_checkpoint", '../../bert/meddocan_corpus/pretraining_output/model.ckpt-30000',
    "Initial checkpoint (usually from a pre-trained BERT model)."
)


flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text."
)

flags.DEFINE_integer(
    "max_seq_length", 256,
    "The maximum total input sequence length after WordPiece tokenization."
)

flags.DEFINE_integer(
    "lstm_size", 128,
    "The size of bilstm cell."
)

flags.DEFINE_bool(
    "do_train", True,
    "Whether to run training."
)

flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")

flags.DEFINE_bool("clean", True, "Whether to clean last training files.")

flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")

flags.DEFINE_bool("do_predict", True,"Whether to run the model in inference mode on the test set.")

flags.DEFINE_integer("train_batch_size", 12, "Total batch size for training.")

flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")

flags.DEFINE_integer("predict_batch_size", 12, "Total batch size for predict.")

flags.DEFINE_float("learning_rate", 1e-5, "The initial learning rate for Adam.")

flags.DEFINE_float("num_train_epochs", 10, "Total number of training epochs to perform.")

flags.DEFINE_bool("use_bilstm", False, "Whether to use BiLSTM.")

flags.DEFINE_bool("use_crf", True, "Whether to use CRF.")

flags.DEFINE_float(
    "warmup_proportion", 0.1,
    "Proportion of training to perform linear learning rate warmup for. "
    "E.g., 0.1 = 10% of training.")

flags.DEFINE_integer("save_checkpoints_steps", 1000,
                     "How often to save the model checkpoint.")

flags.DEFINE_integer("iterations_per_loop", 1000,
                     "How many steps to make in each estimator call.")

flags.DEFINE_string("vocab_file", '../../bert/multi_cased_L-12_H-768_A-12/vocab.txt',
                    "The vocabulary file that the BERT model was trained on.")
tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
flags.DEFINE_integer(
    "num_tpu_cores", 8,
    "Only used if `use_tpu` is True. Total number of TPU cores to use.")


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text, label=None):
        """Constructs a InputExample.

        Args:
          guid: Unique id for the example.
          text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text = text
        self.label = label

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids, ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids
 
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_data(cls, input_file):
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split()
                if len(tokens) >= 2 and (not contends.startswith("-DOCSTART-")):
                    word = line.strip().split()[0]
                    label = line.strip().split()[-1]
                else:
                    if len(contends) == 0:
                        l = ' '.join([label for label in labels if len(label) > 0])
                        w = ' '.join([word for word in words if len(word) > 0])
                        lines.append([w, l])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    continue

                words.append(word)
                labels.append(label)
            return lines


class NerProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "final_train_19_20.txt")), "train"
        )

    def get_dev_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "revise_dev.txt")), "dev"
        )

    def get_test_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "revise_dev.txt")), "test")

    def get_labels(self):
        labels = []
        for datapath in ['final_train_19_20.txt','revise_dev.txt']:
            filepath = os.path.join(FLAGS.data_dir,datapath)
            f = open(filepath,'r',encoding = 'utf-8')
            lines = f.readlines()
            f.close()
            for line in lines:
                content = line.strip().split()
                if len(content) >=2 and '-DOCSTART-' not in line:
                    l = content[-1]
                    if l not in labels:
                        labels.append(l)

        labels = sorted(list(set(labels))+["X", "[CLS]", "[SEP]"])

        return labels


    def _create_example(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text = tokenization.convert_to_unicode(line[0])
            label = tokenization.convert_to_unicode(line[1])
            examples.append(InputExample(guid=guid, text=text, label=label))
        return examples


def write_tokens(tokens, mode):
    """
    将序列解析结果写入到文件中
    只在mode=test的时候启用
    :param tokens:
    :param mode:
    :return:
    """
    if mode == "test":
        path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt")
        wf = codecs.open(path, 'a', encoding='utf-8')
        for token in tokens:
            if token != "**NULL**":
                wf.write(token + '\n')
        wf.close()


def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode):
    """
    将一个样本进行分析，然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中
    :param ex_index: index
    :param example: 一个样本
    :param label_list: 标签列表
    :param max_seq_length:
    :param tokenizer:
    :param mode:
    :return:
    """
    textlist = example.text.split(' ')
    labellist = example.label.split(' ')
    tokens = []
    labels = []

    ##记录token分词信息
    restore_map = []

    for i, word in enumerate(textlist):
        # 分词，如果是中文，就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理（例如中文的引号），可以将所有的分字操作替换为list(input)
        token = tokenizer.tokenize(word)
        tokens.extend(token)
        label_1 = labellist[i]
        for m in range(len(token)):
            if m == 0:
                labels.append(label_1)
                restore_map.append('[ALL]')
            else:  # 一般不会出现else
                labels.append("X")
                restore_map.append('[PART]')
    # tokens = tokenizer.tokenize(example.text)
    # 序列截断
    if len(tokens) >= max_seq_length - 1:
        tokens = tokens[0:(max_seq_length - 2)]  # -2 的原因是因为序列需要加一个句首和句尾标志
        labels = labels[0:(max_seq_length - 2)]
        restore_map = restore_map[0:(max_seq_length - 2)]

    ##write restore_map to file
    restore_map = ['[CLS]']+restore_map+['[SEP]']
    if (mode=='test'):
        with open(os.path.join(FLAGS.output_dir, "restore_map_test.txt"),'a') as writer:
            writer.write('\n'.join(restore_map))
            writer.write('\n')
    elif(mode=='dev'):
        with open(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"),'a') as writer:
            writer.write('\n'.join(restore_map))
            writer.write('\n')

    ntokens = []
    segment_ids = []
    label_ids = []
    ntokens.append("[CLS]")  # 句子开始设置CLS 标志
    segment_ids.append(0)
    # append("O") or append("[CLS]") not sure!
    label_ids.append(label_map["[CLS]"])  # O OR CLS 没有任何影响，不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注，使用LCS 也没毛病
    for i, token in enumerate(tokens):
        ntokens.append(token)
        segment_ids.append(0)
        label_ids.append(label_map[labels[i]])
    ntokens.append("[SEP]")  # 句尾添加[SEP] 标志
    segment_ids.append(0)
    # append("O") or append("[SEP]") not sure!
    label_ids.append(label_map["[SEP]"])
    input_ids = tokenizer.convert_tokens_to_ids(ntokens)  # 将序列中的字(ntokens)转化为ID形式
    input_mask = [1] * len(input_ids)
    # label_mask = [1] * len(input_ids)
    # padding, 使用
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        # we don't concerned about it!
        label_ids.append(0)
        ntokens.append("**NULL**")
        # label_mask.append(0)
    # print(len(input_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length
    # 打印部分样本数据信息
    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("tokens: %s" % " ".join(
            [tokenization.printable_text(x) for x in ntokens if(x!="**NULL**")]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
        # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))

    # 结构化为一个类
    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=label_ids,
    )
    # mode='test'的时候才有效
    write_tokens(ntokens, mode)
    return feature

def filed_based_convert_examples_to_features(
        examples, label_map, max_seq_length, tokenizer, output_file, mode=None):
    """
    将数据转化为TF_Record 结构，作为模型数据输入
    :param examples:  样本
    :param label_list:标签list
    :param max_seq_length: 预先设定的最大序列长度
    :param tokenizer: tokenizer 对象
    :param output_file: tf.record 输出路径
    :param mode:
    :return:
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    # 遍历训练数据
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
        # 对于每一个训练样本,
        feature = convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode)
        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f
        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_ids)
        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
        

def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
    }

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        return example

    def input_fn(params):
        batch_size = params["batch_size"]
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            # d = d.repeat()
            d = d.shuffle(buffer_size=300)
        d = d.apply(tf.data.experimental.map_and_batch(lambda record: _decode_record(record, name_to_features),
                                                       batch_size=batch_size,
                                                       num_parallel_calls=8,  # 并行处理数据的CPU核心数量，不要大于你机器的核心数
                                                       drop_remainder=drop_remainder))
        d = d.prefetch(buffer_size=4)
        return d

    return input_fn


def create_model(bert_config, is_training, input_ids, input_mask,
                 segment_ids, labels, num_labels, use_one_hot_embeddings):
    """
    创建X模型
    :param bert_config: bert 配置
    :param is_training:
    :param input_ids: 数据的idx 表示
    :param input_mask:
    :param segment_ids:
    :param labels: 标签的idx 表示
    :param num_labels: 类别数量
    :param use_one_hot_embeddings:
    :return:
    """
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)
  
    # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
    output_layer = model.get_sequence_output()
    hidden_size = output_layer.shape[-1].value
    max_seq_length = output_layer.shape[1].value

    used = tf.sign(tf.abs(input_ids))
    lengths = tf.reduce_sum(used, reduction_indices=1)  # [batch_size] 大小的向量，包含了当前batch中的序列长度

    ##add CRF layer and biLSTM layer
    if is_training:
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    if(FLAGS.use_bilstm):
        with tf.variable_scope('biLSTM'):
            lstm_output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(FLAGS.lstm_size, return_sequences=True))(output_layer)       
            with tf.variable_scope("hidden"):
                W = tf.get_variable("W", shape=[FLAGS.lstm_size * 2, FLAGS.lstm_size],
                                    dtype=tf.float32, initializer=initializers.xavier_initializer())

                b = tf.get_variable("b", shape=[FLAGS.lstm_size], dtype=tf.float32,
                                    initializer=tf.zeros_initializer())
                output = tf.reshape(lstm_output, shape=[-1, FLAGS.lstm_size * 2])
                hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))

        with tf.variable_scope("logits"):
            W = tf.get_variable("W", shape=[FLAGS.lstm_size, num_labels],
                                dtype=tf.float32, initializer=initializers.xavier_initializer())

            b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32,
                                initializer=tf.zeros_initializer())
            hidden = tf.reshape(hidden, shape=[-1, FLAGS.lstm_size])
            pred = tf.nn.xw_plus_b(hidden, W, b)
    
    else:
        with tf.variable_scope("logits"):
            W = tf.get_variable("W", shape=[hidden_size, num_labels],
                                dtype=tf.float32, initializer=initializers.xavier_initializer())

            b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32,
                                initializer=tf.zeros_initializer())
            hidden = tf.reshape(output_layer, shape=[-1, hidden_size])
            pred = tf.nn.xw_plus_b(hidden, W, b)

    with tf.variable_scope("loss"):
        logits = tf.reshape(pred, [-1, max_seq_length, num_labels])
        if(FLAGS.use_crf):
            trans = tf.get_variable(
                    "transitions",
                    shape=[num_labels, num_labels])     
            log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
                        inputs=logits,
                        tag_indices=labels,
                        transition_params=trans,
                        sequence_lengths=lengths)
            
            pred_ids, _ = crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths)
            loss = tf.reduce_mean(-log_likelihood)
        else:
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            loss = tf.reduce_mean(per_example_loss)
            probabilities = tf.nn.softmax(logits, axis=-1)
            trans = None
            pred_ids = tf.argmax(probabilities,axis=-1)
    return (loss,logits,trans,pred_ids)


def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """
    构建模型
    :param bert_config:
    :param num_labels:
    :param init_checkpoint:
    :param learning_rate:
    :param num_train_steps:
    :param num_warmup_steps:
    :param use_tpu:
    :param use_one_hot_embeddings:
    :return:
    """

    def model_fn(features, labels, mode, params):
        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        # print('shape of input_ids', input_ids.shape)
        # label_mask = features["label_mask"]
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # 使用参数构建模型,input_idx 就是输入的样本idx表示，label_ids 就是标签的idx表示
        (total_loss, logits, trans, pred_ids) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)
       
        tvars = tf.trainable_variables()
        scaffold_fn = None
        # 加载BERT模型
        if init_checkpoint:
            (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
                                                                                                       init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
            if use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        '''
        tf.logging.info("**** Trainable Variables ****")

        # 打印加载模型的参数
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)
        '''

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn)  
        elif mode == tf.estimator.ModeKeys.EVAL:

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                scaffold_fn=scaffold_fn)  #
        else:
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions=pred_ids,
                scaffold_fn=scaffold_fn
            )
        return output_spec

    return model_fn

def labeltoid(label_list):
    label_map = {}
    # 1表示从1开始对label进行index化
    for (i, label) in enumerate(label_list, 1):
        label_map[label] = i
    # 保存label->index 的map
    with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w:
        pickle.dump(label_map, w)

    return label_map

def save_best_model(cur_ckpt_path,best_model_path):
  cmd1 = 'cp '+cur_ckpt_path+'.index '+best_model_path+'.index'
  cmd2 = 'cp '+cur_ckpt_path+'.meta '+best_model_path+'.meta'
  cmd3 = 'cp '+cur_ckpt_path+'.data-00000-of-00001 '+best_model_path+'.data-00000-of-00001'
  os.system(cmd1)
  os.system(cmd2)
  os.system(cmd3)

def result_to_pair(writer,predict_examples,result, id2label, mode):
    # with open(src_path,'r') as src_f:
    #     src_lines = src_f.readlines()
    result = list(result)
    import numpy as np
    print(np.array(result).shape)
    if(mode=='dev'):
        with open(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"),'r') as map_f:
            map_lines = map_f.readlines()
    elif(mode=='test'):
        with open(os.path.join(FLAGS.output_dir, "restore_map_test.txt"),'r') as map_f:
            map_lines = map_f.readlines()
    
    result_ind = -1
    real_token_num = 0
    revise_result = []
    tmp_result = []
    for i in range(len(map_lines)):
        if(map_lines[i].strip()=='[CLS]'):
            result_ind += 1
            real_token_num = 0
            tmp_result = []
        elif(map_lines[i].strip()=='[SEP]'):
            revise_result.append(tmp_result)
            
        else:
            real_token_num += 1
            if(map_lines[i].strip()=='[PART]'):
                continue
            else:
                tmp_result.append(result[result_ind][real_token_num])

    result = revise_result
    
    for predict_line, prediction in zip(predict_examples, result):
        idx = 0
        line = ''
        line_token = str(predict_line.text).split(' ')
        label_token = str(predict_line.label).split(' ')
        len_seq = len(label_token)
        if len(line_token) != len(label_token):
            tf.logging.info(predict_line.text)
            tf.logging.info(predict_line.label)
            break
        for id in prediction:
            if idx >= len_seq:
                break
            if id == 0:
                continue
            curr_labels = id2label[id]
            if curr_labels in ['[CLS]', '[SEP]']:
                continue
            try:
                line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'
            except Exception as e:
                tf.logging.info(e)
                tf.logging.info(predict_line.text)
                tf.logging.info(predict_line.label)
                line = ''
                break
            idx += 1
        writer.write(line + '\n')


def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    processors = {
        "ner": NerProcessor
    }

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
     ## del last training file  
    if(FLAGS.do_train and FLAGS.clean):     
        if os.path.exists(FLAGS.output_dir):
            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(FLAGS.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)
    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    label_list = processor.get_labels()
    label_map = labeltoid(label_list)
    print(label_map)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=None,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None 

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list) + 1,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        model_dir=FLAGS.output_dir,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:      
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        filed_based_convert_examples_to_features(
            train_examples, label_map, FLAGS.max_seq_length, tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(
            eval_examples, label_map, FLAGS.max_seq_length, tokenizer, eval_file, mode="dev")
        tf.logging.info("***** Running dev *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

    ## Get id2label
    with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
        label2id = pickle.load(rf)
        id2label = {value: key for key, value in label2id.items()}

    best_result = 0
    all_results = []
    if FLAGS.do_train:
        for i in range(int(FLAGS.num_train_epochs)):
            print('**'*40)
            print('Train {} epoch'.format(i+1))
            estimator.train(input_fn=train_input_fn)
            result = estimator.predict(input_fn=eval_input_fn)
            output_dev_file = os.path.join(FLAGS.output_dir, "label_dev.txt")
            with codecs.open(output_dev_file, 'w', encoding='utf-8') as writer:
                result_to_pair(writer, eval_examples, result, id2label, 'dev')
            from conlleval import return_report
            eval_result,overall_f = return_report(output_dev_file)
            print('cur epoch result: ',overall_f)
            print(''.join(eval_result))
            all_results.append(overall_f)
            ###Do dev operation
            if(overall_f>best_result):
                print('**'*40)
                print('Found better model, saved!')
                best_result = overall_f
                cur_ckpt_path = estimator.latest_checkpoint()
                best_model_path = '/'.join(cur_ckpt_path.split('/')[:-1]+['model.ckpt-best'])
                save_best_model(cur_ckpt_path,best_model_path)
        print('**'*40)
        print('Training completed!')
        print('all_results: ',all_results)
        print('Best result: ',np.max(all_results))
        print('Avg result: ',np.mean(all_results))

        all_logs = 'all_results:'+' '.join([str(i) for i in all_results])+'\n' \
                    +'Best result:'+str(np.max(all_results))+'\n' \
                    +'Avg result:'+str(np.mean(all_results))+'\n'
    if FLAGS.do_predict:
        print('***********************Running Prediction************************')
        print('Use model which perform best on dev data')
        cur_ckpt_path = estimator.latest_checkpoint()
        best_model_path = '/'.join(cur_ckpt_path.split('/')[:-1]+['model.ckpt-best'])
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            model_dir=None,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            warm_start_from=best_model_path)
        test_examples = processor.get_test_examples(FLAGS.data_dir)
        test_file = os.path.join(FLAGS.output_dir, "test.tf_record")
        filed_based_convert_examples_to_features(
            test_examples, label_map, FLAGS.max_seq_length, tokenizer, test_file, mode="test")
        tf.logging.info("***** Running test *****")
        tf.logging.info("  Num examples = %d", len(test_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        test_input_fn = file_based_input_fn_builder(
            input_file=test_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)
        result = estimator.predict(input_fn=test_input_fn)
        output_test_file = os.path.join(FLAGS.output_dir, "label_test_best.txt")
        with codecs.open(output_test_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer, test_examples, result, id2label, 'test')
        from conlleval import return_report
        eval_result,overall_f = return_report(output_test_file)
        print(''.join(eval_result))
        sys.exit(0)
        all_logs = all_logs+'Use model which perform best on dev data'+'\n'+''.join(eval_result)+'\n'
        print('Use model which restore from last ckpt')
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            model_dir=None,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            predict_batch_size=FLAGS.predict_batch_size,
            warm_start_from=cur_ckpt_path)
        test_examples = processor.get_test_examples(FLAGS.data_dir)
        test_file = os.path.join(FLAGS.output_dir, "test.tf_record")
        filed_based_convert_examples_to_features(
            test_examples, label_map, FLAGS.max_seq_length, tokenizer, test_file, mode="test")
        tf.logging.info("***** Running test *****")
        tf.logging.info("  Num examples = %d", len(test_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
        test_input_fn = file_based_input_fn_builder(
            input_file=test_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)
        result = estimator.predict(input_fn=test_input_fn)
        output_test_file = os.path.join(FLAGS.output_dir, "label_test_last.txt")
        with codecs.open(output_test_file, 'w', encoding='utf-8') as writer:
            result_to_pair(writer, test_examples, result, id2label, 'test')
        from conlleval import return_report
        eval_result,overall_f = return_report(output_test_file)
        print(''.join(eval_result))
        all_logs = all_logs+'Use model which restore from last ckpt'+'\n'+''.join(eval_result)+'\n'
        print(all_logs)


if __name__ == "__main__":
    if os.path.exists(os.path.join(FLAGS.output_dir, "restore_map_test.txt")):
        os.remove(os.path.join(FLAGS.output_dir, "restore_map_test.txt"))
    if os.path.exists(os.path.join(FLAGS.output_dir, "restore_map_dev.txt")):
        os.remove(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"))
    tf.app.run()