|
- #! usr/bin/env python3
- # -*- coding:utf-8 -*-
- """
- Copyright 2018 The Google AI Language Team Authors.
- BASED ON Google_BERT.
- reference from :zhoukaiyin/
-
- @Author:Macan
- """
-
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
-
- import collections
- import os,sys
- import json
- import logging
- import copy
- import re
-
- ##set random seed
- import numpy as np
- np.random.seed(26)
- import tensorflow as tf
- tf.set_random_seed(26)
-
- import codecs
- from tensorflow.contrib.layers.python.layers import initializers
- from tensorflow.contrib import estimator
- import modeling
- import optimization
- # import optimization_layerwise as optimization
- import tokenization
- from tensorflow.contrib import rnn
- from tensorflow.contrib import crf
- import pickle
-
-
- flags = tf.flags
- FLAGS = flags.FLAGS
-
- flags.DEFINE_string(
- "data_dir", '../data',
- "The input datadir.",
- )
-
- flags.DEFINE_string(
- "bert_config_file", '../../bert/multi_cased_L-12_H-768_A-12/bert_config.json',
- "The config json file corresponding to the pre-trained BERT model."
- )
-
- flags.DEFINE_string(
- "task_name", 'NER', "The name of the task to train."
- )
-
- flags.DEFINE_string(
- "output_dir", None,
- "The output directory where the model checkpoints will be written."
- )
-
- # Other parameters
- flags.DEFINE_string(
- "init_checkpoint", '../../bert/meddocan_corpus/pretraining_output/model.ckpt-30000',
- "Initial checkpoint (usually from a pre-trained BERT model)."
- )
-
-
- flags.DEFINE_bool(
- "do_lower_case", True,
- "Whether to lower case the input text."
- )
-
- flags.DEFINE_integer(
- "max_seq_length", 256,
- "The maximum total input sequence length after WordPiece tokenization."
- )
-
- flags.DEFINE_integer(
- "lstm_size", 128,
- "The size of bilstm cell."
- )
-
- flags.DEFINE_bool(
- "do_train", True,
- "Whether to run training."
- )
-
- flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
-
- flags.DEFINE_bool("clean", True, "Whether to clean last training files.")
-
- flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
-
- flags.DEFINE_bool("do_predict", True,"Whether to run the model in inference mode on the test set.")
-
- flags.DEFINE_integer("train_batch_size", 12, "Total batch size for training.")
-
- flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
-
- flags.DEFINE_integer("predict_batch_size", 12, "Total batch size for predict.")
-
- flags.DEFINE_float("learning_rate", 1e-5, "The initial learning rate for Adam.")
-
- flags.DEFINE_float("num_train_epochs", 10, "Total number of training epochs to perform.")
-
- flags.DEFINE_bool("use_bilstm", False, "Whether to use BiLSTM.")
-
- flags.DEFINE_bool("use_crf", True, "Whether to use CRF.")
-
- flags.DEFINE_float(
- "warmup_proportion", 0.1,
- "Proportion of training to perform linear learning rate warmup for. "
- "E.g., 0.1 = 10% of training.")
-
- flags.DEFINE_integer("save_checkpoints_steps", 1000,
- "How often to save the model checkpoint.")
-
- flags.DEFINE_integer("iterations_per_loop", 1000,
- "How many steps to make in each estimator call.")
-
- flags.DEFINE_string("vocab_file", '../../bert/multi_cased_L-12_H-768_A-12/vocab.txt',
- "The vocabulary file that the BERT model was trained on.")
- tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
- flags.DEFINE_integer(
- "num_tpu_cores", 8,
- "Only used if `use_tpu` is True. Total number of TPU cores to use.")
-
-
-
-
- class InputExample(object):
- """A single training/test example for simple sequence classification."""
-
- def __init__(self, guid, text, label=None):
- """Constructs a InputExample.
-
- Args:
- guid: Unique id for the example.
- text_a: string. The untokenized text of the first sequence. For single
- sequence tasks, only this sequence must be specified.
- label: (Optional) string. The label of the example. This should be
- specified for train and dev examples, but not for test examples.
- """
- self.guid = guid
- self.text = text
- self.label = label
-
- class InputFeatures(object):
- """A single set of features of data."""
-
- def __init__(self, input_ids, input_mask, segment_ids, label_ids, ):
- self.input_ids = input_ids
- self.input_mask = input_mask
- self.segment_ids = segment_ids
- self.label_ids = label_ids
-
- class DataProcessor(object):
- """Base class for data converters for sequence classification data sets."""
-
- def get_train_examples(self, data_dir):
- """Gets a collection of `InputExample`s for the train set."""
- raise NotImplementedError()
-
- def get_dev_examples(self, data_dir):
- """Gets a collection of `InputExample`s for the dev set."""
- raise NotImplementedError()
-
- def get_labels(self):
- """Gets the list of labels for this data set."""
- raise NotImplementedError()
-
- @classmethod
- def _read_data(cls, input_file):
- """Reads a BIO data."""
- with codecs.open(input_file, 'r', encoding='utf-8') as f:
- lines = []
- words = []
- labels = []
- for line in f:
- contends = line.strip()
- tokens = contends.split()
- if len(tokens) >= 2 and (not contends.startswith("-DOCSTART-")):
- word = line.strip().split()[0]
- label = line.strip().split()[-1]
- else:
- if len(contends) == 0:
- l = ' '.join([label for label in labels if len(label) > 0])
- w = ' '.join([word for word in words if len(word) > 0])
- lines.append([w, l])
- words = []
- labels = []
- continue
- if contends.startswith("-DOCSTART-"):
- continue
-
- words.append(word)
- labels.append(label)
- return lines
-
-
- class NerProcessor(DataProcessor):
- def get_train_examples(self, data_dir):
- return self._create_example(
- self._read_data(os.path.join(data_dir, "final_train_19_20.txt")), "train"
- )
-
- def get_dev_examples(self, data_dir):
- return self._create_example(
- self._read_data(os.path.join(data_dir, "revise_dev.txt")), "dev"
- )
-
- def get_test_examples(self, data_dir):
- return self._create_example(
- self._read_data(os.path.join(data_dir, "revise_dev.txt")), "test")
-
- def get_labels(self):
- labels = []
- for datapath in ['final_train_19_20.txt','revise_dev.txt']:
- filepath = os.path.join(FLAGS.data_dir,datapath)
- f = open(filepath,'r',encoding = 'utf-8')
- lines = f.readlines()
- f.close()
- for line in lines:
- content = line.strip().split()
- if len(content) >=2 and '-DOCSTART-' not in line:
- l = content[-1]
- if l not in labels:
- labels.append(l)
-
- labels = sorted(list(set(labels))+["X", "[CLS]", "[SEP]"])
-
- return labels
-
-
- def _create_example(self, lines, set_type):
- examples = []
- for (i, line) in enumerate(lines):
- guid = "%s-%s" % (set_type, i)
- text = tokenization.convert_to_unicode(line[0])
- label = tokenization.convert_to_unicode(line[1])
- examples.append(InputExample(guid=guid, text=text, label=label))
- return examples
-
-
- def write_tokens(tokens, mode):
- """
- 将序列解析结果写入到文件中
- 只在mode=test的时候启用
- :param tokens:
- :param mode:
- :return:
- """
- if mode == "test":
- path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt")
- wf = codecs.open(path, 'a', encoding='utf-8')
- for token in tokens:
- if token != "**NULL**":
- wf.write(token + '\n')
- wf.close()
-
-
- def convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode):
- """
- 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中
- :param ex_index: index
- :param example: 一个样本
- :param label_list: 标签列表
- :param max_seq_length:
- :param tokenizer:
- :param mode:
- :return:
- """
- textlist = example.text.split(' ')
- labellist = example.label.split(' ')
- tokens = []
- labels = []
-
- ##记录token分词信息
- restore_map = []
-
- for i, word in enumerate(textlist):
- # 分词,如果是中文,就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理(例如中文的引号),可以将所有的分字操作替换为list(input)
- token = tokenizer.tokenize(word)
- tokens.extend(token)
- label_1 = labellist[i]
- for m in range(len(token)):
- if m == 0:
- labels.append(label_1)
- restore_map.append('[ALL]')
- else: # 一般不会出现else
- labels.append("X")
- restore_map.append('[PART]')
- # tokens = tokenizer.tokenize(example.text)
- # 序列截断
- if len(tokens) >= max_seq_length - 1:
- tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志
- labels = labels[0:(max_seq_length - 2)]
- restore_map = restore_map[0:(max_seq_length - 2)]
-
- ##write restore_map to file
- restore_map = ['[CLS]']+restore_map+['[SEP]']
- if (mode=='test'):
- with open(os.path.join(FLAGS.output_dir, "restore_map_test.txt"),'a') as writer:
- writer.write('\n'.join(restore_map))
- writer.write('\n')
- elif(mode=='dev'):
- with open(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"),'a') as writer:
- writer.write('\n'.join(restore_map))
- writer.write('\n')
-
- ntokens = []
- segment_ids = []
- label_ids = []
- ntokens.append("[CLS]") # 句子开始设置CLS 标志
- segment_ids.append(0)
- # append("O") or append("[CLS]") not sure!
- label_ids.append(label_map["[CLS]"]) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病
- for i, token in enumerate(tokens):
- ntokens.append(token)
- segment_ids.append(0)
- label_ids.append(label_map[labels[i]])
- ntokens.append("[SEP]") # 句尾添加[SEP] 标志
- segment_ids.append(0)
- # append("O") or append("[SEP]") not sure!
- label_ids.append(label_map["[SEP]"])
- input_ids = tokenizer.convert_tokens_to_ids(ntokens) # 将序列中的字(ntokens)转化为ID形式
- input_mask = [1] * len(input_ids)
- # label_mask = [1] * len(input_ids)
- # padding, 使用
- while len(input_ids) < max_seq_length:
- input_ids.append(0)
- input_mask.append(0)
- segment_ids.append(0)
- # we don't concerned about it!
- label_ids.append(0)
- ntokens.append("**NULL**")
- # label_mask.append(0)
- # print(len(input_ids))
- assert len(input_ids) == max_seq_length
- assert len(input_mask) == max_seq_length
- assert len(segment_ids) == max_seq_length
- assert len(label_ids) == max_seq_length
- # 打印部分样本数据信息
- if ex_index < 5:
- tf.logging.info("*** Example ***")
- tf.logging.info("guid: %s" % (example.guid))
- tf.logging.info("tokens: %s" % " ".join(
- [tokenization.printable_text(x) for x in ntokens if(x!="**NULL**")]))
- tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
- tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
- tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
- tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
- # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask]))
-
- # 结构化为一个类
- feature = InputFeatures(
- input_ids=input_ids,
- input_mask=input_mask,
- segment_ids=segment_ids,
- label_ids=label_ids,
- )
- # mode='test'的时候才有效
- write_tokens(ntokens, mode)
- return feature
-
- def filed_based_convert_examples_to_features(
- examples, label_map, max_seq_length, tokenizer, output_file, mode=None):
- """
- 将数据转化为TF_Record 结构,作为模型数据输入
- :param examples: 样本
- :param label_list:标签list
- :param max_seq_length: 预先设定的最大序列长度
- :param tokenizer: tokenizer 对象
- :param output_file: tf.record 输出路径
- :param mode:
- :return:
- """
- writer = tf.python_io.TFRecordWriter(output_file)
- # 遍历训练数据
- for (ex_index, example) in enumerate(examples):
- if ex_index % 5000 == 0:
- tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
- # 对于每一个训练样本,
- feature = convert_single_example(ex_index, example, label_map, max_seq_length, tokenizer, mode)
- def create_int_feature(values):
- f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
- return f
- features = collections.OrderedDict()
- features["input_ids"] = create_int_feature(feature.input_ids)
- features["input_mask"] = create_int_feature(feature.input_mask)
- features["segment_ids"] = create_int_feature(feature.segment_ids)
- features["label_ids"] = create_int_feature(feature.label_ids)
- tf_example = tf.train.Example(features=tf.train.Features(feature=features))
- writer.write(tf_example.SerializeToString())
-
-
- def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
- name_to_features = {
- "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
- "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
- "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
- "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
- }
-
- def _decode_record(record, name_to_features):
- example = tf.parse_single_example(record, name_to_features)
- for name in list(example.keys()):
- t = example[name]
- if t.dtype == tf.int64:
- t = tf.to_int32(t)
- example[name] = t
- return example
-
- def input_fn(params):
- batch_size = params["batch_size"]
- d = tf.data.TFRecordDataset(input_file)
- if is_training:
- # d = d.repeat()
- d = d.shuffle(buffer_size=300)
- d = d.apply(tf.data.experimental.map_and_batch(lambda record: _decode_record(record, name_to_features),
- batch_size=batch_size,
- num_parallel_calls=8, # 并行处理数据的CPU核心数量,不要大于你机器的核心数
- drop_remainder=drop_remainder))
- d = d.prefetch(buffer_size=4)
- return d
-
- return input_fn
-
-
- def create_model(bert_config, is_training, input_ids, input_mask,
- segment_ids, labels, num_labels, use_one_hot_embeddings):
- """
- 创建X模型
- :param bert_config: bert 配置
- :param is_training:
- :param input_ids: 数据的idx 表示
- :param input_mask:
- :param segment_ids:
- :param labels: 标签的idx 表示
- :param num_labels: 类别数量
- :param use_one_hot_embeddings:
- :return:
- """
- model = modeling.BertModel(
- config=bert_config,
- is_training=is_training,
- input_ids=input_ids,
- input_mask=input_mask,
- token_type_ids=segment_ids,
- use_one_hot_embeddings=use_one_hot_embeddings)
-
- # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size]
- output_layer = model.get_sequence_output()
- hidden_size = output_layer.shape[-1].value
- max_seq_length = output_layer.shape[1].value
-
- used = tf.sign(tf.abs(input_ids))
- lengths = tf.reduce_sum(used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度
-
- ##add CRF layer and biLSTM layer
- if is_training:
- output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
-
- if(FLAGS.use_bilstm):
- with tf.variable_scope('biLSTM'):
- lstm_output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(FLAGS.lstm_size, return_sequences=True))(output_layer)
- with tf.variable_scope("hidden"):
- W = tf.get_variable("W", shape=[FLAGS.lstm_size * 2, FLAGS.lstm_size],
- dtype=tf.float32, initializer=initializers.xavier_initializer())
-
- b = tf.get_variable("b", shape=[FLAGS.lstm_size], dtype=tf.float32,
- initializer=tf.zeros_initializer())
- output = tf.reshape(lstm_output, shape=[-1, FLAGS.lstm_size * 2])
- hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))
-
- with tf.variable_scope("logits"):
- W = tf.get_variable("W", shape=[FLAGS.lstm_size, num_labels],
- dtype=tf.float32, initializer=initializers.xavier_initializer())
-
- b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32,
- initializer=tf.zeros_initializer())
- hidden = tf.reshape(hidden, shape=[-1, FLAGS.lstm_size])
- pred = tf.nn.xw_plus_b(hidden, W, b)
-
- else:
- with tf.variable_scope("logits"):
- W = tf.get_variable("W", shape=[hidden_size, num_labels],
- dtype=tf.float32, initializer=initializers.xavier_initializer())
-
- b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32,
- initializer=tf.zeros_initializer())
- hidden = tf.reshape(output_layer, shape=[-1, hidden_size])
- pred = tf.nn.xw_plus_b(hidden, W, b)
-
- with tf.variable_scope("loss"):
- logits = tf.reshape(pred, [-1, max_seq_length, num_labels])
- if(FLAGS.use_crf):
- trans = tf.get_variable(
- "transitions",
- shape=[num_labels, num_labels])
- log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
- inputs=logits,
- tag_indices=labels,
- transition_params=trans,
- sequence_lengths=lengths)
-
- pred_ids, _ = crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths)
- loss = tf.reduce_mean(-log_likelihood)
- else:
- log_probs = tf.nn.log_softmax(logits, axis=-1)
- one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
- per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
- loss = tf.reduce_mean(per_example_loss)
- probabilities = tf.nn.softmax(logits, axis=-1)
- trans = None
- pred_ids = tf.argmax(probabilities,axis=-1)
- return (loss,logits,trans,pred_ids)
-
-
- def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
- num_train_steps, num_warmup_steps, use_tpu,
- use_one_hot_embeddings):
- """
- 构建模型
- :param bert_config:
- :param num_labels:
- :param init_checkpoint:
- :param learning_rate:
- :param num_train_steps:
- :param num_warmup_steps:
- :param use_tpu:
- :param use_one_hot_embeddings:
- :return:
- """
-
- def model_fn(features, labels, mode, params):
- tf.logging.info("*** Features ***")
- for name in sorted(features.keys()):
- tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
- input_ids = features["input_ids"]
- input_mask = features["input_mask"]
- segment_ids = features["segment_ids"]
- label_ids = features["label_ids"]
-
- # print('shape of input_ids', input_ids.shape)
- # label_mask = features["label_mask"]
- is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-
- # 使用参数构建模型,input_idx 就是输入的样本idx表示,label_ids 就是标签的idx表示
- (total_loss, logits, trans, pred_ids) = create_model(
- bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
- num_labels, use_one_hot_embeddings)
-
- tvars = tf.trainable_variables()
- scaffold_fn = None
- # 加载BERT模型
- if init_checkpoint:
- (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,
- init_checkpoint)
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
- if use_tpu:
- def tpu_scaffold():
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
- return tf.train.Scaffold()
-
- scaffold_fn = tpu_scaffold
- else:
- tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
- '''
- tf.logging.info("**** Trainable Variables ****")
-
- # 打印加载模型的参数
- for var in tvars:
- init_string = ""
- if var.name in initialized_variable_names:
- init_string = ", *INIT_FROM_CKPT*"
- tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
- init_string)
- '''
-
- output_spec = None
- if mode == tf.estimator.ModeKeys.TRAIN:
- train_op = optimization.create_optimizer(
- total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
- output_spec = tf.contrib.tpu.TPUEstimatorSpec(
- mode=mode,
- loss=total_loss,
- train_op=train_op,
- scaffold_fn=scaffold_fn)
- elif mode == tf.estimator.ModeKeys.EVAL:
-
- output_spec = tf.contrib.tpu.TPUEstimatorSpec(
- mode=mode,
- loss=total_loss,
- scaffold_fn=scaffold_fn) #
- else:
- output_spec = tf.contrib.tpu.TPUEstimatorSpec(
- mode=mode,
- predictions=pred_ids,
- scaffold_fn=scaffold_fn
- )
- return output_spec
-
- return model_fn
-
- def labeltoid(label_list):
- label_map = {}
- # 1表示从1开始对label进行index化
- for (i, label) in enumerate(label_list, 1):
- label_map[label] = i
- # 保存label->index 的map
- with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'wb') as w:
- pickle.dump(label_map, w)
-
- return label_map
-
- def save_best_model(cur_ckpt_path,best_model_path):
- cmd1 = 'cp '+cur_ckpt_path+'.index '+best_model_path+'.index'
- cmd2 = 'cp '+cur_ckpt_path+'.meta '+best_model_path+'.meta'
- cmd3 = 'cp '+cur_ckpt_path+'.data-00000-of-00001 '+best_model_path+'.data-00000-of-00001'
- os.system(cmd1)
- os.system(cmd2)
- os.system(cmd3)
-
- def result_to_pair(writer,predict_examples,result, id2label, mode):
- # with open(src_path,'r') as src_f:
- # src_lines = src_f.readlines()
- result = list(result)
- import numpy as np
- print(np.array(result).shape)
- if(mode=='dev'):
- with open(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"),'r') as map_f:
- map_lines = map_f.readlines()
- elif(mode=='test'):
- with open(os.path.join(FLAGS.output_dir, "restore_map_test.txt"),'r') as map_f:
- map_lines = map_f.readlines()
-
- result_ind = -1
- real_token_num = 0
- revise_result = []
- tmp_result = []
- for i in range(len(map_lines)):
- if(map_lines[i].strip()=='[CLS]'):
- result_ind += 1
- real_token_num = 0
- tmp_result = []
- elif(map_lines[i].strip()=='[SEP]'):
- revise_result.append(tmp_result)
-
- else:
- real_token_num += 1
- if(map_lines[i].strip()=='[PART]'):
- continue
- else:
- tmp_result.append(result[result_ind][real_token_num])
-
- result = revise_result
-
- for predict_line, prediction in zip(predict_examples, result):
- idx = 0
- line = ''
- line_token = str(predict_line.text).split(' ')
- label_token = str(predict_line.label).split(' ')
- len_seq = len(label_token)
- if len(line_token) != len(label_token):
- tf.logging.info(predict_line.text)
- tf.logging.info(predict_line.label)
- break
- for id in prediction:
- if idx >= len_seq:
- break
- if id == 0:
- continue
- curr_labels = id2label[id]
- if curr_labels in ['[CLS]', '[SEP]']:
- continue
- try:
- line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'
- except Exception as e:
- tf.logging.info(e)
- tf.logging.info(predict_line.text)
- tf.logging.info(predict_line.label)
- line = ''
- break
- idx += 1
- writer.write(line + '\n')
-
-
- def main(_):
- tf.logging.set_verbosity(tf.logging.INFO)
- processors = {
- "ner": NerProcessor
- }
-
- bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
-
- if FLAGS.max_seq_length > bert_config.max_position_embeddings:
- raise ValueError(
- "Cannot use sequence length %d because the BERT model "
- "was only trained up to sequence length %d" %
- (FLAGS.max_seq_length, bert_config.max_position_embeddings))
- ## del last training file
- if(FLAGS.do_train and FLAGS.clean):
- if os.path.exists(FLAGS.output_dir):
- def del_file(path):
- ls = os.listdir(path)
- for i in ls:
- c_path = os.path.join(path, i)
- if os.path.isdir(c_path):
- del_file(c_path)
- else:
- os.remove(c_path)
-
- try:
- del_file(FLAGS.output_dir)
- except Exception as e:
- print(e)
- print('pleace remove the files of output dir and data.conf')
- exit(-1)
- tf.gfile.MakeDirs(FLAGS.output_dir)
-
- task_name = FLAGS.task_name.lower()
- if task_name not in processors:
- raise ValueError("Task not found: %s" % (task_name))
- processor = processors[task_name]()
- label_list = processor.get_labels()
- label_map = labeltoid(label_list)
- print(label_map)
-
- tokenizer = tokenization.FullTokenizer(
- vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
- tpu_cluster_resolver = None
- if FLAGS.use_tpu and FLAGS.tpu_name:
- tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
- FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
-
- is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
-
- run_config = tf.contrib.tpu.RunConfig(
- cluster=tpu_cluster_resolver,
- master=FLAGS.master,
- model_dir=None,
- save_checkpoints_steps=FLAGS.save_checkpoints_steps,
- tpu_config=tf.contrib.tpu.TPUConfig(
- iterations_per_loop=FLAGS.iterations_per_loop,
- num_shards=FLAGS.num_tpu_cores,
- per_host_input_for_training=is_per_host))
-
- train_examples = None
- num_train_steps = None
- num_warmup_steps = None
-
- if FLAGS.do_train:
- train_examples = processor.get_train_examples(FLAGS.data_dir)
- num_train_steps = int(
- len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
- num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
-
- model_fn = model_fn_builder(
- bert_config=bert_config,
- num_labels=len(label_list) + 1,
- init_checkpoint=FLAGS.init_checkpoint,
- learning_rate=FLAGS.learning_rate,
- num_train_steps=num_train_steps,
- num_warmup_steps=num_warmup_steps,
- use_tpu=FLAGS.use_tpu,
- use_one_hot_embeddings=FLAGS.use_tpu)
-
- estimator = tf.contrib.tpu.TPUEstimator(
- use_tpu=FLAGS.use_tpu,
- model_fn=model_fn,
- config=run_config,
- model_dir=FLAGS.output_dir,
- train_batch_size=FLAGS.train_batch_size,
- eval_batch_size=FLAGS.eval_batch_size,
- predict_batch_size=FLAGS.predict_batch_size)
-
- if FLAGS.do_train:
- train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
- filed_based_convert_examples_to_features(
- train_examples, label_map, FLAGS.max_seq_length, tokenizer, train_file)
- tf.logging.info("***** Running training *****")
- tf.logging.info(" Num examples = %d", len(train_examples))
- tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
- tf.logging.info(" Num steps = %d", num_train_steps)
- train_input_fn = file_based_input_fn_builder(
- input_file=train_file,
- seq_length=FLAGS.max_seq_length,
- is_training=True,
- drop_remainder=True)
-
- if FLAGS.do_eval:
- eval_examples = processor.get_dev_examples(FLAGS.data_dir)
- eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
- filed_based_convert_examples_to_features(
- eval_examples, label_map, FLAGS.max_seq_length, tokenizer, eval_file, mode="dev")
- tf.logging.info("***** Running dev *****")
- tf.logging.info(" Num examples = %d", len(eval_examples))
- tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
- eval_input_fn = file_based_input_fn_builder(
- input_file=eval_file,
- seq_length=FLAGS.max_seq_length,
- is_training=False,
- drop_remainder=False)
-
- ## Get id2label
- with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:
- label2id = pickle.load(rf)
- id2label = {value: key for key, value in label2id.items()}
-
- best_result = 0
- all_results = []
- if FLAGS.do_train:
- for i in range(int(FLAGS.num_train_epochs)):
- print('**'*40)
- print('Train {} epoch'.format(i+1))
- estimator.train(input_fn=train_input_fn)
- result = estimator.predict(input_fn=eval_input_fn)
- output_dev_file = os.path.join(FLAGS.output_dir, "label_dev.txt")
- with codecs.open(output_dev_file, 'w', encoding='utf-8') as writer:
- result_to_pair(writer, eval_examples, result, id2label, 'dev')
- from conlleval import return_report
- eval_result,overall_f = return_report(output_dev_file)
- print('cur epoch result: ',overall_f)
- print(''.join(eval_result))
- all_results.append(overall_f)
- ###Do dev operation
- if(overall_f>best_result):
- print('**'*40)
- print('Found better model, saved!')
- best_result = overall_f
- cur_ckpt_path = estimator.latest_checkpoint()
- best_model_path = '/'.join(cur_ckpt_path.split('/')[:-1]+['model.ckpt-best'])
- save_best_model(cur_ckpt_path,best_model_path)
- print('**'*40)
- print('Training completed!')
- print('all_results: ',all_results)
- print('Best result: ',np.max(all_results))
- print('Avg result: ',np.mean(all_results))
-
- all_logs = 'all_results:'+' '.join([str(i) for i in all_results])+'\n' \
- +'Best result:'+str(np.max(all_results))+'\n' \
- +'Avg result:'+str(np.mean(all_results))+'\n'
- if FLAGS.do_predict:
- print('***********************Running Prediction************************')
- print('Use model which perform best on dev data')
- cur_ckpt_path = estimator.latest_checkpoint()
- best_model_path = '/'.join(cur_ckpt_path.split('/')[:-1]+['model.ckpt-best'])
- estimator = tf.contrib.tpu.TPUEstimator(
- use_tpu=FLAGS.use_tpu,
- model_fn=model_fn,
- config=run_config,
- model_dir=None,
- train_batch_size=FLAGS.train_batch_size,
- eval_batch_size=FLAGS.eval_batch_size,
- predict_batch_size=FLAGS.predict_batch_size,
- warm_start_from=best_model_path)
- test_examples = processor.get_test_examples(FLAGS.data_dir)
- test_file = os.path.join(FLAGS.output_dir, "test.tf_record")
- filed_based_convert_examples_to_features(
- test_examples, label_map, FLAGS.max_seq_length, tokenizer, test_file, mode="test")
- tf.logging.info("***** Running test *****")
- tf.logging.info(" Num examples = %d", len(test_examples))
- tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
- test_input_fn = file_based_input_fn_builder(
- input_file=test_file,
- seq_length=FLAGS.max_seq_length,
- is_training=False,
- drop_remainder=False)
- result = estimator.predict(input_fn=test_input_fn)
- output_test_file = os.path.join(FLAGS.output_dir, "label_test_best.txt")
- with codecs.open(output_test_file, 'w', encoding='utf-8') as writer:
- result_to_pair(writer, test_examples, result, id2label, 'test')
- from conlleval import return_report
- eval_result,overall_f = return_report(output_test_file)
- print(''.join(eval_result))
- sys.exit(0)
- all_logs = all_logs+'Use model which perform best on dev data'+'\n'+''.join(eval_result)+'\n'
- print('Use model which restore from last ckpt')
- estimator = tf.contrib.tpu.TPUEstimator(
- use_tpu=FLAGS.use_tpu,
- model_fn=model_fn,
- config=run_config,
- model_dir=None,
- train_batch_size=FLAGS.train_batch_size,
- eval_batch_size=FLAGS.eval_batch_size,
- predict_batch_size=FLAGS.predict_batch_size,
- warm_start_from=cur_ckpt_path)
- test_examples = processor.get_test_examples(FLAGS.data_dir)
- test_file = os.path.join(FLAGS.output_dir, "test.tf_record")
- filed_based_convert_examples_to_features(
- test_examples, label_map, FLAGS.max_seq_length, tokenizer, test_file, mode="test")
- tf.logging.info("***** Running test *****")
- tf.logging.info(" Num examples = %d", len(test_examples))
- tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)
- test_input_fn = file_based_input_fn_builder(
- input_file=test_file,
- seq_length=FLAGS.max_seq_length,
- is_training=False,
- drop_remainder=False)
- result = estimator.predict(input_fn=test_input_fn)
- output_test_file = os.path.join(FLAGS.output_dir, "label_test_last.txt")
- with codecs.open(output_test_file, 'w', encoding='utf-8') as writer:
- result_to_pair(writer, test_examples, result, id2label, 'test')
- from conlleval import return_report
- eval_result,overall_f = return_report(output_test_file)
- print(''.join(eval_result))
- all_logs = all_logs+'Use model which restore from last ckpt'+'\n'+''.join(eval_result)+'\n'
- print(all_logs)
-
-
- if __name__ == "__main__":
- if os.path.exists(os.path.join(FLAGS.output_dir, "restore_map_test.txt")):
- os.remove(os.path.join(FLAGS.output_dir, "restore_map_test.txt"))
- if os.path.exists(os.path.join(FLAGS.output_dir, "restore_map_dev.txt")):
- os.remove(os.path.join(FLAGS.output_dir, "restore_map_dev.txt"))
- tf.app.run()
|