TsinghuaNLP
/
JointNRE

 
			
			   
				 
					
						
						
							
							import numpy as np
import os
import json

# folder of training datasets
data_path = "./origin_data/"
# files to export data
export_path = "./data/"
#length of sentence
fixlen = 120
#max length of position embedding is 100 (-100~+100)
maxlen = 100

word2id = {}
relation2id = {}
word_size = 0
word_vec = None

def pos_embed(x):
	return max(0, min(x + maxlen, maxlen + maxlen + 1))

def find_index(x,y):
	for index, item in enumerate(y):
		if x == item:
			return index
	return -1

def init_word():
	# reading word embedding data...
	global word2id, word_size
	res = []
	ff = open(export_path + "/entity2id.txt", "w")
	f = open(data_path + "kg/train.txt", "r")
	while True:
		content = f.readline()
		if content == "":
			break
		h, t, r = content.strip().split("\t")
		if not h in word2id:
			word2id[h] = len(word2id)
			ff.write("%s\t%d\n"%(h, word2id[h]))
		if not t in word2id:
			word2id[t] = len(word2id)
			ff.write("%s\t%d\n"%(t, word2id[t]))
	f.close()
	f = open(data_path + "text/train.txt", "r")
	while True:
		content = f.readline()
		if content == "":
			break
		h,t = content.strip().split("\t")[:2]
		if not h in word2id:
			word2id[h] = len(word2id)
			ff.write("%s\t%d\n"%(h, word2id[h]))
		if not t in word2id:
			word2id[t] = len(word2id)
			ff.write("%s\t%d\n"%(t, word2id[t]))
	f.close()
	f = open(data_path + "text/test.txt", "r")
	while True:
		content = f.readline()
		if content == "":
			break
		h,t = content.strip().split("\t")[:2]
		if not h in word2id:
			word2id[h] = len(word2id)
			ff.write("%s\t%d\n"%(h, word2id[h]))
		if not t in word2id:
			word2id[t] = len(word2id)
			ff.write("%s\t%d\n"%(t, word2id[t]))
	f.close()
	res.append(len(word2id))
	ff.close()

	print 'reading word embedding data...'
	f = open(data_path + 'text/vec.txt', "r")
	total, size = f.readline().strip().split()[:2]
	total = (int)(total)
	word_size = (int)(size)
	vec = np.ones((total + res[0], word_size), dtype = np.float32)
	for i in range(total):
		content = f.readline().strip().split()
		word2id[content[0]] = len(word2id)
		for j in range(word_size):
			vec[i + res[0]][j] = (float)(content[j+1])
	f.close()
	word2id['UNK'] = len(word2id)
	word2id['BLANK'] = len(word2id)
	global word_vec
	word_vec = vec
	res.append(len(word2id))
	return res

def init_relation():
	# reading relation ids...
	global relation2id
	print 'reading relation ids...'	
	res = []
	ff = open(export_path + "/relation2id.txt", "w")
	f = open(data_path + "text/relation2id.txt","r")
	total = (int)(f.readline().strip())
	for i in range(total):
		content = f.readline().strip().split()
		if not content[0] in relation2id:
			relation2id[content[0]] = len(relation2id)
			ff.write("%s\t%d\n"%(content[0], relation2id[content[0]]))
	f.close()
	res.append(len(relation2id))
	f = open(data_path + "kg/train.txt", "r")
	for i in f.readlines():
		h, t, r = i.strip().split("\t")
		if not r in relation2id:
			relation2id[r] = len(relation2id)
			ff.write("%s\t%d\n"%(r, relation2id[r]))
	f.close()
	ff.close()
	res.append(len(relation2id))
	return res

def sort_files(name, limit):
	hash = {}
	f = open(data_path + "text/" + name + '.txt','r')
	s = 0
	while True:
		content = f.readline()
		if content == '':
			break
		s = s + 1
		origin_data = content
		content = content.strip().split()
		en1_id = content[0]
		en2_id = content[1]
		rel_name = content[4]
		if (rel_name in relation2id) and ((int)(relation2id[rel_name]) < limit[0]):
			relation = relation2id[rel_name]
		else:
			relation = relation2id['NA']
		id1 = str(en1_id)+"#"+str(en2_id)
		id2 = str(relation)
		if not id1 in hash:
			hash[id1] = {}
		if not id2 in hash[id1]:
			hash[id1][id2] = []
		hash[id1][id2].append(origin_data)
	f.close()
	f = open(data_path + name + "_sort.txt", "w")
	f.write("%d\n"%(s))
	for i in hash:
		for j in hash[i]:
			for k in hash[i][j]:
				f.write(k)
	f.close()

def init_train_files(name, limit):
	print 'reading ' + name +' data...'
	f = open(data_path + name + '.txt','r')
	total = (int)(f.readline().strip())
	sen_word = np.zeros((total, fixlen), dtype = np.int32)
	sen_pos1 = np.zeros((total, fixlen), dtype = np.int32)
	sen_pos2 = np.zeros((total, fixlen), dtype = np.int32)
	sen_mask = np.zeros((total, fixlen), dtype = np.int32)
	sen_len = np.zeros((total), dtype = np.int32)
	sen_label = np.zeros((total), dtype = np.int32)
	sen_head = np.zeros((total), dtype = np.int32)
	sen_tail = np.zeros((total), dtype = np.int32)
	instance_scope = []
	instance_triple = []
	for s in range(total):
		content = f.readline().strip().split()
		sentence = content[5:-1]
		en1_id = content[0]
		en2_id = content[1]
		en1_name = content[2]
		en2_name = content[3]
		rel_name = content[4]
		if rel_name in relation2id and ((int)(relation2id[rel_name]) < limit[0]):
			relation = relation2id[rel_name]
		else:
			relation = relation2id['NA']
		en1pos = 0
		en2pos = 0
		for i in range(len(sentence)):
			if sentence[i] == en1_name:
				sentence[i] = en1_id
				en1pos = i
				sen_head[s] = word2id[en1_id]
			if sentence[i] == en2_name:
				sentence[i] = en2_id
				en2pos = i
				sen_tail[s] = word2id[en2_id]
		en_first = min(en1pos,en2pos)
		en_second = en1pos + en2pos - en_first
		for i in range(fixlen):
			sen_word[s][i] = word2id['BLANK']
			sen_pos1[s][i] = pos_embed(i - en1pos)
			sen_pos2[s][i] = pos_embed(i - en2pos)
			if i >= len(sentence):
				sen_mask[s][i] = 0
			elif i - en_first<=0:
				sen_mask[s][i] = 1
			elif i - en_second<=0:
				sen_mask[s][i] = 2
			else:
				sen_mask[s][i] = 3
		for i, word in enumerate(sentence):
			if i >= fixlen:
				break
			elif not word in word2id:
				sen_word[s][i] = word2id['UNK']
			else:
				sen_word[s][i] = word2id[word]
		sen_len[s] = min(fixlen, len(sentence))
		sen_label[s] = relation
		#put the same entity pair sentences into a dict
		tup = (en1_id,en2_id,relation)
		if instance_triple == [] or instance_triple[len(instance_triple) - 1] != tup:
			instance_triple.append(tup)
			instance_scope.append([s,s])
		instance_scope[len(instance_triple) - 1][1] = s
		if (s+1) % 100 == 0:
			print s
	return np.array(instance_triple), np.array(instance_scope), sen_len, sen_label, sen_word, sen_pos1, sen_pos2, sen_mask, sen_head, sen_tail

def init_kg():
	ff = open(export_path + "/triple2id.txt", "w")
	f = open(data_path + "kg/train.txt", "r")
	content = f.readlines()
	ff.write("%d\n"%(len(content)))
	for i in content:
		h,t,r = i.strip().split("\t")
		ff.write("%d\t%d\t%d\n"%(word2id[h], word2id[t], relation2id[r]))
	f.close()
	ff.close()

	f = open(export_path + "/entity2id.txt", "r")
	content = f.readlines()
	f.close()
	f = open(export_path + "/entity2id.txt", "w")
	f.write("%d\n"%(len(content)))
	for i in content:
		f.write(i.strip()+"\n")
	f.close()

	f = open(export_path + "/relation2id.txt", "r")
	content = f.readlines()
	f.close()
	f = open(export_path + "/relation2id.txt", "w")
	f.write("%d\n"%(len(content)))
	for i in content:
		f.write(i.strip()+"\n")
	f.close()

textual_rel_total, rel_total = init_relation()
entity_total, word_total = init_word()

print textual_rel_total
print rel_total
print entity_total
print word_total
print word_vec.shape
f = open(data_path + "word2id.txt", "w")
for i in word2id:
	f.write("%s\t%d\n"%(i, word2id[i]))
f.close()

init_kg()
np.save(export_path+'vec', word_vec)
f = open(export_path+'config', "w")
f.write(json.dumps({"word2id":word2id,"relation2id":relation2id,"word_size":word_size, "fixlen":fixlen, "maxlen":maxlen, "entity_total":entity_total, "word_total":word_total, "rel_total":rel_total, "textual_rel_total":textual_rel_total}))
f.close()
sort_files("train", [textual_rel_total, rel_total])
sort_files("test", [textual_rel_total, rel_total])

# word_vec = np.load(export_path + 'vec.npy')
# f = open(export_path + "config", 'r')
# config = json.loads(f.read())
# f.close()
# relation2id = config["relation2id"]
# word2id = config["word2id"]

instance_triple, instance_scope, train_len, train_label, train_word, train_pos1, train_pos2, train_mask, train_head, train_tail = init_train_files("train_sort",  [textual_rel_total, rel_total])
np.save(export_path+'train_instance_triple', instance_triple)
np.save(export_path+'train_instance_scope', instance_scope)
np.save(export_path+'train_len', train_len)
np.save(export_path+'train_label', train_label)
np.save(export_path+'train_word', train_word)
np.save(export_path+'train_pos1', train_pos1)
np.save(export_path+'train_pos2', train_pos2)
np.save(export_path+'train_mask', train_mask)
np.save(export_path+'train_head', train_head)
np.save(export_path+'train_tail', train_tail)

instance_triple, instance_scope, test_len, test_label, test_word, test_pos1, test_pos2, test_mask, test_head, test_tail = init_train_files("test_sort",  [textual_rel_total, rel_total])
np.save(export_path+'test_instance_triple', instance_triple)
np.save(export_path+'test_instance_scope', instance_scope)
np.save(export_path+'test_len', test_len)
np.save(export_path+'test_label', test_label)
np.save(export_path+'test_word', test_word)
np.save(export_path+'test_pos1', test_pos1)
np.save(export_path+'test_pos2', test_pos2)
np.save(export_path+'test_mask', test_mask)
np.save(export_path+'test_head', test_head)
np.save(export_path+'test_tail', test_tail)