|
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on 22/02/19
-
- author: fenia
- """
-
- import sys
- import os
- from tabulate import tabulate
- import itertools
- import yaml
- import numpy as np
- import pickle as pkl
- import torch
- import matplotlib
- matplotlib.use('Agg')
- import matplotlib.pyplot as plt
-
-
- def solve(A, B):
- A = list(map(int, A))
- B = list(map(int, B))
- m = len(A)
- n = len(B)
- A.sort()
- B.sort()
- a = 0
- b = 0
- result = sys.maxsize
-
- while a < m and b < n:
- if abs(A[a] - B[b]) < result:
- result = abs(A[a] - B[b])
-
- # Move Smaller Value
- if A[a] < B[b]:
- a += 1
- else:
- b += 1
- # return final sma result
- return result
-
-
- def write_errors(preds, info, ofile, map_=None):
- """ Write model errors to file """
- print('Saving predictions ... ', end="")
- with open(ofile+'.errors', 'w') as outfile:
- for p, i in zip(preds, info):
- i = [i_ for i_ in i if i_]
- assert len(p) == len(i)
-
- for k, j in zip(p, i):
- if k != j['rel']:
- outfile.write('Prediction: {} \t Truth: {} \t Type: {} \n'.format(map_[k], map_[j['rel']], j['cross']))
- doc = [it for items in j['doc'] for it in items]
- outfile.write('{}\n{}\n'.format(j['pmid'], ' '.join(doc)))
-
- gg1 = ' | '.join([' '.join(doc[int(m1):int(m2)]) for m1,m2 in
- zip(j['entA'].mstart.split(':'), j['entA'].mend.split(':'))])
- gg2 = ' | '.join([' '.join(doc[int(m1):int(m2)]) for m1, m2 in
- zip(j['entB'].mstart.split(':'), j['entB'].mend.split(':'))])
-
- outfile.write('Arg1: {} | {}\n'.format(j['entA'].id, gg1))
- outfile.write('Arg2: {} | {}\n'.format(j['entB'].id, gg2))
- outfile.write('Distance: {}\n'.format(solve(j['sentA'].split(':'), j['sentB'].split(':'))))
- outfile.write('\n')
- print('DONE')
-
-
- def write_preds(preds, info, ofile, map_=None):
- """ Write predictions to file """
- print('Saving errors ... ', end="")
- with open(ofile+'.preds', 'w') as outfile:
- for p, i in zip(preds, info):
- i = [i_ for i_ in i if i_]
- assert len(p) == len(i)
-
- for k, j in zip(p, i):
- # pmid, e1, e2, pred, truth
- if map_[k] == '1:NR:2':
- pass
- else:
- outfile.write('{}\n'.format('|'.join([j['pmid'].split('__')[0],
- j['entA'].id, j['entB'].id, j['cross'],
- str(solve(j['sentA'].split(':'), j['sentB'].split(':'))),
- map_[k]])))
- print('DONE')
-
-
- def plot_learning_curve(trainer, model_folder):
- """
- Plot the learning curves for training and test set (loss and primary score measure)
-
- Args:
- trainer (Class): trainer object
- model_folder (str): folder to save figures
- """
- x = list(map(int, np.arange(len(trainer.train_res['loss']))))
- fig = plt.figure()
- plt.subplot(2, 1, 1)
- plt.plot(x, trainer.train_res['loss'], 'b', label='train')
- plt.plot(x, trainer.test_res['loss'], 'g', label='test')
- plt.legend()
- plt.ylabel('Loss')
- plt.yticks(np.arange(0, 1, 0.1))
-
- plt.subplot(2, 1, 2)
- plt.plot(x, trainer.train_res['score'], 'b', label='train')
- plt.plot(x, trainer.test_res['score'], 'g', label='test')
- plt.legend()
- plt.ylabel('F1-score')
- plt.xlabel('Epochs')
- plt.yticks(np.arange(0, 1, 0.1))
-
- fig.savefig(model_folder + '/learn_curves.png', bbox_inches='tight')
-
-
- def print_results(scores, scores_class, show_class, time):
- """
- Print class-wise results.
-
- Args:
- scores (dict): micro and macro scores
- scores_class: score per class
- show_class (bool): show or not
- time: time
- """
-
- def indent(txt, spaces=18):
- return "\n".join(" " * spaces + ln for ln in txt.splitlines())
-
- if show_class:
- # print results for every class
- scores_class.append(['-----', None, None, None])
- scores_class.append(['macro score', scores['macro_p'], scores['macro_r'], scores['macro_f']])
- scores_class.append(['micro score', scores['micro_p'], scores['micro_r'], scores['micro_f']])
- print(' | {}\n'.format(humanized_time(time)))
- print(indent(tabulate(scores_class,
- headers=['Class', 'P', 'R', 'F1'],
- tablefmt='orgtbl',
- floatfmtL=".4f",
- missingval="")))
- print()
- else:
- print('ACC = {:.04f} , '
- 'MICRO P/R/F1 = {:.04f}\t{:.04f}\t{:.04f} | '.format(scores['acc'], scores['micro_p'], scores['micro_r'],
- scores['micro_f']), end="")
-
- l = ':<7' # +str(len(str(scores['total'])))
- s = 'TP/ACTUAL/PRED = {'+l+'}/{'+l+'}/{'+l+'}, TOTAL {'+l+'}'
- print(s.format(scores['tp'], scores['true'], scores['pred'], scores['total']), end="")
- print(' | {}'.format(humanized_time(time)))
-
-
- class Tee(object):
- """
- Object to print stdout to a file.
- """
- def __init__(self, *files):
- self.files = files
-
- def write(self, obj):
- for f_ in self.files:
- f_.write(obj)
- f_.flush() # If you want the output to be visible immediately
-
- def flush(self):
- for f_ in self.files:
- f_.flush()
-
-
- def humanized_time(second):
- """
- :param second: time in seconds
- :return: human readable time (hours, minutes, seconds)
- """
- m, s = divmod(second, 60)
- h, m = divmod(m, 60)
- return "%dh %02dm %02ds" % (h, m, s)
-
-
- def setup_log(params, mode):
- """
- Setup .log file to record training process and results.
-
- Args:
- params (dict): model parameters
-
- Returns:
- model_folder (str): model directory
- """
-
- model_folder = params['folder']
- if not os.path.exists(model_folder):
- os.makedirs(model_folder)
-
- if params['full_train']:
- mode = 'full_train'
- with open(os.path.join(model_folder, f'params_{mode}.yaml'), 'w') as f:
- f.write(yaml.dump(params))
-
- log_file = model_folder + '/info_'+mode+'.log'
- f = open(log_file, 'w')
- sys.stdout = Tee(sys.stdout, f)
- return model_folder
-
-
- def observe(model):
- """
- Observe model parameters: name, range of matrices & gradients
-
- Args
- model: specified model object
- """
- for name, param in model.named_parameters():
- p_data, p_grad = param.data, param.grad.data
- print('Name: {:<30}\tRange of data: [{:.4f}, {:.4f}]\tRange of gradient: [{:.4f}, {:.4f}]'.format(name,
- np.min(p_data.data.to('cpu').numpy()),
- np.max(p_data.data.to('cpu').numpy()),
- np.min(p_grad.data.to('cpu').numpy()),
- np.max(p_grad.data.to('cpu').numpy())))
- print('--------------------------------------')
-
-
- def save_model(model_folder, trainer, loader):
- print('\nSaving the model & the parameters ...')
- # save mappings
- with open(os.path.join(model_folder, 'mappings.pkl'), 'wb') as f:
- pkl.dump(loader, f, pkl.HIGHEST_PROTOCOL)
- torch.save(trainer.model.state_dict(), os.path.join(model_folder, 're.model'))
-
-
- def load_model(model_folder, trainer):
- print('\nLoading model & parameters ...')
- trainer.model.load_state_dict(torch.load(os.path.join(model_folder, 're.model'),
- map_location=trainer.device))
- return trainer
-
-
- def load_mappings(model_folder):
- with open(os.path.join(model_folder, 'mappings.pkl'), 'rb') as f:
- loader = pkl.load(f)
- return loader
-
-
- def print_options(params):
- print('''\nParameters:
- - Train Data {}
- - Test Data {}
- - Embeddings {}, Freeze: {}
- - Save folder {}
-
- - batchsize {}
- - Walks iteration {} -> Length = {}
- - beta {}
-
- - Context {}
- - Node Type {}
- - Distances {}
- - Edge Types {}
-
- - Epoch {}
- - UNK word prob {}
- - Parameter Average {}
- - Early stop {} -> Patience = {}
- - Regularization {}
- - Gradient Clip {}
- - Dropout I/O {} / {}
- - Learning rate {}
- - Seed {}
- - Fixbug {}
- '''.format(params['train_data'], params['test_data'], params['embeds'], params['freeze_words'],
- params['folder'],
- params['batch'],
- params['walks_iter'], 2 ** params['walks_iter'] if params['walks_iter'] else 0, params['beta'],
- params['context'], params['types'], params['dist'], params['edges'],
- params['epoch'], params['unk_w_prob'], params['param_avg'],
- params['early_stop'], params['patience'],
- params['reg'], params['gc'], params['drop_i'], params['drop_o'],
- params['lr'], params['seed'],
- params.get('fixbug', False)))
|