PCLNLP
/
PLM4NLP

 
			
							import pickle as pkl
import numpy as np
import torch
import sys
from tqdm import tqdm
from embed import PositionalEncoder
from torch.utils.data import Dataset


def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    rowsum = np.array(adj.sum(1))
    with np.errstate(divide='ignore'):
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = np.diag(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)


def normalize_mean_std(adj):
    min = np.amin(adj)
    max = np.amax(adj)
    return (adj - min) / (max - min)


def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    max_length = max([a.shape[0] for a in adj])
    mask = np.zeros((adj.shape[0], max_length, 1))  # mask for padding

    for i in tqdm(range(adj.shape[0])):
        adj_normalized = normalize_adj(adj[i])
        pad = max_length - adj_normalized.shape[0]  # padding for each epoch
        adj_normalized = np.pad(adj_normalized, ((0, pad), (0, pad)), mode='constant')
        mask[i, :adj[i].shape[0], :] = 1.
        adj[i] = adj_normalized

    return np.array(list(adj)), mask  # coo_to_tuple(sparse.COO(np.array(list(adj)))), mask


def preprocess_features(features, device):
    """Row-normalize feature matrix and convert to tuple representation"""
    max_length = max([len(f) for f in features])
    features_mat = np.zeros([features.shape[0], max_length, 300])
    pe = PositionalEncoder(300, max_seq_len=max_length)
    for i in tqdm(range(features.shape[0])):
        feature = np.array(features[i])
        pad = max_length - feature.shape[0]  # padding for each epoch
        feature = np.pad(feature, ((0, pad), (0, 0)), mode='constant')
        features_mat[i, :, :] = feature
    features_mat = torch.from_numpy(features_mat)
    features_mat = pe(features_mat.to(device), device)

    return features_mat


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)


class DatasetLoader(Dataset):
    def __init__(self, dataset_name, set_name):
        with open("data/{}/{}.all.vocab".format(dataset_name, dataset_name), 'rb') as f:
            self.vocab = pkl.load(f)
        if set_name == "train":
            with open("data/{}/{}.all.features".format(dataset_name, dataset_name), 'rb') as f:
                feats = pkl.load(f)
                self.features = feats[: int(0.5 * len(feats))]
            with open("data/{}/{}.all.adj".format(dataset_name, dataset_name), 'rb') as f:
                self.adjs = pkl.load(f)[: int(len(self.features))]
            with open("data/{}/{}.all.label".format(dataset_name, dataset_name), 'rb') as f:
                self.labels = pkl.load(f)[: int(len(self.features))]
            with open("data/{}/{}.all.sentence".format(dataset_name, dataset_name), 'rb') as f:
                self.sentences = pkl.load(f)[: int(len(self.features))]
        else:
            with open("data/{}/{}.all.features".format(dataset_name, dataset_name), 'rb') as f:
                feats = pkl.load(f)
                self.features = feats[int(0.5 * len(feats)):]
            with open("data/{}/{}.all.adj".format(dataset_name, dataset_name), 'rb') as f:
                self.adjs = pkl.load(f)[-int(len(self.features)):]
            with open("data/{}/{}.all.label".format(dataset_name, dataset_name), 'rb') as f:
                self.labels = pkl.load(f)[-int(len(self.features)):]
            with open("data/{}/{}.all.sentence".format(dataset_name, dataset_name), 'rb') as f:
                self.sentences = pkl.load(f)[-int(len(self.features)):]
        self.nnodes = self.features[0].shape[0]

    def __getitem__(self, index):
        # adj = normalize_adj(adj)
        data = {
            'node_embeddings': torch.FloatTensor(self.features[index]),
            'dependency_graph': torch.FloatTensor(self.adjs[index]),
            'polarity': torch.FloatTensor(self.labels[index]),
            'sentence': torch.IntTensor(self.sentences[index])
        }
        return data

    def __len__(self):
        return len(self.features)

    def nclass(self):
        return self.labels[0].shape[0]


"""
def dependency_adj_matrix(text, max_len):
    # https://spacy.io/docs/usage/processing-text
    document = nlp(text)
    seq_len = len(text.split())
    matrix = np.zeros((max_len, max_len)).astype('float32')

    for token in document:
        if token.i < seq_len:
            matrix[token.i][token.i] = 1
            # https://spacy.io/docs/api/token
            for child in token.children:
                if child.i < seq_len:
                    matrix[token.i][child.i] = 1
                    matrix[child.i][token.i] = 1

    return matrix
"""