|
- import pickle as pkl
- import numpy as np
- import torch
- import sys
- from tqdm import tqdm
- from embed import PositionalEncoder
- from torch.utils.data import Dataset
-
-
- def normalize_adj(adj):
- """Symmetrically normalize adjacency matrix."""
- rowsum = np.array(adj.sum(1))
- with np.errstate(divide='ignore'):
- d_inv_sqrt = np.power(rowsum, -0.5).flatten()
- d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
- d_mat_inv_sqrt = np.diag(d_inv_sqrt)
- return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
-
-
- def normalize_mean_std(adj):
- min = np.amin(adj)
- max = np.amax(adj)
- return (adj - min) / (max - min)
-
-
- def preprocess_adj(adj):
- """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
- max_length = max([a.shape[0] for a in adj])
- mask = np.zeros((adj.shape[0], max_length, 1)) # mask for padding
-
- for i in tqdm(range(adj.shape[0])):
- adj_normalized = normalize_adj(adj[i])
- pad = max_length - adj_normalized.shape[0] # padding for each epoch
- adj_normalized = np.pad(adj_normalized, ((0, pad), (0, pad)), mode='constant')
- mask[i, :adj[i].shape[0], :] = 1.
- adj[i] = adj_normalized
-
- return np.array(list(adj)), mask # coo_to_tuple(sparse.COO(np.array(list(adj)))), mask
-
-
- def preprocess_features(features, device):
- """Row-normalize feature matrix and convert to tuple representation"""
- max_length = max([len(f) for f in features])
- features_mat = np.zeros([features.shape[0], max_length, 300])
- pe = PositionalEncoder(300, max_seq_len=max_length)
- for i in tqdm(range(features.shape[0])):
- feature = np.array(features[i])
- pad = max_length - feature.shape[0] # padding for each epoch
- feature = np.pad(feature, ((0, pad), (0, 0)), mode='constant')
- features_mat[i, :, :] = feature
- features_mat = torch.from_numpy(features_mat)
- features_mat = pe(features_mat.to(device), device)
-
- return features_mat
-
-
- def accuracy(output, labels):
- preds = output.max(1)[1].type_as(labels)
- correct = preds.eq(labels).double()
- correct = correct.sum()
- return correct / len(labels)
-
-
- class DatasetLoader(Dataset):
- def __init__(self, dataset_name, set_name):
- with open("data/{}/{}.all.vocab".format(dataset_name, dataset_name), 'rb') as f:
- self.vocab = pkl.load(f)
- if set_name == "train":
- with open("data/{}/{}.all.features".format(dataset_name, dataset_name), 'rb') as f:
- feats = pkl.load(f)
- self.features = feats[: int(0.5 * len(feats))]
- with open("data/{}/{}.all.adj".format(dataset_name, dataset_name), 'rb') as f:
- self.adjs = pkl.load(f)[: int(len(self.features))]
- with open("data/{}/{}.all.label".format(dataset_name, dataset_name), 'rb') as f:
- self.labels = pkl.load(f)[: int(len(self.features))]
- with open("data/{}/{}.all.sentence".format(dataset_name, dataset_name), 'rb') as f:
- self.sentences = pkl.load(f)[: int(len(self.features))]
- else:
- with open("data/{}/{}.all.features".format(dataset_name, dataset_name), 'rb') as f:
- feats = pkl.load(f)
- self.features = feats[int(0.5 * len(feats)):]
- with open("data/{}/{}.all.adj".format(dataset_name, dataset_name), 'rb') as f:
- self.adjs = pkl.load(f)[-int(len(self.features)):]
- with open("data/{}/{}.all.label".format(dataset_name, dataset_name), 'rb') as f:
- self.labels = pkl.load(f)[-int(len(self.features)):]
- with open("data/{}/{}.all.sentence".format(dataset_name, dataset_name), 'rb') as f:
- self.sentences = pkl.load(f)[-int(len(self.features)):]
- self.nnodes = self.features[0].shape[0]
-
- def __getitem__(self, index):
- # adj = normalize_adj(adj)
- data = {
- 'node_embeddings': torch.FloatTensor(self.features[index]),
- 'dependency_graph': torch.FloatTensor(self.adjs[index]),
- 'polarity': torch.FloatTensor(self.labels[index]),
- 'sentence': torch.IntTensor(self.sentences[index])
- }
- return data
-
- def __len__(self):
- return len(self.features)
-
- def nclass(self):
- return self.labels[0].shape[0]
-
-
- """
- def dependency_adj_matrix(text, max_len):
- # https://spacy.io/docs/usage/processing-text
- document = nlp(text)
- seq_len = len(text.split())
- matrix = np.zeros((max_len, max_len)).astype('float32')
-
- for token in document:
- if token.i < seq_len:
- matrix[token.i][token.i] = 1
- # https://spacy.io/docs/api/token
- for child in token.children:
- if child.i < seq_len:
- matrix[token.i][child.i] = 1
- matrix[child.i][token.i] = 1
-
- return matrix
- """
|