|
- # dataset name: XYGraphP1_no_valid
-
- import pickle
-
- from sklearn.preprocessing import MinMaxScaler
- from utils.dgraphfin import DGraphFin
- from utils.dgraphfin2 import GEARDGraphFin
- from utils.utils import prepare_folder
- from utils.evaluator import Evaluator
- from torch_geometric.data import NeighborSampler
- from models import SAGE_NeighSampler, GAT_NeighSampler, GATv2_NeighSampler
- from tqdm import tqdm
-
- import argparse
-
- import torch
- import torch.nn.functional as F
- import torch.nn as nn
-
- import torch_geometric.transforms as T
- from utils.data_process import data_process
- from torch.nn import Linear
- from torch_geometric.nn import GCNConv,SAGEConv
-
- import os
- try:
- os.system("cp /dataset/* -r /code/dataset/DGraphFin/raw")
- except:
- print("拷贝错误")
-
- eval_metric = 'auc'
- sage_neighsampler_parameters = {'lr': 0.003, 'num_layers': 2, 'hidden_channels': 128, 'dropout': 0.0, 'batchnorm': False, 'l2': 5e-7
- }
- # MLP推理
- class MLP(torch.nn.Module):
- def __init__(self,in_channels,nlabels,hidden_channels=256,dropout=0.3):
-
- super(MLP, self).__init__()
- torch.manual_seed(1024)
- self.lin1 = Linear(in_channels,hidden_channels)
- self.lin2 = Linear(hidden_channels,nlabels)
- self.bns = torch.nn.BatchNorm1d(hidden_channels)
- self.dropout = dropout
- def forward(self,x):
- x = self.lin1(x)
- x = self.bns(x)
- x = F.relu(x)
- x = F.dropout(x, p=self.dropout, training=self.training)
- x = self.lin2(x)
-
- return x.log_softmax(dim=-1)
-
-
- def MLP_train(model, data, train_idx, test_idx,split_idx):
- criterion = torch.nn.CrossEntropyLoss() # Define loss criterion.
- optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4) # Define optimizer.
-
- def train(model, data, train_idx, optimizer, model_name, no_conv=False):
- # data.y is labels of shape (N, )
- model.train()
- optimizer.zero_grad()
- out = model(data.x)[train_idx]
- loss = F.nll_loss(out, data.y[train_idx])
- loss.backward()
- optimizer.step()
-
- return loss.item()
-
- print("MLP测试")
-
- @torch.no_grad()
- def test(model, data, split_idx, evaluator, no_conv=False):
-
- model.eval()
- out = model(data.x)
- y_pred = out.exp() # (N,num_classes)
- losses, eval_results = dict(), dict()
- for key in ['train', 'valid', 'test']:
- node_id = split_idx[key]
- losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
- eval_results[key] = evaluator.eval(data.y[node_id], y_pred[node_id])[eval_metric]
-
- return eval_results, losses, y_pred
-
-
- evaluator = Evaluator(eval_metric)
- for epoch in range(1, 201):
- loss = train(model, data, train_idx, optimizer, model)
-
- # test 函数 返回 :eval_results, losses, y_pred
- eval_results, losses, out = test(model, data, split_idx, evaluator)
- train_eval, valid_eval, test_eval = eval_results['train'], eval_results['valid'], eval_results['test']
- train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test']
-
-
- if epoch % 10 == 0:
- print(
- f'Epoch: {epoch:02d}, '
- f'Loss: {loss:.4f}, '
- f'Train: {train_eval:.3f}, '
- f'Valid: {valid_eval:.3f} '
- f'Test: {test_eval:.3f}')
-
- # GCN 推理
- class GNN(torch.nn.Module):
- def __init__(self, in_channels, hidden_channels, nlabels, num_layers,
- dropout, conv_type):
- super(GNN, self).__init__()
-
- self.convs = torch.nn.ModuleList()
- self.bns = torch.nn.ModuleList()
-
- for i in range(num_layers):
- if conv_type == 'gcn':
- if i == 0:
- self.convs.append(
- GCNConv(in_channels, hidden_channels, cached=True))
- elif i == num_layers - 1:
- self.convs.append(
- GCNConv(hidden_channels, nlabels, cached=True))
- else:
- self.convs.append(
- GCNConv(hidden_channels, hidden_channels, cached=True))
- elif conv_type == 'sage':
- if i == 0:
- self.convs.append(SAGEConv(in_channels, hidden_channels))
- elif i == num_layers - 1:
- self.convs.append(
- SAGEConv(hidden_channels, nlabels))
- else:
- self.convs.append(
- SAGEConv(hidden_channels, hidden_channels))
-
- if i != num_layers - 1:
- self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
-
- self.dropout = dropout
-
- def reset_parameters(self):
- for conv in self.convs:
- conv.reset_parameters()
- for bn in self.bns:
- bn.reset_parameters()
-
- def forward(self, x, adj_t):
- for i, conv in enumerate(self.convs[:-1]):
- x = conv(x, adj_t)
- x = self.bns[i](x)
- x = F.relu(x)
- x = F.dropout(x, p=self.dropout, training=self.training)
- x = self.convs[-1](x, adj_t)
- return x.log_softmax(dim=-1)
-
- def GCN_train(model, data, train_idx, test_idx,split_idx):
-
- optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) # Define optimizer.
-
- def train(model, data, train_idx, optimizer, model_name, no_conv=False):
- # data.y is labels of shape (N, )
- model.train()
- optimizer.zero_grad()
- out = model(data.x, data.adj_t)[train_idx]
- loss = F.nll_loss(out, data.y[train_idx])
- loss.backward()
- optimizer.step()
-
- return loss.item()
-
- print("GCN测试")
-
- @torch.no_grad()
- def test(model, data, split_idx, evaluator, no_conv=False):
-
- model.eval()
- out = model(data.x, data.adj_t)
- y_pred = out.exp() # (N,num_classes)
- losses, eval_results = dict(), dict()
- for key in ['train', 'valid', 'test']:
- node_id = split_idx[key]
- losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
- eval_results[key] = evaluator.eval(data.y[node_id], y_pred[node_id])[eval_metric]
-
- return eval_results, losses, y_pred
-
- evaluator = Evaluator(eval_metric)
- for epoch in range(1, 301):
- loss = train(model, data, train_idx, optimizer, model)
-
- # test 函数 返回 :eval_results, losses, y_pred
- eval_results, losses, out = test(model, data, split_idx, evaluator)
- train_eval, valid_eval, test_eval = eval_results['train'], eval_results['valid'], eval_results['test']
- train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test']
-
-
- if epoch % 10 == 0:
- print(
- f'Epoch: {epoch:02d}, '
- f'Loss: {loss:.4f}, '
- f'Train: {train_eval:.3f}, '
- f'Valid: {valid_eval:.3f} '
- f'Test: {test_eval:.3f}')
-
- @torch.no_grad()
- def to_embedding(layer_loader, model, data, device, no_conv=False):
- # 函数功能:使用训练好的模型,将全图所有的节点生成embedding向量
- # data.y is labels of shape (N, )
- model.eval()
-
- out = model.to_embedding(data.x, layer_loader, device)
- print("Model embedding data : ", out.shape)
-
- return out
-
-
- def read_data():
- dataset = GEARDGraphFin(root='/code/dataset', name="DGraphFin", transform=T.ToSparseTensor())
- data = dataset[0]
-
- print(data)
-
- if data.y.dim() == 2:
- data.y = data.y.squeeze(1)
-
- data.adj_t = data.adj_t.to_symmetric()
-
- data = data_process(data)
- print(data)
-
- return data
-
- def data_norm(data):
- # 创建MinMaxScaler对象
- scaler = MinMaxScaler(feature_range=(0, 1))
-
- # 调用fit_transform对数据进行归一化
- x = scaler.fit_transform(data.x.cpu().numpy())
- x = torch.FloatTensor(x)
- data.x = x
-
- return data
-
- def main():
- parser = argparse.ArgumentParser(description='minibatch_gnn_models')
- parser.add_argument('--device', type=int, default=0)
- parser.add_argument('--dataset', type=str, default='DGraphFin')
- parser.add_argument('--log_steps', type=int, default=10)
- parser.add_argument('--model', type=str, default='sage_neighsampler')
- parser.add_argument('--epochs', type=int, default=100)
-
- args = parser.parse_args()
- print(args)
-
- no_conv = False
- if args.model in ['mlp']:
- no_conv = True
-
- device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
- device = torch.device(device)
-
- data = read_data()
-
-
- data = data_norm(data).to(device)
- nlabels = 2
-
- split_idx = {'train': data.train_mask,
- 'valid': data.valid_mask, 'test': data.test_mask}
- train_idx = split_idx['train'].to(device)
- test_idx = split_idx['test'].to(device)
-
- layer_loader = NeighborSampler(
- data.adj_t, node_idx=None, sizes=[-1], batch_size=4096, shuffle=False, num_workers=0)
- # node_idx=None表示采样全图
-
- if args.model == 'sage_neighsampler':
- para_dict = sage_neighsampler_parameters
- model_para = sage_neighsampler_parameters.copy()
- model_para.pop('lr')
- model_para.pop('l2')
- model = SAGE_NeighSampler(
- in_channels=data.x.size(-1), out_channels=nlabels, **model_para).to(device)
-
-
- print(f'Model {args.model} initialized')
-
- model_file = "/code/model_files/model.pt"
- print('model_file:', model_file)
- model.load_state_dict(torch.load(model_file))
-
- out = to_embedding(layer_loader, model, data, device, no_conv)
-
- # 加载原始数据
-
- # 将此向量与原始数据合并
- print(out.shape)
- print(data.x.shape)
-
- data.x = torch.cat((out, data.x), dim=1)
-
- print(data.x.shape)
- if data.y.dim() == 2:
- data.y = data.y.squeeze(1)
-
- print(data)
-
-
- in_channels = data.x.shape[1]
- # model = MLP(in_channels, nlabels, 128, 0.2).to(device)
- # MLP_train(model, data, train_idx, test_idx,split_idx)
-
-
- # model = GCN(in_channels,nlabels).to(device)
- model = GNN(in_channels, 128, nlabels, 3, 0, "sage").to(device)
- GCN_train(model, data, train_idx, test_idx,split_idx)
-
-
- if __name__ == "__main__":
- main()
|