|
- # 添加属性n_idx
- # 将edge_index 转换成 torch.long
- # 数据集划分信息
-
- import argparse
- from pydoc import describe
- from tkinter import W
- from torch_geometric.data import HeteroData
- import torch
- import numpy as np
- from tqdm import tqdm
- import os.path as osp
- import pickle as pkl
-
- import os
- import os.path as osp
- import argparse
- import json
-
- import numpy as np
- import torch
- import torch.nn.functional as F
- from tqdm import tqdm
-
- from torch_geometric.loader import NeighborLoader
- from torch_geometric.nn import RGCNConv
- from sklearn.metrics import average_precision_score
-
- edge_size = 157814864
- node_size = 13806619
-
-
- def read_node_atts(node_file, pyg_file, label_file=None):
- node_maps = {}
- node_embeds = {}
- count = 0
- lack_num = {}
- node_counts = node_size
- if osp.exists(pyg_file + ".nodes.pyg") == False:
- process = tqdm(total=node_counts, desc="Generating " + pyg_file + ".nodes.pyg")
- with open(node_file, 'r') as rf:
- while True:
- line = rf.readline()
- if line is None or len(line) == 0:
- break
- info = line.strip().split(",")
-
- node_id = int(info[0])
- node_type = info[1].strip()
-
- node_maps.setdefault(node_type, {})
- node_id_v2 = len(node_maps[node_type])
- node_maps[node_type][node_id] = node_id_v2
-
- node_embeds.setdefault(node_type, {})
- lack_num.setdefault(node_type, 0)
- # TODO:可以对不同的node_type缺失项采用不同的随机初始化方法
- if node_type == 'item':
- if len(info[2]) < 50:
- node_embeds[node_type][node_id_v2] = np.zeros(256, dtype=np.float32)
- lack_num[node_type] += 1
- else:
- node_embeds[node_type][node_id_v2] = np.array([x for x in info[2].split(":")], dtype=np.float32)
- else:
- if len(info[2]) < 50:
- node_embeds[node_type][node_id_v2] = np.zeros(256, dtype=np.float32)
- lack_num[node_type] += 1
- else:
- node_embeds[node_type][node_id_v2] = np.array([x for x in info[2].split(":")], dtype=np.float32)
-
- count += 1
- if count % 100000 == 0:
- process.update(100000)
-
- process.close()
-
- print("Num of total nodes:", count)
- print('Node_types:', node_maps.keys())
- print('Node_type Totol_Num Lack_Num):')
- for node_type in node_maps:
- print(node_type, len(node_maps[node_type]), lack_num[node_type])
-
- labels = []
- if label_file is not None:
- labels_info = [x.strip().split(",") for x in open(label_file).readlines()]
- for i in range(len(labels_info)):
- x = labels_info[i]
- item_id = node_maps['item'][int(x[0])]
- label = int(x[1])
- labels.append([item_id, label])
-
- nodes_dict = {'maps': node_maps, 'embeds': node_embeds}
- nodes_dict['labels'] = {}
- nodes_dict['labels']['item'] = labels
- print('Start saving pkl-style node information\n')
- pkl.dump(nodes_dict, open(pyg_file + ".nodes.pyg", 'wb'), pkl.HIGHEST_PROTOCOL)
- print('Complete saving pkl-style node information\n')
-
- else:
- nodes = pkl.load(open(pyg_file + ".nodes.pyg", 'rb'))
- node_embeds = nodes['embeds']
- node_maps = nodes['maps']
- labels = nodes['labels']['item']
-
- # 将结点特征储存到pyg 图数据中
- graph = HeteroData()
-
- print("Start converting into pyg data")
- # 1. 转换结点特征
- for node_type in tqdm(node_embeds, desc="Node features, numbers and mapping", ascii=True):
- graph[node_type].x = torch.empty(len(node_maps[node_type]), 256)
- for nid, embedding in tqdm(node_embeds[node_type].items()):
- graph[node_type].x[nid] = torch.from_numpy(embedding)
- graph[node_type].num_nodes = len(node_maps[node_type])
- graph[node_type].maps = node_maps[node_type]
-
- if label_file is not None:
- # 2. 转换标签
- graph['item'].y = torch.zeros(len(node_maps['item']), dtype=torch.long) - 1
- for index, label in tqdm(labels, desc="Node labels", ascii=True):
- graph['item'].y[index] = label
-
- # 3. 划分数据集
- # 如果不用直接注释就好了,别删
- # 数据集划分
- # 得到有标注的结点索引idx
- indices = (graph['item'].y != -1).nonzero().squeeze()
- print("Num of true labeled nodes:{}".format(indices.shape[0]))
- # 得到训练集和验证集划分
- train_val_random = torch.randperm(indices.shape[0])
- train_idx = indices[train_val_random][:int(indices.shape[0] * 0.8)]
- val_idx = indices[train_val_random][int(indices.shape[0] * 0.8):]
- print("trian_idx:{}".format(train_idx.numpy()))
- print("test_idx:{}".format(val_idx.numpy()))
- # 添加到item类型结点的属性中
- graph['item'].train_idx = train_idx
- graph['item'].val_idx = val_idx
-
- # 添加每个节点的索引信息n_id
- for ntype in graph.node_types:
- graph[ntype].n_id = torch.arange(graph[ntype].num_nodes)
- print("Complete converting into pyg data")
-
- print("Start saving into pyg data")
- torch.save(graph, pyg_file + ".pt")
- print("Complete saving into pyg data")
- return graph
-
-
- def format_pyg_graph(edge_file, node_file, pyg_file, label_file=None):
- if osp.exists(pyg_file + ".pt") and args.reload == False:
- graph = torch.load(pyg_file + ".pt")
- else:
- graph = read_node_atts(node_file, pyg_file, label_file)
-
- process = tqdm(total=edge_size)
-
- # graph = HeteroData()
- edges = {}
- count = 0
- with open(edge_file, 'r') as rf:
- while True:
- line = rf.readline()
- if line is None or len(line) == 0:
- break
- line_info = line.strip().split(",")
- source_id, dest_id, source_type, dest_type, edge_type = line_info
- source_id = graph[source_type].maps[int(source_id)]
- dest_id = graph[dest_type].maps[int(dest_id)]
- edges.setdefault(edge_type, {})
- edges[edge_type].setdefault('source', []).append(int(source_id))
- edges[edge_type].setdefault('dest', []).append(int(dest_id))
- edges[edge_type].setdefault('source_type', source_type)
- edges[edge_type].setdefault('dest_type', dest_type)
- count += 1
- if count % 100000 == 0:
- process.update(100000)
- process.close()
- print('Complete reading edge information\n')
-
- print('Start converting edge information\n')
- for edge_type in edges:
- source_type = edges[edge_type]['source_type']
- dest_type = edges[edge_type]['dest_type']
- source = torch.tensor(edges[edge_type]['source'], dtype=torch.long)
- dest = torch.tensor(edges[edge_type]['dest'], dtype=torch.long)
- graph[(source_type, edge_type, dest_type)].edge_index = torch.vstack([source, dest])
-
- # edge_type 重新排序,pyg处理异质图一般是将其转换为同质图再利用edge_type这个属性确定边的类型,所以最好先把所有图的edge_type按照统一的标准进行排序
- for edge_type in [('b', 'A_1', 'item'),
- ('f', 'B', 'item'),
- ('a', 'G_1', 'f'),
- ('f', 'G', 'a'),
- ('a', 'H_1', 'e'),
- ('f', 'C', 'd'),
- ('f', 'D', 'c'),
- ('c', 'D_1', 'f'),
- ('f', 'F', 'e'),
- ('item', 'B_1', 'f'),
- ('item', 'A', 'b'),
- ('e', 'F_1', 'f'),
- ('e', 'H', 'a'),
- ('d', 'C_1', 'f')]:
- temp = graph[edge_type].edge_index
- del graph[edge_type]
- graph[edge_type].edge_index = temp
-
- print('Complete converting edge information\n')
- print('Start saving into pyg data\n')
- torch.save(graph, pyg_file + ".pt",_use_new_zipfile_serialization=False)
- print('Complete saving into pyg data\n')
-
- class RGCN(torch.nn.Module):
- def __init__(self, in_channels, hidden_channels, out_channels, n_layers=3):
- super().__init__()
- self.convs = torch.nn.ModuleList()
- self.relu = F.relu
- self.convs.append(RGCNConv(in_channels, hidden_channels, num_relations, num_bases=args.n_bases))
- for i in range(n_layers - 2):
- self.convs.append(RGCNConv(hidden_channels, hidden_channels, num_relations, num_bases=args.n_bases))
- self.convs.append(RGCNConv(hidden_channels, out_channels, num_relations, num_bases=args.n_bases))
-
- def forward(self, x, edge_index, edge_type):
- for i, conv in enumerate(self.convs):
- x = conv(x, edge_index, edge_type)
- if i < len(self.convs) - 1:
- x = x.relu_()
- x = F.dropout(x, p=0.4, training=self.training)
- return x
-
- def train(epoch):
- model.train()
-
- pbar = tqdm(total=int(len(train_loader.dataset)), ascii=True)
- pbar.set_description(f'Epoch {epoch:02d}')
-
- total_loss = total_correct = total_examples = 0
- y_pred = []
- y_true = []
- for batch in train_loader:
- optimizer.zero_grad()
- batch_size = batch[labeled_class].batch_size
- y = batch[labeled_class].y[:batch_size].to(device)
-
- # 找到应该输出的index起始值,因为item节点没有放置在最前面,导致to_homogeneous后source node并不会排在最前面
- start = 0
- for ntype in batch.node_types:
- if ntype == labeled_class:
- break
- start += batch[ntype].num_nodes
-
- batch = batch.to_homogeneous()
-
- y_hat = model(batch.x.to(device), batch.edge_index.to(device), batch.edge_type.to(device))[
- start:start + batch_size]
- weights = torch.FloatTensor([5,1])
- weights = weights.to(device)
- loss = F.cross_entropy(y_hat, y,weight=weights)
- loss.backward()
- optimizer.step()
- y_pred.append(F.softmax(y_hat, dim=1)[:, 1].detach().cpu())
- y_true.append(y.cpu())
- total_loss += float(loss) * batch_size
- total_correct += int((y_hat.argmax(dim=-1) == y).sum())
- total_examples += batch_size
- pbar.update(batch_size)
- pbar.close()
- ap_score = average_precision_score(torch.hstack(y_true).numpy(), torch.hstack(y_pred).numpy())
-
- return total_loss / total_examples, total_correct / total_examples, ap_score
-
- @torch.no_grad()
- def val():
- model.eval()
- pbar = tqdm(total=int(len(val_loader.dataset)), ascii=True)
- pbar.set_description(f'Epoch {epoch:02d}')
- total_loss = total_correct = total_examples = 0
- y_pred = []
- y_true = []
- for batch in val_loader:
- batch_size = batch[labeled_class].batch_size
- y = batch[labeled_class].y[:batch_size].to(device)
- # 找到应该输出的index起始值,因为item节点没有放置在最前面,导致to_homogeneous后source node并不会排在最前面,而是排列在给类别最前面
- start = 0
- for ntype in batch.node_types:
- if ntype == labeled_class:
- break
- start += batch[ntype].num_nodes
-
- batch = batch.to_homogeneous()
-
- y_hat = model(batch.x.to(device), batch.edge_index.to(device), batch.edge_type.to(device))[
- start:start + batch_size]
- weights = torch.FloatTensor([5,1])
- weights = weights.to(device)
- loss = F.cross_entropy(y_hat, y,weight=weights)
- #loss = F.cross_entropy(y_hat, y)
- y_pred.append(F.softmax(y_hat, dim=1)[:, 1].detach().cpu())
- y_true.append(y.cpu())
- total_loss += float(loss) * batch_size
- total_correct += int((y_hat.argmax(dim=-1) == y).sum())
- total_examples += batch_size
- pbar.update(batch_size)
- pbar.close()
- ap_score = average_precision_score(torch.hstack(y_true).numpy(), torch.hstack(y_pred).numpy())
-
- return total_loss / total_examples, total_correct / total_examples, ap_score
-
- @torch.no_grad()
- def test():
- model.eval()
- pbar = tqdm(total=int(len(test_loader.dataset)), ascii=True)
- pbar.set_description(f'Generate Final Result:')
- y_pred = []
- for batch in test_loader:
- batch_size = batch[labeled_class].batch_size
- # 找到应该输出的index,因为item节点没有放置在最前面,导致to_homogeneous后source node并不会排在最前面,而是排列在给类别最前面
- start = 0
- for ntype in batch.node_types:
- if ntype == labeled_class:
- break
- start += batch[ntype].num_nodes
-
- # 转换为同质图
- batch = batch.to_homogeneous()
- y_hat = model(batch.x.to(device), batch.edge_index.to(device), batch.edge_type.to(device))[
- start:start + batch_size]
- pbar.update(batch_size)
- y_pred.append(F.softmax(y_hat, dim=1)[:, 1].detach().cpu())
- pbar.close()
-
- return torch.hstack(y_pred)
-
- if __name__ == "__main__":
- parser = argparse.ArgumentParser()
- parser.add_argument('--graph', type=str, default='/tmp/dataset/icdm2022_session1_edges.csv')
- parser.add_argument('--node', type=str, default='/tmp/dataset/icdm2022_session1_nodes.csv')
- parser.add_argument('--label', type=str, default='/tmp/dataset/icdm2022_session1_train_labels.csv')
- parser.add_argument('--storefile', type=str, default='/tmp/output/')
- parser.add_argument('--reload', type=bool, default=False, help="Whether node features should be reloaded")
- args = parser.parse_args(args=[])
- if args.graph is not None and args.storefile is not None and args.node is not None:
- format_pyg_graph(args.graph, args.node, args.storefile, args.label)
- # read_node_atts(args.node, args.storefile, args.label)
-
- # 参数
- parser = argparse.ArgumentParser()
- parser.add_argument('--dataset', type=str, default='/tmp/output/.pt')
- parser.add_argument('--labeled-class', type=str, default='item')
- parser.add_argument("--batch-size", type=int, default=128,
- help="Mini-batch size. If -1, use full graph training.")
- parser.add_argument("--fanout", type=int, default=-1,
- help="Fan-out of neighbor sampling.")
- parser.add_argument("--n-layers", type=int, default=2,
- help="number of propagation rounds")
- parser.add_argument("--h-dim", type=int, default=64,
- help="number of hidden units")
- parser.add_argument("--in-dim", type=int, default=256,
- help="number of hidden units")
- parser.add_argument("--n-bases", type=int, default=8,
- help="number of filter weight matrices, default: -1 [use all]")
- parser.add_argument("--validation", type=bool, default=False)
- parser.add_argument("--early_stopping", type=int, default=10)
- parser.add_argument("--n-epoch", type=int, default=15)
- parser.add_argument("--test-file", type=str, default="/tmp/dataset/icdm2022_session1_test_ids.txt")
- parser.add_argument("--json-file", type=str, default="/tmp/output/pyg_pred.json")
- parser.add_argument("--inference", type=bool, default=False)
- parser.add_argument("--record-file", type=str, default="record.txt")
- parser.add_argument("--lr", type=float, default=0.01)
- parser.add_argument("--model-id", type=int, default=0)
- parser.add_argument("--device-id", type=str, default="0")
- parser.add_argument("--load-model", type=int, default=0)
-
- args = parser.parse_args()
-
- os.environ["CUDA_VISIBLE_DEVICES"] = args.device_id
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- hgraph = torch.load(args.dataset)
-
- labeled_class = args.labeled_class
-
- if args.inference == False:
- train_idx = hgraph[labeled_class].pop('train_idx')
- if args.validation:
- val_idx = hgraph[labeled_class].pop('val_idx')
- #else:
- test_id = [int(x) for x in open(args.test_file).readlines()]
- converted_test_id = []
- for i in test_id:
- converted_test_id.append(hgraph['item'].maps[i])
- test_idx = torch.LongTensor(converted_test_id)
-
- # Mini-Batch
- if args.inference == False:
- train_loader = NeighborLoader(hgraph, input_nodes=(labeled_class, train_idx),
- num_neighbors=[args.fanout] * args.n_layers,
- shuffle=True, batch_size=args.batch_size)
-
- if args.validation:
- val_loader = NeighborLoader(hgraph, input_nodes=(labeled_class, val_idx),
- num_neighbors=[args.fanout] * args.n_layers,
- shuffle=False, batch_size=args.batch_size)
- #else:
- test_loader = NeighborLoader(hgraph, input_nodes=(labeled_class, test_idx),
- num_neighbors=[args.fanout] * args.n_layers,
- shuffle=False, batch_size=args.batch_size)
-
- # # No need to maintain these features during evaluation:
- # # Add global node index information.
- # test_loader.data.num_nodes = data.num_nodes
- # test_loader.data.n_id = torch.arange(data.num_nodes)
-
- num_relations = len(hgraph.edge_types)
-
- if args.inference:
- model = torch.load(osp.join('icdm_graph_competition/pyg_examples/best_model', str(args.load_model) + ".pth"))
- else:
- model = RGCN(in_channels=args.in_dim, hidden_channels=args.h_dim, out_channels=2, n_layers=args.n_layers).to(
- device)
- optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-
- if args.inference == False:
- print("Start training")
- val_ap_list = []
- ave_val_ap = 0
- end = 0
- for epoch in range(1, args.n_epoch + 1):
- train_loss, train_acc, train_ap = train(epoch)
- print(f'Train: Epoch {epoch:02d}, Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, AP_Score: {train_ap:.4f}')
- if args.validation and epoch >= args.early_stopping:
- val_loss, val_acc, val_ap = val()
- print(f'Val: Epoch: {epoch:02d}, Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, AP_Score: {val_ap:.4f}')
- if val_ap <= ave_val_ap:
- print("Early Stopping")
- break
- torch.save(model,
- osp.join("/tmp/model/bestmodel", str(args.model_id) + ".pth"))
- val_ap_list.append(float(val_ap))
- ave_val_ap = np.average(val_ap_list)
- end = epoch
- #with open(args.record_file, 'a+') as f:
- # f.write(
- # f"{args.model_id:2d} {args.h_dim:3d} {args.n_layers:2d} {args.lr:.4f} {end:02d} {float(val_ap_list[-1]):.4f} {np.argmax(val_ap_list) + 5:02d} {float(np.max(val_ap_list)):.4f}\n")
-
- if args.inference == False:
- y_pred = test()
- # 写入json文件
- with open(args.json_file, 'w+') as f:
- for i in range(len(test_id)):
- y_dict = {}
- y_dict["item_id"] = int(test_id[i])
- y_dict["score"] = float(y_pred[i])
- json.dump(y_dict, f)
- f.write('\n')
|