Browse Source

上传文件至 '1_all_code'

master
Katherine1216 2 weeks ago
parent
commit
7a2a15895a
4 changed files with 865 additions and 0 deletions
  1. +492
    -0
      1_all_code/E_results.py
  2. +254
    -0
      1_all_code/NR_xgb_PathCS_Tuning.py
  3. +81
    -0
      1_all_code/SNF.py
  4. +38
    -0
      1_all_code/functions.py

+ 492
- 0
1_all_code/E_results.py View File

@@ -0,0 +1,492 @@
import sys
sys.path.append('/home/chujunyi/MFDF/3_my_code')
import os
os.chdir('/home/chujunyi/MFDF/1_original_data/E')
from Graph_utils import *
from functions import *
from SNF import *
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve
from sklearn.preprocessing import MaxAbsScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import gcforest
from gcforest.gcforest import GCForest
import xgboost
from functools import reduce
import glob
from scipy import interp
import csv
import time
import random
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix

from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

np.set_printoptions(threshold = np.inf)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

with open('/home/chujunyi/MFDF/1_original_data/E/E_combine_origin_new_interactions.txt','r') as f:
E_ddr_new_pair = f.readlines()
E_ddr_new_pair = [item.replace('\n','').split('\t') for item in E_ddr_new_pair]
E_ddr_new_pair = [item[0] + ',' + item[1] for item in E_ddr_new_pair]
#-------------------------

R_all_train_test = "e_admat_dgc_mat_2_line.txt"
(D,T,DT_signature,aAllPossiblePairs,dDs,dTs,diDs,diTs) = get_All_D_T_thier_Labels_Signatures(R_all_train_test)

DT_feature_pair_list = []
for index in diDs.values():
for column in diTs.values():
DT_feature_pair_list.append([index,column])
#-------------------------

def calc_metrics(y_results,DT_feature_pair_list,E_ddr_new_pair):
all_fp, all_fn = [],[]
all_tp, all_tn = [],[]
y_choose_all = pd.DataFrame()
for mode in range(len(y_results)):
for seed in range(len(y_results[mode])):
for fold in range(len(y_results[mode][seed][0])): #[0] = test, [1] = train
a = pd.DataFrame()
a['test_idx_fold'] = y_results[mode][seed][0][fold][0]
a['y_predprob_test_0'] = y_results[mode][seed][0][fold][3]
a['y_predprob_test_1'] = y_results[mode][seed][0][fold][4]
a['y_pred_test'] = y_results[mode][seed][0][fold][1]
a['y_true_test'] = y_results[mode][seed][0][fold][2]
tn, fp, fn, tp = confusion_matrix(a['y_true_test'], a['y_pred_test'],labels = [0,1]).ravel()
all_fp.append(fp)
all_fn.append(fn)
all_tp.append(tp)
all_tn.append(tn)
y_choose = a[a['y_pred_test'] == 0][a['y_true_test'] == 1]
y_choose_all = pd.concat([y_choose_all, y_choose])
mean_fp = np.mean(all_fp) * 10
mean_fn = np.mean(all_fn) * 10
mean_tp = np.mean(all_tp) * 10
mean_tn = np.mean(all_tn) * 10

micro_precision = mean_tp / (mean_tp + mean_fp)
micro_recall = mean_tp/ (mean_tp + mean_fn)
micro_fscore = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
print('mean_fp = {}, mean_fn = {}, mean_tp = {}, mean_tn = {}'.format(mean_fp, mean_fn, mean_tp, mean_tn))
print('micro_precision = {}, micro_recall = {}, micro_fscore = {}'.format(micro_precision, micro_recall, micro_fscore))
y_choose_all.drop_duplicates(subset='test_idx_fold', keep='first', inplace=True) # (16, 5)
y_choose_all['test_idx_fold'] = y_choose_all['test_idx_fold'].astype('int')

y_choose_idx = [item for item in y_choose_all['test_idx_fold']]
DTI_CDF_new_pair = [DT_feature_pair_list[i] for i in y_choose_idx]
y_choose_all['new_pair'] = DTI_CDF_new_pair

DTI_CDF_new_pair = [item[0] + ',' + item[1] for item in DTI_CDF_new_pair]
ddr_intersection_dti = list(set(DTI_CDF_new_pair).intersection(set(E_ddr_new_pair)))
print('DTI_CDF_new_pair: ', DTI_CDF_new_pair)
print('ddr_intersection_dti = ', ddr_intersection_dti)
return DTI_CDF_new_pair, y_choose_all, ddr_intersection_dti
def get_toy_config(rf_tree = 1, rf_max_depth = 1, rf_tree_2 = 1, rf_max_depth_2 = 1,
xgb_tree = 1, xgb_max_depth = 1, min_child_weight = 1, lr = 1,
xgb_tree_2 = 1, xgb_max_depth_2 = 1, min_child_weight_2 = 1, lr_2 = 1,
layer = 'rf1'):
if layer == 'rf1' or layer == 'xgb1':
max_layers = 1
n_folds = 1
else:
max_layers, n_folds = 5, 10

config = {}
ca_config = {}
ca_config["random_state"] = 1231
ca_config["look_indexs_cycle"] = None
ca_config["max_layers"] = max_layers
ca_config["early_stopping_rounds"] = 1
ca_config["n_classes"] = 2
ca_config["estimators"] = []
rf = {"n_folds": n_folds, "type": "RandomForestClassifier",
"n_estimators": rf_tree, "max_depth": rf_max_depth, 'criterion' : 'entropy', 'bootstrap' : True,
"n_jobs": -1, "criterion" : 'entropy', "class_weight":"balanced"}
rf_2 = {"n_folds": n_folds, "type": "RandomForestClassifier",
"n_estimators": rf_tree_2, "max_depth": rf_max_depth_2, 'criterion' : 'entropy', 'bootstrap' : True,
"n_jobs": -1, "criterion" : 'entropy', "class_weight":"balanced"}
xgb = {"n_folds": n_folds, "type": "XGBClassifier",'base_score':0.5, 'booster':'gbtree', 'colsample_bylevel':1,
'colsample_bytree':1, 'eval_metric':'auc', 'gamma':0, 'learning_rate':lr,
'max_delta_step':0, 'max_depth':xgb_max_depth, 'min_child_weight':min_child_weight, 'missing':None,
'n_estimators':xgb_tree, 'n_jobs':-1, 'nthread':-1,
'objective':'binary:logistic', 'random_state':1231, 'reg_alpha':0,
'reg_lambda':1, 'scale_pos_weight':1, 'seed':1231, 'silent':True,
'subsample':1}

xgb_2 = {"n_folds": n_folds, "type": "XGBClassifier",'base_score':0.5, 'booster':'gbtree', 'colsample_bylevel':1,
'colsample_bytree':1, 'eval_metric':'auc', 'gamma':0, 'learning_rate':lr_2,
'max_delta_step':0, 'max_depth':xgb_max_depth_2, 'min_child_weight':min_child_weight_2, 'missing':None,
'n_estimators':xgb_tree_2, 'n_jobs':-1, 'nthread':-1,
'objective':'binary:logistic', 'random_state':1231, 'reg_alpha':0,
'reg_lambda':1, 'scale_pos_weight':1, 'seed':1231, 'silent':True,
'subsample':1}
if layer == 'rf1':
ca_config["estimators"].append(rf)
elif layer == 'xgb1':
ca_config["estimators"].append(xgb)
elif layer == 'rf1xgb1':
ca_config["estimators"].append(rf)
ca_config["estimators"].append(xgb)
elif layer == 'rf1xgb1_2':
ca_config["estimators"].append(rf_2)
ca_config["estimators"].append(xgb_2)
elif layer == 'xgb2':
ca_config["estimators"].append(xgb)
ca_config["estimators"].append(xgb_2)
elif layer == 'rf2':
ca_config["estimators"].append(rf)
ca_config["estimators"].append(rf_2)
elif layer == 'rf2xgb1':
ca_config["estimators"].append(rf)
ca_config["estimators"].append(rf_2)
ca_config["estimators"].append(xgb_2)
elif layer == 'rf1xgb2':
ca_config["estimators"].append(xgb)
ca_config["estimators"].append(xgb_2)
ca_config["estimators"].append(rf_2)
elif layer == 'rf2xgb2':
ca_config["estimators"].append(rf)
ca_config["estimators"].append(rf_2)
ca_config["estimators"].append(xgb)
ca_config["estimators"].append(xgb_2)
config["cascade"] = ca_config
return config
def run_classification(mode, seed, X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold, layer):
if mode == 'p':
xgb_param = {}
xgb_param[1231] = {'xgb_max_depth':3, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':800,
'xgb_max_depth_2':9, 'min_child_weight_2':0.1, 'lr_2':0.1, 'xgb_tree_2':400}

xgb_param[8367] = {'xgb_max_depth':7, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':100,
'xgb_max_depth_2':9, 'min_child_weight_2':0.1, 'lr_2':0.1, 'xgb_tree_2':800}

xgb_param[22] = {'xgb_max_depth':20, 'min_child_weight':0.1, 'lr':0.1, 'xgb_tree':200,
'xgb_max_depth_2':7, 'min_child_weight_2':0.01, 'lr_2':0.1, 'xgb_tree_2':800}

xgb_param[1812] = {'xgb_max_depth':20, 'min_child_weight':0.1, 'lr':0.1, 'xgb_tree':200,
'xgb_max_depth_2':9, 'min_child_weight_2':0.001, 'lr_2':0.1, 'xgb_tree_2':800}

xgb_param[4659] = {'xgb_max_depth':9, 'min_child_weight':0.1, 'lr':0.01, 'xgb_tree':800,
'xgb_max_depth_2':20, 'min_child_weight_2':0.001, 'lr_2':0.1, 'xgb_tree_2':400}
rf_param = {}
rf_param[1231] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 800, 'rf_max_depth_2': 20}
rf_param[8367] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 100, 'rf_max_depth_2': 30}
rf_param[22] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 400, 'rf_max_depth_2': None}
rf_param[1812] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': 20}
rf_param[4659] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': None}
elif mode == 'D':
xgb_param = {}
xgb_param[1231] = {'xgb_max_depth':9, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':200,
'xgb_max_depth_2':7, 'min_child_weight_2':0.1, 'lr_2':0.1, 'xgb_tree_2':400}

xgb_param[8367] = {'xgb_max_depth':5, 'min_child_weight':0.01, 'lr':0.1, 'xgb_tree':400,
'xgb_max_depth_2':9, 'min_child_weight_2':0.1, 'lr_2':0.1, 'xgb_tree_2':400}

xgb_param[22] = {'xgb_max_depth':9, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':400,
'xgb_max_depth_2':20, 'min_child_weight_2':0.01, 'lr_2':0.1, 'xgb_tree_2':600}

xgb_param[1812] = {'xgb_max_depth':5, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':400,
'xgb_max_depth_2':20, 'min_child_weight_2':1, 'lr_2':0.1, 'xgb_tree_2':600}

xgb_param[4659] = {'xgb_max_depth':5, 'min_child_weight':0.01, 'lr':0.1, 'xgb_tree':800,
'xgb_max_depth_2':9, 'min_child_weight_2':0.001, 'lr_2':0.1, 'xgb_tree_2':600}
rf_param = {}
rf_param[1231] = {'rf_tree':600, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': 20}
rf_param[8367] = {'rf_tree':300, 'rf_max_depth':3, 'rf_tree_2': 300, 'rf_max_depth_2': 20}
rf_param[22] = {'rf_tree':200, 'rf_max_depth':3, 'rf_tree_2': 800, 'rf_max_depth_2': 20}
rf_param[1812] = {'rf_tree':300, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': 20}
rf_param[4659] = {'rf_tree':200, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': 20}

elif mode == 'T':
xgb_param = {}
xgb_param[1231] = {'xgb_max_depth':7, 'min_child_weight':0.01, 'lr':0.1, 'xgb_tree':800,
'xgb_max_depth_2':20, 'min_child_weight_2':0.001, 'lr_2':0.1, 'xgb_tree_2':400}

xgb_param[8367] = {'xgb_max_depth':7, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':600,
'xgb_max_depth_2':20, 'min_child_weight_2':0.01, 'lr_2':0.1, 'xgb_tree_2':800}

xgb_param[22] = {'xgb_max_depth':9, 'min_child_weight':0.1, 'lr':0.1, 'xgb_tree':800,
'xgb_max_depth_2':20, 'min_child_weight_2':0.01, 'lr_2':0.1, 'xgb_tree_2':400}

xgb_param[1812] = {'xgb_max_depth':9, 'min_child_weight':0.001, 'lr':0.1, 'xgb_tree':200,
'xgb_max_depth_2':20, 'min_child_weight_2':0.1, 'lr_2':0.1, 'xgb_tree_2':200}

xgb_param[4659] = {'xgb_max_depth':7, 'min_child_weight':0.1, 'lr':0.1, 'xgb_tree':600,
'xgb_max_depth_2':9, 'min_child_weight_2':0.01, 'lr_2':0.1, 'xgb_tree_2':800}
rf_param = {}
rf_param[1231] = {'rf_tree':600, 'rf_max_depth':3, 'rf_tree_2': 300, 'rf_max_depth_2': 20}
rf_param[8367] = {'rf_tree':600, 'rf_max_depth':3, 'rf_tree_2': 300, 'rf_max_depth_2': 30}
rf_param[22] = {'rf_tree':100, 'rf_max_depth':3, 'rf_tree_2': 600, 'rf_max_depth_2': 30}
rf_param[1812] = {'rf_tree':600, 'rf_max_depth':3, 'rf_tree_2': 200, 'rf_max_depth_2': None}
rf_param[4659] = {'rf_tree':400, 'rf_max_depth':3, 'rf_tree_2': 800, 'rf_max_depth_2': 30}

xgb_tree, xgb_max_depth = xgb_param[seed]['xgb_tree'], xgb_param[seed]['xgb_max_depth']
min_child_weight, lr = xgb_param[seed]['min_child_weight'], xgb_param[seed]['lr']
xgb_tree_2, xgb_max_depth_2 = xgb_param[seed]['xgb_tree_2'], xgb_param[seed]['xgb_max_depth_2']
min_child_weight_2, lr_2 = xgb_param[seed]['min_child_weight_2'], xgb_param[seed]['lr_2']
rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2 = rf_param[seed]['rf_tree'], rf_param[seed]['rf_max_depth'], rf_param[seed]['rf_tree_2'], rf_param[seed]['rf_max_depth_2']
result,test_true_predict_compare = run_classification_configuration(X_train_10_fold, X_test_10_fold,
y_train_10_fold, y_test_10_fold,
test_idx_10_fold, train_idx_10_fold,
rf_tree, rf_max_depth,
rf_tree_2, rf_max_depth_2,
xgb_tree, xgb_max_depth, min_child_weight, lr,
xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2,
layer, mode, seed)
print('************results : one_mode + one_seed + one_layer + one_parameter************')
print('Avg_AUPR_training = {:.4},Avg_AUPR_testing = {:.4},Avg_AUC_training = {:.4},Avg_AUC_testing = {:.4}'.format(
result[0],result[1],result[3],result[4]))
print('folds_AUPR_testing:', result[2])
print('folds_AUPR_training:', result[6])
print('folds_AUC_testing:', result[5])
print('folds_AUC_training:',result[7])
print('precision_testing = {}, recall_testing = {}, fscore_testing = {}'.format(result[8][0], result[8][1], result[8][2]))
print('precision_training = {}, recall_training = {}, fscore_training = {}'.format(result[9][0], result[9][1], result[9][2]))
#print('G_mean = ', result[0][12])
print('************************************')
return result, test_true_predict_compare

def gain_results(seeds, mode_list, layer, only_PathCS_feature = False):

aupr_list, auc_list = [], []
precision_list, recall_list, fscore_list = [], [], []
test_true_predict_compare_10cv_seeds_modes = []
recall_25_list, recall_50_list, recall_100_list, recall_200_list, recall_400_list = [], [], [], [], []
# recall_25_list, recall_50_list = [], []
G_mean_list = []
for mode in mode_list:
trails_AUPRs, trails_AUCs = [], []
trails_precisions, trails_recalls, trails_fscores = [], [], []
trails_recall_25, trails_recall_50, trails_recall_100, trails_recall_200, trails_recall_400 = [], [], [], [], []
# trails_recall_25, trails_recall_50 = [], []
trials_G_means = []
seeds_results = []
test_true_predict_compare_10cv_seeds = []
for seed in seeds:
print ("---------GENERATE_FOLD_{}_{}-----------------------------------------------".format(mode, seed))

filename = '/home/chujunyi/MFDF/3_my_code/E/data/E_folddata_X-Y_S' + str(mode) + '_seed' + str(seed) + '.npz'
folddata_XY = np.load(filename)

X_train_10_fold, X_test_10_fold = folddata_XY['X_train_10_fold'],folddata_XY['X_test_10_fold']
y_train_10_fold, y_test_10_fold = folddata_XY['y_train_10_fold'],folddata_XY['y_test_10_fold']
train_idx_10_fold, test_idx_10_fold = folddata_XY['train_idx_10_fold'],folddata_XY['test_idx_10_fold']
if only_PathCS_feature == True:
if mode == 'p':
X_train_10_fold = map(lambda x : x[:,:12], X_train_10_fold)
X_test_10_fold = map(lambda x : x[:,:12], X_test_10_fold)
else:
X_train_10_fold = map(lambda x : x[:,:10], X_train_10_fold)
X_test_10_fold = map(lambda x : x[:,:10], X_test_10_fold)
print ('-------------------------------------------------THIS SEED FINISHED----------------------------------')

results,test_true_predict_compare_10cv = run_classification(mode, seed, X_train_10_fold, X_test_10_fold, y_train_10_fold,
y_test_10_fold,test_idx_10_fold, train_idx_10_fold, layer)
seeds_results.append(results)
trails_AUPRs.extend(results[2]) # 5z
trails_AUCs.extend(results[5]) # 50
trails_precisions.append(results[8][0]) # 5
trails_recalls.append(results[8][1]) # 5
trails_fscores.append(results[8][2]) # 5
test_true_predict_compare_10cv_seeds.append(test_true_predict_compare_10cv)
trails_recall_25.append(results[10])
trails_recall_50.append(results[11])
trails_recall_100.append(results[12])
trails_recall_200.append(results[13])
trails_recall_400.append(results[14])
trials_G_means.extend(results[12])###15
aupr, roc_auc = np.mean(trails_AUPRs), np.mean(trails_AUCs)
precision, recall, fscore = np.mean(trails_precisions), np.mean(trails_recalls), np.mean(trails_fscores)
# recall_25, recall_50 = np.mean(trails_recall_25), np.mean(trails_recall_50)
recall_25, recall_50, recall_100, recall_200, recall_400 = np.mean(trails_recall_25), np.mean(trails_recall_50), np.mean(trails_recall_100), np.mean(trails_recall_200), np.mean(trails_recall_400)
G_mean = np.mean(trials_G_means)
print( "################Results###################" )
print('model_architecture:',layer)
print( "Mode: %s" % mode )
print( "Average: AUPR: %s" % aupr )
print( "Average: AUC: %s" % roc_auc )
print( "Average: precision = {}, recall = {}, fscore = {} ".format(precision, recall, fscore))
print( "Average: G_mean = ", G_mean)
for result_ in seeds_results:
print('seed_results: ')
print('Avg_AUPR_training:',result_[0])
print('Avg_AUPR:',result_[1])
print('folds_AUPR:',result_[2])
print('Avg_AUC_training:',result_[3])
print('Avg_AUC:',result_[4])
print('folds_AUC:',result_[5])
print('precision_testing = {}, recall_testing = {}, fscore_testing = {}'.format(result_[8][0], result_[8][1], result_[8][2]))
print('precision_training = {}, recall_training = {}, fscore_training = {}'.format(result_[9][0],result_[9][1], result_[9][2]))
print('G_mean = ', result_[12])
print('')
print( "###########################################")
aupr_list.append(aupr)
auc_list.append(roc_auc)
precision_list.append(precision)
recall_list.append(recall)
fscore_list.append(fscore)
recall_25_list.append(recall_25)
recall_50_list.append(recall_50)
recall_100_list.append(recall_100)
recall_200_list.append(recall_200)
recall_400_list.append(recall_400)
G_mean_list.append(G_mean)
test_true_predict_compare_10cv_seeds_modes.append(test_true_predict_compare_10cv_seeds)
print( "################Results###################" )
print('model_architecture:',layer)
print( "Mode: %s" % mode_list )
print( "Average AUPR: %s" % aupr_list )
print( "Average AUC: %s" % auc_list )
print( "Average precision = {} ".format(precision_list))
print( "Average recall = {} ".format(recall_list))
print( "Average f1score = {} ".format(fscore_list))
print( "Average recall_25 = {} ".format(recall_25_list))
print( "Average recall_50 = {} ".format(recall_50_list))
print( "Average recall_100 = {} ".format(recall_100_list))
print( "Average recall_200 = {} ".format(recall_200_list))
print( "Average recall_400 = {} ".format(recall_400_list))
print( "Average G_mean = ,", G_mean_list)
print( "###########################################")
return test_true_predict_compare_10cv_seeds_modes
def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,
rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2,
xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2,
layer, mode, seed):
folds_AUC_testing, folds_AUPR_testing = [], []
folds_AUC_training, folds_AUPR_training= [], []
folds_metrics3_training, folds_metrics3_testing = [], []
test_true_predict_compare, train_true_predict_compare = [], []
folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400 = [], [], [], [], []
folds_G_mean = []
i = 0
for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold):
config = get_toy_config(rf_tree, rf_max_depth, rf_tree_2, rf_max_depth_2,
xgb_tree, xgb_max_depth, min_child_weight, lr, xgb_tree_2, xgb_max_depth_2, min_child_weight_2, lr_2,
layer)
gc = GCForest(config)
print(config)
X_train_enc = gc.fit_transform(X_train, y_train)

y_pred_train = gc.predict(X_train)
y_predprob_train = gc.predict_proba(X_train)
y_pred_test = gc.predict(X_test)
y_predprob_test = gc.predict_proba(X_test)
temp = pd.DataFrame([y_test, y_predprob_test[:,1],y_pred_test]).T.sort_values(by = 1, ascending = False)
recall_25 = precision_recall_fscore_support(temp.iloc[:25,:][0], temp.iloc[:25,:][2], pos_label = 1, average = 'binary')[1]
recall_50 = precision_recall_fscore_support(temp.iloc[:50,:][0], temp.iloc[:50,:][2], pos_label = 1, average = 'binary')[1]
recall_100 = precision_recall_fscore_support(temp.iloc[:100,:][0], temp.iloc[:100,:][2], pos_label = 1, average = 'binary')[1]
recall_200 = precision_recall_fscore_support(temp.iloc[:200,:][0], temp.iloc[:200,:][2], pos_label = 1, average = 'binary')[1]
recall_400 = precision_recall_fscore_support(temp.iloc[:400,:][0], temp.iloc[:400,:][2], pos_label = 1, average = 'binary')[1]
test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv
train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv

# with open(r'E:\11_MFDF_code\3_my_code\9_major_revised\NR\code\12\model\NR_PathCS_' + layer + '_S' + str(mode) + '_seed' + str(seed) + '_fold' + str(i) + '.pkl', "wb") as f1:
# pickle.dump(gc, f1, pickle.HIGHEST_PROTOCOL)
precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1)
precision_testing, recall_testing, _ = precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1)
AUPR_training, AUPR_testing = auc(recall_training,precision_training), auc(recall_testing, precision_testing)
AUC_training, AUC_testing = roc_auc_score(y_train, y_predprob_train[:,1]), roc_auc_score(y_test, y_predprob_test[:,1])
metrics3_testing = precision_recall_fscore_support(y_test, y_pred_test, pos_label = 1, average = 'binary')[:3]
metrics3_training = precision_recall_fscore_support(y_train, y_pred_train, pos_label = 1, average = 'binary')[:3]
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test, labels = [0,1]).ravel()
specificity = float(tn) / float(tn + fp)
recall = metrics3_testing[1]
G_mean = np.sqrt(recall * specificity)

folds_AUC_testing.append(AUC_testing)
folds_AUPR_testing.append(AUPR_testing)
folds_metrics3_testing.append(metrics3_testing)
folds_AUC_training.append(AUC_training)
folds_AUPR_training.append(AUPR_training)
folds_metrics3_training.append(metrics3_training)
folds_G_mean.append(G_mean)
folds_recall_25.append(recall_25)
folds_recall_50.append(recall_50)
folds_recall_100.append(recall_100)
folds_recall_200.append(recall_200)
folds_recall_400.append(recall_400)
i += 1
Avg_AUPR_training = np.mean(folds_AUPR_training)
Avg_AUPR_testing = np.mean(folds_AUPR_testing)
Avg_AUC_training = np.mean(folds_AUC_training)
Avg_AUC_testing = np.mean(folds_AUC_testing)
Avg_metrics3_training = np.mean(folds_metrics3_training, axis = 0)
Avg_metrics3_testing = np.mean(folds_metrics3_testing, axis = 0)
Avg_G_mean = np.mean(folds_G_mean)
return [Avg_AUPR_training, Avg_AUPR_testing, folds_AUPR_testing, #012
Avg_AUC_training, Avg_AUC_testing, folds_AUC_testing, #345
folds_AUPR_training, folds_AUC_training, #67
Avg_metrics3_testing, Avg_metrics3_training, #89
folds_recall_25, folds_recall_50, folds_recall_100, folds_recall_200, folds_recall_400, folds_G_mean], [test_true_predict_compare, train_true_predict_compare] #


mode_list = ["p", "D", "T"]
seeds = [1231, 8367, 22, 1812, 4659]
model_architecture = ['rf1', 'xgb1', 'rf2', 'xgb2', 'rf2xgb2', 'rf1xgb2', 'rf2xgb1', 'rf1xgb1_2','rf1xgb1'] # DNN time


for layer in model_architecture:
print('model_architecture:',layer)
time_begin_cpu = time.clock()
time_begin_wall = time.time()
rf1_PathCS_test_true_predict_compare_10cv_seeds_modes = gain_results(seeds,mode_list,layer, only_PathCS_feature = True)
DTI_CDF_new_pair, y_choose_all, ddr_intersection_dti = calc_metrics(rf1_PathCS_test_true_predict_compare_10cv_seeds_modes,DT_feature_pair_list,E_ddr_new_pair)
time_end_cpu = time.clock()
time_end_wall = time.time()
print('CPU Time = {}, Wall Time = {}'.format(time_end_cpu - time_begin_cpu, time_end_wall - time_begin_wall))

+ 254
- 0
1_all_code/NR_xgb_PathCS_Tuning.py View File

@@ -0,0 +1,254 @@

# coding: utf-8

# # 训练所需function

# In[2]:

import sys
sys.path.append(r'E:\11_MFDF_code\3_my_code')
import os
os.chdir(r'E:\11_MFDF_code\1_original_data\NR')
from Graph_utils import *
from functions import *
from SNF import *
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve
from sklearn.preprocessing import MaxAbsScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import gcforest
from gcforest.gcforest import GCForest
import xgboost
from functools import reduce
import glob
from scipy import interp
import csv
import time
import random
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

np.set_printoptions(threshold = np.inf)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# In[4]:

def gain_results(seeds, mode_list, layer, only_PathCS_feature = False):
aupr_list = []
auc_list = []
metrics3_list = []
test_true_predict_compare_10cv_seeds_modes = []
for mode in mode_list:
trails_AUPRs = []
trails_AUCs = []
metrics3 = []
seeds_results = []
test_true_predict_compare_10cv_seeds = []
for seed in seeds:
print ("---------GENERATE_FOLD_{}-----------------------------------------------".format(seed))

filename = r'E:\11_MFDF_code\3_my_code\9_major_revised\NR\data\RFE100_NR_folddata_X-Y_S' + str(mode) + '_seed' + str(seed) + '.npz'
folddata_XY = np.load(filename)

X_train_10_fold, X_test_10_fold = folddata_XY['X_train_rfe_10_fold'],folddata_XY['X_test_rfe_10_fold']
y_train_10_fold, y_test_10_fold = folddata_XY['y_train_10_fold'],folddata_XY['y_test_10_fold']
test_idx_10_fold, train_idx_10_fold = folddata_XY['train_idx_10_fold'],folddata_XY['test_idx_10_fold']
if only_PathCS_feature == True:
if mode == 'p':
X_train_10_fold = map(lambda x : x[:,:12], X_train_10_fold)
X_test_10_fold = map(lambda x : x[:,:12], X_test_10_fold)
else:
X_train_10_fold = map(lambda x : x[:,:10], X_train_10_fold)
X_test_10_fold = map(lambda x : x[:,:10], X_test_10_fold)
print ('-------------------------------------------------THIS SEED FINISHED----------------------------------')

results,parameter_result_df,test_true_predict_compare_10cv = run_classification(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,layer)
seeds_results.append(results)
trails_AUPRs.extend(results[2])
trails_AUCs.extend(results[5])
test_true_predict_compare_10cv_seeds.append(test_true_predict_compare_10cv)
aupr,c1 = mean_confidence_interval(trails_AUPRs)
roc_auc,c1 = mean_confidence_interval(trails_AUCs)
print( "################Results###################" )
print('model_architecture:',layer)
print( "Mode: %s" % mode )
print( "Average: AUPR: %s" % aupr )
print( "Average: AUC: %s" % roc_auc )
for result_ in seeds_results:
print('seed_results: ')
print('Avg_AUPR_training:',result_[0])
print('Avg_AUPR:',result_[1])
print('folds_AUPR:',result_[2])
print('Avg_AUC_training:',result_[3])
print('Avg_AUC:',result_[4])
print('folds_AUC:',result_[5])
print('')
print( "###########################################")
aupr_list.append(aupr)
auc_list.append(roc_auc)
test_true_predict_compare_10cv_seeds_modes.append(test_true_predict_compare_10cv_seeds)
print( "################Results###################" )
print('model_architecture:',layer)
print( "Mode: %s" % mode_list )
print( "Average AUPR: %s" % aupr_list )
print( "Average AUC: %s" % auc_list )
print( "###########################################")
return aupr_list,auc_list,test_true_predict_compare_10cv_seeds_modes


# In[9]:

# params_1 = max_depth, min_child_weight

def get_toy_config(trees, max_depth, min_child_weight, cw = 0.01,layer = 'xgb1', max_layers = 1):
config = {}
ca_config = {}
ca_config["random_state"] = 1231
ca_config["look_indexs_cycle"] = None
ca_config["data_save_rounds"] = 2
ca_config["data_save_dir"] = r'E:\11_MFDF_code\4_v2_generate_dataset\NR\save_dir'
ca_config["max_layers"] = max_layers
ca_config["early_stopping_rounds"] = 1
ca_config["n_classes"] = 2
ca_config["estimators"] = []
xgb = {"n_folds": 1, "type": "XGBClassifier",'base_score':0.5, 'booster':'gbtree', 'colsample_bylevel':1,
'colsample_bytree':1, 'eval_metric':'auc', 'gamma':0, 'learning_rate':cw,
'max_delta_step':0, 'max_depth':max_depth, 'min_child_weight':min_child_weight, 'missing':None,
'n_estimators':trees, 'n_jobs':-1, 'nthread':-1,
'objective':'binary:logistic', 'random_state':1231, 'reg_alpha':0,
'reg_lambda':1, 'scale_pos_weight':1, 'seed':1231, 'silent':True,
'subsample':1}

if layer == 'xgb1':
ca_config["estimators"].append(xgb)

config["cascade"] = ca_config
return config


# In[18]:

def run_classification(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,layer):
max_depths = [i for i in range(3,11,1)] #8
min_child_weights = [0.001, 0.01, 0.1, 1] #4
learning_rate = [0.01, 0.1, 0.5] #3
no_trees = [100, 150, 200, 250, 300, 500, 600, 800] #8
result = []
parameter_list = []
test_true_predict_compare_10cv = []
for max_depth in max_depths:
for min_child_weight in min_child_weights:
for cw in learning_rate:
for t in no_trees:
parameter_result,test_true_predict_compare = run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,t,max_depth, min_child_weight,layer,cw)
result.append(parameter_result)
test_true_predict_compare_10cv.append(test_true_predict_compare)

parameter_list.append([[layer,max_depth,min_child_weight,cw,t],parameter_result[0],parameter_result[1]])
parameter_result_df = pd.DataFrame(parameter_list)
parameter_result_df.columns = ['parameter','train_AUPR','Avg_AUPR']
print(parameter_result_df.sort_values(by='Avg_AUPR', axis=0))
best_parameter_idx = parameter_result_df['Avg_AUPR'].idxmax()
best_parameter_result = parameter_result_df.iloc[best_parameter_idx,:]
print('best_parameter_idx : ',best_parameter_idx)
print('best_parameter_result : ',best_parameter_result)
result.sort(key=lambda x:(x[0],x[1]),reverse=True)
print(result[0])
# print('****mode = {}****'.format(mode))
print('Avg_AUPR_training = {:.4},Avg_AUPR_testing = {:.4},Avg_AUC_training = {:.4},Avg_AUC_testing = {:.4}'.format(
result[0][0],result[0][1],result[0][3],result[0][4]))
print('folds_AUPR_testing:', result[0][2])
print('folds_AUPR_training:', result[0][6])
print('folds_AUC_testing:', result[0][5])
print('folds_AUC_training:',result[0][7])
print('************results************')
return result[0], parameter_result_df, test_true_predict_compare_10cv[best_parameter_idx]


# In[19]:

def run_classification_configuration(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold,test_idx_10_fold, train_idx_10_fold,trees,max_depth, min_child_weight,layer,cw=0.001):
i = 0
folds_AUC_testing = []
folds_AUPR_testing = []
folds_AUC_training = []
folds_AUPR_training = []
test_true_predict_compare = []
train_true_predict_compare = []
for X_train, X_test, y_train, y_test, test_idx_fold, train_idx_fold in zip(X_train_10_fold, X_test_10_fold, y_train_10_fold, y_test_10_fold, test_idx_10_fold, train_idx_10_fold):
# X_train, X_test = X_train[:,12:], X_test[:,12:]
# X_train, X_test = X_train[:,:12], X_test[:,:12]
config = get_toy_config(trees,max_depth, min_child_weight,cw,layer)
gc = GCForest(config)
#print(config)
X_train_enc = gc.fit_transform(X_train, y_train, X_test, y_test)

y_pred_train = gc.predict(X_train)
y_predprob_train = gc.predict_proba(X_train)
y_pred_test = gc.predict(X_test)
y_predprob_test = gc.predict_proba(X_test)
y_predprob_test_df = pd.DataFrame(y_predprob_test)
y_predprob_train_df = pd.DataFrame(y_predprob_train)
test_true_predict_compare.append([test_idx_fold, y_pred_test, y_test, y_predprob_test[:,0], y_predprob_test[:,1]]) #10-cv
train_true_predict_compare.append([train_idx_fold, y_pred_train, y_train, y_predprob_train[:,0], y_predprob_train[:,1]]) #10-cv
precision_training, recall_training, _ = precision_recall_curve(y_train, y_predprob_train[:,1], pos_label=1)
precision_testing, recall_testing, _ = precision_recall_curve(y_test, y_predprob_test[:,1], pos_label=1)
AUPR_training = auc(recall_training,precision_training)
AUPR_testing = auc(recall_testing, precision_testing)
AUC_training = roc_auc_score(y_train, y_predprob_train[:,1])
AUC_testing = roc_auc_score(y_test, y_predprob_test[:,1])

folds_AUC_testing.append(AUC_testing)
folds_AUPR_testing.append(AUPR_testing)
folds_AUC_training.append(AUC_training)
folds_AUPR_training.append(AUPR_training)

Avg_AUPR_training = np.mean(folds_AUPR_training)
Avg_AUPR_testing = np.mean(folds_AUPR_testing)
Avg_AUC_training = np.mean(folds_AUC_training)
Avg_AUC_testing = np.mean(folds_AUC_testing)
return [Avg_AUPR_training,Avg_AUPR_testing,folds_AUPR_testing, Avg_AUC_training,Avg_AUC_testing,folds_AUC_testing,folds_AUPR_training,folds_AUC_training], [test_true_predict_compare,train_true_predict_compare]

# # 训练

# In[20]:

# 三类特征

# In[ ]:
mode_list = ["p"]#,"D","T"]
seeds = [1231, 8367, 22, 1812, 4659]
model_architecture = ['xgb1']

for layer in model_architecture:
print('model_architecture:',layer)
aupr_list,auc_list,rf1_PathCS_test_true_predict_compare_10cv_seeds_modes = gain_results(seeds,mode_list,layer, only_PathCS_feature = True)


+ 81
- 0
1_all_code/SNF.py View File

@@ -0,0 +1,81 @@
import numpy as np
import copy


def FindDominantSet(W,K):
m,n = W.shape
DS = np.zeros((m,n))
for i in range(m):
index = np.argsort(W[i,:])[-K:] # get the closest K neighbors
DS[i,index] = W[i,index] # keep only the nearest neighbors

#normalize by sum
B = np.sum(DS,axis=1)
B = B.reshape(len(B),1)
DS = DS/B
return DS



def normalized(W,ALPHA):
m,n = W.shape
W = W+ALPHA*np.identity(m)
return (W+np.transpose(W))/2







def SNF(Wall,K,t,ALPHA=1):

C = len(Wall)
m,n = Wall[0].shape

for i in range(C):
B = np.sum(Wall[i],axis=1)
len_b = len(B)
B = B.reshape(len_b,1)
Wall[i] = Wall[i]/B
Wall[i] = (Wall[i]+np.transpose(Wall[i]))/2



newW = []

for i in range(C):
newW.append(FindDominantSet(Wall[i],K))

Wsum = np.zeros((m,n))
for i in range(C):
Wsum += Wall[i]


for iteration in range(t):
Wall0 = []
for i in range(C):
temp = np.dot(np.dot(newW[i], (Wsum - Wall[i])),np.transpose(newW[i]))/(C-1)
Wall0.append(temp)

for i in range(C):
Wall[i] = normalized(Wall0[i],ALPHA)

Wsum = np.zeros((m,n))
for i in range (C):
Wsum+=Wall[i]

W = Wsum/C
B = np.sum(W,axis=1)
B = B.reshape(len(B),1)
W/=B
W = (W+np.transpose(W)+np.identity(m))/2
return W







+ 38
- 0
1_all_code/functions.py View File

@@ -0,0 +1,38 @@
import numpy as np
from collections import defaultdict

def cross_validation(intMat, seeds, cv=0, num=10):
cv_data = defaultdict(list)
for seed in seeds:
num_drugs, num_targets = intMat.shape
prng = np.random.RandomState(seed)
if cv == 0:
index = prng.permutation(num_drugs)
if cv == 1:
index = prng.permutation(intMat.size)
step = index.size/num
for i in range(num):
if i < num-1:
ii = index[int(i*step):int((i+1)*step)]
else:
ii = index[int(i*step):]
if cv == 0:
test_data = np.array([[k, j] for k in ii for j in range(num_targets)], dtype=np.int32)
elif cv == 1:
test_data = np.array([[k/num_targets, k % num_targets] for k in ii], dtype=np.int32)
x, y = test_data[:, 0], test_data[:, 1]
test_label = intMat[x, y]
W = np.ones(intMat.shape)
W[x, y] = 0
cv_data[seed].append((W, test_data, test_label))
return cv_data


def mean_confidence_interval(data, confidence=0.95):
import scipy as sp
import scipy.stats
a = 1.0*np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * sp.stats.t._ppf((1+confidence)/2., n-1)
return m, h

Loading…
Cancel
Save