Browse Source

上传文件至 ''

master
Katherine1216 1 week ago
parent
commit
a053209420
3 changed files with 2353 additions and 0 deletions
  1. +14
    -0
      Supplementary Material.csv
  2. +1847
    -0
      Ys.csv
  3. +492
    -0
      single_label.ipynb

+ 14
- 0
Supplementary Material.csv View File

@@ -0,0 +1,14 @@
,ABCG2,MDR1,MRP1,MRP2,MRP3,MRP4,NTCP2,S15A1,S22A1,SO1A2,SO1B1,SO1B3,SO2B1
ABCG2,1,0.330621867,0,0.023091166,0,0.110127563,0.056887184,0.159947355,0.097582982,0,0.015824916,0,0.065997145
MDR1,0.330621867,1,0.157059277,0.166582161,0.144729872,0.112303794,0.164686267,0.363136039,0.231152405,0.046743914,0.132932091,0.107578556,0.085246549
MRP1,0,0.157059277,1,0.293515137,0.211957883,0.12852835,0.035808188,0.09509708,0.067260704,0.049350112,0.053444379,0.07290668,0
MRP2,0.023091166,0.166582161,0.293515137,1,0.271575348,0.089693921,0.018703015,0.074609999,0.074636543,0.114863545,0.145856601,0.167352086,0.092700801
MRP3,0,0.144729872,0.211957883,0.271575348,1,0.109256763,0.002195217,0.052440759,0.030025504,0,0.059409607,0.048355354,0.038523044
MRP4,0.110127563,0.112303794,0.12852835,0.089693921,0.109256763,1,0.004412667,0.038938816,0,0.080924748,0.115939088,0.066917877,0
NTCP2,0.056887184,0.164686267,0.035808188,0.018703015,0.002195217,0.004412667,1,0.055260824,0.037300512,0,0,0,0
S15A1,0.159947355,0.363136039,0.09509708,0.074609999,0.052440759,0.038938816,0.055260824,1,0.104133576,0.055989148,0.043207821,0.039893473,0.030438712
S22A1,0.097582982,0.231152405,0.067260704,0.074636543,0.030025504,0,0.037300512,0.104133576,1,0.014493356,0.020967143,0.03397567,0
SO1A2,0,0.046743914,0.049350112,0.114863545,0,0.080924748,0,0.055989148,0.014493356,1,0.317111182,0.32438942,0.230457313
SO1B1,0.015824916,0.132932091,0.053444379,0.145856601,0.059409607,0.115939088,0,0.043207821,0.020967143,0.317111182,1,0.501365038,0.259655934
SO1B3,0,0.107578556,0.07290668,0.167352086,0.048355354,0.066917877,0,0.039893473,0.03397567,0.32438942,0.501365038,1,0.199565018
SO2B1,0.065997145,0.085246549,0,0.092700801,0.038523044,0,0,0.030438712,0,0.230457313,0.259655934,0.199565018,1

+ 1847
- 0
Ys.csv
File diff suppressed because it is too large
View File


+ 492
- 0
single_label.ipynb View File

@@ -0,0 +1,492 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"D:\\Anaconda3\\envs\\py36\\lib\\site-packages\\sklearn\\externals\\six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
" \"(https://pypi.org/project/six/).\", DeprecationWarning)\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append(\"../..\")\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"from chemocommons import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer\n",
"from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes\n",
"from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?\n",
"from skmultilearn.adapt import MLkNN, MLTSVM\n",
"from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP\n",
"from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?\n",
"from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, \"socalled\"\n",
"from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation\n",
"from sklearn.utils.multiclass import unique_labels\n",
"from lightgbm import LGBMClassifier\n",
"from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
"from joblib import load"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#loocv = LeaveOneOut() # jackknife\n",
"rmskf = MultilabelStratifiedKFold(n_splits=10, random_state=19941115)\n",
"label_names = [\"ABCG2\", \"MDR1\", \"MRP1\", \"MRP2\", \"MRP3\", \"MRP4\", \"NTCP2\", \"S15A1\", \n",
" \"S22A1\", \"SO1A2\", \"SO1B1\", \"SO1B3\", \"SO2B1\"]\n",
"\n",
"Y = pd.read_csv(\"label_matrix.txt\", sep=\"\\t\", names=label_names)\n",
"Y[Y==-1]=0\n",
"\n",
"ft_FP = pd.read_csv(\"query_smiles_feature_similarity_four_average.csv\", names=label_names)\n",
"ft_FP.rename(mapper= lambda x: x + \"_FP\", axis=1, inplace=True)\n",
"ft_OT = pd.read_csv(\"feature_similarity_chebi_ontology_DiShIn_2.csv\", names=label_names)\n",
"ft_OT.rename(mapper= lambda x: x + \"_OT\", axis=1, inplace=True)\n",
"\n",
"X = np.concatenate((ft_FP, ft_OT), axis=1)\n",
"Y = Y.values"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"1 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"2 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"3 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"4 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"5 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"6 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"7 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"8 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"9 th repeat:\n",
"0 th fold.\n",
"1 th fold.\n",
"2 th fold.\n",
"3 th fold.\n",
"4 th fold.\n",
"5 th fold.\n",
"6 th fold.\n",
"7 th fold.\n",
"8 th fold.\n",
"9 th fold.\n",
"[0.86886624 0.82629165 0.95213385 0.93531963 0.97052574 0.97484388\n",
" 0.99404831 0.97431425 0.96509124 0.97319891 0.95619702 0.97633315\n",
" 0.98074716] [0.72206496 0.77961642 0.83943579 0.72212698 0.59753571 0.36666667\n",
" 0.925 0.91736341 0.9194019 0.49666667 0.519 0.5125\n",
" 0.3 ] [0.48469748 0.90494505 0.44450549 0.25406593 0.31071429 0.167\n",
" 0.86666667 0.87695652 0.60961905 0.13333333 0.14097222 0.151\n",
" 0.11 ] [0.57687171 0.83710125 0.57530299 0.36020054 0.38846676 0.21738095\n",
" 0.89090909 0.89447793 0.73041748 0.20373016 0.21516994 0.22747619\n",
" 0.15428571] [0.89082465 0.92430166 0.9057471 0.91329001 0.89746348 0.93414092\n",
" 0.99760552 0.98078294 0.94215135 0.86763828 0.89635493 0.85186265\n",
" 0.85161042]\n"
]
}
],
"source": [
"def measure_per_label(measure, y_true, y_predicted):\n",
" \"\"\"\n",
" This code is inspired by skmultilearn, but our y_true and y_predicted are all dense numpy.ndarray\n",
" \"\"\"\n",
" return [\n",
" measure(\n",
" y_true[:, i],\n",
" y_predicted[:, i]\n",
" )\n",
" for i in range(y_true.shape[1])\n",
" ]\n",
"\n",
"\n",
"NLSP_RF = load(\"rf.joblib\")[0]\n",
"\n",
"final_model = NLSP_RF.best_estimator_\n",
"label_acc = []\n",
"label_sp = []\n",
"label_rc = []\n",
"label_f1 = []\n",
"label_auc = []\n",
"\n",
"y_pred = np.zeros_like(Y)\n",
"y_proba = np.zeros_like(Y)\n",
"\n",
"\n",
"\n",
"for i in range(10): #10*10-cv\n",
" print(i, \"th repeat:\")\n",
" kfold = MultilabelStratifiedKFold(n_splits=10, random_state=19941115)\n",
" for k, (train, test) in enumerate(kfold.split(X, Y)):\n",
" print(k, \"th fold.\")\n",
" final_model.fit(X[train], Y[train])\n",
" y_pred = np.array(final_model.predict(X[test]).todense())\n",
" y_proba = np.array(final_model.predict_proba(X[test]).todense())\n",
" label_acc.append(measure_per_label(metrics.accuracy_score, Y[test], y_pred))\n",
" label_sp.append(measure_per_label(metrics.precision_score, Y[test], y_pred))\n",
" label_rc.append(measure_per_label(metrics.recall_score, Y[test], y_pred))\n",
" label_f1.append(measure_per_label(metrics.f1_score, Y[test], y_pred))\n",
" label_auc.append(measure_per_label(metrics.roc_auc_score, Y[test], y_proba))\n",
"\n",
"label_acc = np.array(label_acc)\n",
"label_sp = np.array(label_sp)\n",
"label_rc = np.array(label_rc)\n",
"label_f1 = np.array(label_f1)\n",
"label_auc = np.array(label_auc)\n",
"\n",
"to_sav = dump((label_acc, label_sp, label_rc, label_f1, label_auc), filename=\"report_array.joblib\")\n",
"\n",
"print(label_acc.mean(axis=0), label_sp.mean(axis=0), label_rc.mean(axis=0),label_f1.mean(axis=0), label_auc.mean(axis=0))\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.86728061 0.82665222 0.95449621 0.92741062 0.96749729 0.97074756\n",
" 0.99458288 0.9723727 0.96533044 0.97453954 0.95612134 0.97508126\n",
" 0.97995666] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856\n",
" 0.02491874 0.1099675 0.04767064 0.00433369 0.00595883 0.00270856\n",
" 0.00216685] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856\n",
" 0.02491874 0.1099675 0.04767064 0.00433369 0.00595883 0.00270856\n",
" 0.00216685] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856\n",
" 0.02491874 0.1099675 0.04767064 0.00433369 0.00595883 0.00270856\n",
" 0.00216685] 0.0\n"
]
}
],
"source": [
"print(label_acc.mean(axis=0), label_sp.mean(axis=0), label_rc.mean(axis=0),label_f1.mean(axis=0), label_auc.mean(axis=0))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.86886624, 0.82629165, 0.95213385, 0.93531963, 0.97052574,\n",
" 0.97484388, 0.99404831, 0.97431425, 0.96509124, 0.97319891,\n",
" 0.95619702, 0.97633315, 0.98074716])"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label_acc.mean(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.72206496, 0.77961642, 0.83943579, 0.72212698, 0.59753571,\n",
" 0.36666667, 0.925 , 0.91736341, 0.9194019 , 0.49666667,\n",
" 0.519 , 0.5125 , 0.3 ])"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label_sp.mean(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.48469748, 0.90494505, 0.44450549, 0.25406593, 0.31071429,\n",
" 0.167 , 0.86666667, 0.87695652, 0.60961905, 0.13333333,\n",
" 0.14097222, 0.151 , 0.11 ])"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label_rc.mean(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.09046587, 0.45016251, 0.03575298, 0.01787649, 0.00975081,\n",
" 0.00270856, 0.02491874, 0.1099675 , 0.04767064, 0.00433369,\n",
" 0.00595883, 0.00270856, 0.00216685])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label_f1.mean(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"reports = load(\"report_array.joblib\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"final_reports = []\n",
"for i in reports:\n",
" final_reports.append(i.mean(axis=0))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"final_reports = pd.DataFrame(final_reports)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"final_reports = final_reports.T"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"final_reports.index = label_names"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"final_reports.columns = [\"ACC\", \"SP\", \"RC\", \"F1\", \"AUC\"]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"final_reports.to_csv(\"final_reports.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(13,)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_list = reports[0].mean(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.72206496, 0.77961642, 0.83943579, 0.72212698, 0.59753571,\n",
" 0.36666667, 0.925 , 0.91736341, 0.9194019 , 0.49666667,\n",
" 0.519 , 0.5125 , 0.3 ])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reports[1].mean(axis=0)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Loading…
Cancel
Save