You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

93 lines
3.6 KiB

  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": null,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import sys\n",
  10. "sys.path.append(\"../..\")\n",
  11. "\n",
  12. "import warnings\n",
  13. "warnings.filterwarnings(\"ignore\")\n",
  14. "\n",
  15. "from chemocommons import *\n",
  16. "import pandas as pd\n",
  17. "import numpy as np\n",
  18. "\n",
  19. "from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
  20. "from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer\n",
  21. "from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes\n",
  22. "from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?\n",
  23. "from skmultilearn.adapt import MLkNN, MLTSVM\n",
  24. "from skmultilearn.problem_transform import ClassifierChain, LabelPowerset # sorry, we only used LP\n",
  25. "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?\n",
  26. "from sklearn.preprocessing import OneHotEncoder\n",
  27. "from sklearn.model_selection import LeaveOneOut, RepeatedKFold#, KFold # jackknife, \"socalled\"\n",
  28. "from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation\n",
  29. "from sklearn.utils.multiclass import unique_labels\n",
  30. "mskf = MultilabelStratifiedKFold(n_splits=10, random_state=1994115)\n",
  31. "loocv = LeaveOneOut() # jackknife\n",
  32. "\n",
  33. "label_names = [\"ABCG2\", \"MDR1\", \"MRP1\", \"MRP2\", \"MRP3\", \"MRP4\", \"NTCP2\", \"S15A1\", \n",
  34. " \"S22A1\", \"SO1A2\", \"SO1B1\", \"SO1B3\", \"SO2B1\"]\n",
  35. "\n",
  36. "Y = pd.read_csv(\"label_matrix.txt\", sep=\"\\t\", names=label_names)\n",
  37. "Y[Y==-1]=0\n",
  38. "\n",
  39. "ft_FP = pd.read_csv(\"query_smiles_feature_similarity_four_average.csv\", names=label_names)\n",
  40. "ft_FP.rename(mapper= lambda x: x + \"_FP\", axis=1, inplace=True)\n",
  41. "ft_OT = pd.read_csv(\"feature_similarity_chebi_ontology_DiShIn_2.csv\", names=label_names)\n",
  42. "ft_OT.rename(mapper= lambda x: x + \"_OT\", axis=1, inplace=True)\n",
  43. "\n",
  44. "X = np.concatenate((ft_FP, ft_OT), axis=1)\n",
  45. "\n",
  46. "scoring_funcs = {\"hamming loss\": hamming_func, \n",
  47. " \"aiming\": aiming_func, \n",
  48. " \"coverage\": coverage_func, \n",
  49. " \"accuracy\": accuracy_func, \n",
  50. " \"absolute true\": absolute_true_func, \n",
  51. " } # Keep recorded\n",
  52. "\n",
  53. "parameters = {\n",
  54. " 'classifier': [LabelPowerset()],\n",
  55. " 'classifier__classifier': [ExtraTreesClassifier()],\n",
  56. " 'classifier__classifier__n_estimators': [50, 100, 500, 1000],\n",
  57. " 'clusterer' : [\n",
  58. " NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),\n",
  59. " NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')\n",
  60. " ]\n",
  61. "}\n",
  62. "\n",
  63. "\n",
  64. "ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv, \n",
  65. " scoring=scoring_funcs, verbose=3, refit=\"absolute true\")\n",
  66. "ext.fit(X, Y.values)\n",
  67. "print(ext.best_score_)"
  68. ]
  69. }
  70. ],
  71. "metadata": {
  72. "kernelspec": {
  73. "display_name": "Python 3",
  74. "language": "python",
  75. "name": "python3"
  76. },
  77. "language_info": {
  78. "codemirror_mode": {
  79. "name": "ipython",
  80. "version": 3
  81. },
  82. "file_extension": ".py",
  83. "mimetype": "text/x-python",
  84. "name": "python",
  85. "nbconvert_exporter": "python",
  86. "pygments_lexer": "ipython3",
  87. "version": "3.6.8"
  88. }
  89. },
  90. "nbformat": 4,
  91. "nbformat_minor": 2
  92. }