You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

133 lines
4.9 KiB

  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from chemocommons import *
  4. import pandas as pd
  5. import numpy as np
  6. from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
  7. from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
  8. from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?
  9. from skmultilearn.adapt import MLkNN, MLTSVM
  10. from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP
  11. from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
  12. from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, "socalled"
  13. from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation
  14. from sklearn.utils.multiclass import unique_labels
  15. from lightgbm import LGBMClassifier
  16. loocv = LeaveOneOut() # jackknife
  17. label_names = ["ABCG2", "MDR1", "MRP1", "MRP2", "MRP3", "MRP4", "NTCP2", "S15A1",
  18. "S22A1", "SO1A2", "SO1B1", "SO1B3", "SO2B1"]
  19. Y = pd.read_csv("label_matrix.txt", sep="\t", names=label_names)
  20. Y[Y==-1]=0
  21. ft_FP = pd.read_csv("query_smiles_feature_similarity_four_average.csv", names=label_names)
  22. ft_FP.rename(mapper= lambda x: x + "_FP", axis=1, inplace=True)
  23. ft_OT = pd.read_csv("feature_similarity_chebi_ontology_DiShIn_2.csv", names=label_names)
  24. ft_OT.rename(mapper= lambda x: x + "_OT", axis=1, inplace=True)
  25. X = np.concatenate((ft_FP, ft_OT), axis=1)
  26. scoring_funcs = {"hamming loss": hamming_func,
  27. "aiming": aiming_func,
  28. "coverage": coverage_func,
  29. "accuracy": accuracy_func,
  30. "absolute true": absolute_true_func,
  31. } # Keep recorded
  32. parameters = {'k': range(1,11), 's': [0.5, 0.7, 1.0]}
  33. mlknn = GridSearchCV(MLkNN(), param_grid=parameters, n_jobs=-1, cv=loocv,
  34. scoring=scoring_funcs, verbose=3, refit="absolute true")
  35. mlknn.fit(X, Y.values)
  36. print(mlknn.best_score_)
  37. parameters = {'c_k': [2**i for i in range(-5, 5)]}
  38. mtsvm = GridSearchCV(MLTSVM(), param_grid=parameters, n_jobs=-1, cv=loocv,
  39. scoring=scoring_funcs, verbose=3, refit="absolute true")
  40. mtsvm.fit(X, Y.values)
  41. print(mtsvm.best_score_)
  42. parameters = {
  43. 'classifier': [LabelPowerset()],
  44. 'classifier__classifier': [ExtraTreesClassifier()],
  45. 'classifier__classifier__n_estimators': [50, 100, 500, 1000],
  46. 'clusterer' : [
  47. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
  48. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
  49. ]
  50. }
  51. ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
  52. scoring=scoring_funcs, verbose=3, refit="absolute true")
  53. ext.fit(X, Y.values)
  54. print(ext.best_score_)
  55. parameters = {
  56. 'classifier': [LabelPowerset()],
  57. 'classifier__classifier': [RandomForestClassifier()],
  58. 'classifier__classifier__n_estimators': [50, 100, 500, 1000],
  59. 'clusterer' : [
  60. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
  61. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
  62. ]
  63. }
  64. rf = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
  65. scoring=scoring_funcs, verbose=3, refit="absolute true")
  66. rf.fit(X, Y.values)
  67. print(rf.best_score_)
  68. parameters = {
  69. 'classifier': [LabelPowerset()],
  70. 'classifier__classifier': [XGBClassifier()],
  71. 'classifier__classifier__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 500],
  72. 'clusterer' : [
  73. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
  74. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
  75. ]
  76. }
  77. xgb = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
  78. scoring=scoring_funcs, verbose=3, refit="absolute true")
  79. xgb.fit(X, Y.values)
  80. print(xgb.best_score_)
  81. parameters = {
  82. 'classifier': [LabelPowerset()],
  83. 'classifier__classifier': [LGBMClassifier()],
  84. 'classifier__classifier__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 500],
  85. 'clusterer' : [
  86. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
  87. NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
  88. ]
  89. }
  90. lgb = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
  91. scoring=scoring_funcs, verbose=3, refit="absolute true")
  92. lgb.fit(X, Y.values)
  93. mytuple = (
  94. ext,
  95. rf,
  96. xgb,
  97. lgb,
  98. mlknn
  99. )
  100. to_save = dump(mytuple, filename="ensemble.joblib")