You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

63 lines
2.2 KiB

  1. import sys
  2. sys.path.append("../..")
  3. import warnings
  4. warnings.filterwarnings("ignore")
  5. from chemocommons import *
  6. import pandas as pd
  7. import numpy as np
  8. from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
  9. from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
  10. from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?
  11. from skmultilearn.ensemble import RakelD
  12. from skmultilearn.adapt import MLkNN, MLTSVM
  13. from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP
  14. from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
  15. from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, "socalled"
  16. from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation
  17. from sklearn.utils.multiclass import unique_labels
  18. from lightgbm import LGBMClassifier
  19. loocv = LeaveOneOut() # jackknife
  20. label_names = ["ABCG2", "MDR1", "MRP1", "MRP2", "MRP3", "MRP4", "NTCP2", "S15A1",
  21. "S22A1", "SO1A2", "SO1B1", "SO1B3", "SO2B1"]
  22. Y = pd.read_csv("label_matrix.txt", sep="\t", names=label_names)
  23. Y[Y==-1]=0
  24. ft_FP = pd.read_csv("query_smiles_feature_similarity_four_average.csv", names=label_names)
  25. ft_FP.rename(mapper= lambda x: x + "_FP", axis=1, inplace=True)
  26. ft_OT = pd.read_csv("feature_similarity_chebi_ontology_DiShIn_2.csv", names=label_names)
  27. ft_OT.rename(mapper= lambda x: x + "_OT", axis=1, inplace=True)
  28. X = np.concatenate((ft_FP, ft_OT), axis=1)
  29. scoring_funcs = {"hamming loss": hamming_func,
  30. "aiming": aiming_func,
  31. "coverage": coverage_func,
  32. "accuracy": accuracy_func,
  33. "absolute true": absolute_true_func,
  34. } # Keep recorded
  35. parameters = {'labelset_size': [2, 3, 4, 5, 6, 7, 8, 9, 10]}
  36. rakeld = GridSearchCV(RakelD(base_classifier=GaussianNB(),
  37. baseclassifier_require_dense=[True, True],), param_grid=parameters, n_jobs=-1, cv=loocv,
  38. scoring=scoring_funcs, verbose=3, refit="absolute true")
  39. rakeld.fit(X, Y.values)
  40. print(rakeld.best_score_)
  41. mytuple = (
  42. rakeld,
  43. )
  44. to_save = dump(mytuple, filename="rakeld.joblib")