Browse Source

上传文件至 ''

master
Katherine1216 1 week ago
parent
commit
06ec6a1c1d
5 changed files with 2526 additions and 0 deletions
  1. +386
    -0
      iterstrat.py
  2. +1846
    -0
      label_matrix.txt
  3. +92
    -0
      model_training.ipynb
  4. +132
    -0
      model_training.py
  5. +70
    -0
      model_training_ext.py

+ 386
- 0
iterstrat.py View File

@@ -0,0 +1,386 @@
"""This file includes multilabel cross validators based on an implementation of
the Iterative Stratification algorithm described in the following paper:
Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-
Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M. (eds)
Machine Learning and Knowledge Discovery in Databases. ECML PKDD 2011. Lecture
Notes in Computer Science, vol 6913. Springer, Berlin, Heidelberg.

From scikit-learn 0.19.0, StratifiedKFold, RepeatedStratifiedKFold, and
StratifiedShuffleSplit were copied and modified, retaining compatibility
with scikit-learn.

Attribution to authors of scikit-learn/model_selection/_split.py under BSD 3 clause:
Alexandre Gramfort <alexandre.gramfort@inria.fr>,
Gael Varoquaux <gael.varoquaux@normalesup.org>,
Olivier Grisel <olivier.grisel@ensta.org>,
Raghav RV <rvraghav93@gmail.com>
"""

# Author: Trent J. Bradberry <trentjason@hotmail.com>
# License: BSD 3 clause

import numpy as np

from sklearn.utils import check_random_state
from sklearn.utils.validation import _num_samples, check_array
from sklearn.utils.multiclass import type_of_target

from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits, \
BaseShuffleSplit, _validate_shuffle_split


def IterativeStratification(labels, r, random_state):
"""This function implements the Iterative Stratification algorithm described
in the following paper:
Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of
Multi-Label Data. In: Gunopulos D., Hofmann T., Malerba D., Vazirgiannis M.
(eds) Machine Learning and Knowledge Discovery in Databases. ECML PKDD
2011. Lecture Notes in Computer Science, vol 6913. Springer, Berlin,
Heidelberg.
"""

n_samples = labels.shape[0]
test_folds = np.zeros(n_samples, dtype=int)

# Calculate the desired number of examples at each subset
c_folds = r * n_samples

# Calculate the desired number of examples of each label at each subset
c_folds_labels = np.outer(r, labels.sum(axis=0))

labels_not_processed_mask = np.ones(n_samples, dtype=bool)

while np.any(labels_not_processed_mask):
# Find the label with the fewest (but at least one) remaining examples,
# breaking ties randomly
num_labels = labels[labels_not_processed_mask].sum(axis=0)

# Handle case where only all-zero labels are left by distributing
# across all folds as evenly as possible (not in original algorithm but
# mentioned in the text). (By handling this case separately, some
# code redundancy is introduced; however, this approach allows for
# decreased execution time when there are a relatively large number
# of all-zero labels.)
if num_labels.sum() == 0:
sample_idxs = np.where(labels_not_processed_mask)[0]

for sample_idx in sample_idxs:
fold_idx = np.where(c_folds == c_folds.max())[0]

if fold_idx.shape[0] > 1:
fold_idx = fold_idx[random_state.choice(fold_idx.shape[0])]

test_folds[sample_idx] = fold_idx
c_folds[fold_idx] -= 1

break

label_idx = np.where(num_labels == num_labels[np.nonzero(num_labels)].min())[0]
if label_idx.shape[0] > 1:
label_idx = label_idx[random_state.choice(label_idx.shape[0])]

sample_idxs = np.where(np.logical_and(labels[:, label_idx].flatten(), labels_not_processed_mask))[0]

for sample_idx in sample_idxs:
# Find the subset(s) with the largest number of desired examples
# for this label, breaking ties by considering the largest number
# of desired examples, breaking further ties randomly
label_folds = c_folds_labels[:, label_idx]
fold_idx = np.where(label_folds == label_folds.max())[0]

if fold_idx.shape[0] > 1:
temp_fold_idx = np.where(c_folds[fold_idx] ==
c_folds[fold_idx].max())[0]
fold_idx = fold_idx[temp_fold_idx]

if temp_fold_idx.shape[0] > 1:
fold_idx = fold_idx[random_state.choice(temp_fold_idx.shape[0])]

test_folds[sample_idx] = fold_idx
labels_not_processed_mask[sample_idx] = False

# Update desired number of examples
c_folds_labels[fold_idx, labels[sample_idx]] -= 1
c_folds[fold_idx] -= 1

return test_folds


class MultilabelStratifiedKFold(_BaseKFold):
"""Multilabel stratified K-Folds cross-validator
Provides train/test indices to split multilabel data into train/test sets.
This cross-validation object is a variation of KFold that returns
stratified folds for multilabel data. The folds are made by preserving
the percentage of samples for each label.
Parameters
----------
n_splits : int, default=3
Number of folds. Must be at least 2.
shuffle : boolean, optional
Whether to shuffle each stratification of the data before splitting
into batches.
random_state : int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Unlike StratifiedKFold that only uses random_state
when ``shuffle`` == True, this multilabel implementation
always uses the random_state since the iterative stratification
algorithm breaks ties randomly.
Examples
--------
>>> from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
>>> import numpy as np
>>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
>>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
>>> mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0)
>>> mskf.get_n_splits(X, y)
2
>>> print(mskf) # doctest: +NORMALIZE_WHITESPACE
MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=False)
>>> for train_index, test_index in mskf.split(X, y):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [0 3 4 6] TEST: [1 2 5 7]
TRAIN: [1 2 5 7] TEST: [0 3 4 6]
Notes
-----
Train and test sizes may be slightly different in each fold.
See also
--------
RepeatedMultilabelStratifiedKFold: Repeats Multilabel Stratified K-Fold
n times.
"""

def __init__(self, n_splits=3, shuffle=False, random_state=None):
super(MultilabelStratifiedKFold, self).__init__(n_splits, shuffle, random_state)

def _make_test_folds(self, X, y):
y = np.asarray(y, dtype=bool)
type_of_target_y = type_of_target(y)

if type_of_target_y != 'multilabel-indicator':
raise ValueError(
'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(type_of_target_y))

num_samples = y.shape[0]

rng = check_random_state(self.random_state)
indices = np.arange(num_samples)

if self.shuffle:
rng.shuffle(indices)
y = y[indices]

r = np.asarray([1 / self.n_splits] * self.n_splits)

test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

return test_folds[np.argsort(indices)]

def _iter_test_masks(self, X=None, y=None, groups=None):
test_folds = self._make_test_folds(X, y)
for i in range(self.n_splits):
yield test_folds == i

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Note that providing ``y`` is sufficient to generate the splits and
hence ``np.zeros(n_samples)`` may be used as a placeholder for
``X`` instead of actual training data.
y : array-like, shape (n_samples, n_labels)
The target variable for supervised learning problems.
Multilabel stratification is done based on the y labels.
groups : object
Always ignored, exists for compatibility.
Returns
-------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
Notes
-----
Randomized CV splitters may return different results for each call of
split. You can make the results identical by setting ``random_state``
to an integer.
"""
y = check_array(y, ensure_2d=False, dtype=None)
return super(MultilabelStratifiedKFold, self).split(X, y, groups)


class RepeatedMultilabelStratifiedKFold(_RepeatedSplits):
"""Repeated Multilabel Stratified K-Fold cross validator.
Repeats Mulilabel Stratified K-Fold n times with different randomization
in each repetition.
Parameters
----------
n_splits : int, default=5
Number of folds. Must be at least 2.
n_repeats : int, default=10
Number of times cross-validator needs to be repeated.
random_state : None, int or RandomState, default=None
Random state to be used to generate random state for each
repetition as well as randomly breaking ties within the iterative
stratification algorithm.
Examples
--------
>>> from iterstrat.ml_stratifiers import RepeatedMultilabelStratifiedKFold
>>> import numpy as np
>>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
>>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
>>> rmskf = RepeatedMultilabelStratifiedKFold(n_splits=2, n_repeats=2,
... random_state=0)
>>> for train_index, test_index in rmskf.split(X, y):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
...
TRAIN: [0 3 4 6] TEST: [1 2 5 7]
TRAIN: [1 2 5 7] TEST: [0 3 4 6]
TRAIN: [0 1 4 5] TEST: [2 3 6 7]
TRAIN: [2 3 6 7] TEST: [0 1 4 5]
See also
--------
RepeatedStratifiedKFold: Repeats (Non-multilabel) Stratified K-Fold
n times.
"""
def __init__(self, n_splits=5, n_repeats=10, random_state=None):
super(RepeatedMultilabelStratifiedKFold, self).__init__(
MultilabelStratifiedKFold, n_repeats, random_state,
n_splits=n_splits)


class MultilabelStratifiedShuffleSplit(BaseShuffleSplit):
"""Multilabel Stratified ShuffleSplit cross-validator
Provides train/test indices to split data into train/test sets.
This cross-validation object is a merge of MultilabelStratifiedKFold and
ShuffleSplit, which returns stratified randomized folds for multilabel
data. The folds are made by preserving the percentage of each label.
Note: like the ShuffleSplit strategy, multilabel stratified random splits
do not guarantee that all folds will be different, although this is
still very likely for sizeable datasets.
Parameters
----------
n_splits : int, default 10
Number of re-shuffling & splitting iterations.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. By default, the value is set to 0.1.
The default will change in version 0.21. It will remain 0.1 only
if ``train_size`` is unspecified, otherwise it will complement
the specified ``train_size``.
train_size : float, int, or None, default is None
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Unlike StratifiedShuffleSplit that only uses
random_state when ``shuffle`` == True, this multilabel implementation
always uses the random_state since the iterative stratification
algorithm breaks ties randomly.
Examples
--------
>>> from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
>>> import numpy as np
>>> X = np.array([[1,2], [3,4], [1,2], [3,4], [1,2], [3,4], [1,2], [3,4]])
>>> y = np.array([[0,0], [0,0], [0,1], [0,1], [1,1], [1,1], [1,0], [1,0]])
>>> msss = MultilabelStratifiedShuffleSplit(n_splits=3, test_size=0.5,
... random_state=0)
>>> msss.get_n_splits(X, y)
3
>>> print(mss) # doctest: +ELLIPSIS
MultilabelStratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
train_size=None)
>>> for train_index, test_index in msss.split(X, y):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
TRAIN: [1 2 5 7] TEST: [0 3 4 6]
TRAIN: [2 3 6 7] TEST: [0 1 4 5]
TRAIN: [1 2 5 6] TEST: [0 3 4 7]
Notes
-----
Train and test sizes may be slightly different from desired due to the
preference of stratification over perfectly sized folds.
"""

def __init__(self, n_splits=10, test_size="default", train_size=None,
random_state=None):
super(MultilabelStratifiedShuffleSplit, self).__init__(
n_splits, test_size, train_size, random_state)

def _iter_indices(self, X, y, groups=None):
n_samples = _num_samples(X)
y = check_array(y, ensure_2d=False, dtype=None)
y = np.asarray(y, dtype=bool)
type_of_target_y = type_of_target(y)

if type_of_target_y != 'multilabel-indicator':
raise ValueError(
'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
type_of_target_y))

n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)

n_samples = y.shape[0]
rng = check_random_state(self.random_state)
y_orig = y.copy()

r = np.array([n_train, n_test]) / (n_train + n_test)

for _ in range(self.n_splits):
indices = np.arange(n_samples)
rng.shuffle(indices)
y = y_orig[indices]

test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

test_idx = test_folds[np.argsort(indices)] == 1
test = np.where(test_idx)[0]
train = np.where(~test_idx)[0]

yield train, test

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Note that providing ``y`` is sufficient to generate the splits and
hence ``np.zeros(n_samples)`` may be used as a placeholder for
``X`` instead of actual training data.
y : array-like, shape (n_samples, n_labels)
The target variable for supervised learning problems.
Multilabel stratification is done based on the y labels.
groups : object
Always ignored, exists for compatibility.
Returns
-------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
Notes
-----
Randomized CV splitters may return different results for each call of
split. You can make the results identical by setting ``random_state``
to an integer.
"""
y = check_array(y, ensure_2d=False, dtype=None)
return super(MultilabelStratifiedShuffleSplit, self).split(X, y, groups)

+ 1846
- 0
label_matrix.txt
File diff suppressed because it is too large
View File


+ 92
- 0
model_training.ipynb View File

@@ -0,0 +1,92 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../..\")\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"from chemocommons import *\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from iterstrat.ml_stratifiers import MultilabelStratifiedKFold\n",
"from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer\n",
"from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes\n",
"from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?\n",
"from skmultilearn.adapt import MLkNN, MLTSVM\n",
"from skmultilearn.problem_transform import ClassifierChain, LabelPowerset # sorry, we only used LP\n",
"from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.model_selection import LeaveOneOut, RepeatedKFold#, KFold # jackknife, \"socalled\"\n",
"from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation\n",
"from sklearn.utils.multiclass import unique_labels\n",
"mskf = MultilabelStratifiedKFold(n_splits=10, random_state=1994115)\n",
"loocv = LeaveOneOut() # jackknife\n",
"\n",
"label_names = [\"ABCG2\", \"MDR1\", \"MRP1\", \"MRP2\", \"MRP3\", \"MRP4\", \"NTCP2\", \"S15A1\", \n",
" \"S22A1\", \"SO1A2\", \"SO1B1\", \"SO1B3\", \"SO2B1\"]\n",
"\n",
"Y = pd.read_csv(\"label_matrix.txt\", sep=\"\\t\", names=label_names)\n",
"Y[Y==-1]=0\n",
"\n",
"ft_FP = pd.read_csv(\"query_smiles_feature_similarity_four_average.csv\", names=label_names)\n",
"ft_FP.rename(mapper= lambda x: x + \"_FP\", axis=1, inplace=True)\n",
"ft_OT = pd.read_csv(\"feature_similarity_chebi_ontology_DiShIn_2.csv\", names=label_names)\n",
"ft_OT.rename(mapper= lambda x: x + \"_OT\", axis=1, inplace=True)\n",
"\n",
"X = np.concatenate((ft_FP, ft_OT), axis=1)\n",
"\n",
"scoring_funcs = {\"hamming loss\": hamming_func, \n",
" \"aiming\": aiming_func, \n",
" \"coverage\": coverage_func, \n",
" \"accuracy\": accuracy_func, \n",
" \"absolute true\": absolute_true_func, \n",
" } # Keep recorded\n",
"\n",
"parameters = {\n",
" 'classifier': [LabelPowerset()],\n",
" 'classifier__classifier': [ExtraTreesClassifier()],\n",
" 'classifier__classifier__n_estimators': [50, 100, 500, 1000],\n",
" 'clusterer' : [\n",
" NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),\n",
" NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')\n",
" ]\n",
"}\n",
"\n",
"\n",
"ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv, \n",
" scoring=scoring_funcs, verbose=3, refit=\"absolute true\")\n",
"ext.fit(X, Y.values)\n",
"print(ext.best_score_)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 132
- 0
model_training.py View File

@@ -0,0 +1,132 @@
import warnings
warnings.filterwarnings("ignore")

from chemocommons import *
import pandas as pd
import numpy as np

from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?

from skmultilearn.adapt import MLkNN, MLTSVM
from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, "socalled"
from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation
from sklearn.utils.multiclass import unique_labels
from lightgbm import LGBMClassifier

loocv = LeaveOneOut() # jackknife

label_names = ["ABCG2", "MDR1", "MRP1", "MRP2", "MRP3", "MRP4", "NTCP2", "S15A1",
"S22A1", "SO1A2", "SO1B1", "SO1B3", "SO2B1"]

Y = pd.read_csv("label_matrix.txt", sep="\t", names=label_names)
Y[Y==-1]=0

ft_FP = pd.read_csv("query_smiles_feature_similarity_four_average.csv", names=label_names)
ft_FP.rename(mapper= lambda x: x + "_FP", axis=1, inplace=True)
ft_OT = pd.read_csv("feature_similarity_chebi_ontology_DiShIn_2.csv", names=label_names)
ft_OT.rename(mapper= lambda x: x + "_OT", axis=1, inplace=True)

X = np.concatenate((ft_FP, ft_OT), axis=1)

scoring_funcs = {"hamming loss": hamming_func,
"aiming": aiming_func,
"coverage": coverage_func,
"accuracy": accuracy_func,
"absolute true": absolute_true_func,
} # Keep recorded


parameters = {'k': range(1,11), 's': [0.5, 0.7, 1.0]}

mlknn = GridSearchCV(MLkNN(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
mlknn.fit(X, Y.values)
print(mlknn.best_score_)

parameters = {'c_k': [2**i for i in range(-5, 5)]}

mtsvm = GridSearchCV(MLTSVM(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")

mtsvm.fit(X, Y.values)
print(mtsvm.best_score_)

parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [ExtraTreesClassifier()],
'classifier__classifier__n_estimators': [50, 100, 500, 1000],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}


ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
ext.fit(X, Y.values)
print(ext.best_score_)


parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [RandomForestClassifier()],
'classifier__classifier__n_estimators': [50, 100, 500, 1000],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}

rf = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
rf.fit(X, Y.values)
print(rf.best_score_)


parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [XGBClassifier()],
'classifier__classifier__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 500],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
xgb = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
xgb.fit(X, Y.values)
print(xgb.best_score_)



parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [LGBMClassifier()],
'classifier__classifier__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 500],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
lgb = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
lgb.fit(X, Y.values)




mytuple = (
ext,
rf,
xgb,
lgb,
mlknn
)

to_save = dump(mytuple, filename="ensemble.joblib")


+ 70
- 0
model_training_ext.py View File

@@ -0,0 +1,70 @@
import warnings
warnings.filterwarnings("ignore")

from chemocommons import *
import pandas as pd
import numpy as np

from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?

from skmultilearn.adapt import MLkNN, MLTSVM
from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, "socalled"
from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation
from sklearn.utils.multiclass import unique_labels
from lightgbm import LGBMClassifier

loocv = LeaveOneOut() # jackknife

label_names = ["ABCG2", "MDR1", "MRP1", "MRP2", "MRP3", "MRP4", "NTCP2", "S15A1",
"S22A1", "SO1A2", "SO1B1", "SO1B3", "SO2B1"]

Y = pd.read_csv("label_matrix.txt", sep="\t", names=label_names)
Y[Y==-1]=0

ft_FP = pd.read_csv("query_smiles_feature_similarity_four_average.csv", names=label_names)
ft_FP.rename(mapper= lambda x: x + "_FP", axis=1, inplace=True)
ft_OT = pd.read_csv("feature_similarity_chebi_ontology_DiShIn_2.csv", names=label_names)
ft_OT.rename(mapper= lambda x: x + "_OT", axis=1, inplace=True)

X = np.concatenate((ft_FP, ft_OT), axis=1)

scoring_funcs = {"hamming loss": hamming_func,
"aiming": aiming_func,
"coverage": coverage_func,
"accuracy": accuracy_func,
"absolute true": absolute_true_func,
} # Keep recorded





parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [ExtraTreesClassifier()],
'classifier__classifier__n_estimators': [50, 100, 500, 1000],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}


ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=3, refit="absolute true")
ext.fit(X, Y.values)
print(ext.best_score_)




mytuple = (
ext,
)

to_save = dump(mytuple, filename="ext.joblib")


Loading…
Cancel
Save