|
- from autox.autox_competition.feature_engineer import FeatureCount
- from autox.autox_competition.feature_engineer.fe_stat import FeatureStat
- from autox.autox_competition.feature_engineer import FeatureRank
- from autox.autox_competition.feature_engineer import FeatureNlp
- from autox.autox_competition.feature_engineer.fe_time import FeatureTime
- from autox.autox_competition.feature_engineer import FeatureCumsum
- from autox.autox_competition.feature_engineer import FeatureShift
- from autox.autox_competition.feature_engineer import FeatureDiff
- from autox.autox_competition.feature_engineer.fe_one2M import FeatureOne2M
- from autox.autox_competition.feature_engineer import fe_ima2vec
- from autox.autox_competition.file_io import read_data_from_path
- from autox.autox_competition.models import CrossLgbRegression, CrossXgbRegression
- from autox.autox_competition.models.classifier import CrossLgbBiClassifier, CrossXgbBiClassifier
- from autox.autox_competition.process_data import feature_combination, train_test_divide, clip_label
- from autox.autox_competition.process_data import feature_filter, auto_encoder
- from autox.autox_competition.process_data.feature_type_recognition import Feature_type_recognition
- from autox.autox_competition.util import log
- from autox.autox_competition.feature_engineer import FeatureShiftTS, FeatureRollingStatTS, FeatureExpWeightedMean
- from autox.autox_competition.models.regressor_ts import LgbRegressionTs, XgbRegressionTs
-
- class AutoX():
- """AutoX主函数描述"""
-
- def __init__(self, target, train_name, test_name, path, time_series=False, ts_unit=None, time_col=None,
- metric='rmse', feature_type = {}, relations = [], id = [], task_type = 'regression',
- Debug = False, image_info={}, target_map={}):
- self.Debug = Debug
- self.info_ = {}
- self.info_['id'] = id
- self.info_['task_type'] = task_type
- self.info_['target'] = target
- self.info_['feature_type'] = feature_type
- self.info_['relations'] = relations
- self.info_['train_name'] = train_name
- self.info_['test_name'] = test_name
- self.info_['metric'] = metric
- self.info_['time_series'] = time_series
- self.info_['ts_unit'] = ts_unit
- self.info_['time_col'] = time_col
- self.info_['image_info'] = image_info
- self.info_['target_map'] = target_map
- self.dfs_ = read_data_from_path(path, train_name=train_name, target=target, target_map=target_map)
- if image_info:
- assert('image_path' in image_info.keys())
- assert('image_col' in image_info.keys())
- assert('filename_extension' in image_info.keys())
- if time_series:
- assert(ts_unit is not None)
- assert(time_col is not None)
- if Debug:
- log("Debug mode, sample data")
- self.dfs_[train_name] = self.dfs_[train_name].sample(5000)
- self.info_['max_target'] = self.dfs_[train_name][target].max()
- self.info_['min_target'] = self.dfs_[train_name][target].min()
- if feature_type == {}:
- for table_name in self.dfs_.keys():
- df = self.dfs_[table_name]
- feature_type_recognition = Feature_type_recognition()
- feature_type = feature_type_recognition.fit(df)
- self.info_['feature_type'][table_name] = feature_type
- self.join_simple_tables()
- self.concat_train_test()
-
- self.dfs_['FE_all'] = None
- self.sub = None
-
- # 识别任务类型
- if self.dfs_[self.info_['train_name']][self.info_['target']].nunique() == 2:
- self.info_['task_type'] = 'binary'
- else:
- self.info_['task_type'] = 'regression'
-
- def join_simple_tables(self):
- simple_relations = [x for x in self.info_['relations'] if x['type'] == '1-1' and x['related_to_main_table'] == 'true']
- for relation in simple_relations:
- left_table_name = relation['left_entity']
- right_table_name = relation['right_entity']
- left_on = relation['left_on']
- right_on = relation['right_on']
- if right_table_name in [self.info_['train_name'], self.info_['test_name']]:
- left_table_name, right_table_name = right_table_name, left_table_name
- left_on, right_on = right_on, left_on
-
- skip_name = right_on
- merge_table_name = right_table_name
- merge_table = self.dfs_[merge_table_name].copy()
-
- # rename
- merge_table.columns = [x if x in skip_name else merge_table_name + '__' + x for x in merge_table.columns]
-
- self.dfs_[left_table_name] = self.dfs_[left_table_name].merge(merge_table, left_on=left_on,
- right_on=right_on, how='left')
- if left_on != right_on:
- self.dfs_[left_table_name].drop(right_on, axis=1, inplace=True)
-
- del merge_table
- for key_ in self.info_['feature_type'][merge_table_name]:
- if key_ not in skip_name:
- self.info_['feature_type'][left_table_name][merge_table_name + '__' + key_] = self.info_['feature_type'][merge_table_name][key_]
-
- def concat_train_test(self):
- self.info_['shape_of_train'] = len(self.dfs_[self.info_['train_name']])
- self.info_['shape_of_test'] = len(self.dfs_[self.info_['test_name']])
- self.dfs_['train_test'] = self.dfs_[self.info_['train_name']].append(self.dfs_[self.info_['test_name']])
- self.dfs_['train_test'].index = range(len(self.dfs_['train_test']))
-
- feature_type_train_test = {}
- for col in self.dfs_['train_test'].columns:
- if col in self.info_['feature_type'][self.info_['train_name']]:
- feature_type_train_test[col] = self.info_['feature_type'][self.info_['train_name']][col]
- else:
- feature_type_train_test[col] = self.info_['feature_type'][self.info_['test_name']][col]
- self.info_['feature_type']['train_test'] = feature_type_train_test
-
- def split_train_test(self):
- self.dfs_['FE_train'] = self.dfs_['FE_all'][:self.info_['shape_of_train']]
- self.dfs_['FE_test'] = self.dfs_['FE_all'][self.info_['shape_of_train']:]
-
- def get_submit(self):
- self.topk_feas = self.get_top_features(return_df = False)
-
- # 模型训练
- log("start training xgboost model")
- if self.info_['task_type'] == 'regression':
- self.model_xgb = CrossXgbRegression(metric=self.info_['metric'])
- self.model_xgb.fit(self.train[self.used_features], self.train[self.info_['target']], tuning=False, Debug=self.Debug)
-
- elif self.info_['task_type'] == 'binary':
- self.model_xgb = CrossXgbBiClassifier()
- self.model_xgb.fit(self.train[self.used_features], self.train[self.info_['target']], tuning=False, Debug=self.Debug)
-
- # 模型预测
- predict_lgb = self.model_lgb.predict(self.test[self.used_features])
- predict_xgb = self.model_xgb.predict(self.test[self.used_features].astype('float64'))
- # predict_tabnet = model_tabnet.predict(test[used_features])
- predict = (predict_xgb + predict_lgb) / 2
-
- # 预测结果后处理
- min_ = self.info_['min_target']
- max_ = self.info_['max_target']
- predict = clip_label(predict, min_, max_)
-
- # 获得结果
- sub = self.test[self.info_['id']]
- sub[self.info_['target']] = predict
- sub.index = range(len(sub))
-
- return sub
-
- def get_top_features(self, topk = 50, return_df = True):
-
- id_ = self.info_['id']
- target = self.info_['target']
-
- # 特征工程
- log("start feature engineer")
- df = self.dfs_['train_test']
- feature_type = self.info_['feature_type']['train_test']
-
- # 1-M拼表特征
- # one2M拼表特征
- log("feature engineer: one2M")
- featureOne2M = FeatureOne2M()
- featureOne2M.fit(self.info_['relations'], self.info_['train_name'], self.info_['feature_type'])
- log(f"featureOne2M ops: {featureOne2M.get_ops()}")
- if len(featureOne2M.get_ops()) != 0:
- self.dfs_['FE_One2M'] = featureOne2M.transform(df, self.dfs_)
- else:
- self.dfs_['FE_One2M'] = None
- log("ignore featureOne2M")
-
- # 时间特征
- log("feature engineer: time")
- featureTime = FeatureTime()
- featureTime.fit(df, df_feature_type=feature_type, silence_cols=id_ + [target])
- log(f"featureTime ops: {featureTime.get_ops()}")
- self.dfs_['FE_time'] = featureTime.transform(df)
-
-
- # cumsum特征
- log("feature engineer: Cumsum")
- featureCumsum = FeatureCumsum()
- featureCumsum.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target],
- silence_agg_cols=id_ + [target], select_all=False)
- fe_cumsum_cnt = 0
- for key_ in featureCumsum.get_ops().keys():
- fe_cumsum_cnt += len(featureCumsum.get_ops()[key_])
- if fe_cumsum_cnt < 30:
- self.dfs_['FE_cumsum'] = featureCumsum.transform(df)
- log(f"featureCumsum ops: {featureCumsum.get_ops()}")
- else:
- self.dfs_['FE_cumsum'] = None
- log("ignore featureCumsum")
-
- # shift特征
- log("feature engineer: Shift")
- featureShift = FeatureShift()
- featureShift.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target],
- silence_agg_cols=id_ + [target], select_all=False)
- fe_shift_cnt = 0
- for key_ in featureShift.get_ops().keys():
- fe_shift_cnt += len(featureShift.get_ops()[key_])
- if fe_shift_cnt < 30:
- self.dfs_['FE_shift'] = featureShift.transform(df)
- log(f"featureShift ops: {featureShift.get_ops()}")
- else:
- self.dfs_['FE_shift'] = None
- log("ignore featureShift")
-
-
- # diff特征
- log("feature engineer: Diff")
- featureDiff = FeatureDiff()
- featureDiff.fit(df, df_feature_type=feature_type, silence_group_cols=id_ + [target],
- silence_agg_cols=id_ + [target], select_all=False)
- fe_diff_cnt = 0
- for key_ in featureDiff.get_ops().keys():
- fe_diff_cnt += len(featureDiff.get_ops()[key_])
- if fe_diff_cnt < 30:
- self.dfs_['FE_diff'] = featureDiff.transform(df)
- log(f"featureDiff ops: {featureDiff.get_ops()}")
- else:
- self.dfs_['FE_diff'] = None
- log("ignore featureDiff")
-
-
- # 统计特征
- log("feature engineer: Stat")
- featureStat = FeatureStat()
- featureStat.fit(df, df_feature_type=feature_type, silence_group_cols= id_ + [target],
- silence_agg_cols= id_ + [target], select_all=False)
-
- fe_stat_cnt = 0
- for key_ in featureStat.get_ops().keys():
- aggs = featureStat.get_ops()[key_]
- for cur_agg in aggs:
- fe_stat_cnt += len(featureStat.get_ops()[key_][cur_agg])
- if fe_stat_cnt < 1500:
- self.dfs_['FE_stat'] = featureStat.transform(df)
- log(f"featureStat ops: {featureStat.get_ops()}")
- else:
- self.dfs_['FE_stat'] = None
- log("ignore featureStat")
-
- # nlp特征
- log("feature engineer: NLP")
- featureNlp = FeatureNlp()
- featureNlp.fit(df, target, df_feature_type=feature_type, silence_cols=id_, select_all=False)
- self.dfs_['FE_nlp'] = featureNlp.transform(df)
- log(f"featureNlp ops: {featureNlp.get_ops()}")
-
- # count特征
- log("feature engineer: Count")
- # degree自动调整
- featureCount = FeatureCount()
- featureCount.fit(df, degree=2, df_feature_type=feature_type, silence_cols= id_ + [target], select_all=False)
- if len(featureCount.get_ops()) > 500:
- featureCount = FeatureCount()
- featureCount.fit(df, degree=1, df_feature_type=feature_type, silence_cols=id_ + [target], select_all=False)
- self.dfs_['FE_count'] = featureCount.transform(df)
- log(f"featureCount ops: {featureCount.get_ops()}")
-
-
- # rank特征
- log("feature engineer: Rank")
- featureRank = FeatureRank()
- featureRank.fit(df, df_feature_type=feature_type, select_all=False)
- fe_rank_cnt = 0
- for key_ in featureRank.get_ops().keys():
- fe_rank_cnt += len(featureRank.get_ops()[key_])
- if fe_rank_cnt < 500:
- self.dfs_['FE_rank'] = featureRank.transform(df)
- log(f"featureRank ops: {featureRank.get_ops()}")
- else:
- self.dfs_['FE_rank'] = None
- log("ignore featureRank")
-
- # image特征
- log("feature engineer: Image")
- if self.info_['image_info']:
- self.dfs_['FE_image'] = fe_ima2vec(df, self.info_['image_info']['image_path'],
- self.info_['image_info']['image_col'],
- self.info_['image_info']['filename_extension'])
- else:
- self.dfs_['FE_image'] = None
- log("ignore image feature")
-
- # auto_encoder
- df = auto_encoder(df, feature_type, id_)
-
- # 特征合并
- log("feature combination")
- df_list = [df, self.dfs_['FE_nlp'], self.dfs_['FE_count'], self.dfs_['FE_stat'], self.dfs_['FE_rank'],
- self.dfs_['FE_shift'], self.dfs_['FE_diff'], self.dfs_['FE_cumsum'], self.dfs_['FE_One2M'],
- self.dfs_['FE_image']]
- self.dfs_['FE_all'] = feature_combination(df_list)
-
- # # 内存优化
- # self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all'])
-
- # train和test数据切分
- train_length = self.info_['shape_of_train']
- self.train, self.test = train_test_divide(self.dfs_['FE_all'], train_length)
- log(f"shape of FE_all: {self.dfs_['FE_all'].shape}, shape of train: {self.train.shape}, shape of test: {self.test.shape}")
-
- # 特征过滤
- log("feature filter")
- self.used_features = feature_filter(self.train, self.test, id_, target)
- log(f"used_features: {self.used_features}")
-
- # 模型训练
- log("start training lightgbm model")
- if self.info_['task_type'] == 'regression':
- self.model_lgb = CrossLgbRegression(metric=self.info_['metric'])
- self.model_lgb.fit(self.train[self.used_features], self.train[target], tuning=False, Debug=self.Debug)
-
- elif self.info_['task_type'] == 'binary':
- self.model_lgb = CrossLgbBiClassifier()
- self.model_lgb.fit(self.train[self.used_features], self.train[target], tuning=False, Debug=self.Debug)
-
- # 特征重要性
- fimp = self.model_lgb.feature_importances_
- log("feature importance")
- log(fimp)
-
- topk_feas = [x for x in list(fimp['feature']) if x not in df.columns][:topk]
- if return_df:
- return topk_feas, self.train[id_ + topk_feas], self.test[id_ + topk_feas]
- else:
- return topk_feas
-
- def get_submit_ts(self):
-
- self.topk_feas = self.get_top_features_ts(return_df=False)
-
- # 模型训练
- log("start training xgboost model")
- if self.info_['task_type'] == 'regression':
- self.model_xgb = XgbRegressionTs()
- self.model_xgb.fit(self.train, self.test, self.used_features, self.info_['target'], self.info_['time_col'], self.info_['ts_unit'])
-
- # 模型预测
- predict_lgb = self.model_lgb.predict(self.test, self.used_features)
- predict_xgb = self.model_xgb.predict(self.test, self.used_features)
- # predict_tabnet = model_tabnet.predict(test[used_features])
- predict = (predict_xgb + predict_lgb) / 2
-
- # 预测结果后处理
- min_ = self.info_['min_target']
- max_ = self.info_['max_target']
- predict = clip_label(predict, min_, max_)
-
- # 获得结果
- sub = self.test[self.info_['id'] + [self.info_['time_col']]]
- sub[self.info_['target']] = predict
- sub.index = range(len(sub))
-
- return sub
-
-
- def get_top_features_ts(self, topk = 50, return_df = True):
-
- id_ = self.info_['id']
- target = self.info_['target']
-
- # 特征工程
- log("start feature engineer")
- df = self.dfs_['train_test']
- feature_type = self.info_['feature_type']['train_test']
-
- # 1-M拼表特征
- # one2M拼表特征
- log("feature engineer: one2M")
- featureOne2M = FeatureOne2M()
- featureOne2M.fit(self.info_['relations'], self.info_['train_name'], self.info_['feature_type'])
- log(f"featureOne2M ops: {featureOne2M.get_ops()}")
- if len(featureOne2M.get_ops()) != 0:
- self.dfs_['FE_One2M'] = featureOne2M.transform(df, self.dfs_)
- else:
- self.dfs_['FE_One2M'] = None
- log("ignore featureOne2M")
-
- # 时间特征
- log("feature engineer: time")
- featureTime = FeatureTime()
- featureTime.fit(df, df_feature_type=feature_type, silence_cols=id_ + [target])
- log(f"featureTime ops: {featureTime.get_ops()}")
- self.dfs_['FE_time'] = featureTime.transform(df)
-
- # lag_ts特征
- log("feature engineer: ShiftTS")
- featureShiftTS = FeatureShiftTS()
- featureShiftTS.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit'])
- log(f"featureShiftTS ops: {featureShiftTS.get_ops()}")
- log(f"featureShiftTS lags: {featureShiftTS.get_lags()}")
- self.dfs_['FE_shift_ts'] = featureShiftTS.transform(df)
-
- # rolling_stat_ts特征
- log("feature engineer: RollingStatTS")
- featureRollingStatTS = FeatureRollingStatTS()
- featureRollingStatTS.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit'])
- log(f"featureRollingStatTS ops: {featureRollingStatTS.get_ops()}")
- log(f"featureRollingStatTS windows: {featureRollingStatTS.get_windows()}")
- self.dfs_['FE_rollingStat_ts'] = featureRollingStatTS.transform(df)
-
- # exp_weighted_mean_ts特征
- log("feature engineer: ExpWeightedMean")
- featureExpWeightedMean = FeatureExpWeightedMean()
- featureExpWeightedMean.fit(df, id_, target, feature_type, self.info_['time_col'], self.info_['ts_unit'])
- log(f"featureExpWeightedMean ops: {featureExpWeightedMean.get_ops()}")
- log(f"featureExpWeightedMean lags: {featureExpWeightedMean.get_lags()}")
- self.dfs_['FE_ewm'] = featureExpWeightedMean.transform(df)
-
- # label_encoder
- df = auto_encoder(df, feature_type, id_)
-
- # 特征合并
- log("feature combination")
- df_list = [df, self.dfs_['FE_One2M'], self.dfs_['FE_time'], self.dfs_['FE_shift_ts'], self.dfs_['FE_rollingStat_ts'], self.dfs_['FE_ewm']]
- self.dfs_['FE_all'] = feature_combination(df_list)
-
- # # 内存优化
- # self.dfs_['FE_all'] = reduce_mem_usage(self.dfs_['FE_all'])
-
- # train和test数据切分
- train_length = self.info_['shape_of_train']
- self.train, self.test = train_test_divide(self.dfs_['FE_all'], train_length)
- log(f"shape of FE_all: {self.dfs_['FE_all'].shape}, shape of train: {self.train.shape}, shape of test: {self.test.shape}")
-
- # 特征过滤
- log("feature filter")
- self.used_features = feature_filter(self.train, self.test, id_, target, time_series=True)
- log(f"used_features: {self.used_features}")
-
- # 模型训练
- log("start training lightgbm model")
- if self.info_['task_type'] == 'regression':
- self.model_lgb = LgbRegressionTs()
- self.model_lgb.fit(self.train, self.test, self.used_features, target, self.info_['time_col'], self.info_['ts_unit'])
-
- # 特征重要性
- fimp = self.model_lgb.feature_importances_
- log("feature importance")
- log(fimp)
-
- topk_feas = [x for x in list(fimp['feature']) if x not in df.columns][:topk]
- if return_df:
- return topk_feas, self.train[id_ + topk_feas], self.test[id_ + topk_feas]
- else:
- return topk_feas
|