|
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
-
-
- def feature_engineering(add_text):
- df = pd.read_csv(
- 'data/yashau_merge_without_name.csv', # sku_attribute, sale
- delimiter=',',
- header=0,
- encoding='gbk'
- )
- df_uv = pd.read_csv(
- 'data/741_sku_uv_rate.csv', # uv
- delimiter=',',
- header=0,
- encoding='gbk'
- )
- df_name_embed = pd.read_csv(
- 'data/741_sku_name_embed.csv', # sku_name_embed
- delimiter=',',
- header=0,
- encoding='utf-8'
- )
-
- # dt改成mt,加入UV
- df_uv['mt'] = df_uv['dt'].apply(lambda x: '-'.join(x.split('-')[:-1]))
- df_uv = df_uv.drop('dt', axis=1)
- df = pd.merge(df, df_uv, on=['item_sku_id', 'mt'], how='left')
-
- # 加入价格特征
- # df['price_max'] = df[['valid_price_avg', 'rule_based_price_avg', 'sku_jd_price_avg']].max(axis=1)
-
- # 定义各列的类型
- binary_columns = ['lovers', 'travel', 'rotation', 'silence', 'appointment_func', 'toothpaste_splashing_proof',
- '3d_cleaning', 'free_brush_head', 'gum_care', 'sensitive', 'remind_replacement', 'bluetooth',
- 'orthodontics', 'inductive_charge', 'multi_speed', 'magnetic_levitation', 'dupont']
- drop_columns = ['shelves_tm', 'valid_price_avg', 'rule_based_price_avg', 'sku_jd_price_avg',
- 'total_order_cnt', 'total_user_cnt', 'cur_mon_valid_ord_qtty_avg', 'curm_visit_user_cnt_avg',
- 'entire_uv_rate', 'rule_based_uv_rate_avg', 'contain_keyword', 'sku_name'] # 'item_sku_id',
- minmax_columns = []
- onehot_columns = ['color', 'suitable_crowd', 'brand_std', 'sale_y','sale_m', 'charge_hours', 'voltage', 'sale_mode']
- label_columns = ['cur_mon_valid_uv_rate', 'cur_mon_valid_ord_qtty']
-
- # 去掉不需要的列,销量和UV异常的行
- df = df.drop(drop_columns, axis=1)
- index = df.loc[(df.cur_mon_valid_ord_qtty <= 0) | (df.cur_mon_valid_ord_qtty.isnull()) |
- (df.cur_mon_valid_uv_rate <= 0) | (df.cur_mon_valid_uv_rate.isnull()) |
- (df.cur_mon_valid_uv_rate > 1)].index.to_list()
- df = df.drop(index, axis=0).reset_index(drop=True)
-
- # Min-max
- min_max = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))
- df[minmax_columns] = df[minmax_columns].apply(min_max)
-
- # One-hot
- transformed_df = pd.get_dummies(df, columns=onehot_columns)
-
- # 是否要加入sku的标题文本特征
- if add_text:
- transformed_df = pd.merge(transformed_df, df_name_embed, on=['item_sku_id'], how='left')
- transformed_df = transformed_df.drop(['item_sku_id'], axis=1)
-
- # 根据时间划分成train和test,作为main_mmoe的输入数据
- mt = transformed_df.mt.unique()
- test_mt = ['2021-08'] # [mt.max()]
- train_mt = list(set(mt) - set(test_mt))
-
- # 获取对应索引
- train_indices = transformed_df.loc[(transformed_df.mt.isin(train_mt))].index.to_list()
- test_indices = transformed_df.loc[transformed_df.mt.isin(test_mt)].index.to_list()
-
- # 根据索引划分数据
- transformed_train = transformed_df.iloc[train_indices]
- test_data = transformed_df.iloc[test_indices]
-
- return transformed_train, test_data
-
-
- def main(add_text):
- pd.set_option('display.max_columns', None)
- pd.set_option('float_format', lambda x: '%.3f' % x)
-
- # The data after feature engineering
- transformed_train, test_data = feature_engineering(add_text)
-
- # Save datasets
- if add_text:
- transformed_train.to_csv('data/input/train_with_title.csv', index=False, encoding='gbk')
- test_data.to_csv('data/input/test_with_title.csv', index=False, encoding='gbk')
- else:
- transformed_train.to_csv('data/input/train.csv', index=False, encoding='gbk')
- test_data.to_csv('data/input/test.csv', index=False, encoding='gbk')
- print('Data processing completed.')
-
|