|
- # -*- coding: utf-8 -*-
- """
- Created on Fri Oct 16 09:55:56 2020
-
- @author: xie-0
- """
- import numpy as np
- #import statsmodels.api as sm
- import pandas as pd
-
- import math
-
- df=pd.read_excel('calculated_brand_data.xlsx',index_col=0)
- label=list(df)
- # #增加性能指标交叉项
- # #添加尺寸多次项
- # cross_size=[0 for k in range(1900)]
- # for i in range(3):
- # power_label = '尺寸^%s'% str(i+2)
- # label.append(power_label)
- # for k in range(1900):
- # cross_size[k]=df['尺寸'][k]**(i+2)
- # df.insert(len(label)-1, power_label,cross_size)
- #添加清晰度2次项
- # label.append('清晰度*清晰度')
- # for k in range(1900):
- # cross_size[k]=df['清晰度'][k]**2
- # df.insert(len(label)-1,'清晰度*清晰度',cross_size)
-
- # #添加func交叉项
- # cross_func = [[0 for k in range(1900)] for i in range(21)]
- # l=0
-
- # for i in range(7):
- # for j in range(i+1,7):
- # cross_func_label = label[24+i]+'*'+label[24+j]
- # label.append(cross_func_label)
- # for k in range(1900):
- # cross_func[l][k]=df[label[24+i]][k]*df[label[24+j]][k]
- # df.insert(len(label)-1,cross_func_label,cross_func[l])
- # l=l+1
-
- # #添加func与brand交叉项
- # cross_brand_func = [[0 for k in range(1900)] for i in range(18*7)]
- # l=0
- # for i in range(18):
- # for j in range(0,7):
- # cross_brand_func_label = label[6+i]+'*'+label[24+j]
- # label.append(cross_brand_func_label)
- # for k in range(1900):
- # cross_brand_func[i*7+j][k]=df[label[6+i]][k]*df[label[24+j]][k]
- # df.insert(len(label)-1,cross_brand_func_label,cross_brand_func[i*7+j])
-
- # import matplotlib.pyplot as plt
- # plt.scatter(df_ppr_gamma['性价比价格敏感度'],df['尺寸'])
-
- # #按照adj.rsquared筛选
- # import statsmodels.formula.api as smf
- # def forward_selected(data, response):
- # 使用Adjusted R-squared来评判新加的参数是否提高回归中的统计显著性
- # Linear model designed by forward selection.
- # Parameters:
- # -----------
- # data : pandas DataFrame with all possible predictors and response
- # response: string, name of response column in data
- # Returns:
- # --------
- # model: an "optimal" fitted statsmodels linear model
- # with an intercept
- # selected by forward selection
- # evaluated by adjusted R-squared
- # """
- # remaining = set(data.columns)
- # remaining.remove(response)
- # selected = []
- # current_score, best_new_score = 0.0, 0.0
- # while remaining and current_score == best_new_score:
- # scores_with_candidates = []
- # for candidate in remaining:
- # formula = "{} ~ {} + 1".format(response,
- # ' + '.join(selected + [candidate]))
- # score = smf.ols(formula, data).fit().rsquared_adj
- # scores_with_candidates.append((score, candidate))
- # scores_with_candidates.sort()
- # best_new_score, best_candidate = scores_with_candidates.pop()
- # if current_score < best_new_score:
- # remaining.remove(best_candidate)
- # selected.append(best_candidate)
- # current_score = best_new_score
- # print("R2 is {},continuing!".format(current_score)) #输出最小的R2值
- # else:
- # print("for selection over!")
- # break
- # formula = "{} ~ {} + 1".format(response,
- # ' + '.join(selected))
- # print("final formula is {}".format(formula))
- # model = smf.ols(formula, data).fit()
-
- # return model
-
- #按照bic/aic筛选
- from statsmodels.formula.api import ols
- #定义向前逐步回归函数
- def forward_selected(data,target):
- variate=set(data.columns) #将字段名转换成字典类型
- variate.remove(target) #去掉因变量的字段名
- selected=[]
- current_score,best_new_score=float('inf'),float('inf') #目前的分数和最好分数初始值都为无穷大(因为AIC越小越好)
- #循环筛选变量
- while variate:
- aic_with_variate=[]
- for candidate in variate: #逐个遍历自变量
- formula="{}~{}".format(target,"+".join(selected+[candidate])) #将自变量名连接起来
- aic=ols(formula=formula,data=data).fit().bic #利用ols训练模型得出aic值,此处可切换bic
- aic_with_variate.append((aic,candidate)) #将第每一次的aic值放进空列表
- aic_with_variate.sort(reverse=True) #降序排序aic值
- best_new_score,best_candidate=aic_with_variate.pop() #最好的aic值等于删除列表的最后一个值,以及最好的自变量等于列表最后一个自变量
- if current_score>best_new_score: #如果目前的aic值大于最好的aic值
- variate.remove(best_candidate) #移除加进来的变量名,即第二次循环时,不考虑此自变量了
- selected.append(best_candidate) #将此自变量作为加进模型中的自变量
- current_score=best_new_score #最新的分数等于最好的分数
- print("aic is {},continuing!".format(current_score)) #输出最小的aic值
- else:
- print("for selection over!")
- break
- formula="{}~{}".format(target,"+".join(selected)) #最终的模型式子
- print("final formula is {}".format(formula))
- model=ols(formula=formula,data=data).fit()
- return(model)
-
- # #定义向后逐步回归函数
- # def backward_selected(data,target):
- # variate=set(data.columns) #将字段名转换成字典类型
- # variate.remove(target) #去掉因变量的字段名
- # selected=set(data.columns)
- # selected.remove(target)
- # current_score,best_new_score=float('inf'),float('inf') #目前的分数和最好分数初始值都为无穷大(因为AIC越小越好)
- # #循环筛选变量
- # while variate:
- # aic_with_variate=[]
- # for candidate in variate: #逐个遍历自变量
- # reduced_selected=set(selected)
- # reduced_selected.remove(candidate)
- # formula="{}~{}".format(target,"+".join(reduced_selected)) #将自变量名连接起来
- # aic=ols(formula=formula,data=data).fit().bic #利用ols训练模型得出aic值,此处可切换bic
- # aic_with_variate.append((aic,candidate)) #将第每一次的aic值放进空列表
- # aic_with_variate.sort(reverse=True) #降序排序aic值
- # best_new_score,best_candidate=aic_with_variate.pop() #最好的aic值等于删除列表的最后一个值,以及最好的自变量等于列表最后一个自变量
- # if current_score>best_new_score: #如果目前的aic值大于最好的aic值
- # selected.remove(best_candidate) #移除待删减的变量名,即第二次循环时,不考虑此自变量了
- # variate.remove(best_candidate) #将此自变量作为加进模型中的自变量
- # current_score=best_new_score #最新的分数等于最好的分数
- # print("aic is {},continuing!".format(current_score)) #输出最小的aic值
- # else:
- # print("for selection over!")
- # break
- # formula="{}~{}".format(target,"+".join(selected)) #最终的模型式子
- # print("final formula is {}".format(formula))
- # model=ols(formula=formula,data=data).fit()
- # return(model)
-
-
- df_ppr_perf=df.drop(label[1:6],axis=1)
- df_ppr_gamma=df.drop(['性价比性能','盈余式性能','盈余式价格敏感度','价格','市场份额'],axis=1)
- df_sur_perf=df.drop(['性价比性能','性价比价格敏感度','盈余式价格敏感度','价格','市场份额'],axis=1)
- df_sur_beta=df.drop(['性价比性能','性价比价格敏感度','盈余式性能','价格','市场份额'],axis=1)
- df_price=df.drop(['性价比性能','性价比价格敏感度','盈余式性能','盈余式价格敏感度','市场份额'],axis=1)
-
- ppr_step = forward_selected(df_ppr_perf, '性价比性能')
-
- #for i in range(1900):
- # df_ppr_gamma['性价比价格敏感度'][i] = df_ppr_gamma['性价比价格敏感度'][i]/df['性价比性能'][i]
- # df_sur_beta['盈余式价格敏感度'][i] = df_sur_beta['盈余式价格敏感度'][i]/df['盈余式性能'][i]
- # df_ppr_gamma['性价比价格敏感度'][i] = math.exp(df_ppr_gamma['性价比价格敏感度'][i]-1)
- # df_sur_beta['盈余式价格敏感度'][i] = math.exp(df_sur_beta['盈余式价格敏感度'][i]-1)
- # # df_ppr_gamma['价格'][i] = math.log(df_ppr_gamma['价格'][i])
- # df_ppr_gamma['尺寸'][i] = df_ppr_gamma['尺寸'][i]**2
- ppr_gamma_step = forward_selected(df_ppr_gamma, '性价比价格敏感度')
- #ppr_gamma_step.summary()
-
- sur_step = forward_selected(df_sur_perf, '盈余式性能')
- sur_beta_step = forward_selected(df_sur_beta, '盈余式价格敏感度')
-
- #price_step=forward_selected(df_price, '价格') #探索价格与各性能指标是否存在线性关系
-
- # #输出回归分析的结果
- # f = open("regression_results.txt", "w")
- # print(ppr_reg.summary(),file=f)
- # print(ppr_gamma_reg.summary(),file=f)
- # print(ppr_log_gamma_reg.summary(),file=f)
- # print(ppr_exp_gamma_reg.summary(),file=f)
- # print(sur_reg.summary(),file=f)
- # print(sur_beta_reg.summary(),file=f)
- # print(sur_log_beta_reg.summary(),file=f)
- # print(sur_exp_beta_reg.summary(),file=f)
- # f.close()
-
- f = open("stepwise_results.txt", "w")
- print(ppr_step.summary(),file=f)
- print(ppr_gamma_step.summary(),file=f)
- # print(ppr_log_gamma_step.summary(),file=f)
- # print(ppr_exp_gamma_step.summary(),file=f)
- print(sur_step.summary(),file=f)
- print(sur_beta_step.summary(),file=f)
- # print(sur_log_beta_step.summary(),file=f)
- # print(sur_exp_beta_step.summary(),file=f)
-
- #print(price_step.summary(),file=f)
- f.close()
-
- #将回归系数输出为numpy数组
- output=ppr_step.params
- #pd.save("ppr_independent_variables.xlsx",output.index) #输出影响显著的自变量
- #pd.save("ppr_coeffecients.xlsx",output) #输出各自变量对应的系数
- output.to_excel(r'ppr_coeffecients.xlsx')
|