ColugoMum
/
demand_prediction
forked from JDOpenISCT/demand_prediction

 
			
							# -*- coding: utf-8 -*-
"""
Created on Fri Oct 16 09:55:56 2020

@author: xie-0
"""
import numpy as np       
#import statsmodels.api as sm
import pandas as pd

import math

df=pd.read_excel('calculated_brand_data.xlsx',index_col=0)
label=list(df)
# #增加性能指标交叉项
# #添加尺寸多次项
# cross_size=[0 for k in range(1900)]
# for i in range(3):
#     power_label = '尺寸^%s'% str(i+2)
#     label.append(power_label)
#     for k in range(1900):
#         cross_size[k]=df['尺寸'][k]**(i+2)
#     df.insert(len(label)-1, power_label,cross_size)
#添加清晰度2次项
# label.append('清晰度*清晰度')
# for k in range(1900):
#     cross_size[k]=df['清晰度'][k]**2
# df.insert(len(label)-1,'清晰度*清晰度',cross_size)

# #添加func交叉项
# cross_func = [[0 for k in range(1900)] for i in range(21)]
# l=0

# for i in range(7):
#     for j in range(i+1,7):
#         cross_func_label = label[24+i]+'*'+label[24+j]
#         label.append(cross_func_label)
#         for k in range(1900):
#             cross_func[l][k]=df[label[24+i]][k]*df[label[24+j]][k]
#         df.insert(len(label)-1,cross_func_label,cross_func[l])
#         l=l+1

# #添加func与brand交叉项
# cross_brand_func = [[0 for k in range(1900)] for i in range(18*7)]
# l=0
# for i in range(18):
#     for j in range(0,7):
#         cross_brand_func_label = label[6+i]+'*'+label[24+j]
#         label.append(cross_brand_func_label)
#         for k in range(1900):
#             cross_brand_func[i*7+j][k]=df[label[6+i]][k]*df[label[24+j]][k]
#         df.insert(len(label)-1,cross_brand_func_label,cross_brand_func[i*7+j])

# import matplotlib.pyplot as plt
# plt.scatter(df_ppr_gamma['性价比价格敏感度'],df['尺寸'])

# #按照adj.rsquared筛选
# import statsmodels.formula.api as smf
# def forward_selected(data, response):
#     使用Adjusted R-squared来评判新加的参数是否提高回归中的统计显著性
#     Linear model designed by forward selection.
#     Parameters:
#     -----------
#     data : pandas DataFrame with all possible predictors and response
#     response: string, name of response column in data
#     Returns:
#     --------
#     model: an "optimal" fitted statsmodels linear model
#             with an intercept
#             selected by forward selection
#             evaluated by adjusted R-squared
#     """
#     remaining = set(data.columns)
#     remaining.remove(response)
#     selected = []
#     current_score, best_new_score = 0.0, 0.0
#     while remaining and current_score == best_new_score:
#         scores_with_candidates = []
#         for candidate in remaining:
#             formula = "{} ~ {} + 1".format(response,
#                                             ' + '.join(selected + [candidate]))
#             score = smf.ols(formula, data).fit().rsquared_adj
#             scores_with_candidates.append((score, candidate))
#         scores_with_candidates.sort()
#         best_new_score, best_candidate = scores_with_candidates.pop()
#         if current_score < best_new_score:
#             remaining.remove(best_candidate)
#             selected.append(best_candidate)
#             current_score = best_new_score
#             print("R2 is {},continuing!".format(current_score))  #输出最小的R2值
#         else:
#             print("for selection over!")
#             break
#     formula = "{} ~ {} + 1".format(response,
#                                     ' + '.join(selected))
#     print("final formula is {}".format(formula))
#     model = smf.ols(formula, data).fit()
 
#     return model

#按照bic/aic筛选
from statsmodels.formula.api import ols
#定义向前逐步回归函数
def forward_selected(data,target):
    variate=set(data.columns)  #将字段名转换成字典类型
    variate.remove(target)  #去掉因变量的字段名
    selected=[]
    current_score,best_new_score=float('inf'),float('inf')  #目前的分数和最好分数初始值都为无穷大（因为AIC越小越好）
    #循环筛选变量
    while variate:
        aic_with_variate=[]
        for candidate in variate:  #逐个遍历自变量
            formula="{}~{}".format(target,"+".join(selected+[candidate]))  #将自变量名连接起来
            aic=ols(formula=formula,data=data).fit().bic  #利用ols训练模型得出aic值，此处可切换bic
            aic_with_variate.append((aic,candidate))  #将第每一次的aic值放进空列表
        aic_with_variate.sort(reverse=True)  #降序排序aic值
        best_new_score,best_candidate=aic_with_variate.pop()  #最好的aic值等于删除列表的最后一个值，以及最好的自变量等于列表最后一个自变量
        if current_score>best_new_score:  #如果目前的aic值大于最好的aic值
            variate.remove(best_candidate)  #移除加进来的变量名，即第二次循环时，不考虑此自变量了
            selected.append(best_candidate)  #将此自变量作为加进模型中的自变量
            current_score=best_new_score  #最新的分数等于最好的分数
            print("aic is {},continuing!".format(current_score))  #输出最小的aic值
        else:
            print("for selection over!")
            break
    formula="{}~{}".format(target,"+".join(selected))  #最终的模型式子
    print("final formula is {}".format(formula))
    model=ols(formula=formula,data=data).fit()
    return(model)

# #定义向后逐步回归函数
# def backward_selected(data,target):
#     variate=set(data.columns)  #将字段名转换成字典类型
#     variate.remove(target)  #去掉因变量的字段名
#     selected=set(data.columns)
#     selected.remove(target)
#     current_score,best_new_score=float('inf'),float('inf')  #目前的分数和最好分数初始值都为无穷大（因为AIC越小越好）
#     #循环筛选变量
#     while variate:
#         aic_with_variate=[]
#         for candidate in variate:  #逐个遍历自变量
#             reduced_selected=set(selected)
#             reduced_selected.remove(candidate)
#             formula="{}~{}".format(target,"+".join(reduced_selected))  #将自变量名连接起来
#             aic=ols(formula=formula,data=data).fit().bic  #利用ols训练模型得出aic值，此处可切换bic
#             aic_with_variate.append((aic,candidate))  #将第每一次的aic值放进空列表
#         aic_with_variate.sort(reverse=True)  #降序排序aic值
#         best_new_score,best_candidate=aic_with_variate.pop()  #最好的aic值等于删除列表的最后一个值，以及最好的自变量等于列表最后一个自变量
#         if current_score>best_new_score:  #如果目前的aic值大于最好的aic值
#             selected.remove(best_candidate)  #移除待删减的变量名，即第二次循环时，不考虑此自变量了
#             variate.remove(best_candidate)  #将此自变量作为加进模型中的自变量
#             current_score=best_new_score  #最新的分数等于最好的分数
#             print("aic is {},continuing!".format(current_score))  #输出最小的aic值
#         else:
#             print("for selection over!")
#             break
#     formula="{}~{}".format(target,"+".join(selected))  #最终的模型式子
#     print("final formula is {}".format(formula))
#     model=ols(formula=formula,data=data).fit()
#     return(model)


df_ppr_perf=df.drop(label[1:6],axis=1)
df_ppr_gamma=df.drop(['性价比性能','盈余式性能','盈余式价格敏感度','价格','市场份额'],axis=1)
df_sur_perf=df.drop(['性价比性能','性价比价格敏感度','盈余式价格敏感度','价格','市场份额'],axis=1)
df_sur_beta=df.drop(['性价比性能','性价比价格敏感度','盈余式性能','价格','市场份额'],axis=1)
df_price=df.drop(['性价比性能','性价比价格敏感度','盈余式性能','盈余式价格敏感度','市场份额'],axis=1)

ppr_step = forward_selected(df_ppr_perf, '性价比性能')

#for i in range(1900):
     # df_ppr_gamma['性价比价格敏感度'][i] = df_ppr_gamma['性价比价格敏感度'][i]/df['性价比性能'][i]
     # df_sur_beta['盈余式价格敏感度'][i] = df_sur_beta['盈余式价格敏感度'][i]/df['盈余式性能'][i]
#     df_ppr_gamma['性价比价格敏感度'][i] = math.exp(df_ppr_gamma['性价比价格敏感度'][i]-1)
#     df_sur_beta['盈余式价格敏感度'][i] = math.exp(df_sur_beta['盈余式价格敏感度'][i]-1)
# #    df_ppr_gamma['价格'][i] = math.log(df_ppr_gamma['价格'][i])
#     df_ppr_gamma['尺寸'][i] = df_ppr_gamma['尺寸'][i]**2
ppr_gamma_step = forward_selected(df_ppr_gamma, '性价比价格敏感度')
#ppr_gamma_step.summary()

sur_step = forward_selected(df_sur_perf, '盈余式性能')
sur_beta_step = forward_selected(df_sur_beta, '盈余式价格敏感度')

#price_step=forward_selected(df_price, '价格') #探索价格与各性能指标是否存在线性关系

# #输出回归分析的结果
# f = open("regression_results.txt", "w")
# print(ppr_reg.summary(),file=f)
# print(ppr_gamma_reg.summary(),file=f)
# print(ppr_log_gamma_reg.summary(),file=f)
# print(ppr_exp_gamma_reg.summary(),file=f)
# print(sur_reg.summary(),file=f)
# print(sur_beta_reg.summary(),file=f)
# print(sur_log_beta_reg.summary(),file=f)
# print(sur_exp_beta_reg.summary(),file=f)
# f.close()

f = open("stepwise_results.txt", "w")
print(ppr_step.summary(),file=f)
print(ppr_gamma_step.summary(),file=f)
# print(ppr_log_gamma_step.summary(),file=f)
# print(ppr_exp_gamma_step.summary(),file=f)
print(sur_step.summary(),file=f)
print(sur_beta_step.summary(),file=f)
# print(sur_log_beta_step.summary(),file=f)
# print(sur_exp_beta_step.summary(),file=f)

#print(price_step.summary(),file=f)
f.close()

#将回归系数输出为numpy数组
output=ppr_step.params
#pd.save("ppr_independent_variables.xlsx",output.index) #输出影响显著的自变量
#pd.save("ppr_coeffecients.xlsx",output) #输出各自变量对应的系数
output.to_excel(r'ppr_coeffecients.xlsx')