|
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
-
- import os
- from datetime import datetime, timedelta
- import pandas as pd
- import math
- import numpy as np
- import random
- from tqdm import trange # 显示任务进度条
-
- from io import BytesIO
- from urllib.request import urlopen # 获取 URL 资源,用于下载数据集
- from zipfile import ZipFile # 压缩和解压文件
-
- from math import sqrt
- from pandas import read_csv, DataFrame
- from scipy import stats # 统计函数包
-
- import matplotlib
- matplotlib.use('Agg') # 执行plt.show(),在PyCharm中不显示绘图,需要在导入包之前执行
- import matplotlib.pyplot as plt
-
- def prep_data(data, covariates, data_start, train = True):
- """
- 作用:
- data:numpy格式[time, num]
- covariates:协变量
- data_start:data中第一个非零元素的下标
- """
- #print("train: ", train)
- time_len = data.shape[0]
- #print("time_len: ", time_len)
- input_size = window_size-stride_size # 滑动窗口的大小-步幅,这里是168
-
- # 下面的作用就是生成长度为370的矩阵,并给里面填充第二个值(这个值就是滑动窗口走的步数)
- # np.full就是指定形状和要填充的数,生成对应的张量。
- # num_series是用户数量370
- # (time_len-input_size) // stride_size就是走的步数
- # windows_per_series就是指:对于长度为window_szie的序列,每次走stride_size步,一共能走的步数,也就是说一共有多少序列
- windows_per_series = np.full((num_series), (time_len-input_size) // stride_size)
- #print("windows pre: ", windows_per_series.shape)
-
- if train: # 训练集需要把之前全0的那些行去除(每个用户不一定一样),然后再计算一共有多少序列
- windows_per_series -= (data_start+stride_size-1) // stride_size
- #print("data_start: ", data_start.shape)
- #print(data_start)
- #print("windows: ", windows_per_series.shape)
- #print(windows_per_series)
-
- total_windows = np.sum(windows_per_series) # 389101,就是所有用户,一共有多少序列
-
- x_input = np.zeros((total_windows, window_size, 1 + num_covariates + 1), dtype='float32')
-
- # 标签的大小和训练集的形状是一样的,但是特征只有一个
- label = np.zeros((total_windows, window_size), dtype='float32')
-
- v_input = np.zeros((total_windows, 2), dtype='float32')
- #cov = 3: ground truth + age + day_of_week + hour_of_day + num_series
- #cov = 4: ground truth + age + day_of_week + hour_of_day + month_of_year + num_series
-
- count = 0
- if not train:
- covariates = covariates[-time_len:] # 若是测试集,则取末尾的长度为time_len的协变量
- for series in trange(num_series): # trange显示进度条,按用户来遍历
- cov_age = stats.zscore(np.arange(total_time-data_start[series])) # 计算每个用户总的用电的使用时间(小时为单位),并标准化
- if train:
- # covariates第一个维度为时间维度,第二个维度为协变量特征,下面的第一列为用户总共的使用电的时间(小时为单位)
- covariates[data_start[series]:time_len, 0] = cov_age[:time_len-data_start[series]]
- else:
- covariates[:, 0] = cov_age[-time_len:] # 同上,只不过是测试集
-
- for i in range(windows_per_series[series]):
- if train:
- window_start = stride_size*i+data_start[series]
- else:
- window_start = stride_size*i
- window_end = window_start+window_size
- '''
- print("x: ", x_input[count, 1:, 0].shape)
- print("window start: ", window_start)
- print("window end: ", window_end)
- print("data: ", data.shape)
- print("d: ", data[window_start:window_end-1, series].shape)
- '''
- x_input[count, 1:, 0] = data[window_start:window_end-1, series] # 第一列放真实数据
- x_input[count, :, 1:1+num_covariates] = covariates[window_start:window_end, :] # 中间的列,就使用之前定义好的协变量
- x_input[count, :, -1] = series # 最后一列,使用用户的编号
- label[count, :] = data[window_start:window_end, series] # 标签,使用真实的用电数据
-
- nonzero_sum = (x_input[count, 1:input_size, 0]!=0).sum() # ?
- if nonzero_sum == 0:
- v_input[count, 0] = 0
- else:
- v_input[count, 0] = np.true_divide(x_input[count, 1:input_size, 0].sum(),nonzero_sum)+1
- x_input[count, :, 0] = x_input[count, :, 0]/v_input[count, 0]
- if train:
- label[count, :] = label[count, :]/v_input[count, 0]
- count += 1
- prefix = os.path.join(save_path, 'train_' if train else 'test_')
- np.save(prefix+'data_'+save_name, x_input)
- np.save(prefix+'v_'+save_name, v_input)
- np.save(prefix+'label_'+save_name, label)
-
- def gen_covariates(times, num_covariates):
- """
- 生成协变量
- times:时间戳(行数)
- num_covariates:协变量的数量(列数)
- """
- covariates = np.zeros((times.shape[0], num_covariates)) # 生成指定形状的全0的矩阵
- for i, input_time in enumerate(times):
- covariates[i, 1] = input_time.weekday() # 给定日期,返回是星期
- covariates[i, 2] = input_time.hour # 小时
- covariates[i, 3] = input_time.month # 月份
- for i in range(1,num_covariates):
- covariates[:,i] = stats.zscore(covariates[:,i]) # 计算样本中每个值相对于样本均值和标准差的z得分。(一列算一个)
- return covariates[:, :num_covariates]
-
- def visualize(data, week_start):
- """画图"""
- x = np.arange(window_size)
- f = plt.figure()
- plt.plot(x, data[week_start:week_start+window_size], color='b')
- f.savefig("visual.png")
- plt.close()
-
- if __name__ == '__main__':
-
- global save_path
- name = 'LD2011_2014.txt'
- save_name = 'elect'
- window_size = 192
- stride_size = 24
- num_covariates = 4 # 协变量的数量
- train_start = '2011-01-01 00:00:00'
- train_end = '2014-08-31 23:00:00'
- test_start = '2014-08-25 00:00:00' #need additional 7 days as given info
- test_end = '2014-09-07 23:00:00'
- pred_days = 7
- given_days = 7
-
- save_path = os.path.join('data', save_name)
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- csv_path = os.path.join(save_path, name)
- if not os.path.exists(csv_path):
- zipurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip'
- with urlopen(zipurl) as zipresp:
- with ZipFile(BytesIO(zipresp.read())) as zfile:
- zfile.extractall(save_path) # 从源上把数据存放到save_path目录下并解压,其实就是下载数据并解压
-
- # index_col=0:让第一列作为index;
- # parse_dates=True:解析索引为日期;
- # decimal:识别为小数点的字符,也就是说指定一个字符,在读入的时候就把这个字符解读为小数点"."了。
- data_frame = pd.read_csv(csv_path, sep=";", index_col=0, parse_dates=True, decimal=',') # 加载原始数据
-
- # Pandas中的resample,重新采样,是对原样本重新处理的一个方法,是一个对常规时间序列数据重新采样和频率转换的便捷的方法。
- # 1H:表示按1小时采样,并把时间戳的合计值放在一个bin(可以翻译为箱子)里面(其实就是下面的sum()函数);
- # label='left':表示时间戳的值取最左边;
- # closed='right':表示右对齐,意思是说,从右往左采样,最后不够频率的就不采样了,全选上;同样closed='left'表示从左往右采样,右边不足一个频率的,就全选上即可
- data_frame = data_frame.resample('1H', label='left', closed='right').sum()[train_start:test_end] #
-
- data_frame.fillna(0, inplace=True) # 用0填充缺失值,inplace=True表示就在当前的data_frame上填充,而不返回新的值
-
- covariates = gen_covariates(data_frame[train_start:test_end].index, num_covariates) # 生成协变量
-
- train_data = data_frame[train_start:train_end].values # 从df中取出数据,其实就是转换成了numpy的格式
- test_data = data_frame[test_start:test_end].values # 同上,注意的是:发现这里不是7天,而是14天,因为:需要额外的7天作为给定的信息
-
- data_start = (train_data!=0).argmax(axis=0) # [370],find first nonzero value in each time series。在每个时间序列中查找第一个非零值
- total_time = data_frame.shape[0] #32304,采样之后的时间戳个数,这个是所有的数据,而不是训练集或者测试集
- num_series = data_frame.shape[1] #370,用户个数
-
- prep_data(train_data, covariates, data_start)
- prep_data(test_data, covariates, data_start, train=False)
|