|
- import os
- import csv
- import numpy as np
-
- from tqdm import tqdm
-
- from sklearn.utils import shuffle
- from sklearn.model_selection import train_test_split
-
- seed = 0 # 3535999445
-
- def _rocstories(path):
- with open(path) as f:
- f = csv.reader(f)
- st = []
- ct1 = []
- ct2 = []
- y = []
- for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
- if i > 0:
- s = ' '.join(line[1:5])
- c1 = line[5]
- c2 = line[6]
- st.append(s)
- ct1.append(c1)
- ct2.append(c2)
- y.append(int(line[-1])-1)
- return st, ct1, ct2, y
-
- def rocstories(data_dir, n_train=1497, n_valid=374):
- storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
- teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
- tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
- trX1, trX2, trX3 = [], [], []
- trY = []
- for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
- trX1.append(s)
- trX2.append(c1)
- trX3.append(c2)
- trY.append(y)
-
- vaX1, vaX2, vaX3 = [], [], []
- vaY = []
- for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
- vaX1.append(s)
- vaX2.append(c1)
- vaX3.append(c2)
- vaY.append(y)
- trY = np.asarray(trY, dtype=np.int32)
- vaY = np.asarray(vaY, dtype=np.int32)
- return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)
|