Source code for Tars.load_data

import gzip
import cPickle
import numpy as np
import scipy as sp
from scipy.io import loadmat
import glob
import sys
import pickle
from PIL import Image
from copy import copy

from sklearn.cross_validation import train_test_split

sys.setrecursionlimit(5000)


[docs]def one_of_k(a): a = np.array(a) b = np.zeros((a.size, 10)).astype('float32') b[np.arange(a.size), a] = 1 return b
[docs]def mnist(datapath, toFloat=False): p = paramaters() def load(test=False, one_hot=True): f = gzip.open(datapath + 'mnist.pkl.gz', 'rb') (train_x, train_y), (valid_x, valid_y), (test_x, test_y) =\ cPickle.load(f) f.close() if toFloat: p.mean = np.mean(train_x, axis=0) p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0)) p.std[p.std == 0] = 1 train_x = ((train_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) test_x = ((test_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) if one_hot: train_y = one_of_k(train_y) valid_y = one_of_k(valid_y) test_y = one_of_k(test_y) if test: return train_x, train_y, valid_x, valid_y, test_x, test_y else: return train_x, train_y def plot(X): if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X[X < 0] = 0 X[X > 1] = 1 X = X.reshape((X.shape[0], 28, 28)) return 1 - X, "gray" return load, plot
[docs]class paramaters(): def __init__(self, mean=0, std=0): self.mean = mean self.std = std
[docs]def svhn(datapath, toFloat=True, binarize_y=True, gray=False, extra=True): p = paramaters() def load(test=False, flatten=False): trains = loadmat(datapath + 'train_32x32.mat') tests = loadmat(datapath + 'test_32x32.mat') train_x = trains['X'].swapaxes(0, 1).T train_y = trains['y'].reshape((-1)) test_x = tests['X'].swapaxes(0, 1).T test_y = tests['y'].reshape((-1)) if extra: extras = loadmat(datapath + 'extra_32x32.mat') extra_x = extras['X'].swapaxes(0, 1).T extra_y = extras['y'].reshape((-1)) train_x = np.concatenate((train_x, extra_x), axis=0) train_y = np.concatenate((train_y, extra_y), axis=0) if flatten is True: train_x = train_x.reshape((train_x.shape[0], -1)) test_x = test_x.reshape((test_x.shape[0], -1)) if gray is True: train_x = train_x[:, 0] * 0.2126 + \ train_x[:, 1] * 0.7152 + train_x[:, 2] * 0.0722 test_x = test_x[:, 0] * 0.2126 + \ test_x[:, 1] * 0.7152 + test_x[:, 2] * 0.0722 train_x = (train_x / 255.).reshape((len(train_x), 32 * 32)).astype(np.float32) test_x = (test_x / 255.).reshape((len(test_x), 32 * 32) ).astype(np.float32) train_y[train_y == 10] = 0 test_y[test_y == 10] = 0 if toFloat: p.mean = np.mean(train_x, axis=0) p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0)) train_x = ((train_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) test_x = ((test_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) if binarize_y: train_y = one_of_k(train_y) test_y = one_of_k(test_y) if test is True: return train_x, train_y, test_x, test_y, test_x, test_y else: return train_x, train_y def plot(X): if gray is True: X = X.reshape((X.shape[0], 32, 32)) return X, "gray" else: X = X.reshape((X.shape[0], 3, 32, 32)) if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 256. X[X < 0] = 0 X[X > 1] = 1 X = np.rollaxis(X, 1, 4) return X, None return load, plot
[docs]def lfw(datapath, toFloat=True, gray=False, rate=0.1, rseed=0): p = paramaters() def load(test=False): x = np.load(datapath + 'lfw_images.npy') y = np.load(datapath + 'lfw_attributes.npy').astype(np.float32) x = np.rollaxis(x, 3, 1) if gray: x = x[:, 0] * 0.2126 + x[:, 1] * 0.7152 + x[:, 2] * 0.0722 x = x.reshape((len(x), 64 * 64)).astype(np.float32) if test: train_x, test_x, train_y, test_y = train_test_split( x, y, test_size=rate, random_state=rseed) else: train_x = x train_y = y if toFloat: p.mean = np.mean(train_x, axis=0) p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0)) train_x = ((train_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) if test: test_x = ((test_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) else: train_x = train_x / 255. if test: test_x = test_x / 255. print train_x.shape if test: return train_x, train_y, test_x, test_y, test_x, test_y else: return train_x, train_y def preprocess(X): X = np.rollaxis(X, 3, 1) X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) return X def plot(X): if gray is True: if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X = X.reshape((X.shape[0], 64, 64)) X[X < 0] = 0 X[X > 1] = 1 return X, "gray" else: X = X.reshape((X.shape[0], 3, 64, 64)) if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X = np.rollaxis(X, 1, 4) X[X < 0] = 0 X[X > 1] = 1 return X, None return load, plot, preprocess
[docs]def celeba(datapath, toFloat=True, gray=False, rate=0.001, rseed=0): p = paramaters() def load(test=False): x = np.load(datapath + 'celeba_images.npy') y = np.load(datapath + 'celeba_attributes.npy').astype(np.float32) x = np.rollaxis(x, 3, 1) if gray: x = x[:, 0] * 0.2126 + x[:, 1] * 0.7152 + x[:, 2] * 0.0722 x = x.reshape((len(x), 64 * 64)).astype(np.float32) if test: train_x, test_x, train_y, test_y = train_test_split( x, y, test_size=rate, random_state=rseed) else: train_x = x train_y = y if toFloat: p.mean = np.mean(train_x, axis=0) p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0)) train_x = ((train_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) if test: test_x = ((test_x - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) else: train_x = train_x / 255. if test: test_x = test_x / 255. print train_x.shape if test: return train_x, train_y, test_x, test_y, test_x, test_y else: return train_x, train_y def preprocess(X): X = np.rollaxis(X, 3, 1) X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) return X def plot(X): if gray is True: if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X = X.reshape((X.shape[0], 64, 64)) X[X < 0] = 0 X[X > 1] = 1 return X, "gray" else: X = X.reshape((X.shape[0], 3, 64, 64)) if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X = np.rollaxis(X, 1, 4) X[X < 0] = 0 X[X > 1] = 1 return X, None return load, plot, preprocess
[docs]def flickr(datapath, toFloat=True): p = paramaters() def load(version=1, label=True, raw_image=False): if label: train_indices = np.load( datapath + "flickr/splits/train_indices_%d.npy" % version) valid_indices = np.load( datapath + "flickr/splits/valid_indices_%d.npy" % version) test_indices = np.load( datapath + "flickr/splits/test_indices_%d.npy" % version) if raw_image: x_labelled = np.load( datapath + "flickr/image/labelled/images.npy") x_labelled = np.rollaxis(x_labelled, 3, 1) else: x_labelled_path = glob.glob(datapath + "flickr/image/labelled/combined-*") x_labelled_path.sort() x_labelled = np.load(x_labelled_path[0]) for path in x_labelled_path[1:]: x_labelled = np.r_[x_labelled, np.load(path)] trn = [] val = [] tst = [] y_ = np.load(datapath + "flickr/labels.npy").astype(np.float32) trn.append(y_[train_indices]) val.append(y_[valid_indices]) tst.append(y_[test_indices]) trn.append(x_labelled[train_indices]) val.append(x_labelled[valid_indices]) tst.append(x_labelled[test_indices]) w_labelled = LoadSparse( datapath + 'flickr/text/text_all_2000_labelled.npz') w_labelled = np.asarray(w_labelled.todense()).astype(np.float32) trn.append(w_labelled[train_indices]) val.append(w_labelled[valid_indices]) tst.append(w_labelled[test_indices]) if raw_image is False: xw_labelled = np.c_[x_labelled, w_labelled] trn.append(xw_labelled[train_indices]) val.append(xw_labelled[valid_indices]) tst.append(xw_labelled[test_indices]) if toFloat: if raw_image: p.mean = np.mean(trn[1], axis=0) p.std = np.sqrt(np.mean((trn[1] - p.mean[np.newaxis])**2, axis=0)) trn[1] = ((trn[1] - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) val[1] = ((val[1] - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) tst[1] = ((tst[1] - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) else: mean = np.mean(trn[1], axis=0) std = np.sqrt( np.mean((trn[1] - mean[np.newaxis, :])**2, axis=0)) trn[1] = ((trn[1] - mean[np.newaxis, :]) / std[np.newaxis, :]).astype(np.float32) val[1] = ((val[1] - mean[np.newaxis, :]) / std[np.newaxis, :]).astype(np.float32) tst[1] = ((tst[1] - mean[np.newaxis, :]) / std[np.newaxis, :]).astype(np.float32) return trn, val, tst else: model_path = datapath + \ "flickr/image/unlabelled/unlabelled_trn.pkl" unlabel_trn = pickle.load(open(model_path)) model_path = datapath + \ "flickr/image/unlabelled/unlabelled_tst.pkl" unlabel_tst = pickle.load(open(model_path)) return unlabel_trn, unlabel_tst def preprocess(X): X = np.rollaxis(X, 3, 1) X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32) return X def plot(X): X = X.reshape((-1, 3, 128, 128)) if toFloat: X = (X * p.std[np.newaxis]) + p.mean[np.newaxis] X = X / 255. X = np.rollaxis(X, 1, 4) X[X < 0] = 0 X[X > 1] = 1 return X, None def LoadSparse(inputfile, verbose=False): """Loads a sparse matrix stored as npz file.""" npzfile = np.load(inputfile) mat = sp.sparse.csr_matrix( (npzfile['data'], npzfile['indices'], npzfile['indptr']), shape=tuple(list(npzfile['shape']))) if verbose: print 'Loaded sparse matrix from %s of shape %s' % ( inputfile, mat.shape.__str__()) return mat def shuffle(trn, each_permutation=False): trn = copy(trn) change_num = np.random.permutation(trn[0].shape[0]) for i in range(len(trn)): if trn[i] is not None: trn[i] = trn[i][change_num] return trn return load, shuffle, plot, preprocess
[docs]def facade(datapath): def load(label=True, test=True, crop=True): # Ref to https://github.com/pfnet-research/chainer-pix2pix # /blob/master/facade_dataset.py x = [] y = [] MAX_ITER = 378 IMAGE_SHAPE = 256 for i in range(1, MAX_ITER + 1): img = Image.open(datapath + "facade/base/cmp_b%04d.jpg" % i) label = Image.open(datapath + "facade/base/cmp_b%04d.png" % i) w, h = img.size r = 286. / min(w, h) img = img.resize((int(r * w), int(r * h)), Image.BILINEAR) label = label.resize((int(r * w), int(r * h)), Image.NEAREST) img = np.asarray(img).astype( "float32").transpose(2, 0, 1) / 128.0 - 1.0 label_ = np.asarray(label) - 1 label = np.zeros((12, img.shape[1], img.shape[2])).astype("int32") for j in range(12): label[j, :] = label_ == j # crop images img = img[:, :IMAGE_SHAPE, :IMAGE_SHAPE] label = label[:, :IMAGE_SHAPE, :IMAGE_SHAPE] x.append(img) y.append(label) x = np.asarray(x).astype("float32") y = np.asarray(y).astype("float32") train_x, train_y = x[:300], y[:300] test_x, test_y = x[300:], y[300:] if test: return train_x, train_y, test_x, test_y else: return train_x, train_y def plot(img): if img.shape[1] == 3: x = np.asarray(np.clip(img * 128 + 128, 0.0, 255.0), dtype=np.uint8) x = x.transpose(0, 2, 3, 1) return x elif img.shape[1] == 12: x = np.ones((len(img), 3, 256, 256)).astype(np.uint8) for i in range(12): x[:, 0, :, :] += np.uint8(15 * i * img[:, i, :, :]) x = x.transpose(0, 2, 3, 1) return x else: NotImplementedError return load, plot