Source code for Tars.load_data

import gzip
import cPickle
import numpy as np
import scipy as sp
from scipy.io import loadmat
import glob
import sys
import pickle
from PIL import Image
from copy import copy

from sklearn.cross_validation import train_test_split

sys.setrecursionlimit(5000)


[docs]def one_of_k(a):
    a = np.array(a)
    b = np.zeros((a.size, 10)).astype('float32')
    b[np.arange(a.size), a] = 1
    return b


[docs]def mnist(datapath, toFloat=False):
    p = paramaters()

    def load(test=False, one_hot=True):
        f = gzip.open(datapath + 'mnist.pkl.gz', 'rb')
        (train_x, train_y), (valid_x, valid_y), (test_x, test_y) =\
            cPickle.load(f)
        f.close()

        if toFloat:
            p.mean = np.mean(train_x, axis=0)
            p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0))
            p.std[p.std == 0] = 1
            train_x = ((train_x - p.mean[np.newaxis]) /
                       p.std[np.newaxis]).astype(np.float32)
            test_x = ((test_x - p.mean[np.newaxis]) /
                      p.std[np.newaxis]).astype(np.float32)

        if one_hot:
            train_y = one_of_k(train_y)
            valid_y = one_of_k(valid_y)
            test_y = one_of_k(test_y)

        if test:
            return train_x, train_y, valid_x, valid_y, test_x, test_y
        else:
            return train_x, train_y

    def plot(X):
        if toFloat:
            X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
            X = X / 255.
            X[X < 0] = 0
            X[X > 1] = 1
        X = X.reshape((X.shape[0], 28, 28))
        return 1 - X, "gray"
    return load, plot


[docs]class paramaters():

    def __init__(self, mean=0, std=0):
        self.mean = mean
        self.std = std


[docs]def svhn(datapath, toFloat=True, binarize_y=True, gray=False, extra=True):
    p = paramaters()

    def load(test=False, flatten=False):
        trains = loadmat(datapath + 'train_32x32.mat')
        tests = loadmat(datapath + 'test_32x32.mat')
        train_x = trains['X'].swapaxes(0, 1).T
        train_y = trains['y'].reshape((-1))
        test_x = tests['X'].swapaxes(0, 1).T
        test_y = tests['y'].reshape((-1))

        if extra:
            extras = loadmat(datapath + 'extra_32x32.mat')
            extra_x = extras['X'].swapaxes(0, 1).T
            extra_y = extras['y'].reshape((-1))

            train_x = np.concatenate((train_x, extra_x), axis=0)
            train_y = np.concatenate((train_y, extra_y), axis=0)

        if flatten is True:
            train_x = train_x.reshape((train_x.shape[0], -1))
            test_x = test_x.reshape((test_x.shape[0], -1))

        if gray is True:
            train_x = train_x[:, 0] * 0.2126 + \
                train_x[:, 1] * 0.7152 + train_x[:, 2] * 0.0722
            test_x = test_x[:, 0] * 0.2126 + \
                test_x[:, 1] * 0.7152 + test_x[:, 2] * 0.0722

            train_x = (train_x / 255.).reshape((len(train_x),
                                                32 * 32)).astype(np.float32)
            test_x = (test_x / 255.).reshape((len(test_x), 32 * 32)
                                             ).astype(np.float32)

        train_y[train_y == 10] = 0
        test_y[test_y == 10] = 0

        if toFloat:
            p.mean = np.mean(train_x, axis=0)
            p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0))
            train_x = ((train_x - p.mean[np.newaxis]) /
                       p.std[np.newaxis]).astype(np.float32)
            test_x = ((test_x - p.mean[np.newaxis]) /
                      p.std[np.newaxis]).astype(np.float32)
        if binarize_y:
            train_y = one_of_k(train_y)
            test_y = one_of_k(test_y)

        if test is True:
            return train_x, train_y, test_x, test_y, test_x, test_y

        else:
            return train_x, train_y

    def plot(X):
        if gray is True:
            X = X.reshape((X.shape[0], 32, 32))
            return X, "gray"
        else:
            X = X.reshape((X.shape[0], 3, 32, 32))
            if toFloat:
                X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
                X = X / 256.
                X[X < 0] = 0
                X[X > 1] = 1
            X = np.rollaxis(X, 1, 4)
            return X, None

    return load, plot


[docs]def lfw(datapath, toFloat=True, gray=False, rate=0.1, rseed=0):
    p = paramaters()

    def load(test=False):
        x = np.load(datapath + 'lfw_images.npy')
        y = np.load(datapath + 'lfw_attributes.npy').astype(np.float32)

        x = np.rollaxis(x, 3, 1)

        if gray:
            x = x[:, 0] * 0.2126 + x[:, 1] * 0.7152 + x[:, 2] * 0.0722
            x = x.reshape((len(x), 64 * 64)).astype(np.float32)

        if test:
            train_x, test_x, train_y, test_y = train_test_split(
                x, y, test_size=rate, random_state=rseed)
        else:
            train_x = x
            train_y = y

        if toFloat:
            p.mean = np.mean(train_x, axis=0)
            p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0))
            train_x = ((train_x - p.mean[np.newaxis]) /
                       p.std[np.newaxis]).astype(np.float32)
            if test:
                test_x = ((test_x - p.mean[np.newaxis]) /
                          p.std[np.newaxis]).astype(np.float32)
        else:
            train_x = train_x / 255.
            if test:
                test_x = test_x / 255.

        print train_x.shape

        if test:
            return train_x, train_y, test_x, test_y, test_x, test_y

        else:
            return train_x, train_y

    def preprocess(X):
        X = np.rollaxis(X, 3, 1)
        X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32)
        return X

    def plot(X):
        if gray is True:
            if toFloat:
                X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
                X = X / 255.
            X = X.reshape((X.shape[0], 64, 64))
            X[X < 0] = 0
            X[X > 1] = 1
            return X, "gray"
        else:
            X = X.reshape((X.shape[0], 3, 64, 64))
            if toFloat:
                X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
                X = X / 255.
            X = np.rollaxis(X, 1, 4)
            X[X < 0] = 0
            X[X > 1] = 1
            return X, None

    return load, plot, preprocess


[docs]def celeba(datapath, toFloat=True, gray=False, rate=0.001, rseed=0):
    p = paramaters()

    def load(test=False):
        x = np.load(datapath + 'celeba_images.npy')
        y = np.load(datapath + 'celeba_attributes.npy').astype(np.float32)

        x = np.rollaxis(x, 3, 1)

        if gray:
            x = x[:, 0] * 0.2126 + x[:, 1] * 0.7152 + x[:, 2] * 0.0722
            x = x.reshape((len(x), 64 * 64)).astype(np.float32)

        if test:
            train_x, test_x, train_y, test_y = train_test_split(
                x, y, test_size=rate, random_state=rseed)
        else:
            train_x = x
            train_y = y

        if toFloat:
            p.mean = np.mean(train_x, axis=0)
            p.std = np.sqrt(np.mean((train_x - p.mean[np.newaxis])**2, axis=0))
            train_x = ((train_x - p.mean[np.newaxis]) /
                       p.std[np.newaxis]).astype(np.float32)
            if test:
                test_x = ((test_x - p.mean[np.newaxis]) /
                          p.std[np.newaxis]).astype(np.float32)
        else:
            train_x = train_x / 255.
            if test:
                test_x = test_x / 255.

        print train_x.shape

        if test:
            return train_x, train_y, test_x, test_y, test_x, test_y

        else:
            return train_x, train_y

    def preprocess(X):
        X = np.rollaxis(X, 3, 1)
        X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32)
        return X

    def plot(X):
        if gray is True:
            if toFloat:
                X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
                X = X / 255.
            X = X.reshape((X.shape[0], 64, 64))
            X[X < 0] = 0
            X[X > 1] = 1
            return X, "gray"
        else:
            X = X.reshape((X.shape[0], 3, 64, 64))
            if toFloat:
                X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
                X = X / 255.
            X = np.rollaxis(X, 1, 4)
            X[X < 0] = 0
            X[X > 1] = 1
            return X, None

    return load, plot, preprocess


[docs]def flickr(datapath, toFloat=True):
    p = paramaters()

    def load(version=1, label=True, raw_image=False):
        if label:
            train_indices = np.load(
                datapath + "flickr/splits/train_indices_%d.npy" % version)
            valid_indices = np.load(
                datapath + "flickr/splits/valid_indices_%d.npy" % version)
            test_indices = np.load(
                datapath + "flickr/splits/test_indices_%d.npy" % version)

            if raw_image:
                x_labelled = np.load(
                    datapath + "flickr/image/labelled/images.npy")
                x_labelled = np.rollaxis(x_labelled, 3, 1)

            else:
                x_labelled_path = glob.glob(datapath + "flickr/image/labelled/combined-*")
                x_labelled_path.sort()
                x_labelled = np.load(x_labelled_path[0])
                for path in x_labelled_path[1:]:
                    x_labelled = np.r_[x_labelled, np.load(path)]

            trn = []
            val = []
            tst = []

            y_ = np.load(datapath + "flickr/labels.npy").astype(np.float32)
            trn.append(y_[train_indices])
            val.append(y_[valid_indices])
            tst.append(y_[test_indices])

            trn.append(x_labelled[train_indices])
            val.append(x_labelled[valid_indices])
            tst.append(x_labelled[test_indices])

            w_labelled = LoadSparse(
                datapath + 'flickr/text/text_all_2000_labelled.npz')
            w_labelled = np.asarray(w_labelled.todense()).astype(np.float32)
            trn.append(w_labelled[train_indices])
            val.append(w_labelled[valid_indices])
            tst.append(w_labelled[test_indices])

            if raw_image is False:
                xw_labelled = np.c_[x_labelled, w_labelled]
                trn.append(xw_labelled[train_indices])
                val.append(xw_labelled[valid_indices])
                tst.append(xw_labelled[test_indices])

            if toFloat:
                if raw_image:
                    p.mean = np.mean(trn[1], axis=0)
                    p.std = np.sqrt(np.mean((trn[1] - p.mean[np.newaxis])**2, axis=0))
                    trn[1] = ((trn[1] - p.mean[np.newaxis]) /
                              p.std[np.newaxis]).astype(np.float32)
                    val[1] = ((val[1] - p.mean[np.newaxis]) /
                              p.std[np.newaxis]).astype(np.float32)
                    tst[1] = ((tst[1] - p.mean[np.newaxis]) /
                              p.std[np.newaxis]).astype(np.float32)

                else:
                    mean = np.mean(trn[1], axis=0)
                    std = np.sqrt(
                        np.mean((trn[1] - mean[np.newaxis, :])**2, axis=0))
                    trn[1] = ((trn[1] - mean[np.newaxis, :]) /
                              std[np.newaxis, :]).astype(np.float32)
                    val[1] = ((val[1] - mean[np.newaxis, :]) /
                              std[np.newaxis, :]).astype(np.float32)
                    tst[1] = ((tst[1] - mean[np.newaxis, :]) /
                              std[np.newaxis, :]).astype(np.float32)

            return trn, val, tst

        else:
            model_path = datapath + \
                "flickr/image/unlabelled/unlabelled_trn.pkl"
            unlabel_trn = pickle.load(open(model_path))

            model_path = datapath + \
                "flickr/image/unlabelled/unlabelled_tst.pkl"
            unlabel_tst = pickle.load(open(model_path))

            return unlabel_trn, unlabel_tst

    def preprocess(X):
        X = np.rollaxis(X, 3, 1)
        X = ((X - p.mean[np.newaxis]) / p.std[np.newaxis]).astype(np.float32)
        return X

    def plot(X):
        X = X.reshape((-1, 3, 128, 128))
        if toFloat:
            X = (X * p.std[np.newaxis]) + p.mean[np.newaxis]
            X = X / 255.
        X = np.rollaxis(X, 1, 4)
        X[X < 0] = 0
        X[X > 1] = 1
        return X, None

    def LoadSparse(inputfile, verbose=False):
        """Loads a sparse matrix stored as npz file."""
        npzfile = np.load(inputfile)
        mat = sp.sparse.csr_matrix(
            (npzfile['data'], npzfile['indices'], npzfile['indptr']),
            shape=tuple(list(npzfile['shape'])))
        if verbose:
            print 'Loaded sparse matrix from %s of shape %s' % (
                inputfile,
                mat.shape.__str__())
        return mat

    def shuffle(trn, each_permutation=False):
        trn = copy(trn)

        change_num = np.random.permutation(trn[0].shape[0])

        for i in range(len(trn)):
            if trn[i] is not None:
                trn[i] = trn[i][change_num]

        return trn

    return load, shuffle, plot, preprocess


[docs]def facade(datapath):
    def load(label=True, test=True, crop=True):
        # Ref to https://github.com/pfnet-research/chainer-pix2pix
        # /blob/master/facade_dataset.py
        x = []
        y = []
        MAX_ITER = 378
        IMAGE_SHAPE = 256
        for i in range(1, MAX_ITER + 1):
            img = Image.open(datapath + "facade/base/cmp_b%04d.jpg" % i)
            label = Image.open(datapath + "facade/base/cmp_b%04d.png" % i)
            w, h = img.size
            r = 286. / min(w, h)
            img = img.resize((int(r * w), int(r * h)), Image.BILINEAR)
            label = label.resize((int(r * w), int(r * h)), Image.NEAREST)

            img = np.asarray(img).astype(
                "float32").transpose(2, 0, 1) / 128.0 - 1.0
            label_ = np.asarray(label) - 1
            label = np.zeros((12, img.shape[1], img.shape[2])).astype("int32")
            for j in range(12):
                label[j, :] = label_ == j

            # crop images
            img = img[:, :IMAGE_SHAPE, :IMAGE_SHAPE]
            label = label[:, :IMAGE_SHAPE, :IMAGE_SHAPE]
            x.append(img)
            y.append(label)

        x = np.asarray(x).astype("float32")
        y = np.asarray(y).astype("float32")

        train_x, train_y = x[:300], y[:300]
        test_x, test_y = x[300:], y[300:]

        if test:
            return train_x, train_y, test_x, test_y

        else:
            return train_x, train_y

    def plot(img):
        if img.shape[1] == 3:
            x = np.asarray(np.clip(img * 128 + 128, 0.0, 255.0),
                           dtype=np.uint8)
            x = x.transpose(0, 2, 3, 1)
            return x

        elif img.shape[1] == 12:
            x = np.ones((len(img), 3, 256, 256)).astype(np.uint8)
            for i in range(12):
                x[:, 0, :, :] += np.uint8(15 * i * img[:, i, :, :])
            x = x.transpose(0, 2, 3, 1)
            return x

        else:
            NotImplementedError

    return load, plot