Note
Go to the end to download the full example code.
Processing visual data (Fashion-MNIST) for Cartolabe¶
In this example we’ll create a 2D representation of an image dataset. We use the fashion MNIST dataset. Fashion-MNIST is a dataset of Zalando’s article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples.
Loading the data¶
We will download the training set images and labels (train-images-idx3-ubyte.gz, train-labels-idx1-ubyte.gz) from the github page and save it in the datas directory of the cartolabe-data project. Then use the load_mnist function below to load the train set.
from download import download # noqa
download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz",
"../datas/train-images-idx3-ubyte.gz")
download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz",
"../datas/train-labels-idx1-ubyte.gz")
""
import os
import gzip
import numpy as np
def load_mnist(path, kind='train'):
"""Load MNIST data from `path`"""
labels_path = os.path.join(path, f"{kind}-labels-idx1-ubyte.gz")
images_path = os.path.join(path, f"{kind}-images-idx3-ubyte.gz")
with gzip.open(labels_path, 'rb') as lbpath:
labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8)
with gzip.open(images_path, 'rb') as imgpath:
images = np.frombuffer(imgpath.read(), dtype=np.uint8,
offset=16).reshape(len(labels), 784)
return images, labels
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz (25.2 MB)
file_sizes: 0%| | 0.00/26.4M [00:00<?, ?B/s]
file_sizes: 32%|████████▏ | 8.38M/26.4M [00:00<00:00, 65.9MB/s]
file_sizes: 63%|████████████████▌ | 16.8M/26.4M [00:00<00:00, 62.0MB/s]
file_sizes: 95%|████████████████████████▊ | 25.2M/26.4M [00:00<00:00, 63.3MB/s]
file_sizes: 100%|██████████████████████████| 26.4M/26.4M [00:00<00:00, 62.3MB/s]
Successfully downloaded file to ../datas/train-images-idx3-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz (29 kB)
file_sizes: 0%| | 0.00/29.5k [00:00<?, ?B/s]
file_sizes: 100%|██████████████████████████| 29.5k/29.5k [00:00<00:00, 30.0MB/s]
Successfully downloaded file to ../datas/train-labels-idx1-ubyte.gz
Load the Fashion-MNIST dataset
X_train, y_train = load_mnist('../datas', kind='train')
the training set has 60000 sample points and 784 features.
X_train.shape
(60000, 784)
The variable y_train contains the labels of every sample marked as integers from 0 to 9.
Label |
Description |
---|---|
0 |
T-shirt/top |
1 |
Trouser |
2 |
Pullover |
3 |
Dress |
4 |
Coat |
5 |
Sandal |
6 |
Shirt |
7 |
Sneaker |
8 |
Bag |
9 |
Ankle boot |
y_train
array([9, 0, 0, ..., 3, 0, 5], dtype=uint8)
Create a 2D projection¶
To create a 2D projection of the dataset, we’ll first run PCA on the X_train array to reduce the dimensions from 784 to 50. Then, we’ll use UMAP to project the dataset to 2D.
This step can take a while to complete as there are 60000 samples in the dataset.
from sklearn.decomposition import PCA # noqa
from umap import UMAP # noqa
pca_50 = PCA(n_components=50)
pca_result_50 = pca_50.fit_transform(X_train)
fashion_pca_umap = UMAP(init='random').fit_transform(pca_result_50)
Visualize the results¶
The following functions will create a visualization of the results with matplotlib.
import numpy as np # noqa
import matplotlib.pyplot as plt # noqa
import matplotlib.patheffects as PathEffects # noqa
import seaborn as sns # noqa
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
rc={"lines.linewidth": 2.5})
""
# Utility function to visualize the outputs of PCA and UMAP.
# https://www.datacamp.com/community/tutorials/introduction-t-sne
def fashion_scatter(x, colors):
# choose a color palette with seaborn.
num_classes = len(np.unique(colors))
palette = np.array(sns.color_palette("hls", num_classes))
# create a scatter plot.
f = plt.figure(figsize=(8, 8))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(x[:, 0], x[:, 1], lw=0, s=40,
c=palette[colors.astype(int)])
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')
# add the labels for each digit corresponding to the label
txts = []
for i in range(num_classes):
# Position of each label at median of data points.
xtext, ytext = np.median(x[colors == i, :], axis=0)
txt = ax.text(xtext, ytext, str(i), fontsize=24)
txt.set_path_effects([
PathEffects.Stroke(linewidth=5, foreground="w"),
PathEffects.Normal()])
txts.append(txt)
return f, ax, sc, txts
""
(f, ax, sc, txts) = fashion_scatter(fashion_pca_umap, y_train)

Export the results for Cartolabe¶
We’ll take the results of the UMAP projection and build an array of points with their position. We’ll also include for each point an img_data field which contains the base64 encode data uri to visualize the point image in an svg image tag.
import base64 # noqa
from PIL import Image # noqa
from io import BytesIO # noqa
labels = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
items = []
for idx, data in enumerate(X_train):
item = {'nature': 'fashion', 'rank': idx, 'score': 1.0, 'label':
labels[y_train[idx]], 'position': fashion_pca_umap[idx].tolist()}
im = Image.fromarray(data.reshape(28, 28))
with BytesIO() as output:
im.save(output, format='png')
encoded_string = base64.b64encode(output.getvalue()).decode()
item['img_data'] = 'data:image/png;base64,{}'.format(encoded_string)
items.append(item)
for txt in txts:
pos = txt.get_position()
item = {'nature': 'hl_clusters', 'rank': len(items), 'score': 1.0, 'label':
labels[int(txt.get_text())], 'position': [float(pos[0]),
float(pos[1])]}
im = Image.fromarray(X_train[np.random.choice(np.argwhere(
y_train == int(txt.get_text())).reshape(-1), 1)[0]].reshape(28, 28))
with BytesIO() as output:
im.save(output, format='png')
encoded_string = base64.b64encode(output.getvalue()).decode()
item['img_data'] = 'data:image/png;base64,{}'.format(encoded_string)
items.append(item)
""
import pprint # noqa
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(items[10])
{ 'img_data': 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAACA0lEQVR4nFWQu24TQRiFz1x31+u1TWQ7IQZZEVESRUghUgouAolIFJQgwQNQIKBAKA1vAS2vwgsgCqChSxGBAnGcZLHsOBvvzM4Mxa5v08yv8+nMmf8A+Qk/fH1TjM8+vysmkl+fHrDu5tnh/mDhrqwclQ9fHkzh7vu4QoJG6fj7jt/vNv/Vzp8AAAUAPPqltDvrHKSbo7jX0vQv7k3hsoK2nop7g9iWknPD3H0A4ABAK/2+D87LXAnKqbiEXZs4V2ige6pEJfN9DWJdctGaOJdSS36zIQkzq3liVacUxY3TwlnveI/ZUepGyWWaWN29fTPBxvjZRvlhe6ffsIpaGfiLKtnalluTPdsf375oRQMJSrJw0Tzfa71Kpw0BT1//UZxwimSF7RYaBQDCBH4OHRWwlrikCjA2gc5kGEKJjIE6/0ID1kwbAoPINDecEBvY/XEaHWe2tI8wgxOa4No8dLijJU09GGEQNAu5gBY3MinBGeBrrMPNQOp403MEHnXUUL0OOwMJKnGgpKbghHG+Cktm4fXIedJJj7gAmRK5Pv7tRjUpK84DqNBzfhVkFi54uppxR0VS5UwsYS5zRZGwL1ExKgooieadJg2UdmHDSlDD3HymMuLEpsL03JBajPJdeAHXarp2RdYrq83tLxFRuciLgr7VT9JTs3z1h9d2ya3j3Pkfp1jNMwZc6WUAAAAASUVORK5CYII=',
'label': 'T-shirt',
'nature': 'fashion',
'position': [7.637689113616943, -3.356983184814453],
'rank': 10,
'score': 1.0}
All that’s left to do is to save the items in a JSON file :
import json # noqa
export_file_name = '../datas/mnist.json'
with open(export_file_name, 'w') as f:
json.dump(items, f)
Total running time of the script: (0 minutes 52.182 seconds)