MNIST with PyTorch

7/3/2020

PyTorch is another python machine learning library similar to scikit-learn. PyTorch is designed for deep learning, giving it more flexibility when creating neural networks.

Logistic Regression

Begin with imports:

import torch
from torch import nn, optim
from torch.utils.data import random_split
import torch.nn.functional as F
import torchvision
from torchvision import datasets, transforms
from ray import tune

import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')
import seaborn as sn
import pandas as pd
import numpy as np
import time
from livelossplot import PlotLosses

The full MNIST dataset is split into training (42000), validation (18000), and test (10000) sets. A batch size of 64 is also defined for stochastic gradient descent:

torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,),(0.5,))]) #[-1,1]
trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainset, valset = random_split(trainset, [42000,18000])
testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)
dataloaders = {'train':trainloader, 'validation':valloader}

Let’s show all digits in a single batch:

def imshow(img):
    img = img / 2 + 0.5 #unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

dataiter = iter(trainloader)
images, labels = dataiter.next()
plt.suptitle('MNIST Examples')
imshow(torchvision.utils.make_grid(images))
../_images/mnist_pytorch_7_0.png

We define logistic regression as a two-layer neural net (784 input nodes for pixels, 10 output nodes for digit classification):

class LogReg(nn.Module):
    def __init__(self):
        super(LogReg, self).__init__()
        self.linear = nn.Linear(784, 10)

    def forward(self, x):
        out = self.linear(x)
        return out

Use a grid search to optimize learning rate:

def trainable_logreg(config):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model = LogReg()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for epoch in range(10):
        model.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs = inputs.view(inputs.shape[0], -1)
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.detach() * inputs.size(0)
            
        tune.report(mean_loss=running_loss/len(trainloader.dataset))

tune.run(trainable_logreg, config={"lr": tune.grid_search([0.01, 0.03, 0.1])}, verbose=1)
== Status ==
Memory usage on this node: 8.6/23.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/9.77 GiB heap, 0.0/3.37 GiB objects
Result logdir: C:\Users\jl208\ray_results\trainable_logreg
Number of trials: 3 (3 TERMINATED)
Trial name status loc lr loss iter total time (s)
trainable_logreg_4bda0_00000TERMINATED 0.010.305648 10 172.208
trainable_logreg_4bda0_00001TERMINATED 0.030.282862 10 170.787
trainable_logreg_4bda0_00002TERMINATED 0.1 0.318506 10 172.697


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x19951a446c8>

Train the model using stochastic gradient descent. To evaluate the classifier’s capabilities, we can we plot the accuracy and loss for training and validation sets as functions of the number of training epochs. After training, we can evaluate the model with the test set:

def train_model(model, criterion, optimizer, num_epochs, reshape, path):
    time_start = time.clock()
    best_val_loss = float('inf')
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    liveloss = PlotLosses()
    model = model.to(device)
    
    for epoch in range(num_epochs):
        logs = {}
        for phase in ['train', 'validation']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                if reshape:
                    inputs = inputs.view(inputs.shape[0],-1)
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, preds = torch.max(outputs, 1)
                running_loss += loss.detach() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.float() / len(dataloaders[phase].dataset)

            prefix = ''
            if phase == 'validation':
                prefix = 'val_'
                if epoch_loss < best_val_loss:
                    best_val_loss = epoch_loss
                    torch.save(model, path)

            logs[prefix + 'log loss'] = epoch_loss.item()
            logs[prefix + 'accuracy'] = epoch_acc.item()
        
        liveloss.update(logs)
        liveloss.send()
    
    # evaluating best model on test set
    model = torch.load(path)
    test_loss = 0.0
    test_corrects = 0
    for inputs, labels in testloader:
        if reshape:
            inputs = inputs.view(inputs.shape[0],-1)
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        _, preds = torch.max(outputs, 1)
        test_loss += loss.detach() * inputs.size(0)
        test_corrects += torch.sum(preds == labels.data)

    test_loss = test_loss / len(testloader.dataset)
    test_acc = test_corrects.float() / len(testloader.dataset)

    print('Training time: %.3fs' % round(time.clock()-time_start,3))
    print('Test loss: %.3f' % test_loss)
    print('Test accuracy: %.3f%%' % (test_acc*100))
model=LogReg()
criterion=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=0.03)
num_epochs=20
train_model(model, criterion, optimizer, num_epochs, reshape=True, path='mnist_pytorch_logreg.pt')
../_images/mnist_pytorch_14_0.png
accuracy
	training         	 (min:    0.855, max:    0.925, cur:    0.925)
	validation       	 (min:    0.873, max:    0.915, cur:    0.911)
log loss
	training         	 (min:    0.266, max:    0.511, cur:    0.266)
	validation       	 (min:    0.301, max:    0.406, cur:    0.308)
Training time: 467.424s
Test loss: 0.284
Test accuracy: 92.160%

We can also plot a confusion matrix to see the classifier’s accuracy for different digits:

def confusion_matrix(model,name,reshape):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    cm = torch.zeros(10, 10)
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(testloader):
            if reshape:
                inputs = inputs.view(inputs.shape[0],-1)
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            for t, p in zip(labels.view(-1), preds.view(-1)):
                cm[t.long(), p.long()] += 1

    cm = cm.numpy()
    for i in range(10):
        cm[i] = cm[i]/np.sum(cm[i])
    cm = np.around(cm,3)

    plt.figure(figsize=(10,7))
    df_cm = pd.DataFrame(cm, range(10), range(10))
    sn.set(font_scale=1.3)
    sn.heatmap(df_cm, annot=True, annot_kws={'size': 12}, cmap='Blues')
    plt.suptitle(name + ' Confusion Matrix', fontsize=16)
    plt.show()
confusion_matrix(model,'Logistic Regression',reshape=True)
../_images/mnist_pytorch_17_0.png

Multilayer Perceptron

We define a MLP with two hidden layers, one with 128 nodes and the other with 64 nodes. Similar to logistic regression, there are 784 input nodes and 10 output nodes:

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden_1 = nn.Linear(784,128)
        self.hidden_2 = nn.Linear(128,64)
        self.output = nn.Linear(64,10)

    def forward(self,x):
        x = F.relu(self.hidden_1(x))
        x = F.relu(self.hidden_2(x))
        out = self.output(x)
        return out
def trainable_mlp(config):
    time_start = time.clock()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model = MLP()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'])
    for epoch in range(10):
        model.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs = inputs.view(inputs.shape[0], -1)
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.detach() * inputs.size(0)

        tune.report(mean_loss=running_loss/len(trainloader.dataset))
    
    print('Tuning time: %.3fs' % round(time.clock()-time_start,3))

tune.run(trainable_mlp, config={'lr': tune.grid_search([0.01, 0.03, 0.1])}, verbose=1)
(pid=8108) Tuning time: 167.976s
== Status ==
Memory usage on this node: 8.6/23.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/9.77 GiB heap, 0.0/3.37 GiB objects
Result logdir: C:\Users\jl208\ray_results\trainable_mlp
Number of trials: 3 (3 TERMINATED)
Trial name status loc lr loss iter total time (s)
trainable_mlp_d8b07_00000TERMINATED 0.010.226144 10 167.678
trainable_mlp_d8b07_00001TERMINATED 0.030.101972 10 166.59
trainable_mlp_d8b07_00002TERMINATED 0.1 0.0530744 10 167.97


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x19a90a50948>
model=MLP()
criterion=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=0.1)
num_epochs=20
train_model(model, criterion, optimizer, num_epochs, reshape=True, path='mnist_pytorch_mlp.pt')
../_images/mnist_pytorch_22_0.png
accuracy
	training         	 (min:    0.822, max:    0.995, cur:    0.995)
	validation       	 (min:    0.802, max:    0.976, cur:    0.976)
log loss
	training         	 (min:    0.017, max:    0.555, cur:    0.017)
	validation       	 (min:    0.093, max:    0.714, cur:    0.098)
Training time: 460.631s
Test loss: 0.081
Test accuracy: 97.660%
confusion_matrix(model,'MLP',reshape=True)
../_images/mnist_pytorch_23_0.png

Convolutional Neural Network

We define a CNN with two convolution layers, each with 5x5 kernels. We have a max-pooling layer with stride 2 after each convolutional layer. Flattened tensors are then passed throung a dense layer and MLP:

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)

    def forward(self,x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2, stride=2)

        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2, stride=2)

        x = x.reshape(-1, 12*4*4)
        x = self.fc1(x)
        x = F.relu(x)

        x = self.fc2(x)
        x = F.relu(x)

        x = self.out(x)
        return x
def trainable_cnn(config):
    time_start = time.clock()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = CNN()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'])
    for epoch in range(5):
        model.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.detach() * inputs.size(0)

        tune.report(mean_loss=running_loss/len(trainloader.dataset))

    print('Tuning time: %.3fs' % round(time.clock()-time_start,3))

tune.run(trainable_cnn, config={'lr': tune.grid_search([0.01, 0.03, 0.1])}, verbose=1)
== Status ==
Memory usage on this node: 8.8/23.9 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/8 CPUs, 0/0 GPUs, 0.0/9.77 GiB heap, 0.0/3.37 GiB objects
Result logdir: C:\Users\jl208\ray_results\trainable_cnn
Number of trials: 3 (3 TERMINATED)
Trial name status loc lr loss iter total time (s)
trainable_cnn_5da4c_00000TERMINATED 0.010.147092 5 194.109
trainable_cnn_5da4c_00001TERMINATED 0.030.0657198 5 198.405
trainable_cnn_5da4c_00002TERMINATED 0.1 0.0405428 5 196.356


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x199521ca808>
model=CNN()
criterion=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=0.1)
num_epochs=10
train_model(model, criterion, optimizer, num_epochs, reshape=False, path='mnist_pytorch_cnn.pt')
../_images/mnist_pytorch_28_0.png
accuracy
	training         	 (min:    0.837, max:    0.994, cur:    0.994)
	validation       	 (min:    0.863, max:    0.988, cur:    0.987)
log loss
	training         	 (min:    0.018, max:    0.502, cur:    0.018)
	validation       	 (min:    0.045, max:    0.403, cur:    0.049)
Training time: 356.850s
Test loss: 0.031
Test accuracy: 99.020%
confusion_matrix(model,'CNN',reshape=False)
../_images/mnist_pytorch_29_0.png