PyTorch from Scratch

5 minute read

Published: September 11, 2023

Use PyTorch to build your neural network.

Imports

import torch
import torch.nn as nn   # Neural network modules
import torch.optim as optim # Optimization algorithms
import torch.nn.functional as F # All functions don't have parameters
from torch.nn.utils.rnn import pad_sequence     # To pad batch
from torch.utils.data import DataLoader, Dataset    # Manage dataset and create mini-batch
from torch.utils.tensorboard import SummaryWriter   # TensorBoard
from torchvision.transforms import transforms
from torchvision.transforms import ToTensor
import torchvision.datasets as datasets

Define network layers

class Net(nn.Module):
    def __init__(self, input_size, class_num):
        super(Net, self).__init()__
        self.fc = nn.Layer(input_size, n)
        # ...
    
    def forward(self, x):
        x = F.relu(self.fc(x))
        # ...
        return x

Set device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Define hypermeters

epochs = 
batch_size = 
input_size = 
class_num = 
learning_rate = 
...

Load data

train_data = datasets.MNIST(
    root='./data',
    train=True,
    transform=ToTensor(),
    download=True,
)
test_data = datasets.MNIST(
    root='./data',
    train=False,
    transform=ToTensor()
)

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

Custom Dataset

class CustomDataset(Dataset):
    def __init__(self, csv_file: str, root_dir: str, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, item):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[item, 0])
        image = io.imread(img_path)
        y_label = self.annotations.iloc[item, 1]

        if self.transform is not None:
            image = self.transform(image)

        return image, y_label

Initialize network

model = Net(input_size, class_num).to(device)   #! to(device)

Loss and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)    # \alpha, \beta_1, \beta_2

Train network

for epoch in range(epochs):
    for i, (data, label) in enumerate(train_loader):
        # Set device
        data = data.to(device)
        label = label.to(device)

        # Reshape data
        x = data.reshape(data.shape[0], -1) # Flatten

        # Forward
        y_pred = model(x)
        loss = criterion(y_pred, label)

        # Backward
        optimizer.zero_grad()
        loss.backward()

        # Gradient Descent
        optimizer.step()

Check accuracy on training and test set

def check_accuracy(loader, model, device):
    if loader.dataset.train:
        print('Checking training set accuracy...')
    else:
        print('Checking test set accuracy...')

    correct = 0.
    total = 0.
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device).squeeze(1)
            y = y.to(device)
            result = model(x)
            _, y_pred = result.max(1)   # Max value of each output (index, value)
            correct += (y_pred == y).sum().item()
            total += y_pred.size(0)

    model.train()   # Toggle model back to train
    return correct / total

Save and Load model

Define

def save_checkpoint(state: dict, filename=save_file) -> None:
    print('Saving checkpoint...')
    torch.save(state, filename)

def load_checkpoint(checkpoint) -> None:
    print('Loading checkpoint...')
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

Usage

if load_model:
    torch.load(save_file)

if epoch == num_epochs:
    state_dict = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(state_dict)
...

Deal with imbalanced datasets

Oversampling (repeat frequently same examples, data augmentation)
Class weighting (multiply the loss by some number)

How to save training time

Overfit a single batch first

 # Increase batch_size from 1
 data, target = next(iter(train_loader))
   
 # for batch, (data, targets) in enumerate(train_loader):
 #   training process...

Check our neural network no bug and work well on single sample.

Set training and evaluation mode In check_accuracy(), toggle the model to eval() first, then toggle back to train(). Like:
```
 model.eval()    # Get rid of dropout or BatchNorm...
 check_accuracy()
 model.train()
```

Remember .zero_grad()

 optimizer.zero_grad()
 loss.backward()
 optimizer.step()

Do NOT use softmax() at the output layer when use CrossEntropy() CrossEntropy function does SoftMax first then calculate the loss value. Gradient will vanish if we use SoftMax twice!

Bias when using BatchNorm When using BatchNorm after Conv:

self.conv = nn.Conv2d(..., bias=False,)  # Add bias=False, important!
self.bn = BatchNorm2d(8)
self.bn(self.conv(x))    # Use BatchNorm after Conv layer

Distinguish between view and permute

x = torch.tensor([[1, 2, 3], [4, 5, 6]]) # (2, 3)
x.view(-1)                               # (6, 1) [1, 2, 3, 4, 5, 6]
x.view(3, 2)                             # (3, 2) [[1, 2], [3, 4], [5, 6]]
x.permute(1, 0)                          # (3, 2) [[1, 4], [2, 5], [3, 6]]   Transpose

Correct data augmentation
Shuffle at dataloader
Normalize data
Clip gradients (when using RNNs, GRUs, and LSTMs)
- To prevent the gradients from becoming too large.
- If the gradient norm exceeds the max_norm, the gradient will be scaled down.

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)

TensorBoard

writer = SummaryWriter(f'runs/...')
step = 0
# For loop
writer.add_scalar('Training loss', loss, global_step=step)
writer.add_scalar('Training accuracy', acc, global_step=step)
# writer.add_histogram('fc', model.fc.weight)
step += 1
# Out of for loop
writer.add_hparams({'Training model': rnn_models[i]}, {'Accuracy': train_acc, 'Loss': sum(losses)/len(losses)})

Process before training

Train/Val/Test Split - separate data into fixed train/val/test set
File loading - load the corpus from various file formats
Tokenization - break sentences into list of words
Vocab - generate a vocabulary list
Numericalize/Indexify - map words into integer number (index) for the entire corpus
Word vector - either initialize vocabulary randomly or load from pre-trained embedding must be “trimmed” (only store words in vocabulary into memory)
Batching - generate batches of training sample, padding is normally happened here
Embedding lookup - map each sentence (which contains word indices) to fixed dimension word vectors

Supplement - Einsum & Matrix Operations

Permutation

torch.einsm('ij->ji', x)
x.permute(1, 0)

Summation
```
torch.einsum('ij->', x)
x.sum()
```
Column sum
```
torch.einsum('ij->j', x)
x.sum(axis=0)
```

Row sum

 torch.einsum('ij->i', x)
 x.sum(axis=1)

Matrix-Vector Multiplication

torch.einsum('ij,kj->ik', x, v)
x.mm(v.t())
x@v.t()

Matrix-Matrix Multiplication

torch.einsum('ij,kj->ik', x, q)
x.mm(q.t())
x@q.t()

Dot Product

torch.einsum('ij,ij->', x, x)
torch.sum(x*x)

Element-wise Multiplication
```
torch.einsum('ij,ij->ij', x, x)
x*x
```

Outer Product

# a: (1, 3)
# b: (1, 5)
# axb: (3, 5)
torch.einsum('i,j->ij', a, b)
torch.ger(a, b)

Matrix Diagonal
```
torch.einsum('ii->i', x)
x.diag()
```
Matrix Trace
```
torch.einsum('ii->', x)
x.trace()
```

Share on

Twitter Facebook LinkedIn

Jingbo (Bob) Su