PyTorch from Scratch
Published:
Use PyTorch to build your neural network.
Imports
import torch
import torch.nn as nn # Neural network modules
import torch.optim as optim # Optimization algorithms
import torch.nn.functional as F # All functions don't have parameters
from torch.nn.utils.rnn import pad_sequence # To pad batch
from torch.utils.data import DataLoader, Dataset # Manage dataset and create mini-batch
from torch.utils.tensorboard import SummaryWriter # TensorBoard
from torchvision.transforms import transforms
from torchvision.transforms import ToTensor
import torchvision.datasets as datasets
Define network layers
class Net(nn.Module):
def __init__(self, input_size, class_num):
super(Net, self).__init()__
self.fc = nn.Layer(input_size, n)
# ...
def forward(self, x):
x = F.relu(self.fc(x))
# ...
return x
Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Define hypermeters
epochs =
batch_size =
input_size =
class_num =
learning_rate =
...
Load data
train_data = datasets.MNIST(
root='./data',
train=True,
transform=ToTensor(),
download=True,
)
test_data = datasets.MNIST(
root='./data',
train=False,
transform=ToTensor()
)
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)
Custom Dataset
class CustomDataset(Dataset):
def __init__(self, csv_file: str, root_dir: str, transform=None):
self.annotations = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.annotations)
def __getitem__(self, item):
img_path = os.path.join(self.root_dir, self.annotations.iloc[item, 0])
image = io.imread(img_path)
y_label = self.annotations.iloc[item, 1]
if self.transform is not None:
image = self.transform(image)
return image, y_label
Initialize network
model = Net(input_size, class_num).to(device) #! to(device)
Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # \alpha, \beta_1, \beta_2
Train network
for epoch in range(epochs):
for i, (data, label) in enumerate(train_loader):
# Set device
data = data.to(device)
label = label.to(device)
# Reshape data
x = data.reshape(data.shape[0], -1) # Flatten
# Forward
y_pred = model(x)
loss = criterion(y_pred, label)
# Backward
optimizer.zero_grad()
loss.backward()
# Gradient Descent
optimizer.step()
Check accuracy on training and test set
def check_accuracy(loader, model, device):
if loader.dataset.train:
print('Checking training set accuracy...')
else:
print('Checking test set accuracy...')
correct = 0.
total = 0.
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device).squeeze(1)
y = y.to(device)
result = model(x)
_, y_pred = result.max(1) # Max value of each output (index, value)
correct += (y_pred == y).sum().item()
total += y_pred.size(0)
model.train() # Toggle model back to train
return correct / total
Save and Load model
- Define
def save_checkpoint(state: dict, filename=save_file) -> None:
print('Saving checkpoint...')
torch.save(state, filename)
def load_checkpoint(checkpoint) -> None:
print('Loading checkpoint...')
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
- Usage
if load_model:
torch.load(save_file)
if epoch == num_epochs:
state_dict = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
save_checkpoint(state_dict)
...
Deal with imbalanced datasets
- Oversampling (repeat frequently same examples, data augmentation)
- Class weighting (multiply the loss by some number)
How to save training time
Overfit a single batch first
# Increase batch_size from 1 data, target = next(iter(train_loader)) # for batch, (data, targets) in enumerate(train_loader): # training process...
Check our neural network no bug and work well on single sample.
Set training and evaluation mode In
check_accuracy()
, toggle the model toeval()
first, then toggle back totrain()
. Like:model.eval() # Get rid of dropout or BatchNorm... check_accuracy() model.train()
Remember
.zero_grad()
optimizer.zero_grad() loss.backward() optimizer.step()
Do NOT use
softmax()
at the output layer when useCrossEntropy()
CrossEntropy function does SoftMax first then calculate the loss value. Gradient will vanish if we use SoftMax twice!Bias when using BatchNorm When using BatchNorm after Conv:
self.conv = nn.Conv2d(..., bias=False,) # Add bias=False, important! self.bn = BatchNorm2d(8) self.bn(self.conv(x)) # Use BatchNorm after Conv layer
Distinguish between view and permute
x = torch.tensor([[1, 2, 3], [4, 5, 6]]) # (2, 3) x.view(-1) # (6, 1) [1, 2, 3, 4, 5, 6] x.view(3, 2) # (3, 2) [[1, 2], [3, 4], [5, 6]] x.permute(1, 0) # (3, 2) [[1, 4], [2, 5], [3, 6]] Transpose
- Correct data augmentation
- Shuffle at dataloader
- Normalize data
Clip gradients (when using RNNs, GRUs, and LSTMs)
- To prevent the gradients from becoming too large.
- If the gradient norm exceeds the max_norm, the gradient will be scaled down.
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
TensorBoard
writer = SummaryWriter(f'runs/...')
step = 0
# For loop
writer.add_scalar('Training loss', loss, global_step=step)
writer.add_scalar('Training accuracy', acc, global_step=step)
# writer.add_histogram('fc', model.fc.weight)
step += 1
# Out of for loop
writer.add_hparams({'Training model': rnn_models[i]}, {'Accuracy': train_acc, 'Loss': sum(losses)/len(losses)})
Process before training
- Train/Val/Test Split - separate data into fixed train/val/test set
- File loading - load the corpus from various file formats
- Tokenization - break sentences into list of words
- Vocab - generate a vocabulary list
- Numericalize/Indexify - map words into integer number (index) for the entire corpus
- Word vector - either initialize vocabulary randomly or load from pre-trained embedding must be “trimmed” (only store words in vocabulary into memory)
- Batching - generate batches of training sample, padding is normally happened here
- Embedding lookup - map each sentence (which contains word indices) to fixed dimension word vectors
Supplement - Einsum & Matrix Operations
Permutation
torch.einsm('ij->ji', x) x.permute(1, 0)
Summation
torch.einsum('ij->', x) x.sum()
Column sum
torch.einsum('ij->j', x) x.sum(axis=0)
Row sum
torch.einsum('ij->i', x) x.sum(axis=1)
Matrix-Vector Multiplication
torch.einsum('ij,kj->ik', x, v) x.mm(v.t()) x@v.t()
Matrix-Matrix Multiplication
torch.einsum('ij,kj->ik', x, q) x.mm(q.t()) x@q.t()
Dot Product
torch.einsum('ij,ij->', x, x) torch.sum(x*x)
Element-wise Multiplication
torch.einsum('ij,ij->ij', x, x) x*x
Outer Product
# a: (1, 3) # b: (1, 5) # axb: (3, 5) torch.einsum('i,j->ij', a, b) torch.ger(a, b)
Matrix Diagonal
torch.einsum('ii->i', x) x.diag()
Matrix Trace
torch.einsum('ii->', x) x.trace()