LSTM in PyTorch

Python

July 23, 2022

Basic of LSTM using PyTorch

Basic import

import torch
import numpy as np
import torch
import torch.nn as nn

Seeding

#Manual seed
torch.manual_seed(786)

Input to feed LSTM network

#random data of 1x6
input = torch.rand(1,6)
print('The size of input: ',input.size())
print('The input itself: ',input)

The size of input: torch.Size([1, 6])

The input itself: tensor([[0.7105, 0.8111, 0.7059, 0.6854, 0.8432, 0.6766]])

Minimal LSTM in PyTorch

#input_size – The number of expected features in the input x
#hidden_size – The number of features in the hidden state h
#nn.LSTM(input_size,hidden_size)
conv,a = nn.LSTM(6,6)(input)
print("The output of above LSTM model:\n ",conv)
print("The value of hidden states:\n ",a)

The output of above LSTM model:
  tensor([[ 0.1748, -0.0426, -0.0335,  0.1421, -0.1876,  0.1138]],
       grad_fn=<SqueezeBackward1>)

The value of hidden states:
  (tensor([[ 0.1748, -0.0426, -0.0335,  0.1421, -0.1876,  0.1138]],
       grad_fn=<SqueezeBackward1>), tensor([[ 0.3817, -0.1098, -0.0467,  0.3663, -0.3186,  0.1684]],
       grad_fn=<SqueezeBackward1>))

By default the no_of_layer (no of stacked LSTM layers is one)

Let us change it to two

conv,a = nn.LSTM(6,6,2)(input)
print("The output of above LSTM model:\n ",conv)
print('\n')
print("The value of hidden states:\n ",a)

The output of above LSTM model:
  tensor([[-0.0099,  0.1247, -0.0815, -0.0407, -0.0446, -0.0919]],
       grad_fn=<SqueezeBackward1>)


The value of hidden states:
  (tensor([[-0.1088, -0.0990, -0.1030, -0.0013,  0.0947, -0.2360],
        [-0.0099,  0.1247, -0.0815, -0.0407, -0.0446, -0.0919]],
       grad_fn=<SqueezeBackward1>), tensor([[-0.2402, -0.1555, -0.1816, -0.0034,  0.2623, -0.4411],
        [-0.0226,  0.2425, -0.2008, -0.0656, -0.1035, -0.2683]],
       grad_fn=<SqueezeBackward1>))

Adding Dropout

If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer,

with dropout probability equal to dropout. Default: 0

conv,a = nn.LSTM(6,6,2,dropout=0.2)(input)
print("The output of above LSTM model:\n ",conv)
print('\n')
print("The value of hidden states:\n ",a)

The output of above LSTM model: tensor([[ 0.2041, 0.0515, -0.0755, 0.1436, -0.0792, 0.0479]], grad_fn=<SqueezeBackward1>)

The value of hidden states: (tensor([[ 0.0369, 0.0682, -0.0806, -0.0235, 0.2393, -0.0630], [ 0.2041, 0.0515, -0.0755, 0.1436, -0.0792, 0.0479]], grad_fn=<SqueezeBackward1>), tensor([[ 0.0664, 0.1654, -0.1296, -0.0542, 0.4706, -0.1871], [ 0.3184, 0.1032, -0.1605, 0.2295, -0.1394, 0.0994]], grad_fn=<SqueezeBackward1>))

Removing bias in LSTM network

conv,a = nn.LSTM(6,6,2,bias=False)(input)
print("The output of above LSTM model:\n ",conv)
print('\n')
print("The value of hidden states:\n ",a)

The output of above LSTM model: tensor([[ 0.0011, -0.0173, 0.0305, -0.0188, -0.0076, -0.0128]], grad_fn=<SqueezeBackward1>)

The value of hidden states: (tensor([[-0.0957, 0.1377, 0.1175, -0.0795, -0.1117, -0.1325], [ 0.0011, -0.0173, 0.0305, -0.0188, -0.0076, -0.0128]], grad_fn=<SqueezeBackward1>), tensor([[-0.1543, 0.2412, 0.2237, -0.1845, -0.2104, -0.3758], [ 0.0023, -0.0348, 0.0591, -0.0390, -0.0159, -0.0243]], grad_fn=<SqueezeBackward1>))

proj_size If > 0, will use LSTM with projections of corresponding size. Default: 0

conv,a = nn.LSTM(6,6,2,proj_size=1)(input)
print("The output of above LSTM model:\n ",conv)

The output of above LSTM model: tensor([[-0.0351]], grad_fn=<SqueezeBackward1>)

conv,a = nn.LSTM(6,6,2,proj_size=2)(input)
print("The output of above LSTM model:\n ",conv)

The output of above LSTM model: tensor([[0.0595, 0.0752]], grad_fn=<SqueezeBackward1>)

#This code generates error
conv,a = nn.LSTM(6,6,2,proj_size=9)(input)
print("The output of above LSTM model:\n ",conv)

ValueError: proj_size has to be smaller than hidden_size

Bidirectional LSTM

#For bidirectional LSTM
#set bidirectional = True
conv,a = nn.LSTM(6,6,bidirectional = True)(input)
print("The output of above LSTM model:\n ",conv)

The output of above LSTM model: tensor([[ 0.1565, -0.2395, 0.1214, 0.1388, 0.1355, 0.0832, 0.0119, -0.0008, -0.2532, 0.1530, -0.0492, 0.0215]], grad_fn=<SqueezeBackward1>)

conv,a = nn.LSTM(6,6,bidirectional = True,proj_size=2)(input)
print("The output of above LSTM model:\n ",conv)

The output of above LSTM model: tensor([[ 0.2349, -0.0812, -0.0632, 0.0543]], grad_fn=<SqueezeBackward1>)

Feeding output of LSTM to linear layer

#preparing the input
input = torch.rand(1,6)
print('The size of input: ',input.size())
print('The input itself: ',input)

The size of input: torch.Size([1, 6]) The input itself: tensor([[0.1132, 0.0979, 0.5497, 0.1461, 0.5804, 0.8175]])

#A LSTM layer
#Followed by ReLU activation function
#The output of LSTM is fed to linear layer
#Which is the fed to ReLU
conv,a = nn.LSTM(6,4)(input)
conv = nn.ReLU()(conv)
conv = nn.Linear(4, 3)(conv)
conv = nn.ReLU()(conv)
print('The final output: \n',conv)
print('The size of final output: \n',conv.size())

The final output: tensor([[0.4911, 0.4870, 0.2221]], grad_fn=<ReluBackward0>)

The size of final output: torch.Size([1, 3])

A complete model

#This is just a helper function
# LSTM() returns tuple of (tensor, (recurrent state))
class extract_tensor_3d_input(torch.nn.Module):
    def forward(self,x):
        # Output shape (batch, features, hidden)
        tensor, _ = x
        # Reshape shape (batch, hidden)
        return tensor[:, -1]

model_lstm_linear =  torch.nn.Sequential(
                        torch.nn.LSTM(6,6),
                        extract_tensor_3d_input(),
                        torch.nn.ReLU(),
                        torch.nn.Linear(6, 20),
                        torch.nn.ReLU(),
                        torch.nn.Linear(20,20),
                        torch.nn.ReLU(),
                        torch.nn.Linear(20, 1),
                        torch.nn.Sigmoid()
                       
                )

#Loss function 
loss_func = nn.MSELoss()
#setting Stochastic gradient descent as an optimizer
from torch.optim import SGD
opt = SGD(model_lstm_linear.parameters(),lr=0.001)

#Preparing the input and output
input_data = torch.rand(50,1,6)
print('The size of input: ',input_data.size())

t = torch.randn(50).uniform_(0,1)
output_data = torch.bernoulli(t)
print("The output data: \n",output_data.size())

x = input_data 
y = output_data

The size of input: torch.Size([50, 1, 6])

The output data: torch.Size([50])

Training the model for 5000 epochs and recording time and loss history

import time
loss_history = []
ticks = time.time()
for _ in range(5000):
  opt.zero_grad()
  
  loss_value = loss_func(model_lstm_linear(x),y)
  
  loss_value.backward()
  opt.step()
 
  loss_value = loss_value.cpu().data.numpy()
  
  #loss_value = loss_value.numpy()
  loss_history.append(loss_value)
toc = time.time()
print("\n time taken: {}".format(toc-ticks))

time taken: 38.14278554916382

Plotting the loss history

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(loss_history)
plt.title('Loss variation over increasing epochs')
plt.xlabel('epochs')
plt.ylabel('loss values')

Above model but takes batch of data at a time

model_lstm_linear_batch =  torch.nn.Sequential(
                        torch.nn.LSTM(6,6,batch_first=True),
                        extract_tensor_3d_input(),
                        torch.nn.ReLU(),
                        torch.nn.Linear(6, 20),
                        torch.nn.ReLU(),
                        torch.nn.Linear(20,20),
                        torch.nn.ReLU(),
                        torch.nn.Linear(20, 1),
                        torch.nn.Sigmoid()
                       
                )

#Loss function 
loss_func = nn.MSELoss()
#setting Stochastic gradient descent as an optimizer
from torch.optim import SGD
opt = SGD(model_lstm_linear_batch.parameters(),lr=0.001)

loss_history = []
ticks = time.time()
for _ in range(5000):
  opt.zero_grad()
  
  loss_value = loss_func(model_lstm_linear_batch(x),y)
  
  loss_value.backward()
  opt.step()
 
  loss_value = loss_value.cpu().data.numpy()
  
  #loss_value = loss_value.numpy()
  loss_history.append(loss_value)
toc = time.time()
print("\n time taken: {}".format(toc-ticks))

time taken: 4.788837432861328

%matplotlib inline

plt.plot(loss_history)
plt.title('Loss variation over increasing epochs')
plt.xlabel('epochs')
plt.ylabel('loss values')

Making models with class

#Class of neural network with LSTM followed by linear layers
class My_LSTM_NeuralNet(nn.Module):
  def __init__(self):
    super().__init__()
    #define LSTM layer
    self.lstm_layer = nn.LSTM(6,6)
    
    #define hidden layer
    self.input_to_hidden_layer = nn.Linear(6,4)
    #define activation function of hidden layer
    self.hidden_layer_activation = nn.ReLU()
    #define output layer
    self.hidden_to_output_layer = nn.Linear(4,1)

  #Define feed forward network based on above definitions
  def forward(self,x):
        x,_ = self.lstm_layer(x)
        x = self.hidden_layer_activation(x)
        x = self.input_to_hidden_layer(x)
        x = self.hidden_layer_activation(x)
        x = self.hidden_to_output_layer(x)
        return x

my_lstm_net = My_LSTM_NeuralNet()

#Loss function 
loss_func = nn.MSELoss()

opt = SGD(my_lstm_net.parameters(),lr=0.001)

loss_history = []
ticks = time.time()
for _ in range(5000):
  opt.zero_grad()
  
  loss_value = loss_func(my_lstm_net(x),y)
  
  loss_value.backward()
  opt.step()
 
  loss_value = loss_value.cpu().data.numpy()
  
  #loss_value = loss_value.numpy()
  loss_history.append(loss_value)

toc = time.time()
print(toc-ticks)

Time taken : 41.547165393829346 seconds

Plotting the loss

%matplotlib inline

plt.plot(loss_history)
plt.title('Loss variation over increasing epochs')
plt.xlabel('epochs')
plt.ylabel('loss values')

Class of LSTM which takes batch of data

#Class of neural network with LSTM followed by linear layers
class My_LSTM_NeuralNet_batch(nn.Module):
  def __init__(self):
    super().__init__()
    #define LSTM layer
    self.lstm_layer = nn.LSTM(6,6,batch_first=True)
    
    #define hidden layer
    self.input_to_hidden_layer = nn.Linear(6,4)
    #define activation function of hidden layer
    self.hidden_layer_activation = nn.ReLU()
    #define output layer
    self.hidden_to_output_layer = nn.Linear(4,1)

  #Define feed forward network based on above definitions
  def forward(self,x):
        x,_ = self.lstm_layer(x)
        x = self.hidden_layer_activation(x)
        x = self.input_to_hidden_layer(x)
        x = self.hidden_layer_activation(x)
        x = self.hidden_to_output_layer(x)
        return x

my_lstm_net_batch = My_LSTM_NeuralNet_batch()

#Loss function 
loss_func = nn.MSELoss()

opt = SGD(my_lstm_net_batch.parameters(),lr=0.001)

loss_history = []
ticks = time.time()
for _ in range(5000):
  opt.zero_grad()
  
  loss_value = loss_func(my_lstm_net_batch(x),y)
  
  loss_value.backward()
  opt.step()
 
  loss_value = loss_value.cpu().data.numpy()
  
  #loss_value = loss_value.numpy()
  loss_history.append(loss_value)

toc = time.time()
print("Time taken: ",toc-ticks)

Time taken: 4.252270698547363

Plotting the Loss

%matplotlib inline

plt.plot(loss_history)
plt.title('Loss variation over increasing epochs')
plt.xlabel('epochs')
plt.ylabel('loss values')

pontu