import datasets
import transformers
import evalaute
import datasets
import tokenizers
tokenizer.encode()
tokenizer.decode()
import datasets
import tokenizers
tokenizer.encode()
tokenizer.decode()
import transformers
Text in source language
Translated text in target language
Input text
Predict the class/sentiment
Input text
Summarize
Question
Answer
Input text
Input text
Predict the class/sentiment
Input text
Summarize
Question
Answer
Input text
Input text
Predict the class/sentiment
Input text
Summarize
Question
Answer
Input text
" Wow, India has now reached the moon"
An excerpt from business today "What sets this mission apart is the pivotal role of artificial intelligence (AI) in guiding the spacecraft during its critical descent to the moon's surface."
He likes to stay
He likes to stray
He likes to sway
Language Modeling
(Pre-training)
Raw text
Downstream tasks
(Fine-tuning)
(Samples and labels)
a. An apple ate I
b. I ate an apple
c. I ate apple
d. an apple
e. ....
Definition
a. I enjoyed reading a book
b. I enjoyed reading a thermometer
\(x_1,x_2,\cdots,x_{i-1}\)
\(P(x_i)\)
Multi-Head Attention
Feed forward NN
Add&Norm
Add&Norm
\(x_1,<mask>,\cdots,x_{T}\)
\(P(<mask>)\)
Multi-Head masked Attention
Feed forward NN
Add&Norm
Add&Norm
\(x_1,x_2,\cdots,x_{i-1}\)
\(P(x_i)\)
Multi-Head Attention
Feed forward NN
Add&Norm
Add&Norm
Multi-Head cross Attention
Feed forward NN
Add&Norm
Add&Norm
Multi-Head Maksed Attention
Add&Norm
\(x_1,<mask>,\cdots,x_{T}\)
\(<go>\)
\(P(<mask>)\)
Feed Forward Network
Masked Multi-Head (Self)Attention
Multi-Head (Cross) Attention
Feed Forward Network
Masked Multi-Head (Self)Attention
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 4
Transformer Block 5
Where \(h_n[i]\) is the \(i-\)the output vector in \(h_n\) block.
BookCorpus
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 12
Transformer Block 1
<go>
at
the
bell
labs
hamming
bound
...................
new
a
devising
..............
<stop>
Transformer Block 1
<go>
at
the
bell
labs
hamming
bound
...................
new
a
devising
..............
<stop>
Feed Forward Neural Network
Multi-head masked attention
<go>
at
the
bell
labs
hamming
bound
...................
new
a
devising
..............
<stop>
Concatenate
Linear
Layer norm
Residual connection
<go>
at
the
bell
labs
hamming
bound
...................
new
a
devising
..............
<stop>
Layer norm
Residual connection
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 12
Embedding Matrix
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 12
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 12
Transformer Block 1
Transformer Block 2
Transformer Block 3
Transformer Block 12
Layer | Parameters (in Millions) |
---|---|
Embedding Layer | |
Attention layers | |
FFN Layers | |
Total |
Embedding Matrix
*Without rounding the number of parameters in each layer
import datasets
import tokenizers
tokenizer.encode()
tokenizer.decode()
import transformers
class TRANSFORMER(torch.nn.Module):
def __init__(self,vocab_size,embed_dim,hidden_dim,num_class):
super().__init__()
self.embedding = nn.Embedding(vocab_size,
embed_dim,
padding_idx=0)
self.transformer = nn.Transformer(dmodel,
nhead,
num_encoder_layers,
num_decoder_layers,
dim_feedforward)
self.fc = Linear(dmodel, vocab_size)
def forward(self, x,length)
x = self.embedding(x)
x = self.transformer(x)
x = self.fc(x[1])
return x
class RNN(torch.nn.Module):
def __init__(self,vocab_size,embed_dim,hidden_dim,num_class):
super().__init__()
self.embedding = nn.Embedding(vocab_size,
embed_dim,
padding_idx=0)
self.rnn = nn.RNN(embed_dim,
hidden_dim,
batch_first=True)
self.fc = nn.Linear(hidden_dim, num_class)
def forward(self, x,length)
x = self.embedding(x)
x = pack_padded_sequence(x,
lengths=length,
enforce_sorted=False,
batch_first=True)
x = self.rnn(x)
x = self.fc(x[1])
return x
class CNN(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 10)
self.fc2 = nn.Linear(10, 4)
self.fc3 = nn.Linear(4, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
1 |
---|
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
N samples
in a storage
torch.tensor
Torch.nn.module
(Linear, Transformer)
torch.optim
torch.nn.Parameter
torch.autograd
1 |
---|
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
N samples
in a storage
1 |
---|
10 |
5 |
4 |
3 |
6 |
17 |
8 |
9 |
2 |
11 |
16 |
13 |
14 |
15 |
12 |
7 |
18 |
Shuffled indices
Model Under
training
Fetch
(Model waits until samples are fetched)
import datasets
import tokenizers
tokenizer.encode()
import transformers
import datasets
import tokenizers
tokenizer.encode()
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
1 |
---|
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
torch.tensor
Torch.nn.module
(Linear, Transformer)
torch.optim
torch.nn.Parameter
torch.autograd