Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,976 changes: 1,976 additions & 0 deletions Notebooks/CNN/cnn_zeynep.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Notebooks/CNN/requierements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
torch
torchvision
torchaudio
88 changes: 88 additions & 0 deletions Notebooks/Transformer/train_transformer.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\n",
"import transformer_model as tf\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.utils.data as data\n",
"import math\n",
"import copy\n",
"\n",
"\n",
"# this is a test data set just to see if the model works\n",
"src_vocab_size = 5000\n",
"tgt_vocab_size = 5000\n",
"d_model = 512\n",
"num_heads = 8\n",
"num_layers = 6\n",
"d_ff = 2048\n",
"max_seq_length = 100\n",
"dropout = 0.1\n",
"\n",
"transformer = tf.Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length,\n",
" dropout)\n",
"\n",
"# Generate random sample data\n",
"src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)\n",
"tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)\n",
"\n",
"# training the model\n",
"criterion = nn.CrossEntropyLoss(ignore_index=0)\n",
"optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)\n",
"\n",
"transformer.train()\n",
"\n",
"for epoch in range(100):\n",
" optimizer.zero_grad()\n",
" output = transformer(src_data, tgt_data[:, :-1])\n",
" loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))\n",
" loss.backward()\n",
" optimizer.step()\n",
" print(f\"Epoch: {epoch + 1}, Loss: {loss.item()}\")\n",
"\n",
"# training model performance evaluation\n",
"transformer.eval()\n",
"\n",
"# Generate random sample validation data\n",
"val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)\n",
"val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)\n",
"\n",
"with torch.no_grad():\n",
" val_output = transformer(val_src_data, val_tgt_data[:, :-1])\n",
" val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))\n",
" print(f\"Validation Loss: {val_loss.item()}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
213 changes: 213 additions & 0 deletions Notebooks/Transformer/transformer_model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
{
"cells": [
{
"metadata": {},
"cell_type": "raw",
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.utils.data as data\n",
"import math\n",
"import copy\n",
"\n",
"\n",
"# The multi-head attention mechanism computes the attention between each pair of positions in a sequence.\n",
"# It consists of multiple “attention heads” that capture different aspects of the input sequence.\n",
"class MultiHeadAttention(nn.Module):\n",
" def __init__(self, d_model, num_heads):\n",
" super(MultiHeadAttention, self).__init__()\n",
" # Ensure that the model dimension (d_model) is divisible by the number of heads\n",
" assert d_model % num_heads == 0, \"d_model must be divisible by num_heads\"\n",
"\n",
" # Initialize dimensions\n",
" self.d_model = d_model # Model's dimension\n",
" self.num_heads = num_heads # Number of attention heads\n",
" self.d_k = d_model // num_heads # Dimension of each head's key, query, and value\n",
"\n",
" # Linear layers for transforming inputs\n",
" self.W_q = nn.Linear(d_model, d_model) # Query transformation\n",
" self.W_k = nn.Linear(d_model, d_model) # Key transformation\n",
" self.W_v = nn.Linear(d_model, d_model) # Value transformation\n",
" self.W_o = nn.Linear(d_model, d_model) # Output transformation\n",
"\n",
" def scaled_dot_product_attention(self, Q, K, V, mask=None):\n",
" # Calculate attention scores\n",
" attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)\n",
"\n",
" # Apply mask if provided (useful for preventing attention to certain parts like padding)\n",
" if mask is not None:\n",
" attn_scores = attn_scores.masked_fill(mask == 0, -1e9)\n",
"\n",
" # Softmax is applied to obtain attention probabilities\n",
" attn_probs = torch.softmax(attn_scores, dim=-1)\n",
"\n",
" # Multiply by values to obtain the final output\n",
" output = torch.matmul(attn_probs, V)\n",
" return output\n",
"\n",
" def split_heads(self, x):\n",
" # Reshape the input to have num_heads for multi-head attention\n",
" batch_size, seq_length, d_model = x.size()\n",
" return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)\n",
"\n",
" def combine_heads(self, x):\n",
" # Combine the multiple heads back to original shape\n",
" batch_size, _, seq_length, d_k = x.size()\n",
" return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)\n",
"\n",
" def forward(self, Q, K, V, mask=None):\n",
" # Apply linear transformations and split heads\n",
" Q = self.split_heads(self.W_q(Q))\n",
" K = self.split_heads(self.W_k(K))\n",
" V = self.split_heads(self.W_v(V))\n",
"\n",
" # Perform scaled dot-product attention\n",
" attn_output = self.scaled_dot_product_attention(Q, K, V, mask)\n",
"\n",
" # Combine heads and apply output transformation\n",
" output = self.W_o(self.combine_heads(attn_output))\n",
" return output\n",
"\n",
"\n",
"class PositionWiseFeedForward(nn.Module):\n",
" def __init__(self, d_model, d_ff):\n",
" super(PositionWiseFeedForward, self).__init__()\n",
" self.fc1 = nn.Linear(d_model, d_ff)\n",
" self.fc2 = nn.Linear(d_ff, d_model)\n",
" self.relu = nn.ReLU()\n",
"\n",
" def forward(self, x):\n",
" return self.fc2(self.relu(self.fc1(x)))\n",
"\n",
"# Positional Encoding is used to inject the position information of each token in the input sequence.\n",
"# It uses sine and cosine functions of different frequencies to generate the positional encoding.\n",
"class PositionalEncoding(nn.Module):\n",
" def __init__(self, d_model, max_seq_length):\n",
" super(PositionalEncoding, self).__init__()\n",
"\n",
" pe = torch.zeros(max_seq_length, d_model)\n",
" position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)\n",
" div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))\n",
"\n",
" pe[:, 0::2] = torch.sin(position * div_term)\n",
" pe[:, 1::2] = torch.cos(position * div_term)\n",
"\n",
" self.register_buffer('pe', pe.unsqueeze(0))\n",
"\n",
" def forward(self, x):\n",
" return x + self.pe[:, :x.size(1)]\n",
"\n",
"\n",
"# The EncoderLayer class defines a single layer of the transformer's encoder\n",
"# multi-head self-attention mechanism -> position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate\n",
"# Together, these components allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks\n",
"# Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.\n",
"class EncoderLayer(nn.Module):\n",
" def __init__(self, d_model, num_heads, d_ff, dropout):\n",
" super(EncoderLayer, self).__init__()\n",
" self.self_attn = MultiHeadAttention(d_model, num_heads)\n",
" self.feed_forward = PositionWiseFeedForward(d_model, d_ff)\n",
" self.norm1 = nn.LayerNorm(d_model)\n",
" self.norm2 = nn.LayerNorm(d_model)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, x, mask):\n",
" attn_output = self.self_attn(x, x, x, mask)\n",
" x = self.norm1(x + self.dropout(attn_output))\n",
" ff_output = self.feed_forward(x)\n",
" x = self.norm2(x + self.dropout(ff_output))\n",
" return x\n",
"\n",
"\n",
"# The DecoderLayer class defines a single layer of the transformer's decoder\n",
"# It consists of a multi-head self-attention mechanism, a multi-head cross-attention mechanism (that attends to the encoder's output), a position-wise feed-forward neural network, and the corresponding residual connections, layer normalization, and dropout layers.\n",
"# This combination enables the decoder to generate meaningful outputs based on the encoder's representations, taking into account both the target sequence and the source sequence.\n",
"# As with the encoder, multiple decoder layers are typically stacked to form the complete decoder part of a transformer model.\n",
"class DecoderLayer(nn.Module):\n",
" def __init__(self, d_model, num_heads, d_ff, dropout):\n",
" super(DecoderLayer, self).__init__()\n",
" self.self_attn = MultiHeadAttention(d_model, num_heads)\n",
" self.cross_attn = MultiHeadAttention(d_model, num_heads)\n",
" self.feed_forward = PositionWiseFeedForward(d_model, d_ff)\n",
" self.norm1 = nn.LayerNorm(d_model)\n",
" self.norm2 = nn.LayerNorm(d_model)\n",
" self.norm3 = nn.LayerNorm(d_model)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, x, enc_output, src_mask, tgt_mask):\n",
" attn_output = self.self_attn(x, x, x, tgt_mask)\n",
" x = self.norm1(x + self.dropout(attn_output))\n",
" attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)\n",
" x = self.norm2(x + self.dropout(attn_output))\n",
" ff_output = self.feed_forward(x)\n",
" x = self.norm3(x + self.dropout(ff_output))\n",
" return x\n",
"\n",
"\n",
"# The Transformer class brings together the various components of a Transformer model, including the embeddings, positional encoding, encoder layers, and decoder layers.\n",
"# It provides a convenient interface for training and inference, encapsulating the complexities of multi-head attention, feed-forward networks, and layer normalization.\n",
"class Transformer(nn.Module):\n",
" def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):\n",
" super(Transformer, self).__init__()\n",
" self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)\n",
" self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)\n",
" self.positional_encoding = PositionalEncoding(d_model, max_seq_length)\n",
"\n",
" self.encoder_layers = nn.ModuleList(\n",
" [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])\n",
" self.decoder_layers = nn.ModuleList(\n",
" [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])\n",
"\n",
" self.fc = nn.Linear(d_model, tgt_vocab_size)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def generate_mask(self, src, tgt):\n",
" src_mask = (src != 0).unsqueeze(1).unsqueeze(2)\n",
" tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)\n",
" seq_length = tgt.size(1)\n",
" nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()\n",
" tgt_mask = tgt_mask & nopeak_mask\n",
" return src_mask, tgt_mask\n",
"\n",
" def forward(self, src, tgt):\n",
" src_mask, tgt_mask = self.generate_mask(src, tgt)\n",
" src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))\n",
" tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))\n",
"\n",
" enc_output = src_embedded\n",
" for enc_layer in self.encoder_layers:\n",
" enc_output = enc_layer(enc_output, src_mask)\n",
"\n",
" dec_output = tgt_embedded\n",
" for dec_layer in self.decoder_layers:\n",
" dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)\n",
"\n",
" output = self.fc(dec_output)\n",
" return output"
],
"id": "b39d5cbe2904f20d"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
65 changes: 52 additions & 13 deletions daphnes_stuff/Transformer/train_transformer.ipynb
Original file line number Diff line number Diff line change
@@ -1,13 +1,52 @@
{
"cells": [
{
"metadata": {},
"cell_type": "raw",
"source": "",
"id": "e806128d44ab1fe5"
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

import transformer_model as tf
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy


# this is a test data set just to see if the model works
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = tf.Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length,
dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)

# training the model
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
optimizer.zero_grad()
output = transformer(src_data, tgt_data[:, :-1])
loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
loss.backward()
optimizer.step()
print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

# training model performance evaluation
transformer.eval()

# Generate random sample validation data
val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)
val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)) # (batch_size, seq_length)

with torch.no_grad():
val_output = transformer(val_src_data, val_tgt_data[:, :-1])
val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))
print(f"Validation Loss: {val_loss.item()}")
Loading