daphnebaud · valiferst · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/Notebooks/CNN/cnn_zeynep.ipynb b/Notebooks/CNN/cnn_zeynep.ipynb
diff --git a/Notebooks/CNN/requierements.txt b/Notebooks/CNN/requierements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+torchaudio
diff --git a/Notebooks/Transformer/train_transformer.ipynb b/Notebooks/Transformer/train_transformer.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "import transformer_model as tf\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.utils.data as data\n",
+    "import math\n",
+    "import copy\n",
+    "\n",
+    "\n",
+    "# this is a test data set just to see if the model works\n",
+    "src_vocab_size = 5000\n",
+    "tgt_vocab_size = 5000\n",
+    "d_model = 512\n",
+    "num_heads = 8\n",
+    "num_layers = 6\n",
+    "d_ff = 2048\n",
+    "max_seq_length = 100\n",
+    "dropout = 0.1\n",
+    "\n",
+    "transformer = tf.Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length,\n",
+    "                             dropout)\n",
+    "\n",
+    "# Generate random sample data\n",
+    "src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)\n",
+    "tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)\n",
+    "\n",
+    "# training the model\n",
+    "criterion = nn.CrossEntropyLoss(ignore_index=0)\n",
+    "optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)\n",
+    "\n",
+    "transformer.train()\n",
+    "\n",
+    "for epoch in range(100):\n",
+    "    optimizer.zero_grad()\n",
+    "    output = transformer(src_data, tgt_data[:, :-1])\n",
+    "    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    print(f\"Epoch: {epoch + 1}, Loss: {loss.item()}\")\n",
+    "\n",
+    "# training model performance evaluation\n",
+    "transformer.eval()\n",
+    "\n",
+    "# Generate random sample validation data\n",
+    "val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)\n",
+    "val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    val_output = transformer(val_src_data, val_tgt_data[:, :-1])\n",
+    "    val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))\n",
+    "    print(f\"Validation Loss: {val_loss.item()}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Notebooks/Transformer/transformer_model.ipynb b/Notebooks/Transformer/transformer_model.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "raw",
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.utils.data as data\n",
+    "import math\n",
+    "import copy\n",
+    "\n",
+    "\n",
+    "# The multi-head attention mechanism computes the attention between each pair of positions in a sequence.\n",
+    "# It consists of multiple “attention heads” that capture different aspects of the input sequence.\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self, d_model, num_heads):\n",
+    "        super(MultiHeadAttention, self).__init__()\n",
+    "        # Ensure that the model dimension (d_model) is divisible by the number of heads\n",
+    "        assert d_model % num_heads == 0, \"d_model must be divisible by num_heads\"\n",
+    "\n",
+    "        # Initialize dimensions\n",
+    "        self.d_model = d_model  # Model's dimension\n",
+    "        self.num_heads = num_heads  # Number of attention heads\n",
+    "        self.d_k = d_model // num_heads  # Dimension of each head's key, query, and value\n",
+    "\n",
+    "        # Linear layers for transforming inputs\n",
+    "        self.W_q = nn.Linear(d_model, d_model)  # Query transformation\n",
+    "        self.W_k = nn.Linear(d_model, d_model)  # Key transformation\n",
+    "        self.W_v = nn.Linear(d_model, d_model)  # Value transformation\n",
+    "        self.W_o = nn.Linear(d_model, d_model)  # Output transformation\n",
+    "\n",
+    "    def scaled_dot_product_attention(self, Q, K, V, mask=None):\n",
+    "        # Calculate attention scores\n",
+    "        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)\n",
+    "\n",
+    "        # Apply mask if provided (useful for preventing attention to certain parts like padding)\n",
+    "        if mask is not None:\n",
+    "            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)\n",
+    "\n",
+    "        # Softmax is applied to obtain attention probabilities\n",
+    "        attn_probs = torch.softmax(attn_scores, dim=-1)\n",
+    "\n",
+    "        # Multiply by values to obtain the final output\n",
+    "        output = torch.matmul(attn_probs, V)\n",
+    "        return output\n",
+    "\n",
+    "    def split_heads(self, x):\n",
+    "        # Reshape the input to have num_heads for multi-head attention\n",
+    "        batch_size, seq_length, d_model = x.size()\n",
+    "        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)\n",
+    "\n",
+    "    def combine_heads(self, x):\n",
+    "        # Combine the multiple heads back to original shape\n",
+    "        batch_size, _, seq_length, d_k = x.size()\n",
+    "        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)\n",
+    "\n",
+    "    def forward(self, Q, K, V, mask=None):\n",
+    "        # Apply linear transformations and split heads\n",
+    "        Q = self.split_heads(self.W_q(Q))\n",
+    "        K = self.split_heads(self.W_k(K))\n",
+    "        V = self.split_heads(self.W_v(V))\n",
+    "\n",
+    "        # Perform scaled dot-product attention\n",
+    "        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)\n",
+    "\n",
+    "        # Combine heads and apply output transformation\n",
+    "        output = self.W_o(self.combine_heads(attn_output))\n",
+    "        return output\n",
+    "\n",
+    "\n",
+    "class PositionWiseFeedForward(nn.Module):\n",
+    "    def __init__(self, d_model, d_ff):\n",
+    "        super(PositionWiseFeedForward, self).__init__()\n",
+    "        self.fc1 = nn.Linear(d_model, d_ff)\n",
+    "        self.fc2 = nn.Linear(d_ff, d_model)\n",
+    "        self.relu = nn.ReLU()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.fc2(self.relu(self.fc1(x)))\n",
+    "\n",
+    "# Positional Encoding is used to inject the position information of each token in the input sequence.\n",
+    "# It uses sine and cosine functions of different frequencies to generate the positional encoding.\n",
+    "class PositionalEncoding(nn.Module):\n",
+    "    def __init__(self, d_model, max_seq_length):\n",
+    "        super(PositionalEncoding, self).__init__()\n",
+    "\n",
+    "        pe = torch.zeros(max_seq_length, d_model)\n",
+    "        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)\n",
+    "        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))\n",
+    "\n",
+    "        pe[:, 0::2] = torch.sin(position * div_term)\n",
+    "        pe[:, 1::2] = torch.cos(position * div_term)\n",
+    "\n",
+    "        self.register_buffer('pe', pe.unsqueeze(0))\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return x + self.pe[:, :x.size(1)]\n",
+    "\n",
+    "\n",
+    "# The EncoderLayer class defines a single layer of the transformer's encoder\n",
+    "# multi-head self-attention mechanism -> position-wise feed-forward neural network, with residual connections, layer normalization, and dropout applied as appropriate\n",
+    "# Together, these components allow the encoder to capture complex relationships in the input data and transform them into a useful representation for downstream tasks\n",
+    "# Typically, multiple such encoder layers are stacked to form the complete encoder part of a transformer model.\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self, d_model, num_heads, d_ff, dropout):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        self.self_attn = MultiHeadAttention(d_model, num_heads)\n",
+    "        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)\n",
+    "        self.norm1 = nn.LayerNorm(d_model)\n",
+    "        self.norm2 = nn.LayerNorm(d_model)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x, mask):\n",
+    "        attn_output = self.self_attn(x, x, x, mask)\n",
+    "        x = self.norm1(x + self.dropout(attn_output))\n",
+    "        ff_output = self.feed_forward(x)\n",
+    "        x = self.norm2(x + self.dropout(ff_output))\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "# The DecoderLayer class defines a single layer of the transformer's decoder\n",
+    "# It consists of a multi-head self-attention mechanism, a multi-head cross-attention mechanism (that attends to the encoder's output), a position-wise feed-forward neural network, and the corresponding residual connections, layer normalization, and dropout layers.\n",
+    "# This combination enables the decoder to generate meaningful outputs based on the encoder's representations, taking into account both the target sequence and the source sequence.\n",
+    "# As with the encoder, multiple decoder layers are typically stacked to form the complete decoder part of a transformer model.\n",
+    "class DecoderLayer(nn.Module):\n",
+    "    def __init__(self, d_model, num_heads, d_ff, dropout):\n",
+    "        super(DecoderLayer, self).__init__()\n",
+    "        self.self_attn = MultiHeadAttention(d_model, num_heads)\n",
+    "        self.cross_attn = MultiHeadAttention(d_model, num_heads)\n",
+    "        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)\n",
+    "        self.norm1 = nn.LayerNorm(d_model)\n",
+    "        self.norm2 = nn.LayerNorm(d_model)\n",
+    "        self.norm3 = nn.LayerNorm(d_model)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def forward(self, x, enc_output, src_mask, tgt_mask):\n",
+    "        attn_output = self.self_attn(x, x, x, tgt_mask)\n",
+    "        x = self.norm1(x + self.dropout(attn_output))\n",
+    "        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)\n",
+    "        x = self.norm2(x + self.dropout(attn_output))\n",
+    "        ff_output = self.feed_forward(x)\n",
+    "        x = self.norm3(x + self.dropout(ff_output))\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "# The Transformer class brings together the various components of a Transformer model, including the embeddings, positional encoding, encoder layers, and decoder layers.\n",
+    "# It provides a convenient interface for training and inference, encapsulating the complexities of multi-head attention, feed-forward networks, and layer normalization.\n",
+    "class Transformer(nn.Module):\n",
+    "    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):\n",
+    "        super(Transformer, self).__init__()\n",
+    "        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)\n",
+    "        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)\n",
+    "        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)\n",
+    "\n",
+    "        self.encoder_layers = nn.ModuleList(\n",
+    "            [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])\n",
+    "        self.decoder_layers = nn.ModuleList(\n",
+    "            [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])\n",
+    "\n",
+    "        self.fc = nn.Linear(d_model, tgt_vocab_size)\n",
+    "        self.dropout = nn.Dropout(dropout)\n",
+    "\n",
+    "    def generate_mask(self, src, tgt):\n",
+    "        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)\n",
+    "        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)\n",
+    "        seq_length = tgt.size(1)\n",
+    "        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()\n",
+    "        tgt_mask = tgt_mask & nopeak_mask\n",
+    "        return src_mask, tgt_mask\n",
+    "\n",
+    "    def forward(self, src, tgt):\n",
+    "        src_mask, tgt_mask = self.generate_mask(src, tgt)\n",
+    "        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))\n",
+    "        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))\n",
+    "\n",
+    "        enc_output = src_embedded\n",
+    "        for enc_layer in self.encoder_layers:\n",
+    "            enc_output = enc_layer(enc_output, src_mask)\n",
+    "\n",
+    "        dec_output = tgt_embedded\n",
+    "        for dec_layer in self.decoder_layers:\n",
+    "            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)\n",
+    "\n",
+    "        output = self.fc(dec_output)\n",
+    "        return output"
+   ],
+   "id": "b39d5cbe2904f20d"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/daphnes_stuff/Transformer/train_transformer.ipynb b/daphnes_stuff/Transformer/train_transformer.ipynb
@@ -1,13 +1,52 @@
-{
- "cells": [
-  {
-   "metadata": {},
-   "cell_type": "raw",
-   "source": "",
-   "id": "e806128d44ab1fe5"
-  }
- ],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 5
-}
+
+import transformer_model as tf
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.utils.data as data
+import math
+import copy
+
+
+# this is a test data set just to see if the model works
+src_vocab_size = 5000
+tgt_vocab_size = 5000
+d_model = 512
+num_heads = 8
+num_layers = 6
+d_ff = 2048
+max_seq_length = 100
+dropout = 0.1
+
+transformer = tf.Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length,
+                             dropout)
+
+# Generate random sample data
+src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
+tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
+
+# training the model
+criterion = nn.CrossEntropyLoss(ignore_index=0)
+optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
+
+transformer.train()
+
+for epoch in range(100):
+    optimizer.zero_grad()
+    output = transformer(src_data, tgt_data[:, :-1])
+    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
+    loss.backward()
+    optimizer.step()
+    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")
+
+# training model performance evaluation
+transformer.eval()
+
+# Generate random sample validation data
+val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
+val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
+
+with torch.no_grad():
+    val_output = transformer(val_src_data, val_tgt_data[:, :-1])
+    val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))
+    print(f"Validation Loss: {val_loss.item()}")