diff --git a/labs/advanced1/README.md b/labs/advanced1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..026346a6949a93b022c0222bdb5acc95cafe1a8e
--- /dev/null
+++ b/labs/advanced1/README.md
@@ -0,0 +1,43 @@
+# Advanced lab 1: Turning GPT into BERT
+
+## Objective
+
+In this assignment, you will take the existing from-scratch implementation of the GPT architecture from lab 2 and modify it to implement the BERT architecture with minimal necessary changes. You will validate your implementation by loading pre-trained BERT weights from [Hugging Face](https://huggingface.co) and verifying that it produces the same input-output behaviour as the official BERT model.
+
+## Instructions
+
+1. **Understand the Codebase**
+
+   - Review the provided GPT implementation (`gpt2.py`), understanding its model components, training loop, and inference behaviour.
+   - Identify key differences between GPT and BERT architectures.
+
+2. **Modify GPT to implement BERT**
+
+   - Make only the minimal necessary modifications to the existing GPT model to turn it into a BERT model.
+   - Do not restructure the code unnecessarily â€” your goal is to adapt, not rewrite.
+
+3. **Validate your implementation**
+
+   - Load pre-trained BERT weights from Hugging Face for the [`bert-base-uncased`](https://huggingface.co/google-bert/bert-base-uncased) model.
+   - Run random test inputs through both your BERT implementation and the Hugging Face model.
+   - Compare the outputs to ensure your implementation behaves identically.
+
+4. **Add your work to your portfolio**
+
+   - Provide a `diff` file showing the differences between the original GPT implementation and your modified BERT implementation.
+   - Include a short report summarising the changes you made and how you verified correctness.
+   - Add the `diff` file and the report to your lab portfolio and present it at the oral exam.
+
+## Hints & considerations
+
+- Think about key architectural differences between GPT and BERT, such as causal vs. bidirectional attention and output layers.
+- Use Hugging Faceâ€™s `AutoModel` and `AutoTokenizer` to compare outputs with a real BERT model.
+- When validating, ensure that the tokenisation and preprocessing match between both models.
+
+## Deliverables
+
+- `gpt2bert.diff` â€“ showing modifications from GPT to BERT
+- `validate.py` â€“ demonstrating how you verified the correctness of your implementation
+- `report.md` â€“ short report summarising your work
+
+Good luck! ðŸš€
diff --git a/labs/advanced1/gpt2.py b/labs/advanced1/gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4f4e5a1f40cca52f035b18ca0b564d981d3c02
--- /dev/null
+++ b/labs/advanced1/gpt2.py
@@ -0,0 +1,118 @@
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class Config:
+    n_vocab = 50257
+    n_ctx = 1024
+    n_embd = 768
+    n_head = 12
+    n_layer = 12
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh((2 / torch.pi) ** 0.5 * (x + 0.044715 * x**3)))
+
+
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, config.n_embd * 4)
+        self.c_proj = nn.Linear(config.n_embd * 4, config.n_embd)
+
+    def forward(self, x):
+        (batch_size, seq_len, n_embd) = x.shape
+        x = self.c_fc(x)
+        x = gelu(x)
+        x = self.c_proj(x)
+        return x
+
+
+def make_causal_mask(n):
+    return torch.triu(torch.full((n, n), float("-inf")), diagonal=1)
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.n_head = config.n_head
+        self.c_attn = nn.Linear(config.n_embd, config.n_embd * 3)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        self.register_buffer("mask", make_causal_mask(config.n_ctx), persistent=False)
+
+    def forward(self, x):
+        (batch_size, seq_len, n_embd) = x.shape
+        head_embd = n_embd // self.n_head
+        (q, k, v) = self.c_attn(x).chunk(3, dim=-1)
+        q = q.view(batch_size, seq_len, self.n_head, head_embd)
+        k = k.view(batch_size, seq_len, self.n_head, head_embd)
+        v = v.view(batch_size, seq_len, self.n_head, head_embd)
+        q = q.transpose(-2, -3)
+        k = k.transpose(-2, -3)
+        v = v.transpose(-2, -3)
+        x = q @ k.transpose(-1, -2)
+        x = x / head_embd**0.5
+        x = x + self.mask[:seq_len, :seq_len]
+        x = torch.softmax(x, dim=-1)
+        x = x @ v
+        x = x.transpose(-2, -3).contiguous()
+        x = x.view(batch_size, seq_len, n_embd)
+        x = self.c_proj(x)
+        return x
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(config.n_embd))
+        self.b = nn.Parameter(torch.zeros(config.n_embd))
+
+    def forward(self, x):
+        mean = x.mean(dim=-1, keepdim=True)
+        variance = x.var(unbiased=False, dim=-1, keepdim=True)
+        return self.g * (x - mean) / torch.sqrt(variance + 1e-05) + self.b
+
+
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config)
+        self.attn = Attention(config)
+        self.ln_2 = LayerNorm(config)
+        self.mlp = MLP(config)
+
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+def make_positions(n):
+    return torch.arange(n, dtype=torch.long)
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.wte = nn.Embedding(config.n_vocab, config.n_embd)
+        self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
+        self.h = nn.Sequential(*(Block(config) for _ in range(config.n_layer)))
+        self.ln_f = LayerNorm(config)
+        self.lm_head = nn.Linear(config.n_embd, config.n_vocab, bias=False)
+        self.register_buffer("pos", make_positions(config.n_ctx), persistent=False)
+
+    def forward(self, x):
+        (batch_size, seq_len) = x.shape
+        wte = self.wte(x)
+        wpe = self.wpe(self.pos[:seq_len])
+        x = wte + wpe
+        x = self.h(x)
+        x = self.ln_f(x)
+        x = self.lm_head(x)
+        return x