Skip to content
Snippets Groups Projects
Commit 30c34c73 authored by Marco Kuhlmann's avatar Marco Kuhlmann
Browse files

Add Advanced lab 1

parent 9ef62411
No related branches found
No related tags found
No related merge requests found
# Advanced lab 1: Turning GPT into BERT
## Objective
In this assignment, you will take the existing from-scratch implementation of the GPT architecture from lab 2 and modify it to implement the BERT architecture with minimal necessary changes. You will validate your implementation by loading pre-trained BERT weights from [Hugging Face](https://huggingface.co) and verifying that it produces the same input-output behaviour as the official BERT model.
## Instructions
1. **Understand the Codebase**
- Review the provided GPT implementation (`gpt2.py`), understanding its model components, training loop, and inference behaviour.
- Identify key differences between GPT and BERT architectures.
2. **Modify GPT to implement BERT**
- Make only the minimal necessary modifications to the existing GPT model to turn it into a BERT model.
- Do not restructure the code unnecessarily — your goal is to adapt, not rewrite.
3. **Validate your implementation**
- Load pre-trained BERT weights from Hugging Face for the [`bert-base-uncased`](https://huggingface.co/google-bert/bert-base-uncased) model.
- Run random test inputs through both your BERT implementation and the Hugging Face model.
- Compare the outputs to ensure your implementation behaves identically.
4. **Add your work to your portfolio**
- Provide a `diff` file showing the differences between the original GPT implementation and your modified BERT implementation.
- Include a short report summarising the changes you made and how you verified correctness.
- Add the `diff` file and the report to your lab portfolio and present it at the oral exam.
## Hints & considerations
- Think about key architectural differences between GPT and BERT, such as causal vs. bidirectional attention and output layers.
- Use Hugging Face’s `AutoModel` and `AutoTokenizer` to compare outputs with a real BERT model.
- When validating, ensure that the tokenisation and preprocessing match between both models.
## Deliverables
- `gpt2bert.diff` – showing modifications from GPT to BERT
- `validate.py` – demonstrating how you verified the correctness of your implementation
- `report.md` – short report summarising your work
Good luck! 🚀
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
@dataclass
class Config:
n_vocab = 50257
n_ctx = 1024
n_embd = 768
n_head = 12
n_layer = 12
def gelu(x):
return 0.5 * x * (1 + torch.tanh((2 / torch.pi) ** 0.5 * (x + 0.044715 * x**3)))
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, config.n_embd * 4)
self.c_proj = nn.Linear(config.n_embd * 4, config.n_embd)
def forward(self, x):
(batch_size, seq_len, n_embd) = x.shape
x = self.c_fc(x)
x = gelu(x)
x = self.c_proj(x)
return x
def make_causal_mask(n):
return torch.triu(torch.full((n, n), float("-inf")), diagonal=1)
class Attention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
self.n_head = config.n_head
self.c_attn = nn.Linear(config.n_embd, config.n_embd * 3)
self.c_proj = nn.Linear(config.n_embd, config.n_embd)
self.register_buffer("mask", make_causal_mask(config.n_ctx), persistent=False)
def forward(self, x):
(batch_size, seq_len, n_embd) = x.shape
head_embd = n_embd // self.n_head
(q, k, v) = self.c_attn(x).chunk(3, dim=-1)
q = q.view(batch_size, seq_len, self.n_head, head_embd)
k = k.view(batch_size, seq_len, self.n_head, head_embd)
v = v.view(batch_size, seq_len, self.n_head, head_embd)
q = q.transpose(-2, -3)
k = k.transpose(-2, -3)
v = v.transpose(-2, -3)
x = q @ k.transpose(-1, -2)
x = x / head_embd**0.5
x = x + self.mask[:seq_len, :seq_len]
x = torch.softmax(x, dim=-1)
x = x @ v
x = x.transpose(-2, -3).contiguous()
x = x.view(batch_size, seq_len, n_embd)
x = self.c_proj(x)
return x
class LayerNorm(nn.Module):
def __init__(self, config):
super().__init__()
self.g = nn.Parameter(torch.ones(config.n_embd))
self.b = nn.Parameter(torch.zeros(config.n_embd))
def forward(self, x):
mean = x.mean(dim=-1, keepdim=True)
variance = x.var(unbiased=False, dim=-1, keepdim=True)
return self.g * (x - mean) / torch.sqrt(variance + 1e-05) + self.b
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.ln_1 = LayerNorm(config)
self.attn = Attention(config)
self.ln_2 = LayerNorm(config)
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
def make_positions(n):
return torch.arange(n, dtype=torch.long)
class Model(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.wte = nn.Embedding(config.n_vocab, config.n_embd)
self.wpe = nn.Embedding(config.n_ctx, config.n_embd)
self.h = nn.Sequential(*(Block(config) for _ in range(config.n_layer)))
self.ln_f = LayerNorm(config)
self.lm_head = nn.Linear(config.n_embd, config.n_vocab, bias=False)
self.register_buffer("pos", make_positions(config.n_ctx), persistent=False)
def forward(self, x):
(batch_size, seq_len) = x.shape
wte = self.wte(x)
wpe = self.wpe(self.pos[:seq_len])
x = wte + wpe
x = self.h(x)
x = self.ln_f(x)
x = self.lm_head(x)
return x
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment