Skip to content
Snippets Groups Projects
Commit c6a82125 authored by Max Björkander's avatar Max Björkander
Browse files

kept on working on ngm but now something is broken

parent 1c8ba7a8
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import datasets
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from transformers.models.bert.modeling_bert import shift_tokens_right
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm
import json
```
%% Cell type:code id: tags:
``` python
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
```
%% Output
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s]
c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\huggingface_hub\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\maxbj\.cache\huggingface\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
warnings.warn(message)
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]
%% Cell type:code id: tags:
``` python
class NgmOne(nn.Module):
def __init__(self):
super(NgmOne, self).__init__()
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
self.bert = BertModel.from_pretrained("bert-base-uncased")
self.linear = nn.Linear(768, 1)
self.linear = nn.Linear(768, 247)
self.softmax = nn.Softmax(dim=1)
def forward(self, triplet, question):
"""Triplet is a list of subject entity, relation, object entity, None if not present"""
def forward(self, tokenized_seq):
x = self.bert.forward(tokenized_seq)
x = self.linear(x)
x = self.softmax(x)
return x
```
%% Cell type:code id: tags:
``` python
# def encode(batch):
# return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
# def convert_to_features(example_batch):
# input_encodings = encode(example_batch['text'])
# target_encodings = encode(example_batch['summary'])
# labels = target_encodings['input_ids']
# decoder_input_ids = shift_tokens_right(
# labels, model.config.pad_token_id, model.config.decoder_start_token_id)
# labels[labels[:, :] == model.config.pad_token_id] = -100
# encodings = {
# 'input_ids': input_encodings['input_ids'],
# 'attention_mask': input_encodings['attention_mask'],
# 'decoder_input_ids': decoder_input_ids,
# 'labels': labels,
# }
# return encodings
# def get_dataset(path):
# df = pd.read_csv(path, sep=",", on_bad_lines='skip')
# dataset = datasets.Dataset.from_pandas(df)
# dataset = dataset.map(convert_to_features, batched=True)
# columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
# dataset.set_format(type='torch', columns=columns)
# return dataset
```
%% Cell type:code id: tags:
``` python
def make_batch():
"""Triplet is a list of [subject entity, relation, object entity], None if not present"""
# Load predicted data
pred = "../data/qald-9-train-linked.json"
#Load gold data
gold = "../data/qald-9-train-linked.json"
print("Beginning making batch")
with open(pred, "r") as p, open(gold, "r") as g:
pred = json.load(p)
gold = json.load(g)
inputs = []
inputs_max_len = 0
for d in tqdm(pred["questions"]):
question = d["question"][0]["string"]
query = d["query"]["sparql"]
#Take the first tripletin query
trip = query.split("WHERE")[1]
trip = trip.replace("{", "").replace("}", "")
triplet = trip.split(" ")
#remove empty strings
triplet = [x for x in triplet if x != ""]
for t in triplet:
if not(t.find("?")):
triplet[triplet.index(t)] = None
#seq = "[CLS] " + question + " [SEP] "
if triplet[0] is not None:
#seq += "[SUB] [SEP] " + triplet[0]
tokenized_seq = self.tokenizer(question, "[SUB]", triplet[0])#, padding=True, truncation=True)
# , padding=True, truncation=True)
tokenized_seq = tokenizer(question, "[SUB]", triplet[0], padding=True, truncation=True)
elif triplet[2] is not None:
#seq += "[OBJ] [SEP] " + triplet[2]
tokenized_seq = self.tokenizer(question, "[OBJ]", triplet[2])#, padding=True, truncation=True)
x = self.bert.forward(**tokenized_seq)
x = self.linear(x)
x = self.softmax(x)
return x
tokenized_seq = tokenizer(question, "[OBJ]", triplet[2], padding=True, truncation=True)
if inputs_max_len < len(tokenized_seq["input_ids"]):
inputs_max_len = len(tokenized_seq["input_ids"])
inputs.append(list(tokenized_seq.values())[0])
correct_rels_max_len = 0
correct_rels = []
for d in tqdm(gold["questions"]):
question = d["question"][0]["string"]
query = d["query"]["sparql"]
#Take the first tripletin query
trip = query.split("WHERE")[1]
trip = trip.replace("{", "").replace("}", "")
triplet = trip.split(" ")
#remove empty strings
triplet = [x for x in triplet if x != ""]
tokenized = tokenizer(triplet[1], padding=True, truncation=True)
if correct_rels_max_len < len(tokenized["input_ids"]):
correct_rels_max_len = len(tokenized["input_ids"])
correct_rels.append(list(tokenized.values())[0])
inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])
correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])
print("Finished with batches")
return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded)
```
%% Cell type:code id: tags:
``` python
# training_args = Seq2SeqTrainingArguments(
# output_dir='./models/blackbox',
# num_train_epochs=1,
# per_device_train_batch_size=1,
# per_device_eval_batch_size=1,
# warmup_steps=10,
# weight_decay=0.01,
# logging_dir='./logs',
# )
```
%% Cell type:code id: tags:
``` python
def encode(batch):
return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
model = NgmOne()
EPOCHS = 3
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train, corr_rels = make_batch()
for epoch in tqdm(range(EPOCHS)):
optimizer.zero_grad()
# Forward pass
output = model(train)
loss = criterion(output, corr_rels)
if (epoch + 1) % 10 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
# Backward pass
loss.backward()
optimizer.step()
def convert_to_features(example_batch):
input_encodings = encode(example_batch['text'])
target_encodings = encode(example_batch['summary'])
labels = target_encodings['input_ids']
decoder_input_ids = shift_tokens_right(
labels, model.config.pad_token_id, model.config.decoder_start_token_id)
labels[labels[:, :] == model.config.pad_token_id] = -100
encodings = {
'input_ids': input_encodings['input_ids'],
'attention_mask': input_encodings['attention_mask'],
'decoder_input_ids': decoder_input_ids,
'labels': labels,
}
return encodings
def get_dataset(path):
df = pd.read_csv(path, sep=",", on_bad_lines='skip')
dataset = datasets.Dataset.from_pandas(df)
dataset = dataset.map(convert_to_features, batched=True)
columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
dataset.set_format(type='torch', columns=columns)
return dataset
```
%% Output
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:399, in load_state_dict(checkpoint_file)
398 try:
--> 399 return torch.load(checkpoint_file, map_location="cpu")
400 except Exception as e:
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:713, in load(f, map_location, pickle_module, **pickle_load_args)
712 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
--> 713 return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:930, in _legacy_load(f, map_location, pickle_module, **pickle_load_args)
929 unpickler.persistent_load = persistent_load
--> 930 result = unpickler.load()
932 deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:871, in _legacy_load.<locals>.persistent_load(saved_id)
870 if root_key not in deserialized_objects:
--> 871 obj = cast(Storage, torch._UntypedStorage(nbytes))
872 obj._torch_load_uninitialized = True
RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes.
During handling of the above exception, another exception occurred:
MemoryError Traceback (most recent call last)
Cell In [69], line 1
----> 1 model = NgmOne()
3 EPOCHS = 3
4 criterion = nn.CrossEntropyLoss()
Cell In [64], line 5, in NgmOne.__init__(self)
3 super(NgmOne, self).__init__()
4 self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
----> 5 self.bert = BertModel.from_pretrained("bert-base-uncased")
6 self.linear = nn.Linear(768, 247)
7 self.softmax = nn.Softmax(dim=1)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:2184, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
2181 if from_pt:
2182 if not is_sharded and state_dict is None:
2183 # Time to load the checkpoint
-> 2184 state_dict = load_state_dict(resolved_archive_file)
2186 # set dtype to instantiate the model under:
2187 # 1. If torch_dtype is not None, we use that dtype
2188 # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
2189 # weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
2190 # we also may have config.torch_dtype available, but we won't rely on it till v5
2191 dtype_orig = None
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:403, in load_state_dict(checkpoint_file)
401 try:
402 with open(checkpoint_file) as f:
--> 403 if f.read().startswith("version"):
404 raise OSError(
405 "You seem to have cloned a repository without having git-lfs installed. Please install "
406 "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
407 "you cloned."
408 )
409 else:
MemoryError:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment