Skip to content
Snippets Groups Projects
Commit e51c465a authored by Max Björkander's avatar Max Björkander
Browse files

still on ngm

parent 028fdee9
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import datasets import datasets
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from transformers import BertTokenizer, BertModel from transformers import BertTokenizer, BertModel
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm from tqdm import tqdm
import json import json
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
``` ```
%% Output
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 636kB/s]
c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\huggingface_hub\file_download.py:123: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\maxbj\.cache\huggingface\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
warnings.warn(message)
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.9kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 572kB/s]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
class NgmOne(nn.Module): class NgmOne(nn.Module):
def __init__(self): def __init__(self):
super(NgmOne, self).__init__() super(NgmOne, self).__init__()
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
self.bert = BertModel.from_pretrained("bert-base-uncased") self.bert = BertModel.from_pretrained("bert-base-uncased")
self.linear = nn.Linear(768, 247) self.linear = nn.Linear(768, 247)
self.softmax = nn.Softmax(dim=1) self.softmax = nn.Softmax(dim=1)
def forward(self, tokenized_seq): def forward(self, tokenized_seq, tokenized_mask):
x = self.bert.forward(tokenized_seq) x = self.bert.forward(tokenized_seq, attention_mask=tokenized_mask)
x = x[0][:,0,:]
x = self.linear(x) x = self.linear(x)
x = self.softmax(x) x = self.softmax(x)
return x return x
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# def encode(batch): # def encode(batch):
# return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt") # return tokenizer(batch, padding="max_length", max_length=256, return_tensors="pt")
# def convert_to_features(example_batch): # def convert_to_features(example_batch):
# input_encodings = encode(example_batch['text']) # input_encodings = encode(example_batch['text'])
# target_encodings = encode(example_batch['summary']) # target_encodings = encode(example_batch['summary'])
# labels = target_encodings['input_ids'] # labels = target_encodings['input_ids']
# decoder_input_ids = shift_tokens_right( # decoder_input_ids = shift_tokens_right(
# labels, model.config.pad_token_id, model.config.decoder_start_token_id) # labels, model.config.pad_token_id, model.config.decoder_start_token_id)
# labels[labels[:, :] == model.config.pad_token_id] = -100 # labels[labels[:, :] == model.config.pad_token_id] = -100
# encodings = { # encodings = {
# 'input_ids': input_encodings['input_ids'], # 'input_ids': input_encodings['input_ids'],
# 'attention_mask': input_encodings['attention_mask'], # 'attention_mask': input_encodings['attention_mask'],
# 'decoder_input_ids': decoder_input_ids, # 'decoder_input_ids': decoder_input_ids,
# 'labels': labels, # 'labels': labels,
# } # }
# return encodings # return encodings
# def get_dataset(path): # def get_dataset(path):
# df = pd.read_csv(path, sep=",", on_bad_lines='skip') # df = pd.read_csv(path, sep=",", on_bad_lines='skip')
# dataset = datasets.Dataset.from_pandas(df) # dataset = datasets.Dataset.from_pandas(df)
# dataset = dataset.map(convert_to_features, batched=True) # dataset = dataset.map(convert_to_features, batched=True)
# columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ] # columns = ['input_ids', 'labels', 'decoder_input_ids', 'attention_mask', ]
# dataset.set_format(type='torch', columns=columns) # dataset.set_format(type='torch', columns=columns)
# return dataset # return dataset
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def make_batch(): def make_batch():
"""Triplet is a list of [subject entity, relation, object entity], None if not present""" """Triplet is a list of [subject entity, relation, object entity], None if not present"""
# Load predicted data # Load predicted data
pred = "../data/qald-9-train-linked.json" pred = "../data/qald-9-train-linked.json"
#Load gold data #Load gold data
gold = "../data/qald-9-train-linked.json" gold = "../data/qald-9-train-linked.json"
print("Beginning making batch") print("Beginning making batch")
with open(pred, "r") as p, open(gold, "r") as g: with open(pred, "r") as p, open(gold, "r") as g:
pred = json.load(p) pred = json.load(p)
gold = json.load(g) gold = json.load(g)
inputs = [] inputs = []
inputs_max_len = 0 inputs_max_len = 0
for d in tqdm(pred["questions"]): for d in tqdm(pred["questions"]):
question = d["question"][0]["string"] question = d["question"][0]["string"]
query = d["query"]["sparql"] query = d["query"]["sparql"]
#Take the first tripletin query #Take the first tripletin query
trip = query.split("WHERE")[1] trip = query.split("WHERE")[1]
trip = trip.replace("{", "").replace("}", "") trip = trip.replace("{", "").replace("}", "")
triplet = trip.split(" ") triplet = trip.split(" ")
#remove empty strings #remove empty strings
triplet = [x for x in triplet if x != ""] triplet = [x for x in triplet if x != ""]
for t in triplet: for t in triplet:
if not(t.find("?")): if not(t.find("?")):
triplet[triplet.index(t)] = None triplet[triplet.index(t)] = None
#seq = "[CLS] " + question + " [SEP] " #seq = "[CLS] " + question + " [SEP] "
if triplet[0] is not None: if triplet[0] is not None:
#seq += "[SUB] [SEP] " + triplet[0] #seq += "[SUB] [SEP] " + triplet[0]
# , padding=True, truncation=True) # , padding=True, truncation=True)
tokenized_seq = tokenizer(question, "[SUB]", triplet[0], padding=True, truncation=True) tokenized_seq = tokenizer(question, "[SUB]", triplet[0], padding=True, truncation=True)
elif triplet[2] is not None: elif triplet[2] is not None:
#seq += "[OBJ] [SEP] " + triplet[2] #seq += "[OBJ] [SEP] " + triplet[2]
tokenized_seq = tokenizer(question, "[OBJ]", triplet[2], padding=True, truncation=True) tokenized_seq = tokenizer(question, "[OBJ]", triplet[2], padding=True, truncation=True)
if inputs_max_len < len(tokenized_seq["input_ids"]): if inputs_max_len < len(tokenized_seq["input_ids"]):
inputs_max_len = len(tokenized_seq["input_ids"]) inputs_max_len = len(tokenized_seq["input_ids"])
inputs.append(list(tokenized_seq.values())[0]) inputs.append(list(tokenized_seq.values())[0])
correct_rels_max_len = 0 correct_rels_max_len = 0
correct_rels = [] correct_rels = []
for d in tqdm(gold["questions"]): for d in tqdm(gold["questions"]):
question = d["question"][0]["string"] question = d["question"][0]["string"]
query = d["query"]["sparql"] query = d["query"]["sparql"]
#Take the first tripletin query #Take the first tripletin query
trip = query.split("WHERE")[1] trip = query.split("WHERE")[1]
trip = trip.replace("{", "").replace("}", "") trip = trip.replace("{", "").replace("}", "")
triplet = trip.split(" ") triplet = trip.split(" ")
#remove empty strings #remove empty strings
triplet = [x for x in triplet if x != ""] triplet = [x for x in triplet if x != ""]
tokenized = tokenizer(triplet[1], padding=True, truncation=True) tokenized = tokenizer(triplet[1], padding=True, truncation=True)
if correct_rels_max_len < len(tokenized["input_ids"]): if correct_rels_max_len < len(tokenized["input_ids"]):
correct_rels_max_len = len(tokenized["input_ids"]) correct_rels_max_len = len(tokenized["input_ids"])
correct_rels.append(list(tokenized.values())[0]) correct_rels.append(list(tokenized.values())[0])
inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs]) inputs_padded = np.array([i + [0]*(inputs_max_len-len(i)) for i in inputs])
correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels]) correct_rels_padded = np.array([i + [0]*(correct_rels_max_len-len(i)) for i in correct_rels])
inputs_attention_mask = np.where(inputs_padded != 0, 1, 0)
correct_rels_attention_mask = np.where(correct_rels_padded != 0, 1, 0)
print("Finished with batches") print("Finished with batches")
return torch.IntTensor(inputs_padded), torch.IntTensor(correct_rels_padded) return torch.IntTensor(inputs_padded), torch.IntTensor(inputs_attention_mask), torch.IntTensor(correct_rels_padded), torch.IntTensor(correct_rels_attention_mask)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# training_args = Seq2SeqTrainingArguments( # training_args = Seq2SeqTrainingArguments(
# output_dir='./models/blackbox', # output_dir='./models/blackbox',
# num_train_epochs=1, # num_train_epochs=1,
# per_device_train_batch_size=1, # per_device_train_batch_size=1,
# per_device_eval_batch_size=1, # per_device_eval_batch_size=1,
# warmup_steps=10, # warmup_steps=10,
# weight_decay=0.01, # weight_decay=0.01,
# logging_dir='./logs', # logging_dir='./logs',
# ) # )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
model = NgmOne() model = NgmOne()
EPOCHS = 3 EPOCHS = 3
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = optim.Adam(model.parameters(), lr=0.001)
train, corr_rels = make_batch() train,train_mask, corr_rels, correct_rels_mask = make_batch()
for epoch in tqdm(range(EPOCHS)): for epoch in tqdm(range(EPOCHS)):
optimizer.zero_grad() optimizer.zero_grad()
# Forward pass # Forward pass
output = model(train) output = model(train, train_mask)
loss = criterion(output, corr_rels) loss = criterion(output, corr_rels)
if (epoch + 1) % 10 == 0: if (epoch + 1) % 10 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
# Backward pass # Backward pass
loss.backward() loss.backward()
optimizer.step() optimizer.step()
``` ```
%% Output %% Output
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Beginning making batch
100%|██████████| 408/408 [00:00<00:00, 688.03it/s]
100%|██████████| 408/408 [00:00<00:00, 2241.79it/s]
Finished with batches
0%| | 0/3 [03:03<?, ?it/s]
--------------------------------------------------------------------------- ---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last) RuntimeError Traceback (most recent call last)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:399, in load_state_dict(checkpoint_file) Cell In [26], line 13
398 try: 11 # Forward pass
--> 399 return torch.load(checkpoint_file, map_location="cpu") 12 output = model(train, train_mask)
400 except Exception as e: ---> 13 loss = criterion(output, corr_rels)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:713, in load(f, map_location, pickle_module, **pickle_load_args) 15 if (epoch + 1) % 10 == 0:
712 return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) 16 print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
--> 713 return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\modules\module.py:1130, in Module._call_impl(self, *input, **kwargs)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:930, in _legacy_load(f, map_location, pickle_module, **pickle_load_args) 1126 # If we don't have any hooks, we want to skip the rest of the logic in
929 unpickler.persistent_load = persistent_load 1127 # this function, and just call forward.
--> 930 result = unpickler.load() 1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
932 deserialized_storage_keys = pickle_module.load(f, **pickle_load_args) 1129 or _global_forward_hooks or _global_forward_pre_hooks):
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\serialization.py:871, in _legacy_load.<locals>.persistent_load(saved_id) -> 1130 return forward_call(*input, **kwargs)
870 if root_key not in deserialized_objects: 1131 # Do not call functions when jit is used
--> 871 obj = cast(Storage, torch._UntypedStorage(nbytes)) 1132 full_backward_hooks, non_full_backward_hooks = [], []
872 obj._torch_load_uninitialized = True File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\modules\loss.py:1164, in CrossEntropyLoss.forward(self, input, target)
RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 93763584 bytes. 1163 def forward(self, input: Tensor, target: Tensor) -> Tensor:
-> 1164 return F.cross_entropy(input, target, weight=self.weight,
During handling of the above exception, another exception occurred: 1165 ignore_index=self.ignore_index, reduction=self.reduction,
MemoryError Traceback (most recent call last) 1166 label_smoothing=self.label_smoothing)
Cell In [69], line 1 File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\functional.py:3014, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
----> 1 model = NgmOne() 3012 if size_average is not None or reduce is not None:
3 EPOCHS = 3 3013 reduction = _Reduction.legacy_get_string(size_average, reduce)
4 criterion = nn.CrossEntropyLoss() -> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
Cell In [64], line 5, in NgmOne.__init__(self) RuntimeError: 0D or 1D target tensor expected, multi-target not supported
3 super(NgmOne, self).__init__()
4 self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
----> 5 self.bert = BertModel.from_pretrained("bert-base-uncased")
6 self.linear = nn.Linear(768, 247)
7 self.softmax = nn.Softmax(dim=1)
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:2184, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
2181 if from_pt:
2182 if not is_sharded and state_dict is None:
2183 # Time to load the checkpoint
-> 2184 state_dict = load_state_dict(resolved_archive_file)
2186 # set dtype to instantiate the model under:
2187 # 1. If torch_dtype is not None, we use that dtype
2188 # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
2189 # weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype
2190 # we also may have config.torch_dtype available, but we won't rely on it till v5
2191 dtype_orig = None
File c:\Users\maxbj\AppData\Local\Programs\Python\Python39\lib\site-packages\transformers\modeling_utils.py:403, in load_state_dict(checkpoint_file)
401 try:
402 with open(checkpoint_file) as f:
--> 403 if f.read().startswith("version"):
404 raise OSError(
405 "You seem to have cloned a repository without having git-lfs installed. Please install "
406 "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
407 "you cloned."
408 )
409 else:
MemoryError:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment