diff --git a/project/load_data.py b/project/load_data.py new file mode 100644 index 0000000000000000000000000000000000000000..ba39cfa893454ea70defaa103f0891661a630120 --- /dev/null +++ b/project/load_data.py @@ -0,0 +1,55 @@ +import pandas as pd +import numpy as np + +def load_data(): + file_path = 'C:\\repos\\text-mining\\project\\all_games.csv' + df = pd.read_csv(file_path) + return df + +def split_data(df): + """ + Split data into two parts; a training dataset, and a test dataset. + Returns the two parts as a tuple.c + """ + test_data = pd.DataFrame() # Initialize an empty DataFrame for test data + drop_indexes = [] + min_review_score = 1 + max_review_score = 100 + for i in range(min_review_score, max_review_score+1): + row = df.loc[df['meta_score'] == i] + # Check if the row is an empty DataFrame + if row.empty: + continue + if row.isnull().values.any(): + print("null") + # Select only the first row for the current meta_score + first_row = row.iloc[0] + drop_indexes.append(first_row.name) + + # Add the first row to test_data + test_data = pd.concat([test_data, pd.DataFrame([first_row])]) + # HERE I want to delete the row that's been added to the test data from the original data + df = df.drop(drop_indexes).copy() + # Shuffle the remaining data for training + df = df.sample(frac=1.0, random_state=200) + training_data = df + return training_data, test_data + + +def prep_data(df): + print(f"Size before prep: {len(df)}") + + df = df.dropna(subset=["summary"]) + print(f"Size after first drop: {len(df)}") + training, test = split_data(df) + print(f"Size of training: {len(training)}\nSize of test: {len(test)}") + train_X = np.array(training["summary"]) + train_Y = np.array(training["meta_score"]) + test_X = np.array(test["summary"]) + test_Y = np.array(test["meta_score"]) + return train_X, train_Y, test_X, test_Y + +def load_and_prep_data(): + df = load_data() + return prep_data(df) + diff --git a/project/nn_pytorch_gpu.py b/project/nn_pytorch_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..b367e89c0ffc39f39b7498961aa88e9862c93df8 --- /dev/null +++ b/project/nn_pytorch_gpu.py @@ -0,0 +1,175 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from transformers import AutoModel, AutoTokenizer +import matplotlib.pyplot as plt +from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error + +from load_data import load_and_prep_data + +# for GPU +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# load data +train_X, train_Y, test_X, test_Y = load_and_prep_data() + +# params +max_sequence_length = 180 +embedding_dim = 6000 +RNN = True # should, for report, always be true + +# fetching tokenizer +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +X_train_sequences = tokenizer.batch_encode_plus( + list(train_X), padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors="pt" +)['input_ids'] + +X_test_sequences = tokenizer.batch_encode_plus( + list(test_X), padding='max_length', truncation=True, max_length=max_sequence_length, return_tensors="pt" +)['input_ids'] +max_vocab_size = tokenizer.vocab_size + +def pad_sequences(sequences, maxlen, padding='post', truncating='pre'): + padded = torch.zeros((len(sequences), maxlen), dtype=torch.long) + for i, seq in enumerate(sequences): + if truncating == 'pre': + seq = seq[-maxlen:] + else: + seq = seq[:maxlen] + if padding == 'post': + padded[i, :len(seq)] = torch.tensor(seq, dtype=torch.long) + else: + padded[i, -len(seq):] = torch.tensor(seq, dtype=torch.long) + return padded + +X_train = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='pre').to(device) +X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='pre').to(device) + +y_train = torch.tensor(train_Y, dtype=torch.float32).unsqueeze(1).to(device) +y_test = torch.tensor(test_Y, dtype=torch.float32).unsqueeze(1).to(device) + +X_train_tensor = torch.clamp(X_train, max=max_vocab_size - 1) +X_test_tensor = torch.clamp(X_test, max=max_vocab_size - 1) + +dataset_train = TensorDataset(X_train_tensor, y_train) +dataset_test = TensorDataset(X_test_tensor, y_test) + +dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True) +dataloader_test = DataLoader(dataset_test, batch_size=32, shuffle=False) + +#class CNNModel(nn.Module): +# def __init__(self, vocab_size, embedding_dim, output_dim): +# super(CNNModel, self).__init__() +# self.embedding = nn.Embedding(vocab_size, embedding_dim) +# self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=5) +# self.pool = nn.AdaptiveMaxPool1d(1) +# self.fc1 = nn.Linear(64, 32) +# self.dropout = nn.Dropout(0.5) +# self.fc2 = nn.Linear(32, output_dim) +# +# def forward(self, x): +# x = self.embedding(x).permute(0, 2, 1) +# x = F.relu(self.conv1(x)) +# x = self.pool(x).squeeze(2) +# x = F.relu(self.fc1(x)) +# x = self.dropout(x) +# return self.fc2(x) +# +#class LSTMModelAlt(nn.Module): +# def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim): +# super(LSTMModel, self).__init__() +# self.embedding = nn.Embedding(vocab_size, embedding_dim) +# self.lstm = nn.LSTM(128, hidden_dim, batch_first=True) +# self.fc1 = nn.Linear(hidden_dim, 16) +# #self.fc2 = nn.Linear(16, hidden_dim) +# +# self.dropout = nn.Dropout(0.5) +# self.fc3 = nn.Linear(16, output_dim) +# +# def forward(self, x): +# x = self.embedding(x) +# x, _ = self.lstm(x) +# x = x[:, -1, :] +# x = F.relu(self.fc1(x)) +# x = self.dropout(x) +# +# return self.fc3(x) + +class LSTMModel(nn.Module): + def __init__(self, vocab_size): + super(LSTMModel, self).__init__() + self.embedding = nn.Embedding(vocab_size, 128) + self.lstm = nn.LSTM(128, 32, batch_first=True) + self.fc1 = nn.Linear(32, 16) + self.dropout = nn.Dropout(0.5) + self.fc2 = nn.Linear(16, 1) + + def forward(self, x): + x = self.embedding(x) + x, _ = self.lstm(x) + x = x[:, -1, :] + x = F.relu(self.fc1(x)) + x = self.dropout(x) + return self.fc2(x) +model = LSTMModel(max_vocab_size) +model.to(device) # to gpu + +# training time +optimizer = optim.Adam(model.parameters(), lr=0.001) +criterion = nn.MSELoss() + + +def plot_losses(train_losses, epochs): + # Plot the training loss + plt.figure(figsize=(10, 6)) + plt.plot(range(1, epochs + 1), train_losses, marker='o', label='Training Loss') + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.title('Training Loss Over Epochs') + plt.legend() + plt.grid() + plt.show() + +def training(): + # training + losses = [] + epochs = 24 + for epoch in range(epochs): + batch_losses = [] + model.train() + for batch_X, batch_Y in dataloader_train: + batch_X, batch_Y = batch_X.to(device), batch_Y.to(device) # Move data to GPU + optimizer.zero_grad() + output = model(batch_X) + loss = criterion(output, batch_Y) + loss.backward() + + optimizer.step() + batch_losses.append(loss.item()) + avg_loss = sum(batch_losses) / len(batch_losses) + print(f"Epoch {epoch}: Loss = {avg_loss}") + losses.append(avg_loss) + # eval + model.eval() + y_true = [] + y_pred = [] + with torch.no_grad(): + for batch_X, batch_Y in dataloader_test: + #batch_X, batch_Y = batch + batch_X, batch_Y = batch_X.to(device), batch_Y.to(device) + predictions = model(batch_X) + y_true.extend(batch_Y.cpu().numpy()) + y_pred.extend(predictions.cpu().numpy()) + mae_value = mean_absolute_error(y_true, y_pred) + mse = mean_squared_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + + print(f"MAE: {mae_value}, MSE: {mse}, R2: {r2}") + + plot_losses(losses, epochs) + +training() + diff --git a/project/nn_tensorflow.py b/project/nn_tensorflow.py new file mode 100644 index 0000000000000000000000000000000000000000..5d8405e73a2d82265418cc0d70bf159418ff3faa --- /dev/null +++ b/project/nn_tensorflow.py @@ -0,0 +1,89 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Conv2D, LSTM +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.callbacks import TensorBoard +import datetime +from transformers import TFAutoModel, AutoTokenizer +from tensorflow.keras.layers import Input, Dense, Dropout +from tensorflow.keras.models import Model +from load_data import load_and_prep_data +import matplotlib.pyplot as plt + +RUN_NETWORK = True +RNN = True + +# prep data and parameters +train_X, train_Y, test_X, test_Y = load_and_prep_data() +max_vocab_size = 6000 +max_sequence_length = 180 +embedding_dim = 6000 + +# custom tokenizer +tokenizer = Tokenizer(num_words=max_vocab_size) +tokenizer.fit_on_texts(train_X) +tokenizer.fit_on_texts(test_X) + + +X_train_sequences = tokenizer.texts_to_sequences(train_X) +X_test_sequences = tokenizer.texts_to_sequences(test_X) + + +X_train = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post', truncating='pre') +X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post', truncating='pre') + +y_train = train_Y +y_test = test_Y + + +# build models --- only RNN is used in report +model = Sequential() +all_metrics = ['mean_absolute_error', 'mean_squared_error', 'r2_score'] +if RUN_NETWORK: + if RNN: + model.add(Embedding(input_dim=max_vocab_size, output_dim=128)) + model.add(LSTM(32, return_sequences=False)) + model.add(Dense(16, activation='relu')) + model.add(Dropout(0.5)) + model.add(Dense(1, activation='linear')) + model.compile( + loss="mean_squared_error", #'mean_squared_error', + optimizer=Adam(learning_rate=0.001), + metrics=all_metrics + ) + + history = model.fit( + X_train, + y_train, + epochs=24, + batch_size=32, + validation_data=(X_test, y_test) + ) + + plt.plot(history.history['loss'], label='Training Loss') + plt.plot(history.history['val_loss'], label='Validation Loss') + plt.xlabel('Epochs') + + plt.ylabel('Loss') + plt.legend() + plt.title('Loss per Epoch') + plt.show() + else: + # CNN network --- works, but left out + model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim)) + model.add(Conv1D(filters=64, kernel_size=5, activation='relu')) + model.add(GlobalMaxPooling1D()) + model.add(Dense(32, activation='relu')) + model.add(Dropout(0.5)) + model.add(Dense(1, activation='linear')) + model.compile( + loss='mean_squared_error', + optimizer=Adam(learning_rate=0.001), + metrics=all_metrics + ) + + model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test)) diff --git a/project/nn_transformers_regression.py b/project/nn_transformers_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..73f4daa7298a43536867ff144afa6f69809038fa --- /dev/null +++ b/project/nn_transformers_regression.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +import math +import torch.nn.functional as F +from load_data import load_and_prep_data +from transformers import AutoTokenizer +from transformers import AutoModel +from torch.utils.data import Dataset, DataLoader +from transformers import AutoTokenizer +from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error +import matplotlib.pyplot as plt + + +class ReviewDataset(Dataset): + def __init__(self, text_summaries, review_scores, tokenizer): + self.text_summaries = text_summaries + self.review_scores = review_scores + self.tokenizer = tokenizer + self.tokenized_summaries = [] + self.normalized_scores = [] + def __len__(self): + return len(self.text_summaries) + + def __getitem__(self, idx): + tokens = self.tokenizer(self.text_summaries[idx], padding='max_length', truncation=True, return_tensors="pt") + label = self.review_scores[idx] / 100.0 # normalize the label (review score) + return tokens, label + +class PositionalEncoding(nn.Module): + + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + +class TransformerModel(nn.Transformer): + def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): + super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers) + self.model_type = 'Transformer' + self.src_mask = None + self.pos_encoder = PositionalEncoding(ninp, dropout) + self.input_emb = nn.Embedding(ntoken, ninp) + self.ninp = ninp + self.decoder = nn.Linear(ninp, 1) + self.init_weights() + + def _generate_square_subsequent_mask(self, sz): + return torch.log(torch.tril(torch.ones(sz,sz))) + + def init_weights(self): + initrange = 0.1 + nn.init.uniform_(self.input_emb.weight, -initrange, initrange) + nn.init.zeros_(self.decoder.bias) + nn.init.uniform_(self.decoder.weight, -initrange, initrange) + + def forward(self, input_ids, attention_mask): + embeddings = self.input_emb(input_ids) * math.sqrt(self.ninp) + embeddings = self.pos_encoder(embeddings) + output = self.encoder(embeddings, src_key_padding_mask=~attention_mask.bool().transpose(0, 1)) + output = self.decoder(output.mean(dim=1)) # for regression + return output + +def plot_losses(train_losses, epochs): + # Plot the training loss + plt.figure(figsize=(10, 6)) + plt.plot(range(1, epochs + 1), train_losses, marker='o', label='Training Loss') + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.title('Training Loss Over Epochs') + plt.legend() + plt.grid() + plt.show() + +text_summaries, review_scores, test_text_summaries, test_review_scores = load_and_prep_data() + +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # bert tokenizer again +tokenizer_vocab = tokenizer.get_vocab() + +# tried filtering vocab to see if it would increase speed +top_n = 5000 +filtered_vocab = dict(sorted(tokenizer_vocab.items(), key=lambda item: item[1])[:top_n]) + +tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', vocab=filtered_vocab) + +tokens_ = tokenizer(list(text_summaries), padding='max_length', truncation=True, return_tensors="pt") +normalized_labels = review_scores / 100.0 + +dataset = ReviewDataset(text_summaries, review_scores, tokenizer) +dataloader = DataLoader(dataset, batch_size=32, shuffle=True, pin_memory=True) + + +num_epochs = 24 +ntoken = tokenizer.vocab_size +ninp = 512 +nhead = 8 +nhid = 4 +nlayers = 2 +dropout = 0.5 +device = torch.device("cuda") +model = TransformerModel(ntoken, ninp, nhead, nhid, nlayers, dropout) +loss_fn = nn.MSELoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) +model = model.to(device) + +test_dataset = ReviewDataset(test_text_summaries, test_review_scores, tokenizer) +test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True) + +all_losses = [] +for epoch in range(num_epochs): + batch_losses = [] + for batch in dataloader: + tokens, labels = batch + labels = labels.float().to(device) + input_ids = tokens['input_ids'].squeeze(1) + attention_mask = tokens['attention_mask'].squeeze(1) + predictions = model(input_ids.to(device), attention_mask.to(device)) + loss = loss_fn(predictions, labels.unsqueeze(-1)) + optimizer.zero_grad() + loss.backward() + optimizer.step() + batch_losses.append(loss.item()) + avg_loss = sum(batch_losses) / len(batch_losses) + all_losses.append(avg_loss) + print("------------------------") + print(f"Epoch {epoch}: Loss = {avg_loss}") + + + + # eval + model.eval() + all_predictions = [] + all_labels = [] + + with torch.no_grad(): + for batch in test_dataloader: + tokens, labels = batch + labels = labels.float().to(device) + input_ids = tokens['input_ids'].squeeze(1).to(device) + attention_mask = tokens['attention_mask'].squeeze(1).to(device) + predictions = model(input_ids, attention_mask) + all_predictions.extend(predictions.cpu().numpy()) + all_labels.extend(labels.cpu().numpy()) + + all_predictions = [pred[0] for pred in all_predictions] + mae = mean_absolute_error(all_labels, all_predictions) + r2 = r2_score(all_labels, all_predictions) + mse = mean_squared_error(all_labels, all_predictions) + + + print(f"Mean Absolute Error: {mae * 100}") + print(f"R2 Score: {r2}") + print(f"Mean Squared Error: {mse * 100}") + +plot_losses(all_losses, 24)