Skip to content
Snippets Groups Projects
Commit a13d6f9e authored by Hampus Elinder's avatar Hampus Elinder
Browse files

Adds code

parents
No related branches found
No related tags found
No related merge requests found
Code for Master Thesis
This code is an extract from the code used in the thesis project by Hampus Elinder at Linköping's University HT 2021.
The code is intended only as an example of the implemented methods used to produce the results. Therefore,
the code is deficient in the sense that it cannot properly run, the dataset is missing (as it is the property of the company
Our Studio and its partners) and that comments might be missing or vague. Some variables, functions etc. have
also been renamed.
\ No newline at end of file
This diff is collapsed.
File added
import numpy as np
import pandas as pd
from IPython.display import clear_output
class SeqAR:
def __init__(self, userHistory=None, windowSize=2) -> None:
# self.userHistory = userHistory
self.windowSize = windowSize
# Mine sequential rules from the dataset
def MineRules(self, history, nrArElements):
# Init transaction and rules matrices and counters
# self.rules = np.zeros([history.shape[0]*(userConfigs.shape[1] - self.windowSize + 1), self.windowSize])
self.nrTransactions = nrArElements - len(history) * (self.windowSize - 1)
preRules = np.zeros((self.nrTransactions, self.windowSize))
rulesCount = 0
for i in range(0, len(history)):
# Fill in rules
for j in range(0, len(history[i]) - (self.windowSize - 1)):
newRule = history[i][j:j + self.windowSize]
if np.any(newRule == -2):
pass
else:
preRules[rulesCount, :] = history[i][j:j + self.windowSize]
rulesCount += 1
self.rules = preRules[0:rulesCount, :]
self.nrTransactions = rulesCount
# Get the unique patterns and the number of times their present in a count vec. Reshape into a column vector.
allPatterns = self.rules[:, 0:-1]
self.patternsAndCount = self.UniqueSequenceAndCount(allPatterns)
# Same for rules as for patterns above
self.rulesAndCount = self.UniqueSequenceAndCount(self.rules)
consequentsEl, consequentsOcc = np.unique(self.rules[:, -1], return_counts=True)
# Initiate DataFrame and set rules and metrics
self.df = pd.DataFrame( index=range(0, self.rulesAndCount.shape[0]),
columns=['antecedents', 'consequents', 'antecedents support', 'consequents support', 'support', 'confidence', 'lift', 'leverage', 'conviction'])
print('Nr rows:', self.df.shape[0])
self.df['antecedents'] = self.rulesAndCount[:, 0:-2].tolist()
self.df['consequents'] = self.rulesAndCount[:, -2:-1].tolist()
# User a for loop to set a few metrics
for index, row in self.df.iterrows():
if index % 100 == 0:
print('Currently on row:', index)
antecedentIndex = np.all(row['antecedents'] == self.patternsAndCount[:, 0:-1], axis=1)
row['antecedents support'] = self.patternsAndCount[antecedentIndex, -1] / self.nrTransactions
row['consequents support'] = consequentsOcc[consequentsEl == row['consequents']] / self.nrTransactions
# Continue setting metrics with vector notation
self.df['support'] = (self.rulesAndCount[:, -1] / self.nrTransactions).tolist()
self.df['confidence'] = self.df['support'] / self.df['antecedents support']
self.df['lift'] = self.df['confidence'] / self.df['consequents support']
self.df['leverage'] = self.df['support'] - self.df['antecedents support'] * self.df['consequents support']
self.consequentsEl = consequentsEl
self.consequentsOcc = consequentsOcc
clear_output(wait=True)
# display(self.df)
def UniqueSequenceAndCount(self, sequence) -> np.ndarray:
uniqueSequence, countUniqueSequence = np.unique(sequence, axis=0, return_counts=True)
countUniqueSequence = np.reshape(countUniqueSequence, (countUniqueSequence.shape[0], 1))
return np.concatenate((uniqueSequence, countUniqueSequence), axis=1)
\ No newline at end of file
import numpy as np
def CalcPredScore(testPredict, Ytest, nrPredsToUse=1, printText=True):
nrActions = testPredict.shape[0]
guessMat = (-testPredict).argsort()[:, :nrPredsToUse]
gT = Ytest.argmax(axis=1)
stackedGT = np.tile(gT.reshape(-1, 1), (1, nrPredsToUse))
output = (guessMat == stackedGT)
scores = np.sum(output, axis=0)
if printText:
print('\nNr actions in total: ', nrActions, '\n')
for i in range(nrPredsToUse):
nrHitsWithIPredictions = np.sum(scores[:i + 1])
print('Guesses per action: ', i + 1)
print('Nr hits: ', nrHitsWithIPredictions)
print('Hit rate: ', nrHitsWithIPredictions / nrActions, '\n')
return output
class DatabaseMetaContainer():
"""
Holds and maintains the meta data of a database in
terms of its unique parts and features.
"""
metaData: dict
nameToVec: dict
vecToName: dict
def __init__(self) -> None:
self.metaData = {}
self.nameToVec = {}
self.vecToName = {}
def UpdateMetaData(self, part, feature) -> None:
# Adds the part and feature if they are not already in
# metaData
if part in self.metaData:
if feature in self.metaData[part]:
pass
else:
self.metaData[part].append(feature)
else:
self.metaData[part] = [feature]
def ConstructConverters(self) -> None:
self.ConstructNameToVec()
self.ConstructVecToName()
def ConstructNameToVec(self) -> None:
# Constructs a dict from metaData which maps parts and
# features to a unique number. Used to create input
# vectors from parts and features.
count = 0
partKeys = self.metaData.keys()
for part in partKeys:
for feature in self.metaData[part]:
self.nameToVec[part, feature] = count
count += 1
def ConstructVecToName(self) -> None:
self.vecToName = {y:x for x,y in self.nameToVec.items()}
\ No newline at end of file
import pandas as pd
from core.DatabaseMetaContainer import DatabaseMetaContainer
import numpy as np
import pickle
class DatabaseProcessor():
database: dict
metaContainer: DatabaseMetaContainer
def __init__(self, windowSize) -> None:
self.windowSize = windowSize
self.database = {}
self.nrAnObjectMsgs = 0
self.nrMessages = 0
self.metaContainer = DatabaseMetaContainer()
def ProcessCsv(self, csvFilePath, maxMsgs=None) -> None:
# Try to read the csv file
try:
df = pd.read_csv(csvFilePath)
# Check that the file is valid
if self.ValidateCsvFile(df):
df = self.RemoveSequentialDublettes(df)
self.nrMessages = Removed
# For msg in the file, add it to a session
for iMsg in range(0, Removed):
if iMsg % 1000 == 0:
print(iMsg)
if maxMsgs != None:
if iMsg > maxMsgs:
break
self.AddMsgToDatabase(df.iloc[iMsg])
except:
if isinstance(csvFilePath, str):
print('Could not read and process file: ' + csvFilePath)
else:
print('Could not read and process file')
def RemoveSequentialDublettes(self, df: pd.DataFrame) -> pd.DataFrame:
dfProcessed = df[~df.eq(df.shift()).all(axis='columns')]
return dfProcessed
def ValidateCsvFile(self, df) -> bool:
Removed
def AddMsgToDatabase(self, msg) -> None:
idIndex = 1
tokenIndex = 2
id = msg[idIndex]
newAnObject = Removed
if newAnObject != None:
if id in self.database:
sessionToUpdate = self.database[id]
# Do not add if the new anObject is a dublette of the previous one
if sessionToUpdate[-1] != newAnObject:
sessionToUpdate.append(newAnObject)
self.database[id] = sessionToUpdate
else:
self.database[id] = [newAnObject]
for iAnObject in newAnObject:
if len(iAnObject) != 0:
self.metaContainer.UpdateMetaData(Removed)
def SaveAsCsv(self) -> None:
tempDataFrame = pd.DataFrame.from_dict(self.database, orient='index')
pd.DataFrame.to_csv(tempDataFrame, 'mycsvfile.csv')
def ProcessMetaData(self) -> None:
self.metaContainer.ConstructConverters()
def ConvertToVecInput(self) -> list:
# Converts the database into input vectors and returns them
nrElements = len(self.metaContainer.nameToVec)
sessions = list(self.database.keys())
nrSessions = len(sessions)
inputs = np.empty((nrSessions), dtype=object)
for iSession, session in enumerate(sessions):
anObjects = self.database[session]
nrAnObjects = len(anObjects)
sessionData = np.zeros((nrAnObjects, nrElements), dtype=int)
for iAnObject in range(nrAnObjects):
featuresUsed = anObjects[iAnObject]
nrFeaturesUsed = len(featuresUsed)
if nrFeaturesUsed == 0:
continue
else:
featureVec = np.zeros((nrElements))
for iFeature in range(nrFeaturesUsed):
if len(featuresUsed[iFeature]) > 0:
featureVec[self.metaContainer.nameToVec[featuresUsed[iFeature][part], featuresUsed[iFeature]['name']]] = int(1)
sessionData[iAnObject] = featureVec
if any(sessionData.shape) > 0:
inputs[iSession] = sessionData
return inputs
def GetNrChoicesFromInput(self, inputs) -> None:
return len(inputs[0][0])
def PadInputs(self, inputs, nrChoices) -> None:
emptyVec = np.zeros((self.windowSize - 1, nrChoices), dtype=int)
for iSession in range(len(inputs)):
inputs[iSession] = np.vstack((emptyVec, inputs[iSession]))
def RemoveEmptySessions(self) -> None:
for key in list(self.database.keys()):
if len(self.database[key]) == 1:
if len(self.database[key][0]) == 0:
self.database.pop(key)
def SaveDatabase(self, filepath=None) -> None:
if filepath == None:
filepath = 'database/' + str(len(list(self.database.keys()))) + '_users.pkl'
with open(filepath, 'wb') as f:
pickle.dump(self.database, f)
def LoadDatabase(self, filepath) -> None:
f = open(filepath, 'rb')
self.database = pickle.load(f)
def ReadMetaData(self) -> None:
userKeys = list(self.database.keys())
for key in userKeys:
currentUser = self.database[key]
for choice in currentUser:
for anObject in choice:
self.metaContainer.UpdateMetaData(anObject[part], anObject['name'])
self.metaContainer.ConstructConverters()
def ReadAndProcessDatabase(self, databasePath) -> None:
import time
st = time.time()
self.LoadDatabase(databasePath)
t = time.time() - st
print('Loading database took: ', t, ' s')
st = time.time()
self.ReadMetaData()
t = time.time() - st
print('Reading metadata took: ', t, ' s')
st = time.time()
self.RemoveEmptySessions()
t = time.time() - st
print('Removing empty sessions took: ', t, ' s')
def GetInputs(self) -> list:
inputs = self.ConvertToVecInput()
nrChoices = self.GetNrChoicesFromInput(inputs)
self.PadInputs(inputs, nrChoices)
return inputs
def GetNnIO(self, inputs, rngSeed=0):
# Create padded input vectors ready for the NN
nrInputs = 0
for userSession in inputs:
nrInputs += len(userSession) - self.windowSize
nrChoices = self.GetNrChoicesFromInput(inputs)
X = np.ndarray((nrInputs, self.windowSize, nrChoices))
Y = np.ndarray((nrInputs, nrChoices))
# Create inputs and outputs for the network
# The input is of size windowSize x nrChoices. For each new session, the corresponding
# input will have windowSize - 1 padding arrays and 1 real array.
count = 0
for userSession in inputs:
for choice in range(len(userSession) - self.windowSize):
X[count, :] = userSession[choice:choice + self.windowSize]
Y[count, :] = userSession[choice + self.windowSize] - userSession[choice + self.windowSize - 1]
count += 1
# Instead of trying to find the true next configuration vector, we want to find the next choice, or user actions.
Y[Y != 1] = 0
# Create a pseudo random index vector
trainIndexVec = np.arange(0, X.shape[0])
rng = np.random.default_rng(rngSeed)
rng.shuffle(trainIndexVec)
# Separate in- and output into training and testing part
trainFrac = 2./3.
Xtrain = X[trainIndexVec[0:int(nrInputs * trainFrac)]]
Xtest = X[trainIndexVec[int(nrInputs * trainFrac):-1]]
Ytrain = Y[trainIndexVec[0:int(nrInputs * trainFrac)]]
Ytest = Y[trainIndexVec[int(nrInputs * trainFrac):-1]]
return Xtrain, Xtest, Ytrain, Ytest
\ No newline at end of file
import numpy as np
class NnInputHandler():
input: list
def __init__(self, input) -> None:
self.input = input
self.nrInputs = len(self.input)
self.inputLength = len(self.input[0])
def GetTrainAndTestInput(self):
emptyVec = np.zeros((self.inputLength, 1))
for iInput in range(self.nrInputs):
a = 2
\ No newline at end of file
from tensorflow import keras
import numpy as np
class batchDataGenerator(keras.utils.Sequence):
def __init__(self, Xin, Yin, batchSize):
self.Xin = Xin
self.Yin = Yin
self.batchSize = batchSize
def __len__(self):
return int(np.floor(self.Xin.shape[0] / self.batchSize))
def __getitem__(self, index):
Xout = self.Xin[index*self.batchSize:(index+1)*self.batchSize, :, :]
Yout = self.Yin[index*self.batchSize:(index+1)*self.batchSize, :]
return np.array(Xout), np.array(Yout)
\ No newline at end of file
from core.DatabaseProcessor import DatabaseProcessor
import numpy as np
from tensorflow import keras
from core.DataEval import CalcPredScore
# Init constants
databasePath = 'database/626765_users.pkl'
windowSize = 3
modelName = 'GRU_winSize_' + str(windowSize)
savePath = 'models/' + modelName
# Generate database processor
myProcessor = DatabaseProcessor(windowSize=windowSize)
# Generate Input
myProcessor.ReadAndProcessDatabase(databasePath)
inputs = myProcessor.GetInputs()
nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
Xtrain, Xtest, Ytrain, Ytest = myProcessor.GetNnIO(inputs)
print('Loading')
nnModel = keras.models.load_model(savePath)
print('Predicting')
predictions = nnModel.predict(Xtest)
print('Calculating score')
CalcPredScore(predictions, Ytest, nr=20)
\ No newline at end of file
from core.DatabaseProcessor import DatabaseProcessor
from tensorflow import keras
from core.NN.batchDataGenerator import batchDataGenerator
from core.DataEval import CalcPredScore
import pickle
# Init constants
databasePath = 'database/626765_users.pkl'
windowSize = 3
modelName = 'LSTM_winSize_' + str(windowSize)
# Generate database processor
myProcessor = DatabaseProcessor(windowSize=windowSize)
# Generate Input
myProcessor.ReadAndProcessDatabase(databasePath)
inputs = myProcessor.GetInputs()
nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
savePath = 'models/' + modelName + '_nrChoices_' + str(nrChoices)
Xtrain, Xtest, Ytrain, Ytest = myProcessor.GetNnIO(inputs)
frac = 0.8
nrSamples = Xtrain.shape[0]
Xval = Xtrain[int(nrSamples*frac):-1, :, :]
Xtrain = Xtrain[:int(nrSamples*frac), :, :]
Yval = Ytrain[int(nrSamples*frac):-1, :]
Ytrain = Ytrain[:int(nrSamples*frac), :]
# Generate generators
batchSize = 10000
trainingGen = batchDataGenerator(Xtrain, Ytrain, batchSize)
validationGen = batchDataGenerator(Xval, Yval, batchSize)
# Generate network
nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
model_lstm = keras.Sequential()
model_lstm.add(keras.layers.SpatialDropout1D(0.3))
model_lstm.add(keras.layers.BatchNormalization())
model_lstm.add(keras.layers.LSTM(312, return_sequences=False, dropout=0.0, input_shape=(windowSize, nrChoices)))
model_lstm.add(keras.layers.BatchNormalization())
model_lstm.add(keras.layers.Dropout(0.3))
model_lstm.add(keras.layers.Dense(nrChoices, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history_lstm = model_lstm.fit(trainingGen, validation_data = validationGen, epochs=20)
print(model_lstm.summary())
# Save
model_lstm.save(savePath)
with open(savePath + '_hist', 'wb') as file_pi:
pickle.dump(history_lstm.history, file_pi)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment