Adds code

a13d6f9e · Hampus Elinder · a13d6f9e · a13d6f9e · a13d6f9e · a13d6f9e
Commit a13d6f9e authored 3 years ago by Hampus Elinder
--- a/README.txt
+++ b/README.txt
+Code for Master Thesis
+This code is an extract from the code used in the thesis project by Hampus Elinder at Linköping's University HT 2021.
+The code is intended only as an example of the implemented methods used to produce the results. Therefore, 
+the code is deficient in the sense that it cannot properly run, the dataset is missing (as it is the property of the company
+Our Studio and its partners) and that comments might be missing or vague. Some variables, functions etc. have 
+also been renamed.
\ No newline at end of file
--- a/TryPrediction.ipynb
+++ b/TryPrediction.ipynb
--- a/core/.DS_Store
+++ b/core/.DS_Store
--- a/core/AR/SeqAR.py
+++ b/core/AR/SeqAR.py
+import numpy as np
+import pandas as pd
+from IPython.display import clear_output
+
+class SeqAR:
+    def __init__(self, userHistory=None, windowSize=2) -> None:
+    #    self.userHistory = userHistory
+        self.windowSize = windowSize
+
+    # Mine sequential rules from the dataset
+    def MineRules(self, history, nrArElements):
+        # Init transaction and rules matrices and counters
+        # self.rules = np.zeros([history.shape[0]*(userConfigs.shape[1] - self.windowSize + 1), self.windowSize])
+        self.nrTransactions = nrArElements - len(history) * (self.windowSize - 1)
+        preRules = np.zeros((self.nrTransactions, self.windowSize))
+        
+        rulesCount = 0
+
+        for i in range(0, len(history)):
+            # Fill in rules
+            for j in range(0, len(history[i]) - (self.windowSize - 1)):
+                newRule = history[i][j:j + self.windowSize]
+                if np.any(newRule == -2):
+                    pass
+                else:
+                    preRules[rulesCount, :] = history[i][j:j + self.windowSize]
+                    rulesCount += 1
+
+        self.rules = preRules[0:rulesCount, :]
+        
+        self.nrTransactions = rulesCount
+
+        # Get the unique patterns and the number of times their present in a count vec. Reshape into a column vector.
+        allPatterns = self.rules[:, 0:-1]
+        self.patternsAndCount = self.UniqueSequenceAndCount(allPatterns)
+        
+        # Same for rules as for patterns above
+        self.rulesAndCount = self.UniqueSequenceAndCount(self.rules)
+        consequentsEl, consequentsOcc = np.unique(self.rules[:, -1], return_counts=True)
+
+        # Initiate DataFrame and set rules and metrics
+        self.df = pd.DataFrame( index=range(0, self.rulesAndCount.shape[0]), 
+                                columns=['antecedents', 'consequents', 'antecedents support', 'consequents support', 'support', 'confidence', 'lift', 'leverage', 'conviction'])
+        print('Nr rows:', self.df.shape[0])
+        self.df['antecedents'] = self.rulesAndCount[:, 0:-2].tolist()
+        self.df['consequents'] = self.rulesAndCount[:, -2:-1].tolist()
+
+        # User a for loop to set a few metrics
+        for index, row in self.df.iterrows():
+            if index % 100 == 0:
+                print('Currently on row:', index)
+            antecedentIndex = np.all(row['antecedents'] == self.patternsAndCount[:, 0:-1], axis=1)
+            row['antecedents support'] =  self.patternsAndCount[antecedentIndex, -1] / self.nrTransactions
+            row['consequents support'] = consequentsOcc[consequentsEl == row['consequents']] / self.nrTransactions
+
+        # Continue setting metrics with vector notation
+        self.df['support'] = (self.rulesAndCount[:, -1] / self.nrTransactions).tolist()
+        self.df['confidence'] = self.df['support'] / self.df['antecedents support']
+        self.df['lift'] = self.df['confidence'] / self.df['consequents support']
+        self.df['leverage'] = self.df['support'] - self.df['antecedents support'] * self.df['consequents support']
+
+        self.consequentsEl = consequentsEl
+        self.consequentsOcc = consequentsOcc
+        clear_output(wait=True)
+        # display(self.df)
+
+
+    def UniqueSequenceAndCount(self, sequence) -> np.ndarray:
+        uniqueSequence, countUniqueSequence = np.unique(sequence, axis=0, return_counts=True)
+        countUniqueSequence = np.reshape(countUniqueSequence, (countUniqueSequence.shape[0], 1))
+        return np.concatenate((uniqueSequence, countUniqueSequence), axis=1)
\ No newline at end of file
--- a/core/DataEval.py
+++ b/core/DataEval.py
+import numpy as np
+
+def CalcPredScore(testPredict, Ytest, nrPredsToUse=1, printText=True):
+    nrActions = testPredict.shape[0]
+    guessMat = (-testPredict).argsort()[:, :nrPredsToUse]
+    gT = Ytest.argmax(axis=1)
+    stackedGT = np.tile(gT.reshape(-1, 1), (1, nrPredsToUse))
+    output = (guessMat == stackedGT)
+    scores = np.sum(output, axis=0)
+
+    if printText:
+        print('\nNr actions in total: ', nrActions, '\n')
+        for i in range(nrPredsToUse):
+            nrHitsWithIPredictions = np.sum(scores[:i + 1])
+            print('Guesses per action: ', i + 1)
+            print('Nr hits:            ', nrHitsWithIPredictions) 
+            print('Hit rate:           ', nrHitsWithIPredictions / nrActions, '\n')   
+
+    return output
--- a/core/DatabaseMetaContainer.py
+++ b/core/DatabaseMetaContainer.py
+class DatabaseMetaContainer():
+    """
+        Holds and maintains the meta data of a database in
+        terms of its unique parts and features.
+        """
+    metaData: dict
+    nameToVec: dict
+    vecToName: dict
+
+    def __init__(self) -> None:
+        self.metaData = {}
+        self.nameToVec = {}
+        self.vecToName = {}
+
+    def UpdateMetaData(self, part, feature) -> None:
+        # Adds the part and feature if they are not already in
+        # metaData
+        if part in self.metaData:
+            if feature in self.metaData[part]:
+                pass
+            else:
+                self.metaData[part].append(feature)
+        else:
+            self.metaData[part] = [feature]
+
+    def ConstructConverters(self) -> None:
+        self.ConstructNameToVec()
+        self.ConstructVecToName()
+
+    def ConstructNameToVec(self) -> None:
+        # Constructs a dict from metaData which maps parts and
+        # features to a unique number. Used to create input
+        # vectors from parts and features. 
+        count = 0
+        partKeys = self.metaData.keys()
+        for part in partKeys:
+            for feature in self.metaData[part]:
+                self.nameToVec[part, feature] = count
+                count += 1
+
+    def ConstructVecToName(self) -> None:
+        self.vecToName = {y:x for x,y in self.nameToVec.items()}
+            
+
+            
\ No newline at end of file
--- a/core/DatabaseProcessor.py
+++ b/core/DatabaseProcessor.py
+import pandas as pd
+from core.DatabaseMetaContainer import DatabaseMetaContainer
+import numpy as np
+import pickle
+
+class DatabaseProcessor():
+    database: dict
+    metaContainer: DatabaseMetaContainer
+
+    def __init__(self, windowSize) -> None:
+        self.windowSize = windowSize
+        self.database = {}
+        self.nrAnObjectMsgs = 0
+        self.nrMessages = 0
+        self.metaContainer = DatabaseMetaContainer()
+
+    def ProcessCsv(self, csvFilePath, maxMsgs=None) -> None:
+        # Try to read the csv file
+        try:
+            df = pd.read_csv(csvFilePath)
+            # Check that the file is valid
+            if self.ValidateCsvFile(df):
+                df = self.RemoveSequentialDublettes(df)
+                self.nrMessages = Removed
+                # For msg in the file, add it to a session
+                for iMsg in range(0, Removed):
+                    if iMsg % 1000 == 0:
+                        print(iMsg)
+                    if maxMsgs != None:
+                        if iMsg > maxMsgs:
+                            break
+                    self.AddMsgToDatabase(df.iloc[iMsg])
+       
+        except:
+            if isinstance(csvFilePath, str):
+                print('Could not read and process file: ' + csvFilePath)
+            else:
+                print('Could not read and process file')
+
+    def RemoveSequentialDublettes(self, df: pd.DataFrame) -> pd.DataFrame:
+        dfProcessed = df[~df.eq(df.shift()).all(axis='columns')]
+        return dfProcessed
+
+    def ValidateCsvFile(self, df) -> bool:
+        Removed
+
+
+
+    def AddMsgToDatabase(self, msg) -> None:
+        idIndex = 1
+        tokenIndex = 2
+        id = msg[idIndex]
+        newAnObject = Removed
+        if newAnObject != None:
+            if id in self.database:
+                sessionToUpdate = self.database[id]
+                # Do not add if the new anObject is a dublette of the previous one
+                if sessionToUpdate[-1] != newAnObject:
+                    sessionToUpdate.append(newAnObject)
+                    self.database[id] = sessionToUpdate
+            else:
+                self.database[id] = [newAnObject]
+
+            for iAnObject in newAnObject:
+                if len(iAnObject) != 0:
+                    self.metaContainer.UpdateMetaData(Removed)
+
+    def SaveAsCsv(self) -> None:
+        tempDataFrame = pd.DataFrame.from_dict(self.database, orient='index')
+        pd.DataFrame.to_csv(tempDataFrame, 'mycsvfile.csv')
+
+    def ProcessMetaData(self) -> None:
+        self.metaContainer.ConstructConverters()
+
+    def ConvertToVecInput(self) -> list:
+        # Converts the database into input vectors and returns them
+        nrElements = len(self.metaContainer.nameToVec)
+        sessions = list(self.database.keys())
+        nrSessions = len(sessions)
+        inputs = np.empty((nrSessions), dtype=object)
+        for iSession, session in enumerate(sessions):
+            anObjects = self.database[session]
+            nrAnObjects = len(anObjects)
+            sessionData = np.zeros((nrAnObjects, nrElements), dtype=int)
+            for iAnObject in range(nrAnObjects):
+                featuresUsed = anObjects[iAnObject]
+                nrFeaturesUsed = len(featuresUsed)
+                if nrFeaturesUsed == 0:
+                    continue
+                else:
+                    featureVec = np.zeros((nrElements))
+                    for iFeature in range(nrFeaturesUsed):
+                        if len(featuresUsed[iFeature]) > 0:
+                            featureVec[self.metaContainer.nameToVec[featuresUsed[iFeature][part], featuresUsed[iFeature]['name']]] = int(1)
+                sessionData[iAnObject] = featureVec
+            if any(sessionData.shape) > 0:
+                inputs[iSession] = sessionData
+
+        return inputs
+
+    def GetNrChoicesFromInput(self, inputs) -> None:
+        return len(inputs[0][0])
+
+    def PadInputs(self, inputs,  nrChoices) -> None:
+        emptyVec = np.zeros((self.windowSize - 1, nrChoices), dtype=int)
+        for iSession in range(len(inputs)):
+            inputs[iSession] = np.vstack((emptyVec, inputs[iSession]))
+
+    def RemoveEmptySessions(self) -> None:
+        for key in list(self.database.keys()):
+            if len(self.database[key]) == 1:
+                if len(self.database[key][0]) == 0:
+                    self.database.pop(key)
+
+    def SaveDatabase(self, filepath=None) -> None:
+        if filepath == None:
+            filepath = 'database/' + str(len(list(self.database.keys()))) + '_users.pkl'
+        
+        with open(filepath, 'wb') as f:
+            pickle.dump(self.database, f)
+
+        
+    def LoadDatabase(self, filepath) -> None:
+        f =  open(filepath, 'rb')
+        self.database = pickle.load(f)
+
+    def ReadMetaData(self) -> None:
+        userKeys = list(self.database.keys())
+        for key in userKeys:
+            currentUser = self.database[key]
+            for choice in currentUser:
+                for anObject in choice:
+                    self.metaContainer.UpdateMetaData(anObject[part], anObject['name'])
+
+        self.metaContainer.ConstructConverters()
+
+    def ReadAndProcessDatabase(self, databasePath) -> None:
+        import time
+        st = time.time()
+        self.LoadDatabase(databasePath)
+        t = time.time() - st
+        print('Loading database took: ', t, ' s')
+        st = time.time()
+        self.ReadMetaData()
+        t = time.time() - st
+        print('Reading metadata took: ', t, ' s')
+        st = time.time()
+        self.RemoveEmptySessions()
+        t = time.time() - st
+        print('Removing empty sessions took: ', t, ' s')
+        
+    def GetInputs(self) -> list:
+        inputs = self.ConvertToVecInput()
+        nrChoices = self.GetNrChoicesFromInput(inputs)
+        self.PadInputs(inputs, nrChoices)
+
+        return inputs
+
+    def GetNnIO(self, inputs, rngSeed=0):
+        # Create padded input vectors ready for the NN
+        nrInputs = 0
+        for userSession in inputs:
+            nrInputs += len(userSession) - self.windowSize
+
+        nrChoices = self.GetNrChoicesFromInput(inputs)
+        X = np.ndarray((nrInputs, self.windowSize, nrChoices))
+        Y = np.ndarray((nrInputs, nrChoices))
+
+        # Create inputs and outputs for the network
+        # The input is of size windowSize x nrChoices. For each new session, the corresponding
+        # input will have windowSize - 1 padding arrays and 1 real array. 
+        count = 0
+        for userSession in inputs:
+            for choice in range(len(userSession) - self.windowSize):
+                X[count, :] = userSession[choice:choice + self.windowSize]
+                Y[count, :] = userSession[choice + self.windowSize] - userSession[choice + self.windowSize - 1]
+                count += 1
+
+        # Instead of trying to find the true next configuration vector, we want to find the next choice, or user actions. 
+        Y[Y != 1] = 0
+
+        # Create a pseudo random index vector
+        trainIndexVec = np.arange(0, X.shape[0])
+        rng = np.random.default_rng(rngSeed)
+        rng.shuffle(trainIndexVec)
+
+        # Separate in- and output into training and testing part
+        trainFrac = 2./3.
+        Xtrain = X[trainIndexVec[0:int(nrInputs * trainFrac)]]
+        Xtest = X[trainIndexVec[int(nrInputs * trainFrac):-1]]
+        Ytrain = Y[trainIndexVec[0:int(nrInputs * trainFrac)]]
+        Ytest = Y[trainIndexVec[int(nrInputs * trainFrac):-1]]
+
+        return Xtrain, Xtest, Ytrain, Ytest
\ No newline at end of file
--- a/core/NN/NnInputHandler.py
+++ b/core/NN/NnInputHandler.py
+import numpy as np
+
+class NnInputHandler():
+    input: list
+
+    def __init__(self, input) -> None:
+        self.input = input
+        self.nrInputs = len(self.input)
+        self.inputLength = len(self.input[0])
+
+    def GetTrainAndTestInput(self):
+        emptyVec = np.zeros((self.inputLength, 1))
+        for iInput in range(self.nrInputs):
+            a = 2
\ No newline at end of file
--- a/core/NN/batchDataGenerator.py
+++ b/core/NN/batchDataGenerator.py
+from tensorflow import keras
+import numpy as np
+
+class batchDataGenerator(keras.utils.Sequence):
+    def __init__(self, Xin, Yin, batchSize):
+        self.Xin = Xin
+        self.Yin = Yin
+        self.batchSize = batchSize
+
+    def __len__(self):
+        return int(np.floor(self.Xin.shape[0] / self.batchSize))
+
+    def __getitem__(self, index):
+        Xout = self.Xin[index*self.batchSize:(index+1)*self.batchSize, :, :]
+        Yout = self.Yin[index*self.batchSize:(index+1)*self.batchSize, :]
+
+        return np.array(Xout), np.array(Yout)
\ No newline at end of file
--- a/loadAndPredictWithModel.py
+++ b/loadAndPredictWithModel.py
+from core.DatabaseProcessor import DatabaseProcessor
+import numpy as np
+from tensorflow import keras
+from core.DataEval import CalcPredScore
+
+
+# Init constants
+databasePath = 'database/626765_users.pkl'
+windowSize = 3
+modelName = 'GRU_winSize_' + str(windowSize)
+savePath = 'models/' + modelName
+
+# Generate database processor
+myProcessor = DatabaseProcessor(windowSize=windowSize)
+
+# Generate Input
+myProcessor.ReadAndProcessDatabase(databasePath)
+inputs = myProcessor.GetInputs()
+nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
+Xtrain, Xtest, Ytrain, Ytest = myProcessor.GetNnIO(inputs)
+
+print('Loading')
+nnModel = keras.models.load_model(savePath)
+
+print('Predicting')
+predictions = nnModel.predict(Xtest)
+
+print('Calculating score')
+CalcPredScore(predictions, Ytest, nr=20)
\ No newline at end of file
--- a/trainAndSaveModel.py
+++ b/trainAndSaveModel.py
+from core.DatabaseProcessor import DatabaseProcessor
+from tensorflow import keras
+from core.NN.batchDataGenerator import batchDataGenerator
+from core.DataEval import CalcPredScore
+import pickle
+
+
+# Init constants
+databasePath = 'database/626765_users.pkl'
+windowSize = 3
+modelName = 'LSTM_winSize_' + str(windowSize)
+
+# Generate database processor
+myProcessor = DatabaseProcessor(windowSize=windowSize)
+
+# Generate Input
+myProcessor.ReadAndProcessDatabase(databasePath)
+inputs = myProcessor.GetInputs()
+nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
+savePath = 'models/' + modelName + '_nrChoices_' + str(nrChoices)
+
+Xtrain, Xtest, Ytrain, Ytest = myProcessor.GetNnIO(inputs)
+frac = 0.8
+nrSamples = Xtrain.shape[0]
+Xval = Xtrain[int(nrSamples*frac):-1, :, :]
+Xtrain = Xtrain[:int(nrSamples*frac), :, :]
+Yval = Ytrain[int(nrSamples*frac):-1, :]
+Ytrain = Ytrain[:int(nrSamples*frac), :]
+
+# Generate generators
+batchSize = 10000
+trainingGen = batchDataGenerator(Xtrain, Ytrain, batchSize)
+validationGen = batchDataGenerator(Xval, Yval, batchSize)
+
+# Generate network
+nrChoices = myProcessor.GetNrChoicesFromInput(inputs)
+
+model_lstm = keras.Sequential()
+model_lstm.add(keras.layers.SpatialDropout1D(0.3))
+model_lstm.add(keras.layers.BatchNormalization())
+model_lstm.add(keras.layers.LSTM(312, return_sequences=False, dropout=0.0, input_shape=(windowSize, nrChoices)))
+model_lstm.add(keras.layers.BatchNormalization())
+model_lstm.add(keras.layers.Dropout(0.3))
+model_lstm.add(keras.layers.Dense(nrChoices, activation='sigmoid'))
+
+model_lstm.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
+history_lstm = model_lstm.fit(trainingGen, validation_data = validationGen, epochs=20)
+
+print(model_lstm.summary())
+
+# Save
+model_lstm.save(savePath)
+with open(savePath + '_hist', 'wb') as file_pi:
+    pickle.dump(history_lstm.history, file_pi)
\ No newline at end of file