diff --git a/common/GL_utilities.h b/common/GL_utilities.h
index cdb6b7d9554ac055058880b9bb0f433c6249b577..6d65357ccfa86e06eb91420a3c3f0f6727923805 100644
--- a/common/GL_utilities.h
+++ b/common/GL_utilities.h
@@ -13,7 +13,7 @@ extern "C" {
 		#include "glew.h"
 		#include <GL/gl.h>
 	#else
-		#include <GL/glew.h>
+		//#include <GL/glew.h>
 		#include <GL/gl.h>
 	#endif
 #endif
diff --git a/src/MakeFile_Alt.txt b/src/MakeFile_Alt.txt
index 75d9e148140da94e89569734cc9ea1433689d08f..03a5302784939116f9f5841956c674f6a8512654 100644
--- a/src/MakeFile_Alt.txt
+++ b/src/MakeFile_Alt.txt
@@ -1,3 +1,12 @@
 nvcc -c kernels_cuda.cu -o kernels_cuda.o -I/usr/local/cuda/include
 nvcc main3.cpp ../common/GL_utilities.c ../common/Linux/MicroGlut.c kernels_cuda.o -I/usr/local/cuda/include -I../common -I../common/Linux -DGL_GLEXT_PROTOTYPES -I/usr/include/GL -L/usr/local/cuda/lib64 -lGL -lGLU -lcudart -lX11 -lXt -lm -o simple_particle_system
+./simple_particle_system
+
+
+
+
+New one:
+
+nvcc -c kernels_cuda.cu -o kernels_cuda.o -I/usr/lib/cuda/include
+nvcc main3.cpp ../common/GL_utilities.c ../common/Linux/MicroGlut.c kernels_cuda.o -I/usr/lib/cuda/include -I../common -I../common/Linux -DGL_GLEXT_PROTOTYPES -I/usr/include/GL -L/usr/lib/cuda/lib64 -lGL -lGLU -lcudart -lX11 -lXt -lm -o simple_particle_system
 ./simple_particle_system
\ No newline at end of file
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 9aae2a3296f0c1194f3eae9ecd782711332af645..36cc04561b5806719ee2e703dbe768117acf7b9b 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -1,16 +1,21 @@
 // kernels_cuda.cu
 #include "kernels_cuda.h"
 #include "constants.h"
+#include <cstdio>    // C++ style header
 #include "include/float3_helper_math.h"
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <device_launch_parameters.h>
 #include <cmath>
 
+#include <iostream>
+
+
+
 // Include Thrust for sorting
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/sequence.h>
+//#include <thrust/device_ptr.h>
+//#include <thrust/sort.h>
+//#include <thrust/sequence.h>
 
 // CUDA Graphics Resource (for OpenGL-CUDA interoperability)
 cudaGraphicsResource_t cuda_vbo_resource;
@@ -110,8 +115,10 @@ __global__ void computeCellIndices(
 
         unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY;
 
+
         cellIndices[idx] = cellIndex;
     }
+    
 }
 
 // Reordering kernel
@@ -129,11 +136,94 @@ __global__ void reorderData(
     if (idx < numParticles) {
         unsigned int sortedIdx = particleIndices[idx];
 
-        positions_sorted[idx] = positions[sortedIdx];
+        printf("Index and sorted %d %d\n", idx, sortedIdx);  // Sorted idx result in 0, thus only one particle is here-. 
+
+        positions_sorted[idx] = positions[sortedIdx];   // Todo was sortedIdx
         velocities_sorted[idx] = velocities[sortedIdx];
     }
 }
 
+
+// TODO testing adding bitonic merge sort. 
+// Launch this code only on the GPU
+__device__ void swapIfGreater(unsigned int &a, unsigned int &b, unsigned int &a_key, unsigned int &b_key) {
+    // Swap the a aqnd b keys
+    unsigned int temp_key = a_key;
+    a_key = b_key;
+    b_key = temp_key;
+
+    // Swap the values a and b
+    unsigned int temp = a;
+    a = b;
+    b = temp;
+
+    // Want values = index order. 
+    
+}
+
+/*
+    Keys, values, N
+*/
+
+
+__global__ void helpMe(unsigned int* keys, unsigned int* values, int numElements){
+
+    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid >= numElements) return;
+
+    // Fix values index
+    //printf("%d\n", values[tid]);    // Todo problem, this should only run once. 
+    values[tid] = tid;
+
+    // Todo can we set this to keys's value instead?
+
+    // Eg. 64, 61 is index 0, 1
+    // Then in the secoind run it should be 61, 64 with currently 0, 1 and that should be correct. 
+    
+    
+}
+
+
+// Todo values is empty, we want to fill this array by sorting keys
+__global__ void bitonicSortKernel(unsigned int* keys, unsigned int* values, int numElements) {
+
+    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (tid >= numElements) return;
+
+    for (int k = 2; k <= numElements; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            unsigned int idxj = tid ^ j;
+            if (idxj > tid) {
+                // Ascending order
+                if ((tid & k) == 0) {
+                    
+                    if (keys[tid] > keys[idxj]){
+                        swapIfGreater(values[tid], values[idxj], keys[tid], keys[idxj]);
+                    }
+                        
+                }
+                // Descending order
+                else {
+                    
+                    if (keys[tid] < keys[idxj]){
+                        swapIfGreater(values[idxj], values[tid], keys[idxj], keys[tid]);
+                    }
+                        
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+
+    // Here try printing keys and values
+    //printf("Here: %d\n", keys[tid]);
+    //printf("Value: %d\n", values[tid]);
+
+    
+}
+
 // Find cell start and end indices
 // can bli reeplaceed med atomicAdd för varje ceell index??
 __global__ void findCellStartEnd(
@@ -390,8 +480,13 @@ __global__ void enforceBoundary(
     }
 }
 
+
+
 void updateParticles(float deltaTime)
 {
+  cudaError_t err;
+
+    
     // Map the VBO resource to get a device pointer
     size_t num_bytes;
     cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
@@ -413,12 +508,50 @@ void updateParticles(float deltaTime)
     // Allocate or resize cellStart and cellEnd arrays if necessary
     // (You may need to adjust your initialization code to handle dynamic sizes)
 
+
     // Compute cell indices
     computeCellIndices<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions, d_cellIndices, NUM_PARTICLES, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ);
     cudaDeviceSynchronize();
 
+
+    /*
+    cudaError_t err;
+    err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        printf("Kernel error1: %s\n", cudaGetErrorString(err));
+    }
+    else{
+        printf("GOOD!");
+    }
+
+    
+    err = cudaPeekAtLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(err));
+    }
+    else{
+        printf("CUDA GOOD:");
+    }
+
+     err = cudaDeviceSynchronize();
+if (err != cudaSuccess) {
+    printf("Kernel execution error: %s\n", cudaGetErrorString(err));
+} else {
+    printf("Kernel executed successfully: No errors detected\n");
+}
+
+
+
+    */
+
+    
+
     // Wrap device pointers with thrust iterators
+    // Todo cannot include thrust?
+    /*
+
+    // Creating pointers used by thrust
     thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndices);
     thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_particleIndices);
 
@@ -427,8 +560,24 @@ void updateParticles(float deltaTime)
 
     // Sort particle indices based on cell indices
     thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + NUM_PARTICLES, dev_particleIndices_ptr);
+    */
+
+
+   //////////////
+
+
+   // TODO adding bitonic merge sort instead of thrust
+   
+
+    //helpMe<<<blocksPerGrid, threadsPerBlock>>>(d_cellIndices, d_particleIndices, NUM_PARTICLES);
+    //bitonicSortKernel<<<blocksPerGrid, threadsPerBlock>>>(d_cellIndices, d_particleIndices, NUM_PARTICLES);
+    //cudaDeviceSynchronize();
+    
+
+    ///////////
 
     // Reorder positions and velocities
+    /*
     reorderData<<<blocksPerGrid, threadsPerBlock>>>(
         d_particleIndices,
         d_positions,
@@ -438,6 +587,8 @@ void updateParticles(float deltaTime)
         NUM_PARTICLES);
     cudaDeviceSynchronize();
 
+   
+
     // Swap sorted data with original data
     float3* temp_positions = d_positions;
     d_positions = d_positions_sorted;
@@ -447,6 +598,8 @@ void updateParticles(float deltaTime)
     d_velocities = d_velocities_sorted;
     d_velocities_sorted = temp_velocities;
 
+    
+
     // Build cell start and end indices
     // Initialize cellStart and cellEnd
     cudaMemset(d_cellStart, 0xFF, maxCells * sizeof(unsigned int));
@@ -459,6 +612,8 @@ void updateParticles(float deltaTime)
         NUM_PARTICLES);
     cudaDeviceSynchronize();
 
+    */
+
     // Compute density and pressure
     computeDensityPressure<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions,
@@ -486,7 +641,7 @@ void updateParticles(float deltaTime)
     cudaDeviceSynchronize();
 
     // Integrate
-    float timeStep = 0.008f;
+    float timeStep = 0.008f;    // todo was  0.008f;
     integrate<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions,
         d_velocities,
@@ -535,14 +690,17 @@ void initializeConstants()
     cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float));
     cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float));
     cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float));
+
 }
 
+
 // Register VBO with CUDA
 void registerVBOWithCUDA(unsigned int vbo)
 {
     cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
 }
 
+
 // Unregister VBO from CUDA
 void unregisterVBOFromCUDA()
 {
@@ -556,6 +714,7 @@ void initializeDeviceArrays(
     int numParticles
 )
 {
+
     // Allocate device memory for velocities
     cudaMalloc(&d_velocities, numParticles * sizeof(float3));
     cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
diff --git a/src/main3.cpp b/src/main3.cpp
index b4ed3f8025dd745ec460e9e4a30ce2dd27f5835f..4c5c7d06b813c02e75008c7ad1f6a4cbbb7e601d 100644
--- a/src/main3.cpp
+++ b/src/main3.cpp
@@ -10,10 +10,10 @@
 
 #ifdef _WIN32
     #include <GL/glew.h>  // Include GLEW before other OpenGL headers
+    #include <GL/freeglut.h>
 #endif
 
 
-//#include <GL/freeglut.h>
 #include "MicroGlut.h"
 
 #include "GL_utilities.h"
@@ -26,8 +26,10 @@
 #define MAIN
 #include "VectorUtils4.h"
 
+#include <iostream>
+
 // Number of particles
-const int NUM_PARTICLES = 64000; // Should match a cubic grid for simplicity
+const int NUM_PARTICLES = 3200; // Must be a power of 2!
 const float GRID_SPACING = 0.25f;  // Spacing between particles in the grid
 
 const float pointRadius = 6.0f;  // Point size in world space
@@ -125,6 +127,7 @@ void display()
         deltaTime = 0.0001f;
 
     // Update particle positions using CUDA
+
     updateParticles(deltaTime);
 
     // Use the shader program
@@ -294,6 +297,7 @@ void initialize()
     registerVBOWithCUDA(vbo);
 
     // Initialize device arrays
+
     initializeDeviceArrays(
         h_positions.data(),
         h_velocities.data(),
@@ -306,6 +310,7 @@ void initialize()
 
     // Enable depth testing
     glEnable(GL_DEPTH_TEST);
+
 }
 
 // Cleanup resources