diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6daabbc9c696a3883ba4e6da7475924629d8ece..b0dc26cdd55d0a643d6cced1f1778ad295db91e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,11 +41,35 @@ target_link_libraries(imgui PUBLIC
 )
 
 # ------------------------------
-# 3. Common Library with CUDA and Shared Code
+# 3. CUDA Library
+# ------------------------------
+set(CUDA_SOURCES
+    src/cuda/init_cuda.cu
+    src/cuda/pcisph_kernels.cu
+    src/cuda/smoothing_kernels.cu
+    src/cuda/spatialHash_NBSearch_kernels.cu
+    src/cuda/pcisph_update.cu
+)
+
+
+add_library(cuda_static STATIC ${CUDA_SOURCES})
+
+set_target_properties(cuda_static PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_STANDARD 17
+)
+
+target_include_directories(cuda_static PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/settings
+    /usr/local/cuda/include
+)
+
+# ------------------------------
+# 4. Common Application Library
 # ------------------------------
-# Common sources including CUDA code
 set(COMMON_SOURCES
-    src/itterative_PCISPH.cu
     src/spawn_particles.cpp
     src/Camera.cpp
     src/ImGuiManager.cpp
@@ -53,7 +77,6 @@ set(COMMON_SOURCES
     src/FluidRenderer.cpp
     src/FluidSimulation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/common/GL_utilities.c
-    # Add other common source files if needed
 )
 
 add_library(common_lib STATIC ${COMMON_SOURCES})
@@ -61,9 +84,6 @@ add_library(common_lib STATIC ${COMMON_SOURCES})
 target_include_directories(common_lib PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/src
     ${CMAKE_CURRENT_SOURCE_DIR}/src/settings
-    /usr/local/cuda/include
-    ${OPENGL_INCLUDE_DIRS}
-    ${GLEW_INCLUDE_DIRS}
     ${CMAKE_CURRENT_SOURCE_DIR}/common
     ${CMAKE_CURRENT_SOURCE_DIR}/common/Linux/
     ${IMGUI_DIR}
@@ -75,39 +95,20 @@ target_link_libraries(common_lib PUBLIC
     ${GLEW_LIBRARIES}
     glfw
     GLU
+    cuda_static
 )
 
 set_target_properties(common_lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 # ------------------------------
-# 4. Define Executables
+# 5. Define Executables
 # ------------------------------
-
-# First executable (GUI_fluid_sim)
-add_executable(cuda_sph_main src/GUI_fluid_sim.cpp)
-target_link_libraries(cuda_sph_main PRIVATE common_lib imgui)
-
-# Second executable (screen_spaced_main)
-add_executable(screen_spaced src/screen_spaced_main.cpp)
-target_link_libraries(screen_spaced PRIVATE common_lib imgui)
-
-# New executable for MainGUI
 add_executable(fluidSimulator src/MainGUI.cpp)
 target_link_libraries(fluidSimulator PRIVATE common_lib imgui)
 
 # ------------------------------
-# 5. Set Target Properties
+# 6. Set Target Properties
 # ------------------------------
-set_target_properties(cuda_sph_main PROPERTIES
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED YES
-)
-
-set_target_properties(screen_spaced PROPERTIES
-    CXX_STANDARD 17
-    CXX_STANDARD_REQUIRED YES
-)
-
 set_target_properties(fluidSimulator PROPERTIES
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED YES
diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index 8f58b8fb2e2c6062ab530b7ef2d662ecf4e67a80..cf9c0d59a20195f423296b8cfc0156504b173e8c 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -1,6 +1,7 @@
 #include "include/FluidSimulation.h"
 #include "include/spawn_particles.h" // For createBoundaryParticles, initParticles
-#include "include/PCISPH_sim.h"      // For CUDA simulation functions
+//#include "include/PCISPH_sim.h"      // For CUDA simulation functions
+#include "cuda/include/PCISPH_CUDA_interface.h"
 #include <iostream>
 
 namespace FluidSimulation {
@@ -11,15 +12,15 @@ namespace FluidSimulation {
     }
 
     FluidSimulation::~FluidSimulation() {
-        unregisterVBOFromCUDA();
-        unregisterBoundaryVBOFromCUDA();
+        FluidSimulationGPU::unregisterVBOFromCUDA();
+        FluidSimulationGPU::unregisterBoundaryVBOFromCUDA();
         glDeleteBuffers(1, &m_vbo_particles);
         glDeleteBuffers(1, &m_vbo_boundary);
     }
 
     void FluidSimulation::initSimulation() {
         // Upload constants to the GPU
-        initializeConstants();
+        FluidSimulationGPU::initializeConstants();
 
         // Initialize particles
         std::vector<float3> h_positions(m_numParticles);
@@ -29,14 +30,14 @@ namespace FluidSimulation {
         initParticles(h_positions, h_velocities, m_numParticles, gridSpacing);
 
         // Initialize particle device arrays
-        initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles);
+        FluidSimulationGPU::initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles);
 
         // Create particle VBO
         glGenBuffers(1, &m_vbo_particles);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_particles);
         glBufferData(GL_ARRAY_BUFFER, h_positions.size() * sizeof(float3), h_positions.data(), GL_DYNAMIC_DRAW);
         glBindBuffer(GL_ARRAY_BUFFER, 0);
-        registerVBOWithCUDA(m_vbo_particles);
+        FluidSimulationGPU::registerVBOWithCUDA(m_vbo_particles);
 
         // Initialize boundary
         std::vector<float3> boundary_positions;
@@ -47,19 +48,19 @@ namespace FluidSimulation {
         createBoundaryParticles(boundary_positions, boundary_velocities, boundaryLayers, spacing);
         m_numBoundaryParticles = boundary_positions.size();
 
-        initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles);
+        FluidSimulationGPU::initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles);
 
         // Create boundary VBO
         glGenBuffers(1, &m_vbo_boundary);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_boundary);
         glBufferData(GL_ARRAY_BUFFER, boundary_positions.size() * sizeof(float3), boundary_positions.data(), GL_DYNAMIC_DRAW);
         glBindBuffer(GL_ARRAY_BUFFER, 0);
-        registerBoundaryVBOWithCUDA(m_vbo_boundary);
+        FluidSimulationGPU::registerBoundaryVBOWithCUDA(m_vbo_boundary);
     }
 
     void FluidSimulation::updateSimulation() {
         for (int i = 0; i < 4; ++i) {
-            updateParticles(0.001);
+            FluidSimulationGPU::updateParticles(0.001);
         }
     }
 
diff --git a/src/cuda/include/PCISPH_CUDA_interface.h b/src/cuda/include/PCISPH_CUDA_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2b7110767f9422b45abdc5bbf2a466dcabf79f3
--- /dev/null
+++ b/src/cuda/include/PCISPH_CUDA_interface.h
@@ -0,0 +1,11 @@
+#ifndef CUDA_INTERFACE_H
+#define CUDA_INTERFACE_H
+
+#include "cuda_device_vars.h"
+#include "spatialHash_NBSearch_kernels.h"
+#include "smoothing_kernels.h"
+#include "pcisph_kernels.h"
+#include "pcisph_update.h" // Header for pcisph_update.cu
+#include "init_cuda.h"     // Header for init_cuda.cu
+
+#endif // CUDA_INTERFACE_H
diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0eada192f94a395ec8b53503a082e6bca262ce7
--- /dev/null
+++ b/src/cuda/include/cuda_device_vars.h
@@ -0,0 +1,69 @@
+#ifndef CUDA_DEVICE_VARS_H
+#define CUDA_DEVICE_VARS_H
+
+#include <cuda_runtime.h>
+
+namespace FluidSimulationGPU {
+
+
+// Global Variables
+extern cudaGraphicsResource_t cuda_vbo_resource;
+extern cudaGraphicsResource_t cuda_vbo_boundary_resource;
+
+// Constants
+extern __constant__ float PI;
+extern __constant__ float H;
+extern __constant__ float H2;
+extern __constant__ float POLY6_CONST;
+extern __constant__ float SPIKY_GRAD_CONST;
+extern __constant__ float VISC_LAP_CONST;
+
+extern __constant__ float PARTICLE_MASS;
+extern __constant__ float REST_DENSITY;
+extern __constant__ float VISCOSITY;
+
+// Boundary constants
+extern __constant__ float BOUND_X_MIN;
+extern __constant__ float BOUND_X_MAX;
+extern __constant__ float BOUND_Y_MIN;
+extern __constant__ float BOUND_Y_MAX;
+extern __constant__ float BOUND_Z_MIN;
+extern __constant__ float BOUND_Z_MAX;
+
+// Device Arrays
+extern float3* d_positions;
+extern float3* d_velocities;
+
+extern unsigned int* d_cellIndices;
+extern unsigned int* d_particleIndices;
+
+extern float3* d_positions_sorted;
+extern float3* d_velocities_sorted;
+
+extern unsigned int* d_cellStart;
+extern unsigned int* d_cellEnd;
+
+// Arrays for densities, pressures, accelerations
+extern float* d_densities;
+extern float* d_density_errors;
+extern float* d_pressure_increments;
+extern float3* d_pressure_forces;
+extern float3* d_accelerations;
+
+// Additional arrays for PCISPH
+extern float3* d_velocities_pred;
+extern float3* d_positions_pred;
+extern float* d_sum_gradW2; // For computing A
+
+// Boundary particles
+extern float3* d_boundary_positions;
+extern float3* d_boundary_velocities;
+
+extern unsigned int* d_boundaryCellStart;
+extern unsigned int* d_boundaryCellEnd;
+
+extern unsigned int* d_boundaryCellIndices;
+extern unsigned int* d_boundaryParticleIndices;
+
+}
+#endif // CUDA_DEVICE_VARS_H
\ No newline at end of file
diff --git a/src/include/float3_helper_math.h b/src/cuda/include/float3_helper_math.h
similarity index 100%
rename from src/include/float3_helper_math.h
rename to src/cuda/include/float3_helper_math.h
diff --git a/src/cuda/include/init_cuda.h b/src/cuda/include/init_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..16b185f904eecc617cb4a4e80a63fd6acd7f4ef2
--- /dev/null
+++ b/src/cuda/include/init_cuda.h
@@ -0,0 +1,29 @@
+#ifndef INIT_CUDA_H
+#define INIT_CUDA_H
+
+#include <cuda_runtime.h>
+
+namespace FluidSimulationGPU {
+
+// Initialize constant values for the simulation
+void initializeConstants();
+
+// Register and unregister VBOs for CUDA-OpenGL interoperability
+void registerVBOWithCUDA(unsigned int vbo);
+void registerBoundaryVBOWithCUDA(unsigned int vbo);
+void unregisterVBOFromCUDA();
+void unregisterBoundaryVBOFromCUDA();
+
+// Initialize device memory for fluid and boundary particles
+void initializeFluidDeviceArrays(const float3* h_positions, const float3* h_velocities, int numParticles);
+void initializeBoundryDeviceArrays(const float3* h_boundaryPositions, const float3* h_boundaryVelocities, int numBoundaryParticles);
+
+// Cleanup all device arrays
+void cleanupDeviceArrays();
+
+// Perform a one-time neighborhood search for static boundary particles
+void boundaryNBSearch(int numBoundaryParticles);
+
+} // namespace FluidSimulationGPU
+
+#endif // INIT_CUDA_H
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..18c65f885237b033780e2869ec41d48806c9a210
--- /dev/null
+++ b/src/cuda/include/pcisph_kernels.h
@@ -0,0 +1,103 @@
+// pcisph_kernels.h
+
+#ifndef PCISPH_KERNELS_H
+#define PCISPH_KERNELS_H
+
+#include <cuda_runtime.h> // Required for float3 and CUDA types
+
+namespace FluidSimulationGPU {
+
+// Function to compute density errors
+__global__ void computeDensityError(
+    float3* positions,
+    float* densities,
+    float* density_errors,
+    float* sum_gradW2,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
+// Function to update pressures
+__global__ void updatePressures(
+    float* pressure_increments,
+    float* density_errors,
+    float A,
+    int numParticles
+);
+
+// Function to compute pressure forces
+__global__ void computePressureForces(
+    float3* positions,
+    float* pressure_increments,
+    float3* pressure_forces,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
+// Function to compute viscosity and external forces
+__global__ void computeViscosityAndExternalForces(
+    float3* positions,
+    float3* velocities,
+    float3* accelerations,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
+// Function to enforce boundary conditions
+__global__ void enforceBoundary(
+    float3* positions,
+    float3* velocities,
+    int numParticles,
+    float damping
+);
+
+// Function to predict velocities and positions
+__global__ void predictVelocitiesPositions(
+    float3* velocities,
+    float3* velocities_pred,
+    float3* positions,
+    float3* positions_pred,
+    float3* accelerations,
+    float timeStep,
+    int numParticles
+);
+
+// Function to update velocities and positions based on pressure forces
+__global__ void updateVelocitiesPositions(
+    float3* velocities_pred,
+    float3* positions_pred,
+    float3* pressure_forces,
+    float timeStep,
+    int numParticles
+);
+
+// Host function for neighbor search (uses device pointers)
+void neigbourHoodSearch(
+    int nParticles,
+    float3* d_pos,
+    float3* d_vel,
+    float3* d_posSorted,
+    float3* d_velSorted,
+    unsigned int* d_cellIndc,
+    unsigned int* d_parIndc,
+    unsigned int* d_cEnd,
+    unsigned int* d_cStart,
+    const int bpg,
+    const int tpb,
+    int& gridSizeX,
+    int& gridSizeY,
+    int& gridSizeZ
+);
+
+} // namespace FluidSimulationGPU
+
+#endif // PCISPH_KERNELS_H
diff --git a/src/cuda/include/pcisph_update.h b/src/cuda/include/pcisph_update.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e0684f6c6eeee0572897a45949e745d084d5d93
--- /dev/null
+++ b/src/cuda/include/pcisph_update.h
@@ -0,0 +1,13 @@
+#ifndef UPDATE_PARTICLES_H
+#define UPDATE_PARTICLES_H
+
+#include <cuda_runtime.h>
+
+namespace FluidSimulationGPU {
+
+// Updates the particle states for one simulation step
+void updateParticles(float deltaTime);
+
+} // namespace FluidSimulationGPU
+
+#endif // UPDATE_PARTICLES_H
\ No newline at end of file
diff --git a/src/cuda/include/smoothing_kernels.h b/src/cuda/include/smoothing_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..af8443c27fe559d6c5e8df41504e61adbd21f17c
--- /dev/null
+++ b/src/cuda/include/smoothing_kernels.h
@@ -0,0 +1,14 @@
+#include <cuda_runtime.h> // Required for float3 and CUDA types
+
+#ifndef SMOOTHING_KERNELS_H
+#define SMOOTHING_KERNELS_H
+
+namespace FluidSimulationGPU {
+
+__device__ float poly6Kernel(float r2);
+__device__ float spikyKernelGrad(float r);
+__device__ float viscosityKernelLap(float r);
+
+} // namespace FluidSimulationGPU
+
+#endif // SMOOTHING_KERNELS_H
diff --git a/src/cuda/include/spatialHash_NBSearch_kernels.h b/src/cuda/include/spatialHash_NBSearch_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..160e50068cb3c10424c931f66d635ffad723f312
--- /dev/null
+++ b/src/cuda/include/spatialHash_NBSearch_kernels.h
@@ -0,0 +1,13 @@
+#ifndef SPATIAL_HASH_NBSEARCH_KERNELS_H
+#define SPATIAL_HASH_NBSEARCH_KERNELS_H
+
+namespace FluidSimulationGPU {
+
+__global__ void computeCellIndices(float3* positions, unsigned int* cellIndices, int numParticles, float3 gridMin, float3 gridCellSize, int gridSizeX, int gridSizeY, int gridSizeZ);
+__global__ void reorderData(unsigned int* particleIndices, float3* positions, float3* velocities, float3* positions_sorted, float3* velocities_sorted, int numParticles);
+__global__ void findCellStartEnd(unsigned int* cellIndices, unsigned int* cellStart, unsigned int* cellEnd, int numParticles);
+void neighbourHoodSearch(int nParticles, float3* d_pos, float3* d_vel, float3* d_posSorted, float3* d_velSorted, unsigned int* d_cellIndc, unsigned int* d_parIndc, unsigned int* d_cEnd, unsigned int* d_cStart, const int bpg, const int tpb, int &gridSizeX, int &gridSizeY, int &gridSizeZ);
+
+} // namespace FluidSimulationGPU
+
+#endif // SPATIAL_HASH_NBSEARCH_KERNELS_H
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7cb10f4a4d13d2960bede1f7179c7701a12fbb3e
--- /dev/null
+++ b/src/cuda/init_cuda.cu
@@ -0,0 +1,275 @@
+// init_cu.cu
+
+#include "include/cuda_device_vars.h"
+#include "../settings/constants.h"
+
+#include "include/pcisph_kernels.h"
+
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+#include <device_launch_parameters.h>
+
+namespace FluidSimulationGPU {
+
+// Global Variables
+cudaGraphicsResource_t cuda_vbo_resource;
+cudaGraphicsResource_t cuda_vbo_boundary_resource;
+
+// Constants
+__constant__ float PI;
+__constant__ float H;
+__constant__ float H2;
+__constant__ float POLY6_CONST;
+__constant__ float SPIKY_GRAD_CONST;
+__constant__ float VISC_LAP_CONST;
+
+__constant__ float PARTICLE_MASS;
+__constant__ float REST_DENSITY;
+__constant__ float VISCOSITY;
+
+// Boundary constants
+__constant__ float BOUND_X_MIN;
+__constant__ float BOUND_X_MAX;
+__constant__ float BOUND_Y_MIN;
+__constant__ float BOUND_Y_MAX;
+__constant__ float BOUND_Z_MIN;
+__constant__ float BOUND_Z_MAX;
+
+// Device Arrays
+float3* d_positions = nullptr;
+float3* d_velocities = nullptr;
+
+unsigned int* d_cellIndices = nullptr;
+unsigned int* d_particleIndices = nullptr;
+
+float3* d_positions_sorted = nullptr;
+float3* d_velocities_sorted = nullptr;
+
+unsigned int* d_cellStart = nullptr;
+unsigned int* d_cellEnd = nullptr;
+
+// Arrays for densities, pressures, accelerations
+float* d_densities = nullptr;
+float* d_density_errors = nullptr;
+float* d_pressure_increments = nullptr;
+float3* d_pressure_forces = nullptr;
+float3* d_accelerations = nullptr;
+
+// Additional arrays for PCISPH
+float3* d_velocities_pred = nullptr;
+float3* d_positions_pred = nullptr;
+float* d_sum_gradW2 = nullptr;
+
+// Boundary particles
+float3* d_boundary_positions = nullptr;
+float3* d_boundary_velocities = nullptr;
+
+unsigned int* d_boundaryCellStart = nullptr;
+unsigned int* d_boundaryCellEnd = nullptr;
+
+unsigned int* d_boundaryCellIndices = nullptr;
+unsigned int* d_boundaryParticleIndices = nullptr;    
+
+void initializeConstants()
+{
+    // Host constants
+    const float h_H2 = h_H * h_H;
+    const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9));
+    const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6));
+    const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6));
+
+    // Copy constants to device
+    cudaMemcpyToSymbol(PI, &h_PI, sizeof(float));
+    cudaMemcpyToSymbol(H, &h_H, sizeof(float));
+    cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
+    cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
+    cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
+    cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
+
+    cudaMemcpyToSymbol(PARTICLE_MASS, &h_PARTICLE_MASS, sizeof(float));
+    cudaMemcpyToSymbol(REST_DENSITY, &h_REST_DENSITY, sizeof(float));
+    cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float));
+
+    // Copy boundary constants to device
+    cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float));
+}
+
+// Register VBO with CUDA
+void registerVBOWithCUDA(unsigned int vbo)
+{
+    cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
+}
+
+void registerBoundaryVBOWithCUDA(unsigned int vbo)
+{
+    cudaGraphicsGLRegisterBuffer(&cuda_vbo_boundary_resource, vbo, cudaGraphicsMapFlagsNone);
+}
+
+// Unregister VBO from CUDA
+void unregisterVBOFromCUDA()
+{
+    cudaGraphicsUnregisterResource(cuda_vbo_resource);
+}
+
+void unregisterBoundaryVBOFromCUDA()
+{
+    cudaGraphicsUnregisterResource(cuda_vbo_boundary_resource);
+}
+
+// Initialize device arrays
+void initializeFluidDeviceArrays(
+    const float3* h_positions,
+    const float3* h_velocities,
+    int numParticles
+)
+{
+    // Allocate device memory for velocities
+    cudaMalloc(&d_velocities, numParticles * sizeof(float3));
+    cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+    // Allocate device memory for predicted velocities and positions
+    cudaMalloc(&d_velocities_pred, numParticles * sizeof(float3));
+    cudaMalloc(&d_positions_pred, numParticles * sizeof(float3));
+
+    // Allocate device memory for cell indices and particle indices
+    cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int));
+    cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int));
+
+    // Allocate device memory for sorted positions and velocities
+    cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3));
+    cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3));
+
+    // Allocate cell start and end arrays
+    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+    cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int));
+    cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int));
+
+    // Allocate device memory for densities, pressure increments, and accelerations
+    cudaMalloc(&d_densities, numParticles * sizeof(float));
+    cudaMalloc(&d_density_errors, numParticles * sizeof(float));
+    cudaMalloc(&d_pressure_increments, numParticles * sizeof(float));
+    cudaMalloc(&d_pressure_forces, numParticles * sizeof(float3));
+    cudaMalloc(&d_accelerations, numParticles * sizeof(float3));
+
+    // Allocate device memory for sum_gradW2
+    cudaMalloc(&d_sum_gradW2, numParticles * sizeof(float));
+
+    // Initialize positions_pred to initial positions
+    cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+}
+
+void initializeBoundryDeviceArrays(
+    const float3* h_boundaryPositions,
+    const float3* h_boundaryVelocities,
+    int numBoundaryParticles
+)
+{
+    // Allocate cell start and end arrays
+    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+    // cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3));
+    cudaMalloc(&d_boundary_velocities, numBoundaryParticles * sizeof(float3));
+
+    //cudaMemcpy(d_boundary_positions, h_boundaryPositions, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_boundary_velocities, h_boundaryVelocities, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+    cudaMalloc(&d_boundaryCellIndices, numBoundaryParticles * sizeof(unsigned int));
+    cudaMalloc(&d_boundaryParticleIndices, numBoundaryParticles * sizeof(unsigned int));
+    
+    cudaMalloc(&d_boundaryCellStart, maxCells * sizeof(unsigned int));
+    cudaMalloc(&d_boundaryCellEnd, maxCells * sizeof(unsigned int));
+}
+
+
+// NeighbourHoodSearch for boundary particles 1 time at start since they dont move
+void boundaryNBSearch(int numBoundaryParticles) {
+    
+    // Map the VBO resource to get a device pointer
+    size_t num_bytes;
+    cudaGraphicsMapResources(1, &cuda_vbo_boundary_resource, 0);
+    cudaGraphicsResourceGetMappedPointer((void**)&d_boundary_positions, &num_bytes, cuda_vbo_boundary_resource);
+    
+    int gsX, gsY, gsZ;
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numBoundaryParticles + threadsPerBlock - 1) / threadsPerBlock;
+    
+    float3* d_boundary_positions_sorted;
+    float3* d_boundary_velocities_sorted;
+    
+
+    cudaMalloc(&d_boundary_positions_sorted, numBoundaryParticles * sizeof(float3));
+    cudaMalloc(&d_boundary_velocities_sorted, numBoundaryParticles * sizeof(float3));
+    
+    neigbourHoodSearch(
+            numBoundaryParticles, 
+            d_boundary_positions,
+            d_boundary_velocities,
+            d_boundary_positions_sorted,
+            d_boundary_velocities_sorted,
+            d_boundaryCellIndices,
+            d_boundaryParticleIndices,
+            d_boundaryCellEnd,
+            d_boundaryCellStart,
+            blocksPerGrid,
+            threadsPerBlock,
+            gsX,
+            gsY,
+            gsZ); 
+    
+    // Unmap the VBO resource
+    cudaGraphicsUnmapResources(1, &cuda_vbo_boundary_resource, 0);
+    
+    cudaFree(d_boundary_positions_sorted);
+    cudaFree(d_boundary_velocities_sorted);
+    
+    // Move some of above init code for boundary to seeparate function
+}
+
+// Cleanup device arrays
+void cleanupDeviceArrays()
+{
+    cudaFree(d_velocities);
+    cudaFree(d_velocities_pred);
+    cudaFree(d_positions_pred);
+
+    cudaFree(d_cellIndices);
+    cudaFree(d_particleIndices);
+
+    cudaFree(d_positions_sorted);
+    cudaFree(d_velocities_sorted);
+
+    cudaFree(d_cellStart);
+    cudaFree(d_cellEnd);
+
+    cudaFree(d_densities);
+    cudaFree(d_density_errors);
+    cudaFree(d_pressure_increments);
+    cudaFree(d_pressure_forces);
+    cudaFree(d_accelerations);
+
+    cudaFree(d_boundary_positions);
+    cudaFree(d_boundary_velocities);
+
+    cudaFree(d_boundaryCellIndices);
+    cudaFree(d_boundaryParticleIndices);
+    cudaFree(d_boundaryCellEnd);
+    cudaFree(d_boundaryCellStart);
+
+    cudaFree(d_sum_gradW2);
+}
+
+
+} // namespace FluidSimulationGPU
diff --git a/src/cuda/kernels_cuda.cu b/src/cuda/kernels_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d7dca020e5aea56ddc0520a8bd3e97219939264a
--- /dev/null
+++ b/src/cuda/kernels_cuda.cu
@@ -0,0 +1,606 @@
+// kernels_cuda.cu
+#include "kernels_cuda.h"
+#include "constants.h"
+#include "include/float3_helper_math.h"
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+#include <device_launch_parameters.h>
+#include <cmath>
+
+// Include Thrust for sorting
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+
+// CUDA Graphics Resource (for OpenGL-CUDA interoperability)
+cudaGraphicsResource_t cuda_vbo_resource;
+
+// Constants
+__constant__ float PI;
+__constant__ float H;
+__constant__ float H2;
+__constant__ float POLY6_CONST;
+__constant__ float SPIKY_GRAD_CONST;
+__constant__ float VISC_LAP_CONST;
+
+__constant__ float PARTICLE_MASS = 1.0f;
+__constant__ float REST_DENSITY = 60.0f;
+__constant__ float GAS_CONSTANT;
+__constant__ float VISCOSITY;
+
+// Boundary constants
+__constant__ float BOUND_X_MIN;
+__constant__ float BOUND_X_MAX;
+__constant__ float BOUND_Y_MIN;
+__constant__ float BOUND_Y_MAX;
+__constant__ float BOUND_Z_MIN;
+__constant__ float BOUND_Z_MAX;
+
+// Device arrays
+float3* d_positions; // Positions mapped from OpenGL VBO
+float3* d_velocities;
+
+unsigned int* d_cellIndices;
+unsigned int* d_particleIndices;
+
+float3* d_positions_sorted;
+float3* d_velocities_sorted;
+
+unsigned int* d_cellStart;
+unsigned int* d_cellEnd;
+
+// Arrays for densities, pressures, accelerations
+float* d_densities;
+float* d_pressures;
+float3* d_accelerations;
+
+// Smoothing kernels
+__device__ float poly6Kernel(float r2)
+{
+    if (r2 >= 0.0f && r2 <= H2) {
+        float term = H2 - r2;
+        return POLY6_CONST * term * term * term;
+    } else {
+        return 0.0f;
+    }
+}
+
+__device__ float spikyKernelGrad(float r)
+{
+    if (r > 0.0f && r <= H) {
+        float term = H - r;
+        return SPIKY_GRAD_CONST * term * term;
+    } else {
+        return 0.0f;
+    }
+}
+
+__device__ float viscosityKernelLap(float r)
+{
+    if (r >= 0.0f && r <= H) {
+        return VISC_LAP_CONST * (H - r);
+    } else {
+        return 0.0f;
+    }
+}
+
+// Compute cell indices
+__global__ void computeCellIndices(
+    float3* positions,
+    unsigned int* cellIndices,
+    int numParticles,
+    float3 gridMin,
+    float3 gridCellSize,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos = positions[idx];
+
+        int ix = (int)((pos.x - gridMin.x) / gridCellSize.x);
+        int iy = (int)((pos.y - gridMin.y) / gridCellSize.y);
+        int iz = (int)((pos.z - gridMin.z) / gridCellSize.z);
+
+        // Clamp indices to grid bounds
+        ix = max(0, min(ix, gridSizeX - 1));
+        iy = max(0, min(iy, gridSizeY - 1));
+        iz = max(0, min(iz, gridSizeZ - 1));
+
+        unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY;
+
+        cellIndices[idx] = cellIndex;
+    }
+}
+
+// Reordering kernel
+__global__ void reorderData(
+    unsigned int* particleIndices,
+    float3* positions,
+    float3* velocities,
+    float3* positions_sorted,
+    float3* velocities_sorted,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        unsigned int sortedIdx = particleIndices[idx];
+
+        positions_sorted[idx] = positions[sortedIdx];
+        velocities_sorted[idx] = velocities[sortedIdx];
+    }
+}
+
+// Find cell start and end indices
+// can bli reeplaceed med atomicAdd för varje ceell index??
+__global__ void findCellStartEnd(
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        unsigned int cellIndex = cellIndices[idx];
+
+        if (idx == 0) {
+            cellStart[cellIndex] = 0;
+        } else {
+            unsigned int prevCellIndex = cellIndices[idx - 1];
+            if (prevCellIndex != cellIndex) {
+                cellStart[cellIndex] = idx;
+                cellEnd[prevCellIndex] = idx;
+            }
+        }
+
+        if (idx == numParticles - 1) {
+            cellEnd[cellIndex] = numParticles;
+        }
+    }
+}
+
+// Compute density and pressure
+__global__ void computeDensityPressure(
+    float3* positions,
+    float* densities,
+    float* pressures,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos_i = positions[idx];
+        float density = 0.0f;
+
+        unsigned int cellIndex = cellIndices[idx];
+
+        // Compute cell coordinates
+        int cellX = cellIndex % gridSizeX;
+        int cellY = (cellIndex / gridSizeX) % gridSizeY;
+        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+        // Iterate over neighboring cells
+        for (int dz = -1; dz <= 1; ++dz) {
+            int nz = cellZ + dz;
+            if (nz < 0 || nz >= gridSizeZ) continue;
+            for (int dy = -1; dy <= 1; ++dy) {
+                int ny = cellY + dy;
+                if (ny < 0 || ny >= gridSizeY) continue;
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int nx = cellX + dx;
+                    if (nx < 0 || nx >= gridSizeX) continue;
+
+                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                    unsigned int startIdx = cellStart[neighborCellIndex];
+                    unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                    if (startIdx != 0xFFFFFFFF) {
+                        for (unsigned int j = startIdx; j < endIdx; ++j) {
+                            float3 pos_j = positions[j];
+
+                            float3 r = pos_j - pos_i;
+                            float r2 = r.x * r.x + r.y * r.y + r.z * r.z;
+
+                            density += PARTICLE_MASS * poly6Kernel(r2);
+                        }
+                    }
+                }
+            }
+        }
+
+        densities[idx] = density;
+        //pressures[idx] = GAS_CONSTANT * fmaxf((density - REST_DENSITY), 0.0);
+        float dens = density/REST_DENSITY;
+        pressures[idx] = GAS_CONSTANT * ((dens*dens*dens*dens*dens*dens*dens)-1.0);
+        //pressures[idx] = GAS_CONSTANT * min((density - REST_DENSITY), 0.0);
+        //pressures[idx] = GAS_CONSTANT * (density - REST_DENSITY);
+    }
+}
+
+// Compute forces
+__global__ void computeForces(
+    float3* positions,
+    float3* velocities,
+    float* densities,
+    float* pressures,
+    float3* accelerations,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos_i = positions[idx];
+        float3 vel_i = velocities[idx];
+        float density_i = densities[idx];
+        float pressure_i = pressures[idx];
+
+        float3 fPressure = make_float3(0.0f, 0.0f, 0.0f);
+        float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f);
+        float3 fGravity = make_float3(0.0f, -4.0f , 0.0f);
+
+        unsigned int cellIndex = cellIndices[idx];
+
+        // Compute cell coordinates
+        int cellX = cellIndex % gridSizeX;
+        int cellY = (cellIndex / gridSizeX) % gridSizeY;
+        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+        // Iterate over neighboring cells
+        for (int dz = -1; dz <= 1; ++dz) {
+            int nz = cellZ + dz;
+            if (nz < 0 || nz >= gridSizeZ) continue;
+            for (int dy = -1; dy <= 1; ++dy) {
+                int ny = cellY + dy;
+                if (ny < 0 || ny >= gridSizeY) continue;
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int nx = cellX + dx;
+                    if (nx < 0 || nx >= gridSizeX) continue;
+
+                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                    unsigned int startIdx = cellStart[neighborCellIndex];
+                    unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                    if (startIdx != 0xFFFFFFFF) {
+                        for (unsigned int j = startIdx; j < endIdx; ++j) {
+                            if (j == idx) continue;
+
+                            float3 pos_j = positions[j];
+                            float3 r = (pos_i - pos_j);
+                            
+                            float distSq = r.x * r.x + r.y * r.y + r.z * r.z;
+
+                            if (distSq < H2 && distSq > 0.0001f) {
+                                float dist = sqrtf(distSq);
+
+                                // Pressure force
+                                float spikyGrad = spikyKernelGrad(dist);
+
+                                // Cache pressures[j]
+                                float density_j = densities[j];
+                                float pressure_j = pressures[j];
+
+                                // avreged pressure scaleed by inverse j density
+                                /*
+                                float pressureTerm = - PARTICLE_MASS * 0.5 * (pressure_i + pressure_j) / density_j * spikyGrad;
+                                fPressure += pressureTerm*(r/dist); // r/dist gives direction
+                                */
+
+                                // Symmetric pressure -- can cahnge to inverse dennsity in density/pressure calc to avoid devide by density here
+                                // Add particle_mass to pressure term according to equations !!! Added down below in aceleration calcualtion
+                                float pressureTerm = - (pressure_i / (density_i * density_i) + pressure_j / (density_j * density_j)) * spikyGrad;
+                                float3 pressureForce = pressureTerm * (r / dist); // Directional force
+                                fPressure += pressureForce;
+
+                                // Viscosity force
+                                float3 velDiff = velocities[j] - vel_i;
+                                float viscLap = viscosityKernelLap(dist);
+                                fViscosity  += VISCOSITY * velDiff * viscLap / density_j * PARTICLE_MASS;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Total acceleration
+        // According to paper v_i(t+dt) = v_(i+1) + (dt/m_i)*Fp_i
+        // While external and viscocisty force is calculated before But mass and nnot deeensity is used to deevidee force
+        // Deviding by mass, looks a lot better
+        accelerations[idx] = (fPressure * PARTICLE_MASS + fViscosity + fGravity) / PARTICLE_MASS;
+        //accelerations[idx] = (fPressure * PARTICLE_MASS + fViscosity + fGravity) / density_i;
+    }
+}
+
+// Integration
+__global__ void integrate(
+    float3* positions,
+    float3* velocities,
+    float3* accelerations,
+    float timeStep,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        // Update velocity
+        float3 vel_i = velocities[idx] + (accelerations[idx] * timeStep);
+        velocities[idx] = vel_i;
+
+        // Update position
+        positions[idx] += vel_i * timeStep;
+    }
+}
+
+// Enforce boundary conditions
+__global__ void enforceBoundary(
+    float3* positions,
+    float3* velocities,
+    int numParticles,
+    float damping
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        // X-axis
+        if (positions[idx].x < BOUND_X_MIN) {
+            positions[idx].x = BOUND_X_MIN;
+            velocities[idx].x *= -damping;
+        }
+        if (positions[idx].x > BOUND_X_MAX) {
+            positions[idx].x = BOUND_X_MAX; // Should account for particle radius!!
+            velocities[idx].x *= -damping;
+        }
+
+        // Y-axis
+        if (positions[idx].y < BOUND_Y_MIN) {
+            positions[idx].y = BOUND_Y_MIN;
+            velocities[idx].y *= -damping;
+        }
+        if (positions[idx].y > BOUND_Y_MAX) {
+            positions[idx].y = BOUND_Y_MAX;
+            velocities[idx].y *= -damping;
+        }
+
+        // Z-axis
+        if (positions[idx].z < BOUND_Z_MIN) {
+            positions[idx].z = BOUND_Z_MIN;
+            velocities[idx].z *= -damping;
+        }
+        if (positions[idx].z > BOUND_Z_MAX) {
+            positions[idx].z = BOUND_Z_MAX;
+            velocities[idx].z *= -damping;
+        }
+    }
+}
+
+void updateParticles(float deltaTime)
+{
+    // Map the VBO resource to get a device pointer
+    size_t num_bytes;
+    cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
+    cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Compute grid size based on smoothing radius
+    float3 gridMin = make_float3(h_BOUND_X_MIN, h_BOUND_Y_MIN, h_BOUND_Z_MIN);
+    float3 gridMax = make_float3(h_BOUND_X_MAX, h_BOUND_Y_MAX, h_BOUND_Z_MAX);
+    float3 gridCellSize = make_float3(h_H, h_H, h_H);
+
+    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+    // Allocate or resize cellStart and cellEnd arrays if necessary
+    // (You may need to adjust your initialization code to handle dynamic sizes)
+
+    // Compute cell indices
+    computeCellIndices<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions, d_cellIndices, NUM_PARTICLES, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    // Wrap device pointers with thrust iterators
+    thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndices);
+    thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_particleIndices);
+
+    // Initialize particle indices
+    thrust::sequence(dev_particleIndices_ptr, dev_particleIndices_ptr + NUM_PARTICLES);
+
+    // Sort particle indices based on cell indices
+    thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + NUM_PARTICLES, dev_particleIndices_ptr);
+
+    // Reorder positions and velocities
+    reorderData<<<blocksPerGrid, threadsPerBlock>>>(
+        d_particleIndices,
+        d_positions,
+        d_velocities,
+        d_positions_sorted,
+        d_velocities_sorted,
+        NUM_PARTICLES);
+    cudaDeviceSynchronize();
+
+    // Swap sorted data with original data
+    float3* temp_positions = d_positions;
+    d_positions = d_positions_sorted;
+    d_positions_sorted = temp_positions;
+
+    float3* temp_velocities = d_velocities;
+    d_velocities = d_velocities_sorted;
+    d_velocities_sorted = temp_velocities;
+
+    // Build cell start and end indices
+    // Initialize cellStart and cellEnd
+    cudaMemset(d_cellStart, 0xFF, maxCells * sizeof(unsigned int));
+    cudaMemset(d_cellEnd, 0xFF, maxCells * sizeof(unsigned int));
+
+    findCellStartEnd<<<blocksPerGrid, threadsPerBlock>>>(
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES);
+    cudaDeviceSynchronize();
+
+    // Compute density and pressure
+    computeDensityPressure<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_densities,
+        d_pressures,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    // Compute forces
+    computeForces<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        d_densities,
+        d_pressures,
+        d_accelerations,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    // Integrate
+    float timeStep = 0.001f;
+    integrate<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        d_accelerations,
+        timeStep,
+        //deltaTime,
+        NUM_PARTICLES);
+    cudaDeviceSynchronize();
+
+    // Enforce boundary conditions
+    float damping = 0.7f;
+    enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        NUM_PARTICLES,
+        damping);
+    cudaDeviceSynchronize();
+
+    // Unmap the VBO resource
+    cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
+}
+
+void initializeConstants()
+{
+    // Host constants
+    const float h_H2 = h_H * h_H;
+    const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9));
+    const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6));
+    const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6));
+
+    // Copy constants to device
+    cudaMemcpyToSymbol(PI, &h_PI, sizeof(float));
+    cudaMemcpyToSymbol(H, &h_H, sizeof(float));
+    cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
+    cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
+    cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
+    cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
+
+    cudaMemcpyToSymbol(GAS_CONSTANT, &h_GAS_CONSTANT, sizeof(float));
+    cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float));
+
+    // Copy boundary constants to device
+    cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float));
+    cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float));
+}
+
+// Register VBO with CUDA
+void registerVBOWithCUDA(unsigned int vbo)
+{
+    cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
+}
+
+// Unregister VBO from CUDA
+void unregisterVBOFromCUDA()
+{
+    cudaGraphicsUnregisterResource(cuda_vbo_resource);
+}
+
+// Initialize device arrays
+void initializeDeviceArrays(
+    const float3* h_positions,
+    const float3* h_velocities,
+    int numParticles
+)
+{
+    // Allocate device memory for velocities
+    cudaMalloc(&d_velocities, numParticles * sizeof(float3));
+    cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+    // Allocate device memory for cell indices and particle indices
+    cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int));
+    cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int));
+
+    // Allocate device memory for sorted positions and velocities
+    cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3));
+    cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3));
+
+    // Allocate cell start and end arrays
+    // Note: You need to ensure that 'maxCells' is available here or adjust the code to allocate after computing 'maxCells'
+    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+    cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int));
+    cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int));
+
+    // Allocate device memory for densities, pressures, and accelerations
+    cudaMalloc(&d_densities, numParticles * sizeof(float));
+    cudaMalloc(&d_pressures, numParticles * sizeof(float));
+    cudaMalloc(&d_accelerations, numParticles * sizeof(float3));
+}
+
+// Cleanup device arrays
+void cleanupDeviceArrays()
+{
+    cudaFree(d_velocities);
+
+    cudaFree(d_cellIndices);
+    cudaFree(d_particleIndices);
+
+    cudaFree(d_positions_sorted);
+    cudaFree(d_velocities_sorted);
+
+    cudaFree(d_cellStart);
+    cudaFree(d_cellEnd);
+
+    cudaFree(d_densities);
+    cudaFree(d_pressures);
+    cudaFree(d_accelerations);
+}
\ No newline at end of file
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2efd40677d2f06175404a1d6a7f80950a4733db
--- /dev/null
+++ b/src/cuda/pcisph_kernels.cu
@@ -0,0 +1,357 @@
+// pcisph_kernels.cu
+#include "include/pcisph_kernels.h"
+#include "include/smoothing_kernels.h"
+
+#include "include/float3_helper_math.h"
+
+#include "include/cuda_device_vars.h"
+#include "../settings/constants.h"
+
+
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+#include <device_launch_parameters.h>
+
+// Include Thrust for sorting and reductions
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+#include <thrust/reduce.h>
+
+
+namespace FluidSimulationGPU {
+
+
+
+// Compute density errors (used in PCISPH)
+__global__ void computeDensityError(
+    float3* positions,
+    float* densities,
+    float* density_errors,
+    float* sum_gradW2,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos_i = positions[idx];
+        float density = 0.0f;
+        float sumGradW2_local = 0.0f;
+
+        unsigned int cellIndex = cellIndices[idx];
+
+        // Compute cell coordinates
+        int cellX = cellIndex % gridSizeX;
+        int cellY = (cellIndex / gridSizeX) % gridSizeY;
+        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+        // Iterate over neighboring cells
+        for (int dz = -1; dz <= 1; ++dz) {
+            int nz = cellZ + dz;
+            if (nz < 0 || nz >= gridSizeZ) continue;
+            for (int dy = -1; dy <= 1; ++dy) {
+                int ny = cellY + dy;
+                if (ny < 0 || ny >= gridSizeY) continue;
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int nx = cellX + dx;
+                    if (nx < 0 || nx >= gridSizeX) continue;
+
+                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                    unsigned int startIdx = cellStart[neighborCellIndex];
+                    unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                    if (startIdx != 0xFFFFFFFF) {
+                        for (unsigned int j = startIdx; j < endIdx; ++j) {
+                            float3 pos_j = positions[j];
+
+                            float3 r = pos_i - pos_j;
+                            float r2 = dot(r, r);
+
+                            if (r2 < H2 && r2 > 0.0001f) {
+                                float dist = sqrtf(r2);
+
+                                float W = poly6Kernel(r2);
+                                density += PARTICLE_MASS * W;
+
+                                if (sum_gradW2 != nullptr) {
+                                    // Compute gradW
+                                    float gradW_coeff = spikyKernelGrad(dist);
+                                    float3 gradW = gradW_coeff * (r / dist);
+
+                                    sumGradW2_local += dot(gradW, gradW);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        densities[idx] = density;
+        density_errors[idx] = density - REST_DENSITY;
+
+        if (sum_gradW2 != nullptr) {
+            sum_gradW2[idx] = sumGradW2_local;
+        }
+    }
+}
+
+// Update pressures based on density errors
+__global__ void updatePressures(
+    float* pressure_increments,
+    float* density_errors,
+    float A,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float deltaP = -density_errors[idx] / A;
+        pressure_increments[idx] += deltaP;
+    }
+}
+
+// Compute pressure forces
+__global__ void computePressureForces(
+    float3* positions,
+    float* pressure_increments,
+    float3* pressure_forces,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos_i = positions[idx];
+        float deltaP_i = pressure_increments[idx];
+
+        float3 fPressure = make_float3(0.0f, 0.0f, 0.0f);
+
+        unsigned int cellIndex = cellIndices[idx];
+
+        // Compute cell coordinates
+        int cellX = cellIndex % gridSizeX;
+        int cellY = (cellIndex / gridSizeX) % gridSizeY;
+        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+        // Iterate over neighboring cells
+        for (int dz = -1; dz <= 1; ++dz) {
+            int nz = cellZ + dz;
+            if (nz < 0 || nz >= gridSizeZ) continue;
+            for (int dy = -1; dy <= 1; ++dy) {
+                int ny = cellY + dy;
+                if (ny < 0 || ny >= gridSizeY) continue;
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int nx = cellX + dx;
+                    if (nx < 0 || nx >= gridSizeX) continue;
+
+                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                    unsigned int startIdx = cellStart[neighborCellIndex];
+                    unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                    if (startIdx != 0xFFFFFFFF) {
+                        for (unsigned int j = startIdx; j < endIdx; ++j) {
+                            if (j == idx) continue;
+
+                            float3 pos_j = positions[j];
+                            float deltaP_j = pressure_increments[j];
+
+                            float3 r = pos_i - pos_j;
+
+                            float distSq = dot(r, r);
+
+                            if (distSq < H2 && distSq > 0.0001f) {
+                                float dist = sqrtf(distSq);
+
+                                // Compute gradient of kernel
+                                float gradW = spikyKernelGrad(dist);
+                                float3 gradWij = gradW * (r / dist);
+
+                                // Symmetric pressure force
+                                float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
+                                fPressure += factor * gradWij;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Multiply by constants
+        fPressure *= PARTICLE_MASS * PARTICLE_MASS;
+
+        pressure_forces[idx] = fPressure;
+    }
+}
+
+// Compute viscosity and external forces (excluding pressure forces)
+__global__ void computeViscosityAndExternalForces(
+    float3* positions,
+    float3* velocities,
+    float3* accelerations,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos_i = positions[idx];
+        float3 vel_i = velocities[idx];
+
+        float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f);
+        float3 fGravity = make_float3(0.0f, -20.0f, 0.0f) * PARTICLE_MASS;
+
+        unsigned int cellIndex = cellIndices[idx];
+
+        // Compute cell coordinates
+        int cellX = cellIndex % gridSizeX;
+        int cellY = (cellIndex / gridSizeX) % gridSizeY;
+        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+        // Iterate over neighboring cells
+        for (int dz = -1; dz <= 1; ++dz) {
+            int nz = cellZ + dz;
+            if (nz < 0 || nz >= gridSizeZ) continue;
+            for (int dy = -1; dy <= 1; ++dy) {
+                int ny = cellY + dy;
+                if (ny < 0 || ny >= gridSizeY) continue;
+                for (int dx = -1; dx <= 1; ++dx) {
+                    int nx = cellX + dx;
+                    if (nx < 0 || nx >= gridSizeX) continue;
+
+                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                    unsigned int startIdx = cellStart[neighborCellIndex];
+                    unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                    if (startIdx != 0xFFFFFFFF) {
+                        for (unsigned int j = startIdx; j < endIdx; ++j) {
+                            if (j == idx) continue;
+
+                            float3 pos_j = positions[j];
+                            float3 vel_j = velocities[j];
+                            float3 r = pos_i - pos_j;
+
+                            float distSq = dot(r, r);
+
+                            if (distSq < H2 && distSq > 0.0001f) {
+                                float dist = sqrtf(distSq);
+
+                                // Viscosity force
+                                float3 velDiff = vel_j - vel_i;
+                                float viscLap = viscosityKernelLap(dist);
+                                fViscosity += VISCOSITY * velDiff * viscLap * PARTICLE_MASS / REST_DENSITY;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Total acceleration (excluding pressure force)
+        accelerations[idx] = (fViscosity + fGravity) / PARTICLE_MASS;
+    }
+}
+
+// Enforce boundary conditions, very simple should bee improved.
+__global__ void enforceBoundary(
+    float3* positions,
+    float3* velocities,
+    int numParticles,
+    float damping
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        // X-axis
+        if (positions[idx].x < BOUND_X_MIN) {
+            positions[idx].x = BOUND_X_MIN;
+            velocities[idx].x *= -damping;
+        }
+        if (positions[idx].x > BOUND_X_MAX) {
+            positions[idx].x = BOUND_X_MAX;
+            velocities[idx].x *= -damping;
+        }
+
+        // Y-axis
+        if (positions[idx].y < BOUND_Y_MIN) {
+            positions[idx].y = BOUND_Y_MIN;
+            velocities[idx].y *= -damping;
+        }
+        if (positions[idx].y > BOUND_Y_MAX) {
+            positions[idx].y = BOUND_Y_MAX;
+            velocities[idx].y *= -damping;
+        }
+
+        // Z-axis
+        if (positions[idx].z < BOUND_Z_MIN) {
+            positions[idx].z = BOUND_Z_MIN;
+            velocities[idx].z *= -damping;
+        }
+        if (positions[idx].z > BOUND_Z_MAX) {
+            positions[idx].z = BOUND_Z_MAX;
+            velocities[idx].z *= -damping;
+        }
+    }
+}
+
+// Kernel to predict velocities and positions
+__global__ void predictVelocitiesPositions(
+    float3* velocities,
+    float3* velocities_pred,
+    float3* positions,
+    float3* positions_pred,
+    float3* accelerations,
+    float timeStep,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < numParticles) {
+        // Compute predicted velocity
+        velocities_pred[idx] = velocities[idx] + accelerations[idx] * timeStep;
+        // Compute predicted position
+        positions_pred[idx] = positions[idx] + velocities_pred[idx] * timeStep;
+    }
+}
+
+// Update velocities and positions based on pressure forces in 
+// pressure correcting loop. 
+__global__ void updateVelocitiesPositions(
+    float3* velocities_pred,
+    float3* positions_pred,
+    float3* pressure_forces,
+    float timeStep,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        // Update predicted velocity
+        velocities_pred[idx] += (pressure_forces[idx] / PARTICLE_MASS) * timeStep;
+
+        // Update predicted position
+        positions_pred[idx] += (pressure_forces[idx] / PARTICLE_MASS) * (timeStep * timeStep);
+    }
+}
+
+
+} // namespace FluidSimulationGPU
\ No newline at end of file
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
new file mode 100644
index 0000000000000000000000000000000000000000..05c163b43b3544496b5138ca7aebb6c9308c2a93
--- /dev/null
+++ b/src/cuda/pcisph_update.cu
@@ -0,0 +1,155 @@
+#include "include/cuda_device_vars.h"
+#include "include/pcisph_kernels.h"
+#include "include/spatialHash_NBSearch_kernels.h"
+#include "../settings/constants.h"
+
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+#include <device_launch_parameters.h>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+namespace FluidSimulationGPU {
+
+void updateParticles(float deltaTime)
+{
+    // Map the VBO resource to get a device pointer
+    size_t num_bytes;
+    cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
+    cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
+
+    // Time step should be small but we can run updateParticles multiple times during display
+    float timeStep = 0.004f;
+
+    // -------------------- Neighbor Search --------------------
+    int gridSizeX, gridSizeY, gridSizeZ;
+    neigbourHoodSearch(
+        NUM_PARTICLES,
+        d_positions,
+        d_velocities,
+        d_positions_sorted,
+        d_velocities_sorted,
+        d_cellIndices,
+        d_particleIndices,
+        d_cellEnd,
+        d_cellStart,
+        blocksPerGrid,
+        threadsPerBlock,
+        gridSizeX,
+        gridSizeY,
+        gridSizeZ);
+
+    // -------------------- Prediction Step --------------------
+
+    computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        d_accelerations,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+        d_velocities,
+        d_velocities_pred,
+        d_positions,
+        d_positions_pred,
+        d_accelerations,
+        timeStep,
+        NUM_PARTICLES);
+    cudaDeviceSynchronize();
+
+    // Initialize pressure increments to zero
+    cudaMemset(d_pressure_increments, 0, NUM_PARTICLES * sizeof(float));
+
+    // Compute densities and density errors, and per-particle sumGradW2
+    computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions_pred,
+        d_densities,
+        d_density_errors,
+        d_sum_gradW2,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    // Reduce sum_gradW2 over all particles
+    thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
+    float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
+
+    // Compute A
+    float m = h_PARTICLE_MASS;
+    float rho0 = h_REST_DENSITY;
+    float A = 2.0f * timeStep * timeStep * m * m / (rho0 * rho0) * totalSumGradW2;
+
+    // ------------ Iterative Pressure Correction Loop --------------------
+
+    const int maxIterations = 5;
+
+    for (int iter = 0; iter < maxIterations; ++iter) {
+        computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions_pred,
+            d_densities,
+            d_density_errors,
+            nullptr,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
+
+        updatePressures<<<blocksPerGrid, threadsPerBlock>>>(
+            d_pressure_increments,
+            d_density_errors,
+            A,
+            NUM_PARTICLES);
+        cudaDeviceSynchronize();
+
+        computePressureForces<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions_pred,
+            d_pressure_increments,
+            d_pressure_forces,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
+
+        updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+            d_velocities_pred,
+            d_positions_pred,
+            d_pressure_forces,
+            timeStep,
+            NUM_PARTICLES);
+        cudaDeviceSynchronize();
+    }
+
+    // -------------------- Finalization --------------------
+
+    cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+
+    // Enforce boundary conditions
+    float damping = 0.75f;
+    enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        NUM_PARTICLES,
+        damping);
+    cudaDeviceSynchronize();
+
+    // Unmap the VBO resource
+    cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
+}
+
+} // namespace FluidSimulationGPU
diff --git a/src/cuda/smoothing_kernels.cu b/src/cuda/smoothing_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52d7248726cff11f11794cf4946de0a5782b5e76
--- /dev/null
+++ b/src/cuda/smoothing_kernels.cu
@@ -0,0 +1,36 @@
+#include "include/smoothing_kernels.h"
+#include "include/cuda_device_vars.h"
+#include "../settings/constants.h" // Dont ned this one?
+
+namespace FluidSimulationGPU {
+
+__device__ float poly6Kernel(float r2)
+{
+    if (r2 >= 0.0f && r2 <= H2) {
+        float term = H2 - r2;
+        return POLY6_CONST * term * term * term;
+    } else {
+        return 0.0f;
+    }
+}
+
+__device__ float spikyKernelGrad(float r)
+{
+    if (r > 0.0f && r <= H) {
+        float term = H - r;
+        return SPIKY_GRAD_CONST * term * term;
+    } else {
+        return 0.0f;
+    }
+}
+
+__device__ float viscosityKernelLap(float r)
+{
+    if (r >= 0.0f && r <= H) {
+        return VISC_LAP_CONST * (H - r);
+    } else {
+        return 0.0f;
+    }
+}
+
+} // namespace FluidSimulationGPU
diff --git a/src/cuda/spatialHash_NBSearch_kernels.cu b/src/cuda/spatialHash_NBSearch_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d48d3005fc409f690d3df74f022c1fe38d35aa9c
--- /dev/null
+++ b/src/cuda/spatialHash_NBSearch_kernels.cu
@@ -0,0 +1,159 @@
+#include "include/spatialHash_NBSearch_kernels.h"
+#include "include/cuda_device_vars.h"
+#include "../settings/constants.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+#include <thrust/reduce.h>
+
+namespace FluidSimulationGPU {
+
+// Compute cell indices
+__global__ void computeCellIndices(
+    float3* positions,
+    unsigned int* cellIndices,
+    int numParticles,
+    float3 gridMin,
+    float3 gridCellSize,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        float3 pos = positions[idx];
+
+        int ix = (int)((pos.x - gridMin.x) / gridCellSize.x);
+        int iy = (int)((pos.y - gridMin.y) / gridCellSize.y);
+        int iz = (int)((pos.z - gridMin.z) / gridCellSize.z);
+
+        // Clamp indices to grid bounds
+        ix = max(0, min(ix, gridSizeX - 1));
+        iy = max(0, min(iy, gridSizeY - 1));
+        iz = max(0, min(iz, gridSizeZ - 1));
+
+        unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY;
+
+        cellIndices[idx] = cellIndex;
+    }
+}
+
+// Reordering kernel
+__global__ void reorderData(
+    unsigned int* particleIndices,
+    float3* positions,
+    float3* velocities,
+    float3* positions_sorted,
+    float3* velocities_sorted,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        unsigned int sortedIdx = particleIndices[idx];
+
+        positions_sorted[idx] = positions[sortedIdx];
+        velocities_sorted[idx] = velocities[sortedIdx];
+    }
+}
+
+// Find cell start and end indices
+__global__ void findCellStartEnd(
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < numParticles) {
+        unsigned int cellIndex = cellIndices[idx];
+
+        if (idx == 0) {
+            cellStart[cellIndex] = 0;
+        } else {
+            unsigned int prevCellIndex = cellIndices[idx - 1];
+            if (prevCellIndex != cellIndex) {
+                cellStart[cellIndex] = idx;
+                cellEnd[prevCellIndex] = idx;
+            }
+        }
+
+        if (idx == numParticles - 1) {
+            cellEnd[cellIndex] = numParticles;
+        }
+    }
+}
+
+void neigbourHoodSearch(
+        int nParticles, 
+        float3* d_pos,
+        float3* d_vel,
+        float3* d_posSorted,
+        float3* d_velSorted,
+        unsigned int* d_cellIndc,
+        unsigned int* d_parIndc,
+        unsigned int* d_cEnd,
+        unsigned int* d_cStart,
+        const int bpg,
+        const int tpb,
+        int &gridSizeX,
+        int &gridSizeY,
+        int &gridSizeZ) 
+{
+
+    // Compute grid size based on smoothing radius
+    float3 gridMin = make_float3(h_BOUND_X_MIN, h_BOUND_Y_MIN, h_BOUND_Z_MIN);
+    float3 gridMax = make_float3(h_BOUND_X_MAX, h_BOUND_Y_MAX, h_BOUND_Z_MAX);
+    float3 gridCellSize = make_float3(h_H, h_H, h_H);
+
+    gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+    gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+    gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+    
+    // Compute cell indices for current positions
+    computeCellIndices<<<bpg, tpb>>>(
+        d_pos, d_cellIndc, nParticles, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    // Wrap device pointers with thrust iterators
+    thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndc);
+    thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_parIndc);
+
+    // Initialize particle indices
+    thrust::sequence(dev_particleIndices_ptr, dev_particleIndices_ptr + nParticles);
+
+    // Sort particle indices based on cell indices
+    thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + nParticles, dev_particleIndices_ptr);
+
+    // Reorder positions and velocities
+    reorderData<<<bpg, tpb>>>(
+        d_parIndc,
+        d_pos,
+        d_vel,
+        d_posSorted,
+        d_velSorted,
+        nParticles);
+    cudaDeviceSynchronize();
+
+    // Copy sorted data back to original arrays
+    cudaMemcpy(d_pos, d_posSorted, nParticles * sizeof(float3), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(d_vel, d_velSorted, nParticles * sizeof(float3), cudaMemcpyDeviceToDevice);
+
+    // Build cell start and end indices
+    cudaMemset(d_cStart, 0xFF, maxCells * sizeof(unsigned int));
+    cudaMemset(d_cEnd, 0xFF, maxCells * sizeof(unsigned int));
+
+    findCellStartEnd<<<bpg, tpb>>>(
+        d_cellIndc,
+        d_cStart,
+        d_cEnd,
+        nParticles);
+    cudaDeviceSynchronize();
+}
+
+} // namespace FluidSimulationGPU