diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index 09752c8cd1a5ef031a92f3ae7c35285450115cc4..df4cc16814210cfd875a510b20fca2f77c6b096b 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -67,7 +67,7 @@ namespace FluidSimulation {
         // Initialize fluid particles
         std::vector<float3> h_positions(m_numParticles);
         std::vector<float3> h_velocities(m_numParticles);
-        float gridSpacing = 0.5 * m_smoothingRadius;
+        float gridSpacing = 2.f * h_particleRadius;
         initParticles(h_positions, h_velocities, m_numParticles, gridSpacing, m_mode);
 
         // Initialize particle device arrays
@@ -84,7 +84,7 @@ namespace FluidSimulation {
         std::vector<float3> boundary_positions;
         std::vector<float3> boundary_normals;
         int boundaryLayers = 1;
-        float spacing = 0.5 * m_smoothingRadius; // Using 0.01 as per your code
+        float spacing = 2.f * h_particleRadius; // Using 0.01 as per your code
 
         createBoundaryParticles(boundary_positions, boundary_normals, boundaryLayers, spacing);
         m_numBoundaryParticles = boundary_positions.size();
@@ -101,6 +101,8 @@ namespace FluidSimulation {
 
         // Now that we have boundary particles set up and registered, run the boundary neighborhood search
         FluidSimulationGPU::boundaryNBSearch(m_numBoundaryParticles);
+    
+        FluidSimulationGPU::calculatePCISPHSSclaingFactor();
     }
 
     void FluidSimulation::updateSimulation() {
diff --git a/src/FluidSimulationApp.cpp b/src/FluidSimulationApp.cpp
index db4d400f8ba78b67cb6c53b5f389faeba8a5171d..45773eb3fbe30350107bf3bcc45003cf81805f44 100644
--- a/src/FluidSimulationApp.cpp
+++ b/src/FluidSimulationApp.cpp
@@ -245,28 +245,23 @@ namespace FluidSimulation {
                     break;
 
                 case RenderStage::ThicknessBuffer:
-                    printf("Render Thickness \n");
-                    printf("Thickness stage active \n");
                     m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
                     m_fluidRenderer->visualizeBuffer(RenderStage::ThicknessBuffer);
                     break;
                 
                 case RenderStage::FilteredThickness:
-                    printf("Render Filtered Thickness \n");
                     m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
                     m_fluidRenderer->renderThicknessFilterFrame();
                     m_fluidRenderer->visualizeBuffer(RenderStage::FilteredThickness);
                     break;
 
                 case RenderStage::LightAttenuation:
-                    printf("Render Filtered Thickness \n");
                     m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
                     m_fluidRenderer->renderThicknessFilterFrame();
                     m_fluidRenderer->renderThicknessAttenFrame();
                     break;
                 
                 case RenderStage::ScreenSpacedRender:
-                    printf("Render Screen Spaced \n");
                     m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
                     m_fluidRenderer->renderDepthFilterFrame();
                     m_fluidRenderer->renderDepthNormalsFrame();
diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h
index 0d73f817fa9ed37534e495605330b9142675c07c..1742015a04bf5d13eedb40ae62bb8d9ca264b191 100644
--- a/src/cuda/include/cuda_device_vars.h
+++ b/src/cuda/include/cuda_device_vars.h
@@ -15,6 +15,8 @@ extern __constant__ float PI;
 extern __constant__ float H;
 extern __constant__ float H2;
 extern __constant__ float POLY6_CONST;
+extern __constant__ float POLY6_GRAD_CONST;
+
 extern __constant__ float SPIKY_GRAD_CONST;
 extern __constant__ float VISC_LAP_CONST;
 
@@ -54,6 +56,7 @@ extern float3* d_force_other;
 extern float3* d_velocities_pred;
 extern float3* d_positions_pred;
 extern float* d_sum_gradW2; // For computing A
+extern __constant__ float d_PCISPH_density_scaling_factor;
 
 // Boundary particles
 extern float3* d_boundary_positions;
diff --git a/src/cuda/include/float3_helper_math.h b/src/cuda/include/float3_helper_math.h
index 521a6950730af7a8ea683713e485e572aead864a..c64b489b510f798c0df02de26001b07c14467bde 100644
--- a/src/cuda/include/float3_helper_math.h
+++ b/src/cuda/include/float3_helper_math.h
@@ -229,5 +229,23 @@ inline __host__ __device__ float3 cross(float3 a, float3 b)
     return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
 }
 
+inline __host__ __device__ float3 clamp(const float3 v, const float minVal, const float maxVal)
+{
+    return make_float3(
+        fminf(fmaxf(v.x, minVal), maxVal),
+        fminf(fmaxf(v.y, minVal), maxVal),
+        fminf(fmaxf(v.z, minVal), maxVal)
+    );
+}
+
+inline __host__ __device__ float3 clamp_length(const float3 v, const float maxLength)
+{
+    float len = length(v);
+    if (len > maxLength)
+    {
+        return v * (maxLength / len);
+    }
+    return v;
+}
 
 #endif
\ No newline at end of file
diff --git a/src/cuda/include/helper_macros.h b/src/cuda/include/helper_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c18a79914af30754f46615d5e517a80f8a41740
--- /dev/null
+++ b/src/cuda/include/helper_macros.h
@@ -0,0 +1,34 @@
+// helper_macros.h
+
+#ifndef HELPER_MACROS_H
+#define HELPER_MACROS_H
+
+#define FOREACH_NEIGHBOR_CELL_BEGIN(cellIndex, gridSizeX, gridSizeY, gridSizeZ, neighborCellIdx) \
+    do { \
+        /* Compute cell coordinates */ \
+        int cellX = cellIndex % gridSizeX; \
+        int cellY = (cellIndex / gridSizeX) % gridSizeY; \
+        int cellZ = cellIndex / (gridSizeX * gridSizeY); \
+        \
+        /* Iterate over neighboring cells */ \
+        for (int dz = -1; dz <= 1; ++dz) { \
+            int nz = cellZ + dz; \
+            if (nz < 0 || nz >= gridSizeZ) continue; \
+            for (int dy = -1; dy <= 1; ++dy) { \
+                int ny = cellY + dy; \
+                if (ny < 0 || ny >= gridSizeY) continue; \
+                for (int dx = -1; dx <= 1; ++dx) { \
+                    int nx = cellX + dx; \
+                    if (nx < 0 || nx >= gridSizeX) continue; \
+                    \
+                    /* Compute neighbor cell index */ \
+                    neighborCellIdx = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; \
+                    /* User-defined code starts here */
+
+#define FOREACH_NEIGHBOR_CELL_END() \
+                } \
+            } \
+        } \
+    } while(0)
+
+#endif // HELPER_MACROS_H
diff --git a/src/cuda/include/init_cuda.h b/src/cuda/include/init_cuda.h
index a55477664ee353afc6a5198e76a61149978297f0..954f2e82a05ddaad1083f83f3d67a5ffd7b91b2e 100644
--- a/src/cuda/include/init_cuda.h
+++ b/src/cuda/include/init_cuda.h
@@ -37,6 +37,8 @@ void updateConstantsOnDevice(
     float smoothingRadius
 );
 
+void calculatePCISPHSSclaingFactor();
+
 } // namespace FluidSimulationGPU
 
 #endif // INIT_CUDA_H
diff --git a/src/cuda/include/smoothing_kernels.h b/src/cuda/include/smoothing_kernels.h
index ed48092705daed120ae6da26414c521deea73fce..76e22fc9729a1c9cbc3dd1343e573ce5dd224f89 100644
--- a/src/cuda/include/smoothing_kernels.h
+++ b/src/cuda/include/smoothing_kernels.h
@@ -6,7 +6,11 @@
 namespace FluidSimulationGPU {
 
 __device__ float poly6Kernel(float r2);
+__device__ float poly6GradKernel(float r2);
+
+__device__ float spikyKernel(float r);
 __device__ float spikyKernelGrad(float r);
+
 __device__ float viscosityKernelLap(float r);
 
 __device__ float cubicSplineKernel(float r);
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
index bc682835476d3ab1044bc9f78cc2fe77b8cc3b17..c471ff46dd75619e950406d502aea18fa22b61f4 100644
--- a/src/cuda/init_cuda.cu
+++ b/src/cuda/init_cuda.cu
@@ -4,220 +4,225 @@
 #include "../settings/constants.h"
 
 #include "include/pcisph_kernels.h"
+#include "include/smoothing_kernels.h"
 
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <device_launch_parameters.h>
 #include <cstdio>
 
-namespace FluidSimulationGPU {
-
-// Global Variables
-cudaGraphicsResource_t cuda_vbo_resource;
-cudaGraphicsResource_t cuda_vbo_boundary_resource;
-
-// Constants
-__constant__ float PI;
-__constant__ float H;
-__constant__ float H2;
-__constant__ float POLY6_CONST;
-__constant__ float SPIKY_GRAD_CONST;
-__constant__ float VISC_LAP_CONST;
-
-__constant__ float PARTICLE_MASS;
-__constant__ float REST_DENSITY;
-__constant__ float VISCOSITY;
+namespace FluidSimulationGPU
+{
 
-// Boundary constants
-__constant__ float BOUND_X_MIN;
-__constant__ float BOUND_X_MAX;
-__constant__ float BOUND_Y_MIN;
-__constant__ float BOUND_Y_MAX;
-__constant__ float BOUND_Z_MIN;
-__constant__ float BOUND_Z_MAX;
+    // Global Variables
+    cudaGraphicsResource_t cuda_vbo_resource;
+    cudaGraphicsResource_t cuda_vbo_boundary_resource;
+
+    // Constants
+    __constant__ float PI;
+    __constant__ float H;
+    __constant__ float H2;
+    __constant__ float POLY6_CONST;
+    __constant__ float POLY6_GRAD_CONST;
+    __constant__ float SPIKY_GRAD_CONST;
+    __constant__ float VISC_LAP_CONST;
+
+    __constant__ float PARTICLE_MASS;
+    __constant__ float REST_DENSITY;
+    __constant__ float VISCOSITY;
+
+    // Boundary constants
+    __constant__ float BOUND_X_MIN;
+    __constant__ float BOUND_X_MAX;
+    __constant__ float BOUND_Y_MIN;
+    __constant__ float BOUND_Y_MAX;
+    __constant__ float BOUND_Z_MIN;
+    __constant__ float BOUND_Z_MAX;
+
+    // Device Arrays
+    float3 *d_positions = nullptr;
+    float3 *d_velocities = nullptr;
+
+    unsigned int *d_cellIndices = nullptr;
+    unsigned int *d_particleIndices = nullptr;
+
+    float3 *d_positions_sorted = nullptr;
+    float3 *d_velocities_sorted = nullptr;
+
+    unsigned int *d_cellStart = nullptr;
+    unsigned int *d_cellEnd = nullptr;
+
+    // Arrays for densities, pressures, accelerations
+    float *d_densities = nullptr;
+    float *d_density_errors = nullptr;
+    float *d_particle_pressure = nullptr;
+    float3 *d_pressure_force = nullptr;
+    float3 *d_force_other = nullptr;
+
+    // Additional arrays for PCISPH
+    float3 *d_velocities_pred = nullptr;
+    float3 *d_positions_pred = nullptr;
+    float *d_sum_gradW2 = nullptr;
+
+    float h_PCISPH_density_scaling_factor = 0.0f;
+    __constant__ float d_PCISPH_density_scaling_factor;
+
+    // Boundary particles
+    float3 *d_boundary_positions = nullptr;
+    float3 *d_boundary_normals = nullptr;
+
+    unsigned int *d_boundaryCellStart = nullptr;
+    unsigned int *d_boundaryCellEnd = nullptr;
+
+    unsigned int *d_boundaryCellIndices = nullptr;
+    unsigned int *d_boundaryParticleIndices = nullptr;
+
+    void initializeConstants()
+    {
+        // Host constants
+        const float h_H2 = h_H * h_H;
+        const float h_POLY6_CONST = 315.0f / (64.0f * M_PI * powf(h_H, 9));
+        const float h_POLY6_GRAD_CONST = -945.0f / (32.0f * M_PI * pow(h_H, 9.0f)); // Check that this andn othere constants are correct!!!
+        const float h_SPIKY_GRAD_CONST = -45.0f / (M_PI * powf(h_H, 6));
+        const float h_VISC_LAP_CONST = 45.0f / (M_PI * powf(h_H, 6));
+
+        // Copy constants to device
+        cudaMemcpyToSymbol(PI, &h_PI, sizeof(float));
+        cudaMemcpyToSymbol(H, &h_H, sizeof(float));
+        cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
+        cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
+        cudaMemcpyToSymbol(POLY6_GRAD_CONST, &h_POLY6_GRAD_CONST, sizeof(float));
+        cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
+        cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
+
+        cudaMemcpyToSymbol(PARTICLE_MASS, &h_PARTICLE_MASS, sizeof(float));
+        cudaMemcpyToSymbol(REST_DENSITY, &h_REST_DENSITY, sizeof(float));
+        cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float));
+
+        // Copy boundary constants to device
+        cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float));
+        cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float));
+        cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float));
+        cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float));
+        cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float));
+        cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float));
+    }
 
-// Device Arrays
-float3* d_positions = nullptr;
-float3* d_velocities = nullptr;
+    // Register VBO with CUDA
+    void registerVBOWithCUDA(unsigned int vbo)
+    {
+        cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
+    }
 
-unsigned int* d_cellIndices = nullptr;
-unsigned int* d_particleIndices = nullptr;
+    void registerBoundaryVBOWithCUDA(unsigned int vbo)
+    {
+        cudaGraphicsGLRegisterBuffer(&cuda_vbo_boundary_resource, vbo, cudaGraphicsMapFlagsNone);
+    }
 
-float3* d_positions_sorted = nullptr;
-float3* d_velocities_sorted = nullptr;
+    // Unregister VBO from CUDA
+    void unregisterVBOFromCUDA()
+    {
+        cudaGraphicsUnregisterResource(cuda_vbo_resource);
+    }
 
-unsigned int* d_cellStart = nullptr;
-unsigned int* d_cellEnd = nullptr;
+    void unregisterBoundaryVBOFromCUDA()
+    {
+        cudaGraphicsUnregisterResource(cuda_vbo_boundary_resource);
+    }
 
-// Arrays for densities, pressures, accelerations
-float* d_densities = nullptr;
-float* d_density_errors = nullptr;
-float* d_particle_pressure = nullptr;
-float3* d_pressure_force = nullptr;
-float3* d_force_other = nullptr;
+    // Initialize device arrays
+    void initializeFluidDeviceArrays(
+        const float3 *h_positions,
+        const float3 *h_velocities,
+        int numParticles)
+    {
+        // Allocate device memory for velocities
+        cudaMalloc(&d_velocities, numParticles * sizeof(float3));
+        cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+        // Allocate device memory for predicted velocities and positions
+        cudaMalloc(&d_velocities_pred, numParticles * sizeof(float3));
+        cudaMalloc(&d_positions_pred, numParticles * sizeof(float3));
+
+        // Allocate device memory for cell indices and particle indices
+        cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int));
+        cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int));
+
+        // Allocate device memory for sorted positions and velocities
+        cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3));
+        cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3));
+
+        // Allocate cell start and end arrays
+        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+        int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+        cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int));
+        cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int));
+
+        // Allocate device memory for densities, pressure increments, and accelerations
+        cudaMalloc(&d_densities, numParticles * sizeof(float));
+        cudaMalloc(&d_density_errors, numParticles * sizeof(float));
+        cudaMalloc(&d_particle_pressure, numParticles * sizeof(float));
+        cudaMalloc(&d_pressure_force, numParticles * sizeof(float3));
+        cudaMalloc(&d_force_other, numParticles * sizeof(float3));
+
+        // Allocate device memory for sum_gradW2
+        cudaMalloc(&d_sum_gradW2, numParticles * sizeof(float));
+
+        // Initialize positions_pred to initial positions
+        cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+    }
 
-// Additional arrays for PCISPH
-float3* d_velocities_pred = nullptr;
-float3* d_positions_pred = nullptr;
-float* d_sum_gradW2 = nullptr;
+    void initializeBoundryDeviceArrays(
+        const float3 *h_boundaryPositions,
+        const float3 *h_boundaryNormals, // Renamed parameter
+        int numBoundaryParticles)
+    {
+        // Allocate cell start and end arrays
+        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
+        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
+        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+        int maxCells = gridSizeX * gridSizeY * gridSizeZ;
+
+        // Allocate memory for boundary positions
+        // cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3));
+        // cudaMemcpy(d_boundary_positions, h_boundaryPositions, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+        // Allocate memory for boundary normals
+        cudaMalloc(&d_boundary_normals, numBoundaryParticles * sizeof(float3));
+        cudaMemcpy(d_boundary_normals, h_boundaryNormals, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+        cudaMalloc(&d_boundaryCellIndices, numBoundaryParticles * sizeof(unsigned int));
+        cudaMalloc(&d_boundaryParticleIndices, numBoundaryParticles * sizeof(unsigned int));
+
+        cudaMalloc(&d_boundaryCellStart, maxCells * sizeof(unsigned int));
+        cudaMalloc(&d_boundaryCellEnd, maxCells * sizeof(unsigned int));
+    }
 
-// Boundary particles
-float3* d_boundary_positions = nullptr;
-float3* d_boundary_normals = nullptr;
+    // NeighbourHoodSearch for boundary particles 1 time at start since they dont move
+    void boundaryNBSearch(int numBoundaryParticles)
+    {
 
-unsigned int* d_boundaryCellStart = nullptr;
-unsigned int* d_boundaryCellEnd = nullptr;
+        // Map the VBO resource to get a device pointer
+        size_t num_bytes;
+        cudaGraphicsMapResources(1, &cuda_vbo_boundary_resource, 0);
+        cudaGraphicsResourceGetMappedPointer((void **)&d_boundary_positions, &num_bytes, cuda_vbo_boundary_resource);
 
-unsigned int* d_boundaryCellIndices = nullptr;
-unsigned int* d_boundaryParticleIndices = nullptr;    
+        int gsX, gsY, gsZ;
 
-void initializeConstants()
-{
-    // Host constants
-    const float h_H2 = h_H * h_H;
-    const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9));
-    const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6));
-    const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6));
-
-    // Copy constants to device
-    cudaMemcpyToSymbol(PI, &h_PI, sizeof(float));
-    cudaMemcpyToSymbol(H, &h_H, sizeof(float));
-    cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
-    cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
-    cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
-    cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
-
-    cudaMemcpyToSymbol(PARTICLE_MASS, &h_PARTICLE_MASS, sizeof(float));
-    cudaMemcpyToSymbol(REST_DENSITY, &h_REST_DENSITY, sizeof(float));
-    cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float));
-
-    // Copy boundary constants to device
-    cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float));
-    cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float));
-    cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float));
-    cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float));
-    cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float));
-    cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float));
-}
-
-// Register VBO with CUDA
-void registerVBOWithCUDA(unsigned int vbo)
-{
-    cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone);
-}
+        int threadsPerBlock = 256;
+        int blocksPerGrid = (numBoundaryParticles + threadsPerBlock - 1) / threadsPerBlock;
 
-void registerBoundaryVBOWithCUDA(unsigned int vbo)
-{
-    cudaGraphicsGLRegisterBuffer(&cuda_vbo_boundary_resource, vbo, cudaGraphicsMapFlagsNone);
-}
+        float3 *d_boundary_positions_sorted;
+        float3 *d_boundary_normals_sorted;
 
-// Unregister VBO from CUDA
-void unregisterVBOFromCUDA()
-{
-    cudaGraphicsUnregisterResource(cuda_vbo_resource);
-}
+        cudaMalloc(&d_boundary_positions_sorted, numBoundaryParticles * sizeof(float3));
+        cudaMalloc(&d_boundary_normals_sorted, numBoundaryParticles * sizeof(float3));
 
-void unregisterBoundaryVBOFromCUDA()
-{
-    cudaGraphicsUnregisterResource(cuda_vbo_boundary_resource);
-}
-
-// Initialize device arrays
-void initializeFluidDeviceArrays(
-    const float3* h_positions,
-    const float3* h_velocities,
-    int numParticles
-)
-{
-    // Allocate device memory for velocities
-    cudaMalloc(&d_velocities, numParticles * sizeof(float3));
-    cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-
-    // Allocate device memory for predicted velocities and positions
-    cudaMalloc(&d_velocities_pred, numParticles * sizeof(float3));
-    cudaMalloc(&d_positions_pred, numParticles * sizeof(float3));
-
-    // Allocate device memory for cell indices and particle indices
-    cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int));
-    cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int));
-
-    // Allocate device memory for sorted positions and velocities
-    cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3));
-    cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3));
-
-    // Allocate cell start and end arrays
-    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
-    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
-    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
-    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
-
-    cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int));
-    cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int));
-
-    // Allocate device memory for densities, pressure increments, and accelerations
-    cudaMalloc(&d_densities, numParticles * sizeof(float));
-    cudaMalloc(&d_density_errors, numParticles * sizeof(float));
-    cudaMalloc(&d_particle_pressure, numParticles * sizeof(float));
-    cudaMalloc(&d_pressure_force, numParticles * sizeof(float3));
-    cudaMalloc(&d_force_other, numParticles * sizeof(float3));
-
-    // Allocate device memory for sum_gradW2
-    cudaMalloc(&d_sum_gradW2, numParticles * sizeof(float));
-
-    // Initialize positions_pred to initial positions
-    cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-}
-
-void initializeBoundryDeviceArrays(
-    const float3* h_boundaryPositions,
-    const float3* h_boundaryNormals, // Renamed parameter
-    int numBoundaryParticles
-)
-{
-    // Allocate cell start and end arrays
-    int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
-    int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
-    int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
-    int maxCells = gridSizeX * gridSizeY * gridSizeZ;
-
-    // Allocate memory for boundary positions
-    //cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3));
-    //cudaMemcpy(d_boundary_positions, h_boundaryPositions, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
-
-    // Allocate memory for boundary normals
-    cudaMalloc(&d_boundary_normals, numBoundaryParticles * sizeof(float3));
-    cudaMemcpy(d_boundary_normals, h_boundaryNormals, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
-
-    cudaMalloc(&d_boundaryCellIndices, numBoundaryParticles * sizeof(unsigned int));
-    cudaMalloc(&d_boundaryParticleIndices, numBoundaryParticles * sizeof(unsigned int));
-    
-    cudaMalloc(&d_boundaryCellStart, maxCells * sizeof(unsigned int));
-    cudaMalloc(&d_boundaryCellEnd, maxCells * sizeof(unsigned int));
-}
-
-
-// NeighbourHoodSearch for boundary particles 1 time at start since they dont move
-void boundaryNBSearch(int numBoundaryParticles) {
-    
-    // Map the VBO resource to get a device pointer
-    size_t num_bytes;
-    cudaGraphicsMapResources(1, &cuda_vbo_boundary_resource, 0);
-    cudaGraphicsResourceGetMappedPointer((void**)&d_boundary_positions, &num_bytes, cuda_vbo_boundary_resource);
-    
-    int gsX, gsY, gsZ;
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (numBoundaryParticles + threadsPerBlock - 1) / threadsPerBlock;
-    
-    float3* d_boundary_positions_sorted;
-    float3* d_boundary_normals_sorted;
-    
-
-    cudaMalloc(&d_boundary_positions_sorted, numBoundaryParticles * sizeof(float3));
-    cudaMalloc(&d_boundary_normals_sorted, numBoundaryParticles * sizeof(float3));
-    
-    neigbourHoodSearch(
-            numBoundaryParticles, 
+        neigbourHoodSearch(
+            numBoundaryParticles,
             d_boundary_positions,
             d_boundary_normals,
             d_boundary_positions_sorted,
@@ -230,112 +235,235 @@ void boundaryNBSearch(int numBoundaryParticles) {
             threadsPerBlock,
             gsX,
             gsY,
-            gsZ); 
-    
-    // Unmap the VBO resource
-    cudaGraphicsUnmapResources(1, &cuda_vbo_boundary_resource, 0);
-    
-    cudaFree(d_boundary_positions_sorted);
-    cudaFree(d_boundary_normals_sorted);
-    
-    // Move some of above init code for boundary to seeparate function
-}
-
-void resetSimulation(
-    const float3* h_positions,
-    const float3* h_velocities,
-    int numParticles
-)
-{
-    cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-    
-    // Initialize positions_pred to initial positions
-    cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-
-    // Map the VBO resource
-    cudaError_t err = cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to map VBO resource: %s\n", cudaGetErrorString(err));
-        return;
+            gsZ);
+
+        // Unmap the VBO resource
+        cudaGraphicsUnmapResources(1, &cuda_vbo_boundary_resource, 0);
+
+        cudaFree(d_boundary_positions_sorted);
+        cudaFree(d_boundary_normals_sorted);
+
+        // Move some of above init code for boundary to seeparate function
     }
 
-    // Check if d_positions is already assigned
-    if (d_positions == nullptr) {
-        size_t num_bytes = 0;
+    void calculatePCISPHSSclaingFactor()
+    {
+
+        // We should only have:
+        // delta_t, rest_density, particle_radius, viscosity
+        // As fluid params we set. The rest we should deduce from them like:
+
+        // params.kernel_radius = 4.f * params.particle_radius;
+        // params.kernel_radius2 = std::pow(params.kernel_radius, 2.f);
+        // params.particle_mass = params.rest_density / std::pow(1.f / (2.f * params.particle_radius), 3);
+
+        // // -> grid (bucket count needs to be a multiple of 64 to make the hash keys locally unique
+        // params.bucket_count = params.fluid_count / 2;
+        // params.bucket_count -= params.bucket_count % 64;
+        // params.bucket_count = std::max(64U, params.bucket_count);
+
+        // params.cell_size = params.kernel_radius;
+
+        // // -> smoothing kernels
+        // params.poly6_normalization = 315.f / (64.f * utils::PI * std::pow(params.kernel_radius, 9.f));
+        // params.poly6_d1_normalization = -945.f / (32.f * utils::PI * std::pow(params.kernel_radius, 9.f));
+        // params.viscosity_d2_normalization = 45.f / (utils::PI * std::pow(params.kernel_radius, 6.0f));
+        // params.spiky_d1_normalization = -45.f / (utils::PI * std::pow(params.kernel_radius, 6.0f));
+        // params.surface_tension_normalization = 32.f / (utils::PI * std::pow(params.kernel_radius, 9.f));
+
+        float kernel_radius = 4.0f * h_particleRadius;
+        float kernel_radius2 = pow(kernel_radius, 2.0f); // check taht pow() func is correct!!
+        float particle_mass = h_REST_DENSITY / pow(1.0f / (2.0f * h_particleRadius), 3.0f) / 1.15f;
+
+        // MAke sure constannts are correct!!!
+        float poly6__const = 315.f / (64.f * M_PI * pow(kernel_radius, 9.f));
+        float poly6_grad_const = -945.f / (32.f * M_PI * pow(kernel_radius, 9.f));
+        float viscosity_d2_normalization = 45.f / (M_PI * pow(kernel_radius, 6.0f));
+        float spiky_d1_normalization = -45.f / (M_PI * pow(kernel_radius, 6.0f));
+        float surface_tension_normalization = 32.f / (M_PI * pow(kernel_radius, 9.f));
+
+        cudaMemcpyToSymbol(H, &kernel_radius, sizeof(float));
+        cudaMemcpyToSymbol(H2, &kernel_radius2, sizeof(float));
+        cudaMemcpyToSymbol(POLY6_CONST, &poly6__const, sizeof(float));
+        cudaMemcpyToSymbol(POLY6_GRAD_CONST, &poly6_grad_const, sizeof(float));
+        cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &spiky_d1_normalization, sizeof(float));
+        cudaMemcpyToSymbol(VISC_LAP_CONST, &viscosity_d2_normalization, sizeof(float));
+
+        cudaMemcpyToSymbol(PARTICLE_MASS, &particle_mass, sizeof(float));
+
+        // 1) Beta
+        float timeStep = 0.01f;
+        float dtm_rho = (timeStep * particle_mass) / h_REST_DENSITY;
+        float beta = 2.0f * dtm_rho * dtm_rho;
+
+        double sumDot = 0.0;
+        double sum_val[3] = {0.0, 0.0, 0.0};
+        double sum_spiky[3] = {0.0, 0.0, 0.0};
+
+        float step = 2.0f * h_particleRadius;
+        float R = kernel_radius + h_particleRadius;
+
+        // 2) Loop over sample points
+        for (float z = -R; z <= R; z += step)
+        {
+            for (float y = -R; y <= R; y += step)
+            {
+                for (float x = -R; x <= R; x += step)
+                {
+                    float r2 = x * x + y * y + z * z;
+                    float r = sqrtf(r2);
+
+                    if (r < h_H && r > 1e-9f)
+                    {
+                        float spikyMag = spiky_d1_normalization * pow(kernel_radius - r, 2.f) * (1.0f / r); // should it not match spky grad kernel?? r/r_norm??
+                        float3 spiky_grad = {
+                            spikyMag * x,
+                            spikyMag * y,
+                            spikyMag * z};
+
+                        float poly6Mag = poly6_grad_const * pow(kernel_radius2 - r2, 2.f); // Check ttaht kernel is correect!!!
+                        float3 poly6_grad = {
+                            poly6Mag * x,
+                            poly6Mag * y,
+                            poly6Mag * z};
+
+                        sum_spiky[0] += spiky_grad.x;
+                        sum_spiky[1] += spiky_grad.y;
+                        sum_spiky[2] += spiky_grad.z;
+
+                        sum_val[0] += poly6_grad.x;
+                        sum_val[1] += poly6_grad.y;
+                        sum_val[2] += poly6_grad.z;
+
+                        double dotSV = (double)spiky_grad.x * poly6_grad.x + (double)spiky_grad.y * poly6_grad.y + (double)spiky_grad.z * poly6_grad.z;
+                        sumDot += dotSV;
+                    }
+                }
+            }
+        }
 
-        // Get the mapped pointer for positions
-        err = cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource);
-        if (err != cudaSuccess) {
-            fprintf(stderr, "Failed to get mapped pointer: %s\n", cudaGetErrorString(err));
-            cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
-            return;
+        double sum_spiky_dot_val = sum_spiky[0] * sum_val[0] + sum_spiky[1] * sum_val[1] + sum_spiky[2] * sum_val[2];
+
+        double bigTerm = -sum_spiky_dot_val - sumDot;
+        if (fabs(bigTerm) < 1e-12)
+        {
+            bigTerm = 1e-12; // Avoid zero
         }
-    }
 
-    // Copy new positions to the mapped pointer
-    err = cudaMemcpy(d_positions, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to copy new positions: %s\n", cudaGetErrorString(err));
+        double alpha = -1.0 / ((double)beta * bigTerm);
+
+        // Store final
+        h_PCISPH_density_scaling_factor = (float)alpha;
+
+        printf("PCISPH density scaling factor: alpha = %g\n",
+               h_PCISPH_density_scaling_factor);
+
+        // Optionally copy to device:
+        cudaMemcpyToSymbol(d_PCISPH_density_scaling_factor,
+                           &h_PCISPH_density_scaling_factor,
+                           sizeof(float),
+                           0,
+                           cudaMemcpyHostToDevice);
     }
 
-    // Unmap the VBO resource
-    err = cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to unmap VBO resource: %s\n", cudaGetErrorString(err));
+    void resetSimulation(
+        const float3 *h_positions,
+        const float3 *h_velocities,
+        int numParticles)
+    {
+        cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+        // Initialize positions_pred to initial positions
+        cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+        // Map the VBO resource
+        cudaError_t err = cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
+        if (err != cudaSuccess)
+        {
+            fprintf(stderr, "Failed to map VBO resource: %s\n", cudaGetErrorString(err));
+            return;
+        }
+
+        // Check if d_positions is already assigned
+        if (d_positions == nullptr)
+        {
+            size_t num_bytes = 0;
+
+            // Get the mapped pointer for positions
+            err = cudaGraphicsResourceGetMappedPointer((void **)&d_positions, &num_bytes, cuda_vbo_resource);
+            if (err != cudaSuccess)
+            {
+                fprintf(stderr, "Failed to get mapped pointer: %s\n", cudaGetErrorString(err));
+                cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
+                return;
+            }
+        }
+
+        // Copy new positions to the mapped pointer
+        err = cudaMemcpy(d_positions, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
+        if (err != cudaSuccess)
+        {
+            fprintf(stderr, "Failed to copy new positions: %s\n", cudaGetErrorString(err));
+        }
+
+        // Unmap the VBO resource
+        err = cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
+        if (err != cudaSuccess)
+        {
+            fprintf(stderr, "Failed to unmap VBO resource: %s\n", cudaGetErrorString(err));
+        }
     }
-}
-
-void updateConstantsOnDevice(float particleMass, float restDensity, float viscosity, float smoothingRadius) {
-    // Recalculate derived constants
-    float h_H2 = smoothingRadius * smoothingRadius;
-    float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(smoothingRadius, 9));
-    float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(smoothingRadius, 6));
-    float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(smoothingRadius, 6));
-
-    // Send constants to CUDA device
-    cudaMemcpyToSymbol(PARTICLE_MASS, &particleMass, sizeof(float));
-    cudaMemcpyToSymbol(REST_DENSITY, &restDensity, sizeof(float));
-    cudaMemcpyToSymbol(VISCOSITY, &viscosity, sizeof(float));
-    cudaMemcpyToSymbol(H, &smoothingRadius, sizeof(float));
-    cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
-    cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
-    cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
-    cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
-}
-
-// Cleanup device arrays
-void cleanupDeviceArrays()
-{
-    cudaFree(d_velocities);
-    cudaFree(d_velocities_pred);
-    cudaFree(d_positions_pred);
 
-    cudaFree(d_cellIndices);
-    cudaFree(d_particleIndices);
+    void updateConstantsOnDevice(float particleMass, float restDensity, float viscosity, float smoothingRadius)
+    {
+        // Recalculate derived constants
+        float h_H2 = smoothingRadius * smoothingRadius;
+        float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(smoothingRadius, 9));
+        float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(smoothingRadius, 6));
+        float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(smoothingRadius, 6));
+
+        // Send constants to CUDA device
+        cudaMemcpyToSymbol(PARTICLE_MASS, &particleMass, sizeof(float));
+        cudaMemcpyToSymbol(REST_DENSITY, &restDensity, sizeof(float));
+        cudaMemcpyToSymbol(VISCOSITY, &viscosity, sizeof(float));
+        cudaMemcpyToSymbol(H, &smoothingRadius, sizeof(float));
+        cudaMemcpyToSymbol(H2, &h_H2, sizeof(float));
+        cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float));
+        cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
+        cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
+    }
 
-    cudaFree(d_positions_sorted);
-    cudaFree(d_velocities_sorted);
+    // Cleanup device arrays
+    void cleanupDeviceArrays()
+    {
+        cudaFree(d_velocities);
+        cudaFree(d_velocities_pred);
+        cudaFree(d_positions_pred);
 
-    cudaFree(d_cellStart);
-    cudaFree(d_cellEnd);
+        cudaFree(d_cellIndices);
+        cudaFree(d_particleIndices);
 
-    cudaFree(d_densities);
-    cudaFree(d_density_errors);
-    cudaFree(d_particle_pressure);
-    cudaFree(d_pressure_force);
-    cudaFree(d_force_other);
+        cudaFree(d_positions_sorted);
+        cudaFree(d_velocities_sorted);
 
-    cudaFree(d_boundary_positions);
-    cudaFree(d_boundary_normals);
+        cudaFree(d_cellStart);
+        cudaFree(d_cellEnd);
 
-    cudaFree(d_boundaryCellIndices);
-    cudaFree(d_boundaryParticleIndices);
-    cudaFree(d_boundaryCellEnd);
-    cudaFree(d_boundaryCellStart);
+        cudaFree(d_densities);
+        cudaFree(d_density_errors);
+        cudaFree(d_particle_pressure);
+        cudaFree(d_pressure_force);
+        cudaFree(d_force_other);
 
-    cudaFree(d_sum_gradW2);
+        cudaFree(d_boundary_positions);
+        cudaFree(d_boundary_normals);
 
-}
+        cudaFree(d_boundaryCellIndices);
+        cudaFree(d_boundaryParticleIndices);
+        cudaFree(d_boundaryCellEnd);
+        cudaFree(d_boundaryCellStart);
 
+        cudaFree(d_sum_gradW2);
+    }
 
 } // namespace FluidSimulationGPU
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index 8ca0dec615a01118dba1d9c2976f281fa3900d24..4e1a5e278d077263f2fc9a19ea5a56aac3f5ce45 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -3,6 +3,7 @@
 #include "include/smoothing_kernels.h"
 
 #include "include/float3_helper_math.h"
+#include "include/helper_macros.h"
 
 #include "include/cuda_device_vars.h"
 #include "../settings/constants.h"
@@ -119,7 +120,7 @@ __global__ void computeInitialDensity(
         }
     }
 
-    densities[idx] = density_i + b_density_i;
+    densities[idx] = REST_DENSITY; //density_i + b_density_i;
 
     if (sum_gradW2 != nullptr) {
         sum_gradW2[idx] = sumGradW2_local;
@@ -208,10 +209,10 @@ __global__ void computeDensityError(
         }
     }
 
-    float density_variation = density_i + b_density_i - REST_DENSITY;
+    float density_variation = max(density_i + b_density_i - REST_DENSITY, 0.f);
     //densities[idx] = density;
     density_errors[idx] = density_variation;
-    particle_pressure[idx] += -density_variation * K_PCI;
+    particle_pressure[idx] += density_variation * d_PCISPH_density_scaling_factor;
 
 }
 
@@ -299,9 +300,9 @@ __global__ void computePressureForces(
                         if (distSq < H2 && distSq > 0.0001f) {
                             float dist = sqrtf(distSq);
                             float gradW = spikyKernelGrad(dist);
-                            float3 gradWij = gradW * (r / dist);
+                            float3 gradWij = gradW * (r / dist); // fix ^ to get r with sign
                             //float factor = - (P_i + P_j) / (REST_DENSITY * REST_DENSITY);
-                            float pfc =  pfc_i + pfc_j; 
+                            float pfc =  pfc_i + pfc_j;
                             fPressure += -pfc * gradWij * PARTICLE_MASS * PARTICLE_MASS;
                         }
                     }
@@ -327,7 +328,7 @@ __global__ void computePressureForces(
                         if (distSq < H2 && distSq > 0.0001f) {
                             float dist = sqrtf(distSq);
                             float gradW = spikyKernelGrad(dist);
-                            float3 gradWij = gradW * (r / dist);
+                            float3 gradWij = gradW * (r/dist);
                             // Using Equation (85): p_i and p_b = p_i; densities: ρ_i and ρ_0
                             //float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
                             float pfc =  pfc_i + pfc_j; 
@@ -341,7 +342,8 @@ __global__ void computePressureForces(
         }
     }
 
-    pressure_force[idx] = fPressure + b_fPressure;}
+    pressure_force[idx] = fPressure + b_fPressure;
+}
 
 __global__ void correctBoundaryPenetrations(
     float3* positions_pred,
@@ -607,11 +609,15 @@ __global__ void updateVelocitiesPositions(
 
     if (idx < numParticles) {
         // Update predicted velocity
+        float3 vel = velocities[idx];
+        vel = clamp(vel, -50, 50);
         float3 acc = (force_other[idx] + pressure_force[idx]) / PARTICLE_MASS;
         
-        velocities[idx] += acc * timeStep;
+        vel += acc * timeStep;
+        
+        velocities[idx] = vel;
         // Update predicted position
-        positions[idx] +=  acc * (timeStep * timeStep);
+        positions[idx] +=  vel * timeStep;
     }
 }
 
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index 9051e606c3504726edf215f6f77d6bff4621500c..5e9203f14ea26955ac709d9bce13cf1eb7dbd73e 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -9,120 +9,193 @@
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
 
-namespace FluidSimulationGPU {
-
-void updateParticles(float deltaTime)
+namespace FluidSimulationGPU
 {
-    // Map the VBO resource to get a device pointer
-    size_t num_bytes;
-    cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
-    cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource);
-
-    int threadsPerBlock = 256;
-    int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
-
-    // Time step should be small but we can run updateParticles multiple times during display
-    float timeStep = 0.0025f;
-
-    // -------------------- Neighbor Search --------------------
-    int gridSizeX, gridSizeY, gridSizeZ;
-    neigbourHoodSearch(
-        NUM_PARTICLES,
-        d_positions,
-        d_velocities,
-        d_positions_sorted,
-        d_velocities_sorted,
-        d_cellIndices,
-        d_particleIndices,
-        d_cellEnd,
-        d_cellStart,
-        blocksPerGrid,
-        threadsPerBlock,
-        gridSizeX,
-        gridSizeY,
-        gridSizeZ);
-
-    // -------------------- Prediction Step --------------------
-
-    computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>(
-        d_positions,
-        d_velocities,
-        d_force_other,
-        d_cellIndices,
-        d_cellStart,
-        d_cellEnd,
-        NUM_PARTICLES,
-        gridSizeX, gridSizeY, gridSizeZ);
-    cudaDeviceSynchronize();
-
-    // predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-    //     d_velocities,
-    //     d_velocities_pred,
-    //     d_positions,
-    //     d_positions_pred,
-    //     d_force_other,
-    //     timeStep,
-    //     NUM_PARTICLES);
-    // cudaDeviceSynchronize();
-
-    // Initialize pressure increments to zero
-    cudaMemset(d_pressure_force, 0, NUM_PARTICLES * sizeof(float));
-    cudaMemset(d_particle_pressure, 0, NUM_PARTICLES * sizeof(float));
-
-    // Compute densities and density errors, and per-particle sumGradW2
-    computeInitialDensity<<<blocksPerGrid, threadsPerBlock>>>(
-        d_positions,
-        d_densities,
-        d_density_errors,
-        d_sum_gradW2,
-        d_cellIndices,
-        d_cellStart,
-        d_cellEnd,
-        d_boundary_positions,
-        d_boundaryCellStart,
-        d_boundaryCellEnd,
-        NUM_PARTICLES,
-        gridSizeX, gridSizeY, gridSizeZ);
-    cudaDeviceSynchronize();
-
-    // Reduce sum_gradW2 over all particles
-    thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
-    float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
-
-    float r0 = 0.5f * h_H;          // Penetration threshold
-    float epsilon = 1.0f; // Moderate to high friction
-    float delta = 0.5f;   // Low to moderate elasticity
-    float normalDamping = 0.5f; // Adjust as needed
-
-
-    // Compute A
-    float m = h_PARTICLE_MASS;
-    float rho0 = h_REST_DENSITY;
-    float B = 2.0f * timeStep * timeStep * m * m / (rho0 * rho0);
-    float K_PCI =  -1 / (B * totalSumGradW2);
-
-    // ------------ Iterative Pressure Correction Loop --------------------
-
-    const int maxIterations = 5;
-
-    for (int iter = 0; iter < maxIterations; ++iter) {
-    // Launch the boundary correction kernel
-
-        predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+
+    void updateParticles(float deltaTime)
+    {
+        // Map the VBO resource to get a device pointer
+        size_t num_bytes;
+        cudaGraphicsMapResources(1, &cuda_vbo_resource, 0);
+        cudaGraphicsResourceGetMappedPointer((void **)&d_positions, &num_bytes, cuda_vbo_resource);
+
+        int threadsPerBlock = 256;
+        int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
+
+        // Time step should be small but we can run updateParticles multiple times during display
+        float timeStep = 0.01f;
+
+        // -------------------- Neighbor Search --------------------
+        int gridSizeX, gridSizeY, gridSizeZ;
+        neigbourHoodSearch(
+            NUM_PARTICLES,
+            d_positions,
+            d_velocities,
+            d_positions_sorted,
+            d_velocities_sorted,
+            d_cellIndices,
+            d_particleIndices,
+            d_cellEnd,
+            d_cellStart,
+            blocksPerGrid,
+            threadsPerBlock,
+            gridSizeX,
+            gridSizeY,
+            gridSizeZ);
+
+        // -------------------- Prediction Step --------------------
+
+        computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions,
             d_velocities,
-            d_velocities_pred,
+            d_force_other,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
+
+        // predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+        //     d_velocities,
+        //     d_velocities_pred,
+        //     d_positions,
+        //     d_positions_pred,
+        //     d_force_other,
+        //     timeStep,
+        //     NUM_PARTICLES);
+        // cudaDeviceSynchronize();
+
+        // Initialize pressure increments to zero
+        cudaMemset(d_pressure_force, 0, NUM_PARTICLES * sizeof(float));
+        cudaMemset(d_particle_pressure, 0, NUM_PARTICLES * sizeof(float));
+
+        // Compute densities and density errors, and per-particle sumGradW2
+        computeInitialDensity<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions,
+            d_densities,
+            d_density_errors,
+            d_sum_gradW2,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            d_boundary_positions,
+            d_boundaryCellStart,
+            d_boundaryCellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
+
+        // Reduce sum_gradW2 over all particles
+        thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
+        float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
+
+        float r0 = 0.5f * h_H;      // Penetration threshold
+        float epsilon = 1.0f;       // Moderate to high friction
+        float delta = 0.5f;         // Low to moderate elasticity
+        float normalDamping = 0.5f; // Adjust as needed
+
+        // Compute A
+        float m = h_PARTICLE_MASS;
+        float rho0 = h_REST_DENSITY;
+        float B = 2.0f * timeStep * timeStep * m * m / (rho0 * rho0);
+        float K_PCI = -1 / (B * totalSumGradW2);
+
+        // ------------ Iterative Pressure Correction Loop --------------------
+
+        const int maxIterations = 5;
+
+        for (int iter = 0; iter < maxIterations; ++iter)
+        {
+            // Launch the boundary correction kernel
+
+            predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+                d_velocities,
+                d_velocities_pred,
+                d_positions,
+                d_positions_pred,
+                d_pressure_force,
+                d_force_other,
+                timeStep,
+                NUM_PARTICLES);
+            cudaDeviceSynchronize();
+
+            correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
+                d_positions_pred,
+                d_velocities_pred,
+                d_boundary_positions,
+                d_boundary_normals,
+                d_cellIndices,
+                d_boundaryCellStart,
+                d_boundaryCellEnd,
+                NUM_PARTICLES,
+                gridSizeX, gridSizeY, gridSizeZ,
+                r0,
+                normalDamping,
+                epsilon,
+                delta);
+            cudaDeviceSynchronize();
+
+            // Compute desnity and update pressure
+            computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
+                d_positions_pred,
+                d_density_errors,
+                d_particle_pressure,
+                d_cellIndices,
+                d_cellStart,
+                d_cellEnd,
+                d_boundary_positions,
+                d_boundaryCellStart,
+                d_boundaryCellEnd,
+                NUM_PARTICLES,
+                K_PCI,
+                gridSizeX, gridSizeY, gridSizeZ);
+            cudaDeviceSynchronize();
+
+            // updatePressures<<<blocksPerGrid, threadsPerBlock>>>(
+            //     d_particle_pressure,
+            //     d_density_errors,
+            //     K_PCI,
+            //     NUM_PARTICLES);
+            // cudaDeviceSynchronize();
+
+            computePressureForces<<<blocksPerGrid, threadsPerBlock>>>(
+                d_positions_pred,
+                d_densities,
+                d_particle_pressure,
+                d_pressure_force,
+                d_cellIndices,
+                d_cellStart,
+                d_cellEnd,
+                d_boundary_positions,
+                d_boundaryCellStart,
+                d_boundaryCellEnd,
+                NUM_PARTICLES,
+                gridSizeX, gridSizeY, gridSizeZ);
+            cudaDeviceSynchronize();
+
+            // updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
+            //     d_velocities_pred,
+            //     d_positions_pred,
+            //     d_pressure_force,
+            //     timeStep,
+            //     NUM_PARTICLES);
+            // cudaDeviceSynchronize();
+        }
+
+        updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
             d_positions,
-            d_positions_pred,
+            d_velocities,
             d_pressure_force,
             d_force_other,
             timeStep,
             NUM_PARTICLES);
-        cudaDeviceSynchronize();
 
+        cudaDeviceSynchronize();
 
         correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions_pred,
-            d_velocities_pred,
+            d_positions,
+            d_velocities,
             d_boundary_positions,
             d_boundary_normals,
             d_cellIndices,
@@ -135,102 +208,26 @@ void updateParticles(float deltaTime)
             epsilon,
             delta
         );
-        cudaDeviceSynchronize();
 
-        // Compute desnity and update pressure
-        computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions_pred,
-            d_density_errors,
-            d_particle_pressure,
-            d_cellIndices,
-            d_cellStart,
-            d_cellEnd,
-            d_boundary_positions,
-            d_boundaryCellStart,
-            d_boundaryCellEnd,
-            NUM_PARTICLES,
-            K_PCI,
-            gridSizeX, gridSizeY, gridSizeZ);
         cudaDeviceSynchronize();
 
-        // updatePressures<<<blocksPerGrid, threadsPerBlock>>>(
-        //     d_particle_pressure,
-        //     d_density_errors,
-        //     K_PCI,
-        //     NUM_PARTICLES);
-        // cudaDeviceSynchronize();
+        // -------------------- Finalization --------------------
 
-        computePressureForces<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions_pred,
-            d_densities,
-            d_particle_pressure,
-            d_pressure_force,
-            d_cellIndices,
-            d_cellStart,
-            d_cellEnd,
-            d_boundary_positions,
-            d_boundaryCellStart,
-            d_boundaryCellEnd,
+        // cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+        // cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+
+        // Enforce boundary conditions
+
+        float damping = 0.75f;
+        enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions,
+            d_velocities,
             NUM_PARTICLES,
-            gridSizeX, gridSizeY, gridSizeZ);
+            damping);
         cudaDeviceSynchronize();
 
-        // updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-        //     d_velocities_pred,
-        //     d_positions_pred,
-        //     d_pressure_force,
-        //     timeStep,
-        //     NUM_PARTICLES);
-        // cudaDeviceSynchronize();
-        
+        // Unmap the VBO resource
+        cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
     }
 
-    updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-        d_positions,
-        d_velocities,
-        d_pressure_force,
-        d_force_other,
-        timeStep,
-        NUM_PARTICLES);
-    cudaDeviceSynchronize();
-    
-    correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
-        d_positions,
-        d_velocities,
-        d_boundary_positions,
-        d_boundary_normals,
-        d_cellIndices,
-        d_boundaryCellStart,
-        d_boundaryCellEnd,
-        NUM_PARTICLES,
-        gridSizeX, gridSizeY, gridSizeZ,
-        r0,
-        normalDamping,
-        epsilon,
-        delta
-    );
-    
-    cudaDeviceSynchronize();
-
-    // -------------------- Finalization --------------------
-
-    //cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
-    //cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
-
-    // Enforce boundary conditions
-    /*
-    float damping = 0.75f;
-    enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
-        d_positions,
-        d_velocities,
-        NUM_PARTICLES,
-        damping);
-    cudaDeviceSynchronize();
-    */
-    
-
-    // Unmap the VBO resource
-    cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
-}
-
 } // namespace FluidSimulationGPU
diff --git a/src/cuda/smoothing_kernels.cu b/src/cuda/smoothing_kernels.cu
index 4bbb8346cef1a3fb59accc7a296b0cb5ff9f8f97..45580770ea11763885e21ea5d10526d0bc733f14 100644
--- a/src/cuda/smoothing_kernels.cu
+++ b/src/cuda/smoothing_kernels.cu
@@ -14,6 +14,26 @@ __device__ float poly6Kernel(float r2)
     }
 }
 
+__device__ float poly6GradKernel(float r2)
+{
+    if (r2 < H2){
+        float term = H2 - r2;
+        return POLY6_GRAD_CONST * term * term; 
+    } else {
+        return 0.0f;
+    }
+}
+
+__device__ float spikyKernel(float r)
+{
+    if (r >= 0.0f && r <= H) {
+        float term = H - r;
+        return term * term * term;
+    } else {
+        return 0.0f;
+    }
+}
+
 __device__ float spikyKernelGrad(float r)
 {
     if (r > 0.0f && r <= H) {
diff --git a/src/include/FluidSimulationApp.h b/src/include/FluidSimulationApp.h
index 0211cbf52d1ab6a9ac385e83103359a4679bc041..e9614d36f3c6dc793dfda4f43a82c5728f1d446f 100644
--- a/src/include/FluidSimulationApp.h
+++ b/src/include/FluidSimulationApp.h
@@ -23,7 +23,7 @@ namespace FluidSimulation
         FluidSimulation* m_fluidSimulation;
          
         bool m_updatePhysics;
-        RenderStage selectedStage = RenderStage::ScreenSpacedRender;
+        RenderStage selectedStage = RenderStage::ParticleView;
     
         void RenderControlPannel();
         void RenderFluidView();
diff --git a/src/settings/constants.h b/src/settings/constants.h
index 5098cfc5853206bdd83f19f3fa027073d9052e00..fbd5bc95fc2fe7822c1f1f0f0b54f6d67298fbb5 100644
--- a/src/settings/constants.h
+++ b/src/settings/constants.h
@@ -10,10 +10,10 @@ const float h_BOUND_Z_MIN = -4.0f;
 const float h_BOUND_Z_MAX = 4.0f;
 
 const float h_GAS_CONSTANT = 0.02f;
-const float h_VISCOSITY = 0.04f;
+const float h_VISCOSITY = 0.0035105f;
 
 const float h_PARTICLE_MASS = 0.04f;
-const float h_REST_DENSITY = 400.0f;
+const float h_REST_DENSITY = 998.29f;
 
 const float h_PI = 3.14159265358979323846f;
 const float h_H = 0.25f; // Smoothing radius
@@ -23,4 +23,6 @@ const int NUM_PARTICLES = 50000;
 const float h_POINT_SCALE = 14.0f;
 const float h_POINT_RADIUS = 25.0f;
 
+const float h_particleRadius = 0.008;
+
 #endif // CONSTANTS_H