diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index 52882abb0b5d581c4f225f662278e757320ac2d2..63d337b2200590fe7d13d853da02c7d3d70d9f9a 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -83,7 +83,7 @@ namespace FluidSimulation {
         // Initialize boundary
         std::vector<float3> boundary_positions;
         std::vector<float3> boundary_normals;
-        int boundaryLayers = 1;
+        int boundaryLayers = 4;
         float spacing = 2.f * h_particleRadius; // Using 0.01 as per your code
 
         createBoundaryParticles(boundary_positions, boundary_normals, boundaryLayers, spacing);
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
index 4615c3afdf92653ebbda46537e6debe8e5a1948f..cbe80c7a97fd3321212757795ea3ed5047ac3d14 100644
--- a/src/cuda/include/pcisph_kernels.h
+++ b/src/cuda/include/pcisph_kernels.h
@@ -116,6 +116,26 @@ __global__ void enforceBoundary(
     float damping
 );
 
+// Function to predict velocities and positions
+__global__ void predictVelNPos(
+    float3* velocities,
+    float3* velocities_pred,
+    float3* positions,
+    float3* positions_pred,
+    float3* force_other,
+    float timeStep,
+    int numParticles
+);
+
+// Function to update velocities and positions based on pressure forces
+__global__ void updatePredVelNPos(
+    float3* velocities_pred,
+    float3* positions_pred,
+    float3* pressure_forces,
+    float timeStep,
+    int numParticles
+);
+
 // Function to predict velocities and positions
 __global__ void predictVelocitiesPositions(
     float3* velocities,
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
index f9df13dace0912c267bee6c345efc83eec9d2f9b..50293da50d81c5f2e0022280e95e9b58b46813b4 100644
--- a/src/cuda/init_cuda.cu
+++ b/src/cuda/init_cuda.cu
@@ -2,6 +2,7 @@
 
 #include "include/cuda_device_vars.h"
 #include "../settings/constants.h"
+#include "include/float3_helper_math.h"
 
 #include "include/pcisph_kernels.h"
 #include "include/smoothing_kernels.h"
@@ -282,8 +283,9 @@ namespace FluidSimulationGPU
         // params.surface_tension_normalization = 32.f / (utils::PI * std::pow(params.kernel_radius, 9.f));
 
         float kernel_radius = 4.0f * h_particleRadius;
-        float kernel_radius2 = pow(kernel_radius, 2.0f); // check taht pow() func is correct!!
-        float particle_mass = h_REST_DENSITY / pow(1.0f / (2.0f * h_particleRadius), 3.0f);
+        float kernel_radius2 = powf(kernel_radius, 2.0f); // check taht pow() func is correct!!
+        //float particle_mass = h_REST_DENSITY / pow(1.0f / (2.0f * h_particleRadius), 3.0f) * 100.f;
+        float particle_mass = 0.04f;
 
         printf("particle_radius: %f\n", h_particleRadius);
         printf("kernel_radius: %f\n", kernel_radius);
@@ -293,11 +295,11 @@ namespace FluidSimulationGPU
 
 
         // MAke sure constannts are correct!!!
-        float poly6__const = 315.f / (64.f * M_PI * pow(kernel_radius, 9.f));
-        float poly6_grad_const = -945.f / (32.f * M_PI * pow(kernel_radius, 9.f));
-        float viscosity_d2_normalization = 45.f / (M_PI * pow(kernel_radius, 6.0f));
-        float spiky_d1_normalization = -45.f / (M_PI * pow(kernel_radius, 6.0f));
-        float surface_tension_normalization = 32.f / (M_PI * pow(kernel_radius, 9.f));
+        float poly6__const = 315.f / (64.f * M_PI * powf(kernel_radius, 9.f));
+        float poly6_grad_const = -945.f / (32.f * M_PI * powf(kernel_radius, 9.f));
+        float viscosity_d2_normalization = 45.f / (M_PI * powf(kernel_radius, 6.0f));
+        float spiky_d1_normalization = -45.f / (M_PI * powf(kernel_radius, 6.0f));
+        float surface_tension_normalization = 32.f / (M_PI * powf(kernel_radius, 9.f));
 
         cudaMemcpyToSymbol(H, &kernel_radius, sizeof(float));
         cudaMemcpyToSymbol(H2, &kernel_radius2, sizeof(float));
@@ -314,7 +316,7 @@ namespace FluidSimulationGPU
         cudaMemcpyToSymbol(SURFACE_TENS_TERM, &surface_tension_term, sizeof(float));
 
         // 1) Beta
-        float timeStep = 0.001f;
+        float timeStep = 0.004f;
         float dtm_rho = (timeStep * particle_mass) / h_REST_DENSITY;
         float beta = 2.0f * dtm_rho * dtm_rho;
 
@@ -322,6 +324,9 @@ namespace FluidSimulationGPU
         double sum_val[3] = {0.0, 0.0, 0.0};
         double sum_spiky[3] = {0.0, 0.0, 0.0};
 
+        float3 grad_sum = make_float3(0.f, 0.f, 0.f);
+        float sum_dot_grad = 0;
+
         float step = 2.0f * h_particleRadius;
         float R = kernel_radius + h_particleRadius;
 
@@ -332,11 +337,20 @@ namespace FluidSimulationGPU
             {
                 for (float x = -R; x <= R; x += step)
                 {
-                    float r2 = x * x + y * y + z * z;
-                    float r = sqrtf(r2);
+                    float3 r = make_float3(x, y, z);
+                    float r2 = dot(r, r);
+                    float r_norm = sqrtf(r2);
+                    // float r2 = x * x + y * y + z * z;
+                    // float r_norm = sqrtf(r2);
 
                     if (r2 < kernel_radius2)
                     {
+                        float3 grad = (r/r_norm) * spiky_d1_normalization * powf(kernel_radius - r_norm, 2.f);
+                        
+                        grad_sum += grad;
+
+                        sum_dot_grad += dot(grad, grad);
+                       /*
                         float spikyMag = spiky_d1_normalization * pow(kernel_radius - r, 2.f) * (1.0f / r); // should it not match spky grad kernel?? r/r_norm??
                         float3 spiky_grad = {
                             spikyMag * x,
@@ -359,11 +373,18 @@ namespace FluidSimulationGPU
 
                         double dotSV = (double)spiky_grad.x * poly6_grad.x + (double)spiky_grad.y * poly6_grad.y + (double)spiky_grad.z * poly6_grad.z;
                         sumDot += dotSV;
+                       */ 
                     }
                 }
             }
         }
 
+        float dot_sum_grad = dot(grad_sum, grad_sum);
+
+        float grad_term = -dot_sum_grad - sum_dot_grad;
+
+        h_PCISPH_density_scaling_factor = (float)-1.f / (beta * grad_term);
+        /*
         double sum_spiky_dot_val = sum_spiky[0] * sum_val[0] + sum_spiky[1] * sum_val[1] + sum_spiky[2] * sum_val[2];
 
         double bigTerm = -sum_spiky_dot_val - sumDot;
@@ -373,9 +394,10 @@ namespace FluidSimulationGPU
         }
 
         double alpha = -1.0 / ((double)beta * bigTerm);
-
         // Store final
         h_PCISPH_density_scaling_factor = (float)alpha;
+        */
+
 
         printf("PCISPH density scaling factor: alpha = %g\n",
                h_PCISPH_density_scaling_factor);
@@ -440,9 +462,9 @@ namespace FluidSimulationGPU
     {
         // Recalculate derived constants
         float h_H2 = smoothingRadius * smoothingRadius;
-        float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(smoothingRadius, 9));
-        float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(smoothingRadius, 6));
-        float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(smoothingRadius, 6));
+        float h_POLY6_CONST = 315.0f / (64.0f * M_PI * powf(smoothingRadius, 9));
+        float h_SPIKY_GRAD_CONST = -45.0f / (M_PI * powf(smoothingRadius, 6));
+        float h_VISC_LAP_CONST = 45.0f / (M_PI * powf(smoothingRadius, 6));
 
         // Send constants to CUDA device
         cudaMemcpyToSymbol(PARTICLE_MASS, &particleMass, sizeof(float));
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index a552c0eda645c876bd84995df8b1382b604428f2..c7fff7ada52d51d650cce2c002256e7235d4bb1e 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -209,11 +209,11 @@ __global__ void computeDensityError(
         }
     }
 
-    float density_variation = density_i + b_density_i - REST_DENSITY;
+    float density_variation = (density_i + b_density_i) - REST_DENSITY;
     //float density_variation = max(0.0f, density_i + b_density_i - REST_DENSITY);
     //densities[idx] = density; 
     density_errors[idx] = density_variation;
-    particle_pressure[idx] += density_variation * d_PCISPH_density_scaling_factor;
+    particle_pressure[idx] += -density_variation * K_PCI;
 
 }
 
@@ -230,7 +230,7 @@ __global__ void updatePressures(
 
     if (idx < numParticles) {
         float deltaP = -density_errors[idx] * A;
-        pressures[idx] += deltaP;
+        pressures[idx] = deltaP;
     }
 }
 
@@ -258,7 +258,8 @@ __global__ void computePressureForces(
     float P_i = pressure[idx];
     float dens_i = densities[idx];
 
-    float pfc_i = P_i / (dens_i* dens_i);
+    //float pfc_i = P_i / (dens_i * dens_i);
+    float pfc_i = P_i / (REST_DENSITY * REST_DENSITY);
 
     float3 fPressure = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -303,7 +304,8 @@ __global__ void computePressureForces(
                             float gradW = spikyKernelGrad(dist);
                             float3 gradWij = gradW * (r / dist); // fix ^ to get r with sign
                             //float factor = - (P_i + P_j) / (REST_DENSITY * REST_DENSITY);
-                            float pfc =  pfc_i + pfc_j;
+                            //float pfc =  pfc_i + pfc_j;
+                            float pfc =  2.f * pfc_i;
                             fPressure += -pfc * gradWij * PARTICLE_MASS * PARTICLE_MASS;
                         }
                     }
@@ -332,7 +334,8 @@ __global__ void computePressureForces(
                             float3 gradWij = gradW * (r/dist);
                             // Using Equation (85): p_i and p_b = p_i; densities: ρ_i and ρ_0
                             //float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
-                            float pfc =  pfc_i + pfc_j; 
+                            //float pfc =  pfc_i + pfc_j; 
+                            float pfc =  2.f * pfc_i; 
                             b_fPressure += -pfc * gradWij * PARTICLE_MASS * PARTICLE_MASS;
                         }
                     }
@@ -549,7 +552,7 @@ __global__ void computeViscosityAndExternalForcesSurfaceTension(
     float  rho_i  = densities[idx];
 
     // Initialize the final “other” forces for particle i
-    float3 fGravity     = make_float3(0.f, -0.f, 0.f) * PARTICLE_MASS;  // e.g. (0, -9.8f, 0)*mass
+    float3 fGravity     = make_float3(0.f, -5.f, 0.f) * PARTICLE_MASS;  // e.g. (0, -9.8f, 0)*mass
     float3 fViscosity   = make_float3(0.f, 0.f, 0.f);
     float3 fCohesion    = make_float3(0.f, 0.f, 0.f);
     float3 fCurvature   = make_float3(0.f, 0.f, 0.f);
@@ -635,7 +638,7 @@ __global__ void computeViscosityAndExternalForcesSurfaceTension(
 
     float3 fSurface = (fCohesion + fCurvature);
 
-    float3 fOther = fGravity + fViscosity + fSurface;
+    float3 fOther = fGravity; //+ fViscosity; // + fSurface;
     force_other[idx] = fOther;
 }
 
@@ -683,6 +686,62 @@ __global__ void enforceBoundary(
     }
 }
 
+
+// Function to predict velocities and positions
+__global__ void predictVelNPos(
+    float3* velocities,
+    float3* velocities_pred,
+    float3* positions,
+    float3* positions_pred,
+    float3* force_other,
+    float timeStep,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+    float max_vel = 50.f;
+    // Compute acceleration
+    float3 acc = force_other[idx] / PARTICLE_MASS;
+
+    // Predict velocity with clamping
+    //float3 vel_pred = clamp_length(velocities[idx] + acc * timeStep, max_vel);
+    float3 vel_pred = velocities[idx] + acc * timeStep;
+
+    // Predict position
+    float3 pos_pred = positions[idx] + vel_pred * timeStep;
+
+
+    velocities_pred[idx] = vel_pred;
+    positions_pred[idx] = pos_pred;
+}
+
+// Function to update velocities and positions based on pressure forces
+__global__ void updatePredVelNPos(
+    float3* velocities_pred,
+    float3* positions_pred,
+    float3* pressure_forces,
+    float timeStep,
+    int numParticles
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+    float max_vel = 50.f;
+    // Compute acceleration
+    float3 acc = pressure_forces[idx] / PARTICLE_MASS;
+
+    // Predict velocity with clamping
+    //float3 vel_pred = clamp_length(velocities[idx] + acc * timeStep, max_vel);
+    float3 vel_pred = velocities_pred[idx] + acc * timeStep;
+
+    // Predict position
+    float3 pos_pred = positions_pred[idx] + vel_pred * timeStep;
+
+    velocities_pred[idx] = vel_pred;
+    positions_pred[idx] = pos_pred;
+}
+
 // Kernel to predict velocities and positions with boundary handling
 __global__ void predictVelocitiesPositions(
     float3* velocities,
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index 3ec48ca0777898ac8746a042785c79d74de8379b..0cd8244eda1118215c263523d781c3eba8de590d 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -23,7 +23,7 @@ namespace FluidSimulationGPU
         int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
 
         // Time step should be small but we can run updateParticles multiple times during display
-        float timeStep = 0.001f;
+        float timeStep = 0.004f;
 
         // -------------------- Neighbor Search --------------------
         int gridSizeX, gridSizeY, gridSizeZ;
@@ -57,7 +57,7 @@ namespace FluidSimulationGPU
         // cudaDeviceSynchronize();
 
         // Initialize pressure increments to zero
-        cudaMemset(d_pressure_force, 0, NUM_PARTICLES * sizeof(float));
+        cudaMemset(d_pressure_force, 0, NUM_PARTICLES * sizeof(float3));
         cudaMemset(d_particle_pressure, 0, NUM_PARTICLES * sizeof(float));
 
         // Compute densities and density errors, and per-particle sumGradW2
@@ -100,6 +100,17 @@ namespace FluidSimulationGPU
             gridSizeX, gridSizeY, gridSizeZ);
         cudaDeviceSynchronize();
 
+        predictVelNPos<<<blocksPerGrid, threadsPerBlock>>>(
+            d_velocities,
+            d_velocities_pred,
+            d_positions,
+            d_positions_pred,
+            d_force_other,
+            timeStep,
+            NUM_PARTICLES
+        );
+        cudaDeviceSynchronize();
+        
         // Reduce sum_gradW2 over all particles
         thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
         float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
@@ -113,43 +124,36 @@ namespace FluidSimulationGPU
         float m = h_PARTICLE_MASS;
         float rho0 = h_REST_DENSITY;
         float B = 2.0f * timeStep * timeStep * m * m / (rho0 * rho0);
-        float K_PCI = -1 / (B * totalSumGradW2);
+        // float K_PCI = -1 / (B * totalSumGradW2);
+        float K_PCI = -0.5f;
+
+        printf("K_PCI: %f\n", K_PCI);
+        
+        correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions_pred,
+            d_velocities_pred,
+            d_boundary_positions,
+            d_boundary_normals,
+            d_cellIndices,
+            d_boundaryCellStart,
+            d_boundaryCellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ,
+            r0,
+            normalDamping,
+            epsilon,
+            delta);
+        cudaDeviceSynchronize();
+
 
         // ------------ Iterative Pressure Correction Loop --------------------
 
-        const int maxIterations = 7;
+        const int maxIterations = 5;
 
         for (int iter = 0; iter < maxIterations; ++iter)
         {
             // Launch the boundary correction kernel
 
-            predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-                d_velocities,
-                d_velocities_pred,
-                d_positions,
-                d_positions_pred,
-                d_pressure_force,
-                d_force_other,
-                timeStep,
-                NUM_PARTICLES);
-            cudaDeviceSynchronize();
-
-            correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
-                d_positions_pred,
-                d_velocities_pred,
-                d_boundary_positions,
-                d_boundary_normals,
-                d_cellIndices,
-                d_boundaryCellStart,
-                d_boundaryCellEnd,
-                NUM_PARTICLES,
-                gridSizeX, gridSizeY, gridSizeZ,
-                r0,
-                normalDamping,
-                epsilon,
-                delta);
-            cudaDeviceSynchronize();
-
             // Compute desnity and update pressure
             computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
                 d_positions_pred,
@@ -188,46 +192,37 @@ namespace FluidSimulationGPU
                 gridSizeX, gridSizeY, gridSizeZ);
             cudaDeviceSynchronize();
 
-            // updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-            //     d_velocities_pred,
-            //     d_positions_pred,
-            //     d_pressure_force,
-            //     timeStep,
-            //     NUM_PARTICLES);
-            // cudaDeviceSynchronize();
-        }
-
-        updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions,
-            d_velocities,
-            d_pressure_force,
-            d_force_other,
-            timeStep,
-            NUM_PARTICLES);
-
-        cudaDeviceSynchronize();
+            updatePredVelNPos<<<blocksPerGrid, threadsPerBlock>>>(
+                d_velocities_pred,
+                d_positions_pred,
+                d_pressure_force,
+                timeStep,
+                NUM_PARTICLES);
+            cudaDeviceSynchronize();
+            
+            correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
+                d_positions_pred,
+                d_velocities_pred,
+                d_boundary_positions,
+                d_boundary_normals,
+                d_cellIndices,
+                d_boundaryCellStart,
+                d_boundaryCellEnd,
+                NUM_PARTICLES,
+                gridSizeX, gridSizeY, gridSizeZ,
+                r0,
+                normalDamping,
+                epsilon,
+                delta);
 
-        correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions,
-            d_velocities,
-            d_boundary_positions,
-            d_boundary_normals,
-            d_cellIndices,
-            d_boundaryCellStart,
-            d_boundaryCellEnd,
-            NUM_PARTICLES,
-            gridSizeX, gridSizeY, gridSizeZ,
-            r0,
-            normalDamping,
-            epsilon,
-            delta);
+            cudaDeviceSynchronize();
+        }
 
-        cudaDeviceSynchronize();
 
         // -------------------- Finalization --------------------
 
-        // cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
-        // cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+        cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
+        cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
 
         // Enforce boundary conditions
 
diff --git a/src/settings/constants.h b/src/settings/constants.h
index 7b04799c4e0cf157aef724e4e63270c73a273709..daaa916f3df108e30ce3c10bf28ddbe63b53cf1b 100644
--- a/src/settings/constants.h
+++ b/src/settings/constants.h
@@ -2,28 +2,35 @@
 #ifndef CONSTANTS_H
 #define CONSTANTS_H
 
-const float h_BOUND_X_MIN = -1.0f;
-const float h_BOUND_X_MAX = 1.0f;
+// const float h_BOUND_X_MIN = -1.0f;
+// const float h_BOUND_X_MAX = 1.0f;
+// const float h_BOUND_Y_MIN = 0.0f;
+// const float h_BOUND_Y_MAX = 2.5f;
+// const float h_BOUND_Z_MIN = -1.0f;
+// const float h_BOUND_Z_MAX = 1.0f;
+const float h_BOUND_X_MIN = -6.0f;
+const float h_BOUND_X_MAX = 12.0f;
 const float h_BOUND_Y_MIN = 0.0f;
-const float h_BOUND_Y_MAX = 2.5f;
-const float h_BOUND_Z_MIN = -1.0f;
-const float h_BOUND_Z_MAX = 1.0f;
+const float h_BOUND_Y_MAX = 10.0f;
+const float h_BOUND_Z_MIN = -4.0f;
+const float h_BOUND_Z_MAX = 4.0f;
+
 
 const float h_GAS_CONSTANT = 0.02f;
-const float h_VISCOSITY = 0.0035105f;
+const float h_VISCOSITY = 0.35105f;
 
 const float h_PARTICLE_MASS = 0.04f;
-const float h_REST_DENSITY = 1000.0f;
+const float h_REST_DENSITY = 800.0f;
 
 const float h_PI = 3.14159265358979323846f;
 const float h_H = 0.25f; // Smoothing radius
 
 const int NUM_PARTICLES = 50000;
 
-const float h_POINT_SCALE = 4.0f;
-const float h_POINT_RADIUS = 10.0f;
+const float h_POINT_SCALE = 18.0f;
+const float h_POINT_RADIUS = 12.0f;
 
-const float h_particleRadius = 0.008;
+const float h_particleRadius = 0.0625;
 
 const float h_surcface_tension_coeff = 0.0032;
 const float h_surface_tension_term = 0.035;
diff --git a/src/spawn_particles.cpp b/src/spawn_particles.cpp
index b8decd4b17d3f1c2ca7a92fe41f6424b112fbb93..f63e9d052202a9499d59e010e0ad918ac9fa32f3 100644
--- a/src/spawn_particles.cpp
+++ b/src/spawn_particles.cpp
@@ -50,7 +50,7 @@ void initParticles(
     {
         case SimulationMode::OneLargeCube:
             particlesPerAxis = static_cast<int>(ceil(pow(NUM_PARTICLES, 1.0f / 3.0f)));
-            index = initializeCube(0, particlesPerAxis, 0.1f, 0.1f, 0.1f);
+            index = initializeCube(0, particlesPerAxis, 1.0f, 4.0f, 1.0f);
             break;
 
         case SimulationMode::TwoSmallCubes: