diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index df4cc16814210cfd875a510b20fca2f77c6b096b..52882abb0b5d581c4f225f662278e757320ac2d2 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -32,7 +32,7 @@ namespace FluidSimulation {
         // Creaet new fluiid particles
         std::vector<float3> h_positions(m_numParticles);
         std::vector<float3> h_velocities(m_numParticles);
-        float gridSpacing = 0.5 * m_smoothingRadius;
+        float gridSpacing = 2.0f * h_particleRadius;
         initParticles(h_positions, h_velocities, m_numParticles, gridSpacing, m_mode);
     
         FluidSimulationGPU::resetSimulation(
diff --git a/src/FluidSimulationApp.cpp b/src/FluidSimulationApp.cpp
index 45773eb3fbe30350107bf3bcc45003cf81805f44..0f18901d87218217251c10b5219f4713b6053fbb 100644
--- a/src/FluidSimulationApp.cpp
+++ b/src/FluidSimulationApp.cpp
@@ -28,7 +28,7 @@ namespace FluidSimulation {
 
         m_fluidRenderer->setVBOs(m_fluidSimulation->getParticleVBO(), m_fluidSimulation->getBoundaryVBO());
 
-        m_updatePhysics = true;
+        m_updatePhysics = false;
 
         //m_fluidRenderer->init();
         std::cout << "FluidSimulatorApp created with settings: "
@@ -216,67 +216,66 @@ namespace FluidSimulation {
         if (m_updatePhysics)
         {
             m_fluidSimulation->updateSimulation();
+        }
+        // Precompute particle counts to avoid multiple function calls
+        size_t particleCount = m_fluidSimulation->getParticleCount();
+        size_t boundaryCount = m_fluidSimulation->getBoundaryParticleCount();
 
-            // Precompute particle counts to avoid multiple function calls
-            size_t particleCount = m_fluidSimulation->getParticleCount();
-            size_t boundaryCount = m_fluidSimulation->getBoundaryParticleCount();
-
-            switch (selectedStage)
-            {
-                case RenderStage::ParticleView:
-                    m_fluidRenderer->renderNormalFrame(particleCount, boundaryCount);
-                    break;
-
-                case RenderStage::DepthBuffer:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->visualizeBuffer(RenderStage::DepthBuffer);
-                    break;
-
-                case RenderStage::FilteredDepth:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->visualizeBuffer(RenderStage::FilteredDepth);
-                    break;
-
-                case RenderStage::DepthNormals:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->renderDepthNormalsFrame();
-                    break;
-
-                case RenderStage::ThicknessBuffer:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->visualizeBuffer(RenderStage::ThicknessBuffer);
-                    break;
-                
-                case RenderStage::FilteredThickness:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->visualizeBuffer(RenderStage::FilteredThickness);
-                    break;
-
-                case RenderStage::LightAttenuation:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->renderThicknessAttenFrame();
-                    break;
+        switch (selectedStage)
+        {
+            case RenderStage::ParticleView:
+                m_fluidRenderer->renderNormalFrame(particleCount, boundaryCount);
+                break;
+
+            case RenderStage::DepthBuffer:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->visualizeBuffer(RenderStage::DepthBuffer);
+                break;
+
+            case RenderStage::FilteredDepth:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->visualizeBuffer(RenderStage::FilteredDepth);
+                break;
+
+            case RenderStage::DepthNormals:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->renderDepthNormalsFrame();
+                break;
+
+            case RenderStage::ThicknessBuffer:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->visualizeBuffer(RenderStage::ThicknessBuffer);
+                break;
+            
+            case RenderStage::FilteredThickness:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->visualizeBuffer(RenderStage::FilteredThickness);
+                break;
+
+            case RenderStage::LightAttenuation:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->renderThicknessAttenFrame();
+                break;
+            
+            case RenderStage::ScreenSpacedRender:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->renderDepthNormalsFrame();
                 
-                case RenderStage::ScreenSpacedRender:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->renderDepthNormalsFrame();
-                    
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->renderThicknessAttenFrame();
-                    
-                    m_fluidRenderer->renderFinalCombineFrame();
-                    break;
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->renderThicknessAttenFrame();
                 
-                default:
-                    std::cerr << "Unknown RenderStage selected." << std::endl;
-                    break;
-            }
+                m_fluidRenderer->renderFinalCombineFrame();
+                break;
+            
+            default:
+                std::cerr << "Unknown RenderStage selected." << std::endl;
+                break;
         }
 
         // Display FBO texture in ImGui window
diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h
index 1742015a04bf5d13eedb40ae62bb8d9ca264b191..4d2abeafab17df0d83064ff0ee4ec3d289f174b4 100644
--- a/src/cuda/include/cuda_device_vars.h
+++ b/src/cuda/include/cuda_device_vars.h
@@ -20,6 +20,10 @@ extern __constant__ float POLY6_GRAD_CONST;
 extern __constant__ float SPIKY_GRAD_CONST;
 extern __constant__ float VISC_LAP_CONST;
 
+extern __constant__ float SURFACE_TENS_COEFF;
+extern __constant__ float SURFACE_TENS_TERM;
+extern __constant__ float SURFACE_TENS_NORMALIZATION;
+
 extern __constant__ float PARTICLE_MASS;
 extern __constant__ float REST_DENSITY;
 extern __constant__ float VISCOSITY;
@@ -51,6 +55,7 @@ extern float* d_density_errors;
 extern float* d_particle_pressure;
 extern float3* d_pressure_force;
 extern float3* d_force_other;
+extern float3* d_fluid_normals;
 
 // Additional arrays for PCISPH
 extern float3* d_velocities_pred;
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
index 44bcf6c9ee54252991311943853782ecffea00ca..4615c3afdf92653ebbda46537e6debe8e5a1948f 100644
--- a/src/cuda/include/pcisph_kernels.h
+++ b/src/cuda/include/pcisph_kernels.h
@@ -82,11 +82,25 @@ __global__ void correctBoundaryPenetrations(
     float delta                         // Elasticity coefficient (0 <= delta <= 1)
 );
 
+
+__global__ void computeFluidNormals(
+    float3* positions,
+    float3* normals,
+    float* densities,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
 // Function to compute viscosity and external forces
-__global__ void computeViscosityAndExternalForces(
+__global__ void computeViscosityAndExternalForcesSurfaceTension(
     float3* positions,
     float3* velocities,
-    float3* force_other,
+    float3* normals,        // Computed from computeNormals(...)
+    float* densities,
+    float3* force_other,    // Output array for “non-pressure” forces
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
diff --git a/src/cuda/include/smoothing_kernels.h b/src/cuda/include/smoothing_kernels.h
index 76e22fc9729a1c9cbc3dd1343e573ce5dd224f89..5f40079409578f7fd0c2fd07c51c6b7a95bcfe4d 100644
--- a/src/cuda/include/smoothing_kernels.h
+++ b/src/cuda/include/smoothing_kernels.h
@@ -16,6 +16,8 @@ __device__ float viscosityKernelLap(float r);
 __device__ float cubicSplineKernel(float r);
 __device__ float cubicSplineGrad(float r);
 
+__device__ float cohesionKernel(float r);
+
 } // namespace FluidSimulationGPU
 
 #endif // SMOOTHING_KERNELS_H
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
index c471ff46dd75619e950406d502aea18fa22b61f4..f9df13dace0912c267bee6c345efc83eec9d2f9b 100644
--- a/src/cuda/init_cuda.cu
+++ b/src/cuda/init_cuda.cu
@@ -27,6 +27,10 @@ namespace FluidSimulationGPU
     __constant__ float SPIKY_GRAD_CONST;
     __constant__ float VISC_LAP_CONST;
 
+    __constant__ float SURFACE_TENS_COEFF;
+    __constant__ float SURFACE_TENS_TERM;
+    __constant__ float SURFACE_TENS_NORMALIZATION;
+
     __constant__ float PARTICLE_MASS;
     __constant__ float REST_DENSITY;
     __constant__ float VISCOSITY;
@@ -42,6 +46,7 @@ namespace FluidSimulationGPU
     // Device Arrays
     float3 *d_positions = nullptr;
     float3 *d_velocities = nullptr;
+    float3 *d_fluid_normals = nullptr; 
 
     unsigned int *d_cellIndices = nullptr;
     unsigned int *d_particleIndices = nullptr;
@@ -143,6 +148,8 @@ namespace FluidSimulationGPU
         // Allocate device memory for predicted velocities and positions
         cudaMalloc(&d_velocities_pred, numParticles * sizeof(float3));
         cudaMalloc(&d_positions_pred, numParticles * sizeof(float3));
+        
+        cudaMalloc(&d_fluid_normals, numParticles * sizeof(float3));
 
         // Allocate device memory for cell indices and particle indices
         cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int));
@@ -152,10 +159,11 @@ namespace FluidSimulationGPU
         cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3));
         cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3));
 
+        float kernel_radius = 4.f * h_particleRadius;
         // Allocate cell start and end arrays
-        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
-        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
-        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / kernel_radius);
+        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / kernel_radius);
+        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / kernel_radius);
         int maxCells = gridSizeX * gridSizeY * gridSizeZ;
 
         cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int));
@@ -180,10 +188,12 @@ namespace FluidSimulationGPU
         const float3 *h_boundaryNormals, // Renamed parameter
         int numBoundaryParticles)
     {
+
+        float kernel_radius = 4.f * h_particleRadius;
         // Allocate cell start and end arrays
-        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
-        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
-        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+        int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / kernel_radius);
+        int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / kernel_radius);
+        int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / kernel_radius);
         int maxCells = gridSizeX * gridSizeY * gridSizeZ;
 
         // Allocate memory for boundary positions
@@ -273,7 +283,14 @@ namespace FluidSimulationGPU
 
         float kernel_radius = 4.0f * h_particleRadius;
         float kernel_radius2 = pow(kernel_radius, 2.0f); // check taht pow() func is correct!!
-        float particle_mass = h_REST_DENSITY / pow(1.0f / (2.0f * h_particleRadius), 3.0f) / 1.15f;
+        float particle_mass = h_REST_DENSITY / pow(1.0f / (2.0f * h_particleRadius), 3.0f);
+
+        printf("particle_radius: %f\n", h_particleRadius);
+        printf("kernel_radius: %f\n", kernel_radius);
+        printf("kernel_radius2: %f\n", kernel_radius2);
+        printf("particle_mass: %f\n", particle_mass);
+        printf("rest_density: %f\n", h_REST_DENSITY);
+
 
         // MAke sure constannts are correct!!!
         float poly6__const = 315.f / (64.f * M_PI * pow(kernel_radius, 9.f));
@@ -288,11 +305,16 @@ namespace FluidSimulationGPU
         cudaMemcpyToSymbol(POLY6_GRAD_CONST, &poly6_grad_const, sizeof(float));
         cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &spiky_d1_normalization, sizeof(float));
         cudaMemcpyToSymbol(VISC_LAP_CONST, &viscosity_d2_normalization, sizeof(float));
+        cudaMemcpyToSymbol(SURFACE_TENS_NORMALIZATION, &surface_tension_normalization, sizeof(float));
+
+        float surface_tension_term = pow(kernel_radius, 6.f) / 64.f;
 
         cudaMemcpyToSymbol(PARTICLE_MASS, &particle_mass, sizeof(float));
+        cudaMemcpyToSymbol(SURFACE_TENS_COEFF, &h_surcface_tension_coeff, sizeof(float));
+        cudaMemcpyToSymbol(SURFACE_TENS_TERM, &surface_tension_term, sizeof(float));
 
         // 1) Beta
-        float timeStep = 0.01f;
+        float timeStep = 0.001f;
         float dtm_rho = (timeStep * particle_mass) / h_REST_DENSITY;
         float beta = 2.0f * dtm_rho * dtm_rho;
 
@@ -313,7 +335,7 @@ namespace FluidSimulationGPU
                     float r2 = x * x + y * y + z * z;
                     float r = sqrtf(r2);
 
-                    if (r < h_H && r > 1e-9f)
+                    if (r2 < kernel_radius2)
                     {
                         float spikyMag = spiky_d1_normalization * pow(kernel_radius - r, 2.f) * (1.0f / r); // should it not match spky grad kernel?? r/r_norm??
                         float3 spiky_grad = {
@@ -321,7 +343,7 @@ namespace FluidSimulationGPU
                             spikyMag * y,
                             spikyMag * z};
 
-                        float poly6Mag = poly6_grad_const * pow(kernel_radius2 - r2, 2.f); // Check ttaht kernel is correect!!!
+                        float poly6Mag = spikyMag = spiky_d1_normalization * pow(kernel_radius - r, 2.f) * (1.0f / r); // Check ttaht kernel is correect!!!
                         float3 poly6_grad = {
                             poly6Mag * x,
                             poly6Mag * y,
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index 4e1a5e278d077263f2fc9a19ea5a56aac3f5ce45..a552c0eda645c876bd84995df8b1382b604428f2 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -120,7 +120,7 @@ __global__ void computeInitialDensity(
         }
     }
 
-    densities[idx] = REST_DENSITY; //density_i + b_density_i;
+    densities[idx] = density_i + b_density_i;
 
     if (sum_gradW2 != nullptr) {
         sum_gradW2[idx] = sumGradW2_local;
@@ -209,8 +209,9 @@ __global__ void computeDensityError(
         }
     }
 
-    float density_variation = max(density_i + b_density_i - REST_DENSITY, 0.f);
-    //densities[idx] = density;
+    float density_variation = density_i + b_density_i - REST_DENSITY;
+    //float density_variation = max(0.0f, density_i + b_density_i - REST_DENSITY);
+    //densities[idx] = density; 
     density_errors[idx] = density_variation;
     particle_pressure[idx] += density_variation * d_PCISPH_density_scaling_factor;
 
@@ -342,6 +343,7 @@ __global__ void computePressureForces(
         }
     }
 
+    // Should we use predicted denseties isntead of original deenseties??
     pressure_force[idx] = fPressure + b_fPressure;
 }
 
@@ -454,79 +456,190 @@ __global__ void correctBoundaryPenetrations(
     velocities_pred[idx] = vel_i;
 }
 
-// Compute viscosity and external forces (excluding pressure forces)
-__global__ void computeViscosityAndExternalForces(
+__global__ void computeFluidNormals(
     float3* positions,
-    float3* velocities,
-    float3* force_other,
+    float3* normals,
+    float* densities,
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
     int numParticles,
-    int gridSizeX, int gridSizeY, int gridSizeZ
-)
+    int gridSizeX, int gridSizeY, int gridSizeZ)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
 
-    if (idx < numParticles) {
-        float3 pos_i = positions[idx];
-        float3 vel_i = velocities[idx];
+    float3 pos_i = positions[idx];
+    //float rho_i = densities[idx];
 
-        float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f);
-        float3 fGravity = make_float3(0.0f, -20.0f, 0.0f) * PARTICLE_MASS;
+    float3 normal_i = make_float3(0.f, 0.f, 0.f);
 
-        unsigned int cellIndex = cellIndices[idx];
+    unsigned int cellIndex = cellIndices[idx];
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
 
-        // Compute cell coordinates
-        int cellX = cellIndex % gridSizeX;
-        int cellY = (cellIndex / gridSizeX) % gridSizeY;
-        int cellZ = cellIndex / (gridSizeX * gridSizeY);
+    // Search neighbors in 3x3x3 region
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
 
-        // Iterate over neighboring cells
-        for (int dz = -1; dz <= 1; ++dz) {
-            int nz = cellZ + dz;
-            if (nz < 0 || nz >= gridSizeZ) continue;
-            for (int dy = -1; dy <= 1; ++dy) {
-                int ny = cellY + dy;
-                if (ny < 0 || ny >= gridSizeY) continue;
-                for (int dx = -1; dx <= 1; ++dx) {
-                    int nx = cellX + dx;
-                    if (nx < 0 || nx >= gridSizeX) continue;
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                unsigned int startIdx = cellStart[neighborCellIndex];
+                unsigned int endIdx   = cellEnd[neighborCellIndex];
 
-                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                if (startIdx == 0xFFFFFFFF) continue; // no particles
 
-                    unsigned int startIdx = cellStart[neighborCellIndex];
-                    unsigned int endIdx = cellEnd[neighborCellIndex];
+                for (unsigned int j = startIdx; j < endIdx; ++j) {
+                    if (j == idx) continue;
 
-                    if (startIdx != 0xFFFFFFFF) {
-                        for (unsigned int j = startIdx; j < endIdx; ++j) {
-                            if (j == idx) continue;
+                    float3 pos_j = positions[j];
+                    float rho_j  = densities[j];
 
-                            float3 pos_j = positions[j];
-                            float3 vel_j = velocities[j];
-                            float3 r = pos_i - pos_j;
+                    float3 r = pos_i - pos_j;
+                    float r2 = dot(r, r);
+                    if (r2 < H2 && r2 > 1e-12f) {
+                        float r_len = sqrtf(r2);
 
-                            float distSq = dot(r, r);
+                        // color field gradient from poly6 or spiky, your choice
+                        // e.g. using spiky gradient coefficient:
+                        float gradW = poly6GradKernel(r2);
+                        float3 gradW_ij = gradW * r;
 
-                            if (distSq < H2 && distSq > 0.0001f) {
-                                float dist = sqrtf(distSq);
+                        // Weighted by mass/density:
+                        float3 w = gradW_ij /  rho_j;
+                        normal_i += w;
+                    }
+                }
+            }
+        }
+    }
 
-                                // Viscosity force
-                                float3 velDiff = vel_j - vel_i;
-                                float viscLap = viscosityKernelLap(dist);
-                                fViscosity += VISCOSITY * velDiff * viscLap * PARTICLE_MASS / REST_DENSITY;
-                            }
+    // Write out the computed normal
+    normal_i *= PARTICLE_MASS * H;
+    normals[idx] = normal_i;
+}
+
+
+__global__ void computeViscosityAndExternalForcesSurfaceTension(
+    float3* positions,
+    float3* velocities,
+    float3* normals,        // Computed from computeNormals(...)
+    float* densities,
+    float3* force_other,    // Output array for “non-pressure” forces
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+
+    float3 pos_i  = positions[idx];
+    float3 vel_i  = velocities[idx];
+    float3 norm_i = normals[idx];
+    float  rho_i  = densities[idx];
+
+    // Initialize the final “other” forces for particle i
+    float3 fGravity     = make_float3(0.f, -0.f, 0.f) * PARTICLE_MASS;  // e.g. (0, -9.8f, 0)*mass
+    float3 fViscosity   = make_float3(0.f, 0.f, 0.f);
+    float3 fCohesion    = make_float3(0.f, 0.f, 0.f);
+    float3 fCurvature   = make_float3(0.f, 0.f, 0.f);
+
+    unsigned int cellIndex = cellIndices[idx];
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+    // Loop over neighbors
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
+
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                unsigned int startIdx = cellStart[neighborCellIndex];
+                unsigned int endIdx   = cellEnd[neighborCellIndex];
+
+                if (startIdx == 0xFFFFFFFF) continue;
+
+                for (unsigned int j = startIdx; j < endIdx; ++j) {
+                    if (j == idx) continue;
+
+                    float3 pos_j  = positions[j];
+                    float3 vel_j  = velocities[j];
+                    float3 norm_j = normals[j];
+                    float  rho_j  = densities[j];
+
+                    float3 r   = pos_i - pos_j;
+                    float dist2= dot(r, r);
+                    if (dist2 < H2 && dist2 > 1e-8f) {
+                        float dist = sqrtf(dist2);
+
+                        // 1. Viscosity
+                        //    Usually: fViscosity += μ * (vel_j - vel_i) * laplacian(W)
+                        //    Weighted by mass/density
+                        float lapW = viscosityKernelLap(dist); 
+                        float3 velDiff = vel_j - vel_i;
+                        fViscosity += (velDiff / rho_j) * lapW;
+
+                        // 2. Surface Tension: “cohesion + curvature”
+                        //    Typically you see: “cohesion” uses a specialized kernel ST_cohesion(dist),
+                        //    “curvature” uses (norm_i - norm_j).
+                        //    Some do a factor = 2 * restDensity / (rho_i + rho_j) 
+                        float st_correction_factor = 2.f * REST_DENSITY / (rho_i + rho_j);
+
+                        // Cohesion
+                        if (dist > 1e-6f) {
+                            float3 dir = r / dist; 
+                            float st_cohesion_kernel = cohesionKernel(dist);
+                            // Accumulate
+                            fCohesion += st_correction_factor * st_cohesion_kernel * dir;
                         }
+
+                        // Curvature
+                        fCurvature += st_correction_factor * (norm_i - norm_j);
                     }
                 }
             }
         }
-
-        // Total acceleration (excluding pressure force)
-        force_other[idx] = fViscosity + fGravity;
     }
+
+    // Scale forces by appropriate constants:
+    // Viscosity final
+    // (You might do fViscosity *= PARTICLE_MASS as well, depends on your conventions.)
+    // Sum into “other forces” 
+    // Notice how you might store partial constants in e.g. “viscosity_d2_normalization,” etc.
+
+    // Surface tension final
+    // Typically: fSurface = - σ * m * [fCohesion + (some smaller factor)*fCurvature]
+    // The code snippet examples differ in how they scale curvature vs. cohesion.
+    //float3 fSurface = - surfaceTensionCoef * PARTICLE_MASS * (fCohesion + fCurvature);
+    fViscosity *= VISCOSITY * PARTICLE_MASS;
+
+    fCohesion *= -SURFACE_TENS_COEFF * PARTICLE_MASS * PARTICLE_MASS * SURFACE_TENS_NORMALIZATION;
+    fCurvature *= -SURFACE_TENS_COEFF * PARTICLE_MASS;
+
+    float3 fSurface = (fCohesion + fCurvature);
+
+    float3 fOther = fGravity + fViscosity + fSurface;
+    force_other[idx] = fOther;
 }
 
+
 // Enforce boundary conditions, very simple should bee improved.
 __global__ void enforceBoundary(
     float3* positions,
@@ -570,7 +683,7 @@ __global__ void enforceBoundary(
     }
 }
 
-// Kernel to predict velocities and positions
+// Kernel to predict velocities and positions with boundary handling
 __global__ void predictVelocitiesPositions(
     float3* velocities,
     float3* velocities_pred,
@@ -583,19 +696,51 @@ __global__ void predictVelocitiesPositions(
 )
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < numParticles) {
-        // Compute predicted velocity
-        float3 acc = (forces_other[idx] + pressure_forces[idx])/PARTICLE_MASS;
-        float3 vel_pred =  velocities[idx] + acc * timeStep;
+    if (idx >= numParticles) return;
+    float max_vel = 50.f;
+    // Compute acceleration
+    float3 acc = (forces_other[idx] + pressure_forces[idx]) / PARTICLE_MASS;
 
-        velocities_pred[idx] = vel_pred;
-        // Compute predicted position
-        positions_pred[idx] = positions[idx] + vel_pred * timeStep;
+    // Predict velocity with clamping
+    float3 vel_pred = clamp_length(velocities[idx] + acc * timeStep, max_vel);
+
+    // Predict position
+    float3 pos_pred = positions[idx] + vel_pred * timeStep;
+
+    /*
+    // Boundary collision detection and response
+    // Clamp positions to container limits
+    float3 clamped_pos = pos_pred;
+    clamped_pos.x = fmaxf(BOUND_X_MIN, fminf(pos_pred.x, BOUND_X_MAX));
+    clamped_pos.y = fmaxf(BOUND_X_MIN, fminf(pos_pred.y, BOUND_Y_MAX));
+    clamped_pos.z = fmaxf(BOUND_Y_MIN, fminf(pos_pred.z, BOUND_Z_MAX));
+
+    // Calculate displacement due to clamping
+    float3 displacement = clamped_pos - pos_pred;
+
+    // If displacement occurred, adjust velocity
+    if (length(displacement) > 0.0f) {
+        // Normal vector
+        float3 normal = normalize(displacement);
+
+        // Calculate restitution (e.g., 0.5 for some energy loss)
+        float restitution = 0.5f;
+
+        // Reflect velocity
+        vel_pred -= (1.0f + restitution) * dot(vel_pred, normal) * normal;
+
+        // Update predicted position after collision
+        pos_pred = clamped_pos;
     }
+    */
+
+    // Store predicted velocity and position
+    velocities_pred[idx] = vel_pred;
+    positions_pred[idx] = pos_pred;
 }
 
-// Update velocities and positions based on pressure forces in 
-// pressure correcting loop. 
+
+// Kernel to update velocities and positions with boundary handling
 __global__ void updateVelocitiesPositions(
     float3* velocities,
     float3* positions,
@@ -606,20 +751,55 @@ __global__ void updateVelocitiesPositions(
 )
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
 
-    if (idx < numParticles) {
-        // Update predicted velocity
-        float3 vel = velocities[idx];
-        vel = clamp(vel, -50, 50);
-        float3 acc = (force_other[idx] + pressure_force[idx]) / PARTICLE_MASS;
-        
-        vel += acc * timeStep;
-        
-        velocities[idx] = vel;
-        // Update predicted position
-        positions[idx] +=  vel * timeStep;
+    float max_vel = 50.f;
+
+    // Compute acceleration
+    float3 acc = (force_other[idx] + pressure_force[idx]) / PARTICLE_MASS;
+
+    // Update velocity with clamping
+    float3 vel = clamp_length(velocities[idx] + acc * timeStep, max_vel);
+
+    // Update position
+    float3 pos = positions[idx] + vel * timeStep;
+
+    /*
+    // Boundary collision detection and response
+    // Clamp positions to container limits
+    float3 clamped_pos = pos;
+    clamped_pos.x = fmaxf(BOUND_X_MIN, fminf(pos.x, BOUND_X_MAX));
+    clamped_pos.y = fmaxf(BOUND_X_MIN, fminf(pos.y, BOUND_Y_MAX));
+    clamped_pos.z = fmaxf(BOUND_Y_MIN, fminf(pos.z, BOUND_Z_MAX));
+
+    // Calculate displacement due to clamping
+    float3 displacement = clamped_pos - pos;
+
+    // If displacement occurred, adjust velocity
+    if (length(displacement) > 0.0f) {
+        // Normal vector
+        float3 normal = normalize(displacement);
+
+        // Calculate restitution (e.g., 0.5 for some energy loss)
+        float restitution = 0.5f;
+
+        // Reflect velocity
+        vel -= (1.0f + restitution) * dot(vel, normal) * normal;
+
+        // Update position after collision
+        pos = clamped_pos;
     }
-}
 
+    // Optionally, clamp the velocity's magnitude
+    float vel_length = length(vel);
+    if (vel_length > max_vel) {
+        vel = (vel / vel_length) * max_vel;
+    }
+    */
+
+    // Store updated velocity and position
+    velocities[idx] = vel;
+    positions[idx] = pos;
+}
 
 } // namespace FluidSimulationGPU
\ No newline at end of file
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index 5e9203f14ea26955ac709d9bce13cf1eb7dbd73e..3ec48ca0777898ac8746a042785c79d74de8379b 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -23,7 +23,7 @@ namespace FluidSimulationGPU
         int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock;
 
         // Time step should be small but we can run updateParticles multiple times during display
-        float timeStep = 0.01f;
+        float timeStep = 0.001f;
 
         // -------------------- Neighbor Search --------------------
         int gridSizeX, gridSizeY, gridSizeZ;
@@ -45,16 +45,6 @@ namespace FluidSimulationGPU
 
         // -------------------- Prediction Step --------------------
 
-        computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions,
-            d_velocities,
-            d_force_other,
-            d_cellIndices,
-            d_cellStart,
-            d_cellEnd,
-            NUM_PARTICLES,
-            gridSizeX, gridSizeY, gridSizeZ);
-        cudaDeviceSynchronize();
 
         // predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>(
         //     d_velocities,
@@ -85,12 +75,36 @@ namespace FluidSimulationGPU
             NUM_PARTICLES,
             gridSizeX, gridSizeY, gridSizeZ);
         cudaDeviceSynchronize();
+        
+        computeFluidNormals<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions,
+            d_fluid_normals,
+            d_densities,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
+        
+        computeViscosityAndExternalForcesSurfaceTension<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions,
+            d_velocities,
+            d_fluid_normals,
+            d_densities,
+            d_force_other,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
 
         // Reduce sum_gradW2 over all particles
         thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
         float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
 
-        float r0 = 0.5f * h_H;      // Penetration threshold
+        float r0 = 0.5f * 2.f * h_particleRadius;      // Penetration threshold
         float epsilon = 1.0f;       // Moderate to high friction
         float delta = 0.5f;         // Low to moderate elasticity
         float normalDamping = 0.5f; // Adjust as needed
@@ -103,7 +117,7 @@ namespace FluidSimulationGPU
 
         // ------------ Iterative Pressure Correction Loop --------------------
 
-        const int maxIterations = 5;
+        const int maxIterations = 7;
 
         for (int iter = 0; iter < maxIterations; ++iter)
         {
@@ -206,8 +220,7 @@ namespace FluidSimulationGPU
             r0,
             normalDamping,
             epsilon,
-            delta
-        );
+            delta);
 
         cudaDeviceSynchronize();
 
@@ -218,13 +231,13 @@ namespace FluidSimulationGPU
 
         // Enforce boundary conditions
 
-        float damping = 0.75f;
-        enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
-            d_positions,
-            d_velocities,
-            NUM_PARTICLES,
-            damping);
-        cudaDeviceSynchronize();
+        // float damping = 0.75f;
+        // enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
+        //     d_positions,
+        //     d_velocities,
+        //     NUM_PARTICLES,
+        //     damping);
+        // cudaDeviceSynchronize();
 
         // Unmap the VBO resource
         cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
diff --git a/src/cuda/smoothing_kernels.cu b/src/cuda/smoothing_kernels.cu
index 45580770ea11763885e21ea5d10526d0bc733f14..c325bb9fe79c743b78b8b8694e1bd1dc4f5da550 100644
--- a/src/cuda/smoothing_kernels.cu
+++ b/src/cuda/smoothing_kernels.cu
@@ -77,5 +77,29 @@ __device__ float cubicSplineGrad(float r) {
     return grad;
 }
 
+__device__ float cohesionKernel(float r) 
+{
+    if (r <= 0.5f * H) {
+        float t1 = (H - r);
+        float t2 = (t1 * t1 * t1) * (r * r * r);
+        return 2.f * t2 - SURFACE_TENS_TERM;
+    }
+    else if (r <= H) {
+        float t1 = (H - r);
+        return (t1 * t1 * t1) * (r * r * r);
+    }
+    else {
+        return 0.f;
+    }
+    /*
+    if (r >= H) return 0.f;   // outside kernel radius = 0
+    float hr = H - r;
+    // This is the raw polynomial shape. 
+    // You might multiply by a constant COHESION_CONST so that it integrates properly, 
+    // or just tune it by hand.
+    return hr*hr*hr * r*r*r; 
+    */
+}
+
 
 } // namespace FluidSimulationGPU
diff --git a/src/cuda/spatialHash_NBSearch_kernels.cu b/src/cuda/spatialHash_NBSearch_kernels.cu
index d48d3005fc409f690d3df74f022c1fe38d35aa9c..d432f8226fa5c8f53fc12722789a5c94efc858a8 100644
--- a/src/cuda/spatialHash_NBSearch_kernels.cu
+++ b/src/cuda/spatialHash_NBSearch_kernels.cu
@@ -105,14 +105,15 @@ void neigbourHoodSearch(
         int &gridSizeZ) 
 {
 
+    float kernel_radius = 4.0 * h_particleRadius;
     // Compute grid size based on smoothing radius
     float3 gridMin = make_float3(h_BOUND_X_MIN, h_BOUND_Y_MIN, h_BOUND_Z_MIN);
     float3 gridMax = make_float3(h_BOUND_X_MAX, h_BOUND_Y_MAX, h_BOUND_Z_MAX);
-    float3 gridCellSize = make_float3(h_H, h_H, h_H);
+    float3 gridCellSize = make_float3(kernel_radius, kernel_radius, kernel_radius);
 
-    gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H);
-    gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H);
-    gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
+    gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / kernel_radius);
+    gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / kernel_radius);
+    gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / kernel_radius);
     int maxCells = gridSizeX * gridSizeY * gridSizeZ;
     
     // Compute cell indices for current positions
diff --git a/src/settings/constants.h b/src/settings/constants.h
index fbd5bc95fc2fe7822c1f1f0f0b54f6d67298fbb5..7b04799c4e0cf157aef724e4e63270c73a273709 100644
--- a/src/settings/constants.h
+++ b/src/settings/constants.h
@@ -2,27 +2,30 @@
 #ifndef CONSTANTS_H
 #define CONSTANTS_H
 
-const float h_BOUND_X_MIN = -6.0f;
-const float h_BOUND_X_MAX = 12.0f;
+const float h_BOUND_X_MIN = -1.0f;
+const float h_BOUND_X_MAX = 1.0f;
 const float h_BOUND_Y_MIN = 0.0f;
-const float h_BOUND_Y_MAX = 10.0f;
-const float h_BOUND_Z_MIN = -4.0f;
-const float h_BOUND_Z_MAX = 4.0f;
+const float h_BOUND_Y_MAX = 2.5f;
+const float h_BOUND_Z_MIN = -1.0f;
+const float h_BOUND_Z_MAX = 1.0f;
 
 const float h_GAS_CONSTANT = 0.02f;
 const float h_VISCOSITY = 0.0035105f;
 
 const float h_PARTICLE_MASS = 0.04f;
-const float h_REST_DENSITY = 998.29f;
+const float h_REST_DENSITY = 1000.0f;
 
 const float h_PI = 3.14159265358979323846f;
 const float h_H = 0.25f; // Smoothing radius
 
 const int NUM_PARTICLES = 50000;
 
-const float h_POINT_SCALE = 14.0f;
-const float h_POINT_RADIUS = 25.0f;
+const float h_POINT_SCALE = 4.0f;
+const float h_POINT_RADIUS = 10.0f;
 
 const float h_particleRadius = 0.008;
 
+const float h_surcface_tension_coeff = 0.0032;
+const float h_surface_tension_term = 0.035;
+
 #endif // CONSTANTS_H
diff --git a/src/spawn_particles.cpp b/src/spawn_particles.cpp
index 9f6672cc496225281d8b1a52023d21df3560e138..b8decd4b17d3f1c2ca7a92fe41f6424b112fbb93 100644
--- a/src/spawn_particles.cpp
+++ b/src/spawn_particles.cpp
@@ -50,7 +50,7 @@ void initParticles(
     {
         case SimulationMode::OneLargeCube:
             particlesPerAxis = static_cast<int>(ceil(pow(NUM_PARTICLES, 1.0f / 3.0f)));
-            index = initializeCube(0, particlesPerAxis, 0.0f, 4.0f, 0.5f);
+            index = initializeCube(0, particlesPerAxis, 0.1f, 0.1f, 0.1f);
             break;
 
         case SimulationMode::TwoSmallCubes: