diff --git a/shaders/thickness.frag b/shaders/thickness.frag
index 8bca2dd5bd3d6512b891611af3bd03bf79fa483e..226d6b193e9ccd36a19d6258f0be27a1ab64483f 100644
--- a/shaders/thickness.frag
+++ b/shaders/thickness.frag
@@ -19,7 +19,7 @@ void main()
         discard;
 
     // For thickness: each particle in this pixel adds some amount.
-    float thicknessIncrement = 0.08;
+    float thicknessIncrement = 0.015;
 
     // Output that in the red channel. Using GL32F only reed chanele sotred  
     FragColor = vec4(thicknessIncrement, 0.0, 0.0, 1.0);
diff --git a/shaders/thickness_filter.frag b/shaders/thickness_filter.frag
index d2571ddf981d14e9308ebfdfd9ab10b3aca359a1..4d9a81e57c2890309fd323f5642302980798ff84 100644
--- a/shaders/thickness_filter.frag
+++ b/shaders/thickness_filter.frag
@@ -16,7 +16,7 @@ void main()
     // Sample center
     float centerVal = texture(uThicknessTex, TexCoords).r;
 
-    if (centerVal < 0.01) discard;
+    if (centerVal < 0.005) discard;
 
     // Accumulate for a 1D Gaussian blur
     float sum = 0.0;
diff --git a/src/FluidRenderer.cpp b/src/FluidRenderer.cpp
index bd783f1a2b88e52644cb5862d48d100ecde0d571..9ffaedae277485d6647733bd68dfe7180a1363f7 100644
--- a/src/FluidRenderer.cpp
+++ b/src/FluidRenderer.cpp
@@ -637,7 +637,7 @@ void FluidRenderer::renderDepthFrame(size_t particleCount, size_t boundaryCount)
     glUniformMatrix4fv(m_depthRenderProgram.viewMatrixLoc, 1, GL_TRUE, viewMatrix.m);
 
     if (m_depthRenderProgram.sphereRadiusLoc != -1) {
-        glUniform1f(m_depthRenderProgram.sphereRadiusLoc, 0.25f);
+        glUniform1f(m_depthRenderProgram.sphereRadiusLoc, 0.05f);
     }
     if (m_depthRenderProgram.pointRadiusLoc != -1) {
         glUniform1f(m_depthRenderProgram.pointRadiusLoc, m_pointRadius);
diff --git a/src/FluidSimulationApp.cpp b/src/FluidSimulationApp.cpp
index 45773eb3fbe30350107bf3bcc45003cf81805f44..79ee9c9fafe4172831e82d3c8ba77b1deb375aa1 100644
--- a/src/FluidSimulationApp.cpp
+++ b/src/FluidSimulationApp.cpp
@@ -216,67 +216,66 @@ namespace FluidSimulation {
         if (m_updatePhysics)
         {
             m_fluidSimulation->updateSimulation();
+        }
+        
+        size_t particleCount = m_fluidSimulation->getParticleCount();
+        size_t boundaryCount = m_fluidSimulation->getBoundaryParticleCount();
 
-            // Precompute particle counts to avoid multiple function calls
-            size_t particleCount = m_fluidSimulation->getParticleCount();
-            size_t boundaryCount = m_fluidSimulation->getBoundaryParticleCount();
-
-            switch (selectedStage)
-            {
-                case RenderStage::ParticleView:
-                    m_fluidRenderer->renderNormalFrame(particleCount, boundaryCount);
-                    break;
-
-                case RenderStage::DepthBuffer:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->visualizeBuffer(RenderStage::DepthBuffer);
-                    break;
-
-                case RenderStage::FilteredDepth:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->visualizeBuffer(RenderStage::FilteredDepth);
-                    break;
-
-                case RenderStage::DepthNormals:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->renderDepthNormalsFrame();
-                    break;
-
-                case RenderStage::ThicknessBuffer:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->visualizeBuffer(RenderStage::ThicknessBuffer);
-                    break;
-                
-                case RenderStage::FilteredThickness:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->visualizeBuffer(RenderStage::FilteredThickness);
-                    break;
-
-                case RenderStage::LightAttenuation:
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->renderThicknessAttenFrame();
-                    break;
+        switch (selectedStage)
+        {
+            case RenderStage::ParticleView:
+                m_fluidRenderer->renderNormalFrame(particleCount, boundaryCount);
+                break;
+
+            case RenderStage::DepthBuffer:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->visualizeBuffer(RenderStage::DepthBuffer);
+                break;
+
+            case RenderStage::FilteredDepth:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->visualizeBuffer(RenderStage::FilteredDepth);
+                break;
+
+            case RenderStage::DepthNormals:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->renderDepthNormalsFrame();
+                break;
+
+            case RenderStage::ThicknessBuffer:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->visualizeBuffer(RenderStage::ThicknessBuffer);
+                break;
+            
+            case RenderStage::FilteredThickness:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->visualizeBuffer(RenderStage::FilteredThickness);
+                break;
+
+            case RenderStage::LightAttenuation:
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->renderThicknessAttenFrame();
+                break;
+            
+            case RenderStage::ScreenSpacedRender:
+                m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderDepthFilterFrame();
+                m_fluidRenderer->renderDepthNormalsFrame();
                 
-                case RenderStage::ScreenSpacedRender:
-                    m_fluidRenderer->renderDepthFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderDepthFilterFrame();
-                    m_fluidRenderer->renderDepthNormalsFrame();
-                    
-                    m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
-                    m_fluidRenderer->renderThicknessFilterFrame();
-                    m_fluidRenderer->renderThicknessAttenFrame();
-                    
-                    m_fluidRenderer->renderFinalCombineFrame();
-                    break;
+                m_fluidRenderer->renderThicknessFrame(particleCount, boundaryCount);
+                m_fluidRenderer->renderThicknessFilterFrame();
+                m_fluidRenderer->renderThicknessAttenFrame();
                 
-                default:
-                    std::cerr << "Unknown RenderStage selected." << std::endl;
-                    break;
-            }
+                m_fluidRenderer->renderFinalCombineFrame();
+                break;
+            
+            default:
+                std::cerr << "Unknown RenderStage selected." << std::endl;
+                break;
         }
 
         // Display FBO texture in ImGui window
diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h
index 69a8f2075502a1d5ebe123c24bfcb0bde92f5cc4..cb261588af904980b6df2f01c9799acd70ce499d 100644
--- a/src/cuda/include/cuda_device_vars.h
+++ b/src/cuda/include/cuda_device_vars.h
@@ -19,6 +19,11 @@ extern __constant__ float POLY6_CONST;
 extern __constant__ float SPIKY_GRAD_CONST;
 extern __constant__ float VISC_LAP_CONST;
 
+extern __constant__ float POLY6_D1_CONST;
+extern __constant__ float SURFACE_TENS_CONST;
+extern __constant__ float SURFACE_TENS_TERM;
+extern __constant__ float SURFACE_TENS_COEFFICIENT;
+
 extern __constant__ float PARTICLE_MASS;
 extern __constant__ float REST_DENSITY;
 extern __constant__ float VISCOSITY;
@@ -50,6 +55,7 @@ extern float* d_density_errors;
 extern float* d_pressure_increments;
 extern float3* d_pressure_forces;
 extern float3* d_accelerations;
+extern float3* d_normals;
 
 // Additional arrays for PCISPH
 extern float3* d_velocities_pred;
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
index 98b22b2669ce12135e10d35a248d020f4a55d3c5..3d890c51ee4f5ed42001c4e5a796be79600687ae 100644
--- a/src/cuda/include/pcisph_kernels.h
+++ b/src/cuda/include/pcisph_kernels.h
@@ -7,6 +7,19 @@
 
 namespace FluidSimulationGPU {
 
+__global__ void initialDensity(
+    float3* positions,
+    float* densities,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
 // Function to compute density errors
 __global__ void computeDensityError(
     float3* positions,
@@ -63,10 +76,24 @@ __global__ void correctBoundaryPenetrations(
 );
 
 // Function to compute viscosity and external forces
-__global__ void computeViscosityAndExternalForces(
+__global__ void computeInitialForces(
     float3* positions,
     float3* velocities,
     float3* accelerations,
+    float3* normals,
+    float* densities,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+);
+
+__global__ void calculateNormals(
+    float3* positions,
+    float3* velocities,
+    float3* normals,
+    float* densities,
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
diff --git a/src/cuda/include/smoothing_kernels.h b/src/cuda/include/smoothing_kernels.h
index af8443c27fe559d6c5e8df41504e61adbd21f17c..b9b9acd57e285a3d0eb52c5a5cc1fd0ec0a9d2a8 100644
--- a/src/cuda/include/smoothing_kernels.h
+++ b/src/cuda/include/smoothing_kernels.h
@@ -6,8 +6,10 @@
 namespace FluidSimulationGPU {
 
 __device__ float poly6Kernel(float r2);
+__device__ float kernel_poly6_d1(float r2);
 __device__ float spikyKernelGrad(float r);
 __device__ float viscosityKernelLap(float r);
+__device__ float kernel_surface_tension(float r);
 
 } // namespace FluidSimulationGPU
 
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
index 307f00e4db4fa2e4d9f5af8a8ab18fa7717877cc..54234d582eb551b011bc93e4aac9d2d06ab16cbc 100644
--- a/src/cuda/init_cuda.cu
+++ b/src/cuda/init_cuda.cu
@@ -26,6 +26,11 @@ __constant__ float POLY6_CONST;
 __constant__ float SPIKY_GRAD_CONST;
 __constant__ float VISC_LAP_CONST;
 
+__constant__ float POLY6_D1_CONST;
+__constant__ float SURFACE_TENS_CONST;
+__constant__ float SURFACE_TENS_TERM;
+__constant__ float SURFACE_TENS_COEFFICIENT;
+
 __constant__ float PARTICLE_MASS;
 __constant__ float REST_DENSITY;
 __constant__ float VISCOSITY;
@@ -57,6 +62,7 @@ float* d_density_errors = nullptr;
 float* d_pressure_increments = nullptr;
 float3* d_pressure_forces = nullptr;
 float3* d_accelerations = nullptr;
+float3* d_normals = nullptr;
 
 // Additional arrays for PCISPH
 float3* d_velocities_pred = nullptr;
@@ -81,8 +87,6 @@ void ComputePCISPHStiffness(
     float restSpacing,
     float spiky_d1_normalization)
 {
-    // 1) Compute the factor often called 'beta' or something similar
-    //    in your code.  (Matches your "2 * (dt*m / rho0)^2" idea)
     float mass = restDensity * powf(kernelRadius/2.0f, 3.f);
 
     printf("Particle mass: %f \n", mass);
@@ -95,13 +99,12 @@ void ComputePCISPHStiffness(
     float beta = 2.0f * dtm_rho * dtm_rho;
 
 
-    // 2) Variables to accumulate gradient sums
     float3 grad_sum     = make_float3(0.f, 0.f, 0.f);
     float  sum_dot_grad = 0.f;
 
     float kernelRadius2 = kernelRadius * kernelRadius;
 
-    // 3) Sample points in [-kernelRadius, +kernelRadius] in increments of restSpacing
+    // Sample points in [-kernelRadius, +kernelRadius] in increments of restSpacing
     for (float z = -kernelRadius; z <= kernelRadius + 1e-5f; z += restSpacing)
     {
         for (float y = -kernelRadius; y <= kernelRadius + 1e-5f; y += restSpacing)
@@ -111,15 +114,10 @@ void ComputePCISPHStiffness(
                 float3 r = make_float3(x, y, z);
                 float r2 = dot(r, r);
 
-                // Only compute if within kernel support
                 if (r2 < kernelRadius2 && r2 > 0.0f)  // avoid r=0 to prevent division by 0
                 {
                     float r_norm = sqrtf(r2);
-                    // Spiky kernel gradient in 3D usually:
-                    //    grad W_spiky(r) = -C * ( (h - r)^2 / r ) * r̂
-                    //
-                    // If you're using a negative sign in spiky_d1_normalization,
-                    // you might want to omit the minus here, etc. Just be consistent.
+                    
                     float3 grad = (r / r_norm) * spiky_d1_normalization * powf(kernelRadius - r_norm, 2.f);
 
                     // Accumulate sums
@@ -130,15 +128,14 @@ void ComputePCISPHStiffness(
         }
     }
 
-    // 4) Compute dot of the sum of gradients
+    // Compute dot of the sum of gradients
     float dot_sum_grad = dot(grad_sum, grad_sum);
 
-    // 5) Combine to get grad_term:
-    //    grad_term ~ - [ (Σ gradW)^2 + Σ (gradW·gradW) ]
-    //    so we define:
+    // Combine to get grad_term:
+    //  grad_term ~ - [ (Σ gradW)^2 + Σ (gradW·gradW) ]
     float grad_term = - (dot_sum_grad + sum_dot_grad);
 
-    // 6) Finally, PCISPH stiffness factor k_PCI ~ -1 / (beta * grad_term)
+    // Finally, PCISPH stiffness factor k_PCI ~ -1 / (beta * grad_term)
     float k_PCI = -1.f / (beta * grad_term);
 
     printf("K_PCi precalc: %f \n", k_PCI);
@@ -149,9 +146,15 @@ void initializeConstants()
 {
     // Host constants
     const float h_H2 = h_H * h_H;
-    const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9));
-    const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6));
-    const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6));
+    const float h_POLY6_CONST = 315.0f / (64.0f * M_PI * powf(h_H, 9));
+    const float h_SPIKY_GRAD_CONST = -45.0f / (M_PI * powf(h_H, 6));
+    const float h_VISC_LAP_CONST = 45.0f / (M_PI * powf(h_H, 6));
+
+    const float poly6_d1 = -945.f / (32.f * M_PI * powf(h_H, 9.f));
+
+    const float surface_tens_normalization = 32.f / (M_PI * powf(h_H, 9.f));
+    const float surface_tens_term = powf(h_H, 6.f) / 64.f;
+    
 
     // Copy constants to device
     cudaMemcpyToSymbol(PI, &h_PI, sizeof(float));
@@ -161,6 +164,13 @@ void initializeConstants()
     cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float));
     cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float));
 
+
+    cudaMemcpyToSymbol(POLY6_D1_CONST, &poly6_d1, sizeof(float));
+    cudaMemcpyToSymbol(SURFACE_TENS_CONST, &surface_tens_normalization, sizeof(float));
+    cudaMemcpyToSymbol(SURFACE_TENS_TERM, &surface_tens_term, sizeof(float));
+    cudaMemcpyToSymbol(SURFACE_TENS_COEFFICIENT, &h_surface_tenns_coef, sizeof(float));
+
+
     cudaMemcpyToSymbol(PARTICLE_MASS, &h_PARTICLE_MASS, sizeof(float));
     cudaMemcpyToSymbol(REST_DENSITY, &h_REST_DENSITY, sizeof(float));
     cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float));
@@ -243,14 +253,13 @@ void initializeFluidDeviceArrays(
     cudaMalloc(&d_pressure_increments, numParticles * sizeof(float));
     cudaMalloc(&d_pressure_forces, numParticles * sizeof(float3));
     cudaMalloc(&d_accelerations, numParticles * sizeof(float3));
+    cudaMalloc(&d_normals, numParticles * sizeof(float3));
 
     // Allocate device memory for sum_gradW2
     cudaMalloc(&d_sum_gradW2, numParticles * sizeof(float));
 
     // Initialize positions_pred to initial positions
     cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice);
-
-
 }
 
 void initializeBoundryDeviceArrays(
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index ab5a90a645054cee6e3d5b8a66dd49bf7a2339dd..172343b839cf688b766d8e0cbb4d170ac62f925d 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -26,6 +26,73 @@ namespace FluidSimulationGPU {
 
 // Compute density errors (used in PCISPH)
 
+__global__ void initialDensity(
+    float3* positions,
+    float* densities,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+
+    float3 pos_i = positions[idx];
+    float density = 0.0f;
+
+    unsigned int cellIndex = cellIndices[idx];
+    unsigned int neighborCellIdx;
+
+    FOREACH_NEIGHBOR_CELL_BEGIN(cellIndex, gridSizeX, gridSizeY, gridSizeZ, neighborCellIdx)
+    {
+        unsigned int startIdx = cellStart[neighborCellIdx];
+        unsigned int endIdx   = cellEnd[neighborCellIdx];
+
+        if (startIdx != 0xFFFFFFFF) 
+        {
+            for (unsigned int j = startIdx; j < endIdx; ++j) 
+            {
+                float3 pos_j = positions[j];
+                float3 r = pos_i - pos_j;
+                float r2 = dot(r, r);
+
+                // Check if in range
+                if (r2 < H2 && r2 > 0.0001f) {
+                    float W = poly6Kernel(r2);
+                    density += PARTICLE_MASS * W;
+                }
+            }
+        }
+
+        // Boundary neighbors
+        unsigned int b_startIdx = boundaryCellStart[neighborCellIdx];
+        unsigned int b_endIdx   = boundaryCellEnd[neighborCellIdx];
+
+        if (b_startIdx != 0xFFFFFFFF)
+        {
+            for (unsigned int j = b_startIdx; j < b_endIdx; ++j) 
+            {
+                float3 pos_j = boundaryPositions[j];
+                float3 r = pos_i - pos_j;
+                float r2 = dot(r, r);
+                if (r2 < H2 && r2 > 0.0001f) {
+                    float W = poly6Kernel(r2);
+                    density += PARTICLE_MASS * W; 
+                }
+            }
+        }
+
+    }
+    FOREACH_NEIGHBOR_CELL_END();
+
+    densities[idx] = density;
+}
+
 __global__ void computeDensityError(
     float3* positions,
     float* densities,
@@ -96,6 +163,57 @@ __global__ void computeDensityError(
     density_errors[idx] = density - REST_DENSITY;
 }
 
+__global__ void calculateNormals(
+    float3* positions,
+    float3* velocities,
+    float3* normals,
+    float* densities,
+    unsigned int* cellIndices,
+    unsigned int* cellStart,
+    unsigned int* cellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+    
+    float3 pos_i = positions[idx];
+    float density = 0.0f;
+
+    float3 normal = make_float3(0.f, 0.f, 0.f);
+
+    unsigned int cellIndex = cellIndices[idx];
+    unsigned int neighborCellIdx;
+
+    FOREACH_NEIGHBOR_CELL_BEGIN(cellIndex, gridSizeX, gridSizeY, gridSizeZ, neighborCellIdx)
+    {
+        unsigned int startIdx = cellStart[neighborCellIdx];
+        unsigned int endIdx   = cellEnd[neighborCellIdx];
+
+        if (startIdx != 0xFFFFFFFF) 
+        {
+            for (unsigned int j = startIdx; j < endIdx; ++j) 
+            {
+                float3 pos_j = positions[j];
+                float dens_j = densities[j];
+                float3 r = pos_i - pos_j;
+                float r2 = dot(r, r);
+                
+                normal += kernel_poly6_d1(r2) * r / dens_j;
+                // poly6GradKernel
+                // normal += poly6grad(r) / other_density; 
+            }
+        }
+
+    }
+    FOREACH_NEIGHBOR_CELL_END();
+
+    normal *= H * PARTICLE_MASS * POLY6_D1_CONST;
+
+    normals[idx] = normal;
+}
+
 
 // Update pressures based on density errors
 __global__ void updatePressures(
@@ -285,10 +403,12 @@ __global__ void correctBoundaryPenetrations(
 }
 
 // Compute viscosity and external forces (excluding pressure forces)
-__global__ void computeViscosityAndExternalForces(
+__global__ void computeInitialForces(
     float3* positions,
     float3* velocities,
     float3* accelerations,
+    float3* normals,
+    float* densities,
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
@@ -301,10 +421,14 @@ __global__ void computeViscosityAndExternalForces(
 
     float3 pos_i = positions[idx];
     float3 vel_i = velocities[idx];
+    float3 normal_i = normals[idx];
+    float dens_i = densities[idx];
+
 
     float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f);
     float3 fGravity   = make_float3(0.0f, -10.0f, 0.0f) * PARTICLE_MASS;
-
+    float3 st_cohesion = make_float3(0.0f, 0.0f, 0.0f);
+    float3 st_curvature = make_float3(0.0f, 0.0f, 0.0f);
     // Current particle's cell
     unsigned int cellIndex = cellIndices[idx];
     // We'll store neighbor cell index here
@@ -326,23 +450,46 @@ __global__ void computeViscosityAndExternalForces(
 
                 float3 pos_j = positions[j];
                 float3 vel_j = velocities[j];
+                float3 normal_j = normals[j];
+                float dens_j = densities[j];
+
                 float3 r = pos_i - pos_j;
 
                 float distSq = dot(r, r);
-                if (distSq < H2 && distSq > 0.0001f) 
+                float dist = sqrtf(distSq);
+                
+                if (dist < H && dist > 0.0001f) 
                 {
-                    float dist = sqrtf(distSq);
 
                     // Viscosity force
                     float3 velDiff = vel_j - vel_i;
                     float viscLap  = viscosityKernelLap(dist);
                     fViscosity += VISCOSITY * velDiff * viscLap * PARTICLE_MASS / REST_DENSITY;
+                
+                    // Implement surface tension here
+
+                    float st_correction_factor = 2.f * REST_DENSITY / (dens_i + dens_j);
+
+                    // -> surface tension (cohesion)
+                    float st_kernel = kernel_surface_tension(dist);
+                    float3 direction = r/dist;
+                    st_cohesion += st_correction_factor * st_kernel * direction;
+
+                    // -> surface tension (curvature)
+                    st_curvature += st_correction_factor * (normal_i - normal_j);
                 }
             }
         }
     }
     FOREACH_NEIGHBOR_CELL_END();
 
+    // finish surface tensiont calculations
+    //st_cohesion *= -SURFACE_TENS_COEFFICIENT * PARTICLE_MASS * PARTICLE_MASS * SURFACE_TENS_CONST;
+    //st_curvature *= -SURFACE_TENS_COEFFICIENT * PARTICLE_MASS;
+    st_cohesion *= 0.f;
+    st_curvature *= 0.f;
+    float3 fSurfaceTension = (st_cohesion + st_curvature);
+    
     // Total acceleration (excluding pressure)
     accelerations[idx] = (fViscosity + fGravity) / PARTICLE_MASS;
 }
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index eea44b15370931defeb6a87a411f63afae1eba82..5725b671a09be759488e3e2aa43368080eb9cf33 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -43,11 +43,37 @@ void updateParticles(float deltaTime)
         gridSizeZ);
 
     // -------------------- Prediction Step --------------------
+    initialDensity<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions_pred,
+        d_densities,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        d_boundary_positions,
+        d_boundaryCellStart,
+        d_boundaryCellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
+
+    calculateNormals<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions,
+        d_velocities,
+        d_normals,
+        d_densities,
+        d_cellIndices,
+        d_cellStart,
+        d_cellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ);
+    cudaDeviceSynchronize();
 
-    computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>(
+    computeInitialForces<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions,
         d_velocities,
         d_accelerations,
+        d_normals,
+        d_densities,
         d_cellIndices,
         d_cellStart,
         d_cellEnd,
diff --git a/src/cuda/smoothing_kernels.cu b/src/cuda/smoothing_kernels.cu
index 52d7248726cff11f11794cf4946de0a5782b5e76..6f84c44d4bfd9ec24cf457485ae1f643880eb0bf 100644
--- a/src/cuda/smoothing_kernels.cu
+++ b/src/cuda/smoothing_kernels.cu
@@ -14,6 +14,17 @@ __device__ float poly6Kernel(float r2)
     }
 }
 
+__device__ float kernel_poly6_d1(float r2) {
+	
+    if(r2 > H2) {
+		return 0.f;
+	} else {
+        float term = r2 - H2;
+		return term * term; // * r need to multiply my diff in calulation
+	}
+}
+
+
 __device__ float spikyKernelGrad(float r)
 {
     if (r > 0.0f && r <= H) {
@@ -33,4 +44,19 @@ __device__ float viscosityKernelLap(float r)
     }
 }
 
+__device__ float kernel_surface_tension(float r) {
+	if( r <= 0.5 * H ) {
+		float t1 = (H - r);
+		float t2 = (t1 * t1 * t1) * (r * r * r);
+		return 2.f * t2 - SURFACE_TENS_TERM;
+	}
+	else if( r <= H ) {
+		float t1 = (H - r);
+		return (t1 * t1 * t1) * (r * r * r);
+	}
+	else {
+		return 0.f;
+	}
+}
+
 } // namespace FluidSimulationGPU
diff --git a/src/settings/constants.h b/src/settings/constants.h
index 1d5b1bf2871268c6915d176b751825cd35a774b9..29d7adc9eb88d74f2d08e2fcb30a053ff38fb456 100644
--- a/src/settings/constants.h
+++ b/src/settings/constants.h
@@ -11,12 +11,13 @@ const float h_BOUND_Z_MAX = 4.0f;
 
 const float h_GAS_CONSTANT = 0.02f;
 const float h_VISCOSITY = 0.04f;
+const float h_surface_tenns_coef = 0.0f;
 
-const float h_PARTICLE_MASS = 0.005f;
+const float h_PARTICLE_MASS = 0.01f;
 const float h_REST_DENSITY = 600.0f;
 
 const float h_PI = 3.14159265358979323846f;
-const float h_H = 0.18f; // Smoothing radius
+const float h_H = 0.20f; // Smoothing radius
 
 const int NUM_PARTICLES = 200000;