diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index cf9c0d59a20195f423296b8cfc0156504b173e8c..3e1fc47ea53abffe9dca96155a7e66189c0e7b5f 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -1,6 +1,7 @@
 #include "include/FluidSimulation.h"
 #include "include/spawn_particles.h" // For createBoundaryParticles, initParticles
 //#include "include/PCISPH_sim.h"      // For CUDA simulation functions
+#include "settings/constants.h"
 #include "cuda/include/PCISPH_CUDA_interface.h"
 #include <iostream>
 
@@ -22,17 +23,16 @@ namespace FluidSimulation {
         // Upload constants to the GPU
         FluidSimulationGPU::initializeConstants();
 
-        // Initialize particles
+        // Initialize fluid particles
         std::vector<float3> h_positions(m_numParticles);
         std::vector<float3> h_velocities(m_numParticles);
-        float gridSpacing = 0.25f;
-
+        float gridSpacing = 0.5 * h_H;
         initParticles(h_positions, h_velocities, m_numParticles, gridSpacing);
 
         // Initialize particle device arrays
         FluidSimulationGPU::initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles);
 
-        // Create particle VBO
+        // Create particle VBO and register with CUDA
         glGenBuffers(1, &m_vbo_particles);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_particles);
         glBufferData(GL_ARRAY_BUFFER, h_positions.size() * sizeof(float3), h_positions.data(), GL_DYNAMIC_DRAW);
@@ -41,25 +41,29 @@ namespace FluidSimulation {
 
         // Initialize boundary
         std::vector<float3> boundary_positions;
-        std::vector<float3> boundary_velocities;
-        int boundaryLayers = 3;
-        float spacing = 0.25f;
+        std::vector<float3> boundary_normals;
+        int boundaryLayers = 1;
+        float spacing = 0.5 * h_H; // Using 0.01 as per your code
 
-        createBoundaryParticles(boundary_positions, boundary_velocities, boundaryLayers, spacing);
+        createBoundaryParticles(boundary_positions, boundary_normals, boundaryLayers, spacing);
         m_numBoundaryParticles = boundary_positions.size();
 
-        FluidSimulationGPU::initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles);
+        // Initialize boundary device arrays
+        FluidSimulationGPU::initializeBoundryDeviceArrays(boundary_positions.data(), boundary_normals.data(), m_numBoundaryParticles);
 
-        // Create boundary VBO
+        // Create boundary VBO and register with CUDA
         glGenBuffers(1, &m_vbo_boundary);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_boundary);
         glBufferData(GL_ARRAY_BUFFER, boundary_positions.size() * sizeof(float3), boundary_positions.data(), GL_DYNAMIC_DRAW);
         glBindBuffer(GL_ARRAY_BUFFER, 0);
         FluidSimulationGPU::registerBoundaryVBOWithCUDA(m_vbo_boundary);
+
+        // Now that we have boundary particles set up and registered, run the boundary neighborhood search
+        FluidSimulationGPU::boundaryNBSearch(m_numBoundaryParticles);
     }
 
     void FluidSimulation::updateSimulation() {
-        for (int i = 0; i < 4; ++i) {
+        for (int i = 0; i < 2; ++i) {
             FluidSimulationGPU::updateParticles(0.001);
         }
     }
diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h
index e0eada192f94a395ec8b53503a082e6bca262ce7..b4eeb12a45cc11c36756b79c20a1052105aa0972 100644
--- a/src/cuda/include/cuda_device_vars.h
+++ b/src/cuda/include/cuda_device_vars.h
@@ -57,7 +57,7 @@ extern float* d_sum_gradW2; // For computing A
 
 // Boundary particles
 extern float3* d_boundary_positions;
-extern float3* d_boundary_velocities;
+extern float3* d_boundary_normals;
 
 extern unsigned int* d_boundaryCellStart;
 extern unsigned int* d_boundaryCellEnd;
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
index 18c65f885237b033780e2869ec41d48806c9a210..f629f97d9f25f631220a5a9c3241874b7f52a8aa 100644
--- a/src/cuda/include/pcisph_kernels.h
+++ b/src/cuda/include/pcisph_kernels.h
@@ -16,6 +16,10 @@ __global__ void computeDensityError(
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
+    // Boundary data
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
     int numParticles,
     int gridSizeX, int gridSizeY, int gridSizeZ
 );
@@ -36,10 +40,30 @@ __global__ void computePressureForces(
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
+    // Boundary data
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
     int numParticles,
     int gridSizeX, int gridSizeY, int gridSizeZ
 );
 
+__global__ void correctBoundaryPenetrations(
+    float3* positions_pred,            // Predicted fluid positions
+    float3* velocities_pred,           // Predicted fluid velocities
+    const float3* boundary_positions,   // Boundary particle positions
+    const float3* boundary_normals,     // Boundary particle normals
+    unsigned int* cellIndices,          // Cell indices for fluid particles
+    unsigned int* boundaryCellStart,    // Start indices for boundary particles per cell
+    unsigned int* boundaryCellEnd,      // End indices for boundary particles per cell
+    int numParticles,                   // Number of fluid particles
+    int gridSizeX, int gridSizeY, int gridSizeZ, // Grid dimensions
+    float r0,                           // Penetration threshold (e.g., 0.5 * h)
+    float normalDamping,                // Damping factor for velocity
+    float epsilon,                      // Friction coefficient (0 <= epsilon <= 1)
+    float delta                         // Elasticity coefficient (0 <= delta <= 1)
+);
+
 // Function to compute viscosity and external forces
 __global__ void computeViscosityAndExternalForces(
     float3* positions,
diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu
index 7cb10f4a4d13d2960bede1f7179c7701a12fbb3e..9c8630bbac0f26525ddedae68faf186dbaf5faca 100644
--- a/src/cuda/init_cuda.cu
+++ b/src/cuda/init_cuda.cu
@@ -62,7 +62,7 @@ float* d_sum_gradW2 = nullptr;
 
 // Boundary particles
 float3* d_boundary_positions = nullptr;
-float3* d_boundary_velocities = nullptr;
+float3* d_boundary_normals = nullptr;
 
 unsigned int* d_boundaryCellStart = nullptr;
 unsigned int* d_boundaryCellEnd = nullptr;
@@ -169,7 +169,7 @@ void initializeFluidDeviceArrays(
 
 void initializeBoundryDeviceArrays(
     const float3* h_boundaryPositions,
-    const float3* h_boundaryVelocities,
+    const float3* h_boundaryNormals, // Renamed parameter
     int numBoundaryParticles
 )
 {
@@ -179,11 +179,13 @@ void initializeBoundryDeviceArrays(
     int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H);
     int maxCells = gridSizeX * gridSizeY * gridSizeZ;
 
-    // cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3));
-    cudaMalloc(&d_boundary_velocities, numBoundaryParticles * sizeof(float3));
-
+    // Allocate memory for boundary positions
+    //cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3));
     //cudaMemcpy(d_boundary_positions, h_boundaryPositions, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_boundary_velocities, h_boundaryVelocities, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
+
+    // Allocate memory for boundary normals
+    cudaMalloc(&d_boundary_normals, numBoundaryParticles * sizeof(float3));
+    cudaMemcpy(d_boundary_normals, h_boundaryNormals, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice);
 
     cudaMalloc(&d_boundaryCellIndices, numBoundaryParticles * sizeof(unsigned int));
     cudaMalloc(&d_boundaryParticleIndices, numBoundaryParticles * sizeof(unsigned int));
@@ -207,18 +209,18 @@ void boundaryNBSearch(int numBoundaryParticles) {
     int blocksPerGrid = (numBoundaryParticles + threadsPerBlock - 1) / threadsPerBlock;
     
     float3* d_boundary_positions_sorted;
-    float3* d_boundary_velocities_sorted;
+    float3* d_boundary_normals_sorted;
     
 
     cudaMalloc(&d_boundary_positions_sorted, numBoundaryParticles * sizeof(float3));
-    cudaMalloc(&d_boundary_velocities_sorted, numBoundaryParticles * sizeof(float3));
+    cudaMalloc(&d_boundary_normals_sorted, numBoundaryParticles * sizeof(float3));
     
     neigbourHoodSearch(
             numBoundaryParticles, 
             d_boundary_positions,
-            d_boundary_velocities,
+            d_boundary_normals,
             d_boundary_positions_sorted,
-            d_boundary_velocities_sorted,
+            d_boundary_normals_sorted,
             d_boundaryCellIndices,
             d_boundaryParticleIndices,
             d_boundaryCellEnd,
@@ -233,7 +235,7 @@ void boundaryNBSearch(int numBoundaryParticles) {
     cudaGraphicsUnmapResources(1, &cuda_vbo_boundary_resource, 0);
     
     cudaFree(d_boundary_positions_sorted);
-    cudaFree(d_boundary_velocities_sorted);
+    cudaFree(d_boundary_normals_sorted);
     
     // Move some of above init code for boundary to seeparate function
 }
@@ -261,7 +263,7 @@ void cleanupDeviceArrays()
     cudaFree(d_accelerations);
 
     cudaFree(d_boundary_positions);
-    cudaFree(d_boundary_velocities);
+    cudaFree(d_boundary_normals);
 
     cudaFree(d_boundaryCellIndices);
     cudaFree(d_boundaryParticleIndices);
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index f2efd40677d2f06175404a1d6a7f80950a4733db..a0dac1aaf44261bb06286921004d64a77b8a7eff 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -32,76 +32,102 @@ __global__ void computeDensityError(
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
+    // Boundary data
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
     int numParticles,
     int gridSizeX, int gridSizeY, int gridSizeZ
 )
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (idx < numParticles) {
-        float3 pos_i = positions[idx];
-        float density = 0.0f;
-        float sumGradW2_local = 0.0f;
-
-        unsigned int cellIndex = cellIndices[idx];
-
-        // Compute cell coordinates
-        int cellX = cellIndex % gridSizeX;
-        int cellY = (cellIndex / gridSizeX) % gridSizeY;
-        int cellZ = cellIndex / (gridSizeX * gridSizeY);
-
-        // Iterate over neighboring cells
-        for (int dz = -1; dz <= 1; ++dz) {
-            int nz = cellZ + dz;
-            if (nz < 0 || nz >= gridSizeZ) continue;
-            for (int dy = -1; dy <= 1; ++dy) {
-                int ny = cellY + dy;
-                if (ny < 0 || ny >= gridSizeY) continue;
-                for (int dx = -1; dx <= 1; ++dx) {
-                    int nx = cellX + dx;
-                    if (nx < 0 || nx >= gridSizeX) continue;
-
-                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
-
-                    unsigned int startIdx = cellStart[neighborCellIndex];
-                    unsigned int endIdx = cellEnd[neighborCellIndex];
-
-                    if (startIdx != 0xFFFFFFFF) {
-                        for (unsigned int j = startIdx; j < endIdx; ++j) {
-                            float3 pos_j = positions[j];
-
-                            float3 r = pos_i - pos_j;
-                            float r2 = dot(r, r);
-
-                            if (r2 < H2 && r2 > 0.0001f) {
+    if (idx >= numParticles) return;
+
+    float3 pos_i = positions[idx];
+    float density = 0.0f;
+    float sumGradW2_local = 0.0f;
+    
+    unsigned int cellIndex = cellIndices[idx];
+
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+    // ----------------------
+    // Fluid neighbors
+    // ----------------------
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
+
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                unsigned int startIdx = cellStart[neighborCellIndex];
+                unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                if (startIdx != 0xFFFFFFFF) {
+                    for (unsigned int j = startIdx; j < endIdx; ++j) {
+                        float3 pos_j = positions[j];
+                        float3 r = pos_i - pos_j;
+                        float r2 = dot(r, r);
+
+                        if (r2 < H2 && r2 > 0.0001f) {
+                            float W = poly6Kernel(r2);
+                            density += PARTICLE_MASS * W;
+
+                            if (sum_gradW2 != nullptr) {
                                 float dist = sqrtf(r2);
-
-                                float W = poly6Kernel(r2);
-                                density += PARTICLE_MASS * W;
-
-                                if (sum_gradW2 != nullptr) {
-                                    // Compute gradW
-                                    float gradW_coeff = spikyKernelGrad(dist);
-                                    float3 gradW = gradW_coeff * (r / dist);
-
-                                    sumGradW2_local += dot(gradW, gradW);
-                                }
+                                float gradW_coeff = spikyKernelGrad(dist);
+                                float3 gradW = gradW_coeff * (r / dist);
+                                sumGradW2_local += dot(gradW, gradW);
+                            }
+                        }
+                    }
+                }
+                
+                // ----------------------
+                // Boundary neighbors
+                // ----------------------
+                
+                unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
+                unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
+
+                if (b_startIdx != 0xFFFFFFFF) {
+                    for (unsigned int j = b_startIdx; j < b_endIdx; ++j) {
+                        float3 pos_j = boundaryPositions[j];
+                        float3 r = pos_i - pos_j;
+                        float r2 = dot(r, r);
+                        if (r2 < H2 && r2 > 0.0001f) {
+                            float W = poly6Kernel(r2);
+                            density += PARTICLE_MASS * W; // Boundary treated similarly here
+
+                            if (sum_gradW2 != nullptr) {
+                                float dist = sqrtf(r2);
+                                float gradW_coeff = spikyKernelGrad(dist);
+                                float3 gradW = gradW_coeff * (r / dist);
+                                sumGradW2_local += dot(gradW, gradW);
                             }
                         }
                     }
                 }
             }
         }
+    }
 
-        densities[idx] = density;
-        density_errors[idx] = density - REST_DENSITY;
+    densities[idx] = density;
+    density_errors[idx] = density - REST_DENSITY;
 
-        if (sum_gradW2 != nullptr) {
-            sum_gradW2[idx] = sumGradW2_local;
-        }
+    if (sum_gradW2 != nullptr) {
+        sum_gradW2[idx] = sumGradW2_local;
     }
 }
 
+
 // Update pressures based on density errors
 __global__ void updatePressures(
     float* pressure_increments,
@@ -126,74 +152,202 @@ __global__ void computePressureForces(
     unsigned int* cellIndices,
     unsigned int* cellStart,
     unsigned int* cellEnd,
+    // Boundary data
+    float3* boundaryPositions,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
     int numParticles,
     int gridSizeX, int gridSizeY, int gridSizeZ
 )
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+
+    float3 pos_i = positions[idx];
+    float deltaP_i = pressure_increments[idx];
+    float3 fPressure = make_float3(0.0f, 0.0f, 0.0f);
+
+    unsigned int cellIndex = cellIndices[idx];
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+    // ------------
+    // Fluid neighbors
+    // ------------
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
+
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                unsigned int startIdx = cellStart[neighborCellIndex];
+                unsigned int endIdx = cellEnd[neighborCellIndex];
+
+                if (startIdx != 0xFFFFFFFF) {
+                    for (unsigned int j = startIdx; j < endIdx; ++j) {
+                        if (j == idx) continue;
+                        float3 pos_j = positions[j];
+                        float deltaP_j = pressure_increments[j];
+
+                        float3 r = pos_i - pos_j;
+                        float distSq = dot(r, r);
+                        if (distSq < H2 && distSq > 0.0001f) {
+                            float dist = sqrtf(distSq);
+                            float gradW = spikyKernelGrad(dist);
+                            float3 gradWij = gradW * (r / dist);
+                            float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
+                            fPressure += factor * gradWij * PARTICLE_MASS * PARTICLE_MASS;
+                        }
+                    }
+                }
+                
+                // ------------
+                // Boundary neighbors
+                // ------------
+            
+                /*
+                unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
+                unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
+
+                if (b_startIdx != 0xFFFFFFFF) {
+                    for (unsigned int j = b_startIdx; j < b_endIdx; ++j) {
+                        float3 pos_j = boundaryPositions[j];
+                        // Mirror pressure: p_b = p_i, and use REST_DENSITY for boundary
+                        float deltaP_j = deltaP_i; // Pressure mirroring
+
+                        float3 r = pos_i - pos_j;
+                        float distSq = dot(r, r);
+                        if (distSq < H2 && distSq > 0.0001f) {
+                            float dist = sqrtf(distSq);
+                            float gradW = spikyKernelGrad(dist);
+                            float3 gradWij = gradW * (r / dist);
+                            // Using Equation (85): p_i and p_b = p_i; densities: ρ_i and ρ_0
+                            float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
+                            fPressure += factor * gradWij * PARTICLE_MASS * PARTICLE_MASS;
+                        }
+                    }
+                } 
+                */
+            }
+        }
+    }
 
-    if (idx < numParticles) {
-        float3 pos_i = positions[idx];
-        float deltaP_i = pressure_increments[idx];
-
-        float3 fPressure = make_float3(0.0f, 0.0f, 0.0f);
-
-        unsigned int cellIndex = cellIndices[idx];
-
-        // Compute cell coordinates
-        int cellX = cellIndex % gridSizeX;
-        int cellY = (cellIndex / gridSizeX) % gridSizeY;
-        int cellZ = cellIndex / (gridSizeX * gridSizeY);
-
-        // Iterate over neighboring cells
-        for (int dz = -1; dz <= 1; ++dz) {
-            int nz = cellZ + dz;
-            if (nz < 0 || nz >= gridSizeZ) continue;
-            for (int dy = -1; dy <= 1; ++dy) {
-                int ny = cellY + dy;
-                if (ny < 0 || ny >= gridSizeY) continue;
-                for (int dx = -1; dx <= 1; ++dx) {
-                    int nx = cellX + dx;
-                    if (nx < 0 || nx >= gridSizeX) continue;
-
-                    unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
-
-                    unsigned int startIdx = cellStart[neighborCellIndex];
-                    unsigned int endIdx = cellEnd[neighborCellIndex];
+    pressure_forces[idx] = fPressure;
+}
 
-                    if (startIdx != 0xFFFFFFFF) {
-                        for (unsigned int j = startIdx; j < endIdx; ++j) {
-                            if (j == idx) continue;
+__global__ void correctBoundaryPenetrations(
+    float3* positions_pred,
+    float3* velocities_pred,
+    const float3* boundary_positions,
+    const float3* boundary_normals,
+    unsigned int* cellIndices,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ,
+    float r0,
+    float normalDamping,
+    float epsilon,
+    float delta)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+
+    float3 pos_i = positions_pred[idx];
+    float3 vel_i = velocities_pred[idx];
+
+    // Retrieve the cell index for this fluid particle
+    unsigned int cellIndex = cellIndices[idx];
+
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+    // Accumulators for penetration correction
+    float w_sum = 0.0f;
+    float depth_sum = 0.0f;
+    float3 normal_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Iterate over neighboring cells in a 3x3x3 grid
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
+
+                // Compute the neighbor cell index
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                // Get the boundary particles in this cell
+                unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
+                unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
+
+                if (b_startIdx == 0xFFFFFFFF) continue; // No boundary particles in this cell
+
+                // Loop over boundary particles in the neighboring cell
+                for (unsigned int b = b_startIdx; b < b_endIdx; ++b) {
+                    float3 pos_b = boundary_positions[b];
+                    float3 n_b = boundary_normals[b];
+                    float3 r = pos_i - pos_b;
+                    float distSq = dot(r, r);
+                    float dist = sqrtf(distSq);
+
+                    if (dist < r0 && dist > 1e-6f) {
+                        // Compute penetration weight
+                        float w = (r0 - dist) / r0;
+                        w_sum += w;
+                        depth_sum += w * (r0 - dist);
+
+                        // Accumulate normals weighted by penetration
+                        normal_sum += w * n_b;
+                    }
+                }
+            }
+        }
+    }
 
-                            float3 pos_j = positions[j];
-                            float deltaP_j = pressure_increments[j];
+    // If penetration is detected, apply corrections
+    if (w_sum > 0.0f) {
+        // Compute the average normal direction
+        float3 n_c = normal_sum / w_sum;
+        float n_len = sqrtf(dot(n_c, n_c));
+        if (n_len > 1e-6f) {
+            n_c /= n_len; // Normalize the accumulated normal
 
-                            float3 r = pos_i - pos_j;
+            // Compute the total penetration depth
+            float correction = depth_sum / w_sum;
 
-                            float distSq = dot(r, r);
+            // Correct the fluid particle's position
+            pos_i += n_c * correction;
 
-                            if (distSq < H2 && distSq > 0.0001f) {
-                                float dist = sqrtf(distSq);
+            // Decompose velocity into normal and tangential components
+            float v_n = dot(vel_i, n_c);
+            float3 v_normal = v_n * n_c;
+            float3 v_tangential = vel_i - v_normal;
 
-                                // Compute gradient of kernel
-                                float gradW = spikyKernelGrad(dist);
-                                float3 gradWij = gradW * (r / dist);
+            // Apply friction and elasticity
+            // [v_tangential_new] = epsilon * v_tangential
+            // [v_normal_new] = -delta * v_normal
+            float3 v_new = epsilon * v_tangential - delta * v_normal;
 
-                                // Symmetric pressure force
-                                float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY);
-                                fPressure += factor * gradWij;
-                            }
-                        }
-                    }
-                }
-            }
+            // Update velocity
+            vel_i = v_new;
         }
-
-        // Multiply by constants
-        fPressure *= PARTICLE_MASS * PARTICLE_MASS;
-
-        pressure_forces[idx] = fPressure;
     }
+
+    // Write back updated positions and velocities
+    positions_pred[idx] = pos_i;
+    velocities_pred[idx] = vel_i;
 }
 
 // Compute viscosity and external forces (excluding pressure forces)
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index 05c163b43b3544496b5138ca7aebb6c9308c2a93..34a73f90a56157c030c5f45fc9d1be9a454903f7 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -77,6 +77,9 @@ void updateParticles(float deltaTime)
         d_cellIndices,
         d_cellStart,
         d_cellEnd,
+        d_boundary_positions,
+        d_boundaryCellStart,
+        d_boundaryCellEnd,
         NUM_PARTICLES,
         gridSizeX, gridSizeY, gridSizeZ);
     cudaDeviceSynchronize();
@@ -85,6 +88,12 @@ void updateParticles(float deltaTime)
     thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
     float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
 
+    float r0 = 0.5f * h_H;          // Penetration threshold
+    float epsilon = 1.0f; // Moderate to high friction
+    float delta = 0.5f;   // Low to moderate elasticity
+    float normalDamping = 0.5f; // Adjust as needed
+
+
     // Compute A
     float m = h_PARTICLE_MASS;
     float rho0 = h_REST_DENSITY;
@@ -95,18 +104,25 @@ void updateParticles(float deltaTime)
     const int maxIterations = 5;
 
     for (int iter = 0; iter < maxIterations; ++iter) {
-        computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
+    // Launch the boundary correction kernel
+        correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
             d_positions_pred,
-            d_densities,
-            d_density_errors,
-            nullptr,
+            d_velocities_pred,
+            d_boundary_positions,
+            d_boundary_normals,
             d_cellIndices,
-            d_cellStart,
-            d_cellEnd,
+            d_boundaryCellStart,
+            d_boundaryCellEnd,
             NUM_PARTICLES,
-            gridSizeX, gridSizeY, gridSizeZ);
+            gridSizeX, gridSizeY, gridSizeZ,
+            r0,
+            normalDamping,
+            epsilon,
+            delta
+        );
         cudaDeviceSynchronize();
 
+
         updatePressures<<<blocksPerGrid, threadsPerBlock>>>(
             d_pressure_increments,
             d_density_errors,
@@ -121,6 +137,9 @@ void updateParticles(float deltaTime)
             d_cellIndices,
             d_cellStart,
             d_cellEnd,
+            d_boundary_positions,
+            d_boundaryCellStart,
+            d_boundaryCellEnd,
             NUM_PARTICLES,
             gridSizeX, gridSizeY, gridSizeZ);
         cudaDeviceSynchronize();
@@ -132,14 +151,49 @@ void updateParticles(float deltaTime)
             timeStep,
             NUM_PARTICLES);
         cudaDeviceSynchronize();
+        
+        computeDensityError<<<blocksPerGrid, threadsPerBlock>>>(
+            d_positions_pred,
+            d_densities,
+            d_density_errors,
+            nullptr,
+            d_cellIndices,
+            d_cellStart,
+            d_cellEnd,
+            d_boundary_positions,
+            d_boundaryCellStart,
+            d_boundaryCellEnd,
+            NUM_PARTICLES,
+            gridSizeX, gridSizeY, gridSizeZ);
+        cudaDeviceSynchronize();
     }
 
+    
+    correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions_pred,
+        d_velocities_pred,
+        d_boundary_positions,
+        d_boundary_normals,
+        d_cellIndices,
+        d_boundaryCellStart,
+        d_boundaryCellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ,
+        r0,
+        normalDamping,
+        epsilon,
+        delta
+    );
+    
+    cudaDeviceSynchronize();
+
     // -------------------- Finalization --------------------
 
     cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
     cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
 
     // Enforce boundary conditions
+    /*
     float damping = 0.75f;
     enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions,
@@ -147,6 +201,8 @@ void updateParticles(float deltaTime)
         NUM_PARTICLES,
         damping);
     cudaDeviceSynchronize();
+    */
+    
 
     // Unmap the VBO resource
     cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0);
diff --git a/src/settings/constants.h b/src/settings/constants.h
index 3faa9ca00980c22eb87f93a11ffcdc0e5f42c006..5ea4afd18d43e36e5d9a761a1237c6fcbd6f9722 100644
--- a/src/settings/constants.h
+++ b/src/settings/constants.h
@@ -10,13 +10,13 @@ const float h_BOUND_Z_MIN = -4.0f;
 const float h_BOUND_Z_MAX = 4.0f;
 
 const float h_GAS_CONSTANT = 0.02f;
-const float h_VISCOSITY = 0.2f;
+const float h_VISCOSITY = 0.04f;
 
-const float h_PARTICLE_MASS = 0.4f;
-const float h_REST_DENSITY = 3000.0f;
+const float h_PARTICLE_MASS = 0.04f;
+const float h_REST_DENSITY = 800.0f;
 
 const float h_PI = 3.14159265358979323846f;
-const float h_H = 0.35f; // Smoothing radius
+const float h_H = 0.25f; // Smoothing radius
 
 const int NUM_PARTICLES = 50000;
 
diff --git a/src/spawn_particles.cpp b/src/spawn_particles.cpp
index 255f0ed7a6a597862580bfbb61a3dee2e64684da..59986b0b7994b2420699443313781a82db6dbc58 100644
--- a/src/spawn_particles.cpp
+++ b/src/spawn_particles.cpp
@@ -27,8 +27,8 @@ void initParticles(
             for (int k = 0; k < particlesPerAxis && index < NUM_PARTICLES; ++k) {
                 // Center particles around the origin
                 float x = (i - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING;
-                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 2.0f;
-                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING;
+                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 4.0f;
+                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 0.5;
 
                 // Add random perturbation within [-0.1*dx, 0.1*dx]
                 float perturbation = 0.1f * GRID_SPACING;
@@ -49,7 +49,35 @@ void initParticles(
             }
         }
     }
+    /*
+    for (int i = 0; i < particlesPerAxis && index < NUM_PARTICLES; ++i) {
+        for (int j = 0; j < particlesPerAxis && index < NUM_PARTICLES; ++j) {
+            for (int k = 0; k < particlesPerAxis && index < NUM_PARTICLES; ++k) {
+                // Center particles around the origin
+                float x = (i - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 9;
+                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 2.0f;
+                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING - 0.5;
 
+                // Add random perturbation within [-0.1*dx, 0.1*dx]
+                float perturbation = 0.1f * GRID_SPACING;
+                float rand_x = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+                float rand_y = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+                float rand_z = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+
+                h_positions[index] = make_float3(
+                    x + rand_x,
+                    y + rand_y,
+                    z + rand_z
+                );
+
+                // Initialize velocities
+                h_velocities[index] = make_float3(0.0f, 0.0f, 0.0f);
+
+                ++index;
+            }
+        }
+    }
+    */
     // If NUM_PARTICLES is not a perfect cube, initialize remaining particles
     for (; index < NUM_PARTICLES; ++index) {
         h_positions[index] = make_float3(0.0f, 10.0f, 0.0f); // Default position
@@ -57,101 +85,112 @@ void initParticles(
     }
 }
 
-// Create boundary particles
+
 void createBoundaryParticles(
     std::vector<float3> &boundary_positions, 
-    std::vector<float3> &boundary_velocities,
+    std::vector<float3> &boundary_normals, // Renamed from boundary_velocities
     int boundaryLayers, 
     float spacing) 
 {
-    // Compute the number of particles along each dimension for a given spacing
-    int nx = static_cast<int>((h_BOUND_X_MAX - h_BOUND_X_MIN)/spacing) + 1;
-    int ny = static_cast<int>((h_BOUND_Y_MAX - h_BOUND_Y_MIN)/spacing) + 1;
-    int nz = static_cast<int>((h_BOUND_Z_MAX - h_BOUND_Z_MIN)/spacing) + 1;
-
-    // Initialize velocities to zero for boundary
-    float3 zeroVel = make_float3(0.0f, 0.0f, 0.0f);
-
-    // Y-min boundary (e.g., "floor")
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float y = h_BOUND_Y_MIN - (layer+1)*spacing; // placing layers below Ymin
-        for (int ix = 0; ix < nx; ix++) {
-            for (int iz = 0; iz < nz; iz++) {
-                float x = h_BOUND_X_MIN + ix*spacing;
-                float z = h_BOUND_Z_MIN + iz*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
+    // Define boundary normals based on boundary geometry
+    // For axis-aligned boxes, normals are straightforward
+
+    // Compute inner box dimensions based on the specified fractions
+    float box_size_x = (h_BOUND_X_MAX - h_BOUND_X_MIN);
+    float box_size_y = (h_BOUND_Y_MAX - h_BOUND_Y_MIN);
+    float box_size_z = (h_BOUND_Z_MAX - h_BOUND_Z_MIN);
+
+    // Compute inner box center positions for X and Z axes
+    float x_center = (h_BOUND_X_MAX + h_BOUND_X_MIN) / 2.0f;
+    float z_center = (h_BOUND_Z_MAX + h_BOUND_Z_MIN) / 2.0f;
+
+    // Define inner box minimum and maximum coordinates
+    float inner_min_x = x_center - box_size_x / 2.0f;
+    float inner_max_x = x_center + box_size_x / 2.0f;
+
+    float inner_min_y = h_BOUND_Y_MIN; // Aligned with Y-min
+    float inner_max_y = h_BOUND_Y_MIN + box_size_y;
+
+    float inner_min_z = z_center - box_size_z / 2.0f;
+    float inner_max_z = z_center + box_size_z / 2.0f;
+
+    // Compute the number of particles along each dimension for the inner box
+    int nx_inner = static_cast<int>((inner_max_x - inner_min_x) / spacing) + 1;
+    int ny_inner = static_cast<int>((inner_max_y - inner_min_y) / spacing) + 1;
+    int nz_inner = static_cast<int>((inner_max_z - inner_min_z) / spacing) + 1;
+
+    // Lambda function to add boundary particles for a specific face
+    auto addBoundaryFace = [&](int axis, bool positive) {
+        for (int layer = 0; layer < boundaryLayers; layer++) {
+            float positionOffset = layer * spacing;
+
+            int loop_i_count, loop_j_count;
+            // Determine which loops correspond to each axis
+            if (axis == 0) {
+                // X-axis walls: vary Y and Z
+                loop_i_count = ny_inner;
+                loop_j_count = nz_inner;
+            } else if (axis == 1) {
+                // Y-axis walls: vary X and Z
+                loop_i_count = nx_inner;
+                loop_j_count = nz_inner;
+            } else {
+                // Z-axis walls: vary X and Y
+                loop_i_count = nx_inner;
+                loop_j_count = ny_inner;
             }
-        }
-    }
 
-    /*
-    // Y-max boundary (ceiling)
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float y = h_BOUND_Y_MAX + (layer+1)*spacing;
-        for (int ix = 0; ix < nx; ix++) {
-            for (int iz = 0; iz < nz; iz++) {
-                float x = h_BOUND_X_MIN + ix*spacing;
-                float z = h_BOUND_Z_MIN + iz*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
+            for (int i = 0; i < loop_i_count; i++) {
+                for (int j = 0; j < loop_j_count; j++) {
+                    float x, y, z, nx, ny, nz;
+                    switch (axis) {
+                        case 0: // X-axis (Left/Right walls)
+                            x = positive ? (inner_max_x + positionOffset) : (inner_min_x - positionOffset);
+                            y = inner_min_y + i * spacing;
+                            z = inner_min_z + j * spacing;
+                            nx = positive ? -1.0f : 1.0f;
+                            ny = 0.0f;
+                            nz = 0.0f;
+                            break;
+
+                        case 1: // Y-axis (Floor/Ceiling)
+                            y = positive ? (inner_max_y + positionOffset) : (inner_min_y - positionOffset);
+                            x = inner_min_x + i * spacing;
+                            z = inner_min_z + j * spacing;
+                            nx = 0.0f;
+                            ny = positive ? -1.0f : 1.0f;
+                            nz = 0.0f;
+                            break;
+
+                        case 2: // Z-axis (Front/Back walls)
+                            z = positive ? (inner_max_z + positionOffset) : (inner_min_z - positionOffset);
+                            x = inner_min_x + i * spacing;
+                            y = inner_min_y + j * spacing;
+                            nx = 0.0f;
+                            ny = 0.0f;
+                            nz = positive ? -1.0f : 1.0f;
+                            break;
+                    }
+
+                    boundary_positions.emplace_back(make_float3(x, y, z));
+                    boundary_normals.emplace_back(make_float3(nx, ny, nz)); // Assign normals
+                }
             }
         }
-    }
-    */
+    };
 
-    // X-min boundary (vertical wall on the left)
-    int ny_cells = ny; // from earlier computation
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float x = h_BOUND_X_MIN - (layer+1)*spacing;
-        for (int iy = 0; iy < ny_cells; iy++) {
-            for (int iz = 0; iz < nz; iz++) {
-                float y = h_BOUND_Y_MIN + iy*spacing;
-                float z = h_BOUND_Z_MIN + iz*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
-            }
-        }
-    }
+    // Add boundary particles for all three axes
+    // Axis 0: X-axis (Left and Right Walls)
+    addBoundaryFace(0, false); // X-min (Left Wall)
+    addBoundaryFace(0, true);  // X-max (Right Wall)
 
-    // X-max boundary (vertical wall on the right)
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float x = h_BOUND_X_MAX + (layer+1)*spacing;
-        for (int iy = 0; iy < ny_cells; iy++) {
-            for (int iz = 0; iz < nz; iz++) {
-                float y = h_BOUND_Y_MIN + iy*spacing;
-                float z = h_BOUND_Z_MIN + iz*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
-            }
-        }
-    }
+    // Axis 1: Y-axis (Floor and Ceiling)
+    addBoundaryFace(1, false); // Y-min (Floor)
+    addBoundaryFace(1, true);  // Y-max (Ceiling)
 
-    // Z-min boundary (front wall)
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float z = h_BOUND_Z_MIN - (layer+1)*spacing;
-        for (int ix = 0; ix < nx; ix++) {
-            for (int iy = 0; iy < ny_cells; iy++) {
-                float x = h_BOUND_X_MIN + ix*spacing;
-                float y = h_BOUND_Y_MIN + iy*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
-            }
-        }
-    }
-
-    // Z-max boundary (back wall)
-    for (int layer = 0; layer < boundaryLayers; layer++) {
-        float z = h_BOUND_Z_MAX + (layer+1)*spacing;
-        for (int ix = 0; ix < nx; ix++) {
-            for (int iy = 0; iy < ny_cells; iy++) {
-                float x = h_BOUND_X_MIN + ix*spacing;
-                float y = h_BOUND_Y_MIN + iy*spacing;
-                boundary_positions.push_back(make_float3(x, y, z));
-                boundary_velocities.push_back(zeroVel);
-            }
-        }
-    }
+    // Axis 2: Z-axis (Front and Back Walls)
+    addBoundaryFace(2, false); // Z-min (Front Wall)
+    addBoundaryFace(2, true);  // Z-max (Back Wall)
 }
 
 }
\ No newline at end of file