diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp
index cf9c0d59a20195f423296b8cfc0156504b173e8c..fd2c974973de35a4b0576db73d057a46a553c63f 100644
--- a/src/FluidSimulation.cpp
+++ b/src/FluidSimulation.cpp
@@ -1,6 +1,7 @@
 #include "include/FluidSimulation.h"
 #include "include/spawn_particles.h" // For createBoundaryParticles, initParticles
 //#include "include/PCISPH_sim.h"      // For CUDA simulation functions
+#include "settings/constants.h"
 #include "cuda/include/PCISPH_CUDA_interface.h"
 #include <iostream>
 
@@ -22,17 +23,16 @@ namespace FluidSimulation {
         // Upload constants to the GPU
         FluidSimulationGPU::initializeConstants();
 
-        // Initialize particles
+        // Initialize fluid particles
         std::vector<float3> h_positions(m_numParticles);
         std::vector<float3> h_velocities(m_numParticles);
-        float gridSpacing = 0.25f;
-
+        float gridSpacing = 0.5 * h_H;
         initParticles(h_positions, h_velocities, m_numParticles, gridSpacing);
 
         // Initialize particle device arrays
         FluidSimulationGPU::initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles);
 
-        // Create particle VBO
+        // Create particle VBO and register with CUDA
         glGenBuffers(1, &m_vbo_particles);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_particles);
         glBufferData(GL_ARRAY_BUFFER, h_positions.size() * sizeof(float3), h_positions.data(), GL_DYNAMIC_DRAW);
@@ -43,19 +43,23 @@ namespace FluidSimulation {
         std::vector<float3> boundary_positions;
         std::vector<float3> boundary_velocities;
         int boundaryLayers = 3;
-        float spacing = 0.25f;
+        float spacing = 0.5 * h_H; // Using 0.01 as per your code
 
         createBoundaryParticles(boundary_positions, boundary_velocities, boundaryLayers, spacing);
         m_numBoundaryParticles = boundary_positions.size();
 
+        // Initialize boundary device arrays
         FluidSimulationGPU::initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles);
 
-        // Create boundary VBO
+        // Create boundary VBO and register with CUDA
         glGenBuffers(1, &m_vbo_boundary);
         glBindBuffer(GL_ARRAY_BUFFER, m_vbo_boundary);
         glBufferData(GL_ARRAY_BUFFER, boundary_positions.size() * sizeof(float3), boundary_positions.data(), GL_DYNAMIC_DRAW);
         glBindBuffer(GL_ARRAY_BUFFER, 0);
         FluidSimulationGPU::registerBoundaryVBOWithCUDA(m_vbo_boundary);
+
+        // Now that we have boundary particles set up and registered, run the boundary neighborhood search
+        FluidSimulationGPU::boundaryNBSearch(m_numBoundaryParticles);
     }
 
     void FluidSimulation::updateSimulation() {
diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h
index 07be87cb0ed361f5743354e6da751f6f44015ba3..6551eb89d4a76cb3705230fcb7532779ecde7e03 100644
--- a/src/cuda/include/pcisph_kernels.h
+++ b/src/cuda/include/pcisph_kernels.h
@@ -48,6 +48,19 @@ __global__ void computePressureForces(
     int gridSizeX, int gridSizeY, int gridSizeZ
 );
 
+__global__ void correctBoundaryPenetrations(
+    float3* positions_pred,
+    float3* velocities_pred,
+    const float3* boundary_positions,
+    unsigned int* cellIndices,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ,
+    float r0,
+    float normalDamping
+);
+
 // Function to compute viscosity and external forces
 __global__ void computeViscosityAndExternalForces(
     float3* positions,
diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu
index e42cda376bc9a29282087928e88d283418cc2ab2..704f2b7d3eb3e03ce94d9a5ce9fe28da0755ad0d 100644
--- a/src/cuda/pcisph_kernels.cu
+++ b/src/cuda/pcisph_kernels.cu
@@ -89,24 +89,11 @@ __global__ void computeDensityError(
                         }
                     }
                 }
-            }
-        }
-    }
-
-    // ----------------------
-    // Boundary neighbors
-    // ----------------------
-    for (int dz = -1; dz <= 1; ++dz) {
-        int nz = cellZ + dz;
-        if (nz < 0 || nz >= gridSizeZ) continue;
-        for (int dy = -1; dy <= 1; ++dy) {
-            int ny = cellY + dy;
-            if (ny < 0 || ny >= gridSizeY) continue;
-            for (int dx = -1; dx <= 1; ++dx) {
-                int nx = cellX + dx;
-                if (nx < 0 || nx >= gridSizeX) continue;
-
-                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                
+                // ----------------------
+                // Boundary neighbors
+                // ----------------------
+                
                 unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
                 unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
 
@@ -219,24 +206,12 @@ __global__ void computePressureForces(
                         }
                     }
                 }
-            }
-        }
-    }
-
-    // ------------
-    // Boundary neighbors
-    // ------------
-    for (int dz = -1; dz <= 1; ++dz) {
-        int nz = cellZ + dz;
-        if (nz < 0 || nz >= gridSizeZ) continue;
-        for (int dy = -1; dy <= 1; ++dy) {
-            int ny = cellY + dy;
-            if (ny < 0 || ny >= gridSizeY) continue;
-            for (int dx = -1; dx <= 1; ++dx) {
-                int nx = cellX + dx;
-                if (nx < 0 || nx >= gridSizeX) continue;
-
-                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+                
+                // ------------
+                // Boundary neighbors
+                // ------------
+            
+                /*
                 unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
                 unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
 
@@ -258,6 +233,7 @@ __global__ void computePressureForces(
                         }
                     }
                 }
+                */
             }
         }
     }
@@ -265,6 +241,97 @@ __global__ void computePressureForces(
     pressure_forces[idx] = fPressure;
 }
 
+__global__ void correctBoundaryPenetrations(
+    float3* positions_pred,
+    float3* velocities_pred,
+    const float3* boundary_positions,
+    unsigned int* cellIndices,
+    unsigned int* boundaryCellStart,
+    unsigned int* boundaryCellEnd,
+    int numParticles,
+    int gridSizeX, int gridSizeY, int gridSizeZ,
+    float r0,
+    float normalDamping
+)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numParticles) return;
+
+    float3 pos_i = positions_pred[idx];
+    float3 vel_i = velocities_pred[idx];
+
+    // Retrieve the cell index for this fluid particle
+    unsigned int cellIndex = cellIndices[idx];
+
+    int cellX = cellIndex % gridSizeX;
+    int cellY = (cellIndex / gridSizeX) % gridSizeY;
+    int cellZ = cellIndex / (gridSizeX * gridSizeY);
+
+    float w_sum = 0.0f;
+    float depth_sum = 0.0f;
+    float3 normal_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Iterate over neighboring cells in a 3x3x3 area
+    for (int dz = -1; dz <= 1; ++dz) {
+        int nz = cellZ + dz;
+        if (nz < 0 || nz >= gridSizeZ) continue;
+        for (int dy = -1; dy <= 1; ++dy) {
+            int ny = cellY + dy;
+            if (ny < 0 || ny >= gridSizeY) continue;
+            for (int dx = -1; dx <= 1; ++dx) {
+                int nx = cellX + dx;
+                if (nx < 0 || nx >= gridSizeX) continue;
+
+                // Compute the neighbor cell index
+                unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY;
+
+                // Get the boundary particles in this cell
+                unsigned int b_startIdx = boundaryCellStart[neighborCellIndex];
+                unsigned int b_endIdx = boundaryCellEnd[neighborCellIndex];
+
+                if (b_startIdx == 0xFFFFFFFF) continue; // No boundary particles in this cell
+
+                // Loop over boundary particles in the neighboring cell
+                for (unsigned int b = b_startIdx; b < b_endIdx; ++b) {
+                    float3 pos_b = boundary_positions[b];
+                    float3 r = pos_i - pos_b;
+                    float dist = sqrtf(dot(r, r));
+
+                    if (dist < r0 && dist > 1e-6f) {
+                        float w = (r0 - dist) / r0;
+                        w_sum += w;
+                        depth_sum += w * (r0 - dist);
+
+                        // Approximate normal direction away from boundary particle
+                        float3 n_b = r / dist;
+                        normal_sum += w * n_b;
+                    }
+                }
+            }
+        }
+    }
+
+    // If we found penetration, correct it
+    if (w_sum > 0.0f) {
+        float3 n_c = normal_sum / w_sum;
+        float n_len = sqrtf(dot(n_c, n_c));
+        if (n_len > 1e-6f) {
+            n_c /= n_len;
+            float correction = depth_sum / w_sum;
+            pos_i += n_c * correction;
+
+            // Dampen the normal velocity component
+            float v_n = dot(vel_i, n_c);
+            float3 v_normal = v_n * n_c;
+            vel_i -= normalDamping * v_normal;
+        }
+    }
+
+    // Write back updated positions and velocities
+    positions_pred[idx] = pos_i;
+    velocities_pred[idx] = vel_i;
+}
+
 // Compute viscosity and external forces (excluding pressure forces)
 __global__ void computeViscosityAndExternalForces(
     float3* positions,
diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu
index 51171356a411e67f1e022de263f0b9a58119a0be..4aad44d170f1055f50b2986c5caf79b72d0068c2 100644
--- a/src/cuda/pcisph_update.cu
+++ b/src/cuda/pcisph_update.cu
@@ -88,6 +88,24 @@ void updateParticles(float deltaTime)
     thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2);
     float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>());
 
+    float r0 = 0.5f * h_H;          // Penetration threshold
+    float normalDamping = 0.5f;   // Damping factor (adjust as needed, e.g., 0.5f)
+
+    // Launch the boundary correction kernel
+    correctBoundaryPenetrations<<<blocksPerGrid, threadsPerBlock>>>(
+        d_positions_pred,
+        d_velocities_pred,
+        d_boundary_positions,
+        d_cellIndices,
+        d_boundaryCellStart,
+        d_boundaryCellEnd,
+        NUM_PARTICLES,
+        gridSizeX, gridSizeY, gridSizeZ,
+        r0,
+        normalDamping
+    );
+    cudaDeviceSynchronize();
+
     // Compute A
     float m = h_PARTICLE_MASS;
     float rho0 = h_REST_DENSITY;
@@ -149,14 +167,15 @@ void updateParticles(float deltaTime)
     cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice);
 
     // Enforce boundary conditions
+    /*
     float damping = 0.75f;
-    
     enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>(
         d_positions,
         d_velocities,
         NUM_PARTICLES,
         damping);
     cudaDeviceSynchronize();
+    */
     
 
     // Unmap the VBO resource
diff --git a/src/spawn_particles.cpp b/src/spawn_particles.cpp
index 255f0ed7a6a597862580bfbb61a3dee2e64684da..f5800c01ec76a257fb6a4356e277fa40735d020c 100644
--- a/src/spawn_particles.cpp
+++ b/src/spawn_particles.cpp
@@ -19,7 +19,7 @@ void initParticles(
     srand(static_cast<unsigned int>(time(0)));
 
     // Calculate particles per axis
-    int particlesPerAxis = static_cast<int>(ceil(pow(NUM_PARTICLES, 1.0f / 3.0f)));
+    int particlesPerAxis = static_cast<int>(ceil(pow(NUM_PARTICLES/2, 1.0f / 3.0f)));
 
     int index = 0;
     for (int i = 0; i < particlesPerAxis && index < NUM_PARTICLES; ++i) {
@@ -27,8 +27,8 @@ void initParticles(
             for (int k = 0; k < particlesPerAxis && index < NUM_PARTICLES; ++k) {
                 // Center particles around the origin
                 float x = (i - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING;
-                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 2.0f;
-                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING;
+                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 4.0f;
+                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 2;
 
                 // Add random perturbation within [-0.1*dx, 0.1*dx]
                 float perturbation = 0.1f * GRID_SPACING;
@@ -50,6 +50,33 @@ void initParticles(
         }
     }
 
+    for (int i = 0; i < particlesPerAxis && index < NUM_PARTICLES; ++i) {
+        for (int j = 0; j < particlesPerAxis && index < NUM_PARTICLES; ++j) {
+            for (int k = 0; k < particlesPerAxis && index < NUM_PARTICLES; ++k) {
+                // Center particles around the origin
+                float x = (i - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 9;
+                float y = (j - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING + 4.0f;
+                float z = (k - (particlesPerAxis - 1) / 2.0f) * GRID_SPACING - 2;
+
+                // Add random perturbation within [-0.1*dx, 0.1*dx]
+                float perturbation = 0.1f * GRID_SPACING;
+                float rand_x = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+                float rand_y = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+                float rand_z = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * perturbation;
+
+                h_positions[index] = make_float3(
+                    x + rand_x,
+                    y + rand_y,
+                    z + rand_z
+                );
+
+                // Initialize velocities
+                h_velocities[index] = make_float3(0.0f, 0.0f, 0.0f);
+
+                ++index;
+            }
+        }
+    }
     // If NUM_PARTICLES is not a perfect cube, initialize remaining particles
     for (; index < NUM_PARTICLES; ++index) {
         h_positions[index] = make_float3(0.0f, 10.0f, 0.0f); // Default position
@@ -57,6 +84,106 @@ void initParticles(
     }
 }
 
+void createBoundaryParticles(
+    std::vector<float3> &boundary_positions, 
+    std::vector<float3> &boundary_velocities,
+    int boundaryLayers, 
+    float spacing) 
+{
+    // Initialize velocities to zero for boundary particles
+    float3 zeroVel = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute inner box dimensions based on the specified fractions
+    float box_size_x = (h_BOUND_X_MAX - h_BOUND_X_MIN); // / 4.0f; // 1/4 of X dimension
+    float box_size_y = (h_BOUND_Y_MAX - h_BOUND_Y_MIN); // / 2.0f; // 1/2 of Y dimension
+    float box_size_z = (h_BOUND_Z_MAX - h_BOUND_Z_MIN); // / 4.0f; // 1/4 of Z dimension
+
+    // Compute inner box center positions for X and Z axes
+    float x_center = (h_BOUND_X_MAX + h_BOUND_X_MIN) / 2.0f;
+    float z_center = (h_BOUND_Z_MAX + h_BOUND_Z_MIN) / 2.0f;
+
+    // Define inner box minimum and maximum coordinates
+    float inner_min_x = x_center - box_size_x / 2.0f;
+    float inner_max_x = x_center + box_size_x / 2.0f;
+
+    float inner_min_y = h_BOUND_Y_MIN; // Aligned with Y-min
+    float inner_max_y = h_BOUND_Y_MIN + box_size_y;
+
+    float inner_min_z = z_center - box_size_z / 2.0f;
+    float inner_max_z = z_center + box_size_z / 2.0f;
+
+    // Compute the number of particles along each dimension for the inner box
+    int nx_inner = static_cast<int>((inner_max_x - inner_min_x) / spacing) + 1;
+    int ny_inner = static_cast<int>((inner_max_y - inner_min_y) / spacing) + 1;
+    int nz_inner = static_cast<int>((inner_max_z - inner_min_z) / spacing) + 1;
+
+    // Lambda function to add boundary particles for a specific face
+    auto addBoundaryFace = [&](int axis, bool positive) {
+        for (int layer = 0; layer < boundaryLayers; layer++) {
+            float positionOffset = layer * spacing;
+
+            int loop_i_count, loop_j_count;
+            // Determine which loops correspond to each axis
+            if (axis == 0) {
+                // X-axis walls: vary Y and Z
+                loop_i_count = ny_inner;
+                loop_j_count = nz_inner;
+            } else if (axis == 1) {
+                // Y-axis walls: vary X and Z
+                loop_i_count = nx_inner;
+                loop_j_count = nz_inner;
+            } else {
+                // Z-axis walls: vary X and Y
+                loop_i_count = nx_inner;
+                loop_j_count = ny_inner;
+            }
+
+            for (int i = 0; i < loop_i_count; i++) {
+                for (int j = 0; j < loop_j_count; j++) {
+                    float x, y, z;
+                    switch (axis) {
+                        case 0: // X-axis (Left/Right walls): fix x, vary y and z
+                            x = positive ? (inner_max_x + positionOffset) : (inner_min_x - positionOffset);
+                            y = inner_min_y + i * spacing;
+                            z = inner_min_z + j * spacing;
+                            break;
+
+                        case 1: // Y-axis (Floor/Ceiling): fix y, vary x and z
+                            y = positive ? (inner_max_y + positionOffset) : (inner_min_y - positionOffset);
+                            x = inner_min_x + i * spacing;
+                            z = inner_min_z + j * spacing;
+                            break;
+
+                        case 2: // Z-axis (Front/Back walls): fix z, vary x and y
+                            z = positive ? (inner_max_z + positionOffset) : (inner_min_z - positionOffset);
+                            x = inner_min_x + i * spacing;
+                            y = inner_min_y + j * spacing;
+                            break;
+                    }
+
+                    boundary_positions.emplace_back(make_float3(x, y, z));
+                    boundary_velocities.emplace_back(zeroVel);
+                }
+            }
+        }
+    };
+
+    // Add boundary particles for all three axes
+    // Axis 0: X-axis (Left and Right Walls)
+    addBoundaryFace(0, false); // X-min (Left Wall)
+    addBoundaryFace(0, true);  // X-max (Right Wall)
+
+    // Axis 1: Y-axis (Floor)
+    addBoundaryFace(1, false); // Y-min (Floor)
+    // Uncomment the following line to add Y-max (Ceiling) when needed
+    addBoundaryFace(1, true);  // Y-max (Ceiling)
+
+    // Axis 2: Z-axis (Front and Back Walls)
+    addBoundaryFace(2, false); // Z-min (Front Wall)
+    addBoundaryFace(2, true);  // Z-max (Back Wall)
+}
+
+/*
 // Create boundary particles
 void createBoundaryParticles(
     std::vector<float3> &boundary_positions, 
@@ -98,7 +225,6 @@ void createBoundaryParticles(
             }
         }
     }
-    */
 
     // X-min boundary (vertical wall on the left)
     int ny_cells = ny; // from earlier computation
@@ -153,5 +279,6 @@ void createBoundaryParticles(
         }
     }
 }
+*/
 
 }
\ No newline at end of file