diff --git a/CMakeLists.txt b/CMakeLists.txt index f6daabbc9c696a3883ba4e6da7475924629d8ece..b0dc26cdd55d0a643d6cced1f1778ad295db91e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,11 +41,35 @@ target_link_libraries(imgui PUBLIC ) # ------------------------------ -# 3. Common Library with CUDA and Shared Code +# 3. CUDA Library +# ------------------------------ +set(CUDA_SOURCES + src/cuda/init_cuda.cu + src/cuda/pcisph_kernels.cu + src/cuda/smoothing_kernels.cu + src/cuda/spatialHash_NBSearch_kernels.cu + src/cuda/pcisph_update.cu +) + + +add_library(cuda_static STATIC ${CUDA_SOURCES}) + +set_target_properties(cuda_static PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 +) + +target_include_directories(cuda_static PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/settings + /usr/local/cuda/include +) + +# ------------------------------ +# 4. Common Application Library # ------------------------------ -# Common sources including CUDA code set(COMMON_SOURCES - src/itterative_PCISPH.cu src/spawn_particles.cpp src/Camera.cpp src/ImGuiManager.cpp @@ -53,7 +77,6 @@ set(COMMON_SOURCES src/FluidRenderer.cpp src/FluidSimulation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/common/GL_utilities.c - # Add other common source files if needed ) add_library(common_lib STATIC ${COMMON_SOURCES}) @@ -61,9 +84,6 @@ add_library(common_lib STATIC ${COMMON_SOURCES}) target_include_directories(common_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/src/settings - /usr/local/cuda/include - ${OPENGL_INCLUDE_DIRS} - ${GLEW_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/common ${CMAKE_CURRENT_SOURCE_DIR}/common/Linux/ ${IMGUI_DIR} @@ -75,39 +95,20 @@ target_link_libraries(common_lib PUBLIC ${GLEW_LIBRARIES} glfw GLU + cuda_static ) set_target_properties(common_lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON) # ------------------------------ -# 4. Define Executables +# 5. Define Executables # ------------------------------ - -# First executable (GUI_fluid_sim) -add_executable(cuda_sph_main src/GUI_fluid_sim.cpp) -target_link_libraries(cuda_sph_main PRIVATE common_lib imgui) - -# Second executable (screen_spaced_main) -add_executable(screen_spaced src/screen_spaced_main.cpp) -target_link_libraries(screen_spaced PRIVATE common_lib imgui) - -# New executable for MainGUI add_executable(fluidSimulator src/MainGUI.cpp) target_link_libraries(fluidSimulator PRIVATE common_lib imgui) # ------------------------------ -# 5. Set Target Properties +# 6. Set Target Properties # ------------------------------ -set_target_properties(cuda_sph_main PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES -) - -set_target_properties(screen_spaced PROPERTIES - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED YES -) - set_target_properties(fluidSimulator PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES diff --git a/src/FluidSimulation.cpp b/src/FluidSimulation.cpp index 8f58b8fb2e2c6062ab530b7ef2d662ecf4e67a80..cf9c0d59a20195f423296b8cfc0156504b173e8c 100644 --- a/src/FluidSimulation.cpp +++ b/src/FluidSimulation.cpp @@ -1,6 +1,7 @@ #include "include/FluidSimulation.h" #include "include/spawn_particles.h" // For createBoundaryParticles, initParticles -#include "include/PCISPH_sim.h" // For CUDA simulation functions +//#include "include/PCISPH_sim.h" // For CUDA simulation functions +#include "cuda/include/PCISPH_CUDA_interface.h" #include <iostream> namespace FluidSimulation { @@ -11,15 +12,15 @@ namespace FluidSimulation { } FluidSimulation::~FluidSimulation() { - unregisterVBOFromCUDA(); - unregisterBoundaryVBOFromCUDA(); + FluidSimulationGPU::unregisterVBOFromCUDA(); + FluidSimulationGPU::unregisterBoundaryVBOFromCUDA(); glDeleteBuffers(1, &m_vbo_particles); glDeleteBuffers(1, &m_vbo_boundary); } void FluidSimulation::initSimulation() { // Upload constants to the GPU - initializeConstants(); + FluidSimulationGPU::initializeConstants(); // Initialize particles std::vector<float3> h_positions(m_numParticles); @@ -29,14 +30,14 @@ namespace FluidSimulation { initParticles(h_positions, h_velocities, m_numParticles, gridSpacing); // Initialize particle device arrays - initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles); + FluidSimulationGPU::initializeFluidDeviceArrays(h_positions.data(), h_velocities.data(), m_numParticles); // Create particle VBO glGenBuffers(1, &m_vbo_particles); glBindBuffer(GL_ARRAY_BUFFER, m_vbo_particles); glBufferData(GL_ARRAY_BUFFER, h_positions.size() * sizeof(float3), h_positions.data(), GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); - registerVBOWithCUDA(m_vbo_particles); + FluidSimulationGPU::registerVBOWithCUDA(m_vbo_particles); // Initialize boundary std::vector<float3> boundary_positions; @@ -47,19 +48,19 @@ namespace FluidSimulation { createBoundaryParticles(boundary_positions, boundary_velocities, boundaryLayers, spacing); m_numBoundaryParticles = boundary_positions.size(); - initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles); + FluidSimulationGPU::initializeBoundryDeviceArrays(boundary_positions.data(), boundary_velocities.data(), m_numBoundaryParticles); // Create boundary VBO glGenBuffers(1, &m_vbo_boundary); glBindBuffer(GL_ARRAY_BUFFER, m_vbo_boundary); glBufferData(GL_ARRAY_BUFFER, boundary_positions.size() * sizeof(float3), boundary_positions.data(), GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); - registerBoundaryVBOWithCUDA(m_vbo_boundary); + FluidSimulationGPU::registerBoundaryVBOWithCUDA(m_vbo_boundary); } void FluidSimulation::updateSimulation() { for (int i = 0; i < 4; ++i) { - updateParticles(0.001); + FluidSimulationGPU::updateParticles(0.001); } } diff --git a/src/cuda/include/PCISPH_CUDA_interface.h b/src/cuda/include/PCISPH_CUDA_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..a2b7110767f9422b45abdc5bbf2a466dcabf79f3 --- /dev/null +++ b/src/cuda/include/PCISPH_CUDA_interface.h @@ -0,0 +1,11 @@ +#ifndef CUDA_INTERFACE_H +#define CUDA_INTERFACE_H + +#include "cuda_device_vars.h" +#include "spatialHash_NBSearch_kernels.h" +#include "smoothing_kernels.h" +#include "pcisph_kernels.h" +#include "pcisph_update.h" // Header for pcisph_update.cu +#include "init_cuda.h" // Header for init_cuda.cu + +#endif // CUDA_INTERFACE_H diff --git a/src/cuda/include/cuda_device_vars.h b/src/cuda/include/cuda_device_vars.h new file mode 100644 index 0000000000000000000000000000000000000000..e0eada192f94a395ec8b53503a082e6bca262ce7 --- /dev/null +++ b/src/cuda/include/cuda_device_vars.h @@ -0,0 +1,69 @@ +#ifndef CUDA_DEVICE_VARS_H +#define CUDA_DEVICE_VARS_H + +#include <cuda_runtime.h> + +namespace FluidSimulationGPU { + + +// Global Variables +extern cudaGraphicsResource_t cuda_vbo_resource; +extern cudaGraphicsResource_t cuda_vbo_boundary_resource; + +// Constants +extern __constant__ float PI; +extern __constant__ float H; +extern __constant__ float H2; +extern __constant__ float POLY6_CONST; +extern __constant__ float SPIKY_GRAD_CONST; +extern __constant__ float VISC_LAP_CONST; + +extern __constant__ float PARTICLE_MASS; +extern __constant__ float REST_DENSITY; +extern __constant__ float VISCOSITY; + +// Boundary constants +extern __constant__ float BOUND_X_MIN; +extern __constant__ float BOUND_X_MAX; +extern __constant__ float BOUND_Y_MIN; +extern __constant__ float BOUND_Y_MAX; +extern __constant__ float BOUND_Z_MIN; +extern __constant__ float BOUND_Z_MAX; + +// Device Arrays +extern float3* d_positions; +extern float3* d_velocities; + +extern unsigned int* d_cellIndices; +extern unsigned int* d_particleIndices; + +extern float3* d_positions_sorted; +extern float3* d_velocities_sorted; + +extern unsigned int* d_cellStart; +extern unsigned int* d_cellEnd; + +// Arrays for densities, pressures, accelerations +extern float* d_densities; +extern float* d_density_errors; +extern float* d_pressure_increments; +extern float3* d_pressure_forces; +extern float3* d_accelerations; + +// Additional arrays for PCISPH +extern float3* d_velocities_pred; +extern float3* d_positions_pred; +extern float* d_sum_gradW2; // For computing A + +// Boundary particles +extern float3* d_boundary_positions; +extern float3* d_boundary_velocities; + +extern unsigned int* d_boundaryCellStart; +extern unsigned int* d_boundaryCellEnd; + +extern unsigned int* d_boundaryCellIndices; +extern unsigned int* d_boundaryParticleIndices; + +} +#endif // CUDA_DEVICE_VARS_H \ No newline at end of file diff --git a/src/include/float3_helper_math.h b/src/cuda/include/float3_helper_math.h similarity index 100% rename from src/include/float3_helper_math.h rename to src/cuda/include/float3_helper_math.h diff --git a/src/cuda/include/init_cuda.h b/src/cuda/include/init_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..16b185f904eecc617cb4a4e80a63fd6acd7f4ef2 --- /dev/null +++ b/src/cuda/include/init_cuda.h @@ -0,0 +1,29 @@ +#ifndef INIT_CUDA_H +#define INIT_CUDA_H + +#include <cuda_runtime.h> + +namespace FluidSimulationGPU { + +// Initialize constant values for the simulation +void initializeConstants(); + +// Register and unregister VBOs for CUDA-OpenGL interoperability +void registerVBOWithCUDA(unsigned int vbo); +void registerBoundaryVBOWithCUDA(unsigned int vbo); +void unregisterVBOFromCUDA(); +void unregisterBoundaryVBOFromCUDA(); + +// Initialize device memory for fluid and boundary particles +void initializeFluidDeviceArrays(const float3* h_positions, const float3* h_velocities, int numParticles); +void initializeBoundryDeviceArrays(const float3* h_boundaryPositions, const float3* h_boundaryVelocities, int numBoundaryParticles); + +// Cleanup all device arrays +void cleanupDeviceArrays(); + +// Perform a one-time neighborhood search for static boundary particles +void boundaryNBSearch(int numBoundaryParticles); + +} // namespace FluidSimulationGPU + +#endif // INIT_CUDA_H diff --git a/src/cuda/include/pcisph_kernels.h b/src/cuda/include/pcisph_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..18c65f885237b033780e2869ec41d48806c9a210 --- /dev/null +++ b/src/cuda/include/pcisph_kernels.h @@ -0,0 +1,103 @@ +// pcisph_kernels.h + +#ifndef PCISPH_KERNELS_H +#define PCISPH_KERNELS_H + +#include <cuda_runtime.h> // Required for float3 and CUDA types + +namespace FluidSimulationGPU { + +// Function to compute density errors +__global__ void computeDensityError( + float3* positions, + float* densities, + float* density_errors, + float* sum_gradW2, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +); + +// Function to update pressures +__global__ void updatePressures( + float* pressure_increments, + float* density_errors, + float A, + int numParticles +); + +// Function to compute pressure forces +__global__ void computePressureForces( + float3* positions, + float* pressure_increments, + float3* pressure_forces, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +); + +// Function to compute viscosity and external forces +__global__ void computeViscosityAndExternalForces( + float3* positions, + float3* velocities, + float3* accelerations, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +); + +// Function to enforce boundary conditions +__global__ void enforceBoundary( + float3* positions, + float3* velocities, + int numParticles, + float damping +); + +// Function to predict velocities and positions +__global__ void predictVelocitiesPositions( + float3* velocities, + float3* velocities_pred, + float3* positions, + float3* positions_pred, + float3* accelerations, + float timeStep, + int numParticles +); + +// Function to update velocities and positions based on pressure forces +__global__ void updateVelocitiesPositions( + float3* velocities_pred, + float3* positions_pred, + float3* pressure_forces, + float timeStep, + int numParticles +); + +// Host function for neighbor search (uses device pointers) +void neigbourHoodSearch( + int nParticles, + float3* d_pos, + float3* d_vel, + float3* d_posSorted, + float3* d_velSorted, + unsigned int* d_cellIndc, + unsigned int* d_parIndc, + unsigned int* d_cEnd, + unsigned int* d_cStart, + const int bpg, + const int tpb, + int& gridSizeX, + int& gridSizeY, + int& gridSizeZ +); + +} // namespace FluidSimulationGPU + +#endif // PCISPH_KERNELS_H diff --git a/src/cuda/include/pcisph_update.h b/src/cuda/include/pcisph_update.h new file mode 100644 index 0000000000000000000000000000000000000000..0e0684f6c6eeee0572897a45949e745d084d5d93 --- /dev/null +++ b/src/cuda/include/pcisph_update.h @@ -0,0 +1,13 @@ +#ifndef UPDATE_PARTICLES_H +#define UPDATE_PARTICLES_H + +#include <cuda_runtime.h> + +namespace FluidSimulationGPU { + +// Updates the particle states for one simulation step +void updateParticles(float deltaTime); + +} // namespace FluidSimulationGPU + +#endif // UPDATE_PARTICLES_H \ No newline at end of file diff --git a/src/cuda/include/smoothing_kernels.h b/src/cuda/include/smoothing_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..af8443c27fe559d6c5e8df41504e61adbd21f17c --- /dev/null +++ b/src/cuda/include/smoothing_kernels.h @@ -0,0 +1,14 @@ +#include <cuda_runtime.h> // Required for float3 and CUDA types + +#ifndef SMOOTHING_KERNELS_H +#define SMOOTHING_KERNELS_H + +namespace FluidSimulationGPU { + +__device__ float poly6Kernel(float r2); +__device__ float spikyKernelGrad(float r); +__device__ float viscosityKernelLap(float r); + +} // namespace FluidSimulationGPU + +#endif // SMOOTHING_KERNELS_H diff --git a/src/cuda/include/spatialHash_NBSearch_kernels.h b/src/cuda/include/spatialHash_NBSearch_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..160e50068cb3c10424c931f66d635ffad723f312 --- /dev/null +++ b/src/cuda/include/spatialHash_NBSearch_kernels.h @@ -0,0 +1,13 @@ +#ifndef SPATIAL_HASH_NBSEARCH_KERNELS_H +#define SPATIAL_HASH_NBSEARCH_KERNELS_H + +namespace FluidSimulationGPU { + +__global__ void computeCellIndices(float3* positions, unsigned int* cellIndices, int numParticles, float3 gridMin, float3 gridCellSize, int gridSizeX, int gridSizeY, int gridSizeZ); +__global__ void reorderData(unsigned int* particleIndices, float3* positions, float3* velocities, float3* positions_sorted, float3* velocities_sorted, int numParticles); +__global__ void findCellStartEnd(unsigned int* cellIndices, unsigned int* cellStart, unsigned int* cellEnd, int numParticles); +void neighbourHoodSearch(int nParticles, float3* d_pos, float3* d_vel, float3* d_posSorted, float3* d_velSorted, unsigned int* d_cellIndc, unsigned int* d_parIndc, unsigned int* d_cEnd, unsigned int* d_cStart, const int bpg, const int tpb, int &gridSizeX, int &gridSizeY, int &gridSizeZ); + +} // namespace FluidSimulationGPU + +#endif // SPATIAL_HASH_NBSEARCH_KERNELS_H diff --git a/src/cuda/init_cuda.cu b/src/cuda/init_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7cb10f4a4d13d2960bede1f7179c7701a12fbb3e --- /dev/null +++ b/src/cuda/init_cuda.cu @@ -0,0 +1,275 @@ +// init_cu.cu + +#include "include/cuda_device_vars.h" +#include "../settings/constants.h" + +#include "include/pcisph_kernels.h" + +#include <cuda_runtime.h> +#include <cuda_gl_interop.h> +#include <device_launch_parameters.h> + +namespace FluidSimulationGPU { + +// Global Variables +cudaGraphicsResource_t cuda_vbo_resource; +cudaGraphicsResource_t cuda_vbo_boundary_resource; + +// Constants +__constant__ float PI; +__constant__ float H; +__constant__ float H2; +__constant__ float POLY6_CONST; +__constant__ float SPIKY_GRAD_CONST; +__constant__ float VISC_LAP_CONST; + +__constant__ float PARTICLE_MASS; +__constant__ float REST_DENSITY; +__constant__ float VISCOSITY; + +// Boundary constants +__constant__ float BOUND_X_MIN; +__constant__ float BOUND_X_MAX; +__constant__ float BOUND_Y_MIN; +__constant__ float BOUND_Y_MAX; +__constant__ float BOUND_Z_MIN; +__constant__ float BOUND_Z_MAX; + +// Device Arrays +float3* d_positions = nullptr; +float3* d_velocities = nullptr; + +unsigned int* d_cellIndices = nullptr; +unsigned int* d_particleIndices = nullptr; + +float3* d_positions_sorted = nullptr; +float3* d_velocities_sorted = nullptr; + +unsigned int* d_cellStart = nullptr; +unsigned int* d_cellEnd = nullptr; + +// Arrays for densities, pressures, accelerations +float* d_densities = nullptr; +float* d_density_errors = nullptr; +float* d_pressure_increments = nullptr; +float3* d_pressure_forces = nullptr; +float3* d_accelerations = nullptr; + +// Additional arrays for PCISPH +float3* d_velocities_pred = nullptr; +float3* d_positions_pred = nullptr; +float* d_sum_gradW2 = nullptr; + +// Boundary particles +float3* d_boundary_positions = nullptr; +float3* d_boundary_velocities = nullptr; + +unsigned int* d_boundaryCellStart = nullptr; +unsigned int* d_boundaryCellEnd = nullptr; + +unsigned int* d_boundaryCellIndices = nullptr; +unsigned int* d_boundaryParticleIndices = nullptr; + +void initializeConstants() +{ + // Host constants + const float h_H2 = h_H * h_H; + const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9)); + const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6)); + const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6)); + + // Copy constants to device + cudaMemcpyToSymbol(PI, &h_PI, sizeof(float)); + cudaMemcpyToSymbol(H, &h_H, sizeof(float)); + cudaMemcpyToSymbol(H2, &h_H2, sizeof(float)); + cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float)); + cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float)); + cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float)); + + cudaMemcpyToSymbol(PARTICLE_MASS, &h_PARTICLE_MASS, sizeof(float)); + cudaMemcpyToSymbol(REST_DENSITY, &h_REST_DENSITY, sizeof(float)); + cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float)); + + // Copy boundary constants to device + cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float)); +} + +// Register VBO with CUDA +void registerVBOWithCUDA(unsigned int vbo) +{ + cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); +} + +void registerBoundaryVBOWithCUDA(unsigned int vbo) +{ + cudaGraphicsGLRegisterBuffer(&cuda_vbo_boundary_resource, vbo, cudaGraphicsMapFlagsNone); +} + +// Unregister VBO from CUDA +void unregisterVBOFromCUDA() +{ + cudaGraphicsUnregisterResource(cuda_vbo_resource); +} + +void unregisterBoundaryVBOFromCUDA() +{ + cudaGraphicsUnregisterResource(cuda_vbo_boundary_resource); +} + +// Initialize device arrays +void initializeFluidDeviceArrays( + const float3* h_positions, + const float3* h_velocities, + int numParticles +) +{ + // Allocate device memory for velocities + cudaMalloc(&d_velocities, numParticles * sizeof(float3)); + cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice); + + // Allocate device memory for predicted velocities and positions + cudaMalloc(&d_velocities_pred, numParticles * sizeof(float3)); + cudaMalloc(&d_positions_pred, numParticles * sizeof(float3)); + + // Allocate device memory for cell indices and particle indices + cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int)); + cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int)); + + // Allocate device memory for sorted positions and velocities + cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3)); + cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3)); + + // Allocate cell start and end arrays + int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H); + int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H); + int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H); + int maxCells = gridSizeX * gridSizeY * gridSizeZ; + + cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int)); + cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int)); + + // Allocate device memory for densities, pressure increments, and accelerations + cudaMalloc(&d_densities, numParticles * sizeof(float)); + cudaMalloc(&d_density_errors, numParticles * sizeof(float)); + cudaMalloc(&d_pressure_increments, numParticles * sizeof(float)); + cudaMalloc(&d_pressure_forces, numParticles * sizeof(float3)); + cudaMalloc(&d_accelerations, numParticles * sizeof(float3)); + + // Allocate device memory for sum_gradW2 + cudaMalloc(&d_sum_gradW2, numParticles * sizeof(float)); + + // Initialize positions_pred to initial positions + cudaMemcpy(d_positions_pred, h_positions, numParticles * sizeof(float3), cudaMemcpyHostToDevice); +} + +void initializeBoundryDeviceArrays( + const float3* h_boundaryPositions, + const float3* h_boundaryVelocities, + int numBoundaryParticles +) +{ + // Allocate cell start and end arrays + int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H); + int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H); + int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H); + int maxCells = gridSizeX * gridSizeY * gridSizeZ; + + // cudaMalloc(&d_boundary_positions, numBoundaryParticles * sizeof(float3)); + cudaMalloc(&d_boundary_velocities, numBoundaryParticles * sizeof(float3)); + + //cudaMemcpy(d_boundary_positions, h_boundaryPositions, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice); + cudaMemcpy(d_boundary_velocities, h_boundaryVelocities, numBoundaryParticles * sizeof(float3), cudaMemcpyHostToDevice); + + cudaMalloc(&d_boundaryCellIndices, numBoundaryParticles * sizeof(unsigned int)); + cudaMalloc(&d_boundaryParticleIndices, numBoundaryParticles * sizeof(unsigned int)); + + cudaMalloc(&d_boundaryCellStart, maxCells * sizeof(unsigned int)); + cudaMalloc(&d_boundaryCellEnd, maxCells * sizeof(unsigned int)); +} + + +// NeighbourHoodSearch for boundary particles 1 time at start since they dont move +void boundaryNBSearch(int numBoundaryParticles) { + + // Map the VBO resource to get a device pointer + size_t num_bytes; + cudaGraphicsMapResources(1, &cuda_vbo_boundary_resource, 0); + cudaGraphicsResourceGetMappedPointer((void**)&d_boundary_positions, &num_bytes, cuda_vbo_boundary_resource); + + int gsX, gsY, gsZ; + + int threadsPerBlock = 256; + int blocksPerGrid = (numBoundaryParticles + threadsPerBlock - 1) / threadsPerBlock; + + float3* d_boundary_positions_sorted; + float3* d_boundary_velocities_sorted; + + + cudaMalloc(&d_boundary_positions_sorted, numBoundaryParticles * sizeof(float3)); + cudaMalloc(&d_boundary_velocities_sorted, numBoundaryParticles * sizeof(float3)); + + neigbourHoodSearch( + numBoundaryParticles, + d_boundary_positions, + d_boundary_velocities, + d_boundary_positions_sorted, + d_boundary_velocities_sorted, + d_boundaryCellIndices, + d_boundaryParticleIndices, + d_boundaryCellEnd, + d_boundaryCellStart, + blocksPerGrid, + threadsPerBlock, + gsX, + gsY, + gsZ); + + // Unmap the VBO resource + cudaGraphicsUnmapResources(1, &cuda_vbo_boundary_resource, 0); + + cudaFree(d_boundary_positions_sorted); + cudaFree(d_boundary_velocities_sorted); + + // Move some of above init code for boundary to seeparate function +} + +// Cleanup device arrays +void cleanupDeviceArrays() +{ + cudaFree(d_velocities); + cudaFree(d_velocities_pred); + cudaFree(d_positions_pred); + + cudaFree(d_cellIndices); + cudaFree(d_particleIndices); + + cudaFree(d_positions_sorted); + cudaFree(d_velocities_sorted); + + cudaFree(d_cellStart); + cudaFree(d_cellEnd); + + cudaFree(d_densities); + cudaFree(d_density_errors); + cudaFree(d_pressure_increments); + cudaFree(d_pressure_forces); + cudaFree(d_accelerations); + + cudaFree(d_boundary_positions); + cudaFree(d_boundary_velocities); + + cudaFree(d_boundaryCellIndices); + cudaFree(d_boundaryParticleIndices); + cudaFree(d_boundaryCellEnd); + cudaFree(d_boundaryCellStart); + + cudaFree(d_sum_gradW2); +} + + +} // namespace FluidSimulationGPU diff --git a/src/cuda/kernels_cuda.cu b/src/cuda/kernels_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..d7dca020e5aea56ddc0520a8bd3e97219939264a --- /dev/null +++ b/src/cuda/kernels_cuda.cu @@ -0,0 +1,606 @@ +// kernels_cuda.cu +#include "kernels_cuda.h" +#include "constants.h" +#include "include/float3_helper_math.h" +#include <cuda_runtime.h> +#include <cuda_gl_interop.h> +#include <device_launch_parameters.h> +#include <cmath> + +// Include Thrust for sorting +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#include <thrust/sequence.h> + +// CUDA Graphics Resource (for OpenGL-CUDA interoperability) +cudaGraphicsResource_t cuda_vbo_resource; + +// Constants +__constant__ float PI; +__constant__ float H; +__constant__ float H2; +__constant__ float POLY6_CONST; +__constant__ float SPIKY_GRAD_CONST; +__constant__ float VISC_LAP_CONST; + +__constant__ float PARTICLE_MASS = 1.0f; +__constant__ float REST_DENSITY = 60.0f; +__constant__ float GAS_CONSTANT; +__constant__ float VISCOSITY; + +// Boundary constants +__constant__ float BOUND_X_MIN; +__constant__ float BOUND_X_MAX; +__constant__ float BOUND_Y_MIN; +__constant__ float BOUND_Y_MAX; +__constant__ float BOUND_Z_MIN; +__constant__ float BOUND_Z_MAX; + +// Device arrays +float3* d_positions; // Positions mapped from OpenGL VBO +float3* d_velocities; + +unsigned int* d_cellIndices; +unsigned int* d_particleIndices; + +float3* d_positions_sorted; +float3* d_velocities_sorted; + +unsigned int* d_cellStart; +unsigned int* d_cellEnd; + +// Arrays for densities, pressures, accelerations +float* d_densities; +float* d_pressures; +float3* d_accelerations; + +// Smoothing kernels +__device__ float poly6Kernel(float r2) +{ + if (r2 >= 0.0f && r2 <= H2) { + float term = H2 - r2; + return POLY6_CONST * term * term * term; + } else { + return 0.0f; + } +} + +__device__ float spikyKernelGrad(float r) +{ + if (r > 0.0f && r <= H) { + float term = H - r; + return SPIKY_GRAD_CONST * term * term; + } else { + return 0.0f; + } +} + +__device__ float viscosityKernelLap(float r) +{ + if (r >= 0.0f && r <= H) { + return VISC_LAP_CONST * (H - r); + } else { + return 0.0f; + } +} + +// Compute cell indices +__global__ void computeCellIndices( + float3* positions, + unsigned int* cellIndices, + int numParticles, + float3 gridMin, + float3 gridCellSize, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos = positions[idx]; + + int ix = (int)((pos.x - gridMin.x) / gridCellSize.x); + int iy = (int)((pos.y - gridMin.y) / gridCellSize.y); + int iz = (int)((pos.z - gridMin.z) / gridCellSize.z); + + // Clamp indices to grid bounds + ix = max(0, min(ix, gridSizeX - 1)); + iy = max(0, min(iy, gridSizeY - 1)); + iz = max(0, min(iz, gridSizeZ - 1)); + + unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY; + + cellIndices[idx] = cellIndex; + } +} + +// Reordering kernel +__global__ void reorderData( + unsigned int* particleIndices, + float3* positions, + float3* velocities, + float3* positions_sorted, + float3* velocities_sorted, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + unsigned int sortedIdx = particleIndices[idx]; + + positions_sorted[idx] = positions[sortedIdx]; + velocities_sorted[idx] = velocities[sortedIdx]; + } +} + +// Find cell start and end indices +// can bli reeplaceed med atomicAdd för varje ceell index?? +__global__ void findCellStartEnd( + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + unsigned int cellIndex = cellIndices[idx]; + + if (idx == 0) { + cellStart[cellIndex] = 0; + } else { + unsigned int prevCellIndex = cellIndices[idx - 1]; + if (prevCellIndex != cellIndex) { + cellStart[cellIndex] = idx; + cellEnd[prevCellIndex] = idx; + } + } + + if (idx == numParticles - 1) { + cellEnd[cellIndex] = numParticles; + } + } +} + +// Compute density and pressure +__global__ void computeDensityPressure( + float3* positions, + float* densities, + float* pressures, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos_i = positions[idx]; + float density = 0.0f; + + unsigned int cellIndex = cellIndices[idx]; + + // Compute cell coordinates + int cellX = cellIndex % gridSizeX; + int cellY = (cellIndex / gridSizeX) % gridSizeY; + int cellZ = cellIndex / (gridSizeX * gridSizeY); + + // Iterate over neighboring cells + for (int dz = -1; dz <= 1; ++dz) { + int nz = cellZ + dz; + if (nz < 0 || nz >= gridSizeZ) continue; + for (int dy = -1; dy <= 1; ++dy) { + int ny = cellY + dy; + if (ny < 0 || ny >= gridSizeY) continue; + for (int dx = -1; dx <= 1; ++dx) { + int nx = cellX + dx; + if (nx < 0 || nx >= gridSizeX) continue; + + unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; + + unsigned int startIdx = cellStart[neighborCellIndex]; + unsigned int endIdx = cellEnd[neighborCellIndex]; + + if (startIdx != 0xFFFFFFFF) { + for (unsigned int j = startIdx; j < endIdx; ++j) { + float3 pos_j = positions[j]; + + float3 r = pos_j - pos_i; + float r2 = r.x * r.x + r.y * r.y + r.z * r.z; + + density += PARTICLE_MASS * poly6Kernel(r2); + } + } + } + } + } + + densities[idx] = density; + //pressures[idx] = GAS_CONSTANT * fmaxf((density - REST_DENSITY), 0.0); + float dens = density/REST_DENSITY; + pressures[idx] = GAS_CONSTANT * ((dens*dens*dens*dens*dens*dens*dens)-1.0); + //pressures[idx] = GAS_CONSTANT * min((density - REST_DENSITY), 0.0); + //pressures[idx] = GAS_CONSTANT * (density - REST_DENSITY); + } +} + +// Compute forces +__global__ void computeForces( + float3* positions, + float3* velocities, + float* densities, + float* pressures, + float3* accelerations, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos_i = positions[idx]; + float3 vel_i = velocities[idx]; + float density_i = densities[idx]; + float pressure_i = pressures[idx]; + + float3 fPressure = make_float3(0.0f, 0.0f, 0.0f); + float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f); + float3 fGravity = make_float3(0.0f, -4.0f , 0.0f); + + unsigned int cellIndex = cellIndices[idx]; + + // Compute cell coordinates + int cellX = cellIndex % gridSizeX; + int cellY = (cellIndex / gridSizeX) % gridSizeY; + int cellZ = cellIndex / (gridSizeX * gridSizeY); + + // Iterate over neighboring cells + for (int dz = -1; dz <= 1; ++dz) { + int nz = cellZ + dz; + if (nz < 0 || nz >= gridSizeZ) continue; + for (int dy = -1; dy <= 1; ++dy) { + int ny = cellY + dy; + if (ny < 0 || ny >= gridSizeY) continue; + for (int dx = -1; dx <= 1; ++dx) { + int nx = cellX + dx; + if (nx < 0 || nx >= gridSizeX) continue; + + unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; + + unsigned int startIdx = cellStart[neighborCellIndex]; + unsigned int endIdx = cellEnd[neighborCellIndex]; + + if (startIdx != 0xFFFFFFFF) { + for (unsigned int j = startIdx; j < endIdx; ++j) { + if (j == idx) continue; + + float3 pos_j = positions[j]; + float3 r = (pos_i - pos_j); + + float distSq = r.x * r.x + r.y * r.y + r.z * r.z; + + if (distSq < H2 && distSq > 0.0001f) { + float dist = sqrtf(distSq); + + // Pressure force + float spikyGrad = spikyKernelGrad(dist); + + // Cache pressures[j] + float density_j = densities[j]; + float pressure_j = pressures[j]; + + // avreged pressure scaleed by inverse j density + /* + float pressureTerm = - PARTICLE_MASS * 0.5 * (pressure_i + pressure_j) / density_j * spikyGrad; + fPressure += pressureTerm*(r/dist); // r/dist gives direction + */ + + // Symmetric pressure -- can cahnge to inverse dennsity in density/pressure calc to avoid devide by density here + // Add particle_mass to pressure term according to equations !!! Added down below in aceleration calcualtion + float pressureTerm = - (pressure_i / (density_i * density_i) + pressure_j / (density_j * density_j)) * spikyGrad; + float3 pressureForce = pressureTerm * (r / dist); // Directional force + fPressure += pressureForce; + + // Viscosity force + float3 velDiff = velocities[j] - vel_i; + float viscLap = viscosityKernelLap(dist); + fViscosity += VISCOSITY * velDiff * viscLap / density_j * PARTICLE_MASS; + } + } + } + } + } + } + + // Total acceleration + // According to paper v_i(t+dt) = v_(i+1) + (dt/m_i)*Fp_i + // While external and viscocisty force is calculated before But mass and nnot deeensity is used to deevidee force + // Deviding by mass, looks a lot better + accelerations[idx] = (fPressure * PARTICLE_MASS + fViscosity + fGravity) / PARTICLE_MASS; + //accelerations[idx] = (fPressure * PARTICLE_MASS + fViscosity + fGravity) / density_i; + } +} + +// Integration +__global__ void integrate( + float3* positions, + float3* velocities, + float3* accelerations, + float timeStep, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + // Update velocity + float3 vel_i = velocities[idx] + (accelerations[idx] * timeStep); + velocities[idx] = vel_i; + + // Update position + positions[idx] += vel_i * timeStep; + } +} + +// Enforce boundary conditions +__global__ void enforceBoundary( + float3* positions, + float3* velocities, + int numParticles, + float damping +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + // X-axis + if (positions[idx].x < BOUND_X_MIN) { + positions[idx].x = BOUND_X_MIN; + velocities[idx].x *= -damping; + } + if (positions[idx].x > BOUND_X_MAX) { + positions[idx].x = BOUND_X_MAX; // Should account for particle radius!! + velocities[idx].x *= -damping; + } + + // Y-axis + if (positions[idx].y < BOUND_Y_MIN) { + positions[idx].y = BOUND_Y_MIN; + velocities[idx].y *= -damping; + } + if (positions[idx].y > BOUND_Y_MAX) { + positions[idx].y = BOUND_Y_MAX; + velocities[idx].y *= -damping; + } + + // Z-axis + if (positions[idx].z < BOUND_Z_MIN) { + positions[idx].z = BOUND_Z_MIN; + velocities[idx].z *= -damping; + } + if (positions[idx].z > BOUND_Z_MAX) { + positions[idx].z = BOUND_Z_MAX; + velocities[idx].z *= -damping; + } + } +} + +void updateParticles(float deltaTime) +{ + // Map the VBO resource to get a device pointer + size_t num_bytes; + cudaGraphicsMapResources(1, &cuda_vbo_resource, 0); + cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource); + + int threadsPerBlock = 256; + int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock; + + // Compute grid size based on smoothing radius + float3 gridMin = make_float3(h_BOUND_X_MIN, h_BOUND_Y_MIN, h_BOUND_Z_MIN); + float3 gridMax = make_float3(h_BOUND_X_MAX, h_BOUND_Y_MAX, h_BOUND_Z_MAX); + float3 gridCellSize = make_float3(h_H, h_H, h_H); + + int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H); + int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H); + int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H); + int maxCells = gridSizeX * gridSizeY * gridSizeZ; + + // Allocate or resize cellStart and cellEnd arrays if necessary + // (You may need to adjust your initialization code to handle dynamic sizes) + + // Compute cell indices + computeCellIndices<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, d_cellIndices, NUM_PARTICLES, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + // Wrap device pointers with thrust iterators + thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndices); + thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_particleIndices); + + // Initialize particle indices + thrust::sequence(dev_particleIndices_ptr, dev_particleIndices_ptr + NUM_PARTICLES); + + // Sort particle indices based on cell indices + thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + NUM_PARTICLES, dev_particleIndices_ptr); + + // Reorder positions and velocities + reorderData<<<blocksPerGrid, threadsPerBlock>>>( + d_particleIndices, + d_positions, + d_velocities, + d_positions_sorted, + d_velocities_sorted, + NUM_PARTICLES); + cudaDeviceSynchronize(); + + // Swap sorted data with original data + float3* temp_positions = d_positions; + d_positions = d_positions_sorted; + d_positions_sorted = temp_positions; + + float3* temp_velocities = d_velocities; + d_velocities = d_velocities_sorted; + d_velocities_sorted = temp_velocities; + + // Build cell start and end indices + // Initialize cellStart and cellEnd + cudaMemset(d_cellStart, 0xFF, maxCells * sizeof(unsigned int)); + cudaMemset(d_cellEnd, 0xFF, maxCells * sizeof(unsigned int)); + + findCellStartEnd<<<blocksPerGrid, threadsPerBlock>>>( + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES); + cudaDeviceSynchronize(); + + // Compute density and pressure + computeDensityPressure<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_densities, + d_pressures, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + // Compute forces + computeForces<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_velocities, + d_densities, + d_pressures, + d_accelerations, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + // Integrate + float timeStep = 0.001f; + integrate<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_velocities, + d_accelerations, + timeStep, + //deltaTime, + NUM_PARTICLES); + cudaDeviceSynchronize(); + + // Enforce boundary conditions + float damping = 0.7f; + enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_velocities, + NUM_PARTICLES, + damping); + cudaDeviceSynchronize(); + + // Unmap the VBO resource + cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0); +} + +void initializeConstants() +{ + // Host constants + const float h_H2 = h_H * h_H; + const float h_POLY6_CONST = 315.0f / (64.0f * h_PI * powf(h_H, 9)); + const float h_SPIKY_GRAD_CONST = -45.0f / (h_PI * powf(h_H, 6)); + const float h_VISC_LAP_CONST = 45.0f / (h_PI * powf(h_H, 6)); + + // Copy constants to device + cudaMemcpyToSymbol(PI, &h_PI, sizeof(float)); + cudaMemcpyToSymbol(H, &h_H, sizeof(float)); + cudaMemcpyToSymbol(H2, &h_H2, sizeof(float)); + cudaMemcpyToSymbol(POLY6_CONST, &h_POLY6_CONST, sizeof(float)); + cudaMemcpyToSymbol(SPIKY_GRAD_CONST, &h_SPIKY_GRAD_CONST, sizeof(float)); + cudaMemcpyToSymbol(VISC_LAP_CONST, &h_VISC_LAP_CONST, sizeof(float)); + + cudaMemcpyToSymbol(GAS_CONSTANT, &h_GAS_CONSTANT, sizeof(float)); + cudaMemcpyToSymbol(VISCOSITY, &h_VISCOSITY, sizeof(float)); + + // Copy boundary constants to device + cudaMemcpyToSymbol(BOUND_X_MIN, &h_BOUND_X_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_X_MAX, &h_BOUND_X_MAX, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Y_MIN, &h_BOUND_Y_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float)); + cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float)); +} + +// Register VBO with CUDA +void registerVBOWithCUDA(unsigned int vbo) +{ + cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); +} + +// Unregister VBO from CUDA +void unregisterVBOFromCUDA() +{ + cudaGraphicsUnregisterResource(cuda_vbo_resource); +} + +// Initialize device arrays +void initializeDeviceArrays( + const float3* h_positions, + const float3* h_velocities, + int numParticles +) +{ + // Allocate device memory for velocities + cudaMalloc(&d_velocities, numParticles * sizeof(float3)); + cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice); + + // Allocate device memory for cell indices and particle indices + cudaMalloc(&d_cellIndices, numParticles * sizeof(unsigned int)); + cudaMalloc(&d_particleIndices, numParticles * sizeof(unsigned int)); + + // Allocate device memory for sorted positions and velocities + cudaMalloc(&d_positions_sorted, numParticles * sizeof(float3)); + cudaMalloc(&d_velocities_sorted, numParticles * sizeof(float3)); + + // Allocate cell start and end arrays + // Note: You need to ensure that 'maxCells' is available here or adjust the code to allocate after computing 'maxCells' + int gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H); + int gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H); + int gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H); + int maxCells = gridSizeX * gridSizeY * gridSizeZ; + + cudaMalloc(&d_cellStart, maxCells * sizeof(unsigned int)); + cudaMalloc(&d_cellEnd, maxCells * sizeof(unsigned int)); + + // Allocate device memory for densities, pressures, and accelerations + cudaMalloc(&d_densities, numParticles * sizeof(float)); + cudaMalloc(&d_pressures, numParticles * sizeof(float)); + cudaMalloc(&d_accelerations, numParticles * sizeof(float3)); +} + +// Cleanup device arrays +void cleanupDeviceArrays() +{ + cudaFree(d_velocities); + + cudaFree(d_cellIndices); + cudaFree(d_particleIndices); + + cudaFree(d_positions_sorted); + cudaFree(d_velocities_sorted); + + cudaFree(d_cellStart); + cudaFree(d_cellEnd); + + cudaFree(d_densities); + cudaFree(d_pressures); + cudaFree(d_accelerations); +} \ No newline at end of file diff --git a/src/cuda/pcisph_kernels.cu b/src/cuda/pcisph_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..f2efd40677d2f06175404a1d6a7f80950a4733db --- /dev/null +++ b/src/cuda/pcisph_kernels.cu @@ -0,0 +1,357 @@ +// pcisph_kernels.cu +#include "include/pcisph_kernels.h" +#include "include/smoothing_kernels.h" + +#include "include/float3_helper_math.h" + +#include "include/cuda_device_vars.h" +#include "../settings/constants.h" + + +#include <cuda_runtime.h> +#include <cuda_gl_interop.h> +#include <device_launch_parameters.h> + +// Include Thrust for sorting and reductions +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#include <thrust/sequence.h> +#include <thrust/reduce.h> + + +namespace FluidSimulationGPU { + + + +// Compute density errors (used in PCISPH) +__global__ void computeDensityError( + float3* positions, + float* densities, + float* density_errors, + float* sum_gradW2, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos_i = positions[idx]; + float density = 0.0f; + float sumGradW2_local = 0.0f; + + unsigned int cellIndex = cellIndices[idx]; + + // Compute cell coordinates + int cellX = cellIndex % gridSizeX; + int cellY = (cellIndex / gridSizeX) % gridSizeY; + int cellZ = cellIndex / (gridSizeX * gridSizeY); + + // Iterate over neighboring cells + for (int dz = -1; dz <= 1; ++dz) { + int nz = cellZ + dz; + if (nz < 0 || nz >= gridSizeZ) continue; + for (int dy = -1; dy <= 1; ++dy) { + int ny = cellY + dy; + if (ny < 0 || ny >= gridSizeY) continue; + for (int dx = -1; dx <= 1; ++dx) { + int nx = cellX + dx; + if (nx < 0 || nx >= gridSizeX) continue; + + unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; + + unsigned int startIdx = cellStart[neighborCellIndex]; + unsigned int endIdx = cellEnd[neighborCellIndex]; + + if (startIdx != 0xFFFFFFFF) { + for (unsigned int j = startIdx; j < endIdx; ++j) { + float3 pos_j = positions[j]; + + float3 r = pos_i - pos_j; + float r2 = dot(r, r); + + if (r2 < H2 && r2 > 0.0001f) { + float dist = sqrtf(r2); + + float W = poly6Kernel(r2); + density += PARTICLE_MASS * W; + + if (sum_gradW2 != nullptr) { + // Compute gradW + float gradW_coeff = spikyKernelGrad(dist); + float3 gradW = gradW_coeff * (r / dist); + + sumGradW2_local += dot(gradW, gradW); + } + } + } + } + } + } + } + + densities[idx] = density; + density_errors[idx] = density - REST_DENSITY; + + if (sum_gradW2 != nullptr) { + sum_gradW2[idx] = sumGradW2_local; + } + } +} + +// Update pressures based on density errors +__global__ void updatePressures( + float* pressure_increments, + float* density_errors, + float A, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float deltaP = -density_errors[idx] / A; + pressure_increments[idx] += deltaP; + } +} + +// Compute pressure forces +__global__ void computePressureForces( + float3* positions, + float* pressure_increments, + float3* pressure_forces, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos_i = positions[idx]; + float deltaP_i = pressure_increments[idx]; + + float3 fPressure = make_float3(0.0f, 0.0f, 0.0f); + + unsigned int cellIndex = cellIndices[idx]; + + // Compute cell coordinates + int cellX = cellIndex % gridSizeX; + int cellY = (cellIndex / gridSizeX) % gridSizeY; + int cellZ = cellIndex / (gridSizeX * gridSizeY); + + // Iterate over neighboring cells + for (int dz = -1; dz <= 1; ++dz) { + int nz = cellZ + dz; + if (nz < 0 || nz >= gridSizeZ) continue; + for (int dy = -1; dy <= 1; ++dy) { + int ny = cellY + dy; + if (ny < 0 || ny >= gridSizeY) continue; + for (int dx = -1; dx <= 1; ++dx) { + int nx = cellX + dx; + if (nx < 0 || nx >= gridSizeX) continue; + + unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; + + unsigned int startIdx = cellStart[neighborCellIndex]; + unsigned int endIdx = cellEnd[neighborCellIndex]; + + if (startIdx != 0xFFFFFFFF) { + for (unsigned int j = startIdx; j < endIdx; ++j) { + if (j == idx) continue; + + float3 pos_j = positions[j]; + float deltaP_j = pressure_increments[j]; + + float3 r = pos_i - pos_j; + + float distSq = dot(r, r); + + if (distSq < H2 && distSq > 0.0001f) { + float dist = sqrtf(distSq); + + // Compute gradient of kernel + float gradW = spikyKernelGrad(dist); + float3 gradWij = gradW * (r / dist); + + // Symmetric pressure force + float factor = - (deltaP_i + deltaP_j) / (REST_DENSITY * REST_DENSITY); + fPressure += factor * gradWij; + } + } + } + } + } + } + + // Multiply by constants + fPressure *= PARTICLE_MASS * PARTICLE_MASS; + + pressure_forces[idx] = fPressure; + } +} + +// Compute viscosity and external forces (excluding pressure forces) +__global__ void computeViscosityAndExternalForces( + float3* positions, + float3* velocities, + float3* accelerations, + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos_i = positions[idx]; + float3 vel_i = velocities[idx]; + + float3 fViscosity = make_float3(0.0f, 0.0f, 0.0f); + float3 fGravity = make_float3(0.0f, -20.0f, 0.0f) * PARTICLE_MASS; + + unsigned int cellIndex = cellIndices[idx]; + + // Compute cell coordinates + int cellX = cellIndex % gridSizeX; + int cellY = (cellIndex / gridSizeX) % gridSizeY; + int cellZ = cellIndex / (gridSizeX * gridSizeY); + + // Iterate over neighboring cells + for (int dz = -1; dz <= 1; ++dz) { + int nz = cellZ + dz; + if (nz < 0 || nz >= gridSizeZ) continue; + for (int dy = -1; dy <= 1; ++dy) { + int ny = cellY + dy; + if (ny < 0 || ny >= gridSizeY) continue; + for (int dx = -1; dx <= 1; ++dx) { + int nx = cellX + dx; + if (nx < 0 || nx >= gridSizeX) continue; + + unsigned int neighborCellIndex = nx + ny * gridSizeX + nz * gridSizeX * gridSizeY; + + unsigned int startIdx = cellStart[neighborCellIndex]; + unsigned int endIdx = cellEnd[neighborCellIndex]; + + if (startIdx != 0xFFFFFFFF) { + for (unsigned int j = startIdx; j < endIdx; ++j) { + if (j == idx) continue; + + float3 pos_j = positions[j]; + float3 vel_j = velocities[j]; + float3 r = pos_i - pos_j; + + float distSq = dot(r, r); + + if (distSq < H2 && distSq > 0.0001f) { + float dist = sqrtf(distSq); + + // Viscosity force + float3 velDiff = vel_j - vel_i; + float viscLap = viscosityKernelLap(dist); + fViscosity += VISCOSITY * velDiff * viscLap * PARTICLE_MASS / REST_DENSITY; + } + } + } + } + } + } + + // Total acceleration (excluding pressure force) + accelerations[idx] = (fViscosity + fGravity) / PARTICLE_MASS; + } +} + +// Enforce boundary conditions, very simple should bee improved. +__global__ void enforceBoundary( + float3* positions, + float3* velocities, + int numParticles, + float damping +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + // X-axis + if (positions[idx].x < BOUND_X_MIN) { + positions[idx].x = BOUND_X_MIN; + velocities[idx].x *= -damping; + } + if (positions[idx].x > BOUND_X_MAX) { + positions[idx].x = BOUND_X_MAX; + velocities[idx].x *= -damping; + } + + // Y-axis + if (positions[idx].y < BOUND_Y_MIN) { + positions[idx].y = BOUND_Y_MIN; + velocities[idx].y *= -damping; + } + if (positions[idx].y > BOUND_Y_MAX) { + positions[idx].y = BOUND_Y_MAX; + velocities[idx].y *= -damping; + } + + // Z-axis + if (positions[idx].z < BOUND_Z_MIN) { + positions[idx].z = BOUND_Z_MIN; + velocities[idx].z *= -damping; + } + if (positions[idx].z > BOUND_Z_MAX) { + positions[idx].z = BOUND_Z_MAX; + velocities[idx].z *= -damping; + } + } +} + +// Kernel to predict velocities and positions +__global__ void predictVelocitiesPositions( + float3* velocities, + float3* velocities_pred, + float3* positions, + float3* positions_pred, + float3* accelerations, + float timeStep, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numParticles) { + // Compute predicted velocity + velocities_pred[idx] = velocities[idx] + accelerations[idx] * timeStep; + // Compute predicted position + positions_pred[idx] = positions[idx] + velocities_pred[idx] * timeStep; + } +} + +// Update velocities and positions based on pressure forces in +// pressure correcting loop. +__global__ void updateVelocitiesPositions( + float3* velocities_pred, + float3* positions_pred, + float3* pressure_forces, + float timeStep, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + // Update predicted velocity + velocities_pred[idx] += (pressure_forces[idx] / PARTICLE_MASS) * timeStep; + + // Update predicted position + positions_pred[idx] += (pressure_forces[idx] / PARTICLE_MASS) * (timeStep * timeStep); + } +} + + +} // namespace FluidSimulationGPU \ No newline at end of file diff --git a/src/cuda/pcisph_update.cu b/src/cuda/pcisph_update.cu new file mode 100644 index 0000000000000000000000000000000000000000..05c163b43b3544496b5138ca7aebb6c9308c2a93 --- /dev/null +++ b/src/cuda/pcisph_update.cu @@ -0,0 +1,155 @@ +#include "include/cuda_device_vars.h" +#include "include/pcisph_kernels.h" +#include "include/spatialHash_NBSearch_kernels.h" +#include "../settings/constants.h" + +#include <cuda_runtime.h> +#include <cuda_gl_interop.h> +#include <device_launch_parameters.h> +#include <thrust/device_ptr.h> +#include <thrust/reduce.h> + +namespace FluidSimulationGPU { + +void updateParticles(float deltaTime) +{ + // Map the VBO resource to get a device pointer + size_t num_bytes; + cudaGraphicsMapResources(1, &cuda_vbo_resource, 0); + cudaGraphicsResourceGetMappedPointer((void**)&d_positions, &num_bytes, cuda_vbo_resource); + + int threadsPerBlock = 256; + int blocksPerGrid = (NUM_PARTICLES + threadsPerBlock - 1) / threadsPerBlock; + + // Time step should be small but we can run updateParticles multiple times during display + float timeStep = 0.004f; + + // -------------------- Neighbor Search -------------------- + int gridSizeX, gridSizeY, gridSizeZ; + neigbourHoodSearch( + NUM_PARTICLES, + d_positions, + d_velocities, + d_positions_sorted, + d_velocities_sorted, + d_cellIndices, + d_particleIndices, + d_cellEnd, + d_cellStart, + blocksPerGrid, + threadsPerBlock, + gridSizeX, + gridSizeY, + gridSizeZ); + + // -------------------- Prediction Step -------------------- + + computeViscosityAndExternalForces<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_velocities, + d_accelerations, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + predictVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>( + d_velocities, + d_velocities_pred, + d_positions, + d_positions_pred, + d_accelerations, + timeStep, + NUM_PARTICLES); + cudaDeviceSynchronize(); + + // Initialize pressure increments to zero + cudaMemset(d_pressure_increments, 0, NUM_PARTICLES * sizeof(float)); + + // Compute densities and density errors, and per-particle sumGradW2 + computeDensityError<<<blocksPerGrid, threadsPerBlock>>>( + d_positions_pred, + d_densities, + d_density_errors, + d_sum_gradW2, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + // Reduce sum_gradW2 over all particles + thrust::device_ptr<float> dev_sumGradW2_ptr(d_sum_gradW2); + float totalSumGradW2 = thrust::reduce(dev_sumGradW2_ptr, dev_sumGradW2_ptr + NUM_PARTICLES, 0.0f, thrust::plus<float>()); + + // Compute A + float m = h_PARTICLE_MASS; + float rho0 = h_REST_DENSITY; + float A = 2.0f * timeStep * timeStep * m * m / (rho0 * rho0) * totalSumGradW2; + + // ------------ Iterative Pressure Correction Loop -------------------- + + const int maxIterations = 5; + + for (int iter = 0; iter < maxIterations; ++iter) { + computeDensityError<<<blocksPerGrid, threadsPerBlock>>>( + d_positions_pred, + d_densities, + d_density_errors, + nullptr, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + updatePressures<<<blocksPerGrid, threadsPerBlock>>>( + d_pressure_increments, + d_density_errors, + A, + NUM_PARTICLES); + cudaDeviceSynchronize(); + + computePressureForces<<<blocksPerGrid, threadsPerBlock>>>( + d_positions_pred, + d_pressure_increments, + d_pressure_forces, + d_cellIndices, + d_cellStart, + d_cellEnd, + NUM_PARTICLES, + gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + updateVelocitiesPositions<<<blocksPerGrid, threadsPerBlock>>>( + d_velocities_pred, + d_positions_pred, + d_pressure_forces, + timeStep, + NUM_PARTICLES); + cudaDeviceSynchronize(); + } + + // -------------------- Finalization -------------------- + + cudaMemcpy(d_velocities, d_velocities_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice); + cudaMemcpy(d_positions, d_positions_pred, NUM_PARTICLES * sizeof(float3), cudaMemcpyDeviceToDevice); + + // Enforce boundary conditions + float damping = 0.75f; + enforceBoundary<<<blocksPerGrid, threadsPerBlock>>>( + d_positions, + d_velocities, + NUM_PARTICLES, + damping); + cudaDeviceSynchronize(); + + // Unmap the VBO resource + cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0); +} + +} // namespace FluidSimulationGPU diff --git a/src/cuda/smoothing_kernels.cu b/src/cuda/smoothing_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..52d7248726cff11f11794cf4946de0a5782b5e76 --- /dev/null +++ b/src/cuda/smoothing_kernels.cu @@ -0,0 +1,36 @@ +#include "include/smoothing_kernels.h" +#include "include/cuda_device_vars.h" +#include "../settings/constants.h" // Dont ned this one? + +namespace FluidSimulationGPU { + +__device__ float poly6Kernel(float r2) +{ + if (r2 >= 0.0f && r2 <= H2) { + float term = H2 - r2; + return POLY6_CONST * term * term * term; + } else { + return 0.0f; + } +} + +__device__ float spikyKernelGrad(float r) +{ + if (r > 0.0f && r <= H) { + float term = H - r; + return SPIKY_GRAD_CONST * term * term; + } else { + return 0.0f; + } +} + +__device__ float viscosityKernelLap(float r) +{ + if (r >= 0.0f && r <= H) { + return VISC_LAP_CONST * (H - r); + } else { + return 0.0f; + } +} + +} // namespace FluidSimulationGPU diff --git a/src/cuda/spatialHash_NBSearch_kernels.cu b/src/cuda/spatialHash_NBSearch_kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..d48d3005fc409f690d3df74f022c1fe38d35aa9c --- /dev/null +++ b/src/cuda/spatialHash_NBSearch_kernels.cu @@ -0,0 +1,159 @@ +#include "include/spatialHash_NBSearch_kernels.h" +#include "include/cuda_device_vars.h" +#include "../settings/constants.h" + +#include <thrust/device_ptr.h> +#include <thrust/sort.h> +#include <thrust/sequence.h> +#include <thrust/reduce.h> + +namespace FluidSimulationGPU { + +// Compute cell indices +__global__ void computeCellIndices( + float3* positions, + unsigned int* cellIndices, + int numParticles, + float3 gridMin, + float3 gridCellSize, + int gridSizeX, int gridSizeY, int gridSizeZ +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + float3 pos = positions[idx]; + + int ix = (int)((pos.x - gridMin.x) / gridCellSize.x); + int iy = (int)((pos.y - gridMin.y) / gridCellSize.y); + int iz = (int)((pos.z - gridMin.z) / gridCellSize.z); + + // Clamp indices to grid bounds + ix = max(0, min(ix, gridSizeX - 1)); + iy = max(0, min(iy, gridSizeY - 1)); + iz = max(0, min(iz, gridSizeZ - 1)); + + unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY; + + cellIndices[idx] = cellIndex; + } +} + +// Reordering kernel +__global__ void reorderData( + unsigned int* particleIndices, + float3* positions, + float3* velocities, + float3* positions_sorted, + float3* velocities_sorted, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + unsigned int sortedIdx = particleIndices[idx]; + + positions_sorted[idx] = positions[sortedIdx]; + velocities_sorted[idx] = velocities[sortedIdx]; + } +} + +// Find cell start and end indices +__global__ void findCellStartEnd( + unsigned int* cellIndices, + unsigned int* cellStart, + unsigned int* cellEnd, + int numParticles +) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < numParticles) { + unsigned int cellIndex = cellIndices[idx]; + + if (idx == 0) { + cellStart[cellIndex] = 0; + } else { + unsigned int prevCellIndex = cellIndices[idx - 1]; + if (prevCellIndex != cellIndex) { + cellStart[cellIndex] = idx; + cellEnd[prevCellIndex] = idx; + } + } + + if (idx == numParticles - 1) { + cellEnd[cellIndex] = numParticles; + } + } +} + +void neigbourHoodSearch( + int nParticles, + float3* d_pos, + float3* d_vel, + float3* d_posSorted, + float3* d_velSorted, + unsigned int* d_cellIndc, + unsigned int* d_parIndc, + unsigned int* d_cEnd, + unsigned int* d_cStart, + const int bpg, + const int tpb, + int &gridSizeX, + int &gridSizeY, + int &gridSizeZ) +{ + + // Compute grid size based on smoothing radius + float3 gridMin = make_float3(h_BOUND_X_MIN, h_BOUND_Y_MIN, h_BOUND_Z_MIN); + float3 gridMax = make_float3(h_BOUND_X_MAX, h_BOUND_Y_MAX, h_BOUND_Z_MAX); + float3 gridCellSize = make_float3(h_H, h_H, h_H); + + gridSizeX = (int)ceil((h_BOUND_X_MAX - h_BOUND_X_MIN) / h_H); + gridSizeY = (int)ceil((h_BOUND_Y_MAX - h_BOUND_Y_MIN) / h_H); + gridSizeZ = (int)ceil((h_BOUND_Z_MAX - h_BOUND_Z_MIN) / h_H); + int maxCells = gridSizeX * gridSizeY * gridSizeZ; + + // Compute cell indices for current positions + computeCellIndices<<<bpg, tpb>>>( + d_pos, d_cellIndc, nParticles, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ); + cudaDeviceSynchronize(); + + // Wrap device pointers with thrust iterators + thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndc); + thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_parIndc); + + // Initialize particle indices + thrust::sequence(dev_particleIndices_ptr, dev_particleIndices_ptr + nParticles); + + // Sort particle indices based on cell indices + thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + nParticles, dev_particleIndices_ptr); + + // Reorder positions and velocities + reorderData<<<bpg, tpb>>>( + d_parIndc, + d_pos, + d_vel, + d_posSorted, + d_velSorted, + nParticles); + cudaDeviceSynchronize(); + + // Copy sorted data back to original arrays + cudaMemcpy(d_pos, d_posSorted, nParticles * sizeof(float3), cudaMemcpyDeviceToDevice); + cudaMemcpy(d_vel, d_velSorted, nParticles * sizeof(float3), cudaMemcpyDeviceToDevice); + + // Build cell start and end indices + cudaMemset(d_cStart, 0xFF, maxCells * sizeof(unsigned int)); + cudaMemset(d_cEnd, 0xFF, maxCells * sizeof(unsigned int)); + + findCellStartEnd<<<bpg, tpb>>>( + d_cellIndc, + d_cStart, + d_cEnd, + nParticles); + cudaDeviceSynchronize(); +} + +} // namespace FluidSimulationGPU