diff --git a/common/GL_utilities.h b/common/GL_utilities.h index cdb6b7d9554ac055058880b9bb0f433c6249b577..6d65357ccfa86e06eb91420a3c3f0f6727923805 100644 --- a/common/GL_utilities.h +++ b/common/GL_utilities.h @@ -13,7 +13,7 @@ extern "C" { #include "glew.h" #include <GL/gl.h> #else - #include <GL/glew.h> + //#include <GL/glew.h> #include <GL/gl.h> #endif #endif diff --git a/src/MakeFile_Alt.txt b/src/MakeFile_Alt.txt index 75d9e148140da94e89569734cc9ea1433689d08f..03a5302784939116f9f5841956c674f6a8512654 100644 --- a/src/MakeFile_Alt.txt +++ b/src/MakeFile_Alt.txt @@ -1,3 +1,12 @@ nvcc -c kernels_cuda.cu -o kernels_cuda.o -I/usr/local/cuda/include nvcc main3.cpp ../common/GL_utilities.c ../common/Linux/MicroGlut.c kernels_cuda.o -I/usr/local/cuda/include -I../common -I../common/Linux -DGL_GLEXT_PROTOTYPES -I/usr/include/GL -L/usr/local/cuda/lib64 -lGL -lGLU -lcudart -lX11 -lXt -lm -o simple_particle_system +./simple_particle_system + + + + +New one: + +nvcc -c kernels_cuda.cu -o kernels_cuda.o -I/usr/lib/cuda/include +nvcc main3.cpp ../common/GL_utilities.c ../common/Linux/MicroGlut.c kernels_cuda.o -I/usr/lib/cuda/include -I../common -I../common/Linux -DGL_GLEXT_PROTOTYPES -I/usr/include/GL -L/usr/lib/cuda/lib64 -lGL -lGLU -lcudart -lX11 -lXt -lm -o simple_particle_system ./simple_particle_system \ No newline at end of file diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 9aae2a3296f0c1194f3eae9ecd782711332af645..36cc04561b5806719ee2e703dbe768117acf7b9b 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -1,16 +1,21 @@ // kernels_cuda.cu #include "kernels_cuda.h" #include "constants.h" +#include <cstdio> // C++ style header #include "include/float3_helper_math.h" #include <cuda_runtime.h> #include <cuda_gl_interop.h> #include <device_launch_parameters.h> #include <cmath> +#include <iostream> + + + // Include Thrust for sorting -#include <thrust/device_ptr.h> -#include <thrust/sort.h> -#include <thrust/sequence.h> +//#include <thrust/device_ptr.h> +//#include <thrust/sort.h> +//#include <thrust/sequence.h> // CUDA Graphics Resource (for OpenGL-CUDA interoperability) cudaGraphicsResource_t cuda_vbo_resource; @@ -110,8 +115,10 @@ __global__ void computeCellIndices( unsigned int cellIndex = ix + iy * gridSizeX + iz * gridSizeX * gridSizeY; + cellIndices[idx] = cellIndex; } + } // Reordering kernel @@ -129,11 +136,94 @@ __global__ void reorderData( if (idx < numParticles) { unsigned int sortedIdx = particleIndices[idx]; - positions_sorted[idx] = positions[sortedIdx]; + printf("Index and sorted %d %d\n", idx, sortedIdx); // Sorted idx result in 0, thus only one particle is here-. + + positions_sorted[idx] = positions[sortedIdx]; // Todo was sortedIdx velocities_sorted[idx] = velocities[sortedIdx]; } } + +// TODO testing adding bitonic merge sort. +// Launch this code only on the GPU +__device__ void swapIfGreater(unsigned int &a, unsigned int &b, unsigned int &a_key, unsigned int &b_key) { + // Swap the a aqnd b keys + unsigned int temp_key = a_key; + a_key = b_key; + b_key = temp_key; + + // Swap the values a and b + unsigned int temp = a; + a = b; + b = temp; + + // Want values = index order. + +} + +/* + Keys, values, N +*/ + + +__global__ void helpMe(unsigned int* keys, unsigned int* values, int numElements){ + + unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= numElements) return; + + // Fix values index + //printf("%d\n", values[tid]); // Todo problem, this should only run once. + values[tid] = tid; + + // Todo can we set this to keys's value instead? + + // Eg. 64, 61 is index 0, 1 + // Then in the secoind run it should be 61, 64 with currently 0, 1 and that should be correct. + + +} + + +// Todo values is empty, we want to fill this array by sorting keys +__global__ void bitonicSortKernel(unsigned int* keys, unsigned int* values, int numElements) { + + unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if (tid >= numElements) return; + + for (int k = 2; k <= numElements; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + unsigned int idxj = tid ^ j; + if (idxj > tid) { + // Ascending order + if ((tid & k) == 0) { + + if (keys[tid] > keys[idxj]){ + swapIfGreater(values[tid], values[idxj], keys[tid], keys[idxj]); + } + + } + // Descending order + else { + + if (keys[tid] < keys[idxj]){ + swapIfGreater(values[idxj], values[tid], keys[idxj], keys[tid]); + } + + } + } + __syncthreads(); + } + } + + + // Here try printing keys and values + //printf("Here: %d\n", keys[tid]); + //printf("Value: %d\n", values[tid]); + + +} + // Find cell start and end indices // can bli reeplaceed med atomicAdd för varje ceell index?? __global__ void findCellStartEnd( @@ -390,8 +480,13 @@ __global__ void enforceBoundary( } } + + void updateParticles(float deltaTime) { + cudaError_t err; + + // Map the VBO resource to get a device pointer size_t num_bytes; cudaGraphicsMapResources(1, &cuda_vbo_resource, 0); @@ -413,12 +508,50 @@ void updateParticles(float deltaTime) // Allocate or resize cellStart and cellEnd arrays if necessary // (You may need to adjust your initialization code to handle dynamic sizes) + // Compute cell indices computeCellIndices<<<blocksPerGrid, threadsPerBlock>>>( d_positions, d_cellIndices, NUM_PARTICLES, gridMin, gridCellSize, gridSizeX, gridSizeY, gridSizeZ); cudaDeviceSynchronize(); + + /* + cudaError_t err; + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("Kernel error1: %s\n", cudaGetErrorString(err)); + } + else{ + printf("GOOD!"); + } + + + err = cudaPeekAtLastError(); + if (err != cudaSuccess) { + printf("CUDA error: %s\n", cudaGetErrorString(err)); + } + else{ + printf("CUDA GOOD:"); + } + + err = cudaDeviceSynchronize(); +if (err != cudaSuccess) { + printf("Kernel execution error: %s\n", cudaGetErrorString(err)); +} else { + printf("Kernel executed successfully: No errors detected\n"); +} + + + + */ + + + // Wrap device pointers with thrust iterators + // Todo cannot include thrust? + /* + + // Creating pointers used by thrust thrust::device_ptr<unsigned int> dev_cellIndices_ptr(d_cellIndices); thrust::device_ptr<unsigned int> dev_particleIndices_ptr(d_particleIndices); @@ -427,8 +560,24 @@ void updateParticles(float deltaTime) // Sort particle indices based on cell indices thrust::sort_by_key(dev_cellIndices_ptr, dev_cellIndices_ptr + NUM_PARTICLES, dev_particleIndices_ptr); + */ + + + ////////////// + + + // TODO adding bitonic merge sort instead of thrust + + + //helpMe<<<blocksPerGrid, threadsPerBlock>>>(d_cellIndices, d_particleIndices, NUM_PARTICLES); + //bitonicSortKernel<<<blocksPerGrid, threadsPerBlock>>>(d_cellIndices, d_particleIndices, NUM_PARTICLES); + //cudaDeviceSynchronize(); + + + /////////// // Reorder positions and velocities + /* reorderData<<<blocksPerGrid, threadsPerBlock>>>( d_particleIndices, d_positions, @@ -438,6 +587,8 @@ void updateParticles(float deltaTime) NUM_PARTICLES); cudaDeviceSynchronize(); + + // Swap sorted data with original data float3* temp_positions = d_positions; d_positions = d_positions_sorted; @@ -447,6 +598,8 @@ void updateParticles(float deltaTime) d_velocities = d_velocities_sorted; d_velocities_sorted = temp_velocities; + + // Build cell start and end indices // Initialize cellStart and cellEnd cudaMemset(d_cellStart, 0xFF, maxCells * sizeof(unsigned int)); @@ -459,6 +612,8 @@ void updateParticles(float deltaTime) NUM_PARTICLES); cudaDeviceSynchronize(); + */ + // Compute density and pressure computeDensityPressure<<<blocksPerGrid, threadsPerBlock>>>( d_positions, @@ -486,7 +641,7 @@ void updateParticles(float deltaTime) cudaDeviceSynchronize(); // Integrate - float timeStep = 0.008f; + float timeStep = 0.008f; // todo was 0.008f; integrate<<<blocksPerGrid, threadsPerBlock>>>( d_positions, d_velocities, @@ -535,14 +690,17 @@ void initializeConstants() cudaMemcpyToSymbol(BOUND_Y_MAX, &h_BOUND_Y_MAX, sizeof(float)); cudaMemcpyToSymbol(BOUND_Z_MIN, &h_BOUND_Z_MIN, sizeof(float)); cudaMemcpyToSymbol(BOUND_Z_MAX, &h_BOUND_Z_MAX, sizeof(float)); + } + // Register VBO with CUDA void registerVBOWithCUDA(unsigned int vbo) { cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone); } + // Unregister VBO from CUDA void unregisterVBOFromCUDA() { @@ -556,6 +714,7 @@ void initializeDeviceArrays( int numParticles ) { + // Allocate device memory for velocities cudaMalloc(&d_velocities, numParticles * sizeof(float3)); cudaMemcpy(d_velocities, h_velocities, numParticles * sizeof(float3), cudaMemcpyHostToDevice); diff --git a/src/main3.cpp b/src/main3.cpp index b4ed3f8025dd745ec460e9e4a30ce2dd27f5835f..4c5c7d06b813c02e75008c7ad1f6a4cbbb7e601d 100644 --- a/src/main3.cpp +++ b/src/main3.cpp @@ -10,10 +10,10 @@ #ifdef _WIN32 #include <GL/glew.h> // Include GLEW before other OpenGL headers + #include <GL/freeglut.h> #endif -//#include <GL/freeglut.h> #include "MicroGlut.h" #include "GL_utilities.h" @@ -26,8 +26,10 @@ #define MAIN #include "VectorUtils4.h" +#include <iostream> + // Number of particles -const int NUM_PARTICLES = 64000; // Should match a cubic grid for simplicity +const int NUM_PARTICLES = 3200; // Must be a power of 2! const float GRID_SPACING = 0.25f; // Spacing between particles in the grid const float pointRadius = 6.0f; // Point size in world space @@ -125,6 +127,7 @@ void display() deltaTime = 0.0001f; // Update particle positions using CUDA + updateParticles(deltaTime); // Use the shader program @@ -294,6 +297,7 @@ void initialize() registerVBOWithCUDA(vbo); // Initialize device arrays + initializeDeviceArrays( h_positions.data(), h_velocities.data(), @@ -306,6 +310,7 @@ void initialize() // Enable depth testing glEnable(GL_DEPTH_TEST); + } // Cleanup resources