| #include <math.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
|
|
|
|
| #include "cluster_gpu.h" |
| #include "cuda_utils.h" |
|
|
|
|
| __device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) { |
| float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2); |
| return sqrt(dis); |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| __global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx, |
| int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) { |
| |
| |
| |
| int bs_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (bs_idx >= b) return; |
|
|
| xyz += bs_idx * n * 3; |
| idx += bs_idx * n; |
| pts_cnt += bs_idx * n; |
| pts_stack += bs_idx * n; |
| pts_adj += bs_idx * n * n; |
|
|
| for(int i=0;i<n;i++) { |
| pts_cnt[i] = 0; |
| for(int j=0;j<n;j++) { |
| pts_adj[i * n + j] = -1; |
| if(i==j) continue; |
| float x1 = xyz[i * 3 + 0]; |
| float y1 = xyz[i * 3 + 1]; |
| float z1 = xyz[i * 3 + 2]; |
| float x2 = xyz[j * 3 + 0]; |
| float y2 = xyz[j * 3 + 1]; |
| float z2 = xyz[j * 3 + 2]; |
|
|
| if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue; |
| if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) { |
| pts_adj[i * n + pts_cnt[i]] = j; |
| pts_cnt[i] += 1; |
| } |
|
|
| } |
| } |
|
|
| int cluster_idx = 0; |
|
|
| for(int i=0;i<n;i++) { |
| if(idx[i] != -1) continue; |
|
|
| if(pts_cnt[i] >= min_pts) { |
| for(int j=0;j<n;j++) |
| pts_stack[j] = -1; |
| pts_stack[0] = i; |
| int stack_idx = 0; |
| int stack_len = 1; |
| while (stack_idx < n && pts_stack[stack_idx] != -1) |
| { |
| int pts_idx = pts_stack[stack_idx]; |
| idx[pts_idx] = cluster_idx; |
| if(pts_cnt[pts_idx] < min_pts){ |
| stack_idx += 1; |
| continue; |
| } |
| for(int j=0;j<n;j++) { |
| int adj = pts_adj[pts_idx * n + j]; |
| if (adj == -1) break; |
| if (idx[adj] == -1) |
| { |
| idx[adj] = -2; |
| pts_stack[stack_len++] = adj; |
| } |
| } |
| stack_idx += 1; |
| } |
| cluster_idx += 1; |
| } |
| } |
| } |
|
|
|
|
| void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) { |
| |
| |
| |
|
|
| cudaError_t err; |
|
|
| dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); |
| dim3 threads(THREADS_PER_BLOCK); |
|
|
| int* pts_cnt; |
| int* pts_stack; |
| int* pts_adj; |
|
|
| err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int)); |
| if (cudaSuccess != err) { |
| fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| exit(-1); |
| } |
|
|
| err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int)); |
| if (cudaSuccess != err) { |
| fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| exit(-1); |
| } |
|
|
| err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int)); |
| if (cudaSuccess != err) { |
| fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| exit(-1); |
| } |
|
|
| dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack); |
| |
| cudaFree(pts_cnt); |
| cudaFree(pts_stack); |
| cudaFree(pts_adj); |
| err = cudaGetLastError(); |
| if (cudaSuccess != err) { |
| fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| exit(-1); |
| } |
| } |
|
|
|
|
|
|
| __global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx, |
| float *__restrict__ new_xyz, int *__restrict__ num) { |
| int bs_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| if (bs_idx >= b ) return; |
|
|
| xyz += bs_idx * n * 3; |
| idx += bs_idx * n; |
| new_xyz += bs_idx * m * 3; |
| num += bs_idx * m; |
|
|
| for(int i=0;i<n;i++) { |
| if (idx[i] == -1) continue; |
| int c_idx = idx[i]; |
| new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0]; |
| new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1]; |
| new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2]; |
| num[c_idx] += 1; |
| } |
| for(int i=0;i<m;i++) { |
| if (num[i] == 0) break; |
| new_xyz[i * 3 + 0] /= num[i]; |
| new_xyz[i * 3 + 1] /= num[i]; |
| new_xyz[i * 3 + 2] /= num[i]; |
| } |
|
|
| } |
|
|
|
|
|
|
|
|
| void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) { |
| cudaError_t err; |
|
|
| dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); |
| dim3 threads(THREADS_PER_BLOCK); |
|
|
| cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num); |
| |
| err = cudaGetLastError(); |
| if (cudaSuccess != err) { |
| fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| exit(-1); |
| } |
| } |
|
|
|
|
|
|