I want to parallelize function in CUDA C which will count all vectors with sum equal of vector elements and elements not bigger of k. For example if number of vector elements n is 5, sum=10 and k=3 than, the number of vectors who satisfy this condition is 101. I've already make this function in cuda c but the problem is when the number of blocks and threads are bigger than 1, I know that the problem is in for cycles and I should to change it but I don't know from where to start. When I am calling the function with blocks and threads equal to one than the function is working on classic way and everything is good but in this case the function is not paralyzed.
The source code of all program is:
#include <stdio.h>
#include<stdlib.h>
#include<assert.h>
#include<cuda.h>
__device__ void count(int *vector, int *total, int n, int s)
{
int i,sum=0;
for(i=blockIdx.x*blockDim.x+threadIdx.x;i<n;i+=blockDim.x*gridDim.x)
{
sum+=vector[i];
__syncthreads();
}
if(sum==s)
{
total[0]=total[0]+1;
}
}
__global__ void computeVectors(int *vector, int n, int kk, int s, int *total)
{
int k=0;
int j,i,next;
while(1)
{
for(j=blockIdx.x*blockDim.x+threadIdx.x; j<=kk; j+=blockDim.x*gridDim.x)
{
vector[k]=j;
count(vector, total, n, s);
__syncthreads();
}
for(i=blockIdx.x*blockDim.x+threadIdx.x; i<n; i+=blockDim.x*gridDim.x)
{
if(vector[i]<kk)
break;
}
next=i;
vector[next]++;
for(i=blockIdx.x*blockDim.x+threadIdx.x; i<sledno; i+=blockDim.x*gridDim.x)
{
vector[i]=0;
__syncthreads();
}
k=0;
if(next>=n)
break;
}
}
int main(){
cudaError_t err = cudaSuccess;
int n,k,sum;
int counter=0;
printf("Enter the length of vector n=");
scanf("%d",&n);
printf("Enter the max value of vector elements k=");
scanf("%d",&k);
printf("Enter the sum of vector elements sum=");
scanf("%d",&sum);
int *vec_h, *vec_d;
size_t sizevec=n*sizeof(int);
vec_h=(int *)malloc(sizevec);
cudaMalloc((void **) &vec_d, sizevec);
for(counter=0; counter<n; counter++)
{
vec_h[counter]=0;
}
cudaMemcpy(vec_d, vec_h, sizevec, cudaMemcpyHostToDevice);
int *total_h, *total_d;
size_t size=1*sizeof(int);
total_h=(int *)malloc(size);
cudaMalloc((void **) &total_d, size);
total_h[0]=0;
cudaMemcpy(total_d, total_h, size, cudaMemcpyHostToDevice);
computeVectors<<<1, 1>>>(vec_d, n, k, sum, total_d);
cudaThreadSynchronize();
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
cudaMemcpy(total_h, total_d, size, cudaMemcpyDeviceToHost);
printf("Number of vectors that satisfy condition is %d\n", total_h[0]);
free(vec_h);
cudaFree(vec_d);
free(total_h);
cudaFree(total_d);
return 0;
}