Hi,
I have following MPI code
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ARRAYSIZE 2000
#define MASTER 0
int data[ARRAYSIZE];
int main(int argc, char* argv[])
{
int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;
int mysum;
long sum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;
tag1 = 2;
if (taskid == MASTER){
fp=fopen("integers.txt", "r");
if(fp != NULL){
sum = 0;
while(fgets(line, sizeof line, fp)!= NULL){
fscanf(fp,"%d",&data[k]);
sum = sum + data[k];
k++;
}
}
printf("Initialized array sum %d", sum);
offset = chunksize;
for (dest=1; dest<numtasks; dest++) {
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD);
printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset);
offset = offset + chunksize;
}
offset = 0;
mysum = run_kernel(&data[offset], chunksize);
printf("Kernel returns sum %d", mysum);
for (i=1; i<numtasks; i++) {
source = i;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);
}
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
printf("\n*** Final sum= %d ***\n",sum);
}
if (taskid > MASTER) {
start= MPI_Wtime();
source = MASTER;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status);
mysum = run_kernel(&data[offset], chunksize);
printf("\nKernel returns sum %d ", mysum);
stop = MPI_Wtime();
time = stop -start;
printf("time taken by process %d to recieve elements and caluclate own sum is = %lf seconds \n", taskid, time);
dest = MASTER;
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD);
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
}
MPI_Finalize();
}
int update(int myoffset, int chunk, int myid) {
int i,j;
int mysum = 0;
for(i=myoffset; i < myoffset + chunk; i++) {
mysum = mysum + data[i];
}
printf("Task %d has sum = %d\n",myid,mysum);
return(mysum);
}
and I have following cuda code
#include <stdio.h>
__global__ void add(int *devarray, int *devsum)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
devsum = devsum + devarray[index];
}
extern "C"
int * run_kernel(int array[],int nelements)
{
int *devarray, *sum, *devsum;
printf("\nrun_kernel called..............");
cudaMalloc((void**) &devarray, sizeof(int)*nelements);
cudaMalloc((void**) &devsum, sizeof(int));
cudaMemcpy(devarray, array, sizeof(int)*nelements, cudaMemcpyHostToDevice);
add<<<2, 3>>>(devarray, devsum);
cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost);
printf(" \nthe sum is %d", sum);
cudaFree(devarray);
return sum;
}
I am getting following output
Here is my output when I run above code -
MPI task 0 has started on host
MPI task 1 has started on host
MPI task 2 has started on host
MPI task 3 has started on host
Initialized array sum 9061Sent 500 elements to task 1 offset= 500
Sent 500 elements to task 2 offset= 1000
Sent 500 elements to task 3 offset= 1500
[node4] *** Process received signal ***
run_kernel called..............
[node4:04786] Signal: Segmentation fault (11)
[node4:04786] Signal code: Invalid permissions (2)
[node4:04786] Failing at address: 0x8049828
[node4:04786] [ 0] [0xaf440c]
[node4:04786] [ 1] /usr/lib/libcuda.so(+0x13a0f6) [0xfa10f6]
[node4:04786] [ 2] /usr/lib/libcuda.so(+0x146912) [0xfad912]
[node4:04786] [ 3] /usr/lib/libcuda.so(+0x148094) [0xfaf094]
[node4:04786] [ 4] /usr/lib/libcuda.so(+0x13ca50) [0xfa3a50]
[node4:04786] [ 5] /usr/lib/libcuda.so(+0x11863c) [0xf7f63c]
[node4:04786] [ 6] /usr/lib/libcuda.so(+0x11d167) [0xf84167]
[node4:04786] [ 7] /usr/lib/libcuda.so(cuMemcpyDtoH_v2+0x64) [0xf74014]
[node4:04786] [ 8] /usr/local/cuda/lib/libcudart.so.4(+0x2037b) [0xcbe37b]
[node4:04786] [ 9] /usr/local/cuda/lib/libcudart.so.4(cudaMemcpy+0x230) [0xcf1360]
[node4:04786] [10] mpi_array(run_kernel+0x135) [0x8049559]
[node4:04786] [11] mpi_array(main+0x2f2) [0x8049046]
[node4:04786] [12] /lib/libc.so.6(__libc_start_main+0xe6) [0x2fece6]
[node4:04786] [13] mpi_array() [0x8048cc1]
[node4:04786] *** End of error message ***
Kernel returns sum 134530992 time taken by process 1 to recieve elements and caluclate own sum is = 0.276339 seconds
run_kernel called..............
devsum is 3211264
the sum is 134532992
Kernel returns sum 134532992 time taken by process 2 to recieve elements and caluclate own sum is = 0.280452 seconds
run_kernel called..............
devsum is 3211264
the sum is 134534992
Kernel returns sum 134534992 time taken by process 3 to recieve elements and caluclate own sum is = 0.285010 seconds
------------------------------------------------------------- -------------
mpirun noticed that process rank 0 with PID 4786 on node ecm-c-l-207-004.uniwa.uwa.edu.au exited on signal 11 (Segmentation fault).
Perhaps the sum does not look correct. Not sure what is causing segmentation fault. Can anyone help?
Thanks
|