Q1: Intel, GPU Q2: Enclose lines 47 and 48 in a 'critical' pragma. Q3: Adding the pragma would result in each thread doing only some iterations. This is already a problem, as we had aimed to do, say 100 iterations but if the number of threads is, say, 4, then we will do only 25 iterations. We might lose some accuracy here, but the code will otherwise run correctly. Q4: // takes a graph adjacency matrix for a directed graph, and converts it // to a 2-column matrix of pairs (i,j), meaning an edge from vertex i to // vertex j; the rows of the output matrix can be in any order #include #include #include // arguments: // adjm: the adjacency matrix (NOT assumed symmetric), 1 for edge, 0 // otherwise; note: matrix is overwritten by the function // n: number of rows and columns of adjm // nout: output, number of rows in returned matrix actually used // return value: // pointer to the converted matrix int *transgraph(int *adjm, int n, int *nout) { int *outm = malloc(2 * n*n * sizeof(int)); // worst-case size // outm to become the output matrix, 2 x whatever int nextoutmrow = 0; // number of next row to be written to outm #pragma omp parallel { int i,j; int me = omp_get_thread_num(), nth = omp_get_num_threads(); int myrows[2]; // range of input rows for my thread to process int myoutmrow; // current row in outm that I will write findmyrange(n,nth,me,myrows); // I will write a row to outm for each input 1 in my range of rows; // but first, how many rows will I write? int mynum1s = 0; for (i = myrows[0]; i <= myrows[1]; i++) { for (j = 0; j < n; j++) if (adjm[n*i+j] == 1) mynum1s++; } #pragma omp critical { myoutmrow = nextoutmrow; nextoutmrow += mynum1s; } // now write my rows of outm for (i = myrows[0]; i <= myrows[1]; i++) { for (j = 0; j < n; j++) if (adjm[n*i+j] == 1) { outm[2*myoutmrow] = i; outm[2*myoutmrow+1] = j; myoutmrow++; } } #pragma omp barrier #pragma omp single *nout = nextoutmrow; } *nout = nextoutmrow; // give back the unused part of outm realloc(outm,2*nextoutmrow*sizeof(int)); return outm; } int main() { int i,j; int *adjm; int n = 4; int nout; int *outm; omp_set_num_threads(2); // random test input adjm = malloc(n*n*sizeof(int)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { adjm[n*i+j] = (i*j*j+1) % 2; printf("%d ",adjm[n*i+j]); } printf("\n"); } outm = transgraph(adjm,n,&nout); // check results for (i = 0; i < nout; i++) printf("%d %d\n",outm[2*i],outm[2*i+1]); } // finds chunk among 0,...,n-1 to assign to thread number me among nth // threads void findmyrange(int n,int nth,int me,int *myrange) { int chunksize = n / nth; myrange[0] = me * chunksize; if (me < nth-1) myrange[1] = (me+1) * chunksize - 1; else myrange[1] = n - 1; }