Q1:  Intel, GPU

Q2:  Enclose lines 47 and 48 in a 'critical' pragma.

Q3:  Adding the pragma would result in each thread doing only some
iterations.  This is already a problem, as we had aimed to do, say 100
iterations but if the number of threads is, say, 4, then we will do only
25 iterations.  We might lose some accuracy here, but the code will
otherwise run correctly.

Q4:

// takes a graph adjacency matrix for a directed graph, and converts it
// to a 2-column matrix of pairs (i,j), meaning an edge from vertex i to
// vertex j; the rows of the output matrix can be in any order

#include <omp.h>
#include <stdlib.h>
#include <stdio.h>

// arguments:
//    adjm:  the adjacency matrix (NOT assumed symmetric), 1 for edge, 0
//           otherwise; note: matrix is overwritten by the function
//    n:  number of rows and columns of adjm
//    nout:  output, number of rows in returned matrix actually used
// return value: 
//    pointer to the converted matrix

int *transgraph(int *adjm, int n, int *nout)
{  
   int *outm = malloc(2 * n*n * sizeof(int));  // worst-case size
   // outm to become the output matrix, 2 x whatever
   int nextoutmrow = 0;  // number of next row to be written to outm
   #pragma omp parallel
   {  int i,j;
      int me = omp_get_thread_num(),
          nth = omp_get_num_threads();
      int myrows[2];  // range of input rows for my thread to process
      int myoutmrow;  // current row in outm that I will write
      findmyrange(n,nth,me,myrows); 
      // I will write a row to outm for each input 1 in my range of rows;
      // but first, how many rows will I write?
      int mynum1s = 0;
      for (i = myrows[0]; i <= myrows[1]; i++) {
         for (j = 0; j < n; j++) 
            if (adjm[n*i+j] == 1) mynum1s++;
      }
      #pragma omp critical
      {
         myoutmrow = nextoutmrow;
         nextoutmrow += mynum1s;
      }
      // now write my rows of outm
      for (i = myrows[0]; i <= myrows[1]; i++) {
         for (j = 0; j < n; j++) 
            if (adjm[n*i+j] == 1) {
               outm[2*myoutmrow] = i;
               outm[2*myoutmrow+1] = j;
               myoutmrow++;
            }
      }
      #pragma omp barrier
      #pragma omp single
      *nout = nextoutmrow;
   }
   *nout = nextoutmrow;
   // give back the unused part of outm
   realloc(outm,2*nextoutmrow*sizeof(int));
   return outm;
}

int main()
{  int i,j;
   int *adjm;
   int n = 4;
   int nout;
   int *outm;
   omp_set_num_threads(2);
   // random test input
   adjm = malloc(n*n*sizeof(int));
   for (i = 0; i < n; i++)  {
      for (j = 0; j < n; j++) {
         adjm[n*i+j] = (i*j*j+1) % 2;
         printf("%d ",adjm[n*i+j]);
      }
      printf("\n");
   }

   outm = transgraph(adjm,n,&nout);

   // check results
   for (i = 0; i < nout; i++) 
      printf("%d %d\n",outm[2*i],outm[2*i+1]);
}

// finds chunk among 0,...,n-1 to assign to thread number me among nth
// threads
void findmyrange(int n,int nth,int me,int *myrange)
{  int chunksize = n / nth;
   myrange[0] = me * chunksize;
   if (me < nth-1) myrange[1] = (me+1) * chunksize - 1;
   else myrange[1] = n - 1;
}