/*

build instructions, e.g. for my files in /home/nm:

export R_LIBS_USER=/home/nm/R  # my R libraries
export PKG_LIBS="-lgomp"
export PKG_CXXFLAGS="-fopenmp -I/home/nm/R/Rcpp/include"
R CMD SHLIB AdjRcpp.cpp

before start R, set number of threads, e.g. 

export OMP_NUM_THREADS=2

in R:

> library(Rcpp)
# m is the matrix to convert
> .Call("transgraph",m)

*/


#include <Rcpp.h>
#include <omp.h>

// finds the chunk of rows this thread will process
void findmyrange(int n,int nth,int me,int *myrange)
{  int chunksize = n / nth;
   myrange[0] = me * chunksize;
   if (me < nth-1) myrange[1] = (me+1) * chunksize - 1;
   else myrange[1] = n - 1;
}

// SEXP is the internal data type for R objects, as seen from C/C++;
// here are input is an R matrix adjm, and the return value is another
// R matrix
RcppExport SEXP transgraph(SEXP adjm)
{  
   // i-th element of num1s will be the number of 1s 
   // in row i of adjm
   int *num1s,  
       *cumul1s,  // cumulative sums in num1s
       n;
   // make a C/C++ compatible view of the R matrix;
   // note: referencing is done with ( , ) not [ , ],
   // and indices start at 0
   Rcpp::NumericMatrix xadjm(adjm);
   n = xadjm.nrow();
   int n2 = n*n;
   // create output matrix
   Rcpp::NumericMatrix outm(n2,2);

   // start the threads; they will run the 
   // code below simultaneously, though not
   // necessarily executing the same line 
   // at the same time
   #pragma omp parallel
   {  int i,j,m;
      // each thread has an ID (starting at 0), so
      // determine the ID for this thread
      int me = omp_get_thread_num(),
          // find total number of threads
          nth = omp_get_num_threads();
      int myrows[2];
      int tot1s;
      int outrow,num1si;
      // have just one thread execute the
      // following block, while the others wait
      #pragma omp single
      { 
         num1s = (int *) malloc(n*sizeof(int)); 
         cumul1s = (int *) malloc((n+1)*sizeof(int)); 
      }
      findmyrange(n,nth,me,myrows);
      for (i = myrows[0]; i <= myrows[1]; i++) {
         // number of 1s found in this row
         tot1s = 0;  
         for (j = 0; j < n; j++) 
            if (xadjm(i,j) == 1) {
               xadjm(i,(tot1s++)) = j;
            }
         num1s[i] = tot1s;
      }
      // wait for all threads to be done
      #pragma omp barrier  
      // again, one thread does the
      // following, others wait
      #pragma omp single
      {  
         // cumul1s[i] will be tot 1s before row i of xadjm
         cumul1s[0] = 0;  
         // now calculate where the output of each row in 
         // xadjm should start in outm
         for (m = 1; m <= n; m++) {
            cumul1s[m] = cumul1s[m-1] + num1s[m-1];
         }
      }
      // now this thread will put the rows it 
      // found earlier into outm
      for (i = myrows[0]; i <= myrows[1]; i++) {
         outrow = cumul1s[i];  // current row within outm
         num1si = num1s[i];
         for (j = 0; j < num1si; j++) {
            outm(outrow+j,0) = i + 1;
            outm(outrow+j,1) = xadjm(i,j) + 1;
         }
      }
   }
   // have some all-0 rows at end of outm; delete them
   Rcpp::NumericMatrix outmshort = 
      outm(Rcpp::Range(0,cumul1s[n]-1),Rcpp::Range(0,1));
   return outmshort;  // R object returned, all done!
}