\BOOKMARK [0][-]{chapter.1}{Introduction to Parallel Processing}{}% 1 \BOOKMARK [1][-]{section.1.1}{Platforms}{chapter.1}% 2 \BOOKMARK [2][-]{subsection.1.1.1}{Why R?}{section.1.1}% 3 \BOOKMARK [1][-]{section.1.2}{Why Use Parallel Systems?}{chapter.1}% 4 \BOOKMARK [2][-]{subsection.1.2.1}{Execution Speed}{section.1.2}% 5 \BOOKMARK [2][-]{subsection.1.2.2}{Memory}{section.1.2}% 6 \BOOKMARK [2][-]{subsection.1.2.3}{Distributed Processing}{section.1.2}% 7 \BOOKMARK [2][-]{subsection.1.2.4}{Our Focus Here}{section.1.2}% 8 \BOOKMARK [1][-]{section.1.3}{Parallel Processing Hardware}{chapter.1}% 9 \BOOKMARK [2][-]{subsection.1.3.1}{Shared-Memory Systems}{section.1.3}% 10 \BOOKMARK [3][-]{subsubsection.1.3.1.1}{Basic Architecture}{subsection.1.3.1}% 11 \BOOKMARK [3][-]{subsubsection.1.3.1.2}{Multiprocessor Topologies}{subsection.1.3.1}% 12 \BOOKMARK [3][-]{subsubsection.1.3.1.3}{Memory Issues Etc.}{subsection.1.3.1}% 13 \BOOKMARK [2][-]{subsection.1.3.2}{Message-Passing Systems}{section.1.3}% 14 \BOOKMARK [3][-]{subsubsection.1.3.2.1}{Basic Architecture}{subsection.1.3.2}% 15 \BOOKMARK [3][-]{subsubsection.1.3.2.2}{Example: Clusters}{subsection.1.3.2}% 16 \BOOKMARK [2][-]{subsection.1.3.3}{SIMD}{section.1.3}% 17 \BOOKMARK [1][-]{section.1.4}{Programmer World Views}{chapter.1}% 18 \BOOKMARK [2][-]{subsection.1.4.1}{Example: Matrix-Vector Multiply}{section.1.4}% 19 \BOOKMARK [2][-]{subsection.1.4.2}{Shared-Memory}{section.1.4}% 20 \BOOKMARK [3][-]{subsubsection.1.4.2.1}{Programmer View}{subsection.1.4.2}% 21 \BOOKMARK [1][-]{section.1.5}{Shared Memory Programming}{chapter.1}% 22 \BOOKMARK [2][-]{subsection.1.5.1}{Low-Level Threads Systems: Pthreads}{section.1.5}% 23 \BOOKMARK [3][-]{subsubsection.1.5.1.1}{Pthreads Example: Finding Primes}{subsection.1.5.1}% 24 \BOOKMARK [2][-]{subsection.1.5.2}{Compiling and Linking}{section.1.5}% 25 \BOOKMARK [2][-]{subsection.1.5.3}{The Notion of a ``Critical Section''}{section.1.5}% 26 \BOOKMARK [2][-]{subsection.1.5.4}{Role of the OS}{section.1.5}% 27 \BOOKMARK [2][-]{subsection.1.5.5}{Debugging Threads Programs}{section.1.5}% 28 \BOOKMARK [1][-]{section.1.6}{A Note on Global Variables}{chapter.1}% 29 \BOOKMARK [2][-]{subsection.1.6.1}{Higher-Level Threads Programming: OpenMP}{section.1.6}% 30 \BOOKMARK [3][-]{subsubsection.1.6.1.1}{Example: Sampling Bucket Sort}{subsection.1.6.1}% 31 \BOOKMARK [3][-]{subsubsection.1.6.1.2}{Debugging OpenMP}{subsection.1.6.1}% 32 \BOOKMARK [2][-]{subsection.1.6.2}{Message Passing}{section.1.6}% 33 \BOOKMARK [3][-]{subsubsection.1.6.2.1}{Programmer View}{subsection.1.6.2}% 34 \BOOKMARK [3][-]{subsubsection.1.6.2.2}{Example: MPI Prime Numbers Finder}{subsection.1.6.2}% 35 \BOOKMARK [2][-]{subsection.1.6.3}{Scatter/Gather}{section.1.6}% 36 \BOOKMARK [3][-]{subsubsection.1.6.3.1}{R snow Package}{subsection.1.6.3}% 37 \BOOKMARK [1][-]{section.1.7}{Threads Programming in R: Rdsm}{chapter.1}% 38 \BOOKMARK [2][-]{subsection.1.7.1}{Example: Matrix Multiplication}{section.1.7}% 39 \BOOKMARK [2][-]{subsection.1.7.2}{Example: Maximal Burst in a Time Series}{section.1.7}% 40 \BOOKMARK [0][-]{chapter.2}{Recurring Performance Issues}{}% 41 \BOOKMARK [1][-]{section.2.1}{Communication Bottlenecks}{chapter.2}% 42 \BOOKMARK [1][-]{section.2.2}{Load Balancing}{chapter.2}% 43 \BOOKMARK [1][-]{section.2.3}{``Embarrassingly Parallel'' Applications}{chapter.2}% 44 \BOOKMARK [2][-]{subsection.2.3.1}{What People Mean by ``Embarrassingly Parallel''}{section.2.3}% 45 \BOOKMARK [2][-]{subsection.2.3.2}{Iterative Algorithms}{section.2.3}% 46 \BOOKMARK [1][-]{section.2.4}{Static \(But Possibly Random\) Task Assignment Typically Better Than Dynamic}{chapter.2}% 47 \BOOKMARK [2][-]{subsection.2.4.1}{Example: Matrix-Vector Multiply}{section.2.4}% 48 \BOOKMARK [2][-]{subsection.2.4.2}{Load Balance, Revisited}{section.2.4}% 49 \BOOKMARK [2][-]{subsection.2.4.3}{Example: Mutual Web Outlinks}{section.2.4}% 50 \BOOKMARK [2][-]{subsection.2.4.4}{Work Stealing}{section.2.4}% 51 \BOOKMARK [2][-]{subsection.2.4.5}{Timing Example}{section.2.4}% 52 \BOOKMARK [2][-]{subsection.2.4.6}{Example: Extracting a Submatrix}{section.2.4}% 53 \BOOKMARK [1][-]{section.2.5}{Latency and Bandwidth}{chapter.2}% 54 \BOOKMARK [1][-]{section.2.6}{Relative Merits: Performance of Shared-Memory Vs. Message-Passing}{chapter.2}% 55 \BOOKMARK [1][-]{section.2.7}{Memory Allocation Issues}{chapter.2}% 56 \BOOKMARK [1][-]{section.2.8}{Issues Particular to Shared-Memory Systems}{chapter.2}% 57 \BOOKMARK [0][-]{chapter.3}{Shared Memory Parallelism}{}% 58 \BOOKMARK [1][-]{section.3.1}{What Is Shared?}{chapter.3}% 59 \BOOKMARK [1][-]{section.3.2}{Memory Modules}{chapter.3}% 60 \BOOKMARK [2][-]{subsection.3.2.1}{Interleaving}{section.3.2}% 61 \BOOKMARK [2][-]{subsection.3.2.2}{Bank Conflicts and Solutions}{section.3.2}% 62 \BOOKMARK [2][-]{subsection.3.2.3}{Example: Code to Implement Padding}{section.3.2}% 63 \BOOKMARK [1][-]{section.3.3}{Interconnection Topologies}{chapter.3}% 64 \BOOKMARK [2][-]{subsection.3.3.1}{SMP Systems}{section.3.3}% 65 \BOOKMARK [2][-]{subsection.3.3.2}{NUMA Systems}{section.3.3}% 66 \BOOKMARK [2][-]{subsection.3.3.3}{NUMA Interconnect Topologies}{section.3.3}% 67 \BOOKMARK [3][-]{subsubsection.3.3.3.1}{Crossbar Interconnects}{subsection.3.3.3}% 68 \BOOKMARK [3][-]{subsubsection.3.3.3.2}{Omega \(or Delta\) Interconnects}{subsection.3.3.3}% 69 \BOOKMARK [2][-]{subsection.3.3.4}{Comparative Analysis}{section.3.3}% 70 \BOOKMARK [2][-]{subsection.3.3.5}{Why Have Memory in Modules?}{section.3.3}% 71 \BOOKMARK [1][-]{section.3.4}{Synchronization Hardware}{chapter.3}% 72 \BOOKMARK [2][-]{subsection.3.4.1}{Test-and-Set Instructions}{section.3.4}% 73 \BOOKMARK [3][-]{subsubsection.3.4.1.1}{LOCK Prefix on Intel Processors}{subsection.3.4.1}% 74 \BOOKMARK [3][-]{subsubsection.3.4.1.2}{Locks with More Complex Interconnects}{subsection.3.4.1}% 75 \BOOKMARK [2][-]{subsection.3.4.2}{May Not Need the Latest}{section.3.4}% 76 \BOOKMARK [2][-]{subsection.3.4.3}{Fetch-and-Add Instructions}{section.3.4}% 77 \BOOKMARK [1][-]{section.3.5}{Cache Issues}{chapter.3}% 78 \BOOKMARK [2][-]{subsection.3.5.1}{Cache Coherency}{section.3.5}% 79 \BOOKMARK [2][-]{subsection.3.5.2}{Example: the MESI Cache Coherency Protocol}{section.3.5}% 80 \BOOKMARK [2][-]{subsection.3.5.3}{The Problem of ``False Sharing''}{section.3.5}% 81 \BOOKMARK [1][-]{section.3.6}{Memory-Access Consistency Policies}{chapter.3}% 82 \BOOKMARK [1][-]{section.3.7}{Fetch-and-Add Combining within Interconnects}{chapter.3}% 83 \BOOKMARK [1][-]{section.3.8}{Multicore Chips}{chapter.3}% 84 \BOOKMARK [1][-]{section.3.9}{Optimal Number of Threads}{chapter.3}% 85 \BOOKMARK [1][-]{section.3.10}{Processor Affinity}{chapter.3}% 86 \BOOKMARK [1][-]{section.3.11}{Illusion of Shared-Memory through Software}{chapter.3}% 87 \BOOKMARK [2][-]{subsection.3.11.1}{Software Distributed Shared Memory}{section.3.11}% 88 \BOOKMARK [2][-]{subsection.3.11.2}{Case Study: JIAJIA}{section.3.11}% 89 \BOOKMARK [1][-]{section.3.12}{Barrier Implementation}{chapter.3}% 90 \BOOKMARK [2][-]{subsection.3.12.1}{A Use-Once Version}{section.3.12}% 91 \BOOKMARK [2][-]{subsection.3.12.2}{An Attempt to Write a Reusable Version}{section.3.12}% 92 \BOOKMARK [2][-]{subsection.3.12.3}{A Correct Version}{section.3.12}% 93 \BOOKMARK [2][-]{subsection.3.12.4}{Refinements}{section.3.12}% 94 \BOOKMARK [3][-]{subsubsection.3.12.4.1}{Use of Wait Operations}{subsection.3.12.4}% 95 \BOOKMARK [3][-]{subsubsection.3.12.4.2}{Parallelizing the Barrier Operation}{subsection.3.12.4}% 96 \BOOKMARK [4][-]{paragraph.3.12.4.2.1}{Tree Barriers}{subsubsection.3.12.4.2}% 97 \BOOKMARK [4][-]{paragraph.3.12.4.2.2}{Butterfly Barriers}{subsubsection.3.12.4.2}% 98 \BOOKMARK [0][-]{chapter.4}{Introduction to OpenMP}{}% 99 \BOOKMARK [1][-]{section.4.1}{Overview}{chapter.4}% 100 \BOOKMARK [1][-]{section.4.2}{Example: Dijkstra Shortest-Path Algorithm}{chapter.4}% 101 \BOOKMARK [2][-]{subsection.4.2.1}{The Algorithm}{section.4.2}% 102 \BOOKMARK [2][-]{subsection.4.2.2}{The OpenMP parallel Pragma}{section.4.2}% 103 \BOOKMARK [2][-]{subsection.4.2.3}{Scope Issues}{section.4.2}% 104 \BOOKMARK [2][-]{subsection.4.2.4}{The OpenMP single Pragma}{section.4.2}% 105 \BOOKMARK [2][-]{subsection.4.2.5}{The OpenMP barrier Pragma}{section.4.2}% 106 \BOOKMARK [2][-]{subsection.4.2.6}{Implicit Barriers}{section.4.2}% 107 \BOOKMARK [2][-]{subsection.4.2.7}{The OpenMP critical Pragma}{section.4.2}% 108 \BOOKMARK [1][-]{section.4.3}{The OpenMP for Pragma}{chapter.4}% 109 \BOOKMARK [2][-]{subsection.4.3.1}{Example: Dijkstra with Parallel for Loops}{section.4.3}% 110 \BOOKMARK [2][-]{subsection.4.3.2}{Nested Loops}{section.4.3}% 111 \BOOKMARK [2][-]{subsection.4.3.3}{Controlling the Partitioning of Work to Threads: the schedule Clause}{section.4.3}% 112 \BOOKMARK [2][-]{subsection.4.3.4}{Example: In-Place Matrix Transpose}{section.4.3}% 113 \BOOKMARK [2][-]{subsection.4.3.5}{The OpenMP reduction Clause}{section.4.3}% 114 \BOOKMARK [1][-]{section.4.4}{Example: Mandelbrot Set}{chapter.4}% 115 \BOOKMARK [1][-]{section.4.5}{The Task Directive}{chapter.4}% 116 \BOOKMARK [2][-]{subsection.4.5.1}{Example: Quicksort}{section.4.5}% 117 \BOOKMARK [1][-]{section.4.6}{Other OpenMP Synchronization Issues}{chapter.4}% 118 \BOOKMARK [2][-]{subsection.4.6.1}{The OpenMP atomic Clause}{section.4.6}% 119 \BOOKMARK [2][-]{subsection.4.6.2}{Memory Consistency and the flush Pragma}{section.4.6}% 120 \BOOKMARK [1][-]{section.4.7}{Combining Work-Sharing Constructs}{chapter.4}% 121 \BOOKMARK [1][-]{section.4.8}{The Rest of OpenMP}{chapter.4}% 122 \BOOKMARK [1][-]{section.4.9}{Compiling, Running and Debugging OpenMP Code}{chapter.4}% 123 \BOOKMARK [2][-]{subsection.4.9.1}{Compiling}{section.4.9}% 124 \BOOKMARK [2][-]{subsection.4.9.2}{Running}{section.4.9}% 125 \BOOKMARK [2][-]{subsection.4.9.3}{Debugging}{section.4.9}% 126 \BOOKMARK [1][-]{section.4.10}{Performance}{chapter.4}% 127 \BOOKMARK [2][-]{subsection.4.10.1}{The Effect of Problem Size}{section.4.10}% 128 \BOOKMARK [2][-]{subsection.4.10.2}{Some Fine Tuning}{section.4.10}% 129 \BOOKMARK [2][-]{subsection.4.10.3}{OpenMP Internals}{section.4.10}% 130 \BOOKMARK [1][-]{section.4.11}{Example: Root Finding}{chapter.4}% 131 \BOOKMARK [1][-]{section.4.12}{Example: Mutual Outlinks}{chapter.4}% 132 \BOOKMARK [1][-]{section.4.13}{Example: Transforming an Adjacency Matrix}{chapter.4}% 133 \BOOKMARK [1][-]{section.4.14}{Example: Finding the Maximal Burst in a Time Series}{chapter.4}% 134 \BOOKMARK [1][-]{section.4.15}{Locks with OpenMP}{chapter.4}% 135 \BOOKMARK [1][-]{section.4.16}{Other Examples of OpenMP Code in This Book}{chapter.4}% 136 \BOOKMARK [0][-]{chapter.5}{Introduction to GPU Programming with CUDA}{}% 137 \BOOKMARK [1][-]{section.5.1}{Overview}{chapter.5}% 138 \BOOKMARK [1][-]{section.5.2}{Some Terminology}{chapter.5}% 139 \BOOKMARK [1][-]{section.5.3}{Example: Calculate Row Sums}{chapter.5}% 140 \BOOKMARK [1][-]{section.5.4}{Understanding the Hardware Structure}{chapter.5}% 141 \BOOKMARK [2][-]{subsection.5.4.1}{Processing Units}{section.5.4}% 142 \BOOKMARK [2][-]{subsection.5.4.2}{Thread Operation}{section.5.4}% 143 \BOOKMARK [3][-]{subsubsection.5.4.2.1}{SIMT Architecture}{subsection.5.4.2}% 144 \BOOKMARK [3][-]{subsubsection.5.4.2.2}{The Problem of Thread Divergence}{subsection.5.4.2}% 145 \BOOKMARK [3][-]{subsubsection.5.4.2.3}{``OS in Hardware''}{subsection.5.4.2}% 146 \BOOKMARK [2][-]{subsection.5.4.3}{Memory Structure}{section.5.4}% 147 \BOOKMARK [3][-]{subsubsection.5.4.3.1}{Shared and Global Memory}{subsection.5.4.3}% 148 \BOOKMARK [3][-]{subsubsection.5.4.3.2}{Global-Memory Performance Issues}{subsection.5.4.3}% 149 \BOOKMARK [3][-]{subsubsection.5.4.3.3}{Shared-Memory Performance Issues}{subsection.5.4.3}% 150 \BOOKMARK [3][-]{subsubsection.5.4.3.4}{Host/Device Memory Transfer Performance Issues}{subsection.5.4.3}% 151 \BOOKMARK [3][-]{subsubsection.5.4.3.5}{Other Types of Memory}{subsection.5.4.3}% 152 \BOOKMARK [2][-]{subsection.5.4.4}{Threads Hierarchy}{section.5.4}% 153 \BOOKMARK [2][-]{subsection.5.4.5}{What's NOT There}{section.5.4}% 154 \BOOKMARK [1][-]{section.5.5}{Synchronization, Within and Between Blocks}{chapter.5}% 155 \BOOKMARK [1][-]{section.5.6}{More on the Blocks/Threads Tradeoff}{chapter.5}% 156 \BOOKMARK [1][-]{section.5.7}{Hardware Requirements, Installation, Compilation, Debugging}{chapter.5}% 157 \BOOKMARK [1][-]{section.5.8}{Example: Improving the Row Sums Program}{chapter.5}% 158 \BOOKMARK [1][-]{section.5.9}{Example: Finding the Mean Number of Mutual Outlinks}{chapter.5}% 159 \BOOKMARK [1][-]{section.5.10}{Example: Finding Prime Numbers}{chapter.5}% 160 \BOOKMARK [1][-]{section.5.11}{Example: Finding Cumulative Sums}{chapter.5}% 161 \BOOKMARK [1][-]{section.5.12}{When Is It Advantageous to Use Shared Memory}{chapter.5}% 162 \BOOKMARK [1][-]{section.5.13}{Example: Transforming an Adjacency Matrix}{chapter.5}% 163 \BOOKMARK [1][-]{section.5.14}{Error Checking}{chapter.5}% 164 \BOOKMARK [1][-]{section.5.15}{Loop Unrolling}{chapter.5}% 165 \BOOKMARK [1][-]{section.5.16}{Short Vectors}{chapter.5}% 166 \BOOKMARK [1][-]{section.5.17}{Newer Generations}{chapter.5}% 167 \BOOKMARK [2][-]{subsection.5.17.1}{True Caching}{section.5.17}% 168 \BOOKMARK [2][-]{subsection.5.17.2}{Unified Memory}{section.5.17}% 169 \BOOKMARK [1][-]{section.5.18}{CUDA from a Higher Level}{chapter.5}% 170 \BOOKMARK [2][-]{subsection.5.18.1}{CUBLAS}{section.5.18}% 171 \BOOKMARK [3][-]{subsubsection.5.18.1.1}{Example: Row Sums Once Again}{subsection.5.18.1}% 172 \BOOKMARK [2][-]{subsection.5.18.2}{Thrust}{section.5.18}% 173 \BOOKMARK [2][-]{subsection.5.18.3}{CUFFT}{section.5.18}% 174 \BOOKMARK [1][-]{section.5.19}{Other CUDA Examples in This Book}{chapter.5}% 175 \BOOKMARK [0][-]{chapter.6}{Introduction to Thrust Programming}{}% 176 \BOOKMARK [1][-]{section.6.1}{Compiling Thrust Code}{chapter.6}% 177 \BOOKMARK [2][-]{subsection.6.1.1}{Compiling to CUDA}{section.6.1}% 178 \BOOKMARK [2][-]{subsection.6.1.2}{Compiling to OpenMP}{section.6.1}% 179 \BOOKMARK [1][-]{section.6.2}{Example: Counting the Number of Unique Values in an Array}{chapter.6}% 180 \BOOKMARK [1][-]{section.6.3}{Example: A Plain-C Wrapper for Thrust sort\(\)}{chapter.6}% 181 \BOOKMARK [1][-]{section.6.4}{Example: Calculating Percentiles in an Array}{chapter.6}% 182 \BOOKMARK [1][-]{section.6.5}{Mixing Thrust and CUDA Code}{chapter.6}% 183 \BOOKMARK [1][-]{section.6.6}{Example: Doubling Every kth Element of an Array}{chapter.6}% 184 \BOOKMARK [1][-]{section.6.7}{Scatter and Gather Operations}{chapter.6}% 185 \BOOKMARK [2][-]{subsection.6.7.1}{Example: Matrix Transpose}{section.6.7}% 186 \BOOKMARK [1][-]{section.6.8}{Advanced \(``Fancy''\) Iterators}{chapter.6}% 187 \BOOKMARK [2][-]{subsection.6.8.1}{Example: Matrix Transpose Again}{section.6.8}% 188 \BOOKMARK [1][-]{section.6.9}{A Timing Comparison}{chapter.6}% 189 \BOOKMARK [1][-]{section.6.10}{Example: Transforming an Adjacency Matrix}{chapter.6}% 190 \BOOKMARK [1][-]{section.6.11}{Prefix Scan}{chapter.6}% 191 \BOOKMARK [1][-]{section.6.12}{More on Use of Thrust for a CUDA Backend}{chapter.6}% 192 \BOOKMARK [2][-]{subsection.6.12.1}{Synchronicity}{section.6.12}% 193 \BOOKMARK [1][-]{section.6.13}{Error Messages}{chapter.6}% 194 \BOOKMARK [1][-]{section.6.14}{Other Examples of Thrust Code in This Book}{chapter.6}% 195 \BOOKMARK [0][-]{chapter.7}{Message Passing Systems}{}% 196 \BOOKMARK [1][-]{section.7.1}{Overview}{chapter.7}% 197 \BOOKMARK [1][-]{section.7.2}{A Historical Example: Hypercubes}{chapter.7}% 198 \BOOKMARK [2][-]{subsection.7.2.1}{Definitions}{section.7.2}% 199 \BOOKMARK [1][-]{section.7.3}{Networks of Workstations \(NOWs\)}{chapter.7}% 200 \BOOKMARK [2][-]{subsection.7.3.1}{The Network Is Literally the Weakest Link}{section.7.3}% 201 \BOOKMARK [2][-]{subsection.7.3.2}{Other Issues}{section.7.3}% 202 \BOOKMARK [1][-]{section.7.4}{Scatter/Gather Operations}{chapter.7}% 203 \BOOKMARK [0][-]{chapter.8}{Introduction to MPI}{}% 204 \BOOKMARK [1][-]{section.8.1}{Overview}{chapter.8}% 205 \BOOKMARK [2][-]{subsection.8.1.1}{History}{section.8.1}% 206 \BOOKMARK [2][-]{subsection.8.1.2}{Structure and Execution}{section.8.1}% 207 \BOOKMARK [2][-]{subsection.8.1.3}{Implementations}{section.8.1}% 208 \BOOKMARK [2][-]{subsection.8.1.4}{Performance Issues}{section.8.1}% 209 \BOOKMARK [1][-]{section.8.2}{Review of Earlier Example}{chapter.8}% 210 \BOOKMARK [1][-]{section.8.3}{Example: Dijkstra Algorithm}{chapter.8}% 211 \BOOKMARK [2][-]{subsection.8.3.1}{The Algorithm}{section.8.3}% 212 \BOOKMARK [2][-]{subsection.8.3.2}{The MPI Code}{section.8.3}% 213 \BOOKMARK [2][-]{subsection.8.3.3}{Introduction to MPI APIs}{section.8.3}% 214 \BOOKMARK [3][-]{subsubsection.8.3.3.1}{MPI\137Init\(\) and MPI\137Finalize\(\)}{subsection.8.3.3}% 215 \BOOKMARK [3][-]{subsubsection.8.3.3.2}{MPI\137Comm\137size\(\) and MPI\137Comm\137rank\(\)}{subsection.8.3.3}% 216 \BOOKMARK [3][-]{subsubsection.8.3.3.3}{MPI\137Send\(\)}{subsection.8.3.3}% 217 \BOOKMARK [3][-]{subsubsection.8.3.3.4}{MPI\137Recv\(\)}{subsection.8.3.3}% 218 \BOOKMARK [1][-]{section.8.4}{Example: Removing 0s from an Array}{chapter.8}% 219 \BOOKMARK [1][-]{section.8.5}{Debugging MPI Code}{chapter.8}% 220 \BOOKMARK [1][-]{section.8.6}{Collective Communications}{chapter.8}% 221 \BOOKMARK [2][-]{subsection.8.6.1}{Example: Refined Dijkstra Code}{section.8.6}% 222 \BOOKMARK [2][-]{subsection.8.6.2}{MPI\137Bcast\(\)}{section.8.6}% 223 \BOOKMARK [2][-]{subsection.8.6.3}{MPI\137Reduce\(\)/MPI\137Allreduce\(\)}{section.8.6}% 224 \BOOKMARK [2][-]{subsection.8.6.4}{MPI\137Gather\(\)/MPI\137Allgather\(\)}{section.8.6}% 225 \BOOKMARK [2][-]{subsection.8.6.5}{The MPI\137Scatter\(\)}{section.8.6}% 226 \BOOKMARK [2][-]{subsection.8.6.6}{Example: Count the Number of Edges in a Directed Graph}{section.8.6}% 227 \BOOKMARK [2][-]{subsection.8.6.7}{Example: Cumulative Sums}{section.8.6}% 228 \BOOKMARK [2][-]{subsection.8.6.8}{Example: an MPI Solution to the Mutual Outlinks Problem}{section.8.6}% 229 \BOOKMARK [2][-]{subsection.8.6.9}{The MPI\137Barrier\(\)}{section.8.6}% 230 \BOOKMARK [2][-]{subsection.8.6.10}{Creating Communicators}{section.8.6}% 231 \BOOKMARK [1][-]{section.8.7}{Buffering, Synchrony and Related Issues}{chapter.8}% 232 \BOOKMARK [2][-]{subsection.8.7.1}{Buffering, Etc.}{section.8.7}% 233 \BOOKMARK [2][-]{subsection.8.7.2}{Safety}{section.8.7}% 234 \BOOKMARK [2][-]{subsection.8.7.3}{Living Dangerously}{section.8.7}% 235 \BOOKMARK [2][-]{subsection.8.7.4}{Safe Exchange Operations}{section.8.7}% 236 \BOOKMARK [1][-]{section.8.8}{Use of MPI from Other Languages}{chapter.8}% 237 \BOOKMARK [1][-]{section.8.9}{Other MPI Examples in This Book}{chapter.8}% 238 \BOOKMARK [0][-]{chapter.9}{MapReduce Computation}{}% 239 \BOOKMARK [1][-]{section.9.1}{Apache Hadoop}{chapter.9}% 240 \BOOKMARK [2][-]{subsection.9.1.1}{Hadoop Streaming}{section.9.1}% 241 \BOOKMARK [2][-]{subsection.9.1.2}{Example: Word Count}{section.9.1}% 242 \BOOKMARK [2][-]{subsection.9.1.3}{Running the Code}{section.9.1}% 243 \BOOKMARK [2][-]{subsection.9.1.4}{Analysis of the Code}{section.9.1}% 244 \BOOKMARK [2][-]{subsection.9.1.5}{Role of Disk Files}{section.9.1}% 245 \BOOKMARK [1][-]{section.9.2}{Other MapReduce Systems}{chapter.9}% 246 \BOOKMARK [1][-]{section.9.3}{R Interfaces to MapReduce Systems}{chapter.9}% 247 \BOOKMARK [1][-]{section.9.4}{An Alternative: ``Snowdoop''}{chapter.9}% 248 \BOOKMARK [2][-]{subsection.9.4.1}{Example: Snowdoop Word Count}{section.9.4}% 249 \BOOKMARK [2][-]{subsection.9.4.2}{Example: Snowdoop k-Means Clustering}{section.9.4}% 250 \BOOKMARK [0][-]{chapter.10}{The Parallel Prefix Problem}{}% 251 \BOOKMARK [1][-]{section.10.1}{Example: Permutations}{chapter.10}% 252 \BOOKMARK [1][-]{section.10.2}{General Strategies for Parallel Scan Computation}{chapter.10}% 253 \BOOKMARK [1][-]{section.10.3}{Implementations}{chapter.10}% 254 \BOOKMARK [1][-]{section.10.4}{Example: Parallel Prefix Summing in OpenMP}{chapter.10}% 255 \BOOKMARK [1][-]{section.10.5}{Example: Run-Length Coding Decompression in OpenMP}{chapter.10}% 256 \BOOKMARK [1][-]{section.10.6}{Example: Run-Length Coding Decompression in Thrust}{chapter.10}% 257 \BOOKMARK [1][-]{section.10.7}{Example: Moving Average}{chapter.10}% 258 \BOOKMARK [2][-]{subsection.10.7.1}{Rth Code}{section.10.7}% 259 \BOOKMARK [2][-]{subsection.10.7.2}{Algorithm}{section.10.7}% 260 \BOOKMARK [2][-]{subsection.10.7.3}{Use of Lambda Functions}{section.10.7}% 261 \BOOKMARK [0][-]{chapter.11}{Introduction to Parallel Matrix Operations}{}% 262 \BOOKMARK [1][-]{section.11.1}{``We're Not in Physicsland Anymore, Toto''}{chapter.11}% 263 \BOOKMARK [1][-]{section.11.2}{Partitioned Matrices}{chapter.11}% 264 \BOOKMARK [1][-]{section.11.3}{Parallel Matrix Multiplication}{chapter.11}% 265 \BOOKMARK [2][-]{subsection.11.3.1}{Message-Passing Case}{section.11.3}% 266 \BOOKMARK [3][-]{subsubsection.11.3.1.1}{Fox's Algorithm}{subsection.11.3.1}% 267 \BOOKMARK [3][-]{subsubsection.11.3.1.2}{Performance Issues}{subsection.11.3.1}% 268 \BOOKMARK [2][-]{subsection.11.3.2}{Shared-Memory Case}{section.11.3}% 269 \BOOKMARK [3][-]{subsubsection.11.3.2.1}{Example: Matrix Multiply in OpenMP}{subsection.11.3.2}% 270 \BOOKMARK [3][-]{subsubsection.11.3.2.2}{Example: Matrix Multiply in CUDA}{subsection.11.3.2}% 271 \BOOKMARK [2][-]{subsection.11.3.3}{R Snow}{section.11.3}% 272 \BOOKMARK [2][-]{subsection.11.3.4}{R Interfaces to GPUs}{section.11.3}% 273 \BOOKMARK [1][-]{section.11.4}{Finding Powers of Matrices}{chapter.11}% 274 \BOOKMARK [2][-]{subsection.11.4.1}{Example: Graph Connectedness}{section.11.4}% 275 \BOOKMARK [2][-]{subsection.11.4.2}{Example: Fibonacci Numbers}{section.11.4}% 276 \BOOKMARK [2][-]{subsection.11.4.3}{Example: Matrix Inversion}{section.11.4}% 277 \BOOKMARK [2][-]{subsection.11.4.4}{Parallel Computation}{section.11.4}% 278 \BOOKMARK [1][-]{section.11.5}{Solving Systems of Linear Equations}{chapter.11}% 279 \BOOKMARK [2][-]{subsection.11.5.1}{Gaussian Elimination}{section.11.5}% 280 \BOOKMARK [2][-]{subsection.11.5.2}{Example: Gaussian Elimination in CUDA}{section.11.5}% 281 \BOOKMARK [2][-]{subsection.11.5.3}{The Jacobi Algorithm}{section.11.5}% 282 \BOOKMARK [2][-]{subsection.11.5.4}{Example: OpenMP Implementation of the Jacobi Algorithm}{section.11.5}% 283 \BOOKMARK [2][-]{subsection.11.5.5}{Example: R/gputools Implementation of Jacobi}{section.11.5}% 284 \BOOKMARK [1][-]{section.11.6}{Eigenvalues and Eigenvectors}{chapter.11}% 285 \BOOKMARK [2][-]{subsection.11.6.1}{The Power Method}{section.11.6}% 286 \BOOKMARK [2][-]{subsection.11.6.2}{Parallel Computation}{section.11.6}% 287 \BOOKMARK [1][-]{section.11.7}{Sparse Matrices}{chapter.11}% 288 \BOOKMARK [1][-]{section.11.8}{Libraries}{chapter.11}% 289 \BOOKMARK [0][-]{chapter.12}{Introduction to Parallel Sorting}{}% 290 \BOOKMARK [1][-]{section.12.1}{Quicksort}{chapter.12}% 291 \BOOKMARK [2][-]{subsection.12.1.1}{The Separation Process}{section.12.1}% 292 \BOOKMARK [2][-]{subsection.12.1.2}{Example: OpenMP Quicksort}{section.12.1}% 293 \BOOKMARK [2][-]{subsection.12.1.3}{Hyperquicksort}{section.12.1}% 294 \BOOKMARK [1][-]{section.12.2}{Mergesorts}{chapter.12}% 295 \BOOKMARK [2][-]{subsection.12.2.1}{Sequential Form}{section.12.2}% 296 \BOOKMARK [2][-]{subsection.12.2.2}{Shared-Memory Mergesort}{section.12.2}% 297 \BOOKMARK [2][-]{subsection.12.2.3}{Message Passing Mergesort on a Tree Topology}{section.12.2}% 298 \BOOKMARK [2][-]{subsection.12.2.4}{Compare-Exchange Operations}{section.12.2}% 299 \BOOKMARK [2][-]{subsection.12.2.5}{Bitonic Mergesort}{section.12.2}% 300 \BOOKMARK [1][-]{section.12.3}{The Bubble Sort and Its Cousins}{chapter.12}% 301 \BOOKMARK [2][-]{subsection.12.3.1}{The Much-Maligned Bubble Sort}{section.12.3}% 302 \BOOKMARK [2][-]{subsection.12.3.2}{A Popular Variant: Odd-Even Transposition}{section.12.3}% 303 \BOOKMARK [2][-]{subsection.12.3.3}{Example: CUDA Implementation of Odd/Even Transposition Sort}{section.12.3}% 304 \BOOKMARK [1][-]{section.12.4}{Shearsort}{chapter.12}% 305 \BOOKMARK [1][-]{section.12.5}{Bucket Sort with Sampling}{chapter.12}% 306 \BOOKMARK [1][-]{section.12.6}{Radix Sort}{chapter.12}% 307 \BOOKMARK [1][-]{section.12.7}{Enumeration Sort}{chapter.12}% 308 \BOOKMARK [0][-]{chapter.13}{Parallel Computation for Audio and Image Processing}{}% 309 \BOOKMARK [1][-]{section.13.1}{General Principles}{chapter.13}% 310 \BOOKMARK [2][-]{subsection.13.1.1}{One-Dimensional Fourier Series}{section.13.1}% 311 \BOOKMARK [2][-]{subsection.13.1.2}{Two-Dimensional Fourier Series}{section.13.1}% 312 \BOOKMARK [1][-]{section.13.2}{Discrete Fourier Transforms}{chapter.13}% 313 \BOOKMARK [2][-]{subsection.13.2.1}{One-Dimensional Data}{section.13.2}% 314 \BOOKMARK [2][-]{subsection.13.2.2}{Inversion}{section.13.2}% 315 \BOOKMARK [3][-]{subsubsection.13.2.2.1}{Alternate Formulation}{subsection.13.2.2}% 316 \BOOKMARK [2][-]{subsection.13.2.3}{Two-Dimensional Data}{section.13.2}% 317 \BOOKMARK [1][-]{section.13.3}{Parallel Computation of Discrete Fourier Transforms}{chapter.13}% 318 \BOOKMARK [2][-]{subsection.13.3.1}{The Fast Fourier Transform}{section.13.3}% 319 \BOOKMARK [2][-]{subsection.13.3.2}{A Matrix Approach}{section.13.3}% 320 \BOOKMARK [2][-]{subsection.13.3.3}{Parallelizing Computation of the Inverse Transform}{section.13.3}% 321 \BOOKMARK [2][-]{subsection.13.3.4}{Parallelizing Computation of the Two-Dimensional Transform}{section.13.3}% 322 \BOOKMARK [1][-]{section.13.4}{Available FFT Software}{chapter.13}% 323 \BOOKMARK [2][-]{subsection.13.4.1}{R}{section.13.4}% 324 \BOOKMARK [2][-]{subsection.13.4.2}{CUFFT}{section.13.4}% 325 \BOOKMARK [2][-]{subsection.13.4.3}{FFTW}{section.13.4}% 326 \BOOKMARK [1][-]{section.13.5}{Applications to Image Processing}{chapter.13}% 327 \BOOKMARK [2][-]{subsection.13.5.1}{Smoothing}{section.13.5}% 328 \BOOKMARK [2][-]{subsection.13.5.2}{Example: Audio Smoothing in R}{section.13.5}% 329 \BOOKMARK [2][-]{subsection.13.5.3}{Edge Detection}{section.13.5}% 330 \BOOKMARK [1][-]{section.13.6}{R Access to Sound and Image Files}{chapter.13}% 331 \BOOKMARK [1][-]{section.13.7}{Keeping the Pixel Intensities in the Proper Range}{chapter.13}% 332 \BOOKMARK [1][-]{section.13.8}{Does the Function g\(\) Really Have to Be Repeating?}{chapter.13}% 333 \BOOKMARK [1][-]{section.13.9}{Vector Space Issues \(optional section\)}{chapter.13}% 334 \BOOKMARK [1][-]{section.13.10}{Bandwidth: How to Read the San Francisco Chronicle Business Page \(optional section\)}{chapter.13}% 335 \BOOKMARK [0][-]{chapter.14}{Parallel Computation in Statistics/Data Mining}{}% 336 \BOOKMARK [1][-]{section.14.1}{Itemset Analysis}{chapter.14}% 337 \BOOKMARK [2][-]{subsection.14.1.1}{What Is It?}{section.14.1}% 338 \BOOKMARK [2][-]{subsection.14.1.2}{The Market Basket Problem}{section.14.1}% 339 \BOOKMARK [2][-]{subsection.14.1.3}{Serial Algorithms}{section.14.1}% 340 \BOOKMARK [2][-]{subsection.14.1.4}{Parallelizing the Apriori Algorithm}{section.14.1}% 341 \BOOKMARK [1][-]{section.14.2}{Probability Density Estimation}{chapter.14}% 342 \BOOKMARK [2][-]{subsection.14.2.1}{Kernel-Based Density Estimation}{section.14.2}% 343 \BOOKMARK [2][-]{subsection.14.2.2}{Histogram Computation for Images}{section.14.2}% 344 \BOOKMARK [1][-]{section.14.3}{Clustering}{chapter.14}% 345 \BOOKMARK [2][-]{subsection.14.3.1}{Example: k-Means Clustering in R}{section.14.3}% 346 \BOOKMARK [1][-]{section.14.4}{Principal Component Analysis \(PCA\)}{chapter.14}% 347 \BOOKMARK [1][-]{section.14.5}{Monte Carlo Simulation}{chapter.14}% 348 \BOOKMARK [0][-]{appendix.A}{Miscellaneous Systems Issues}{}% 349 \BOOKMARK [1][-]{section.A.1}{Timesharing}{appendix.A}% 350 \BOOKMARK [2][-]{subsection.A.1.1}{Many Processes, Taking Turns}{section.A.1}% 351 \BOOKMARK [1][-]{section.A.2}{Memory Hierarchies}{appendix.A}% 352 \BOOKMARK [2][-]{subsection.A.2.1}{Cache Memory}{section.A.2}% 353 \BOOKMARK [2][-]{subsection.A.2.2}{Virtual Memory}{section.A.2}% 354 \BOOKMARK [3][-]{subsubsection.A.2.2.1}{Make Sure You Understand the Goals}{subsection.A.2.2}% 355 \BOOKMARK [3][-]{subsubsection.A.2.2.2}{How It Works}{subsection.A.2.2}% 356 \BOOKMARK [2][-]{subsection.A.2.3}{Performance Issues}{section.A.2}% 357 \BOOKMARK [1][-]{section.A.3}{Array Issues}{appendix.A}% 358 \BOOKMARK [2][-]{subsection.A.3.1}{Storage}{section.A.3}% 359 \BOOKMARK [2][-]{subsection.A.3.2}{Subarrays}{section.A.3}% 360 \BOOKMARK [2][-]{subsection.A.3.3}{Memory Allocation}{section.A.3}% 361 \BOOKMARK [0][-]{appendix.B}{Review of Matrix Algebra}{}% 362 \BOOKMARK [1][-]{section.B.1}{Terminology and Notation}{appendix.B}% 363 \BOOKMARK [2][-]{subsection.B.1.1}{Matrix Addition and Multiplication}{section.B.1}% 364 \BOOKMARK [1][-]{section.B.2}{Matrix Transpose}{appendix.B}% 365 \BOOKMARK [1][-]{section.B.3}{Linear Independence}{appendix.B}% 366 \BOOKMARK [1][-]{section.B.4}{Determinants}{appendix.B}% 367 \BOOKMARK [1][-]{section.B.5}{Matrix Inverse}{appendix.B}% 368 \BOOKMARK [1][-]{section.B.6}{Eigenvalues and Eigenvectors}{appendix.B}% 369 \BOOKMARK [1][-]{section.B.7}{Rank of a Matrix}{appendix.B}% 370 \BOOKMARK [1][-]{section.B.8}{Matrix Algebra in R}{appendix.B}% 371 \BOOKMARK [0][-]{appendix.C}{R Quick Start}{}% 372 \BOOKMARK [1][-]{section.C.1}{Correspondences}{appendix.C}% 373 \BOOKMARK [1][-]{section.C.2}{Starting R}{appendix.C}% 374 \BOOKMARK [1][-]{section.C.3}{First Sample Programming Session}{appendix.C}% 375 \BOOKMARK [1][-]{section.C.4}{Vectorization}{appendix.C}% 376 \BOOKMARK [1][-]{section.C.5}{Second Sample Programming Session}{appendix.C}% 377 \BOOKMARK [1][-]{section.C.6}{Recycling}{appendix.C}% 378 \BOOKMARK [1][-]{section.C.7}{More on Vectorization}{appendix.C}% 379 \BOOKMARK [1][-]{section.C.8}{Third Sample Programming Session}{appendix.C}% 380 \BOOKMARK [1][-]{section.C.9}{Default Argument Values}{appendix.C}% 381 \BOOKMARK [1][-]{section.C.10}{The R List Type}{appendix.C}% 382 \BOOKMARK [2][-]{subsection.C.10.1}{The Basics}{section.C.10}% 383 \BOOKMARK [2][-]{subsection.C.10.2}{The Reduce\(\) Function}{section.C.10}% 384 \BOOKMARK [2][-]{subsection.C.10.3}{S3 Classes}{section.C.10}% 385 \BOOKMARK [1][-]{section.C.11}{Some Workhorse Functions}{appendix.C}% 386 \BOOKMARK [1][-]{section.C.12}{Handy Utilities}{appendix.C}% 387 \BOOKMARK [1][-]{section.C.13}{Data Frames}{appendix.C}% 388 \BOOKMARK [1][-]{section.C.14}{Graphics}{appendix.C}% 389 \BOOKMARK [1][-]{section.C.15}{Packages}{appendix.C}% 390 \BOOKMARK [1][-]{section.C.16}{Other Sources for Learning R}{appendix.C}% 391 \BOOKMARK [1][-]{section.C.17}{Online Help}{appendix.C}% 392 \BOOKMARK [1][-]{section.C.18}{Debugging in R}{appendix.C}% 393 \BOOKMARK [1][-]{section.C.19}{Complex Numbers}{appendix.C}% 394 \BOOKMARK [1][-]{section.C.20}{Further Reading}{appendix.C}% 395