\BOOKMARK [0][-]{chapter.1}{Introduction to Parallel Processing}{}% 1
\BOOKMARK [1][-]{section.1.1}{Platforms}{chapter.1}% 2
\BOOKMARK [2][-]{subsection.1.1.1}{Why R?}{section.1.1}% 3
\BOOKMARK [1][-]{section.1.2}{Why Use Parallel Systems?}{chapter.1}% 4
\BOOKMARK [2][-]{subsection.1.2.1}{Execution Speed}{section.1.2}% 5
\BOOKMARK [2][-]{subsection.1.2.2}{Memory}{section.1.2}% 6
\BOOKMARK [2][-]{subsection.1.2.3}{Distributed Processing}{section.1.2}% 7
\BOOKMARK [2][-]{subsection.1.2.4}{Our Focus Here}{section.1.2}% 8
\BOOKMARK [1][-]{section.1.3}{Parallel Processing Hardware}{chapter.1}% 9
\BOOKMARK [2][-]{subsection.1.3.1}{Shared-Memory Systems}{section.1.3}% 10
\BOOKMARK [3][-]{subsubsection.1.3.1.1}{Basic Architecture}{subsection.1.3.1}% 11
\BOOKMARK [3][-]{subsubsection.1.3.1.2}{Multiprocessor Topologies}{subsection.1.3.1}% 12
\BOOKMARK [3][-]{subsubsection.1.3.1.3}{Memory Issues Etc.}{subsection.1.3.1}% 13
\BOOKMARK [2][-]{subsection.1.3.2}{Message-Passing Systems}{section.1.3}% 14
\BOOKMARK [3][-]{subsubsection.1.3.2.1}{Basic Architecture}{subsection.1.3.2}% 15
\BOOKMARK [3][-]{subsubsection.1.3.2.2}{Example: Clusters}{subsection.1.3.2}% 16
\BOOKMARK [2][-]{subsection.1.3.3}{SIMD}{section.1.3}% 17
\BOOKMARK [1][-]{section.1.4}{Programmer World Views}{chapter.1}% 18
\BOOKMARK [2][-]{subsection.1.4.1}{Example: Matrix-Vector Multiply}{section.1.4}% 19
\BOOKMARK [2][-]{subsection.1.4.2}{Shared-Memory}{section.1.4}% 20
\BOOKMARK [3][-]{subsubsection.1.4.2.1}{Programmer View}{subsection.1.4.2}% 21
\BOOKMARK [1][-]{section.1.5}{Shared Memory Programming}{chapter.1}% 22
\BOOKMARK [2][-]{subsection.1.5.1}{Low-Level Threads Systems: Pthreads}{section.1.5}% 23
\BOOKMARK [3][-]{subsubsection.1.5.1.1}{Pthreads Example: Finding Primes}{subsection.1.5.1}% 24
\BOOKMARK [2][-]{subsection.1.5.2}{Compiling and Linking}{section.1.5}% 25
\BOOKMARK [2][-]{subsection.1.5.3}{The Notion of a ``Critical Section''}{section.1.5}% 26
\BOOKMARK [2][-]{subsection.1.5.4}{Role of the OS}{section.1.5}% 27
\BOOKMARK [2][-]{subsection.1.5.5}{Debugging Threads Programs}{section.1.5}% 28
\BOOKMARK [1][-]{section.1.6}{A Note on Global Variables}{chapter.1}% 29
\BOOKMARK [2][-]{subsection.1.6.1}{Higher-Level Threads Programming: OpenMP}{section.1.6}% 30
\BOOKMARK [3][-]{subsubsection.1.6.1.1}{Example: Sampling Bucket Sort}{subsection.1.6.1}% 31
\BOOKMARK [3][-]{subsubsection.1.6.1.2}{Debugging OpenMP}{subsection.1.6.1}% 32
\BOOKMARK [2][-]{subsection.1.6.2}{Message Passing}{section.1.6}% 33
\BOOKMARK [3][-]{subsubsection.1.6.2.1}{Programmer View}{subsection.1.6.2}% 34
\BOOKMARK [3][-]{subsubsection.1.6.2.2}{Example: MPI Prime Numbers Finder}{subsection.1.6.2}% 35
\BOOKMARK [2][-]{subsection.1.6.3}{Scatter/Gather}{section.1.6}% 36
\BOOKMARK [3][-]{subsubsection.1.6.3.1}{R snow Package}{subsection.1.6.3}% 37
\BOOKMARK [1][-]{section.1.7}{Threads Programming in R: Rdsm}{chapter.1}% 38
\BOOKMARK [2][-]{subsection.1.7.1}{Example: Matrix Multiplication}{section.1.7}% 39
\BOOKMARK [2][-]{subsection.1.7.2}{Example: Maximal Burst in a Time Series}{section.1.7}% 40
\BOOKMARK [0][-]{chapter.2}{Recurring Performance Issues}{}% 41
\BOOKMARK [1][-]{section.2.1}{Communication Bottlenecks}{chapter.2}% 42
\BOOKMARK [1][-]{section.2.2}{Load Balancing}{chapter.2}% 43
\BOOKMARK [1][-]{section.2.3}{``Embarrassingly Parallel'' Applications}{chapter.2}% 44
\BOOKMARK [2][-]{subsection.2.3.1}{What People Mean by ``Embarrassingly Parallel''}{section.2.3}% 45
\BOOKMARK [2][-]{subsection.2.3.2}{Iterative Algorithms}{section.2.3}% 46
\BOOKMARK [1][-]{section.2.4}{Static \(But Possibly Random\) Task Assignment Typically Better Than Dynamic}{chapter.2}% 47
\BOOKMARK [2][-]{subsection.2.4.1}{Example: Matrix-Vector Multiply}{section.2.4}% 48
\BOOKMARK [2][-]{subsection.2.4.2}{Load Balance, Revisited}{section.2.4}% 49
\BOOKMARK [2][-]{subsection.2.4.3}{Example: Mutual Web Outlinks}{section.2.4}% 50
\BOOKMARK [2][-]{subsection.2.4.4}{Work Stealing}{section.2.4}% 51
\BOOKMARK [2][-]{subsection.2.4.5}{Timing Example}{section.2.4}% 52
\BOOKMARK [2][-]{subsection.2.4.6}{Example: Extracting a Submatrix}{section.2.4}% 53
\BOOKMARK [1][-]{section.2.5}{Latency and Bandwidth}{chapter.2}% 54
\BOOKMARK [1][-]{section.2.6}{Relative Merits: Performance of Shared-Memory Vs. Message-Passing}{chapter.2}% 55
\BOOKMARK [1][-]{section.2.7}{Memory Allocation Issues}{chapter.2}% 56
\BOOKMARK [1][-]{section.2.8}{Issues Particular to Shared-Memory Systems}{chapter.2}% 57
\BOOKMARK [0][-]{chapter.3}{Shared Memory Parallelism}{}% 58
\BOOKMARK [1][-]{section.3.1}{What Is Shared?}{chapter.3}% 59
\BOOKMARK [1][-]{section.3.2}{Memory Modules}{chapter.3}% 60
\BOOKMARK [2][-]{subsection.3.2.1}{Interleaving}{section.3.2}% 61
\BOOKMARK [2][-]{subsection.3.2.2}{Bank Conflicts and Solutions}{section.3.2}% 62
\BOOKMARK [2][-]{subsection.3.2.3}{Example: Code to Implement Padding}{section.3.2}% 63
\BOOKMARK [1][-]{section.3.3}{Interconnection Topologies}{chapter.3}% 64
\BOOKMARK [2][-]{subsection.3.3.1}{SMP Systems}{section.3.3}% 65
\BOOKMARK [2][-]{subsection.3.3.2}{NUMA Systems}{section.3.3}% 66
\BOOKMARK [2][-]{subsection.3.3.3}{NUMA Interconnect Topologies}{section.3.3}% 67
\BOOKMARK [3][-]{subsubsection.3.3.3.1}{Crossbar Interconnects}{subsection.3.3.3}% 68
\BOOKMARK [3][-]{subsubsection.3.3.3.2}{Omega \(or Delta\) Interconnects}{subsection.3.3.3}% 69
\BOOKMARK [2][-]{subsection.3.3.4}{Comparative Analysis}{section.3.3}% 70
\BOOKMARK [2][-]{subsection.3.3.5}{Why Have Memory in Modules?}{section.3.3}% 71
\BOOKMARK [1][-]{section.3.4}{Synchronization Hardware}{chapter.3}% 72
\BOOKMARK [2][-]{subsection.3.4.1}{Test-and-Set Instructions}{section.3.4}% 73
\BOOKMARK [3][-]{subsubsection.3.4.1.1}{LOCK Prefix on Intel Processors}{subsection.3.4.1}% 74
\BOOKMARK [3][-]{subsubsection.3.4.1.2}{Locks with More Complex Interconnects}{subsection.3.4.1}% 75
\BOOKMARK [2][-]{subsection.3.4.2}{May Not Need the Latest}{section.3.4}% 76
\BOOKMARK [2][-]{subsection.3.4.3}{Fetch-and-Add Instructions}{section.3.4}% 77
\BOOKMARK [1][-]{section.3.5}{Cache Issues}{chapter.3}% 78
\BOOKMARK [2][-]{subsection.3.5.1}{Cache Coherency}{section.3.5}% 79
\BOOKMARK [2][-]{subsection.3.5.2}{Example: the MESI Cache Coherency Protocol}{section.3.5}% 80
\BOOKMARK [2][-]{subsection.3.5.3}{The Problem of ``False Sharing''}{section.3.5}% 81
\BOOKMARK [1][-]{section.3.6}{Memory-Access Consistency Policies}{chapter.3}% 82
\BOOKMARK [1][-]{section.3.7}{Fetch-and-Add Combining within Interconnects}{chapter.3}% 83
\BOOKMARK [1][-]{section.3.8}{Multicore Chips}{chapter.3}% 84
\BOOKMARK [1][-]{section.3.9}{Optimal Number of Threads}{chapter.3}% 85
\BOOKMARK [1][-]{section.3.10}{Processor Affinity}{chapter.3}% 86
\BOOKMARK [1][-]{section.3.11}{Illusion of Shared-Memory through Software}{chapter.3}% 87
\BOOKMARK [2][-]{subsection.3.11.1}{Software Distributed Shared Memory}{section.3.11}% 88
\BOOKMARK [2][-]{subsection.3.11.2}{Case Study: JIAJIA}{section.3.11}% 89
\BOOKMARK [1][-]{section.3.12}{Barrier Implementation}{chapter.3}% 90
\BOOKMARK [2][-]{subsection.3.12.1}{A Use-Once Version}{section.3.12}% 91
\BOOKMARK [2][-]{subsection.3.12.2}{An Attempt to Write a Reusable Version}{section.3.12}% 92
\BOOKMARK [2][-]{subsection.3.12.3}{A Correct Version}{section.3.12}% 93
\BOOKMARK [2][-]{subsection.3.12.4}{Refinements}{section.3.12}% 94
\BOOKMARK [3][-]{subsubsection.3.12.4.1}{Use of Wait Operations}{subsection.3.12.4}% 95
\BOOKMARK [3][-]{subsubsection.3.12.4.2}{Parallelizing the Barrier Operation}{subsection.3.12.4}% 96
\BOOKMARK [4][-]{paragraph.3.12.4.2.1}{Tree Barriers}{subsubsection.3.12.4.2}% 97
\BOOKMARK [4][-]{paragraph.3.12.4.2.2}{Butterfly Barriers}{subsubsection.3.12.4.2}% 98
\BOOKMARK [0][-]{chapter.4}{Introduction to OpenMP}{}% 99
\BOOKMARK [1][-]{section.4.1}{Overview}{chapter.4}% 100
\BOOKMARK [1][-]{section.4.2}{Example: Dijkstra Shortest-Path Algorithm}{chapter.4}% 101
\BOOKMARK [2][-]{subsection.4.2.1}{The Algorithm}{section.4.2}% 102
\BOOKMARK [2][-]{subsection.4.2.2}{The OpenMP parallel Pragma}{section.4.2}% 103
\BOOKMARK [2][-]{subsection.4.2.3}{Scope Issues}{section.4.2}% 104
\BOOKMARK [2][-]{subsection.4.2.4}{The OpenMP single Pragma}{section.4.2}% 105
\BOOKMARK [2][-]{subsection.4.2.5}{The OpenMP barrier Pragma}{section.4.2}% 106
\BOOKMARK [2][-]{subsection.4.2.6}{Implicit Barriers}{section.4.2}% 107
\BOOKMARK [2][-]{subsection.4.2.7}{The OpenMP critical Pragma}{section.4.2}% 108
\BOOKMARK [1][-]{section.4.3}{The OpenMP for Pragma}{chapter.4}% 109
\BOOKMARK [2][-]{subsection.4.3.1}{Example: Dijkstra with Parallel for Loops}{section.4.3}% 110
\BOOKMARK [2][-]{subsection.4.3.2}{Nested Loops}{section.4.3}% 111
\BOOKMARK [2][-]{subsection.4.3.3}{Controlling the Partitioning of Work to Threads: the schedule Clause}{section.4.3}% 112
\BOOKMARK [2][-]{subsection.4.3.4}{Example: In-Place Matrix Transpose}{section.4.3}% 113
\BOOKMARK [2][-]{subsection.4.3.5}{The OpenMP reduction Clause}{section.4.3}% 114
\BOOKMARK [1][-]{section.4.4}{Example: Mandelbrot Set}{chapter.4}% 115
\BOOKMARK [1][-]{section.4.5}{The Task Directive}{chapter.4}% 116
\BOOKMARK [2][-]{subsection.4.5.1}{Example: Quicksort}{section.4.5}% 117
\BOOKMARK [1][-]{section.4.6}{Other OpenMP Synchronization Issues}{chapter.4}% 118
\BOOKMARK [2][-]{subsection.4.6.1}{The OpenMP atomic Clause}{section.4.6}% 119
\BOOKMARK [2][-]{subsection.4.6.2}{Memory Consistency and the flush Pragma}{section.4.6}% 120
\BOOKMARK [1][-]{section.4.7}{Combining Work-Sharing Constructs}{chapter.4}% 121
\BOOKMARK [1][-]{section.4.8}{The Rest of OpenMP}{chapter.4}% 122
\BOOKMARK [1][-]{section.4.9}{Compiling, Running and Debugging OpenMP Code}{chapter.4}% 123
\BOOKMARK [2][-]{subsection.4.9.1}{Compiling}{section.4.9}% 124
\BOOKMARK [2][-]{subsection.4.9.2}{Running}{section.4.9}% 125
\BOOKMARK [2][-]{subsection.4.9.3}{Debugging}{section.4.9}% 126
\BOOKMARK [1][-]{section.4.10}{Performance}{chapter.4}% 127
\BOOKMARK [2][-]{subsection.4.10.1}{The Effect of Problem Size}{section.4.10}% 128
\BOOKMARK [2][-]{subsection.4.10.2}{Some Fine Tuning}{section.4.10}% 129
\BOOKMARK [2][-]{subsection.4.10.3}{OpenMP Internals}{section.4.10}% 130
\BOOKMARK [1][-]{section.4.11}{Example: Root Finding}{chapter.4}% 131
\BOOKMARK [1][-]{section.4.12}{Example: Mutual Outlinks}{chapter.4}% 132
\BOOKMARK [1][-]{section.4.13}{Example: Transforming an Adjacency Matrix}{chapter.4}% 133
\BOOKMARK [1][-]{section.4.14}{Example: Finding the Maximal Burst in a Time Series}{chapter.4}% 134
\BOOKMARK [1][-]{section.4.15}{Locks with OpenMP}{chapter.4}% 135
\BOOKMARK [1][-]{section.4.16}{Other Examples of OpenMP Code in This Book}{chapter.4}% 136
\BOOKMARK [0][-]{chapter.5}{Introduction to GPU Programming with CUDA}{}% 137
\BOOKMARK [1][-]{section.5.1}{Overview}{chapter.5}% 138
\BOOKMARK [1][-]{section.5.2}{Some Terminology}{chapter.5}% 139
\BOOKMARK [1][-]{section.5.3}{Example: Calculate Row Sums}{chapter.5}% 140
\BOOKMARK [1][-]{section.5.4}{Understanding the Hardware Structure}{chapter.5}% 141
\BOOKMARK [2][-]{subsection.5.4.1}{Processing Units}{section.5.4}% 142
\BOOKMARK [2][-]{subsection.5.4.2}{Thread Operation}{section.5.4}% 143
\BOOKMARK [3][-]{subsubsection.5.4.2.1}{SIMT Architecture}{subsection.5.4.2}% 144
\BOOKMARK [3][-]{subsubsection.5.4.2.2}{The Problem of Thread Divergence}{subsection.5.4.2}% 145
\BOOKMARK [3][-]{subsubsection.5.4.2.3}{``OS in Hardware''}{subsection.5.4.2}% 146
\BOOKMARK [2][-]{subsection.5.4.3}{Memory Structure}{section.5.4}% 147
\BOOKMARK [3][-]{subsubsection.5.4.3.1}{Shared and Global Memory}{subsection.5.4.3}% 148
\BOOKMARK [3][-]{subsubsection.5.4.3.2}{Global-Memory Performance Issues}{subsection.5.4.3}% 149
\BOOKMARK [3][-]{subsubsection.5.4.3.3}{Shared-Memory Performance Issues}{subsection.5.4.3}% 150
\BOOKMARK [3][-]{subsubsection.5.4.3.4}{Host/Device Memory Transfer Performance Issues}{subsection.5.4.3}% 151
\BOOKMARK [3][-]{subsubsection.5.4.3.5}{Other Types of Memory}{subsection.5.4.3}% 152
\BOOKMARK [2][-]{subsection.5.4.4}{Threads Hierarchy}{section.5.4}% 153
\BOOKMARK [2][-]{subsection.5.4.5}{What's NOT There}{section.5.4}% 154
\BOOKMARK [1][-]{section.5.5}{Synchronization, Within and Between Blocks}{chapter.5}% 155
\BOOKMARK [1][-]{section.5.6}{More on the Blocks/Threads Tradeoff}{chapter.5}% 156
\BOOKMARK [1][-]{section.5.7}{Hardware Requirements, Installation, Compilation, Debugging}{chapter.5}% 157
\BOOKMARK [1][-]{section.5.8}{Example: Improving the Row Sums Program}{chapter.5}% 158
\BOOKMARK [1][-]{section.5.9}{Example: Finding the Mean Number of Mutual Outlinks}{chapter.5}% 159
\BOOKMARK [1][-]{section.5.10}{Example: Finding Prime Numbers}{chapter.5}% 160
\BOOKMARK [1][-]{section.5.11}{Example: Finding Cumulative Sums}{chapter.5}% 161
\BOOKMARK [1][-]{section.5.12}{When Is It Advantageous to Use Shared Memory}{chapter.5}% 162
\BOOKMARK [1][-]{section.5.13}{Example: Transforming an Adjacency Matrix}{chapter.5}% 163
\BOOKMARK [1][-]{section.5.14}{Error Checking}{chapter.5}% 164
\BOOKMARK [1][-]{section.5.15}{Loop Unrolling}{chapter.5}% 165
\BOOKMARK [1][-]{section.5.16}{Short Vectors}{chapter.5}% 166
\BOOKMARK [1][-]{section.5.17}{Newer Generations}{chapter.5}% 167
\BOOKMARK [2][-]{subsection.5.17.1}{True Caching}{section.5.17}% 168
\BOOKMARK [2][-]{subsection.5.17.2}{Unified Memory}{section.5.17}% 169
\BOOKMARK [1][-]{section.5.18}{CUDA from a Higher Level}{chapter.5}% 170
\BOOKMARK [2][-]{subsection.5.18.1}{CUBLAS}{section.5.18}% 171
\BOOKMARK [3][-]{subsubsection.5.18.1.1}{Example: Row Sums Once Again}{subsection.5.18.1}% 172
\BOOKMARK [2][-]{subsection.5.18.2}{Thrust}{section.5.18}% 173
\BOOKMARK [2][-]{subsection.5.18.3}{CUFFT}{section.5.18}% 174
\BOOKMARK [1][-]{section.5.19}{Other CUDA Examples in This Book}{chapter.5}% 175
\BOOKMARK [0][-]{chapter.6}{Introduction to Thrust Programming}{}% 176
\BOOKMARK [1][-]{section.6.1}{Compiling Thrust Code}{chapter.6}% 177
\BOOKMARK [2][-]{subsection.6.1.1}{Compiling to CUDA}{section.6.1}% 178
\BOOKMARK [2][-]{subsection.6.1.2}{Compiling to OpenMP}{section.6.1}% 179
\BOOKMARK [1][-]{section.6.2}{Example: Counting the Number of Unique Values in an Array}{chapter.6}% 180
\BOOKMARK [1][-]{section.6.3}{Example: A Plain-C Wrapper for Thrust sort\(\)}{chapter.6}% 181
\BOOKMARK [1][-]{section.6.4}{Example: Calculating Percentiles in an Array}{chapter.6}% 182
\BOOKMARK [1][-]{section.6.5}{Mixing Thrust and CUDA Code}{chapter.6}% 183
\BOOKMARK [1][-]{section.6.6}{Example: Doubling Every kth Element of an Array}{chapter.6}% 184
\BOOKMARK [1][-]{section.6.7}{Scatter and Gather Operations}{chapter.6}% 185
\BOOKMARK [2][-]{subsection.6.7.1}{Example: Matrix Transpose}{section.6.7}% 186
\BOOKMARK [1][-]{section.6.8}{Advanced \(``Fancy''\) Iterators}{chapter.6}% 187
\BOOKMARK [2][-]{subsection.6.8.1}{Example: Matrix Transpose Again}{section.6.8}% 188
\BOOKMARK [1][-]{section.6.9}{A Timing Comparison}{chapter.6}% 189
\BOOKMARK [1][-]{section.6.10}{Example: Transforming an Adjacency Matrix}{chapter.6}% 190
\BOOKMARK [1][-]{section.6.11}{Prefix Scan}{chapter.6}% 191
\BOOKMARK [1][-]{section.6.12}{More on Use of Thrust for a CUDA Backend}{chapter.6}% 192
\BOOKMARK [2][-]{subsection.6.12.1}{Synchronicity}{section.6.12}% 193
\BOOKMARK [1][-]{section.6.13}{Error Messages}{chapter.6}% 194
\BOOKMARK [1][-]{section.6.14}{Other Examples of Thrust Code in This Book}{chapter.6}% 195
\BOOKMARK [0][-]{chapter.7}{Message Passing Systems}{}% 196
\BOOKMARK [1][-]{section.7.1}{Overview}{chapter.7}% 197
\BOOKMARK [1][-]{section.7.2}{A Historical Example: Hypercubes}{chapter.7}% 198
\BOOKMARK [2][-]{subsection.7.2.1}{Definitions}{section.7.2}% 199
\BOOKMARK [1][-]{section.7.3}{Networks of Workstations \(NOWs\)}{chapter.7}% 200
\BOOKMARK [2][-]{subsection.7.3.1}{The Network Is Literally the Weakest Link}{section.7.3}% 201
\BOOKMARK [2][-]{subsection.7.3.2}{Other Issues}{section.7.3}% 202
\BOOKMARK [1][-]{section.7.4}{Scatter/Gather Operations}{chapter.7}% 203
\BOOKMARK [0][-]{chapter.8}{Introduction to MPI}{}% 204
\BOOKMARK [1][-]{section.8.1}{Overview}{chapter.8}% 205
\BOOKMARK [2][-]{subsection.8.1.1}{History}{section.8.1}% 206
\BOOKMARK [2][-]{subsection.8.1.2}{Structure and Execution}{section.8.1}% 207
\BOOKMARK [2][-]{subsection.8.1.3}{Implementations}{section.8.1}% 208
\BOOKMARK [2][-]{subsection.8.1.4}{Performance Issues}{section.8.1}% 209
\BOOKMARK [1][-]{section.8.2}{Review of Earlier Example}{chapter.8}% 210
\BOOKMARK [1][-]{section.8.3}{Example: Dijkstra Algorithm}{chapter.8}% 211
\BOOKMARK [2][-]{subsection.8.3.1}{The Algorithm}{section.8.3}% 212
\BOOKMARK [2][-]{subsection.8.3.2}{The MPI Code}{section.8.3}% 213
\BOOKMARK [2][-]{subsection.8.3.3}{Introduction to MPI APIs}{section.8.3}% 214
\BOOKMARK [3][-]{subsubsection.8.3.3.1}{MPI\137Init\(\) and MPI\137Finalize\(\)}{subsection.8.3.3}% 215
\BOOKMARK [3][-]{subsubsection.8.3.3.2}{MPI\137Comm\137size\(\) and MPI\137Comm\137rank\(\)}{subsection.8.3.3}% 216
\BOOKMARK [3][-]{subsubsection.8.3.3.3}{MPI\137Send\(\)}{subsection.8.3.3}% 217
\BOOKMARK [3][-]{subsubsection.8.3.3.4}{MPI\137Recv\(\)}{subsection.8.3.3}% 218
\BOOKMARK [1][-]{section.8.4}{Example: Removing 0s from an Array}{chapter.8}% 219
\BOOKMARK [1][-]{section.8.5}{Debugging MPI Code}{chapter.8}% 220
\BOOKMARK [1][-]{section.8.6}{Collective Communications}{chapter.8}% 221
\BOOKMARK [2][-]{subsection.8.6.1}{Example: Refined Dijkstra Code}{section.8.6}% 222
\BOOKMARK [2][-]{subsection.8.6.2}{MPI\137Bcast\(\)}{section.8.6}% 223
\BOOKMARK [2][-]{subsection.8.6.3}{MPI\137Reduce\(\)/MPI\137Allreduce\(\)}{section.8.6}% 224
\BOOKMARK [2][-]{subsection.8.6.4}{MPI\137Gather\(\)/MPI\137Allgather\(\)}{section.8.6}% 225
\BOOKMARK [2][-]{subsection.8.6.5}{The MPI\137Scatter\(\)}{section.8.6}% 226
\BOOKMARK [2][-]{subsection.8.6.6}{Example: Count the Number of Edges in a Directed Graph}{section.8.6}% 227
\BOOKMARK [2][-]{subsection.8.6.7}{Example: Cumulative Sums}{section.8.6}% 228
\BOOKMARK [2][-]{subsection.8.6.8}{Example: an MPI Solution to the Mutual Outlinks Problem}{section.8.6}% 229
\BOOKMARK [2][-]{subsection.8.6.9}{The MPI\137Barrier\(\)}{section.8.6}% 230
\BOOKMARK [2][-]{subsection.8.6.10}{Creating Communicators}{section.8.6}% 231
\BOOKMARK [1][-]{section.8.7}{Buffering, Synchrony and Related Issues}{chapter.8}% 232
\BOOKMARK [2][-]{subsection.8.7.1}{Buffering, Etc.}{section.8.7}% 233
\BOOKMARK [2][-]{subsection.8.7.2}{Safety}{section.8.7}% 234
\BOOKMARK [2][-]{subsection.8.7.3}{Living Dangerously}{section.8.7}% 235
\BOOKMARK [2][-]{subsection.8.7.4}{Safe Exchange Operations}{section.8.7}% 236
\BOOKMARK [1][-]{section.8.8}{Use of MPI from Other Languages}{chapter.8}% 237
\BOOKMARK [1][-]{section.8.9}{Other MPI Examples in This Book}{chapter.8}% 238
\BOOKMARK [0][-]{chapter.9}{MapReduce Computation}{}% 239
\BOOKMARK [1][-]{section.9.1}{Apache Hadoop}{chapter.9}% 240
\BOOKMARK [2][-]{subsection.9.1.1}{Hadoop Streaming}{section.9.1}% 241
\BOOKMARK [2][-]{subsection.9.1.2}{Example: Word Count}{section.9.1}% 242
\BOOKMARK [2][-]{subsection.9.1.3}{Running the Code}{section.9.1}% 243
\BOOKMARK [2][-]{subsection.9.1.4}{Analysis of the Code}{section.9.1}% 244
\BOOKMARK [2][-]{subsection.9.1.5}{Role of Disk Files}{section.9.1}% 245
\BOOKMARK [1][-]{section.9.2}{Other MapReduce Systems}{chapter.9}% 246
\BOOKMARK [1][-]{section.9.3}{R Interfaces to MapReduce Systems}{chapter.9}% 247
\BOOKMARK [1][-]{section.9.4}{An Alternative: ``Snowdoop''}{chapter.9}% 248
\BOOKMARK [2][-]{subsection.9.4.1}{Example: Snowdoop Word Count}{section.9.4}% 249
\BOOKMARK [2][-]{subsection.9.4.2}{Example: Snowdoop k-Means Clustering}{section.9.4}% 250
\BOOKMARK [0][-]{chapter.10}{The Parallel Prefix Problem}{}% 251
\BOOKMARK [1][-]{section.10.1}{Example: Permutations}{chapter.10}% 252
\BOOKMARK [1][-]{section.10.2}{General Strategies for Parallel Scan Computation}{chapter.10}% 253
\BOOKMARK [1][-]{section.10.3}{Implementations}{chapter.10}% 254
\BOOKMARK [1][-]{section.10.4}{Example: Parallel Prefix Summing in OpenMP}{chapter.10}% 255
\BOOKMARK [1][-]{section.10.5}{Example: Run-Length Coding Decompression in OpenMP}{chapter.10}% 256
\BOOKMARK [1][-]{section.10.6}{Example: Run-Length Coding Decompression in Thrust}{chapter.10}% 257
\BOOKMARK [1][-]{section.10.7}{Example: Moving Average}{chapter.10}% 258
\BOOKMARK [2][-]{subsection.10.7.1}{Rth Code}{section.10.7}% 259
\BOOKMARK [2][-]{subsection.10.7.2}{Algorithm}{section.10.7}% 260
\BOOKMARK [2][-]{subsection.10.7.3}{Use of Lambda Functions}{section.10.7}% 261
\BOOKMARK [0][-]{chapter.11}{Introduction to Parallel Matrix Operations}{}% 262
\BOOKMARK [1][-]{section.11.1}{``We're Not in Physicsland Anymore, Toto''}{chapter.11}% 263
\BOOKMARK [1][-]{section.11.2}{Partitioned Matrices}{chapter.11}% 264
\BOOKMARK [1][-]{section.11.3}{Parallel Matrix Multiplication}{chapter.11}% 265
\BOOKMARK [2][-]{subsection.11.3.1}{Message-Passing Case}{section.11.3}% 266
\BOOKMARK [3][-]{subsubsection.11.3.1.1}{Fox's Algorithm}{subsection.11.3.1}% 267
\BOOKMARK [3][-]{subsubsection.11.3.1.2}{Performance Issues}{subsection.11.3.1}% 268
\BOOKMARK [2][-]{subsection.11.3.2}{Shared-Memory Case}{section.11.3}% 269
\BOOKMARK [3][-]{subsubsection.11.3.2.1}{Example: Matrix Multiply in OpenMP}{subsection.11.3.2}% 270
\BOOKMARK [3][-]{subsubsection.11.3.2.2}{Example: Matrix Multiply in CUDA}{subsection.11.3.2}% 271
\BOOKMARK [2][-]{subsection.11.3.3}{R Snow}{section.11.3}% 272
\BOOKMARK [2][-]{subsection.11.3.4}{R Interfaces to GPUs}{section.11.3}% 273
\BOOKMARK [1][-]{section.11.4}{Finding Powers of Matrices}{chapter.11}% 274
\BOOKMARK [2][-]{subsection.11.4.1}{Example: Graph Connectedness}{section.11.4}% 275
\BOOKMARK [2][-]{subsection.11.4.2}{Example: Fibonacci Numbers}{section.11.4}% 276
\BOOKMARK [2][-]{subsection.11.4.3}{Example: Matrix Inversion}{section.11.4}% 277
\BOOKMARK [2][-]{subsection.11.4.4}{Parallel Computation}{section.11.4}% 278
\BOOKMARK [1][-]{section.11.5}{Solving Systems of Linear Equations}{chapter.11}% 279
\BOOKMARK [2][-]{subsection.11.5.1}{Gaussian Elimination}{section.11.5}% 280
\BOOKMARK [2][-]{subsection.11.5.2}{Example: Gaussian Elimination in CUDA}{section.11.5}% 281
\BOOKMARK [2][-]{subsection.11.5.3}{The Jacobi Algorithm}{section.11.5}% 282
\BOOKMARK [2][-]{subsection.11.5.4}{Example: OpenMP Implementation of the Jacobi Algorithm}{section.11.5}% 283
\BOOKMARK [2][-]{subsection.11.5.5}{Example: R/gputools Implementation of Jacobi}{section.11.5}% 284
\BOOKMARK [1][-]{section.11.6}{Eigenvalues and Eigenvectors}{chapter.11}% 285
\BOOKMARK [2][-]{subsection.11.6.1}{The Power Method}{section.11.6}% 286
\BOOKMARK [2][-]{subsection.11.6.2}{Parallel Computation}{section.11.6}% 287
\BOOKMARK [1][-]{section.11.7}{Sparse Matrices}{chapter.11}% 288
\BOOKMARK [1][-]{section.11.8}{Libraries}{chapter.11}% 289
\BOOKMARK [0][-]{chapter.12}{Introduction to Parallel Sorting}{}% 290
\BOOKMARK [1][-]{section.12.1}{Quicksort}{chapter.12}% 291
\BOOKMARK [2][-]{subsection.12.1.1}{The Separation Process}{section.12.1}% 292
\BOOKMARK [2][-]{subsection.12.1.2}{Example: OpenMP Quicksort}{section.12.1}% 293
\BOOKMARK [2][-]{subsection.12.1.3}{Hyperquicksort}{section.12.1}% 294
\BOOKMARK [1][-]{section.12.2}{Mergesorts}{chapter.12}% 295
\BOOKMARK [2][-]{subsection.12.2.1}{Sequential Form}{section.12.2}% 296
\BOOKMARK [2][-]{subsection.12.2.2}{Shared-Memory Mergesort}{section.12.2}% 297
\BOOKMARK [2][-]{subsection.12.2.3}{Message Passing Mergesort on a Tree Topology}{section.12.2}% 298
\BOOKMARK [2][-]{subsection.12.2.4}{Compare-Exchange Operations}{section.12.2}% 299
\BOOKMARK [2][-]{subsection.12.2.5}{Bitonic Mergesort}{section.12.2}% 300
\BOOKMARK [1][-]{section.12.3}{The Bubble Sort and Its Cousins}{chapter.12}% 301
\BOOKMARK [2][-]{subsection.12.3.1}{The Much-Maligned Bubble Sort}{section.12.3}% 302
\BOOKMARK [2][-]{subsection.12.3.2}{A Popular Variant: Odd-Even Transposition}{section.12.3}% 303
\BOOKMARK [2][-]{subsection.12.3.3}{Example: CUDA Implementation of Odd/Even Transposition Sort}{section.12.3}% 304
\BOOKMARK [1][-]{section.12.4}{Shearsort}{chapter.12}% 305
\BOOKMARK [1][-]{section.12.5}{Bucket Sort with Sampling}{chapter.12}% 306
\BOOKMARK [1][-]{section.12.6}{Radix Sort}{chapter.12}% 307
\BOOKMARK [1][-]{section.12.7}{Enumeration Sort}{chapter.12}% 308
\BOOKMARK [0][-]{chapter.13}{Parallel Computation for Audio and Image Processing}{}% 309
\BOOKMARK [1][-]{section.13.1}{General Principles}{chapter.13}% 310
\BOOKMARK [2][-]{subsection.13.1.1}{One-Dimensional Fourier Series}{section.13.1}% 311
\BOOKMARK [2][-]{subsection.13.1.2}{Two-Dimensional Fourier Series}{section.13.1}% 312
\BOOKMARK [1][-]{section.13.2}{Discrete Fourier Transforms}{chapter.13}% 313
\BOOKMARK [2][-]{subsection.13.2.1}{One-Dimensional Data}{section.13.2}% 314
\BOOKMARK [2][-]{subsection.13.2.2}{Inversion}{section.13.2}% 315
\BOOKMARK [3][-]{subsubsection.13.2.2.1}{Alternate Formulation}{subsection.13.2.2}% 316
\BOOKMARK [2][-]{subsection.13.2.3}{Two-Dimensional Data}{section.13.2}% 317
\BOOKMARK [1][-]{section.13.3}{Parallel Computation of Discrete Fourier Transforms}{chapter.13}% 318
\BOOKMARK [2][-]{subsection.13.3.1}{The Fast Fourier Transform}{section.13.3}% 319
\BOOKMARK [2][-]{subsection.13.3.2}{A Matrix Approach}{section.13.3}% 320
\BOOKMARK [2][-]{subsection.13.3.3}{Parallelizing Computation of the Inverse Transform}{section.13.3}% 321
\BOOKMARK [2][-]{subsection.13.3.4}{Parallelizing Computation of the Two-Dimensional Transform}{section.13.3}% 322
\BOOKMARK [1][-]{section.13.4}{Available FFT Software}{chapter.13}% 323
\BOOKMARK [2][-]{subsection.13.4.1}{R}{section.13.4}% 324
\BOOKMARK [2][-]{subsection.13.4.2}{CUFFT}{section.13.4}% 325
\BOOKMARK [2][-]{subsection.13.4.3}{FFTW}{section.13.4}% 326
\BOOKMARK [1][-]{section.13.5}{Applications to Image Processing}{chapter.13}% 327
\BOOKMARK [2][-]{subsection.13.5.1}{Smoothing}{section.13.5}% 328
\BOOKMARK [2][-]{subsection.13.5.2}{Example: Audio Smoothing in R}{section.13.5}% 329
\BOOKMARK [2][-]{subsection.13.5.3}{Edge Detection}{section.13.5}% 330
\BOOKMARK [1][-]{section.13.6}{R Access to Sound and Image Files}{chapter.13}% 331
\BOOKMARK [1][-]{section.13.7}{Keeping the Pixel Intensities in the Proper Range}{chapter.13}% 332
\BOOKMARK [1][-]{section.13.8}{Does the Function g\(\) Really Have to Be Repeating?}{chapter.13}% 333
\BOOKMARK [1][-]{section.13.9}{Vector Space Issues \(optional section\)}{chapter.13}% 334
\BOOKMARK [1][-]{section.13.10}{Bandwidth: How to Read the San Francisco Chronicle Business Page \(optional section\)}{chapter.13}% 335
\BOOKMARK [0][-]{chapter.14}{Parallel Computation in Statistics/Data Mining}{}% 336
\BOOKMARK [1][-]{section.14.1}{Itemset Analysis}{chapter.14}% 337
\BOOKMARK [2][-]{subsection.14.1.1}{What Is It?}{section.14.1}% 338
\BOOKMARK [2][-]{subsection.14.1.2}{The Market Basket Problem}{section.14.1}% 339
\BOOKMARK [2][-]{subsection.14.1.3}{Serial Algorithms}{section.14.1}% 340
\BOOKMARK [2][-]{subsection.14.1.4}{Parallelizing the Apriori Algorithm}{section.14.1}% 341
\BOOKMARK [1][-]{section.14.2}{Probability Density Estimation}{chapter.14}% 342
\BOOKMARK [2][-]{subsection.14.2.1}{Kernel-Based Density Estimation}{section.14.2}% 343
\BOOKMARK [2][-]{subsection.14.2.2}{Histogram Computation for Images}{section.14.2}% 344
\BOOKMARK [1][-]{section.14.3}{Clustering}{chapter.14}% 345
\BOOKMARK [2][-]{subsection.14.3.1}{Example: k-Means Clustering in R}{section.14.3}% 346
\BOOKMARK [1][-]{section.14.4}{Principal Component Analysis \(PCA\)}{chapter.14}% 347
\BOOKMARK [1][-]{section.14.5}{Monte Carlo Simulation}{chapter.14}% 348
\BOOKMARK [0][-]{appendix.A}{Miscellaneous Systems Issues}{}% 349
\BOOKMARK [1][-]{section.A.1}{Timesharing}{appendix.A}% 350
\BOOKMARK [2][-]{subsection.A.1.1}{Many Processes, Taking Turns}{section.A.1}% 351
\BOOKMARK [1][-]{section.A.2}{Memory Hierarchies}{appendix.A}% 352
\BOOKMARK [2][-]{subsection.A.2.1}{Cache Memory}{section.A.2}% 353
\BOOKMARK [2][-]{subsection.A.2.2}{Virtual Memory}{section.A.2}% 354
\BOOKMARK [3][-]{subsubsection.A.2.2.1}{Make Sure You Understand the Goals}{subsection.A.2.2}% 355
\BOOKMARK [3][-]{subsubsection.A.2.2.2}{How It Works}{subsection.A.2.2}% 356
\BOOKMARK [2][-]{subsection.A.2.3}{Performance Issues}{section.A.2}% 357
\BOOKMARK [1][-]{section.A.3}{Array Issues}{appendix.A}% 358
\BOOKMARK [2][-]{subsection.A.3.1}{Storage}{section.A.3}% 359
\BOOKMARK [2][-]{subsection.A.3.2}{Subarrays}{section.A.3}% 360
\BOOKMARK [2][-]{subsection.A.3.3}{Memory Allocation}{section.A.3}% 361
\BOOKMARK [0][-]{appendix.B}{Review of Matrix Algebra}{}% 362
\BOOKMARK [1][-]{section.B.1}{Terminology and Notation}{appendix.B}% 363
\BOOKMARK [2][-]{subsection.B.1.1}{Matrix Addition and Multiplication}{section.B.1}% 364
\BOOKMARK [1][-]{section.B.2}{Matrix Transpose}{appendix.B}% 365
\BOOKMARK [1][-]{section.B.3}{Linear Independence}{appendix.B}% 366
\BOOKMARK [1][-]{section.B.4}{Determinants}{appendix.B}% 367
\BOOKMARK [1][-]{section.B.5}{Matrix Inverse}{appendix.B}% 368
\BOOKMARK [1][-]{section.B.6}{Eigenvalues and Eigenvectors}{appendix.B}% 369
\BOOKMARK [1][-]{section.B.7}{Rank of a Matrix}{appendix.B}% 370
\BOOKMARK [1][-]{section.B.8}{Matrix Algebra in R}{appendix.B}% 371
\BOOKMARK [0][-]{appendix.C}{R Quick Start}{}% 372
\BOOKMARK [1][-]{section.C.1}{Correspondences}{appendix.C}% 373
\BOOKMARK [1][-]{section.C.2}{Starting R}{appendix.C}% 374
\BOOKMARK [1][-]{section.C.3}{First Sample Programming Session}{appendix.C}% 375
\BOOKMARK [1][-]{section.C.4}{Vectorization}{appendix.C}% 376
\BOOKMARK [1][-]{section.C.5}{Second Sample Programming Session}{appendix.C}% 377
\BOOKMARK [1][-]{section.C.6}{Recycling}{appendix.C}% 378
\BOOKMARK [1][-]{section.C.7}{More on Vectorization}{appendix.C}% 379
\BOOKMARK [1][-]{section.C.8}{Third Sample Programming Session}{appendix.C}% 380
\BOOKMARK [1][-]{section.C.9}{Default Argument Values}{appendix.C}% 381
\BOOKMARK [1][-]{section.C.10}{The R List Type}{appendix.C}% 382
\BOOKMARK [2][-]{subsection.C.10.1}{The Basics}{section.C.10}% 383
\BOOKMARK [2][-]{subsection.C.10.2}{The Reduce\(\) Function}{section.C.10}% 384
\BOOKMARK [2][-]{subsection.C.10.3}{S3 Classes}{section.C.10}% 385
\BOOKMARK [1][-]{section.C.11}{Some Workhorse Functions}{appendix.C}% 386
\BOOKMARK [1][-]{section.C.12}{Handy Utilities}{appendix.C}% 387
\BOOKMARK [1][-]{section.C.13}{Data Frames}{appendix.C}% 388
\BOOKMARK [1][-]{section.C.14}{Graphics}{appendix.C}% 389
\BOOKMARK [1][-]{section.C.15}{Packages}{appendix.C}% 390
\BOOKMARK [1][-]{section.C.16}{Other Sources for Learning R}{appendix.C}% 391
\BOOKMARK [1][-]{section.C.17}{Online Help}{appendix.C}% 392
\BOOKMARK [1][-]{section.C.18}{Debugging in R}{appendix.C}% 393
\BOOKMARK [1][-]{section.C.19}{Complex Numbers}{appendix.C}% 394
\BOOKMARK [1][-]{section.C.20}{Further Reading}{appendix.C}% 395