/* Recursive quicksort program.  This program uses the workpile (see
 * workpile.c) to implement quicksort.  The array is split into chunks, and the
 * resulting arrays are put into the workpile.  An array chunk is insertion
 * sorted if it is below a certain length (SORT_DIRECT).
 *
 * All code here comes from the book "Programming With Threads" by Steve
 * Kleiman (srk@netapp.com), Devang Shah (devang.shah@eng.sun.com), and
 * Bart Smaalders (bart.smaalders@eng.sun.com).  Check out the official book
 * description at http://www.sun.com/books/catalog/kleiman  -MW */

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include "workpile.c"

#define TRUE 1
#define FALSE 0

/* Fewer than SORT_DIRECT items are sorted with an insertion sort. */
#define SORT_DIRECT 20

/* Work at this depth or less generates a separate work item. */
#define DEFER_DEPTH 6

typedef struct {
	float *data;		/* Array to sort */
	int n;			/* Number of elements in the array */
	int depth;		/* Depth of recursion */
	workpile_t wp;		/* Workpile to use */
} quick_sort_args;

static workpile_t quick_sort_workpile = NULL;

void
quick_sort_aux(float *data, int n, int depth, workpile_t wp, int deferrable)
{
	int i,j;

	/* If array small, use insertion sort */
	if (n <= SORT_DIRECT) {
		for (j = 1; j < n; j++) {
			/* data[0..j-1] in sort; find a spot for data[j] */
			float key = data[j];
			for (i = j - 1; i >= 0 && key < data[i]; i--)
				data[i+1] = data[i];
			data[i+1] = key;
		}

		return;
	}

	/* Defer this work to work queue if policy says so */
	if (deferrable && depth <= DEFER_DEPTH) {
		quick_sort_args *q =
			(quick_sort_args *) malloc(sizeof(quick_sort_args));
		
		/* Make sure malloc didn't fail -MW */
		assert(q != NULL);
		q->data = data; q->n = n; q->depth = depth; q->wp = wp;

		/* Put this section into the work queue -MW */
		work_put(wp, (void *)q);
		return;
	}

	/* Otherwise, partition data based on a median estimate */
#define swap(i,j) {float t = data[i]; data[i] = data[j]; data[j] = t;}

	i = 0;
	j = n - 1;
	for (;;) {
		while (data[i] < data[j]) j--;
		if (i >= j) break;
		swap(i, j); i++;
		while (data[i] < data[j]) i++;
		if (i >= j) { i = j; break; }
		swap(i, j); j--;
	}

	/* Median value is now at data[i] */
	/* Partitioned so that data[0..i-1] <= median <= data[i+1..n-1] */
	quick_sort_aux(data,       i,     depth+1, wp, TRUE);
	quick_sort_aux(&data[i+1], n-i-1, depth+1, wp, TRUE);
}

/* Called from workpile controller with argument pointing to work. */
void
quick_sort_worker(void *a)
{
	quick_sort_args *q = (quick_sort_args *)a;
	quick_sort_aux(q->data, q->n, q->depth, q->wp, FALSE);
	free(q);
}

/* Main routine, called by client to do a sort. */
void
quick_sort(float *data, int n)
{
	/* Initialize quick_sort_workpile the first time -MW */
	if (quick_sort_workpile == NULL) {
		/* sysconf returns the number of processors --> 1 in the case of a
		 * uniprocessor machine.  We limit the number of threads so as to not
		 * overload the processors.  -MW */
		int n_threads = sysconf(_SC_NPROCESSORS_ONLN) + 2;

		/* work_init takes the following arguments:
		 * 1) maximum pile size,
		 * 2) name of worker function
		 * 3) number of threads to create
		 * 
		 * #2 makes workpile more general because we can supply a user-defined
		 * function, and the OS will take care of the rest. -MW */
		quick_sort_workpile = work_init(2 << DEFER_DEPTH,
			quick_sort_worker, n_threads);

		/* If work_init returns NULL, then the malloc inside the function
		 * failed. -MW */
		assert(quick_sort_workpile != NULL);
	}

	/* Call the initial quick_sort_aux to get the process started -MW */
	quick_sort_aux(data, n, 0, quick_sort_workpile, FALSE);

	/* Wait for all work to finish */
	work_wait(quick_sort_workpile);
}

main()
{
	float data[100];
	int i;

	/* Print out initial array -MW */
	for(i = 0; i < 100; i++)
		data[99-i] = drand48();
	for(i = 0; i < 100; i++)
		printf("data[%d] = %f\n", i, data[i]);

	quick_sort(data, 100);

	/* Print out sorted array -MW */
	for(i = 0; i < 100; i++)
		printf("data[%d] = %f\n", i, data[i]);
	return(0);
}

