static char rcsid[] = "$Id: ompc_runtime.c,v 1.14 2000/07/31 16:57:37 y-tanaka Exp $";
/* 
 * $RWC_Release: Omni-1.6 $
 * $RWC_Copyright:
 *  Omni Compiler Software Version 1.5-1.6
 *  Copyright (C) 2002 PC Cluster Consortium
 *  
 *  This software is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License version
 *  2.1 published by the Free Software Foundation.
 *  
 *  Omni Compiler Software Version 1.0-1.4
 *  Copyright (C) 1999, 2000, 2001.
 *   Tsukuba Research Center, Real World Computing Partnership, Japan.
 *  
 *  Please check the Copyright and License information in the files named
 *  COPYRIGHT and LICENSE under the top  directory of the Omni Compiler
 *  Software release kit.
 *  
 *  
 *  $
 */
#include <stdlib.h>
#include "exc_platform.h"
#include "ompclib.h"

extern _ompc_lock_t _ompc_thread_lock;
extern struct ompc_thread *_ompc_current_thread(void);
void _ompc_static_bsched_tid(struct ompc_thread *tp, 
			     int *lb, int *ub, int *step);
void _ompc_static_csched_tid(struct ompc_thread *tp, 
			     int *lb, int *up, int *step);
void _ompc_static_sched_init_tid(struct ompc_thread *tp,
		       int lb, int up, int step, int chunk_size);
void _ompc_dynamic_sched_init_tid(struct ompc_thread *tp,
		       int lb, int up, int step, int chunk_size);
int _ompc_static_sched_next_tid(struct ompc_thread *tp, int *lb,int *ub);
int _ompc_dynamic_sched_next_tid(struct ompc_thread *tp, int *lb,int *ub,int guided);
int _ompc_do_single_tid(struct ompc_thread *tp);

void _ompc_reduction(void *in_p,void *out_p,int type, int op);

/*
 * compiler runtime
 */

int _ompc_thread_num_tid(struct ompc_thread *tp)
{
    ST_POLLING();
    return tp->num;
}

void _ompc_barrier()
{
    struct ompc_thread *tp;

    tp = _ompc_current_thread();
    _ompc_thread_barrier(tp->num, tp->parent);
    ST_POLLING();
}

void _ompc_barrier_tid(struct ompc_thread *tp)
{
    _ompc_thread_barrier(tp->num, tp->parent);
    ST_POLLING();
}

void _ompc_default_sched(int *lb, int *ub, int *step)
{
    struct ompc_thread *tp = _ompc_current_thread();
#ifdef DEBUG
    if(_ompc_debug_flag)
	printf("default_sched: +id=%d, lb=%d, ub=%d, step=%d\n",
	       tp->num,*lb,*ub,*step);
#endif
    _ompc_static_bsched_tid(tp, lb, ub, step);
#ifdef DEBUG
    if(_ompc_debug_flag)
	printf("default_sched: -id=%d, lb=%d, ub=%d, step=%d\n",
	       tp->num,*lb,*ub,*step); 
#endif
    ST_POLLING();
}

/* static scheduling: cyclic */
void _ompc_static_csched(int *lb, int *up, int *step)
{
    struct ompc_thread *tp = _ompc_current_thread();
    _ompc_static_csched_tid(tp, lb, up, step);
    ST_POLLING();
}

void _ompc_static_csched_tid(struct ompc_thread *tp, int *lb, int *up, int *step)
{
    int n,s;
    struct ompc_parent *tpp;
    int n_thd, id;

    if((tpp = tp->parent) == NULL) return;  /* not in parallel, do nothing */

    id = tp->num;
    n_thd = tpp->num_thds;
    tp->is_last = 0;
    s = *step;

    /* how many iteration */
    if(s > 0) n = (*up-*lb+s-1)/s;
    else n = (*up-*lb+s+1)/s;

    *lb += id*s;   /* adjust low bound */
    *step = s*n_thd;
    if(n > 0 && ((n-1)%n_thd) == id) tp->is_last = 1;
    ST_POLLING();
}

/* static scheduling: block */
void _ompc_static_bsched(int *lb, int *ub, int *step)
{
    struct ompc_thread *tp = _ompc_current_thread();
    _ompc_static_bsched_tid(tp, lb, ub, step);
    ST_POLLING();
}

void _ompc_static_bsched_tid(struct ompc_thread *tp, int *lb, int *ub, int *step)
{
    struct ompc_parent *tpp;
    int n_thd, id;
    int b,e,ee,s,blk_s;

    if((tpp = tp->parent) == NULL) return;  /* not in parallel, do nothing */

    s = *step;
    b = *lb;
    ee = e = *ub;
    id = tp->num;
    n_thd = tpp->num_thds;
    tp->is_last = 0;
    if(s > 0){
	blk_s = (e-b+n_thd-1)/n_thd;
	blk_s = ((blk_s+s-1)/s)*s;
	b += blk_s*id;
	e = b + blk_s;
	if(e >= ee){
	    e = ee;
	    if(ee > b) tp->is_last = 1;
	}
    } else if(s < 0){
	blk_s = (e-b-n_thd+1)/n_thd;
	blk_s = ((blk_s+s+1)/s)*s;
	b += blk_s*id;
	e = b + blk_s;
	if(e <= ee){
	    e = ee;
	    if(ee < b) tp->is_last = 1;
	}
    } else return;

    *lb = b;
    *ub = e;
    ST_POLLING();
}

/*
 * static schedule
 */
void _ompc_static_sched_init(int lb, int up, int step, int chunk_size)
{
    struct ompc_thread *tp = _ompc_current_thread();
    _ompc_static_sched_init_tid(tp, lb, up, step, chunk_size);
    ST_POLLING();
}

void _ompc_static_sched_init_tid(struct ompc_thread *tp,
		       int lb, int up, int step, int chunk_size)
{
    struct ompc_parent *tpp;
    if((tpp = tp->parent) == NULL){  /* not in parallel */
	tp->loop_sched_index = lb;
	tp->loop_end = up;
	return;	
    }

    if(chunk_size <= 0){
	printf("check size is non-positive\n");
	_ompc_fatal("_ompc_static_sched_init");
    }

    chunk_size *= step;
    tp->loop_sched_index = lb+chunk_size*tp->num;
    tp->loop_chunk_size = chunk_size;
    tp->loop_stride = chunk_size*tpp->num_thds;
    tp->loop_end = up;
    tp->is_last = 0;
/*    printf("begin=%d,end=%d,chunk_size=%d,stride=%d\n",
	   tp->loop_sched_index,tp->loop_end,tp->loop_chunk_size,tp->loop_stride);
*/
    ST_POLLING();
}

int _ompc_static_sched_next(int *lb, int *ub)
{
    struct ompc_thread *tp = _ompc_current_thread();
    ST_POLLING();
    return _ompc_static_sched_next_tid(tp, lb, ub);
}

int _ompc_static_sched_next_tid(struct ompc_thread *tp,
			    int *lb,int *ub)
{
    int b,e;
    struct ompc_parent *tpp;

    b = tp->loop_sched_index;
    if((tpp = tp->parent) == NULL){  /* not in parallel */
	e = tp->loop_end;
	if(b == e) return FALSE;
	*lb = b;
	*ub = e;
	tp->loop_sched_index = e;
	return TRUE;
    }

    tp->loop_sched_index += tp->loop_stride;
    e = b+tp->loop_chunk_size;
    if(tp->loop_chunk_size > 0){
	if(b >= tp->loop_end) return FALSE; 
	if(e >= tp->loop_end){
	    e = tp->loop_end;
	    tp->is_last = 1;
	}
    } else {
	if(b <= tp->loop_end) return FALSE;
	if(e <= tp->loop_end){
	    e = tp->loop_end;
	    tp->is_last = 1;
	}
    }
    *lb = b;
    *ub = e;
    ST_POLLING();
    return TRUE;
}

/* 
 * dynamic schedule
 */
void _ompc_dynamic_sched_init(int lb, int up, int step, int chunk_size)
{
    struct ompc_thread *tp = _ompc_current_thread();
    _ompc_dynamic_sched_init_tid(tp, lb, up, step, chunk_size);
    ST_POLLING();
}


void _ompc_dynamic_sched_init_tid(struct ompc_thread *tp,
		       int lb, int up, int step, int chunk_size)
{
    struct ompc_parent *tpp;
    int pcount;

    if((tpp = tp->parent) == NULL){  /* not in parallel */
	tp->loop_sched_index = lb;
	tp->loop_end = up;
	return;	/* stride is not used */
    }
    if(chunk_size <= 0){
	printf("check size is non-positive\n");
	_ompc_fatal("_ompc_dynamic_sched_init");
    }
    tp->loop_chunk_size = chunk_size*step;
    tp->loop_end = up;
    tp->loop_sched_index = lb;
    tp->is_last = 0;

    pcount = st_read_and_lock_int(&tpp->parent_count);
    if(pcount == tp->count) {
        tpp->dynamic_index = lb;
        pcount++;
    }
    st_write_and_unlock_int(&tpp->parent_count, pcount);
    tp->count++;
    ST_POLLING();
}

int _ompc_dynamic_sched_next(int *lb, int *ub)
{
    struct ompc_thread *tp = _ompc_current_thread();
    ST_POLLING();
    return _ompc_dynamic_sched_next_tid(tp, lb, ub,FALSE);
}

int _ompc_dynamic_sched_next_tid(struct ompc_thread *tp,
			    int *lb,int *ub,int guided)
{
    struct ompc_parent *tpp;
    int b,e,l,c;

    if((tpp = tp->parent) == NULL){  /* not in parallel */
	b = tp->loop_sched_index;
	e = tp->loop_end;
	if(b == e) return FALSE;
	*lb = b;
	*ub = e;
	tp->loop_sched_index = e;
	return TRUE;
    } 
    c = tp->loop_chunk_size;

    /* get my chunk, set b and e */
    if(guided){
        int pcount = st_read_and_lock_int(&tpp->parent_count);
	b = tpp->dynamic_index;
	l = (tp->loop_end - b)/tpp->num_thds;
	l = ((l+c)/c)*c;
	if(c > 0){
	    if(c > l) l = c;
	} else {
	    if(c < l) l = c;
	}
	e = b + l;
	tpp->dynamic_index = e;
        st_write_and_unlock_int(&tpp->parent_count, pcount);
    } else {
        int pcount = st_read_and_lock_int(&tpp->parent_count);
	b = tpp->dynamic_index;
	e = b + c;
	tpp->dynamic_index = e;
        st_write_and_unlock_int(&tpp->parent_count, pcount);
    }

    if(c > 0){
	if(tp->loop_sched_index >= tp->loop_end) return FALSE;
    } else {
	if(tp->loop_sched_index <= tp->loop_end) return FALSE;
    }

    tp->loop_sched_index = e;

    /* adjust the last iteration */
    if(c > 0){
	if(b >= tp->loop_end) return FALSE; 
	if(e >= tp->loop_end){
	    e = tp->loop_end;
	    tp->is_last = 1;
	}
    } else {
	if(b <= tp->loop_end) return FALSE;
	if(e <= tp->loop_end){
	    e = tp->loop_end;
	    tp->is_last = 1;
	}
    }
    *lb = b;
    *ub = e;
    ST_POLLING();
    return TRUE;
}

/* 
 * guided schedule
 */
void _ompc_guided_sched_init(int lb, int up, int step, int chunk_size)
{
    struct ompc_thread *tp = _ompc_current_thread();
    _ompc_dynamic_sched_init_tid(tp, lb, up, step, chunk_size);
    ST_POLLING();
}

int _ompc_guided_sched_next(int *lb, int *ub)
{
    struct ompc_thread *tp = _ompc_current_thread();
    ST_POLLING();
    return _ompc_dynamic_sched_next_tid(tp, lb, ub, TRUE);
}


/* 
 * runtime schedule 
 */
static enum { SCHED_NONE = 0, SCHED_STATIC, SCHED_DYNAMIC, SCHED_GUIDED }
_ompc_runtime_sched_kind;
static int _ompc_runtime_chunk_size = 0; /* default */

void _ompc_set_runtime_schedule(char *s)
{
    char *cp;
    cp = s;
    while(isspace((int)*cp)) cp++;
    if(*cp == 0) return;
    if(strncmp(cp,"static",6) == 0){
	cp += 6;
	_ompc_runtime_sched_kind = SCHED_STATIC;
    } else if(strncmp(cp,"dynamic",7) == 0){
	cp += 7;
	_ompc_runtime_sched_kind = SCHED_DYNAMIC;
    } else if(strncmp(cp,"guided",6) == 0){
	cp += 6;
	_ompc_runtime_sched_kind = SCHED_GUIDED;
    }
    while(isspace((int)*cp)) cp++;
    if(*cp == 0) return;
    if(*cp != ',') goto err;
    cp++;
    while(isspace((int)*cp)) cp++;
    if(!isdigit((int)*cp)) goto err;
    sscanf(cp,"%d",&_ompc_runtime_chunk_size);
    if(_ompc_runtime_chunk_size <= 0){
	_ompc_runtime_sched_kind = SCHED_NONE;
	goto err;
    }
    ST_POLLING();
    return;
err:
    fprintf(stderr,"OMP_SCHEDULE ='%s'",s);
    _ompc_fatal("bad OMP_SCHEDULE");
}

void _ompc_runtime_sched_init(int lb, int up, int step)
{
    struct ompc_thread *tp = _ompc_current_thread();
    struct ompc_parent *tpp;
    int chunk_size,n_thd;

    chunk_size = _ompc_runtime_chunk_size;
    switch(_ompc_runtime_sched_kind){
    case SCHED_DYNAMIC:
    case SCHED_GUIDED:
	if(chunk_size <= 0) chunk_size = 1;
	_ompc_dynamic_sched_init_tid(tp, lb, up, step, chunk_size);
	break;
    case SCHED_STATIC:
    case SCHED_NONE:
    default:
	if((tpp = tp->parent) == NULL) n_thd = 1;
	else n_thd = tpp->num_thds;
	if(chunk_size <= 0){
	    chunk_size = (up - lb)/(step*n_thd) + (((up - lb)%(step*n_thd))?(1):(0));
	    if(chunk_size <= 0) chunk_size = 1;
	}
	_ompc_static_sched_init_tid(tp, lb, up, step, chunk_size);
	break;
    }
    ST_POLLING();
}

int _ompc_runtime_sched_next(int *lb, int *ub)
{
    struct ompc_thread *tp = _ompc_current_thread();

    ST_POLLING();
    switch(_ompc_runtime_sched_kind){
    case SCHED_DYNAMIC:
	return _ompc_dynamic_sched_next_tid(tp, lb, ub,FALSE);
    case SCHED_GUIDED:
	return _ompc_dynamic_sched_next_tid(tp, lb, ub, TRUE);
    case SCHED_STATIC:
    case SCHED_NONE:
    default:
	return _ompc_static_sched_next_tid(tp, lb, ub);
    }
}

/* 
 * ordered
 */
void _ompc_set_loop_id(int i)
{
    struct ompc_thread *tp = _ompc_current_thread();
    struct ompc_parent *tpp = tp->parent;
    tp->loop_id = (i - tpp->ordered_lb)/tpp->ordered_step;
    ST_POLLING();
}

void _ompc_init_ordered(int lb,int step)
{
    int pcount;
    struct ompc_parent *tpp;
    struct ompc_thread *tp = _ompc_current_thread();

    if((tpp = tp->parent) == NULL) return; /* not in parallel */
    
    pcount = st_read_and_lock_int(&tpp->parent_count);
    if(pcount == tp->count){ /* first visitor execute it */
	tpp->ordered_id = 0;
	tpp->ordered_lb = lb;
	tpp->ordered_step = step;
	tpp->ordered_flag = 0;
        tpp->ordered_context_list = NULL;
        pcount++;
    }
    st_write_and_unlock_int(&tpp->parent_count, pcount);
    tp->count++;
    ST_POLLING();
}

void _ompc_ordered_begin()
{
    int i = 0, max_count;
    struct ompc_parent *tpp;
    struct ompc_thread *tp = _ompc_current_thread();
    
    if ((tpp = tp->parent) == NULL) return; /* sequential */

    if (tpp->parent == NULL) {
	max_count = 1000000;
    } else {
	max_count = 100000;
    }
    while (1) {
	if ((volatile int)tp->loop_id == (volatile int)tpp->ordered_id) {
	    return;
	} else {
	    if (i++ > max_count) {
		if (_ompc_test_lock(&_ompc_thread_lock)) {
		    if ((volatile int)tp->loop_id != (volatile int)tpp->ordered_id) {
			struct st_context c[1];
			struct ompc_context_list list[1], **p;
			c->valid = 0;
			list->c = c;
			list->id = tp->loop_id;
			for (p = &tpp->ordered_context_list; *p != NULL; p = &(*p)->next)
			    if ((*p)->id > tp->loop_id) {
				list->next = (*p);
				(*p) = list;
				break;
			    }
			if (*p == NULL) {
			    list->next = NULL;
			    (*p) = list;
			}
			_ompc_unlock(&_ompc_thread_lock);
			ST_POLLING();
			st_suspend_thread(c);
			_ompc_set_thread(tp);
			tpp->ordered_flag = 1;
			MBAR();
			return;
		    } else {
			_ompc_unlock(&_ompc_thread_lock);
		    }
		}
	    }
	    ST_POLLING();
	}
    }
}

void _ompc_ordered_end()
{
    struct ompc_parent *tpp;
    struct ompc_thread *tp = _ompc_current_thread();
    struct ompc_context_list *p;
    int i;

    if ((tpp = tp->parent) == NULL) return; /* sequential */

    _ompc_lock(&_ompc_thread_lock);
    tpp->ordered_id++;
    MBAR();
    if (tpp->ordered_context_list != NULL) {
        if (tpp->ordered_id == tpp->ordered_context_list->id) {
            p = tpp->ordered_context_list->next;
            st_resume_context(tpp->ordered_context_list->c);
            tpp->ordered_context_list = p;
	    _ompc_unlock(&_ompc_thread_lock);
	    while (tpp->ordered_flag == 0) ST_POLLING();
	    tpp->ordered_flag = 0;
        } else {
	    _ompc_unlock(&_ompc_thread_lock);
	}
    } else {
	_ompc_unlock(&_ompc_thread_lock);
    }
    for (i = 0; i < tpp->num_thds; i++) ST_POLLING();
}

/*
 * sections directives. section_id is allocated in round-robin manner.
 */
void _ompc_section_init(int n_sections)
{
    struct ompc_thread *tp;
    tp = _ompc_current_thread();
    tp->section_id = tp->num;
    tp->last_section_id = n_sections - 1;
    tp->is_last = 0;
    ST_POLLING();
}

int _ompc_section_id()
{
    struct ompc_thread *tp;
    struct ompc_parent *tpp;
    int id;

    tp = _ompc_current_thread();
    id = tp->section_id;
    if((tpp = tp->parent) == NULL) tp->section_id += 1;
    else tp->section_id += tpp->num_thds;
    if(id == tp->last_section_id) tp->is_last = 1;
    ST_POLLING();
    return id;
}

int _ompc_is_last()
{
    struct ompc_thread *tp;
    tp = _ompc_current_thread();
    ST_POLLING();
    return tp->is_last || (tp->parent == NULL);
}

/*
 * single construct
 */
int _ompc_do_single()
{
    ST_POLLING();
    return _ompc_do_single_tid(_ompc_current_thread());
}

int _ompc_do_single_tid(struct ompc_thread *tp)
{
    struct ompc_parent *tpp;
    int pcount;
    int ret = 0;

    if((tpp = tp->parent) == NULL) return 1; /* not in parallel */

    pcount = st_read_and_lock_int(&tpp->parent_count);
    if(pcount == tp->count) {
        ret = 1;        /* first visitor execute it */
        pcount++;
    }
    st_write_and_unlock_int(&tpp->parent_count, pcount);
    tp->count++;
    ST_POLLING();
    return ret;
}

int _ompc_is_master()
{
    struct ompc_thread *tp;
    tp = _ompc_current_thread();
    ST_POLLING();
    return tp->num == 0;
    /* return omp_get_thread_num() == 0; */
}

int _ompc_is_master_tid(struct ompc_thread *tp)
{
    ST_POLLING();
    return tp->num == 0;
}



static _ompc_lock_t	_critical_lock;

void _ompc_critical_init ()
{
    ST_POLLING();
    _ompc_init_lock (&_critical_lock);
}

void _ompc_critical_destroy ()
{
    ST_POLLING();
    _ompc_destroy_lock (&_critical_lock);
}

void _ompc_enter_critical(_ompc_lock_t **p)
{
    if (*p == NULL) {
	_ompc_lock (&_critical_lock);
	if ((_ompc_lock_t volatile *)*p == NULL) {
	    if((*p = (_ompc_lock_t *)malloc(sizeof(_ompc_lock_t))) == NULL) {
		_ompc_fatal("cannot allocate lock memory");
	    }
	    _ompc_init_lock (*p);
	}
	_ompc_unlock (&_critical_lock);
    }
    _ompc_lock((_ompc_lock_t volatile *)*p);
    ST_POLLING();
}

void _ompc_exit_critical(_ompc_lock_t **p)
{
    _ompc_unlock(*p);
    ST_POLLING();
}

static _ompc_lock_t _atomic_lock;

void _ompc_atomic_init_lock ()
{
    ST_POLLING();
    _ompc_init_lock (&_atomic_lock);
}

void _ompc_atomic_lock()
{
    ST_POLLING();
    _ompc_lock(&_atomic_lock);
}

void _ompc_atomic_unlock()
{
    ST_POLLING();
    _ompc_unlock(&_atomic_lock);
}

void _ompc_atomic_destroy_lock ()
{
    ST_POLLING();
    _ompc_destroy_lock (&_atomic_lock);
}

void _ompc_bcopy(char *dst,char *src,int nbyte)
{
    ST_POLLING();
    bcopy(src,dst,nbyte);
}

void _ompc_flush(char *dst,int nbyte)
{
    ST_POLLING();
    MBAR();
}

volatile int _ompc_thdprv_count = 0;

void * _ompc_get_thdprv(int **thdprv_p,int size,void *datap)
{
    void **pp,*p;
    int *n;
    int pcount;
    struct ompc_thread *tp;
    struct ompc_parent *tpp;
    tp = _ompc_current_thread();

    if(*thdprv_p == NULL){
	n = (int *)malloc(sizeof(int));
	if(n == NULL) _ompc_fatal("cannot allocate memory");
	_ompc_lock(&_ompc_thread_lock);
	if(*thdprv_p == NULL){
	    *n = _ompc_thdprv_count++;
	    *thdprv_p = n;
	    _ompc_unlock(&_ompc_thread_lock);
	} else {
	    _ompc_unlock(&_ompc_thread_lock);
	    free(n);
	}
    }
    if((tpp = tp->parent) == NULL) return datap;
    if((pp = tpp->thread->thdprv[**thdprv_p]) == NULL){
	pp = (void **)malloc(sizeof(void *)*_ompc_max_threads);
	if(pp == NULL) _ompc_fatal("cannot allocate memory");
	bzero(pp,sizeof(void *)*_ompc_max_threads);
	pcount = st_read_and_lock_int(&tpp->parent_count);
	if(tpp->thread->thdprv[**thdprv_p] == NULL){
	    tpp->thread->thdprv[**thdprv_p] = pp;
	    st_write_and_unlock_int(&tpp->parent_count, pcount);
	} else {
	    st_write_and_unlock_int(&tpp->parent_count, pcount);
	    free(pp);
	    pp = tpp->thread->thdprv[**thdprv_p];
	}
    }	
    if((p = pp[tp->num]) == NULL){
	p = (void *)malloc(size);
	if(tp->num == 0) {
	    if(tpp->parent == NULL) {
		bcopy(datap, p, size);
	    } else {
		bcopy(tpp->parent->thread->thdprv[**thdprv_p][tpp->thread->num], p, size);
	    }
	}
	pp[tp->num] = p;
    }
    ST_POLLING();
    return p;
}

void _ompc_copyin_thdprv(void *datap,void *global_datap,int size)
{
    bcopy(global_datap, datap, size);
    _ompc_barrier();
    ST_POLLING();
}

/* 
 * reduction operation
 */
#define DO_REDUCTION_INTEGRAL(type_t,t) {\
  vals[id].r_v.t = *((type_t *)in_p); \
  vals[id]._v = 1; \
  if(exe == 1){ \
      any_type v; int i; \
      v.t = *((type_t *)out_p); \
      for (i = 0; i < n_thd; i++) while (vals[i]._v != 1) ST_POLLING();\
      switch(op){ \
      case OMPC_REDUCTION_PLUS: \
      case OMPC_REDUCTION_MINUS: \
	for(i = 0; i < n_thd; i++) v.t += vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_MUL: \
	for(i = 0; i < n_thd; i++) v.t *= vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_BITAND: \
	for(i = 0; i < n_thd; i++) v.t &= vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_BITOR: \
	for(i = 0; i < n_thd; i++) v.t |= vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_BITXOR: \
	for(i = 0; i < n_thd; i++) v.t ^= vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_LOGAND: \
	if(!v.t) break; \
	for(i = 0; i < n_thd; i++) \
          if(!vals[i].r_v.t) { v.t = 0; break; } \
	break; \
      case OMPC_REDUCTION_LOGOR: \
	if(v.t) break; \
	for(i = 0; i < n_thd; i++) \
          if(vals[i].r_v.t) { v.t = 1; break; } \
	break; \
      case OMPC_REDUCTION_MIN: \
	for(i = 0; i < n_thd; i++) \
         if(v.t>vals[i].r_v.t) v.t = vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_MAX: \
	for(i = 0; i < n_thd; i++) \
         if(v.t<vals[i].r_v.t) v.t = vals[i].r_v.t;\
	break; \
      default: \
	  _ompc_fatal("_ompc_reduction: bad op\n"); \
      } \
      *((type_t *)out_p) = v.t; \
      _ompc_reduction_free(vals); \
   } \
  }

#define DO_REDUCTION_FLOAT(type_t,t) { \
  vals[id].r_v.t = *((type_t *)in_p); \
  vals[id]._v = 1; \
  if(exe == 1){ \
      any_type v; int i; \
      v.t = *((type_t *)out_p); \
      for (i = 0; i < n_thd; i++) while (vals[i]._v != 1) ST_POLLING();\
      switch(op){ \
      case OMPC_REDUCTION_PLUS: \
      case OMPC_REDUCTION_MINUS: \
	for(i = 0; i < n_thd; i++) v.t += vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_MUL: \
	for(i = 0; i < n_thd; i++) v.t *= vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_LOGAND: \
	if(!v.t) break; \
	for(i = 0; i < n_thd; i++) \
          if(!vals[i].r_v.t) { v.t = 0; break; } \
	break; \
      case OMPC_REDUCTION_LOGOR: \
	if(v.t) break; \
	for(i = 0; i < n_thd; i++) \
          if(vals[i].r_v.t) { v.t = 1; break; } \
	break; \
      case OMPC_REDUCTION_MIN: \
	for(i = 0; i < n_thd; i++) \
         if(v.t > vals[i].r_v.t) v.t=vals[i].r_v.t;\
	break; \
      case OMPC_REDUCTION_MAX: \
	for(i = 0; i < n_thd; i++) \
         if(v.t < vals[i].r_v.t) v.t=vals[i].r_v.t;\
	break; \
      default: \
	  _ompc_fatal("_ompc_reduction: bad op\n"); \
      } \
      *((type_t *)out_p) = v.t; \
      _ompc_reduction_free(vals); \
   } \
 }

static struct barrier_flag *_ompc_reduction_list = NULL;

static struct barrier_flag *_ompc_reduction_alloc()
{
    int i;
    struct barrier_flag *vals;

    _ompc_lock(&_ompc_thread_lock);
    if (_ompc_reduction_list == NULL) {
	vals = (struct barrier_flag *)malloc(sizeof(struct barrier_flag) * _ompc_max_threads);
    } else {
	vals = _ompc_reduction_list;
	_ompc_reduction_list = vals->next;
    }
    _ompc_unlock(&_ompc_thread_lock);

    for (i = 0; i < _ompc_max_threads; i++) vals[i]._v = 0;

    ST_POLLING();
    return vals;
}

static void _ompc_reduction_free(struct barrier_flag *vals)
{
    _ompc_lock(&_ompc_thread_lock);
    vals->next = _ompc_reduction_list;
    _ompc_reduction_list = vals;
    _ompc_unlock(&_ompc_thread_lock);
    ST_POLLING();
}

void _ompc_reduction(void *in_p,void *out_p,int type, int op)
{
    struct ompc_thread *tp = _ompc_current_thread();
    struct ompc_parent *tpp;
    struct barrier_flag *vals;
    int id,n_thd,pcount,exe = 0;

    if((tpp = tp->parent) == NULL) {
	id = 0;
	n_thd = 1;
	vals = _ompc_reduction_alloc();
	exe = 1;
    } else {
	id = tp->num;
	n_thd = tpp->num_thds;
	tp->red_count = (tp->red_count + 1) & RED_MASK;
	pcount = st_read_and_lock_int(&tpp->parent_count);
	if(pcount == tp->count){ /* first visitor execute it */
	    tpp->barrier_flags[tp->red_count] = _ompc_reduction_alloc();
	    exe = 1;
	    pcount++;
	}
	st_write_and_unlock_int(&tpp->parent_count, pcount);
	tp->count++;
	vals = tpp->barrier_flags[tp->red_count];
    }

    switch(type){
    case OMPC_REDUCTION_CHAR:
	DO_REDUCTION_INTEGRAL(char,c);
	break;
    case OMPC_REDUCTION_UNSIGNED_CHAR:
	DO_REDUCTION_INTEGRAL(unsigned char,uc);
	break;

    case OMPC_REDUCTION_SHORT:
	DO_REDUCTION_INTEGRAL(short,s);
	break;
    case OMPC_REDUCTION_UNSIGNED_SHORT:
	DO_REDUCTION_INTEGRAL(unsigned short,us);
	break;

    case OMPC_REDUCTION_SIGNED:
    case OMPC_REDUCTION_INT:
        DO_REDUCTION_INTEGRAL(int, i);
        break;
    case OMPC_REDUCTION_UNSIGNED_INT:
        DO_REDUCTION_INTEGRAL(unsigned int, ui);
        break;

    case OMPC_REDUCTION_LONG:
	DO_REDUCTION_INTEGRAL(long,l);
	break;
    case OMPC_REDUCTION_UNSIGNED_LONG:
        DO_REDUCTION_INTEGRAL(unsigned long,ul);
        break;

    case OMPC_REDUCTION_LONGLONG:
	DO_REDUCTION_INTEGRAL(long long,ll);
        break;
    case OMPC_REDUCTION_UNSIGNED_LONGLONG:
	DO_REDUCTION_INTEGRAL(unsigned long long,ull);
        break;

    case OMPC_REDUCTION_FLOAT:
	DO_REDUCTION_FLOAT(float,f);
	break;
    case OMPC_REDUCTION_DOUBLE:
	DO_REDUCTION_FLOAT(double,d);
	break;

    case OMPC_REDUCTION_LONG_DOUBLE:
    default:
	_ompc_fatal("_ompc_reduction: bad type");
    }
    ST_POLLING();
}
