// File created: 2007-07-12 14:23:49

import tango.core.Array : sort;

import tango.io.Stdout;

import tango.math.Random;

import regex = tango.text.Regex;
import tango.text.Util            : locate;
import tango.text.convert.Integer : toString, toInt, atoi;

import tango.time.StopWatch;

alias typeof(StopWatch.init.stop()) Interval;

const char[] USAGE =
"	{} RANGE RUNS [LENGTHS...]

Benchmark various sorting algorithms by sorting 32-bit integers.

Results are printed as three sets of three integers, representing milliseconds.
Each set contains the mean, minimum, and maximum times taken sorting. The first
set is for a random array, the second for a reversed array, and the third for
an already sorted array.

RANGE is the range of the numbers which will be sorted, for instance -10-20.

RUNS is the number of times each sort will be performed, to average the time
taken.

LENGTHS... stands for any number of numerical arguments describing the lengths
of arrays which will be sorted.

Each length will be sorted both in already sorted order, in reversed order, and
in random order. This random order will be generated only once for each length
to make the comparisons as fair as possible.

If RANGE isn't valid, the default, {} to {}, will be used.

If RUNS is 0 or not numeric, the default, {} will be used.

If LENGTHS are not specified, the defaults used will be {}.";

alias int type;

size_t[] lengths = [15u, 1000, 100_000, 5_000_000];
uint runs = 5;
type minRange = -1000,
     maxRange =  1000;

void main(char[][] args) {
	void printUsage() {
		Stderr.formatln(USAGE, args[0], minRange, maxRange, runs, arrToString(lengths));
	}

	size_t l = 0;
	bool gotLengths = false;
	foreach (i, arg; args[1..$]) {
		static bool help(char[] s) { return (regex.find(s, "^(--?|/)([?]|h(e?lp)?)$", "i") != -1); }

		if (arg.help())
			return printUsage();

		if (i == 0) {
			auto idx = locate(arg, '-');

			if (idx == arg.length) {
				Stderr("Ignoring invalid range '")(arg)("'").newline;
				continue;
			}

			int minr, maxr;
			try {
				minr = toInt(arg[0..idx]);
				maxr = toInt(arg[idx+1..$]);
			} catch {
				Stderr("Ignoring invalid range '")(arg)("'").newline;
				continue;
			}

			if (minr <= maxr) {
				if (minr)
					minRange = minr;
				else
					Stderr("Ignoring invalid minimum range ")(minr).newline;

				if (maxr)
					maxRange = maxr;
				else
					Stderr("Ignoring invalid maximum range ")(maxr).newline;
			} else
				Stderr("Ignoring inverted range '")(arg)("'").newline;
		} else {
			uint a = atoi(arg);
			if (a) {
				if (i == 1)
					runs = a;
				else {
					if (l == lengths.length)
						lengths.length = l * 2;

					gotLengths = true;
					lengths[l++] = a;
				}
			} else {
				if (i == 1)
					Stderr("Ignoring invalid run count ")(a).newline;
				else
					Stderr("Ignoring invalid length ")(a).newline;
			}
		}
	}
	if (gotLengths)
		lengths.length = l;

	Stdout.formatln(
		"Using arrays of lengths {} containing numbers in the range {}-{}.",

		arrToString(lengths),
		minRange, maxRange

	).newline.print(
		"Note that odd lengths will be increased by one for the 'median-of-3 killer' sequences used to test quicksort. "
		"This means that if a length is odd, one of the arrays will actually have a length one greater than the rest."

	).newline.newline.format(
		"Generating {} arrays", 4 * lengths.length
	);

	type[][4][] sortdata = new type[][4][lengths.length];

	foreach (i, len; lengths) {
		sortdata[i][0] = new type[len];

		foreach (ref val; sortdata[i][0])
			val = randomize(minRange, maxRange);

		Stdout('.').flush;
		sortdata[i][2] = sortdata[i][0].dup.sort;
		Stdout('.').flush;
		sortdata[i][1] = sortdata[i][2].dup.reverse;
		Stdout('.').flush;

		if (len & 1)
			++len;

		sortdata[i][3] = new type[len];

		for (size_t j = 1, k = len/2; j <= k; ++j) {
			if (j & 1) {
				sortdata[i][3][j-1] = j;
				sortdata[i][3][j]   = j+k;
			}
			sortdata[i][3][k+j-1] = 2*j;
		}

		Stdout('.').flush;
	}

	Stdout(" done.").newline.newline.formatln(
		"Each of the {} algorithms will run {} times over 4 arrays of each length, "
		"for a total of {} passes for each algorithm, "
		"for a total of {} passes globally.",

		algorithms.length,
		runs,
		4 * runs * lengths.length,
		4 * runs * lengths.length * algorithms.length
	);

	StopWatch watch;

	void benchmark(char[] name, void function(type[]) algo) {
		Stdout.newline.print(name).flush;

		// test trivial cases
		algo([]);
		algo([1]);

		Interval[4][] sum, min, max;

		sum.length = min.length = max.length = sortdata.length;

		for (size_t i = 0; i < sum.length; ++i) {
			sum[i][] = 0;
			min[i][] = Interval.max;
			max[i][] = Interval.min;
		}

		for (auto r = runs; r-- > 0;) {
			foreach (j, set; sortdata) {
				void timedsort(size_t i) {
					auto a = set[i].dup;

					watch.start;
					algo(a);
					Interval t = watch.stop;

					assert (sorted(a), "Not sorted");
					assert (sameContents(a, set[i]), "Array corrupted");

					sum[j][i] += t;

					if (t < min[j][i]) min[j][i] = t;
					if (t > max[j][i]) max[j][i] = t;
				}

				timedsort(0);
				timedsort(1);
				timedsort(2);
				if (algo !is &tangoSort)
					timedsort(3);

				Stdout('.').flush;
			}
			Stdout(' ').flush;
		}

		Stdout.newline;

		foreach (j, set; sortdata) {
			auto s = sum[j];
			auto i = min[j];
			auto x = max[j];

			const FACTOR = 10_000UL;

			Stdout.formatln(
				"{,7}: "
				"{,5} {,5} {,5} | "
				"{,5} {,5} {,5} | "
				"{,5} {,5} {,5} | "
				"{,5} {,5} {,5}",

				set[0].length,
				cast(ulong)(s[0] / runs * FACTOR), cast(ulong)(i[0] * FACTOR), cast(ulong)(x[0] * FACTOR),
				cast(ulong)(s[1] / runs * FACTOR), cast(ulong)(i[1] * FACTOR), cast(ulong)(x[1] * FACTOR),
				cast(ulong)(s[2] / runs * FACTOR), cast(ulong)(i[2] * FACTOR), cast(ulong)(x[2] * FACTOR),
				cast(ulong)(s[3] / runs * FACTOR), cast(ulong)(i[3] * FACTOR), cast(ulong)(x[3] * FACTOR)
			);
		}
	}

	foreach (algo; algorithms)
		benchmark(algo.name, algo.sort);
}

char[] arrToString(T)(T[] a) {
	char[] s = "[";
	foreach (e; a)
		s ~= toString(e) ~ ",";
	s[$-1] = ']';
	return s;
}

int randomize(int min, int max) {
	auto range = max - min;

	assert (range > 0);

	auto mod = uint.max - uint.max % range;

	uint n;
	do n = Random.shared.next();
	while (n >= mod);

	return cast(int)(n % range) + min;
}

bool sorted(type[] a) {
	if (a.length > 1)
	foreach (j, n; a[1..$])
	if (n != a[j] && cmp(n, a[j])) {
		Stderr.formatln("cmp {} and {} shouldn't be true in {}", a[j], n, arrToString(a));
		return false;
	}

	return true;
}

bool sameContents(type[] a, type[] b) {
	bool[type] aa;

	foreach (i; a)
		aa[i] = true;

	foreach (i; b)
		if (!(i in aa))
			return false;

	return true;
}

///////////////////////////////////

void swap(T)(inout T a, inout T b) {
	auto
	t = a;
	a = b;
	b = t;
}

bool compare(type a, type b) {
	return a < b;
}

bool function(type, type) cmp;

struct Algo {
	void function(type[]) sort;
	char[] name;
}
Algo[] algorithms;

static this() {
	cmp = &compare;

	algorithms = [
		//Algo( &insertionSort, "Insertion sort" )
		//,Algo( &selectionSort, "Selection sort" )

		//Algo( &combSort11, "Comb sort 11" )
		//,Algo( &shellSort, "Shell sort" )

		//Algo( &heapSort, "Heap sort" )
		//,Algo( &smoothSort, "Smoothsort" )

		Algo( &builtinSort, "Built-in sort (UNFAIR: DOESN'T CALL FUNCTION FOR COMPARISON)" )
		,Algo( &tangoSort, "Tango sort" ) // dies due to stack overflow for median of 3 killers
		,Algo( &introSort, "Introspective sort" )

		//,Algo( &mergeSort, "Merge sort" )
		,Algo( &fastSort_orig!(type), "Fast sort by leonardo maffi (UNFAIR like built-in)" )
		,Algo( &fastSort!(type), "Fast sort by leonardo maffi" )
	];
}

void insertionSort(type[] a) {
	for (size_t i = 1; i < a.length; ++i) {
		type val = a[i];

		size_t pos = i-1;
		for (; pos < a.length && cmp(val, a[pos]); --pos)
			a[pos+1] = a[pos];

		a[pos+1] = val;
	}
}

void selectionSort(type[] a) {
	if (a.length) for (size_t i = 0; i < a.length - 1; ++i) {
		type min = a[i];
		size_t m = a.length;

		foreach (j, n; a[i+1..$])
		if (cmp(n, min)) {
			min = n;
			m = j;
		}

		if (m < a.length) {
			a[i+m+1] = a[i];
			a[i] = min;
		}
	}
}

void combSort11(type[] a) {
	// empirically found to be good here: http://world.std.com/~jdveale/combsort.htm
	// 1 / (1 - 1 / (e ^ phi)), phi being golden ratio = (sqrt(5) + 1)/2
	const real SHRINK_FACTOR = 1.2473309501039786540366528676643474234433714124826;

	bool swapped = void;
	size_t gap = a.length;

	if (gap) do {
		gap = cast(size_t)(cast(real) gap / SHRINK_FACTOR);

		switch (gap) {
			case     0:  gap = 1; break;
			case 9, 10: gap = 11; break;
			default: break;
		}

		swapped = false;

		for (size_t i = 0; i < a.length - gap; ++i) {
			size_t j = i + gap;

			if (cmp(a[j], a[i])) {
				swap(a[i], a[j]);
				swapped = true;
			}
		}

	} while (swapped || gap > 1);
}

void shellSort(type[] a) {

	// magic numbers from http://www.research.att.com/~njas/sequences/A102549
	foreach (h; [701,301,132,57,23,10,4,1])
	for (size_t i = h; i < a.length; ++i) {
		type v = a[i];
		size_t j = i;

		//for (; j >= h && cmp(v, a[j-h]); j -= h)
		//	a[j] = a[j-h];
		// measurably faster code below

		do {
			auto jh = j-h;

			if (!cmp(v, a[jh]))
				break;

			a[j] = a[jh];
			j = jh;
		} while (j >= h);

		a[j] = v;
	}
}

void heapSort(type[] a) {
	if (a.length <= 1)
		return;

	auto n = a.length;
	auto i = n / 2;

	for (;;) {
		type t = void;

		if (i > 0)
			t = a[--i];
		else {
			if (--n == 0)
				return;

			t = a[n];
			a[n] = a[0];
		}

		auto parent = i;
		auto child  = i*2 + 1;

		while (child < n) {
			if (child + 1 < n && cmp(a[child], a[child+1]))
				++child;

			if (cmp(t, a[child])) {
				a[parent] = a[child];
				parent = child;
				child = parent*2 + 1;
			} else
				break;
		}

		a[parent] = t;
	}
}

void smoothSort(type[] a) {

	if (a.length <= 1)
		return;

	static void up(inout size_t b, inout size_t c) {
		auto
		t = b + c + 1;
		c = b;
		b = t;
	}

	static void down(inout size_t b, inout size_t c) {
		auto
		t = b - c - 1;
		b = c;
		c = t;
	}

	size_t
		r = 0,
		p = 1,
		b = 1,
		c = 1,
		r1 = 0,
		b1 = void,
		c1 = void;

	void sift() {
		while (b1 >= 3) {
			auto r2 = r1 - b1 + c1;
			auto r3 = r1 - 1;

			if (cmp(a[r2], a[r3])) {
				r2 = r3;
				down(b1, c1);
			}

			if (cmp(a[r1], a[r2])) {
				swap(a[r1], a[r2]);
				r1 = r2;
				down(b1, c1);
			} else
				break;
		}
	}

	void trinkle() {
		auto p1 = p;

		while (p1 > 0) {
			while (p1 % 2 == 0) {
				p1 /= 2;
				up(b1, c1);
			}
			auto r3 = r1 - b1;

			if (p1 == 1 || cmp(a[r3], a[r1]))
				break;
			else {
				--p1;
				if (b1 == 1) {
					swap(a[r1], a[r3]);
					r1 = r3;
				} else if (b1 >= 3) {
					auto r2 = r1 - b1 + c1;
					auto r4 = r1 - 1;

					if (cmp(a[r2], a[r4])) {
						r2 = r4;
						down(b1, c1);
						p1 *= 2;
					}

					if (cmp(a[r3], a[r2])) {
						swap(a[r1], a[r2]);
						r1 = r2;
						down(b1, c1);
						break;
					} else {
						swap(a[r1], a[r3]);
						r1 = r3;
					}
				}
			}
		}
		sift();
	}

	void semitrinkle() {
		r1 = r - c;
		if (cmp(a[r], a[r1])) {
			swap(a[r], a[r1]);
			b1 = b;
			c1 = c;
			trinkle();
		}
	}

	size_t q = 1;
	for (; q < a.length; ++q) {
		if (p % 8 == 3) {
			r1 = r;
			b1 = b;
			c1 = c;

			sift();
			p = (p+1) >>> 2;
			up(b, c);
			up(b, c);

		} else if (p % 4 == 1) {
			r1 = r;
			b1 = b;
			c1 = c;

			if (q + c < a.length)
				sift();
			else
				trinkle();

			do {
				down(b, c);
				p *= 2;
			} while (b > 1);

			++p;
		}

		++r;
	}

	r1 = r;
	b1 = b;
	c1 = c;
	trinkle();

	while (q-- > 1) {
		if (b == 1) {
			--r;
			--p;
			while (p % 2 == 0) {
				p /= 2;
				up(b, c);
			}
		} else if (b >= 3) {

			r += c - b;
			if (--p != 0)
				semitrinkle();

			down(b, c);
			p = 2*p + 1;

			r += c;
			semitrinkle();

			down(b, c);
			p = 2*p + 1;
		}
	}
}

void builtinSort(type[] a) {
	a.sort;
	static if (compare(2, 1))
		a.reverse;
}

void tangoSort(type[] a) {
	sort(a, cmp);
}

// did testing in increments of 16, starting from the SGI STL's value of 16
// for my not-quite-new-yet-not-old machine this is approximately optimal
enum : size_t { INSERTIONSORT_THRESHOLD = 80 }

void introSort(type[] a) {
	static type median(type a, type b, type c) {
		if (cmp(a, b))
			if (cmp(b, c))
				return b;
			else if (cmp(a, c))
				return c;
			else
				return a;
		else if (cmp(a, c))
			return a;
		else if (cmp(b, c))
			return c;
		else
			return b;
	}

	static void doIntroSort(type[] a, size_t maxDepth) {
		while (a.length > INSERTIONSORT_THRESHOLD) {
			if (maxDepth-- == 0)
				return heapSort(a);

			type pivot = median(a[0], a[$/2], a[$-1]);
			size_t cut = 0, i = a.length;

			do {
				while (cmp(a[cut], pivot))
					++cut;
				--i;

				while (cmp(pivot, a[i]))
					--i;

				swap(a[cut++], a[i]);

			} while (cut < i);

			doIntroSort(a[cut..$], maxDepth);
			a = a[0..cut];
		}
	}

	if (a.length > 1) {
		size_t lg = 0;
		size_t i = a.length;
		do {
			++lg;
			i /= 2;
		} while (i > 1);

		doIntroSort(a, lg*2);
		insertionSort(a);
	}
}

void mergeSort(type[] a) {
	// use the same array throughout one external call

	static size_t orig = 0;
	static type[] b;

	if (a.length <= INSERTIONSORT_THRESHOLD)
		return insertionSort(a);

	if (orig == 0) {
		orig = a.length;
		b = new type[orig];
	}

	size_t mid = a.length / 2;

	auto l = a[0..mid];
	auto r = a[mid..$];

	mergeSort(l);
	mergeSort(r);

	size_t i, j, k;

	while (i < l.length && j < r.length) {

		// cmp must return true for equal values
		// if this is to be a stable sort

		if (cmp(l[i], r[j]))
			b[k] = l[i++];
		else
			b[k] = r[j++];

		++k;

	}

	b[k..k+l.length-i]           = l[i..$];
	b[   k+l.length-i..a.length] = r[j..$];

	a[] = b[0..a.length];

	if (a.length == orig) {
		orig = 0;
		delete b;
	}
}

// d.func.d, V.2.45, Jan 17 2008, by leonardo maffi
// Copyright: Python licence.

// edited by Deewiant to make fastSort return void

void fastSort_orig(TyItem)(TyItem[] v) {
    int inf = 0, sup = v.length - 1;
    _aqs_orig(v, inf, sup); // Call to the approximate Quick Sort

    // Just 1 application of Simple Insertion Sort (the faster version possible?)
    TyItem t, tt;
    for (int i = inf+1; i <= sup; i++) {
        int j = i;
        t = v[j];
        tt = (j > inf) ? v[j - 1] : 0;
        while ((j > inf) && (tt > t)) {
            v[j] = tt;
            j--;
            if (j > inf)
                tt = v[j - 1];
        }
        if (i != j)
            v[j] = t;
    }

    return v;
} // End of fastSort()

// Approximate Quick Sort, helper of fastSort
private void _aqs_orig(TyItem)(TyItem[] v, int l, int r) {
    const tresh = 8;
    TyItem a, aux;
    int i, j, auxl;

    auxl = l; // To remove the tail recursion.

    // If the sub vector is too small, then the QuickSort doesn't sort it.
    // So, this isn't a sort algorithm, but a approximated-sort algorithm.
    // The SIS at the end performs the exact sorting.
    while ((r - auxl) > tresh) {
        i = (r + auxl) / 2;

        // Tri-median method, very useful to balance the partitions
        if (v[auxl] > v[i]) { aux = v[auxl]; v[auxl] = v[i]; v[i] = aux; }
        if (v[auxl] > v[r]) { aux = v[auxl]; v[auxl] = v[r]; v[r] = aux; }
        if (v[i] > v[r]) { aux = v[i]; v[i] = v[r]; v[r] = aux; }

        j = r - 1; // pivot.
        aux = v[i]; v[i] = v[j]; v[j] = aux; // swap
        i = auxl;
        a = v[j];
        for (;;) {
            do i++; while (v[i] < a);
            do j--; while (v[j] > a);
            if (j < i) break;
            aux = v[i]; v[i] = v[j]; v[j] = aux; // swap
        }
        aux = v[i]; v[i] = v[r - 1]; v[r - 1] = aux; // swap

        _aqs_orig(v, auxl, j); // Inner recursive call, removing it the program slows down

        auxl = i + 1; // Tail recursion removed for speed
    }
}

// edited further to use the cmp() function
// added "j &&" in second do in _aqs to prevent array bounds errors with flipped cmp()

void fastSort(TyItem)(TyItem[] v) {
    int inf = 0, sup = v.length - 1;
    _aqs(v, inf, sup); // Call to the approximate Quick Sort

    // Just 1 application of Simple Insertion Sort (the faster version possible?)
    TyItem t, tt;
    for (int i = inf+1; i <= sup; i++) {
        int j = i;
        t = v[j];
        tt = (j > inf) ? v[j - 1] : 0;
        while ((j > inf) && cmp(t, tt)) {
            v[j] = tt;
            j--;
            if (j > inf)
                tt = v[j - 1];
        }
        if (i != j)
            v[j] = t;
    }

    return v;
} // End of fastSort()


// Approximate Quick Sort, helper of fastSort
private void _aqs(TyItem)(TyItem[] v, int l, int r) {
    const tresh = 8;
    TyItem a, aux;
    int i, j, auxl;

    auxl = l; // To remove the tail recursion.

    // If the sub vector is too small, then the QuickSort doesn't sort it.
    // So, this isn't a sort algorithm, but a approximated-sort algorithm.
    // The SIS at the end performs the exact sorting.
    while ((r - auxl) > tresh) {
        i = (r + auxl) / 2;

        // Tri-median method, very useful to balance the partitions
        if (cmp(v[auxl], v[i])) { aux = v[auxl]; v[auxl] = v[i]; v[i] = aux; }
        if (cmp(v[auxl], v[r])) { aux = v[auxl]; v[auxl] = v[r]; v[r] = aux; }
        if (cmp(v[i], v[r])) { aux = v[i]; v[i] = v[r]; v[r] = aux; }

        j = r - 1; // pivot.
        aux = v[i]; v[i] = v[j]; v[j] = aux; // swap
        i = auxl;
        a = v[j];
        for (;;) {
            do i++; while (cmp(v[i], a));
            do j--; while (j && cmp(a, v[j]));
            if (j < i) break;
            aux = v[i]; v[i] = v[j]; v[j] = aux; // swap
        }
        aux = v[i]; v[i] = v[r - 1]; v[r - 1] = aux; // swap

        _aqs(v, auxl, j); // Inner recursive call, removing it the program slows down

        auxl = i + 1; // Tail recursion removed for speed
    }
}