June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to Andrea Fontana | On Wednesday, 15 June 2016 at 09:32:21 UTC, Andrea Fontana wrote:
> Then I think the slice.byElement.array is the right solution.
The problem with that is that it slows down the code. I compared matrix multiplication between R and D's cblas adaptor and ndslice.
n = 4000
Matrices: A, B
Sizes: both n by n
Engine: both call openblas
R Elapsed Time: 2.709 s
D's cblas and ndslice: 3.593 s
The R code:
n = 4000; A = matrix(runif(n*n), nr = n); B = matrix(runif(n*n), nr = n)
system.time(C <- A%*%B)
The D code:
import std.stdio : writeln;
import std.experimental.ndslice;
import std.random : Random, uniform;
import std.conv : to;
import std.array : array;
import cblas;
import std.datetime : StopWatch;
T[] runif(T)(ulong len, T min, T max){
T[] arr = new T[len];
Random gen;
for(ulong i = 0; i < len; ++i)
arr[i] = uniform(min, max, gen);
return arr;
}
// Random matrix
auto rmat(T)(ulong nrow, ulong ncol, T min, T max){
return runif(nrow*ncol, min, max).sliced(nrow, ncol);
}
auto matrix_mult(T)(Slice!(2, T*) a, Slice!(2, T*) b){
int M = to!int(a.shape[0]);
int K = to!int(a.shape[1]);
int N = to!int(b.shape[1]);
int n_el = to!int(a.elementsCount);
T[] A = a.byElement.array;
T[] B = b.byElement.array;
T[] C = new T[M*N];
gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N);
return C.sliced(M, N);
}
void main()
{
int n = 4000;
auto A = rmat(n, n, 0., 1.);
auto B = rmat(n, n, 0., 1. );
StopWatch sw;
sw.start();
auto C = matrix_mult(A, B);
sw.stop();
writeln("Time taken: \n\t", sw.peek().msecs, " [ms]");
}
In my system monitor I can see the copy phase in the D process as as single core process. There should be a way to do go from ndslice to T[] without copying. Using a foreach loop is even slower
|
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to data pulverizer | On Wednesday, 15 June 2016 at 11:19:20 UTC, data pulverizer wrote: > On Wednesday, 15 June 2016 at 09:32:21 UTC, Andrea Fontana wrote: >> Then I think the slice.byElement.array is the right solution. > > The problem with that is that it slows down the code. I compared matrix multiplication between R and D's cblas adaptor and ndslice. > > n = 4000 > Matrices: A, B > Sizes: both n by n > Engine: both call openblas > > R Elapsed Time: 2.709 s > D's cblas and ndslice: 3.593 s > > The R code: > > n = 4000; A = matrix(runif(n*n), nr = n); B = matrix(runif(n*n), nr = n) > system.time(C <- A%*%B) > > The D code: > > import std.stdio : writeln; > import std.experimental.ndslice; > import std.random : Random, uniform; > import std.conv : to; > import std.array : array; > import cblas; > import std.datetime : StopWatch; > > > T[] runif(T)(ulong len, T min, T max){ > T[] arr = new T[len]; > Random gen; > for(ulong i = 0; i < len; ++i) > arr[i] = uniform(min, max, gen); > return arr; > } > > // Random matrix > auto rmat(T)(ulong nrow, ulong ncol, T min, T max){ > return runif(nrow*ncol, min, max).sliced(nrow, ncol); > } > > auto matrix_mult(T)(Slice!(2, T*) a, Slice!(2, T*) b){ > int M = to!int(a.shape[0]); > int K = to!int(a.shape[1]); > int N = to!int(b.shape[1]); > int n_el = to!int(a.elementsCount); > T[] A = a.byElement.array; > T[] B = b.byElement.array; > T[] C = new T[M*N]; > gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N); > return C.sliced(M, N); > } > > > void main() > { > int n = 4000; > auto A = rmat(n, n, 0., 1.); > auto B = rmat(n, n, 0., 1. ); > StopWatch sw; > sw.start(); > auto C = matrix_mult(A, B); > sw.stop(); > writeln("Time taken: \n\t", sw.peek().msecs, " [ms]"); > } > > In my system monitor I can see the copy phase in the D process as as single core process. There should be a way to do go from ndslice to T[] without copying. Using a foreach loop is even slower As said you can avoid the copy (see below). I also profiled it a bit and it was interesting to see that 50% of the runtime are spent on generating the random matrix. On my machine now both scripts take 1.5s when compiled with DFLAGS="-release -O3 -boundscheck=off" dub foo2.d --compiler=ldc (`-b release` would also work) #!/usr/bin/env dub /+ dub.sdl: name "matrix_mult" dependency "cblas" version="~master" dependency "mir" version="~>0.15" +/ import std.stdio : writeln; import mir.ndslice; import std.random : Random, uniform; import std.conv : to; import std.array : array; import cblas; import std.datetime : StopWatch; T[] runif(T)(ulong len, T min, T max){ T[] arr = new T[len]; Random gen; for(ulong i = 0; i < len; ++i) arr[i] = uniform(min, max, gen); return arr; } // Random matrix auto rmat(T)(ulong nrow, ulong ncol, T min, T max){ import std.typecons : tuple; auto arr = runif(nrow*ncol, min, max); return tuple(arr, arr.sliced(nrow, ncol)); } auto matrix_mult(T)(T[] A, T[] B, Slice!(2, T*) a, Slice!(2, T*) b){ int M = to!int(a.shape[0]); int K = to!int(a.shape[1]); int N = to!int(b.shape[1]); int n_el = to!int(a.elementsCount); T[] C = new T[M*N]; gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N); return C.sliced(M, N); } void main() { int n = 4000; auto ta = rmat(n, n, 0., 1.); auto tb = rmat(n, n, 0., 1. ); StopWatch sw; sw.start(); auto C = matrix_mult(ta[0], tb[0], ta[1], tb[1]); sw.stop(); writeln("Time taken: \n\t", sw.peek().msecs, " [ms]"); } For performance issues, you should definitely open an issue at mir (the development library of ndslice): https://github.com/libmir/mir |
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to Seb | On Wednesday, 15 June 2016 at 12:10:32 UTC, Seb wrote: > As said you can avoid the copy (see below). I also profiled it a bit and it was interesting to see that 50% of the runtime are spent on generating the random matrix. On my machine now both scripts take 1.5s when compiled with I didn't benchmark the RNG but I did notice it took a lot of time to generate the matrix but for now I am focused on the BLAS side of things. I am puzzled about how your code works: Firstly: I didn't know that you could substitute an array for its first element in D though I am aware that a pointer to an array's first element is equivalent to passing the array in C. > > auto matrix_mult(T)(T[] A, T[] B, Slice!(2, T*) a, Slice!(2, T*) b){ > ... > gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N); > return C.sliced(M, N); > } > Secondly: I am especially puzzled about using the second element to stand in for the slice itself. How does that work? And where can I find more cool tricks like that? > void main() > { > ... > auto C = matrix_mult(ta[0], tb[0], ta[1], tb[1]); > sw.stop(); > writeln("Time taken: \n\t", sw.peek().msecs, " [ms]"); > } > Many thanks! |
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to data pulverizer | Oh, I didn't see that runif now returns a tuple. |
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to data pulverizer | On Wednesday, 15 June 2016 at 13:13:05 UTC, data pulverizer wrote: > On Wednesday, 15 June 2016 at 12:10:32 UTC, Seb wrote: >> As said you can avoid the copy (see below). I also profiled it a bit and it was interesting to see that 50% of the runtime are spent on generating the random matrix. On my machine now both scripts take 1.5s when compiled with > > I didn't benchmark the RNG but I did notice it took a lot of time to generate the matrix but for now I am focused on the BLAS side of things. > > I am puzzled about how your code works: > > Firstly: > I didn't know that you could substitute an array for its first element in D though I am aware that a pointer to an array's first element is equivalent to passing the array in C. >> >> auto matrix_mult(T)(T[] A, T[] B, Slice!(2, T*) a, Slice!(2, T*) b){ >> ... >> gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., A.ptr, K, B.ptr, N, 0, C.ptr, N); >> return C.sliced(M, N); >> } >> You wrote that too :-) For more infos see: https://dlang.org/spec/arrays.html However that's very dangerous, so use just slices wherever you can. > Secondly: > I am especially puzzled about using the second element to stand in for the slice itself. How does that work? And where can I find more cool tricks like that? > >> void main() >> { >> ... >> auto C = matrix_mult(ta[0], tb[0], ta[1], tb[1]); >> sw.stop(); >> writeln("Time taken: \n\t", sw.peek().msecs, " [ms]"); >> } >> > > Many thanks! Btw you don't even need to save tuples, the pointer is already saved in the slice ;-) N.b: afaik you need the latest version of mir, because std.experimental.ndslice in 2.071 doesn't expose the `.ptr` (yet). // Random matrix auto rmat(T)(ulong nrow, ulong ncol, T min, T max){ return runif(nrow*ncol, min, max).sliced(nrow, ncol); } auto matrix_mult(T)(Slice!(2, T*) a, Slice!(2, T*) b){ int M = to!int(a.shape[0]); int K = to!int(a.shape[1]); int N = to!int(b.shape[1]); int n_el = to!int(a.elementsCount); T[] C = new T[M*N]; gemm(Order.ColMajor, Transpose.NoTrans, Transpose.NoTrans, M, N, K, 1., a.ptr, K, b.ptr, N, 0, C.ptr, N); return C.sliced(M, N); } void main() { int n = 4000; auto A = rmat(n, n, 0., 1.); auto B = rmat(n, n, 0., 1. ); StopWatch sw; sw.start(); auto C = matrix_mult(A, B); sw.stop(); writeln("Time taken: \n\t", sw.peek().msecs, " [ms]"); } If you really want to get the original T[] back, you could use something like ``` T[] a = slice.ptr[0.. slice.elementsCount]; ``` but for most cases `byElement` would be a lot better, because all transformations etc are of course only applied to your view. > And where can I find more cool tricks like that? Browse the source code and the unittests. Phobos is an amazing resource :) |
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to Seb | On Wednesday, 15 June 2016 at 14:14:23 UTC, Seb wrote:
> On Wednesday, 15 June 2016 at 13:13:05 UTC, data pulverizer
>> And where can I find more cool tricks like that?
>
> Browse the source code and the unittests. Phobos is an amazing resource :)
Very true!
That's great many thanks!
|
June 15, 2016 Re: ndslice: convert a sliced object to T[] | ||||
---|---|---|---|---|
| ||||
Posted in reply to data pulverizer | On Wednesday, 15 June 2016 at 14:14:23 UTC, Seb wrote:
> ```
> T[] a = slice.ptr[0.. slice.elementsCount];
> ```
This would work only for slices with continuous memory representation and positive strides. -- Ilya
|
Copyright © 1999-2021 by the D Language Foundation