- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
•
Inferring ‘auto’
// Compile with /Zauto namespace std { template <typename II, typename Func> Func for_each(II first, II last, Func f) // implies restrict(cpu,auto) { for( ; first!=last; ++first ) f(*first); return f; } }
• • •
•
• • • •
Host
Accelerator
PCIe
•
• •
•
•
• •
•
•
•
•
•
•
•
1. void MatrixMultiplyTiled( vector<float>& C, const vector<float>& A, 2. const vector<float>& B, int M, int N, int W ) 3. { 4. array_view<const float,2> a(M,W,A), b(M,W,B); 5. array_view<writeonly<float>,2> c(M,W,C); 6. 7. parallel_for_each(c.grid.tiled<16,16>(),[&](tiled_index<16,16> ti) 8. restrict(direct3d) 9. { 10. // Use tile static memory for working with a tile 11. tile_static fixed_array<float,16,16> localA; 12. tile_static fixed_array<float,16,16> localB; 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23. 24.} for (int t = 0; t < a.y; t += 16) { localA.load(a.section(ti.tile_origin.y, t, 16, 16)); localA.load(b.section(t, ti.tile_origin.x, 16, 16)); float sum = 0; for (int i=0; i < 16; i++) sum += localA(ti.local.y,i) * localB(i,ti.local.x); } c[ti] = sum;
ing namespace Concurrency; 2.void MatrixMultiply( vector<float>& C, 3. const vector<float>& A, 4. const vector<float>& B, 5. int M, int N, int W ) 6.{ 7. array_view<const float,2> a(M,W,A); 8. array_view<const float,2> b(W,N,B); 9. array_view<writeonly<float>,2> c(M,N,C); 10. 11. 12. 13. 14. 15. 16. 17. 18.} parallel_for_each(c.grid, [&](index<2> idx) restrict(direct3d) { float sum = 0; for(int i = 0; i < a.x; i++) sum += a(idx.y, i) * b(i, idx.x); c[idx] = sum; });
• • •
ing namespace Concurrency; 2.void MatrixMultiply( vector<float>& C, 3. const vector<float>& A, 4. const vector<float>& B, 5. int M, int N, int W ) 6.{ 7. array_view<const float,2> a(M,W,A); 8. array_view<const float,2> b(W,N,B); 9. array_view<writeonly<float>,2> c(M,N,C); 10. parallel_for_each(c.grid, [=](index<2> idx) 11. restrict(direct3d) 12. { 13. float sum = 0; 14. for(int i = 0; i < a.x; ++i) 15. sum += a(idx.x, i) * b(i, idx.y); 16. c[idx] = sum; 17. }); 18.}
// Target-polymorphic call site
float foo(float v) { return cos(v); }
‘auto’ restriction specifier
•
template <typename Func> inline void my_generic_algorithm(Func f) restrict(auto) { f(); }
•
C++ source file
// Overload on target float cos(float) restrict(direct3d,fpga) { Baz *pBaz = new Baz(v); // error return _TaylorSeries_cos(v); } float cos(float v) restrict(cpu) { return _x64_FastCos(v); }
• • •
• • • • • • •
• • • • • • •
Source: AMD
Source: NVIDIA
•
Ray tracing
•
•
Medical tomography
•
• • •
•
• • •
• •
•
•
• •
• •
C=A*B
CPU
ALU ALU ALU
A B C
0 0
1 1
•
•
•
•
•
• •
grid<3> e3(6,3,3); index<3> i3(2,0,1);
• •
•
•
•
•
void Compute(double &x, array<double>& z, int i) restrict(direct3d) { x = z[i] * z[i+1]; }
•
•
•
• • • • •
2 2
3 … n 3 … n
GPU
0
1
4
9 … n2
Control
ALUCacheDAMvoid MatrixMult(float * C, const float * A, const float * B, int M, int N, int W ) { for (int x = 0; x <[&](int x) { parallel_for(0, W, W; ++x) for (int y = 0; y < N; ++y) { float sum = 0; for(int i = 0; i < M; i++) sum += A[x*M+i] * B[i*W+y]; C[x*W+y] = sum; } }}); }
•
});
•
• •
•
• •
•
•
• • •
•
•
•
•
Bring CPU debugging experience to GPU
• •
• • •
•
•
DRAM
void MatrixMult( float * C, const float * A, const float * B, int M, int N, int W ) { array_view<const float,2> a(M,W,A), b(W,N,B); array_view<float,2> c(M,N,C); parallel_for_each(c.grid, [=](index<2> idx) restrict(direct3d) { float sum = 0; for(int i = 0; i < a.x; i++) sum += a(idx.y, i) * b(i, idx.x); c[idx] = sum; }); }