Cuda Optimize : Vectorized Memory Access
baseline¶
__global__ void device_copy_scalar_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
d_out[i] = d_in[i];
}
}
void device_copy_scalar(int* d_in, int* d_out, int N)
{
int threads = 128;
int blocks = min((N + threads-1) / threads, MAX_BLOCKS);
device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
简单的分块拷贝。
通过cuobjdump -sass executable
.得到对应的标量copy对应的SASS代码
/*0058*/ IMAD R6.CC, R0, R9, c[0x0][0x140]
/*0060*/ IMAD.HI.X R7, R0, R9, c[0x0][0x144]
/*0068*/ IMAD R4.CC, R0, R9, c[0x0][0x148]
/*0070*/ LD.E R2, [R6]
/*0078*/ IMAD.HI.X R5, R0, R9, c[0x0][0x14c]
/*0090*/ ST.E [R4], R2
(SASS不熟悉,请看SASS一文)
其中4条IMAD指令计算出读取和存储的指令地址R6:R7
和R4:R5
。第4和6条指令执行32位的访存命令。
Vector way1: CUDA C/C++ standard headers¶
通过使用int2
, int4
, or float2
比如将int
的指针d_in
类型转换然后赋值。
但是需要注意对齐问题,比如
这样是非法的。
Vector way2: structures¶
通过使用对齐的结构体来实现同样的目的。
实际修改LD.E.64¶
执行for循环次数减半,注意边界处理。
__global__ void device_copy_vector2_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N/2; i += blockDim.x * gridDim.x) {
reinterpret_cast<int2*>(d_out)[i] = reinterpret_cast<int2*>(d_in)[i];
}
// in only one thread, process final element (if there is one)
if (idx==N/2 && N%2==1)
d_out[N-1] = d_in[N-1];
}
void device_copy_vector2(int* d_in, int* d_out, int n) {
threads = 128;
blocks = min((N/2 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
对应汇编可以看出
/*0088*/ IMAD R10.CC, R3, R5, c[0x0][0x140]
/*0090*/ IMAD.HI.X R11, R3, R5, c[0x0][0x144]
/*0098*/ IMAD R8.CC, R3, R5, c[0x0][0x148]
/*00a0*/ LD.E.64 R6, [R10]
/*00a8*/ IMAD.HI.X R9, R3, R5, c[0x0][0x14c]
/*00c8*/ ST.E.64 [R8], R6
变成了LD.E.64
实际修改LD.E.128¶
执行for循环次数减半,注意边界处理。
__global__ void device_copy_vector4_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
reinterpret_cast<int4*>(d_out)[i] = reinterpret_cast<int4*>(d_in)[i];
}
// in only one thread, process final elements (if there are any)
int remainder = N%4;
if (idx==N/4 && remainder!=0) {
while(remainder) {
int idx = N - remainder--;
d_out[idx] = d_in[idx];
}
}
}
void device_copy_vector4(int* d_in, int* d_out, int N) {
int threads = 128;
int blocks = min((N/4 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
对应汇编可以看出
/*0090*/ IMAD R10.CC, R3, R13, c[0x0][0x140]
/*0098*/ IMAD.HI.X R11, R3, R13, c[0x0][0x144]
/*00a0*/ IMAD R8.CC, R3, R13, c[0x0][0x148]
/*00a8*/ LD.E.128 R4, [R10]
/*00b0*/ IMAD.HI.X R9, R3, R13, c[0x0][0x14c]
/*00d0*/ ST.E.128 [R8], R4
变成了LD.E.128
summary¶
(个人感觉,提升也不大吗?也没有两倍和四倍的效果)
绝大部分情况,向量比标量好, increase bandwidth, reduce instruction count, and reduce latency. 。
但是会增加额外的寄存器(SASS里也没有看到??)和降低并行性(什么意思???)