EpicUnrealEngine
我跪了,看来垃圾电脑玩不来,官方光明山脉demo要64GB内存和200GB储存。而且打开渲染超级慢
需要进一步的研究学习
暂无
遇到的问题
暂无
开题缘由、总结、反思、吐槽~~
- 想实现美少女跳舞
- 其实好像Unity 3D更简单
参考文献
无
我跪了,看来垃圾电脑玩不来,官方光明山脉demo要64GB内存和200GB储存。而且打开渲染超级慢
暂无
暂无
无
肯定有人问,这不是计算机博客吗?哦!诶!我就不,我想怎么写就怎么写😋
__global__ void device_copy_scalar_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N; i += blockDim.x * gridDim.x) {
d_out[i] = d_in[i];
}
}
void device_copy_scalar(int* d_in, int* d_out, int N)
{
int threads = 128;
int blocks = min((N + threads-1) / threads, MAX_BLOCKS);
device_copy_scalar_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
简单的分块拷贝。
通过cuobjdump -sass executable
.得到对应的标量copy对应的SASS代码
/*0058*/ IMAD R6.CC, R0, R9, c[0x0][0x140]
/*0060*/ IMAD.HI.X R7, R0, R9, c[0x0][0x144]
/*0068*/ IMAD R4.CC, R0, R9, c[0x0][0x148]
/*0070*/ LD.E R2, [R6]
/*0078*/ IMAD.HI.X R5, R0, R9, c[0x0][0x14c]
/*0090*/ ST.E [R4], R2
(SASS不熟悉,请看SASS一文)
其中4条IMAD指令计算出读取和存储的指令地址R6:R7
和R4:R5
。第4和6条指令执行32位的访存命令。
通过使用int2
, int4
, or float2
比如将int
的指针d_in
类型转换然后赋值。
但是需要注意对齐问题,比如
这样是非法的。
通过使用对齐的结构体来实现同样的目的。
执行for循环次数减半,注意边界处理。
__global__ void device_copy_vector2_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = idx; i < N/2; i += blockDim.x * gridDim.x) {
reinterpret_cast<int2*>(d_out)[i] = reinterpret_cast<int2*>(d_in)[i];
}
// in only one thread, process final element (if there is one)
if (idx==N/2 && N%2==1)
d_out[N-1] = d_in[N-1];
}
void device_copy_vector2(int* d_in, int* d_out, int n) {
threads = 128;
blocks = min((N/2 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector2_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
对应汇编可以看出
/*0088*/ IMAD R10.CC, R3, R5, c[0x0][0x140]
/*0090*/ IMAD.HI.X R11, R3, R5, c[0x0][0x144]
/*0098*/ IMAD R8.CC, R3, R5, c[0x0][0x148]
/*00a0*/ LD.E.64 R6, [R10]
/*00a8*/ IMAD.HI.X R9, R3, R5, c[0x0][0x14c]
/*00c8*/ ST.E.64 [R8], R6
变成了LD.E.64
执行for循环次数减半,注意边界处理。
__global__ void device_copy_vector4_kernel(int* d_in, int* d_out, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < N/4; i += blockDim.x * gridDim.x) {
reinterpret_cast<int4*>(d_out)[i] = reinterpret_cast<int4*>(d_in)[i];
}
// in only one thread, process final elements (if there are any)
int remainder = N%4;
if (idx==N/4 && remainder!=0) {
while(remainder) {
int idx = N - remainder--;
d_out[idx] = d_in[idx];
}
}
}
void device_copy_vector4(int* d_in, int* d_out, int N) {
int threads = 128;
int blocks = min((N/4 + threads-1) / threads, MAX_BLOCKS);
device_copy_vector4_kernel<<<blocks, threads>>>(d_in, d_out, N);
}
对应汇编可以看出
/*0090*/ IMAD R10.CC, R3, R13, c[0x0][0x140]
/*0098*/ IMAD.HI.X R11, R3, R13, c[0x0][0x144]
/*00a0*/ IMAD R8.CC, R3, R13, c[0x0][0x148]
/*00a8*/ LD.E.128 R4, [R10]
/*00b0*/ IMAD.HI.X R9, R3, R13, c[0x0][0x14c]
/*00d0*/ ST.E.128 [R8], R4
变成了LD.E.128
(个人感觉,提升也不大吗?也没有两倍和四倍的效果)
绝大部分情况,向量比标量好, increase bandwidth, reduce instruction count, and reduce latency. 。
但是会增加额外的寄存器(SASS里也没有看到??)和降低并行性(什么意思???)
对于Ampere架构
指令方向
各种寄存器说明
* RX
for registers
* URX
for uniform registers
* SRX
for special system-controlled registers
* PX
for predicate registers
* c[X][Y]
for constant memory
SASS的难点在于指令的后缀。由于手册确实,需要结合PTX的后缀查看
/*0028*/ IMAD R6.CC, R3, R5, c[0x0][0x20];
/*0030*/ IMAD.HI.X R7, R3, R5, c[0x0][0x24];
/*0040*/ LD.E R2, [R6]; //load
c[BANK][ADDR] is a constant memory。
.CC
means “set the flags”
R6 = R3*R5 + c[0x0][0x20], saving carry to CC
R7 = (R3*R5 + c[0x0][0x24])>>32 + CC
R2 = *(R7<<32 + R6)
first two commands multiply two 32-bit values (R3 and R5) and add 64-bit value c[0x0][0x24]<<32+c[0x0][0x20],
leaving 64-bit address result in the R6,R7 pair
对应的代码是
观察 偏移 * 4 * 2060(delta=2056) * 4116(delta=2056) * 8228(delta=2 * 2056) * 6172(delta=-1 * 2056) * 10284(delta=2 * 2056) * 12340(delta=2056)
可见汇编就是中间写反了,导致不连续,不然能隐藏更多延迟
那么这些寄存器是怎么来的呢?感觉就是写反了
IMAD.WIDE.U32 R16, R16, R19, c[0x0][0x168]
LDG.E R27, [R16.64]
IMAD.WIDE R30, R19, c[0x0][0x164], R16
LDG.E R31, [R30.64]
IMAD.WIDE R32, R19, c[0x0][0x164], R30
LDG.E R39, [R32.64]
# important R41 R37
IMAD.WIDE R34, R19, c[0x0][0x164], R32
IMAD.WIDE R40, R19, c[0x0][0x164], R34
LDG.E R41, [R40.64]
LDG.E R37, [R34.64]
原因是前面是手动展开的,假如等待编译器自动展开for循环就不会有这个问题
暂无
暂无
https://forums.developer.nvidia.com/t/solved-sass-code-analysis/41167/2
https://stackoverflow.com/questions/35055014/how-to-understand-the-result-of-sass-analysis-in-cuda-gpu
有对应的PPT,代码。
最终将1000ms程序优化到1~2ms
乔良师兄有根据知乎介绍如何利用寄存器文件缓存
导致Embarrassingly Parallel Problems
计算某点的梯度,需要前后的function值。
问题: 对于边界上的cells,需要访问相邻区域的元素。
解决办法: 将他们也加入进当前block的SMEM
s_idx = threadIdx.x + RAD;
int main() {
const float PI = 3.1415927;
const int N = 150;
const float h = 2 * PI / N;
float x[N] = { 0.0 };
float u[N] = { 0.0 };
float result_parallel[N] = { 0.0 };
for (int i = 0; i < N; ++i) {
x[i] = 2 * PI*i / N;
u[i] = sinf(x[i]);
}
ddParallel(result_parallel, u, N, h);
}
Kernel Launching
#define TPB 64
#define RAD 1 // radius of the stencil
…
void ddParallel(float *out, const float *in, int n, float h) {
float *d_in = 0, *d_out = 0;
cudaMalloc(&d_in, n * sizeof(float));
cudaMalloc(&d_out, n * sizeof(float));
cudaMemcpy(d_in, in, n * sizeof(float), cudaMemcpyHostToDevice);
// Set shared memory size in bytes
const size_t smemSize = (TPB + 2 * RAD) * sizeof(float);
ddKernel<<<(n + TPB - 1)/TPB, TPB, smemSize>>>(d_out, d_in, n, h);
cudaMemcpy(out, d_out, n * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_in);
cudaFree(d_out);
}
Kernel Definition
__global__ void ddKernel(float *d_out, const float *d_in, int size, float h) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i >= size) return;
const int s_idx = threadIdx.x + RAD;
extern __shared__ float s_in[];
// Regular cells
s_in[s_idx] = d_in[i];
// Halo cells
if (threadIdx.x < RAD) {
s_in[s_idx - RAD] = d_in[i - RAD];
s_in[s_idx + blockDim.x] = d_in[i + blockDim.x];
}
__syncthreads();
d_out[i] = (s_in[s_idx-1] - 2.f*s_in[s_idx] + s_in[s_idx+1])/(h*h);
}
暂无
暂无
研一下USTC并行计算自己的选题
https://dumas.ccsd.cnrs.fr/dumas-00636254/document
https://indico.fysik.su.se/event/6743/contributions/10338/attachments/4175/4801/4.CUDA-StencilsSharedMemory-Markidis.pdf
洗漱台
淋雨间
卫生间(马桶(我们的变杂物间了
四人宿舍
某人的宿舍位~(一定不是我的)~
如果我们数据只会在 GPU 产生和使用,我们不需要来回进行拷贝。
https://migocpp.wordpress.com/2018/06/08/cuda-memory-access-global-zero-copy-unified/
简而言之,在 host 使用命令:cudaHostRegisterMapped 之后用 cudaHostGetDevicePointer 进行映射 最后解除绑定 cudaHostUnregister
即,
// First, pin the memory (or cudaHostAlloc instead)
cudaHostRegister(h_a, …, cudaHostRegisterMapped);
cudaHostRegister(h_b, …, cudaHostRegisterMapped);
cudaHostRegister(h_c, …, cudaHostRegisterMapped);
cudaHostGetDevicePointer(&a, h_a, 0);
cudaHostGetDevicePointer(&b, h_b, 0);
cudaHostGetDevicePointer(&c, h_c, 0);
kernel<<<...>>>(a, b, c);
cudaDeviceSynchronize();
// unpin/release host memory
cudaHostUnregister(h_a);
cudaHostUnregister(h_b);
cudaHostUnregister(h_c);
只要两个thread在 同一个warp中,允许thread直接读其他thread的寄存器值,这种比通过shared Memory进行thread间的通讯效果更好,latency更低,同时也不消耗额外的内存资源来执行数据交换。ref
对齐(Starting address for a region must be a multiple of region size)集体访问,有数量级的差异Coalesced
利用好每个block里的thread,全部每个线程各自读取自己对齐(Starting address for a region must be a multiple of region size 不一定是自己用的)数据到shared memory开辟的总空间。由于需要的数据全部合力读取进来了,计算时正常使用需要的读入的数据。
特别是对于结构体使用SoA(structure of arrays)而不是AoS(array of structures),
如果结构体实在不能对齐, 可以使用 __align(X)
, where X = 4, 8, or 16.强制对齐。
对于small Kernel和访存瓶颈的Kernel影响很大
由于需要对齐读取,3float是12字节,所以只能拆成三份。
有无采用对齐shared读取,有10倍的加速。
__syncthreads
self-tuning
出来占用率是指每个多处理器(Streaming Multiprocessor,SM)的实际的活动warps数量与最大理论的warps数量的比率。 高的占用率不一定能提升性能,因为这一般意味着每个线程分配的寄存器和shared memory变少。但低的占用率会导致内存延迟无法隐藏。
实际需要计算每个线程大概需要的shared memory和register数量
https://www.cnblogs.com/1024incn/p/4541313.html
https://www.cnblogs.com/1024incn/p/4545265.html
通过SMEM实现coalescing access
原本代码
_global__ void transpose_naive(float *odata, float *idata, int width, int height)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
if (xIndex < width && yIndex < height)
{
unsigned int index_in = xIndex + width * yIndex;
unsigned int index_out = yIndex + height * xIndex;
odata[index_out] = idata[index_in];
}
}
思想:将大矩阵划分成方块,并且存储在SMEM里。不仅SMEM速度更快,而且每行元素个数变少,跨行访问的间距变小,局部性增强。而且对于大矩阵加速效果会更明显。
__global__ void transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM*BLOCK_DIM];
unsigned int xBlock = blockDim.x * blockIdx.x;
unsigned int yBlock = blockDim.y * blockIdx.y;
unsigned int xIndex = xBlock + threadIdx.x;
unsigned int yIndex = yBlock + threadIdx.y;
unsigned int index_out, index_transpose;
if (xIndex < width && yIndex < height)
{
unsigned int index_in = width * yIndex + xIndex;
unsigned int index_block = threadIdx.y * BLOCK_DIM + threadIdx.x;
block[index_block] = idata[index_in];
index_transpose = threadIdx.x * BLOCK_DIM + threadIdx.y;
index_out = height * (xBlock + threadIdx.y) + yBlock + threadIdx.x;
}
__syncthreads();
if (xIndex < width && yIndex < height)
odata[index_out] = block[index_transpose]
}
when Block/tile dimensions are multiples of 16 ???
https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
对于一个32 × 32个元素的共享内存块,一列数据中的所有元素都映射到相同的SMEM bank ,导致bank conflict 的最坏情况:读取一列数据会导致32路的存储库冲突。
幸运的是,只需要将tile的元素宽度改为33,而不是32就行。
为了避免全局同步的巨大开销,采取分级归约
由于归约的计算密度低 1 flop per element loaded (bandwidth-optimal)
所以优化目标是将访存带宽用满。
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
if (tid % (s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
工作的线程越来越少。一开始是全部,最后一次只有thread0.
Just replace divergent branch With strided index and non-divergent branch,但是会带来bank conflict。
原理和Warp发射有关,假如在这里每个Warp并行的线程是2。一个Warp运行耗时为T.
Step0: 4+4+2+1=11T
Step1: 4+2+1+1=8T
for (unsigned int s=1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
sdata[index] += sdata[index + s];
}
__syncthreads();
}
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
原本寻址
现在寻址有一边连续了
方法: 在load SMEM的时候提前做一次规约加法,通过减少一半的block数,将原本两个block里的值load+add存储在sum里。
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
当s< 32的时候,就只有一个Warp工作了。
使用warp的SIMD还省去了__syncthreads()
的麻烦
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid < 32)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
为了保持整洁,最后一个if还做了无效的计算。eg, Warp里的最后一个线程只有第一句命令有用。
由于for循环里是二分的,而且小于32的单独处理了,导致for循环里实际运行代码最多就3句。
利用代码模板和编译器的自动优化实现:
红色代码会在编译时自动优化。
加速级联??
Cost= processors × time complexity
我们知道N个元素直接二叉树归约是O(log N) 时间 Cost=N*O(log N).
但是假如只有P个线程先做N/P的串行加法, 然后是log(P)的归约。 总cost=P(N/P+log(P))
当P=N/log(N), cost=O(N)
each thread should sum O(log n) elements来设置
比如,1024 or 2048 elements per block vs. 256 线程。每个sum n=4个元素。 具体参数要perf
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
do { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } while (i < n);
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
有if语句是没问题的,只要运行的时候全部执行if或者else就行。不要有些执行if,有些执行else,这才会等待。
说不定也不是全部执行if或者else就行,只需要连续32个Thread Index,是相同的执行就行。(猜想,需要测试。
通过增加block里的线程数,并且同时读取来隐藏延迟。 不仅可以隐藏Global Memory的延迟,还可以隐藏写后读的延迟
线程太多会导致分配到每一个的寄存器和SMEM变少
通过编译时加-cubin
选项,.cubin
文件前几行会显示
architecture {sm_10}
abiversion {0}
modname {cubin}
code {
name = BlackScholesGPU
lmem = 0 # per thread local memory
smem = 68 # per thread block shared memory
reg = 20 # per thread registers
2009 清华 邓仰东 cuda lecture pdf 注意也是参考的SC07 Nvidia。 ↩
命令行直接运行
sudo /usr/local/cuda/bin/nvprof --log-file a.log --metrics achieved_occupancy /staff/shaojiemike/github/cutests/22-commonstencil/common
nvprof --export-profile timeline.prof <app> <app args>
nvprof --analysis-metrics -o nbody-analysis.nvprof ./myApp
sudo /usr/local/cuda/bin/ncu -k stencil_kernel -s 0 -c 1 /staff/shaojiemike/github/cutests/22-commonstencil/best
ncu-ui是可视化界面,但是没弄懂
# shaojiemike @ snode0 in ~/github/cuda-samples-11.0 [16:02:08] $ ./bin/x86_64/linux/release/bandwidthTest [CUDA Bandwidth Test] - Starting... Running on... Device 0: Tesla P40 Quick Mode Host to Device Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(GB/s) 32000000 11.8 Device to Host Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(GB/s) 32000000 13.0 Device to Device Bandwidth, 1 Device(s) PINNED Memory Transfers Transfer Size (Bytes) Bandwidth(GB/s) 32000000 244.3 Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled. # shaojiemike @ snode0 in ~/github/cuda-samples-11.0 [16:03:24] $ ./bin/x86_64/linux/release/p2pBandwidthLatencyTest
nvprof通过指定与dram,L1或者L2 的metrics来实现。具体解释可以参考官网
在 Maxwell 和之后的架构中 L1 和 SMEM 合并
Metric Name | 解释 |
---|---|
achieved_occupancy | 活跃cycle是 Warps 活跃的比例 |
dram_read_throughput | |
dram_utilization | 在0到10的范围内,相对于峰值利用率,设备内存的利用率水平 |
shared_load_throughput | |
shared_utilization | |
l2_utilization |
暂无
暂无
无