CUDA Shared Memory Data Retain 共享内存驻留

共享内存 shared memory的生命周期是跟随block的生命周期的,即一份shared memory随着block的开启而开启,随着block(里所有thread)的结束而结束。

但从物理上说, shared memory里的数据并没有被清空。 如果一个新的block被分配到了这段shared memory,那么就能延续使用shared memory驻留的数据。以下的实验验证了这一点

两次kernel使用同样大小的gridDim和blockDim

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <iostream>
using namespace std;

__global__ void kernel(int *out_0 , int *out_1)
{
__shared__ int sharedVar ;
if(threadIdx.x != 0)
return;

if(sharedVar != 999)
sharedVar = 999;
else if(blockIdx.x == 0)
sharedVar = 111;

if(blockIdx.x == 0)
*out_0 = sharedVar;
else if(blockIdx.x == 1)
*out_1 = sharedVar;
}

int main(void)
{
int *out_0;
int *out_1;
cudaMallocHost(&out_0,sizeof(int));
cudaMallocHost(&out_1,sizeof(int));
dim3 grid(2,1,1); dim3 block(2,1,1);
cout<< "======Round 0====== " <<endl;
kernel<<<grid,block>>>(out_0,out_1);
cudaDeviceSynchronize();
cout<< "[block#0]sharedVar = " << *out_0 <<endl;
cout<< "[block#1]sharedVar = " << *out_1 <<endl;

dim3 grid_1(2,1,1); dim3 block_1(2,1,1);
cout<< "======Round 1====== " <<endl;
kernel<<<grid_1,block_1>>>(out_0,out_1);
cudaDeviceSynchronize();
cout<< "[block#0]sharedVar = " << *out_0 <<endl;
cout<< "[block#1]sharedVar = " << *out_1 <<endl;
}

输出如下:

1
2
3
4
5
6
======Round 0====== 
[block#0]sharedVar = 999
[block#1]sharedVar = 999
======Round 1======
[block#0]sharedVar = 111
[block#1]sharedVar = 999

说明shared memory的数据驻留了

改变第二次的blockDim,gridDim大小

1
2
将     dim3 grid_1(2,1,1); dim3 block_1(2,1,1); 
变为 dim3 grid_1(128,1,1); dim3 block_1(1024,1,1);

输出如下:

1
2
3
4
5
6
======Round 0====== 
[block#0]sharedVar = 999
[block#1]sharedVar = 999
======Round 1======
[block#0]sharedVar = 111
[block#1]sharedVar = 999

说明grid和block尺寸,并不影响shared memory的数据驻留

重置CUDA

1
2
3
4
在dim3 grid_1(2,1,1); dim3 block_1(2,1,1) 之前加上以下几行:
cudaDeviceReset();
cudaMallocHost(&out_0,sizeof(int));
cudaMallocHost(&out_1,sizeof(int));

输出如下:

1
2
3
4
5
6
======Round 0====== 
[block#0]sharedVar = 999
[block#1]sharedVar = 999
======Round 1======
[block#0]sharedVar = 999
[block#1]sharedVar = 999

说明cudaDeviceReset()清空了shared memory的数据