Performance of CUDA __threadfence() synchronization versus synchronization by two separate kernel calls -
i'm trying understand how use __threadfence()
, seems powerful synchronization primitive lets different blocks work without going through huge hassle of ending kernel , starting new one. cuda c programming guide has example of (appendix b.5), fleshed out in "threadfencereduction" sample in sdk, seems "should" using.
however, when have tried using __threadfence()
, shockingly slow. see code below example. understand, __threadfence()
should make sure pending memory transfers current thread block finished, before proceeding. memory latency better microsecond, believe, total time deal 64kb of memory transfers in included code, on gtx680, should somewhere around microsecond. instead, __threadfence()
instruction seems take around 20
microseconds! instead of using __threadfence()
synchronize, can instead end kernel, , launch entirely new kernel (in same, default, stream synchronized), in less third of time!
what going on here? code have bug in i'm not noticing? or __threadfence()
20x
slower should be, , 6x
slower entire kernel launch+cleanup?
time 1000 runs of threadfence kernel: 27.716831 ms
answer: 120
time 1000 runs of first 3 lines, including threadfence: 25.962912 ms
synchronizing without threadfence, splitting 2 kernels: 7.653344 ms
answer: 120
#include "cuda.h" #include <cstdio> __device__ unsigned int count = 0; __shared__ bool islastblockdone; __device__ int scratch[16]; __device__ int junk[16000]; __device__ int answer; __global__ void usethreadfence() //just code example in b.5 of cuda c programming guide { if (threadidx.x==0) scratch[blockidx.x]=blockidx.x; junk[threadidx.x+blockidx.x*1000]=17+threadidx.x; //do more memory writes make kernel nontrivial __threadfence(); if (threadidx.x==0) { unsigned int value = atomicinc(&count, griddim.x); islastblockdone = (value == (griddim.x - 1)); } __syncthreads(); if (islastblockdone && threadidx.x==0) { // last block sums results stored in scratch[0 .. griddim.x-1] int sum=0; (int i=0;i<griddim.x;i++) sum+=scratch[i]; answer=sum; } } __global__ void justthreadfence() //first 3 lines of previous kernel, can compare speeds { if (threadidx.x==0) scratch[blockidx.x]=blockidx.x; junk[threadidx.x+blockidx.x*1000]=17+threadidx.x; __threadfence(); } __global__ void usetwokernels_1() //this , next kernel reproduce functionality of first kernel, faster! { if (threadidx.x==0) scratch[blockidx.x]=blockidx.x; junk[threadidx.x+blockidx.x*1000]=17+threadidx.x; } __global__ void usetwokernels_2() { if (threadidx.x==0) { int sum=0; (int i=0;i<griddim.x;i++) sum+=scratch[i]; answer=sum; } } int main() { int sum; cudaevent_t start, stop; float time; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); (int i=0;i<1000;i++) usethreadfence<<<16,1000>>>(); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time, start, stop); printf ("time 1000 runs of threadfence kernel: %f ms\n", time); cudaeventdestroy(start); cudaeventdestroy(stop); cudamemcpyfromsymbol(&sum,answer,sizeof(int)); printf("answer: %d\n",sum); cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); (int i=0;i<1000;i++) justthreadfence<<<16,1000>>>(); cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time, start, stop); printf ("time 1000 runs of first 3 lines, including threadfence: %f ms\n", time); cudaeventdestroy(start); cudaeventdestroy(stop); cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); (int i=0;i<1000;i++) {usetwokernels_1<<<16,1000>>>(); usetwokernels_2<<<16,1000>>>();} cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time, start, stop); printf ("synchronizing without threadfence, splitting 2 kernels: %f ms\n", time); cudaeventdestroy(start); cudaeventdestroy(stop); cudamemcpyfromsymbol(&sum,answer,sizeof(int)); printf("answer: %d\n",sum); }
i have tested code, compiled cuda 6.0, on 2 different cards: gt540m (fermi) , kepler k20c (kepler) , these results
gt540m
time 1000 runs of threadfence kernel: 303.373688 ms answer: 120 time 1000 runs of first 3 lines, including threadfence: 300.395416 ms synchronizing without threadfence, splitting 2 kernels: 597.729919 ms answer: 120
kepler k20c
time 1000 runs of threadfence kernel: 10.164096 ms answer: 120 time 1000 runs of first 3 lines, including threadfence: 8.808896 ms synchronizing without threadfence, splitting 2 kernels: 17.330784 ms answer: 120
i not observe particularly slow behavior of __threadfence()
against other 2 considered cases.
this can justified resorting disassembled codes.
usethreadfence()
c[0xe][0x0] = scratch c[0xe][0x4] = junk c[0xe][0xc] = count c[0x0][0x14] = griddim.x /*0000*/ mov r1, c[0x1][0x100]; /*0008*/ s2r r0, sr_tid.x; r0 = threadidx.x /*0010*/ isetp.ne.and p0, pt, r0, rz, pt; p0 = (r0 != 0) /*0018*/ s2r r5, sr_ctaid.x; r5 = blockidx.x /*0020*/ imad r3, r5, 0x3e8, r0; r3 = r5 * 1000 + r0 = threadidx.x + blockidx.x * 1000 if (threadidx.x == 0) /*0028*/ @!p0 iscadd r2, r5, c[0xe][0x0], 0x2; r2 = scratch + threadidx.x /*0030*/ iadd r4, r0, 0x11; r4 = r0 + 17 = threadidx.x + 17 /*0038*/ iscadd r3, r3, c[0xe][0x4], 0x2; r3 = junk + threadidx.x + blockidx.x * 1000 /*0040*/ @!p0 st [r2], r5; scratch[threadidx.x] = blockidx.x /*0048*/ st [r3], r4; junk[threadidx.x + blockidx.x * 1000] = threadidx.x + 17 /*0050*/ membar.gl; __threadfence /*0058*/ @p0 bra.u 0x98; if (threadidx.x != 0) branch 0x98 if (threadidx.x == 0) /*0060*/ @!p0 mov r2, c[0xe][0xc]; r2 = &count /*0068*/ @!p0 mov r3, c[0x0][0x14]; r3 = griddim.x /*0070*/ @!p0 atom.inc r2, [r2], r3; r2 = value = count + 1; *(&count) ++ /*0078*/ @!p0 iadd r3, r3, -0x1; r3 = r3 - 1 = griddim.x - 1 /*0080*/ @!p0 isetp.eq.and p1, pt, r2, r3, pt; p1 = (r2 == r3) = 8 value == (griddim.x - 1)) /*0088*/ @!p0 sel r2, rz, 0x1, !p1; if (!p1) r2 = rz otherwise r2 = 1 (r2 = islastblockdone) /*0090*/ @!p0 sts.u8 [rz], r2; stores r2 (i.e., islastblockdone) shared memory [0] /*0098*/ isetp.eq.and p0, pt, r0, rz, pt; p0 = (r0 == 0) = (threadidx.x == 0) /*00a0*/ bar.red.popc rz, rz, rz, pt; __syncthreads() /*00a8*/ lds.u8 r0, [rz]; r0 = r2 = islastblockdone /*00b0*/ isetp.ne.and p0, pt, r0, rz, p0; p0 = (r0 == 0) /*00b8*/ @!p0 exit; if (islastblockdone != 0) exits /*00c0*/ isetp.ne.and p0, pt, rz, c[0x0][0x14], pt; implementing loop loop unroll of 4 /*00c8*/ mov r0, rz; /*00d0*/ @!p0 bra 0x1b8; /*00d8*/ mov r2, c[0x0][0x14]; /*00e0*/ isetp.gt.and p0, pt, r2, 0x3, pt; /*00e8*/ mov r2, rz; /*00f0*/ @!p0 bra 0x170; /*00f8*/ mov r3, c[0x0][0x14]; /*0100*/ iadd r7, r3, -0x3; /*0108*/ nop; /*0110*/ iscadd r3, r2, c[0xe][0x0], 0x2; /*0118*/ iadd r2, r2, 0x4; /*0120*/ ld r4, [r3]; /*0128*/ isetp.lt.u32.and p0, pt, r2, r7, pt; /*0130*/ ld r5, [r3+0x4]; /*0138*/ ld r6, [r3+0x8]; /*0140*/ ld r3, [r3+0xc]; /*0148*/ iadd r0, r4, r0; /*0150*/ iadd r0, r5, r0; /*0158*/ iadd r0, r6, r0; /*0160*/ iadd r0, r3, r0; /*0168*/ @p0 bra 0x110; /*0170*/ isetp.lt.u32.and p0, pt, r2, c[0x0][0x14], pt; /*0178*/ @!p0 bra 0x1b8; /*0180*/ iscadd r3, r2, c[0xe][0x0], 0x2; /*0188*/ iadd r2, r2, 0x1; /*0190*/ ld r3, [r3]; /*0198*/ isetp.lt.u32.and p0, pt, r2, c[0x0][0x14], pt; /*01a0*/ nop; /*01a8*/ iadd r0, r3, r0; /*01b0*/ @p0 bra 0x180; /*01b8*/ mov r2, c[0xe][0x8]; /*01c0*/ st [r2], r0; /*01c8*/ exit;
justthreadfence()
function : _z15justthreadfencev .headerflags @"ef_cuda_sm20 ef_cuda_ptx_sm(ef_cuda_sm20)" /*0000*/ mov r1, c[0x1][0x100]; /* 0x2800440400005de4 */ /*0008*/ s2r r3, sr_tid.x; /* 0x2c0000008400dc04 */ /*0010*/ isetp.ne.and p0, pt, r3, rz, pt; /* 0x1a8e0000fc31dc23 */ /*0018*/ s2r r4, sr_ctaid.x; /* 0x2c00000094011c04 */ /*0020*/ imad r2, r4, 0x3e8, r3; /* 0x2006c00fa0409ca3 */ /*0028*/ @!p0 iscadd r0, r4, c[0xe][0x0], 0x2; /* 0x4000780000402043 */ /*0030*/ iadd r3, r3, 0x11; /* 0x4800c0004430dc03 */ /*0038*/ iscadd r2, r2, c[0xe][0x4], 0x2; /* 0x4000780010209c43 */ /*0040*/ @!p0 st [r0], r4; /* 0x9000000000012085 */ /*0048*/ st [r2], r3; /* 0x900000000020dc85 */ /*0050*/ membar.gl; /* 0xe000000000001c25 */ /*0058*/ exit; /* 0x8000000000001de7 */
usetwokernels_1()
function : _z15usetwokernels_1v .headerflags @"ef_cuda_sm20 ef_cuda_ptx_sm(ef_cuda_sm20)" /*0000*/ mov r1, c[0x1][0x100]; /* 0x2800440400005de4 */ /*0008*/ s2r r0, sr_tid.x; /* 0x2c00000084001c04 */ /*0010*/ isetp.ne.and p0, pt, r0, rz, pt; /* 0x1a8e0000fc01dc23 */ /*0018*/ s2r r2, sr_ctaid.x; /* 0x2c00000094009c04 */ /*0020*/ imad r4, r2, 0x3e8, r0; /* 0x2000c00fa0211ca3 */ /*0028*/ @!p0 iscadd r3, r2, c[0xe][0x0], 0x2; /* 0x400078000020e043 */ /*0030*/ iadd r0, r0, 0x11; /* 0x4800c00044001c03 */ /*0038*/ iscadd r4, r4, c[0xe][0x4], 0x2; /* 0x4000780010411c43 */ /*0040*/ @!p0 st [r3], r2; /* 0x900000000030a085 */ /*0048*/ st [r4], r0; /* 0x9000000000401c85 */ /*0050*/ exit; /* 0x8000000000001de7 */ .....................................
usetwokernels_1()
function : _z15usetwokernels_2v .headerflags @"ef_cuda_sm20 ef_cuda_ptx_sm(ef_cuda_sm20)" /*0000*/ mov r1, c[0x1][0x100]; /* 0x2800440400005de4 */ /*0008*/ s2r r0, sr_tid.x; /* 0x2c00000084001c04 */ /*0010*/ isetp.ne.and p0, pt, r0, rz, pt; /* 0x1a8e0000fc01dc23 */ /*0018*/ @p0 exit; /* 0x80000000000001e7 */ /*0020*/ isetp.ne.and p0, pt, rz, c[0x0][0x14], pt; /* 0x1a8e400053f1dc23 */ /*0028*/ mov r0, rz; /* 0x28000000fc001de4 */ /*0030*/ @!p0 bra 0x130; /* 0x40000003e00021e7 */ /*0038*/ mov r2, c[0x0][0x14]; /* 0x2800400050009de4 */ /*0040*/ isetp.gt.and p0, pt, r2, 0x3, pt; /* 0x1a0ec0000c21dc23 */ /*0048*/ mov r2, rz; /* 0x28000000fc009de4 */ /*0050*/ @!p0 bra 0xe0; /* 0x40000002200021e7 */ /*0058*/ mov r3, c[0x0][0x14]; /* 0x280040005000dde4 */ /*0060*/ iadd r7, r3, -0x3; /* 0x4800fffff431dc03 */ /*0068*/ nop; /* 0x4000000000001de4 */ /*0070*/ nop; /* 0x4000000000001de4 */ /*0078*/ nop; /* 0x4000000000001de4 */ /*0080*/ iscadd r3, r2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */ /*0088*/ ld r4, [r3]; /* 0x8000000000311c85 */ /*0090*/ iadd r2, r2, 0x4; /* 0x4800c00010209c03 */ /*0098*/ ld r5, [r3+0x4]; /* 0x8000000010315c85 */ /*00a0*/ isetp.lt.u32.and p0, pt, r2, r7, pt; /* 0x188e00001c21dc03 */ /*00a8*/ ld r6, [r3+0x8]; /* 0x8000000020319c85 */ /*00b0*/ ld r3, [r3+0xc]; /* 0x800000003030dc85 */ /*00b8*/ iadd r0, r4, r0; /* 0x4800000000401c03 */ /*00c0*/ iadd r0, r5, r0; /* 0x4800000000501c03 */ /*00c8*/ iadd r0, r6, r0; /* 0x4800000000601c03 */ /*00d0*/ iadd r0, r3, r0; /* 0x4800000000301c03 */ /*00d8*/ @p0 bra 0x80; /* 0x4003fffe800001e7 */ /*00e0*/ isetp.lt.u32.and p0, pt, r2, c[0x0][0x14], pt; /* 0x188e40005021dc03 */ /*00e8*/ @!p0 bra 0x130; /* 0x40000001000021e7 */ /*00f0*/ nop; /* 0x4000000000001de4 */ /*00f8*/ nop; /* 0x4000000000001de4 */ /*0100*/ iscadd r3, r2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */ /*0108*/ iadd r2, r2, 0x1; /* 0x4800c00004209c03 */ /*0110*/ ld r3, [r3]; /* 0x800000000030dc85 */ /*0118*/ isetp.lt.u32.and p0, pt, r2, c[0x0][0x14], pt; /* 0x188e40005021dc03 */ /*0120*/ iadd r0, r3, r0; /* 0x4800000000301c03 */ /*0128*/ @p0 bra 0x100; /* 0x4003ffff400001e7 */ /*0130*/ mov r2, c[0xe][0x8]; /* 0x2800780020009de4 */ /*0138*/ st [r2], r0; /* 0x9000000000201c85 */ /*0140*/ exit; /* 0x8000000000001de7 */ .....................................
as can seen, instructions of justthreadfencev()
strictly contained in of usethreadfence()
, while of usetwokernels_1()
, usetwokernels_2()
practically partitioning of of justthreadfencev()
. so, difference in timings ascribed kernel launch overhead of second kernel.
Comments
Post a Comment