Structure to array (and back again) with Circle
Breaking up a structure or vector into its constituent parts, doing some work, and stitching them back together later is a common activity in high-performance programming. Hardware vector support is built for "vertical" operations, where all the .x, .y and .z components of a vector are de-interleaved into their own arrays. Transforming the data between interleaved (normal struct/vector) and deinterleaved (all members separated out) is a frequent task, but one that can't be automated using C++.
Circle provides introspection keywords and same-language reflection. We can write S2A code once, make it generic, and use it for everything.
Here's Circle/CUDA code for deinterleaving a float4 array into its constituent components, then interleaving them back again.
#include <cuda.h>
#include <type_traits>
// Define a structure with a pointer to each member in type_t.
template<typename type_t>
struct s2a_pointers_t {
static_assert(std::is_class<type_t>::value,
"s2a argument must be a class type");
enum { count = @member_count(type_t) };
// Make a non-static data member that's a pointer to the struct's member
// type and has the same name.
@meta for(int i = 0; i < count; ++i)
@member_type(type_t, i)* @(@member_name(type_t, i));
// Convert from array to struct.
type_t to_struct(size_t index) const {
type_t obj { };
// Loop over each member of s, and set that member from data loaded from
// the corresponding array.
@meta for(int i = 0; i < count; ++i)
@member_ref(obj, i) = this->@(@member_name(type_t, i))[index];
return obj;
}
// Convert from struct to array.
void to_array(const type_t& obj, size_t index) {
// Loop over each member of type_t, and store its value into the
// corresponding array.
@meta for(int i = 0; i < count; ++i)
this->@(@member_name(type_t, i))[index] = @member_ref(obj, i);
}
};
// Each thread in s2a_k loads one element from s (the aggregate) and
// stores out each component to the corresponding pointer in a.
template<typename type_t>
__global__ void s2a_k(const type_t* s, s2a_pointers_t<type_t> a, size_t count) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < count) {
// Load the type as a structure.
type_t obj = s[index];
// Write the data members to the arrays in a.
a.to_array(obj, index);
}
}
// Each thread in a2s_k loads a full set of members from the deinterleaved
// pointers and stores them out as an aggregate to s.
template<typename type_t>
__global__ void a2s_k(s2a_pointers_t<type_t> a, type_t* s, size_t count) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < count) {
// Load the members from the s2a pointers and return as a struct.
type_t obj = a.to_struct(index);
// Store the struct back to global memory.
s[index] = obj;
}
}
int main(int argc, char** argv) {
size_t count = 1000000;
// Allocate float4 array.
float4* v;
cudaMalloc((void**)&v, sizeof(float4) * count);
// Allocate an array for each float4 member.
float* x, *y, *z, *w;
cudaMalloc((void**)&x, sizeof(float) * count);
cudaMalloc((void**)&y, sizeof(float) * count);
cudaMalloc((void**)&z, sizeof(float) * count);
cudaMalloc((void**)&w, sizeof(float) * count);
// Has members .x, .y, .z, .w. It's nice to put S2A pointers inside a struct,
// so that we can pass them to generic functions like s2a_k.
s2a_pointers_t<float4> pointers { x, y, z, w };
size_t nt = 512;
int num_blocks = (count + nt - 1) / nt;
// Convert from struct to array.
s2a_k<<<num_blocks, nt>>>(v, pointers, count);
// Convert back from array to struct.
a2s_k<<<num_blocks, nt>>>(pointers, v, count);
cudaFree(v);
cudaFree(x);
cudaFree(y);
cudaFree(z);
cudaFree(w);
return 0;
}
The resulting code is well optimized. Because we're storing a 16-byte aligned float4, we expect to see an STG.E.128 instruction to store out the data in one transaction, which we do, at [0178]. All of the control flow in to_struct
and to_array
is evaluated at compile time... It's not optimized out--it's never even part of the AST.
$ cuobjdump -sass s2a
code for sm_52
Function : _Z5a2s_kI6float4Ev14s2a_pointers_tIT_EPS2_m
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c7c00e22007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0xf0c8000002570000 */
/*0018*/ S2R R2, SR_TID.X ; /* 0xf0c8000002170002 */
/* 0x001fd840fec20ff1 */
/*0028*/ XMAD R2, R0.reuse, c[0x0] [0x8], R2 ; /* 0x4e00010000270002 */
/*0030*/ XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ; /* 0x4f107f8000270003 */
/*0038*/ XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ; /* 0x5b30011800370000 */
/* 0x001ff400fda007f6 */
/*0048*/ IADD RZ.CC, R0, -c[0x0][0x168] ; /* 0x4c11800005a700ff */
/*0050*/ ISETP.GE.U32.X.AND P0, PT, RZ, c[0x0][0x16c], PT ; /* 0x4b6c0b8005b7ff07 */
/*0058*/ @P0 EXIT ; /* 0xe30000000000000f */
/* 0x001fd800fea207f1 */
/*0068*/ SHL R2, R0.reuse, 0x2 ; /* 0x3848000000270002 */
/*0070*/ SHR.U32 R3, R0, 0x1e ; /* 0x3828000001e70003 */
/*0078*/ IADD R5.CC, R2, c[0x0][0x140] ; /* 0x4c10800005070205 */
/* 0x001f8440fec007f1 */
/*0088*/ IADD.X R6, R3, c[0x0][0x144] ; /* 0x4c10080005170306 */
/*0090*/ LEA R4.CC, R5.reuse, RZ ; /* 0x5bd780000ff70504 */
/*0098*/ LEA.HI.X P0, R5, R5, RZ, R6 ; /* 0x5bd803400ff70505 */
/* 0x081fd800fe2207f6 */
/*00a8*/ IADD R9.CC, R2.reuse, c[0x0][0x148] ; /* 0x4c10800005270209 */
/*00b0*/ IADD.X R6, R3, c[0x0][0x14c] ; /* 0x4c10080005370306 */
/*00b8*/ LEA R8.CC, R9.reuse, RZ ; /* 0x5bd780000ff70908 */
/* 0x001f8400fec007f1 */
/*00c8*/ LEA.HI.X P1, R9, R9, RZ, R6 ; /* 0x5bd903400ff70909 */
/*00d0*/ IADD R11.CC, R2, c[0x0][0x150] ; /* 0x4c1080000547020b */
/*00d8*/ IADD.X R6, R3, c[0x0][0x154] ; /* 0x4c10080005570306 */
/* 0x001fb000fe2207f6 */
/*00e8*/ LEA R10.CC, R11.reuse, RZ ; /* 0x5bd780000ff70b0a */
/*00f0*/ LEA.HI.X P2, R11, R11, RZ, R6 ; /* 0x5bda03400ff70b0b */
/*00f8*/ IADD R2.CC, R2, c[0x0][0x158] ; /* 0x4c10800005670202 */
/* 0x081fd800f62007f0 */
/*0108*/ { IADD.X R3, R3, c[0x0][0x15c] ; /* 0x4c10080005770303 */
/*0110*/ LD.E R6, [R10], P2 }
/* 0x8890000000070a06 */
/*0118*/ LEA R12.CC, R2.reuse, RZ ; /* 0x5bd780000ff7020c */
/* 0x001fc000fe2007e1 */
/*0128*/ LEA.HI.X P3, R13, R2, RZ, R3 ; /* 0x5bdb01c00ff7020d */
/*0130*/ MOV R2, R4 ; /* 0x5c98078000470002 */
/*0138*/ { MOV R3, R5 ; /* 0x5c98078000570003 */
/* 0x001edc40fe0007b4 */
/*0148*/ LD.E R5, [R8], P1 }
/* 0x8490000000070805 */
/*0150*/ { LEA R14.CC, R0.reuse, c[0x0][0x160], 0x4 ; /* 0x4bd782000587000e */
/*0158*/ LD.E R4, [R2], P0 }
/* 0x8090000000070204 */
/* 0x041fc400f6c007f0 */
/*0168*/ { LEA.HI.X R15, R0, c[0x0][0x164], RZ, 0x4 ; /* 0x1a277f800597000f */
/*0170*/ LD.E R7, [R12], P3 }
/* 0x8c90000000070c07 */
/*0178*/ STG.E.128 [R14], R4 ; /* 0xeede200000070e04 */
/* 0x001f8000ffe007ff */
/*0188*/ EXIT ; /* 0xe30000000007000f */
/*0190*/ BRA 0x190 ; /* 0xe2400fffff87000f */
/*0198*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*01a8*/ NOP; /* 0x50b0000000070f00 */
/*01b0*/ NOP; /* 0x50b0000000070f00 */
/*01b8*/ NOP; /* 0x50b0000000070f00 */
......................................................