Structure to array (and back again) with Circle

Breaking up a structure or vector into its constituent parts, doing some work, and stitching them back together later is a common activity in high-performance programming. Hardware vector support is built for "vertical" operations, where all the .x, .y and .z components of a vector are de-interleaved into their own arrays. Transforming the data between interleaved (normal struct/vector) and deinterleaved (all members separated out) is a frequent task, but one that can't be automated using C++.

Circle provides introspection keywords and same-language reflection. We can write S2A code once, make it generic, and use it for everything.

Here's Circle/CUDA code for deinterleaving a float4 array into its constituent components, then interleaving them back again.

s2a.cu

#include <cuda.h>
#include <type_traits>

// Define a structure with a pointer to each member in type_t.
template<typename type_t>
struct s2a_pointers_t {
  static_assert(std::is_class<type_t>::value, 
    "s2a argument must be a class type");

  enum { count = @member_count(type_t) };

  // Make a non-static data member that's a pointer to the struct's member
  // type and has the same name.
  @meta for(int i = 0; i < count; ++i)
    @member_type(type_t, i)* @(@member_name(type_t, i));

  // Convert from array to struct.
  type_t to_struct(size_t index) const {
    type_t obj { };

    // Loop over each member of s, and set that member from data loaded from
    // the corresponding array.
    @meta for(int i = 0; i < count; ++i)
      @member_ref(obj, i) = this->@(@member_name(type_t, i))[index];

    return obj;
  }

  // Convert from struct to array.
  void to_array(const type_t& obj, size_t index) {
    // Loop over each member of type_t, and store its value into the 
    // corresponding array.
    @meta for(int i = 0; i < count; ++i)
      this->@(@member_name(type_t, i))[index] = @member_ref(obj, i);
  }
};


// Each thread in s2a_k loads one element from s (the aggregate) and 
// stores out each component to the corresponding pointer in a.
template<typename type_t>
__global__ void s2a_k(const type_t* s, s2a_pointers_t<type_t> a, size_t count) {
  size_t index = blockIdx.x * blockDim.x + threadIdx.x;
  if(index < count) {
    // Load the type as a structure.
    type_t obj = s[index];

    // Write the data members to the arrays in a.
    a.to_array(obj, index);
  }
}

// Each thread in a2s_k loads a full set of members from the deinterleaved
// pointers and stores them out as an aggregate to s.
template<typename type_t>
__global__ void a2s_k(s2a_pointers_t<type_t> a, type_t* s, size_t count) {
  size_t index = blockIdx.x * blockDim.x + threadIdx.x;
  if(index < count) {
    // Load the members from the s2a pointers and return as a struct.
    type_t obj = a.to_struct(index);

    // Store the struct back to global memory.
    s[index] = obj;
  }
}

int main(int argc, char** argv) {
  size_t count = 1000000;

  // Allocate float4 array.
  float4* v;
  cudaMalloc((void**)&v, sizeof(float4) * count);

  // Allocate an array for each float4 member.
  float* x, *y, *z, *w;
  cudaMalloc((void**)&x, sizeof(float) * count);
  cudaMalloc((void**)&y, sizeof(float) * count);
  cudaMalloc((void**)&z, sizeof(float) * count);
  cudaMalloc((void**)&w, sizeof(float) * count);

  // Has members .x, .y, .z, .w. It's nice to put S2A pointers inside a struct,
  // so that we can pass them to generic functions like s2a_k.
  s2a_pointers_t<float4> pointers { x, y, z, w };

  size_t nt = 512;
  int num_blocks = (count + nt - 1) / nt;

  // Convert from struct to array.
  s2a_k<<<num_blocks, nt>>>(v, pointers, count);

  // Convert back from array to struct.
  a2s_k<<<num_blocks, nt>>>(pointers, v, count);

  cudaFree(v);
  cudaFree(x);
  cudaFree(y);
  cudaFree(z);
  cudaFree(w);

  return 0;
}

The resulting code is well optimized. Because we're storing a 16-byte aligned float4, we expect to see an STG.E.128 instruction to store out the data in one transaction, which we do, at [0178]. All of the control flow in to_struct and to_array is evaluated at compile time... It's not optimized out--it's never even part of the AST.

$ cuobjdump -sass s2a

  code for sm_52
    Function : _Z5a2s_kI6float4Ev14s2a_pointers_tIT_EPS2_m
  .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                                       /* 0x001c7c00e22007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;                              /* 0x4c98078000870001 */
        /*0010*/                   S2R R0, SR_CTAID.X ;                                /* 0xf0c8000002570000 */
        /*0018*/                   S2R R2, SR_TID.X ;                                  /* 0xf0c8000002170002 */
                                                                                       /* 0x001fd840fec20ff1 */
        /*0028*/                   XMAD R2, R0.reuse, c[0x0] [0x8], R2 ;               /* 0x4e00010000270002 */
        /*0030*/                   XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ;        /* 0x4f107f8000270003 */
        /*0038*/                   XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ;                /* 0x5b30011800370000 */
                                                                                       /* 0x001ff400fda007f6 */
        /*0048*/                   IADD RZ.CC, R0, -c[0x0][0x168] ;                    /* 0x4c11800005a700ff */
        /*0050*/                   ISETP.GE.U32.X.AND P0, PT, RZ, c[0x0][0x16c], PT ;  /* 0x4b6c0b8005b7ff07 */
        /*0058*/               @P0 EXIT ;                                              /* 0xe30000000000000f */
                                                                                       /* 0x001fd800fea207f1 */
        /*0068*/                   SHL R2, R0.reuse, 0x2 ;                             /* 0x3848000000270002 */
        /*0070*/                   SHR.U32 R3, R0, 0x1e ;                              /* 0x3828000001e70003 */
        /*0078*/                   IADD R5.CC, R2, c[0x0][0x140] ;                     /* 0x4c10800005070205 */
                                                                                       /* 0x001f8440fec007f1 */
        /*0088*/                   IADD.X R6, R3, c[0x0][0x144] ;                      /* 0x4c10080005170306 */
        /*0090*/                   LEA R4.CC, R5.reuse, RZ ;                           /* 0x5bd780000ff70504 */
        /*0098*/                   LEA.HI.X P0, R5, R5, RZ, R6 ;                       /* 0x5bd803400ff70505 */
                                                                                       /* 0x081fd800fe2207f6 */
        /*00a8*/                   IADD R9.CC, R2.reuse, c[0x0][0x148] ;               /* 0x4c10800005270209 */
        /*00b0*/                   IADD.X R6, R3, c[0x0][0x14c] ;                      /* 0x4c10080005370306 */
        /*00b8*/                   LEA R8.CC, R9.reuse, RZ ;                           /* 0x5bd780000ff70908 */
                                                                                       /* 0x001f8400fec007f1 */
        /*00c8*/                   LEA.HI.X P1, R9, R9, RZ, R6 ;                       /* 0x5bd903400ff70909 */
        /*00d0*/                   IADD R11.CC, R2, c[0x0][0x150] ;                    /* 0x4c1080000547020b */
        /*00d8*/                   IADD.X R6, R3, c[0x0][0x154] ;                      /* 0x4c10080005570306 */
                                                                                       /* 0x001fb000fe2207f6 */
        /*00e8*/                   LEA R10.CC, R11.reuse, RZ ;                         /* 0x5bd780000ff70b0a */
        /*00f0*/                   LEA.HI.X P2, R11, R11, RZ, R6 ;                     /* 0x5bda03400ff70b0b */
        /*00f8*/                   IADD R2.CC, R2, c[0x0][0x158] ;                     /* 0x4c10800005670202 */
                                                                                       /* 0x081fd800f62007f0 */
        /*0108*/         {         IADD.X R3, R3, c[0x0][0x15c] ;                      /* 0x4c10080005770303 */
        /*0110*/                   LD.E R6, [R10], P2         }
                                                                                       /* 0x8890000000070a06 */
        /*0118*/                   LEA R12.CC, R2.reuse, RZ ;                          /* 0x5bd780000ff7020c */
                                                                                       /* 0x001fc000fe2007e1 */
        /*0128*/                   LEA.HI.X P3, R13, R2, RZ, R3 ;                      /* 0x5bdb01c00ff7020d */
        /*0130*/                   MOV R2, R4 ;                                        /* 0x5c98078000470002 */
        /*0138*/         {         MOV R3, R5 ;                                        /* 0x5c98078000570003 */
                                                                                       /* 0x001edc40fe0007b4 */
        /*0148*/                   LD.E R5, [R8], P1         }
                                                                                       /* 0x8490000000070805 */
        /*0150*/         {         LEA R14.CC, R0.reuse, c[0x0][0x160], 0x4 ;          /* 0x4bd782000587000e */
        /*0158*/                   LD.E R4, [R2], P0         }
                                                                                       /* 0x8090000000070204 */
                                                                                       /* 0x041fc400f6c007f0 */
        /*0168*/         {         LEA.HI.X R15, R0, c[0x0][0x164], RZ, 0x4 ;          /* 0x1a277f800597000f */
        /*0170*/                   LD.E R7, [R12], P3         }
                                                                                       /* 0x8c90000000070c07 */
        /*0178*/                   STG.E.128 [R14], R4 ;                               /* 0xeede200000070e04 */
                                                                                       /* 0x001f8000ffe007ff */
        /*0188*/                   EXIT ;                                              /* 0xe30000000007000f */
        /*0190*/                   BRA 0x190 ;                                         /* 0xe2400fffff87000f */
        /*0198*/                   NOP;                                                /* 0x50b0000000070f00 */
                                                                                       /* 0x001f8000fc0007e0 */
        /*01a8*/                   NOP;                                                /* 0x50b0000000070f00 */
        /*01b0*/                   NOP;                                                /* 0x50b0000000070f00 */
        /*01b8*/                   NOP;                                                /* 0x50b0000000070f00 */
    ......................................................