Layout

0 1 2 3 4 5 6 7 0 0 0 0 0 0 0 0 
8 9 10 11 12 13 14 15 0 0 0 0 0 0 0 0 
16 17 18 19 20 21 22 23 0 0 0 0 0 0 0 0 
24 25 26 27 28 29 30 31 0 0 0 0 0 0 0 0 
32 33 34 35 36 37 38 39 0 0 0 0 0 0 0 0 
40 41 42 43 44 45 46 47 0 0 0 0 0 0 0 0 
48 49 50 51 52 53 54 55 0 0 0 0 0 0 0 0 
56 57 58 59 60 61 62 63 0 0 0 0 0 0 0 0 
0 0 0 0 0 0 0 0 0 8 16 24 32 40 48 56 
0 0 0 0 0 0 0 0 1 9 17 25 33 41 49 57 
0 0 0 0 0 0 0 0 2 10 18 26 34 42 50 58 
0 0 0 0 0 0 0 0 3 11 19 27 35 43 51 59 
0 0 0 0 0 0 0 0 4 12 20 28 36 44 52 60 
0 0 0 0 0 0 0 0 5 13 21 29 37 45 53 61 
0 0 0 0 0 0 0 0 6 14 22 30 38 46 54 62 
0 0 0 0 0 0 0 0 7 15 23 31 39 47 55 63

Code on V100

int half_elements = a_frag.num_elements / 2;

    for (int i = 0; i < half_elements; i++) {
        if (lid < 4)
            a_frag.x[i] = (lid << 3) + i;  // lid * 8 + i
        else if (lid >= 16 && lid < 20)
            a_frag.x[i] = (lid << 3) - 96 + i;  // lid * 8 - 96 + i
    }

    for (int i = half_elements; i < a_frag.num_elements; i++) {
        if (lid >= 12 && lid < 16)
            a_frag.x[i] = lid - 76 + (i << 3);  // lid - 76 + i * 8
        else if (lid >= 28 && lid < 32)
            a_frag.x[i] = lid - 88 + (i << 3);  // lid - 88 + i * 8
    }