Estimate GFLOPS (dot products, no loops)

Here we are trying to use dot products between 2 or 3 arrays of vectors. Unfortunately all of these experiments showed that this task is still very memory-bound:

GFLOPS and memory bandwidth for various tests

FMA is the count of multiply-accumulate instructions in the shader, % FMA is the ration of FMAs to all instructions, GFLOPS is the achieved multiplication rate per second, GOPS is the amount of instructions issued per second (including in the ADD/SF unit) – this is what ARM Mali marketing numbers are based on and finally BW is the total memory bandwidth require to fetch the input vectors and save the results.

We are getting around 50% of the expected GOPS. We need a different approach.

Test 1: 16 FMAs per invocation (2 vec4 loads + 1 vec4 store)

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w*2, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w*2, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(2,{unrolly});
      highp vec4 x, y;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      highp vec4 r = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w, h)
    MACs = output.sum()
#     print(MACs, elapsed, w*h*2*4*4)
    gflops, mem_mbps = MACs / elapsed / 1e9, w*h*2*4*4 / elapsed / 1024 / 1024
    return gflops

gflops = test(localy=8)
print(source)
print(gflops)
showLastShaderDisassembly()

    #version 310 es
    precision mediump float;

    layout(local_size_x = 1, local_size_y = 8) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(2,1);
      highp vec4 x, y;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      highp vec4 r = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r);
    }
    
1.2023323586109347
FMAs: 64.00% (16 / 25)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *LSHIFT_OR.i32 r0:t0, r60, #0, 0x00000001 /* 0.000000 */
    +NOP t1
    *MKVEC.v2i16 t0, t0, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r1
}

clause_3:
ds(0) nbb attr ncph dwb(0) 
{
    *NOP t0
    +IADD.s32 t1, r0, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r5
}

clause_6:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r4, r8, #0.neg
    +NOP t1
    *FMA.f32 r0:t0, r3, r7, t0
    +NOP t1
}

clause_8:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 t0, r2, r6, r0
    +NOP t1
    *FMA.f32 r0:t0, r1, r5, t0
    +NOP t1
    *FMA.f32 t0, r4, r7, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r8, t0
    +NOP t1
    *FMA.f32 t0, r2, r5, t0
    +NOP t1
    *FMA.f32 r9:t0, r1, r6, t0
    +NOP t1
    *FMA.f32 t0, r4, r6, #0.neg
    +NOP t1
    *FMA.f32 r10:t0, r3, r5, t0
    +NOP t1
}

clause_14:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *FMA.f32 t0, r2, r8, r10
    +NOP t1
    *FMA.f32 r10:t0, r1, r7, t0
    +NOP t1
    *FMA.f32 t0, r4, r5, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r6, t0
    +NOP t1
    *FMA.f32 t0, r2, r7, t0
    +NOP t1
    *FMA.f32 r1:t0, r1, r8, t0
    +NOP t1
    *MOV.i32 r2:t0, r0
    +MKVEC.v2i16 t1, r60, r61
    *DTSEL_IMM.attribute_1 t0, t1
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r6
}

clause_20:
ds(0) eos store 
{
    *MOV.i32 r3:t0, r9
    +MOV.i32 r4:t1, r10
    *NOP t0
    +MOV.i32 r5:t1, r1
    *NOP t0
    +ST_CVT.v4 t1, r6, r7, r8, @r2
}

shader7799 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)

Test 2: 48 FMAs per invocation (3 vec4 loads + 1 vec4 store)

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1, membw=False):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w*3, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w*3, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,{unrolly});
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w*3, h)
    MACs = output.sum()
#     print(MACs, elapsed, 4*w*h*4*4)
    if membw:
        return MACs / elapsed / 1e9, 4*w*h*4*4 / elapsed / 1024 / 1024
    else:
        return MACs / elapsed / 1e9

# test(localsz=(1,1), unrollx=4, unrolly=4)
test()
test()
gflops = test(localx=8, localy=2, membw=True)
print(source)
print(gflops)
showLastShaderDisassembly()

    #version 310 es
    precision mediump float;

    layout(local_size_x = 8, local_size_y = 2) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,1);
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2);
    }
    
(5.032359501523362, 6398.9760100343865)
FMAs: 73.85% (48 / 65)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *IMUL.i32 r0:t0, r60, 0x00000003 /* 0.000000 */
    +NOP t1
    *MKVEC.v2i16 t0, t0, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r1
}

clause_3:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *NOP t0
    +IADD.s32 t1, r0, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r5
}

clause_6:
ds(0) nbb attr ncph 
{
    *NOP t0
    +IADD.s32 t1, r0, 0x00000002 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r9
    *FMA.f32 r0:t0, r4, r8, #0.neg
    +NOP t1
}

clause_9:
ds(0) nbb ncph dwb(0) 
{
    *FMA.f32 t0, r3, r7, r0
    +NOP t1
    *FMA.f32 t0, r2, r6, t0
    +NOP t1
    *FMA.f32 r0:t0, r1, r5, t0
    +NOP t1
    *FMA.f32 t0, r4, r7, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r8, t0
    +NOP t1
    *FMA.f32 t0, r2, r5, t0
    +NOP t1
    *FMA.f32 r13:t0, r1, r6, t0
    +NOP t1
    *FMA.f32 r14:t0, r4, r6, #0.neg
    +NOP t1
}

clause_15:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r3, r5, r14
    +NOP t1
    *FMA.f32 t0, r2, r8, t0
    +NOP t1
    *FMA.f32 r14:t0, r1, r7, t0
    +NOP t1
    *FMA.f32 t0, r4, r5, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r6, t0
    +NOP t1
    *FMA.f32 t0, r2, r7, t0
    +NOP t1
    *FMA.f32 r15:t0, r1, r8, t0
    +NOP t1
    *FMA.f32 r48:t0, r4, r12, #0.neg
    +NOP t1
}

clause_21:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r3, r11, r48
    +NOP t1
    *FMA.f32 t0, r2, r10, t0
    +NOP t1
    *FMA.f32 r48:t0, r1, r9, t0
    +NOP t1
    *FMA.f32 t0, r4, r11, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r12, t0
    +NOP t1
    *FMA.f32 t0, r2, r9, t0
    +NOP t1
    *FMA.f32 r49:t0, r1, r10, t0
    +NOP t1
    *FMA.f32 r50:t0, r4, r10, #0.neg
    +NOP t1
}

clause_27:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r3, r9, r50
    +NOP t1
    *FMA.f32 t0, r2, r12, t0
    +NOP t1
    *FMA.f32 r50:t0, r1, r11, t0
    +NOP t1
    *FMA.f32 t0, r4, r9, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r10, t0
    +NOP t1
    *FMA.f32 t0, r2, r11, t0
    +NOP t1
    *FMA.f32 r1:t0, r1, r12, t0
    +NOP t1
    *FMA.f32 r2:t0, r8, r12, #0.neg
    +NOP t1
}

clause_33:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r7, r11, r2
    +NOP t1
    *FMA.f32 t0, r6, r10, t0
    +NOP t1
    *FMA.f32 r2:t0, r5, r9, t0
    +NOP t1
    *FMA.f32 t0, r8, r11, #0.neg
    +NOP t1
    *FMA.f32 t0, r7, r12, t0
    +NOP t1
    *FMA.f32 t0, r6, r9, t0
    +NOP t1
    *FMA.f32 r3:t0, r5, r10, t0
    +NOP t1
    *FMA.f32 r4:t0, r8, r10, #0.neg
    +NOP t1
}

clause_39:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 t0, r7, r9, r4
    +NOP t1
    *FMA.f32 t0, r6, r12, t0
    +NOP t1
    *FMA.f32 r4:t0, r5, r11, t0
    +NOP t1
    *FMA.f32 t0, r8, r9, #0.neg
    +NOP t1
    *FMA.f32 t0, r7, r10, t0
    +NOP t1
    *FMA.f32 t0, r6, r11, t0
    +NOP t1
    *FMA.f32 r5:t0, r5, r12, t0
    +NOP t1
    *NOP t0
    +FADD.f32 r6:t1, r13, r49
}

clause_45:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *NOP t0
    +FADD.f32 r7:t1, r14, r50
    *FADD.f32 t0, r0, r48
    +FADD.f32 r0:t1, t, r2
    *NOP t0
    +FADD.f32 r2:t1, r6, r3
    *NOP t0
    +FADD.f32 r3:t1, r7, r4
    *FADD.f32 t0, r15, r1
    +MOV.i32 r4:t1, r0
    *FADD.f32 t0, t0, r5
    +MOV.i32 r5:t1, r2
    *MOV.i32 r7:t0, t0
    +MKVEC.v2i16 t1, r60, r61
    *DTSEL_IMM.attribute_1 t0, t1
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r0
}

clause_51:
ds(0) eos store 
{
    *NOP t0
    +MOV.i32 r6:t1, r3
    *NOP t0
    +ST_CVT.v4 t1, r0, r1, r2, @r4
}

shader7949 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)

Test 3: 107 FMAs per invocation (3 vec4 loads + 1 vec4 store)

Watch out since it’s doing only 107 multiplications and not 112 as might be naively counted. This means the reported GFLOPS number needs to be corrected manually.

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1, membw=False):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w*3, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w*3, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,{unrolly});
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      r0.x -= 5.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w*3, h)
#     assert(output[0,0,0] == reps * 4 * 4)
    MACs = output.sum()
#     print(wh, unrollx, elapsed)
#     print(MACs, elapsed, 4*w*h*4*4)

    if membw:
        print(output[0,0].sum())
        return MACs / elapsed / 1e9, 4*w*h*4*4 / elapsed / 1024 / 1024
    else:
        return MACs / elapsed / 1e9

# test(localsz=(1,1), unrollx=4, unrolly=4)
test()
test()
gflops = test(localx=8, localy=2, membw=True)
print(source)
print(gflops)
showLastShaderDisassembly()
107.0

    #version 310 es
    precision mediump float;

    layout(local_size_x = 8, local_size_y = 2) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,1);
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      r0.x -= 5.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2);
    }
    
(9.985058991414952, 5695.697529967095)
FMAs: 82.95% (107 / 129)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *IMUL.i32 r0:t0, r60, 0x00000003 /* 0.000000 */
    +IADD.s32 t1, t, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r1
}

clause_3:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *NOP t0
    +FADD.f32 t1, r3, r3
    *FMA.f32 t0, r4, t1, #0.neg
    +FADD.f32 t1, r1, r1
    *FMA.f32 r5:t0, r2, t1, t0
    +NOP t1
    *FMA.f32 t0, r4, r1, #0.neg
    +FADD.f32 t1, r2, r2
    *FMA.f32 t0, r3, t1, t0
    +NOP t1
    *FMA.f32 r6:t0, r4, r1, t0
    +NOP t1
    *MKVEC.v2i16 t0, r0, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r7
}

clause_8:
ds(0) nbb attr ncph dwb(0) 
{
    *FMA.f32 r6:t0, r10, r7, r6
    +FADD.f32 t1, r7, r7
    *FMA.f32 r5:t0, r8, t1, r5
    +FADD.f32 t1, r8, r8
    *FMA.f32 r6:t0, r9, t1, r6
    +FADD.f32 t1, r9, r9
    *FMA.f32 r5:t0, r10, t1, r5
    +NOP t1
    *FMA.f32 t0, r10, r8, #0.neg
    +NOP t1
    *FMA.f32 r11:t0, r9, r7, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r0, 0x00000002 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r12
}

clause_14:
ds(0) nbb ncph 
{
    *FMA.f32 r0:t0, r10, r8, r11
    +NOP t1
    *FMA.f32 r6:t0, r10, r7, r6
    +NOP t1
    *FMA.f32 r11:t0, r10, r1, #0.neg
    +NOP t1
    *FMA.f32 r48:t0, r10, r3, #0.neg
    +NOP t1
    *FMA.f32 r49:t0, r10, r2, #0.neg
    +NOP t1
    *FMA.f32 r50:t0, r10, r4, #0.neg
    +NOP t1
    *FMA.f32 r51:t0, r10, r12, #0.neg
    +NOP t1
    *FMA.f32 r52:t0, r10, r13, #0.neg
    +NOP t1
}

clause_20:
ds(0) nbb ncph 
{
    *FMA.f32 r53:t0, r10, r14, #0.neg
    +NOP t1
    *FMA.f32 r54:t0, r10, r10, #0.neg
    +NOP t1
    *FMA.f32 r10:t0, r10, r15, #0.neg
    +NOP t1
    *FMA.f32 r11:t0, r9, r2, r11
    +NOP t1
    *FMA.f32 r48:t0, r9, r4, r48
    +NOP t1
    *FMA.f32 r49:t0, r9, r1, r49
    +NOP t1
    *FMA.f32 r50:t0, r9, r3, r50
    +NOP t1
    *FMA.f32 r51:t0, r9, r13, r51
    +NOP t1
}

clause_26:
ds(0) nbb ncph 
{
    *FMA.f32 r52:t0, r9, r12, r52
    +NOP t1
    *FMA.f32 r53:t0, r9, r15, r53
    +NOP t1
    *FMA.f32 r54:t0, r9, r9, r54
    +NOP t1
    *FMA.f32 r10:t0, r9, r14, r10
    +NOP t1
    *FMA.f32 r11:t0, r8, r3, r11
    +NOP t1
    *FMA.f32 r48:t0, r8, r1, r48
    +NOP t1
    *FMA.f32 r49:t0, r8, r4, r49
    +NOP t1
    *FMA.f32 r50:t0, r8, r2, r50
    +NOP t1
}

clause_32:
ds(0) nbb ncph 
{
    *FMA.f32 r51:t0, r8, r14, r51
    +NOP t1
    *FMA.f32 r52:t0, r8, r15, r52
    +NOP t1
    *FMA.f32 r53:t0, r8, r12, r53
    +NOP t1
    *FMA.f32 r54:t0, r8, r8, r54
    +NOP t1
    *FMA.f32 r8:t0, r8, r13, r10
    +NOP t1
    *FMA.f32 t0, r4, r2, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r1, t0
    +NOP t1
    *FMA.f32 r10:t0, r4, r2, t0
    +NOP t1
}

clause_38:
ds(0) nbb ncph 
{
    *FMA.f32 r11:t0, r7, r4, r11
    +NOP t1
    *FMA.f32 r55:t0, r4, r12, #0.neg
    +NOP t1
    *FMA.f32 r56:t0, r4, r13, #0.neg
    +NOP t1
    *FMA.f32 r57:t0, r4, r14, #0.neg
    +NOP t1
    *FMA.f32 r58:t0, r4, r4, #0.neg
    +NOP t1
    *FMA.f32 r4:t0, r4, r15, #0.neg
    +NOP t1
    *FMA.f32 r55:t0, r3, r13, r55
    +NOP t1
    *FMA.f32 r56:t0, r3, r12, r56
    +NOP t1
}

clause_44:
ds(0) nbb ncph 
{
    *FMA.f32 r57:t0, r3, r15, r57
    +NOP t1
    *FMA.f32 r58:t0, r3, r3, r58
    +NOP t1
    *FMA.f32 r4:t0, r3, r14, r4
    +NOP t1
    *FMA.f32 r55:t0, r2, r14, r55
    +NOP t1
    *FMA.f32 r56:t0, r2, r15, r56
    +NOP t1
    *FMA.f32 r57:t0, r2, r12, r57
    +NOP t1
    *FMA.f32 r58:t0, r2, r2, r58
    +NOP t1
    *FMA.f32 r4:t0, r2, r13, r4
    +NOP t1
}

clause_50:
ds(0) nbb ncph 
{
    *FMA.f32 r2:t0, r7, r2, r48
    +NOP t1
    *FMA.f32 r48:t0, r7, r3, r49
    +NOP t1
    *FMA.f32 r3:t0, r3, r1, r10
    +NOP t1
    *FMA.f32 r0:t0, r9, r7, r0
    +NOP t1
    *FMA.f32 r6:t0, r15, r12, r6
    +FADD.f32 t1, r12, r12
    *FMA.f32 r5:t0, r13, t1, r5
    +FADD.f32 t1, r13, r13
    *FMA.f32 r6:t0, r14, t1, r6
    +NOP t1
    *FMA.f32 r9:t0, r7, r1, r50
    +NOP t1
}

clause_56:
ds(0) nbb ncph 
{
    *FMA.f32 r10:t0, r1, r15, r55
    +NOP t1
    *FMA.f32 r49:t0, r1, r14, r56
    +NOP t1
    *FMA.f32 r50:t0, r1, r13, r57
    +NOP t1
    *FMA.f32 r55:t0, r1, r1, r58
    +NOP t1
    *FMA.f32 r1:t0, r1, r12, r4
    +NOP t1
    *FMA.f32 r4:t0, r7, r15, r51
    +NOP t1
    *FMA.f32 r51:t0, r7, r14, r52
    +NOP t1
    *FMA.f32 r52:t0, r7, r13, r53
    +NOP t1
}

clause_62:
ds(0) nbb ncph 
{
    *FMA.f32 r53:t0, r7, r7, r54
    +NOP t1
    *FMA.f32 r7:t0, r7, r12, r8
    +NOP t1
    *FMA.f32 t0, r11, r12, #0.neg
    +NOP t1
    *FMA.f32 t0, r48, r13, t0
    +NOP t1
    *FMA.f32 t0, r2, r14, t0
    +NOP t1
    *FMA.f32 t0, r9, r15, t0
    +NOP t1
    *FMA.f32 r8:t0, t0, 0x3e800000 /* 0.250000 */, r10
    +NOP t1
    *FMA.f32 r10:t0, r11, r14, #0.neg
    +NOP t1
}

clause_68:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r48, r15, r10
    +NOP t1
    *FMA.f32 r10:t0, r2, r12, t0
    +NOP t1
    *FMA.f32 t0, r15, r13, #0.neg
    +NOP t1
    *FMA.f32 r54:t0, r14, r12, t0
    +NOP t1
    *FMA.f32 t0, r11, r15, #0.neg
    +NOP t1
    *FMA.f32 r56:t0, r48, r14, t0
    +NOP t1
    *FMA.f32 t0, r15, r15, #0.neg
    +NOP t1
    *FMA.f32 r57:t0, r14, r14, t0
    +NOP t1
}

clause_74:
ds(0) nbb ncph 
{
    *FMA.f32 r10:t0, r9, r13, r10
    +NOP t1
    *FMA.f32 r54:t0, r15, r13, r54
    +NOP t1
    *FMA.f32 r56:t0, r2, r13, r56
    +NOP t1
    *FMA.f32 r57:t0, r13, r13, r57
    +NOP t1
    *FMA.f32 r13:t0, r11, r13, #0.neg
    +NOP t1
    *NOP t0
    +FADD.f32 r0:t1, r0, r3
    *FMA.f32 t0, r48, r12, r13
    +NOP t1
    *FMA.f32 r3:t0, r2, r15, t0
    +NOP t1
}

clause_80:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 t0, r9, r14, r3
    +NOP t1
    *FMA.f32 r3:t0, t0, 0x3e800000 /* 0.250000 */, r49
    +NOP t1
    *FMA.f32 r10:t0, r10, 0x3e800000 /* 0.250000 */, r50
    +NOP t1
    *FMA.f32 r13:t0, r14, r12, r54
    +FADD.f32 t1, r14, r14
    *FMA.f32 r5:t0, r15, t1, r5
    +NOP t1
    *FMA.f32 r6:t0, r15, r12, r6
    +NOP t1
    *FMA.f32 r14:t0, r9, r12, r56
    +NOP t1
    *NOP t0
    +FADD.f32 r4:t1, r11, r4
}

clause_86:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *NOP t0
    +FADD.f32 r2:t1, r2, r52
    *FADD.f32 t0, r9, 0xc0a00000 /* -5.000000 */
    +FADD.f32 r7:t1, t, r7
    *NOP t0
    +FADD.f32 r4:t1, r4, r8
    *FMA.f32 t0, r14, 0x3e800000 /* 0.250000 */, r1
    +FADD.f32 r1:t1, r7, t
    *NOP t0
    +MKVEC.v2i16 r7:t1, r60, r61
    *NOP t0
    +FADD.f32 r4:t1, r4, r6
    *FADD.f32 t0, r2, r10
    +FADD.f32 r2:t1, t, r5
    *DTSEL_IMM.attribute_1 t0, r7
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r5
}

clause_93:
ds(0) eos store 
{
    *NOP t0
    +FADD.f32 r11:t1, r48, r51
    *NOP t0
    +FADD.f32 r15:t1, r53, r55
    *NOP t0
    +FADD.f32 r0:t1, r0, r13
    *NOP t0
    +FADD.f32 r3:t1, r11, r3
    *FMA.f32 t0, r12, r12, r57
    +FADD.f32 t1, r15, t
    *NOP t0
    +FADD.f32 r1:t1, r1, t1
    *FADD.f32 t0, r3, r0
    +MOV.i32 r3:t1, t
    *NOP t0
    +ST_CVT.v4 t1, r5, r6, r7, @r1
}

shader14645 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)

Test 4: 169 FMAs per invocation (3 vec4 loads + 1 vec4 store)

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1, membw=False):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w*3, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w*3, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,{unrolly});
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      r0.x += 9.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w*3, h)
#     assert(output[0,0,0] == reps * 4 * 4)
    MACs = output.sum()
#     print(wh, unrollx, elapsed)
#     print(MACs, elapsed, 4*w*h*4*4)

    if membw:
        print(output[0,0].sum())
        return MACs / elapsed / 1e9, 4*w*h*4*4 / elapsed / 1024 / 1024
    else:
        return MACs / elapsed / 1e9

# test(localsz=(1,1), unrollx=4, unrolly=4)
test()
test()
gflops = test(localx=8, localy=2, membw=True)
print(source)
print(gflops)
showLastShaderDisassembly()
169.0

    #version 310 es
    precision mediump float;

    layout(local_size_x = 8, local_size_y = 2) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(3,1);
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      r0.x += 9.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2);
    }
    
(17.302189172289815, 6248.768163300396)
FMAs: 85.79% (169 / 197)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *IMUL.i32 r0:t0, r60, 0x00000003 /* 0.000000 */
    +NOP t1
    *MKVEC.v2i16 t0, t0, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r1
}

clause_3:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *FMA.f32 t0, r4, r1, #0.neg
    +FADD.f32 t1, r2, r2
    *FMA.f32 r5:t0, r3, t1, t0
    +FADD.f32 t1, r3, r3
    *FMA.f32 t0, r4, t1, #0.neg
    +FADD.f32 t1, r1, r1
    *FMA.f32 r6:t0, r2, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r0, 0x00000002 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r7
}

clause_8:
ds(0) nbb attr ncph dwb(0) 
{
    *FMA.f32 t0, r10, r7, #0.neg
    +FADD.f32 t1, r8, r8
    *FMA.f32 r11:t0, r9, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r0, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r12
}

clause_12:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r15, r12, #0.neg
    +FADD.f32 t1, r13, r13
    *FMA.f32 r0:t0, r14, t1, t0
    +FADD.f32 t1, r14, r14
    *FMA.f32 t0, r15, t1, #0.neg
    +FADD.f32 t1, r12, r12
    *FMA.f32 r16:t0, r13, t1, t0
    +NOP t1
}

clause_15:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r4, r2, #0.neg
    +NOP t1
    *FMA.f32 t0, r3, r1, t0
    +NOP t1
    *FMA.f32 r17:t0, r4, r2, t0
    +NOP t1
    *FMA.f32 r5:t0, r4, r1, r5
    +NOP t1
    *FMA.f32 r17:t0, r3, r1, r17
    +NOP t1
    *FMA.f32 t0, r4, r4, #0.neg
    +NOP t1
    *FMA.f32 r18:t0, r3, r3, t0
    +NOP t1
    *FMA.f32 r19:t0, r5, r1, #0.neg
    +NOP t1
}

clause_21:
ds(0) nbb ncph 
{
    *FMA.f32 r19:t0, r17, r2, r19
    +NOP t1
    *FMA.f32 r18:t0, r2, r2, r18
    +NOP t1
    *FMA.f32 t0, r5, r2, #0.neg
    +NOP t1
    *FMA.f32 r20:t0, r17, r1, t0
    +NOP t1
    *FMA.f32 r19:t0, r6, r3, r19
    +NOP t1
    *FMA.f32 r18:t0, r1, r1, r18
    +NOP t1
    *FMA.f32 r20:t0, r6, r4, r20
    +NOP t1
    *FMA.f32 r19:t0, r18, r4, r19
    +NOP t1
}

clause_27:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r5, r3, #0.neg
    +NOP t1
    *FMA.f32 r21:t0, r17, r4, t0
    +NOP t1
    *FMA.f32 r22:t0, r5, r4, #0.neg
    +NOP t1
    *FMA.f32 r23:t0, r4, r12, #0.neg
    +NOP t1
    *FMA.f32 r24:t0, r4, r13, #0.neg
    +NOP t1
    *FMA.f32 r25:t0, r4, r14, #0.neg
    +NOP t1
    *FMA.f32 r26:t0, r4, r15, #0.neg
    +NOP t1
    *FMA.f32 r27:t0, r4, r7, #0.neg
    +NOP t1
}

clause_33:
ds(0) nbb ncph 
{
    *FMA.f32 r28:t0, r4, r8, #0.neg
    +NOP t1
    *FMA.f32 r29:t0, r4, r9, #0.neg
    +NOP t1
    *FMA.f32 r4:t0, r4, r10, #0.neg
    +NOP t1
    *FMA.f32 r20:t0, r18, r3, r20
    +NOP t1
    *FMA.f32 r22:t0, r17, r3, r22
    +NOP t1
    *FMA.f32 r23:t0, r3, r13, r23
    +NOP t1
    *FMA.f32 r24:t0, r3, r12, r24
    +NOP t1
    *FMA.f32 r25:t0, r3, r15, r25
    +NOP t1
}

clause_39:
ds(0) nbb ncph 
{
    *FMA.f32 r26:t0, r3, r14, r26
    +NOP t1
    *FMA.f32 r27:t0, r3, r8, r27
    +NOP t1
    *FMA.f32 r28:t0, r3, r7, r28
    +NOP t1
    *FMA.f32 r29:t0, r3, r10, r29
    +NOP t1
    *FMA.f32 r3:t0, r3, r9, r4
    +NOP t1
    *FMA.f32 t0, r10, r8, #0.neg
    +NOP t1
    *FMA.f32 t0, r9, r7, t0
    +NOP t1
    *FMA.f32 r4:t0, r10, r8, t0
    +NOP t1
}

clause_45:
ds(0) nbb ncph 
{
    *FMA.f32 r11:t0, r10, r7, r11
    +NOP t1
    *FMA.f32 r4:t0, r9, r7, r4
    +NOP t1
    *FMA.f32 r19:t0, r19, 0x3e800000 /* 0.250000 */, r11
    +NOP t1
    *FMA.f32 r30:t0, r11, r7, #0.neg
    +NOP t1
    *FMA.f32 r31:t0, r11, r8, #0.neg
    +NOP t1
    *FMA.f32 r32:t0, r11, r9, #0.neg
    +NOP t1
    *FMA.f32 r11:t0, r11, r10, #0.neg
    +NOP t1
    *FMA.f32 r30:t0, r4, r8, r30
    +NOP t1
}

clause_51:
ds(0) nbb ncph 
{
    *FMA.f32 r31:t0, r4, r7, r31
    +NOP t1
    *FMA.f32 r32:t0, r4, r10, r32
    +NOP t1
    *FMA.f32 r11:t0, r4, r9, r11
    +NOP t1
    *FMA.f32 r4:t0, r20, 0x3e800000 /* 0.250000 */, r4
    +NOP t1
    *FMA.f32 t0, r15, r13, #0.neg
    +NOP t1
    *FMA.f32 t0, r14, r12, t0
    +NOP t1
    *FMA.f32 r20:t0, r15, r13, t0
    +NOP t1
    *FMA.f32 r0:t0, r15, r12, r0
    +NOP t1
}

clause_57:
ds(0) nbb ncph 
{
    *FMA.f32 r20:t0, r14, r12, r20
    +NOP t1
    *FMA.f32 r33:t0, r0, r12, #0.neg
    +NOP t1
    *FMA.f32 r34:t0, r0, r13, #0.neg
    +NOP t1
    *FMA.f32 r35:t0, r0, r14, #0.neg
    +FADD.f32 r5:t1, r5, r0
    *FMA.f32 r0:t0, r0, r15, #0.neg
    +NOP t1
    *FMA.f32 r33:t0, r20, r13, r33
    +NOP t1
    *FMA.f32 r34:t0, r20, r12, r34
    +NOP t1
    *FMA.f32 r35:t0, r20, r15, r35
    +NOP t1
}

clause_63:
ds(0) nbb ncph 
{
    *FMA.f32 r0:t0, r20, r14, r0
    +NOP t1
    *NOP t0
    +FADD.f32 r17:t1, r17, r20
    *FADD.f32 t0, r6, r16
    +FADD.f32 r36:t1, r9, r9
    *FMA.f32 r20:t0, r10, t1, t0
    +NOP t1
    *FMA.f32 t0, r10, r36, #0.neg
    +FADD.f32 r37:t1, r7, r7
    *FMA.f32 r36:t0, r8, t1, t0
    +NOP t1
    *FMA.f32 r20:t0, r8, r37, r20
    +NOP t1
    *FMA.f32 r21:t0, r6, r1, r21
    +NOP t1
}

clause_69:
ds(0) nbb ncph 
{
    *FMA.f32 r6:t0, r6, r2, r22
    +NOP t1
    *FMA.f32 r22:t0, r16, r14, r33
    +NOP t1
    *FMA.f32 r33:t0, r16, r15, r34
    +NOP t1
    *FMA.f32 r34:t0, r16, r12, r35
    +NOP t1
    *FMA.f32 r0:t0, r16, r13, r0
    +NOP t1
    *FMA.f32 r16:t0, r2, r14, r23
    +NOP t1
    *FMA.f32 r23:t0, r2, r15, r24
    +NOP t1
    *FMA.f32 r21:t0, r18, r2, r21
    +NOP t1
}

clause_75:
ds(0) nbb ncph 
{
    *FMA.f32 r24:t0, r2, r12, r25
    +NOP t1
    *FMA.f32 r25:t0, r2, r13, r26
    +NOP t1
    *FMA.f32 r26:t0, r2, r9, r27
    +NOP t1
    *FMA.f32 r27:t0, r2, r10, r28
    +NOP t1
    *FMA.f32 r28:t0, r2, r7, r29
    +NOP t1
    *FMA.f32 r2:t0, r2, r8, r3
    +NOP t1
    *FMA.f32 r3:t0, r36, r9, r30
    +NOP t1
    *FMA.f32 r29:t0, r36, r10, r31
    +NOP t1
}

clause_81:
ds(0) nbb ncph 
{
    *FMA.f32 r30:t0, r36, r7, r32
    +NOP t1
    *FMA.f32 r11:t0, r36, r8, r11
    +NOP t1
    *NOP t0
    +FADD.f32 r4:t1, r17, r4
    *FMA.f32 t0, r15, r15, #0.neg
    +NOP t1
    *FMA.f32 t0, r14, r14, t0
    +NOP t1
    *FMA.f32 t0, r13, r13, t0
    +NOP t1
    *FMA.f32 r17:t0, r12, r12, t0
    +NOP t1
    *FMA.f32 r16:t0, r1, r15, r16
    +NOP t1
}

clause_87:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r5:t1, r5, r19
    *FMA.f32 r19:t0, r17, r15, r22
    +NOP t1
    *FMA.f32 r22:t0, r15, r7, #0.neg
    +NOP t1
    *FMA.f32 r31:t0, r15, r8, #0.neg
    +NOP t1
    *FMA.f32 r32:t0, r15, r9, #0.neg
    +NOP t1
    *FMA.f32 r15:t0, r15, r10, #0.neg
    +NOP t1
    *FMA.f32 r22:t0, r14, r8, r22
    +NOP t1
    *FMA.f32 r31:t0, r14, r7, r31
    +NOP t1
}

clause_93:
ds(0) nbb ncph 
{
    *FMA.f32 r32:t0, r14, r10, r32
    +NOP t1
    *FMA.f32 r15:t0, r14, r9, r15
    +NOP t1
    *FMA.f32 r23:t0, r1, r14, r23
    +NOP t1
    *FMA.f32 r14:t0, r17, r14, r33
    +NOP t1
    *FMA.f32 r20:t0, r21, 0x3e800000 /* 0.250000 */, r20
    +NOP t1
    *FMA.f32 r21:t0, r13, r9, r22
    +NOP t1
    *FMA.f32 r22:t0, r13, r10, r31
    +NOP t1
    *FMA.f32 r31:t0, r13, r7, r32
    +NOP t1
}

clause_99:
ds(0) nbb ncph 
{
    *FMA.f32 r15:t0, r13, r8, r15
    +NOP t1
    *FMA.f32 r24:t0, r1, r13, r24
    +NOP t1
    *FMA.f32 r13:t0, r17, r13, r34
    +NOP t1
    *FMA.f32 t0, r10, r10, #0.neg
    +NOP t1
    *FMA.f32 t0, r9, r9, t0
    +NOP t1
    *FMA.f32 t0, r8, r8, t0
    +NOP t1
    *FMA.f32 r32:t0, r7, r7, t0
    +NOP t1
    *FMA.f32 r6:t0, r18, r1, r6
    +NOP t1
}

clause_105:
ds(0) nbb ncph 
{
    *FMA.f32 r6:t0, r6, 0x3e800000 /* 0.250000 */, r32
    +NOP t1
    *FMA.f32 r0:t0, r17, r12, r0
    +NOP t1
    *NOP t0
    +FADD.f32 r18:t1, r18, r17
    *FMA.f32 r17:t0, r1, r12, r25
    +NOP t1
    *NOP t0
    +FADD.f32 r6:t1, r18, r6
    *FMA.f32 r18:t0, r12, r10, r21
    +NOP t1
    *FMA.f32 r21:t0, r12, r9, r22
    +NOP t1
    *FMA.f32 r22:t0, r12, r8, r31
    +NOP t1
}

clause_111:
ds(0) nbb ncph 
{
    *FMA.f32 r12:t0, r12, r7, r15
    +NOP t1
    *FMA.f32 r15:t0, r1, r10, r26
    +NOP t1
    *FMA.f32 r25:t0, r1, r9, r27
    +NOP t1
    *FMA.f32 r26:t0, r1, r8, r28
    +NOP t1
    *FMA.f32 r1:t0, r1, r7, r2
    +NOP t1
    *FMA.f32 t0, r16, r7, #0.neg
    +NOP t1
    *FMA.f32 r2:t0, r23, r8, t0
    +NOP t1
    *FMA.f32 r27:t0, r16, r8, #0.neg
    +NOP t1
}

clause_117:
ds(0) nbb ncph 
{
    *FMA.f32 r27:t0, r23, r7, r27
    +NOP t1
    *FMA.f32 r2:t0, r24, r9, r2
    +NOP t1
    *FMA.f32 r27:t0, r24, r10, r27
    +NOP t1
    *FMA.f32 t0, r16, r9, #0.neg
    +NOP t1
    *FMA.f32 r28:t0, r23, r10, t0
    +NOP t1
    *FMA.f32 r2:t0, r17, r10, r2
    +NOP t1
    *FMA.f32 r3:t0, r32, r10, r3
    +NOP t1
    *FMA.f32 r10:t0, r16, r10, #0.neg
    +NOP t1
}

clause_123:
ds(0) nbb ncph 
{
    *FMA.f32 r10:t0, r23, r9, r10
    +NOP t1
    *FMA.f32 r27:t0, r17, r9, r27
    +NOP t1
    *FMA.f32 r9:t0, r32, r9, r29
    +NOP t1
    *FMA.f32 r28:t0, r24, r7, r28
    +NOP t1
    *FMA.f32 r10:t0, r24, r8, r10
    +NOP t1
    *FMA.f32 r28:t0, r17, r8, r28
    +NOP t1
    *FMA.f32 r8:t0, r32, r8, r30
    +NOP t1
    *FMA.f32 r11:t0, r32, r7, r11
    +NOP t1
}

clause_129:
ds(0) nbb ncph 
{
    *FMA.f32 r7:t0, r17, r7, r10
    +NOP t1
    *FMA.f32 r5:t0, r19, 0x3e800000 /* 0.250000 */, r5
    +NOP t1
    *FMA.f32 r2:t0, r2, 0x3e800000 /* 0.250000 */, r18
    +NOP t1
    *FMA.f32 r4:t0, r14, 0x3e800000 /* 0.250000 */, r4
    +NOP t1
    *FMA.f32 r14:t0, r27, 0x3e800000 /* 0.250000 */, r21
    +NOP t1
    *FMA.f32 r13:t0, r13, 0x3e800000 /* 0.250000 */, r20
    +NOP t1
    *NOP t0
    +FADD.f32 r10:t1, r16, r15
    *FMA.f32 r16:t0, r28, 0x3e800000 /* 0.250000 */, r22
    +NOP t1
}

clause_135:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 r0:t0, r0, 0x3e800000 /* 0.250000 */, r6
    +NOP t1
    *FMA.f32 r6:t0, r7, 0x3e800000 /* 0.250000 */, r12
    +NOP t1
    *FMA.f32 r3:t0, r3, 0x3e800000 /* 0.250000 */, r5
    +NOP t1
    *FADD.f32 t0, r17, 0x41100000 /* 9.000000 */
    +FADD.f32 r1:t1, t, r1
    *FMA.f32 r7:t0, r8, 0x3e800000 /* 0.250000 */, r13
    +NOP t1
    *NOP t0
    +FADD.f32 r15:t1, r23, r25
    *NOP t0
    +FADD.f32 r18:t1, r24, r26
    *NOP t0
    +FADD.f32 r2:t1, r10, r2
}

clause_141:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *NOP t0
    +FADD.f32 r5:t1, r15, r14
    *NOP t0
    +FADD.f32 r8:t1, r18, r16
    *NOP t0
    +FADD.f32 r1:t1, r1, r6
    *NOP t0
    +MKVEC.v2i16 r6:t1, r60, r61
    *NOP t0
    +FADD.f32 r2:t1, r2, r3
    *FMA.f32 t0, r9, 0x3e800000 /* 0.250000 */, r4
    +FADD.f32 r3:t1, r5, t
    *NOP t0
    +FADD.f32 r4:t1, r8, r7
    *DTSEL_IMM.attribute_1 t0, r6
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r5
}

clause_148:
ds(0) eos store 
{
    *FMA.f32 t0, r11, 0x3e800000 /* 0.250000 */, r0
    +NOP t1
    *FADD.f32 t0, r1, t0
    +MOV.i32 r8:t1, t
    *MOV.i32 r9:t0, r4
    +MOV.i32 r10:t1, r3
    *NOP t0
    +MOV.i32 r11:t1, r2
    *NOP t0
    +ST_CVT.v4 t1, r5, r6, r7, @r8
}

shader16595 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)
ndecode.dump.0278
pandecode: dump command stream to file pandecode.dump.0279
pandecode: dump command stream to file pandecode.dump.0280
pandecode: dump command stream to file pandecode.dump.0281
pandecode: dump command stream to file pandecode.dump.0282
pandecode: dump command stream to file pandecode.dump.0283
pandecode: dump command stream to file pandecode.dump.0284
pandecode: dump command stream to file pandecode.dump.0285
pandecode: dump command stream to file pandecode.dump.0286
pandecode: dump command stream to file pandecode.dump.0287
pandecode: dump command stream to file pandecode.dump.0288
pandecode: dump command stream to file pandecode.dump.0289
pandecode: dump command stream to file pandecode.dump.0290
pandecode: dump command stream to file pandecode.dump.0291
pandecode: dump command stream to file pandecode.dump.0292
pandecode: dump command stream to file pandecode.dump.0293
pandecode: dump command stream to file pandecode.dump.0294
pandecode: dump command stream to file pandecode.dump.0295
pandecode: dump command stream to file pandecode.dump.0296
pandecode: dump command stream to file pandecode.dump.0297
pandecode: dump command stream to file pandecode.dump.0298
pandecode: dump command stream to file pandecode.dump.0299
pandecode: dump command stream to file pandecode.dump.0300
pandecode: dump command stream to file pandecode.dump.0301
pandecode: dump command stream to file pandecode.dump.0302
pandecode: dump command stream to file pandecode.dump.0303
pandecode: dump command stream to file pandecode.dump.0304
pandecode: dump command stream to file pandecode.dump.0305
pandecode: dump command stream to file pandecode.dump.0306
pandecode: dump command stream to file pandecode.dump.0307
pandecode: dump command stream to file pandecode.dump.0308
pandecode: dump command stream to file pandecode.dump.0309
pandecode: dump command stream to file pandecode.dump.0310
pandecode: dump command stream to file pandecode.dump.0311
pandecode: dump command stream to file pandecode.dump.0312
pandecode: dump command stream to file pandecode.dump.0313
pandecode: dump command stream to file pandecode.dump.0314
pandecode: dump command stream to file pandecode.dump.0315
pandecode: dump command stream to file pandecode.dump.0316
pandecode: dump command stream to file pandecode.dump.0317
pandecode: dump command stream to file pandecode.dump.0318
pandecode: dump command stream to file pandecode.dump.0319
pandecode: dump command stream to file pandecode.dump.0320
pandecode: dump command stream to file pandecode.dump.0321
pandecode: dump command stream to file pandecode.dump.0322
pandecode: dump command stream to file pandecode.dump.0323
pandecode: dump command stream to file pandecode.dump.0324
pandecode: dump command stream to file pandecode.dump.0325
pandecode: dump command stream to file pandecode.dump.0326
pandecode: dump command stream to file pandecode.dump.0327
pandecode: dump command stream to file pandecode.dump.0328
pandecode: dump command stream to file pandecode.dump.0329
pandecode: dump command stream to file pandecode.dump.0330
pandecode: dump command stream to file pandecode.dump.0331
pandecode: dump command stream to file pandecode.dump.0332
pandecode: dump command stream to file pandecode.dump.0333
pandecode: dump command stream to file pandecode.dump.0334
pandecode: dump command stream to file pandecode.dump.0335
pandecode: dump command stream to file pandecode.dump.0336
pandecode: dump command stream to file pandecode.dump.0337
pandecode: dump command stream to file pandecode.dump.0338
pandecode: dump command stream to file pandecode.dump.0339
pandecode: dump command stream to file pandecode.dump.0340
pandecode: dump command stream to file pandecode.dump.0341
pandecode: dump command stream to file pandecode.dump.0342
pandecode: dump command stream to file pandecode.dump.0343
pandecode: dump command stream to file pandecode.dump.0344
pandecode: dump command stream to file pandecode.dump.0345
pandecode: dump command stream to file pandecode.dump.0346
pandecode: dump command stream to file pandecode.dump.0347
pandeco

Test 5: 169 FMAs per invocation (3 vec4 loads with overlap + 1 vec4 store)

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1, membw=False):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w+2, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w+2, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(1,{unrolly});
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      r0.x += 9.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w, h)
#     assert(output[0,0,0] == reps * 4 * 4)
    MACs = output.sum()
#     print(wh, unrollx, elapsed)
#     print(MACs, elapsed, 4*w*h*4*4)

    if membw:
        print(output[0,0].sum())
        return MACs / elapsed / 1e9, 4*w*h*4*4 / elapsed / 1024 / 1024
    else:
        return MACs / elapsed / 1e9

# test(localsz=(1,1), unrollx=4, unrolly=4)
test()
test()
gflops = test(localx=16, localy=2, membw=True)
print(source)
print(gflops)
showLastShaderDisassembly()
169.0

    #version 310 es
    precision mediump float;

    layout(local_size_x = 16, local_size_y = 2) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(1,1);
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      r0.x += 9.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy), r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2);
    }
    
(23.881990676008602, 8625.094866693657)
FMAs: 87.56% (169 / 193)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *MKVEC.v2i16 r2:t0, r60, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r3
}

clause_1:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *FMA.f32 t0, r6, r3, #0.neg
    +FADD.f32 t1, r4, r4
    *FMA.f32 r7:t0, r5, t1, t0
    +FADD.f32 t1, r5, r5
    *FMA.f32 t0, r6, t1, #0.neg
    +FADD.f32 t1, r3, r3
    *FMA.f32 r8:t0, r4, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r60, 0x00000002 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r9
}

clause_6:
ds(0) nbb attr ncph dwb(0) 
{
    *FMA.f32 t0, r12, r9, #0.neg
    +FADD.f32 t1, r10, r10
    *FMA.f32 r0:t0, r11, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r60, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r13
}

clause_10:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r16, r13, #0.neg
    +FADD.f32 t1, r14, r14
    *FMA.f32 r1:t0, r15, t1, t0
    +FADD.f32 t1, r15, r15
    *FMA.f32 t0, r16, t1, #0.neg
    +FADD.f32 t1, r13, r13
    *FMA.f32 r17:t0, r14, t1, t0
    +NOP t1
    *FMA.f32 t0, r6, r4, #0.neg
    +NOP t1
    *FMA.f32 r18:t0, r5, r3, t0
    +NOP t1
}

clause_15:
ds(0) nbb ncph 
{
    *FMA.f32 r18:t0, r6, r4, r18
    +NOP t1
    *FMA.f32 r7:t0, r6, r3, r7
    +NOP t1
    *FMA.f32 r18:t0, r5, r3, r18
    +NOP t1
    *FMA.f32 t0, r6, r6, #0.neg
    +NOP t1
    *FMA.f32 r19:t0, r5, r5, t0
    +NOP t1
    *FMA.f32 t0, r7, r3, #0.neg
    +NOP t1
    *FMA.f32 r20:t0, r18, r4, t0
    +NOP t1
    *FMA.f32 r19:t0, r4, r4, r19
    +NOP t1
}

clause_21:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r7, r4, #0.neg
    +NOP t1
    *FMA.f32 r21:t0, r18, r3, t0
    +NOP t1
    *FMA.f32 r20:t0, r8, r5, r20
    +NOP t1
    *FMA.f32 r19:t0, r3, r3, r19
    +NOP t1
    *FMA.f32 r21:t0, r8, r6, r21
    +NOP t1
    *FMA.f32 r20:t0, r19, r6, r20
    +NOP t1
    *FMA.f32 t0, r7, r5, #0.neg
    +NOP t1
    *FMA.f32 r22:t0, r18, r6, t0
    +NOP t1
}

clause_27:
ds(0) nbb ncph 
{
    *FMA.f32 r23:t0, r7, r6, #0.neg
    +NOP t1
    *FMA.f32 r24:t0, r6, r13, #0.neg
    +NOP t1
    *FMA.f32 r25:t0, r6, r14, #0.neg
    +NOP t1
    *FMA.f32 r26:t0, r6, r15, #0.neg
    +NOP t1
    *FMA.f32 r27:t0, r6, r16, #0.neg
    +NOP t1
    *FMA.f32 r28:t0, r6, r9, #0.neg
    +NOP t1
    *FMA.f32 r29:t0, r6, r10, #0.neg
    +NOP t1
    *FMA.f32 r30:t0, r6, r11, #0.neg
    +NOP t1
}

clause_33:
ds(0) nbb ncph 
{
    *FMA.f32 r6:t0, r6, r12, #0.neg
    +NOP t1
    *FMA.f32 r21:t0, r19, r5, r21
    +NOP t1
    *FMA.f32 r23:t0, r18, r5, r23
    +NOP t1
    *FMA.f32 r24:t0, r5, r14, r24
    +NOP t1
    *FMA.f32 r25:t0, r5, r13, r25
    +NOP t1
    *FMA.f32 r26:t0, r5, r16, r26
    +NOP t1
    *FMA.f32 r27:t0, r5, r15, r27
    +NOP t1
    *FMA.f32 r28:t0, r5, r10, r28
    +NOP t1
}

clause_39:
ds(0) nbb ncph 
{
    *FMA.f32 r29:t0, r5, r9, r29
    +NOP t1
    *FMA.f32 r30:t0, r5, r12, r30
    +NOP t1
    *FMA.f32 r5:t0, r5, r11, r6
    +NOP t1
    *FMA.f32 t0, r12, r10, #0.neg
    +NOP t1
    *FMA.f32 t0, r11, r9, t0
    +NOP t1
    *FMA.f32 r6:t0, r12, r10, t0
    +NOP t1
    *FMA.f32 r0:t0, r12, r9, r0
    +NOP t1
    *FMA.f32 r6:t0, r11, r9, r6
    +NOP t1
}

clause_45:
ds(0) nbb ncph 
{
    *FMA.f32 r20:t0, r20, 0x3e800000 /* 0.250000 */, r0
    +NOP t1
    *FMA.f32 r31:t0, r0, r9, #0.neg
    +NOP t1
    *FMA.f32 r32:t0, r0, r10, #0.neg
    +NOP t1
    *FMA.f32 r33:t0, r0, r11, #0.neg
    +NOP t1
    *FMA.f32 r0:t0, r0, r12, #0.neg
    +NOP t1
    *FMA.f32 r31:t0, r6, r10, r31
    +NOP t1
    *FMA.f32 r32:t0, r6, r9, r32
    +NOP t1
    *FMA.f32 r33:t0, r6, r12, r33
    +NOP t1
}

clause_51:
ds(0) nbb ncph 
{
    *FMA.f32 r0:t0, r6, r11, r0
    +NOP t1
    *FMA.f32 r6:t0, r21, 0x3e800000 /* 0.250000 */, r6
    +NOP t1
    *FMA.f32 t0, r16, r14, #0.neg
    +NOP t1
    *FMA.f32 t0, r15, r13, t0
    +NOP t1
    *FMA.f32 r21:t0, r16, r14, t0
    +NOP t1
    *FMA.f32 r1:t0, r16, r13, r1
    +NOP t1
    *FMA.f32 r21:t0, r15, r13, r21
    +NOP t1
    *FMA.f32 r34:t0, r1, r13, #0.neg
    +NOP t1
}

clause_57:
ds(0) nbb ncph 
{
    *FMA.f32 r35:t0, r1, r14, #0.neg
    +NOP t1
    *FMA.f32 r36:t0, r1, r15, #0.neg
    +FADD.f32 r7:t1, r7, r1
    *FMA.f32 r1:t0, r1, r16, #0.neg
    +NOP t1
    *FMA.f32 r34:t0, r21, r14, r34
    +NOP t1
    *FMA.f32 r35:t0, r21, r13, r35
    +NOP t1
    *FMA.f32 r36:t0, r21, r16, r36
    +NOP t1
    *FMA.f32 r1:t0, r21, r15, r1
    +NOP t1
    *NOP t0
    +FADD.f32 r18:t1, r18, r21
}

clause_63:
ds(0) nbb ncph 
{
    *FADD.f32 t0, r8, r17
    +FADD.f32 r37:t1, r11, r11
    *FMA.f32 r21:t0, r12, t1, t0
    +NOP t1
    *FMA.f32 t0, r12, r37, #0.neg
    +FADD.f32 r38:t1, r9, r9
    *FMA.f32 r37:t0, r10, t1, t0
    +NOP t1
    *FMA.f32 r21:t0, r10, r38, r21
    +NOP t1
    *FMA.f32 r22:t0, r8, r3, r22
    +NOP t1
    *FMA.f32 r8:t0, r8, r4, r23
    +NOP t1
    *FMA.f32 r23:t0, r17, r15, r34
    +NOP t1
}

clause_69:
ds(0) nbb ncph 
{
    *FMA.f32 r34:t0, r17, r16, r35
    +NOP t1
    *FMA.f32 r35:t0, r17, r13, r36
    +NOP t1
    *FMA.f32 r1:t0, r17, r14, r1
    +NOP t1
    *FMA.f32 r17:t0, r4, r15, r24
    +NOP t1
    *FMA.f32 r24:t0, r4, r16, r25
    +NOP t1
    *FMA.f32 r22:t0, r19, r4, r22
    +NOP t1
    *FMA.f32 r25:t0, r4, r13, r26
    +NOP t1
    *FMA.f32 r26:t0, r4, r14, r27
    +NOP t1
}

clause_75:
ds(0) nbb ncph 
{
    *FMA.f32 r27:t0, r4, r11, r28
    +NOP t1
    *FMA.f32 r28:t0, r4, r12, r29
    +NOP t1
    *FMA.f32 r29:t0, r4, r9, r30
    +NOP t1
    *FMA.f32 r4:t0, r4, r10, r5
    +NOP t1
    *FMA.f32 r5:t0, r37, r11, r31
    +NOP t1
    *FMA.f32 r30:t0, r37, r12, r32
    +NOP t1
    *FMA.f32 r31:t0, r37, r9, r33
    +NOP t1
    *FMA.f32 r0:t0, r37, r10, r0
    +NOP t1
}

clause_81:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r6:t1, r18, r6
    *FMA.f32 t0, r16, r16, #0.neg
    +NOP t1
    *FMA.f32 t0, r15, r15, t0
    +NOP t1
    *FMA.f32 t0, r14, r14, t0
    +NOP t1
    *FMA.f32 r18:t0, r13, r13, t0
    +NOP t1
    *FMA.f32 r17:t0, r3, r16, r17
    +NOP t1
    *NOP t0
    +FADD.f32 r7:t1, r7, r20
    *FMA.f32 r20:t0, r18, r16, r23
    +NOP t1
}

clause_87:
ds(0) nbb ncph 
{
    *FMA.f32 r23:t0, r16, r9, #0.neg
    +NOP t1
    *FMA.f32 r32:t0, r16, r10, #0.neg
    +NOP t1
    *FMA.f32 r33:t0, r16, r11, #0.neg
    +NOP t1
    *FMA.f32 r16:t0, r16, r12, #0.neg
    +NOP t1
    *FMA.f32 r23:t0, r15, r10, r23
    +NOP t1
    *FMA.f32 r32:t0, r15, r9, r32
    +NOP t1
    *FMA.f32 r33:t0, r15, r12, r33
    +NOP t1
    *FMA.f32 r16:t0, r15, r11, r16
    +NOP t1
}

clause_93:
ds(0) nbb ncph 
{
    *FMA.f32 r24:t0, r3, r15, r24
    +NOP t1
    *FMA.f32 r15:t0, r18, r15, r34
    +NOP t1
    *FMA.f32 r21:t0, r22, 0x3e800000 /* 0.250000 */, r21
    +NOP t1
    *FMA.f32 r22:t0, r14, r11, r23
    +NOP t1
    *FMA.f32 r23:t0, r14, r12, r32
    +NOP t1
    *FMA.f32 r32:t0, r14, r9, r33
    +NOP t1
    *FMA.f32 r16:t0, r14, r10, r16
    +NOP t1
    *FMA.f32 r25:t0, r3, r14, r25
    +NOP t1
}

clause_99:
ds(0) nbb ncph 
{
    *FMA.f32 r14:t0, r18, r14, r35
    +NOP t1
    *FMA.f32 t0, r12, r12, #0.neg
    +NOP t1
    *FMA.f32 t0, r11, r11, t0
    +NOP t1
    *FMA.f32 t0, r10, r10, t0
    +NOP t1
    *FMA.f32 r33:t0, r9, r9, t0
    +NOP t1
    *FMA.f32 t0, r19, r3, r8
    +NOP t1
    *FMA.f32 r8:t0, t0, 0x3e800000 /* 0.250000 */, r33
    +NOP t1
    *FMA.f32 r1:t0, r18, r13, r1
    +NOP t1
}

clause_105:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r19:t1, r19, r18
    *FMA.f32 r18:t0, r3, r13, r26
    +NOP t1
    *NOP t0
    +FADD.f32 r8:t1, r19, r8
    *FMA.f32 r19:t0, r13, r12, r22
    +NOP t1
    *FMA.f32 r22:t0, r13, r11, r23
    +NOP t1
    *FMA.f32 r23:t0, r13, r10, r32
    +NOP t1
    *FMA.f32 r13:t0, r13, r9, r16
    +NOP t1
    *FMA.f32 r16:t0, r3, r12, r27
    +NOP t1
}

clause_111:
ds(0) nbb ncph 
{
    *FMA.f32 r26:t0, r3, r11, r28
    +NOP t1
    *FMA.f32 r27:t0, r3, r10, r29
    +NOP t1
    *FMA.f32 r3:t0, r3, r9, r4
    +NOP t1
    *FMA.f32 t0, r17, r9, #0.neg
    +NOP t1
    *FMA.f32 r4:t0, r24, r10, t0
    +NOP t1
    *FMA.f32 t0, r17, r10, #0.neg
    +NOP t1
    *FMA.f32 r28:t0, r24, r9, t0
    +NOP t1
    *FMA.f32 r4:t0, r25, r11, r4
    +NOP t1
}

clause_117:
ds(0) nbb ncph 
{
    *FMA.f32 r28:t0, r25, r12, r28
    +NOP t1
    *FMA.f32 t0, r17, r11, #0.neg
    +NOP t1
    *FMA.f32 r29:t0, r24, r12, t0
    +NOP t1
    *FMA.f32 r4:t0, r18, r12, r4
    +NOP t1
    *FMA.f32 r5:t0, r33, r12, r5
    +NOP t1
    *FMA.f32 t0, r17, r12, #0.neg
    +NOP t1
    *FMA.f32 r12:t0, r24, r11, t0
    +NOP t1
    *FMA.f32 r28:t0, r18, r11, r28
    +NOP t1
}

clause_123:
ds(0) nbb ncph 
{
    *FMA.f32 r11:t0, r33, r11, r30
    +NOP t1
    *FMA.f32 r29:t0, r25, r9, r29
    +NOP t1
    *FMA.f32 r12:t0, r25, r10, r12
    +NOP t1
    *FMA.f32 r29:t0, r18, r10, r29
    +NOP t1
    *FMA.f32 r10:t0, r33, r10, r31
    +NOP t1
    *FMA.f32 r0:t0, r33, r9, r0
    +NOP t1
    *FMA.f32 r9:t0, r18, r9, r12
    +NOP t1
    *FMA.f32 r7:t0, r20, 0x3e800000 /* 0.250000 */, r7
    +NOP t1
}

clause_129:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 r4:t0, r4, 0x3e800000 /* 0.250000 */, r19
    +NOP t1
    *FMA.f32 r6:t0, r15, 0x3e800000 /* 0.250000 */, r6
    +NOP t1
    *FMA.f32 r15:t0, r28, 0x3e800000 /* 0.250000 */, r22
    +NOP t1
    *FMA.f32 r14:t0, r14, 0x3e800000 /* 0.250000 */, r21
    +NOP t1
    *FMA.f32 r1:t0, r1, 0x3e800000 /* 0.250000 */, r8
    +NOP t1
    *FMA.f32 r8:t0, r9, 0x3e800000 /* 0.250000 */, r13
    +NOP t1
    *FMA.f32 r5:t0, r5, 0x3e800000 /* 0.250000 */, r7
    +NOP t1
    *FMA.f32 r0:t0, r0, 0x3e800000 /* 0.250000 */, r1
    +NOP t1
}

clause_135:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *NOP t0
    +FADD.f32 r12:t1, r17, r16
    *NOP t0
    +FADD.f32 r16:t1, r24, r26
    *NOP t0
    +FADD.f32 r4:t1, r12, r4
    *FADD.f32 t0, r18, 0x41100000 /* 9.000000 */
    +FADD.f32 r7:t1, r16, r15
    *FADD.f32 t0, t0, r3
    +FADD.f32 r1:t1, t, r8
    *NOP t0
    +FADD.f32 r3:t1, r4, r5
    *FMA.f32 t0, r11, 0x3e800000 /* 0.250000 */, r6
    +FADD.f32 r4:t1, r7, t
    *DTSEL_IMM.attribute_1 t0, r2
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r6
}

clause_142:
ds(0) eos store 
{
    *FMA.f32 r17:t0, r29, 0x3e800000 /* 0.250000 */, r23
    +NOP t1
    *FMA.f32 r9:t0, r10, 0x3e800000 /* 0.250000 */, r14
    +NOP t1
    *NOP t0
    +FADD.f32 t1, r25, r27
    *NOP t0
    +FADD.f32 t1, t1, r17
    *NOP t0
    +FADD.f32 t1, t1, r9
    *FADD.f32 r0:t0, r1, r0
    +MOV.i32 r1:t1, t1
    *NOP t0
    +MOV.i32 r2:t1, r4
    *NOP t0
    +ST_CVT.v4 t1, r6, r7, r8, @r0
}

shader19895 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)

Test 5: 169 FMAs per invocation (3 vec4 loads with overlap + 1 vec4 store)

def test(wh=None, w=1024, h=1024, localsz=None, localx=1, localy=1, unrollx=1, unrolly=1, membw=False):
    global intex, outtex, source
    if wh is not None:
        w,h = wh,wh
    if localsz is not None:
        localx, localy = localsz
    src = np.stack([np.stack([np.full(w+2, 1, dtype=np.uint8)]*4, axis=-1)]*h).astype(np.float32)
    intex = createTexture(w+2, h, fmt=gl.GL_RGBA32F, output=False, src=src)
    outtex = createTexture(w, h, texid=1, fmt=gl.GL_RGBA32F)
    source = f"""
    #version 310 es
    precision mediump float;

    layout(local_size_x = {localx}, local_size_y = {localy}) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {{
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(1,{unrolly});
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 s0y = vec4(
          dot(s0.xyzw, y.xyzw) / 4.,
          dot(s0.xyzw, y.yxwz) / 4.,
          dot(s0.xyzw, y.zwxy) / 4.,
          dot(s0.xyzw, y.wzyx) / 4.
      );
      highp vec4 s0z = vec4(
          dot(s0.xyzw, z.xyzw) / 4.,
          dot(s0.xyzw, z.yxwz) / 4.,
          dot(s0.xyzw, z.zwxy) / 4.,
          dot(s0.xyzw, z.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 s1x = vec4(
          dot(s1.xyzw, x.xyzw) / 4.,
          dot(s1.xyzw, x.yxwz) / 4.,
          dot(s1.xyzw, x.zwxy) / 4.,
          dot(s1.xyzw, x.wzyx) / 4.
      );
      highp vec4 s1z = vec4(
          dot(s1.xyzw, z.xyzw) / 4.,
          dot(s1.xyzw, z.yxwz) / 4.,
          dot(s1.xyzw, z.zwxy) / 4.,
          dot(s1.xyzw, z.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      highp vec4 s2x = vec4(
          dot(s2.xyzw, x.xyzw) / 4.,
          dot(s2.xyzw, x.yxwz) / 4.,
          dot(s2.xyzw, x.zwxy) / 4.,
          dot(s2.xyzw, x.wzyx) / 4.
      );
      highp vec4 s2y = vec4(
          dot(s2.xyzw, y.xyzw) / 4.,
          dot(s2.xyzw, y.yxwz) / 4.,
          dot(s2.xyzw, y.zwxy) / 4.,
          dot(s2.xyzw, y.wzyx) / 4.
      );
      r0.x += 5.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
          r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2 + 
          s0y+ s0z+ s1x+ s1z+ s2x+ s2y);
    }}
    """
    computeShader(source)
    start = time.perf_counter()
    for i in range(50):
        gl.glDispatchCompute(w//localx, h//unrolly//localy, 1)
        # make sure writing to image has finished before read
        gl.glMemoryBarrier(gl.GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
    gl.glFinish()
    elapsed = (time.perf_counter() - start)/50
    output = downloadTextureFloat(outtex, w, h)
#     assert(output[0,0,0] == reps * 4 * 4)
    MACs = output.sum()
#     print(wh, unrollx, elapsed)
#     print(MACs, elapsed, 4*w*h*4*4)

    if membw:
        print(output[0,0].sum())
        return MACs / elapsed / 1e9, 4*w*h*4*4 / elapsed / 1024 / 1024
    else:
        return MACs / elapsed / 1e9

# test(localsz=(1,1), unrollx=4, unrolly=4)
test()
test()
gflops = test(localx=4, localy=8, membw=True)
print(source)
print(gflops)
showLastShaderDisassembly()
261.0

    #version 310 es
    precision mediump float;

    layout(local_size_x = 4, local_size_y = 8) in;
    layout(rgba32f, binding = 0) uniform mediump readonly image2D img_input;
    layout(rgba32f, binding = 1) uniform mediump writeonly image2D img_output;

    void main() {
      ivec2 start_coords = ivec2(gl_GlobalInvocationID.xy) * ivec2(1,1);
      highp vec4 x, y, z;
      x = imageLoad(img_input, start_coords + ivec2(0,0));
      y = imageLoad(img_input, start_coords + ivec2(1,0));
      z = imageLoad(img_input, start_coords + ivec2(2,0));
      highp vec4 r0 = vec4(
          dot(x.xyzw, y.xyzw),
          dot(x.xyzw, y.yxwz),
          dot(x.xyzw, y.zwxy),
          dot(x.xyzw, y.wzyx)
      );
      highp vec4 r1 = vec4(
          dot(x.xyzw, z.xyzw),
          dot(x.xyzw, z.yxwz),
          dot(x.xyzw, z.zwxy),
          dot(x.xyzw, z.wzyx)
      );
      highp vec4 r2 = vec4(
          dot(y.xyzw, z.xyzw),
          dot(y.xyzw, z.yxwz),
          dot(y.xyzw, z.zwxy),
          dot(y.xyzw, z.wzyx)
      );
      highp vec4 r3 = vec4(
          dot(r0.xyzw, z.xyzw) / 4.,
          dot(r0.xyzw, z.yxwz) / 4.,
          dot(r0.xyzw, z.zwxy) / 4.,
          dot(r0.xyzw, z.wzyx) / 4.
      );
      highp vec4 s0 = vec4(
          dot(x.xyzw, x.xyzw),
          dot(x.xyzw, x.yxwz),
          dot(x.xyzw, x.zwxy),
          dot(x.xyzw, x.wzyx)
      );
      highp vec4 s1 = vec4(
          dot(y.xyzw, y.xyzw),
          dot(y.xyzw, y.yxwz),
          dot(y.xyzw, y.zwxy),
          dot(y.xyzw, y.wzyx)
      );
      highp vec4 s2 = vec4(
          dot(z.xyzw, z.xyzw),
          dot(z.xyzw, z.yxwz),
          dot(z.xyzw, z.zwxy),
          dot(z.xyzw, z.wzyx)
      );
      highp vec4 c0 = vec4(
          dot(s0.xyzw, x.xyzw) / 4.,
          dot(s0.xyzw, x.yxwz) / 4.,
          dot(s0.xyzw, x.zwxy) / 4.,
          dot(s0.xyzw, x.wzyx) / 4.
      );
      highp vec4 s0y = vec4(
          dot(s0.xyzw, y.xyzw) / 4.,
          dot(s0.xyzw, y.yxwz) / 4.,
          dot(s0.xyzw, y.zwxy) / 4.,
          dot(s0.xyzw, y.wzyx) / 4.
      );
      highp vec4 s0z = vec4(
          dot(s0.xyzw, z.xyzw) / 4.,
          dot(s0.xyzw, z.yxwz) / 4.,
          dot(s0.xyzw, z.zwxy) / 4.,
          dot(s0.xyzw, z.wzyx) / 4.
      );
      highp vec4 c1 = vec4(
          dot(s1.xyzw, y.xyzw) / 4.,
          dot(s1.xyzw, y.yxwz) / 4.,
          dot(s1.xyzw, y.zwxy) / 4.,
          dot(s1.xyzw, y.wzyx) / 4.
      );
      highp vec4 s1x = vec4(
          dot(s1.xyzw, x.xyzw) / 4.,
          dot(s1.xyzw, x.yxwz) / 4.,
          dot(s1.xyzw, x.zwxy) / 4.,
          dot(s1.xyzw, x.wzyx) / 4.
      );
      highp vec4 s1z = vec4(
          dot(s1.xyzw, z.xyzw) / 4.,
          dot(s1.xyzw, z.yxwz) / 4.,
          dot(s1.xyzw, z.zwxy) / 4.,
          dot(s1.xyzw, z.wzyx) / 4.
      );
      highp vec4 c2 = vec4(
          dot(s2.xyzw, z.xyzw) / 4.,
          dot(s2.xyzw, z.yxwz) / 4.,
          dot(s2.xyzw, z.zwxy) / 4.,
          dot(s2.xyzw, z.wzyx) / 4.
      );
      highp vec4 s2x = vec4(
          dot(s2.xyzw, x.xyzw) / 4.,
          dot(s2.xyzw, x.yxwz) / 4.,
          dot(s2.xyzw, x.zwxy) / 4.,
          dot(s2.xyzw, x.wzyx) / 4.
      );
      highp vec4 s2y = vec4(
          dot(s2.xyzw, y.xyzw) / 4.,
          dot(s2.xyzw, y.yxwz) / 4.,
          dot(s2.xyzw, y.zwxy) / 4.,
          dot(s2.xyzw, y.wzyx) / 4.
      );
      r0.x += 5.; // optimizer correction
      imageStore(img_output, ivec2(gl_GlobalInvocationID.xy),
          r0 + r1 + r2 + r3 + s0 + s1 + s2 + c0 + c1 + c2 + 
          s0y+ s0z+ s1x+ s1z+ s2x+ s2y);
    }
    
(24.24173074040384, 5668.957178202975)
FMAs: 84.47% (261 / 309)

clause_0:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *NOP t0
    +IADD.s32 t1, r60, 0x00000002 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r3
}

clause_3:
ds(0) nbb attr ncph next_attr dwb(0) 
{
    *FMA.f32 t0, r6, r3, #0.neg
    +FADD.f32 t1, r4, r4
    *FMA.f32 r0:t0, r5, t1, t0
    +NOP t1
    *MKVEC.v2i16 r2:t0, r60, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r7
}

clause_6:
ds(0) nbb attr ncph dwb(0) 
{
    *FMA.f32 t0, r10, r7, #0.neg
    +FADD.f32 t1, r8, r8
    *FMA.f32 r11:t0, r9, t1, t0
    +FADD.f32 t1, r9, r9
    *FMA.f32 t0, r10, t1, #0.neg
    +FADD.f32 t1, r7, r7
    *FMA.f32 r12:t0, r8, t1, t0
    +NOP t1
    *NOP t0
    +IADD.s32 t1, r60, 0x00000001 /* 0.000000 */
    *MKVEC.v2i16 t0, t1, r61
    +LD_ATTR_TEX.f32.v4 t1, t, #0.x, #0.x, @r13
}

clause_11:
ds(0) nbb ncph 
{
    *FMA.f32 r1:t0, r16, r13, #0.neg
    +NOP t1
    *NOP t0
    +FADD.f32 r17:t1, r14, r14
}

clause_13:
ds(0) nbb ncph 
{
    *FMA.f32 r1:t0, r15, r17, r1
    +FADD.f32 t1, r15, r15
    *FMA.f32 t0, r16, t1, #0.neg
    +FADD.f32 t1, r13, r13
    *FMA.f32 r17:t0, r14, t1, t0
    +NOP t1
    *FMA.f32 t0, r6, r4, #0.neg
    +NOP t1
    *FMA.f32 t0, r5, r3, t0
    +NOP t1
    *FMA.f32 r18:t0, r6, r4, t0
    +NOP t1
    *FMA.f32 r0:t0, r6, r3, r0
    +NOP t1
    *FMA.f32 r18:t0, r5, r3, r18
    +NOP t1
}

clause_19:
ds(0) nbb ncph 
{
    *FMA.f32 r19:t0, r6, r6, #0.neg
    +NOP t1
    *FMA.f32 t0, r0, r13, #0.neg
    +NOP t1
    *FMA.f32 r20:t0, r18, r14, t0
    +NOP t1
    *FMA.f32 r19:t0, r5, r5, r19
    +FADD.f32 r21:t1, r5, r5
    *FMA.f32 t0, r6, t1, #0.neg
    +FADD.f32 r23:t1, r3, r3
    *FMA.f32 r22:t0, r4, t1, t0
    +NOP t1
    *FMA.f32 r19:t0, r4, r4, r19
    +NOP t1
    *FMA.f32 r24:t0, r0, r7, #0.neg
    +NOP t1
}

clause_25:
ds(0) nbb ncph 
{
    *FMA.f32 r24:t0, r18, r8, r24
    +NOP t1
    *FMA.f32 r20:t0, r22, r15, r20
    +NOP t1
    *FMA.f32 r19:t0, r3, r3, r19
    +NOP t1
    *FMA.f32 r24:t0, r22, r9, r24
    +NOP t1
    *FMA.f32 r20:t0, r19, r16, r20
    +NOP t1
    *FMA.f32 t0, r19, r10, r24
    +NOP t1
    *NOP t0
    +FADD.f32 r20:t1, t0, r20
    *FMA.f32 r24:t0, r10, r8, #0.neg
    +NOP t1
}

clause_31:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r9, r7, r24
    +NOP t1
    *FMA.f32 r24:t0, r10, r8, t0
    +NOP t1
    *FMA.f32 r11:t0, r10, r7, r11
    +NOP t1
    *FMA.f32 r24:t0, r9, r7, r24
    +NOP t1
    *FMA.f32 t0, r10, r10, #0.neg
    +NOP t1
    *FMA.f32 r25:t0, r9, r9, t0
    +NOP t1
    *FMA.f32 t0, r11, r3, #0.neg
    +NOP t1
    *FMA.f32 r26:t0, r24, r4, t0
    +NOP t1
}

clause_37:
ds(0) nbb ncph 
{
    *FMA.f32 r25:t0, r8, r8, r25
    +NOP t1
    *FMA.f32 t0, r11, r13, #0.neg
    +NOP t1
    *FMA.f32 r27:t0, r24, r14, t0
    +NOP t1
    *FMA.f32 r26:t0, r12, r5, r26
    +NOP t1
    *FMA.f32 r25:t0, r7, r7, r25
    +NOP t1
    *FMA.f32 r27:t0, r12, r15, r27
    +NOP t1
    *FMA.f32 r26:t0, r25, r6, r26
    +NOP t1
    *FMA.f32 r27:t0, r25, r16, r27
    +NOP t1
}

clause_43:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r26:t1, r27, r26
    *FMA.f32 t0, r0, r14, #0.neg
    +NOP t1
    *FMA.f32 r27:t0, r18, r13, t0
    +NOP t1
    *FMA.f32 t0, r0, r8, #0.neg
    +NOP t1
    *FMA.f32 r28:t0, r18, r7, t0
    +NOP t1
    *FMA.f32 r27:t0, r22, r16, r27
    +NOP t1
    *FMA.f32 r28:t0, r22, r10, r28
    +NOP t1
    *FMA.f32 r27:t0, r19, r15, r27
    +NOP t1
}

clause_49:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r19, r9, r28
    +NOP t1
    *NOP t0
    +FADD.f32 r27:t1, t0, r27
    *FMA.f32 t0, r16, r14, #0.neg
    +NOP t1
    *FMA.f32 t0, r15, r13, t0
    +NOP t1
    *FMA.f32 r28:t0, r16, r14, t0
    +NOP t1
    *FMA.f32 r1:t0, r16, r13, r1
    +NOP t1
    *FMA.f32 r28:t0, r15, r13, r28
    +NOP t1
    *FMA.f32 r29:t0, r16, r16, #0.neg
    +NOP t1
}

clause_55:
ds(0) nbb ncph 
{
    *FMA.f32 r29:t0, r15, r15, r29
    +NOP t1
    *FMA.f32 t0, r11, r7, #0.neg
    +NOP t1
    *FMA.f32 r30:t0, r24, r8, t0
    +NOP t1
    *FMA.f32 t0, r1, r7, #0.neg
    +NOP t1
    *FMA.f32 r31:t0, r28, r8, t0
    +NOP t1
    *FMA.f32 r29:t0, r14, r14, r29
    +NOP t1
    *FMA.f32 t0, r11, r8, #0.neg
    +NOP t1
    *FMA.f32 r32:t0, r24, r7, t0
    +NOP t1
}

clause_61:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r1, r8, #0.neg
    +NOP t1
    *FMA.f32 r33:t0, r28, r7, t0
    +NOP t1
    *FMA.f32 r30:t0, r12, r9, r30
    +NOP t1
    *FMA.f32 r31:t0, r17, r9, r31
    +NOP t1
    *FMA.f32 r29:t0, r13, r13, r29
    +NOP t1
    *FMA.f32 r32:t0, r12, r10, r32
    +NOP t1
    *FMA.f32 r33:t0, r17, r10, r33
    +NOP t1
    *FMA.f32 r34:t0, r11, r9, #0.neg
    +NOP t1
}

clause_67:
ds(0) nbb ncph 
{
    *FMA.f32 r34:t0, r24, r10, r34
    +NOP t1
    *FMA.f32 t0, r0, r9, #0.neg
    +NOP t1
    *FMA.f32 r35:t0, r18, r10, t0
    +NOP t1
    *FMA.f32 t0, r1, r9, #0.neg
    +NOP t1
    *FMA.f32 r36:t0, r28, r10, t0
    +NOP t1
    *FMA.f32 r30:t0, r25, r10, r30
    +NOP t1
    *FMA.f32 r31:t0, r29, r10, r31
    +NOP t1
    *FMA.f32 r37:t0, r10, r13, #0.neg
    +NOP t1
}

clause_73:
ds(0) nbb ncph 
{
    *FMA.f32 r38:t0, r10, r14, #0.neg
    +NOP t1
    *FMA.f32 r39:t0, r11, r10, #0.neg
    +NOP t1
    *FMA.f32 r40:t0, r10, r15, #0.neg
    +NOP t1
    *FMA.f32 r41:t0, r10, r16, #0.neg
    +NOP t1
    *FMA.f32 r42:t0, r0, r10, #0.neg
    +NOP t1
    *FMA.f32 r43:t0, r1, r10, #0.neg
    +NOP t1
    *FMA.f32 r44:t0, r10, r3, #0.neg
    +NOP t1
    *FMA.f32 r45:t0, r10, r4, #0.neg
    +NOP t1
}

clause_79:
ds(0) nbb ncph 
{
    *FMA.f32 r46:t0, r10, r5, #0.neg
    +NOP t1
    *FMA.f32 r10:t0, r10, r6, #0.neg
    +NOP t1
    *FMA.f32 r37:t0, r9, r14, r37
    +NOP t1
    *FMA.f32 r38:t0, r9, r13, r38
    +NOP t1
    *FMA.f32 r39:t0, r24, r9, r39
    +NOP t1
    *FMA.f32 r40:t0, r9, r16, r40
    +NOP t1
    *FMA.f32 r41:t0, r9, r15, r41
    +NOP t1
    *FMA.f32 r42:t0, r18, r9, r42
    +NOP t1
}

clause_85:
ds(0) nbb ncph 
{
    *FMA.f32 r43:t0, r28, r9, r43
    +NOP t1
    *FMA.f32 r44:t0, r9, r4, r44
    +NOP t1
    *FMA.f32 r45:t0, r9, r3, r45
    +NOP t1
    *FMA.f32 r46:t0, r9, r6, r46
    +NOP t1
    *FMA.f32 r10:t0, r9, r5, r10
    +NOP t1
    *FMA.f32 r32:t0, r25, r9, r32
    +NOP t1
    *FMA.f32 r9:t0, r29, r9, r33
    +NOP t1
    *FMA.f32 r33:t0, r11, r4, #0.neg
    +NOP t1
}

clause_91:
ds(0) nbb ncph 
{
    *FMA.f32 r33:t0, r24, r3, r33
    +NOP t1
    *FMA.f32 t0, r11, r14, #0.neg
    +NOP t1
    *FMA.f32 r47:t0, r24, r13, t0
    +NOP t1
    *FMA.f32 r33:t0, r12, r6, r33
    +NOP t1
    *FMA.f32 r47:t0, r12, r16, r47
    +NOP t1
    *FMA.f32 r33:t0, r25, r5, r33
    +NOP t1
    *FMA.f32 t0, r25, r15, r47
    +NOP t1
    *NOP t0
    +FADD.f32 r33:t1, t0, r33
}

clause_97:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r1, r13, #0.neg
    +NOP t1
    *FMA.f32 r47:t0, r28, r14, t0
    +NOP t1
    *FMA.f32 t0, r1, r14, #0.neg
    +NOP t1
    *FMA.f32 r48:t0, r28, r13, t0
    +NOP t1
    *FMA.f32 r37:t0, r8, r15, r37
    +NOP t1
    *FMA.f32 r47:t0, r17, r15, r47
    +NOP t1
    *FMA.f32 r38:t0, r8, r16, r38
    +NOP t1
    *FMA.f32 r48:t0, r17, r16, r48
    +NOP t1
}

clause_103:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r11, r15, #0.neg
    +NOP t1
    *FMA.f32 r49:t0, r24, r16, t0
    +NOP t1
    *FMA.f32 t0, r0, r15, #0.neg
    +NOP t1
    *FMA.f32 r50:t0, r18, r16, t0
    +NOP t1
    *FMA.f32 t0, r1, r15, #0.neg
    +NOP t1
    *FMA.f32 r51:t0, r28, r16, t0
    +NOP t1
    *FMA.f32 r37:t0, r7, r16, r37
    +NOP t1
    *FMA.f32 r47:t0, r29, r16, r47
    +NOP t1
}

clause_109:
ds(0) nbb ncph 
{
    *FMA.f32 r52:t0, r11, r16, #0.neg
    +NOP t1
    *FMA.f32 r53:t0, r0, r16, #0.neg
    +NOP t1
    *FMA.f32 r54:t0, r16, r3, #0.neg
    +NOP t1
    *FMA.f32 r55:t0, r16, r4, #0.neg
    +NOP t1
    *FMA.f32 r56:t0, r16, r5, #0.neg
    +NOP t1
    *FMA.f32 r57:t0, r16, r6, #0.neg
    +NOP t1
    *FMA.f32 r16:t0, r1, r16, #0.neg
    +NOP t1
    *FMA.f32 r52:t0, r24, r15, r52
    +NOP t1
}

clause_115:
ds(0) nbb ncph 
{
    *FMA.f32 r53:t0, r18, r15, r53
    +NOP t1
    *FMA.f32 r54:t0, r15, r4, r54
    +NOP t1
    *FMA.f32 r55:t0, r15, r3, r55
    +NOP t1
    *FMA.f32 r56:t0, r15, r6, r56
    +NOP t1
    *FMA.f32 r57:t0, r15, r5, r57
    +NOP t1
    *FMA.f32 r16:t0, r28, r15, r16
    +NOP t1
    *FMA.f32 r38:t0, r7, r15, r38
    +NOP t1
    *FMA.f32 r15:t0, r29, r15, r48
    +NOP t1
}

clause_121:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 t1, r12, r17
    *FMA.f32 r21:t0, r6, r21, t1
    +NOP t1
    *FMA.f32 t0, r11, r5, #0.neg
    +NOP t1
    *FMA.f32 r48:t0, r24, r6, t0
    +NOP t1
    *FMA.f32 t0, r11, r6, #0.neg
    +NOP t1
    *FMA.f32 r58:t0, r24, r5, t0
    +NOP t1
    *FMA.f32 r48:t0, r12, r3, r48
    +NOP t1
    *FMA.f32 r49:t0, r12, r13, r49
    +NOP t1
}

clause_127:
ds(0) nbb ncph 
{
    *FMA.f32 r39:t0, r12, r8, r39
    +NOP t1
    *FMA.f32 r58:t0, r12, r4, r58
    +NOP t1
    *FMA.f32 r52:t0, r12, r14, r52
    +NOP t1
    *FMA.f32 r12:t0, r12, r7, r34
    +NOP t1
    *FMA.f32 r34:t0, r22, r13, r50
    +NOP t1
    *FMA.f32 r35:t0, r22, r7, r35
    +NOP t1
    *FMA.f32 r34:t0, r19, r14, r34
    +NOP t1
    *FMA.f32 r35:t0, r19, r8, r35
    +NOP t1
}

clause_133:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r34:t1, r35, r34
    *FMA.f32 r35:t0, r25, r4, r48
    +NOP t1
    *FMA.f32 r48:t0, r25, r14, r49
    +NOP t1
    *FMA.f32 r40:t0, r8, r13, r40
    +NOP t1
    *NOP t0
    +FADD.f32 r35:t1, r48, r35
    *FMA.f32 r48:t0, r17, r13, r51
    +NOP t1
    *FMA.f32 r49:t0, r22, r14, r53
    +NOP t1
    *FMA.f32 r41:t0, r8, r14, r41
    +NOP t1
}

clause_139:
ds(0) nbb ncph 
{
    *FMA.f32 r50:t0, r14, r5, r54
    +NOP t1
    *FMA.f32 r51:t0, r14, r6, r55
    +NOP t1
    *FMA.f32 r53:t0, r14, r3, r56
    +NOP t1
    *FMA.f32 r54:t0, r14, r4, r57
    +NOP t1
    *FMA.f32 r16:t0, r17, r14, r16
    +NOP t1
    *FMA.f32 r40:t0, r7, r14, r40
    +NOP t1
    *FMA.f32 r14:t0, r29, r14, r48
    +NOP t1
    *FMA.f32 r48:t0, r0, r3, #0.neg
    +NOP t1
}

clause_145:
ds(0) nbb ncph 
{
    *FMA.f32 r48:t0, r18, r4, r48
    +NOP t1
    *FMA.f32 t0, r0, r4, #0.neg
    +NOP t1
    *FMA.f32 r55:t0, r18, r3, t0
    +NOP t1
    *FMA.f32 t0, r0, r5, #0.neg
    +NOP t1
    *FMA.f32 r56:t0, r18, r6, t0
    +NOP t1
    *FMA.f32 t0, r0, r6, #0.neg
    +NOP t1
    *FMA.f32 r57:t0, r18, r5, t0
    +NOP t1
    *FMA.f32 r42:t0, r22, r8, r42
    +NOP t1
}

clause_151:
ds(0) nbb ncph 
{
    *FMA.f32 r48:t0, r22, r5, r48
    +NOP t1
    *FMA.f32 r55:t0, r22, r6, r55
    +NOP t1
    *FMA.f32 r56:t0, r22, r3, r56
    +NOP t1
    *FMA.f32 r22:t0, r22, r4, r57
    +NOP t1
    *FMA.f32 t0, r1, r3, #0.neg
    +NOP t1
    *FMA.f32 r57:t0, r28, r4, t0
    +NOP t1
    *FMA.f32 t0, r1, r4, #0.neg
    +NOP t1
    *FMA.f32 r59:t0, r28, r3, t0
    +NOP t1
}

clause_157:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r1, r5, #0.neg
    +NOP t1
    *FMA.f32 r60:t0, r28, r6, t0
    +NOP t1
    *FMA.f32 t0, r1, r6, #0.neg
    +NOP t1
    *FMA.f32 r61:t0, r28, r5, t0
    +NOP t1
    *FMA.f32 r43:t0, r17, r8, r43
    +NOP t1
    *FMA.f32 r57:t0, r17, r5, r57
    +NOP t1
    *FMA.f32 r59:t0, r17, r6, r59
    +NOP t1
    *FMA.f32 r36:t0, r17, r7, r36
    +NOP t1
}

clause_163:
ds(0) nbb ncph 
{
    *FMA.f32 r60:t0, r17, r3, r60
    +NOP t1
    *FMA.f32 r17:t0, r17, r4, r61
    +NOP t1
    *FMA.f32 r49:t0, r19, r13, r49
    +NOP t1
    *FMA.f32 r42:t0, r19, r7, r42
    +NOP t1
    *FMA.f32 r44:t0, r8, r5, r44
    +NOP t1
    *FMA.f32 r45:t0, r8, r6, r45
    +NOP t1
    *FMA.f32 r46:t0, r8, r3, r46
    +NOP t1
    *FMA.f32 r10:t0, r8, r4, r10
    +NOP t1
}

clause_169:
ds(0) nbb ncph 
{
    *FMA.f32 r41:t0, r7, r13, r41
    +NOP t1
    *FMA.f32 r44:t0, r7, r6, r44
    +NOP t1
    *FMA.f32 r45:t0, r7, r5, r45
    +NOP t1
    *FMA.f32 r46:t0, r7, r4, r46
    +NOP t1
    *FMA.f32 r39:t0, r25, r7, r39
    +NOP t1
    *FMA.f32 r10:t0, r7, r3, r10
    +NOP t1
    *FMA.f32 r7:t0, r29, r7, r43
    +NOP t1
    *FMA.f32 r43:t0, r25, r3, r58
    +NOP t1
}

clause_175:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r42:t1, r42, r49
    *FMA.f32 t0, r25, r13, r52
    +NOP t1
    *NOP t0
    +FADD.f32 r43:t1, t0, r43
    *FMA.f32 r49:t0, r13, r6, r50
    +NOP t1
    *FMA.f32 r50:t0, r13, r5, r51
    +NOP t1
    *FMA.f32 r51:t0, r13, r4, r53
    +NOP t1
    *FMA.f32 r52:t0, r13, r3, r54
    +NOP t1
    *FMA.f32 r13:t0, r29, r13, r16
    +NOP t1
}

clause_181:
ds(0) nbb ncph 
{
    *FMA.f32 r0:t0, r30, 0x3e800000 /* 0.250000 */, r0
    +NOP t1
    *NOP t0
    +FADD.f32 r1:t1, r11, r1
    *FMA.f32 t0, r37, r3, #0.neg
    +NOP t1
    *FMA.f32 t0, r38, r4, t0
    +NOP t1
    *FMA.f32 t0, r40, r5, t0
    +NOP t1
    *FMA.f32 r11:t0, r41, r6, t0
    +NOP t1
    *NOP t0
    +FADD.f32 r16:t1, r31, r20
    *NOP t0
    +FADD.f32 r20:t1, r47, r26
}

clause_187:
ds(0) nbb ncph 
{
    *FMA.f32 t0, r37, r4, #0.neg
    +NOP t1
    *FMA.f32 t0, r38, r3, t0
    +NOP t1
    *FMA.f32 r26:t0, r40, r6, t0
    +NOP t1
    *FMA.f32 r31:t0, r37, r5, #0.neg
    +FADD.f32 r30:t1, r37, r44
    *FMA.f32 r37:t0, r37, r6, #0.neg
    +NOP t1
    *FMA.f32 r31:t0, r38, r6, r31
    +NOP t1
    *FMA.f32 r44:t0, r29, r6, r57
    +NOP t1
    *FMA.f32 r6:t0, r19, r6, r48
    +NOP t1
}

clause_193:
ds(0) nbb ncph 
{
    *FMA.f32 r18:t0, r32, 0x3e800000 /* 0.250000 */, r18
    +NOP t1
    *FMA.f32 r26:t0, r41, r5, r26
    +NOP t1
    *NOP t0
    +FADD.f32 r24:t1, r24, r28
    *FMA.f32 r28:t0, r38, r5, r37
    +NOP t1
    *NOP t0
    +FADD.f32 r9:t1, r9, r27
    *FMA.f32 r27:t0, r29, r5, r59
    +NOP t1
    *FMA.f32 r5:t0, r19, r5, r55
    +NOP t1
    *FMA.f32 r21:t0, r4, r23, r21
    +NOP t1
}

clause_199:
ds(0) nbb ncph 
{
    *FMA.f32 r23:t0, r29, r8, r36
    +NOP t1
    *FMA.f32 r8:t0, r25, r8, r12
    +NOP t1
    *FMA.f32 t0, r40, r3, r31
    +NOP t1
    *FMA.f32 t0, r41, r4, t0
    +NOP t1
    *FMA.f32 r12:t0, t0, 0x3e800000 /* 0.250000 */, r51
    +NOP t1
    *FMA.f32 r28:t0, r40, r4, r28
    +NOP t1
    *NOP t0
    +FADD.f32 r15:t1, r15, r33
    *FMA.f32 r33:t0, r29, r4, r60
    +NOP t1
}

clause_205:
ds(0) nbb ncph 
{
    *FMA.f32 r4:t0, r19, r4, r56
    +NOP t1
    *NOP t0
    +FADD.f32 r23:t1, r23, r34
    *FMA.f32 r34:t0, r39, 0x3e800000 /* 0.250000 */, r19
    +NOP t1
    *FMA.f32 r19:t0, r19, r3, r22
    +NOP t1
    *FMA.f32 r22:t0, r41, r3, r28
    +NOP t1
    *FMA.f32 r3:t0, r29, r3, r17
    +NOP t1
    *FMA.f32 r22:t0, r22, 0x3e800000 /* 0.250000 */, r52
    +NOP t1
    *FMA.f32 r8:t0, r8, 0x3e800000 /* 0.250000 */, r21
    +NOP t1
}

clause_211:
ds(0) nbb ncph 
{
    *NOP t0
    +FADD.f32 r0:t1, r1, r0
    *FMA.f32 t0, r11, 0x3e800000 /* 0.250000 */, r49
    +FADD.f32 r1:t1, r30, t
    *NOP t0
    +FADD.f32 r11:t1, r44, r16
    *NOP t0
    +FADD.f32 r0:t1, r1, r0
    *FADD.f32 t0, r6, r20
    +FADD.f32 t1, t, r11
    *FMA.f32 r0:t0, 0x3e800000 /* 0.250000 */, t1, r0
    +NOP t1
    *NOP t0
    +FADD.f32 r32:t1, r38, r45
    *NOP t0
    +FADD.f32 r16:t1, r24, r18
}

clause_217:
ds(0) nbb ncph next_attr 
{
    *FMA.f32 t0, r26, 0x3e800000 /* 0.250000 */, r50
    +FADD.f32 r18:t1, r32, t
    *NOP t0
    +FADD.f32 r9:t1, r27, r9
    *NOP t0
    +FADD.f32 r6:t1, r18, r16
    *FADD.f32 t0, r5, r15
    +FADD.f32 t1, t, r9
    *FMA.f32 r1:t0, 0x3e800000 /* 0.250000 */, t1, r6
    +NOP t1
    *NOP t0
    +FADD.f32 r31:t1, r40, r46
    *NOP t0
    +FADD.f32 r14:t1, r14, r35
    *NOP t0
    +FADD.f32 r15:t1, r33, r23
}

clause_223:
ds(0) nbb attr ncph next_store dwb(0) 
{
    *FADD.f32 t0, r31, r12
    +FADD.f32 r8:t1, t, r8
    *FADD.f32 t0, r4, r14
    +FADD.f32 t1, t, r15
    *FMA.f32 r4:t0, 0x3e800000 /* 0.250000 */, t1, r8
    +NOP t1
    *NOP t0
    +FADD.f32 t1, r7, r42
    *NOP t0
    +FADD.f32 r3:t1, r3, t1
    *FADD.f32 t0, r13, r43
    +FADD.f32 t1, r19, t
    *NOP t0
    +FADD.f32 r3:t1, t1, r3
    *DTSEL_IMM.attribute_1 t0, r2
    +LEA_ATTR_TEX.f32 t1, t, 0x00000000 /* 0.000000 */, 0x00000001 /* 0.000000 */, @r5
}

clause_230:
ds(0) eos store 
{
    *NOP t0
    +FADD.f32 r17:t1, r25, r29
    *FADD.f32 t0, r41, 0x40a00000 /* 5.000000 */
    +FADD.f32 r10:t1, t, r10
    *NOP t0
    +FADD.f32 t1, r17, r34
    *FADD.f32 t0, r10, r22
    +FADD.f32 t1, t, t1
    *FMA.f32 t0, 0x3e800000 /* 0.250000 */, r3, t1
    +MOV.i32 r8:t1, t
    *MOV.i32 r9:t0, r4
    +MOV.i32 r10:t1, r1
    *NOP t0
    +MOV.i32 r11:t1, r0
    *NOP t0
    +ST_CVT.v4 t1, r5, r6, r7, @r8
}

shader21995 - MESA_SHADER_COMPUTE shader: 0 inst, 0 bundles, 0 quadwords, 0 registers, 4 threads, 0 loops, 0:0 spills:fills
r = grid_run(test, localx=[1,2,4,8,16], localy=[1,2,4,8,16])
grid_show(r, colorfun=lambda x: -x)
de: dump command stream to file pandecode.dump.0348
pandecode: dump command stream to file pandecode.dump.0349
pandecode: dump command stream to file pandecode.dump.0350
pandecode: dump command stream to file pandecode.dump.0351
pandecode: dump command stream to file pandecode.dump.0352
pandecode: dump command stream to file pandecode.dump.0353
pandecode: dump command stream to file pandecode.dump.0354
pandecode: dump command stream to file pandecode.dump.0355
pandecode: dump command stream to file pandecode.dump.0356
pandecode: dump command stream to file pandecode.dump.0357
pandecode: dump command stream to file pandecode.dump.0358
pandecode: dump command stream to file pandecode.dump.0359
pandecode: dump command stream to file pandecode.dump.0360
pandecode: dump command stream to file pandecode.dump.0361
pandecode: dump command stream to file pandecode.dump.0362
pandecode: dump command stream to file pandecode.dump.0363
pandecode: dump command stream to file pandecode.dump.0364
pandecode: dump command stream to file pandecode.dump.0365
pandecode: dump command stream to file pandecode.dump.0366
pandecode: dump command stream to file pandecode.dump.0367
pandecode: dump command stream to file pandecode.dump.0368
pandecode: dump command stream to file pandecode.dump.0369
pandecode: dump command stream to file pandecode.dump.0370
pandecode: dump command stream to file pandecode.dump.0371
pandecode: dump command stream to file pandecode.dump.0372
pandecode: dump command stream to file pandecode.dump.0373
pandecode: dump command stream to file pandecode.dump.0374
pandecode: dump command stream to file pandecode.dump.0375
pandecode: dump command stream to file pandecode.dump.0376
pandecode: dump command stream to file pandecode.dump.0377
pandecode: dump command stream to file pandecode.dump.0378
pandecode: dump command stream to file pandecode.dump.0379
pandecode: dump command stream to file pandecode.dump.0380
pandecode: dump command stream to file pandecode.dump.0381
pandecode: dump command stream to file pandecode.dump.0382
pandecode: dump command stream to file pandecode.dump.0383
pandecode: dump command stream to file pandecode.dump.0384
pandecode: dump command stream to file pandecode.dump.0385
pandecode: dump command stream to file pandecode.dump.0386
pandecode: dump command stream to file pandecode.dump.0387
pandecode: dump command stream to file pandecode.dump.0388
pandecode: dump command stream to file pandecode.dump.0389
pandecode: dump command stream to file pandecode.dump.0390
pandecode: dump command stream to file pandecode.dump.0391
pandecode: dump command stream to file pandecode.dump.0392
pandecode: dump command stream to file pandecode.dump.0393
pandecode: dump command stream to file pandecode.dump.0394
pandecode: dump command stream to file pandecode.dump.0395
pandecode: dump command stream to file pandecode.dump.0396
pandecode: dump command stream to file pandecode.dump.0397
pandecode: dump command stream to file pandecode.dump.0399
pandecode: dump command stream to file pandecode.dump.0400
pandecode: dump command stream to file pandecode.dump.0401
pandecode: dump command stream to file pandecode.dump.0402
pandecode: dump command stream to file pandecode.dump.0403
pandecode: dump command stream to file pandecode.dump.0404
pandecode: dump command stream to file pandecode.dump.0405
pandecode: dump command stream to file pandecode.dump.0406
pandecode: dump command stream to file pandecode.dump.0407
pandecode: dump command stream to file pandecode.dump.0408
pandecode: dump command stream to file pandecode.dump.0409
pandecode: dump command stream to file pandecode.dump.0410
pandecode: dump command stream to file pandecode.dump.0411
pandecode: dump command stream to file pandecode.dump.0412
pandecode: dump command stream to file pandecode.dump.0413
pandecode: dump command stream to file pandecode.dump.0414
pandecode: dump command stream to file pandecode.dump.0415
pandecode: dump command stream to file pandecode.dump.0416
pandecode: dump command stream to file pandecode.dump.0417
pandecode: dump command stream t