Forming Architectural Performance Expectations¶

In [1]:
!mkdir -p tmp

This demonstrates the LLVM Machine Code Analyzer.

In [13]:
%%writefile tmp/transpose.c

#include <x86intrin.h>

// 8x8 transpose kernel stolen from
// https://github.com/springer13/hptt/blob/e1017ef8b8ed0b6f3bb3b70df825a87f94c643e8/src/transpose.cpp#L137

void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
{
   __m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B
   __m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B
   //Load A
   __m256 rowA0 = _mm256_loadu_ps((A +0*lda));
   __m256 rowA1 = _mm256_loadu_ps((A +1*lda));
   __m256 rowA2 = _mm256_loadu_ps((A +2*lda));
   __m256 rowA3 = _mm256_loadu_ps((A +3*lda));
   __m256 rowA4 = _mm256_loadu_ps((A +4*lda));
   __m256 rowA5 = _mm256_loadu_ps((A +5*lda));
   __m256 rowA6 = _mm256_loadu_ps((A +6*lda));
   __m256 rowA7 = _mm256_loadu_ps((A +7*lda));

   //8x8 transpose micro kernel
   __m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28;
   r28 = _mm256_unpacklo_ps( rowA4, rowA5 );
   r54 = _mm256_unpacklo_ps( rowA6, rowA7 );
   r0 = _mm256_unpacklo_ps( rowA0, rowA1 );
   r10 = _mm256_unpacklo_ps( rowA2, rowA3 );
   r29 = _mm256_unpackhi_ps( rowA4, rowA5 );
   r55 = _mm256_unpackhi_ps( rowA6, rowA7 );
   r1 = _mm256_unpackhi_ps( rowA0, rowA1 );
   r11 = _mm256_unpackhi_ps( rowA2, rowA3 );
   r88 = _mm256_shuffle_ps( r28, r54, 0x44 );
   r70 = _mm256_shuffle_ps( r0, r10, 0x44 );
   r89 = _mm256_shuffle_ps( r28, r54, 0xee );
   r71 = _mm256_shuffle_ps( r0, r10, 0xee );
   r138 = _mm256_shuffle_ps( r29, r55, 0x44 );
   r120 = _mm256_shuffle_ps( r1, r11, 0x44 );
   r139 = _mm256_shuffle_ps( r29, r55, 0xee );
   r121 = _mm256_shuffle_ps( r1, r11, 0xee );
   rowA0 = _mm256_permute2f128_ps( r88, r70, 0x2 );
   rowA1 = _mm256_permute2f128_ps( r89, r71, 0x2 );
   rowA2 = _mm256_permute2f128_ps( r138, r120, 0x2 );
   rowA3 = _mm256_permute2f128_ps( r139, r121, 0x2 );
   rowA4 = _mm256_permute2f128_ps( r88, r70, 0x13 );
   rowA5 = _mm256_permute2f128_ps( r89, r71, 0x13 );
   rowA6 = _mm256_permute2f128_ps( r138, r120, 0x13 );
   rowA7 = _mm256_permute2f128_ps( r139, r121, 0x13 );

  _mm256_storeu_ps((B + 0 * ldb), rowA0);
  _mm256_storeu_ps((B + 1 * ldb), rowA1);
  _mm256_storeu_ps((B + 2 * ldb), rowA2);
  _mm256_storeu_ps((B + 3 * ldb), rowA3);
  _mm256_storeu_ps((B + 4 * ldb), rowA4);
  _mm256_storeu_ps((B + 5 * ldb), rowA5);
  _mm256_storeu_ps((B + 6 * ldb), rowA6);
  _mm256_storeu_ps((B + 7 * ldb), rowA7);
}
Overwriting tmp/transpose.c
In [14]:
!clang -S -o - -march=haswell tmp/transpose.c | llvm-mca -mcpu=haswell
warning: found a return instruction in the input assembly sequence.
note: program counter updates are ignored.
Iterations:        100
Instructions:      34400
Total Cycles:      11910
Total uOps:        44700

Dispatch Width:    4
uOps Per Cycle:    3.75
IPC:               2.89
Block RThroughput: 111.8


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 3      2     1.00           *            pushq	%rbp
 1      1     0.25                        movq	%rsp, %rbp
 1      1     0.25                        andq	$-32, %rsp
 1      1     0.25                        subq	$2144, %rsp
 1      1     1.00           *            movq	%rdi, 880(%rsp)
 1      1     1.00           *            movq	%rsi, 872(%rsp)
 1      1     1.00           *            movq	%rdx, 864(%rsp)
 1      1     1.00           *            movq	%rcx, 856(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 852(%rsp)
 2      1     1.00           *            vmovss	%xmm1, 848(%rsp)
 1      5     0.50    *                   vmovss	852(%rsp), %xmm0
 2      1     1.00           *            vmovss	%xmm0, 892(%rsp)
 1      5     0.50    *                   vmovss	892(%rsp), %xmm0
 2      1     1.00           *            vmovss	%xmm0, 2044(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2040(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2036(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2032(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2028(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2024(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2020(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2016(%rsp)
 1      5     0.50    *                   vmovss	2020(%rsp), %xmm1
 1      5     0.50    *                   vmovss	2016(%rsp), %xmm0
 1      1     1.00                        vinsertps	$16, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2024(%rsp), %xmm1
 1      1     1.00                        vinsertps	$32, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2028(%rsp), %xmm1
 1      1     1.00                        vinsertps	$48, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2036(%rsp), %xmm2
 1      5     0.50    *                   vmovss	2032(%rsp), %xmm1
 1      1     1.00                        vinsertps	$16, %xmm2, %xmm1, %xmm1
 1      5     0.50    *                   vmovss	2040(%rsp), %xmm2
 1      1     1.00                        vinsertps	$32, %xmm2, %xmm1, %xmm1
 1      5     0.50    *                   vmovss	2044(%rsp), %xmm2
 1      1     1.00                        vinsertps	$48, %xmm2, %xmm1, %xmm1
 2      1     1.00           *            vmovaps	%xmm1, 2000(%rsp)
 2      1     1.00           *            vmovaps	%xmm0, 1984(%rsp)
 1      7     0.50    *                   vmovaps	1984(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 800(%rsp)
 1      5     0.50    *                   vmovss	848(%rsp), %xmm0
 2      1     1.00           *            vmovss	%xmm0, 888(%rsp)
 1      5     0.50    *                   vmovss	888(%rsp), %xmm0
 2      1     1.00           *            vmovss	%xmm0, 2124(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2120(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2116(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2112(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2108(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2104(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2100(%rsp)
 2      1     1.00           *            vmovss	%xmm0, 2096(%rsp)
 1      5     0.50    *                   vmovss	2100(%rsp), %xmm1
 1      5     0.50    *                   vmovss	2096(%rsp), %xmm0
 1      1     1.00                        vinsertps	$16, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2104(%rsp), %xmm1
 1      1     1.00                        vinsertps	$32, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2108(%rsp), %xmm1
 1      1     1.00                        vinsertps	$48, %xmm1, %xmm0, %xmm0
 1      5     0.50    *                   vmovss	2116(%rsp), %xmm2
 1      5     0.50    *                   vmovss	2112(%rsp), %xmm1
 1      1     1.00                        vinsertps	$16, %xmm2, %xmm1, %xmm1
 1      5     0.50    *                   vmovss	2120(%rsp), %xmm2
 1      1     1.00                        vinsertps	$32, %xmm2, %xmm1, %xmm1
 1      5     0.50    *                   vmovss	2124(%rsp), %xmm2
 1      1     1.00                        vinsertps	$48, %xmm2, %xmm1, %xmm1
 2      1     1.00           *            vmovaps	%xmm1, 2064(%rsp)
 2      1     1.00           *            vmovaps	%xmm0, 2048(%rsp)
 1      7     0.50    *                   vmovaps	2048(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 768(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      1     1.00           *            movq	%rax, 952(%rsp)
 1      5     0.50    *                   movq	952(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 736(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        leaq	(%rax,%rcx,4), %rax
 1      1     1.00           *            movq	%rax, 944(%rsp)
 1      5     0.50    *                   movq	944(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 704(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        leaq	(%rax,%rcx,8), %rax
 1      1     1.00           *            movq	%rax, 936(%rsp)
 1      5     0.50    *                   movq	936(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 672(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        leaq	(%rcx,%rcx,2), %rcx
 1      1     0.50                        leaq	(%rax,%rcx,4), %rax
 1      1     1.00           *            movq	%rax, 928(%rsp)
 1      5     0.50    *                   movq	928(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 640(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        shlq	$4, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      1     1.00           *            movq	%rax, 920(%rsp)
 1      5     0.50    *                   movq	920(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 608(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        leaq	(%rcx,%rcx,4), %rcx
 1      1     0.50                        leaq	(%rax,%rcx,4), %rax
 1      1     1.00           *            movq	%rax, 912(%rsp)
 1      5     0.50    *                   movq	912(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 576(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rcx
 1      1     0.50                        leaq	(%rcx,%rcx,2), %rcx
 1      1     0.50                        leaq	(%rax,%rcx,8), %rax
 1      1     1.00           *            movq	%rax, 904(%rsp)
 1      5     0.50    *                   movq	904(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 544(%rsp)
 1      5     0.50    *                   movq	880(%rsp), %rax
 1      5     0.50    *                   movq	872(%rsp), %rdx
 1      1     0.50                        leaq	(%rdx,%rdx,8), %rcx
 1      1     0.50                        leaq	(%rcx,%rcx,2), %rcx
 1      1     0.25                        addq	%rdx, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      1     1.00           *            movq	%rax, 896(%rsp)
 1      5     0.50    *                   movq	896(%rsp), %rax
 1      7     0.50    *                   vmovups	(%rax), %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 512(%rsp)
 1      7     0.50    *                   vmovaps	608(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	576(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1184(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1152(%rsp)
 1      7     0.50    *                   vmovaps	1184(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1152(%rsp), %ymm1
 1      1     1.00                        vunpcklps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, (%rsp)
 1      7     0.50    *                   vmovaps	544(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	512(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1120(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1088(%rsp)
 1      7     0.50    *                   vmovaps	1120(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1088(%rsp), %ymm1
 1      1     1.00                        vunpcklps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 32(%rsp)
 1      7     0.50    *                   vmovaps	736(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	704(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1056(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1024(%rsp)
 1      7     0.50    *                   vmovaps	1056(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1024(%rsp), %ymm1
 1      1     1.00                        vunpcklps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 64(%rsp)
 1      7     0.50    *                   vmovaps	672(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	640(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 992(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 960(%rsp)
 1      7     0.50    *                   vmovaps	992(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	960(%rsp), %ymm1
 1      1     1.00                        vunpcklps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 96(%rsp)
 1      7     0.50    *                   vmovaps	608(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	576(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1440(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1408(%rsp)
 1      7     0.50    *                   vmovaps	1440(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1408(%rsp), %ymm1
 1      1     1.00                        vunpckhps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 128(%rsp)
 1      7     0.50    *                   vmovaps	544(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	512(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1376(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1344(%rsp)
 1      7     0.50    *                   vmovaps	1376(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1344(%rsp), %ymm1
 1      1     1.00                        vunpckhps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 160(%rsp)
 1      7     0.50    *                   vmovaps	736(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	704(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1312(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1280(%rsp)
 1      7     0.50    *                   vmovaps	1312(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1280(%rsp), %ymm1
 1      1     1.00                        vunpckhps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 192(%rsp)
 1      7     0.50    *                   vmovaps	672(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	640(%rsp), %ymm0
 2      1     1.00           *            vmovaps	%ymm1, 1248(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1216(%rsp)
 1      7     0.50    *                   vmovaps	1248(%rsp), %ymm0
 1      7     0.50    *                   vmovaps	1216(%rsp), %ymm1
 1      1     1.00                        vunpckhps	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 224(%rsp)
 1      7     0.50    *                   vmovapd	(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	32(%rsp), %ymm1
 1      1     1.00                        vunpcklpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 256(%rsp)
 1      7     0.50    *                   vmovapd	64(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	96(%rsp), %ymm1
 1      1     1.00                        vunpcklpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 288(%rsp)
 1      7     0.50    *                   vmovapd	(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	32(%rsp), %ymm1
 1      1     1.00                        vunpckhpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 320(%rsp)
 1      7     0.50    *                   vmovapd	64(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	96(%rsp), %ymm1
 1      1     1.00                        vunpckhpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 352(%rsp)
 1      7     0.50    *                   vmovapd	128(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	160(%rsp), %ymm1
 1      1     1.00                        vunpcklpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 384(%rsp)
 1      7     0.50    *                   vmovapd	192(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	224(%rsp), %ymm1
 1      1     1.00                        vunpcklpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 416(%rsp)
 1      7     0.50    *                   vmovapd	128(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	160(%rsp), %ymm1
 1      1     1.00                        vunpckhpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 448(%rsp)
 1      7     0.50    *                   vmovapd	192(%rsp), %ymm0
 1      7     0.50    *                   vmovapd	224(%rsp), %ymm1
 1      1     1.00                        vunpckhpd	%ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 480(%rsp)
 1      7     0.50    *                   vmovaps	256(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	288(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$32, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 736(%rsp)
 1      7     0.50    *                   vmovaps	320(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	352(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$32, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 704(%rsp)
 1      7     0.50    *                   vmovaps	384(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	416(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$32, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 672(%rsp)
 1      7     0.50    *                   vmovaps	448(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	480(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$32, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 640(%rsp)
 1      7     0.50    *                   vmovaps	256(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	288(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$49, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 608(%rsp)
 1      7     0.50    *                   vmovaps	320(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	352(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$49, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 576(%rsp)
 1      7     0.50    *                   vmovaps	384(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	416(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$49, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 544(%rsp)
 1      7     0.50    *                   vmovaps	448(%rsp), %ymm1
 1      7     0.50    *                   vmovaps	480(%rsp), %ymm0
 1      3     1.00                        vperm2f128	$49, %ymm1, %ymm0, %ymm0
 2      1     1.00           *            vmovaps	%ymm0, 512(%rsp)
 1      5     0.50    *                   movq	864(%rsp), %rax
 2      8     1.00    *                   imulq	$0, 856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	736(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1976(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1920(%rsp)
 1      7     0.50    *                   vmovaps	1920(%rsp), %ymm0
 1      5     0.50    *                   movq	1976(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 1      5     0.50    *                   movq	856(%rsp), %rcx
 1      1     0.50                        shlq	$0, %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	704(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1912(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1856(%rsp)
 1      7     0.50    *                   vmovaps	1856(%rsp), %ymm0
 1      5     0.50    *                   movq	1912(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 1      5     0.50    *                   movq	856(%rsp), %rcx
 1      1     0.50                        shlq	%rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	672(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1848(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1792(%rsp)
 1      7     0.50    *                   vmovaps	1792(%rsp), %ymm0
 1      5     0.50    *                   movq	1848(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 2      8     1.00    *                   imulq	$3, 856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	640(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1784(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1728(%rsp)
 1      7     0.50    *                   vmovaps	1728(%rsp), %ymm0
 1      5     0.50    *                   movq	1784(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 1      5     0.50    *                   movq	856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	608(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1720(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1664(%rsp)
 1      7     0.50    *                   vmovaps	1664(%rsp), %ymm0
 1      5     0.50    *                   movq	1720(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 2      8     1.00    *                   imulq	$5, 856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	576(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1656(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1600(%rsp)
 1      7     0.50    *                   vmovaps	1600(%rsp), %ymm0
 1      5     0.50    *                   movq	1656(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 2      8     1.00    *                   imulq	$6, 856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	544(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1592(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1536(%rsp)
 1      7     0.50    *                   vmovaps	1536(%rsp), %ymm0
 1      5     0.50    *                   movq	1592(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      5     0.50    *                   movq	864(%rsp), %rax
 2      8     1.00    *                   imulq	$7, 856(%rsp), %rcx
 1      1     0.50                        shlq	$2, %rcx
 1      1     0.25                        addq	%rcx, %rax
 1      7     0.50    *                   vmovaps	512(%rsp), %ymm0
 1      1     1.00           *            movq	%rax, 1528(%rsp)
 2      1     1.00           *            vmovaps	%ymm0, 1472(%rsp)
 1      7     0.50    *                   vmovaps	1472(%rsp), %ymm0
 1      5     0.50    *                   movq	1528(%rsp), %rax
 2      1     1.00           *            vmovups	%ymm0, (%rax)
 1      1     0.25                        movq	%rbp, %rsp
 2      6     0.50    *                   popq	%rbp
 4      0     1.00                  U     vzeroupper
 3      7     1.00                  U     retq


Resources:
[0]   - HWDivider
[1]   - HWFPDivider
[2]   - HWPort0
[3]   - HWPort1
[4]   - HWPort2
[5]   - HWPort3
[6]   - HWPort4
[7]   - HWPort5
[8]   - HWPort6
[9]   - HWPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 -      -     10.01  14.98  90.00  90.01  111.00 47.00  10.01  89.99  

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -      -     0.98    -     0.99   1.00   0.01   0.01   0.01   pushq	%rbp
 -      -      -     0.01    -      -      -     0.01   0.98    -     movq	%rsp, %rbp
 -      -     0.01   0.01    -      -      -     0.98    -      -     andq	$-32, %rsp
 -      -     0.01   0.98    -      -      -      -     0.01    -     subq	$2144, %rsp
 -      -      -      -     0.99   0.01   1.00    -      -      -     movq	%rdi, 880(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rsi, 872(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   movq	%rdx, 864(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rcx, 856(%rsp)
 -      -      -      -      -     0.01   1.00    -      -     0.99   vmovss	%xmm0, 852(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovss	%xmm1, 848(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     vmovss	852(%rsp), %xmm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 892(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovss	892(%rsp), %xmm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2044(%rsp)
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovss	%xmm0, 2040(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2036(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2032(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2028(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovss	%xmm0, 2024(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2020(%rsp)
 -      -      -      -     0.99   0.01   1.00    -      -      -     vmovss	%xmm0, 2016(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovss	2020(%rsp), %xmm1
 -      -      -      -     1.00    -      -      -      -      -     vmovss	2016(%rsp), %xmm0
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$16, %xmm1, %xmm0, %xmm0
 -      -      -      -      -     1.00    -      -      -      -     vmovss	2024(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$32, %xmm1, %xmm0, %xmm0
 -      -      -      -     1.00    -      -      -      -      -     vmovss	2028(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$48, %xmm1, %xmm0, %xmm0
 -      -      -      -      -     1.00    -      -      -      -     vmovss	2036(%rsp), %xmm2
 -      -      -      -     1.00    -      -      -      -      -     vmovss	2032(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$16, %xmm2, %xmm1, %xmm1
 -      -      -      -      -     1.00    -      -      -      -     vmovss	2040(%rsp), %xmm2
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$32, %xmm2, %xmm1, %xmm1
 -      -      -      -     1.00    -      -      -      -      -     vmovss	2044(%rsp), %xmm2
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$48, %xmm2, %xmm1, %xmm1
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm1, 2000(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 1984(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1984(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 800(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovss	848(%rsp), %xmm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovss	%xmm0, 888(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovss	888(%rsp), %xmm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2124(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovss	%xmm0, 2120(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2116(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2112(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2108(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2104(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovss	%xmm0, 2100(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovss	%xmm0, 2096(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovss	2100(%rsp), %xmm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovss	2096(%rsp), %xmm0
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$16, %xmm1, %xmm0, %xmm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovss	2104(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$32, %xmm1, %xmm0, %xmm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovss	2108(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$48, %xmm1, %xmm0, %xmm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovss	2116(%rsp), %xmm2
 -      -      -      -     0.99   0.01    -      -      -      -     vmovss	2112(%rsp), %xmm1
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$16, %xmm2, %xmm1, %xmm1
 -      -      -      -     0.01   0.99    -      -      -      -     vmovss	2120(%rsp), %xmm2
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$32, %xmm2, %xmm1, %xmm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovss	2124(%rsp), %xmm2
 -      -      -      -      -      -      -     1.00    -      -     vinsertps	$48, %xmm2, %xmm1, %xmm1
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm1, 2064(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%xmm0, 2048(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	2048(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 768(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 952(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     movq	952(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -     0.99   1.00    -      -     0.01   vmovaps	%ymm0, 736(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rax,%rcx,4), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 944(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	944(%rsp), %rax
 -      -      -      -     1.00    -      -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 704(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     1.00    -      -      -      -      -     movq	872(%rsp), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rax,%rcx,8), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 936(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	936(%rsp), %rax
 -      -      -      -     1.00    -      -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 672(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rcx
 -      -      -      -      -      -      -     1.00    -      -     leaq	(%rcx,%rcx,2), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rax,%rcx,4), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 928(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	928(%rsp), %rax
 -      -      -      -      -     1.00    -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -     0.99   1.00    -      -     0.01   vmovaps	%ymm0, 640(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     shlq	$4, %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     addq	%rcx, %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 920(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	920(%rsp), %rax
 -      -      -      -      -     1.00    -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 608(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rcx
 -      -      -      -      -      -      -     1.00    -      -     leaq	(%rcx,%rcx,4), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rax,%rcx,4), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 912(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     movq	912(%rsp), %rax
 -      -      -      -     1.00    -      -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 576(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rcx
 -      -      -      -      -      -      -     1.00    -      -     leaq	(%rcx,%rcx,2), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rax,%rcx,8), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 904(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     movq	904(%rsp), %rax
 -      -      -      -     0.01   0.99    -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -     0.01   0.99   1.00    -      -      -     vmovaps	%ymm0, 544(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	880(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	872(%rsp), %rdx
 -      -      -      -      -      -      -     1.00    -      -     leaq	(%rdx,%rdx,8), %rcx
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rcx,%rcx,2), %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     addq	%rdx, %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     addq	%rcx, %rax
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 896(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     movq	896(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     vmovups	(%rax), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 512(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	608(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	576(%rsp), %ymm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm1, 1184(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1152(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1184(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1152(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, (%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	544(%rsp), %ymm1
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	512(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm1, 1120(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1088(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1120(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	1088(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -     0.01   1.00    -      -     0.99   vmovaps	%ymm0, 32(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	736(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	704(%rsp), %ymm0
 -      -      -      -      -     0.99   1.00    -      -     0.01   vmovaps	%ymm1, 1056(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1024(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1056(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1024(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklps	%ymm1, %ymm0, %ymm0
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovaps	%ymm0, 64(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	672(%rsp), %ymm1
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	640(%rsp), %ymm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm1, 992(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 960(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	992(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	960(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 96(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	608(%rsp), %ymm1
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	576(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm1, 1440(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1408(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1440(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1408(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhps	%ymm1, %ymm0, %ymm0
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovaps	%ymm0, 128(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	544(%rsp), %ymm1
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	512(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm1, 1376(%rsp)
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm0, 1344(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1376(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	1344(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 160(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	736(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	704(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm1, 1312(%rsp)
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovaps	%ymm0, 1280(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	1312(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1280(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 192(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	672(%rsp), %ymm1
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	640(%rsp), %ymm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm1, 1248(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1216(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1248(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1216(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhps	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 224(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovapd	(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     vmovapd	32(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 256(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovapd	64(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	96(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 288(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	32(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm1, %ymm0, %ymm0
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovaps	%ymm0, 320(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	64(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	96(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 352(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	128(%rsp), %ymm0
 -      -      -      -      -     1.00    -      -      -      -     vmovapd	160(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm1, %ymm0, %ymm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm0, 384(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	192(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	224(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 416(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	128(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	160(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 448(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovapd	192(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     vmovapd	224(%rsp), %ymm1
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm1, %ymm0, %ymm0
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovaps	%ymm0, 480(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	256(%rsp), %ymm1
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	288(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$32, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 736(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	320(%rsp), %ymm1
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	352(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$32, %ymm1, %ymm0, %ymm0
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm0, 704(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	384(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	416(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$32, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 672(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	448(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	480(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$32, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 640(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	256(%rsp), %ymm1
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	288(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$49, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 608(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	320(%rsp), %ymm1
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	352(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$49, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 576(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	384(%rsp), %ymm1
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	416(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$49, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 544(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	448(%rsp), %ymm1
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	480(%rsp), %ymm0
 -      -      -      -      -      -      -     1.00    -      -     vperm2f128	$49, %ymm1, %ymm0, %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 512(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     movq	864(%rsp), %rax
 -      -      -     1.00   0.01   0.99    -      -      -      -     imulq	$0, 856(%rsp), %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     shlq	$2, %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     addq	%rcx, %rax
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	736(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1976(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1920(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1920(%rsp), %ymm0
 -      -      -      -      -     1.00    -      -      -      -     movq	1976(%rsp), %rax
 -      -      -      -      -     0.01   1.00    -      -     0.99   vmovups	%ymm0, (%rax)
 -      -      -      -     1.00    -      -      -      -      -     movq	864(%rsp), %rax
 -      -      -      -      -     1.00    -      -      -      -     movq	856(%rsp), %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$0, %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$2, %rcx
 -      -      -      -      -      -      -     1.00    -      -     addq	%rcx, %rax
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	704(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1912(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1856(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1856(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     movq	1912(%rsp), %rax
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovups	%ymm0, (%rax)
 -      -      -      -      -     1.00    -      -      -      -     movq	864(%rsp), %rax
 -      -      -      -     1.00    -      -      -      -      -     movq	856(%rsp), %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     shlq	%rcx
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$2, %rcx
 -      -      -      -      -      -      -     1.00    -      -     addq	%rcx, %rax
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	672(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1848(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1792(%rsp)
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	1792(%rsp), %ymm0
 -      -      -      -      -     1.00    -      -      -      -     movq	1848(%rsp), %rax
 -      -      -      -      -     0.01   1.00    -      -     0.99   vmovups	%ymm0, (%rax)
 -      -      -      -     1.00    -      -      -      -      -     movq	864(%rsp), %rax
 -      -      -     1.00    -     1.00    -      -      -      -     imulq	$3, 856(%rsp), %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     shlq	$2, %rcx
 -      -      -      -      -      -      -     1.00    -      -     addq	%rcx, %rax
 -      -      -      -     1.00    -      -      -      -      -     vmovaps	640(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1784(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1728(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1728(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     movq	1784(%rsp), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%ymm0, (%rax)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	864(%rsp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movq	856(%rsp), %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$2, %rcx
 -      -     0.99    -      -      -      -      -     0.01    -     shlq	$2, %rcx
 -      -      -     0.99    -      -      -     0.01    -      -     addq	%rcx, %rax
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	608(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1720(%rsp)
 -      -      -      -     0.99    -     1.00    -      -     0.01   vmovaps	%ymm0, 1664(%rsp)
 -      -      -      -      -     1.00    -      -      -      -     vmovaps	1664(%rsp), %ymm0
 -      -      -      -     1.00    -      -      -      -      -     movq	1720(%rsp), %rax
 -      -      -      -     0.01    -     1.00    -      -     0.99   vmovups	%ymm0, (%rax)
 -      -      -      -      -     1.00    -      -      -      -     movq	864(%rsp), %rax
 -      -      -     1.00   1.00    -      -      -      -      -     imulq	$5, 856(%rsp), %rcx
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$2, %rcx
 -      -      -      -      -      -      -     0.99   0.01    -     addq	%rcx, %rax
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	576(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1656(%rsp)
 -      -      -      -      -      -     1.00    -      -     1.00   vmovaps	%ymm0, 1600(%rsp)
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	1600(%rsp), %ymm0
 -      -      -      -     0.01   0.99    -      -      -      -     movq	1656(%rsp), %rax
 -      -      -      -     0.99   0.01   1.00    -      -      -     vmovups	%ymm0, (%rax)
 -      -      -      -     0.99   0.01    -      -      -      -     movq	864(%rsp), %rax
 -      -      -     1.00   0.01   0.99    -      -      -      -     imulq	$6, 856(%rsp), %rcx
 -      -     1.00    -      -      -      -      -      -      -     shlq	$2, %rcx
 -      -      -     0.99    -      -      -      -     0.01    -     addq	%rcx, %rax
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	544(%rsp), %ymm0
 -      -      -      -      -      -     1.00    -      -     1.00   movq	%rax, 1592(%rsp)
 -      -      -      -     0.99   0.01   1.00    -      -      -     vmovaps	%ymm0, 1536(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1536(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     movq	1592(%rsp), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%ymm0, (%rax)
 -      -      -      -     0.01   0.99    -      -      -      -     movq	864(%rsp), %rax
 -      -      -     1.00    -     1.00    -      -      -      -     imulq	$7, 856(%rsp), %rcx
 -      -     1.00    -      -      -      -      -      -      -     shlq	$2, %rcx
 -      -      -     0.01    -      -      -     0.99    -      -     addq	%rcx, %rax
 -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	512(%rsp), %ymm0
 -      -      -      -      -     1.00   1.00    -      -      -     movq	%rax, 1528(%rsp)
 -      -      -      -     1.00    -     1.00    -      -      -     vmovaps	%ymm0, 1472(%rsp)
 -      -      -      -     0.01   0.99    -      -      -      -     vmovaps	1472(%rsp), %ymm0
 -      -      -      -     0.99   0.01    -      -      -      -     movq	1528(%rsp), %rax
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%ymm0, (%rax)
 -      -     0.99    -      -      -      -     0.01    -      -     movq	%rbp, %rsp
 -      -      -     0.01   0.01   0.99    -      -     0.99    -     popq	%rbp
 -      -      -      -      -      -      -      -      -      -     vzeroupper
 -      -      -      -     0.01   0.99    -     1.00   1.00    -     retq
In [ ]: