Forming Architectural Performance Expectations¶
In [1]:
!mkdir -p tmp
This demonstrates the LLVM Machine Code Analyzer.
In [13]:
%%writefile tmp/transpose.c
#include <x86intrin.h>
// 8x8 transpose kernel stolen from
// https://github.com/springer13/hptt/blob/e1017ef8b8ed0b6f3bb3b70df825a87f94c643e8/src/transpose.cpp#L137
void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
{
__m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B
__m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B
//Load A
__m256 rowA0 = _mm256_loadu_ps((A +0*lda));
__m256 rowA1 = _mm256_loadu_ps((A +1*lda));
__m256 rowA2 = _mm256_loadu_ps((A +2*lda));
__m256 rowA3 = _mm256_loadu_ps((A +3*lda));
__m256 rowA4 = _mm256_loadu_ps((A +4*lda));
__m256 rowA5 = _mm256_loadu_ps((A +5*lda));
__m256 rowA6 = _mm256_loadu_ps((A +6*lda));
__m256 rowA7 = _mm256_loadu_ps((A +7*lda));
//8x8 transpose micro kernel
__m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28;
r28 = _mm256_unpacklo_ps( rowA4, rowA5 );
r54 = _mm256_unpacklo_ps( rowA6, rowA7 );
r0 = _mm256_unpacklo_ps( rowA0, rowA1 );
r10 = _mm256_unpacklo_ps( rowA2, rowA3 );
r29 = _mm256_unpackhi_ps( rowA4, rowA5 );
r55 = _mm256_unpackhi_ps( rowA6, rowA7 );
r1 = _mm256_unpackhi_ps( rowA0, rowA1 );
r11 = _mm256_unpackhi_ps( rowA2, rowA3 );
r88 = _mm256_shuffle_ps( r28, r54, 0x44 );
r70 = _mm256_shuffle_ps( r0, r10, 0x44 );
r89 = _mm256_shuffle_ps( r28, r54, 0xee );
r71 = _mm256_shuffle_ps( r0, r10, 0xee );
r138 = _mm256_shuffle_ps( r29, r55, 0x44 );
r120 = _mm256_shuffle_ps( r1, r11, 0x44 );
r139 = _mm256_shuffle_ps( r29, r55, 0xee );
r121 = _mm256_shuffle_ps( r1, r11, 0xee );
rowA0 = _mm256_permute2f128_ps( r88, r70, 0x2 );
rowA1 = _mm256_permute2f128_ps( r89, r71, 0x2 );
rowA2 = _mm256_permute2f128_ps( r138, r120, 0x2 );
rowA3 = _mm256_permute2f128_ps( r139, r121, 0x2 );
rowA4 = _mm256_permute2f128_ps( r88, r70, 0x13 );
rowA5 = _mm256_permute2f128_ps( r89, r71, 0x13 );
rowA6 = _mm256_permute2f128_ps( r138, r120, 0x13 );
rowA7 = _mm256_permute2f128_ps( r139, r121, 0x13 );
_mm256_storeu_ps((B + 0 * ldb), rowA0);
_mm256_storeu_ps((B + 1 * ldb), rowA1);
_mm256_storeu_ps((B + 2 * ldb), rowA2);
_mm256_storeu_ps((B + 3 * ldb), rowA3);
_mm256_storeu_ps((B + 4 * ldb), rowA4);
_mm256_storeu_ps((B + 5 * ldb), rowA5);
_mm256_storeu_ps((B + 6 * ldb), rowA6);
_mm256_storeu_ps((B + 7 * ldb), rowA7);
}
Overwriting tmp/transpose.c
In [14]:
!clang -S -o - -march=haswell tmp/transpose.c | llvm-mca -mcpu=haswell
warning: found a return instruction in the input assembly sequence. note: program counter updates are ignored. Iterations: 100 Instructions: 34400 Total Cycles: 11910 Total uOps: 44700 Dispatch Width: 4 uOps Per Cycle: 3.75 IPC: 2.89 Block RThroughput: 111.8 Instruction Info: [1]: #uOps [2]: Latency [3]: RThroughput [4]: MayLoad [5]: MayStore [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: 3 2 1.00 * pushq %rbp 1 1 0.25 movq %rsp, %rbp 1 1 0.25 andq $-32, %rsp 1 1 0.25 subq $2144, %rsp 1 1 1.00 * movq %rdi, 880(%rsp) 1 1 1.00 * movq %rsi, 872(%rsp) 1 1 1.00 * movq %rdx, 864(%rsp) 1 1 1.00 * movq %rcx, 856(%rsp) 2 1 1.00 * vmovss %xmm0, 852(%rsp) 2 1 1.00 * vmovss %xmm1, 848(%rsp) 1 5 0.50 * vmovss 852(%rsp), %xmm0 2 1 1.00 * vmovss %xmm0, 892(%rsp) 1 5 0.50 * vmovss 892(%rsp), %xmm0 2 1 1.00 * vmovss %xmm0, 2044(%rsp) 2 1 1.00 * vmovss %xmm0, 2040(%rsp) 2 1 1.00 * vmovss %xmm0, 2036(%rsp) 2 1 1.00 * vmovss %xmm0, 2032(%rsp) 2 1 1.00 * vmovss %xmm0, 2028(%rsp) 2 1 1.00 * vmovss %xmm0, 2024(%rsp) 2 1 1.00 * vmovss %xmm0, 2020(%rsp) 2 1 1.00 * vmovss %xmm0, 2016(%rsp) 1 5 0.50 * vmovss 2020(%rsp), %xmm1 1 5 0.50 * vmovss 2016(%rsp), %xmm0 1 1 1.00 vinsertps $16, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2024(%rsp), %xmm1 1 1 1.00 vinsertps $32, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2028(%rsp), %xmm1 1 1 1.00 vinsertps $48, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2036(%rsp), %xmm2 1 5 0.50 * vmovss 2032(%rsp), %xmm1 1 1 1.00 vinsertps $16, %xmm2, %xmm1, %xmm1 1 5 0.50 * vmovss 2040(%rsp), %xmm2 1 1 1.00 vinsertps $32, %xmm2, %xmm1, %xmm1 1 5 0.50 * vmovss 2044(%rsp), %xmm2 1 1 1.00 vinsertps $48, %xmm2, %xmm1, %xmm1 2 1 1.00 * vmovaps %xmm1, 2000(%rsp) 2 1 1.00 * vmovaps %xmm0, 1984(%rsp) 1 7 0.50 * vmovaps 1984(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm0, 800(%rsp) 1 5 0.50 * vmovss 848(%rsp), %xmm0 2 1 1.00 * vmovss %xmm0, 888(%rsp) 1 5 0.50 * vmovss 888(%rsp), %xmm0 2 1 1.00 * vmovss %xmm0, 2124(%rsp) 2 1 1.00 * vmovss %xmm0, 2120(%rsp) 2 1 1.00 * vmovss %xmm0, 2116(%rsp) 2 1 1.00 * vmovss %xmm0, 2112(%rsp) 2 1 1.00 * vmovss %xmm0, 2108(%rsp) 2 1 1.00 * vmovss %xmm0, 2104(%rsp) 2 1 1.00 * vmovss %xmm0, 2100(%rsp) 2 1 1.00 * vmovss %xmm0, 2096(%rsp) 1 5 0.50 * vmovss 2100(%rsp), %xmm1 1 5 0.50 * vmovss 2096(%rsp), %xmm0 1 1 1.00 vinsertps $16, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2104(%rsp), %xmm1 1 1 1.00 vinsertps $32, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2108(%rsp), %xmm1 1 1 1.00 vinsertps $48, %xmm1, %xmm0, %xmm0 1 5 0.50 * vmovss 2116(%rsp), %xmm2 1 5 0.50 * vmovss 2112(%rsp), %xmm1 1 1 1.00 vinsertps $16, %xmm2, %xmm1, %xmm1 1 5 0.50 * vmovss 2120(%rsp), %xmm2 1 1 1.00 vinsertps $32, %xmm2, %xmm1, %xmm1 1 5 0.50 * vmovss 2124(%rsp), %xmm2 1 1 1.00 vinsertps $48, %xmm2, %xmm1, %xmm1 2 1 1.00 * vmovaps %xmm1, 2064(%rsp) 2 1 1.00 * vmovaps %xmm0, 2048(%rsp) 1 7 0.50 * vmovaps 2048(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm0, 768(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 1 1.00 * movq %rax, 952(%rsp) 1 5 0.50 * movq 952(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 736(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 leaq (%rax,%rcx,4), %rax 1 1 1.00 * movq %rax, 944(%rsp) 1 5 0.50 * movq 944(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 704(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 leaq (%rax,%rcx,8), %rax 1 1 1.00 * movq %rax, 936(%rsp) 1 5 0.50 * movq 936(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 672(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 leaq (%rcx,%rcx,2), %rcx 1 1 0.50 leaq (%rax,%rcx,4), %rax 1 1 1.00 * movq %rax, 928(%rsp) 1 5 0.50 * movq 928(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 640(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 shlq $4, %rcx 1 1 0.25 addq %rcx, %rax 1 1 1.00 * movq %rax, 920(%rsp) 1 5 0.50 * movq 920(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 608(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 leaq (%rcx,%rcx,4), %rcx 1 1 0.50 leaq (%rax,%rcx,4), %rax 1 1 1.00 * movq %rax, 912(%rsp) 1 5 0.50 * movq 912(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 576(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rcx 1 1 0.50 leaq (%rcx,%rcx,2), %rcx 1 1 0.50 leaq (%rax,%rcx,8), %rax 1 1 1.00 * movq %rax, 904(%rsp) 1 5 0.50 * movq 904(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 544(%rsp) 1 5 0.50 * movq 880(%rsp), %rax 1 5 0.50 * movq 872(%rsp), %rdx 1 1 0.50 leaq (%rdx,%rdx,8), %rcx 1 1 0.50 leaq (%rcx,%rcx,2), %rcx 1 1 0.25 addq %rdx, %rcx 1 1 0.25 addq %rcx, %rax 1 1 1.00 * movq %rax, 896(%rsp) 1 5 0.50 * movq 896(%rsp), %rax 1 7 0.50 * vmovups (%rax), %ymm0 2 1 1.00 * vmovaps %ymm0, 512(%rsp) 1 7 0.50 * vmovaps 608(%rsp), %ymm1 1 7 0.50 * vmovaps 576(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1184(%rsp) 2 1 1.00 * vmovaps %ymm0, 1152(%rsp) 1 7 0.50 * vmovaps 1184(%rsp), %ymm0 1 7 0.50 * vmovaps 1152(%rsp), %ymm1 1 1 1.00 vunpcklps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, (%rsp) 1 7 0.50 * vmovaps 544(%rsp), %ymm1 1 7 0.50 * vmovaps 512(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1120(%rsp) 2 1 1.00 * vmovaps %ymm0, 1088(%rsp) 1 7 0.50 * vmovaps 1120(%rsp), %ymm0 1 7 0.50 * vmovaps 1088(%rsp), %ymm1 1 1 1.00 vunpcklps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 32(%rsp) 1 7 0.50 * vmovaps 736(%rsp), %ymm1 1 7 0.50 * vmovaps 704(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1056(%rsp) 2 1 1.00 * vmovaps %ymm0, 1024(%rsp) 1 7 0.50 * vmovaps 1056(%rsp), %ymm0 1 7 0.50 * vmovaps 1024(%rsp), %ymm1 1 1 1.00 vunpcklps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 64(%rsp) 1 7 0.50 * vmovaps 672(%rsp), %ymm1 1 7 0.50 * vmovaps 640(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 992(%rsp) 2 1 1.00 * vmovaps %ymm0, 960(%rsp) 1 7 0.50 * vmovaps 992(%rsp), %ymm0 1 7 0.50 * vmovaps 960(%rsp), %ymm1 1 1 1.00 vunpcklps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 96(%rsp) 1 7 0.50 * vmovaps 608(%rsp), %ymm1 1 7 0.50 * vmovaps 576(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1440(%rsp) 2 1 1.00 * vmovaps %ymm0, 1408(%rsp) 1 7 0.50 * vmovaps 1440(%rsp), %ymm0 1 7 0.50 * vmovaps 1408(%rsp), %ymm1 1 1 1.00 vunpckhps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 128(%rsp) 1 7 0.50 * vmovaps 544(%rsp), %ymm1 1 7 0.50 * vmovaps 512(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1376(%rsp) 2 1 1.00 * vmovaps %ymm0, 1344(%rsp) 1 7 0.50 * vmovaps 1376(%rsp), %ymm0 1 7 0.50 * vmovaps 1344(%rsp), %ymm1 1 1 1.00 vunpckhps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 160(%rsp) 1 7 0.50 * vmovaps 736(%rsp), %ymm1 1 7 0.50 * vmovaps 704(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1312(%rsp) 2 1 1.00 * vmovaps %ymm0, 1280(%rsp) 1 7 0.50 * vmovaps 1312(%rsp), %ymm0 1 7 0.50 * vmovaps 1280(%rsp), %ymm1 1 1 1.00 vunpckhps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 192(%rsp) 1 7 0.50 * vmovaps 672(%rsp), %ymm1 1 7 0.50 * vmovaps 640(%rsp), %ymm0 2 1 1.00 * vmovaps %ymm1, 1248(%rsp) 2 1 1.00 * vmovaps %ymm0, 1216(%rsp) 1 7 0.50 * vmovaps 1248(%rsp), %ymm0 1 7 0.50 * vmovaps 1216(%rsp), %ymm1 1 1 1.00 vunpckhps %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 224(%rsp) 1 7 0.50 * vmovapd (%rsp), %ymm0 1 7 0.50 * vmovapd 32(%rsp), %ymm1 1 1 1.00 vunpcklpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 256(%rsp) 1 7 0.50 * vmovapd 64(%rsp), %ymm0 1 7 0.50 * vmovapd 96(%rsp), %ymm1 1 1 1.00 vunpcklpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 288(%rsp) 1 7 0.50 * vmovapd (%rsp), %ymm0 1 7 0.50 * vmovapd 32(%rsp), %ymm1 1 1 1.00 vunpckhpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 320(%rsp) 1 7 0.50 * vmovapd 64(%rsp), %ymm0 1 7 0.50 * vmovapd 96(%rsp), %ymm1 1 1 1.00 vunpckhpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 352(%rsp) 1 7 0.50 * vmovapd 128(%rsp), %ymm0 1 7 0.50 * vmovapd 160(%rsp), %ymm1 1 1 1.00 vunpcklpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 384(%rsp) 1 7 0.50 * vmovapd 192(%rsp), %ymm0 1 7 0.50 * vmovapd 224(%rsp), %ymm1 1 1 1.00 vunpcklpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 416(%rsp) 1 7 0.50 * vmovapd 128(%rsp), %ymm0 1 7 0.50 * vmovapd 160(%rsp), %ymm1 1 1 1.00 vunpckhpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 448(%rsp) 1 7 0.50 * vmovapd 192(%rsp), %ymm0 1 7 0.50 * vmovapd 224(%rsp), %ymm1 1 1 1.00 vunpckhpd %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 480(%rsp) 1 7 0.50 * vmovaps 256(%rsp), %ymm1 1 7 0.50 * vmovaps 288(%rsp), %ymm0 1 3 1.00 vperm2f128 $32, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 736(%rsp) 1 7 0.50 * vmovaps 320(%rsp), %ymm1 1 7 0.50 * vmovaps 352(%rsp), %ymm0 1 3 1.00 vperm2f128 $32, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 704(%rsp) 1 7 0.50 * vmovaps 384(%rsp), %ymm1 1 7 0.50 * vmovaps 416(%rsp), %ymm0 1 3 1.00 vperm2f128 $32, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 672(%rsp) 1 7 0.50 * vmovaps 448(%rsp), %ymm1 1 7 0.50 * vmovaps 480(%rsp), %ymm0 1 3 1.00 vperm2f128 $32, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 640(%rsp) 1 7 0.50 * vmovaps 256(%rsp), %ymm1 1 7 0.50 * vmovaps 288(%rsp), %ymm0 1 3 1.00 vperm2f128 $49, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 608(%rsp) 1 7 0.50 * vmovaps 320(%rsp), %ymm1 1 7 0.50 * vmovaps 352(%rsp), %ymm0 1 3 1.00 vperm2f128 $49, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 576(%rsp) 1 7 0.50 * vmovaps 384(%rsp), %ymm1 1 7 0.50 * vmovaps 416(%rsp), %ymm0 1 3 1.00 vperm2f128 $49, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 544(%rsp) 1 7 0.50 * vmovaps 448(%rsp), %ymm1 1 7 0.50 * vmovaps 480(%rsp), %ymm0 1 3 1.00 vperm2f128 $49, %ymm1, %ymm0, %ymm0 2 1 1.00 * vmovaps %ymm0, 512(%rsp) 1 5 0.50 * movq 864(%rsp), %rax 2 8 1.00 * imulq $0, 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 736(%rsp), %ymm0 1 1 1.00 * movq %rax, 1976(%rsp) 2 1 1.00 * vmovaps %ymm0, 1920(%rsp) 1 7 0.50 * vmovaps 1920(%rsp), %ymm0 1 5 0.50 * movq 1976(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 1 5 0.50 * movq 856(%rsp), %rcx 1 1 0.50 shlq $0, %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 704(%rsp), %ymm0 1 1 1.00 * movq %rax, 1912(%rsp) 2 1 1.00 * vmovaps %ymm0, 1856(%rsp) 1 7 0.50 * vmovaps 1856(%rsp), %ymm0 1 5 0.50 * movq 1912(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 1 5 0.50 * movq 856(%rsp), %rcx 1 1 0.50 shlq %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 672(%rsp), %ymm0 1 1 1.00 * movq %rax, 1848(%rsp) 2 1 1.00 * vmovaps %ymm0, 1792(%rsp) 1 7 0.50 * vmovaps 1792(%rsp), %ymm0 1 5 0.50 * movq 1848(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 2 8 1.00 * imulq $3, 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 640(%rsp), %ymm0 1 1 1.00 * movq %rax, 1784(%rsp) 2 1 1.00 * vmovaps %ymm0, 1728(%rsp) 1 7 0.50 * vmovaps 1728(%rsp), %ymm0 1 5 0.50 * movq 1784(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 1 5 0.50 * movq 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 608(%rsp), %ymm0 1 1 1.00 * movq %rax, 1720(%rsp) 2 1 1.00 * vmovaps %ymm0, 1664(%rsp) 1 7 0.50 * vmovaps 1664(%rsp), %ymm0 1 5 0.50 * movq 1720(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 2 8 1.00 * imulq $5, 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 576(%rsp), %ymm0 1 1 1.00 * movq %rax, 1656(%rsp) 2 1 1.00 * vmovaps %ymm0, 1600(%rsp) 1 7 0.50 * vmovaps 1600(%rsp), %ymm0 1 5 0.50 * movq 1656(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 2 8 1.00 * imulq $6, 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 544(%rsp), %ymm0 1 1 1.00 * movq %rax, 1592(%rsp) 2 1 1.00 * vmovaps %ymm0, 1536(%rsp) 1 7 0.50 * vmovaps 1536(%rsp), %ymm0 1 5 0.50 * movq 1592(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 5 0.50 * movq 864(%rsp), %rax 2 8 1.00 * imulq $7, 856(%rsp), %rcx 1 1 0.50 shlq $2, %rcx 1 1 0.25 addq %rcx, %rax 1 7 0.50 * vmovaps 512(%rsp), %ymm0 1 1 1.00 * movq %rax, 1528(%rsp) 2 1 1.00 * vmovaps %ymm0, 1472(%rsp) 1 7 0.50 * vmovaps 1472(%rsp), %ymm0 1 5 0.50 * movq 1528(%rsp), %rax 2 1 1.00 * vmovups %ymm0, (%rax) 1 1 0.25 movq %rbp, %rsp 2 6 0.50 * popq %rbp 4 0 1.00 U vzeroupper 3 7 1.00 U retq Resources: [0] - HWDivider [1] - HWFPDivider [2] - HWPort0 [3] - HWPort1 [4] - HWPort2 [5] - HWPort3 [6] - HWPort4 [7] - HWPort5 [8] - HWPort6 [9] - HWPort7 Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] - - 10.01 14.98 90.00 90.01 111.00 47.00 10.01 89.99 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: - - - 0.98 - 0.99 1.00 0.01 0.01 0.01 pushq %rbp - - - 0.01 - - - 0.01 0.98 - movq %rsp, %rbp - - 0.01 0.01 - - - 0.98 - - andq $-32, %rsp - - 0.01 0.98 - - - - 0.01 - subq $2144, %rsp - - - - 0.99 0.01 1.00 - - - movq %rdi, 880(%rsp) - - - - - - 1.00 - - 1.00 movq %rsi, 872(%rsp) - - - - 0.01 - 1.00 - - 0.99 movq %rdx, 864(%rsp) - - - - - - 1.00 - - 1.00 movq %rcx, 856(%rsp) - - - - - 0.01 1.00 - - 0.99 vmovss %xmm0, 852(%rsp) - - - - 0.01 - 1.00 - - 0.99 vmovss %xmm1, 848(%rsp) - - - - 1.00 - - - - - vmovss 852(%rsp), %xmm0 - - - - - - 1.00 - - 1.00 vmovss %xmm0, 892(%rsp) - - - - - 1.00 - - - - vmovss 892(%rsp), %xmm0 - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2044(%rsp) - - - - 0.99 - 1.00 - - 0.01 vmovss %xmm0, 2040(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2036(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2032(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2028(%rsp) - - - - 0.01 - 1.00 - - 0.99 vmovss %xmm0, 2024(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2020(%rsp) - - - - 0.99 0.01 1.00 - - - vmovss %xmm0, 2016(%rsp) - - - - - 1.00 - - - - vmovss 2020(%rsp), %xmm1 - - - - 1.00 - - - - - vmovss 2016(%rsp), %xmm0 - - - - - - - 1.00 - - vinsertps $16, %xmm1, %xmm0, %xmm0 - - - - - 1.00 - - - - vmovss 2024(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $32, %xmm1, %xmm0, %xmm0 - - - - 1.00 - - - - - vmovss 2028(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $48, %xmm1, %xmm0, %xmm0 - - - - - 1.00 - - - - vmovss 2036(%rsp), %xmm2 - - - - 1.00 - - - - - vmovss 2032(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $16, %xmm2, %xmm1, %xmm1 - - - - - 1.00 - - - - vmovss 2040(%rsp), %xmm2 - - - - - - - 1.00 - - vinsertps $32, %xmm2, %xmm1, %xmm1 - - - - 1.00 - - - - - vmovss 2044(%rsp), %xmm2 - - - - - - - 1.00 - - vinsertps $48, %xmm2, %xmm1, %xmm1 - - - - - - 1.00 - - 1.00 vmovaps %xmm1, 2000(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %xmm0, 1984(%rsp) - - - - - 1.00 - - - - vmovaps 1984(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 800(%rsp) - - - - 0.01 0.99 - - - - vmovss 848(%rsp), %xmm0 - - - - 0.99 - 1.00 - - 0.01 vmovss %xmm0, 888(%rsp) - - - - 0.99 0.01 - - - - vmovss 888(%rsp), %xmm0 - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2124(%rsp) - - - - 0.01 - 1.00 - - 0.99 vmovss %xmm0, 2120(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2116(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2112(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2108(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2104(%rsp) - - - - 0.01 - 1.00 - - 0.99 vmovss %xmm0, 2100(%rsp) - - - - - - 1.00 - - 1.00 vmovss %xmm0, 2096(%rsp) - - - - 0.01 0.99 - - - - vmovss 2100(%rsp), %xmm1 - - - - 0.99 0.01 - - - - vmovss 2096(%rsp), %xmm0 - - - - - - - 1.00 - - vinsertps $16, %xmm1, %xmm0, %xmm0 - - - - 0.01 0.99 - - - - vmovss 2104(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $32, %xmm1, %xmm0, %xmm0 - - - - 0.99 0.01 - - - - vmovss 2108(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $48, %xmm1, %xmm0, %xmm0 - - - - 0.01 0.99 - - - - vmovss 2116(%rsp), %xmm2 - - - - 0.99 0.01 - - - - vmovss 2112(%rsp), %xmm1 - - - - - - - 1.00 - - vinsertps $16, %xmm2, %xmm1, %xmm1 - - - - 0.01 0.99 - - - - vmovss 2120(%rsp), %xmm2 - - - - - - - 1.00 - - vinsertps $32, %xmm2, %xmm1, %xmm1 - - - - 0.99 0.01 - - - - vmovss 2124(%rsp), %xmm2 - - - - - - - 1.00 - - vinsertps $48, %xmm2, %xmm1, %xmm1 - - - - - - 1.00 - - 1.00 vmovaps %xmm1, 2064(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %xmm0, 2048(%rsp) - - - - 0.01 0.99 - - - - vmovaps 2048(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 768(%rsp) - - - - - 1.00 - - - - movq 880(%rsp), %rax - - - - - - 1.00 - - 1.00 movq %rax, 952(%rsp) - - - - 0.99 0.01 - - - - movq 952(%rsp), %rax - - - - 0.99 0.01 - - - - vmovups (%rax), %ymm0 - - - - - 0.99 1.00 - - 0.01 vmovaps %ymm0, 736(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rcx - - - 1.00 - - - - - - leaq (%rax,%rcx,4), %rax - - - - - - 1.00 - - 1.00 movq %rax, 944(%rsp) - - - - 0.01 0.99 - - - - movq 944(%rsp), %rax - - - - 1.00 - - - - - vmovups (%rax), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 704(%rsp) - - - - - 1.00 - - - - movq 880(%rsp), %rax - - - - 1.00 - - - - - movq 872(%rsp), %rcx - - - 1.00 - - - - - - leaq (%rax,%rcx,8), %rax - - - - - - 1.00 - - 1.00 movq %rax, 936(%rsp) - - - - - 1.00 - - - - movq 936(%rsp), %rax - - - - 1.00 - - - - - vmovups (%rax), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 672(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rcx - - - - - - - 1.00 - - leaq (%rcx,%rcx,2), %rcx - - - 1.00 - - - - - - leaq (%rax,%rcx,4), %rax - - - - - - 1.00 - - 1.00 movq %rax, 928(%rsp) - - - - - 1.00 - - - - movq 928(%rsp), %rax - - - - - 1.00 - - - - vmovups (%rax), %ymm0 - - - - - 0.99 1.00 - - 0.01 vmovaps %ymm0, 640(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rcx - - 0.99 - - - - - 0.01 - shlq $4, %rcx - - 0.01 - - - - - 0.99 - addq %rcx, %rax - - - - - - 1.00 - - 1.00 movq %rax, 920(%rsp) - - - - - 1.00 - - - - movq 920(%rsp), %rax - - - - - 1.00 - - - - vmovups (%rax), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 608(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rcx - - - - - - - 1.00 - - leaq (%rcx,%rcx,4), %rcx - - - 1.00 - - - - - - leaq (%rax,%rcx,4), %rax - - - - - - 1.00 - - 1.00 movq %rax, 912(%rsp) - - - - 1.00 - - - - - movq 912(%rsp), %rax - - - - 1.00 - - - - - vmovups (%rax), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 576(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rcx - - - - - - - 1.00 - - leaq (%rcx,%rcx,2), %rcx - - - 1.00 - - - - - - leaq (%rax,%rcx,8), %rax - - - - - - 1.00 - - 1.00 movq %rax, 904(%rsp) - - - - 1.00 - - - - - movq 904(%rsp), %rax - - - - 0.01 0.99 - - - - vmovups (%rax), %ymm0 - - - - 0.01 0.99 1.00 - - - vmovaps %ymm0, 544(%rsp) - - - - 0.01 0.99 - - - - movq 880(%rsp), %rax - - - - 0.99 0.01 - - - - movq 872(%rsp), %rdx - - - - - - - 1.00 - - leaq (%rdx,%rdx,8), %rcx - - - 1.00 - - - - - - leaq (%rcx,%rcx,2), %rcx - - 0.99 - - - - - 0.01 - addq %rdx, %rcx - - 0.01 - - - - - 0.99 - addq %rcx, %rax - - - - - - 1.00 - - 1.00 movq %rax, 896(%rsp) - - - - - 1.00 - - - - movq 896(%rsp), %rax - - - - 0.99 0.01 - - - - vmovups (%rax), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 512(%rsp) - - - - 1.00 - - - - - vmovaps 608(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 576(%rsp), %ymm0 - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm1, 1184(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1152(%rsp) - - - - - 1.00 - - - - vmovaps 1184(%rsp), %ymm0 - - - - 1.00 - - - - - vmovaps 1152(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklps %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, (%rsp) - - - - - 1.00 - - - - vmovaps 544(%rsp), %ymm1 - - - - 1.00 - - - - - vmovaps 512(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm1, 1120(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1088(%rsp) - - - - 0.01 0.99 - - - - vmovaps 1120(%rsp), %ymm0 - - - - 0.99 0.01 - - - - vmovaps 1088(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklps %ymm1, %ymm0, %ymm0 - - - - - 0.01 1.00 - - 0.99 vmovaps %ymm0, 32(%rsp) - - - - 0.01 0.99 - - - - vmovaps 736(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 704(%rsp), %ymm0 - - - - - 0.99 1.00 - - 0.01 vmovaps %ymm1, 1056(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1024(%rsp) - - - - - 1.00 - - - - vmovaps 1056(%rsp), %ymm0 - - - - 1.00 - - - - - vmovaps 1024(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklps %ymm1, %ymm0, %ymm0 - - - - 0.01 - 1.00 - - 0.99 vmovaps %ymm0, 64(%rsp) - - - - - 1.00 - - - - vmovaps 672(%rsp), %ymm1 - - - - 1.00 - - - - - vmovaps 640(%rsp), %ymm0 - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm1, 992(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 960(%rsp) - - - - 0.99 0.01 - - - - vmovaps 992(%rsp), %ymm0 - - - - 0.01 0.99 - - - - vmovaps 960(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklps %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 96(%rsp) - - - - 0.99 0.01 - - - - vmovaps 608(%rsp), %ymm1 - - - - 0.01 0.99 - - - - vmovaps 576(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm1, 1440(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1408(%rsp) - - - - - 1.00 - - - - vmovaps 1440(%rsp), %ymm0 - - - - 1.00 - - - - - vmovaps 1408(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhps %ymm1, %ymm0, %ymm0 - - - - 0.01 - 1.00 - - 0.99 vmovaps %ymm0, 128(%rsp) - - - - - 1.00 - - - - vmovaps 544(%rsp), %ymm1 - - - - 1.00 - - - - - vmovaps 512(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm1, 1376(%rsp) - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm0, 1344(%rsp) - - - - 0.01 0.99 - - - - vmovaps 1376(%rsp), %ymm0 - - - - 0.99 0.01 - - - - vmovaps 1344(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhps %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 160(%rsp) - - - - 0.01 0.99 - - - - vmovaps 736(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 704(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm1, 1312(%rsp) - - - - 0.01 - 1.00 - - 0.99 vmovaps %ymm0, 1280(%rsp) - - - - 0.99 0.01 - - - - vmovaps 1312(%rsp), %ymm0 - - - - 0.01 0.99 - - - - vmovaps 1280(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhps %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 192(%rsp) - - - - 0.99 0.01 - - - - vmovaps 672(%rsp), %ymm1 - - - - 0.01 0.99 - - - - vmovaps 640(%rsp), %ymm0 - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm1, 1248(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1216(%rsp) - - - - - 1.00 - - - - vmovaps 1248(%rsp), %ymm0 - - - - 1.00 - - - - - vmovaps 1216(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhps %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 224(%rsp) - - - - - 1.00 - - - - vmovapd (%rsp), %ymm0 - - - - 1.00 - - - - - vmovapd 32(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklpd %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 256(%rsp) - - - - - 1.00 - - - - vmovapd 64(%rsp), %ymm0 - - - - 0.99 0.01 - - - - vmovapd 96(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklpd %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 288(%rsp) - - - - 0.01 0.99 - - - - vmovapd (%rsp), %ymm0 - - - - 0.99 0.01 - - - - vmovapd 32(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhpd %ymm1, %ymm0, %ymm0 - - - - 0.01 - 1.00 - - 0.99 vmovaps %ymm0, 320(%rsp) - - - - 0.01 0.99 - - - - vmovapd 64(%rsp), %ymm0 - - - - 0.99 0.01 - - - - vmovapd 96(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhpd %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 352(%rsp) - - - - 0.01 0.99 - - - - vmovapd 128(%rsp), %ymm0 - - - - - 1.00 - - - - vmovapd 160(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklpd %ymm1, %ymm0, %ymm0 - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm0, 384(%rsp) - - - - 0.99 0.01 - - - - vmovapd 192(%rsp), %ymm0 - - - - 0.01 0.99 - - - - vmovapd 224(%rsp), %ymm1 - - - - - - - 1.00 - - vunpcklpd %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 416(%rsp) - - - - 0.99 0.01 - - - - vmovapd 128(%rsp), %ymm0 - - - - 0.01 0.99 - - - - vmovapd 160(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhpd %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 448(%rsp) - - - - 0.99 0.01 - - - - vmovapd 192(%rsp), %ymm0 - - - - 0.01 0.99 - - - - vmovapd 224(%rsp), %ymm1 - - - - - - - 1.00 - - vunpckhpd %ymm1, %ymm0, %ymm0 - - - - 0.01 - 1.00 - - 0.99 vmovaps %ymm0, 480(%rsp) - - - - - 1.00 - - - - vmovaps 256(%rsp), %ymm1 - - - - 1.00 - - - - - vmovaps 288(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $32, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 736(%rsp) - - - - - 1.00 - - - - vmovaps 320(%rsp), %ymm1 - - - - 1.00 - - - - - vmovaps 352(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $32, %ymm1, %ymm0, %ymm0 - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm0, 704(%rsp) - - - - - 1.00 - - - - vmovaps 384(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 416(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $32, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 672(%rsp) - - - - 0.01 0.99 - - - - vmovaps 448(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 480(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $32, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 640(%rsp) - - - - 0.01 0.99 - - - - vmovaps 256(%rsp), %ymm1 - - - - 0.99 0.01 - - - - vmovaps 288(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $49, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 608(%rsp) - - - - 0.01 0.99 - - - - vmovaps 320(%rsp), %ymm1 - - - - - 1.00 - - - - vmovaps 352(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $49, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 576(%rsp) - - - - 0.99 0.01 - - - - vmovaps 384(%rsp), %ymm1 - - - - 0.01 0.99 - - - - vmovaps 416(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $49, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 544(%rsp) - - - - 0.99 0.01 - - - - vmovaps 448(%rsp), %ymm1 - - - - 0.01 0.99 - - - - vmovaps 480(%rsp), %ymm0 - - - - - - - 1.00 - - vperm2f128 $49, %ymm1, %ymm0, %ymm0 - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 512(%rsp) - - - - 0.99 0.01 - - - - movq 864(%rsp), %rax - - - 1.00 0.01 0.99 - - - - imulq $0, 856(%rsp), %rcx - - 0.99 - - - - - 0.01 - shlq $2, %rcx - - 0.99 - - - - - 0.01 - addq %rcx, %rax - - - - - 1.00 - - - - vmovaps 736(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1976(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1920(%rsp) - - - - 1.00 - - - - - vmovaps 1920(%rsp), %ymm0 - - - - - 1.00 - - - - movq 1976(%rsp), %rax - - - - - 0.01 1.00 - - 0.99 vmovups %ymm0, (%rax) - - - - 1.00 - - - - - movq 864(%rsp), %rax - - - - - 1.00 - - - - movq 856(%rsp), %rcx - - 0.01 - - - - - 0.99 - shlq $0, %rcx - - 0.01 - - - - - 0.99 - shlq $2, %rcx - - - - - - - 1.00 - - addq %rcx, %rax - - - - 1.00 - - - - - vmovaps 704(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1912(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1856(%rsp) - - - - - 1.00 - - - - vmovaps 1856(%rsp), %ymm0 - - - - 1.00 - - - - - movq 1912(%rsp), %rax - - - - 0.99 - 1.00 - - 0.01 vmovups %ymm0, (%rax) - - - - - 1.00 - - - - movq 864(%rsp), %rax - - - - 1.00 - - - - - movq 856(%rsp), %rcx - - 0.99 - - - - - 0.01 - shlq %rcx - - 0.01 - - - - - 0.99 - shlq $2, %rcx - - - - - - - 1.00 - - addq %rcx, %rax - - - - - 1.00 - - - - vmovaps 672(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1848(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1792(%rsp) - - - - 1.00 - - - - - vmovaps 1792(%rsp), %ymm0 - - - - - 1.00 - - - - movq 1848(%rsp), %rax - - - - - 0.01 1.00 - - 0.99 vmovups %ymm0, (%rax) - - - - 1.00 - - - - - movq 864(%rsp), %rax - - - 1.00 - 1.00 - - - - imulq $3, 856(%rsp), %rcx - - 0.99 - - - - - 0.01 - shlq $2, %rcx - - - - - - - 1.00 - - addq %rcx, %rax - - - - 1.00 - - - - - vmovaps 640(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1784(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1728(%rsp) - - - - 0.01 0.99 - - - - vmovaps 1728(%rsp), %ymm0 - - - - 0.99 0.01 - - - - movq 1784(%rsp), %rax - - - - - - 1.00 - - 1.00 vmovups %ymm0, (%rax) - - - - 0.01 0.99 - - - - movq 864(%rsp), %rax - - - - 0.99 0.01 - - - - movq 856(%rsp), %rcx - - 0.01 - - - - - 0.99 - shlq $2, %rcx - - 0.99 - - - - - 0.01 - shlq $2, %rcx - - - 0.99 - - - 0.01 - - addq %rcx, %rax - - - - 0.01 0.99 - - - - vmovaps 608(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1720(%rsp) - - - - 0.99 - 1.00 - - 0.01 vmovaps %ymm0, 1664(%rsp) - - - - - 1.00 - - - - vmovaps 1664(%rsp), %ymm0 - - - - 1.00 - - - - - movq 1720(%rsp), %rax - - - - 0.01 - 1.00 - - 0.99 vmovups %ymm0, (%rax) - - - - - 1.00 - - - - movq 864(%rsp), %rax - - - 1.00 1.00 - - - - - imulq $5, 856(%rsp), %rcx - - 0.01 - - - - - 0.99 - shlq $2, %rcx - - - - - - - 0.99 0.01 - addq %rcx, %rax - - - - 0.01 0.99 - - - - vmovaps 576(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1656(%rsp) - - - - - - 1.00 - - 1.00 vmovaps %ymm0, 1600(%rsp) - - - - 0.99 0.01 - - - - vmovaps 1600(%rsp), %ymm0 - - - - 0.01 0.99 - - - - movq 1656(%rsp), %rax - - - - 0.99 0.01 1.00 - - - vmovups %ymm0, (%rax) - - - - 0.99 0.01 - - - - movq 864(%rsp), %rax - - - 1.00 0.01 0.99 - - - - imulq $6, 856(%rsp), %rcx - - 1.00 - - - - - - - shlq $2, %rcx - - - 0.99 - - - - 0.01 - addq %rcx, %rax - - - - 0.99 0.01 - - - - vmovaps 544(%rsp), %ymm0 - - - - - - 1.00 - - 1.00 movq %rax, 1592(%rsp) - - - - 0.99 0.01 1.00 - - - vmovaps %ymm0, 1536(%rsp) - - - - 0.01 0.99 - - - - vmovaps 1536(%rsp), %ymm0 - - - - 0.99 0.01 - - - - movq 1592(%rsp), %rax - - - - - - 1.00 - - 1.00 vmovups %ymm0, (%rax) - - - - 0.01 0.99 - - - - movq 864(%rsp), %rax - - - 1.00 - 1.00 - - - - imulq $7, 856(%rsp), %rcx - - 1.00 - - - - - - - shlq $2, %rcx - - - 0.01 - - - 0.99 - - addq %rcx, %rax - - - - 0.99 0.01 - - - - vmovaps 512(%rsp), %ymm0 - - - - - 1.00 1.00 - - - movq %rax, 1528(%rsp) - - - - 1.00 - 1.00 - - - vmovaps %ymm0, 1472(%rsp) - - - - 0.01 0.99 - - - - vmovaps 1472(%rsp), %ymm0 - - - - 0.99 0.01 - - - - movq 1528(%rsp), %rax - - - - - - 1.00 - - 1.00 vmovups %ymm0, (%rax) - - 0.99 - - - - 0.01 - - movq %rbp, %rsp - - - 0.01 0.01 0.99 - - 0.99 - popq %rbp - - - - - - - - - - vzeroupper - - - - 0.01 0.99 - 1.00 1.00 - retq
In [ ]: