Forming Architectural Performance Expectations¶
In [1]:
!mkdir -p tmp
This demonstrates the Intel Architecture Code Analyzer, a tool released by Intel.
There is an open-source clone (by RRZ Erlangen-Nuernberg) called osaca.
In [2]:
%%writefile tmp/transpose.c
#include <x86intrin.h>
#include <iacaMarks.h>
// 8x8 transpose kernel stolen from
// https://github.com/springer13/hptt/blob/e1017ef8b8ed0b6f3bb3b70df825a87f94c643e8/src/transpose.cpp#L137
void execute(const float* __restrict__ A, const size_t lda, float* __restrict__ B, const size_t ldb, const float alpha ,const float beta)
{
IACA_START
__m256 reg_alpha = _mm256_set1_ps(alpha); // do not alter the content of B
__m256 reg_beta = _mm256_set1_ps(beta); // do not alter the content of B
//Load A
__m256 rowA0 = _mm256_loadu_ps((A +0*lda));
__m256 rowA1 = _mm256_loadu_ps((A +1*lda));
__m256 rowA2 = _mm256_loadu_ps((A +2*lda));
__m256 rowA3 = _mm256_loadu_ps((A +3*lda));
__m256 rowA4 = _mm256_loadu_ps((A +4*lda));
__m256 rowA5 = _mm256_loadu_ps((A +5*lda));
__m256 rowA6 = _mm256_loadu_ps((A +6*lda));
__m256 rowA7 = _mm256_loadu_ps((A +7*lda));
//8x8 transpose micro kernel
__m256 r121, r139, r120, r138, r71, r89, r70, r88, r11, r1, r55, r29, r10, r0, r54, r28;
r28 = _mm256_unpacklo_ps( rowA4, rowA5 );
r54 = _mm256_unpacklo_ps( rowA6, rowA7 );
r0 = _mm256_unpacklo_ps( rowA0, rowA1 );
r10 = _mm256_unpacklo_ps( rowA2, rowA3 );
r29 = _mm256_unpackhi_ps( rowA4, rowA5 );
r55 = _mm256_unpackhi_ps( rowA6, rowA7 );
r1 = _mm256_unpackhi_ps( rowA0, rowA1 );
r11 = _mm256_unpackhi_ps( rowA2, rowA3 );
r88 = _mm256_shuffle_ps( r28, r54, 0x44 );
r70 = _mm256_shuffle_ps( r0, r10, 0x44 );
r89 = _mm256_shuffle_ps( r28, r54, 0xee );
r71 = _mm256_shuffle_ps( r0, r10, 0xee );
r138 = _mm256_shuffle_ps( r29, r55, 0x44 );
r120 = _mm256_shuffle_ps( r1, r11, 0x44 );
r139 = _mm256_shuffle_ps( r29, r55, 0xee );
r121 = _mm256_shuffle_ps( r1, r11, 0xee );
rowA0 = _mm256_permute2f128_ps( r88, r70, 0x2 );
rowA1 = _mm256_permute2f128_ps( r89, r71, 0x2 );
rowA2 = _mm256_permute2f128_ps( r138, r120, 0x2 );
rowA3 = _mm256_permute2f128_ps( r139, r121, 0x2 );
rowA4 = _mm256_permute2f128_ps( r88, r70, 0x13 );
rowA5 = _mm256_permute2f128_ps( r89, r71, 0x13 );
rowA6 = _mm256_permute2f128_ps( r138, r120, 0x13 );
rowA7 = _mm256_permute2f128_ps( r139, r121, 0x13 );
_mm256_storeu_ps((B + 0 * ldb), rowA0);
_mm256_storeu_ps((B + 1 * ldb), rowA1);
_mm256_storeu_ps((B + 2 * ldb), rowA2);
_mm256_storeu_ps((B + 3 * ldb), rowA3);
_mm256_storeu_ps((B + 4 * ldb), rowA4);
_mm256_storeu_ps((B + 5 * ldb), rowA5);
_mm256_storeu_ps((B + 6 * ldb), rowA6);
_mm256_storeu_ps((B + 7 * ldb), rowA7);
IACA_END
}
Overwriting tmp/transpose.c
In [3]:
!(cd tmp; gcc -c -march=haswell -I$HOME/pack/iaca-lin64/include transpose.c)
!~/pack/iaca-lin64/bin/iaca.sh -64 tmp/transpose.o
Intel(R) Architecture Code Analyzer Version - 2.1 Analyzed File - tmp/transpose.o Binary Format - 64Bit Architecture - HSW Analysis Type - Throughput Throughput Analysis Report -------------------------- Block Throughput: 107.00 Cycles Throughput Bottleneck: PORT2_AGU, PORT3_AGU Port Binding In Cycles Per Iteration: -------------------------------------------------------------------------------------------------- | Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------------------------- | Cycles | 10.6 0.0 | 13.6 | 107.0 69.0 | 107.0 69.0 | 84.0 | 24.0 | 13.7 | 8.0 | -------------------------------------------------------------------------------------------------- N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0) D - Data fetch pipe (on ports 2 and 3), CP - on a critical path F - Macro Fusion with the previous instruction occurred * - instruction micro-ops not bound to a port ^ - Micro Fusion happened # - ESP Tracking sync uop was issued @ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected ! - instruction not supported, was not accounted in Analysis | Num Of | Ports pressure in cycles | | | Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | | --------------------------------------------------------------------------------- | 1 | | | 1.0 1.0 | | | | | | CP | vmovss xmm0, dword ptr [rsp-0x6c] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovss dword ptr [rsp-0x40], xmm0 | 1 | | | | 1.0 1.0 | | | | | CP | vbroadcastss ymm0, dword ptr [rsp-0x40] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x728], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovss xmm0, dword ptr [rsp-0x70] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovss dword ptr [rsp-0x3c], xmm0 | 1 | | | | 1.0 1.0 | | | | | CP | vbroadcastss ymm0, dword ptr [rsp-0x3c] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x708], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x38], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x38] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6e8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x58] | 1 | | 1.0 | | | | | | | | lea rdx, ptr [rax*4] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | 0.1 | 0.1 | | | | | 0.9 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x30], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x30] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6c8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x58] | 1 | | 1.0 | | | | | | | | lea rdx, ptr [rax*8] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | 0.1 | | | | | | 0.9 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x28], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x28] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6a8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rdx, qword ptr [rsp-0x58] | 0* | | | | | | | | | | mov rax, rdx | 1 | | 0.2 | | | | | 0.9 | | | add rax, rax | 1 | 0.1 | 0.8 | | | | | 0.2 | | | add rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x2 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | | 0.2 | | | | | 0.9 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x20], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x20] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x688], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x58] | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x4 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | 0.2 | 0.8 | | | | | 0.1 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x18], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x18] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x668], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rdx, qword ptr [rsp-0x58] | 0* | | | | | | | | | | mov rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x2 | 1 | | 0.2 | | | | | 0.8 | | | add rax, rdx | 1 | 1.0 | | | | | | | | | shl rax, 0x2 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | | 0.9 | | | | | 0.2 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x10], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x10] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x648], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rdx, qword ptr [rsp-0x58] | 0* | | | | | | | | | | mov rax, rdx | 1 | | 0.1 | | | | | 0.9 | | | add rax, rax | 1 | 0.1 | 0.8 | | | | | 0.2 | | | add rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x3 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x50] | 1 | | 0.2 | | | | | 0.8 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp-0x8], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x628], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x58] | ! | | | | | | | | | | imul rax, rax, 0x1c | 1 | | | 1.0 1.0 | | | | | | CP | mov rdx, qword ptr [rsp-0x50] | 1 | 0.2 | 0.8 | | | | | 0.1 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp], rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp] | 1 | | | 1.0 1.0 | | | | | | CP | vmovups ymm0, ymmword ptr [rax] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x608], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x668] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x28], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x648] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x28] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpcklps ymm0, ymm0, ymmword ptr [rsp+0x8] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x5e8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x628] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x68], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x608] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x48], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x68] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpcklps ymm0, ymm0, ymmword ptr [rsp+0x48] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x5c8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6e8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0xa8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6c8] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x88], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0xa8] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpcklps ymm0, ymm0, ymmword ptr [rsp+0x88] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x5a8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6a8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0xe8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x688] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0xc8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0xe8] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpcklps ymm0, ymm0, ymmword ptr [rsp+0xc8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x588], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x668] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x128], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x648] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x108], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x128] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpckhps ymm0, ymm0, ymmword ptr [rsp+0x108] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x568], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x628] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x168], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x608] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x148], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x168] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpckhps ymm0, ymm0, ymmword ptr [rsp+0x148] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x548], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6e8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x1a8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6c8] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x188], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x1a8] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpckhps ymm0, ymm0, ymmword ptr [rsp+0x188] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x528], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6a8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x1e8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x688] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x1c8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x1e8] | 2^ | | | 1.0 1.0 | | | 1.0 | | | CP | vunpckhps ymm0, ymm0, ymmword ptr [rsp+0x1c8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x508], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x5e8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x5c8] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0x44 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x4e8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x5a8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x588] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0x44 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x4c8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x5e8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x5c8] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0xee | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x4a8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x5a8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x588] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0xee | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x488], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x568] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x548] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0x44 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x468], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x528] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x508] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0x44 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x448], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x568] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x548] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0xee | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x428], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x528] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x508] | 1 | | | | | | 1.0 | | | | vshufps ymm0, ymm0, ymm1, 0xee | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x408], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x4e8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x4c8] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x2 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6e8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x4a8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x488] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x2 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6c8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x468] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x448] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x2 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x6a8], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x428] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x408] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x2 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x688], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x4e8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x4c8] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x13 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x668], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x4a8] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x488] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x13 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x648], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x468] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x448] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x13 | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x628], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x428] | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm1, ymmword ptr [rsp+0x408] | 1 | | | | | | 1.0 | | | | vperm2f128 ymm0, ymm0, ymm1, 0x13 | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x608], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x60] | 2^ | | | 1.0 | | 1.0 | | | | CP | mov qword ptr [rsp+0x240], rax | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6e8] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x208], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x208] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp+0x240] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x68] | 1 | | 1.0 | | | | | | | | lea rdx, ptr [rax*4] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.1 | | | | | 0.8 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp+0x280], rax | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6c8] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x248], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x248] | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp+0x280] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x68] | 1 | | 1.0 | | | | | | | | lea rdx, ptr [rax*8] | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.1 | | | | | 0.9 | | | add rax, rdx | 2^ | | | 1.0 | | 1.0 | | | | CP | mov qword ptr [rsp+0x2c0], rax | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x6a8] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x288], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x288] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp+0x2c0] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rdx, qword ptr [rsp-0x68] | 0* | | | | | | | | | | mov rax, rdx | 1 | 0.1 | 0.2 | | | | | 0.7 | | | add rax, rax | 1 | 0.2 | 0.6 | | | | | 0.2 | | | add rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x2 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.2 | | | | | 0.8 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp+0x300], rax | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x688] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x2c8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x2c8] | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp+0x300] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x68] | 1 | 1.0 | | | | | | | | | shl rax, 0x4 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.9 | | | | | 0.1 | | | add rax, rdx | 2^ | | | 1.0 | | 1.0 | | | | CP | mov qword ptr [rsp+0x340], rax | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x668] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x308], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x308] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp+0x340] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rdx, qword ptr [rsp-0x68] | 0* | | | | | | | | | | mov rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x2 | 1 | | 0.2 | | | | | 0.9 | | | add rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x2 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.8 | | | | | 0.2 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp+0x380], rax | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x648] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x348], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x348] | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp+0x380] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | mov rdx, qword ptr [rsp-0x68] | 0* | | | | | | | | | | mov rax, rdx | 1 | | 0.2 | | | | | 0.9 | | | add rax, rax | 1 | | 0.8 | | | | | 0.2 | | | add rax, rdx | 1 | 0.9 | | | | | | 0.1 | | | shl rax, 0x3 | 0* | | | | | | | | | | mov rdx, rax | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x60] | 1 | 0.1 | 0.2 | | | | | 0.8 | | | add rax, rdx | 2^ | | | 1.0 | | 1.0 | | | | CP | mov qword ptr [rsp+0x3c0], rax | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x628] | 2^ | | | | 1.0 | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x388], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x388] | 1 | | | 1.0 1.0 | | | | | | CP | mov rax, qword ptr [rsp+0x3c0] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp-0x68] | ! | | | | | | | | | | imul rax, rax, 0x1c | 1 | | | 1.0 1.0 | | | | | | CP | mov rdx, qword ptr [rsp-0x60] | 1 | | 0.9 | | | | | 0.2 | | | add rax, rdx | 2^ | | | | 1.0 | 1.0 | | | | CP | mov qword ptr [rsp+0x400], rax | 1 | | | | 1.0 1.0 | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x608] | 2^ | | | 1.0 | | 1.0 | | | | CP | vmovaps ymmword ptr [rsp+0x3c8], ymm0 | 1 | | | 1.0 1.0 | | | | | | CP | vmovaps ymm0, ymmword ptr [rsp+0x3c8] | 1 | | | | 1.0 1.0 | | | | | CP | mov rax, qword ptr [rsp+0x400] | 2^ | | | | | 1.0 | | | 1.0 | | vmovups ymmword ptr [rax], ymm0 Total Num Of Uops: 368
In [ ]: