!rm -Rf tmp
!mkdir -p tmp

%%writefile tmp/inline-assembly.c

#include <x86intrin.h>
#include <stdio.h>

int main()
{
    float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
    float result[8];

    __m256 avec = _mm256_loadu_ps(a);
    __m256 bvec = _mm256_loadu_ps(b);
    __m256 resultvec;
    
    asm ("vaddps %1, %2, %0"
        : "=x" (resultvec)
        : "x" (avec), "x" (bvec) /* "r" for normal registers, "x"
        : /* clobbers */
        );
    
    _mm256_storeu_ps(result, resultvec);
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}

Overwriting tmp/inline-assembly.c

!cd tmp; gcc -march=sandybridge -c -O inline-assembly.c
!objdump --disassemble tmp/inline-assembly.o

tmp/inline-assembly.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <main>:
   0:	4c 8d 54 24 08       	lea    0x8(%rsp),%r10
   5:	48 83 e4 e0          	and    $0xffffffffffffffe0,%rsp
   9:	41 ff 72 f8          	pushq  -0x8(%r10)
   d:	55                   	push   %rbp
   e:	48 89 e5             	mov    %rsp,%rbp
  11:	41 55                	push   %r13
  13:	41 54                	push   %r12
  15:	41 52                	push   %r10
  17:	53                   	push   %rbx
  18:	48 83 ec 70          	sub    $0x70,%rsp
  1c:	c7 45 b0 00 00 00 00 	movl   $0x0,-0x50(%rbp)
  23:	c5 fa 10 05 00 00 00 	vmovss 0x0(%rip),%xmm0        # 2b <main+0x2b>
  2a:	00 
  2b:	c5 fa 11 45 b4       	vmovss %xmm0,-0x4c(%rbp)
  30:	c5 fa 10 0d 00 00 00 	vmovss 0x0(%rip),%xmm1        # 38 <main+0x38>
  37:	00 
  38:	c5 fa 11 4d b8       	vmovss %xmm1,-0x48(%rbp)
  3d:	c5 fa 10 15 00 00 00 	vmovss 0x0(%rip),%xmm2        # 45 <main+0x45>
  44:	00 
  45:	c5 fa 11 55 bc       	vmovss %xmm2,-0x44(%rbp)
  4a:	c5 fa 10 1d 00 00 00 	vmovss 0x0(%rip),%xmm3        # 52 <main+0x52>
  51:	00 
  52:	c5 fa 11 5d c0       	vmovss %xmm3,-0x40(%rbp)
  57:	c5 fa 10 25 00 00 00 	vmovss 0x0(%rip),%xmm4        # 5f <main+0x5f>
  5e:	00 
  5f:	c5 fa 11 65 c4       	vmovss %xmm4,-0x3c(%rbp)
  64:	c5 fa 10 2d 00 00 00 	vmovss 0x0(%rip),%xmm5        # 6c <main+0x6c>
  6b:	00 
  6c:	c5 fa 11 6d c8       	vmovss %xmm5,-0x38(%rbp)
  71:	c5 fa 10 35 00 00 00 	vmovss 0x0(%rip),%xmm6        # 79 <main+0x79>
  78:	00 
  79:	c5 fa 11 75 cc       	vmovss %xmm6,-0x34(%rbp)
  7e:	c5 fa 11 75 90       	vmovss %xmm6,-0x70(%rbp)
  83:	c5 fa 11 6d 94       	vmovss %xmm5,-0x6c(%rbp)
  88:	c5 fa 11 65 98       	vmovss %xmm4,-0x68(%rbp)
  8d:	c5 fa 11 5d 9c       	vmovss %xmm3,-0x64(%rbp)
  92:	c5 fa 11 55 a0       	vmovss %xmm2,-0x60(%rbp)
  97:	c5 fa 11 4d a4       	vmovss %xmm1,-0x5c(%rbp)
  9c:	c5 fa 11 45 a8       	vmovss %xmm0,-0x58(%rbp)
  a1:	c7 45 ac 00 00 00 00 	movl   $0x0,-0x54(%rbp)
  a8:	c5 fc 28 7d b0       	vmovaps -0x50(%rbp),%ymm7
  ad:	c5 fc 28 4d 90       	vmovaps -0x70(%rbp),%ymm1
  b2:	c5 f4 58 c7          	vaddps %ymm7,%ymm1,%ymm0
  b6:	c5 fc 29 85 70 ff ff 	vmovaps %ymm0,-0x90(%rbp)
  bd:	ff 
  be:	48 8d 9d 70 ff ff ff 	lea    -0x90(%rbp),%rbx
  c5:	4c 8d 6b 20          	lea    0x20(%rbx),%r13
  c9:	4c 8d 25 00 00 00 00 	lea    0x0(%rip),%r12        # d0 <main+0xd0>
  d0:	c5 f9 57 c0          	vxorpd %xmm0,%xmm0,%xmm0
  d4:	c5 fa 5a 03          	vcvtss2sd (%rbx),%xmm0,%xmm0
  d8:	4c 89 e7             	mov    %r12,%rdi
  db:	b8 01 00 00 00       	mov    $0x1,%eax
  e0:	e8 00 00 00 00       	callq  e5 <main+0xe5>
  e5:	48 83 c3 04          	add    $0x4,%rbx
  e9:	4c 39 eb             	cmp    %r13,%rbx
  ec:	75 e2                	jne    d0 <main+0xd0>
  ee:	bf 0a 00 00 00       	mov    $0xa,%edi
  f3:	e8 00 00 00 00       	callq  f8 <main+0xf8>
  f8:	b8 00 00 00 00       	mov    $0x0,%eax
  fd:	48 83 c4 70          	add    $0x70,%rsp
 101:	5b                   	pop    %rbx
 102:	41 5a                	pop    %r10
 104:	41 5c                	pop    %r12
 106:	41 5d                	pop    %r13
 108:	5d                   	pop    %rbp
 109:	49 8d 62 f8          	lea    -0x8(%r10),%rsp
 10d:	c3                   	retq

%%writefile tmp/intrinsics.c

#include <x86intrin.h>
#include <stdio.h>

int main()
{
    float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
    float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
    float result[8];

    __m256 avec = _mm256_loadu_ps(a);
    __m256 bvec = _mm256_loadu_ps(b);
    __m256 resultvec = _mm256_add_ps(avec, bvec);    
    _mm256_storeu_ps(result, resultvec);
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}

Writing tmp/intrinsics.c

!cd tmp; gcc -march=sandybridge -O intrinsics.c -ointrinsics
!./tmp/intrinsics

7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000

%%writefile tmp/vector-types.c

#include <stdio.h>

int main()
{
    typedef float v8f __attribute__ ((vector_size (256/8)));
    v8f a = {0, 1, 2, 3, 4, 5, 6, 7};
    v8f b = {7, 6, 5, 4, 3, 2, 1, 0};

    v8f result = a+b+5;
    
    for (int i = 0; i < 8; ++i)
        printf("%f ", result[i]);
    printf("\n");
}

Overwriting tmp/vector-types.c

!cd tmp; gcc -march=sandybridge -O vector-types.c -ovector-types
!./tmp/vector-types

12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000

%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));

v8f add_vecs(v8f a, v8f b)
{
    return a+b+5;
}

Overwriting tmp/vector-types.c

!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o

tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_vecs>:
   0:	c5 fc 58 c1          	vaddps %ymm1,%ymm0,%ymm0
   4:	c5 fc 58 05 00 00 00 	vaddps 0x0(%rip),%ymm0,%ymm0        # c <add_vecs+0xc>
   b:	00 
   c:	c3                   	retq

%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));

v8f reverse_vec(v8f a)
{
    //const v8i idx = {7, 6, 5, 4, 3, 2, 1, 0};
    const v8i idx = {3, 2, 1, 0, 7, 6, 5, 4};
    return __builtin_shuffle(a, idx);
}

Overwriting tmp/vector-types.c

!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o

tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <reverse_vec>:
   0:	c4 e3 7d 04 c0 1b    	vpermilps $0x1b,%ymm0,%ymm0
   6:	c3                   	ret

%%writefile tmp/vector-types.c

#include <stdio.h>

typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));

v8f interleave(v8f a, v8f b)
{
    // one-argument
    //const v8i idx = {0,1,2,3,4,5,6,7};
    //const v8i idx = {1,2,3,4,5,6,7,0};
    
    // two-argument
    const v8i idx = {2,2+8,3,3+8,6,6+8,7,7+8};
    return __builtin_shuffle(a, b, idx);
}

Overwriting tmp/vector-types.c

!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o

tmp/vector-types.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <interleave>:
   0:	c5 fc 15 c1          	vunpckhps %ymm1,%ymm0,%ymm0
   4:	c3                   	ret

%%writefile tmp/highway.cpp

#include <hwy/highway.h>

namespace hn = hwy::HWY_NAMESPACE;

using T = float;

void MulAddLoop(const T* HWY_RESTRICT mul_array,
                const T* HWY_RESTRICT add_array,
                const size_t size, T* HWY_RESTRICT x_array) {
  const hn::ScalableTag<T> d;
  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
    const auto mul = hn::Load(d, mul_array + i);
    const auto add = hn::Load(d, add_array + i);
    auto x = hn::Load(d, x_array + i);
    x = hn::MulAdd(mul, x, add);
    hn::Store(x, d, x_array + i);
  }
}

Overwriting tmp/highway.cpp

!cd tmp; g++ -march=native -O -c highway.cpp
!objdump --disassemble tmp/highway.o

tmp/highway.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <_Z10MulAddLoopPKfS0_mPf>:
   0:	48 85 d2             	test   %rdx,%rdx
   3:	74 39                	je     3e <_Z10MulAddLoopPKfS0_mPf+0x3e>
   5:	b8 00 00 00 00       	mov    $0x0,%eax
   a:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
  11:	00 00 00 00 
  15:	66 66 2e 0f 1f 84 00 	data16 cs nopw 0x0(%rax,%rax,1)
  1c:	00 00 00 00 
  20:	c5 fc 28 04 87       	vmovaps (%rdi,%rax,4),%ymm0
  25:	c5 fc 28 0c 86       	vmovaps (%rsi,%rax,4),%ymm1
  2a:	c4 e2 75 98 04 81    	vfmadd132ps (%rcx,%rax,4),%ymm1,%ymm0
  30:	c5 fc 29 04 81       	vmovaps %ymm0,(%rcx,%rax,4)
  35:	48 83 c0 08          	add    $0x8,%rax
  39:	48 39 d0             	cmp    %rdx,%rax
  3c:	72 e2                	jb     20 <_Z10MulAddLoopPKfS0_mPf+0x20>
  3e:	c3                   	ret

%%writefile tmp/pragma-simd.c

#include <stdio.h>

typedef float vec_t[8];

void add_them(vec_t a, vec_t b, vec_t result)
{
    #pragma GCC ivdep
    // #pragma omp simd
    // #pragma omp simd aligned(a, b, result:32)
    for (int i = 0; i<8; ++i)
        result[i] = a[i] + b[i];
}

Writing tmp/pragma-simd.c

!cd tmp; gcc -march=sandybridge -fopenmp -O -c pragma-simd.c
#!cd tmp; gcc -march=sandybridge -fopenmp -O3 -c pragma-simd.c

!objdump --disassemble tmp/pragma-simd.o

tmp/pragma-simd.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_them>:
   0:	b8 00 00 00 00       	mov    $0x0,%eax
   5:	c5 fa 10 04 07       	vmovss (%rdi,%rax,1),%xmm0
   a:	c5 fa 58 04 06       	vaddss (%rsi,%rax,1),%xmm0,%xmm0
   f:	c5 fa 11 04 02       	vmovss %xmm0,(%rdx,%rax,1)
  14:	48 83 c0 04          	add    $0x4,%rax
  18:	48 83 f8 20          	cmp    $0x20,%rax
  1c:	75 e7                	jne    5 <add_them+0x5>
  1e:	c3                   	retq

%%writefile tmp/scalar-program-instances.ispc

float add_them(float a, float b)
{
    return a + b;
}

Overwriting tmp/scalar-program-instances.ispc

!cd tmp; ~/pack/ispc-v1.9.0-linux/ispc  \
    --target=avx2-i32x8 \
    --opt=force-aligned-memory \
    scalar-program-instances.ispc \
    -o scalar-program-instances.o
! objdump --disassemble tmp/scalar-program-instances.o

tmp/scalar-program-instances.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <add_them___vyfvyf>:
   0:	c5 fc 58 c1          	vaddps %ymm1,%ymm0,%ymm0
   4:	c3                   	retq

Ways to SIMD¶

Inline Assembly¶

Intrinsics¶

Vector types¶

One-argument shuffle¶

Two-argument shuffle¶

Highway¶

#pragma simd¶

Scalar Program Instances¶