Ways to SIMD¶
!rm -Rf tmp
!mkdir -p tmp
Inline Assembly¶
%%writefile tmp/inline-assembly.c
#include <x86intrin.h>
#include <stdio.h>
int main()
{
float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
float result[8];
__m256 avec = _mm256_loadu_ps(a);
__m256 bvec = _mm256_loadu_ps(b);
__m256 resultvec;
asm ("vaddps %1, %2, %0"
: "=x" (resultvec)
: "x" (avec), "x" (bvec) /* "r" for normal registers, "x"
: /* clobbers */
);
_mm256_storeu_ps(result, resultvec);
for (int i = 0; i < 8; ++i)
printf("%f ", result[i]);
printf("\n");
}
Overwriting tmp/inline-assembly.c
!cd tmp; gcc -march=sandybridge -c -O inline-assembly.c
!objdump --disassemble tmp/inline-assembly.o
tmp/inline-assembly.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <main>: 0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10 5: 48 83 e4 e0 and $0xffffffffffffffe0,%rsp 9: 41 ff 72 f8 pushq -0x8(%r10) d: 55 push %rbp e: 48 89 e5 mov %rsp,%rbp 11: 41 55 push %r13 13: 41 54 push %r12 15: 41 52 push %r10 17: 53 push %rbx 18: 48 83 ec 70 sub $0x70,%rsp 1c: c7 45 b0 00 00 00 00 movl $0x0,-0x50(%rbp) 23: c5 fa 10 05 00 00 00 vmovss 0x0(%rip),%xmm0 # 2b <main+0x2b> 2a: 00 2b: c5 fa 11 45 b4 vmovss %xmm0,-0x4c(%rbp) 30: c5 fa 10 0d 00 00 00 vmovss 0x0(%rip),%xmm1 # 38 <main+0x38> 37: 00 38: c5 fa 11 4d b8 vmovss %xmm1,-0x48(%rbp) 3d: c5 fa 10 15 00 00 00 vmovss 0x0(%rip),%xmm2 # 45 <main+0x45> 44: 00 45: c5 fa 11 55 bc vmovss %xmm2,-0x44(%rbp) 4a: c5 fa 10 1d 00 00 00 vmovss 0x0(%rip),%xmm3 # 52 <main+0x52> 51: 00 52: c5 fa 11 5d c0 vmovss %xmm3,-0x40(%rbp) 57: c5 fa 10 25 00 00 00 vmovss 0x0(%rip),%xmm4 # 5f <main+0x5f> 5e: 00 5f: c5 fa 11 65 c4 vmovss %xmm4,-0x3c(%rbp) 64: c5 fa 10 2d 00 00 00 vmovss 0x0(%rip),%xmm5 # 6c <main+0x6c> 6b: 00 6c: c5 fa 11 6d c8 vmovss %xmm5,-0x38(%rbp) 71: c5 fa 10 35 00 00 00 vmovss 0x0(%rip),%xmm6 # 79 <main+0x79> 78: 00 79: c5 fa 11 75 cc vmovss %xmm6,-0x34(%rbp) 7e: c5 fa 11 75 90 vmovss %xmm6,-0x70(%rbp) 83: c5 fa 11 6d 94 vmovss %xmm5,-0x6c(%rbp) 88: c5 fa 11 65 98 vmovss %xmm4,-0x68(%rbp) 8d: c5 fa 11 5d 9c vmovss %xmm3,-0x64(%rbp) 92: c5 fa 11 55 a0 vmovss %xmm2,-0x60(%rbp) 97: c5 fa 11 4d a4 vmovss %xmm1,-0x5c(%rbp) 9c: c5 fa 11 45 a8 vmovss %xmm0,-0x58(%rbp) a1: c7 45 ac 00 00 00 00 movl $0x0,-0x54(%rbp) a8: c5 fc 28 7d b0 vmovaps -0x50(%rbp),%ymm7 ad: c5 fc 28 4d 90 vmovaps -0x70(%rbp),%ymm1 b2: c5 f4 58 c7 vaddps %ymm7,%ymm1,%ymm0 b6: c5 fc 29 85 70 ff ff vmovaps %ymm0,-0x90(%rbp) bd: ff be: 48 8d 9d 70 ff ff ff lea -0x90(%rbp),%rbx c5: 4c 8d 6b 20 lea 0x20(%rbx),%r13 c9: 4c 8d 25 00 00 00 00 lea 0x0(%rip),%r12 # d0 <main+0xd0> d0: c5 f9 57 c0 vxorpd %xmm0,%xmm0,%xmm0 d4: c5 fa 5a 03 vcvtss2sd (%rbx),%xmm0,%xmm0 d8: 4c 89 e7 mov %r12,%rdi db: b8 01 00 00 00 mov $0x1,%eax e0: e8 00 00 00 00 callq e5 <main+0xe5> e5: 48 83 c3 04 add $0x4,%rbx e9: 4c 39 eb cmp %r13,%rbx ec: 75 e2 jne d0 <main+0xd0> ee: bf 0a 00 00 00 mov $0xa,%edi f3: e8 00 00 00 00 callq f8 <main+0xf8> f8: b8 00 00 00 00 mov $0x0,%eax fd: 48 83 c4 70 add $0x70,%rsp 101: 5b pop %rbx 102: 41 5a pop %r10 104: 41 5c pop %r12 106: 41 5d pop %r13 108: 5d pop %rbp 109: 49 8d 62 f8 lea -0x8(%r10),%rsp 10d: c3 retq
Intrinsics¶
%%writefile tmp/intrinsics.c
#include <x86intrin.h>
#include <stdio.h>
int main()
{
float a[8] = {0, 1, 2, 3, 4, 5, 6, 7};
float b[8] = {7, 6, 5, 4, 3, 2, 1, 0};
float result[8];
__m256 avec = _mm256_loadu_ps(a);
__m256 bvec = _mm256_loadu_ps(b);
__m256 resultvec = _mm256_add_ps(avec, bvec);
_mm256_storeu_ps(result, resultvec);
for (int i = 0; i < 8; ++i)
printf("%f ", result[i]);
printf("\n");
}
Writing tmp/intrinsics.c
!cd tmp; gcc -march=sandybridge -O intrinsics.c -ointrinsics
!./tmp/intrinsics
7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000 7.000000
Vector types¶
%%writefile tmp/vector-types.c
#include <stdio.h>
int main()
{
typedef float v8f __attribute__ ((vector_size (256/8)));
v8f a = {0, 1, 2, 3, 4, 5, 6, 7};
v8f b = {7, 6, 5, 4, 3, 2, 1, 0};
v8f result = a+b+5;
for (int i = 0; i < 8; ++i)
printf("%f ", result[i]);
printf("\n");
}
Overwriting tmp/vector-types.c
!cd tmp; gcc -march=sandybridge -O vector-types.c -ovector-types
!./tmp/vector-types
12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000 12.000000
%%writefile tmp/vector-types.c
#include <stdio.h>
typedef float v8f __attribute__ ((vector_size (256/8)));
v8f add_vecs(v8f a, v8f b)
{
return a+b+5;
}
Overwriting tmp/vector-types.c
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <add_vecs>: 0: c5 fc 58 c1 vaddps %ymm1,%ymm0,%ymm0 4: c5 fc 58 05 00 00 00 vaddps 0x0(%rip),%ymm0,%ymm0 # c <add_vecs+0xc> b: 00 c: c3 retq
%%writefile tmp/vector-types.c
#include <stdio.h>
typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));
v8f reverse_vec(v8f a)
{
//const v8i idx = {7, 6, 5, 4, 3, 2, 1, 0};
const v8i idx = {3, 2, 1, 0, 7, 6, 5, 4};
return __builtin_shuffle(a, idx);
}
Overwriting tmp/vector-types.c
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <reverse_vec>: 0: c4 e3 7d 04 c0 1b vpermilps $0x1b,%ymm0,%ymm0 6: c3 retq
%%writefile tmp/vector-types.c
#include <stdio.h>
typedef float v8f __attribute__ ((vector_size (256/8)));
typedef int v8i __attribute__ ((vector_size (256/8)));
v8f interleave(v8f a, v8f b)
{
const v8i idx = {16, 15, 14, 13, 0, 1, 2, 3};
return __builtin_shuffle(a, b, idx);
}
Overwriting tmp/vector-types.c
!cd tmp; gcc -march=sandybridge -O -c vector-types.c
! objdump --disassemble tmp/vector-types.o
tmp/vector-types.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <interleave>: 0: c5 f8 28 d8 vmovaps %xmm0,%xmm3 4: c5 f8 28 d0 vmovaps %xmm0,%xmm2 8: c4 e3 7d 19 c9 01 vextractf128 $0x1,%ymm1,%xmm1 e: c5 f0 c6 f1 ff vshufps $0xff,%xmm1,%xmm1,%xmm6 13: c5 f0 15 c1 vunpckhps %xmm1,%xmm1,%xmm0 17: c5 f0 c6 c9 55 vshufps $0x55,%xmm1,%xmm1,%xmm1 1c: c5 e0 c6 e3 55 vshufps $0x55,%xmm3,%xmm3,%xmm4 21: c5 e0 15 eb vunpckhps %xmm3,%xmm3,%xmm5 25: c5 e0 c6 db ff vshufps $0xff,%xmm3,%xmm3,%xmm3 2a: c5 d0 14 eb vunpcklps %xmm3,%xmm5,%xmm5 2e: c5 e8 14 dc vunpcklps %xmm4,%xmm2,%xmm3 32: c5 f8 14 c9 vunpcklps %xmm1,%xmm0,%xmm1 36: c5 e8 14 c6 vunpcklps %xmm6,%xmm2,%xmm0 3a: c5 f8 16 c1 vmovlhps %xmm1,%xmm0,%xmm0 3e: c5 e0 16 dd vmovlhps %xmm5,%xmm3,%xmm3 42: c4 e3 7d 18 c3 01 vinsertf128 $0x1,%xmm3,%ymm0,%ymm0 48: c3 retq
#pragma simd¶
%%writefile tmp/pragma-simd.c
#include <stdio.h>
typedef float vec_t[8];
void add_them(vec_t a, vec_t b, vec_t result)
{
#pragma GCC ivdep
// #pragma omp simd
// #pragma omp simd aligned(a, b, result:32)
for (int i = 0; i<8; ++i)
result[i] = a[i] + b[i];
}
Writing tmp/pragma-simd.c
!cd tmp; gcc -march=sandybridge -fopenmp -O -c pragma-simd.c
#!cd tmp; gcc -march=sandybridge -fopenmp -O3 -c pragma-simd.c
!objdump --disassemble tmp/pragma-simd.o
tmp/pragma-simd.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <add_them>: 0: b8 00 00 00 00 mov $0x0,%eax 5: c5 fa 10 04 07 vmovss (%rdi,%rax,1),%xmm0 a: c5 fa 58 04 06 vaddss (%rsi,%rax,1),%xmm0,%xmm0 f: c5 fa 11 04 02 vmovss %xmm0,(%rdx,%rax,1) 14: 48 83 c0 04 add $0x4,%rax 18: 48 83 f8 20 cmp $0x20,%rax 1c: 75 e7 jne 5 <add_them+0x5> 1e: c3 retq
Semantics (openMP 4.5, 2.8.1):
The
simd
construct enables the execution of multiple iterations of the associated loops concurrently by means of SIMD instructions.
A SIMD loop has logical iterations numbered 0,1,...,N-1 where N is the number of loop iterations, and the logical numbering denotes the sequence in which the iterations would be executed if the associated loop(s) were executed with no SIMD instructions. If the
safelen
clause is used then no two iterations executed concurrently with SIMD instructions can have a greater distance in the logical iteration space than its value. The parameter of the safelen clause must be a constant positive integer expression. If used, the simdlen clause specifies the preferred number of iterations to be executed concurrently.
- What does
safelen(12)
mean?
The parameter of the
simdlen
clause must be a constant positive integer. The number of iterations that are executed concurrently at any given time is implementation defined. Each concurrent iteration will be executed by a different SIMD lane. Each set of concurrent iterations is a SIMD chunk. Lexical forward dependencies in the iterations of the original loop must be preserved within each SIMD chunk.
- What does
simdlen(16)
mean? - How do
safelen
andsimdlen
relate to each other
Also:
#pragma omp declare simd
on functions- with
inbranch
,notinbranch
Scalar Program Instances¶
%%writefile tmp/scalar-program-instances.ispc
float add_them(float a, float b)
{
return a + b;
}
Overwriting tmp/scalar-program-instances.ispc
!cd tmp; ~/pack/ispc-v1.9.0-linux/ispc \
--target=avx2-i32x8 \
--opt=force-aligned-memory \
scalar-program-instances.ispc \
-o scalar-program-instances.o
! objdump --disassemble tmp/scalar-program-instances.o
tmp/scalar-program-instances.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <add_them___vyfvyf>: 0: c5 fc 58 c1 vaddps %ymm1,%ymm0,%ymm0 4: c3 retq
uniform
andvarying
programCount
andprogramIndex
x16
targets... why?- How do you think shuffles are done?
- Compare this with
omp declare simd