NUMA and Bandwidths¶
In [1]:
!rm -Rf tmp
!mkdir -p tmp
Gathering Information¶
In [7]:
%%writefile tmp/numa-info.c
#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"
void print_bitmask(const struct bitmask *bm)
{
for(size_t i=0; i<bm->size; ++i)
printf("%d", numa_bitmask_isbitset(bm, i));
}
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
printf("num cpus: %d\n", num_cpus);
printf("numa available: %d\n", numa_available());
numa_set_localalloc();
struct bitmask *bm = numa_bitmask_alloc(num_cpus);
for (int i=0; i<=numa_max_node(); ++i)
{
numa_node_to_cpus(i, bm);
printf("numa node %d ", i);
print_bitmask(bm);
printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.));
}
numa_bitmask_free(bm);
return 0;
}
Overwriting tmp/numa-info.c
In [8]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-info.c -onuma-info
! ./tmp/numa-info
num cpus: 16 numa available: 0 numa node 0 1111111100000000 - 31.3159 GiB numa node 1 0000000011111111 - 31.4982 GiB
A Shared Header¶
In [3]:
%%writefile tmp/numatest.h
#define _GNU_SOURCE
#include <numa.h>
#include <sched.h>
#include <stdio.h>
#include <pthread.h>
#include <omp.h>
#include <assert.h>
#include "timing.h"
void pin_to_core(size_t core)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
}
double measure_access(void *x, size_t array_size, size_t ntrips)
{
timestamp_type t1;
get_timestamp(&t1);
for (size_t i = 0; i<ntrips; ++i)
for(size_t j = 0; j<array_size; ++j)
{
*(((char*)x) + ((j * 1009) % array_size)) += 1;
}
timestamp_type t2;
get_timestamp(&t2);
return timestamp_diff_in_seconds(t1, t2);
}
Writing tmp/numatest.h
On- and Off-Node Bandwidths¶
In [10]:
%%writefile tmp/numa-bw-seq.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ single access
#pragma omp barrier
for (size_t i = 0; i<num_cpus; ++i)
{
if (tid == i)
{
double t = measure_access(x, array_size, ntrips);
printf("sequential core %d -> core 0 : BW %g MB/s\n",
i, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
}
// }}}
}
numa_free(x, array_size);
return 0;
}
Overwriting tmp/numa-bw-seq.c
In [11]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-seq.c -onuma-bw-seq
! ./tmp/numa-bw-seq
sequential core 0 -> core 0 : BW 8322.05 MB/s sequential core 1 -> core 0 : BW 8636.35 MB/s sequential core 2 -> core 0 : BW 8713.88 MB/s sequential core 3 -> core 0 : BW 8695.55 MB/s sequential core 4 -> core 0 : BW 8649.28 MB/s sequential core 5 -> core 0 : BW 8738.7 MB/s sequential core 6 -> core 0 : BW 8665.42 MB/s sequential core 7 -> core 0 : BW 8702.21 MB/s sequential core 8 -> core 0 : BW 4821.3 MB/s sequential core 9 -> core 0 : BW 4806.34 MB/s sequential core 10 -> core 0 : BW 4817.99 MB/s sequential core 11 -> core 0 : BW 4788.99 MB/s sequential core 12 -> core 0 : BW 4824.2 MB/s sequential core 13 -> core 0 : BW 4819.79 MB/s sequential core 14 -> core 0 : BW 4828.46 MB/s sequential core 15 -> core 0 : BW 4809.35 MB/s
Contention: Everybody¶
In [1]:
%%writefile tmp/numa-bw-all.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ everybody contends for one
{
if (tid == 0) puts("");
#pragma omp barrier
double t = measure_access(x, array_size, ntrips);
#pragma omp barrier
for (size_t i = 0; i<num_cpus; ++i)
{
if (tid == i)
printf("all-contention core %d -> core 0 : BW %g MB/s\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
#pragma omp barrier
}
}
// }}}
}
numa_free(x, array_size);
return 0;
}
Overwriting tmp/numa-bw-all.c
In [3]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-all.c -onuma-bw-all
! ./tmp/numa-bw-all
all-contention core 0 -> core 0 : BW 1698.6 MB/s all-contention core 1 -> core 0 : BW 1699.33 MB/s all-contention core 2 -> core 0 : BW 1699.32 MB/s all-contention core 3 -> core 0 : BW 1699.33 MB/s all-contention core 4 -> core 0 : BW 1699.33 MB/s all-contention core 5 -> core 0 : BW 1698.6 MB/s all-contention core 6 -> core 0 : BW 1698.6 MB/s all-contention core 7 -> core 0 : BW 1699.32 MB/s all-contention core 8 -> core 0 : BW 1668.48 MB/s all-contention core 9 -> core 0 : BW 1677.92 MB/s all-contention core 10 -> core 0 : BW 1678.6 MB/s all-contention core 11 -> core 0 : BW 1668.48 MB/s all-contention core 12 -> core 0 : BW 1668.48 MB/s all-contention core 13 -> core 0 : BW 1677.31 MB/s all-contention core 14 -> core 0 : BW 1678.6 MB/s all-contention core 15 -> core 0 : BW 1668.48 MB/s
Contention: Pairs¶
In [13]:
%%writefile tmp/numa-bw-two.c
#include "numatest.h"
int main(int argc, const char **argv)
{
int num_cpus = numa_num_task_cpus();
numa_set_localalloc();
char *x;
const size_t cache_line_size = 64;
const size_t array_size = 100*1000*1000;
size_t ntrips = 2;
#pragma omp parallel
{
assert(omp_get_num_threads() == num_cpus);
int tid = omp_get_thread_num();
pin_to_core(tid);
if(tid == 0)
x = (char *) numa_alloc_local(array_size);
// {{{ zero and someone else contending
if (tid == 0) puts("");
#pragma omp barrier
for (size_t i = 1; i<num_cpus; ++i)
{
double t;
if (tid == i || tid == 0)
t = measure_access(x, array_size, ntrips);
#pragma omp barrier
if (tid == 0)
{
printf("two-contention core %d -> core 0 : BW %g MB/s\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
if (tid == i)
{
printf("two-contention core %d -> core 0 : BW %g MB/s\n\n",
tid, array_size*ntrips*cache_line_size / t / 1e6);
}
#pragma omp barrier
}
}
numa_free(x, array_size);
return 0;
}
Overwriting tmp/numa-bw-two.c
In [14]:
! cd tmp; gcc -O3 -std=gnu99 -fopenmp -lrt -lnuma -I.. numa-bw-two.c -onuma-bw-two
! ./tmp/numa-bw-two
two-contention core 0 -> core 0 : BW 7348.34 MB/s two-contention core 1 -> core 0 : BW 7348.35 MB/s two-contention core 0 -> core 0 : BW 7653.44 MB/s two-contention core 2 -> core 0 : BW 7653.55 MB/s two-contention core 0 -> core 0 : BW 7661.08 MB/s two-contention core 3 -> core 0 : BW 7661.09 MB/s two-contention core 0 -> core 0 : BW 7643.78 MB/s two-contention core 4 -> core 0 : BW 7643.98 MB/s two-contention core 0 -> core 0 : BW 7698.85 MB/s two-contention core 5 -> core 0 : BW 7698.86 MB/s two-contention core 0 -> core 0 : BW 7592.64 MB/s two-contention core 6 -> core 0 : BW 7592.65 MB/s two-contention core 0 -> core 0 : BW 7694.93 MB/s two-contention core 7 -> core 0 : BW 7694.93 MB/s two-contention core 0 -> core 0 : BW 7541.66 MB/s two-contention core 8 -> core 0 : BW 4713.53 MB/s two-contention core 0 -> core 0 : BW 7516.31 MB/s two-contention core 9 -> core 0 : BW 4697.66 MB/s two-contention core 0 -> core 0 : BW 7513.48 MB/s two-contention core 10 -> core 0 : BW 4705.6 MB/s two-contention core 0 -> core 0 : BW 7519.91 MB/s two-contention core 11 -> core 0 : BW 4702.41 MB/s two-contention core 0 -> core 0 : BW 7510.43 MB/s two-contention core 12 -> core 0 : BW 4710.96 MB/s two-contention core 0 -> core 0 : BW 7524.66 MB/s two-contention core 13 -> core 0 : BW 4691.61 MB/s two-contention core 0 -> core 0 : BW 7524.08 MB/s two-contention core 14 -> core 0 : BW 4711.21 MB/s two-contention core 0 -> core 0 : BW 7520.46 MB/s two-contention core 15 -> core 0 : BW 4691.32 MB/s
Tests based on numatest.cpp
by James Brock
http://stackoverflow.com/questions/7259363/measuring-numa-non-uniform-memory-access-no-observable-asymmetry-why
Changes by Andreas Kloeckner, 10/2012:
- Rewritten in C + OpenMP
- Added contention tests