Pipeline Performance Mysteries¶
Consider setting power profiles, see cpupower-gui
.
In [11]:
!LANG= cpupower frequency-info
analyzing CPU 7: driver: intel_pstate CPUs which run at the same hardware frequency: 7 CPUs which need to have their frequency coordinated by software: 7 maximum transition latency: Cannot determine or is not supported. hardware limits: 400 MHz - 3.90 GHz available cpufreq governors: performance powersave current policy: frequency should be within 400 MHz and 3.90 GHz. The governor "performance" may decide which speed to use within this range. current CPU frequency: Unable to call hardware current CPU frequency: 3.64 GHz (asserted by call to kernel) boost state support: Supported: yes Active: yes
In [12]:
!rm -Rf tmp
!mkdir -p tmp
In [13]:
%%writefile tmp/pipeline-perf.c
#include <stdio.h>
#include "timing.h"
int main()
{
int result = 0;
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 400*1000; ++i)
{
a += i;
a += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("a, a: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
return result;
}
Writing tmp/pipeline-perf.c
In [14]:
!cd tmp; gcc -std=gnu99 -lrt -I.. -opipeline-perf pipeline-perf.c
!tmp/pipeline-perf
a, a: elapsed time 1.14691 s
Check that the compiler didn't do anything unexpected:
In [15]:
# !objdump --disassemble tmp/pipeline-perf
Come up with variants of this that exhibit various behaviors of the execution pipeline:
In [ ]:
In [16]:
!cd tmp; gcc -std=gnu99 -lrt -I.. -opipeline-perf pipeline-perf.c
!tmp/pipeline-perf
a, a: elapsed time 1.13467 s
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
- Scroll down for solution
In [17]:
%%writefile tmp/pipeline-perf.c
#include <stdio.h>
#include "timing.h"
int main()
{
int result = 0;
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 1000*1000; ++i)
{
a += i;
a += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("a, a: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 1000*1000; ++i)
{
a += i;
b += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("a, b: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 250*1000; ++i)
{
a += i;
a += i;
a += i;
a += i;
a += i;
a += i;
a += i;
a += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("a, a unrolled: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 250*1000; ++i)
{
a += i;
a += i;
a += i;
a += i;
b += i;
b += i;
b += i;
b += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("aa, bb unrolled: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
{
int a = 0, b = 0;
timestamp_type t1;
get_timestamp(&t1);
for (int ntrips = 0; ntrips < 1000; ++ntrips)
for (int i = 0; i< 250*1000; ++i)
{
a += i;
b += i;
a += i;
b += i;
a += i;
b += i;
a += i;
b += i;
}
timestamp_type t2;
get_timestamp(&t2);
printf("a, b unrolled: elapsed time %g s\n",
timestamp_diff_in_seconds(t1, t2));
result += a+b;
}
return result;
}
Overwriting tmp/pipeline-perf.c
In [18]:
!cd tmp; gcc -std=gnu99 -lrt -I.. -opipeline-perf pipeline-perf.c
!tmp/pipeline-perf
a, a: elapsed time 2.85316 s a, b: elapsed time 1.53108 s a, a unrolled: elapsed time 2.862 s aa, bb unrolled: elapsed time 1.46449 s a, b unrolled: elapsed time 1.49071 s
In [ ]: