The Meltdown patch for the Linux kernel makes use of the relatively new PCID instruction. I still sometimes use my old laptop, which contains a Core 2 Duo Penryn CPU (T7250), and does not support the PCID instruction, so I did a quick UnixBench run to see what kind of difference the absence of the PCID instruction would make. At the end of this article, I have a bonus “benchmark” for an alternative way to mitigate Meltdown: disabling the CPU’s caches. All my tests were performed on Debian Wheezy (currently oldstable) using kernel version 3.16.0-5-amd64.
First of all, here are another person’s results for a CPU that supports PCID. And since that’s in Japanese, here’s the important bit:
Test |
Before |
After |
Change (positive is better) |
System Call Overhead |
5391.9 |
4009.7 |
-25.63% |
Now, my tests on the Penryn CPU:
Test |
Before |
After |
Change (positive is better) |
Dhrystone 2 using register variables |
3360.4 |
3414.1 |
+1.60% |
Double-Precision Whetstone |
724.1 |
724 |
-0.01% |
Execl Throughput |
1351.7 |
1222.9 |
-9.53% |
File Copy 1024 bufsize 2000 maxblocks |
1582 |
1244 |
-21.37% |
File Copy 256 bufsize 500 maxblocks |
1255.9 |
922.1 |
-26.58% |
File Copy 4096 bufsize 8000 maxblocks |
1982.4 |
1810.6 |
-8.67% |
Pipe Throughput |
1672.8 |
765.4 |
-54.24% |
Pipe-based Context Switching |
1108.3 |
671 |
-39.46% |
Process Creation |
1150 |
1025.3 |
-10.84% |
Shell Scripts (1 concurrent) |
1995.7 |
1909 |
-4.34% |
Shell Scripts (8 concurrent) |
1831.8 |
1743.3 |
-4.83% |
System Call Overhead |
1705.6 |
544.9 |
-68.05% |
System Benchmarks Index Score |
1535.8 |
1160.9 |
-24.41% |
And the raw data in case you are interested:
Before updating:
Test |
Score |
Unit |
Time |
Iters. |
Baseline |
Index |
Dhrystone 2 using register variables |
39215974.0 |
lps |
10.0 s |
7 |
116700.0 |
3360.4 |
Double-Precision Whetstone |
3982.6 |
MWIPS |
9.9 s |
7 |
55.0 |
724.1 |
Execl Throughput |
5812.4 |
lps |
29.2 s |
2 |
43.0 |
1351.7 |
File Copy 1024 bufsize 2000 maxblocks |
626453.0 |
KBps |
30.0 s |
2 |
3960.0 |
1582.0 |
File Copy 256 bufsize 500 maxblocks |
207854.8 |
KBps |
30.0 s |
2 |
1655.0 |
1255.9 |
File Copy 4096 bufsize 8000 maxblocks |
1149781.6 |
KBps |
30.0 s |
2 |
5800.0 |
1982.4 |
Pipe Throughput |
2080979.1 |
lps |
10.0 s |
7 |
12440.0 |
1672.8 |
Pipe-based Context Switching |
443337.7 |
lps |
10.0 s |
7 |
4000.0 |
1108.3 |
Process Creation |
14490.3 |
lps |
30.0 s |
2 |
126.0 |
1150.0 |
Shell Scripts (1 concurrent) |
8461.7 |
lpm |
60.0 s |
2 |
42.4 |
1995.7 |
Shell Scripts (8 concurrent) |
1099.1 |
lpm |
60.1 s |
2 |
6.0 |
1831.8 |
System Call Overhead |
2558469.9 |
lps |
10.0 s |
7 |
15000.0 |
1705.6 |
System Benchmarks Index Score: |
1535.8 |
After updating:
Test |
Score |
Unit |
Time |
Iters. |
Baseline |
Index |
Dhrystone 2 using register variables |
39842314.8 |
lps |
10.0 s |
7 |
116700.0 |
3414.1 |
Double-Precision Whetstone |
3982.0 |
MWIPS |
9.8 s |
7 |
55.0 |
724.0 |
Execl Throughput |
5258.5 |
lps |
30.0 s |
2 |
43.0 |
1222.9 |
File Copy 1024 bufsize 2000 maxblocks |
492638.1 |
KBps |
30.0 s |
2 |
3960.0 |
1244.0 |
File Copy 256 bufsize 500 maxblocks |
152610.9 |
KBps |
30.0 s |
2 |
1655.0 |
922.1 |
File Copy 4096 bufsize 8000 maxblocks |
1050156.7 |
KBps |
30.0 s |
2 |
5800.0 |
1810.6 |
Pipe Throughput |
952188.4 |
lps |
10.0 s |
7 |
12440.0 |
765.4 |
Pipe-based Context Switching |
268401.0 |
lps |
10.0 s |
7 |
4000.0 |
671.0 |
Process Creation |
12918.3 |
lps |
30.0 s |
2 |
126.0 |
1025.3 |
Shell Scripts (1 concurrent) |
8094.2 |
lpm |
60.0 s |
2 |
42.4 |
1909.0 |
Shell Scripts (8 concurrent) |
1046.0 |
lpm |
60.1 s |
2 |
6.0 |
1743.3 |
System Call Overhead |
817288.1 |
lps |
10.0 s |
7 |
15000.0 |
544.9 |
System Benchmarks Index Score: |
1160.9 |
Now, Mitigating Meltdown by switching off CPU caches:
You wouldn’t even want to run UnixBench without CPU caches. Here’s a “simpler” benchmark that tells you why:
# time perl -e 'for (1..1000000) {}'
real 0m0.056s
user 0m0.052s
sys 0m0.000s
# insmod disable_cache.ko
# time perl -e 'for (1..1000000) {}'
real 0m44.689s
user 0m40.044s
sys 0m0.520s
# rmmod disable_cache
Unless you enjoy working on a system that is some 800 times slower. (Don’t try to do this in a GUI setting.)
Nonetheless, here’s some code to disable the CPU caches. (Modified from https://www.linuxquestions.org/questions/linux-kernel-70/disabling-cpu-caches-936077/)
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
MODULE_LICENSE("Dual BSD/GPL");
void _disable_cache(void *p) {
printk(KERN_ALERT "Disabling L1 and L2 caches on processor %d.\n", smp_processor_id());
__asm__(".intel_syntax noprefix\n\t"
"mov rax,cr0\n\t"
"or rax,(1 << 30)\n\t"
"mov cr0,rax\n\t"
"wbinvd\n\t"
".att_syntax noprefix\n\t"
: : : "rax" );
}
void _enable_cache(void *p) {
printk(KERN_ALERT "Enabling L1 and L2 caches on processor %d.\n", smp_processor_id());
__asm__(".intel_syntax noprefix\n\t"
"mov rax,cr0\n\t"
"and rax,~(1 << 30)\n\t"
"mov cr0,rax\n\t"
"wbinvd\n\t"
".att_syntax noprefix\n\t"
: : : "rax" );
}
static int disable_cache_init(void)
{
on_each_cpu(_disable_cache, NULL, 1);
return 0;
}
static void disable_cache_exit(void)
{
on_each_cpu(_enable_cache, NULL, 1);
}
module_init(disable_cache_init);
module_exit(disable_cache_exit);
Makefile:
obj-m += disable_cache.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
Note that you need to indent using tabs in Makefile. CR0 can only be read from Ring 0, and thus a kernel module is needed.
Here’s some example code to just read the CR0 registers on all CPUs:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
MODULE_LICENSE("Dual BSD/GPL");
void cache_status(void *p) {
long int cr0_30 = 0;
__asm__(".intel_syntax noprefix\n\t"
"mov %0, cr0\n\t"
"and %0, (1 << 30)\n\t"
"shr %0, 30\n\t"
".att_syntax noprefix\n\t"
: "=r" (cr0_30));
printk(KERN_INFO "Processor %d: %ld\n", smp_processor_id(), cr0_30&(1<<30)>>30);
}
static int cache_status_init(void) {
on_each_cpu(cache_status, NULL, 1);
return 0;
}
static void cache_status_exit(void) {
on_each_cpu(cache_status, NULL, 1);
}
module_init(cache_status_init);
module_exit(cache_status_exit);
And the corresponding Makefile:
obj-m += cache_status.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules