The Meltdown patch for the Linux kernel makes use of the relatively new PCID instruction. I still sometimes use my old laptop, which contains a Core 2 Duo Penryn CPU (T7250), and does not support the PCID instruction, so I did a quick UnixBench run to see what kind of difference the absence of the PCID instruction would make. At the end of this article, I have a bonus “benchmark” for an alternative way to mitigate Meltdown: disabling the CPU’s caches. All my tests were performed on Debian Wheezy (currently oldstable) using kernel version 3.16.0-5-amd64.
First of all, here are another person’s results for a CPU that supports PCID. And since that’s in Japanese, here’s the important bit:
| Test | Before | After | Change (positive is better) |
| System Call Overhead | 5391.9 | 4009.7 | -25.63% |
Now, my tests on the Penryn CPU:
| Test | Before | After | Change (positive is better) |
| Dhrystone 2 using register variables | 3360.4 | 3414.1 | +1.60% |
| Double-Precision Whetstone | 724.1 | 724 | -0.01% |
| Execl Throughput | 1351.7 | 1222.9 | -9.53% |
| File Copy 1024 bufsize 2000 maxblocks | 1582 | 1244 | -21.37% |
| File Copy 256 bufsize 500 maxblocks | 1255.9 | 922.1 | -26.58% |
| File Copy 4096 bufsize 8000 maxblocks | 1982.4 | 1810.6 | -8.67% |
| Pipe Throughput | 1672.8 | 765.4 | -54.24% |
| Pipe-based Context Switching | 1108.3 | 671 | -39.46% |
| Process Creation | 1150 | 1025.3 | -10.84% |
| Shell Scripts (1 concurrent) | 1995.7 | 1909 | -4.34% |
| Shell Scripts (8 concurrent) | 1831.8 | 1743.3 | -4.83% |
| System Call Overhead | 1705.6 | 544.9 | -68.05% |
| System Benchmarks Index Score | 1535.8 | 1160.9 | -24.41% |
And the raw data in case you are interested:
Before updating:
| Test | Score | Unit | Time | Iters. | Baseline | Index |
|---|---|---|---|---|---|---|
| Dhrystone 2 using register variables | 39215974.0 | lps | 10.0 s | 7 | 116700.0 | 3360.4 |
| Double-Precision Whetstone | 3982.6 | MWIPS | 9.9 s | 7 | 55.0 | 724.1 |
| Execl Throughput | 5812.4 | lps | 29.2 s | 2 | 43.0 | 1351.7 |
| File Copy 1024 bufsize 2000 maxblocks | 626453.0 | KBps | 30.0 s | 2 | 3960.0 | 1582.0 |
| File Copy 256 bufsize 500 maxblocks | 207854.8 | KBps | 30.0 s | 2 | 1655.0 | 1255.9 |
| File Copy 4096 bufsize 8000 maxblocks | 1149781.6 | KBps | 30.0 s | 2 | 5800.0 | 1982.4 |
| Pipe Throughput | 2080979.1 | lps | 10.0 s | 7 | 12440.0 | 1672.8 |
| Pipe-based Context Switching | 443337.7 | lps | 10.0 s | 7 | 4000.0 | 1108.3 |
| Process Creation | 14490.3 | lps | 30.0 s | 2 | 126.0 | 1150.0 |
| Shell Scripts (1 concurrent) | 8461.7 | lpm | 60.0 s | 2 | 42.4 | 1995.7 |
| Shell Scripts (8 concurrent) | 1099.1 | lpm | 60.1 s | 2 | 6.0 | 1831.8 |
| System Call Overhead | 2558469.9 | lps | 10.0 s | 7 | 15000.0 | 1705.6 |
| System Benchmarks Index Score: | 1535.8 | |||||
After updating:
| Test | Score | Unit | Time | Iters. | Baseline | Index |
|---|---|---|---|---|---|---|
| Dhrystone 2 using register variables | 39842314.8 | lps | 10.0 s | 7 | 116700.0 | 3414.1 |
| Double-Precision Whetstone | 3982.0 | MWIPS | 9.8 s | 7 | 55.0 | 724.0 |
| Execl Throughput | 5258.5 | lps | 30.0 s | 2 | 43.0 | 1222.9 |
| File Copy 1024 bufsize 2000 maxblocks | 492638.1 | KBps | 30.0 s | 2 | 3960.0 | 1244.0 |
| File Copy 256 bufsize 500 maxblocks | 152610.9 | KBps | 30.0 s | 2 | 1655.0 | 922.1 |
| File Copy 4096 bufsize 8000 maxblocks | 1050156.7 | KBps | 30.0 s | 2 | 5800.0 | 1810.6 |
| Pipe Throughput | 952188.4 | lps | 10.0 s | 7 | 12440.0 | 765.4 |
| Pipe-based Context Switching | 268401.0 | lps | 10.0 s | 7 | 4000.0 | 671.0 |
| Process Creation | 12918.3 | lps | 30.0 s | 2 | 126.0 | 1025.3 |
| Shell Scripts (1 concurrent) | 8094.2 | lpm | 60.0 s | 2 | 42.4 | 1909.0 |
| Shell Scripts (8 concurrent) | 1046.0 | lpm | 60.1 s | 2 | 6.0 | 1743.3 |
| System Call Overhead | 817288.1 | lps | 10.0 s | 7 | 15000.0 | 544.9 |
| System Benchmarks Index Score: | 1160.9 | |||||
Now, Mitigating Meltdown by switching off CPU caches:
You wouldn’t even want to run UnixBench without CPU caches. Here’s a “simpler” benchmark that tells you why:
# time perl -e 'for (1..1000000) {}'
real 0m0.056s
user 0m0.052s
sys 0m0.000s
# insmod disable_cache.ko
# time perl -e 'for (1..1000000) {}'
real 0m44.689s
user 0m40.044s
sys 0m0.520s
# rmmod disable_cache
Unless you enjoy working on a system that is some 800 times slower. (Don’t try to do this in a GUI setting.)
Nonetheless, here’s some code to disable the CPU caches. (Modified from https://www.linuxquestions.org/questions/linux-kernel-70/disabling-cpu-caches-936077/)
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
MODULE_LICENSE("Dual BSD/GPL");
void _disable_cache(void *p) {
printk(KERN_ALERT "Disabling L1 and L2 caches on processor %d.\n", smp_processor_id());
__asm__(".intel_syntax noprefix\n\t"
"mov rax,cr0\n\t"
"or rax,(1 << 30)\n\t"
"mov cr0,rax\n\t"
"wbinvd\n\t"
".att_syntax noprefix\n\t"
: : : "rax" );
}
void _enable_cache(void *p) {
printk(KERN_ALERT "Enabling L1 and L2 caches on processor %d.\n", smp_processor_id());
__asm__(".intel_syntax noprefix\n\t"
"mov rax,cr0\n\t"
"and rax,~(1 << 30)\n\t"
"mov cr0,rax\n\t"
"wbinvd\n\t"
".att_syntax noprefix\n\t"
: : : "rax" );
}
static int disable_cache_init(void)
{
on_each_cpu(_disable_cache, NULL, 1);
return 0;
}
static void disable_cache_exit(void)
{
on_each_cpu(_enable_cache, NULL, 1);
}
module_init(disable_cache_init);
module_exit(disable_cache_exit);
Makefile:
obj-m += disable_cache.o all: make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
Note that you need to indent using tabs in Makefile. CR0 can only be read from Ring 0, and thus a kernel module is needed.
Here’s some example code to just read the CR0 registers on all CPUs:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
MODULE_LICENSE("Dual BSD/GPL");
void cache_status(void *p) {
long int cr0_30 = 0;
__asm__(".intel_syntax noprefix\n\t"
"mov %0, cr0\n\t"
"and %0, (1 << 30)\n\t"
"shr %0, 30\n\t"
".att_syntax noprefix\n\t"
: "=r" (cr0_30));
printk(KERN_INFO "Processor %d: %ld\n", smp_processor_id(), cr0_30&(1<<30)>>30);
}
static int cache_status_init(void) {
on_each_cpu(cache_status, NULL, 1);
return 0;
}
static void cache_status_exit(void) {
on_each_cpu(cache_status, NULL, 1);
}
module_init(cache_status_init);
module_exit(cache_status_exit);
And the corresponding Makefile:
obj-m += cache_status.o all: make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules