False Sharing Experiment
What You'll Learn
- How to measure false sharing impact
- Why shared counters are slow
- How padding eliminates false sharing
- Per-thread counters vs shared counters
Experiment: Shared Counter (False Sharing)
#include <chrono>
#include <iostream>
#include <thread>
#include <vector>
using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::nanoseconds;
// False sharing: counters on same cache line
struct SharedCounters {
int counters[8]; // All on same cache line (or close)
};
// Padded: each counter on separate cache line
struct PaddedCounters {
int counter;
char padding[64 - sizeof(int)]; // Pad to cache line size
};
double benchmark_shared(int num_threads) {
SharedCounters shared;
for (int i = 0; i < 8; ++i) shared.counters[i] = 0;
const int iterations = 10000000;
auto start = Clock::now();
std::vector<std::thread> threads;
for (int t = 0; t < num_threads; ++t) {
threads.emplace_back([&, t]() {
for (int i = 0; i < iterations; ++i) {
shared.counters[t % 8]++; // False sharing!
}
});
}
for (auto& th : threads) th.join();
auto end = Clock::now();
auto elapsed = std::chrono::duration_cast<Duration>(end - start);
return static_cast<double>(elapsed.count()) / (num_threads * iterations);
}
double benchmark_padded(int num_threads) {
PaddedCounters padded[8];
for (int i = 0; i < 8; ++i) padded[i].counter = 0;
const int iterations = 10000000;
auto start = Clock::now();
std::vector<std::thread> threads;
for (int t = 0; t < num_threads; ++t) {
threads.emplace_back([&, t]() {
for (int i = 0; i < iterations; ++i) {
padded[t % 8].counter++; // No false sharing!
}
});
}
for (auto& th : threads) th.join();
auto end = Clock::now();
auto elapsed = std::chrono::duration_cast<Duration>(end - start);
return static_cast<double>(elapsed.count()) / (num_threads * iterations);
}
int main() {
std::cout << "Threads\tShared (ns)\tPadded (ns)\tSpeedup\n";
for (int threads = 1; threads <= 8; threads *= 2) {
double shared = benchmark_shared(threads);
double padded = benchmark_padded(threads);
std::cout << threads << "\t" << shared << "\t" << padded
<< "\t" << shared / padded << "x\n";
}
return 0;
} What to Measure
- ns/operation: Time per increment vs thread count
- Throughput: Operations per second
- Speedup: Padded vs shared performance
Expected Shape of Results
You should see:
- Shared counters: Performance degrades with more threads (coherence traffic)
- Padded counters: Performance scales better (no false sharing)
- Speedup: 2-10x improvement with padding
Interpretation
Coherence traffic dominates: When threads modify the same cache line, the cache coherence protocol must synchronize caches, causing expensive memory traffic. Padding eliminates this by ensuring each thread's data is on a separate cache line.
Checklist
- ✓ Measured shared vs padded counter performance
- ✓ Observed false sharing impact
- ✓ Understood coherence traffic