Vectorization Experiments

What You'll Learn

Experiment: Dot Product

#include <chrono>
#include <iostream>
#include <vector>

using Clock = std::chrono::high_resolution_clock;
using Duration = std::chrono::nanoseconds;

// Scalar version
float dot_product_scalar(const std::vector<float>& a, const std::vector<float>& b) {
    float sum = 0.0f;
    for (size_t i = 0; i < a.size(); ++i) {
        sum += a[i] * b[i];
    }
    return sum;
}

// Compiler will auto-vectorize this (with -O3 -march=native)
float dot_product_vectorized(const std::vector<float>& a, const std::vector<float>& b) {
    float sum = 0.0f;
    #pragma GCC ivdep  // Hint: no dependencies
    for (size_t i = 0; i < a.size(); ++i) {
        sum += a[i] * b[i];
    }
    return sum;
}

double benchmark(std::function<float()> func, int iterations = 100) {
    volatile float sink = 0;
    auto start = Clock::now();
    for (int i = 0; i < iterations; ++i) {
        sink = func();
    }
    auto end = Clock::now();
    auto elapsed = std::chrono::duration_cast<Duration>(end - start);
    return static_cast<double>(elapsed.count()) / iterations;
}

int main() {
    const size_t N = 1000000;
    std::vector<float> a(N, 1.0f);
    std::vector<float> b(N, 2.0f);
    
    double scalar_time = benchmark([&]() { return dot_product_scalar(a, b); });
    double vectorized_time = benchmark([&]() { return dot_product_vectorized(a, b); });
    
    std::cout << "Scalar: " << scalar_time << " ns\n";
    std::cout << "Vectorized: " << vectorized_time << " ns\n";
    std::cout << "Speedup: " << scalar_time / vectorized_time << "x\n";
    
    return 0;
}

Compile: g++ -O3 -march=native -o vectorization vectorization.cpp

What to Measure

Expected Shape of Results

Typical results:

Interpretation

Load/store vs compute: If operations are memory-bound, vectorization helps less. If compute-bound, vectorization helps more.

Why speedup is not always 4x/8x: Memory bandwidth can be the bottleneck. Processing 8 elements in parallel doesn't help if you can't load/store them fast enough.

Checklist