// RUSTFLAGS='-C target-cpu=haswell' cargo run --release
// RUSTFLAGS='-C target-cpu=haswell' cargo bench

// cargo rustc --release --bin sum_array_rust -- -C target-cpu=haswell
// ./target/release/sum_array_rust

#![feature(float_algebraic)]
#![feature(iter_array_chunks)]
#![feature(portable_simd)]
use nix::time::{clock_gettime, ClockId};
use rayon::ThreadPoolBuilder;
pub mod generate;
pub mod sums;

const N: usize = 640000000;
const REPS: usize = 1;

pub fn vec_op(name: &str, vec_op: fn(&Vec<f32>) -> f32, vec: &Vec<f32>) {
    let mut total = 0.0;
    let start = clock_gettime(ClockId::CLOCK_REALTIME).unwrap();
    for _ in 0..REPS {
        total += vec_op(vec);
    }
    let end = clock_gettime(ClockId::CLOCK_REALTIME).unwrap();
    let elapsed = (end.tv_sec() - start.tv_sec()) * 1000000000 + (end.tv_nsec() - start.tv_nsec());
    println!(
        "{:25} calculated {:12.4} in {:5} ps/elt",
        name,
        total,
        elapsed * 1000 / N as i64 / REPS as i64
    );
}

pub fn main() {
    println!(
        "Vector size: {} MB",
        N * std::mem::size_of::<f32>() / 1024 / 1024
    );

    let vec = generate::random_vec(N);
    _ = ThreadPoolBuilder::new().num_threads(24).build_global(); // pre-initialize the pool

    vec_op("vec_sum_for_loop", sums::vec_sum_for_loop, &vec);
    vec_op(
        "vec_sum_two_accumulators",
        sums::vec_sum_two_accumulators,
        &vec,
    );
    vec_op("vec_sum_fold", sums::vec_sum_fold, &vec);
    vec_op("vec_sum_method", sums::vec_sum_method, &vec);
    vec_op("vec_sum_algebraic", sums::vec_sum_algebraic, &vec);
    vec_op("vec_sum_parallel", sums::vec_sum_parallel, &vec);
    vec_op("vec_sum_simd", sums::vec_sum_simd, &vec);
}
