autoscale: true slidenumbers: true
Nick Thompson
Suppose
Then a killer interpolator for this data is
It's
where my best alternative interpolator with
So I had a big incentive to make it fast . . .
template<class RandomAccessContainer>
class whittaker_shannon {
public:
using Real = typename RandomAccessContainer::value_type;
whittaker_shannon(RandomAccessContainer&& y, Real t0, Real h)
y_{std::move(y)}, t0_{t0}, h_{h} {}
Real operator()(Real t) const {
Real x = boost::math::constants::pi<Real>()*(t-m_t0)/m_h;
Real s = 0;
for (size_t i = 0; i < y_.size(); ++i) {
s += y[i]*boost::math::sinc(x - i*pi<Real>());
}
return s;
}
};
template<class Real>
void BM_WhittakerShannon(benchmark::State& state) {
std::vector<Real> v(state.range(0));
std::mt19937 gen(323723);
std::uniform_real_distribution<Real> dis(-0.95, 0.95);
for (size_t i = 0; i < v.size(); ++i) {
v[i] = dis(gen);
}
auto ws = whittaker_shannon(std::move(v), Real(0), 1/Real(32));
Real arg = dis(gen);
for (auto _ : state) {
benchmark::DoNotOptimize(ws(arg));
}
state.SetComplexityN(state.range(0));
}
BENCHMARK_TEMPLATE(BM_WhittakerShannon, double)->RangeMultiplier(2)
->Range(1<<8, 1<<15)->Complexity(benchmark::oN);
BENCHMARK_MAIN();
If you're spending a lot of time evaluating sines and cosines, you're doing something wrong
Use
Real operator()(Real t) const {
Real x = (t-t0_)/h_;
Real s = 0;
for (size_t i = 0; i < y_.size(); ++i) {
Real term = y_[i]/(x-i);
if(i & 1) {
s -= term;
}
else {
s += term;
}
}
return s*sin(pi<Real>()*x)/pi<Real>();
}
So instead of sin(pi<Real>*x)
, compute boost::math::sin_pi(x)
.
The cvtsi2sd instruction is super slow! Can we get rid of it?
Also, the add and subtracts are suspiciously slow. Are they misattributed branch mispredicts?
Real operator()(Real t) const {
Real x = (t-t0_)/h_;
Real s = 0;
Real z = x;
auto it = y_.begin();
auto end = y_.end();
while (it != end) {
s += *it++/z;
z -= 1;
}
return s*sin(pi<Real>()*x)/pi<Real>();
}
This gives speedup on clang, but not gcc . . .
Real operator()(Real t) const {
Real x = (t-t0_)/h_;
Real y0 = 0;
Real y1 = 0;
Real y2 = 0;
Real y3 = 0;
Real z0 = x;
Real z1 = x - 1;
Real z2 = x - 2;
Real z3 = x - 3;
auto it = y_.begin();
auto end = y_.end();
while (it != end) {
Real k0 = 1/z0;
Real k1 = 1/z1;
Real k2 = 1/z2;
Real k3 = 1/z3;
y0 += (*it)*k0;
y1 += (*it+1)*k1;
y2 += (*it+2)*k2;
y3 += (*it+3)*k3;
z0 -= 4;
z1 -= 4;
z2 -= 4;
z3 -= 4;
it += 4;
}
Real s = y0 + y1 + y2 + y3;
return s*sin(pi<Real>()*x)/pi<Real>();
}
I feel like I've made it obvious what I want here, but only clang 6 actually vectorizes this.
gcc and Apple clang just don't get it. . .
What about -ffast-math
?