Log-likelihood benchmark
Log-likelihood benchmark¶
This is a simple benchmark, which I use for basic test of vector-based computing engines.
The problem is to find log-likelihood of normal distribution: $\sum_i \log \left[ \dfrac{1}{\sqrt{2 \pi \sigma}} \exp \left( - \dfrac{(x_i - x)^2}{2 \sigma^2} \right) \right] $
There are elementwise subtraction, division, power, exponent, logarithm and summation of array, so this is broader test.
tested:
- numpy + scipy's pdf
- numpy
- cython
- numexpr
- theano
- parakeet
- C++
- FORTRAN
computing log-likelihood for normal distribution
Notes
- Not optimizing computations here (but in theory theano and parakeet may remove unnecessary computations)
- This test includes exp, log, division and summation of array
- Everything is running on CPU in one thread, this limitation is for 'fairness' of tests
import numpy
import scipy
from scipy.stats import norm
import theano
Scipy implementation¶
def llh_scipy(data, mean, sigma):
lh = norm(mean, sigma).pdf(data)
return numpy.log(lh).sum()
Numpy implementation¶
def llh_numpy(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = numpy.exp(- s)
pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
return numpy.log(pdfs).sum()
Cython implementation¶
%load_ext Cython
%%cython
cdef extern from "math.h":
double sqrt(double m)
double exp(double m)
double log(double m)
import cython
import numpy as np
cimport numpy as np
from numpy cimport ndarray
pi = np.pi
@cython.boundscheck(False)
@cython.wraparound(False)
def llh_cython(ndarray[np.float64_t, ndim=1] data, double mean, double sigma):
cdef int l = len(data)
cdef double llh = 0
cdef double s = 0
for i in xrange(l):
s = (data[i] - mean) ** 2 / (2 * (sigma ** 2))
llh += log(exp(-s) / (sqrt(2 * pi) * sigma))
return llh
Numexpr implementation¶
import numexpr
numexpr.set_num_threads(1)
def llh_numexpr(data, mean, sigma):
expression = 'sum(log(exp(- (data-mean) **2 / (2 * sigma ** 2)) / (sqrt(2 * pi) * sigma)))'
return numexpr.evaluate(expression, local_dict=dict(data=data, mean=mean, sigma=sigma, pi=numpy.pi))
Parakeet implementation¶
import parakeet
parakeet.config.backend = 'c'
@parakeet.jit
def llh_parakeet(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = numpy.exp(- s)
pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
return numpy.log(pdfs).sum()
Theano implementation¶
import theano
import theano.tensor as T
print theano.config.device
theano.config.openmp
def llh_theano(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = T.exp(- s)
pdfs /= T.sqrt(2 * numpy.pi) * sigma
return T.log(pdfs).sum()
mean, sigma = T.scalars('m', 's')
d = T.vector('data')
llh_theano = theano.function([d, mean, sigma], llh_theano(d, mean, sigma))
FORTRAN implementation¶
%load_ext fortranmagic
%%fortran
subroutine llh_fortran(data, mean, sigma, result)
real*8, dimension(:), intent(in) :: data
real*8, intent(in) :: mean, sigma
real*8, intent(out) :: result
real*8, dimension(size(data, 1)) :: s
real*8, parameter :: PI = 3.14159265358979323846
s = (data - mean) ** 2 / (2 * sigma ** 2)
s = exp(- s) / (sqrt(2 * PI) * sigma)
result = sum(log(s))
end subroutine llh_fortran
C++ implementation for comparison of speed¶
we are neither passing, nor returning anything in c++. Just doing same operations in C++ for some array to compare speed.
Mind the overhead for creating new process - it is essential for small sizes.
%%writefile test_speed.cpp
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// using namespace std;
int main(int argc, char** argv) {
if (argc < 4){
std::cout << "run with n_samples, mean, sigma!";
return 1;
}
int size = atoi(argv[1]);
double mean = atof(argv[2]);
double sigma = atof(argv[3]);
double * data = new double[size];
double factor = 1. / size;
for (int i=0; i<size; ++i){
data[i] = i * factor;
}
double result = 0.;
double s = 0.;
double x = 0.;
for (int i=0; i<size; ++i){
x = (data[i] - mean);
s = x * x / (2 * (sigma * sigma));
result += log(exp(-s) / (sqrt(2 * M_PI) * sigma));
}
std::cout << std::endl << result << std::endl;
return 0;
}
!g++ test_speed.cpp -o test_speed -O3
def llh_cpp(data, mean, sigma):
size = len(data)
out = !./test_speed {len(data)} {mean} {sigma}
Data generation, checking that all functions output the same value¶
from collections import OrderedDict
functions = OrderedDict()
functions['scipy'] = llh_scipy
functions['numpy'] = llh_numpy
functions['cython'] = llh_cython
functions['numexpr'] = llh_numexpr
functions['parakeet'] = llh_parakeet
functions['theano'] = llh_theano
functions['fortran'] = llh_fortran
functions['c++'] = llh_cpp
data = numpy.random.normal(size=1000000).astype('float64')
for name, function in functions.items():
print name, function(data, 0.1, 1.1)
import timeit
sizes = [10 ** 5, 10 ** 6, 10 ** 7, 10 ** 8]
import pandas
scores = pandas.DataFrame(data=0, columns=functions.keys(), index=sizes)
for size in sizes:
for name, function in functions.items():
data = numpy.random.normal(size=size).astype('float64')
result = %timeit -o function(data, 0.1, 1.1)
scores.loc[size, name] = result.best
Results (time in seconds, less is better)¶
scores
Comparison to numpy time (less is better)¶
normalized_scores = scores.copy()
for column in normalized_scores.columns:
normalized_scores[column] /= scores['numpy']
normalized_scores
Conclusion¶
Many libraries claim that they can speed up number crunching in python. Results of this test are floating (+- 0.1), but what we can see
- numpy turned out to be fastest at moderate sizes of arrays
- numpy implementation at least not more complex than others
- parakeet was the only to get sensible speed up
Technical info¶
import multiprocessing
print multiprocessing.cpu_count(), 'xeon cores'
numpy.__version__
scipy.__version__
import cython
cython.__version__
numexpr.__version__
parakeet.__version__
theano.__version__
!g++ -v
!gfortran -v
This post was written in IPython. You can download the notebook from repository.