SIMDy

Blazingly fast computing in Python

SIMDy allow you to write high performance kernels directly from Python. A number of Python features can be used inside of a kernel. They include expressions, functions, user-defined classes, conditionals, arrays and loops. To achieve maximum possible performance SIMDy supports vector data types which allows CPU vector unit utilization.

SIMDy main features are:

  • automatic generation of AVX-512, FMA, AVX2, AVX, SSE instructions
  • high performance
  • on-the-fly code generation
  • native code generation
  • easy to write vectorized code

Below you can see simple example how to use SIMDy to calculate pi

from multiprocessing import cpu_count
from simdy import int64, float64, simdy_kernel

@simdy_kernel(nthreads=cpu_count())
def calculate_pi(n_samples: int64) -> float64:
    inside = int64x4(0)
    for i in range(n_samples):
        x = 2.0 * random_float64x4() - float64x4(1.0)
        y = 2.0 * random_float64x4() - float64x4(1.0)
        inside += select(int64x4(1), int64x4(0), x * x + y * y < float64x4(1.0))

    nn = inside[0] + inside[1] + inside[2] + inside[3]
    result = 4.0 * float64(nn) / float64(n_samples * 4)
    return result

result = calculate_pi(int64(25_000_000))
print(sum(result) / cpu_count())

Mandelbrot calculation using SIMDy


import time
from simdy import float64x3, simdy_kernel, int32, float64, array_float64x3,\
    int32x4, float64x4
from multiprocessing import cpu_count


@simdy_kernel(nthreads=cpu_count())
def mandel(width: int32, height: int32, pixels: array_float64x3):

    def num_iterations(max_it: int32, cx: float64x4, cy: float64x4):
        iterations = int32x4(0)
        zx = float64x4(0)
        zy = float64x4(0)
        treshold = float64x4(4.0)
        iterations = int32x4(0)
        for i in range(max_it):
            increment = select(int32x4(0), int32x4(1), zx * zx + zy * zy > treshold)
            if increment[0] + increment[1] + increment[2] + increment[3] == 0:
                return iterations
            iterations = iterations + increment

            new_zx = zx * zx - zy * zy
            new_zy = 2.0 * zx * zy

            zx = cx + new_zx
            zy = cy + new_zy
        return iterations

    def write_pixel(pixels, width, x, y, i, scale):
        r = float64(i % 4 * 64)
        g = float64(i % 8 * 32)
        b = float64(i % 16 * 16)
        pixels[y * width + x] = float64x3(r, g, b) * scale

    yb = 1.5
    ya = -1.5
    xa = -2.0
    xb = 1.0
    max_it = 255
    dx = float64x4(xb - xa)
    width1 = float64x4(float64(width - 1))
    xa_2 = float64x4(xa)
    scale = float64x3(1.0 / 256.0)

    for y in range(thread_idx(), height, nthreads()):
        cy = float64x4(float64(y) * (yb - ya) / float64(height - 1) + ya)
        for x in range(0, width, 4):
            cx = float64x4(int32x4(x, x + 1, x + 2, x + 3)) * dx / width1 + xa_2
            iterations = num_iterations(max_it, cx, cy)
            write_pixel(pixels, width, x, y, iterations[0], scale)
            write_pixel(pixels, width, x + 1, y, iterations[1], scale)
            write_pixel(pixels, width, x + 2, y, iterations[2], scale)
            write_pixel(pixels, width, x + 3, y, iterations[3], scale)


width = int32(1200)
height = int32(1200)
pixels = array_float64x3(size=width * height)


start = time.clock()
mandel(width, height, pixels)
print('Generating mandelbrot took %f seconds.' % (time.clock() - start))


def write_ppm_ascii(width, height, pixels, filename):
    with open(filename, 'w') as fp:
        fp.write('P3\n')
        fp.write('%i %i\n' % (width, height))
        fp.write('255\n')
        # Pixels are written top to bottom
        for y in range(height - 1, -1, -1):
            for x in range(width):
                p = pixels[y * width + x]
                r = int(p[0] * 255.99)
                g = int(p[1] * 255.99)
                b = int(p[2] * 255.99)
                fp.write('%i %i %i\n' % (r, g, b))

start = time.clock()
write_ppm_ascii(width, height, pixels, 'figure.ppm')
print('Writing image to file took %f seconds.' % (time.clock() - start))