SIMDy

Blazingly fast computing in Python

SIMDy is compiler that transforms python code to native machine code. Compilation can be invoked automatically when calling a function that uses the @simdy_kernel decorator or using Kernel class. A number of Python features can be used inside of a kernel. They include expressions, functions, user-defined classes, conditionals, while loop. Supported basic data types are integers, reals(floats) and vector data types which allows CPU vector unit utilization. Currently only x86 based CPU are supported.

SIMDy main featrues are:

  • high performance
  • on-the-fly code generation
  • native code generation
  • automatic generation of SSE, AVX, AVX2, AVX-512, FMA instructions
  • easy to write vectorized code

Mandelbrot calculation using SIMDy


import time
from simdy import float64x3, simdy_kernel, int32, float64, array_float64x3,\
    int32x4, float64x4
from multiprocessing import cpu_count


@simdy_kernel(nthreads=cpu_count())
def mandel(width: int32, height: int32, pixels: array_float64x3):

    def num_iterations(max_it: int32, cx: float64x4, cy: float64x4):
        iterations = int32x4(0)
        zx = float64x4(0)
        zy = float64x4(0)
        treshold = float64x4(4.0)
        iterations = int32x4(0)
        for i in range(max_it):
            increment = select(int32x4(0), int32x4(1), zx * zx + zy * zy > treshold)
            if increment[0] + increment[1] + increment[2] + increment[3] == 0:
                return iterations
            iterations = iterations + increment

            new_zx = zx * zx - zy * zy
            new_zy = 2.0 * zx * zy

            zx = cx + new_zx
            zy = cy + new_zy
        return iterations

    def write_pixel(pixels, width, x, y, i, scale):
        r = float64(i % 4 * 64)
        g = float64(i % 8 * 32)
        b = float64(i % 16 * 16)
        pixels[y * width + x] = float64x3(r, g, b) * scale

    yb = 1.5
    ya = -1.5
    xa = -2.0
    xb = 1.0
    max_it = 255
    dx = float64x4(xb - xa)
    width1 = float64x4(float64(width - 1))
    xa_2 = float64x4(xa)
    scale = float64x3(1.0 / 256.0)

    for y in range(thread_idx(), height, nthreads()):
        cy = float64x4(float64(y) * (yb - ya) / float64(height - 1) + ya)
        for x in range(0, width, 4):
            cx = float64x4(int32x4(x, x + 1, x + 2, x + 3)) * dx / width1 + xa_2
            iterations = num_iterations(max_it, cx, cy)
            write_pixel(pixels, width, x, y, iterations[0], scale)
            write_pixel(pixels, width, x + 1, y, iterations[1], scale)
            write_pixel(pixels, width, x + 2, y, iterations[2], scale)
            write_pixel(pixels, width, x + 3, y, iterations[3], scale)


width = int32(1200)
height = int32(1200)
pixels = array_float64x3(size=width * height)


start = time.clock()
mandel(width, height, pixels)
print('Generating mandelbrot took %f seconds.' % (time.clock() - start))


def write_ppm_ascii(width, height, pixels, filename):
    with open(filename, 'w') as fp:
        fp.write('P3\n')
        fp.write('%i %i\n' % (width, height))
        fp.write('255\n')
        # Pixels are written top to bottom
        for y in range(height - 1, -1, -1):
            for x in range(width):
                p = pixels[y * width + x]
                r = int(p[0] * 255.99)
                g = int(p[1] * 255.99)
                b = int(p[2] * 255.99)
                fp.write('%i %i %i\n' % (r, g, b))

start = time.clock()
write_ppm_ascii(width, height, pixels, 'figure.ppm')
print('Writing image to file took %f seconds.' % (time.clock() - start))