[CUDA] CPU vs GPU with python
Published:
This column compares the speed of the CPU and GPU.
Multiplication
when we use CPU:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize
@vectorize(["float32(float32, float32)"], target = 'cpu')
def MultiplyMyVecotrs(a,b):
return a*b
def main():
N =64000000
A = np.ones(N, dtype=np.float32)
B = np.ones(N, dtype=np.float32)
C = np.ones(N, dtype=np.float32)
start = timer()
MemoryError(A, B)
C = MultiplyMyVecotrs(A,B)
vectormutiply_time = timer() - start
print("C[:6] = " + str(C[:6]))
print("C[-6:]= "+str(C[-6:]))
print("This multiplication took %f seconds" % vectormutiply_time)
main()
C[:6] = [1. 1. 1. 1. 1. 1.]
C[-6:]= [1. 1. 1. 1. 1. 1.]
This multiplication took 0.094402 seconds
import numpy as np
from timeit import default_timer as timer
from numba import vectorize
@vectorize(["float32(float32, float32)"], target = 'cuda')
def MultiplyMyVecotrs(a,b):
return a*b
def main():
N =64000000
A = np.ones(N, dtype=np.float32)
B = np.ones(N, dtype=np.float32)
C = np.ones(N, dtype=np.float32)
start = timer()
MemoryError(A, B)
C = MultiplyMyVecotrs(A,B)
vectormutiply_time = timer() - start
print("C[:6] = " + str(C[:6]))
print("C[-6:]= "+str(C[-6:]))
print("This multiplication took %f seconds" % vectormutiply_time)
main()
C[:6] = [1. 1. 1. 1. 1. 1.]
C[-6:]= [1. 1. 1. 1. 1. 1.]
This multiplication took 0.627809 seconds
Filling an array
import numpy as np
from timeit import default_timer as timer
from numba import cuda, jit
def FillArrayWithoutGPU(a):
for k in range(1000000):
a[k]+=1
@jit(target_backend='cuda')
def FillArrayWithGPU(a):
for k in range(1000000):
a[k]+=1
a = np.ones(10000000, dtype = np.float64 )
start = timer()
FillArrayWithoutGPU(a)
print("On a CPU: ", timer()-start)
start = timer()
FillArrayWithGPU(a)
print("On a GPU: ", timer()-start)
On a CPU: 0.24581730199997764
On a GPU: 0.07611533699991924
Mandlebrot
import numpy as np
from matplotlib.pylab import imshow, show
from timeit import default_timer as timer
def mandelbrot(x, y, max_iters):
c = complex(x, y)
z = 0.0j
for i in range(max_iters):
z = z * z + c
if (z.real * z.real + z.imag * z.imag) >= 4:
return i
return 255
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
width = image.shape[1]
height = image.shape[0]
pixel_size_x = (max_x - min_x) / width
pixel_size_y = (max_y - min_y) / height
for x in range(width):
real = min_x + x * pixel_size_x
for y in range(height):
imag = min_y + y * pixel_size_y
color = mandelbrot(real, imag, iters)
image[y, x] = color
image = np.zeros((500*10, 750*10), dtype=np.uint8)
s = timer()
create_fractal(-2.0, 1.0, -1.0, 1.0, image, 20)
e = timer()
print("Execution time on CPU: %f seconds" % (e - s))
imshow(image)
show()
add output image
Execution time on CPU: 133.978606 seconds
import numpy as np
from matplotlib.pylab import imshow, show
from timeit import default_timer as timer
from numba import cuda
@cuda.jit(device=True)
def mandelbrot(x, y, max_iters):
c = complex(x, y)
z = 0.0j
for i in range(max_iters):
z = z * z + c
if (z.real * z.real + z.imag * z.imag) >= 4:
return i
return 255
@cuda.jit
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
width = image.shape[1]
height = image.shape[0]
pixel_size_x = (max_x - min_x) / width
pixel_size_y = (max_y - min_y) / height
x,y = cuda.grid(2)
if x< width and y < height:
real = min_x + x * pixel_size_x
imag = min_y + y * pixel_size_y
color = mandelbrot(real, imag, iters)
image[y, x] = color
image = np.zeros((500*10*2, 750*10*2), dtype=np.uint8)
pixels = 500 * 10* 2* 750* 10* 2
nthreads=32
nblocksy= ((500 * 10 * 2)//nthreads) + 1
nblocksx= ((750 * 10 * 2)//nthreads) + 1
s = timer()
create_fractal[(nblocksx, nblocksy),(nthreads, nthreads)](-2.0, 1.0, -1.0, 1.0, image, 20)
e = timer()
print("Execution time on GPU: %f seconds" % (e - s))
imshow(image)
show()
add output image
Execution time on GPU: 0.691364 seconds