After copying the GPU files to my Gentoo install and installing opencl and pyopencl I ran a python benchmark below to see the difference in speed between the CPU and GPU
# example provided by Roger Pau Monn'e
import pyopencl as cl
import numpy
import numpy.linalg as la
import datetime
from time import time
a = numpy.random.rand(1000).astype(numpy.float32)
b = numpy.random.rand(1000).astype(numpy.float32)
c_result = numpy.empty_like(a)
# Speed in normal CPU usage
time1 = time()
for i in range(1000):
for j in range(1000):
c_result[i] = a[i] + b[i]
c_result[i] = c_result[i] * (a[i] + b[i])
c_result[i] = c_result[i] * (a[i] / 2.0)
time2 = time()
print ("Execution time of test without OpenCL: ", time2 - time1, "s")
for platform in cl.get_platforms():
for device in platform.get_devices():
print ("===============================================================")
print ("Platform name:", platform.name)
print ("Platform profile:", platform.profile)
print ("Platform vendor:", platform.vendor)
print ("Platform version:", platform.version)
print ("---------------------------------------------------------------")
print ("Device name:", device.name)
print ("Device type:", cl.device_type.to_string(device.type))
print ("Device memory: ", device.global_mem_size//1024//1024, 'MB')
print ("Device max clock speed:", device.max_clock_frequency, 'MHz')
print ("Device compute units:", device.max_compute_units)
# Simnple speed test
ctx = cl.Context([device])
queue = cl.CommandQueue(ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
prg = cl.Program(ctx, """
__kernel void sum(__global const float *a,
__global const float *b, __global float *c)
{
int loop;
int gid = get_global_id(0);
for(loop=0; loop<1000;loop++)
{
c[gid] = a[gid] + b[gid];
c[gid] = c[gid] * (a[gid] + b[gid]);
c[gid] = c[gid] * (a[gid] / 2.0);
}
}
""").build()
exec_evt = prg.sum(queue, a.shape, None, a_buf, b_buf, dest_buf)
exec_evt.wait()
elapsed = 1e-9*(exec_evt.profile.end - exec_evt.profile.start)
print ("Execution time of test: %g s" % elapsed)
c = numpy.empty_like(a)
cl._enqueue_read_buffer(queue, dest_buf, c).wait()
error = 0
for i in range(1000):
if c[i] != c_result[i]:
error = 1
if error:
print ("Results doesn't match!!")
else:
print ("Results OK")
Here is the result
StarFive ~ # python test.py
Execution time of test without OpenCL: 34.012295722961426 s
===============================================================
Platform name: PowerVR
Platform profile: EMBEDDED_PROFILE
Platform vendor: Imagination Technologies
Platform version: OpenCL 3.0
---------------------------------------------------------------
Device name: PowerVR B-Series BXE-4-32
Device type: ALL | GPU
Device memory: 7927 MB
Device max clock speed: 594 MHz
Device compute units: 1
Execution time of test: 0.014952 s
Results OK