You are on page 1of 19

My Experiments

Its all about Me

Home

About Me

Programming/Coding

Reviews

My Photos

Home > Programming, Technology > OpenCL GPU Matrix multiplication program

OpenCL GPU Matrix multiplication program


August 24, 2011 Vasanth Raja Chittampally Leave a comment Go to comments

Here is the code to multiply two matrices using heterogeneous system


programming language OpenCL. The reason being called OpenCL as heterogeneous is that
written a code in OpenCL can be ported to CPU or GPU or Cell processor.

1 //Author: Vasanth Raja


//Program to multiply two matrices using OpenCL in GPU
2
3
#include "stdafx.h"
4
5 #include < stdio.h >
6 #include < stdlib.h >
7 #include < time.h >
8 #include < ctime >
9
#define widthA 128
10 #define heightA 128
11
12 #define widthB heightA
13 #define heightB 128
14
15 #define widthC widthA
#define heightC heightB
16
17
#ifdef __APPLE__
18 #include < OpenCL/opencl.h >
19 #else
20 #include < CL/cl.h >
21 #endif
22
#define MEM_SIZE (128)
23 #define MAX_SOURCE_SIZE (0x100000)
24
25 int main()
26 {
27 float * A = (float *)malloc(sizeof(float)*widthA*heightA);
28 float * B = (float *)malloc(sizeof(float)*widthB*heightB);
float * C = (float *)malloc(sizeof(float)*widthC*heightC);
29 float * Res = (float *)malloc(sizeof(float)*widthC*heightC);
30 float * D= (float *)malloc(sizeof(float)*widthC*heightC);
31
32 FILE * fp1 = fopen("matAdata.txt", "w");
33 if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
34 exit(1);
35 }
36
37 for(int i = 0;i < widthA; i++)
38 {
39 for(int j=0;j {
float p=(rand()%100)/7.0;
40 *(A+i*heightA+j)=rand()%100 + p;
41 fprintf(fp1, "%f ",*(A+i*heightA+j));
42 }
43 fprintf(fp1, "\n");
44 }
fclose(fp1);
45
46 fp1 = fopen("matBdata.txt", "w");
47 if (!fp1) {
48 fprintf(stderr, "Failed to open matAdata.\n");
49 exit(1);
}
50
51 for(int i = 0;i < widthB; i++)
52 {
53 for(int j=0; j {
54 float p=(rand()%100)/7.0;
55 *((B+i*heightB+j))=rand()%100 + p;
fprintf(fp1, "%f ",*(B+i*heightA+j));
56 }
57 fprintf(fp1, "\n");
58 }
59 fclose(fp1);
60 cl_device_id device_id = NULL;
cl_context context = NULL;
61
cl_command_queue command_queue = NULL;
62 cl_mem memobjA = NULL;
63 cl_mem memobjB = NULL;
64 cl_mem memobjC = NULL;
65 cl_mem rowA = NULL;
cl_mem colC = NULL;
66 cl_program program = NULL;
67 cl_kernel kernel = NULL;
68 cl_platform_id platform_id = NULL;
69 cl_uint ret_num_devices;
70 cl_uint ret_num_platforms;
cl_int ret;
71
72 //char string[MEM_SIZE];
73
74 FILE *fp;
75 char fileName[] = "./hello.cl";
76 char *source_str;
size_t source_size;
77 int row = widthA;
78 int col = heightC;
79 /* Load the source code containing the kernel*/
80 fp = fopen(fileName, "r");
81 if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
82 exit(1);
83 }
84 source_str = (char*)malloc(MAX_SOURCE_SIZE);
85 source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
86 fclose( fp );
87
/* Get Platform and Device Info */
88 ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
89 ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
90 &ret_num_devices);
91
92 /* Create OpenCL context */
context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
93
94 /* Create Command Queue */
95 command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
96
97 /* Create Memory Buffer */
98 memobjA = clCreateBuffer(context, CL_MEM_READ_WRITE, widthA * heightA *
99 sizeof(float), NULL, &ret);
memobjB = clCreateBuffer(context, CL_MEM_READ_WRITE, widthB * heightB *
10 sizeof(float), NULL, &ret);
0 memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC *
10 sizeof(float), NULL, &ret);
1 rowA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL,
&ret);
10
colC = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL,
2 &ret);
10
3 // Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0,
10
widthA * heightA * sizeof(int), A, 0, NULL, NULL);
4 ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0,
10 widthB * heightB * sizeof(int), B, 0, NULL, NULL);
5 ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0,
10 sizeof(int), &row, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0,
6 sizeof(int), &col, 0, NULL, NULL);
10
7 /* Create Kernel Program from the source */
10 program = clCreateProgramWithSource(context, 1, (const char
8 **)&source_str,
10 (const size_t *)&source_size, &ret);
9
/* Build Kernel Program */
110 ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
111
112 /* Create OpenCL Kernel */
113 kernel = clCreateKernel(program, "matrixMultiplication", &ret);
114
115 /* Set OpenCL Kernel Arguments */
116 ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB);
117 ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC);
118 //ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA);
119 ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row);
12 ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col);
/* Execute OpenCL Kernel */
0 //ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
12 size_t globalThreads[2] = {widthA, heightB};
1 size_t localThreads[2] = {16,16};
12
2 clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads,
12 localThreads, NULL, 0, NULL);
/* Copy results from the memory buffer */
3 ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0,
12 widthA * heightC * sizeof(float),Res, 0, NULL,
4 NULL);
12
5 fp1 = fopen("matGPURes.txt", "w");
if (!fp1) {
12 fprintf(stderr, "Failed to open matAdata.\n");
6 exit(1);
12 }
7
12 printf("\nResult\n");
8 for(int i = 0;i < widthA; i++)
{
12 for(int j=0;j < heightC; j++)
9 {
13
0 fprintf(fp1, "%f ",*(Res+i*heightC+j));
13
1 }
fprintf(fp1, "\n");
13
}
2 fclose(fp1);
13
3 ret = clFlush(command_queue);
13 ret = clFinish(command_queue);
4 ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
13 ret = clReleaseMemObject(memobjA);
5 ret = clReleaseMemObject(memobjB);
13 ret = clReleaseMemObject(memobjC);
6 ret = clReleaseCommandQueue(command_queue);
13 ret = clReleaseContext(context);
7
free(source_str);
13 system("pause");
8
13 float sum=0.0;
9
14 for(int i = 0;i < widthA; i++)
0 {
for(int j = 0; j < heightC; j++)
14 {
1 sum = 0;
14 for(int k = 0; k < widthB; k++)
2 {
14 sum += A[i*col+k] * B[k*row+j];
}
3 D[i*heightC+j] = sum;
14 }
4
14 }
5
14 fp1 = fopen("matNormalMultiplicationRes.txt", "w");
6 if (!fp1) {
fprintf(stderr, "Failed to open matAdata.\n");
14 exit(1);
7 }
14
8 printf("\nResult\n");
14 for(int i = 0;i < widthA; i++)
{
9 for(int j=0;j < heightC; j++)
15 {
0 fprintf(fp1, "%f ",*(D+i*heightC+j));
15
1 }
15 fprintf(fp1, "\n");
}
2 system("pause");
15 return 0;
3 }
15
4
15
5
15
6
15
7
15
8
15
9
16
0
16
1
16
2
16
3
16
4
16
5
16
6
16
7
16
8
16
9
17
0
17
1
17
2
17
3
17
4
17
5
17
6
17
7
17
8
17
9
18
0
18
1
18
2
18
3
18
4
18
5
18
6
18
7
18
8
18
9
19
0
19
1
19
2
19
3
19
4
19
5
19
6
19
7
19
8
19
9
20
0
20
1
20
2
20
3
20
4
20
5
20
6
20
7
20
8
20
9
21
0
211
21
2
21
3
21
4
21
5
21
6
21
7
21
8
21
9
22
0
22
1
22
2
You can check configuration and set up in Visual studio here.

The actual Kernel executed in the GPU is as follows.

1
2
__kernel
3 void matrixMultiplication(__global float* A, __global float* B, __global
4 float* C, int widthA, int widthB )
5 {
6 int i = get_global_id(0);
int j = get_global_id(1);
7 float value=0;
8 for ( int k = 0; k < widthA; k++)
9 {
1 value = value + A[k + j * widthA] * B[k*widthB + i];
0 }
C[i + widthA * j] = value;
11}
1
2

Advertisements

Share this:

Twitter

Reddit

Facebook
Print

Email

Related

Aparapi Java Matrix Multiplication ExampleIn "Programming"

Aparapi + Java + OpenCLIn "Programming"

Install OpenCL in Windows7 and configure in Visual StudioIn "Programming"

Categories: Programming, Technology Tags: GPGPU programming, GPU matrix multiplication,


OpenCL, OpenCL GPU matrix multiplication, OpenCL matrix multiplication
Comments (4) Trackbacks (1) Leave a comment Trackback

1.

Ivan

January 17, 2012 at 10:01 pm

Reply

Thank you very much!!! I learned a lot from it!!!

Could you please upload code for matrix inversion!!!! or give at least a hint!!!

2.

KOnark PAtel

February 23, 2013 at 1:58 pm

Reply

thanks for this code. i have a query. when i used few days back this code on my w8,
visual studio, it was working well. but since today the program runs too slowly for
executing this lines
float p=(rand()%100)/7.0;
*(A+i*heightA+j)=rand()%100 + p;
fprintf(fp1, %f ,*(A+i*heightA+j));
and also program is not finishing. command console just remains open. can u suggest any
solution?

3.

paweln66

January 17, 2014 at 10:49 pm

Reply

Useful solution. In line 45 and 62 you have a bug missing elements of for loop.

For me its works correctly, but I getting following output:

Result
sh: pause: command not found

Result
sh: pause: command not found

What means sh: pause: command not found?

4.

Alex

January 19, 2014 at 5:15 pm

Reply

Cool code thx. Hovewer I tried to implement it in vs 2012 and got two errors.

1.First:
for(int i = 0;i < widthA; i++)
{
for(int j=0;j { //I think you forgot here the hightA
float p=(rand()%100)/7.0;
*(A+i*heightA+j)=rand()%100 + p;
fprintf(fp1, "%f ",*(A+i*heightA+j));
}
fprintf(fp1, "\n");
}
2.Second:
for(int i = 0;i < widthB; i++)
{
for(int j=0; j { //Here the same thing
float p=(rand()%100)/7.0;
*((B+i*heightB+j))=rand()%100 + p;
fprintf(fp1, "%f ",*(B+i*heightA+j));
}
fprintf(fp1, "\n");
}

New code
5KK73
GPU
assign
ment
websit
e
2014/2
015
Home Matrix multiplication in OpenCL
This document describes a matrix multiplication example application
Administrati
using OpenCL for Nvidia GPUs, the focus will be on the code structure
on
for the host application and the OpenCL GPU kernels. For examples of
Request
optimization matrix multiplication please refer to the CUDA example
server
documentation, most CUDA kernels will be very similar in a OpenCL
account
implementation. This example can be found here. The source code
Announceme
for the OpenCL matrix multiplication example can be found here.
nts
Host code
Resources
Slides The host code initializes the OpenCL capable GPUs, allocates and
5KK73 forum transfers memory and executed the OpenCL kernel.

Examples The code shown below declares OpenCL memories which will be
Matrix instantiated on the device, hence the prefix 'd_'. The A and B
multiplication memories are two input matrices of size 1024x1024, C is the result
matrix. Since the memory described above is on the device we also
- OpenCL need to declare and allocate memory on the host, in this case the
Matrix server, and fill the input arrays with values. This is done by the
Multiplication radomInit() function.
- CUDA
// OpenCL device memory for matrices
cl_mem d_A;
Assignment/ cl_mem d_B;
competition cl_mem d_C;
Mining
application // set seed for rand()
Assignment srand(2014);
guidelines
//Allocate host memory for matrices A and B
Competition
unsigned int size_A = WA * HA;
Score board unsigned int mem_size_A = sizeof(float) * size_A;
Submit float* h_A = (float*) malloc(mem_size_A);

Other unsigned int size_B = WB * HB;


OpenCL unsigned int mem_size_B = sizeof(float) * size_B;
reference float* h_B = (float*) malloc(mem_size_B);
(Khronos)
//Initialize host memory
OpenCL
randomInit(h_A, size_A);
tutorials randomInit(h_B, size_B);
(StreamComp
uting) //Allocate host memory for the result C
More turorials unsigned int size_C = WC * HC;
(Codeplex) unsigned int mem_size_C = sizeof(float) * size_C;
Porting CUDA float* h_C = (float*) malloc(mem_size_C);
to OpenCL
(AMD) The output memory on the host will be allocated but only written
OpenCL with the result after execution of the OpenCL kernel.
Matrix The function clCreateCommandQueue creates a OpenCL command
Multiply queue. The OpenCL functions that are submitted to a command-
Tutorial queue are enqueued in the order the calls are made but can be
Getting configured to execute in-order or out-of-order. The properties
Started with argument in clCreateCommandQueue can be used to specify the
CUDA execution order.

cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);

cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);

// Connect to a compute device


int gpu = 1;
err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GP
U : CL_DEVICE_TYPE_CPU, 1,
&device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}

// Create a compute context


context = clCreateContext(0, 1, &device_id, NULL, NULL, &err)
;
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}

// Create a command commands


commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}

Once a OpenCL context and command queue are defined the OpenCL
kernel can be loaded. In OpenCL kernels are typically loaded are
runtime and compiled by the function clBuildProgram. In order to do
this the actual kernel is loaded by the function LoadOpenCLKernel
and transformed into an OpenCL program description with the
clCreateProgramWithSource function. The built kernel description will
then be made ready for execution by the clCreateKernel function. Be
aware that the second argument should match the name of the
kernel as descibed in the .cl file.

// Create the compute program from the source file


char *KernelSource;
long lFileSize;

lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSo


urce, false);
if( lFileSize < 0L ) {
perror("File read failed");
return 1;
}

program = clCreateProgramWithSource(context, 1, (const char *


*) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUIL
D_LOG, sizeof(buffer),
buffer, &len);

printf("%s\n", buffer);
exit(1);
}

// Create the compute kernel in the program we wish to run


//
kernel = clCreateKernel(program, "matrixMul", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
exit(1);
}

Now the kernel is ready for execution the buffers on the compute
device (in our case the GPU) should be allocated, this is done with
the clCreateBuffer function, the arguments of this function can be
used to describe if a memory is read-only, write-only or read-write.
Specifying this correct can help to increase performance. The
function clSetKernelArg links the allocated memory space in the GPU
to the arguments of the kernel, in our case the A,B and C matrices
and two integers specifying the width of the matrices.

// Create the input and output arrays in device memory for ou


r calculation
d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A,
NULL, &err);
d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY
_HOST_PTR, mem_size_A, h_A,
&err);
d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY
_HOST_PTR, mem_size_B, h_B,
&err);

if (!d_A || !d_B || !d_C)


{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
printf("Running matrix multiplication for matrices A (%dx%d)
and B (%dx%d) ...\n",
WA,HA,WB,HB);

//Launch OpenCL kernel


size_t localWorkSize[2], globalWorkSize[2];

int wA = WA;
int wC = WC;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C)
;
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A
);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B
);
err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA);
err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC);

if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err
);
exit(1);
}

The function clEnqueueNDRangeKernel enqueues a command to


execute a kernel on a device. Some important parameters are the
global work size and the local work size. These are explained as
follows by the OpenCL documentation:

Global work size


Points to an array of work_dim unsigned values that describe the
number of global work-items in work_dim dimensions that will
execute the kernel function. The total number of global work-items is
computed as global_work_size[0] *...* global_work_size[work_dim -
1]. The values specified in global_work_size cannot exceed the range
given by the sizeof(size_t) for the device on which the kernel
execution will be enqueued. The sizeof(size_t) for a device can be
determined using CL_DEVICE_ADDRESS_BITS in the table of OpenCL
Device Queries for clGetDeviceInfo. If, for example,
CL_DEVICE_ADDRESS_BITS = 32, i.e. the device uses a 32-bit address
space, size_t is a 32-bit unsigned integer and global_work_size values
must be in the range 1 .. 2^32 - 1. Values outside this range return a
CL_OUT_OF_RESOURCES error.

Local work size


Points to an array of work_dim unsigned values that describe the
number of work-items that make up a work-group (also referred to as
the size of the work-group) that will execute the kernel specified by
kernel. The total number of work-items in a work-group is computed
as local_work_size[0] *... * local_work_size[work_dim - 1]. The total
number of work-items in the work-group must be less than or equal
to the CL_DEVICE_MAX_WORK_GROUP_SIZE value specified in table
of OpenCL Device Queries for clGetDeviceInfo and the number of
work-items specified in local_work_size[0],...
local_work_size[work_dim - 1] must be less than or equal to the
corresponding values specified by
CL_DEVICE_MAX_WORK_ITEM_SIZES[0],....
CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1]. The explicitly
specified local_work_size will be used to determine how to break the
global work-items specified by global_work_size into appropriate
work-group instances. If local_work_size is specified, the values
specified in global_work_size[0],... global_work_size[work_dim - 1]
must be evenly divisable by the corresponding values specified in
local_work_size[0],... local_work_size[work_dim - 1].

In effect, these parameters describe something similar to the CUDA


block sizes.

localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;

err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globa


lWorkSize, localWorkSize,
0, NULL, NULL);

if (err != CL_SUCCESS)
{
printf("Error: Failed to execute kernel! %d\n", err);
exit(1);
}

After execution of the kernel the clEnqueueReadBuffer is used to read


the result memory on the device and copy it to the memory on the
host.

//Retrieve result from device


err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size
_C, h_C, 0, NULL, NULL);

if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}

GPU code
The OpenCL kernel is very similar in structure to a CUDA kernel, with
some small differences. The external memory is described with
__global and shared memory is described with __local, whereas this
would be called shared memory in CUDA. Additionally a similar
structure to CUDA is used for determining the thread id. This can be
done via the get_global_id function which works for multiple
dimensions. The return values of this function can be used to
determine the matrix location to read for calculation. Due to the
similar structure between CUDA and OpenCL many of the
optimizations described in the CUDA matrix multiplication example
can be applied to the OpenCL version without too many
modifications.

/* kernel.cl
* Matrix multiplication: C = A * B.
* Device code.
*/

// OpenCL Kernel
__kernel void
matrixMul(__global float* C,
__global float* A,
__global float* B,
int wA, int wB)
{

int tx = get_global_id(0);
int ty = get_global_id(1);

// value stores the element that is


// computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}

// Write the matrix to device memory each


// thread writes one element
C[ty * wA + tx] = value;
}

You might also like