diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp
index f0c286e0c8..df23b690ae 100644
--- a/opencl/source/api/api.cpp
+++ b/opencl/source/api/api.cpp
@@ -3449,7 +3449,7 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
     }
 
     retVal = pCommandQueue->enqueueKernel(
-        kernel,
+        pKernel,
         workDim,
         globalWorkOffset,
         globalWorkSize,
@@ -5904,7 +5904,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
     }
 
     retVal = pCommandQueue->enqueueKernel(
-        kernel,
+        pKernel,
         workDim,
         globalWorkOffset,
         globalWorkSize,
diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h
index 019721a08b..2182c03340 100644
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -85,7 +85,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
     virtual cl_int enqueueFillBuffer(Buffer *buffer, const void *pattern, size_t patternSize, size_t offset,
                                      size_t size, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0;
 
-    virtual cl_int enqueueKernel(cl_kernel kernel, cl_uint workDim, const size_t *globalWorkOffset, const size_t *globalWorkSize,
+    virtual cl_int enqueueKernel(Kernel *kernel, cl_uint workDim, const size_t *globalWorkOffset, const size_t *globalWorkSize,
                                  const size_t *localWorkSize, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0;
 
     virtual cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) = 0;
diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index 15532d8d1e..21b8488660 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -144,7 +144,7 @@ class CommandQueueHw : public CommandQueue {
                             const cl_event *eventWaitList,
                             cl_event *event) override;
 
-    cl_int enqueueKernel(cl_kernel kernel,
+    cl_int enqueueKernel(Kernel *kernel,
                          cl_uint workDim,
                          const size_t *globalWorkOffset,
                          const size_t *globalWorkSize,
diff --git a/opencl/source/command_queue/enqueue_kernel.h b/opencl/source/command_queue/enqueue_kernel.h
index 14a109c816..e14bdecbe3 100644
--- a/opencl/source/command_queue/enqueue_kernel.h
+++ b/opencl/source/command_queue/enqueue_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,7 @@ namespace NEO {
 
 template <typename GfxFamily>
 cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
-    cl_kernel clKernel,
+    Kernel *pKernel,
     cl_uint workDim,
     const size_t *globalWorkOffsetIn,
     const size_t *globalWorkSizeIn,
@@ -36,7 +36,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
     size_t workGroupSize[3] = {1, 1, 1};
     size_t enqueuedLocalWorkSize[3] = {0, 0, 0};
 
-    auto &kernel = *castToObjectOrAbort<Kernel>(clKernel);
+    auto &kernel = *pKernel;
     auto rootDeviceIndex = device->getRootDeviceIndex();
     const auto &kernelInfo = kernel.getKernelInfo(rootDeviceIndex);
 
diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h
index 92b6446fa1..9d49e3e3ca 100644
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@@ -88,7 +88,7 @@ class MockCommandQueue : public CommandQueue {
                              size_t size, cl_uint numEventsInWaitList,
                              const cl_event *eventWaitList, cl_event *event) override { return CL_SUCCESS; }
 
-    cl_int enqueueKernel(cl_kernel kernel, cl_uint workDim, const size_t *globalWorkOffset,
+    cl_int enqueueKernel(Kernel *kernel, cl_uint workDim, const size_t *globalWorkOffset,
                          const size_t *globalWorkSize, const size_t *localWorkSize,
                          cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override { return CL_SUCCESS; }
 
diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp
index 593aa1faff..68775cb3c0 100644
--- a/opencl/test/unit_test/profiling/profiling_tests.cpp
+++ b/opencl/test/unit_test/profiling/profiling_tests.cpp
@@ -151,10 +151,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfolingWhenWa
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = &kernel;
 
     static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
-        clKernel,
+        &kernel,
         dimensions,
         globalOffsets,
         workItems,
@@ -198,10 +197,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingTests, GivenCommandQueueWithProfilingWhenNo
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = &kernel;
 
     static_cast<CommandQueueHw<FamilyType> *>(pCmdQ)->enqueueKernel(
-        clKernel,
+        &kernel,
         dimensions,
         globalOffsets,
         workItems,
@@ -687,9 +685,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWit
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = kernel->mockKernel;
 
-    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
 
     HardwareParse parse;
     auto &cmdList = parse.cmdList;
@@ -739,9 +736,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWit
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = kernel->mockKernel;
 
-    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
 
     HardwareParse parse;
     auto &cmdList = parse.cmdList;
@@ -792,8 +788,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueBlo
     uint32_t dimensions = 1;
     cl_event event;
     cl_event ue = new UserEvent();
-    cl_kernel clKernel = kernel->mockKernel;
-    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr,
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr,
                                                                           1,   // one user event to block queue
                                                                           &ue, // user event not signaled
                                                                           &event);
@@ -849,9 +844,8 @@ HWTEST_F(ProfilingWithPerfCountersTests, GivenCommandQueueWithProfilingPerfCount
     size_t globalOffsets[3] = {0, 0, 0};
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
-    cl_kernel clKernel = kernel->mockKernel;
 
-    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, nullptr);
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, nullptr);
 
     HardwareParse parse;
     auto &cmdList = parse.cmdList;
@@ -917,9 +911,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersTests, GivenCommandQueueWit
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = kernel->mockKernel;
 
-    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
+    static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get())->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
 
     auto pEvent = static_cast<MockEvent<Event> *>(event);
     EXPECT_EQ(pEvent->getHwTimeStampNode()->getGpuAddress(), timeStampGpuAddress);
@@ -972,10 +965,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQue
     uint32_t dimensions = 1;
     cl_event event;
     cl_event userEvent = clCreateUserEvent(context.get(), nullptr);
-    cl_kernel clKernel = kernel->mockKernel;
     CommandQueueHw<FamilyType> *cmdQHw = static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get());
 
-    cmdQHw->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 1, &userEvent, &event);
+    cmdQHw->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 1, &userEvent, &event);
     ASSERT_NE(nullptr, pCmdQ->virtualEvent);
     ASSERT_NE(nullptr, pCmdQ->virtualEvent->peekCommand());
     NEO::LinearStream *eventCommandStream = pCmdQ->virtualEvent->peekCommand()->getCommandStream();
@@ -1026,10 +1018,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingWithPerfCountersOnCCSTests, givenCommandQue
     size_t workItems[3] = {1, 1, 1};
     uint32_t dimensions = 1;
     cl_event event;
-    cl_kernel clKernel = kernel->mockKernel;
     CommandQueueHw<FamilyType> *cmdQHw = static_cast<CommandQueueHw<FamilyType> *>(pCmdQ.get());
 
-    cmdQHw->enqueueKernel(clKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
+    cmdQHw->enqueueKernel(kernel->mockKernel, dimensions, globalOffsets, workItems, nullptr, 0, nullptr, &event);
 
     HardwareParse parse;
     auto &cmdList = parse.cmdList;