mirror of
https://github.com/intel/llvm.git
synced 2026-01-18 07:57:36 +08:00
[StreamExecutor] Add Doxygen main page
Reviewers: jlebar Subscribers: jprice, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D24066 llvm-svn: 280277
This commit is contained in:
@@ -62,6 +62,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")
|
||||
|
||||
add_subdirectory(lib)
|
||||
add_subdirectory(examples)
|
||||
|
||||
if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
|
||||
find_package(Doxygen REQUIRED)
|
||||
|
||||
@@ -398,7 +398,7 @@ LOOKUP_CACHE_SIZE = 0
|
||||
# normally produced when WARNINGS is set to YES.
|
||||
# The default value is: NO.
|
||||
|
||||
EXTRACT_ALL = NO
|
||||
EXTRACT_ALL = YES
|
||||
|
||||
# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
|
||||
# be included in the documentation.
|
||||
@@ -811,7 +811,7 @@ EXCLUDE_SYMBOLS =
|
||||
# that contain example code fragments that are included (see the \include
|
||||
# command).
|
||||
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@
|
||||
|
||||
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
|
||||
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
|
||||
|
||||
2
parallel-libs/streamexecutor/examples/CMakeLists.txt
Normal file
2
parallel-libs/streamexecutor/examples/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(example Example.cpp)
|
||||
target_link_libraries(example streamexecutor)
|
||||
163
parallel-libs/streamexecutor/examples/Example.cpp
Normal file
163
parallel-libs/streamexecutor/examples/Example.cpp
Normal file
@@ -0,0 +1,163 @@
|
||||
//===-- Example.cpp - Example code for documentation ----------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
///
|
||||
/// \file
|
||||
/// This file contains example code demonstrating the usage of the
|
||||
/// StreamExecutor API. Snippets of this file will be included as code examples
|
||||
/// in documentation. Taking these examples from a real source file guarantees
|
||||
/// that the examples will always compile.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "streamexecutor/StreamExecutor.h"
|
||||
|
||||
/// [Example saxpy host helper functions]
|
||||
// Example handler for streamexecutor::Expected return values.
|
||||
template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) {
|
||||
if (!E) {
|
||||
std::fprintf(stderr, "Error extracting an expected value: %s.\n",
|
||||
streamexecutor::consumeAndGetMessage(E.takeError()).c_str());
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
return std::move(*E);
|
||||
}
|
||||
|
||||
// Example handler for streamexecutor::Error return values.
|
||||
void check(streamexecutor::Error &&E) {
|
||||
if (E) {
|
||||
std::fprintf(stderr, "Error encountered: %s.\n",
|
||||
streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
|
||||
std::exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
/// [Example saxpy host helper functions]
|
||||
|
||||
/// [Example saxpy compiler-generated]
|
||||
// Code in this namespace is generated by the compiler (e.g. clang).
|
||||
//
|
||||
// The name of this namespace may depend on the compiler that generated it, so
|
||||
// this is just an example name.
|
||||
namespace __compilergen {
|
||||
|
||||
// Specialization of the streamexecutor::Kernel template class for the parameter
|
||||
// types of the saxpy(float A, float *X, float *Y) kernel.
|
||||
using SaxpyKernel =
|
||||
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
|
||||
streamexecutor::GlobalDeviceMemory<float>>;
|
||||
|
||||
// A string containing the PTX code generated by the device compiler for the
|
||||
// saxpy kernel. String contents not shown here.
|
||||
extern const char *SaxpyPTX;
|
||||
|
||||
// A global instance of a loader spec that knows how to load the code in the
|
||||
// SaxpyPTX string.
|
||||
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
|
||||
streamexecutor::MultiKernelLoaderSpec Spec;
|
||||
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
|
||||
return Spec;
|
||||
}();
|
||||
|
||||
} // namespace __compilergen
|
||||
/// [Example saxpy compiler-generated]
|
||||
|
||||
/// [Example saxpy host PTX]
|
||||
const char *__compilergen::SaxpyPTX = R"(
|
||||
.version 4.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
|
||||
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
|
||||
.reg .f32 %AValue;
|
||||
.reg .f32 %XValue;
|
||||
.reg .f32 %YValue;
|
||||
.reg .f32 %Result;
|
||||
|
||||
.reg .b64 %XBaseAddrGeneric;
|
||||
.reg .b64 %YBaseAddrGeneric;
|
||||
.reg .b64 %XBaseAddrGlobal;
|
||||
.reg .b64 %YBaseAddrGlobal;
|
||||
.reg .b64 %XAddr;
|
||||
.reg .b64 %YAddr;
|
||||
.reg .b64 %ThreadByteOffset;
|
||||
|
||||
.reg .b32 %TID;
|
||||
|
||||
ld.param.f32 %AValue, [A];
|
||||
ld.param.u64 %XBaseAddrGeneric, [X];
|
||||
ld.param.u64 %YBaseAddrGeneric, [Y];
|
||||
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
|
||||
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
|
||||
mov.u32 %TID, %tid.x;
|
||||
mul.wide.u32 %ThreadByteOffset, %TID, 4;
|
||||
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
|
||||
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
|
||||
ld.global.f32 %XValue, [%XAddr];
|
||||
ld.global.f32 %YValue, [%YAddr];
|
||||
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
|
||||
st.global.f32 [%XAddr], %Result;
|
||||
ret;
|
||||
}
|
||||
)";
|
||||
/// [Example saxpy host PTX]
|
||||
|
||||
int main() {
|
||||
/// [Example saxpy host main]
|
||||
namespace se = ::streamexecutor;
|
||||
namespace cg = ::__compilergen;
|
||||
|
||||
// Create some host data.
|
||||
float A = 42.0f;
|
||||
std::vector<float> HostX = {0, 1, 2, 3};
|
||||
std::vector<float> HostY = {4, 5, 6, 7};
|
||||
size_t ArraySize = HostX.size();
|
||||
|
||||
// Get a device object.
|
||||
se::Platform *Platform =
|
||||
getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
|
||||
if (Platform->getDeviceCount() == 0) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
se::Device *Device = getOrDie(Platform->getDevice(0));
|
||||
|
||||
// Load the kernel onto the device.
|
||||
std::unique_ptr<cg::SaxpyKernel> Kernel =
|
||||
getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
|
||||
|
||||
// Allocate memory on the device.
|
||||
se::GlobalDeviceMemory<float> X =
|
||||
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
|
||||
se::GlobalDeviceMemory<float> Y =
|
||||
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
|
||||
|
||||
// Run operations on a stream.
|
||||
std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream());
|
||||
Stream->thenCopyH2D<float>(HostX, X)
|
||||
.thenCopyH2D<float>(HostY, Y)
|
||||
.thenLaunch(ArraySize, 1, *Kernel, A, X, Y)
|
||||
.thenCopyD2H<float>(X, HostX);
|
||||
// Wait for the stream to complete.
|
||||
check(Stream->blockHostUntilDone());
|
||||
|
||||
// Process output data in HostX.
|
||||
std::vector<float> ExpectedX = {4, 47, 90, 133};
|
||||
for (size_t I = 0; I < ArraySize; ++I) {
|
||||
assert(HostX[I] == ExpectedX[I]);
|
||||
}
|
||||
|
||||
// Free device memory.
|
||||
check(Device->freeDeviceMemory(X));
|
||||
check(Device->freeDeviceMemory(Y));
|
||||
/// [Example saxpy host main]
|
||||
}
|
||||
@@ -55,7 +55,7 @@
|
||||
/// function as follows:
|
||||
/// \code
|
||||
/// namespace ccn = compiler_cuda_namespace;
|
||||
/// using KernelPtr = std::unique_ptr<cnn::SaxpyKernel>;
|
||||
/// using KernelPtr = std::unique_ptr<ccn::SaxpyKernel>;
|
||||
/// // Assumes Device is a pointer to the Device on which to launch the
|
||||
/// // kernel.
|
||||
/// //
|
||||
|
||||
@@ -115,8 +115,9 @@ public:
|
||||
/// Adds each item in SpecList to this object.
|
||||
///
|
||||
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
|
||||
CUDAPTXInMemorySpec(llvm::StringRef KernelName,
|
||||
const llvm::ArrayRef<PTXSpec> SpecList);
|
||||
CUDAPTXInMemorySpec(
|
||||
llvm::StringRef KernelName,
|
||||
const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
|
||||
|
||||
/// Returns a pointer to the PTX code for the requested compute capability.
|
||||
///
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// \mainpage Getting Started
|
||||
///
|
||||
/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
|
||||
/// models (runtimes). This abstraction cleanly permits host code to target
|
||||
/// either CUDA or OpenCL devices with identically-functioning data parallel
|
||||
/// kernels. It manages the execution of concurrent work targeting the
|
||||
/// accelerator, similar to a host-side Executor.
|
||||
///
|
||||
/// This version of StreamExecutor can be built either as a sub-project of the
|
||||
/// LLVM project or as a standalone project depending on LLVM as an external
|
||||
/// package.
|
||||
///
|
||||
/// Below is an example of the use of the StreamExecutor API:
|
||||
///
|
||||
/// \snippet examples/Example.cpp Example saxpy host main
|
||||
///
|
||||
/// In the example, a couple of handler functions are used to handle error
|
||||
/// return values in the StreamExecutor API:
|
||||
///
|
||||
/// \snippet examples/Example.cpp Example saxpy host helper functions
|
||||
///
|
||||
/// These are just example handler functions. A real application will likely
|
||||
/// want to define similar handlers of its own that log errors in an
|
||||
/// application-specific way, convert errors to the application's own
|
||||
/// error-handling framework, or try to recover from errors as appropriate.
|
||||
///
|
||||
/// The example also references some symbols from a compiler-generated
|
||||
/// namespace:
|
||||
///
|
||||
/// \snippet examples/Example.cpp Example saxpy compiler-generated
|
||||
///
|
||||
/// Instead of depending on the compiler to generate this code, you can
|
||||
/// technically write the code yourself, but this is not recommended because the
|
||||
/// code is very error-prone. For example, the template parameters for the
|
||||
/// Kernel specialization have to match the parameter types for the device
|
||||
/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
|
||||
/// device code for the kernel. Errors in this code will not show up until
|
||||
/// runtime, and may only show up as garbage output rather than an explicit
|
||||
/// error, which can be very hard to debug, so again, it is strongly advised not
|
||||
/// to write this code yourself.
|
||||
///
|
||||
/// The example compiler-generated code uses a PTX string in the source code to
|
||||
/// store the device code, but the device code can also be stored in other
|
||||
/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
|
||||
/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
|
||||
/// right device code at runtime based on the user's platform selection. See
|
||||
/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
|
||||
/// stored for different platforms, but again, the code to set up the
|
||||
/// MultiKernelLoaderSpec instance should be generated by the compiler if
|
||||
/// possible, not by the user.
|
||||
|
||||
#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
|
||||
#define STREAMEXECUTOR_STREAMEXECUTOR_H
|
||||
|
||||
#include "Device.h"
|
||||
#include "Kernel.h"
|
||||
#include "KernelSpec.h"
|
||||
#include "Platform.h"
|
||||
#include "PlatformManager.h"
|
||||
#include "Stream.h"
|
||||
|
||||
#endif // STREAMEXECUTOR_STREAMEXECUTOR_H
|
||||
Reference in New Issue
Block a user