[StreamExecutor] Add Doxygen main page

Reviewers: jlebar

Subscribers: jprice, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D24066

llvm-svn: 280277
This commit is contained in:
Jason Henline
2016-08-31 19:02:44 +00:00
parent bd850f4185
commit 5b363dd294
7 changed files with 243 additions and 5 deletions

View File

@@ -62,6 +62,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter")
add_subdirectory(lib)
add_subdirectory(examples)
if (STREAM_EXECUTOR_ENABLE_DOXYGEN)
find_package(Doxygen REQUIRED)

View File

@@ -398,7 +398,7 @@ LOOKUP_CACHE_SIZE = 0
# normally produced when WARNINGS is set to YES.
# The default value is: NO.
EXTRACT_ALL = NO
EXTRACT_ALL = YES
# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
# be included in the documentation.
@@ -811,7 +811,7 @@ EXCLUDE_SYMBOLS =
# that contain example code fragments that are included (see the \include
# command).
EXAMPLE_PATH =
EXAMPLE_PATH = @CMAKE_CURRENT_SOURCE_DIR@
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and

View File

@@ -0,0 +1,2 @@
add_executable(example Example.cpp)
target_link_libraries(example streamexecutor)

View File

@@ -0,0 +1,163 @@
//===-- Example.cpp - Example code for documentation ----------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file contains example code demonstrating the usage of the
/// StreamExecutor API. Snippets of this file will be included as code examples
/// in documentation. Taking these examples from a real source file guarantees
/// that the examples will always compile.
///
//===----------------------------------------------------------------------===//
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <memory>
#include <vector>
#include "streamexecutor/StreamExecutor.h"
/// [Example saxpy host helper functions]
// Example handler for streamexecutor::Expected return values.
template <typename T> T getOrDie(streamexecutor::Expected<T> &&E) {
if (!E) {
std::fprintf(stderr, "Error extracting an expected value: %s.\n",
streamexecutor::consumeAndGetMessage(E.takeError()).c_str());
std::exit(EXIT_FAILURE);
}
return std::move(*E);
}
// Example handler for streamexecutor::Error return values.
void check(streamexecutor::Error &&E) {
if (E) {
std::fprintf(stderr, "Error encountered: %s.\n",
streamexecutor::consumeAndGetMessage(std::move(E)).c_str());
std::exit(EXIT_FAILURE);
}
}
/// [Example saxpy host helper functions]
/// [Example saxpy compiler-generated]
// Code in this namespace is generated by the compiler (e.g. clang).
//
// The name of this namespace may depend on the compiler that generated it, so
// this is just an example name.
namespace __compilergen {
// Specialization of the streamexecutor::Kernel template class for the parameter
// types of the saxpy(float A, float *X, float *Y) kernel.
using SaxpyKernel =
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
streamexecutor::GlobalDeviceMemory<float>>;
// A string containing the PTX code generated by the device compiler for the
// saxpy kernel. String contents not shown here.
extern const char *SaxpyPTX;
// A global instance of a loader spec that knows how to load the code in the
// SaxpyPTX string.
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
streamexecutor::MultiKernelLoaderSpec Spec;
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
return Spec;
}();
} // namespace __compilergen
/// [Example saxpy compiler-generated]
/// [Example saxpy host PTX]
const char *__compilergen::SaxpyPTX = R"(
.version 4.3
.target sm_20
.address_size 64
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
.reg .f32 %AValue;
.reg .f32 %XValue;
.reg .f32 %YValue;
.reg .f32 %Result;
.reg .b64 %XBaseAddrGeneric;
.reg .b64 %YBaseAddrGeneric;
.reg .b64 %XBaseAddrGlobal;
.reg .b64 %YBaseAddrGlobal;
.reg .b64 %XAddr;
.reg .b64 %YAddr;
.reg .b64 %ThreadByteOffset;
.reg .b32 %TID;
ld.param.f32 %AValue, [A];
ld.param.u64 %XBaseAddrGeneric, [X];
ld.param.u64 %YBaseAddrGeneric, [Y];
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
mov.u32 %TID, %tid.x;
mul.wide.u32 %ThreadByteOffset, %TID, 4;
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
ld.global.f32 %XValue, [%XAddr];
ld.global.f32 %YValue, [%YAddr];
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
st.global.f32 [%XAddr], %Result;
ret;
}
)";
/// [Example saxpy host PTX]
int main() {
/// [Example saxpy host main]
namespace se = ::streamexecutor;
namespace cg = ::__compilergen;
// Create some host data.
float A = 42.0f;
std::vector<float> HostX = {0, 1, 2, 3};
std::vector<float> HostY = {4, 5, 6, 7};
size_t ArraySize = HostX.size();
// Get a device object.
se::Platform *Platform =
getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
if (Platform->getDeviceCount() == 0) {
return EXIT_FAILURE;
}
se::Device *Device = getOrDie(Platform->getDevice(0));
// Load the kernel onto the device.
std::unique_ptr<cg::SaxpyKernel> Kernel =
getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
// Allocate memory on the device.
se::GlobalDeviceMemory<float> X =
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
se::GlobalDeviceMemory<float> Y =
getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
// Run operations on a stream.
std::unique_ptr<se::Stream> Stream = getOrDie(Device->createStream());
Stream->thenCopyH2D<float>(HostX, X)
.thenCopyH2D<float>(HostY, Y)
.thenLaunch(ArraySize, 1, *Kernel, A, X, Y)
.thenCopyD2H<float>(X, HostX);
// Wait for the stream to complete.
check(Stream->blockHostUntilDone());
// Process output data in HostX.
std::vector<float> ExpectedX = {4, 47, 90, 133};
for (size_t I = 0; I < ArraySize; ++I) {
assert(HostX[I] == ExpectedX[I]);
}
// Free device memory.
check(Device->freeDeviceMemory(X));
check(Device->freeDeviceMemory(Y));
/// [Example saxpy host main]
}

View File

@@ -55,7 +55,7 @@
/// function as follows:
/// \code
/// namespace ccn = compiler_cuda_namespace;
/// using KernelPtr = std::unique_ptr<cnn::SaxpyKernel>;
/// using KernelPtr = std::unique_ptr<ccn::SaxpyKernel>;
/// // Assumes Device is a pointer to the Device on which to launch the
/// // kernel.
/// //

View File

@@ -115,8 +115,9 @@ public:
/// Adds each item in SpecList to this object.
///
/// Does not take ownership of the PTXCode pointers in the SpecList elements.
CUDAPTXInMemorySpec(llvm::StringRef KernelName,
const llvm::ArrayRef<PTXSpec> SpecList);
CUDAPTXInMemorySpec(
llvm::StringRef KernelName,
const llvm::ArrayRef<CUDAPTXInMemorySpec::PTXSpec> SpecList);
/// Returns a pointer to the PTX code for the requested compute capability.
///

View File

@@ -0,0 +1,71 @@
//===-- StreamExecutor.h - Main include file for StreamExecutor -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \mainpage Getting Started
///
/// \b StreamExecutor is a wrapper around CUDA and OpenCL host-side programming
/// models (runtimes). This abstraction cleanly permits host code to target
/// either CUDA or OpenCL devices with identically-functioning data parallel
/// kernels. It manages the execution of concurrent work targeting the
/// accelerator, similar to a host-side Executor.
///
/// This version of StreamExecutor can be built either as a sub-project of the
/// LLVM project or as a standalone project depending on LLVM as an external
/// package.
///
/// Below is an example of the use of the StreamExecutor API:
///
/// \snippet examples/Example.cpp Example saxpy host main
///
/// In the example, a couple of handler functions are used to handle error
/// return values in the StreamExecutor API:
///
/// \snippet examples/Example.cpp Example saxpy host helper functions
///
/// These are just example handler functions. A real application will likely
/// want to define similar handlers of its own that log errors in an
/// application-specific way, convert errors to the application's own
/// error-handling framework, or try to recover from errors as appropriate.
///
/// The example also references some symbols from a compiler-generated
/// namespace:
///
/// \snippet examples/Example.cpp Example saxpy compiler-generated
///
/// Instead of depending on the compiler to generate this code, you can
/// technically write the code yourself, but this is not recommended because the
/// code is very error-prone. For example, the template parameters for the
/// Kernel specialization have to match the parameter types for the device
/// kernel, and the MultiKernelLoaderSpec has to be initialized with valid
/// device code for the kernel. Errors in this code will not show up until
/// runtime, and may only show up as garbage output rather than an explicit
/// error, which can be very hard to debug, so again, it is strongly advised not
/// to write this code yourself.
///
/// The example compiler-generated code uses a PTX string in the source code to
/// store the device code, but the device code can also be stored in other
/// formats such as CUDA cubin and CUDA fatbin. Furthermore, device code can be
/// stored for other platforms such as OpenCL, and StreamExecutor will pick the
/// right device code at runtime based on the user's platform selection. See
/// streamexecutor::MultiKernelLoaderSpec for details of how device code can be
/// stored for different platforms, but again, the code to set up the
/// MultiKernelLoaderSpec instance should be generated by the compiler if
/// possible, not by the user.
#ifndef STREAMEXECUTOR_STREAMEXECUTOR_H
#define STREAMEXECUTOR_STREAMEXECUTOR_H
#include "Device.h"
#include "Kernel.h"
#include "KernelSpec.h"
#include "Platform.h"
#include "PlatformManager.h"
#include "Stream.h"
#endif // STREAMEXECUTOR_STREAMEXECUTOR_H