Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/LSCCacheOptimizationPass.cpp
Paige, Alexander 420b632df9 Update IGC code format
Update IGC code format
2025-07-20 06:20:11 +02:00

522 lines
27 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2021-2022 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
/**
* @file LSCCacheOptimizationPass.cpp
* @author Konstantin Rebrov
*
* @brief This file implements the LSCCacheOptimizationPass
* This pass performs an optimization upon the Load Store Cache, makes eligible store instructions go into the L1 cache
* Instead of the L3 cache, which is the default setting.
* It utilizes the L1 cache for store instructions that are in these four memory regions:
* RTAsynctack, SWStack, SWHotZone, RTSynctack
*
* @details This pass examines all store instructions, and if a store instruction is identified as being eligible,
* and if the optimization is necessary and possible, the pass performs a Read Write Modify operation on the store
* instruction.
*
* The requirements for a store instruction to go into the L1 cache are:
* The address for the store needs to be 16-byte aligned.
* The size of the stored data needs to be a multiple of 16 bytes.
* .ca on load and store instructions should not be marked as uncached
* If these requirements are not satisfied then the store instruction is expanded to include the padding around the
* stored data. This padding is preliminarily loaded into a virtual register to save those values.
*
* green blocks represent a dword that we don't want to overwrite (leave it as is)
* this is a padding blocks surrounding the (red) original memory location of the store instruction
*
* red blocks represent a dword that we actually do want to overwrite, specifically the old value
* this is the memory location that we want to store the new value into
* it is the region of memory which is addressed to by the original pointer operand of the input store instruction
*
* blue blocks represent the new value that we want to overwrite on top of the red dword
*/
#include "LSCCacheOptimizationPass.h"
#include "AdaptorCommon/RayTracing/MemRegionAnalysis.h" // for getRegionOffset(), RTMemRegion
#include "getCacheOpts.h" // for getCacheOptsStoreInst()
#include "AdaptorCommon/RayTracing/RTStackFormat.h" // for RTStackFormat::LSC_WRITE_GRANULARITY
#include "IGCPassSupport.h"
#include "common/LLVMWarningsPush.hpp" // for suppressing LLVM warnings
#include <llvm/IR/LLVMContext.h> // for llvm::LLVMContext
#include <llvm/IR/IRBuilder.h> // for llvm::IRBuilder
#include <llvm/IR/Function.h> // for llvm::Function
#include <llvm/IR/BasicBlock.h> // for llvm::BasicBlock
#include <llvm/IR/Value.h> // for llvm::Value
#include <llvm/IR/Type.h> // for llvm::Type
#include <llvm/IR/DerivedTypes.h> // for llvm::VectorType
#include "llvmWrapper/IR/DerivedTypes.h"
#include <llvm/IR/Constants.h> // for llvm::ConstantInt, llvm::ConstantFP, llvm::ConstantVector, llvm::ConstantDataVector, llvm::UndefValue
#include <llvm/IR/Instruction.h> // for llvm::Instruction
#include <llvm/IR/Instructions.h> // for llvm::StoreInst, llvm::CallInst
#include <optional>
#include <llvm/ADT/APInt.h> // for llvm::APInt, llvm::ArrayRef
#include "common/LLVMWarningsPop.hpp" // for suppressing LLVM warnings
#include <climits> // for CHAR_BIT
#include <cstdint> // for std::uint64_t
#include <string> // for std::string
#include <vector> // for std::vector
using namespace IGC;
using RTStackFormat::LSC_WRITE_GRANULARITY;
using namespace llvm;
using namespace llvm::GenISAIntrinsic;
using std::string;
using std::uint64_t;
using std::vector;
/**
* @param builder An IRBuilder to create instructions.
*
* @param data_type The data type of the zeroed vector, can be floating point or integer.
*
* @param num_elements The number of elements in the zeroed vector.
*
* @return An rvalue which is a zeroed vector with num_elements of the specified data_type
* It can be used as an argument for a constructed instruction, such as insertelement
*/
inline Value *getZeroedVector(IRBuilder<> &builder, Type *data_type, uint64_t num_elements) {
if (data_type->isFloatingPointTy()) {
return ConstantDataVector::getSplat((unsigned)num_elements, ConstantFP::get(data_type, 0.0));
} else {
return ConstantDataVector::getSplat((unsigned)num_elements, ConstantInt::get(data_type, 0ull));
}
}
/**
* This utility function being given a source rvalue, which is usually a vector, extracts elements marked by a range out
* of that vector and caches those resulting rvalues into a container. We might want to use those rvalues in future
* instructions. This is a half-open range marked by the two last parameters: [begin, end) The range is intended to be
* used for extracting elements from only part of the vector.
*
* @param builder An IRBuilder to create instructions.
*
* @param source_rvalue The rvalue from which to extract the elements.
* It can be rvalues of both scalar and vector data types, both % virtual registers as well as
* literal values. This function can extract elements from both green vectors and blue vectors.
*
* @param extracted_elements Each element is extracted and saved as a Value* into this container.
*
* @param begin This index points to the first element in the range.
*
* @param end This index points to one after the last element in the range.
*/
static void extract_elements(IRBuilder<> &builder, Value *source_rvalue, vector<Value *> &extracted_elements,
uint64_t begin, uint64_t end) {
IGC_ASSERT(begin <= end && extracted_elements.size() == (end - begin));
if (begin == end)
return;
Type *type = source_rvalue->getType();
// If the source rvalue is a vector of integers.
if (isa<VectorType>(type)) {
vector<Value *>::iterator i = extracted_elements.begin();
for (uint64_t extract_at = begin; extract_at < end; ++extract_at, ++i) {
auto *element = builder.CreateExtractElement(source_rvalue, extract_at);
*i = element;
}
}
// If the source rvalue is a scalar integer.
else {
extracted_elements[0] = source_rvalue;
}
}
/**
* This utility function being given a source rvalue, which is usually a vector, extracts N elements out of that vector
* and caches those resulting rvalues into a container. We might want to use those rvalues in future instructions.
*
* @param builder An IRBuilder to create instructions.
*
* @param source_rvalue The rvalue from which to extract the elements.
* It can be rvalues of both scalar and vector data types, both % virtual registers as well as
* literal values. This function can extract elements from both green vectors and blue vectors.
*
* @param extracted_elements Each element is extracted and saved as a Value* into this container.
*
* @param num_elements The number of elements to extract.
*/
static void extract_elements(IRBuilder<> &builder, Value *source_rvalue, vector<Value *> &extracted_elements,
uint64_t num_elements) {
return extract_elements(builder, source_rvalue, extracted_elements, 0, num_elements);
}
/**
* This function takes a container having rvalues of individual elements, and inserts each one of them into the given
* llvm vector, the positions within that vector where to insert the elements are marked by the range. This is a
* half-open range marked by the two last parameters: [begin, end) The range is intended to be used if we want to insert
* elements into only part of the vector.
*
* @param builder An IRBuilder to create instructions.
*
* @param source_rvalue The rvalue vector which will be the base for the insertions.
* In other words this is the vector into which we insert the elements.
*
* @param extracted_elements A container containing Value* objects to insert into the vector.
*
* @param begin This index points to the first element in the range.
*
* @param end This index points to one after the last element in the range.
*
* @return Value* A virtual register of the vector with all the inserted elements.
*/
static Value *insert_elements(IRBuilder<> &builder, Value *source_rvalue, const vector<Value *> &extracted_elements,
uint64_t begin, uint64_t end) {
Value *temp_vector = source_rvalue;
IGC_ASSERT(begin <= end && extracted_elements.size() == (end - begin));
vector<Value *>::const_iterator i = extracted_elements.cbegin();
for (uint64_t insert_at = begin; insert_at < end; ++insert_at, ++i) {
// Use the previous vector to compute the insertelement insruction for the current vector,
// overwriting the current vector into the variable, discarding the information for the previous vector.
temp_vector = builder.CreateInsertElement(temp_vector, *i, insert_at);
}
return temp_vector;
}
bool LSCCacheOptimizationPass::runOnFunction(Function &function) {
m_CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
changed_IR = false;
current_function = &function;
current_context = &function.getContext();
#if 0
// This method is only used for testing purposes.
// In order to insert a 48 wide store instruction into the Function, if it doesn't already have one.
// Most shaders that I've run it through don't have such an instruction, so I have to manually insert one
// like that in order to test the fourth case.
create_48_wide_store(function);
#endif
// This method indirectly calls visitStoreInst() to process the LLVM IR function.
// It basically performs the RMW operation on any eligible instructions.
visit(function);
return changed_IR;
}
void LSCCacheOptimizationPass::visitStoreInst(StoreInst &storeInst) {
auto cacheOpts = getCacheOptsStorePolicy(storeInst, *m_CGCtx);
// cacheOpts will be not None for the following memory regions:
// RTAsynctack, SWStack, SWHotZone, RTSynctack
if (!cacheOpts)
return;
auto store_cache_policy = cacheOpts.value();
if (store_cache_policy == LSC_L1UC_L3UC || store_cache_policy == LSC_L1UC_L3C_WB) {
// The cache policy is uncached for the L1 cache.
// unsuccessful early exit
return;
}
auto &DL = current_function->getParent()->getDataLayout();
uint64_t offset = 0;
uint64_t region_size = 0;
const Instruction *address = dyn_cast<Instruction>(storeInst.getPointerOperand());
auto Region = getRegionOffset(address, *m_CGCtx->getModuleMetaData(), &DL, &offset, &region_size);
if (!Region)
return;
IGC_ASSERT(offset < region_size); // the offset is within the bounds of the memory region
Value *value = storeInst.getValueOperand();
Type *type = value->getType();
uint64_t data_size = 0; // measured in bytes
uint64_t element_size = 0; // measured in bytes
Type *element_type;
// If the stored data is a vector variable.
if (IGCLLVM::FixedVectorType *vectorType = dyn_cast<IGCLLVM::FixedVectorType>(type)) {
element_type = vectorType->getElementType();
uint64_t num_elements = vectorType->getNumElements();
element_size = DL.getTypeSizeInBits(element_type) / CHAR_BIT;
data_size = num_elements * element_size;
}
// If the stored data is a scalar variable.
else {
data_size = element_size = DL.getTypeSizeInBits(type) / CHAR_BIT;
element_type = type;
}
// If the offset is a multiple of 16 bytes (16 byte aligned store address)
// && the size of the stored data is a multiple of 16 bytes
if ((offset % LSC_WRITE_GRANULARITY == 0) && (data_size % LSC_WRITE_GRANULARITY == 0)) {
// successful early exit
return;
}
// Convert the address from the start of the memory region into
// the offset from the start of the previous nearest 16 byte boundary.
offset %= LSC_WRITE_GRANULARITY;
// Create an IRBuilder with an insertion point set to the given intrinsic_call instruction.
// IRBuilder automatically inserts instructions when it creates them,
// and the inserted instructions (dynamically allocated) are deleted when the function is destroyed.
IRBuilder<> builder(&storeInst);
auto *initial_pointer = storeInst.getPointerOperand();
unsigned addrspace = storeInst.getPointerAddressSpace();
uint64_t right_boundary; // measured in bytes
// If the stored data straddles across three 16 bytes size chunks,
// we break it up into two stores: first 32 bytes and last 16 bytes
/* This is the fourth case. */
if (offset + data_size > 32) {
right_boundary = 48;
// get the left pad
// get the right pad
// get the blue blocks
// construct and store the first 32 bytes vector
// construct and store the last 16 bytes vector
uint64_t num_blue_blocks = data_size / element_size;
uint64_t num_green_blocks_left = offset / element_size;
uint64_t num_blue_blocks_left = (32 - offset) / element_size;
uint64_t num_total_blocks_left = num_green_blocks_left + num_blue_blocks_left;
uint64_t num_green_blocks_right = (right_boundary - (offset + data_size)) / element_size;
uint64_t num_blue_blocks_right = (offset + data_size - 32) / element_size;
uint64_t num_total_blocks_right = num_green_blocks_right + num_blue_blocks_right;
/* First do the GGRR 32 wide store */
// %0 = bitcast <>* %baseAddress to i8*
auto *bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
// %1 = getelementptr i8, i8* %0, i64 -offset
auto *left_green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
// %2 = bitcast i8* %1 to <num_green_blocks_left x iN>*
auto *left_green_vector_pointer = builder.CreateBitCast(
left_green_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_left)->getPointerTo(addrspace));
// %3 = load <num_green_blocks_left x iN>, <num_green_blocks_left x iN>* %2
auto *left_green_vector_rvalue = builder.CreateLoad(
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_left), left_green_vector_pointer);
// The static_cast is needed to remove this warning or error, which appears only in QuickBuild Windows
// https://ubit-gfx.intel.com/build/9814699/step_status
// error C2220: the following warning is treated as an error
// warning C4244: 'argument': conversion from 'uint64_t' to 'const unsigned int', possible loss of data
vector<Value *> green_elements_left(static_cast<const unsigned int>(num_green_blocks_left));
extract_elements(builder, left_green_vector_rvalue, green_elements_left, num_green_blocks_left);
vector<Value *> blue_elements_left(static_cast<const unsigned int>(num_blue_blocks_left));
extract_elements(builder, value, blue_elements_left, num_blue_blocks_left);
auto *initial_vector_left =
UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_left));
auto *intermediate_vector_left =
insert_elements(builder, initial_vector_left, green_elements_left, 0, num_green_blocks_left);
auto *final_vector_left = insert_elements(builder, intermediate_vector_left, blue_elements_left,
num_green_blocks_left, num_total_blocks_left);
auto *final_vector_pointer_left = builder.CreateBitCast(
left_green_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_left)->getPointerTo(addrspace));
builder.CreateAlignedStore(final_vector_left, final_vector_pointer_left, llvm::Align(LSC_WRITE_GRANULARITY));
auto *right_part_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(32 - offset));
vector<Value *> blue_elements_right(static_cast<const unsigned int>(num_blue_blocks_right));
extract_elements(builder, value, blue_elements_right, num_blue_blocks_left, num_blue_blocks);
vector<Value *> green_elements_right(static_cast<const unsigned int>(num_green_blocks_right));
auto *right_green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(data_size));
auto *right_green_vector_pointer = builder.CreateBitCast(
right_green_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_right)->getPointerTo(addrspace));
auto *right_green_vector_rvalue = builder.CreateLoad(
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks_right), right_green_vector_pointer);
extract_elements(builder, right_green_vector_rvalue, green_elements_right, num_green_blocks_right);
auto *initial_vector_right =
UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_right));
auto *intermediate_vector_right =
insert_elements(builder, initial_vector_right, blue_elements_right, 0, num_blue_blocks_right);
auto *final_vector_right = insert_elements(builder, intermediate_vector_right, green_elements_right,
num_blue_blocks_right, num_total_blocks_right);
auto *final_vector_pointer_right = builder.CreateBitCast(
right_part_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks_right)->getPointerTo(addrspace));
builder.CreateAlignedStore(final_vector_right, final_vector_pointer_right, llvm::Align(LSC_WRITE_GRANULARITY));
}
// If the data straddles across one or two 16 byte size chunks,
// we do only one load or store, but we can optimize it to
// load only the green blocks if they are contiguous
else {
right_boundary = (offset + data_size > 16) ? 32 : 16;
// the red blocks are on the left side, contiguous green blocks on the right side
// load only the contiguous green blocks
/* This is the first case. */
if (offset == 0) {
// get the right pad
// get the blue blocks
// construct and store the right_boundary size vector
uint64_t num_green_blocks = (right_boundary - data_size) / element_size;
uint64_t num_blue_blocks = data_size / element_size;
// %0 = bitcast <>* %baseAddress to i8*
auto *bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
// %1 = getelementptr i8, i8* %0, i64 data_size
auto *green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(data_size));
// %2 = bitcast i8* %1 to <num_green_blocks x iN>*
auto *green_vector_pointer = builder.CreateBitCast(
green_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks)->getPointerTo(addrspace));
// %3 = load <num_green_blocks x iN>, <num_green_blocks x iN>* %2
auto *green_vector_rvalue = builder.CreateLoad(
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks), green_vector_pointer);
vector<Value *> green_elements(static_cast<const unsigned int>(num_green_blocks));
extract_elements(builder, green_vector_rvalue, green_elements, num_green_blocks);
vector<Value *> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
extract_elements(builder, value, blue_elements, num_blue_blocks);
uint64_t num_elements = num_green_blocks + num_blue_blocks;
auto *initial_vector = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements));
auto *intermediate_vector = insert_elements(builder, initial_vector, blue_elements, 0, num_blue_blocks);
auto *final_vector = insert_elements(builder, intermediate_vector, green_elements, num_blue_blocks, num_elements);
auto *final_vector_pointer = builder.CreateBitCast(
initial_pointer,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements)->getPointerTo(addrspace));
builder.CreateAlignedStore(final_vector, final_vector_pointer, llvm::Align(LSC_WRITE_GRANULARITY));
}
// the red blocks are on the right side, contiguous green blocks on the left side
// load only the contiguous green blocks
/* This is the second case. */
else if (offset + data_size == right_boundary) {
// get the left pad
// get the blue blocks
// construct and store the right_boundary size vector
uint64_t num_green_blocks = offset / element_size;
uint64_t num_blue_blocks = data_size / element_size;
// %0 = bitcast <>* %baseAddress to i8*
auto *bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
// %1 = getelementptr i8, i8* %0, i64 -offset
auto *green_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
// %2 = bitcast i8* %1 to <num_green_blocks x iN>*
auto *green_vector_pointer = builder.CreateBitCast(
green_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks)->getPointerTo(addrspace));
// %3 = load <num_green_blocks x iN>, <num_green_blocks x iN>* %2
auto *green_vector_rvalue = builder.CreateLoad(
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_green_blocks), green_vector_pointer);
vector<Value *> green_elements(static_cast<const unsigned int>(num_green_blocks));
extract_elements(builder, green_vector_rvalue, green_elements, num_green_blocks);
vector<Value *> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
extract_elements(builder, value, blue_elements, num_blue_blocks);
uint64_t num_elements = num_green_blocks + num_blue_blocks;
auto *initial_vector = UndefValue::get(IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements));
auto *intermediate_vector = insert_elements(builder, initial_vector, green_elements, 0, num_green_blocks);
auto *final_vector = insert_elements(builder, intermediate_vector, blue_elements, num_green_blocks, num_elements);
auto *final_vector_pointer = builder.CreateBitCast(
green_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_elements)->getPointerTo(addrspace));
builder.CreateAlignedStore(final_vector, final_vector_pointer, llvm::Align(LSC_WRITE_GRANULARITY));
}
// the red blocks in the middle, around them discontinuous green blocks
// load the green blocks on the left, red blocks in the middle, and green blocks on the right
/* This is the third case. */
else {
// get the whole thing
// get the blue blocks
// construct and store the right_boundary size vector, overwriting the red blocks in the middle
uint64_t num_total_blocks = right_boundary / element_size;
// %0 = bitcast <>* %baseAddress to i8*
auto *bitcast1 = builder.CreateBitCast(initial_pointer, builder.getInt8PtrTy(addrspace));
// %1 = getelementptr i8, i8* %0, i64 -offset
auto *starting_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(-1 * offset));
// %2 = bitcast i8* %1 to <num_total_blocks x iN>*
auto *full_vector_pointer = builder.CreateBitCast(
starting_address,
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks)->getPointerTo(addrspace));
// %3 = load <num_total_blocks x iN>, <num_total_blocks x iN>* %2
auto *full_vector_rvalue = builder.CreateLoad(
IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_total_blocks), full_vector_pointer);
uint64_t left_padding_size_blocks = offset / element_size;
uint64_t num_blue_blocks = data_size / element_size;
vector<Value *> blue_elements(static_cast<const unsigned int>(num_blue_blocks));
extract_elements(builder, value, blue_elements, num_blue_blocks);
auto *final_vector = insert_elements(builder, full_vector_rvalue, blue_elements, left_padding_size_blocks,
left_padding_size_blocks + num_blue_blocks);
builder.CreateAlignedStore(final_vector, full_vector_pointer, llvm::Align(LSC_WRITE_GRANULARITY));
}
}
storeInst.eraseFromParent();
changed_IR = true;
}
#if 0
bool LSCCacheOptimizationPass::create_48_wide_store(Function& function)
{
Function::iterator bb = function.begin(), bb_end = function.end();
for (; bb != bb_end; ++bb) {
for (BasicBlock::iterator i = bb->begin(), i_end = bb->end(); i != i_end; ++i) {
// Loop through all instructions in a function, search for a call to a GenIntrinsicInst::GenISA_AsyncStackPtr
// which would look something like this:
// %perLaneAsyncStackPointer24 = call noalias align 128 dereferenceable(256) %"struct.RTStackFormat::RTStack" addrspace(1)* @"llvm.genx.GenISA.AsyncStackPtr.p1struct.RTStackFormat::RTStack.i64"(i64 %19)
if (auto* intrinsic_call = dyn_cast<GenIntrinsicInst>(i)) {
if ((intrinsic_call->getIntrinsicID() == llvm::GenISAIntrinsic::GenISA_AsyncStackPtr) ||
(intrinsic_call->getIntrinsicID() == llvm::GenISAIntrinsic::GenISA_AsyncStackPtrPlaceHolder))
{
// Create an IRBuilder with an insertion point set to the given intrinsic_call instruction.
// IRBuilder automatically inserts instructions when it creates them,
// and the inserted instructions (dynamically allocated) are deleted when the function is destroyed.
IRBuilder<> builder(intrinsic_call);
Type* return_type = intrinsic_call->getFunctionType()->getReturnType();
unsigned addrspace = return_type->getPointerAddressSpace();
uint64_t offset = 8; // in bytes
uint64_t num_red_blocks = 8; // in dwords
Type* element_type = builder.getInt32Ty();
// %0 = bitcast <>* %baseAddress to i8*
auto* bitcast1 = builder.CreateBitCast(intrinsic_call, builder.getInt8PtrTy(addrspace));
// %1 = getelementptr i8, i8* %0, i64 offset
auto* red_address = builder.CreateGEP(builder.getInt8Ty(), bitcast1, builder.getInt64(offset));
// %2 = bitcast i8* %1 to <num_red_blocks x iN>*
auto* red_vector_pointer = builder.CreateBitCast(red_address, IGCLLVM::FixedVectorType::get(element_type, (unsigned)num_red_blocks)->getPointerTo(addrspace));
auto* red_vector_rvalue = getZeroedVector(builder, element_type, num_red_blocks);
builder.CreateStore(red_vector_rvalue, red_vector_pointer);
changed_IR = true;
return true;
}
}
}
}
return false;
}
#endif
char LSCCacheOptimizationPass::ID = 0;
#define PASS_FLAG "LSC-Cache-Optimization-pass"
#define PASS_DESCRIPTION "Load/Store cache optimization pass"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(LSCCacheOptimizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(LSCCacheOptimizationPass, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)