Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 22 additions & 24 deletions src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1105,59 +1105,57 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie
}

Id EmitContext::DefineGetBdaPointer() {
const auto caching_pagebits{
const Id caching_pagebits{
Constant(U64, static_cast<u64>(VideoCore::BufferCache::CACHING_PAGEBITS))};
const auto caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)};
const Id caching_pagemask{Constant(U64, VideoCore::BufferCache::CACHING_PAGESIZE - 1)};

const auto func_type{TypeFunction(U64, U64)};
const auto func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)};
const auto address{OpFunctionParameter(U64)};
const Id func_type{TypeFunction(U64, U64)};
const Id func{OpFunction(U64, spv::FunctionControlMask::MaskNone, func_type)};
const Id address{OpFunctionParameter(U64)};
Name(func, "get_bda_pointer");
AddLabel();

const auto fault_label{OpLabel()};
const auto available_label{OpLabel()};
const auto merge_label{OpLabel()};
const Id fault_label{OpLabel()};
const Id available_label{OpLabel()};
const Id merge_label{OpLabel()};

// Get page BDA
const auto page{OpShiftRightLogical(U64, address, caching_pagebits)};
const auto page32{OpUConvert(U32[1], page)};
const auto& bda_buffer{buffers[bda_pagetable_index]};
const auto [bda_buffer_id, bda_pointer_type] = bda_buffer.Alias(PointerType::U64);
const auto bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
const auto bda{OpLoad(U64, bda_ptr)};
const Id page{OpShiftRightLogical(U64, address, caching_pagebits)};
const Id page32{OpUConvert(U32[1], page)};
const Id bda_ptr{OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32)};
const Id bda{OpLoad(U64, bda_ptr)};

// Check if page is GPU cached
const auto is_fault{OpIEqual(U1[1], bda, u64_zero_value)};
const Id is_fault{OpIEqual(U1[1], bda, u64_zero_value)};
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
OpBranchConditional(is_fault, fault_label, available_label);

// First time acces, mark as fault
AddLabel(fault_label);
const auto& fault_buffer{buffers[fault_buffer_index]};
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer.Alias(PointerType::U32);
const auto page_div32{OpShiftRightLogical(U32[1], page32, ConstU32(5U))};
const auto page_mod32{OpBitwiseAnd(U32[1], page32, ConstU32(31U))};
const auto page_mask{OpShiftLeftLogical(U32[1], u32_one_value, page_mod32)};
const auto fault_ptr{
const Id page_div32{OpShiftRightLogical(U32[1], page32, ConstU32(5U))};
const Id page_mod32{OpBitwiseAnd(U32[1], page32, ConstU32(31U))};
const Id page_mask{OpShiftLeftLogical(U32[1], u32_one_value, page_mod32)};
const Id fault_ptr{
OpAccessChain(fault_pointer_type, fault_buffer_id, u32_zero_value, page_div32)};
const auto fault_value{OpLoad(U32[1], fault_ptr)};
const auto fault_value_masked{OpBitwiseOr(U32[1], fault_value, page_mask)};
OpStore(fault_ptr, fault_value_masked);
OpAtomicOr(U32[1], fault_ptr, ConstU32(u32(spv::Scope::Device)), u32_zero_value, page_mask);

// Return null pointer
const auto fallback_result{u64_zero_value};
const Id fallback_result{u64_zero_value};
OpBranch(merge_label);

// Value is available, compute address
AddLabel(available_label);
const auto offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)};
const auto addr{OpIAdd(U64, bda, offset_in_bda)};
const Id offset_in_bda{OpBitwiseAnd(U64, address, caching_pagemask)};
const Id addr{OpIAdd(U64, bda, offset_in_bda)};
OpBranch(merge_label);

// Merge
AddLabel(merge_label);
const auto result{OpPhi(U64, addr, available_label, fallback_result, fault_label)};
const Id result{OpPhi(U64, addr, available_label, fallback_result, fault_label)};
OpReturnValue(result);
OpFunctionEnd();
return func;
Expand Down
81 changes: 17 additions & 64 deletions src/video_core/buffer_cache/buffer_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -657,10 +657,6 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
}
WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
bda_addrs.size() * sizeof(vk::DeviceAddress));
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
cmdbuf.fillBuffer(new_buffer.buffer, 0, size_bytes, 0);
for (const BufferId overlap_id : overlap.ids) {
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
}
Expand All @@ -670,8 +666,10 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {

void BufferCache::ProcessFaultBuffer() {
// Run fault processing shader
const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64));
vk::BufferMemoryBarrier2 fault_buffer_barrier{
static constexpr size_t StagingSize = MaxPageFaults * sizeof(u64);
const auto [mapped, offset] = download_buffer.Map(StagingSize);
std::memset(mapped, 0, StagingSize);
const vk::BufferMemoryBarrier2 fault_buffer_pre_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
Expand All @@ -680,27 +678,17 @@ void BufferCache::ProcessFaultBuffer() {
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
vk::BufferMemoryBarrier2 download_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite,
.buffer = download_buffer.Handle(),
.offset = offset,
.size = MaxPageFaults * sizeof(u64),
};
std::array<vk::BufferMemoryBarrier2, 2> barriers{fault_buffer_barrier, download_barrier};
vk::DescriptorBufferInfo fault_buffer_info{
const vk::DescriptorBufferInfo fault_buffer_info{
.buffer = fault_buffer.Handle(),
.offset = 0,
.range = FAULT_BUFFER_SIZE,
};
vk::DescriptorBufferInfo download_info{
const vk::DescriptorBufferInfo download_info{
.buffer = download_buffer.Handle(),
.offset = offset,
.range = MaxPageFaults * sizeof(u64),
.range = StagingSize,
};
boost::container::small_vector<vk::WriteDescriptorSet, 2> writes{
const std::array<vk::WriteDescriptorSet, 2> writes{{
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 0,
Expand All @@ -717,15 +705,14 @@ void BufferCache::ProcessFaultBuffer() {
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &download_info,
},
};
}};
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = barriers.data(),
.bufferMemoryBarrierCount = 1U,
.pBufferMemoryBarriers = &fault_buffer_pre_barrier,
});
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0,
Expand All @@ -735,34 +722,19 @@ void BufferCache::ProcessFaultBuffer() {
cmdbuf.dispatch(num_workgroups, 1, 1);

// Reset fault buffer
const vk::BufferMemoryBarrier2 reset_pre_barrier = {
const vk::BufferMemoryBarrier2 fault_buffer_post_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.srcAccessMask = vk::AccessFlagBits2::eShaderRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
const vk::BufferMemoryBarrier2 reset_post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite | vk::AccessFlagBits2::eShaderRead,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_pre_barrier,
});
cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_post_barrier,
.bufferMemoryBarrierCount = 1U,
.pBufferMemoryBarriers = &fault_buffer_post_barrier,
});

// Defer creating buffers
Expand Down Expand Up @@ -1036,25 +1008,6 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
});
}

void BufferCache::MemoryBarrier() {
// Vulkan doesn't know which buffer we access in a shader if we use
// BufferDeviceAddress. We need a full memory barrier.
// For now, we only read memory using BDA. If we want to write to it,
// we might need to change this.
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
vk::MemoryBarrier2 barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.memoryBarrierCount = 1,
.pMemoryBarriers = &barrier,
});
}

void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value,
u32 num_bytes) {
scheduler.EndRendering();
Expand Down
3 changes: 0 additions & 3 deletions src/video_core/buffer_cache/buffer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,6 @@ class BufferCache {
/// Synchronizes all buffers neede for DMA.
void SynchronizeDmaBuffers();

/// Record memory barrier. Used for buffers when accessed via BDA.
void MemoryBarrier();

private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
Expand Down
4 changes: 1 addition & 3 deletions src/video_core/host_shaders/fault_buffer_process.comp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ layout(constant_id = 0) const uint CACHING_PAGEBITS = 0;
void main() {
uint id = gl_GlobalInvocationID.x;
uint word = fault_buffer[id];
if (word == 0u) {
return;
}
// 1 page per bit
uint base_bit = id * 32u;
while (word != 0u) {
Expand All @@ -39,4 +36,5 @@ void main() {
download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS;
}
}
fault_buffer[id] = 0u;
}
1 change: 0 additions & 1 deletion src/video_core/renderer_vulkan/vk_rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,6 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
range.upper() - range.lower());
}
}
buffer_cache.MemoryBarrier();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe i'm wrong in this one. But since accessing buffer content through DMA doesn't bind those buffers, how does the driver know how to synchronize calls even if there is a barrier? Does this happen automaticly when an access is detected using a device buffer address?

Copy link
Contributor Author

@raphaelthegreat raphaelthegreat Jul 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm that is an interesting question. The process of binding buffers I don't believe involves the driver doing any "smart" tracking, but rather the pipeline barrier command itself just emits sync packets that perform cache flushes or wait for specific parts of the pipeline to finish. Can check radv to make sure

Copy link
Contributor Author

@raphaelthegreat raphaelthegreat Jul 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like radv ignores the buffer memory range https://github.com/chaotic-cx/mesa-mirror/blob/main/src/amd/vulkan/radv_cmd_buffer.c#L13475 In general I believe the whole buffer device address feature wouldn't be very useful if drivers relied on bindings to do synchronization

}

fault_process_pending |= uses_dma;
Expand Down