Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

johndpope · 2024-05-08T07:08:25Z

I forgot about this code - I got claude.ai to spit it out some weeks back in about 5 mins
I don't know if it works / compiles - so maybe garbage. I was captivated by the 10+ hrs of youtube videos - and frankly - I'm not sure if this is exactly what you wanted.

Regardless - I beseech you to look at Claude Opus as a vector to getting hacking results (not gpt4)
It's well abreast on the AMD firmware / drivers / ALL github projects (including tinygrad).

#include "helpers.h"
#include "nouveau.h"
#include "ROCT-Thunk-Interface.h"
#include "PM4Queue.hpp"
#include "PM4Packet.hpp"

#define ROCHSA_PM4_QUEUE_SIZE (64*1024) // 64 KB

uint64_t trivial[] = {
// Trivial compute shader, same as original
0x00005a00ff057624, 0x000fe200078e00ff,

0x0000580000027a02, 0x000fe20000000f00,
0x0000590000037a02, 0x000fca0000000f00,
0x0000000502007986, 0x000fe2000c101904,
0x000000000000794d, 0x000fea0003800000,
};

void gpu_setup(PM4Queue* pQueue) {
// Initialize the PM4 queue
pQueue->Init();
}

void gpu_memcpy(PM4Queue* pQueue, uint64_t dst, const uint32_t *src, int len) {
assert(len % 4 == 0);

// Use PM4 DMA packet to do the memcpy

pQueue->PlaceAndSubmitPacket(PM4DmaDataPacket(dst, src, len));
}

void gpu_compute(PM4Queue* pQueue, uint64_t shader_addr, uint64_t cb_addr, int cb_len) {

// Set up registers
const unsigned int COMPUTE_PGM_VALUES[] = {
static_cast<uint32_t>(shader_addr),       // PGM_LO
static_cast<uint32_t>(shader_addr >> 32) // PGM_HI
};

const unsigned int COMPUTE_PGM_RSRC1[] = { 0x000c0084 }; // Same as original

const unsigned int COMPUTE_DISPATCH_DIMENSIONS[] = {
1, 1, 1, // THREADS_X/Y/Z
1, 1, 1, // GROUPS_X/Y/Z

0, 0     // PIPELINESTAT/PERFCOUNT
};

const unsigned int COMPUTE_USER_DATA[] = {
static_cast<uint32_t>(cb_addr),       // CB1_BASE_LO
static_cast<uint32_t>(cb_addr >> 32), // CB1_BASE_HI

cb_len,                               // CB1_SIZE
1                                     // CB1_VALID
};

// Configure shader registers
pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_LO, COMPUTE_PGM_VALUES,
sizeof(COMPUTE_PGM_VALUES)/sizeof(COMPUTE_PGM_VALUES[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_PGM_RSRC1, COMPUTE_PGM_RSRC1,
sizeof(COMPUTE_PGM_RSRC1)/sizeof(COMPUTE_PGM_RSRC1[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_NUM_THREAD_X, COMPUTE_DISPATCH_DIMENSIONS,
sizeof(COMPUTE_DISPATCH_DIMENSIONS)/sizeof(COMPUTE_DISPATCH_DIMENSIONS[0])));

pQueue->PlaceAndSubmitPacket(
PM4SetShaderRegPacket(mmCOMPUTE_USER_DATA_0, COMPUTE_USER_DATA,
sizeof(COMPUTE_USER_DATA)/sizeof(COMPUTE_USER_DATA[0])));

// Dispatch the compute shader

pQueue->PlaceAndSubmitPacket(PM4DispatchDirectPacket(1, 1, 1));

// Wait for shader completion
pQueue->PlaceAndSubmitPacket(PM4ReleaseMemoryPacket(true, cb_addr, 0xC0FFEE));
pQueue->Wait4PacketConsumption();
}

int main() {

PM4Queue queue;
HsaMemoryBuffer isaBuf(trivial, sizeof(trivial), PAGE_SIZE, false);

// Map and initialize GPU resources
void* gpu_mmio_ptr = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,
MAP_SHARED, open("/dev/mem", O_RDWR), 0);

uint64_t gpu_local_mem = 0; // Allocate with hsaKmtAllocMemory()
uint64_t cb_gpu_addr = gpu_local_mem;

// Set up the queue
gpu_setup(&queue);

// Copy shader code to GPU memory

gpu_memcpy(&queue, gpu_local_mem, trivial, sizeof(trivial));

// Run the shader
gpu_compute(&queue, gpu_local_mem, cb_gpu_addr, 16);

// Clean up
munmap(gpu_mmio_ptr, PAGE_SIZE);
hsaKmtFreeMemory(gpu_local_mem, sizeGpuMem);

return 0;
}

geohot/cuda_ioctl_sniffer#5

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

johndpope commented May 8, 2024

Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

Claude OPUS - POC - AMD driver + ROCT + PM4Queue / packets #13

Comments

johndpope commented May 8, 2024