C++ ile GPU beamforming işini tamamladım. Sonuçlar gerçekten çok iyi
CPU ~ 1.1 saniye GPU ~ 0.018 saniye. Daha L1 cache'ini kullanmadım GPU'nun yinede sonuçlar şimdiden çok iyi. Biraz heyecan yarattı iyi sonuçlar L1 adımını geçip direk D 'ye geçirme işine başlıyacağım.
GPU kerneli:
struct MicParamsGpu
{
float *outputData;
float *rawData;
int packetSize;
float *leapData;
int leapStride;
int *delays;
int arraySize;
int stride;
};
__device__ float GetElement(MicParamsGpu params, int curMic, int index ) {
if (params.packetSize > index)
{
return params.rawData[curMic*params.packetSize + index];
}
else
{
int leapIndex = (index - params.packetSize) + curMic*params.leapStride;
return params.leapData[leapIndex];
}
}
__global__ void beamformKernel2(MicParamsGpu params)
{
int xIndex = threadIdx.x;
int currentStartIndex = xIndex * params.stride;
for (int k = 0; k < params.stride; k++)
{
float curVal = 0;
for (int i = 0; i < params.arraySize; i++)
{
curVal = GetElement(params, i, currentStartIndex + k + params.delays[i]);
}
params.outputData[currentStartIndex + k] = curVal;
}
}
Cuda Memory Allocation:
// Helper function for using CUDA to add vectors in parallel.
cudaError_t beamformWithCudaHelper(MicrophoneArray& array, SharpVector& outputData)
{
cudaError_t cudaStatus;
MicParamsGpu params;
params.arraySize = array.micropshoneList.size();
params.packetSize = Config::getInstance().packetSize;
params.leapStride = Config::getInstance().getMicMaxDelay()*2;
cudaMalloc(¶ms.rawData, array.micropshoneList.size() * sizeof(float) * params.packetSize);
cudaMalloc(¶ms.leapData, array.micropshoneList.size() * sizeof(float) * params.leapStride);
cudaMalloc(¶ms.delays, array.micropshoneList.size() * sizeof(int));
cudaMalloc(¶ms.outputData, Config::getInstance().packetSize * sizeof(float) );
std::vector<int> delayVec;
params.stride = params.packetSize / 1000;
cudaStatus = cudaGetLastError();
for (int i = 0; i < params.arraySize; i++)
{
cudaMemcpy( params.rawData + i * params.packetSize, array.micropshoneList[i].getData().data(),
params.packetSize* sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaGetLastError();
cudaMemcpy(params.leapData + i * params.leapStride, array.micropshoneList[i].getLeapData().data(),
params.leapStride* sizeof(float), cudaMemcpyHostToDevice);
cudaStatus = cudaGetLastError();
delayVec.push_back(array.micropshoneList[i].getDelay(1000, 45) + Config::getInstance().getMicMaxDelay());
}
cudaStatus = cudaGetLastError();
cudaMemcpy(params.delays, delayVec.data(), delayVec.size() * sizeof(int), cudaMemcpyHostToDevice);
float startTime = get_time();
// Launch a kernel on the GPU with one thread for each element.
beamformKernel2 << <1, 1000 >> >(params);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaMemcpy(outputData.data(), params.outputData, Config::getInstance().packetSize * sizeof(float), cudaMemcpyDeviceToHost);
float endTime = get_time();
std::cout << "GPU Time spent: " << endTime - startTime;
return cudaStatus;
}
--
[ Bu gönderi, http://ddili.org/forum'dan dönüştürülmüştür. ]