Udacity并行计算课程 CS344 编程作业答案

Problem set 1

// Homework 1
// Color to Greyscale Conversion //A common way to represent color images is known as RGBA - the color
//is specified by how much Red, Green, and Blue is in it.
//The 'A' stands for Alpha and is used for transparency; it will be
//ignored in this homework. //Each channel Red, Blue, Green, and Alpha is represented by one byte.
//Since we are using one byte for each color there are 256 different
//possible values for each color. This means we use 4 bytes per pixel. //Greyscale images are represented by a single intensity value per pixel
//which is one byte in size. //To convert an image from color to grayscale one simple method is to
//set the intensity to the average of the RGB channels. But we will
//use a more sophisticated method that takes into account how the eye
//perceives color and weights the channels unequally. //The eye responds most strongly to green followed by red and then blue.
//The NTSC (National Television System Committee) recommends the following
//formula for color to greyscale conversion: //I = .299f * R + .587f * G + .114f * B //Notice the trailing f's on the numbers which indicate that they are
//single precision floating point constants and not double precision
//constants. //You should fill in the kernel as well as set the block and grid sizes
//so that the entire image is processed. #include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h> __global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
//TODO
//Fill in the kernel to convert from color to greyscale
//the mapping from components of a uchar4 to RGBA is:
// .x -> R ; .y -> G ; .z -> B ; .w -> A
//
//The output (greyImage) at each pixel should be the result of
//applying the formula: output = .299f * R + .587f * G + .114f * B;
//Note: We will be ignoring the alpha channel for this conversion //First create a mapping from the 2D block and grid locations
//to an absolute 2D location in the image, then use that to
//calculate a 1D offset int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; if(x < numCols && y < numRows)
{
greyImage[y * numRows + x] = 0.299f * rgbaImage[y * numRows + x].x + 0.587f * rgbaImage[y * numRows + x].y + 0.114f * rgbaImage[y * numRows + x].z;
} } void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(32, 8); //TODO
const dim3 gridSize( (numRows+blockSize.x-1)/blockSize.x, (numCols+blockSize.y-1)/blockSize.y, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

Problem set 2

// Homework 2
// Image Blurring
//
// In this homework we are blurring an image. To do this, imagine that we have
// a square array of weight values. For each pixel in the image, imagine that we
// overlay this square array of weights on top of the image such that the center
// of the weight array is aligned with the current pixel. To compute a blurred
// pixel value, we multiply each pair of numbers that line up. In other words, we
// multiply each weight with the pixel underneath it. Finally, we add up all of the
// multiplied numbers and assign that value to our output for the current pixel.
// We repeat this process for all the pixels in the image. // To help get you started, we have included some useful notes here. //**************************************************************************** // For a color image that has multiple channels, we suggest separating
// the different color channels so that each color is stored contiguously
// instead of being interleaved. This will simplify your code. // That is instead of RGBARGBARGBARGBA... we suggest transforming to three
// arrays (as in the previous homework we ignore the alpha channel again):
// 1) RRRRRRRR...
// 2) GGGGGGGG...
// 3) BBBBBBBB...
//
// The original layout is known an Array of Structures (AoS) whereas the
// format we are converting to is known as a Structure of Arrays (SoA). // As a warm-up, we will ask you to write the kernel that performs this
// separation. You should then write the "meat" of the assignment,
// which is the kernel that performs the actual blur. We provide code that
// re-combines your blurred results for each color channel. //**************************************************************************** // You must fill in the gaussian_blur kernel to perform the blurring of the
// inputChannel, using the array of weights, and put the result in the outputChannel. // Here is an example of computing a blur, using a weighted average, for a single
// pixel in a small image.
//
// Array of weights:
//
// 0.0 0.2 0.0
// 0.2 0.2 0.2
// 0.0 0.2 0.0
//
// Image (note that we align the array of weights to the center of the box):
//
// 1 2 5 2 0 3
// -------
// 3 |2 5 1| 6 0 0.0*2 + 0.2*5 + 0.0*1 +
// | |
// 4 |3 6 2| 1 4 -> 0.2*3 + 0.2*6 + 0.2*2 + -> 3.2
// | |
// 0 |4 0 3| 4 2 0.0*4 + 0.2*0 + 0.0*3
// -------
// 9 6 5 0 3 9
//
// (1) (2) (3)
//
// A good starting place is to map each thread to a pixel as you have before.
// Then every thread can perform steps 2 and 3 in the diagram above
// completely independently of one another. // Note that the array of weights is square, so its height is the same as its width.
// We refer to the array of weights as a filter, and we refer to its width with the
// variable filterWidth. //**************************************************************************** // Your homework submission will be evaluated based on correctness and speed.
// We test each pixel against a reference solution. If any pixel differs by
// more than some small threshold value, the system will tell you that your
// solution is incorrect, and it will let you try again. // Once you have gotten that working correctly, then you can think about using
// shared memory and having the threads cooperate to achieve better performance. //**************************************************************************** // Also note that we've supplied a helpful debugging function called checkCudaErrors.
// You should wrap your allocation and copying statements like we've done in the
// code we're supplying you. Here is an example of the unsafe way to allocate
// memory on the GPU:
//
// cudaMalloc(&d_red, sizeof(unsigned char) * numRows * numCols);
//
// Here is an example of the safe way to do the same thing:
//
// checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char) * numRows * numCols));
//
// Writing code the safe way requires slightly more typing, but is very helpful for
// catching mistakes. If you write code the unsafe way and you make a mistake, then
// any subsequent kernels won't compute anything, and it will be hard to figure out
// why. Writing code the safe way will inform you as soon as you make a mistake. // Finally, remember to free the memory you allocate at the end of the function. //**************************************************************************** #include "utils.h" __global__
void gaussian_blur(const unsigned char* const inputChannel,
unsigned char* const outputChannel,
int numRows, int numCols,
const float* const filter, const int filterWidth)
{
// TODO // NOTE: Be sure to compute any intermediate results in floating point
// before storing the final result as unsigned char. // NOTE: Be careful not to try to access memory that is outside the bounds of
// the image. You'll want code that performs the following check before accessing
// GPU memory:
//
// if ( absolute_image_position_x >= numCols ||
// absolute_image_position_y >= numRows )
// {
// return;
// } // NOTE: If a thread's absolute position 2D position is within the image, but some of
// its neighbors are outside the image, then you will need to be extra careful. Instead
// of trying to read such a neighbor value from GPU memory (which won't work because
// the value is out of bounds), you should explicitly clamp the neighbor values you read
// to be within the bounds of the image. If this is not clear to you, then please refer
// to sequential reference solution for the exact clamping semantics you should follow. int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; if( x < numRows && y < numCols){ int pos = y * numCols + x;
assert((filterWidth % 2) == 1); for(int filterRow = -filterWidth /2 ; filterRow <= filterWidth / 2; ++filterRow)
for(int filterCol = -filterWidth / 2; filterCol <= filterWidth / 2; ++filterCol){ int neighborRows = min(numRows - 1,max(0,y + filterRow));
int neighborCols = min(numCols - 1, max(0,x + filterCol));
int pos = neighborCols * numCols + neighborRows;
int neighbor = static_cast<float>(inputChannel[pos]); int filter_pos = (filterRow + filterWidth / 2)* filterWidth; outputChannel[pos] += outputChannel[neighbor] * filter[filter_pos]; } } } //This kernel takes in an image represented as a uchar4 and splits
//it into three images consisting of only one color channel each
__global__
void separateChannels(const uchar4* const inputImageRGBA,
int numRows,
int numCols,
unsigned char* const redChannel,
unsigned char* const greenChannel,
unsigned char* const blueChannel)
{
// TODO
//
// NOTE: Be careful not to try to access memory that is outside the bounds of
// the image. You'll want code that performs the following check before accessing
// GPU memory:
//
// if ( absolute_image_position_x >= numCols ||
// absolute_image_position_y >= numRows )
// {
// return;
// } int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; if(x < numCols && y < numRows){
redChannel[y * numCols + x ] = inputImageRGBA[y * numCols + x].x;
greenChannel[y * numCols + x ] = inputImageRGBA[y * numCols + x].y;
blueChannel[y * numCols + x ] = inputImageRGBA[y * numCols + x].z;
}
}
//This kernel takes in three color channels and recombines them
//into one image. The alpha channel is set to 255 to represent
//that this image has no transparency.
__global__
void recombineChannels(const unsigned char* const redChannel,
const unsigned char* const greenChannel,
const unsigned char* const blueChannel,
uchar4* const outputImageRGBA,
int numRows,
int numCols)
{
const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y);
const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;
//make sure we don't try and access memory outside the image
//by having any threads mapped there return early
if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows)
return; unsigned char red = redChannel[thread_1D_pos];
unsigned char green = greenChannel[thread_1D_pos];
unsigned char blue = blueChannel[thread_1D_pos]; //Alpha should be 255 for no transparency
uchar4 outputPixel = make_uchar4(red, green, blue, 255); outputImageRGBA[thread_1D_pos] = outputPixel;
} unsigned char *d_red, *d_green, *d_blue;
float *d_filter; void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
const float* const h_filter, const size_t filterWidth)
{ //allocate memory for the three different channels
//original
checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char) * numRowsImage * numColsImage));
checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage)); //TODO:
//Allocate memory for the filter on the GPU
//Use the pointer d_filter that we have already declared for you
//You need to allocate memory for the filter with cudaMalloc
//be sure to use checkCudaErrors like the above examples to
//be able to tell if anything goes wrong
//IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
checkCudaErrors(cudaMalloc(&d_filter,sizeof(float) * filterWidth * filterWidth)); //TODO:
//Copy the filter on the host (h_filter) to the memory you just allocated
//on the GPU. cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
//Remember to use checkCudaErrors!
checkCudaErrors(cudaMemcpy(d_filter,h_filter,sizeof(float) * filterWidth * filterWidth,cudaMemcpyHostToDevice)); } void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
uchar4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
unsigned char *d_redBlurred,
unsigned char *d_greenBlurred,
unsigned char *d_blueBlurred,
const int filterWidth)
{
//TODO: Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize(32, 32, 1); //TODO:
//Compute correct grid size (i.e., number of blocks per kernel launch)
//from the image size and and block size.
const dim3 gridSize((numCols + blockSize.x - 1) / blockSize.x, (numRows + blockSize.y - 1) / blockSize.y,1); //TODO: Launch a kernel for separating the RGBA image into different color channels
/*
unsigned char* d_red;
unsigned char* d_green;
unsigned char* d_blue;
checkCudaErrors(cudaMalloc((void**)&d_red,numRows * numCols *sizeof(unsigned char)));
checkCudaErrors(cudaMalloc((void**)&d_green,numRows * numCols *sizeof(unsigned char)));
checkCudaErrors(cudaMalloc((void**)&d_blue,numRows * numCols *sizeof(unsigned char)));
*/
separateChannels<<< gridSize,blockSize>>>(d_inputImageRGBA,
numRows,
numCols,
d_red,
d_green,
d_blue); // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); //TODO: Call your convolution kernel here 3 times, once for each color channel.
gaussian_blur<<<gridSize,blockSize>>>(d_red,
d_redBlurred,
numRows, numCols,
d_filter, filterWidth);
gaussian_blur<<<gridSize,blockSize>>>(d_blue,
d_blueBlurred,
numRows, numCols,
d_filter, filterWidth);
gaussian_blur<<<gridSize,blockSize>>>(d_green,
d_greenBlurred,
numRows, numCols,
d_filter, filterWidth); // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); // Now we recombine your results. We take care of launching this kernel for you.
//
// NOTE: This kernel launch depends on the gridSize and blockSize variables,
// which you must set yourself.
recombineChannels<<<gridSize, blockSize>>>(d_redBlurred,
d_greenBlurred,
d_blueBlurred,
d_outputImageRGBA,
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); } //Free all the memory that we allocated
//TODO: make sure you free any arrays that you allocated
void cleanup() {
checkCudaErrors(cudaFree(d_red));
checkCudaErrors(cudaFree(d_green));
checkCudaErrors(cudaFree(d_blue));
}

Problem set 3

/* Udacity Homework 3
HDR Tone-mapping
Background HDR
==============
A High Dynamic Range (HDR) image contains a wider variation of intensity
and color than is allowed by the RGB format with 1 byte per channel that we
have used in the previous assignment.
To store this extra information we use single precision floating point for
each channel. This allows for an extremely wide range of intensity values.
In the image for this assignment, the inside of church with light coming in
through stained glass windows, the raw input floating point values for the
channels range from 0 to 275. But the mean is .41 and 98% of the values are
less than 3! This means that certain areas (the windows) are extremely bright
compared to everywhere else. If we linearly map this [0-275] range into the
[0-255] range that we have been using then most values will be mapped to zero!
The only thing we will be able to see are the very brightest areas - the
windows - everything else will appear pitch black.
The problem is that although we have cameras capable of recording the wide
range of intensity that exists in the real world our monitors are not capable
of displaying them. Our eyes are also quite capable of observing a much wider
range of intensities than our image formats / monitors are capable of
displaying.
Tone-mapping is a process that transforms the intensities in the image so that
the brightest values aren't nearly so far away from the mean. That way when
we transform the values into [0-255] we can actually see the entire image.
There are many ways to perform this process and it is as much an art as a
science - there is no single "right" answer. In this homework we will
implement one possible technique.
Background Chrominance-Luminance
================================
The RGB space that we have been using to represent images can be thought of as
one possible set of axes spanning a three dimensional space of color. We
sometimes choose other axes to represent this space because they make certain
operations more convenient.
Another possible way of representing a color image is to separate the color
information (chromaticity) from the brightness information. There are
multiple different methods for doing this - a common one during the analog
television days was known as Chrominance-Luminance or YUV.
We choose to represent the image in this way so that we can remap only the
intensity channel and then recombine the new intensity values with the color
information to form the final image.
Old TV signals used to be transmitted in this way so that black & white
televisions could display the luminance channel while color televisions would
display all three of the channels. Tone-mapping
============
In this assignment we are going to transform the luminance channel (actually
the log of the luminance, but this is unimportant for the parts of the
algorithm that you will be implementing) by compressing its range to [0, 1].
To do this we need the cumulative distribution of the luminance values.
Example
-------
input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
min / max / range: 0 / 9 / 9
histo with 3 bins: [4 7 3]
cdf : [4 11 14]
Your task is to calculate this cumulative distribution by following these
steps.
*/ #include "utils.h" __global__ void reduce_minimum(float * d_out, const float * const d_in, const size_t numItem) {
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[]; int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x; // load shared mem from global mem
sdata[tid] = 99999999999.0f;
if (myId < numItem)
sdata[tid] = d_in[myId]; __syncthreads(); // make sure entire block is loaded! // do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] = min(sdata[tid], sdata[tid + s]);
}
__syncthreads(); // make sure all adds at one stage are done!
} // only thread 0 writes result for this block back to global mem
if (tid == 0) {
d_out[blockIdx.x] = sdata[0];
}
} __global__ void reduce_maximum(float * d_out, const float * const d_in, const size_t numItem) {
// sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[]; int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x; // load shared mem from global mem
sdata[tid] = -99999999999.0f;
if (myId < numItem)
sdata[tid] = d_in[myId]; __syncthreads(); // make sure entire block is loaded! // do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] = max(sdata[tid], sdata[tid + s]);
}
__syncthreads(); // make sure all adds at one stage are done!
} // only thread 0 writes result for this block back to global mem
if (tid == 0) {
d_out[blockIdx.x] = sdata[0];
}
} __global__ void histogram(unsigned int *d_bins, const float * const d_in, const size_t numBins, const float min_logLum, const float range, const size_t numRows, const size_t numCols) { int myId = threadIdx.x + blockDim.x * blockIdx.x;
if (myId >= (numRows * numCols))
return; float myItem = d_in[myId];
int myBin = (myItem - min_logLum) / range * numBins;
atomicAdd(&(d_bins[myBin]), 1);
} __global__ void scan(unsigned int *d_out, unsigned int *d_sums, const unsigned int * const d_in, const unsigned int numBins, const unsigned int numElems) { extern __shared__ float sdata[];
int myId = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
int offset = 1; // load two items per thread into shared memory
if ((2 * myId) < numBins) {
sdata[2 * tid] = d_in[2 * myId];
}
else {
sdata[2 * tid] = 0;
} if ((2 * myId + 1) < numBins) {
sdata[2 * tid + 1] = d_in[2 * myId + 1];
}
else {
sdata[2 * tid + 1] = 0;
} // Reduce
for (unsigned int d = numElems >> 1; d > 0; d >>= 1) {
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
sdata[bi] += sdata[ai];
}
offset *= 2;
__syncthreads();
} // clear the last element
if (tid == 0) {
d_sums[blockIdx.x] = sdata[numElems - 1];
sdata[numElems - 1] = 0;
} // Down Sweep
for (unsigned int d = 1; d < numElems; d *= 2) {
offset >>= 1;
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
float t = sdata[ai];
sdata[ai] = sdata[bi];
sdata[bi] += t;
}
__syncthreads();
} // write the output to global memory
if ((2 * myId) < numBins) {
d_out[2 * myId] = sdata[2 * tid];
}
if ((2 * myId + 1) < numBins) {
d_out[2 * myId + 1] = sdata[2 * tid + 1];
}
} // This version only works for one single block! The size of the array of items
__global__ void scan2(unsigned int *d_out, const unsigned int * const d_in, const unsigned int numBins, const unsigned int numElems) { extern __shared__ float sdata[];
int tid = threadIdx.x;
int offset = 1; // load two items per thread into shared memory
if ((2 * tid) < numBins) {
sdata[2 * tid] = d_in[2 * tid];
}
else {
sdata[2 * tid] = 0;
} if ((2 * tid + 1) < numBins) {
sdata[2 * tid + 1] = d_in[2 * tid + 1];
}
else {
sdata[2 * tid + 1] = 0;
} // Reduce
for (unsigned int d = numElems >> 1; d > 0; d >>= 1) {
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
sdata[bi] += sdata[ai];
}
offset *= 2;
__syncthreads();
} // clear the last element
if (tid == 0) {
sdata[numElems - 1] = 0;
} // Down Sweep
for (unsigned int d = 1; d < numElems; d *= 2) {
offset >>= 1;
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
float t = sdata[ai];
sdata[ai] = sdata[bi];
sdata[bi] += t;
}
__syncthreads();
} // write the output to global memory
if ((2 * tid) < numBins) {
d_out[2 * tid] = sdata[2 * tid];
} if ((2 * tid + 1) < numBins) {
d_out[2 * tid + 1] = sdata[2 * tid + 1];
}
} __global__ void add_scan(unsigned int *d_out, const unsigned int * const d_in, const unsigned int numBins) { if (blockIdx.x == 0)
return; int myId = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int myOffset = d_in[blockIdx.x]; if ((2 * myId) < numBins) {
d_out[2 * myId] += myOffset;
}
if ((2 * myId + 1) < numBins) {
d_out[2 * myId + 1] += myOffset;
} } void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */ // Initialization
unsigned int numItem = numRows * numCols;
dim3 blockSize(256, 1, 1);
dim3 gridSize(numItem / blockSize.x + 1, 1, 1); float * d_inter_min;
float * d_inter_max;
unsigned int * d_histogram;
unsigned int * d_sums;
unsigned int * d_incr; checkCudaErrors(cudaMalloc(&d_inter_min, sizeof(float) * gridSize.x));
checkCudaErrors(cudaMalloc(&d_inter_max, sizeof(float) * gridSize.x));
checkCudaErrors(cudaMalloc(&d_histogram, sizeof(unsigned int) * numBins));
checkCudaErrors(cudaMemset(d_histogram, 0, sizeof(unsigned int) * numBins)); // Step 1: Reduce (min and max). It could be done in one step only!
reduce_minimum<<<gridSize, blockSize, sizeof(float) * blockSize.x>>>(d_inter_min, d_logLuminance, numItem);
reduce_maximum<<<gridSize, blockSize, sizeof(float) * blockSize.x>>>(d_inter_max, d_logLuminance, numItem);
numItem = gridSize.x;
gridSize.x = numItem / blockSize.x + 1; while (numItem > 1) {
reduce_minimum<<<gridSize, blockSize, sizeof(float) * blockSize.x>>>(d_inter_min, d_inter_min, numItem);
reduce_maximum<<<gridSize, blockSize, sizeof(float) * blockSize.x>>>(d_inter_max, d_inter_max, numItem);
numItem = gridSize.x;
gridSize.x = numItem / blockSize.x + 1;
} // Step 2: Range
checkCudaErrors(cudaMemcpy(&min_logLum, d_inter_min, sizeof(float), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(&max_logLum, d_inter_max, sizeof(float), cudaMemcpyDeviceToHost)); float range = max_logLum - min_logLum; // Step 3: Histogram
gridSize.x = numRows * numCols / blockSize.x + 1;
histogram<<<gridSize, blockSize>>>(d_histogram, d_logLuminance, numBins, min_logLum, range, numRows, numCols); // Step 4: Exclusive scan - Blelloch
unsigned int numElems = 256;
blockSize.x = numElems / 2;
gridSize.x = numBins / numElems;
if (numBins % numElems != 0)
gridSize.x++;
checkCudaErrors(cudaMalloc(&d_sums, sizeof(unsigned int) * gridSize.x));
checkCudaErrors(cudaMemset(d_sums, 0, sizeof(unsigned int) * gridSize.x)); // First-level scan to obtain the scanned blocks
scan<<<gridSize, blockSize, sizeof(float) * numElems>>>(d_cdf, d_sums, d_histogram, numBins, numElems); // Second-level scan to obtain the scanned blocks sums
numElems = gridSize.x; // Look for the next power of 2 (32 bits)
unsigned int nextPow = numElems;
nextPow--;
nextPow = (nextPow >> 1) | nextPow;
nextPow = (nextPow >> 2) | nextPow;
nextPow = (nextPow >> 4) | nextPow;
nextPow = (nextPow >> 8) | nextPow;
nextPow = (nextPow >> 16) | nextPow;
nextPow++; blockSize.x = nextPow / 2;
gridSize.x = 1;
checkCudaErrors(cudaMalloc(&d_incr, sizeof(unsigned int) * numElems));
checkCudaErrors(cudaMemset(d_incr, 0, sizeof(unsigned int) * numElems));
scan2<<<gridSize, blockSize, sizeof(float) * nextPow>>>(d_incr, d_sums, numElems, nextPow); // Add scanned block sum i to all values of scanned block i
numElems = 256;
blockSize.x = numElems / 2;
gridSize.x = numBins / numElems;
if (numBins % numElems != 0)
gridSize.x++;
add_scan<<<gridSize, blockSize>>>(d_cdf, d_incr, numBins); // Clean memory
checkCudaErrors(cudaFree(d_inter_min));
checkCudaErrors(cudaFree(d_inter_max));
checkCudaErrors(cudaFree(d_histogram));
checkCudaErrors(cudaFree(d_sums));
checkCudaErrors(cudaFree(d_incr)); }

Problem set 4

//Udacity HW 4
//Radix Sorting #include "utils.h"
#include <thrust/host_vector.h> /* Red Eye Removal
=============== For this assignment we are implementing red eye removal. This is
accomplished by first creating a score for every pixel that tells us how
likely it is to be a red eye pixel. We have already done this for you - you
are receiving the scores and need to sort them in ascending order so that we
know which pixels to alter to remove the red eye.
Note: ascending order == smallest to largest
Each score is associated with a position, when you sort the scores, you must
also move the positions accordingly.
Implementing Parallel Radix Sort with CUDA
==========================================
The basic idea is to construct a histogram on each pass of how many of each
"digit" there are. Then we scan this histogram so that we know where to put
the output of each digit. For example, the first 1 must come after all the
0s so we have to know how many 0s there are to be able to start moving 1s
into the correct position.
1) Histogram of the number of occurrences of each digit
2) Exclusive Prefix Sum of Histogram
3) Determine relative offset of each digit
For example [0 0 1 1 0 0 1]
-> [0 1 0 1 2 3 2]
4) Combine the results of steps 2 & 3 to determine the final
output location for each element and move it there
LSB Radix sort is an out-of-place sort and you will need to ping-pong values
between the input and output buffers we have provided. Make sure the final
sorted results end up in the output buffer! Hint: You may need to do a copy
at the end.
*/ /*
void your_sort(unsigned int* const d_inputVals,
unsigned int* const d_inputPos,
unsigned int* const d_outputVals,
unsigned int* const d_outputPos,
const size_t numElems)
{
//TODO
//PUT YOUR SORT HERE
const int numBits = 1;
unsigned int* d_splitVals;
checkCudaErrors(cudaMalloc((void**)&d_splitVals,numElems * sizeof(unsigned)));
unsigned int* d_cdf;
checkCudaErrors(cudaMalloc((void**)&d_cdf,numElems * sizeof(unsigned)));
//d_scatterAddr keeps track of the scattered oringinal addresses at every pass
unsigned int* d_scatterAddr;
checkCudaErrors(cudaMalloc((void**)&d_scatterAddr,numElems * sizeof(unsigned)));
checkCudaErrors(cudaMemcpy(d_scatterAddr,d_inputPos,numElems * sizeof(unsigned),cudaMemcpyDeviceToDevice));
}
*/ //Udacity HW 4
//Radix Sorting #include "reference_calc.cpp"
#include "utils.h"
#include <float.h>
#include <math.h>
#include <stdio.h> #include "utils.h" /* Red Eye Removal
=============== For this assignment we are implementing red eye removal. This is
accomplished by first creating a score for every pixel that tells us how
likely it is to be a red eye pixel. We have already done this for you - you
are receiving the scores and need to sort them in ascending order so that we
know which pixels to alter to remove the red eye.
Note: ascending order == smallest to largest
Each score is associated with a position, when you sort the scores, you must
also move the positions accordingly.
Implementing Parallel Radix Sort with CUDA
==========================================
The basic idea is to construct a histogram on each pass of how many of each
"digit" there are. Then we scan this histogram so that we know where to put
the output of each digit. For example, the first 1 must come after all the
0s so we have to know how many 0s there are to be able to start moving 1s
into the correct position.
1) Histogram of the number of occurrences of each digit
2) Exclusive Prefix Sum of Histogram
3) Determine relative offset of each digit
For example [0 0 1 1 0 0 1]
-> [0 1 0 1 2 3 2]
4) Combine the results of steps 2 & 3 to determine the final
output location for each element and move it there
LSB Radix sort is an out-of-place sort and you will need to ping-pong values
between the input and output buffers we have provided. Make sure the final
sorted results end up in the output buffer! Hint: You may need to do a copy
at the end.
*/ const int MAX_THREADS_PER_BLOCK = 512; /*---------------------------------------------------------------------------------*/ ///////////////////////////////////////////////////////
//--------------------- KERNELS ---------------------//
///////////////////////////////////////////////////////
__global__ void split_array(unsigned int* d_inputVals, unsigned int* d_splitVals,
const size_t numElems, unsigned int mask,
unsigned int ibit)
{ int array_idx = blockIdx.x*blockDim.x + threadIdx.x;
if (array_idx >= numElems) return; // Split based on whether inputVals digit is 1 or 0:
d_splitVals[array_idx] = !(d_inputVals[array_idx] & mask); } __global__ void blelloch_scan_single_block(unsigned int* d_in_array,
const size_t numBins,
unsigned normalization=0)
/*
Computes the blelloch exclusive scan for a cumulative distribution function of a
histogram, one block at a time.
\Params:
* d_in_array - input array of histogram values in each bin. Gets converted
to cdf by the end of the function.
* numBins - number of bins in the histogram (Must be < 2*MAX_THREADS_PER_BLOCK)
* normalization - constant value to add to all bins
(when doing full exclusive sum scan over multiple blocks).
*/
{ int thid = threadIdx.x; extern __shared__ float temp_array[]; // Make sure that we do not read from undefined part of array if it
// is smaller than the number of threads that we gave defined. If
// that is the case, the final values of the input array are
// extended to zero.
if (thid < numBins) temp_array[thid] = d_in_array[thid];
else temp_array[thid] = 0;
if( (thid + numBins/2) < numBins)
temp_array[thid + numBins/2] = d_in_array[thid + numBins/2];
else temp_array[thid + numBins/2] = 0; __syncthreads(); // Part 1: Up Sweep, reduction
// Iterate log_2(numBins) times, and each element adds value 'stride'
// elements away to its own value.
int stride = 1;
for (int d = numBins>>1; d > 0; d>>=1) { if (thid < d) {
int neighbor = stride*(2*thid+1) - 1;
int index = stride*(2*thid+2) - 1; temp_array[index] += temp_array[neighbor];
}
stride *=2;
__syncthreads();
}
// Now set last element to identity:
if (thid == 0) temp_array[numBins-1] = 0; // Part 2: Down sweep
// Iterate log(n) times. Each thread adds value stride elements away to
// its own value, and sets the value stride elements away to its own
// previous value.
for (int d=1; d<numBins; d *= 2) {
stride >>= 1;
__syncthreads(); if(thid < d) {
int neighbor = stride*(2*thid+1) - 1;
int index = stride*(2*thid+2) - 1; float t = temp_array[neighbor];
temp_array[neighbor] = temp_array[index];
temp_array[index] += t;
}
} __syncthreads(); if (thid < numBins) d_in_array[thid] = temp_array[thid] + normalization;
if ((thid + numBins/2) < numBins)
d_in_array[thid + numBins/2] = temp_array[thid + numBins/2] + normalization; } __global__ void compute_outputPos(const unsigned int* d_inputVals,
unsigned int* d_outputVals,
unsigned int* d_outputPos, unsigned int* d_tVals,
const unsigned int* d_splitVals,
const unsigned int* d_cdf, const unsigned int totalFalses,
const unsigned int numElems)
{ int thid = threadIdx.x;
int global_id = blockIdx.x*blockDim.x + thid;
if (global_id >= numElems) return; d_tVals[global_id] = global_id - d_cdf[global_id] + totalFalses; unsigned int scatter = (!(d_splitVals[global_id]) ?
d_tVals[global_id] : d_cdf[global_id] );
d_outputPos[global_id] = scatter; } __global__ void do_scatter(unsigned int* d_outputVals, const unsigned int* d_inputVals,
unsigned int* d_outputPos,
unsigned int* d_inputPos,
unsigned int* d_scatterAddr,
const unsigned int numElems)
{ int global_id = blockIdx.x*blockDim.x + threadIdx.x;
if(global_id >= numElems) return; d_outputVals[d_outputPos[global_id]] = d_inputVals[global_id];
d_scatterAddr[d_outputPos[global_id]] = d_inputPos[global_id]; } ///////////////////////////////////////////////////////////
//--------------------- END KERNELS ---------------------//
/////////////////////////////////////////////////////////// void full_blelloch_exclusive_scan(unsigned int* d_binScan, const size_t totalNumElems)
/*
NOTE: blelloch_scan_single_block() does an exclusive sum scan over
an array (balanced tree) of size 2*MAX_THREADS_PER_BLOCK, by
performing the up and down sweep of the scan in shared memory (which
is limited in size).
In order to scan over an entire array of size >
2*MAX_THREADS_PER_BLOCK, we employ the following procedure:
1) Compute total number of blocks of size 2*MAX_THREADS_PER_BLOCK
2) Loop over each block and compute a partial array of number
of bins: 2*MAX_THREADS_PER_BLOCK
3) Give this partial array to blelloch_scan_single_block() and let
it return the sum scan.
4) Now, one has a full array of partial sum scans, and then we take the
last element of the j-1 block and add it to each element of the jth
block.
\Params:
* d_binScan - starts out as the "histogram" or in this case, the
split_array that we will perform an exclusive scan over.
* totalNumElems - total number of elements in the d_binScan array to
perform an exclusive scan over.
*/
{ int nthreads = MAX_THREADS_PER_BLOCK;
int nblocksTotal = (totalNumElems/2 - 1) / nthreads + 1;
int partialBins = 2*nthreads;
int smSize = partialBins*sizeof(unsigned); // Need a balanced d_binScan array so that on final block, correct
// values are given to d_partialBinScan.
// 1. define balanced bin scan
// 2. set all values to zero
// 3. copy all of binScan into binScanBalanced.
unsigned int* d_binScanBalanced;
unsigned int balanced_size = nblocksTotal*partialBins*sizeof(unsigned);
checkCudaErrors(cudaMalloc((void**)&d_binScanBalanced, balanced_size));
checkCudaErrors(cudaMemset(d_binScanBalanced, 0, balanced_size));
checkCudaErrors(cudaMemcpy(d_binScanBalanced, d_binScan,
totalNumElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); unsigned int* d_partialBinScan;
checkCudaErrors(cudaMalloc((void**)&d_partialBinScan, partialBins*sizeof(unsigned))); unsigned int* normalization = (unsigned*)malloc(sizeof(unsigned));
unsigned int* lastVal = (unsigned*)malloc(sizeof(unsigned));
for (unsigned iblock = 0; iblock < nblocksTotal; iblock++) {
unsigned offset = iblock*partialBins; // Copy binScan Partition into partialBinScan
checkCudaErrors(cudaMemcpy(d_partialBinScan, (d_binScanBalanced + offset),
smSize, cudaMemcpyDeviceToDevice)); if (iblock > 0) {
// get normalization - final value in last cdf bin + last value in original
checkCudaErrors(cudaMemcpy(normalization, (d_binScanBalanced + (offset-1)),
sizeof(unsigned), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(lastVal, (d_binScan + (offset-1)),
sizeof(unsigned), cudaMemcpyDeviceToHost));
*normalization += (*lastVal);
} else *normalization = 0; blelloch_scan_single_block<<<1, nthreads, smSize>>>(d_partialBinScan,
partialBins,
*normalization);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); // Copy partialBinScan back into binScanBalanced:
checkCudaErrors(cudaMemcpy((d_binScanBalanced+offset), d_partialBinScan, smSize,
cudaMemcpyDeviceToDevice)); } // ONE BLOCK WORKING HERE!!!
// binScanBalanced now needs to be copied into d_binScan!
checkCudaErrors(cudaMemcpy(d_binScan,d_binScanBalanced,totalNumElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); free(normalization);
free(lastVal);
checkCudaErrors(cudaFree(d_binScanBalanced));
checkCudaErrors(cudaFree(d_partialBinScan)); } void compute_scatter_addresses(const unsigned int* d_inputVals,
unsigned int* d_outputVals,
unsigned int* d_inputPos,
unsigned int* d_outputPos,
unsigned int* d_scatterAddr,
const unsigned int* const d_splitVals,
const unsigned int* const d_cdf,
const unsigned totalFalses,
const size_t numElems)
/*
Modifies d_outputVals and d_outputPos
*/
{ unsigned int* d_tVals;
checkCudaErrors(cudaMalloc((void**)&d_tVals, numElems*sizeof(unsigned))); int nthreads = MAX_THREADS_PER_BLOCK;
int nblocks = (numElems - 1) / nthreads + 1;
compute_outputPos<<<nblocks, nthreads>>>(d_inputVals, d_outputVals, d_outputPos,
d_tVals, d_splitVals, d_cdf, totalFalses,
numElems);
// Testing purposes - REMOVE in production
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); do_scatter<<<nblocks, nthreads>>>(d_outputVals, d_inputVals, d_outputPos,
d_inputPos, d_scatterAddr, numElems);
// Testing purposes - REMOVE in production
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaFree(d_tVals)); } void your_sort(unsigned int* const d_inputVals,
unsigned int* const d_inputPos,
unsigned int* const d_outputVals,
unsigned int* const d_outputPos,
const size_t numElems)
{ //-----Set up-----
const int numBits = 1;
unsigned int* d_splitVals;
checkCudaErrors(cudaMalloc((void**)&d_splitVals, numElems*sizeof(unsigned)));
unsigned int* d_cdf;
checkCudaErrors(cudaMalloc((void**)&d_cdf, numElems*sizeof(unsigned))); // d_scatterAddr keeps track of the scattered original addresses at every pass
unsigned int* d_scatterAddr;
checkCudaErrors(cudaMalloc((void**)&d_scatterAddr, numElems*sizeof(unsigned)));
checkCudaErrors(cudaMemcpy(d_scatterAddr, d_inputPos, numElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); // Need a global device array for blelloch scan:
const int nBlellochBins = 1 << unsigned(log((long double)numElems)/log((long double)2) + 0.5);
unsigned int* d_blelloch;
checkCudaErrors(cudaMalloc((void**)&d_blelloch, nBlellochBins*sizeof(unsigned)));
//printf(" numElems: %lu, numBlellochBins: %d \n",numElems, nBlellochBins); unsigned int* d_inVals = d_inputVals;
unsigned int* d_inPos = d_inputPos;
unsigned int* d_outVals = d_outputVals;
unsigned int* d_outPos = d_outputPos; // Testing purposes - also free'd at end
unsigned int* h_splitVals = (unsigned*)malloc(numElems*sizeof(unsigned));
unsigned int* h_cdf = (unsigned*)malloc(numElems*sizeof(unsigned));
unsigned int* h_inVals = (unsigned*)malloc(numElems*sizeof(unsigned));
unsigned int* h_outVals = (unsigned*)malloc(numElems*sizeof(unsigned));
unsigned int* h_inPos = (unsigned*)malloc(numElems*sizeof(unsigned));
unsigned int* h_outPos = (unsigned*)malloc(numElems*sizeof(unsigned)); // Parallel radix sort - For each pass (each bit):
// 1) Split values based on current bit
// 2) Scan values of split array
// 3) Compute scatter output position
// 4) Scatter output values using inputVals and outputPos
for(unsigned ibit = 0; ibit < 8 * sizeof(unsigned); ibit+=numBits) { checkCudaErrors(cudaMemset(d_splitVals, 0, numElems*sizeof(unsigned)));
checkCudaErrors(cudaMemset(d_cdf,0,numElems*sizeof(unsigned)));
checkCudaErrors(cudaMemset(d_blelloch,0,nBlellochBins*sizeof(unsigned))); // Step 1: Split values on True if bit matches 0 in the given bit
// NOTE: mask = [1 2 4 8 ... 2147483648]
// [2^0, 2^1,...2^31]
unsigned int mask = 1 << ibit;
int nthreads = MAX_THREADS_PER_BLOCK;
int nblocks = (numElems - 1)/nthreads + 1;
split_array<<<nblocks, nthreads>>>(d_inVals, d_splitVals, numElems,
mask, ibit);
// Testing purposes - REMOVE in production
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaMemcpy(d_cdf, d_splitVals, numElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); // Step 2: Scan values of split array:
// Uses Blelloch exclusive scan
full_blelloch_exclusive_scan(d_cdf, numElems);
// STEP 2 --> WORKING!!! VERIFIED FOR ALL STEPS! // Step 3: compute scatter addresses
// Get totalFalses:
unsigned totalFalses = 0;
checkCudaErrors(cudaMemcpy(h_splitVals, d_splitVals + (numElems-1), sizeof(unsigned),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_cdf, d_cdf + (numElems -1), sizeof(unsigned),
cudaMemcpyDeviceToHost));
totalFalses = h_splitVals[0] + h_cdf[0];
compute_scatter_addresses(d_inVals, d_outVals, d_inPos, d_outPos, d_scatterAddr,
d_splitVals, d_cdf, totalFalses, numElems); // swap pointers:
std::swap(d_inVals, d_outVals);
std::swap(d_inPos, d_scatterAddr); } // Do we need this?
checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, numElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, numElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); // Put scatter addresses (->inPos) into d_outputVals;
checkCudaErrors(cudaMemcpy(d_outputPos, d_inPos, numElems*sizeof(unsigned),
cudaMemcpyDeviceToDevice)); checkCudaErrors(cudaFree(d_splitVals));
checkCudaErrors(cudaFree(d_cdf));
checkCudaErrors(cudaFree(d_blelloch)); free(h_splitVals);
free(h_cdf);
free(h_inVals);
free(h_outVals);
free(h_inPos);
free(h_outPos);
}
上一篇:斯坦福机器学习课程 Exercise 习题二


下一篇:斯坦福机器学习课程 Exercise 习题三