Membersihkan semua memori yang tertinggal dalam proses inpainting. 7)Tampilkan Gambar
8) Simpan Gambar
outpuinpainting.convertTo(outpuinpainting, CV_8UC1, 255.0); imwrite("outputGPU.png", outpuinpainting);
Kode Menyimpan gambar
Untuk menyimpan gambar hasil inpainting sesuai dengan format yang benar, maka data
akan dikonversi terlebih dahulu ke tipe CV_8UC1. Setelah itu diberikan parameter 255, hal
ini bertujuan sebagai penanda nilai maksimum dari matriks yang dibuat. Setelah format
tercukupi, maka data disimpan dengan bantuan perintah imwrite.
Lampiran 5. Kode Lengkap Inpainting Berbasis CPU #include <iostream> #include <iomanip> #include <chrono> #include <math.h> #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/highgui/highgui.hpp" #include "opencv2/features2d/features2d.hpp" const double PI = 3.1415; using namespace std; using namespace cv; Mat src; Mat u; Mat mask;
char window_name[] = "Cahn Hilliard Inpainting ";
Mat ComputeCurve(Mat src, int rows, int cols, int vartype); double psnr(Mat & img_src, Mat & img_compressed);
double mse(Mat & img1, Mat & img2);
double ssim(Mat & img_src, Mat & img_compressed, int block_size, bool show_progress = false);
double sigma(Mat & m, int i, int j, int block_size);
double cov(Mat & m1, Mat & m2, int i, int j, int block_size);
int main(void) {
double maxs = 0, mins = 0, maxm = 0, minm = 0, maxg = 0, ming = 0; Mat lambda, g;
int vartype = CV_64FC1;
src = imread("input.png", CV_LOAD_IMAGE_GRAYSCALE);
mask = imread("input-mask_i.png", CV_LOAD_IMAGE_GRAYSCALE); src.convertTo(src, vartype);
mask.convertTo(mask, vartype); minMaxLoc(src, &mins, &maxs); minMaxLoc(mask, &minm, &maxm); src = src / maxs;
mask = 1-mask / maxm; lambda = mask.clone(); g = src.clone();
minMaxLoc(g, &ming, &maxg); divide(g, maxg, g);
//parameter
int dt = 100;
double epsilon = 0.005;
double ep2 = pow(epsilon , 2); int lpower = 2; double lambda0 = 100; lambda = lambda.mul(lambda0); double c1 = 200; double c2 = 100; int Itermax = 2000; int rows = g.rows; int cols = g.cols;
u = g;
imshow("Lambda", lambda); imshow("Original Image", g); waitKey(1);
Mat Lambda1 = Mat::zeros(rows, rows, vartype); Mat Lambda2 = Mat::zeros(cols, cols, vartype); for (int j = 0; j<rows; j++){
Lambda1.at<double>(j, j) = 2 * (cos(2 * j * PI / rows) - 1); }
for (int j = 0; j<cols; j++){
Lambda2.at<double>(j, j) = 2 * (cos(2 * j * PI / cols) - 1); }
int h1 = 1; int h2 = 1;
Mat Denominator = 1 / (h1*h1)*(Lambda1*Mat::ones(rows, cols, vartype)) + 1 / (h2*h2)*(Mat::ones(rows, cols, vartype)*Lambda2);
Mat ubar, uabar, LUbar, LU0bar, LapCurvbar,curvbar; Mat curv,alpha,LU,tesfft,ivtes;
Mat complexLapCurvbar[] = { Mat::zeros(u.size(), vartype), Mat::zeros(u.size(), vartype) };
Mat complexubar[] = { Mat::zeros(u.size(), vartype), Mat::zeros(u.size(), vartype) };
dft(u, ubar, DFT_COMPLEX_OUTPUT);
dft(u.mul(lambda), LU0bar, DFT_COMPLEX_OUTPUT); LUbar = LU0bar.clone();
//LUbar = LU0bar;
std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now();
for (int i = 0; i<200; i++) {
curv = ComputeCurve(u, src.rows, src.cols, vartype); dft(curv, curvbar,DFT_COMPLEX_OUTPUT);
split(curvbar, complexLapCurvbar); complexLapCurvbar[0] = complexLapCurvbar[0].mul(Denominator); complexLapCurvbar[1] = complexLapCurvbar[1].mul(Denominator); merge(complexLapCurvbar, 2, LapCurvbar); alpha = 1 + c2*dt + dt*c1*Denominator.mul(Denominator); split(ubar, complexubar); complexubar[0] = complexubar[0].mul(alpha); complexubar[1] = complexubar[1].mul(alpha); merge(complexubar, 2, uabar);
ubar = uabar - dt*LapCurvbar + dt*(LU0bar - LUbar); tesfft = dt*LapCurvbar;
split(ubar, complexubar);
divide( complexubar[0],alpha, complexubar[0]); divide(complexubar[1], alpha, complexubar[1]); merge(complexubar, 2, ubar);
idft(ubar, u, DFT_SCALE | DFT_REAL_OUTPUT); // Applying IDFT
LU = u.mul(lambda);
//imshow("Cahn Hilliard high order Inpainting", u); //waitKey(1);
cout << "Iterasi - " << i + 1 << endl; }
end = std::chrono::system_clock::now(); u.convertTo(u, CV_8UC1, 255.0);
imshow("Cahn Hilliard high order Inpainting", u); imwrite("output.png", u);
std::chrono::duration<double> elapsed_seconds = end - start; time_t end_time = std::chrono::system_clock::to_time_t(end); struct tm timeinfo;
localtime_s(&timeinfo, &end_time);
cout << "Finished computation in " << put_time(&timeinfo, " %d-%m-%Y at %H:%M:%S") << endl;
cout << "Elapsed time: " << elapsed_seconds.count() << "s\n"; u.convertTo(u, CV_64FC1);
cout << "\nQuality Measurement" << endl; cout << "MSE : " << mse(g, u) << endl; cout << "PSNR: " << psnr(g, u) << endl;
cout << "SSIM: " << ssim(g, u, 8, false) << endl; waitKey(0);
return 0; }
Mat ComputeCurve(Mat src, int rows, int cols, int vartype) {
Mat result = Mat::zeros(rows, cols, vartype); Mat ux = Mat::zeros(rows, cols, vartype); double epsilon = 0.005;
double ep2 = pow(epsilon, 2); int w, n, s, e = 0;
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) { e = j + 1; w = j - 1; n = i - 1; s = i + 1; w <= 0 ? w = 0 : w = w; e>cols - 1 ? e = cols - 1 : e = e; n <= 0 ? n = 0 : n = n; s > rows - 1 ? s = rows - 1 : s = s;
double x = (src.at<double>(i, e) - src.at<double>(i, w)) / 2; double y = (src.at<double>(s, j) - src.at<double>(n, j)) / 2; double xx = src.at<double>(i, w) + src.at<double>(i, e) - 2 * src.at<double>(i, j);
double yy = src.at<double>(s, j) + src.at<double>(n, j) - 2 * src.at<double>(i, j);
double dp = src.at<double>(n, w) + src.at<double>(s, e); double dm = src.at<double>(n, e) + src.at<double>(s, w); double xy = (dp - dm) / 4;
double num = xx*(ep2 + y*y) - 2 * x*y*xy + yy*(ep2 + x*x);; double den = pow((ep2 + x*x + y*y), 1.5);
//ux.at<double>(i, j) = curve;
result.at<double>(i, j) =curve; }
}
return result; }
double psnr(Mat & img_src, Mat & img_compressed) {
int D = 255;
return (10 * log10((D*D) / mse(img_src, img_compressed))); }
double mse(Mat & img1, Mat & img2) {
int i, j; double mse = 0;
int height = img1.rows; int width = img1.cols; for (i = 0; i < height; i++) for (j = 0; j < width; j++)
mse += (img1.at<double>(i, j) - img2.at<double>(i, j)) * (img1.at<double>(i, j) - img2.at<double>(i, j));
mse /= height * width; return mse;
}
double ssim(Mat & img_src, Mat & img_compressed, int block_size, bool show_progress) {
// typicaly block 8
double ssim = 0;
float C1 = (float)(0.01 * 255 * 0.01 * 255); float C2 = (float)(0.03 * 255 * 0.03 * 255); int nbBlockPerHeight = img_src.rows / block_size; int nbBlockPerWidth = img_src.cols / block_size; for (int k = 0; k < nbBlockPerHeight; k++) {
for (int l = 0; l < nbBlockPerWidth; l++) {
int m = k * block_size; int n = l * block_size;
double avg_o = mean(img_src(Range(k, k + block_size), Range(l, l + block_size)))[0];
double avg_r = mean(img_compressed(Range(k, k + block_size), Range(l, l + block_size)))[0];
double sigma_o = sigma(img_src, m, n, block_size);
double sigma_r = sigma(img_compressed, m, n, block_size);
double sigma_ro = cov(img_src, img_compressed, m, n, block_size); ssim += ((2 * avg_o * avg_r + C1) * (2 * sigma_ro + C2)) /
((avg_o * avg_o + avg_r * avg_r + C1) * (sigma_o * sigma_o + sigma_r * sigma_r + C2)); }
// Progress
cout << "\r>>SSIM [" << (int)((((double)k) / nbBlockPerHeight) * 100) << "%]";
}
ssim /= nbBlockPerHeight * nbBlockPerWidth; if (show_progress)
{
cout << "\r>>SSIM [100%]" << endl; cout << "SSIM : " << ssim << endl; }
return ssim; }
double sigma(Mat & m, int i, int j, int block_size) {
double sd = 0;
Mat m_tmp = m(Range(i, i + block_size), Range(j, j + block_size)); Mat m_squared(block_size, block_size, CV_64F);
multiply(m_tmp, m_tmp, m_squared);
// E(x)
double avg = mean(m_tmp)[0];
// E(x²)
double avg_2 = mean(m_squared)[0];
sd = sqrt(avg_2 - avg * avg); return sd;
}
double cov(Mat & m1, Mat & m2, int i, int j, int block_size) {
Mat m3 = Mat::zeros(block_size, block_size, m1.depth());
Mat m1_tmp = m1(Range(i, i + block_size), Range(j, j + block_size)); Mat m2_tmp = m2(Range(i, i + block_size), Range(j, j + block_size));
multiply(m1_tmp, m2_tmp, m3);
double avg_ro = mean(m3)[0]; // E(XY)
double avg_r = mean(m1_tmp)[0]; // E(X)
double avg_o = mean(m2_tmp)[0]; // E(Y)
double sd_ro = avg_ro - avg_o * avg_r; // E(XY) - E(X)E(Y)
return sd_ro; }
Lampiran 6. Kode Lengkap Inpainting Pemrosesan Paralel untuk CPU #include "opencv2\core\core.hpp" #include "opencv2\opencv.hpp" #include "opencv2\highgui\highgui.hpp" #include "cuda_runtime.h" #include <stdio.h> #include <math.h> const double PI = 3.1415; using namespace std; using namespace cv;
void gpu_image2double(unsigned char *h_src, unsigned char *h_lamda, double* h_outsrc, double *h_outlamda, int cols, int rows);
void gpu_createvariabel(double *h_src, double * h_out, int cols, int rows); void gpu_inpaintinghighorder(double *h_src, double *h_lamda, double *denominator, double* h_output, double * h_curve, int iteration, float *elapsedtime, int numRows, int numCols);
void main() {
printf(" CUDA Inpainting High Order Started \n"); int vartype = CV_64FC1;
//Read Data
Mat src = imread("original.png", CV_LOAD_IMAGE_COLOR); Mat mask = imread("mask.png", CV_LOAD_IMAGE_COLOR); Mat lambda, u, g;
int rows = src.rows; int cols = src.cols;
//Init Parameter
int Itermax = 1000;
Mat Lambda1 = Mat::zeros(rows, rows, vartype); Mat Lambda2 = Mat::zeros(cols, cols, vartype); for (int j = 0; j<rows; j++){
Lambda1.at<double>(j, j) = 2 * (cos(2 *j * PI / rows) - 1); }
for (int j = 0; j<cols; j++){
Lambda2.at<double>(j, j) = 2 * (cos(2 *j * PI / cols) - 1); }
int h1 = 1; int h2 = 1;
Mat Denominator = 1 / (h1*h1)*(Lambda1*Mat::ones(rows,cols, vartype)) + 1 / (h2*h2)*(Mat::ones(rows, cols, vartype)*Lambda2);
cvtColor(src, src, CV_BGR2GRAY); cvtColor(mask, mask, CV_BGR2GRAY);
//GPU JADI
unsigned char * h_src = (unsigned char*)src.data; unsigned char * h_lamda = (unsigned char*)mask.data; double *h_denominator = (double*)Denominator.data; double *h_lambda1 = (double*)Lambda1.data;
double *h_lambda2 = (double*)Lambda2.data;
double *h_doubleSrc = (double*)malloc(sizeof(double)*src.cols*src.rows); double *h_doubleLamda = (double*)malloc(sizeof(double)*src.cols*src.rows);
double *h_out = (double*)malloc(sizeof(double)*src.cols*src.rows); double *h_curve = (double*)malloc(sizeof(double)*src.cols*src.rows); float *elapsedtime = 0;
gpu_image2double(h_src, h_lamda, h_doubleSrc, h_doubleLamda, rows, cols); gpu_inpaintinghighorder(h_doubleSrc, h_doubleLamda, h_denominator,
h_out,h_curve, 2000, elapsedtime, rows,cols);
Mat outputsrc(src.rows, src.cols, vartype, h_doubleSrc); Mat outpuinpainting(src.rows, src.cols, vartype, h_out); Mat outputcurve(src.rows, src.cols, vartype, h_curve); imshow("lambda asli", mask);
printf("Print Inpainting 10 first data\n"); for (int j = 0; j < 10; j++)
{
cout << outpuinpainting.at<double>(0, j)<<" "; }
printf("\n\nPrint Src 10 first data\n"); for (int j = 0; j < 10; j++)
{
cout << outputsrc.at<double>(0, j) << " "; }
printf("\n\nPrint Curve 10 first data\n"); for (int j = 0; j < 10; j++)
{
cout << outputcurve.at<double>(0, j) << " "; }
imshow("soruce", outputsrc);
imshow("inpainting", outpuinpainting);
outpuinpainting.convertTo(outpuinpainting, CV_8UC1, 255.0); imwrite("outputGPU.png", outpuinpainting);
imshow("Curve", outputcurve);
//cout << "Elapsed time: " << elapsedtime << "s\n";
waitKey(); }
Lampiran 7. Kode Lengkap Inpainting Pemrosesan Paralel GPU #include "cuda_runtime.h" #include "cufft.h" #include "utils.h" #include <stdio.h> #include <algorithm> #include "assert.h" __constant__ int dt = 100;
__constant__ double epsilon = 0.005;
__constant__ double ep2 =0.000025; //ep2 = epsilon^2;
//__constant__ int lpower = 2;
__constant__ double lambda0 = 100; //lambda0 = 10 ^ lpower;
__constant__ double c1 = 200; //1 / epsilon;
__constant__ double c2 = 100; //c2 = lambda0;
__global__
void maxreduce(double * d_in, double * d_out, int numRows, int numCols) {
extern __shared__ double sdata[];
// each thread loads one element from global to shared mem
unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)) {
return; }
unsigned int tid = threadIdx.x;
unsigned int idx = col + numCols * row; // current pixel index
sdata[tid] = d_in[idx]; __syncthreads();
// do reduction in shared mem
for
(unsigned int s = 1; s < blockDim.x; s *= 2) { if (tid % (2 * s) == 0) {
sdata[tid] = max(sdata[tid], sdata[tid + s]);
//sdata[tid] =sdata[tid]+sdata[tid + s];sum
}
__syncthreads(); }
// write result for this block to global mem
if (tid == 0) d_out[blockIdx.x] = sdata[0]; }
__global__
void Convert2double(unsigned char* const inputChannel, double* const outputChannel, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
int idx = col + numCols * row; // current pixel index
if ((row >= numRows) || (col >= numCols)) {
return; }
double intensity = inputChannel[idx]; outputChannel[idx] = intensity / 255; }
void computelubarcomplex(double *u, double* lambda, cufftDoubleComplex *clubar, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row; // current pixel index
double value = 0;
value = u[index] * lambda[index]; clubar[index].x =value;
clubar[index].y = 0.0f; }
__global__
void computeLU(double *u, double* lambda, cufftDoubleComplex* lubar, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
double value = 0;
int index = col + numCols * row; // current pixel index
value = u[index] * lambda[index]; lubar[index].x= value;
lubar[index].y = 0; }
__global__
void computelapcurvbar(cufftDoubleComplex * curve, double *Denominator, cufftDoubleComplex * result, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
double x,y = 0;
int index = col + numCols * row; // current pixel index
x = curve[index].x * Denominator[index]; y = curve[index].y * Denominator[index]; result[index].x = x; result[index].y = y; } __global__
void copyData(cufftDoubleComplex * lu0bar, cufftDoubleComplex * lubar, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row; cufftDoubleComplex temp;
temp = lu0bar[index]; lubar[index] = temp; }
__global__
void Inverse2double(unsigned char* const inputChannel, double* const outputChannel, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
int idx = col + numCols * row; // current pixel index
if ((row >= numRows) || (col >= numCols)) {
return; }
double intensity = inputChannel[idx];
outputChannel[idx] = (1 - intensity / 255) * lambda0; //lambda=lambda *. lambda0;
}
__global__ void real2complex(double * in, cufftDoubleComplex *out, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row; out[index].x = in[index];
out[index].y = 0.0f; }
__global__ void complex2real(cufftDoubleComplex *in, double * out, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row;
double result = in[index].x / ((double)numRows*(double)numCols); out[index] = result;
}
static __device__ __host__ inline cufftDoubleComplex ComplexScale(cufftDoubleComplex a, double s) { cufftDoubleComplex c; c.x = s * a.x; c.y = s * a.y; return c; }
static __device__ __host__ inline cufftDoubleComplex ComplexSub(cufftDoubleComplex a, cufftDoubleComplex b)
{
cufftDoubleComplex c; c.x = a.x - b.x; c.y = a.y - b.y; return c;
}
static __device__ __host__ inline cufftDoubleComplex ComplexAdd(cufftDoubleComplex a, cufftDoubleComplex b)
{
cufftDoubleComplex c; c.x = a.x + b.x; c.y = a.y + b.y; return c;
static __device__ __host__ inline cufftDoubleComplex ComplexScaleDiv(cufftDoubleComplex a, double s) { cufftDoubleComplex c; c.x = a.x/s; c.y = a.y/s; return c; } __global__
void ComputeCurve(double *u, cufftDoubleComplex *curv, double *cur, int size, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row; int idown = index + numCols; int iup = index - numCols; int iright = index + 1; int ileft = index - 1;
////for diagonal
int iupleft = iup - 1; int iupright = iup + 1; int idownleft = idown - 1; int idownright = idown + 1;
//menangani pojok
iup<0 ? iup = index : iup = iup;
ileft < 0 ? ileft = index : ileft = ileft; idown >= size ? idown = index : idown = idown; iright >= size ? iright = index : iright = iright;
////menangani pojok if (row == 0){ iupleft = ileft; iupright = iright; ////idownleft = idown; }
else if (row == numRows - 1){ idownleft = ileft; idownright = iright; }
////supaya 0,0 mengikuti colom
if (col == 0){
ileft = index; //menangani selisih kanan kiri
iupleft = iup; //menangani diagonal pojok
idownleft = idown; }
else if (col == numCols - 1){ iright = index;
iupright = iup; idownright = idown; }
double x = (u[ileft] - u[iright])/2; double y = (u[idown] - u[iup])/2;
double xx = u[ileft] + u[iright] - 2 * u[index]; double yy = u[iup] + u[idown] - 2 * u[index]; double Dp = u[iupleft] + u[idownright]; double Dm = u[iupright] + u[idownleft]; double xy = (Dp - Dm) / 4;
double Num = xx * (ep2 + y * y) - 2 * x * y * xy + yy * (ep2 + x * x); double Den = powf((ep2 + x * x + y * y), 1.5);
double Curv = Num / Den;
//Curv complex format
curv[index].x = Curv; curv[index].y = 0; cur[index] = Curv; __syncthreads(); } __global__
void normalize(cufftDoubleComplex *din, double * dout, int numRows, int numCols) {
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row;
double value = din[index].x/(numCols*numRows); dout[index] = value;
}
__global__
void CahnHilliardHighOrder(double * ubar, double * Denominator, double * lapcurvbar, double * lu0bar, double * lubar, double *newubar, int numRows, int numCols)
{
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row; newubar[index] = ((1 + c2*dt +
dt*c1*Denominator[index]*Denominator[index])*ubar[index] - dt*lapcurvbar[index] + dt*(lu0bar[index] - lubar[index]))/ (1 + dt*c2 + dt*c1*Denominator[index] * Denominator[index]);
}
__global__
void CahnHilliardHighOrderComplex(cufftDoubleComplex * ubar, double * Denominator, cufftDoubleComplex * lapcurvbar, cufftDoubleComplex * lu0bar, cufftDoubleComplex * lubar, cufftDoubleComplex *newubar, cufftDoubleComplex *temp, int numRows, int numCols)
{
double dt = 100; double c1 = 200; double c2 = 100;
int row = blockIdx.y * blockDim.y + threadIdx.y; // current row
int col = blockIdx.x * blockDim.x + threadIdx.x; // current column
if ((row >= numRows) || (col >= numCols)){ return;
}
int index = col + numCols * row;
double alpha = 1 + c2*dt + dt*c1*(Denominator[index] * Denominator[index]); cufftDoubleComplex result123 = ComplexSub(lubar[index], lu0bar[index]);
cufftDoubleComplex result = ComplexSub(ComplexScale(ubar[index], alpha), ComplexAdd(ComplexScale(lapcurvbar[index], dt), ComplexScale(ComplexSub(lubar[index], lu0bar[index] ), dt)));
result = ComplexScaleDiv(result, alpha); newubar[index] = result;
temp[index] = result123; }
void gpu_image2double(unsigned char *h_src, unsigned char *h_lamda, double* h_outsrc, double *h_outlamda, int numRows, int numCols)
{
double *d_outlamda, *d_outsrc; unsigned char *d_src, *d_lamda; int size = numRows*numCols; const dim3 blockSize(32, 32, 1);
const dim3 gridSize((numCols / 32) + 1, (numRows / 32) + 1, 1); checkCudaErrors(cudaMalloc(&d_src, sizeof(unsigned char)* size)); checkCudaErrors(cudaMalloc(&d_lamda, sizeof(unsigned char)* size)); checkCudaErrors(cudaMalloc(&d_outlamda, sizeof(double)* size)); checkCudaErrors(cudaMalloc(&d_outsrc, sizeof(double)* size));
checkCudaErrors(cudaMemcpy(d_src, h_src, sizeof(unsigned char)*size,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_lamda, h_lamda, sizeof(unsigned char)*size,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_outlamda, 0, sizeof(double)*size)); checkCudaErrors(cudaMemset(d_outsrc, 0, sizeof(double)*size));
Convert2double << <gridSize, blockSize >> >(d_src, d_outsrc, numRows, numCols); Inverse2double << <gridSize, blockSize >> >(d_lamda, d_outlamda, numRows, numCols);
checkCudaErrors(cudaMemcpy(h_outsrc, d_outsrc, sizeof(double)* size,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_outlamda, d_outlamda, sizeof(double)* size,
cudaMemcpyDeviceToHost)); cudaFree(d_src); cudaFree(d_lamda); cudaFree(d_outlamda); cudaFree(d_outsrc); }
void gpu_inpaintinghighorder(double *h_src, double *h_lamda, double *h_denominator, double* h_output, double* h_curve, int iteration, float *elapsedTime, int numRows, int numCols)
{
const dim3 blockSize(32, 32, 1);
const dim3 gridSize((numCols / 32) + 1, (numRows / 32) + 1, 1); int sizedata = numRows*numCols;
int sizedouble = sizeof(cufftDoubleReal)*sizedata; int sizecomplex = sizeof(cufftDoubleComplex)*sizedata; cufftDoubleReal * d_u, *d_lambda, *d_denominator, *d_output; cufftDoubleReal * ux, *uy, *uxx, *uyy, *uxy, *dp, *dm; cufftDoubleReal *LUbar, *curve, *result;
cufftDoubleComplex *cu, *cubar, *cLU0bar, *cLU, *cLUbar, *ccurve, *ccurvebar, *lapcurvbar, *cresult, *cresult2;
cudaEventCreate(&start); cudaEventCreate(&stop); checkCudaErrors(cudaMalloc(&d_u, sizedouble)); checkCudaErrors(cudaMalloc(&result, sizedouble)); checkCudaErrors(cudaMalloc(&d_output, sizedouble)); checkCudaErrors(cudaMalloc(&d_lambda, sizedouble)); checkCudaErrors(cudaMalloc(&d_denominator, sizedouble)); checkCudaErrors(cudaMalloc(&ux, sizedouble)); checkCudaErrors(cudaMalloc(&uy, sizedouble)); checkCudaErrors(cudaMalloc(&uxx, sizedouble)); checkCudaErrors(cudaMalloc(&uyy, sizedouble)); checkCudaErrors(cudaMalloc(&dp, sizedouble)); checkCudaErrors(cudaMalloc(&dm, sizedouble)); checkCudaErrors(cudaMalloc(&uxy, sizedouble)); checkCudaErrors(cudaMalloc(&curve, sizedouble)); checkCudaErrors(cudaMalloc(&LUbar, sizedouble)); checkCudaErrors(cudaMalloc(&cubar, sizecomplex)); checkCudaErrors(cudaMalloc(&cu, sizecomplex)); checkCudaErrors(cudaMalloc(&cLU0bar, sizecomplex)); checkCudaErrors(cudaMalloc(&cLU, sizecomplex)); checkCudaErrors(cudaMalloc(&cLUbar, sizecomplex)); checkCudaErrors(cudaMalloc(&ccurve, sizecomplex)); checkCudaErrors(cudaMalloc(&ccurvebar, sizecomplex)); checkCudaErrors(cudaMalloc(&lapcurvbar, sizecomplex)); checkCudaErrors(cudaMalloc(&cresult, sizecomplex)); checkCudaErrors(cudaMalloc(&cresult2, sizecomplex));
checkCudaErrors(cudaMemcpy(d_u, h_src, sizedouble, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_lambda, h_lamda, sizedouble,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_denominator, h_denominator, sizedouble,
cudaMemcpyHostToDevice)); cufftHandle pFFT; cufftHandle pIFFT;
if (cufftPlan2d(&pFFT,numRows,numCols, CUFFT_Z2Z) != CUFFT_SUCCESS){ fprintf(stderr, "CUFFT error:01.FFT Plan creation Real 2 Complex failed");
return; }
if (cufftPlan2d(&pIFFT,numRows,numCols, CUFFT_Z2Z) != CUFFT_SUCCESS){ fprintf(stderr, "CUFFT error:02.IFFT Plan creation Complex 2 Real failed");
return; }
computeLU << <gridSize, blockSize >> >(d_u, d_lambda, cLU, numRows, numCols); if (cufftExecZ2Z(pFFT, cLU, cLUbar, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Exec FFT lubar to Complex lubar failed"); return;
}
copyData << <gridSize, blockSize >> >(cLUbar, cLU0bar, numRows, numCols);
//lu0bar copy!!!!
if (cufftExecZ2Z(pFFT, cu, cubar, CUFFT_FORWARD) != CUFFT_SUCCESS){ fprintf(stderr, "CUFFT error: Exec FFT u to Complex ubar failed"); return;
}
cudaEventRecord(start);
for (int i = 0; i < iteration; i++) {
ComputeCurve << < gridSize, blockSize >> >(d_u, ccurve,curve, sizedata, numRows, numCols);
if (cufftExecZ2Z(pFFT, ccurve, ccurvebar, CUFFT_FORWARD) !=
CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Exec FFT curv to Complex failed"); return;
}
computelapcurvbar << <gridSize, blockSize >> >(ccurvebar, d_denominator, lapcurvbar, numRows, numCols); //lapcurvbar = Denominator.*fft2(curv);
//lapruvbar agak beda sedikit, karena nilai 0nya.
CahnHilliardHighOrderComplex << <gridSize, blockSize >> >(cubar, d_denominator, lapcurvbar, cLU0bar, cLUbar, cresult,cresult2, numRows, numCols);
if (cufftExecZ2Z(pIFFT, cresult, cu, CUFFT_INVERSE) != CUFFT_SUCCESS){
//ifft
fprintf(stderr, "CUFFT error: Exec IFFT curv to Complex failed"); return;
}
copyData << <gridSize, blockSize >> >(cresult, cubar, numRows, numCols);
//lu0bar copy!!!!
complex2real << <gridSize, blockSize >> >(cu, d_u, numRows, numCols); computeLU << <gridSize, blockSize >> >(d_u, d_lambda, cLU, numRows, numCols); //new LU
if (cufftExecZ2Z(pFFT, cLU, cLUbar, CUFFT_FORWARD) != CUFFT_SUCCESS){
//LUbar
fprintf(stderr, "CUFFT error: Exec IFFT lubar to Complex failed");
return; }
}
if (cufftExecZ2Z(pIFFT, cresult2, cresult2, CUFFT_INVERSE) != CUFFT_SUCCESS){ fprintf(stderr, "CUFFT error: Exec IFFT u to Complex result failed"); return;
};
cudaEventRecord(stop);
normalize << <gridSize, blockSize >> >(cresult2, curve, numRows, numCols); checkCudaErrors(cudaMemcpy(h_output, d_u, sizedouble, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_curve, curve, sizedouble,
cudaMemcpyDeviceToHost));
cudaEventSynchronize(stop); float elapsedTime1 = 0;
cudaEventElapsedTime(&elapsedTime1, start, stop); printf("GPU: %f ms", elapsedTime1);
cufftDestroy(pFFT); cufftDestroy(pIFFT);
cudaFree(d_u); cudaFree(d_lambda);
cudaFree(d_denominator); cudaFree(d_output); cudaFree(cubar); cudaFree(LUbar); cudaFree(cLUbar); cudaFree(cLU0bar); cudaFree(curve); cudaFree(ccurve); cudaFree(lapcurvbar); cudaFree(cresult); }