196 lines
6.3 KiB
Plaintext
196 lines
6.3 KiB
Plaintext
#include <cuda.h>
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <sys/time.h>
|
|
|
|
// includes, kernels
|
|
#include "backprop.h"
|
|
#include "backprop_cuda_kernel.cu"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
extern "C" void bpnn_layerforward(float *l1, float *l2, float **conn, int n1,
|
|
int n2);
|
|
|
|
extern "C" void bpnn_output_error(float *delta, float *target, float *output,
|
|
int nj, float *err);
|
|
|
|
extern "C" void bpnn_hidden_error(float *delta_h, int nh, float *delta_o,
|
|
int no, float **who, float *hidden,
|
|
float *err);
|
|
|
|
extern "C" void bpnn_adjust_weights(float *delta, int ndelta, float *ly,
|
|
int nly, float **w, float **oldw);
|
|
|
|
extern "C" int setup(int argc, char **argv);
|
|
|
|
extern "C" float **alloc_2d_dbl(int m, int n);
|
|
|
|
extern "C" float squash(float x);
|
|
|
|
double gettime() {
|
|
struct timeval t;
|
|
gettimeofday(&t, NULL);
|
|
return t.tv_sec + t.tv_usec * 1e-6;
|
|
}
|
|
|
|
unsigned int num_threads = 0;
|
|
unsigned int num_blocks = 0;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Program main
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv) {
|
|
cudaSetDevice(0);
|
|
setup(argc, argv);
|
|
}
|
|
|
|
extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) {
|
|
int in, hid, out;
|
|
float out_err, hid_err;
|
|
|
|
in = net->input_n;
|
|
hid = net->hidden_n;
|
|
out = net->output_n;
|
|
|
|
#ifdef GPU
|
|
int m = 0;
|
|
float *input_hidden_cuda;
|
|
float *input_cuda;
|
|
float *output_hidden_cuda;
|
|
float *partial_sum;
|
|
float *hidden_partial_sum;
|
|
float *hidden_delta_cuda;
|
|
float *input_prev_weights_cuda;
|
|
float sum;
|
|
float *input_weights_one_dim;
|
|
float *input_weights_prev_one_dim;
|
|
num_blocks = in / 16;
|
|
dim3 grid(1, num_blocks);
|
|
dim3 threads(16, 16);
|
|
|
|
input_weights_one_dim = (float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
|
input_weights_prev_one_dim =
|
|
(float *)malloc((in + 1) * (hid + 1) * sizeof(float));
|
|
partial_sum = (float *)malloc(num_blocks * WIDTH * sizeof(float));
|
|
|
|
// this preprocessing stage is added to correct the bugs of wrong memcopy
|
|
// using two-dimensional net->inputweights
|
|
for (int k = 0; k <= in; k++) {
|
|
for (int j = 0; j <= hid; j++) {
|
|
input_weights_one_dim[m] = net->input_weights[k][j];
|
|
input_weights_prev_one_dim[m] = net->input_prev_weights[k][j];
|
|
m++;
|
|
}
|
|
}
|
|
|
|
cudaMalloc((void **)&input_cuda, (in + 1) * sizeof(float));
|
|
cudaMalloc((void **)&output_hidden_cuda, (hid + 1) * sizeof(float));
|
|
cudaMalloc((void **)&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
|
|
cudaMalloc((void **)&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
|
|
|
|
#endif
|
|
|
|
#ifdef CPU
|
|
|
|
printf("Performing CPU computation\n");
|
|
bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in,
|
|
hid);
|
|
|
|
#endif
|
|
|
|
#ifdef GPU
|
|
|
|
printf("Performing GPU computation\n");
|
|
|
|
// printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
|
|
|
|
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float),
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
|
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
bpnn_layerforward_CUDA<<<grid, threads>>>(input_cuda, output_hidden_cuda,
|
|
input_hidden_cuda,
|
|
hidden_partial_sum, in, hid);
|
|
|
|
cudaThreadSynchronize();
|
|
|
|
cudaError_t error = cudaGetLastError();
|
|
if (error != cudaSuccess) {
|
|
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
cudaMemcpy(partial_sum, hidden_partial_sum,
|
|
num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
|
|
|
|
for (int j = 1; j <= hid; j++) {
|
|
sum = 0.0;
|
|
for (int k = 0; k < num_blocks; k++) {
|
|
sum += partial_sum[k * hid + j - 1];
|
|
}
|
|
sum += net->input_weights[0][j];
|
|
net->hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
|
|
}
|
|
#endif
|
|
|
|
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights,
|
|
hid, out);
|
|
bpnn_output_error(net->output_delta, net->target, net->output_units, out,
|
|
&out_err);
|
|
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
|
net->hidden_weights, net->hidden_units, &hid_err);
|
|
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
|
net->hidden_weights, net->hidden_prev_weights);
|
|
|
|
#ifdef CPU
|
|
|
|
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
|
net->input_weights, net->input_prev_weights);
|
|
|
|
#endif
|
|
|
|
#ifdef GPU
|
|
|
|
cudaMalloc((void **)&hidden_delta_cuda, (hid + 1) * sizeof(float));
|
|
cudaMalloc((void **)&input_prev_weights_cuda,
|
|
(in + 1) * (hid + 1) * sizeof(float));
|
|
|
|
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float),
|
|
cudaMemcpyHostToDevice);
|
|
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim,
|
|
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(input_hidden_cuda, input_weights_one_dim,
|
|
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
bpnn_adjust_weights_cuda<<<grid, threads>>>(hidden_delta_cuda, hid,
|
|
input_cuda, in, input_hidden_cuda,
|
|
input_prev_weights_cuda);
|
|
|
|
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float),
|
|
cudaMemcpyDeviceToHost);
|
|
cudaMemcpy(input_weights_one_dim, input_hidden_cuda,
|
|
(in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
|
|
|
|
for (int i = 0; i < (in + 1) * (hid + 1); i++) {
|
|
printf("%f ", input_weights_one_dim[i]);
|
|
}
|
|
printf("\n");
|
|
|
|
cudaFree(input_cuda);
|
|
cudaFree(output_hidden_cuda);
|
|
cudaFree(input_hidden_cuda);
|
|
cudaFree(hidden_partial_sum);
|
|
cudaFree(input_prev_weights_cuda);
|
|
cudaFree(hidden_delta_cuda);
|
|
|
|
free(partial_sum);
|
|
free(input_weights_one_dim);
|
|
free(input_weights_prev_one_dim);
|
|
|
|
#endif
|
|
}
|