97 lines
2.5 KiB
Plaintext
97 lines
2.5 KiB
Plaintext
#ifndef _BACKPROP_CUDA_KERNEL_H_
|
|
#define _BACKPROP_CUDA_KERNEL_H_
|
|
|
|
#include "backprop.h"
|
|
#include "cuda.h"
|
|
#include "math.h"
|
|
#include <stdio.h>
|
|
|
|
__global__ void bpnn_layerforward_CUDA(float *input_cuda,
|
|
float *output_hidden_cuda,
|
|
float *input_hidden_cuda,
|
|
float *hidden_partial_sum, int in,
|
|
int hid) {
|
|
int by = blockIdx.y;
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
|
|
|
int index_in = HEIGHT * by + ty + 1;
|
|
|
|
__shared__ float input_node[HEIGHT];
|
|
__shared__ float weight_matrix[HEIGHT][WIDTH];
|
|
|
|
if (tx == 0)
|
|
input_node[ty] = input_cuda[index_in];
|
|
|
|
__syncthreads();
|
|
|
|
weight_matrix[ty][tx] = input_hidden_cuda[index];
|
|
|
|
__syncthreads();
|
|
|
|
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
|
|
|
|
__syncthreads();
|
|
|
|
for (int i = 1; i <= __log2f(HEIGHT); i++) {
|
|
|
|
int power_two = __powf(2, i);
|
|
|
|
if (ty % power_two == 0)
|
|
weight_matrix[ty][tx] =
|
|
weight_matrix[ty][tx] + weight_matrix[ty + power_two / 2][tx];
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
//__syncthreads();
|
|
|
|
input_hidden_cuda[index] = weight_matrix[ty][tx];
|
|
|
|
/*
|
|
for ( unsigned int i = 2 ; i <= HEIGHT ; i *= 2){
|
|
|
|
unsigned int power_two = i - 1;
|
|
if( (ty & power_two) == 0 ) {
|
|
weight_matrix[ty][tx] = weight_matrix[ty][tx] +
|
|
weight_matrix[ty + power_two/2][tx];
|
|
}
|
|
}
|
|
*/
|
|
|
|
__syncthreads();
|
|
|
|
if (tx == 0) {
|
|
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
|
|
}
|
|
}
|
|
|
|
__global__ void bpnn_adjust_weights_cuda(float *delta, int hid, float *ly,
|
|
int in, float *w, float *oldw) {
|
|
|
|
int by = blockIdx.y;
|
|
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
int index = (hid + 1) * HEIGHT * by + (hid + 1) * ty + tx + 1 + (hid + 1);
|
|
int index_y = HEIGHT * by + ty + 1;
|
|
int index_x = tx + 1;
|
|
// eta = 0.3;
|
|
// momentum = 0.3;
|
|
|
|
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
|
oldw[index] =
|
|
((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
|
|
|
__syncthreads();
|
|
|
|
if (ty == 0 && by == 0) {
|
|
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
|
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
|
}
|
|
}
|
|
#endif
|