# -*- coding: utf-8 -*-
from ..core import *
from ..autograd import *
from ..functions import listconcat
[docs]class RNNCell(Function):
[docs] @staticmethod
def forward(x, h, weight_x, weight_h, bias_x, bias_h):
'''
Shape:
- x: [N, input_size]
- h: [N, hidden_size]
- weight_x: [input_size, hidden_size]
- weight_h: [hidden_size, hidden_size]
- bias_x: [hidden_size]
- bias_h: [hidden_size]
- Output: [N, hidden_size]
'''
if bias_x is None or bias_h is None:
tmp = np.add(np.dot(h.data, weight_h.data), np.dot(x.data, weight_x.data))
result = Tensor(np.tanh(tmp))
result.set_creator(RNNCell.prepare(result.shape, x, h, weight_x, weight_h, bias=False, tmp=result.data))
x.child.append(id(result.creator))
h.child.append(id(result.creator))
weight_x.child.append(id(result.creator))
weight_h.child.append(id(result.creator))
else:
tmp = np.add(np.add(np.dot(h.data, weight_h.data), bias_x.data), np.add(np.dot(x.data, weight_x.data), bias_h.data))
result = Tensor(np.tanh(tmp))
result.set_creator(RNNCell.prepare(result.shape, x, h, weight_x, weight_h, bias_x, bias_h, bias=True, tmp=result.data))
x.child.append(id(result.creator))
h.child.append(id(result.creator))
weight_x.child.append(id(result.creator))
weight_h.child.append(id(result.creator))
bias_x.child.append(id(result.creator))
bias_h.child.append(id(result.creator))
return result
[docs] def calc_grad(self, dh_next):
dt = np.multiply(dh_next, np.subtract(1, np.square(self.kwargs['tmp'])))
dw_x = np.dot(self.var[0].data.T, dt)
dw_h = np.dot(self.var[1].data.T, dt)
dx = np.dot(dt, self.var[2].data.T)
dh = np.dot(dt, self.var[3].data.T)
if not self.kwargs['bias']:
return dx, dh, dw_x, dw_h
else:
db_x = RNNCell.handle_broadcast(dt, self.var[4])
db_h = RNNCell.handle_broadcast(dt, self.var[5])
return dx, dh, dw_x, dw_h, db_x, db_h
rnncell = RNNCell(None)
# TODO: optimize backprop
[docs]class RNN(Function):
[docs] @staticmethod
def forward(x, h, weight_x, weight_h, bias_x, bias_h, num_layers):
'''
Shape:
- x: [seq_len, N, input_size]
- h: [num_layers, N, hidden_size]
- Output: [seq_len, N, hidden_size]
- Hidden: [num_layers, N, hidden_size]
'''
seq_len = x.shape[0]
h_out = []
tmp = [x]
for l in range(num_layers):
hx = h[l]
tmp.append([])
for i in range(seq_len):
if bias_x is None or bias_h is None:
hx = rnncell(tmp[l][i], hx, weight_x[l], weight_h[l], None, None)
tmp[l+1].append(hx)
else:
hx = rnncell(tmp[l][i], hx, weight_x[l], weight_h[l], bias_x[l], bias_h[l])
tmp[l+1].append(hx)
h_out.append(hx)
result_x = listconcat(tmp[-1])
result_h = listconcat(h_out)
return result_x, result_h
rnn = RNN(None)
[docs]class LSTMCell(Function):
def __init__(self, output_shape_h, output_shape_c, *args, **kwargs):
super().__init__(output_shape_h, *args, **kwargs)
self.output_shape_c = output_shape_c
[docs] @staticmethod
def forward(x, h, c, weight_x, weight_h, bias_x, bias_h):
'''
Shape:
- x: [N, input_size]
- h: [N, hidden_size]
- c: [N, hidden_size]
- weight_x: [input_size, 4*hidden_size]
- weight_h: [hidden_size, 4*hidden_size]
- bias_x: [4*hidden_size]
- bias_h: [4*hidden_size]
- Output_h: [N, hidden_size]
- Output_c: [N, hidden_size]
'''
hidden_size = h.shape[1]
if bias_x is None or bias_h is None:
tmp = np.add(np.dot(h.data, weight_h.data), np.dot(x.data, weight_x.data))
else:
tmp = np.add(np.add(np.dot(h.data, weight_h.data), bias_x.data), np.add(np.dot(x.data, weight_x.data), bias_h.data))
f = np.divide(1, np.add(1, np.exp(np.negative(tmp[:,:hidden_size]))))
g = np.tanh(tmp[:, hidden_size:2*hidden_size])
i = np.divide(1, np.add(1, np.exp(np.negative(tmp[:,2*hidden_size:3*hidden_size]))))
o = np.divide(1, np.add(1, np.exp(np.negative(tmp[:,3*hidden_size:4*hidden_size]))))
c_next = np.add(np.multiply(f, c.data), np.multiply(g, i))
h_next = np.multiply(o, np.tanh(c_next))
c_next = Tensor(c_next)
h_next = Tensor(h_next)
if bias_x is None or bias_h is None:
parent = LSTMCell.prepare(h_next.shape, c_next.shape, x, h, c, weight_x, weight_h, bias=False, f=f, g=g, i=i, o=o, c_next=c_next.data, hidden_size=hidden_size)
x.child.append(id(parent))
h.child.append(id(parent))
c.child.append(id(parent))
weight_x.child.append(id(parent))
weight_h.child.append(id(parent))
else:
parent = LSTMCell.prepare(h_next.shape, c_next.shape, x, h, c, weight_x, weight_h, bias_x, bias_h, bias=True, f=f, g=g, i=i, o=o, c_next=c_next.data, hidden_size=hidden_size)
x.child.append(id(parent))
h.child.append(id(parent))
c.child.append(id(parent))
weight_x.child.append(id(parent))
weight_h.child.append(id(parent))
bias_x.child.append(id(parent))
bias_h.child.append(id(parent))
c_next.set_creator(parent)
h_next.set_creator(parent)
return h_next, c_next
[docs] def calc_grad(self, dh_next, dc_next):
tanh_c_next = np.tanh(self.kwargs['c_next'])
ds = dc_next + (dh_next * self.kwargs['o']) * (1 - tanh_c_next ** 2)
df = ds * self.var[2].data
dg = ds * self.kwargs['i']
di = ds * self.kwargs['g']
do = dh_next * tanh_c_next
# derivative of activation
df *= self.kwargs['f'] * (1 - self.kwargs['f'])
dg *= (1 - self.kwargs['g'] ** 2)
di *= self.kwargs['i'] * (1 - self.kwargs['i'])
do *= self.kwargs['o'] * (1 - self.kwargs['o'])
dtmp = np.hstack((df, dg, di, do))
dx = np.dot(dtmp, self.var[3].data.T)
dh = np.dot(dtmp, self.var[4].data.T)
dc = ds * self.kwargs['f']
dw_h = np.dot(self.var[1].data.T, dtmp)
dw_x = np.dot(self.var[0].data.T, dtmp)
if not self.kwargs['bias']:
return dx, dh, dc, dw_x, dw_h
else:
hidden_size = self.kwargs['hidden_size']
db_x = np.zeros_like(self.var[5].data)
db_h = np.zeros_like(self.var[6].data)
db_x = LSTMCell.handle_broadcast(dtmp, self.var[5])
db_h = LSTMCell.handle_broadcast(dtmp, self.var[6])
return dx, dh, dc, dw_x, dw_h, db_x, db_h
# TODO: fix backward pass
[docs] def backward(self, dh_next, dc_next=None):
if dc_next is None:
dc_next = np.zeros_like(self.kwargs['c_next'])
grads = self.calc_grad(dh_next, dc_next)
if type(grads) is list:
grads = tuple(grads)
for dx, var in zip(grads, self.var):
if not var.requires_grad:
continue
if var.grad is None:
var.grad = dx
else:
var.grad += dx
for i, var in enumerate(self.var):
if i == 2:
continue
if var.creator is not None:
if i == 1:
if isinstance(var.creator, LSTMCell):
var.backward(var.grad, self.var[2].grad)
else:
var.backward(var.grad)
else:
var.backward(var.grad)
lstmcell = LSTMCell(None, None)
# TODO: optimize backprop
[docs]class LSTM(Function):
[docs] @staticmethod
def forward(x, h, c, weight_x, weight_h, bias_x, bias_h, num_layers):
'''
Shape:
- x: [seq_len, N, input_size]
- h: [num_layers, N, hidden_size]
- c: [num_layers, N, hidden_size]
- Output: [seq_len, N, hidden_size]
- Hidden_h: [num_layers, N, hidden_size]
- Hidden_c: [num_layers, N, hidden_size]
'''
seq_len = x.shape[0]
h_out = []
c_out = []
tmp = [x]
for l in range(num_layers):
hx = h[l]
cx = c[l]
tmp.append([])
for i in range(seq_len):
if bias_x is None or bias_h is None:
hx, cx = lstmcell(tmp[l][i], hx, cx, weight_x[l], weight_h[l], None, None)
tmp[l+1].append(hx)
else:
hx, cx = lstmcell(tmp[l][i], hx, cx, weight_x[l], weight_h[l], bias_x[l], bias_h[l])
tmp[l+1].append(hx)
h_out.append(hx)
c_out.append(cx)
result_x = listconcat(tmp[-1])
result_h = listconcat(h_out)
result_c = listconcat(c_out)
return result_x, result_h, result_c
lstm = LSTM(None)
[docs]class GRUCell(Function):
[docs] @staticmethod
def forward(x, h, weight_x, weight_h, bias_x, bias_h):
'''
Shape:
- x: [N, input_size]
- h: [N, hidden_size]
- weight_x: [input_size, 3*hidden_size]
- weight_h: [hidden_size, 3*hidden_size]
- bias_x: [3*hidden_size]
- bias_h: [3*hidden_size]
- Output: [N, hidden_size]
'''
hidden_size = h.shape[1]
if bias_x is None or bias_h is None:
tmp = np.add(np.dot(h.data, weight_h.data[:, :2*hidden_size]), np.dot(x.data, weight_x.data[:, :2*hidden_size]))
else:
tmp = np.add(np.add(np.dot(h.data, weight_h.data[:, :2*hidden_size]), bias_x.data[:2*hidden_size]), np.add(np.dot(x.data, weight_x.data[:, :2*hidden_size]), bias_h.data[:2*hidden_size]))
tmp = np.divide(1, np.add(1, np.exp(np.negative(tmp))))
r = tmp[:, :hidden_size]
z = tmp[:, hidden_size:]
if bias_x is None or bias_h is None:
n = np.tanh(np.add(np.multiply(r, np.dot(h.data, weight_h.data[:, 2*hidden_size:])), np.dot(x.data, weight_x.data[:, 2*hidden_size:])))
h_next = Tensor(np.add(np.multiply(np.subtract(1, z), n), np.multiply(z, h.data)))
h_next.set_creator(GRUCell.prepare(h_next.shape, x, h, weight_x, weight_h, bias=False, r=r, z=z, n=n, tmp=tmp, hidden_size=hidden_size))
x.child.append(id(h_next.creator))
h.child.append(id(h_next.creator))
weight_x.child.append(id(h_next.creator))
weight_h.child.append(id(h_next.creator))
else:
n = np.tanh(np.add(np.add(np.multiply(r, np.dot(h.data, weight_h.data[:, 2*hidden_size:])), bias_x.data[2*hidden_size:]), np.add(np.dot(x.data, weight_x.data[:, 2*hidden_size:]), bias_h.data[2*hidden_size:])))
h_next = Tensor(np.add(np.multiply(np.subtract(1, z), n), np.multiply(z, h.data)))
h_next.set_creator(GRUCell.prepare(h_next.shape, x, h, weight_x, weight_h, bias_x, bias_h, bias=True, r=r, z=z, n=n, tmp=tmp, hidden_size=hidden_size))
x.child.append(id(h_next.creator))
h.child.append(id(h_next.creator))
weight_x.child.append(id(h_next.creator))
weight_h.child.append(id(h_next.creator))
bias_x.child.append(id(h_next.creator))
bias_h.child.append(id(h_next.creator))
return h_next
[docs] def calc_grad(self, dh_next):
hidden_size = self.kwargs['hidden_size']
dw_x = np.zeros_like(self.var[2].data)
dw_h = np.zeros_like(self.var[3].data)
dn = np.multiply(dh_next, np.subtract(1, self.kwargs['z']))
dh = np.multiply(dh_next, self.kwargs['z'])
# tanh derivative
tmp = np.multiply(dn, np.subtract(1, np.square(self.kwargs['n'])))
dx = np.dot(tmp, self.var[2].data[:, 2*hidden_size:].T)
dw_x[:, 2*hidden_size:] = np.dot(self.var[0].data.T, tmp)
rtmp = np.multiply(self.kwargs['r'], tmp)
dh += np.dot(rtmp, self.var[3].data[:, 2*hidden_size:].T)
dw_h[:, 2*hidden_size:] = np.dot(self.var[1].data.T, rtmp)
dr = np.multiply(tmp, np.dot(self.var[1].data, self.var[3].data[:, 2*hidden_size:]))
dz = np.subtract(np.multiply(self.var[1].data, dh_next), np.multiply(self.kwargs['n'], dh_next))
dtmp = np.concatenate([dr,dz],axis=1)
# sigmoid derivative
tmp2 = np.multiply(dtmp, np.multiply(self.kwargs['tmp'], np.subtract(1, self.kwargs['tmp'])))
dx += np.dot(tmp2, self.var[2].data[:, :2*hidden_size].T)
dw_x[:, :2*hidden_size] = np.dot(self.var[0].data.T, tmp2)
dh += np.dot(tmp2, self.var[3].data[:, :2*hidden_size].T)
dw_h[:, :2*hidden_size] = np.dot(self.var[1].data.T, tmp2)
if not self.kwargs['bias']:
return dx, dh, dw_x, dw_h
else:
db_x = np.zeros_like(self.var[4].data)
db_h = np.zeros_like(self.var[5].data)
db_x[2*hidden_size:] = GRUCell.handle_broadcast(tmp, self.var[4][2*hidden_size:])
db_h[2*hidden_size:] = GRUCell.handle_broadcast(rtmp, self.var[5][2*hidden_size:])
db_x[:2*hidden_size] = GRUCell.handle_broadcast(tmp2, self.var[4][:2*hidden_size])
db_h[:2*hidden_size] = GRUCell.handle_broadcast(tmp2, self.var[5][:2*hidden_size])
return dx, dh, dw_x, dw_h, db_x, db_h
grucell = GRUCell(None)
# TODO: optimize backprop
[docs]class GRU(Function):
[docs] @staticmethod
def forward(x, h, weight_x, weight_h, bias_x, bias_h, num_layers):
'''
Shape:
- x: [seq_len, N, input_size]
- h: [num_layers, N, hidden_size]
- Output: [seq_len, N, hidden_size]
- Hidden: [num_layers, N, hidden_size]
'''
seq_len = x.shape[0]
h_out = []
tmp = [x]
for l in range(num_layers):
hx = h[l]
tmp.append([])
for i in range(seq_len):
if bias_x is None or bias_h is None:
hx = grucell(tmp[l][i], hx, weight_x[l], weight_h[l], None, None)
tmp[l+1].append(hx)
else:
hx = grucell(tmp[l][i], hx, weight_x[l], weight_h[l], bias_x[l], bias_h[l])
tmp[l+1].append(hx)
h_out.append(hx)
result_x = listconcat(tmp[-1])
result_h = listconcat(h_out)
return result_x, result_h
gru = GRU(None)