Source code for qualia2.nn.optim

# -*- coding: utf-8 -*- 
from ..core import *
from collections import OrderedDict 
from itertools import chain

[docs]class Optimizer(object): '''Optimizer base class\n Args: parameters (generator): Parameters to optimize ''' def __init__(self, parameters): self.params = [parameters()] self._state = OrderedDict() def __repr__(self): return '{}() at 0x{:0{}X}'.format(self.__class__.__name__, id(self), 16) def __str__(self): return self.__class__.__name__ def __setattr__(self, key, value): if key == 'params' or isinstance(value, OrderedDict): object.__setattr__(self, key, value) else: self._state[key] = value def __getattr__(self, key): if key == 'params': return object.__getattribute__(self, 'params') else: return object.__getattribute__(self, '_state')[key]
[docs] def add_params(self, parameters): self.params.append(parameters())
[docs] def state_dict(self): return dict(self._state)
[docs] def load_state_dict(self, state_dict): for key, val in state_dict.items(): if key in self._state: self._state[key] = val else: raise KeyError('[*] {} not found in parameters.'.format(key))
[docs] def step(self): ''' Performs a single optimization step (parameter update). ''' raise NotImplementedError
[docs] def zero_grad(self): ''' Clears the all gradients in parameters ''' for i in chain(*self.params): i.grad = None
[docs]class SGD(Optimizer): '''Implements stochastic gradient descent (optionally with momentum).\n Args: params (iterable): iterable of parameters to optimize lr (float): learning rate momentum (float): momentum factor Default: 0 weight_decay (float) – weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, momentum=0, weight_decay=0): super().__init__(parameters) self.lr = lr self.m = momentum self.l2 = weight_decay self.v = {} def __repr__(self): return '{}(lr={}, momentum={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.m, self.l2, id(self), 16)
[docs] def step(self): for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue assert var.data.shape == var.grad.shape if i not in self.v: self.v[i] = np.zeros_like(var.grad) self.v[i] = self.m * self.v[i] + (1 - self.m) * var.grad var.data -= self.l2 * var.data var.data -= self.lr * self.v[i]
[docs]class Adadelta(Optimizer): '''Implements Adadelta algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): coefficient that scale delta before it is applied to the parameters. Default: 1.0 decay_rate (float): coefficient used for computing a running average of squared gradients. Default: 0.9 eps (flaot): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=1.0, decay_rate=0.9, eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.rho = decay_rate self.eps = eps self.g = {} self.u = {} self.l2 = weight_decay def __repr__(self): return '{}(lr={}, decay_rate={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.rho, self.l2, id(self), 16)
[docs] def step(self): for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.g: self.g[i] = np.zeros_like(var.grad) if i not in self.u: self.u[i] = np.zeros_like(var.grad) self.g[i] = self.rho * self.g[i] + (1-self.rho) * var.grad**2 update = -np.sqrt(self.u[i]+self.eps) * var.grad / np.sqrt(self.g[i]+self.eps) self.u[i] = self.rho * self.u[i] + (1-self.rho) * update**2 var.data -= self.l2 * var.data var.data += self.lr * update
[docs]class AdaGrad(Optimizer): '''Implements Adagrad algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 eps (flaot): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.eps = eps self.l2 = weight_decay self.h = {} def __repr__(self): return '{}(lr={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.l2, id(self), 16)
[docs] def step(self): for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.h: self.h[i] = np.zeros_like(var.grad) self.h[i] += var.grad**2 var.data -= self.l2 * var.data var.data -= self.lr * var.grad / np.sqrt(self.h[i]+self.eps)
[docs]class RMSProp(Optimizer): '''Implements RMSprop algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 alpha (float): smoothing constant Default: 0.99 eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, alpha=0.99, eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.alpha = alpha self.eps = eps self.h = {} self.l2 = weight_decay def __repr__(self): return '{}(lr={}, alpha={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.alpha, self.l2, id(self), 16)
[docs] def step(self): for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.h: self.h[i] = np.zeros_like(var.grad) self.h[i] = self.alpha * self.h[i] + (1-self.alpha) * var.grad**2 var.data -= self.l2 * var.data var.data -= self.lr * var.grad / np.sqrt(self.h[i]+self.eps)
[docs]class Adam(Optimizer): '''Implements Adam algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 betas (tuple of float): coefficients used for computing running averages of gradient and its square Default: (0.9, 0.999) eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.betas = betas self.eps = eps self.m = {} self.v = {} self.l2 = weight_decay self.t = 0 def __repr__(self): return '{}(lr={}, betas={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.betas, self.l2, id(self), 16)
[docs] def step(self): self.t += 1 for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.m: self.m[i] = np.zeros_like(var.grad) if i not in self.v: self.v[i] = np.zeros_like(var.grad) self.m[i] = self.betas[0] * self.m[i] + (1-self.betas[0]) * var.grad self.v[i] = self.betas[1] * self.v[i] + (1-self.betas[1]) * var.grad**2 m = self.m[i] / (1-self.betas[0]**self.t) v = self.v[i] / (1-self.betas[1]**self.t) var.data -= self.l2 * var.data var.data -= self.lr * m / np.sqrt(v+self.eps)
[docs]class AdaMax(Optimizer): '''Implements AdaMax algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 betas (tuple of float): coefficients used for computing running averages of gradient and its square Default: (0.9, 0.999) eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.betas = betas self.eps = eps self.m = {} self.v = {} self.l2 = weight_decay self.t = 0 def __repr__(self): return '{}(lr={}, betas={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.betas, self.l2, id(self), 16)
[docs] def step(self): self.t += 1 for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.m: self.m[i] = np.zeros_like(var.grad) if i not in self.v: self.v[i] = np.zeros_like(var.grad) self.m[i] = self.betas[0] * self.m[i] + (1-self.betas[0]) * var.grad self.v[i] = np.maximum(self.betas[1] * self.v[i], np.abs(var.grad)) var.data -= self.l2 * var.data var.data -= self.lr / (1-self.betas[0]**self.t) * self.m[i] / self.v[i]
[docs]class Nadam(Optimizer): '''Implements Nesterov-accelerated adaptive moment estimation (Nadam) algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 betas (tuple of float): coefficients used for computing running averages of gradient and its square Default: (0.9, 0.999) eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.betas = betas self.eps = eps self.m = {} self.v = {} self.l2 = weight_decay self.t = 0 self.mu = [self.betas[0]*(1-0.5*(0.96**0.004))] def __repr__(self): return '{}(lr={}, betas={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.betas, self.l2, id(self), 16)
[docs] def step(self): self.t += 1 self.mu.append(self.betas[0]*(1-0.5*(0.96**(0.004*(self.t+1))))) for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.m: self.m[i] = np.zeros_like(var.grad) if i not in self.v: self.v[i] = np.zeros_like(var.grad) grad = var.grad/(1-sum(self.mu[:-1])) self.m[i] = self.betas[0] * self.m[i] + (1-self.betas[0]) * var.grad self.v[i] = self.betas[1] * self.v[i] + (1-self.betas[1]) * var.grad**2 m = self.m[i] / (1-sum(self.mu)) v = self.v[i] / (1-self.betas[1]**self.t) m_bar = (1-self.mu[-2]) * grad + self.mu[-1] * m var.data -= self.l2 * var.data var.data -= self.lr * m_bar / np.sqrt(v+self.eps)
[docs]class RAdam(Optimizer): ''' Implements Rectified Adam algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 betas (tuple of float): coefficients used for computing running averages of gradient and its square Default: (0.9, 0.999) eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): super().__init__(parameters) self.lr = lr self.betas = betas self.eps = eps self.m = {} self.v = {} self.l2 = weight_decay self.t = 0 self.rho = 2/(1-betas[1])-1 def __repr__(self): return '{}(lr={}, betas={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.betas, self.l2, id(self), 16)
[docs] def step(self): self.t += 1 for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.m: self.m[i] = np.zeros_like(var.grad) if i not in self.v: self.v[i] = np.zeros_like(var.grad) self.m[i] = self.betas[0] * self.m[i] + (1-self.betas[0]) * var.grad self.v[i] = self.betas[1] * self.v[i] + (1-self.betas[1]) * var.grad**2 m = self.m[i] / (1-self.betas[0]**self.t) rho = self.rho - 2*self.t*self.betas[1]**self.t/(1-self.betas[1]**self.t) if self.t > 4: v = np.sqrt(self.v[i]/(1-self.betas[1]**self.t))+self.eps r = np.sqrt((rho-4)*(rho-2)*self.rho/((self.rho-4)*(self.rho-2)*rho)) var.data -= self.l2 * var.data var.data -= self.lr * r * m / v else: var.data -= self.l2 * var.data var.data -= self.lr * m
[docs]class NovoGrad(Optimizer): '''Implements NovoGrad algorithm.\n Args: parameters (iterable): iterable of parameters to optimize lr (float): learning rate Default: 1e-03 betas (tuple of float): coefficients used for computing running averages of gradient and its square Default: (0.95, 0.98) eps (float): for numerical stability Default: 1e-08 weight_decay (float): weight decay (L2 penalty) Default: 0 ''' def __init__(self, parameters, lr=0.001, betas=(0.95, 0.98), eps=1e-08, weight_decay=0): super().__init__(parameters) self.lr = lr self.betas = betas self.eps = eps self.m = {} self.v = {} self.l2 = weight_decay self.t = 0 def __repr__(self): return '{}(lr={}, betas={}, weight_decay={}) at 0x{:0{}X}'.format(self.__class__.__name__, self.lr, self.betas, self.l2, id(self), 16)
[docs] def step(self): self.t += 1 for i, var in enumerate(chain(*self.params)): if not var.requires_grad: continue if i not in self.v: self.v[i] = np.zeros_like(var.grad) if i not in self.m: self.m[i] = np.zeros_like(var.grad) self.v[i] = self.betas[1] * self.v[i] + (1-self.betas[1]) * var.grad**2 self.m[i] = self.betas[0] * self.m[i] + var.grad / np.sqrt(self.v[i] + self.eps) var.data -= self.l2 * var.data var.data -= self.lr * self.m[i]