Fork me on GitHub

Implementation Practical Misc

Posted on 2019-09-10 Edited on 2022-05-16 In Programming practical Views: Disqus:

Some implementation magic.

RL

Cumulative sum

Cumulative sum of state-value function with discounting.

scipy.signal.lfilter

$G \leftarrow \sum_{t=1}^T \gamma^{i-1}R_i$

def discount_cumsum(x, discount):
  """   
  https://github.com/openai/spinningup/blob/2e0eff9bd019c317af908b72c056a33f14626602/spinup/algos/trpo/core.py#L88
  magic from rllab for computing discounted cumulatice sums of vectors
  ==========
  input:
      vector x,
       [x0,
        x1,
        x2]
    
  output:
     [x0 + discount * x1 + discount^2 * x2,
      x1 + discount *x2,
      x2]
  """
  return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

TD(n)

def discount_with_dones(rewards, dones, gamma):
    """ 
    https://github.com/openai/baselines/blob/229a772b81155695a2692066b1ef3e7b77f5993a/baselines/a2c/utils.py#L147
    """
    discounted = []
    r = 0
    for reward, done in zip(rewards[::-1], dones[::-1]):
        r = reward + gamma*r*(1.-done) # fixed off by one bug
        discounted.append(r)
    return discounted[::-1]

mb_rewards, mb_dones, _, _ = replay_buffer.sample(...)

for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
    if dones[-1] == 0: 
        rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
    else: # terminal
        rewards = discount_with_dones(rewards, dones, self.gamma)

    mb_rewards[n] = rewards

Training tricks

Entropy normalization

# reference: baseline A2C
def entropy(logits):
    """ PyTorch """
    row_max, _ = torch.max(logits, -1, True)
    a0 = logits - row_max
    ea0 = torch.exp(a0)
    z0 = torch.sum(ea0, -1, True)
    p0 = ea0 / z0
    return torch.sum(p0 * torch.log(z0) - a0, -1)

ent = entropy(logits).mean()

def entropy(logits):
    """ TensorFlow """
    a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
    ea0 = tf.exp(a0)
    z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
    p0 = ea0 / z0
    return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)

all-reduce

# average grad of target layers in multiprocessing training
def avg_grad(layers: tuple):
    """ (PyTorch) gradients averaging """
    size = float(dist.get_world_size())
    for layer in layers:
        for param in layer.parameters():
            dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
            param.grad.data /= size

DL

One-hot encoding

def one_hot(col: int, row: int, one_hot_index: torch.LongTensor):
    """ PyTorch """
    y_one_hot = torch.FloatTensor(col, row)
    y_one_hot.zero_()
    y_one_hot.scatter_(1, one_hot_index, 1)
    return y_one_hot