Yekun's Note

Machine learning notes and writeup.

Fork me on GitHub

Implementation Practical Misc

Some implementation magic.

RL

Cumulative sum

Cumulative sum of state-value function with discounting.

scipy.signal.lfilter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def discount_cumsum(x, discount):
"""
https://github.com/openai/spinningup/blob/2e0eff9bd019c317af908b72c056a33f14626602/spinup/algos/trpo/core.py#L88
magic from rllab for computing discounted cumulatice sums of vectors
==========
input:
vector x,
[x0,
x1,
x2]

output:
[x0 + discount * x1 + discount^2 * x2,
x1 + discount *x2,
x2]
"""
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

TD(n)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def discount_with_dones(rewards, dones, gamma):
"""
https://github.com/openai/baselines/blob/229a772b81155695a2692066b1ef3e7b77f5993a/baselines/a2c/utils.py#L147
"""
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]

mb_rewards, mb_dones, _, _ = replay_buffer.sample(...)

for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
else: # terminal
rewards = discount_with_dones(rewards, dones, self.gamma)

mb_rewards[n] = rewards

Training tricks

Entropy normalization

1
2
3
4
5
6
7
8
9
10
11
# reference: baseline A2C
def entropy(logits):
""" PyTorch """
row_max, _ = torch.max(logits, -1, True)
a0 = logits - row_max
ea0 = torch.exp(a0)
z0 = torch.sum(ea0, -1, True)
p0 = ea0 / z0
return torch.sum(p0 * torch.log(z0) - a0, -1)

ent = entropy(logits).mean()
1
2
3
4
5
6
7
def entropy(logits):
""" TensorFlow """
a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)

all-reduce

1
2
3
4
5
6
7
8
# average grad of target layers in multiprocessing training
def avg_grad(layers: tuple):
""" (PyTorch) gradients averaging """
size = float(dist.get_world_size())
for layer in layers:
for param in layer.parameters():
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
param.grad.data /= size

DL

One-hot encoding

1
2
3
4
5
6
def one_hot(col: int, row: int, one_hot_index: torch.LongTensor):
""" PyTorch """
y_one_hot = torch.FloatTensor(col, row)
y_one_hot.zero_()
y_one_hot.scatter_(1, one_hot_index, 1)
return y_one_hot