|
628 | 628 |
|
629 | 629 | ;; ### Advantages |
630 | 630 | ;; |
631 | | -;; If we are in state $s_t$ and take an action $a_t$ at timestep $t$, we end up in state $s_{t+1}$ and receive reward $r_t$. |
632 | | -;; The cumulative reward for state $s_t$ is |
| 631 | +;; #### Theory |
| 632 | +;; |
| 633 | +;; If we are in state $s_t$ and take an action $a_t$ at timestep $t$, we receive reward $r_t$ and end up in state $s_{t+1}$. |
| 634 | +;; The cumulative reward for state $s_t$ is a finite or infinite sequence using a discount factor $\gamma<1$: |
633 | 635 | ;; |
634 | 636 | ;; $r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + \gamma^3 r_{t+3} + \ldots$ |
635 | 637 | ;; |
|
639 | 641 | ;; |
640 | 642 | ;; In particular, the difference between discounted rewards can be used to get an estimate for the individual reward: |
641 | 643 | ;; |
642 | | -;; $\hat{A}_{T-1} = -V(S_{T-1}) + r_{T-1} + \gamma V(S_T)$ |
| 644 | +;; $V(s_t) = \mathop{\hat{\mathbb{E}}} [ r_t ] + \gamma V(s_{t+1})$ $\Leftrightarrow$ $\mathop{\hat{\mathbb{E}}} [ r_t ] = V(s_t) - \gamma V(s_{t+1})$ |
| 645 | +;; |
| 646 | +;; The deviation of the individual reward received in state $s_t$ from the expected reward is: |
| 647 | +;; |
| 648 | +;; $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$ if not $\operatorname{done}_t$ |
| 649 | +;; |
| 650 | +;; The special case where a time series is "done" (and the next one is started) uses 0 as the remaining expected cumulative reward. |
| 651 | +;; |
| 652 | +;; $\delta_t = r_t - V(s_t)$ if $\operatorname{done}_{t}$ |
| 653 | +;; |
| 654 | +;; If we have a sample set with a sequence of T states ($t=0,1,\ldots,T-1$), one can compute the cumulative advantage for each time step going backwards: |
| 655 | +;; |
| 656 | +;; $\hat{A}_{T-1} = -V(s_{T-1}) + r_{T-1} + \gamma V(s_T) = \delta_{T-1}$ |
643 | 657 | ;; |
644 | | -;; $\hat{A}_{T-2} = -V(S_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(S_T)$ |
| 658 | +;; $\hat{A}_{T-2} = -V(s_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(s_T) = \delta_{T-2} + \gamma \delta_{T-1}$ |
645 | 659 | ;; |
646 | 660 | ;; $\vdots$ |
647 | 661 | ;; |
648 | | -;; $\hat{A}_0 = -V(S_0) + r_0 + \gamma r_1 + \ldots + \gamma^T V(S_T)$ |
| 662 | +;; $\hat{A}_0 = -V(s_0) + r_0 + \gamma r_1 + \gamma^2 r_2 + \ldots + + \gamma^{T-1} r_{T-1} + \gamma^{T} V(s_{T}) = \delta_0 + \gamma \delta_1 + \gamma^2 \delta_2 + \ldots + \gamma^{T-1} \delta_{T-1}$ |
649 | 663 | ;; |
650 | | -;; $\hat{A}_t = -V(s_t) + r_t + \gamma r_{t+1} + \ldots + \gamma^{T-t+1} r_{T-1} + \gamma^{T-t} V(S_T)$ |
| 664 | +;; I.e. we can compute the cumulative advantages as follows: |
651 | 665 | ;; |
652 | | -;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \delta_{t+l}$ |
| 666 | +;; * Start with $\hat{A}_{T-1} = \delta_{T-1}$ |
| 667 | +;; * Continue with $\hat{A}_t = \delta_t + \gamma \hat{A}_{t+1}$ for $t=T-2,T-3,\ldots,0$ |
653 | 668 | ;; |
654 | | -;; $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$ |
| 669 | +;; PPO uses an additional factor $\lambda\le 1$ called Generalized Advantage Estimation (GAE) which can be used to steer the training towards more immediate rewards if there are stability issues. |
| 670 | +;; See [Schulman et al.](https://arxiv.org/abs/1707.06347) for more details. |
655 | 671 | ;; |
656 | | -;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \left( r_{t+l} + \gamma V(s_{t+l+1}) - V(s_{t+l}) \right)$ |
| 672 | +;; #### Implementation of Delta |
| 673 | +;; |
| 674 | +;; The code for computing the $\delta$ values follows here: |
| 675 | +(defn deltas |
| 676 | + "Compute difference between actual reward plus discounted estimate of next state and estimated value of current state" |
| 677 | + [{:keys [observations next-observations rewards dones]} critic gamma] |
| 678 | + (mapv (fn [observation next-observation reward done] |
| 679 | + (- (+ reward (if done 0.0 (* gamma (critic next-observation)))) (critic observation))) |
| 680 | + observations next-observations rewards dones)) |
| 681 | + |
| 682 | +;; If the reward is zero and the critic outputs constant zero, there is no difference between the expected and received reward. |
| 683 | +(deltas {:observations [[4]] :next-observations [[3]] :rewards [0] :dones [false]} (constantly 0) 1.0) |
| 684 | + |
| 685 | +;; If the reward is 1.0 and the critic outputs zero for both observations, the difference is 1.0. |
| 686 | +(deltas {:observations [[4]] :next-observations [[3]] :rewards [1] :dones [false]} (constantly 0) 1.0) |
| 687 | + |
| 688 | +;; If the reward is 1.0 and the difference of critic outputs is also 1.0 then there is no difference between the expected and received reward (when $\gamma=1$). |
| 689 | +(defn linear-critic [observation] (first observation)) |
| 690 | +(deltas {:observations [[4]] :next-observations [[3]] :rewards [1] :dones [false]} linear-critic 1.0) |
| 691 | + |
| 692 | +;; If the next critic value is 1.0 and discounted with 0.5 and the current critic value is 2.0, we expect a reward of 1.5. |
| 693 | +;; If we only get a reward of 1.0, the difference is -0.5. |
| 694 | +(deltas {:observations [[2]] :next-observations [[1]] :rewards [1] :dones [false]} linear-critic 0.5) |
| 695 | + |
| 696 | +;; If the run is terminated, the current critic value is compared with the reward which in this case is the last reward received in this run. |
| 697 | +(deltas {:observations [[4]] :next-observations [[3]] :rewards [4] :dones [true]} linear-critic 1.0) |
0 commit comments