Skip to content

Commit 8fe2855

Browse files
authored
Merge pull request #365 from wedesoft/ppo-update-2
PPO update 2
2 parents 8c6691b + bc4b61f commit 8fe2855

1 file changed

Lines changed: 31 additions & 15 deletions

File tree

src/ppo/main.clj

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -677,33 +677,45 @@
677677
;; If we are in state $s_t$ and take an action $a_t$ at timestep $t$, we receive reward $r_t$ and end up in state $s_{t+1}$.
678678
;; The cumulative reward for state $s_t$ is a finite or infinite sequence using a discount factor $\gamma<1$:
679679
;;
680-
;; $r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + \gamma^3 r_{t+3} + \ldots$
680+
;; $$
681+
;; r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + \gamma^3 r_{t+3} + \ldots
682+
;; $$
681683
;;
682684
;; The critic $V$ estimates the expected cumulative reward for starting from the specified state.
683685
;;
684-
;; $V(s_t) = \mathop{\hat{\mathbb{E}}} [ r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + \gamma^3 r_{t+3} + \ldots ]$
686+
;; $$
687+
;; V(s_t) = \mathop{\hat{\mathbb{E}}} [ r_t + \gamma r_{t+1} + \gamma^2 r_{t+2} + \gamma^3 r_{t+3} + \ldots ]
688+
;; $$
685689
;;
686690
;; In particular, the difference between discounted rewards can be used to get an estimate for the individual reward:
687691
;;
688-
;; $V(s_t) = \mathop{\hat{\mathbb{E}}} [ r_t ] + \gamma V(s_{t+1})$ $\Leftrightarrow$ $\mathop{\hat{\mathbb{E}}} [ r_t ] = V(s_t) - \gamma V(s_{t+1})$
692+
;; $$
693+
;; V(s_t) = \mathop{\hat{\mathbb{E}}} [ r_t ] + \gamma V(s_{t+1})\Leftrightarrow\mathop{\hat{\mathbb{E}}} [ r_t ] = V(s_t) - \gamma V(s_{t+1})
694+
;; $$
689695
;;
690696
;; The deviation of the individual reward received in state $s_t$ from the expected reward is:
691697
;;
692-
;; $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$ if not $\operatorname{done}_t$
698+
;; $$
699+
;; \delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)\mathrm{\ if\ not\ }\operatorname{done}_t
700+
;; $$
693701
;;
694702
;; The special case where a time series is "done" (and the next one is started) uses 0 as the remaining expected cumulative reward.
695703
;;
696-
;; $\delta_t = r_t - V(s_t)$ if $\operatorname{done}_{t}$
704+
;; $$
705+
;; \delta_t = r_t - V(s_t)\mathrm{\ if\ }\operatorname{done}_{t}
706+
;; $$
697707
;;
698708
;; If we have a sample set with a sequence of T states ($t=0,1,\ldots,T-1$), one can compute the cumulative advantage for each time step going backwards:
699709
;;
700-
;; $\hat{A}_{T-1} = -V(s_{T-1}) + r_{T-1} + \gamma V(s_T) = \delta_{T-1}$
701-
;;
702-
;; $\hat{A}_{T-2} = -V(s_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(s_T) = \delta_{T-2} + \gamma \delta_{T-1}$
703-
;;
704-
;; $\vdots$
705-
;;
706-
;; $\hat{A}_0 = -V(s_0) + r_0 + \gamma r_1 + \gamma^2 r_2 + \ldots + + \gamma^{T-1} r_{T-1} + \gamma^{T} V(s_{T}) = \delta_0 + \gamma \delta_1 + \gamma^2 \delta_2 + \ldots + \gamma^{T-1} \delta_{T-1}$
710+
;; $$
711+
;; \begin{aligned}
712+
;; \hat{A} _ {T-1} & = -V(s_{T-1}) + r_{T-1} + \gamma V(s_T) = \delta_{T-1} \\
713+
;; \hat{A} _ {T-2} & = -V(s_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(s_T) = \delta_{T-2} + \gamma \delta_{T-1} \\
714+
;; & \vdots \\
715+
;; \hat{A} _ 0 & = -V(s_0) + r_0 + \gamma r_1 + \gamma^2 r_2 + \ldots + + \gamma^{T-1} r_{T-1} + \gamma^{T} V(s_{T}) \\
716+
;; & = \delta_0 + \gamma \delta_1 + \gamma^2 \delta_2 + \ldots + \gamma^{T-1} \delta_{T-1}
717+
;; \end{aligned}
718+
;; $$
707719
;;
708720
;; I.e. we can compute the cumulative advantages as follows:
709721
;;
@@ -818,7 +830,7 @@
818830
;; The core of the actor loss function relies on the action probability ratio of using the updated and the old policy (actor network output).
819831
;; The ratio is defined as $r_t(\theta)=\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\operatorname{old}}}(a_t|s_t)}$.
820832
;; Note that $r_t(\theta)$ here refers to the probability ratio as opposed to the reward of the previous section.
821-
;;
833+
;;
822834
;; The sampled observations, log probabilities, and actions are combined with the actor's parameter-dependent log probabilities.
823835
(defn probability-ratios
824836
"Probability ratios for a actions using updated policy and old policy"
@@ -829,12 +841,16 @@
829841
;; The objective is to increase the probability of actions which lead to a positive advantage and reduce the probability of actions which lead to a negative advantage.
830842
;; I.e. maximising the following objective function.
831843
;;
832-
;; $L^{CPI}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\operatorname{old}}}(a_t|s_t)} \hat{A}_t] = \mathop{\hat{\mathbb{E}}}_t [r_t(\theta) \hat{A}_t]$
844+
;; $$
845+
;; L^{CPI}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\operatorname{old}}}(a_t|s_t)} \hat{A}_t] = \mathop{\hat{\mathbb{E}}}_t [r_t(\theta) \hat{A}_t]
846+
;; $$
833847
;;
834848
;; The core idea of PPO is to use clipped probability ratios for the loss function in order to increase stability, .
835849
;; The probability ratio is clipped to stay below $1+\epsilon$ for positive advantages and to stay above $1-\epsilon$ for negative advantages.
836850
;;
837-
;; $L^{CLIP}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\min(r_t(\theta) \hat{A}_t, \mathop{\operatorname{clip}}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t)]$
851+
;; $$
852+
;; L^{CLIP}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\min(r_t(\theta) \hat{A}_t, \mathop{\operatorname{clip}}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t)]
853+
;; $$
838854
;;
839855
;; See [Schulman et al.](https://arxiv.org/abs/1707.06347) for more details.
840856
;;

0 commit comments

Comments
 (0)