|
6 | 6 | :description "A Clojure port of XinJingHao's PPO implementation using libpython-clj2, PyTorch, and Quil" |
7 | 7 | :image "pendulum.png" |
8 | 8 | :type :post |
9 | | - :date "2026-04-18" |
| 9 | + :date "2026-04-22" |
10 | 10 | :category :ml |
11 | 11 | :tags [:physics :machine-learning :optimization :ppo :control]}}} |
12 | 12 |
|
|
20 | 20 | [libpython-clj2.require :refer (require-python)] |
21 | 21 | [libpython-clj2.python :refer (py.) :as py])) |
22 | 22 |
|
23 | | -(require-python '[builtins :as python] |
24 | | - '[torch :as torch] |
25 | | - '[torch.nn :as nn] |
26 | | - '[torch.nn.functional :as F] |
27 | | - '[torch.optim :as optim] |
28 | | - '[torch.distributions :refer (Beta)] |
29 | | - '[torch.nn.utils :as utils]) |
30 | 23 | ;; ## Motivation |
31 | 24 | ;; |
32 | 25 | ;; Recently I started to look into the problem of reentry trajectory planning in the context of developing the [sfsim](https://store.steampowered.com/app/3687560/sfsim/) space flight simulator. |
|
38 | 31 | ;; PPO is inspired by Trust Region Policy Optimization (TRPO) but is much easier to implement. |
39 | 32 | ;; Also PPO handles continuous observation and action spaces which is important for control problems. |
40 | 33 | ;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms. |
41 | | -;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow. |
| 34 | +;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which is easier to follow. |
42 | 35 | ;; |
43 | 36 | ;; In order to use PPO with a simulation environment implemented in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure. |
44 | 37 | ;; |
| 38 | +;; ## Dependencies |
| 39 | +;; |
| 40 | +;; For this project we are using the following `deps.edn` file. |
| 41 | +;; The Python setup is shown further down in this article. |
| 42 | +;; |
| 43 | +;; ```Clojure |
| 44 | +;; {:deps |
| 45 | +;; {org.clojure/clojure {:mvn/version "1.12.4"} |
| 46 | +;; clj-python/libpython-clj {:mvn/version "2.026"} |
| 47 | +;; quil/quil {:mvn/version "4.3.1563"} |
| 48 | +;; org.clojure/core.async {:mvn/version "1.9.865"}} |
| 49 | +;; } |
| 50 | +;; ``` |
| 51 | +;; |
| 52 | +;; The dependencies can be pulled in using the following statement. |
| 53 | +;; |
| 54 | +;; ```Clojure |
| 55 | +;; (require '[clojure.math :refer (PI cos sin exp to-radians)] |
| 56 | +;; '[clojure.core.async :as async] |
| 57 | +;; '[tablecloth.api :as tc] |
| 58 | +;; '[scicloj.tableplot.v1.plotly :as plotly] |
| 59 | +;; '[quil.core :as q] |
| 60 | +;; '[quil.middleware :as m] |
| 61 | +;; '[libpython-clj2.require :refer (require-python)] |
| 62 | +;; '[libpython-clj2.python :refer (py.) :as py]) |
| 63 | +;; ``` |
45 | 64 | ;; ## Pendulum Environment |
46 | 65 | ;; |
47 | 66 | ;;  |
|
104 | 123 | [control motor-acceleration] |
105 | 124 | (* control motor-acceleration)) |
106 | 125 |
|
107 | | -;; A simulation step of the pendulum is implemented as follows. |
| 126 | +;; A simulation step of the pendulum is implemented using Euler integration. |
108 | 127 | (defn update-state |
109 | 128 | "Perform simulation step of pendulum" |
110 | 129 | ([{:keys [angle velocity t]} |
|
357 | 376 | '[torch.nn :as nn] |
358 | 377 | '[torch.nn.functional :as F] |
359 | 378 | '[torch.optim :as optim] |
360 | | - '[torch.distributions :refer (Beta)]) |
| 379 | + '[torch.distributions :refer (Beta)] |
| 380 | + '[torch.nn.utils :as utils]) |
361 | 381 |
|
362 | 382 | ;; ### Tensor Conversion |
363 | 383 | ;; |
|
539 | 559 | ;; Here (as the default in [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/)) we use the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) with parameters `alpha` and `beta` both greater than 1.0. |
540 | 560 | ;; See [here](https://mathlets.org/mathlets/beta-distribution/) for an interactive visualization of the Beta distribution. |
541 | 561 | (defn indeterministic-act |
542 | | - "Sample action using actor network returning distribution" |
| 562 | + "Sample action using actor network returning random action and log-probability" |
543 | 563 | [actor] |
544 | 564 | (fn indeterministic-act-with-actor [observation] |
545 | 565 | (without-gradient |
|
582 | 602 | (plotly/base {:=title "Actor output for a single observation" :=mode :lines}) |
583 | 603 | (plotly/layer-point {:=x :x :=y :y})))) |
584 | 604 |
|
585 | | -;; Finally we also can also query the entropy of the distribution. |
| 605 | +;; Finally we can also query the entropy of the distribution. |
586 | 606 | ;; By incorporating the entropy into the loss function later on, we can encourage exploration and prevent the probability density function from collapsing. |
587 | 607 | (defn entropy-of-distribution |
588 | 608 | "Get entropy of distribution" |
|
738 | 758 | 0.0 |
739 | 759 | (reverse (map vector deltas dones truncates))))))) |
740 | 760 |
|
741 | | -;; For example if using an discount factor of 0.5, the advantages approach 2.0 assymptotically when going backwards in time. |
| 761 | +;; For example when all rewards are 1.0 and if using an discount factor of 0.5, the advantages approach 2.0 assymptotically when going backwards in time. |
742 | 762 | (advantages {:dones [false false false] :truncates [false false false]} |
743 | 763 | [1.0 1.0 1.0] |
744 | 764 | 0.5 |
|
786 | 806 |
|
787 | 807 | ;; ### Actor Loss Function |
788 | 808 | ;; |
789 | | -;; The core of the actor loss function relies on the probability ratio of the actions using the current and the updated policy. |
| 809 | +;; The core of the actor loss function relies on the action probability ratio of using the updated and the old policy (actor network output). |
790 | 810 | ;; The ratio is defined as $r_t(\theta)=\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\operatorname{old}}}(a_t|s_t)}$. |
791 | 811 | ;; Note that $r_t(\theta)$ here refers to the probability ratio as opposed to the reward of the previous section. |
792 | 812 | ;; |
|
802 | 822 | ;; |
803 | 823 | ;; $L^{CPI}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\operatorname{old}}}(a_t|s_t)} \hat{A}_t] = \mathop{\hat{\mathbb{E}}}_t [r_t(\theta) \hat{A}_t]$ |
804 | 824 | ;; |
805 | | -;; In order to increase stability, the loss function uses clipped probability ratios. |
| 825 | +;; The core idea of PPO is to use clipped probability ratios for the loss function in order to increase stability, . |
806 | 826 | ;; The probability ratio is clipped to stay below $1+\epsilon$ for positive advantages and to stay above $1-\epsilon$ for negative advantages. |
807 | 827 | ;; |
808 | 828 | ;; $L^{CLIP}(\theta) = \mathop{\hat{\mathbb{E}}}_t [\min(r_t(\theta) \hat{A}_t, \mathop{\operatorname{clip}}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A}_t)]$ |
|
1040 | 1060 | (max -1.0 |
1041 | 1061 | (- 1.0 (/ (q/mouse-x) |
1042 | 1062 | (/ (q/width) 2.0)))))}) |
1043 | | - state (update-state state action)] |
1044 | | - (when (done? state) (async/close! done-chan)) |
| 1063 | + state (update-state state action config)] |
| 1064 | + (when (done? state config) (async/close! done-chan)) |
1045 | 1065 | (reset! last-action action) |
1046 | 1066 | state)) |
1047 | 1067 | :draw #(draw-state % @last-action) |
|
1051 | 1071 | (System/exit 0)) |
1052 | 1072 |
|
1053 | 1073 | ;; Here is a small demo video of the pendulum being controlled using the actor network. |
| 1074 | +;; You can find a repository with the code of this article as well as unit tests at [github.com/wedesoft/ppo](https://github.com/wedesoft/ppo). |
1054 | 1075 | ;; |
1055 | 1076 | ;;  |
| 1077 | +;; |
| 1078 | +;; Enjoy! |
0 commit comments