|
3 | 3 | :external-requirements [] |
4 | 4 | :quarto {:author [:janwedekind] |
5 | 5 | :draft true |
6 | | - :description "A Clojure port of Jinghao's PPO implementation using Pytorch and Quil" |
| 6 | + :description "A Clojure port of XinJingHao's PPO implementation using Pytorch and Quil" |
7 | 7 | :image "pendulum.png" |
8 | 8 | :type :post |
9 | 9 | :date "2026-04-18" |
10 | 10 | :category :ml |
11 | 11 | :tags [:physics :machine-learning :optimization :ppo :control]}}} |
12 | 12 |
|
13 | 13 | (ns ppo.main |
14 | | - (:require [libpython-clj2.require :refer (require-python)])) |
| 14 | + (:require [clojure.math :refer (PI cos sin to-radians)] |
| 15 | + [clojure.core.async :as async] |
| 16 | + [quil.core :as q] |
| 17 | + [quil.middleware :as m] |
| 18 | + [libpython-clj2.require :refer (require-python)])) |
15 | 19 |
|
16 | 20 | (require-python '[torch :as torch]) |
17 | 21 |
|
| 22 | +;; ## Motivation |
| 23 | +;; |
18 | 24 | ;; Recently I started to look into the problem of reentry trajectory planning in the context of developing the [sfsim](https://store.steampowered.com/app/3687560/sfsim/) space flight simulator. |
19 | | - |
| 25 | +;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/) (now maintained by the Farama Foundation). |
| 26 | +;; However I had stability issues. |
| 27 | +;; The algorithm would learn a strategy and then suddenly diverge again. |
| 28 | +;; |
| 29 | +;; More recently (2017) the Proximal Policy Optimization (PPO) algorithm was published and it has gained in popularity. |
| 30 | +;; PPO is inspired by Trust Region Policy Optimization (TRPO) but is much easier to implement. |
| 31 | +;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms. |
| 32 | +;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow. |
| 33 | +;; |
| 34 | +;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure. |
| 35 | +;; |
| 36 | +;; ## Pendulum environment |
| 37 | +;; |
20 | 38 | ;;  |
| 39 | +;; |
| 40 | +;; First we implement a simple pendulum environment to test the PPO algorithm. |
| 41 | +;; In order to be able to switch environments, we define a protocol according to the environment abstract class used in OpenAI's gym. |
| 42 | +(defprotocol Environment |
| 43 | + (environment-update [this action]) |
| 44 | + (environment-observation [this]) |
| 45 | + (environment-done? [this]) |
| 46 | + (environment-truncate? [this]) |
| 47 | + (environment-reward [this action])) |
| 48 | + |
| 49 | +;; Here is a configuration for testing the pendulum. |
| 50 | +(def frame-rate 20) |
| 51 | + |
| 52 | +(def config |
| 53 | + {:length (/ 2.0 3.0) |
| 54 | + :max-speed 8.0 |
| 55 | + :motor 6.0 |
| 56 | + :gravitation 10.0 |
| 57 | + :dt (/ 1.0 frame-rate) |
| 58 | + :save false |
| 59 | + :timeout 10.0 |
| 60 | + :angle-weight 1.0 |
| 61 | + :velocity-weight 0.1 |
| 62 | + :control-weight 0.0001}) |
| 63 | + |
| 64 | +;; ### Setup |
| 65 | +;; |
| 66 | +;; A method to initialise the pendulum is defined. |
| 67 | +(defn setup |
| 68 | + "Initialise pendulum" |
| 69 | + [angle velocity] |
| 70 | + {:angle angle |
| 71 | + :velocity velocity |
| 72 | + :t 0.0}) |
| 73 | + |
| 74 | +;; Same as in OpenAI's gym the angle is zero when the pendulum is pointing up. |
| 75 | +;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5. |
| 76 | +(setup (/ PI 2) 0.5) |
| 77 | + |
| 78 | +;; ### State updates |
| 79 | +;; |
| 80 | +;; The angular acceleration due to gravitation is implemented as follows. |
| 81 | +(defn pendulum-gravity |
| 82 | + "Determine angular acceleration due to gravity" |
| 83 | + [gravitation length angle] |
| 84 | + (/ (* (sin angle) gravitation) length)) |
| 85 | + |
| 86 | +;; The angular acceleration depends on the gravitation, length of pendulum, and angle of pendulum. |
| 87 | +(pendulum-gravity 9.81 1.0 0.0) |
| 88 | +(pendulum-gravity 9.81 1.0 (/ PI 2)) |
| 89 | +(pendulum-gravity 9.81 2.0 (/ PI 2)) |
| 90 | + |
| 91 | +;; The motor is controlled using an input value between -1 and 1. |
| 92 | +;; This value is simply multiplied with the maximum acceleration provided by the motor. |
| 93 | +(defn motor-acceleration |
| 94 | + "Angular acceleration from motor" |
| 95 | + [control motor-acceleration] |
| 96 | + (* control motor-acceleration)) |
| 97 | + |
| 98 | +;; A simulation step of the pendulum is implemented as follows. |
| 99 | +(defn update-state |
| 100 | + "Perform simulation step of pendulum" |
| 101 | + ([{:keys [angle velocity t]} {:keys [control]} {:keys [dt motor gravitation length max-speed]}] |
| 102 | + (let [gravity (pendulum-gravity gravitation length angle) |
| 103 | + motor (motor-acceleration control motor) |
| 104 | + t (+ t dt) |
| 105 | + acceleration (+ motor gravity) |
| 106 | + velocity (max (- max-speed) (min max-speed (+ velocity (* acceleration dt)))) |
| 107 | + angle (+ angle (* velocity dt))] |
| 108 | + {:angle angle |
| 109 | + :velocity velocity |
| 110 | + :t t}))) |
| 111 | + |
| 112 | +;; Here are a few examples for advancing the state in different situations. |
| 113 | +(update-state {:angle PI :velocity 0.0 :t 0.0} {:control 0.0} config) |
| 114 | +(update-state {:angle PI :velocity 0.1 :t 0.0} {:control 0.0} config) |
| 115 | +(update-state {:angle (/ PI 2) :velocity 0.0 :t 0.0} {:control 0.0} config) |
| 116 | +(update-state {:angle 0.0 :velocity 0.0 :t 0.0} {:control 1.0} config) |
| 117 | + |
| 118 | +;; ### Observation |
| 119 | +;; |
| 120 | +;; The observation of the pendulum state uses cosinus and sinus of the angle to resolve the wrap around problem of angles. |
| 121 | +;; The angular speed is normalized to be between -1 and 1. |
| 122 | +(defn observation |
| 123 | + "Get observation from state" |
| 124 | + [{:keys [angle velocity]} {:keys [max-speed]}] |
| 125 | + [(cos angle) (sin angle) (/ velocity max-speed)]) |
| 126 | + |
| 127 | +;; The observation of the pendulum is a vector with 3 elements. |
| 128 | +(observation {:angle 0.0 :velocity 0.0} config) |
| 129 | +(observation {:angle 0.0 :velocity 0.5} config) |
| 130 | +(observation {:angle (/ PI 2) :velocity 0.0} config) |
| 131 | + |
| 132 | +;; ### Action |
| 133 | +;; |
| 134 | +;; The action of a pendulum is a vector with one element between 0 and 1. |
| 135 | +;; The following method converts it to a action hashmap used by the pendulum environment. |
| 136 | +(defn action |
| 137 | + "Convert array to action" |
| 138 | + [array] |
| 139 | + {:control (max -1.0 (min 1.0 (- (* 2.0 (first array)) 1.0)))}) |
| 140 | + |
| 141 | +;; The following examples show how the action vector is mapped to a control input between -1 and 1. |
| 142 | +(action [0.0]) |
| 143 | +(action [0.5]) |
| 144 | +(action [1.0]) |
| 145 | + |
| 146 | +;; ### Termination |
| 147 | +;; |
| 148 | +;; The truncate method is used to stop a pendulum run after a specific amount of time. |
| 149 | +(defn truncate? |
| 150 | + "Decide whether a run should be aborted" |
| 151 | + ([{:keys [t]} {:keys [timeout]}] |
| 152 | + (>= t timeout))) |
| 153 | + |
| 154 | +(truncate? {:t 50.0} {:timeout 100.0}) |
| 155 | +(truncate? {:t 100.0} {:timeout 100.0}) |
| 156 | + |
| 157 | +;; It is also possible to define a termination condition. |
| 158 | +;; For the pendulum environment we specify that it never terminates. |
| 159 | +(defn done? |
| 160 | + "Decide whether pendulum achieved target state" |
| 161 | + ([_state _config] |
| 162 | + false)) |
| 163 | + |
| 164 | +;; ### Reward |
| 165 | +;; |
| 166 | +;; The following method normalizes an angle to be between -PI and +PI. |
| 167 | +(defn normalize-angle |
| 168 | + "Angular deviation from up angle" |
| 169 | + [angle] |
| 170 | + (- (mod (+ angle PI) (* 2 PI)) PI)) |
| 171 | + |
| 172 | +;; We also need the square of a number. |
| 173 | +(defn sqr |
| 174 | + "Square of number" |
| 175 | + [x] |
| 176 | + (* x x)) |
| 177 | + |
| 178 | +;; The reward function penalises deviation from the upright position, non-zero velocities, and non-zero control input. |
| 179 | +(defn reward |
| 180 | + "Reward function" |
| 181 | + [{:keys [angle velocity]} {:keys [angle-weight velocity-weight control-weight]} {:keys [control]}] |
| 182 | + (- (+ (* angle-weight (sqr (normalize-angle angle))) |
| 183 | + (* velocity-weight (sqr velocity)) |
| 184 | + (* control-weight (sqr control))))) |
| 185 | + |
| 186 | +;; ### Environment protocol |
| 187 | +;; |
| 188 | +;; Finally we are able to implement the pendulum as a generic environment. |
| 189 | +(defrecord Pendulum [config state] |
| 190 | + Environment |
| 191 | + (environment-update [_this input] |
| 192 | + (->Pendulum config (update-state state (action input) config))) |
| 193 | + (environment-observation [_this] |
| 194 | + (observation state config)) |
| 195 | + (environment-done? [_this] |
| 196 | + (done? state config)) |
| 197 | + (environment-truncate? [_this] |
| 198 | + (truncate? state config)) |
| 199 | + (environment-reward [_this input] |
| 200 | + (reward state config (action input)))) |
| 201 | + |
| 202 | +;; The following factory method creates an environment with an initial random state covering all possible pendulum states. |
| 203 | +(defn pendulum-factory |
| 204 | + [] |
| 205 | + (let [angle (- (rand (* 2.0 PI)) PI) |
| 206 | + max-speed (:max-speed config) |
| 207 | + velocity (- (rand (* 2.0 max-speed)) max-speed)] |
| 208 | + (->Pendulum config (setup angle velocity)))) |
| 209 | + |
| 210 | +;; ### Visualisation |
| 211 | +;; |
| 212 | +;; The following method is used to draw the pendulum and visualise the motor control input. |
| 213 | +(defn draw-state [{:keys [angle]} {:keys [control]}] |
| 214 | + (let [origin-x (/ (q/width) 2) |
| 215 | + origin-y (/ (q/height) 2) |
| 216 | + length (* 0.5 (q/height) (:length config)) |
| 217 | + pendulum-x (+ origin-x (* length (sin angle))) |
| 218 | + pendulum-y (- origin-y (* length (cos angle))) |
| 219 | + size (* 0.05 (q/height)) |
| 220 | + arc-radius (* (abs control) 0.2 (q/height)) |
| 221 | + positive (pos? control) |
| 222 | + tip-angle (if positive 225 -45)] |
| 223 | + (q/frame-rate frame-rate) |
| 224 | + (q/background 255) |
| 225 | + (q/stroke-weight 5) |
| 226 | + (q/stroke 0) |
| 227 | + (q/fill 175) |
| 228 | + (q/line origin-x origin-y pendulum-x pendulum-y) |
| 229 | + (q/stroke-weight 1) |
| 230 | + (q/ellipse pendulum-x pendulum-y size size) |
| 231 | + (q/no-fill) |
| 232 | + (q/arc origin-x origin-y (* 2 arc-radius) (* 2 arc-radius) (to-radians -45) (to-radians 225)) |
| 233 | + (q/with-translation [(+ origin-x (* (cos (to-radians tip-angle)) arc-radius)) (+ origin-y (* (sin (to-radians tip-angle)) arc-radius))] |
| 234 | + (q/with-rotation [(to-radians (if positive 225 -45))] |
| 235 | + (q/triangle 0 (if positive 10 -10) -5 0 5 0))) |
| 236 | + (when (:save config) |
| 237 | + (q/save-frame "frame-####.png")))) |
| 238 | + |
| 239 | +;; ### Animation |
| 240 | +;; |
| 241 | +;; The following method animates the pendulum and facilitates mouse control. |
| 242 | +(defn run [] |
| 243 | + (let [done-chan (async/chan) |
| 244 | + last-action (atom {:control 0.0})] |
| 245 | + (q/sketch |
| 246 | + :title "Inverted Pendulum with Mouse Control" |
| 247 | + :size [854 480] |
| 248 | + :setup #(setup PI 0.0) |
| 249 | + :update (fn [state] |
| 250 | + (let [action {:control (min 1.0 (max -1.0 (- 1.0 (/ (q/mouse-x) (/ (q/width) 2.0)))))} |
| 251 | + state (update-state state action config)] |
| 252 | + (when (done? state config) (async/close! done-chan)) |
| 253 | + (reset! last-action action) |
| 254 | + state)) |
| 255 | + :draw #(draw-state % @last-action) |
| 256 | + :middleware [m/fun-mode] |
| 257 | + :on-close (fn [& _] (async/close! done-chan))) |
| 258 | + (async/<!! done-chan)) |
| 259 | + (System/exit 0)) |
0 commit comments