Skip to content

Commit de9a0bd

Browse files
authored
Merge pull request #356 from wedesoft/ppo-draft-3
PPO visualisation of pendulum
2 parents 1326f65 + 9d2853d commit de9a0bd

2 files changed

Lines changed: 243 additions & 3 deletions

File tree

deps.edn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
generateme/fastmath {:mvn/version "3.0.0-alpha4"}
6868
clojure2d/clojure2d {:mvn/version "1.5.0-alpha1"}
6969
org.scicloj/wadogo {:mvn/version "1.1.0-alpha1"}
70+
quil/quil {:mvn/version "4.3.1563"}
7071

7172
com.github.psambit9791/jdsp {:mvn/version "3.1.0"}
7273

src/ppo/main.clj

Lines changed: 242 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,257 @@
33
:external-requirements []
44
:quarto {:author [:janwedekind]
55
:draft true
6-
:description "A Clojure port of Jinghao's PPO implementation using Pytorch and Quil"
6+
:description "A Clojure port of XinJingHao's PPO implementation using Pytorch and Quil"
77
:image "pendulum.png"
88
:type :post
99
:date "2026-04-18"
1010
:category :ml
1111
:tags [:physics :machine-learning :optimization :ppo :control]}}}
1212

1313
(ns ppo.main
14-
(:require [libpython-clj2.require :refer (require-python)]))
14+
(:require [clojure.math :refer (PI cos sin to-radians)]
15+
[clojure.core.async :as async]
16+
[quil.core :as q]
17+
[quil.middleware :as m]
18+
[libpython-clj2.require :refer (require-python)]))
1519

1620
(require-python '[torch :as torch])
1721

22+
;; ## Motivation
23+
;;
1824
;; Recently I started to look into the problem of reentry trajectory planning in the context of developing the [sfsim](https://store.steampowered.com/app/3687560/sfsim/) space flight simulator.
19-
25+
;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/) (now maintained by the Farama Foundation).
26+
;; However I had stability issues.
27+
;; The algorithm would learn a strategy and then suddenly diverge again.
28+
;;
29+
;; More recently (2017) the Proximal Policy Optimization (PPO) algorithm was published and it has gained in popularity.
30+
;; PPO is inspired by Trust Region Policy Optimization (TRPO) but is much easier to implement.
31+
;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms.
32+
;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow.
33+
;;
34+
;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure.
35+
;;
36+
;; ## Pendulum environment
37+
;;
2038
;; ![pendulum](pendulum.png)
39+
;;
40+
;; First we implement a simple pendulum environment to test the PPO algorithm.
41+
;; In order to be able to switch environments, we define a protocol according to the environment abstract class used in OpenAI's gym.
42+
(defprotocol Environment
43+
(environment-update [this action])
44+
(environment-observation [this])
45+
(environment-done? [this])
46+
(environment-truncate? [this])
47+
(environment-reward [this action]))
48+
49+
;; Here is a configuration for testing the pendulum.
50+
(def frame-rate 20)
51+
52+
(def config
53+
{:length (/ 2.0 3.0)
54+
:max-speed 8.0
55+
:motor 6.0
56+
:gravitation 10.0
57+
:dt (/ 1.0 frame-rate)
58+
:save false
59+
:timeout 10.0
60+
:angle-weight 1.0
61+
:velocity-weight 0.1
62+
:control-weight 0.0001})
63+
64+
;; ### Setup
65+
;;
66+
;; A method to initialise the pendulum is defined.
67+
(defn setup
68+
"Initialise pendulum"
69+
[angle velocity]
70+
{:angle angle
71+
:velocity velocity
72+
:t 0.0})
73+
74+
;; Same as in OpenAI's gym the angle is zero when the pendulum is pointing up.
75+
;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5.
76+
(setup (/ PI 2) 0.5)
77+
78+
;; ### State updates
79+
;;
80+
;; The angular acceleration due to gravitation is implemented as follows.
81+
(defn pendulum-gravity
82+
"Determine angular acceleration due to gravity"
83+
[gravitation length angle]
84+
(/ (* (sin angle) gravitation) length))
85+
86+
;; The angular acceleration depends on the gravitation, length of pendulum, and angle of pendulum.
87+
(pendulum-gravity 9.81 1.0 0.0)
88+
(pendulum-gravity 9.81 1.0 (/ PI 2))
89+
(pendulum-gravity 9.81 2.0 (/ PI 2))
90+
91+
;; The motor is controlled using an input value between -1 and 1.
92+
;; This value is simply multiplied with the maximum acceleration provided by the motor.
93+
(defn motor-acceleration
94+
"Angular acceleration from motor"
95+
[control motor-acceleration]
96+
(* control motor-acceleration))
97+
98+
;; A simulation step of the pendulum is implemented as follows.
99+
(defn update-state
100+
"Perform simulation step of pendulum"
101+
([{:keys [angle velocity t]} {:keys [control]} {:keys [dt motor gravitation length max-speed]}]
102+
(let [gravity (pendulum-gravity gravitation length angle)
103+
motor (motor-acceleration control motor)
104+
t (+ t dt)
105+
acceleration (+ motor gravity)
106+
velocity (max (- max-speed) (min max-speed (+ velocity (* acceleration dt))))
107+
angle (+ angle (* velocity dt))]
108+
{:angle angle
109+
:velocity velocity
110+
:t t})))
111+
112+
;; Here are a few examples for advancing the state in different situations.
113+
(update-state {:angle PI :velocity 0.0 :t 0.0} {:control 0.0} config)
114+
(update-state {:angle PI :velocity 0.1 :t 0.0} {:control 0.0} config)
115+
(update-state {:angle (/ PI 2) :velocity 0.0 :t 0.0} {:control 0.0} config)
116+
(update-state {:angle 0.0 :velocity 0.0 :t 0.0} {:control 1.0} config)
117+
118+
;; ### Observation
119+
;;
120+
;; The observation of the pendulum state uses cosinus and sinus of the angle to resolve the wrap around problem of angles.
121+
;; The angular speed is normalized to be between -1 and 1.
122+
(defn observation
123+
"Get observation from state"
124+
[{:keys [angle velocity]} {:keys [max-speed]}]
125+
[(cos angle) (sin angle) (/ velocity max-speed)])
126+
127+
;; The observation of the pendulum is a vector with 3 elements.
128+
(observation {:angle 0.0 :velocity 0.0} config)
129+
(observation {:angle 0.0 :velocity 0.5} config)
130+
(observation {:angle (/ PI 2) :velocity 0.0} config)
131+
132+
;; ### Action
133+
;;
134+
;; The action of a pendulum is a vector with one element between 0 and 1.
135+
;; The following method converts it to a action hashmap used by the pendulum environment.
136+
(defn action
137+
"Convert array to action"
138+
[array]
139+
{:control (max -1.0 (min 1.0 (- (* 2.0 (first array)) 1.0)))})
140+
141+
;; The following examples show how the action vector is mapped to a control input between -1 and 1.
142+
(action [0.0])
143+
(action [0.5])
144+
(action [1.0])
145+
146+
;; ### Termination
147+
;;
148+
;; The truncate method is used to stop a pendulum run after a specific amount of time.
149+
(defn truncate?
150+
"Decide whether a run should be aborted"
151+
([{:keys [t]} {:keys [timeout]}]
152+
(>= t timeout)))
153+
154+
(truncate? {:t 50.0} {:timeout 100.0})
155+
(truncate? {:t 100.0} {:timeout 100.0})
156+
157+
;; It is also possible to define a termination condition.
158+
;; For the pendulum environment we specify that it never terminates.
159+
(defn done?
160+
"Decide whether pendulum achieved target state"
161+
([_state _config]
162+
false))
163+
164+
;; ### Reward
165+
;;
166+
;; The following method normalizes an angle to be between -PI and +PI.
167+
(defn normalize-angle
168+
"Angular deviation from up angle"
169+
[angle]
170+
(- (mod (+ angle PI) (* 2 PI)) PI))
171+
172+
;; We also need the square of a number.
173+
(defn sqr
174+
"Square of number"
175+
[x]
176+
(* x x))
177+
178+
;; The reward function penalises deviation from the upright position, non-zero velocities, and non-zero control input.
179+
(defn reward
180+
"Reward function"
181+
[{:keys [angle velocity]} {:keys [angle-weight velocity-weight control-weight]} {:keys [control]}]
182+
(- (+ (* angle-weight (sqr (normalize-angle angle)))
183+
(* velocity-weight (sqr velocity))
184+
(* control-weight (sqr control)))))
185+
186+
;; ### Environment protocol
187+
;;
188+
;; Finally we are able to implement the pendulum as a generic environment.
189+
(defrecord Pendulum [config state]
190+
Environment
191+
(environment-update [_this input]
192+
(->Pendulum config (update-state state (action input) config)))
193+
(environment-observation [_this]
194+
(observation state config))
195+
(environment-done? [_this]
196+
(done? state config))
197+
(environment-truncate? [_this]
198+
(truncate? state config))
199+
(environment-reward [_this input]
200+
(reward state config (action input))))
201+
202+
;; The following factory method creates an environment with an initial random state covering all possible pendulum states.
203+
(defn pendulum-factory
204+
[]
205+
(let [angle (- (rand (* 2.0 PI)) PI)
206+
max-speed (:max-speed config)
207+
velocity (- (rand (* 2.0 max-speed)) max-speed)]
208+
(->Pendulum config (setup angle velocity))))
209+
210+
;; ### Visualisation
211+
;;
212+
;; The following method is used to draw the pendulum and visualise the motor control input.
213+
(defn draw-state [{:keys [angle]} {:keys [control]}]
214+
(let [origin-x (/ (q/width) 2)
215+
origin-y (/ (q/height) 2)
216+
length (* 0.5 (q/height) (:length config))
217+
pendulum-x (+ origin-x (* length (sin angle)))
218+
pendulum-y (- origin-y (* length (cos angle)))
219+
size (* 0.05 (q/height))
220+
arc-radius (* (abs control) 0.2 (q/height))
221+
positive (pos? control)
222+
tip-angle (if positive 225 -45)]
223+
(q/frame-rate frame-rate)
224+
(q/background 255)
225+
(q/stroke-weight 5)
226+
(q/stroke 0)
227+
(q/fill 175)
228+
(q/line origin-x origin-y pendulum-x pendulum-y)
229+
(q/stroke-weight 1)
230+
(q/ellipse pendulum-x pendulum-y size size)
231+
(q/no-fill)
232+
(q/arc origin-x origin-y (* 2 arc-radius) (* 2 arc-radius) (to-radians -45) (to-radians 225))
233+
(q/with-translation [(+ origin-x (* (cos (to-radians tip-angle)) arc-radius)) (+ origin-y (* (sin (to-radians tip-angle)) arc-radius))]
234+
(q/with-rotation [(to-radians (if positive 225 -45))]
235+
(q/triangle 0 (if positive 10 -10) -5 0 5 0)))
236+
(when (:save config)
237+
(q/save-frame "frame-####.png"))))
238+
239+
;; ### Animation
240+
;;
241+
;; The following method animates the pendulum and facilitates mouse control.
242+
(defn run []
243+
(let [done-chan (async/chan)
244+
last-action (atom {:control 0.0})]
245+
(q/sketch
246+
:title "Inverted Pendulum with Mouse Control"
247+
:size [854 480]
248+
:setup #(setup PI 0.0)
249+
:update (fn [state]
250+
(let [action {:control (min 1.0 (max -1.0 (- 1.0 (/ (q/mouse-x) (/ (q/width) 2.0)))))}
251+
state (update-state state action config)]
252+
(when (done? state config) (async/close! done-chan))
253+
(reset! last-action action)
254+
state))
255+
:draw #(draw-state % @last-action)
256+
:middleware [m/fun-mode]
257+
:on-close (fn [& _] (async/close! done-chan)))
258+
(async/<!! done-chan))
259+
(System/exit 0))

0 commit comments

Comments
 (0)