Explained pendulum environment

wedesoft · wedesoft · commit 1e3581df6702 · 2026-04-19T22:24:11.000+01:00
diff --git a/deps.edn b/deps.edn
@@ -67,6 +67,7 @@
   generateme/fastmath                         {:mvn/version "3.0.0-alpha4"}
   clojure2d/clojure2d                         {:mvn/version "1.5.0-alpha1"}
   org.scicloj/wadogo                          {:mvn/version "1.1.0-alpha1"}
+  quil/quil                                   {:mvn/version "4.3.1563"}
 
   com.github.psambit9791/jdsp                 {:mvn/version "3.1.0"}
 
diff --git a/src/ppo/main.clj b/src/ppo/main.clj
@@ -11,12 +11,15 @@
                               :tags     [:physics :machine-learning :optimization :ppo :control]}}}
 
 (ns ppo.main
-    (:require [libpython-clj2.require :refer (require-python)]))
+    (:require [clojure.math :refer (PI cos sin)]
+              [libpython-clj2.require :refer (require-python)]))
 
 (require-python '[torch :as torch])
 
+;; ## Motivation
+;;
 ;; Recently I started to look into the problem of reentry trajectory planning in the context of developing the [sfsim](https://store.steampowered.com/app/3687560/sfsim/) space flight simulator.
-;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/).
+;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/) (now maintained by the Farama Foundation).
 ;; However I had stability issues.
 ;; The algorithm would learn a strategy and then suddenly diverge again.
 ;;
@@ -25,4 +28,178 @@
 ;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms.
 ;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow.
 ;;
+;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure.
+;;
+;; ## Pendulum environment
+;;
 ;; ![pendulum](pendulum.png)
+;;
+;; First we implement a simple pendulum environment to test the PPO algorithm.
+;; In order to be able to switch environments, we define a protocol according to the environment abstract class used in OpenAI's gym.
+(defprotocol Environment
+  (environment-update [this action])
+  (environment-observation [this])
+  (environment-done? [this])
+  (environment-truncate? [this])
+  (environment-reward [this action]))
+
+;; Here is a configuration for testing the pendulum.
+(def frame-rate 20)
+
+(def config
+  {:length  (/ 2.0 3.0)
+   :max-speed 8.0
+   :motor 6.0
+   :gravitation 10.0
+   :dt (/ 1.0 frame-rate)
+   :save false
+   :timeout 10.0
+   :angle-weight 1.0
+   :velocity-weight 0.1
+   :control-weight 0.0001})
+
+;; ### Setup
+;;
+;; A method to initialise the pendulum is defined.
+(defn setup
+  "Initialise pendulum"
+  [angle velocity]
+  {:angle          angle
+   :velocity       velocity
+   :t              0.0})
+
+;; Same as in OpenAI's gym the angle is zero when the pendulum is pointing up.
+;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5.
+(setup (/ PI 2) 0.5)
+
+;; ### State updates
+;;
+;; The angular acceleration due to gravitation is implemented as follows.
+(defn pendulum-gravity
+  "Determine angular acceleration due to gravity"
+  [gravitation length angle]
+  (/ (* (sin angle) gravitation) length))
+
+;; The angular acceleration depends on the gravitation, length of pendulum, and angle of pendulum.
+(pendulum-gravity 9.81 1.0 0.0)
+(pendulum-gravity 9.81 1.0 (/ PI 2))
+(pendulum-gravity 9.81 2.0 (/ PI 2))
+
+;; The motor is controlled using an input value between -1 and 1.
+;; This value is simply multiplied with the maximum acceleration provided by the motor.
+(defn motor-acceleration
+  "Angular acceleration from motor"
+  [control motor-acceleration]
+  (* control motor-acceleration))
+
+;; A simulation step of the pendulum is implemented as follows.
+(defn update-state
+  "Perform simulation step of pendulum"
+  ([{:keys [angle velocity t]} {:keys [control]} {:keys [dt motor gravitation length max-speed]}]
+   (let [gravity        (pendulum-gravity gravitation length angle)
+         motor          (motor-acceleration control motor)
+         t              (+ t dt)
+         acceleration   (+ motor gravity)
+         velocity       (max (- max-speed) (min max-speed (+ velocity (* acceleration dt))))
+         angle          (+ angle (* velocity dt))]
+     {:angle          angle
+      :velocity       velocity
+      :t              t})))
+
+;; Here are a few examples for advancing the state in different situations.
+(update-state {:angle PI :velocity 0.0 :t 0.0} {:control 0.0} config)
+(update-state {:angle PI :velocity 0.1 :t 0.0} {:control 0.0} config)
+(update-state {:angle (/ PI 2) :velocity 0.0 :t 0.0} {:control 0.0} config)
+(update-state {:angle 0.0 :velocity 0.0 :t 0.0} {:control 1.0} config)
+
+;; ### Observation
+;;
+;; The observation of the pendulum state uses cosinus and sinus of the angle to resolve the wrap around problem of angles.
+;; The angular speed is normalized to be between -1 and 1.
+(defn observation
+  "Get observation from state"
+  [{:keys [angle velocity]} {:keys [max-speed]}]
+  [(cos angle) (sin angle) (/ velocity max-speed)])
+
+;; The observation of the pendulum is a vector with 3 elements.
+(observation {:angle 0.0 :velocity 0.0} config)
+(observation {:angle 0.0 :velocity 0.5} config)
+(observation {:angle (/ PI 2) :velocity 0.0} config)
+
+;; ### Action
+;;
+;; The action of a pendulum is a vector with one element between 0 and 1.
+;; The following method converts it to a action hashmap used by the pendulum environment.
+(defn action
+  "Convert array to action"
+  [array]
+  {:control (max -1.0 (min 1.0 (- (* 2.0 (first array)) 1.0)))})
+
+;; The following examples show how the action vector is mapped to a control input between -1 and 1.
+(action [0.0])
+(action [0.5])
+(action [1.0])
+
+;; ### Termination
+;;
+;; The truncate method is used to stop a pendulum run after a specific amount of time.
+(defn truncate?
+  "Decide whether a run should be aborted"
+  ([{:keys [t]} {:keys [timeout]}]
+   (>= t timeout)))
+
+(truncate? {:t 50.0} {:timeout 100.0})
+(truncate? {:t 100.0} {:timeout 100.0})
+
+;; It is also possible to define a termination condition.
+;; For the pendulum environment we specify that it never terminates.
+(defn done?
+  "Decide whether pendulum achieved target state"
+  ([_state _config]
+   false))
+
+;; ### Reward
+;;
+;; The following method normalizes an angle to be between -PI and +PI.
+(defn normalize-angle
+  "Angular deviation from up angle"
+  [angle]
+  (- (mod (+ angle PI) (* 2 PI)) PI))
+
+;; We also need the square of a number.
+(defn sqr
+  "Square of number"
+  [x]
+  (* x x))
+
+;; The reward function penalises deviation from the upright position, non-zero velocities, and non-zero control input.
+(defn reward
+  "Reward function"
+  [{:keys [angle velocity]} {:keys [angle-weight velocity-weight control-weight]} {:keys [control]}]
+  (- (+ (* angle-weight (sqr (normalize-angle angle)))
+        (* velocity-weight (sqr velocity))
+        (* control-weight (sqr control)))))
+
+;; ### Environment protocol
+;;
+;; Finally we are able to implement the pendulum as a generic environment.
+(defrecord Pendulum [config state]
+  Environment
+  (environment-update [_this input]
+    (->Pendulum config (update-state state (action input) config)))
+  (environment-observation [_this]
+    (observation state config))
+  (environment-done? [_this]
+    (done? state config))
+  (environment-truncate? [_this]
+    (truncate? state config))
+  (environment-reward [_this input]
+    (reward state config (action input))))
+
+;; The following factory method creates an environment with an initial random state covering all possible pendulum states.
+(defn pendulum-factory
+  []
+  (let [angle     (- (rand (* 2.0 PI)) PI)
+        max-speed (:max-speed config)
+        velocity  (- (rand (* 2.0 max-speed)) max-speed)]
+    (->Pendulum config (setup angle velocity))))