Skip to content

Commit 1e3581d

Browse files
committed
Explained pendulum environment
1 parent 208ac44 commit 1e3581d

2 files changed

Lines changed: 180 additions & 2 deletions

File tree

deps.edn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
generateme/fastmath {:mvn/version "3.0.0-alpha4"}
6868
clojure2d/clojure2d {:mvn/version "1.5.0-alpha1"}
6969
org.scicloj/wadogo {:mvn/version "1.1.0-alpha1"}
70+
quil/quil {:mvn/version "4.3.1563"}
7071

7172
com.github.psambit9791/jdsp {:mvn/version "3.1.0"}
7273

src/ppo/main.clj

Lines changed: 179 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@
1111
:tags [:physics :machine-learning :optimization :ppo :control]}}}
1212

1313
(ns ppo.main
14-
(:require [libpython-clj2.require :refer (require-python)]))
14+
(:require [clojure.math :refer (PI cos sin)]
15+
[libpython-clj2.require :refer (require-python)]))
1516

1617
(require-python '[torch :as torch])
1718

19+
;; ## Motivation
20+
;;
1821
;; Recently I started to look into the problem of reentry trajectory planning in the context of developing the [sfsim](https://store.steampowered.com/app/3687560/sfsim/) space flight simulator.
19-
;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/).
22+
;; I had looked into reinforcement learning before and tried out Q-learning using the [lunar lander reference environment of OpenAI's gym library](https://gymnasium.farama.org/environments/box2d/lunar_lander/) (now maintained by the Farama Foundation).
2023
;; However I had stability issues.
2124
;; The algorithm would learn a strategy and then suddenly diverge again.
2225
;;
@@ -25,4 +28,178 @@
2528
;; The [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) Python library has a implementation of PPO, TRPO, and other reinforcement learning algorithms.
2629
;; However I found [XinJingHao's PPO implementation](https://github.com/XinJingHao/PPO-Continuous-Pytorch/) which I found easier to follow.
2730
;;
31+
;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure.
32+
;;
33+
;; ## Pendulum environment
34+
;;
2835
;; ![pendulum](pendulum.png)
36+
;;
37+
;; First we implement a simple pendulum environment to test the PPO algorithm.
38+
;; In order to be able to switch environments, we define a protocol according to the environment abstract class used in OpenAI's gym.
39+
(defprotocol Environment
40+
(environment-update [this action])
41+
(environment-observation [this])
42+
(environment-done? [this])
43+
(environment-truncate? [this])
44+
(environment-reward [this action]))
45+
46+
;; Here is a configuration for testing the pendulum.
47+
(def frame-rate 20)
48+
49+
(def config
50+
{:length (/ 2.0 3.0)
51+
:max-speed 8.0
52+
:motor 6.0
53+
:gravitation 10.0
54+
:dt (/ 1.0 frame-rate)
55+
:save false
56+
:timeout 10.0
57+
:angle-weight 1.0
58+
:velocity-weight 0.1
59+
:control-weight 0.0001})
60+
61+
;; ### Setup
62+
;;
63+
;; A method to initialise the pendulum is defined.
64+
(defn setup
65+
"Initialise pendulum"
66+
[angle velocity]
67+
{:angle angle
68+
:velocity velocity
69+
:t 0.0})
70+
71+
;; Same as in OpenAI's gym the angle is zero when the pendulum is pointing up.
72+
;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5.
73+
(setup (/ PI 2) 0.5)
74+
75+
;; ### State updates
76+
;;
77+
;; The angular acceleration due to gravitation is implemented as follows.
78+
(defn pendulum-gravity
79+
"Determine angular acceleration due to gravity"
80+
[gravitation length angle]
81+
(/ (* (sin angle) gravitation) length))
82+
83+
;; The angular acceleration depends on the gravitation, length of pendulum, and angle of pendulum.
84+
(pendulum-gravity 9.81 1.0 0.0)
85+
(pendulum-gravity 9.81 1.0 (/ PI 2))
86+
(pendulum-gravity 9.81 2.0 (/ PI 2))
87+
88+
;; The motor is controlled using an input value between -1 and 1.
89+
;; This value is simply multiplied with the maximum acceleration provided by the motor.
90+
(defn motor-acceleration
91+
"Angular acceleration from motor"
92+
[control motor-acceleration]
93+
(* control motor-acceleration))
94+
95+
;; A simulation step of the pendulum is implemented as follows.
96+
(defn update-state
97+
"Perform simulation step of pendulum"
98+
([{:keys [angle velocity t]} {:keys [control]} {:keys [dt motor gravitation length max-speed]}]
99+
(let [gravity (pendulum-gravity gravitation length angle)
100+
motor (motor-acceleration control motor)
101+
t (+ t dt)
102+
acceleration (+ motor gravity)
103+
velocity (max (- max-speed) (min max-speed (+ velocity (* acceleration dt))))
104+
angle (+ angle (* velocity dt))]
105+
{:angle angle
106+
:velocity velocity
107+
:t t})))
108+
109+
;; Here are a few examples for advancing the state in different situations.
110+
(update-state {:angle PI :velocity 0.0 :t 0.0} {:control 0.0} config)
111+
(update-state {:angle PI :velocity 0.1 :t 0.0} {:control 0.0} config)
112+
(update-state {:angle (/ PI 2) :velocity 0.0 :t 0.0} {:control 0.0} config)
113+
(update-state {:angle 0.0 :velocity 0.0 :t 0.0} {:control 1.0} config)
114+
115+
;; ### Observation
116+
;;
117+
;; The observation of the pendulum state uses cosinus and sinus of the angle to resolve the wrap around problem of angles.
118+
;; The angular speed is normalized to be between -1 and 1.
119+
(defn observation
120+
"Get observation from state"
121+
[{:keys [angle velocity]} {:keys [max-speed]}]
122+
[(cos angle) (sin angle) (/ velocity max-speed)])
123+
124+
;; The observation of the pendulum is a vector with 3 elements.
125+
(observation {:angle 0.0 :velocity 0.0} config)
126+
(observation {:angle 0.0 :velocity 0.5} config)
127+
(observation {:angle (/ PI 2) :velocity 0.0} config)
128+
129+
;; ### Action
130+
;;
131+
;; The action of a pendulum is a vector with one element between 0 and 1.
132+
;; The following method converts it to a action hashmap used by the pendulum environment.
133+
(defn action
134+
"Convert array to action"
135+
[array]
136+
{:control (max -1.0 (min 1.0 (- (* 2.0 (first array)) 1.0)))})
137+
138+
;; The following examples show how the action vector is mapped to a control input between -1 and 1.
139+
(action [0.0])
140+
(action [0.5])
141+
(action [1.0])
142+
143+
;; ### Termination
144+
;;
145+
;; The truncate method is used to stop a pendulum run after a specific amount of time.
146+
(defn truncate?
147+
"Decide whether a run should be aborted"
148+
([{:keys [t]} {:keys [timeout]}]
149+
(>= t timeout)))
150+
151+
(truncate? {:t 50.0} {:timeout 100.0})
152+
(truncate? {:t 100.0} {:timeout 100.0})
153+
154+
;; It is also possible to define a termination condition.
155+
;; For the pendulum environment we specify that it never terminates.
156+
(defn done?
157+
"Decide whether pendulum achieved target state"
158+
([_state _config]
159+
false))
160+
161+
;; ### Reward
162+
;;
163+
;; The following method normalizes an angle to be between -PI and +PI.
164+
(defn normalize-angle
165+
"Angular deviation from up angle"
166+
[angle]
167+
(- (mod (+ angle PI) (* 2 PI)) PI))
168+
169+
;; We also need the square of a number.
170+
(defn sqr
171+
"Square of number"
172+
[x]
173+
(* x x))
174+
175+
;; The reward function penalises deviation from the upright position, non-zero velocities, and non-zero control input.
176+
(defn reward
177+
"Reward function"
178+
[{:keys [angle velocity]} {:keys [angle-weight velocity-weight control-weight]} {:keys [control]}]
179+
(- (+ (* angle-weight (sqr (normalize-angle angle)))
180+
(* velocity-weight (sqr velocity))
181+
(* control-weight (sqr control)))))
182+
183+
;; ### Environment protocol
184+
;;
185+
;; Finally we are able to implement the pendulum as a generic environment.
186+
(defrecord Pendulum [config state]
187+
Environment
188+
(environment-update [_this input]
189+
(->Pendulum config (update-state state (action input) config)))
190+
(environment-observation [_this]
191+
(observation state config))
192+
(environment-done? [_this]
193+
(done? state config))
194+
(environment-truncate? [_this]
195+
(truncate? state config))
196+
(environment-reward [_this input]
197+
(reward state config (action input))))
198+
199+
;; The following factory method creates an environment with an initial random state covering all possible pendulum states.
200+
(defn pendulum-factory
201+
[]
202+
(let [angle (- (rand (* 2.0 PI)) PI)
203+
max-speed (:max-speed config)
204+
velocity (- (rand (* 2.0 max-speed)) max-speed)]
205+
(->Pendulum config (setup angle velocity))))

0 commit comments

Comments
 (0)