Skip to content

Commit e740093

Browse files
authored
Merge pull request #358 from wedesoft/ppo-draft-5
PPO draft 5
2 parents a41c91f + e6a3fc5 commit e740093

1 file changed

Lines changed: 254 additions & 6 deletions

File tree

src/ppo/main.clj

Lines changed: 254 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
:tags [:physics :machine-learning :optimization :ppo :control]}}}
1212

1313
(ns ppo.main
14-
(:require [clojure.math :refer (PI cos sin to-radians)]
14+
(:require [clojure.math :refer (PI cos sin exp to-radians)]
1515
[clojure.core.async :as async]
16+
[tablecloth.api :as tc]
17+
[scicloj.tableplot.v1.plotly :as plotly]
1618
[quil.core :as q]
1719
[quil.middleware :as m]
1820
[libpython-clj2.require :refer (require-python)]
@@ -40,7 +42,7 @@
4042
;;
4143
;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure.
4244
;;
43-
;; ## Pendulum environment
45+
;; ## Pendulum Environment
4446
;;
4547
;; ![screenshot of pendulum environment](pendulum.png)
4648
;;
@@ -82,7 +84,7 @@
8284
;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5.
8385
(setup (/ PI 2) 0.5)
8486

85-
;; ### State updates
87+
;; ### State Updates
8688
;;
8789
;; The angular acceleration due to gravitation is implemented as follows.
8890
(defn pendulum-gravity
@@ -192,7 +194,7 @@
192194
(* velocity-weight (sqr velocity))
193195
(* control-weight (sqr control)))))
194196

195-
;; ### Environment protocol
197+
;; ### Environment Protocol
196198
;;
197199
;; Finally we are able to implement the pendulum as a generic environment.
198200
(defrecord Pendulum [config state]
@@ -269,14 +271,14 @@
269271

270272
;; ![manually controlled pendulum](manual.gif)
271273

272-
;; ## Neural networks
274+
;; ## Neural Networks
273275
;;
274276
;; PPO is a machine learning technique using backpropagation to learn the parameters of two neural networks.
275277
;;
276278
;; * The **actor** network takes an observation as an input and outputs the parameters of a probability distribution for sampling the next action to take.
277279
;; * The **critic** takes an observation as an input and outputs the expected cumulative reward for the current state.
278280
;;
279-
;; ### Pytorch
281+
;; ### Import Pytorch
280282
;;
281283
;; For implementing the neural networks and backpropagation, I am using the Python-Clojure bridge [libpython-clj2](https://github.com/clj-python/libpython-clj) and [Pytorch](https://pytorch.org/).
282284
;; The Pytorch library is quite comprehensive, is free software, and you can find a lot of documentation on how to use it.
@@ -342,3 +344,249 @@
342344
'[torch.nn.functional :as F]
343345
'[torch.optim :as optim]
344346
'[torch.distributions :refer (Beta)])
347+
348+
;; ### Tensor Conversion
349+
;;
350+
;; First we implement a few methods for converting nested Clojure vectors to Pytorch tensors and back.
351+
;;
352+
;; #### Clojure to Pytorch
353+
;;
354+
;; The method `tensor` is for converting a Clojure datatype to a Pytorch tensor.
355+
(defn tensor
356+
"Convert nested vector to tensor"
357+
([data]
358+
(tensor data torch/float32))
359+
([data dtype]
360+
(torch/tensor data :dtype dtype)))
361+
362+
(tensor PI)
363+
(tensor [2.0 3.0 5.0])
364+
(tensor [[1.0 2.0] [3.0 4.0] [5.0 6.0]])
365+
(tensor [1 2 3] torch/long)
366+
367+
;; #### Pytorch to Clojure
368+
;;
369+
;; The next method is for converting a Pytorch tensor back to a Clojure datatype.
370+
(defn tolist
371+
"Convert tensor to nested vector"
372+
[tensor]
373+
(py/->jvm (py. tensor tolist)))
374+
375+
(tolist (tensor [2.0 3.0 5.0]))
376+
(tolist (tensor [[1.0 2.0] [3.0 4.0] [5.0 6.0]]))
377+
378+
;; #### Pytorch scalar to Clojure
379+
;;
380+
;; A tensor with no dimensions can also be converted using `toitem`
381+
(defn toitem
382+
"Convert torch scalar value to float"
383+
[tensor]
384+
(py. tensor item))
385+
386+
(toitem (tensor PI))
387+
388+
;; ### Critic Network
389+
;;
390+
;; The critic network is a fully connected neural network with an input layer of size `observation-size` and two hidden layers of size `hidden-units` with `tanh` activation functions.
391+
;; The critic output is a single value (an estimate for the expected cumulative return achievable by the given observed state.
392+
(def Critic
393+
(py/create-class
394+
"Critic" [nn/Module]
395+
{"__init__"
396+
(py/make-instance-fn
397+
(fn [self observation-size hidden-units]
398+
(py. nn/Module __init__ self)
399+
(py/set-attrs!
400+
self
401+
{"fc1" (nn/Linear observation-size hidden-units)
402+
"fc2" (nn/Linear hidden-units hidden-units)
403+
"fc3" (nn/Linear hidden-units 1)})
404+
nil))
405+
"forward"
406+
(py/make-instance-fn
407+
(fn [self x]
408+
(let [x (py. self fc1 x)
409+
x (torch/tanh x)
410+
x (py. self fc2 x)
411+
x (torch/tanh x)
412+
x (py. self fc3 x)]
413+
(torch/squeeze x -1))))}))
414+
415+
;; When running inference, you need to run the network with gradient accumulation disabled, otherwise gradients get accumulated and can leak into a subsequent training step.
416+
;; In Python this looks like this.
417+
;;
418+
;; ```Python
419+
;; with torch.no_grad():
420+
;; ...
421+
;; ```
422+
;;
423+
;; Here we create a Clojure macro to do the same job.
424+
(defmacro without-gradient
425+
"Execute body without gradient calculation"
426+
[& body]
427+
`(let [no-grad# (torch/no_grad)]
428+
(try
429+
(py. no-grad# ~'__enter__)
430+
~@body
431+
(finally
432+
(py. no-grad# ~'__exit__ nil nil nil)))))
433+
434+
;; Now we can create a network and try it out.
435+
;; Note that the network creates non-zero outputs because Pytorch performs random initialisation of ther weights for us.
436+
(def critic (Critic 3 64))
437+
(without-gradient
438+
(toitem (critic (tensor [-1 0 0]))))
439+
440+
;; We can also create a wrapper for using the neural network with Clojure datatypes.
441+
(defn critic-observation
442+
"Use critic with Clojure datatypes"
443+
[critic]
444+
(fn [observation]
445+
(without-gradient (toitem (critic (tensor observation))))))
446+
447+
;; Here is the output of the network for the observation `[-1 0 0]`.
448+
((critic-observation critic) [-1 0 0])
449+
450+
;; ### Training
451+
;;
452+
;; Training a neural network is done by defining a loss function.
453+
;; The loss of the network then is calculated for a mini-batch of training data.
454+
;; One can then use Pytorch's backpropagation to compute the gradient of the loss value with respect to every single parameter of the network.
455+
;; The gradient then is used to perform gradient descent steps.
456+
;; A popular gradient descent method is the [Adam optimizer](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam).
457+
458+
;; Here is a wrapper for the Adam optimizer.
459+
(defn adam-optimizer
460+
"Adam optimizer"
461+
[model learning-rate weight-decay]
462+
(optim/Adam (py. model parameters) :lr learning-rate :weight_decay weight-decay))
463+
464+
;; Pytorch also provides the mean square error (MSE) loss function.
465+
(defn mse-loss
466+
"Mean square error cost function"
467+
[]
468+
(nn/MSELoss))
469+
470+
;; A training step can be performed as follows.
471+
;; Here we only use a single mini-batch with a single observation and an expected output of 1.0.
472+
(def optimizer (adam-optimizer critic 0.001 0.0))
473+
(def criterion (mse-loss))
474+
(def mini-batch [(tensor [[-1 0 0]]) (tensor [1.0])])
475+
(let [prediction (critic (first mini-batch))
476+
loss (criterion prediction (second mini-batch))]
477+
(py. optimizer zero_grad)
478+
(py. loss backward)
479+
(py. optimizer step))
480+
481+
;; As you can see, the output of the network for the observation `[-1 0 0]` is now closer to 1.0.
482+
((critic-observation critic) [-1 0 0])
483+
484+
;; ### Actor Network
485+
;;
486+
;; The actor network for PPO takes an observation as an input and it outputs the parameters of a probability distribution over actions.
487+
;; In addition to the forward pass, the actor network has a method `deterministic_act` to choose the expectation value of the distribution as a deterministic action.
488+
(def Actor
489+
(py/create-class
490+
"Actor" [nn/Module]
491+
{"__init__"
492+
(py/make-instance-fn
493+
(fn [self observation-size hidden-units action-size]
494+
(py. nn/Module __init__ self)
495+
(py/set-attrs!
496+
self
497+
{"fc1" (nn/Linear observation-size hidden-units)
498+
"fc2" (nn/Linear hidden-units hidden-units)
499+
"fcalpha" (nn/Linear hidden-units action-size)
500+
"fcbeta" (nn/Linear hidden-units action-size)})
501+
nil))
502+
"forward"
503+
(py/make-instance-fn
504+
(fn [self x]
505+
(let [x (py. self fc1 x)
506+
x (torch/tanh x)
507+
x (py. self fc2 x)
508+
x (torch/tanh x)
509+
alpha (torch/add 1.0 (F/softplus (py. self fcalpha x)))
510+
beta (torch/add 1.0 (F/softplus (py. self fcbeta x)))]
511+
[alpha beta])))
512+
"deterministic_act"
513+
(py/make-instance-fn
514+
(fn [self x]
515+
(let [[alpha beta] (py. self forward x)]
516+
(torch/div alpha (torch/add alpha beta)))))
517+
"get_dist"
518+
(py/make-instance-fn
519+
(fn [self x]
520+
(let [[alpha beta] (py. self forward x)]
521+
(Beta alpha beta))))}))
522+
523+
;; Furthermore the actor network has a method `get_dist` to return a [Torch distribution](https://docs.pytorch.org/docs/stable/distributions.html) object which can be used to sample a random action or query the current log-probability of an action.
524+
;; Here (as the default in XinJingHao's PPO implementation) we use the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) with parameters `alpha` and `beta` both greater than 1.0.
525+
;; See [here](https://mathlets.org/mathlets/beta-distribution/) for an interactive visualization.
526+
(defn indeterministic-act
527+
"Sample action using actor network returning distribution"
528+
[actor]
529+
(fn indeterministic-act-with-actor [observation]
530+
(without-gradient
531+
(let [dist (py. actor get_dist (tensor observation))
532+
sample (py. dist sample)
533+
action (torch/clamp sample 0.0 1.0)
534+
logprob (py. dist log_prob action)]
535+
{:action (tolist action) :logprob (tolist logprob)}))))
536+
537+
(def actor (Actor 3 64 1))
538+
;; One can then use the network to:
539+
;;
540+
;; a) get the parameters of the distribution for a given observation.
541+
(without-gradient (actor (tensor [-1 0 0])))
542+
543+
;; b) choose the expectation value of the distribution as an action.
544+
(without-gradient (py. actor deterministic_act (tensor [-1 0 0])))
545+
546+
;; c) sample a random action from the distribution and get the associated log-probability.
547+
((indeterministic-act actor) [-1 0 0])
548+
549+
;; We can also query the current log-probability of a previously sampled action.
550+
(defn logprob-of-action
551+
"Get log probability of action"
552+
[actor]
553+
(fn [observation action]
554+
(let [dist (py. actor get_dist observation)]
555+
(py. dist log_prob action))))
556+
557+
;; Here is a plot of the probability density function (PDF) actor output for a single observation.
558+
(without-gradient
559+
(let [actions (range 0.0 1.01 0.01)
560+
scatter (tc/dataset {:x actions
561+
:y (map (fn [action]
562+
(exp (first (tolist ((logprob-of-action actor) (tensor [-1 0 0]) (tensor [action]))))))
563+
actions)})]
564+
(-> scatter
565+
(plotly/base {:=title "Actor output for a single observation" :=mode :lines})
566+
(plotly/layer-point {:=x :x :=y :y}))))
567+
568+
;; Finally we also can also query the entropy of the distribution.
569+
;; By incorporating the entropy into the loss function later on, we can encourage exploration and prevent the probability density function from collapsing.
570+
(defn entropy-of-distribution
571+
"Get entropy of distribution"
572+
[actor observation]
573+
(let [dist (py. actor get_dist observation)]
574+
(py. dist entropy)))
575+
576+
(without-gradient (entropy-of-distribution actor (tensor [-1 0 0])))
577+
578+
;; $\hat{A}_{T-1} = -V(S_{T-1}) + r_{T-1} + \gamma V(S_T)$
579+
;;
580+
;; $\hat{A}_{T-2} = -V(S_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(S_T)$
581+
;;
582+
;; $\vdots$
583+
;;
584+
;; $\hat{A}_0 = -V(S_0) + r_0 + \gamma r_1 + \ldots + \gamma^T V(S_T)$
585+
;;
586+
;; $\hat{A}_t = -V(s_t) + r_t + \gamma r_{t+1} + \ldots + \gamma^{T-t+1} r_{T-1} + \gamma^{T-t} V(S_T)$
587+
;;
588+
;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \delta_{t+l}$
589+
;;
590+
;; $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$
591+
;;
592+
;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \left( r_{t+l} + \gamma V(s_{t+l+1}) - V(s_{t+l}) \right)$

0 commit comments

Comments
 (0)