Skip to content

Commit 115b4a7

Browse files
committed
Show random distribution of actor network
1 parent 1756dd9 commit 115b4a7

1 file changed

Lines changed: 84 additions & 6 deletions

File tree

src/ppo/main.clj

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
:tags [:physics :machine-learning :optimization :ppo :control]}}}
1212

1313
(ns ppo.main
14-
(:require [clojure.math :refer (PI cos sin to-radians)]
14+
(:require [clojure.math :refer (PI cos sin exp to-radians)]
1515
[clojure.core.async :as async]
16+
[tablecloth.api :as tc]
17+
[scicloj.tableplot.v1.plotly :as plotly]
1618
[quil.core :as q]
1719
[quil.middleware :as m]
1820
[libpython-clj2.require :refer (require-python)]
@@ -442,6 +444,7 @@
442444
(fn [observation]
443445
(without-gradient (toitem (critic (tensor observation))))))
444446

447+
;; Here is the output of the network for the observation `[-1 0 0]`.
445448
((critic-observation critic) [-1 0 0])
446449

447450
;; ### Training
@@ -465,18 +468,93 @@
465468
(nn/MSELoss))
466469

467470
;; A training step can be performed as follows.
471+
;; Here we only use a single mini-batch with a single observation and an expected output of 1.0.
468472
(def optimizer (adam-optimizer critic 0.001 0.0))
469473
(def criterion (mse-loss))
470474
(def mini-batch [(tensor [[-1 0 0]]) (tensor [1.0])])
471-
(def prediction (critic (first mini-batch)))
472-
(def loss (criterion prediction (second mini-batch)))
473-
(py. optimizer zero_grad)
474-
(py. loss backward)
475-
(py. optimizer step)
475+
(let [prediction (critic (first mini-batch))
476+
loss (criterion prediction (second mini-batch))]
477+
(py. optimizer zero_grad)
478+
(py. loss backward)
479+
(py. optimizer step))
476480

477481
;; As you can see, the output of the network for the observation `[-1 0 0]` is now closer to 1.0.
478482
((critic-observation critic) [-1 0 0])
479483

484+
;; ### Actor Network
485+
;;
486+
;; The actor network for PPO takes an observation as an input and it outputs the parameters of a probability distribution over actions.
487+
;; In addition to the forward pass, the actor network has a method `deterministic_act` to choose the expectation value of the distribution as a deterministic action.
488+
(def Actor
489+
(py/create-class
490+
"Actor" [nn/Module]
491+
{"__init__"
492+
(py/make-instance-fn
493+
(fn [self observation-size hidden-units action-size]
494+
(py. nn/Module __init__ self)
495+
(py/set-attrs!
496+
self
497+
{"fc1" (nn/Linear observation-size hidden-units)
498+
"fc2" (nn/Linear hidden-units hidden-units)
499+
"fcalpha" (nn/Linear hidden-units action-size)
500+
"fcbeta" (nn/Linear hidden-units action-size)})
501+
nil))
502+
"forward"
503+
(py/make-instance-fn
504+
(fn [self x]
505+
(let [x (py. self fc1 x)
506+
x (torch/tanh x)
507+
x (py. self fc2 x)
508+
x (torch/tanh x)
509+
alpha (torch/add 1.0 (F/softplus (py. self fcalpha x)))
510+
beta (torch/add 1.0 (F/softplus (py. self fcbeta x)))]
511+
[alpha beta])))
512+
"deterministic_act"
513+
(py/make-instance-fn
514+
(fn [self x]
515+
(let [[alpha beta] (py. self forward x)]
516+
(torch/div alpha (torch/add alpha beta)))))
517+
"get_dist"
518+
(py/make-instance-fn
519+
(fn [self x]
520+
(let [[alpha beta] (py. self forward x)]
521+
(Beta alpha beta))))}))
522+
523+
;; Furthermore the actor network has a method `get_dist` to return a [Torch distribution](https://docs.pytorch.org/docs/stable/distributions.html) object which can be used to sample a random action or query the current log-probability of an action.
524+
;; Here (as the default in XinJingHao's PPO implementation) we use the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) with parameters `alpha` and `beta` both greater than 1.0.
525+
;; See [here](https://mathlets.org/mathlets/beta-distribution/) for an interactive visualization.
526+
(defn indeterministic-act
527+
"Sample action using actor network returning distribution"
528+
[actor]
529+
(fn indeterministic-act-with-actor [observation]
530+
(without-gradient
531+
(let [dist (py. actor get_dist (tensor observation))
532+
sample (py. dist sample)
533+
action (torch/clamp sample 0.0 1.0)
534+
logprob (py. dist log_prob action)]
535+
{:action (tolist action) :logprob (tolist logprob)}))))
536+
537+
(def actor (Actor 3 64 1))
538+
;; One can then use the network to:
539+
;;
540+
;; a) get the parameters of the distribution for a given observation.
541+
(without-gradient (actor (tensor [-1 0 0])))
542+
543+
;; b) choose the expectation value of the distribution as an action.
544+
(without-gradient (py. actor deterministic_act (tensor [-1 0 0])))
545+
546+
;; c) sample a random action from the distribution and get the associated log-probability.
547+
((indeterministic-act actor) [-1 0 0])
548+
549+
550+
(let [samples (repeatedly 256 #((indeterministic-act actor) [-1 0 0]))
551+
scatter (tc/dataset {:x (map (fn [sample] (first (:action sample))) samples)
552+
:y (map (fn [sample] (exp (first (:logprob sample)))) samples)})]
553+
(-> scatter
554+
(plotly/base {:=title "Actor output for a single observation"})
555+
(plotly/layer-point {:=x :x :=y :y})))
556+
557+
480558
;; # TODO
481559
;;
482560
;; * neural networks

0 commit comments

Comments
 (0)