|
11 | 11 | :tags [:physics :machine-learning :optimization :ppo :control]}}} |
12 | 12 |
|
13 | 13 | (ns ppo.main |
14 | | - (:require [clojure.math :refer (PI cos sin to-radians)] |
| 14 | + (:require [clojure.math :refer (PI cos sin exp to-radians)] |
15 | 15 | [clojure.core.async :as async] |
| 16 | + [tablecloth.api :as tc] |
| 17 | + [scicloj.tableplot.v1.plotly :as plotly] |
16 | 18 | [quil.core :as q] |
17 | 19 | [quil.middleware :as m] |
18 | 20 | [libpython-clj2.require :refer (require-python)] |
|
40 | 42 | ;; |
41 | 43 | ;; In order to use PPO with a simulation environment in Clojure and also in order to get a better understanding of PPO, I dediced to do an implementation of PPO in Clojure. |
42 | 44 | ;; |
43 | | -;; ## Pendulum environment |
| 45 | +;; ## Pendulum Environment |
44 | 46 | ;; |
45 | 47 | ;;  |
46 | 48 | ;; |
|
82 | 84 | ;; Here a pendulum is initialised to be pointing down and with an angular velocity of 0.5. |
83 | 85 | (setup (/ PI 2) 0.5) |
84 | 86 |
|
85 | | -;; ### State updates |
| 87 | +;; ### State Updates |
86 | 88 | ;; |
87 | 89 | ;; The angular acceleration due to gravitation is implemented as follows. |
88 | 90 | (defn pendulum-gravity |
|
192 | 194 | (* velocity-weight (sqr velocity)) |
193 | 195 | (* control-weight (sqr control))))) |
194 | 196 |
|
195 | | -;; ### Environment protocol |
| 197 | +;; ### Environment Protocol |
196 | 198 | ;; |
197 | 199 | ;; Finally we are able to implement the pendulum as a generic environment. |
198 | 200 | (defrecord Pendulum [config state] |
|
269 | 271 |
|
270 | 272 | ;;  |
271 | 273 |
|
272 | | -;; ## Neural networks |
| 274 | +;; ## Neural Networks |
273 | 275 | ;; |
274 | 276 | ;; PPO is a machine learning technique using backpropagation to learn the parameters of two neural networks. |
275 | 277 | ;; |
276 | 278 | ;; * The **actor** network takes an observation as an input and outputs the parameters of a probability distribution for sampling the next action to take. |
277 | 279 | ;; * The **critic** takes an observation as an input and outputs the expected cumulative reward for the current state. |
278 | 280 | ;; |
279 | | -;; ### Pytorch |
| 281 | +;; ### Import Pytorch |
280 | 282 | ;; |
281 | 283 | ;; For implementing the neural networks and backpropagation, I am using the Python-Clojure bridge [libpython-clj2](https://github.com/clj-python/libpython-clj) and [Pytorch](https://pytorch.org/). |
282 | 284 | ;; The Pytorch library is quite comprehensive, is free software, and you can find a lot of documentation on how to use it. |
|
342 | 344 | '[torch.nn.functional :as F] |
343 | 345 | '[torch.optim :as optim] |
344 | 346 | '[torch.distributions :refer (Beta)]) |
| 347 | + |
| 348 | +;; ### Tensor Conversion |
| 349 | +;; |
| 350 | +;; First we implement a few methods for converting nested Clojure vectors to Pytorch tensors and back. |
| 351 | +;; |
| 352 | +;; #### Clojure to Pytorch |
| 353 | +;; |
| 354 | +;; The method `tensor` is for converting a Clojure datatype to a Pytorch tensor. |
| 355 | +(defn tensor |
| 356 | + "Convert nested vector to tensor" |
| 357 | + ([data] |
| 358 | + (tensor data torch/float32)) |
| 359 | + ([data dtype] |
| 360 | + (torch/tensor data :dtype dtype))) |
| 361 | + |
| 362 | +(tensor PI) |
| 363 | +(tensor [2.0 3.0 5.0]) |
| 364 | +(tensor [[1.0 2.0] [3.0 4.0] [5.0 6.0]]) |
| 365 | +(tensor [1 2 3] torch/long) |
| 366 | + |
| 367 | +;; #### Pytorch to Clojure |
| 368 | +;; |
| 369 | +;; The next method is for converting a Pytorch tensor back to a Clojure datatype. |
| 370 | +(defn tolist |
| 371 | + "Convert tensor to nested vector" |
| 372 | + [tensor] |
| 373 | + (py/->jvm (py. tensor tolist))) |
| 374 | + |
| 375 | +(tolist (tensor [2.0 3.0 5.0])) |
| 376 | +(tolist (tensor [[1.0 2.0] [3.0 4.0] [5.0 6.0]])) |
| 377 | + |
| 378 | +;; #### Pytorch scalar to Clojure |
| 379 | +;; |
| 380 | +;; A tensor with no dimensions can also be converted using `toitem` |
| 381 | +(defn toitem |
| 382 | + "Convert torch scalar value to float" |
| 383 | + [tensor] |
| 384 | + (py. tensor item)) |
| 385 | + |
| 386 | +(toitem (tensor PI)) |
| 387 | + |
| 388 | +;; ### Critic Network |
| 389 | +;; |
| 390 | +;; The critic network is a fully connected neural network with an input layer of size `observation-size` and two hidden layers of size `hidden-units` with `tanh` activation functions. |
| 391 | +;; The critic output is a single value (an estimate for the expected cumulative return achievable by the given observed state. |
| 392 | +(def Critic |
| 393 | + (py/create-class |
| 394 | + "Critic" [nn/Module] |
| 395 | + {"__init__" |
| 396 | + (py/make-instance-fn |
| 397 | + (fn [self observation-size hidden-units] |
| 398 | + (py. nn/Module __init__ self) |
| 399 | + (py/set-attrs! |
| 400 | + self |
| 401 | + {"fc1" (nn/Linear observation-size hidden-units) |
| 402 | + "fc2" (nn/Linear hidden-units hidden-units) |
| 403 | + "fc3" (nn/Linear hidden-units 1)}) |
| 404 | + nil)) |
| 405 | + "forward" |
| 406 | + (py/make-instance-fn |
| 407 | + (fn [self x] |
| 408 | + (let [x (py. self fc1 x) |
| 409 | + x (torch/tanh x) |
| 410 | + x (py. self fc2 x) |
| 411 | + x (torch/tanh x) |
| 412 | + x (py. self fc3 x)] |
| 413 | + (torch/squeeze x -1))))})) |
| 414 | + |
| 415 | +;; When running inference, you need to run the network with gradient accumulation disabled, otherwise gradients get accumulated and can leak into a subsequent training step. |
| 416 | +;; In Python this looks like this. |
| 417 | +;; |
| 418 | +;; ```Python |
| 419 | +;; with torch.no_grad(): |
| 420 | +;; ... |
| 421 | +;; ``` |
| 422 | +;; |
| 423 | +;; Here we create a Clojure macro to do the same job. |
| 424 | +(defmacro without-gradient |
| 425 | + "Execute body without gradient calculation" |
| 426 | + [& body] |
| 427 | + `(let [no-grad# (torch/no_grad)] |
| 428 | + (try |
| 429 | + (py. no-grad# ~'__enter__) |
| 430 | + ~@body |
| 431 | + (finally |
| 432 | + (py. no-grad# ~'__exit__ nil nil nil))))) |
| 433 | + |
| 434 | +;; Now we can create a network and try it out. |
| 435 | +;; Note that the network creates non-zero outputs because Pytorch performs random initialisation of ther weights for us. |
| 436 | +(def critic (Critic 3 64)) |
| 437 | +(without-gradient |
| 438 | + (toitem (critic (tensor [-1 0 0])))) |
| 439 | + |
| 440 | +;; We can also create a wrapper for using the neural network with Clojure datatypes. |
| 441 | +(defn critic-observation |
| 442 | + "Use critic with Clojure datatypes" |
| 443 | + [critic] |
| 444 | + (fn [observation] |
| 445 | + (without-gradient (toitem (critic (tensor observation)))))) |
| 446 | + |
| 447 | +;; Here is the output of the network for the observation `[-1 0 0]`. |
| 448 | +((critic-observation critic) [-1 0 0]) |
| 449 | + |
| 450 | +;; ### Training |
| 451 | +;; |
| 452 | +;; Training a neural network is done by defining a loss function. |
| 453 | +;; The loss of the network then is calculated for a mini-batch of training data. |
| 454 | +;; One can then use Pytorch's backpropagation to compute the gradient of the loss value with respect to every single parameter of the network. |
| 455 | +;; The gradient then is used to perform gradient descent steps. |
| 456 | +;; A popular gradient descent method is the [Adam optimizer](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam). |
| 457 | + |
| 458 | +;; Here is a wrapper for the Adam optimizer. |
| 459 | +(defn adam-optimizer |
| 460 | + "Adam optimizer" |
| 461 | + [model learning-rate weight-decay] |
| 462 | + (optim/Adam (py. model parameters) :lr learning-rate :weight_decay weight-decay)) |
| 463 | + |
| 464 | +;; Pytorch also provides the mean square error (MSE) loss function. |
| 465 | +(defn mse-loss |
| 466 | + "Mean square error cost function" |
| 467 | + [] |
| 468 | + (nn/MSELoss)) |
| 469 | + |
| 470 | +;; A training step can be performed as follows. |
| 471 | +;; Here we only use a single mini-batch with a single observation and an expected output of 1.0. |
| 472 | +(def optimizer (adam-optimizer critic 0.001 0.0)) |
| 473 | +(def criterion (mse-loss)) |
| 474 | +(def mini-batch [(tensor [[-1 0 0]]) (tensor [1.0])]) |
| 475 | +(let [prediction (critic (first mini-batch)) |
| 476 | + loss (criterion prediction (second mini-batch))] |
| 477 | + (py. optimizer zero_grad) |
| 478 | + (py. loss backward) |
| 479 | + (py. optimizer step)) |
| 480 | + |
| 481 | +;; As you can see, the output of the network for the observation `[-1 0 0]` is now closer to 1.0. |
| 482 | +((critic-observation critic) [-1 0 0]) |
| 483 | + |
| 484 | +;; ### Actor Network |
| 485 | +;; |
| 486 | +;; The actor network for PPO takes an observation as an input and it outputs the parameters of a probability distribution over actions. |
| 487 | +;; In addition to the forward pass, the actor network has a method `deterministic_act` to choose the expectation value of the distribution as a deterministic action. |
| 488 | +(def Actor |
| 489 | + (py/create-class |
| 490 | + "Actor" [nn/Module] |
| 491 | + {"__init__" |
| 492 | + (py/make-instance-fn |
| 493 | + (fn [self observation-size hidden-units action-size] |
| 494 | + (py. nn/Module __init__ self) |
| 495 | + (py/set-attrs! |
| 496 | + self |
| 497 | + {"fc1" (nn/Linear observation-size hidden-units) |
| 498 | + "fc2" (nn/Linear hidden-units hidden-units) |
| 499 | + "fcalpha" (nn/Linear hidden-units action-size) |
| 500 | + "fcbeta" (nn/Linear hidden-units action-size)}) |
| 501 | + nil)) |
| 502 | + "forward" |
| 503 | + (py/make-instance-fn |
| 504 | + (fn [self x] |
| 505 | + (let [x (py. self fc1 x) |
| 506 | + x (torch/tanh x) |
| 507 | + x (py. self fc2 x) |
| 508 | + x (torch/tanh x) |
| 509 | + alpha (torch/add 1.0 (F/softplus (py. self fcalpha x))) |
| 510 | + beta (torch/add 1.0 (F/softplus (py. self fcbeta x)))] |
| 511 | + [alpha beta]))) |
| 512 | + "deterministic_act" |
| 513 | + (py/make-instance-fn |
| 514 | + (fn [self x] |
| 515 | + (let [[alpha beta] (py. self forward x)] |
| 516 | + (torch/div alpha (torch/add alpha beta))))) |
| 517 | + "get_dist" |
| 518 | + (py/make-instance-fn |
| 519 | + (fn [self x] |
| 520 | + (let [[alpha beta] (py. self forward x)] |
| 521 | + (Beta alpha beta))))})) |
| 522 | + |
| 523 | +;; Furthermore the actor network has a method `get_dist` to return a [Torch distribution](https://docs.pytorch.org/docs/stable/distributions.html) object which can be used to sample a random action or query the current log-probability of an action. |
| 524 | +;; Here (as the default in XinJingHao's PPO implementation) we use the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) with parameters `alpha` and `beta` both greater than 1.0. |
| 525 | +;; See [here](https://mathlets.org/mathlets/beta-distribution/) for an interactive visualization. |
| 526 | +(defn indeterministic-act |
| 527 | + "Sample action using actor network returning distribution" |
| 528 | + [actor] |
| 529 | + (fn indeterministic-act-with-actor [observation] |
| 530 | + (without-gradient |
| 531 | + (let [dist (py. actor get_dist (tensor observation)) |
| 532 | + sample (py. dist sample) |
| 533 | + action (torch/clamp sample 0.0 1.0) |
| 534 | + logprob (py. dist log_prob action)] |
| 535 | + {:action (tolist action) :logprob (tolist logprob)})))) |
| 536 | + |
| 537 | +(def actor (Actor 3 64 1)) |
| 538 | +;; One can then use the network to: |
| 539 | +;; |
| 540 | +;; a) get the parameters of the distribution for a given observation. |
| 541 | +(without-gradient (actor (tensor [-1 0 0]))) |
| 542 | + |
| 543 | +;; b) choose the expectation value of the distribution as an action. |
| 544 | +(without-gradient (py. actor deterministic_act (tensor [-1 0 0]))) |
| 545 | + |
| 546 | +;; c) sample a random action from the distribution and get the associated log-probability. |
| 547 | +((indeterministic-act actor) [-1 0 0]) |
| 548 | + |
| 549 | +;; We can also query the current log-probability of a previously sampled action. |
| 550 | +(defn logprob-of-action |
| 551 | + "Get log probability of action" |
| 552 | + [actor] |
| 553 | + (fn [observation action] |
| 554 | + (let [dist (py. actor get_dist observation)] |
| 555 | + (py. dist log_prob action)))) |
| 556 | + |
| 557 | +;; Here is a plot of the probability density function (PDF) actor output for a single observation. |
| 558 | +(without-gradient |
| 559 | + (let [actions (range 0.0 1.01 0.01) |
| 560 | + scatter (tc/dataset {:x actions |
| 561 | + :y (map (fn [action] |
| 562 | + (exp (first (tolist ((logprob-of-action actor) (tensor [-1 0 0]) (tensor [action])))))) |
| 563 | + actions)})] |
| 564 | + (-> scatter |
| 565 | + (plotly/base {:=title "Actor output for a single observation" :=mode :lines}) |
| 566 | + (plotly/layer-point {:=x :x :=y :y})))) |
| 567 | + |
| 568 | +;; Finally we also can also query the entropy of the distribution. |
| 569 | +;; By incorporating the entropy into the loss function later on, we can encourage exploration and prevent the probability density function from collapsing. |
| 570 | +(defn entropy-of-distribution |
| 571 | + "Get entropy of distribution" |
| 572 | + [actor observation] |
| 573 | + (let [dist (py. actor get_dist observation)] |
| 574 | + (py. dist entropy))) |
| 575 | + |
| 576 | +(without-gradient (entropy-of-distribution actor (tensor [-1 0 0]))) |
| 577 | + |
| 578 | +;; $\hat{A}_{T-1} = -V(S_{T-1}) + r_{T-1} + \gamma V(S_T)$ |
| 579 | +;; |
| 580 | +;; $\hat{A}_{T-2} = -V(S_{T-2}) + r_{T-2} + \gamma r_{T-1} + \gamma^2 V(S_T)$ |
| 581 | +;; |
| 582 | +;; $\vdots$ |
| 583 | +;; |
| 584 | +;; $\hat{A}_0 = -V(S_0) + r_0 + \gamma r_1 + \ldots + \gamma^T V(S_T)$ |
| 585 | +;; |
| 586 | +;; $\hat{A}_t = -V(s_t) + r_t + \gamma r_{t+1} + \ldots + \gamma^{T-t+1} r_{T-1} + \gamma^{T-t} V(S_T)$ |
| 587 | +;; |
| 588 | +;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \delta_{t+l}$ |
| 589 | +;; |
| 590 | +;; $\delta_t = r_t + \gamma V(s_{t+1}) - V(s_t)$ |
| 591 | +;; |
| 592 | +;; $\hat{A}_t = \sum_{l=0}^{T-t-1} (\gamma \lambda)^l \left( r_{t+l} + \gamma V(s_{t+l+1}) - V(s_{t+l}) \right)$ |
0 commit comments