|
11 | 11 | :tags [:physics :machine-learning :optimization :ppo :control]}}} |
12 | 12 |
|
13 | 13 | (ns ppo.main |
14 | | - (:require [clojure.math :refer (PI cos sin to-radians)] |
| 14 | + (:require [clojure.math :refer (PI cos sin exp to-radians)] |
15 | 15 | [clojure.core.async :as async] |
| 16 | + [tablecloth.api :as tc] |
| 17 | + [scicloj.tableplot.v1.plotly :as plotly] |
16 | 18 | [quil.core :as q] |
17 | 19 | [quil.middleware :as m] |
18 | 20 | [libpython-clj2.require :refer (require-python)] |
|
442 | 444 | (fn [observation] |
443 | 445 | (without-gradient (toitem (critic (tensor observation)))))) |
444 | 446 |
|
| 447 | +;; Here is the output of the network for the observation `[-1 0 0]`. |
445 | 448 | ((critic-observation critic) [-1 0 0]) |
446 | 449 |
|
447 | 450 | ;; ### Training |
|
465 | 468 | (nn/MSELoss)) |
466 | 469 |
|
467 | 470 | ;; A training step can be performed as follows. |
| 471 | +;; Here we only use a single mini-batch with a single observation and an expected output of 1.0. |
468 | 472 | (def optimizer (adam-optimizer critic 0.001 0.0)) |
469 | 473 | (def criterion (mse-loss)) |
470 | 474 | (def mini-batch [(tensor [[-1 0 0]]) (tensor [1.0])]) |
471 | | -(def prediction (critic (first mini-batch))) |
472 | | -(def loss (criterion prediction (second mini-batch))) |
473 | | -(py. optimizer zero_grad) |
474 | | -(py. loss backward) |
475 | | -(py. optimizer step) |
| 475 | +(let [prediction (critic (first mini-batch)) |
| 476 | + loss (criterion prediction (second mini-batch))] |
| 477 | + (py. optimizer zero_grad) |
| 478 | + (py. loss backward) |
| 479 | + (py. optimizer step)) |
476 | 480 |
|
477 | 481 | ;; As you can see, the output of the network for the observation `[-1 0 0]` is now closer to 1.0. |
478 | 482 | ((critic-observation critic) [-1 0 0]) |
479 | 483 |
|
| 484 | +;; ### Actor Network |
| 485 | +;; |
| 486 | +;; The actor network for PPO takes an observation as an input and it outputs the parameters of a probability distribution over actions. |
| 487 | +;; In addition to the forward pass, the actor network has a method `deterministic_act` to choose the expectation value of the distribution as a deterministic action. |
| 488 | +(def Actor |
| 489 | + (py/create-class |
| 490 | + "Actor" [nn/Module] |
| 491 | + {"__init__" |
| 492 | + (py/make-instance-fn |
| 493 | + (fn [self observation-size hidden-units action-size] |
| 494 | + (py. nn/Module __init__ self) |
| 495 | + (py/set-attrs! |
| 496 | + self |
| 497 | + {"fc1" (nn/Linear observation-size hidden-units) |
| 498 | + "fc2" (nn/Linear hidden-units hidden-units) |
| 499 | + "fcalpha" (nn/Linear hidden-units action-size) |
| 500 | + "fcbeta" (nn/Linear hidden-units action-size)}) |
| 501 | + nil)) |
| 502 | + "forward" |
| 503 | + (py/make-instance-fn |
| 504 | + (fn [self x] |
| 505 | + (let [x (py. self fc1 x) |
| 506 | + x (torch/tanh x) |
| 507 | + x (py. self fc2 x) |
| 508 | + x (torch/tanh x) |
| 509 | + alpha (torch/add 1.0 (F/softplus (py. self fcalpha x))) |
| 510 | + beta (torch/add 1.0 (F/softplus (py. self fcbeta x)))] |
| 511 | + [alpha beta]))) |
| 512 | + "deterministic_act" |
| 513 | + (py/make-instance-fn |
| 514 | + (fn [self x] |
| 515 | + (let [[alpha beta] (py. self forward x)] |
| 516 | + (torch/div alpha (torch/add alpha beta))))) |
| 517 | + "get_dist" |
| 518 | + (py/make-instance-fn |
| 519 | + (fn [self x] |
| 520 | + (let [[alpha beta] (py. self forward x)] |
| 521 | + (Beta alpha beta))))})) |
| 522 | + |
| 523 | +;; Furthermore the actor network has a method `get_dist` to return a [Torch distribution](https://docs.pytorch.org/docs/stable/distributions.html) object which can be used to sample a random action or query the current log-probability of an action. |
| 524 | +;; Here (as the default in XinJingHao's PPO implementation) we use the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) with parameters `alpha` and `beta` both greater than 1.0. |
| 525 | +;; See [here](https://mathlets.org/mathlets/beta-distribution/) for an interactive visualization. |
| 526 | +(defn indeterministic-act |
| 527 | + "Sample action using actor network returning distribution" |
| 528 | + [actor] |
| 529 | + (fn indeterministic-act-with-actor [observation] |
| 530 | + (without-gradient |
| 531 | + (let [dist (py. actor get_dist (tensor observation)) |
| 532 | + sample (py. dist sample) |
| 533 | + action (torch/clamp sample 0.0 1.0) |
| 534 | + logprob (py. dist log_prob action)] |
| 535 | + {:action (tolist action) :logprob (tolist logprob)})))) |
| 536 | + |
| 537 | +(def actor (Actor 3 64 1)) |
| 538 | +;; One can then use the network to: |
| 539 | +;; |
| 540 | +;; a) get the parameters of the distribution for a given observation. |
| 541 | +(without-gradient (actor (tensor [-1 0 0]))) |
| 542 | + |
| 543 | +;; b) choose the expectation value of the distribution as an action. |
| 544 | +(without-gradient (py. actor deterministic_act (tensor [-1 0 0]))) |
| 545 | + |
| 546 | +;; c) sample a random action from the distribution and get the associated log-probability. |
| 547 | +((indeterministic-act actor) [-1 0 0]) |
| 548 | + |
| 549 | + |
| 550 | +(let [samples (repeatedly 256 #((indeterministic-act actor) [-1 0 0])) |
| 551 | + scatter (tc/dataset {:x (map (fn [sample] (first (:action sample))) samples) |
| 552 | + :y (map (fn [sample] (exp (first (:logprob sample)))) samples)})] |
| 553 | + (-> scatter |
| 554 | + (plotly/base {:=title "Actor output for a single observation"}) |
| 555 | + (plotly/layer-point {:=x :x :=y :y}))) |
| 556 | + |
| 557 | + |
480 | 558 | ;; # TODO |
481 | 559 | ;; |
482 | 560 | ;; * neural networks |
|
0 commit comments