|
12 | 12 | (ns scicloj.data-visualization.violin |
13 | 13 | (:require |
14 | 14 | [clojure.data.json :as json] |
| 15 | + [tablecloth.api :as tc] |
15 | 16 | [scicloj.kindly.v4.api :as kindly] |
16 | 17 | [scicloj.kindly.v4.kind :as kind] |
17 | 18 | )) |
18 | 19 |
|
| 20 | +;;; I can't stand writing long text in ;;; comments, so import it from actual .md files |
| 21 | +^{:kind/md true :kindly/hide-code true} |
| 22 | +(slurp "src/data_visualization/violin1.md") |
| 23 | + |
| 24 | +;;; # Get some data |
| 25 | + |
| 26 | +;;; ## Movie Dataset |
19 | 27 |
|
20 | 28 | ;;; Get the movie dataset |
21 | 29 | (def movie-data |
22 | | - (json/read-str (slurp "https://vega.github.io/editor/data/movies.json") )) |
| 30 | + (json/read-str (slurp "https://vega.github.io/editor/data/movies.json") )) |
23 | 31 |
|
| 32 | +;;; Here we'll take a look at a sample of the data (selected columns and just a few rows). |
| 33 | +(kind/table |
| 34 | + {:row-maps (take 10 movie-data) |
| 35 | + :column-names ["Title" "IMDB Rating" "US Gross" "Distributor" "Production Budget" "MPAA Rating" "Major Genre"]}) |
24 | 36 |
|
25 | | -;;; Make a simple boxplot showing US Gross by Genre |
26 | | -(kind/vega-lite |
27 | | - {:mark {:type "boxplot" :tooltip {:content "data"}}, |
28 | | - :data {:values movie-data} |
| 37 | + |
| 38 | +;;; ## Penguin dataset |
| 39 | + |
| 40 | +(def penguin-data-url "https://raw.githubusercontent.com/ttimbers/palmerpenguins/refs/heads/file-variants/inst/extdata/penguins.tsv") |
| 41 | + |
| 42 | +(def penguin-data |
| 43 | + (tc/dataset penguin-data-url {:key-fn keyword})) |
| 44 | + |
| 45 | +(kind/table (tc/random penguin-data 10)) |
| 46 | + |
| 47 | +;;; # Boxplot |
| 48 | + |
| 49 | + |
| 50 | +;;; A basic boxplot shows the distribution of a single varianle. Here we look at the distribution of US gross profits: |
| 51 | +^:kind/vega-lite |
| 52 | + {:mark {:type "boxplot"} |
| 53 | + :data {:values movie-data} |
29 | 54 | :encoding |
30 | | - {"color" {:field "Major Genre", :type "nominal" :legend false}, |
31 | | - "y" {:field "Major Genre", |
32 | | - :type "nominal"}, |
33 | | - "x" {:field "US Gross", |
| 55 | + {"x" {:field "US Gross" |
34 | 56 | :type "quantitative"} |
35 | | - :tooltip {:field "Title" |
36 | | - ;; :type "quantitative" |
37 | | - }}, |
| 57 | + :tooltip {:field "Title"}} |
38 | 58 | :width 800 |
39 | | - }) |
| 59 | + } |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | + |
| 65 | + |
| 66 | +;;; But boxplots are more useful when you compare distributions given a second variable. Here we see the different distributions for specific genres of movie. |
| 67 | + |
| 68 | +^:kind/vega-lite |
| 69 | + {:mark {:type "boxplot"} |
| 70 | + :data {:values movie-data} |
| 71 | + :encoding |
| 72 | + {"x" {:field "US Gross" |
| 73 | + :type "quantitative"} |
| 74 | + "y" {:field "Major Genre" |
| 75 | + :type "nominal"} |
| 76 | + "color" {:field "Major Genre" :type "nominal" :legend false} |
| 77 | + :tooltip {:field "Title"}} |
| 78 | + :width 800 |
| 79 | + } |
| 80 | + |
| 81 | +;;; This shows us the median (white line) |
| 82 | + |
| 83 | +;;; # Violins |
40 | 84 |
|
41 | 85 |
|
42 | | -;; That's nice, but how can we add violins to this? |
43 | 86 | (kind/vega-lite |
44 | | - {:mark {:type "area"}, |
| 87 | + {:mark {:type "area"} |
45 | 88 | :data {:values movie-data} |
46 | 89 | :transform [{:density "US Gross" |
47 | 90 | :groupby ["Major Genre"] |
48 | | - :extent [0, 200000000]}] |
| 91 | + :extent [0 200000000]}] |
| 92 | + :height 50 ;this is the height of each row (facet) |
49 | 93 | :encoding |
50 | | - {"color" {:field "Major Genre", :type "nominal" :legend false}, |
51 | | - "y" {:field "density", |
| 94 | + {"color" {:field "Major Genre" :type "nominal" :legend false} |
| 95 | + "y" {:field "density" |
52 | 96 | :type "quantitative" |
53 | 97 | :stack "center" |
54 | 98 | :axis false |
55 | | - }, ;this reflect-doubles the area plot to produce the violin shape |
56 | | - "x" {:field "value", |
| 99 | + } ;this reflect-doubles the area plot to produce the violin shape |
| 100 | + "x" {:field "value" |
57 | 101 | :type "quantitative"} |
58 | | - "facet" {:field "Major Genre" |
59 | | - :type "nominal" |
60 | | - :columns 1 |
61 | | - :spacing 0 |
62 | | - :legend :left |
63 | | - } |
64 | | - }, |
| 102 | + "row" {:field "Major Genre" |
| 103 | + :type "nominal" |
| 104 | + :columns 1 |
| 105 | + :spacing 0 |
| 106 | + :header {:labelAngle 0 :labelAlign "left"} |
| 107 | + } |
| 108 | + } |
65 | 109 | :width 800 |
66 | 110 | }) |
67 | 111 |
|
68 | 112 |
|
| 113 | +(defn violin-plot |
| 114 | + [data value-field group-field min max] |
| 115 | + {:mark {:type "area"} |
| 116 | + :data data |
| 117 | + :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
| 118 | + {:density value-field |
| 119 | + :groupby [group-field] |
| 120 | + ;; :bandwidth 1.0 |
| 121 | + :extent [min max]}] |
| 122 | + :height 50 ;this is the height of each row (facet) |
| 123 | + :encoding |
| 124 | + {:color {:field group-field :type "nominal" :legend false} |
| 125 | + :y {:field "density" |
| 126 | + :type "quantitative" |
| 127 | + :stack "center" |
| 128 | + :axis false |
| 129 | + } ;this reflect-doubles the area plot to produce the violin shape |
| 130 | + :x {:field "value" |
| 131 | + :type "quantitative"} |
| 132 | + :row {:field group-field |
| 133 | + :type "nominal" |
| 134 | + :columns 1 |
| 135 | + :spacing 0 |
| 136 | + :header {:labelAngle 0 :labelAlign "left"} |
| 137 | + } |
| 138 | + } |
| 139 | + :width 800 |
| 140 | + }) |
| 141 | + |
| 142 | +;;; TODO show box, whiskers, points |
| 143 | +;;; TODO need a better dataset, this is boring |
| 144 | + |
| 145 | +;;; # Here's a more scientific example |
| 146 | + |
| 147 | +(def iris |
| 148 | + (json/read-str (slurp "https://storage.googleapis.com/kagglesdsdata/datasets/20079/26025/iris.json?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251129%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251129T050349Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=a076da9c0375641bed362393229356cd341ab694a356bbbadb6678f654ef58880b12de60a307cff229fc845e66a05acc14621bc4a6a022fc6419e0431327bc9b8105ca66e8289bd4b030825dfb5e0aaa7b0824bb9ebe9ed087c23329fb8a9259c86d0bccfdfe4da1f4d7ae84a91e14dc0df16aa011afecaa2daa1a96d83efc170e2d50758690b22e9b1fb289a476786d15f756e84724706c5581389462938de2a7d6d7ec38e20a7d7edc9b143ddef286e462f07c7827900a9e2130ca41cf21ce7da1e540d599d6bec333a0eae26af1532bf2ba745fd07e197226fb75795b1655aab3f62d097fa9be56a907e8c98601deb5c6c880e5ccc00617752ea92518f945"))) |
| 149 | + |
| 150 | +^:kind/vega-lite |
| 151 | +(violin-plot {:values iris} "petalWidth" "species" 0 4) |
| 152 | + |
| 153 | +^:kind/vega-lite |
| 154 | +(violin-plot {:url penguin-data-url |
| 155 | + :format {:type "tsv"} |
| 156 | + } |
| 157 | + "flipper_length_mm" "species island" |
| 158 | + 150 250) |
| 159 | + |
| 160 | + |
| 161 | +;;; # Dotplot |
| 162 | + |
| 163 | +(defn dot-plot |
| 164 | + [data value-field group-field min max] |
| 165 | + {:mark {:type "point" :tooltip {:content :data}} |
| 166 | + :data data |
| 167 | + :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
| 168 | + {:calculate "random()" :as "jitter"}] |
| 169 | + :height 50 ;this is the height of each row (facet) |
| 170 | + :encoding |
| 171 | + {:color {:field group-field :type "nominal" :legend false} |
| 172 | + :x {:field value-field |
| 173 | + :type "quantitative" |
| 174 | + :scale {:domain [min max]}} |
| 175 | + :y {:field "jitter" |
| 176 | + :type "quantitative" |
| 177 | + :axis false} |
| 178 | + :row {:field group-field |
| 179 | + :type "nominal" |
| 180 | + :columns 1 |
| 181 | + :spacing 0 |
| 182 | + :header {:labelAngle 0 :labelAlign "left"} |
| 183 | + } |
| 184 | + } |
| 185 | + :width 800 |
| 186 | + }) |
| 187 | + |
| 188 | +^:kind/vega-lite |
| 189 | +(dot-plot {:url penguin-data-url |
| 190 | + :format {:type "tsv"} |
| 191 | + } |
| 192 | + "flipper_length_mm" "species island" |
| 193 | + 150 250 |
| 194 | + ) |
| 195 | + |
| 196 | +;;; TODO vertical violins |
| 197 | +;;; TODO controls |
| 198 | +;;; TODO layering |
| 199 | +;;; TODO more options |
| 200 | +;;; TODO merge |
| 201 | + |
| 202 | + |
| 203 | +;;; # Combining layers |
| 204 | + |
| 205 | + |
| 206 | +(defn box-dot-plot |
| 207 | + [data value-field group-field min max] |
| 208 | + { |
| 209 | + ;; Data is in common |
| 210 | + :data data |
| 211 | + |
| 212 | + :facet |
| 213 | + {:row {:field group-field |
| 214 | + :type "nominal" |
| 215 | + :spacing 0 ;??? not working |
| 216 | + :header {:labelAngle 0 :labelAlign "left"} |
| 217 | + }} |
| 218 | + |
| 219 | + :spec |
| 220 | + { |
| 221 | + :height 50 ;this is the height of each row (facet) |
| 222 | + :encoding |
| 223 | + {:x {:field value-field |
| 224 | + :type "quantitative" |
| 225 | + :scale {:domain [min max]}} |
| 226 | + } |
| 227 | + |
| 228 | + :layer |
| 229 | + [{:mark {:type "point" :tooltip {:content :data}} |
| 230 | + :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
| 231 | + {:calculate "random()" :as "jitter"}] |
| 232 | + |
| 233 | + :encoding |
| 234 | + {:color {:value "gray"} |
| 235 | + :y {:field "jitter" |
| 236 | + :type "quantitative" |
| 237 | + :axis false} |
| 238 | + }} |
| 239 | + |
| 240 | + ;; box layer |
| 241 | + ;; TODO turn off outliers, widen box |
| 242 | + {:mark {:type "boxplot" :outliers false} |
| 243 | + :encoding |
| 244 | + {:color {:field group-field :type "nominal" :legend false} |
| 245 | + }}] |
| 246 | + :width 800 |
| 247 | + } |
69 | 248 |
|
| 249 | + }) |
70 | 250 |
|
71 | 251 |
|
72 | 252 |
|
| 253 | +^:kind/vega-lite |
| 254 | +(box-dot-plot {:url penguin-data-url |
| 255 | + :format {:type "tsv"} |
| 256 | + } |
| 257 | + "flipper_length_mm" "species island" |
| 258 | + 150 250 |
| 259 | + ) |
0 commit comments