|
16 | 16 | [scicloj.kindly.v4.api :as kindly] |
17 | 17 | [scicloj.kindly.v4.kind :as kind] |
18 | 18 | [clojure.java.io :as io] |
| 19 | + [clojure.set :as set] |
19 | 20 | )) |
20 | 21 |
|
21 | 22 | ;; # The World's Smallest Violin (plot generating code) |
|
66 | 67 | ;;; Let's start off with a simple dot-plot. We'll group the data by species, and turn each value for body_mass into a point |
67 | 68 |
|
68 | 69 |
|
69 | | -;;; Vega really just requires specifying some basic mappings (encodings) between data and visual properties. This is a minimal version of dot plot: |
| 70 | +;;; Vega really just requires specifying some basic mappings (encodings) between data and visual properties. This is a minimal version of a dot plot, with the points grouped by species: |
70 | 71 |
|
71 | 72 | ^:kind/vega-lite |
72 | 73 | {:mark {:type "point"} |
|
127 | 128 | } |
128 | 129 |
|
129 | 130 |
|
| 131 | +;;; # Violin Plot |
| 132 | + |
| 133 | +;;; Violin plots extend the ida of a box plot. Isntead of showing a few coarse statistics like median, a violin plot shows the [probability density](https://en.wikipedia.org/wiki/Probability_density_function) as a continuous variable. |
| 134 | + |
| 135 | +;;; Vega-lite provides a [`:density`](https://vega.github.io/vega-lite/docs/density.html) transform that does the work of computing this. This transform has a number of options; `:bandwidth` controls the degree of smoothign of the density curve, and you can select a value depending on your data and needs. |
| 136 | + |
| 137 | +(kind/vega-lite |
| 138 | + {:mark {:type :area} |
| 139 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 140 | + :transform [{:density "body_mass_g" |
| 141 | + :groupby ["species island"] |
| 142 | + :bandwidth 80 |
| 143 | + }] |
| 144 | + :encoding |
| 145 | + {:color {:field "species island" |
| 146 | + :type :nominal |
| 147 | + :legend false} |
| 148 | + :x {:field "value" |
| 149 | + :title "body_mass_g" |
| 150 | + :type :quantitative |
| 151 | + :scale {:zero false}} |
| 152 | + :y {:field "density" |
| 153 | + :type :quantitative |
| 154 | + :stack :center ; this reflects the area plot to produce the violin shape. |
| 155 | + :axis false ; hide some labels |
| 156 | + } |
| 157 | + :row {:field "species island" |
| 158 | + :type :nominal |
| 159 | + :spacing 0 |
| 160 | + :header {:labelAngle 0 :labelAlign :left} |
| 161 | + } |
| 162 | + } |
| 163 | + :height 50 ;this is the height of each row (facet) |
| 164 | + :width 800 |
| 165 | + }) |
| 166 | + |
| 167 | + |
| 168 | +;;; # Abstraction |
| 169 | + |
| 170 | +;;; Once we know how to make a visualization, it makes sense to abstract it into a procedure, so this knowledge in a function. |
| 171 | + |
| 172 | + |
| 173 | +;;; For inst |
130 | 174 |
|
131 | 175 | (defn dot-plot |
132 | | - [data value-field group-field min max] |
| 176 | + [data value-field group-field] |
133 | 177 | {:mark {:type "point" :tooltip {:content :data}} |
134 | 178 | :data data |
135 | | - :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
136 | | - {:calculate "random()" :as "jitter"}] |
137 | | - :height 50 ;this is the height of each row (facet) |
138 | 179 | :encoding |
139 | | - {:color {:field group-field :type "nominal" :legend false} |
140 | | - :x {:field value-field |
| 180 | + {:x {:field value-field |
141 | 181 | :type "quantitative" |
142 | | - :scale {:domain [min max]}} |
143 | | - :y {:field "jitter" |
| 182 | + :scale {:zero false}} |
| 183 | + :row {:field group-field |
| 184 | + :type "nominal" |
| 185 | + :header {:labelAngle 0 :labelAlign "left"} |
| 186 | + :spacing 0} |
| 187 | + :color {:field group-field |
| 188 | + :type "nominal" |
| 189 | + :legend false} |
| 190 | + } |
| 191 | + :height 50 |
| 192 | + :width 800 |
| 193 | + }) |
| 194 | + |
| 195 | +;;; Which can be used like this: |
| 196 | + |
| 197 | + |
| 198 | +^:kind/vega-lite |
| 199 | +(dot-plot {:values (tc/rows penguin-data :as-maps)} |
| 200 | + "flipper_length_mm" "year" |
| 201 | + ) |
| 202 | + |
| 203 | + |
| 204 | +;;; On any data set |
| 205 | +^:kind/vega-lite |
| 206 | +(dot-plot {:url "https://vega.github.io/editor/data/movies.json"} |
| 207 | + "US Gross" "Major Genre") |
| 208 | + |
| 209 | + |
| 210 | + |
| 211 | +;;; Add jitter |
| 212 | + |
| 213 | +(defn dot-plot-2 |
| 214 | + [data value-field group-field jitter?] |
| 215 | + {:mark {:type "point" :tooltip {:content :data}} |
| 216 | + :data data |
| 217 | + :transform (if jitter? [{:calculate "random()" :as "jitter"}] []) |
| 218 | + :encoding |
| 219 | + {:x {:field value-field |
144 | 220 | :type "quantitative" |
145 | | - :axis false} |
| 221 | + :scale {:zero false}} |
| 222 | + :y (when jitter? |
| 223 | + {:field "jitter" |
| 224 | + :type "quantitative" |
| 225 | + :axis false}) |
146 | 226 | :row {:field group-field |
147 | 227 | :type "nominal" |
148 | | - :columns 1 |
149 | | - :spacing 0 |
150 | 228 | :header {:labelAngle 0 :labelAlign "left"} |
151 | | - } |
| 229 | + :spacing 0} |
| 230 | + :color {:field group-field |
| 231 | + :type "nominal" |
| 232 | + :legend false} |
152 | 233 | } |
| 234 | + :height 50 |
153 | 235 | :width 800 |
154 | 236 | }) |
155 | 237 |
|
156 | 238 |
|
| 239 | +;;; On any data set |
| 240 | +^:kind/vega-lite |
| 241 | +(dot-plot-2 {:url "https://vega.github.io/editor/data/movies.json"} |
| 242 | + "US Gross" "Major Genre" true) |
157 | 243 |
|
158 | 244 |
|
| 245 | +;;; # Generalize |
159 | 246 |
|
160 | | -;;; # Violin Plot |
| 247 | +;;; This section introduces a new, and somewhat funky way of using and generalizing Vega specs. |
161 | 248 |
|
162 | | -;;; Violin plots extend the ida of a box plot. Isntead of showing just the mean and some percentiles, a violin plot shows the probability density as a continuous variables. |
| 249 | +;;; Take our dot-plot abstraction above. We could parameterize it further, say :type which could be :dotplot or :boxplot. But instead, we're going to hack it by introducing a function that can merge arbitrarily nested structures. This means we can alter any aspect of the spec, at the cost of having to have some knowledge of its structure. Eg we could change the height or spacing or fonts. |
163 | 250 |
|
164 | | -;;; ## Kernel Density Estimate |
165 | 251 |
|
| 252 | +;; TODO maybe more confuscing than it is worth here. For a later section? |
| 253 | +;; From multitool TODO link |
166 | 254 |
|
167 | | -(kind/vega-lite |
| 255 | +(defn merge-recursive |
| 256 | + "Merge two arbitrariy nested map structures. Terminal seqs are concatentated, terminal sets are merged." |
| 257 | + [m1 m2] |
| 258 | + (cond (and (map? m1) (map? m2)) |
| 259 | + (merge-with merge-recursive m1 m2) |
| 260 | + (and (set? m1) (set? m2)) |
| 261 | + (set/union m1 m2) |
| 262 | + (and (vector? m1) (vector? m2)) |
| 263 | + (into [] (concat m1 m2)) |
| 264 | + (and (sequential? m1) (sequential? m2)) |
| 265 | + (concat m1 m2) |
| 266 | + (nil? m2) m1 |
| 267 | + :else m2)) |
| 268 | + |
| 269 | +(defn box-plot |
| 270 | + [data value-field group-field] |
| 271 | + (-> (dot-plot-2 data value-field group-field false) |
| 272 | + (merge-recursive |
| 273 | + {:mark {:type :boxplot |
| 274 | + :extent :min-max}}))) |
| 275 | + |
| 276 | + |
| 277 | + |
| 278 | +;;; Next, an abstraction for violin plots. This could also be built from dot-plot-2, but since theres a lot to change let's not. |
| 279 | + |
| 280 | + |
| 281 | +(defn violin-plot |
| 282 | + [data value-field group-field & bandwidth] |
168 | 283 | {:mark {:type :area} |
169 | | - :data {:values (tc/rows penguin-data :as-maps)} |
170 | | - :transform [{:density "flipper_length_mm" |
171 | | - :groupby ["species island"] |
172 | | - :extent [150 250] |
173 | | - :bandwidth 2 |
| 284 | + :data data |
| 285 | + :transform [{:density value-field |
| 286 | + :groupby [group-field] |
| 287 | + :bandwidth bandwidth |
174 | 288 | }] |
175 | | - :height 50 ;this is the height of each row (facet) |
176 | 289 | :encoding |
177 | | - {:color {:field "species island" |
| 290 | + {:color {:field group-field |
178 | 291 | :type :nominal |
179 | 292 | :legend false} |
| 293 | + :x {:field "value" |
| 294 | + :type :quantitative |
| 295 | + :title value-field |
| 296 | + :scale {:zero false}} |
180 | 297 | :y {:field "density" |
181 | 298 | :type :quantitative |
182 | | - :stack :center ; this reflects the area plot to produce the violin shape. |
183 | | - :axis false ; hide some labels |
| 299 | + :stack :center ; this reflects the area plot to produce the violin shape. |
| 300 | + :axis false ; hide some labels |
184 | 301 | } |
185 | | - :x {:field "value" |
186 | | - :type :quantitative |
187 | | - :scale {:zero false}} ;not strictly necessary |
188 | | - :row {:field "species island" |
| 302 | + :row {:field group-field |
189 | 303 | :type :nominal |
190 | | - :columns 1 |
191 | 304 | :spacing 0 |
192 | 305 | :header {:labelAngle 0 :labelAlign :left} |
193 | 306 | } |
194 | 307 | } |
| 308 | + :height 50 ;this is the height of each row (facet) |
195 | 309 | :width 800 |
196 | 310 | }) |
197 | 311 |
|
198 | 312 |
|
| 313 | +^:kind/vega-lite |
| 314 | +(-> (violin-plot {:url "https://vega.github.io/editor/data/movies.json"} |
| 315 | + "US Gross" "Major Genre" 5000000) |
| 316 | + (merge-recursive {:encoding {:x {:scale {:domain [0 100000000]}}} ;force scale to exclude outliers |
| 317 | + :mark {:clip true}})) ;and don't plot them |
| 318 | + |
| 319 | + |
| 320 | + |
| 321 | + |
| 322 | + |
| 323 | + |
199 | 324 | ;;; We can try some of the other properties |
200 | 325 |
|
201 | 326 | (kind/vega-lite |
|
258 | 383 | :width 800 |
259 | 384 | }) |
260 | 385 |
|
261 | | -;;; TODO show box, whiskers, points |
262 | | -;;; TODO need a better dataset, this is boring |
| 386 | + |
263 | 387 |
|
264 | 388 | ;;; # Here's a more scientific example |
265 | 389 |
|
|
298 | 422 |
|
299 | 423 |
|
300 | 424 |
|
301 | | -^:kind/vega-lite |
302 | | -(dot-plot {:url penguin-data-url |
303 | | - :format {:type "tsv"} |
304 | | - } |
305 | | - "flipper_length_mm" "species island" |
306 | | - 150 250 |
307 | | - ) |
| 425 | + |
308 | 426 |
|
309 | 427 | ;;; TODO vertical violins |
310 | 428 | ;;; TODO controls |
|
385 | 503 |
|
386 | 504 | ;;; ## Movie Dataset |
387 | 505 |
|
388 | | -;;; Get the movie dataset |
389 | | -(def movie-data |
390 | | - (json/read-str (slurp "https://vega.github.io/editor/data/movies.json") )) |
| 506 | + |
391 | 507 |
|
392 | 508 | ;;; Here we'll take a look at a sample of the data (selected columns and just a few rows). |
393 | 509 | (kind/table |
|
0 commit comments