|
1 | 1 | ^{:kindly/hide-code true |
2 | 2 | :clay |
3 | | - {:title "Introduction to Violin Plots with Vega" |
| 3 | + {:title "Introduction to Violin Plots with Vega Lite" |
4 | 4 | :quarto {:author :mt |
5 | 5 | :description "..." |
6 | | - :image "overlayplot.png" |
| 6 | + :image "violin-example.png" |
7 | 7 | :type :post |
8 | | - :date "2025-11-24" |
| 8 | + :date "2026-01-05" |
9 | 9 | :category :clojure |
10 | 10 | :tags [:dataviz :vega-lite]}}} |
11 | 11 |
|
|
15 | 15 | [tablecloth.api :as tc] |
16 | 16 | [scicloj.kindly.v4.api :as kindly] |
17 | 17 | [scicloj.kindly.v4.kind :as kind] |
| 18 | + [clojure.java.io :as io] |
18 | 19 | )) |
19 | 20 |
|
20 | | -;;; I can't stand writing long text in ;;; comments, so import it from actual .md files |
21 | | -^{:kind/md true :kindly/hide-code true} |
22 | | -(slurp "src/data_visualization/violin1.md") |
| 21 | +;; # The World's Smallest Violin (plot generating code) |
23 | 22 |
|
24 | | -;;; # Get some data |
25 | 23 |
|
26 | | -;;; ## Movie Dataset |
| 24 | +;;; This post will explain how to visualize data with violin plots, using Vega-Lite and Clojure. |
27 | 25 |
|
28 | | -;;; Get the movie dataset |
29 | | -(def movie-data |
30 | | - (json/read-str (slurp "https://vega.github.io/editor/data/movies.json") )) |
31 | 26 |
|
32 | | -;;; Here we'll take a look at a sample of the data (selected columns and just a few rows). |
33 | | -(kind/table |
34 | | - {:row-maps (take 10 movie-data) |
35 | | - :column-names ["Title" "IMDB Rating" "US Gross" "Distributor" "Production Budget" "MPAA Rating" "Major Genre"]}) |
| 27 | + |
| 28 | +;; # What is a violin plot? |
| 29 | + |
| 30 | +;; A [Violin plot](https://en.wikipedia.org/wiki/Violin_plot) is a way to visualize how data is distributed - essentially showing you where your data points fall and how spread out they are.is |
| 31 | + |
| 32 | +;; Imagine you're analyzing a dataset of movies and how much money they made. A violin plot would show you not just the median value, but the full "shape" of your data: Are values clustered around certain points, ore evenly spread out? How much concentration is there? How many such points? |
| 33 | + |
| 34 | +;; A violin plot is best understood as an extension of the more common box plot. Violin plots add a visulization of the probability **density**, and can reveal more features of the data, such as multiple modes. This tutorial shows you how to make box plots and violin plots in Vega-Lite. |
| 35 | + |
| 36 | +^:kindly/hide-code |
| 37 | +(import javax.imageio.ImageIO |
| 38 | + java.net.URL) |
| 39 | + |
| 40 | +;;; TODO use movie example, and tweak bandwidth |
| 41 | + |
| 42 | +^:kindly/hide-code |
| 43 | +(-> "src/data_visualization/violin-example.png" |
| 44 | + io/file |
| 45 | + (ImageIO/read)) |
| 46 | + |
| 47 | +;;; If you want to see a full-fledged implementation of interactive violin plots for visualizaing biological data, [the BRUCE website](https://bruce.parkerici.org) has one. |
36 | 48 |
|
37 | 49 |
|
38 | | -;;; ## Penguin dataset |
39 | 50 |
|
40 | | -(def penguin-data-url "https://raw.githubusercontent.com/ttimbers/palmerpenguins/refs/heads/file-variants/inst/extdata/penguins.tsv") |
| 51 | +;;; # Data |
| 52 | + |
| 53 | +;;; We'll use this classic [dataset about penguin morphology](https://github.com/ttimbers/palmerpenguins/blob/master/README.md). <img src='man/figures/logo.png' align="right" height="138.5" /></a>. Each row describes an individual penguin, with properties like species, sex, body mass, wing size. |
| 54 | + |
| 55 | +(def penguin-data-url |
| 56 | + "https://raw.githubusercontent.com/ttimbers/palmerpenguins/refs/heads/file-variants/inst/extdata/penguins.tsv") |
41 | 57 |
|
42 | 58 | (def penguin-data |
43 | 59 | (tc/dataset penguin-data-url {:key-fn keyword})) |
44 | 60 |
|
45 | | -(kind/table (tc/random penguin-data 10)) |
| 61 | +(kind/table |
| 62 | + (tc/random penguin-data 10)) |
46 | 63 |
|
47 | | -;;; # Boxplot |
| 64 | +;;; # Just the points, ma'am |
| 65 | + |
| 66 | +;;; Let's start off with a simple dot-plot. We'll group the data by species, and turn each value for body_mass into a point |
48 | 67 |
|
49 | 68 |
|
50 | | -;;; A basic boxplot shows the distribution of a single varianle. Here we look at the distribution of US gross profits: |
| 69 | +;;; Vega really just requires specifying some basic mappings (encodings) between data and visual properties. This is a minimal version of dot plot: |
| 70 | + |
51 | 71 | ^:kind/vega-lite |
52 | | - {:mark {:type "boxplot"} |
53 | | - :data {:values movie-data} |
54 | | - :encoding |
55 | | - {"x" {:field "US Gross" |
56 | | - :type "quantitative"} |
57 | | - :tooltip {:field "Title"}} |
58 | | - :width 800 |
59 | | - } |
| 72 | +{:mark {:type "point"} |
| 73 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 74 | + :encoding |
| 75 | + {:x {:field "body_mass_g" |
| 76 | + :type "quantitative"} |
| 77 | + :y {:field "species island" |
| 78 | + :type "nominal"}} |
| 79 | + } |
60 | 80 |
|
| 81 | +;;; Vega's defaults are not always what we want, so this is the same as above with a bit of tweaking to look more like what we want. One nonobvious change: we use `:row` in place of `:y`. This is not estrictly necessary at this point, but will make it easier when we get to actual violin plots |
61 | 82 |
|
| 83 | +^:kind/vega-lite |
| 84 | +{:mark {:type "point" :tooltip {:content :data}} |
| 85 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 86 | + :encoding |
| 87 | + {:x {:field "body_mass_g" |
| 88 | + :type "quantitative" |
| 89 | + :scale {:zero false}} |
| 90 | + :row {:field "species island" |
| 91 | + :type "nominal" |
| 92 | + :header {:labelAngle 0 :labelAlign "left"} |
| 93 | + :spacing 0 |
| 94 | + } |
| 95 | + :color {:field "species island" |
| 96 | + :type "nominal" |
| 97 | + :legend false} |
| 98 | + } |
| 99 | + :height 50 |
| 100 | + :width 800 |
| 101 | + } |
62 | 102 |
|
| 103 | +;;; In this case the data is not very dense, so plotting everything on with a constant y value is OK. In a more polished application, you might want to add random y jitter to make the points more visually distinguishable. |
63 | 104 |
|
64 | 105 |
|
65 | 106 |
|
66 | | -;;; But boxplots are more useful when you compare distributions given a second variable. Here we see the different distributions for specific genres of movie. |
| 107 | +;;; # Boxplot |
67 | 108 |
|
68 | | -^:kind/vega-lite |
69 | | - {:mark {:type "boxplot"} |
70 | | - :data {:values movie-data} |
71 | | - :encoding |
72 | | - {"x" {:field "US Gross" |
73 | | - :type "quantitative"} |
74 | | - "y" {:field "Major Genre" |
75 | | - :type "nominal"} |
76 | | - "color" {:field "Major Genre" :type "nominal" :legend false} |
77 | | - :tooltip {:field "Title"}} |
78 | | - :width 800 |
79 | | - } |
| 109 | +;;; A boxplot is another way of displaying the distribution of a single numeric varianle. |
| 110 | +;;; A box plot summarizes a distribution of quantitative values using a set of summary statistics. The median tick in the box represents the median. The left and right parts of the box represent the first and third quartile respectively. The whisker shows the full domain of the data. |
80 | 111 |
|
81 | | -;;; This shows us the median (white line) |
| 112 | +^:kind/vega-lite |
| 113 | +{:mark {:type :boxplot |
| 114 | + :extent :min-max} |
| 115 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 116 | + :encoding |
| 117 | + {:x {:field "body_mass_g" |
| 118 | + :type :quantitative |
| 119 | + :scale {:zero false}} |
| 120 | + :y {:field "species island" |
| 121 | + :type :nominal} |
| 122 | + :color {:field "species island" |
| 123 | + :type "nominal" |
| 124 | + :legend false}} |
| 125 | + :height {:step 50} |
| 126 | + :width 800 |
| 127 | + } |
82 | 128 |
|
83 | | -;;; # Violins |
84 | 129 |
|
85 | 130 |
|
86 | | -(kind/vega-lite |
87 | | - {:mark {:type "area"} |
88 | | - :data {:values movie-data} |
89 | | - :transform [{:density "US Gross" |
90 | | - :groupby ["Major Genre"] |
91 | | - :extent [0 200000000]}] |
92 | | - :height 50 ;this is the height of each row (facet) |
93 | | - :encoding |
94 | | - {"color" {:field "Major Genre" :type "nominal" :legend false} |
95 | | - "y" {:field "density" |
| 131 | +(defn dot-plot |
| 132 | + [data value-field group-field min max] |
| 133 | + {:mark {:type "point" :tooltip {:content :data}} |
| 134 | + :data data |
| 135 | + :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
| 136 | + {:calculate "random()" :as "jitter"}] |
| 137 | + :height 50 ;this is the height of each row (facet) |
| 138 | + :encoding |
| 139 | + {:color {:field group-field :type "nominal" :legend false} |
| 140 | + :x {:field value-field |
96 | 141 | :type "quantitative" |
97 | | - :stack "center" |
98 | | - :axis false |
99 | | - } ;this reflect-doubles the area plot to produce the violin shape |
100 | | - "x" {:field "value" |
101 | | - :type "quantitative"} |
102 | | - "row" {:field "Major Genre" |
| 142 | + :scale {:domain [min max]}} |
| 143 | + :y {:field "jitter" |
| 144 | + :type "quantitative" |
| 145 | + :axis false} |
| 146 | + :row {:field group-field |
103 | 147 | :type "nominal" |
104 | 148 | :columns 1 |
105 | 149 | :spacing 0 |
106 | 150 | :header {:labelAngle 0 :labelAlign "left"} |
107 | 151 | } |
| 152 | + } |
| 153 | + :width 800 |
| 154 | + }) |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | + |
| 159 | + |
| 160 | +;;; # Violin Plot |
| 161 | + |
| 162 | +;;; Violin plots extend the ida of a box plot. Isntead of showing just the mean and some percentiles, a violin plot shows the probability density as a continuous variables. |
| 163 | + |
| 164 | +;;; ## Kernel Density Estimate |
| 165 | + |
| 166 | + |
| 167 | +(kind/vega-lite |
| 168 | + {:mark {:type :area} |
| 169 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 170 | + :transform [{:density "flipper_length_mm" |
| 171 | + :groupby ["species island"] |
| 172 | + :extent [150 250] |
| 173 | + :bandwidth 2 |
| 174 | + }] |
| 175 | + :height 50 ;this is the height of each row (facet) |
| 176 | + :encoding |
| 177 | + {:color {:field "species island" |
| 178 | + :type :nominal |
| 179 | + :legend false} |
| 180 | + :y {:field "density" |
| 181 | + :type :quantitative |
| 182 | + :stack :center ; this reflects the area plot to produce the violin shape. |
| 183 | + :axis false ; hide some labels |
| 184 | + } |
| 185 | + :x {:field "value" |
| 186 | + :type :quantitative |
| 187 | + :scale {:zero false}} ;not strictly necessary |
| 188 | + :row {:field "species island" |
| 189 | + :type :nominal |
| 190 | + :columns 1 |
| 191 | + :spacing 0 |
| 192 | + :header {:labelAngle 0 :labelAlign :left} |
| 193 | + } |
| 194 | + } |
| 195 | + :width 800 |
| 196 | + }) |
| 197 | + |
| 198 | + |
| 199 | +;;; We can try some of the other properties |
| 200 | + |
| 201 | +(kind/vega-lite |
| 202 | + {:mark {:type :area} |
| 203 | + :data {:values (tc/rows penguin-data :as-maps)} |
| 204 | + :transform [{:density "body_mass_g" |
| 205 | + :groupby ["species island"] |
| 206 | + :bandwidth 100 |
| 207 | + :extent [2700 6300]}] |
| 208 | + :height 50 ;this is the height of each row (facet) |
| 209 | + :encoding |
| 210 | + {:color {:field "species island" |
| 211 | + :type :nominal |
| 212 | + :legend false} |
| 213 | + :y {:field "density" |
| 214 | + :type :quantitative |
| 215 | + :stack :center ; this reflects the area plot to produce the violin shape. |
| 216 | + :axis false ; hide some labels |
| 217 | + } |
| 218 | + :x {:field "value" |
| 219 | + :type :quantitative |
| 220 | + :scale {:zero false}} ;not strictly necessary |
| 221 | + :row {:field "species island" |
| 222 | + :type :nominal |
| 223 | + :columns 1 |
| 224 | + :spacing 0 |
| 225 | + :header {:labelAngle 0 :labelAlign :left} |
| 226 | + } |
108 | 227 | } |
109 | 228 | :width 800 |
110 | 229 | }) |
|
177 | 296 |
|
178 | 297 | ;;; # Dotplot |
179 | 298 |
|
180 | | -(defn dot-plot |
181 | | - [data value-field group-field min max] |
182 | | - {:mark {:type "point" :tooltip {:content :data}} |
183 | | - :data data |
184 | | - :transform [{:filter (format "datum['%s'] != 'NA'" value-field)} |
185 | | - {:calculate "random()" :as "jitter"}] |
186 | | - :height 50 ;this is the height of each row (facet) |
187 | | - :encoding |
188 | | - {:color {:field group-field :type "nominal" :legend false} |
189 | | - :x {:field value-field |
190 | | - :type "quantitative" |
191 | | - :scale {:domain [min max]}} |
192 | | - :y {:field "jitter" |
193 | | - :type "quantitative" |
194 | | - :axis false} |
195 | | - :row {:field group-field |
196 | | - :type "nominal" |
197 | | - :columns 1 |
198 | | - :spacing 0 |
199 | | - :header {:labelAngle 0 :labelAlign "left"} |
200 | | - } |
201 | | - } |
202 | | - :width 800 |
203 | | - }) |
| 299 | + |
204 | 300 |
|
205 | 301 | ^:kind/vega-lite |
206 | 302 | (dot-plot {:url penguin-data-url |
|
281 | 377 | (box-dot-plot {:values movie-data} |
282 | 378 | "US Gross" "Major Genre" |
283 | 379 | 0 500000000) |
| 380 | + |
| 381 | + |
| 382 | +;;; # Scrap |
| 383 | + |
| 384 | +;;; # Get some data |
| 385 | + |
| 386 | +;;; ## Movie Dataset |
| 387 | + |
| 388 | +;;; Get the movie dataset |
| 389 | +(def movie-data |
| 390 | + (json/read-str (slurp "https://vega.github.io/editor/data/movies.json") )) |
| 391 | + |
| 392 | +;;; Here we'll take a look at a sample of the data (selected columns and just a few rows). |
| 393 | +(kind/table |
| 394 | + {:row-maps (take 10 movie-data) |
| 395 | + :column-names ["Title" "IMDB Rating" "US Gross" "Distributor" "Production Budget" "MPAA Rating" "Major Genre"]}) |
| 396 | + |
| 397 | + |
| 398 | +;;; # Experiment with dataset choice |
| 399 | + |
| 400 | +(def data (atom nil)) |
| 401 | + |
| 402 | +;;; ## Scittle min |
| 403 | + |
| 404 | +(kind/scittle |
| 405 | + '(def geotiff-sources |
| 406 | + (js/ol.source.GeoTIFF. |
| 407 | + (clj->js |
| 408 | + {:sources [{:min 0 |
| 409 | + :nodata 0 |
| 410 | + :max 10000 |
| 411 | + :bands [1 ;; B02 blue (490nm) -> band 1 |
| 412 | + 2 ;; B03 green (560nm) -> band 2 |
| 413 | + 3 ;; B04 red (665nm) -> band 3 |
| 414 | + 4 ;; B08 NIR (841nm) -> band 4 |
| 415 | + ] |
| 416 | + :url "https://s2downloads.eox.at/demo/Sentinel-2/3857/R10m.tif"}]})))) |
| 417 | + |
| 418 | +;;; # References |
| 419 | + |
| 420 | +;;; - [Violin Plots: A Box Plot-Density Trace Synergism](https://web.archive.org/web/20231106021405/https://quantixed.org/wp-content/uploads/2014/12/hintze_1998.pdf) Jerry L. Hintze, Ray D. Nelson |
| 421 | + |
| 422 | + |
0 commit comments