Skip to content

Commit 6eab7ae

Browse files
committed
piton, violin parameterized, merge-recursive
1 parent 1622498 commit 6eab7ae

1 file changed

Lines changed: 159 additions & 43 deletions

File tree

src/data_visualization/violin.clj

Lines changed: 159 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
[scicloj.kindly.v4.api :as kindly]
1717
[scicloj.kindly.v4.kind :as kind]
1818
[clojure.java.io :as io]
19+
[clojure.set :as set]
1920
))
2021

2122
;; # The World's Smallest Violin (plot generating code)
@@ -66,7 +67,7 @@
6667
;;; Let's start off with a simple dot-plot. We'll group the data by species, and turn each value for body_mass into a point
6768

6869

69-
;;; Vega really just requires specifying some basic mappings (encodings) between data and visual properties. This is a minimal version of dot plot:
70+
;;; Vega really just requires specifying some basic mappings (encodings) between data and visual properties. This is a minimal version of a dot plot, with the points grouped by species:
7071

7172
^:kind/vega-lite
7273
{:mark {:type "point"}
@@ -127,75 +128,199 @@
127128
}
128129

129130

131+
;;; # Violin Plot
132+
133+
;;; Violin plots extend the ida of a box plot. Isntead of showing a few coarse statistics like median, a violin plot shows the [probability density](https://en.wikipedia.org/wiki/Probability_density_function) as a continuous variable.
134+
135+
;;; Vega-lite provides a [`:density`](https://vega.github.io/vega-lite/docs/density.html) transform that does the work of computing this. This transform has a number of options; `:bandwidth` controls the degree of smoothign of the density curve, and you can select a value depending on your data and needs.
136+
137+
(kind/vega-lite
138+
{:mark {:type :area}
139+
:data {:values (tc/rows penguin-data :as-maps)}
140+
:transform [{:density "body_mass_g"
141+
:groupby ["species island"]
142+
:bandwidth 80
143+
}]
144+
:encoding
145+
{:color {:field "species island"
146+
:type :nominal
147+
:legend false}
148+
:x {:field "value"
149+
:title "body_mass_g"
150+
:type :quantitative
151+
:scale {:zero false}}
152+
:y {:field "density"
153+
:type :quantitative
154+
:stack :center ; this reflects the area plot to produce the violin shape.
155+
:axis false ; hide some labels
156+
}
157+
:row {:field "species island"
158+
:type :nominal
159+
:spacing 0
160+
:header {:labelAngle 0 :labelAlign :left}
161+
}
162+
}
163+
:height 50 ;this is the height of each row (facet)
164+
:width 800
165+
})
166+
167+
168+
;;; # Abstraction
169+
170+
;;; Once we know how to make a visualization, it makes sense to abstract it into a procedure, so this knowledge in a function.
171+
172+
173+
;;; For inst
130174

131175
(defn dot-plot
132-
[data value-field group-field min max]
176+
[data value-field group-field]
133177
{:mark {:type "point" :tooltip {:content :data}}
134178
:data data
135-
:transform [{:filter (format "datum['%s'] != 'NA'" value-field)}
136-
{:calculate "random()" :as "jitter"}]
137-
:height 50 ;this is the height of each row (facet)
138179
:encoding
139-
{:color {:field group-field :type "nominal" :legend false}
140-
:x {:field value-field
180+
{:x {:field value-field
141181
:type "quantitative"
142-
:scale {:domain [min max]}}
143-
:y {:field "jitter"
182+
:scale {:zero false}}
183+
:row {:field group-field
184+
:type "nominal"
185+
:header {:labelAngle 0 :labelAlign "left"}
186+
:spacing 0}
187+
:color {:field group-field
188+
:type "nominal"
189+
:legend false}
190+
}
191+
:height 50
192+
:width 800
193+
})
194+
195+
;;; Which can be used like this:
196+
197+
198+
^:kind/vega-lite
199+
(dot-plot {:values (tc/rows penguin-data :as-maps)}
200+
"flipper_length_mm" "year"
201+
)
202+
203+
204+
;;; On any data set
205+
^:kind/vega-lite
206+
(dot-plot {:url "https://vega.github.io/editor/data/movies.json"}
207+
"US Gross" "Major Genre")
208+
209+
210+
211+
;;; Add jitter
212+
213+
(defn dot-plot-2
214+
[data value-field group-field jitter?]
215+
{:mark {:type "point" :tooltip {:content :data}}
216+
:data data
217+
:transform (if jitter? [{:calculate "random()" :as "jitter"}] [])
218+
:encoding
219+
{:x {:field value-field
144220
:type "quantitative"
145-
:axis false}
221+
:scale {:zero false}}
222+
:y (when jitter?
223+
{:field "jitter"
224+
:type "quantitative"
225+
:axis false})
146226
:row {:field group-field
147227
:type "nominal"
148-
:columns 1
149-
:spacing 0
150228
:header {:labelAngle 0 :labelAlign "left"}
151-
}
229+
:spacing 0}
230+
:color {:field group-field
231+
:type "nominal"
232+
:legend false}
152233
}
234+
:height 50
153235
:width 800
154236
})
155237

156238

239+
;;; On any data set
240+
^:kind/vega-lite
241+
(dot-plot-2 {:url "https://vega.github.io/editor/data/movies.json"}
242+
"US Gross" "Major Genre" true)
157243

158244

245+
;;; # Generalize
159246

160-
;;; # Violin Plot
247+
;;; This section introduces a new, and somewhat funky way of using and generalizing Vega specs.
161248

162-
;;; Violin plots extend the ida of a box plot. Isntead of showing just the mean and some percentiles, a violin plot shows the probability density as a continuous variables.
249+
;;; Take our dot-plot abstraction above. We could parameterize it further, say :type which could be :dotplot or :boxplot. But instead, we're going to hack it by introducing a function that can merge arbitrarily nested structures. This means we can alter any aspect of the spec, at the cost of having to have some knowledge of its structure. Eg we could change the height or spacing or fonts.
163250

164-
;;; ## Kernel Density Estimate
165251

252+
;; TODO maybe more confuscing than it is worth here. For a later section?
253+
;; From multitool TODO link
166254

167-
(kind/vega-lite
255+
(defn merge-recursive
256+
"Merge two arbitrariy nested map structures. Terminal seqs are concatentated, terminal sets are merged."
257+
[m1 m2]
258+
(cond (and (map? m1) (map? m2))
259+
(merge-with merge-recursive m1 m2)
260+
(and (set? m1) (set? m2))
261+
(set/union m1 m2)
262+
(and (vector? m1) (vector? m2))
263+
(into [] (concat m1 m2))
264+
(and (sequential? m1) (sequential? m2))
265+
(concat m1 m2)
266+
(nil? m2) m1
267+
:else m2))
268+
269+
(defn box-plot
270+
[data value-field group-field]
271+
(-> (dot-plot-2 data value-field group-field false)
272+
(merge-recursive
273+
{:mark {:type :boxplot
274+
:extent :min-max}})))
275+
276+
277+
278+
;;; Next, an abstraction for violin plots. This could also be built from dot-plot-2, but since theres a lot to change let's not.
279+
280+
281+
(defn violin-plot
282+
[data value-field group-field & bandwidth]
168283
{:mark {:type :area}
169-
:data {:values (tc/rows penguin-data :as-maps)}
170-
:transform [{:density "flipper_length_mm"
171-
:groupby ["species island"]
172-
:extent [150 250]
173-
:bandwidth 2
284+
:data data
285+
:transform [{:density value-field
286+
:groupby [group-field]
287+
:bandwidth bandwidth
174288
}]
175-
:height 50 ;this is the height of each row (facet)
176289
:encoding
177-
{:color {:field "species island"
290+
{:color {:field group-field
178291
:type :nominal
179292
:legend false}
293+
:x {:field "value"
294+
:type :quantitative
295+
:title value-field
296+
:scale {:zero false}}
180297
:y {:field "density"
181298
:type :quantitative
182-
:stack :center ; this reflects the area plot to produce the violin shape.
183-
:axis false ; hide some labels
299+
:stack :center ; this reflects the area plot to produce the violin shape.
300+
:axis false ; hide some labels
184301
}
185-
:x {:field "value"
186-
:type :quantitative
187-
:scale {:zero false}} ;not strictly necessary
188-
:row {:field "species island"
302+
:row {:field group-field
189303
:type :nominal
190-
:columns 1
191304
:spacing 0
192305
:header {:labelAngle 0 :labelAlign :left}
193306
}
194307
}
308+
:height 50 ;this is the height of each row (facet)
195309
:width 800
196310
})
197311

198312

313+
^:kind/vega-lite
314+
(-> (violin-plot {:url "https://vega.github.io/editor/data/movies.json"}
315+
"US Gross" "Major Genre" 5000000)
316+
(merge-recursive {:encoding {:x {:scale {:domain [0 100000000]}}} ;force scale to exclude outliers
317+
:mark {:clip true}})) ;and don't plot them
318+
319+
320+
321+
322+
323+
199324
;;; We can try some of the other properties
200325

201326
(kind/vega-lite
@@ -258,8 +383,7 @@
258383
:width 800
259384
})
260385

261-
;;; TODO show box, whiskers, points
262-
;;; TODO need a better dataset, this is boring
386+
263387

264388
;;; # Here's a more scientific example
265389

@@ -298,13 +422,7 @@
298422

299423

300424

301-
^:kind/vega-lite
302-
(dot-plot {:url penguin-data-url
303-
:format {:type "tsv"}
304-
}
305-
"flipper_length_mm" "species island"
306-
150 250
307-
)
425+
308426

309427
;;; TODO vertical violins
310428
;;; TODO controls
@@ -385,9 +503,7 @@
385503

386504
;;; ## Movie Dataset
387505

388-
;;; Get the movie dataset
389-
(def movie-data
390-
(json/read-str (slurp "https://vega.github.io/editor/data/movies.json") ))
506+
391507

392508
;;; Here we'll take a look at a sample of the data (selected columns and just a few rows).
393509
(kind/table

0 commit comments

Comments
 (0)