diff --git a/.gitignore b/.gitignore index ea848071..d37b19da 100644 --- a/.gitignore +++ b/.gitignore @@ -95,6 +95,8 @@ docs/_build/ *.sqlite *.sqlite3 !src/data/*.parquet +!doc/assets/data/*.csv +!doc/gallery/examples/*.csv # Configuration files .env diff --git a/doc/_quarto.yml b/doc/_quarto.yml index a44e90b2..1bc6952f 100644 --- a/doc/_quarto.yml +++ b/doc/_quarto.yml @@ -2,6 +2,7 @@ project: type: website resources: - wasm/** + - assets/data/** website: title: "ggsql" diff --git a/doc/assets/data/minard_cities.csv b/doc/assets/data/minard_cities.csv new file mode 100644 index 00000000..31597c6c --- /dev/null +++ b/doc/assets/data/minard_cities.csv @@ -0,0 +1,21 @@ +"long","lat","city" +24,55,"Kowno" +25.3,54.7,"Wilna" +26.4,54.4,"Smorgoni" +26.8,54.3,"Moiodexno" +27.7,55.2,"Gloubokoe" +27.6,53.9,"Minsk" +28.5,54.3,"Studienska" +28.7,55.5,"Polotzk" +29.2,54.4,"Bobr" +30.2,55.3,"Witebsk" +30.4,54.5,"Orscha" +30.4,53.9,"Mohilow" +32,54.8,"Smolensk" +33.2,54.9,"Dorogobouge" +34.3,55.2,"Wixma" +34.4,55.5,"Chjat" +36,55.5,"Mojaisk" +37.6,55.8,"Moscou" +36.6,55.3,"Tarantino" +36.5,55,"Malo-Jarosewii" \ No newline at end of file diff --git a/doc/assets/data/minard_troops.csv b/doc/assets/data/minard_troops.csv new file mode 100644 index 00000000..c3b527e1 --- /dev/null +++ b/doc/assets/data/minard_troops.csv @@ -0,0 +1,52 @@ +"long","lat","survivors","direction","group" +37.7,55.7,100000,"R",1 +37.5,55.7,98000,"R",1 +37,55,97000,"R",1 +36.8,55,96000,"R",1 +35.4,55.3,87000,"R",1 +34.3,55.2,55000,"R",1 +33.3,54.8,37000,"R",1 +32,54.6,24000,"R",1 +30.4,54.4,20000,"R",1 +29.2,54.3,20000,"R",1 +28.5,54.2,20000,"R",1 +28.3,54.3,20000,"R",1 +27.5,54.5,20000,"R",1 +26.8,54.3,12000,"R",1 +26.4,54.4,14000,"R",1 +25,54.4,8000,"R",1 +24.4,54.4,4000,"R",1 +24.2,54.4,4000,"R",1 +24.1,54.4,4000,"R",1 +28.7,55.5,33000,"R",2 +29.2,54.2,30000,"R",2 +28.5,54.1,30000,"R",2 +28.3,54.2,28000,"R",2 +24.6,55.8,6000,"R",3 +24.2,54.4,6000,"R",3 +24.1,54.4,6000,"R",3 +24,54.9,340000,"A",1 +24.5,55,340000,"A",1 +25.5,54.5,340000,"A",1 +26,54.7,320000,"A",1 +27,54.8,300000,"A",1 +28,54.9,280000,"A",1 +28.5,55,240000,"A",1 +29,55.1,210000,"A",1 +30,55.2,180000,"A",1 +30.3,55.3,175000,"A",1 +32,54.8,145000,"A",1 +33.2,54.9,140000,"A",1 +34.4,55.5,127100,"A",1 +35.5,55.4,100000,"A",1 +36,55.5,100000,"A",1 +37.6,55.8,100000,"A",1 +24,55.1,60000,"A",2 +24.5,55.2,60000,"A",2 +25.5,54.7,60000,"A",2 +26.6,55.7,40000,"A",2 +27.4,55.6,33000,"A",2 +28.7,55.5,33000,"A",2 +24,55.2,22000,"A",3 +24.5,55.3,22000,"A",3 +24.6,55.8,6000,"A",3 diff --git a/doc/assets/minard.png b/doc/assets/minard.png new file mode 100644 index 00000000..82901efb Binary files /dev/null and b/doc/assets/minard.png differ diff --git a/doc/gallery/examples/boxplot.qmd b/doc/gallery/examples/boxplot.qmd new file mode 100644 index 00000000..ec3cbb2c --- /dev/null +++ b/doc/gallery/examples/boxplot.qmd @@ -0,0 +1,84 @@ +--- +title: "Box plots" +description: "Showing groups of distributions of single numeric variables" +image: thumbnails/boxplot.svg +categories: [basic, boxplot, distribution] +order: 3 +--- + +Boxplots are a popular way to display a summary of a distribution of single continuous variables. +It is good to keep in mind boxplots hide the actual distribution of the data behind a summary, for example when the data is bi- or multi-modal. +For every group, a boxplot displays the following 6 things: + +1. The 25^th^ percentile, or Q1, as the start of the box. +2. The 50^th^ percentile, i.e. median or Q2, as a line across the box. +3. The 75^th^ percentile, or Q3, as the end of the box. Together with Q1 we can compute the interquartile range: IQR = Q3 - Q1. +4. The minimum data value or Q1 - 1.5 * IQR, whichever is larger. This is displayed as the lower whisker. +5. The maximum data value or Q3 + 1.5 * IQR, whichever is smaller. This is displayed as the upper whisker. +6. Outliers outside the whiskers, if present. These are drawn as individual points. + +## Code + +```{ggsql} +VISUALISE species AS x, bill_len AS y FROM ggsql:penguins + DRAW boxplot +``` + +## Explanation + +* The `VISUALISE ... FROM ggsql:penguins` loads the built-in penguins dataset. +* `species AS x` sets a categorical variable to separate different groups. +* `bill_len AS y` sets the numeric variable to summarise. +* `DRAW boxplot` gives instructions to draw the boxplot layer. + +## Variations + +### Dodging + +You can refine groups beyond the axis categorical variable, and the boxplots will be displayed in a dodged way. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, island AS fill FROM ggsql:penguins + DRAW boxplot +``` + +However, dodging might be unproductive or counterintuitive in some cases. +For example if we double-encode groups, like `species` as both `x` *and* `fill` in the plot below, dodging looks bad. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, species AS fill FROM ggsql:penguins + DRAW boxplot +``` + +We can disable the dodging by setting `position => 'identity'`. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, species AS fill FROM ggsql:penguins + DRAW boxplot SETTING position => 'identity' +``` + +### Horizontal + +To draw the boxplots horizontally, simply swap the `x` and `y` mapping. +The orientation is detected automatically based on which variable is continuous and which is discrete. + +```{ggsql} +VISUALISE bill_len AS x, species AS y, island AS fill FROM ggsql:penguins + DRAW boxplot +``` + +### With individual datapoints + +Because a boxplot is a summary, it may be a good idea to supplement them with individual datapoints so that you can't be accused of 'hiding' the distribution. +The datapoints can be jittered by setting `position => 'jitter'`. +When you do this, make sure to turn `outliers => false` to not draw the outlier points twice across the two layers. + + + +```{ggsql} +VISUALISE species AS x, bill_len AS y FROM ggsql:penguins + DRAW point SETTING position => 'jitter' + DRAW boxplot SETTING outliers => false +``` + + diff --git a/doc/gallery/examples/density.qmd b/doc/gallery/examples/density.qmd new file mode 100644 index 00000000..d251c2ad --- /dev/null +++ b/doc/gallery/examples/density.qmd @@ -0,0 +1,94 @@ +--- +title: "Density plots" +description: "Showing smooth distributions of single numeric variables" +image: thumbnails/density-plot.svg +categories: [basic, density, distribution] +order: 3 +--- + +Like histograms, density plots show the distribution of a numeric variable. +Instead of binning, density plots use [kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation) to estimate a smooth, continuous probability density. +A kernel (like a Gaussian) is placed on each point and summed. +The level of smoothing is controlled via the bandwidth which affects the width of the kernel. + +## Code + +The x-axis gives the value of the numerical variable, whereas the y-axis gives the estimated probability density. + +```{ggsql} +VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins + DRAW density +``` + +## Explanation + +* The `VISUALISE ... FROM ggsql:penguins` loads the built-in penguins dataset. +* `bill_len AS x` sets the numeric variable to use for density estimation. +* `species AS colour` sets implicit groups indicated by colour. +* `DRAW density` gives instructions to draw the density layer. + +## Variations + +### Group contributions + +Using the density gives all groups equal area that integrates to 1. +This masks differences between the sizes of groups. +Instead of using density, one can use the `intensity` that also encompasses differences in group size. + +```{ggsql} +VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins + DRAW density REMAPPING intensity AS y +``` + +### Stacking + +Instead of having independent groups, the density can also be stacked. +Note that stacking alone does not account for relative contributions per group. +For that reason, you may want to show the intensity instead. + +```{ggsql} +VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins + DRAW density + REMAPPING intensity AS y + SETTING position => 'stack' +``` + +### Annotation + +You can use the [rule](../../syntax/layer/type/rule.qmd) layer to display precomputed summaries, like the mean. + + + +```{ggsql} +WITH mean_data AS ( + SELECT + AVG(bill_len) AS bill_len, + species + FROM ggsql:penguins + GROUP BY species +) +VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins + DRAW density SETTING opacity => 0.3 + DRAW rule MAPPING FROM mean_data +``` + +### Faceting + +Another way of comparing groups is by using facets to separate the groups into different panels. + +```{ggsql} +VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins + DRAW density + FACET species SETTING ncol => 1 +``` + +### Relation to violin plots + +Conceptually, violin plots also display densities. +The similarity becomes clearer if you make a ridgeline plot by displaying the violin density on a single side. +The plot below is essentially showing the same thing as the plot above, but gathered in a single panel. + +```{ggsql} +VISUALISE bill_len AS x, species AS y, species AS colour FROM ggsql:penguins + DRAW violin SETTING side => 'top', width => 2 +``` diff --git a/doc/gallery/examples/heatmap.qmd b/doc/gallery/examples/heatmap.qmd new file mode 100644 index 00000000..5c4b60a1 --- /dev/null +++ b/doc/gallery/examples/heatmap.qmd @@ -0,0 +1,56 @@ +--- +title: "Heatmap" +description: "Arranging tiles on a grid" +image: thumbnails/violin-plot.svg +categories: [basic, heatmap] +order: 3 +--- + +A heatmap visusalised data values as colors in a grid layout. +It makes it easy to see patterns and relationships through color intensity. +It works best with discrete or ordinal arrangements. + +## Code + +```{ggsql} +VISUALISE Day AS x, Month AS y, Temp AS fill FROM ggsql:airquality + DRAW rect +``` + +## Explanation + +* The `VISUALISE ... FROM ggsql:airquality` loads the built-in air quality dataset. +* `Day AS x, Month AS y` defines a 2D grid 'map'. The default width and height of each cell is 1. Because these variables are contiguous whole numbers, this creates a grid. +* `Temp AS fill` declares the 'heat' variable to display as colour intensity. +* `DRAW rect` gives instructions to draw a rectangle layer. + +## Variations + +As a stylistic choice, you can set the cells to be opaque without borders. + +```{ggsql} +VISUALISE Month AS y, Day AS x, Temp AS fill FROM ggsql:airquality + DRAW rect + SETTING stroke => null, opacity => 1 +``` + +You can change the color by adapting the scale. + +```{ggsql} +VISUALISE Month AS y, Day AS x, Temp AS fill FROM ggsql:airquality + DRAW rect + SCALE fill TO magma + SETTING reverse => true +``` + +If you have centered data, you may want to use a divergent colour scale. It is important to the two extremes in `FROM` symmetrically around the midpoint. + +```{ggsql} +SELECT *, + Temp * 1.0 - AVG(Temp) OVER (PARTITION BY Month) AS centered +FROM ggsql:airquality + +VISUALISE Month AS y, Day AS x, centered AS fill + DRAW rect + SCALE fill FROM [-20, 20] TO vik +``` \ No newline at end of file diff --git a/doc/gallery/examples/minard.qmd b/doc/gallery/examples/minard.qmd new file mode 100644 index 00000000..6f8252b5 --- /dev/null +++ b/doc/gallery/examples/minard.qmd @@ -0,0 +1,187 @@ +--- +title: "Napoleon's march to Moscow" +description: "Re-creating the famous visualisation from Minard." +image: thumbnails/minard.svg +categories: [line, advanced] +order: 2 +references: + - id: friendly2002 + type: article-journal + author: + - family: Friendly + given: Michael + title: "Visions and Re-visions of Charles Joseph Minard" + container-title: Journal of Educational and Behavioral Statistics + volume: 27 + issue: 1 + page: 31-51 + issued: + year: 2002 + doi: 10.3102/10769986027001031 +--- + +In 1812 the French emperor Napoleon waged a military campaign invading Russia. +The campaign had early tactical success and Napoleon briefly occupied Moscow. +However, the campaign was a strategic failure because the retreat from Russia back to France was a catastrophe. +Charles Joseph Minard is best known for visualising numerical data about this campaign showing the advance and retreat. + +![The original infographic drawn by Charles Minard](../../assets/minard.png) + +In this example, we'll recreate the top part of the infographic. +The particular incarnation of the data that we're using here is adapted from the [HistData](https://cran.r-project.org/package=HistData) R package [@friendly2002]. + +## Getting started + +Before building a graphic it is always good to be aware of the columns and data structures that are present in your data. + +```{ggsql} +SELECT * FROM 'minard_troops.csv' LIMIT 5 +``` + +Our first goal is commit *something* to paper. We'll iron out mistakes and polish the graphic later. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW line +``` + +To explain what we have done here: + +* `VISUALISE ... 'minard_troops.csv'` queries a local CSV file for Napoleon's troops. +* `long AS x` sets the `long` (longitude) column as the `x` aesthetic. +* `lat AS y` sets the `lat` (latitude) column as the `y` aesthetic. +* `DRAW line` instructs to plot to use the `line` layer. + +No celebrated military strategist would plan his troup movements towards Moscow in this fashion though. +The chart only shows movement in the west-east direction, meaning that we are not capturing the retreat properly. + +## Correcting mistakes + +The first 'mistake' we made is chosing the `line` layer. +Line layers automatically sort along the axis, so we're mixing coordinates from the advance and the retreat. +To rectify this, we should use the `path` layer instead. +Path layers connect datapoints in the order they appear in, so we're no long sorting along west-east. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path +``` + +The second mistake is that Napoleon's retreat was not a simple linear path. +For example: a detachment of soldiers arrived in Polotsk to guard the northern flank. +This detachment later joined up with the remainder of the army during the retreat. +What that means for us is that we have to account for additional grouping. +This grouping allows us to resolve separate paths. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + PARTITION BY direction, group +``` + +## Enriching + +One of the appealing aspects of Minard's visualisation is that it is rich. +Not only does it display a map and the route of the army; it also separates the advance from the retreat in different colours, and displays the troop numbers as line thickness. +We can also separate the advance from the retreat by mapping the `direction` variable to the stroke colour. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke + PARTITION BY direction, group +``` + +Similarly, we can include the troop numbers by mapping the `survivors` variable to the line width. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke, survivors AS linewidth + PARTITION BY direction, group +``` + +## Detailing + +Now that we have all the data included in the ways we want, we can start detailing the graphic to our tastes. +The first thing we might do is to pick some better colours. +Because we have two levels for the `direction` variable —**A**dvance and **R**etreat— we can create a new colour scale for the `stroke` aesthetic. +We'll choose the colours to more closely resemble the original graphic by Minard. +We set the palette using the `TO` keyword, and format the labels using `RENAMING`. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke, survivors AS linewidth + PARTITION BY direction, group + SCALE stroke TO ['burlywood', 'black'] + RENAMING 'A' => 'Advance', 'R' => 'Retreat' +``` + +Now for a slightly more complicated scale, we're going to set one for the `linewidth` variable that represent the number of troops. +If you want to build in some extra intuition for the scale, you can let 0 troops coincide with 0 linewidth. +We define the output range using `TO [0, 20]` because for a continuous variable it expects the output limits. +Slightly more elaborate is the input domain, where we use `FROM [0, null]` to state that the scale should start at 0 and go up to the largest value in the data. +Because both the input and output ranges start at 0, we get a well-proportioned line. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke, survivors AS linewidth + PARTITION BY direction, group + SCALE stroke TO ['burlywood', 'black'] + RENAMING 'A' => 'Advance', 'R' => 'Retreat' + SCALE linewidth FROM [0, null] TO [0, 20] +``` + +## Polishing + +While this map is nice, it is a little bit lacking in context. +For sure the longitude and latitude coordinates are meaningful to cartographers among us. +However, for the rest of us we may like some city names to contextualise the march a bit. +There is a separate dataset wherein we've saved the city coordinates and their names. +We can use this by adding a second `DRAW` layer. +Note that `long AS x, lat AS y` is applied globally, so it also applies to our city layer. +In our new layer, we need so set additional mapping `city AS label` and the new dataset using `FROM`. +We can also make text a little bit smaller by setting the font size. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke, survivors AS linewidth + PARTITION BY direction, group + DRAW text + MAPPING city AS label FROM 'minard_cities.csv' + SETTING fontsize => 6 + SCALE stroke TO ['burlywood', 'black'] + RENAMING 'A' => 'Advance', 'R' => 'Retreat' + SCALE linewidth FROM [0, null] TO [0, 30] +``` + +An additional obvious way to polish your graphic is to add nicer titles for all your variables. +We can use the `LABEL` statement to add custom labels for our plot. +In the title, we escape the single quote mark by using `\'` so that we know it is not the end of the string yet. +Moreover, we can use `null` to note that a title should be removed. +In that way we can hide the `long` and `lat` labels from the position mapping. + +```{ggsql} +VISUALISE long AS x, lat AS y FROM 'minard_troops.csv' + DRAW path + MAPPING direction AS stroke, survivors AS linewidth + PARTITION BY direction, group + DRAW text + MAPPING city AS label FROM 'minard_cities.csv' + SETTING fontsize => 6 + SCALE stroke TO ['burlywood', 'black'] + RENAMING 'A' => 'Advance', 'R' => 'Retreat' + SCALE linewidth FROM [0, null] TO [0, 20] + LABEL + title => 'Napoleon\'s Russian Campaign', + subtitle => 'Inspired by the graphic of C.J. Minard', + linewidth => 'Troops', + stroke => 'Direction', + x => null, + y => null +``` + +And there we have it: a reproduction of Minard's infographic on Napoleon's Russian campaign. \ No newline at end of file diff --git a/doc/gallery/examples/minard_cities.csv b/doc/gallery/examples/minard_cities.csv new file mode 100644 index 00000000..31597c6c --- /dev/null +++ b/doc/gallery/examples/minard_cities.csv @@ -0,0 +1,21 @@ +"long","lat","city" +24,55,"Kowno" +25.3,54.7,"Wilna" +26.4,54.4,"Smorgoni" +26.8,54.3,"Moiodexno" +27.7,55.2,"Gloubokoe" +27.6,53.9,"Minsk" +28.5,54.3,"Studienska" +28.7,55.5,"Polotzk" +29.2,54.4,"Bobr" +30.2,55.3,"Witebsk" +30.4,54.5,"Orscha" +30.4,53.9,"Mohilow" +32,54.8,"Smolensk" +33.2,54.9,"Dorogobouge" +34.3,55.2,"Wixma" +34.4,55.5,"Chjat" +36,55.5,"Mojaisk" +37.6,55.8,"Moscou" +36.6,55.3,"Tarantino" +36.5,55,"Malo-Jarosewii" \ No newline at end of file diff --git a/doc/gallery/examples/minard_troops.csv b/doc/gallery/examples/minard_troops.csv new file mode 100644 index 00000000..c3b527e1 --- /dev/null +++ b/doc/gallery/examples/minard_troops.csv @@ -0,0 +1,52 @@ +"long","lat","survivors","direction","group" +37.7,55.7,100000,"R",1 +37.5,55.7,98000,"R",1 +37,55,97000,"R",1 +36.8,55,96000,"R",1 +35.4,55.3,87000,"R",1 +34.3,55.2,55000,"R",1 +33.3,54.8,37000,"R",1 +32,54.6,24000,"R",1 +30.4,54.4,20000,"R",1 +29.2,54.3,20000,"R",1 +28.5,54.2,20000,"R",1 +28.3,54.3,20000,"R",1 +27.5,54.5,20000,"R",1 +26.8,54.3,12000,"R",1 +26.4,54.4,14000,"R",1 +25,54.4,8000,"R",1 +24.4,54.4,4000,"R",1 +24.2,54.4,4000,"R",1 +24.1,54.4,4000,"R",1 +28.7,55.5,33000,"R",2 +29.2,54.2,30000,"R",2 +28.5,54.1,30000,"R",2 +28.3,54.2,28000,"R",2 +24.6,55.8,6000,"R",3 +24.2,54.4,6000,"R",3 +24.1,54.4,6000,"R",3 +24,54.9,340000,"A",1 +24.5,55,340000,"A",1 +25.5,54.5,340000,"A",1 +26,54.7,320000,"A",1 +27,54.8,300000,"A",1 +28,54.9,280000,"A",1 +28.5,55,240000,"A",1 +29,55.1,210000,"A",1 +30,55.2,180000,"A",1 +30.3,55.3,175000,"A",1 +32,54.8,145000,"A",1 +33.2,54.9,140000,"A",1 +34.4,55.5,127100,"A",1 +35.5,55.4,100000,"A",1 +36,55.5,100000,"A",1 +37.6,55.8,100000,"A",1 +24,55.1,60000,"A",2 +24.5,55.2,60000,"A",2 +25.5,54.7,60000,"A",2 +26.6,55.7,40000,"A",2 +27.4,55.6,33000,"A",2 +28.7,55.5,33000,"A",2 +24,55.2,22000,"A",3 +24.5,55.3,22000,"A",3 +24.6,55.8,6000,"A",3 diff --git a/doc/gallery/examples/pie-chart.qmd b/doc/gallery/examples/pie-chart.qmd new file mode 100644 index 00000000..52c00d71 --- /dev/null +++ b/doc/gallery/examples/pie-chart.qmd @@ -0,0 +1,100 @@ +--- +title: "Pie chart" +description: "Visualisation of proportions" +image: thumbnails/pie-chart.svg +categories: [basic, polar] +order: 2 +--- + +Pie charts are a popular but less effective way to display proportions of groups. +No special syntax exists to create pie charts. +In the Grammar of Graphics, pie charts are stacked bar charts displayed in a polar coordinate system. + +## Code + +It makes sense to construct pie charts first in a Cartesian coordinate system, which makes it easier to diagnose any potential problems. +We start out with a stacked bar chart. +The `bar` layer does not need `x` or `y` because if absent, it will count the number of observations per group. +Stacking is the default position adjustment for the `bar` layer. + +```{ggsql} +VISUALISE island AS fill FROM ggsql:penguins + DRAW bar +``` + +Now we need to add the polar coordinate system to make a pie chart. + +```{ggsql} +VISUALISE island AS fill FROM ggsql:penguins + DRAW bar + PROJECT TO polar +``` + +## Explanation + +* The `VISUALISE ... FROM ggsql:penguins` loads the built-in penguins dataset. +* `island AS fill` gives the instruction to colour by `island`. Groups are implicitly defined by this too. +* `DRAW bar` instructs to use a bar layer. Because we have no `x` or `y`, but we do have fill-based groups, this will be counted. +* `PROJECT TO polar` sets the polar coordinate system that uses the radius and angle instead of x/y-coordinates. + * We need this when there are no position aesthetics, or we want to use additional `SETTING`s. + * If we use `angle` and `radius` aesthetics, polar coordinates are implied automatically. + * If we use `x` and `y` aesthetics, we need to use `PROJECT y, x TO polar` instead. This is easiest when swapping between Cartesian and polar coordinates. + +## Variations + +### Precomputed summary + +If we've pre-computed proportions or counts, we can build a pie chart by mapping to the `angle` and `radius` aesthetics. +Going for a classic pie chart, you may have to define a dummy `radius` aesthetic, because the default behaviour would otherwise be to count the number of every `n`. + +```{ggsql} +WITH count_data AS ( + SELECT + COUNT(*) AS n, + species + FROM ggsql:penguins + GROUP BY species +) + +VISUALISE n AS angle, 'dummy' AS radius, species AS fill FROM count_data + DRAW bar +``` + +Note that because we're using `angle` and `radius`, we no longer need to specify `PROJECT TO polar` because it is inferred from the aesthetic names. + +### Don't use `radius` for pie charts + +You could in theory also use a non-dummy variable for `radius`, but you may need to use `SETTING total` to fill in each ring. +We must warn sternly that this is a bad idea though. +We tend to judge pie charts by area, and the more central rings are allocated less area than the outer rings. +This could mislead one into thinking that the 'Adelie' species makes up the vast majority of the observations by eyeballing the cart below. +In reality, the 'Adelie' species makes up ~44% of the dataset: a narrow plurality. + +```{ggsql} +VISUALISE island AS radius, species AS fill FROM ggsql:penguins + DRAW bar SETTING total => 100 +``` + +### Donut charts + +Donut charts are a variation of the pie chart that shows a ring instead of a full circle. +You can create a donut chart by setting the `inner` of the polar coordinate system. + +```{ggsql} +VISUALISE island AS fill FROM ggsql:penguins + DRAW bar + PROJECT TO polar SETTING inner => 0.5 +``` + +### Semicircular pie charts + +These plots are a piece of cake. +You can set their extent by tweaking the `start` and `end` setting in degrees. +If you want the `angle` aesthetic to take up the full extent, you may turn off the `expand` setting in the angle scale. + +```{ggsql} +VISUALISE island AS fill FROM ggsql:penguins + DRAW bar + PROJECT TO polar SETTING start => -90, end => 90 + SCALE angle SETTING expand => 0 +``` diff --git a/doc/gallery/examples/scatterplot.qmd b/doc/gallery/examples/scatterplot.qmd index e8cc8460..674b97ec 100644 --- a/doc/gallery/examples/scatterplot.qmd +++ b/doc/gallery/examples/scatterplot.qmd @@ -6,7 +6,8 @@ categories: [basic, point] order: 1 --- -A scatter plot displays the relationship between two numeric variables by mapping them to x and y positions. This is one of the most fundamental visualization types for exploring correlations and patterns. +A scatter plot displays the relationship between two numeric variables by mapping them to x and y positions. +This is one of the most fundamental visualization types for exploring correlations and patterns. ## Code @@ -30,6 +31,8 @@ LABEL ### With color by species +We're adding `species AS color` to the mapping to colour the points by species. + ```{ggsql} VISUALISE bill_len AS x, bill_dep AS y, species AS color FROM ggsql:penguins DRAW point @@ -38,3 +41,52 @@ LABEL x => 'Bill Length (mm)', y => 'Bill Depth (mm)' ``` + +The color palette can be changed by detailing the `SCALE color` clause. + +```{ggsql} +VISUALISE bill_len AS x, bill_dep AS y, species AS color FROM ggsql:penguins +DRAW point +SCALE color TO ['DeepSkyBlue', 'Fuchsia', 'Lime'] +LABEL + title => 'Penguin Bill Dimensions by Species', + x => 'Bill Length (mm)', + y => 'Bill Depth (mm)' +``` + +### Encoding even more data using shape + +We can also encode a second layer of information by displaying `island AS shape`. + +```{ggsql} +VISUALISE + bill_len AS x, + bill_dep AS y, + species AS color, + island AS shape +FROM ggsql:penguins +DRAW point +LABEL + title => 'Penguin Bill Dimensions by Species', + x => 'Bill Length (mm)', + y => 'Bill Depth (mm)' +``` + +### Highlighting groups + +Using the layer's `FILTER` clause we can split the data across layers. +Using layer level `MAPPING` ensures we apply `species AS color` only to one layer and not the other. +The `SETTING` is used here to directly set a property without mapping data. + +```{ggsql} +VISUALISE + bill_len AS x, + bill_dep AS y +FROM ggsql:penguins +DRAW point + MAPPING species AS color + FILTER island == 'Biscoe' +DRAW point + SETTING color => 'grey' + FILTER island != 'Biscoe' +``` \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/boxplot.svg b/doc/gallery/examples/thumbnails/boxplot.svg new file mode 100644 index 00000000..804b952b --- /dev/null +++ b/doc/gallery/examples/thumbnails/boxplot.svg @@ -0,0 +1 @@ +AdelieChinstrapGentoospecies405060bill_lenAdelieChinstrapGentoospecies \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/density-plot.svg b/doc/gallery/examples/thumbnails/density-plot.svg new file mode 100644 index 00000000..5fbb93cc --- /dev/null +++ b/doc/gallery/examples/thumbnails/density-plot.svg @@ -0,0 +1 @@ +30405060bill_len0.000.050.10densityAdelieChinstrapGentoospecies \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/heatmap.svg b/doc/gallery/examples/thumbnails/heatmap.svg new file mode 100644 index 00000000..60fe0606 --- /dev/null +++ b/doc/gallery/examples/thumbnails/heatmap.svg @@ -0,0 +1 @@ +0102030Day56789Month60708090Temp \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/minard.svg b/doc/gallery/examples/thumbnails/minard.svg new file mode 100644 index 00000000..8f93fdf7 --- /dev/null +++ b/doc/gallery/examples/thumbnails/minard.svg @@ -0,0 +1 @@ +25303554.054.555.055.5KownoWilnaSmorgoniMoiodexnoGloubokoeMinskStudienskaPolotzkBobrWitebskOrschaMohilowSmolenskDorogobougeWixmaChjatMojaiskMoscouTarantinoMalo-Jarosewii0100,000200,000300,000TroopsAdvanceRetreatDirectionNapoleon's Russian CampaignInspired by the graphic of C.J. Minard \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/pie-chart.svg b/doc/gallery/examples/thumbnails/pie-chart.svg new file mode 100644 index 00000000..47e2cd7d --- /dev/null +++ b/doc/gallery/examples/thumbnails/pie-chart.svg @@ -0,0 +1 @@ +BiscoeDreamTorgersenisland \ No newline at end of file diff --git a/doc/gallery/examples/thumbnails/violin-plot.svg b/doc/gallery/examples/thumbnails/violin-plot.svg new file mode 100644 index 00000000..3b38b813 --- /dev/null +++ b/doc/gallery/examples/thumbnails/violin-plot.svg @@ -0,0 +1 @@ +AdelieChinstrapGentoospecies30405060bill_lenAdelieChinstrapGentoospecies \ No newline at end of file diff --git a/doc/gallery/examples/violin.qmd b/doc/gallery/examples/violin.qmd new file mode 100644 index 00000000..c2a0c19f --- /dev/null +++ b/doc/gallery/examples/violin.qmd @@ -0,0 +1,100 @@ +--- +title: "Violin plots" +description: "Showing groups of distributions of single numeric variables" +image: thumbnails/violin-plot.svg +categories: [basic, violin, density, distribution] +order: 3 +--- + +Violin plots display the distribution of a single continuous variable, much like [density plots](density.qmd). +They are displayed differently, with mirrored densities and separated groups. +The densities are mirrored, and each group has its own center. + +## Code + +```{ggsql} +VISUALISE species AS x, bill_len AS y FROM ggsql:penguins + DRAW violin +``` + +## Explanation + +* The `VISUALISE ... FROM ggsql:penguins` loads the built-in penguins dataset. +* `species AS x` sets a categorical variable to separate different groups. +* `bill_len AS y` sets the numeric variable to use for density estimation. +* `DRAW violin` gives instructions to draw the violin layer. + +## Variations + +### Dodging + +You can refine groups beyond the axis categorical variable, and the violins will be displayed in a dodged way. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, island AS colour FROM ggsql:penguins + DRAW violin +``` + +However, dodging might be unproductive or counterintuitive in some cases. +For example if we double-encode groups, like `species` as both `x` *and* `colour` in the plot below, dodging looks bad. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, species AS colour FROM ggsql:penguins + DRAW violin +``` + +We can disable the dodging by setting `position => 'identity'`. + +```{ggsql} +VISUALISE species AS x, bill_len AS y, species AS colour FROM ggsql:penguins + DRAW violin SETTING position => 'identity' +``` + +### Half-violins + +A ridgeline plot is a plot where violins are placed horizontally without mirroring. +To place violins horizontally, we just need to swap the `x` and `y` variables. + +```{ggsql} +VISUALISE bill_len AS x, species AS y FROM ggsql:penguins + DRAW violin +``` + +To get ridges, we can set `side => 'top'`. + +```{ggsql} +VISUALISE bill_len AS x, species AS y FROM ggsql:penguins + DRAW violin SETTING side => 'top' +``` + +To display variables split across two different groups, you can combine two halves to get an asymmetrical violin. +Here we're using the `FILTER` clause to draw separate layers for the 'male' and 'female' groups. + +```{ggsql} +VISUALISE bill_len AS x, species AS y, sex AS colour FROM ggsql:penguins + DRAW violin + SETTING side => 'top' + FILTER sex == 'female' + DRAW violin + SETTING side => 'bottom' + FILTER sex == 'male' +``` + +### With individual datapoints + +It might be tempting to combine the display of individual datapoints with a violin to accentuate the distribution. +The datapoints can be jittered by setting `position => 'jitter'`. + +```{ggsql} +VISUALISE species AS x, bill_len AS y FROM ggsql:penguins + DRAW point SETTING position => 'jitter' + DRAW violin SETTING opacity => 0.3 +``` + +This can be made even more clear by also using the `distrion => 'density'` setting. + +```{ggsql} +VISUALISE species AS x, bill_len AS y FROM ggsql:penguins + DRAW point SETTING position => 'jitter', distribution => 'density' + DRAW violin SETTING opacity => 0.3 +```