Python ggplot alternatives

Using Python to make data visuals as an R user

This package contains similar packages to the sample ones we automatically find installed within R studio. This link contains more packages with sample data sets: https://towardsdatascience.com/datasets-in-python-425475a20eb1

import seaborn as sns

## list datasets in the seaborn package

print(sns.get_dataset_names())
['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']

Let’s use the diamonds data set

### First lets import all the different libraries we will be using 

# people say this is comparable to the tidyverse
import pandas as pd

# similar to ggplot
import plotnine as pn

# plots with js elements 
import plotly.express as px

# syntax similar to matlab
import matplotlib.pyplot as matplt

# plots that allow the user to import from the rendered markdown
import altair as altr

# Import diamonds dataset
diamonds = sns.load_dataset('diamonds')

diamonds.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB

Bar graphs

Plotnine package:

This package is just like the ggplot2 package, one thing to note that instead of data = cars we see the ggplot(cars). In this case data = cars will give us an error

## Plotnine which is equivalent to ggplot
(pn.ggplot(diamonds) + 
   pn.aes(x='cut') +
   pn.geom_bar(size=20) + 
   pn.coord_flip() +
   pn.ggtitle('Number of Diamonds by Cut')
)

<Figure Size: (640 x 480)>

Plotly is pretty cool becuase she has interactive elements using java script

## Plotly 

px.histogram(
diamonds, 
y = "cut", 
title= "Number of Diamonds by Cut"
)

Altair also has some interactive elements like giving the viewer the option to save

My only issue is that I have not figure out how to make her a bigger size within a markdown document. Additionally altair can’t deal with data having more than 5000 rows

## Altair
(
  altr.Chart(
   diamonds.sample(5000),
   title = "Number of Diamonds by Cut"
 )
 .mark_bar()
 .encode(
    x = 'count()', y = altr.Y('cut')
 )
)

Histogram

Pandas

(diamonds.sample(1000)['carat']
 .plot
 .hist(bins=20))
<Axes: ylabel='Frequency'>

Matplotlib

matplt.hist('carat', bins = 20, data = diamonds.sample(1000))
(array([208., 157., 113., 118.,  68., 110.,  56.,  48.,   7.,  48.,  19.,
          1.,  23.,  16.,   3.,   1.,   1.,   1.,   0.,   2.]),
 array([0.23 , 0.369, 0.508, 0.647, 0.786, 0.925, 1.064, 1.203, 1.342,
        1.481, 1.62 , 1.759, 1.898, 2.037, 2.176, 2.315, 2.454, 2.593,
        2.732, 2.871, 3.01 ]),
 <BarContainer object of 20 artists>)

Plotnine

(pn.ggplot(diamonds.sample(1000)) +
      pn.aes(x = 'carat') +
      pn.geom_histogram(binwidth = 0.1) +
      pn.xlim(0,3)
      )
C:\Users\hanna\AppData\Local\Programs\Python\Python311\Lib\site-packages\plotnine\layer.py:284: PlotnineWarning: stat_bin : Removed 1 rows containing non-finite values.
C:\Users\hanna\AppData\Local\Programs\Python\Python311\Lib\site-packages\plotnine\layer.py:364: PlotnineWarning: geom_histogram : Removed 2 rows containing missing values.

<Figure Size: (640 x 480)>

Plotly

px.histogram(
  diamonds.sample(1000), x = 'carat'
)

Altair

(
  altr.Chart(diamonds.sample(1000))
  .mark_bar()
  .encode(
    altr.X('carat', bin = altr.Bin(step = 0.1)),
    y = 'count()'
)
)

Scatter Plot

Pandas

(
diamonds.sample(1000)
.plot
.scatter(x = 'carat', y = 'price')
.set(
  title = 'Carat vs Diamond Price', 
  xlabel = 'Diamond Carat',
  ylabel = 'Diamond Price'
  )
)
[Text(0.5, 1.0, 'Carat vs Diamond Price'),
 Text(0.5, 0, 'Diamond Carat'),
 Text(0, 0.5, 'Diamond Price')]

Plotnine

(
  pn.ggplot(diamonds.sample(1000)) +
    pn.aes(x = 'carat', y = 'price') + 
    pn.geom_point() + 
    pn.ggtitle('Carat vs Diamond Price') + 
    pn.xlab('Diamond Carat')+
    pn.ylab('Diamond Price')
)

<Figure Size: (640 x 480)>

Plotly

px.scatter(
diamonds.sample(1000),
x = 'carat',
y = 'price',
title = 'Diamond Carat vs Price',
labels = dict(carat = 'Diamond Carat', price = 'Diamond Price')
)

Altair

altr.Chart(diamonds.sample(1000)).mark_circle().encode(
    altr.X(
        'carat',
        title='Diamond Carat',
    ),
    altr.Y(
        'price',
        title='Diamond Price',
    ),
).properties(
    title='Diamond Carat vs Diamond Price'
)