plt.figure(figsize = [10, 5])# larger figure size for subplots# example of somewhat too-large bin sizeplt.subplot(1, 2, 1)# 1 row, 2 cols, subplot 1bin_edges = np.arange(0, df['num_var'].max()+4, 4)plt.hist(data = df, x ='num_var', bins = bin_edges)# example of somewhat too-small bin sizeplt.subplot(1, 2, 2)# 1 row, 2 cols, subplot 2bin_edges = np.arange(0, df['num_var'].max()+1/4, 1/4)plt.hist(data = df, x ='num_var', bins = bin_edges)plt.xlim(0, 35)plt.xticks(np.arange(2, 12+1, 1))
Base Color and Order
# Use only 1 color because other colors give no extra meaningbase_color = sb.color_palette()[0]sb.countplot(data = df, x ='cat_var', color = base_color)# Sort the data by highest / lowest valuecat_order = df['cat_var'].value_counts().indexsb.countplot(data = df, x ='cat_var', color = base_color, order = cat_order)# Order with ordinal categories # this method requires pandas v0.21 or laterlevel_order = ['Alpha','Beta','Gamma','Delta']ordered_cat = pd.api.types.CategoricalDtype(ordered =True, categories = level_order)df['cat_var']= df['cat_var'].astype(ordered_cat)
Get proportions or relative frequencies
# get proportion taken by most common group for derivation# of tick marksn_points = df.shape[0]max_count = df['cat_var'].value_counts().max()max_prop = max_count / n_points# generate tick mark locations and namestick_props = np.arange(0, max_prop, 0.05)tick_names = ['{:0.2f}'.format(v)for v in tick_props]# create the plotbase_color = sb.color_palette()[0]sb.countplot(data = df, x ='cat_var', color = base_color)plt.yticks(tick_props * n_points, tick_names)plt.ylabel('proportion')
Use text annotations to label the frequencies on bars
# create the plotbase_color = sb.color_palette()[0]sb.countplot(data = df, x ='cat_var', color = base_color)# add annotationsn_points = df.shape[0]cat_counts = df['cat_var'].value_counts()locs, labels = plt.xticks()# get the current tick locations and labels# loop through each pair of locations and labelsfor loc, label inzip(locs, labels):# get the text property for the label to get the correct count count = cat_counts[label.get_text()] pct_string ='{:0.1f}%'.format(100*count/n_points)# print the annotation just below the top of the bar plt.text(loc, count-8, pct_string, ha ='center', color ='w')
# Transparencyplt.scatter(data = df, x ='disc_var1', y ='disc_var2', alpha =1/5)
As an alternative or companion to transparency, we can also add jitter to move the position of each point slightly from its true value. This is not a direct option in matplotlib's scatter function, but is a built-in option with seaborn's regplot function (setting alpha is different as well):
sb.regplot(data = df, x ='disc_var1', y ='disc_var2', fit_reg =False, x_jitter =0.2, y_jitter =0.2, scatter_kws = {'alpha' : 1/3})
Heatmap
default "viridis" color palette, by setting cmap = 'viridis_r'.By adding a cmin = 0.5 parameter to the hist2d call, this means that a cell will only get colored if it contains at least one point.
bins_x = np.arange(0.5, 10.5+1, 1)bins_y = np.arange(-0.5, 10.5+1, 1)plt.hist2d(data = df, x ='disc_var1', y ='disc_var2', bins = [bins_x, bins_y], cmap ='viridis_r', cmin =0.5)plt.colorbar()# loop through the cell counts and add text annotations for eachfor i inrange(counts.shape[0]):for j inrange(counts.shape[1]): c = counts[i,j]if c >=7:# increase visibility on darkest cells plt.text(bins_x[i]+0.5, bins_y[j]+0.5, int(c), ha ='center', va ='center', color ='white')elif c >0: plt.text(bins_x[i]+0.5, bins_y[j]+0.5, int(c), ha ='center', va ='center', color ='black')
Legend
ax.legend(loc =8, ncol =3, framealpha =1, title ='cat_var2')
2 Categorical Variables
sb.countplot(data = df, x ='cat_var1', hue ='cat_var2')
Heatmap: Instead of providing the original dataframe, we need to summarize the counts into a matrix that will then be plotted.
annot = True makes it so annotations show up in each cell, but the default string formatting only goes to two digits of precision. Adding fmt = 'd' means that annotations will all be formatted as integers instead. You can use fmt = '.0f' if you have any cells with no counts, in order to account for NaNs.
# set bin edges, compute centersbin_size =0.25xbin_edges = np.arange(0.5, df['num_var1'].max()+bin_size, bin_size)xbin_centers = (xbin_edges + bin_size/2)[:-1]# compute statistics in each bindata_xbins = pd.cut(df['num_var1'], xbin_edges, right =False, include_lowest =True)y_means = df['num_var2'].groupby(data_xbins).mean()y_sems = df['num_var2'].groupby(data_xbins).sem()# plot the summarized dataplt.errorbar(x = xbin_centers, y = y_means, yerr = y_sems)plt.xlabel('num_var1')plt.ylabel('num_var2')
Adapted Bar Plot
we could plot a numeric variable against a categorical variable by adapting a bar chart so that its bar heights indicate the mean of the numeric variable. This is the purpose of seaborn's barplot function:
base_color = sb.color_palette()[0]sb.barplot(data = df, x ='cat_var', y ='num_var', color = base_color)
Different hues are automatically assigned to each category level unless a fixed color is set in the "color" parameter, like in countplot and violinplot.
Encoding via shape
cat_markers = [['A','o'], ['B','s']]for cat, marker in cat_markers: df_cat = df[df['cat_var1']== cat] plt.scatter(data = df_cat, x ='num_var1', y ='num_var2', marker = marker)plt.legend(['A','B'])
Encoding via size
plt.scatter(data = df, x ='num_var1', y ='num_var2', s ='num_var3')# dummy series for adding legendsizes = [20,35,50]base_color = sb.color_palette()[0]legend_obj = []for s in sizes: legend_obj.append(plt.scatter([], [], s = s, color = base_color))plt.legend(legend_obj, sizes)
Encoding via color
If you have a qualitative variable, you can set different colors for different levels of a categorical variable through the "hue" parameter on seaborn's FacetGrid class.
For quantitative variables, we should not take the same approach, since FacetGrid expects any variable input for subsetting to be categorical. Instead, we can set color based on numeric value in the scatterfunction through the "c" parameter, much like how we set up marker sizes through "s". (Unlike with size, we don't have an easy way of setting color by numeric value through regplot due to how its "color" argument is set up.)
plt.scatter(data = df, x ='num_var1', y ='num_var2', c ='num_var3')plt.colorbar()
Faceting for multivariate data
Where the faceted plots demonstrated were univariate before, you can actually use any plot type, allowing you to facet bivariate plots to create a multivariate visualization.
g = sb.FacetGrid(data = df, col ='cat_var1', size =4)g.map(sb.boxplot, 'cat_var2', 'num_var2')
FacetGrid also allows for faceting a variable not just by columns, but also by rows. We can set one categorical variable on each of the two facet axes for one additional method of depicting multivariate trends.
g = sb.FacetGrid(data = df, col ='cat_var2', row ='cat_var1', size =2.5, margin_titles =True)g.map(plt.scatter, 'num_var1', 'num_var2')
Plot Matrices
g = sb.PairGrid(data = df, vars = ['num_var1', 'num_var2', 'num_var3'])g.map_diag(plt.hist)g.map_offdiag(plt.scatter)
Correlation Matrices
sb.heatmap(df.corr(), annot =True, fmt ='.2f', cmap ='vlag_r', center =0)