I provide Python code and examples for a wide variety of data visualizations, using the standard machine learning workflow (explore, describe, infer, model, evaluate, tune, etc.) as a roadmap and letting the code and visualizations do most of the talking. The custom stylesheet and plotting routines I reference throughout can be found in my data-tools repository.
### Establish project paths
= '/content/drive/MyDrive/Colab_Notebooks/'
root = 'experiments/'
project_dir = 'stylesheet_tests/'
output_dir
= root + project_dir
project_path = root + project_dir + output_dir
output_path = project_path + 'requirements.txt' env_path
Visual elements and parameters I consider when establishing defaults
knitr::include_graphics("../../images/attribute_histogram_plots.png")
#> Name Age Score
#> 1 Alice 25 85
#> 2 Bob 30 92
#> 3 Charlie 22 78
knitr::kable(df, caption = "Caption", digits = 2)
Name | Age | Score |
---|---|---|
Alice | 25 | 85 |
Bob | 30 | 92 |
Charlie | 22 | 78 |
Generic
text
(x/y/title/tick labels/legend title/legend items): font face/sizeaxes
: which to display/linewidth/minMax tick sizes/minMax tick widthsgrid
: which minMax VerticalHorizontal lines to display/colour/linewidthobject
: edgecolour/edgewidthcolours
: discrete cycle/continuous maplegend
: box/placementfigure
: sizesubplots
: vertical padding/horizontal paddingother
: layout compactness, display qualityAdditional specifications for particular objects
scatter
: marker sizeline
: width/continuous error colour/continuous error linewidthbar
: width/between group spacing/errorbar colour/errorbar linewidth/errorbar cap widthgrouped bar
: within group spacingbox
: median colour/median widthNote: for each of the above, despite generic object
defaults, may additionally have to specify:
- edgecolour/edgewidth
Parameters to consider manually specifying for each (sub)plot:
labels/title
: on or offtick labels
: rotation, which ticksaxes
: limits, which ticksgrid
: which linesobject
: alphafigure
: sizesubplots
: RowColPosition arrangement, (shared) labels/titlesTypical plotting function arguments:
References - The matplotlibrc file - matplotlib.org
Note
- All lines in the matplotlibrc
file start with a ‘#’, so that removing all leading ’#’s yields a valid style file
### [SCRATCH] Get fonts
from pathlib import Path
from matplotlib import font_manager as fm
from fontTools.ttLib import TTFont
# !wget 'https://github.com/google/fonts/blob/main/ofl/allison/Allison-Regular.ttf'
= Path(*fm.findSystemFonts('.'))
font_file = Path(font_file)
font_file_path # fm.fontManager.addfont(font_file_path) # [TODO] resolve this
### [SCRATCH]
import os
= 'Abel-Regular.ttf'
item = 1
overwrite
= os.path.join('/content/', item)
source_path = os.path.join(project_path, output_dir, item)
output_path
print(f"Copying: {item} from {source_path} to {output_path}")
if os.path.isfile(source_path) or os.path.isdir(source_path):
# Check if the item already exists in the output path
if os.path.exists(output_path):
if overwrite:
# Copy the item
#!cp -rf "$source_path" "$output_path"
print(f"Successfully copied {item} ->\n{output_path}\n")
else:
print(f"Skipped: {item} already exists in {output_path}\n")
else:
# Copy the item
#!cp -rf "$source_path" "$output_path"
print(f"Successfully copied {item} to {output_path}\n")
### [SCRATCH]
# !pip install pipreqs
# !pipreqs > requirements.txt --force
### View path to base matplotlibrc file
import matplotlib
matplotlib.matplotlib_fname()
### STYLESHEET TEMPLATE
# font.family : 'sans'
### Stylesheet. Contrast with https://matplotlib.org/stable/users/explain/customizing.html#the-matplotlibrc-file
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from matplotlib.colors import ListedColormap
## General
# Font face and sizes
'font.family'] = 'sans-serif'
mpl.rcParams[# mpl.rcParams['font.sans-serif'] = "Helvetica"
'font.size'] = 8 # default font sizes
mpl.rcParams['axes.titlesize'] = 12 # large
mpl.rcParams['axes.labelsize'] = 9 # medium
mpl.rcParams['xtick.labelsize'] = 8 # medium
mpl.rcParams['ytick.labelsize'] = 8 # medium
mpl.rcParams['legend.fontsize'] = 9 # medium
mpl.rcParams['legend.title_fontsize'] = 9 # None (same as default axes)
mpl.rcParams['figure.titlesize'] = 15 # large (suptitle size)
mpl.rcParams['figure.labelsize'] = 12 # large (sup[x|y]label size)
mpl.rcParams[
# Spines and ticks
# mpl.rcParams['axes.spines.top'] = False
# mpl.rcParams['axes.spines.right'] = False
'axes.linewidth'] = .6
mpl.rcParams['axes.edgecolor'] = 'black'
mpl.rcParams['xtick.major.size'] = 0 # 3.5
mpl.rcParams['ytick.major.size'] = 0 # 3.5
mpl.rcParams[# mpl.rcParams['xtick.major.width'] = 0.8
# mpl.rcParams['ytick.major.width'] = 0.8
# Grid
'axes.grid.which'] = 'major' # lines at {major, minor, both} ticks
mpl.rcParams['grid.linestyle'] = '--'
mpl.rcParams['grid.color'] = '#CCCCCC'
mpl.rcParams['grid.linewidth'] = 0.2
mpl.rcParams[# mpl.rcParams['grid.alpha'] = 1
# Label placement
'axes.titlelocation'] = 'center' # {left, right, center}
mpl.rcParams['axes.titlepad'] = 7.5 # 6
mpl.rcParams['axes.labelpad'] = 7.5 # 4
mpl.rcParams[# mpl.rcParams['xtick.major.pad'] = 3.5 # distance to major tick label in points
# mpl.rcParams['ytick.major.pad'] = 3.5
# Discrete color cycle (and continuous map)
# mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
'axes.prop_cycle'] = mpl.cycler(color=sns.color_palette("PiYG", n_colors=6))
mpl.rcParams[
# Legend properties
'legend.loc'] = 'best'
mpl.rcParams['legend.frameon'] = False
mpl.rcParams['legend.loc'] = 'best'
mpl.rcParams[
# Legend padding
# mpl.rcParams['legend.borderpad'] = 0.4 # border whitespace
# mpl.rcParams['legend.labelspacing'] = 0.5 # vert space between legend entries
# mpl.rcParams['legend.handlelength'] = 2.0 # length of the legend lines
# mpl.rcParams['legend.handleheight'] = 0.7 # height of the legend handle
# mpl.rcParams['legend.handletextpad'] = 0.8 # space btwn legend line legend text
# mpl.rcParams['legend.borderaxespad'] = 0.5 # border btwn axes and legend edge
# mpl.rcParams['legend.columnspacing'] = 2.0 # column separation
# Space-filling object properties (e.g., polygons/circles, scatter)
'patch.edgecolor'] = 'blue' # if forced, else patch is not filled
mpl.rcParams['patch.force_edgecolor'] = 1
mpl.rcParams['patch.linewidth'] = 0 # edgewidth (.5)
mpl.rcParams[
## Specific objects
# Scatter properties
# mpl.rcParams['scatter.edgecolors'] = 'black'
# Line properties
'lines.markersize'] = 5
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams[
# Bar properties
# mpl.rcParams['bar.width'] = 0.8
# Error properties
'errorbar.capsize'] = 1
mpl.rcParams[# mpl.rcParams['errorbar.color'] = 'black'
# mpl.rcParams['errorbar.linewidth'] = 1.5
# Box properties
# box
'boxplot.boxprops.linewidth'] = 0 # box outline (0.5)
mpl.rcParams[# mpl.rcParams['boxplot.boxprops.color'] = 'none' # 'black' (?)
# box line to cap
'boxplot.whiskerprops.linewidth'] = .65
mpl.rcParams['boxplot.whiskerprops.linestyle'] = '--'
mpl.rcParams[# mpl.rcParams['boxplot.whiskerprops.color'] = 'black' # (?)
# box cap line
'boxplot.capprops.linewidth'] = .75
mpl.rcParams[# mpl.rcParams['boxplot.capprops.color'] = 'black' # (?)
# box median line
'boxplot.medianprops.linewidth'] = 1
mpl.rcParams['boxplot.medianprops.linestyle'] = '-'
mpl.rcParams[# mpl.rcParams['boxplot.medianprops.color'] = 'black' # (?)
'boxplot.meanprops.linewidth'] = 1
mpl.rcParams['boxplot.meanprops.linestyle'] = '-'
mpl.rcParams[# mpl.rcParams['boxplot.meanprops.color'] = 'black' # (?)
# box scatter
'boxplot.flierprops.markerfacecolor'] = 'none'
mpl.rcParams['boxplot.flierprops.markeredgewidth'] = .65
mpl.rcParams['boxplot.flierprops.marker'] = 'o'
mpl.rcParams[# mpl.rcParams['boxplot.flierprops.markersize'] = 6 # (?)
# mpl.rcParams['boxplot.flierprops.linewidth'] = 0 # (?)
# mpl.rcParams['boxplot.flierprops.markeredgecolor'] = 'black' # (?)
# mpl.rcParams['boxplot.flierprops.color'] = 'black' # (?)
## Figure padding
# Figure layout
'figure.autolayout'] = True # auto- make plot elements fit on fig
mpl.rcParams['figure.constrained_layout.use'] = True # apply tight layout
mpl.rcParams[
# Subplot padding (all dims are a fraction of the fig width and height).
# Not compatible with constrained_layout.
# mpl.rcParams['figure.subplot.left'] = .125 # left side of subplots of fig
# mpl.rcParams['figure.subplot.right'] = 0.9 # right side of subplots of fig
# mpl.rcParams['figure.subplot.bottom'] = 0.11 # bottom of subplots of fig
# mpl.rcParams['figure.subplot.top'] = 0.88 # top of subplots of figure
# mpl.rcParams['figure.subplot.wspace'] = 0.2 # w reserved space btwn subplots
# mpl.rcParams['figure.subplot.hspace'] = 0.2 # h reserved space btwn subplots
# Constrained layout padding. Not compatible with autolayout.
# mpl.rcParams['figure.constrained_layout.h_pad'] = 0.04167
# mpl.rcParams['figure.constrained_layout.w_pad'] = 0.04167
# Constrained layout spacing between subplots, relative to the subplot sizes.
# Much smaller than for tight_layout (figure.subplot.hspace, figure.subplot.wspace)
# as constrained_layout already takes surrounding text (titles, labels, # ticklabels)
# into account. Not compatible with autolayout.
# mpl.rcParams['figure.constrained_layout.hspace'] = 0.02
# mpl.rcParams['figure.constrained_layout.wspace'] = 0.02
## Other
# Figure size and quality
'figure.dpi'] = 100 # [NOTE] alters figure size
mpl.rcParams['figure.figsize'] = (5, 5) # (6, 4), (6.4, 4.8)
mpl.rcParams[
# Figure saving settings
'savefig.transparent'] = True
mpl.rcParams['savefig.format'] = 'png'
mpl.rcParams['savefig.dpi'] = 330
mpl.rcParams[
#%config InlineBackend.figure_format = 'svg' # set inline figure format/quality
[TODO]
Colour palette generator utilities
[TODO] [def] Continuous generators:
(1 H nodes) Monchromatic (linear L)
- args: 1 H node, optional n L steps from it
- example: blues
- utility: topological
- use cases: surface/distribution/contour plot
(~2-6 H nodes) Sequential (constant L):
- args: 2-6 H nodes, optional n(k - 1) transition steps between them
- example: viridis/plasma
- utility: topological
- use cases: surface/distribution/contour plot
(2 H nodes) Diverging (triangular L):
- args: 2 H nodes, optional n*2 ascending/descending L steps between them
- example: PiYG/RdBu
- utility: 1D bipolar scale/2D binary probability surface
- use cases: heatmap, classifier decision surface
(~3-6 H nodes) Sequential diverging (~triangular L)
- args: k H nodes, optional n(k - 1)*2 ascending/descending L steps between them
- example: ~gnuplot2/gist_ncar/gist_rainbow/jet
- utility: 1D multipolar scale
- use cases: heatmap with 1+ intermediate nodes
/ (~6 H nodes) Spectral (~constant L, cyclic /H)
- example: hsv
- utility: multiclass probability surface (using class probability-weighted H nodes)
- ues cases: multiclass classifier decision surface
[TODO] [def] Discrete generators:
(k H nodes) Discrete (~constant L)
- args: k H nodes
- example: tab10/Dark2
- utility: discrete objects with no within-group levels OR groups of objects with within-group levels but no order
- use cases: k objects (lines/hists/bars/boxplots) for k categories OR m objects (within groups) for m levels
(k H nodes) Discrete monochromatic (~constant L between * linear L within)
- args: k H nodes, optional n*k L steps from each node
- example: tab20/paired
> tab20c/tab20b
- utility: discrete groups of objects with a within-group order and aim to highlight between-group differences
- use cases: (group * time period) * value grouped objects (lines/hists/bars/boxplots)
Colour generator utillity
Given (a) any H value and k-iary scheme / (b) H/set of H values and integer k: - (a) [def] return complimentary/triad/square/k-iary H nodes - (b) [def] return k quantized H nodes
[TODO] For all of the above: - args: accept k colours and optional n steps - convert inputs to H values [0-359] - (maximize S/L of input args) - if multiple H nodes passed, quantize so each evenly spaced on colorwheel - return: k H nodes (+ either colourmap or discrete colourset) - plot: H nodes on colourwheel (+ colourbar)
[TODO] Other: - demo how to convert any map hsl values
[REF] Tools - Adobe Colorwheel
Matplotlib - Choosing Colormaps - Creating Colormaps
Other libraries - seaborn - CMasher - Colormaps - Palettable
Theory - Standard color space: HSL and HSV - Wikipedia - Perceptually uniform color space: HCL - hclwizard - Human-friendly HSL! : HSLuv - HSLuv.org
Other - Reference of good perceptual palettes: HCL-Based Color Palettes - colorspace
#!pip install cmasher
### Continuous maps
# (1) Monochromatic (1 H, linear L)
#import cmasher as cmr
#cmr.get_sub_cmap('Blues_r', 0, 1, N=20)
#cmr.get_sub_cmap('Greens_r', 0, 1, N=20)
# (2) Sequential (multiple H, constant L)
#cmr.get_sub_cmap('plasma', 0, 1, N=20)
#cmr.get_sub_cmap('cmr.chroma', 0.2, .8, N=20)
### Constructing monochromatic (rot=.1)/sequential/cyclic (rot=1) maps w/ linear L
# starting H [0, 3] (ROYGBIV = 3/7*h), rotations around hue wheel (int)
# dark/light (intensity of darkest/lightest color in palette) [0, 1] or gamma
=(3/7)*1, rot=.1, reverse=True, gamma=.7,
ListedColormap(sns.cubehelix_palette(start# dark=.25, light=.85,
=20, as_cmap=False)) n_colors
# (3) Diverging (~2 H, triangular L)
#cmr.get_sub_cmap('PiYG', 0, 1, N=20)
#cmr.get_sub_cmap('Spectral', 0, 1, N=20)
# (4) Sequentially diverging (3-6 H, ~triangular L)
#cmr.get_sub_cmap('gnuplot', 0, 1, N=20)
### Constructing diverging/sequentially diverging map
# starting/ending H [0, 359], S [0, 100], L [0, 100]
= 50
n_colors = sns.diverging_palette(h_neg=100, h_pos=200, s=100, l=50, n=n_colors, as_cmap=False)
palette_1 = ListedColormap(palette_1)
palette_1 palette_1
= sns.diverging_palette(h_neg=200, h_pos=300, s=100, l=50, n=n_colors, as_cmap=False)
palette_2 = ListedColormap(palette_2)
palette_2 palette_2
# Combine
import numpy as np
= np.vstack((
palette_combined 0, 1, n_colors)),
palette_1(np.linspace(1/n_colors, 1, n_colors - 1))))
palette_2(np.linspace(='GrBlPu') ListedColormap(palette_combined, name
# (5) Spectral (cyclic H, ~constant L)
"husl", n_colors=10, as_cmap=False)) ListedColormap(sns.color_palette(
#cmr.get_sub_cmap('hsv', 0, 1, N=20)
### Discrete maps
#cmr.get_sub_cmap('tab20b', 0, 1, N=20)
# plt.cm.tab20b
### Constructing discrete monochromatic map (hue-grouped truncated discrete map)
= 3
n_groups = 3
n_bars = plt.cm.tab20b((
colors 0,1,2,
[4,5,6,
16,17,18])).reshape(n_groups, n_bars, 4)
* n_bars, 4))
ListedColormap(colors.reshape(n_groups # ListedColormap(colors[0, :])
### Obtain hex/rgb string codes for specified colourmap
# return_fmt {hex, float=rgb}
#cmr.take_cmap_colors('cmr.chroma', cmap_range=(0, 1), N=20, return_fmt='float')
###
# [TODO] legend has 1st colour of each group; xaxis has rotated labels of 3*3
import matplotlib.pyplot as plt
import numpy as np
= ("Adelie", "Chinstrap", "Gentoo")
species = {
data 'Bill Depth': (18.35, 18.43, 14.98),
'Bill Length': (38.79, 48.83, 47.50),
'Flipper Length': (189.95, 195.82, 217.19),
}
= np.arange(len(species)) # the label locations
x = 0.2 # the width of the bars
width = 0
multiplier
= plt.subplots(layout='constrained')
fig, ax
for group, (attribute, measurement) in enumerate(data.items()):
= width * multiplier
offset = ax.bar(x + offset, measurement, width, alpha=.85,
rects =attribute.lower().capitalize(),
label=colors[:, group])
color=3)
ax.bar_label(rects, padding+= 1
multiplier
='y'); ax.legend(ncols=3)
ax.grid(axis0, 250)
ax.set_ylim(+ width, species)
ax.set_xticks(x 'Length (mm)')
ax.set_ylabel('Whitepaper plot')
ax.set_title(
plt.show()
### [def] Utilities for visualizing df and LA operations
# [REF] Vanderplas; [MB] adapted to handle numpy arrays
import numpy as np
import numpy.linalg as la
import pandas as pd
class display(object):
"""Display HTML representation of multiple objects"""
= """<div style="float: left; padding: 10px;">
template <p style='font-family:"Courier New", Courier, monospace'>{0}{1}
"""
def __init__(self, *args):
self.args = args
def __repr__(self):
return '\n\n'.join(
'\n' + '\033[1m' + a + '\033[0m'
+ '\n' + ' ' + repr(np.shape(eval(a))) + ' '
+ '\n' + repr(np.round(eval(a), 2))
for a in self.args
)
def make_df(cols, ind):
"""Quickly make a DataFrame"""
= {c: [str(c) + str(i) for i in ind]
data for c in cols}
return pd.DataFrame(data, ind)
### Examples: Visualizing df operations
= make_df('AB', [1, 2])
df1 = make_df('AB', [3, 4])
df2 'df1', 'df2', 'pd.concat([df1, df2])') display(
### Examples: Visualizing matrix operations
# Matrix algebra & operations
= np.array([[1, 2, 3], [4, 5, 6]])
X = np.array([[1, 2, 3], [4, 5, 6]])
Y 'X', 'Y.T', 'X @ Y.T', 'la.inv(X @ Y.T)', 'la.det(X @ Y.T)') display(
# Outer products
= np.array([1, 2])
x = x + 1
y 'x', 'y', 'np.outer(x, y)') display(
# Eigendecomposition
= np.vstack((X, np.array([7, 8, 9])))
X_square = la.eig(X_square)
eig, Evec = np.diag(eig)
Eig 'X_square', 'Eig', 'Evec') display(
# Matrix decomposition
= la.svd(X_square)
U, d, V_T = np.diag(d)
D 'X_square', 'U', 'D', 'V_T') display(
# Forming tensors: From vectors
= x[:, np.newaxis]
x_vec = x_vec[:, :, np.newaxis]
x_ten 'x', 'x_vec', 'x_ten') display(
# Forming tensors: From matrices
= X[:, :, np.newaxis]
X_3D1 = X[:, np.newaxis, :]
X_3D2 = X[np.newaxis, :, :]
X_3D3 'X', 'X_3D1', 'X_3D2', 'X_3D3') display(
# Tensor broadcasting: From vectors
= np.array([0, 1, 2])
c 'x_vec', 'x_ten', 'c', 'x_ten - c') display(
# Tensor broadcasting: From matrices
'X', 'X_3D1', 'c', 'X_3D1 - c') display(
# Reshaping & aggregating: From vectors
'x_vec', 'c', 'x_ten - c',
display('(x_ten - c) ** 2',
'np.sum((x_ten - c) ** 2, axis=1)')
# Reshaping & aggregating: From tensors
# [TODO] add tensor dot product for summation over arbitrary dims
'X', 'c', 'X_3D1 - c',
display('(X_3D1 - c) ** 2',
'((X_3D1 - c) ** 2).reshape(len(X), 1, -1)',
'np.sum(((X_3D1 - c) ** 2).reshape(len(X), 1, -1), axis=1)') # EATS empty 1st dim
### [def] Tools for visualizing LA operations
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
# [NOTE] attempted to resolve pyplot matrix rendering issues
# !pip install array_to_latex
# import array_to_latex as a2l
# mpl.rcParams['text.usetex'] = True
# mpl.rcParams['text.latex.preamble'] = r'\usepackage{{amsmath}}'
# plt.rcParams["text.latex.preamble"].join([
# r"\usepackage{dashbox}",
# r"\setmainfont{xcolor}"])
# plt.rcParams.update({"text.usetex": True})
# # [REF] https://inakleinbottle.com/posts/formatting-matrices-with-python/
def format_matrix(matrix, environment="pmatrix", formatter=str):
"""Format a matrix using LaTeX syntax"""
if not isinstance(matrix, np.ndarray):
try:
= np.array(matrix)
matrix except Exception:
raise TypeError("Could not convert to Numpy array")
if len(shape := matrix.shape) == 1:
= matrix.reshape(1, shape[0])
matrix elif len(shape) > 2:
raise ValueError("Array must be 2 dimensional")
= [" & ".join(map(formatter, row)) for row in matrix]
body_lines
= "\\\\\n".join(body_lines)
body return f"""\\begin{{{environment}}}
{body}
\\end{{{environment}}}"""
# [REF] Geron
def plot_vector2d(vector2d, origin=[0, 0], **options):
return plt.arrow(origin[0], origin[1], vector2d[0], vector2d[1],
=1, head_width=0.1,
linewidth=0.15, length_includes_head=True,
head_length**options)
def plot_transformation(P_before, P_after, text_before, text_after,
=[0, 5, 0, 4], arrows=False, display_mapping=None):
axisif arrows:
for vector_before, vector_after in zip(P_before.T, P_after.T):
="blue", linestyle="--")
plot_vector2d(vector_before, color# plot_vector2d(vector_before, color="blue", linestyle="-")
="red", linestyle="-")
plot_vector2d(vector_after, color
if display_mapping is not None:
# M_before = r'$\begin{bmatrix} ' + '\\'.join([' & '.join(map(str, row)) for row in P_before]) + '\end{bmatrix}$'
1]*.7, axis[3]*.7,
plt.text(axis[f"{format_matrix(P, environment='bmatrix')}",
=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.3'),
bbox=9, color="black")
fontsize
=0.2))
plt.gca().add_artist(Polygon(P_before.T, alpha=0.3, color="r"))
plt.gca().add_artist(Polygon(P_after.T, alpha0], P_before[1], "b--", alpha=0.5)
plt.plot(P_before[0], P_after[1], "r--", alpha=0.5)
plt.plot(P_after[
0].mean(), P_before[1].mean(), text_before,
plt.text(P_before[=12, color="blue")
fontsize# plt.text(P_before[0].max()*.9, P_before[1].max()*.9, text_before,
# fontsize=12, color="blue")
0].mean(), P_after[1].mean(), text_after,
plt.text(P_after[=12, color="red")
fontsize
plt.axis(axis)"equal")
plt.gca().set_aspect( plt.grid()
### Examples: Vectors
= np.array([2, 5])
u = np.array([3, 1])
v
=(4,4))
plt.subplots(figsize="r")
plot_vector2d(u, color="b")
plot_vector2d(v, color=u, color="b")
plot_vector2d(v, origin=v, color="r")
plot_vector2d(u, origin+v, color="g")
plot_vector2d(u
0, 6.5, 0, 6.5])
plt.axis(["equal")
plt.gca().set_aspect(0.7, 3, "u", color="r", fontsize=12)
plt.text(4, 3, "u", color="r", fontsize=12)
plt.text(1.8, 0.2, "v", color="b", fontsize=12)
plt.text(3.1, 5.6, "v", color="b", fontsize=12)
plt.text(2.4, 2.5, "u+v", color="g", fontsize=12)
plt.text(
plt.grid() plt.show()
### Examples: Matrices
# [def] Helper to get min & max coordinates of start & end matrices
def mat_minmax_coords(*matrices, square=False):
= np.hstack(matrices)
stacked_matrix = np.min(stacked_matrix, axis=1) * 1.1
xy_min = np.max(stacked_matrix, axis=1) * 1.1
xy_max = [xy_min[0], xy_max[0], xy_min[1], xy_max[1]]
axis
if square:
= [xy_max[0] - xy_min[0], xy_max[1] - xy_min[1]]
axis_lengths = max(axis_lengths)/axis_lengths
scale_factor = axis * np.repeat(scale_factor, 2)
axis
return axis
# Starting matrix
# P = np.array([[3.0, 4.0, 1.0, 4.6], [0.2, 3.5, 2.0, 0.5]]) # non-convex shape
# P = np.array([[0.0, 4.0, 6.0, 2.0], [0.0, 0.0, 5.0, 5.0]]) # neater shape
= np.array([[0, 0, 1, 1], [0, 1, 1, 0]]) # square
P = P / 2
P = plt.subplots(2,2, figsize=(8,8))
fig, axs
# Scale
= .6 * P
P_rescaled 0,0]) # [0, 3.5, 0, 3.5]
plt.sca(axs["$P$", "$0.6 P$",
plot_transformation(P, P_rescaled, =mat_minmax_coords(P, P_rescaled, square=0),
axis=False)
arrows
# Rotate
= 30 * np.pi / 180 # angle in radians
angle30 = 120 * np.pi / 180 # angle in radians
angle120 = np.array([
V
[np.cos(angle30), np.sin(angle30)],
[np.cos(angle120), np.sin(angle120)]])= V @ P
P_rotated 0,1]) # [-1.5, 4, -1.5, 4]
plt.sca(axs["$P$", "$V_{rotate} P$",
plot_transformation(P, P_rotated, =mat_minmax_coords(P, P_rotated, square=0),
axis=V)
display_mapping
# Shear
= np.array([[1, 1.5], [0, 1]])
F_shear = F_shear @ P
P_sheared 1,0]) # [0, 7, 0, 7]
plt.sca(axs["$P$", "$F_{shear} P$",
plot_transformation(P, P_sheared, =mat_minmax_coords(P, P_sheared, square=0))
axis
# Horizontal reflection
= np.array([[1, 0], [0, -1]])
F_reflect = F_reflect @ P
P_reflected 1,1]) # [-3, 4, -3, 4]
plt.sca(axs["$P$", "$F_{reflect} P$",
plot_transformation(P, P_reflected, =mat_minmax_coords(P, P_reflected, square=0))
axis
plt.show()
### Example: SVD
# M = np.array([[1, 1.5], [0, 1]]) # F_shear
= (np.array([[1, 1.5], [0, 1]]))*-1
M = np.array([
Square 0, 0, 1, 1],
[0, 1, 1, 0]])
[= la.svd(M) # 𝜎/Σ_diag, Σ
U, d, V_T = np.diag(d)
D assert np.any(np.round(U @ D @ V_T, 1) == np.round(M, 1))
# SVD
= plt.subplots(1, 3, figsize=(9,6))
fig, axs = mat_minmax_coords(
axis @ Square, D @ V_T @ Square, U @ D @ V_T @ Square, square=True)
Square, V_T
0]) # no need to explicitly plot with axs handle for custom plotter
plt.sca(axs[@ Square, "$Square$", r"$V^T \cdot Square$",
plot_transformation(Square, V_T =axis)
axis'$(V^T) \cdot Square$')
plt.title(
1])
plt.sca(axs[@ Square, D @ V_T @ Square,
plot_transformation(V_T r"$V^T \cdot Square$", r"$\Sigma \cdot V^T \cdot Square$",
=axis)
axis'$(\Sigma \cdot V^T) \cdot Square$')
plt.title(
2])
plt.sca(axs[@ V_T @ Square, U @ D @ V_T @ Square,
plot_transformation(D r"$\Sigma \cdot V^T \cdot Square$",
r"$U \cdot \Sigma \cdot V^T \cdot Square$",
=axis)
axis'$(U \cdot \Sigma \cdot V^T) \cdot Square$')
plt.title(
# plt.suptitle('SVD')
# plt.tight_layout()
plt.show()
### [def] Generate B/W image
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
19680801) # seed random number generator
np.random.seed(
def generate_stripe_image(size, stripe_nr, vertical = True):
"""Generate random (black) square image w/ `stripe_nr` (white) stripes"""
= np.zeros((size, size, 1), dtype = "uint8") # init size^2 img of 0's
img for i in range(0, stripe_nr):
= np.random.randint(0, size, 2) # gen 2 rand int btwn 0 & size
x, y = np.int(np.random.randint(y, size, 1)) # rand int btwn y & size
l if (vertical): # set to True by default
0] = 255 # add vertical line from y:l in rand col x
img[y:l, x, else:
0] = 255 # horizontal line in row x
img[x, y:l, return img
### [fig] Example: Plot image
# plot example image for reference
= generate_stripe_image(50, 10, vertical = True)
img 0], cmap = 'gray') plt.imshow(img[:, :,
### Generate training and validation sets
= 1000
n_img # n_stripes = 10
# img_size = 50
# initialize empty arrays to store training/validation sets
= X_val = np.empty([n_img, 50, 50, 1])
X_train = True # initially generate vertical images
vert = .5*n_img
n_vert
## generate images
for i in range(np.shape(X_train)[0]):
# start generating horizontal images half-way through loop
if i == n_vert - 1:
= False
vert
= generate_stripe_image(50, 10, vert)
X_train[i, :, :, :] = generate_stripe_image(50, 10, vert)
X_val[i, :, :, :]
## (standard) normalize training and validation sets (based on training data)
= np.mean(X_train, axis = 0)
X_mean = np.std(X_train, axis = 0) + 1e-7 # to deal w/ potential sd's of 0
X_std = (X_train - X_mean) / X_std
X_train = (X_val - X_mean) / X_std X_val
### Store true class labels (0 = vert. stripes; 1 = horiz.)
= np.zeros(500)
Y_train = Y_val = np.append(Y_train, np.ones(500)) Y_train
# Basics
= plt.subplots()
fig, ax 1, 2, 3], [4, 5, 6], label='Example line 1')
ax.plot([1, 1.5, 2], [4, 5, 6], label='Example line 2')
ax.plot([1, 2, 3], [4, 5, 6])
ax.scatter([1, 1.5, 2], [4, 5, 6])
ax.scatter([
; ax.legend()
ax.grid().75,3.25); #ax.set_ylim(0,3)
ax.set_xlim(1,1.5,2,2.5,3]); #ax.set_yticks([1,2,3])
ax.set_xticks(['$x$'); ax.set_ylabel('$y$')
ax.set_xlabel('Lines & scatter')
ax.set_title(
plt.show()
# [TODO] compute bar heights from randomly generated averages & add error bar
# fig, ax = plt.subplots(figsize=(6/1.2,4*1.2))
= plt.subplots(layout='constrained')
fig, ax 1, 2, 3], [4, 5, 6], label='Example bar 1', width=.3, alpha=.85)
ax.bar([1.5, 2.5], [4, 6], label='Example bar 2', width=.3, alpha=.85)
ax.bar([
='major', axis='y'); ax.legend()
ax.grid(which1,2,3])
ax.set_xticks(['$x$'); ax.set_ylabel('$y$')
ax.set_xlabel('Bars')
ax.set_title(
# plt.tight_layout()
plt.show()
### Pivot table for numeric column aggregated wrt 2 cats (group/level)
# [TODO] move to data exploration, above grouping/joins
import seaborn as sns
import pandas as pd
# Load the tips dataset
= sns.load_dataset("tips")
tips
# Specify the categorical variables for levels and groups
= "sex"
level_variable = "day"
group_variable
# Create a pivot table to structure the data
= tips.pivot_table(values="total_bill", index=level_variable, columns=group_variable, aggfunc="mean")
pivot_table
pivot_table
### Pivot table for num column w/ non-aggregated values wrt 2 cats (group/level)
# For quickly plotting group comparisons
# [TODO] move to data exploration, above grouping/joins
import seaborn as sns
import pandas as pd
# Load the tips dataset
= sns.load_dataset("tips")
tips
# Specify the categorical variables for levels and groups
= "sex"
level_variable = "day"
group_variable
# Group by the specified variables and collect total_bill values as arrays
= tips.groupby([level_variable, group_variable])["total_bill"].apply(list).unstack()
grouped_data
# Display the resulting DataFrame
grouped_data
### Stat + CI for num column w/ non-aggregated values wrt 2 cats (group/level)
# For quickly obtaining stats and err for group comparisons
# [TODO] move to data exploration, above grouping/joins
import numpy as np
import pandas as pd
from scipy import stats
# Assume grouped_data is a DataFrame with multi-index (group, level) and values as arrays
# Replace 'total_bill' with the actual column name containing arrays
= tips.groupby(['sex', 'day'])['total_bill'].apply(np.array)
grouped_data
# Function to calculate mean and confidence interval
def calculate_mean_ci(data):
# Convert non-numeric values to NaN and drop them
= pd.Series(data).apply(pd.to_numeric, errors='coerce').dropna()
numeric_data
if len(numeric_data) > 0:
= np.mean(numeric_data)
mean = stats.t.interval(0.95, len(numeric_data) - 1, loc=np.mean(numeric_data), scale=stats.sem(numeric_data))
ci return mean, ci
else:
return np.nan, (np.nan, np.nan)
# Apply the function to each group
= []
result_list
for (group, level), data in grouped_data.items():
= calculate_mean_ci(data)
mean, ci
result_list.append({'Group': group,
'Level': level,
'Mean': mean,
'Confidence Interval': ci
})
# Display the list of means and confidence intervals
for result in result_list:
print(result)
# [TODO] add option to plot sigbars for within group comparisons instead of btwn
# [TODO] add option to plot sigbars for betwn group comps of particular level
# [TODO] check accuracy of stats
# [TODO] clean up
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from itertools import combinations
# Specify the number of groups and bars per group
= 4
N_GROUPS = 3
N_LEVELS
# Generate or use actual data for averages and confidence intervals
= np.random.normal(loc=5, scale=1.0, size=(N_GROUPS, N_LEVELS, 10)) # Adjusted the number of samples
data
# Modify a couple of bar values to demonstrate significant differences
0, 0, :] += 2.5 # Increase the first bar in the first group
data[2, 2, :] -= 2.5 # Decrease the third bar in the third group
data[
# Calculate averages and confidence intervals
= np.mean(data, axis=2)
averages = np.zeros_like(averages, dtype=float)
conf_intervals
for group_idx in range(N_GROUPS):
for level_idx in range(N_LEVELS):
= stats.t.interval(0.95, len(data[group_idx, level_idx]) - 1,
interval =np.mean(data[group_idx, level_idx]),
loc=stats.sem(data[group_idx, level_idx]))
scale= np.abs(interval[1] - averages[group_idx, level_idx]) # Use upper bound
conf_intervals[group_idx, level_idx]
# Plot grouped bars with confidence intervals
= 0.2
width = plt.cm.viridis(np.linspace(0, 1, N_LEVELS))
colors = 0.7 # Adjust the line thickness
line_thickness = 0.8 # Increased the stagger amount
stagger_amount
= plt.subplots()
fig, ax
for level_idx in range(N_LEVELS):
= ax.bar(np.arange(N_GROUPS) + level_idx * width - (width * (N_LEVELS - 1) / 2),
bars
averages[:, level_idx],=conf_intervals[:, level_idx],
yerr=width,
width=0.85,
alpha=3,
capsize=colors[level_idx],
color=f'Level {level_idx + 1}',
label={'elinewidth': line_thickness}) # Set the line thickness for error bars
error_kw
# Set labels and title
'Groups')
ax.set_xlabel('Values')
ax.set_ylabel('Grouped Bars with Confidence Intervals')
ax.set_title(
# Set x-axis ticks and labels
= [f'Group {i}' for i in range(1, N_GROUPS + 1)]
group_labels
ax.set_xticks(np.arange(N_GROUPS))
ax.set_xticklabels(group_labels)
# Add legend
='Levels', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.legend(title
# Add staggered significance bars and asterisks for select between-group comparisons
= 0.05
significance_level = 0
stagger_index
for comb in combinations(range(N_GROUPS), 2):x
= ax.get_xticks()[comb[0]]
group1_center = ax.get_xticks()[comb[1]]
group2_center
= stats.ttest_ind(data[comb[0], :, :].flatten(), data[comb[1], :, :].flatten())
t_stat, p_value if p_value < significance_level:
= np.max(averages) + np.max(conf_intervals) + 0.5
tallest_bar_height = tallest_bar_height + np.max(conf_intervals) * 0.07 + stagger_index * stagger_amount # Adjust the stagger amount
significance_height
# Plot staggered horizontal lines aligned with the midpoints of the compared groups
ax.plot([group1_center, group2_center],* 2, color='black', lw=line_thickness)
[significance_height]
# Plot asterisks aligned with the center of the significance bars
= '*' * sum([p_value < alpha for alpha in [0.01, 0.001, 0.0001]])
asterisks + group2_center) / 2, significance_height,
ax.text((group1_center ='center', va='bottom', fontsize=10)
asterisks, ha
+= 1 # Increment the index for staggered bars
stagger_index
# Print significant comparisons, t-test results, and sample sizes
= len(data[comb[0], :, :].flatten())
sample_size1 = len(data[comb[1], :, :].flatten())
sample_size2 print(f'Significant comparison between {group_labels[comb[0]]} and {group_labels[comb[1]]}: '
f'p-value = {p_value}, \nt-statistic = {t_stat}, '
f'Sample Size: {group_labels[comb[0]]} = {sample_size1}, {group_labels[comb[1]]} = {sample_size2}\n')
# Show the plot
='y')
ax.grid(axis
plt.tight_layout() plt.show()
import seaborn as sns
from pandas.plotting import scatter_matrix
from matplotlib.ticker import FormatStrFormatter
= sns.load_dataset("iris")
data = data.rename(columns=lambda name: name.replace('_', ' ').capitalize())
data_
= plt.subplots()
fig, axs = scatter_matrix(data_, ax=axs, diagonal='hist', alpha=.75,
axs =dict(alpha=.75),
hist_kwds
)
# Iterate through the axes to set the y-axis formatter
for ax in axs.flatten():
# ax.yaxis.set_major_formatter(FormatStrFormatter('%05.2f'))
'%.2f'))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
ax.xaxis.set_major_formatter(FormatStrFormatter(='both', labelsize=7)
ax.tick_params(axis8)
ax.xaxis.label.set_size(8)
ax.yaxis.label.set_size(# ax.grid()
"Scatter matrix")
fig.suptitle( plt.show()
# [TODO] Add example where axes flipped so text horizontal with bars
import matplotlib.pyplot as plt
import numpy as np
= ("Adelie", "Chinstrap", "Gentoo")
species = {
penguin_means 'Bill Depth': (18.35, 18.43, 14.98),
'Bill Length': (38.79, 48.83, 47.50),
'Flipper Length': (189.95, 195.82, 217.19),
}
= np.arange(len(species)) # the label locations
x = 0.2 # the width of the bars
width = 0.009 # the space between bars within groups
spacing = 0
multiplier
with sns.color_palette('viridis', n_colors=3, as_cmap=False):
# color=ListedColormap(plt.cm.tab20b((np.arange(0,9))))
= plt.subplots(layout='constrained')
fig, ax
for attribute, measurement in penguin_means.items():
= (width + spacing) * multiplier
offset = ax.bar(x + offset, measurement, width, alpha=.85,
rects =attribute.lower().capitalize())
label=3)
ax.bar_label(rects, padding+= 1
multiplier
='y'); ax.legend(ncols=3)
ax.grid(axis0, 250)
ax.set_ylim(# ax.set_xticks(x + width, species)
+ ((width + spacing) * (multiplier - 1) / 2), species)
ax.set_xticks(x 'Length (mm)')
ax.set_ylabel('Whitepaper plot')
ax.set_title(
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
= ("Adelie", "Chinstrap", "Gentoo")
species = {
penguin_means 'Bill Depth': (18.35, 18.43, 14.98),
'Bill Length': (38.79, 48.83, 47.50),
'Flipper Length': (189.95, 195.82, 217.19),
}
= np.arange(len(species)) # the label locations
x = 0.2 # the width of the bars
width = 0.0 # the space between bars within groups
spacing = 0
multiplier
with sns.color_palette('viridis', n_colors=3, as_cmap=False):
= plt.subplots(layout='constrained', figsize=(6, 3.5))
fig, ax
for attribute, measurement in penguin_means.items():
= (width + spacing) * multiplier
offset = ax.barh(x + offset, measurement, height=width, alpha=.85,
rects =attribute.lower().capitalize())
label=3)
ax.bar_label(rects, padding+= 1
multiplier
='x')
ax.grid(axis=3)
ax.legend(ncols0, 250)
ax.set_xlim(+ ((width + spacing) * (multiplier - 1) / 2))
ax.set_yticks(x
ax.set_yticklabels(species)'Length (mm)')
ax.set_xlabel('Transposed whitepaper plot')
ax.set_title(
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# At least one dimension must be > 1
= 2
N_ROWS = 3
N_COLS
# Sample data for each subplot in groups of 2
= np.linspace(0, 2 * np.pi, 100)
x = []
functions = []
titles for i in range(N_ROWS*N_COLS * 2):
lambda x: np.sin((i + 1) * x))
functions.append(lambda x: np.cos((i + 1) * x))
functions.append(
if (i % 2 == 0): titles.append(f'$a = {int(i/2 + 1)}$')
# Create a 2x2 subplots grid using a for loop
= plt.subplots(N_ROWS, N_COLS, sharey=True)
fig, axs
# Flatten the axs array for easier iteration
= axs.flatten()
axs
# Loop through subplots and plot data
for i, ax in enumerate(axs):
= functions[i * 2](x)
y1 = functions[(i * 2) + 1](x)
y2
ax.plot(x, y1)
ax.plot(x, y2)
# To add legend to each frame
# ax.legend(labels=[f'$sin({i + 1}x)$', f'$cos({i + 1}x)$'], loc=2)
# Set title of each frame and label only rows and columns
ax.set_title(titles[i])if (i > (N_ROWS - 1) * N_COLS - 1): ax.set_xlabel('$x$')
if i % N_COLS == 0: ax.set_ylabel('$y$')
ax.grid()
# Add legend for groups
=['$sin(ax)$', '$cos(ax)$'], bbox_to_anchor=(1, .89), loc=2)
fig.legend(labels
# Use tight layout (default) to prevent clipping of titles & center
'$y$')
fig.supylabel('$x$', x=.58)
fig.supxlabel('Subplot grid', x=.58)
plt.suptitle(
plt.show()
import seaborn as sns
from matplotlib.ticker import StrMethodFormatter
= sns.load_dataset("iris")
data = data.rename(columns=lambda name: name.replace('_', ' ').capitalize())
data_
= dict(edgecolor='black', linewidth=.5) # bins
hist_kws = dict(linewidth=.5)
kde_kws = dict(alpha=.6, edgecolor='black', linewidth=.1) # s
scatter_kws = dict(linewidth=1)
line_kws = dict(scatter_kws=scatter_kws, line_kws=line_kws)
reg_kws
with sns.axes_style('whitegrid'), sns.color_palette('viridis'):
# plt.figure(figsize=(6, 6))
= sns.pairplot(data_, vars=['Petal width', 'Petal length', 'Sepal length'],
ax ='Species',
hue='reg', diag_kind='kde',
kind=reg_kws, diag_kws=kde_kws,
plot_kws
)
'Grouped pairplot')
plt.suptitle(
plt.tight_layout()
plt.show()# ax.axes.set_major_formatter(StrMethodFormatter(f'{{x:.2f}}'))
# sns.move_legend(ax, "upper left", bbox_to_anchor=(1, .75))
# Grouped boxplots
# [TODO] add sigstars to significant within-group differences
= sns.load_dataset('tips')
tips
# with sns.axes_style('whitegrid'), sns.color_palette('viridis', n_colors=2):
# with sns.color_palette('viridis', n_colors=2):
plt.figure()= sns.boxplot(data=tips, x='day', y='total_bill', hue='sex',
ax # dodge=1,
=.2, width=.6,
gap=.7,
saturation=3.75,
fliersize# whis=.5,
# linecolor='black', linewidth=.5,
)
# plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
='y')
ax.grid(axis=2, loc='best')
ax.legend(ncol'Category'); plt.ylabel('Value')
plt.xlabel('Grouped boxplots')
plt.title(
plt.tight_layout() plt.show()
= sns.load_dataset("titanic")
titanic
with sns.axes_style('whitegrid'):
plt.figure()= sns.FacetGrid(titanic, col="survived", row="sex")
ax = ax.map(plt.hist, "age", alpha=.85, bins=10)
ax
'Age')
ax.set_xlabels('Bifaceted wildcard plot')
plt.suptitle(# plt.tight_layout()
plt.show()
[TODO]
Missing 1. overlapping densities 2. overlapping scatter 3. jointplot 4. √ bars with error bar 5. timeseries w/ error bar
= sns.load_dataset('tips')
tips # hist_kws = dict(edgecolor='black', linewidth=.5) # bins
= dict(linewidth=1)
line_kws = dict() # linewidth=.5
kde_kws
with sns.axes_style('whitegrid'):
plt.figure()= sns.histplot(data=tips, x='total_bill', #hue='sex',
ax =True, element='bars', stat='count',
kde=line_kws, kde_kws=kde_kws,
line_kws
)
-10, 60)
plt.xlim('Value')
plt.xlabel('Histogram with KDE')
plt.title(
plt.tight_layout() plt.show()
# Correlation matrix and heatmap
# [TODO] Make these colours nicer
= sns.load_dataset('iris')
iris = iris.rename(columns=lambda name: name.replace('_', ' ').capitalize())
iris_ = iris_.corr(numeric_only=True)
correlation_matrix
#with sns.plotting_context(font_scale=.1):
plt.figure()= sns.heatmap(correlation_matrix,
ax =True, cbar=True, cmap=plt.cm.PiYG,
annot=True,
square=0,
linewidths#linecolor='black', edgecolor='black',
)
'Heatmap')
plt.title( plt.show()
[TODO]
Missing 1. 3D scatter 2. 2D cartesian 3. 2surface 4. curve 5. 22D vector field 6. 2D contour plot
ML plots: 7. [TODO] 3D data + PCs -> transformed 2D 8. [TODO] clean up generative (HoML), cluster/DR comparison
Functionalize: 12. [TODO] functionalize ML plots (args: fit model; X/y train/test (test optional); DR/manifold method) 13. [TODO] functionalize ML comparisons (args: model builds, labs, dataset lists; DR/man meth; dataset kwargs)
[Multi] For class/clust functions: 14. [TODO] add support for multi-class via appropriate custom colour generator calls 15. [TODO] add support for multi-feature class/clust via appropriate DR/manifold method (class DR, clust man)
[Prep] For class/clust/manifold functions: 16. [TODO] include appropriate prep for each feature (num: scale, cat: enc, ts: smooth, word: embed, sent: tfidf)
[Datasets] For ML comparison * class/clust functions: 17. [TODO] include std set of datasets (blob/circ/lin) * std args (blob cov kw: sphere/elip/ndiag, deg over, rand)
Later, for model-specific validations: - fix standard dataset & DR/manifold method & prep
= sns.load_dataset('iris')
iris
with sns.axes_style('whitegrid'):
plt.figure()=iris, x='sepal_length', y='sepal_width',
sns.kdeplot(data='Blues', fill=True, levels=5,
cmap
)
'Joint contour plot')
plt.title('Feature 1'); plt.ylabel('Feature 2')
plt.xlabel(
plt.tight_layout() plt.show()
[TODO]
def math_plotter(
f: lambda {
function(x, y), curve(t), surface(u, v), field(x, y, z)},
type: string {
'2D', '3D', 'contour', 'curve', 'surface', 'field'},
coordinates: string = 'rect' {
'rect', 'radial', 'cylindrical', 'spherical'},
bounds: List(
{x, t, u}: List(lo: float, hi: float) = [-10., 10.],
{y, v}: List(lo: float, hi: float) = [-10., 10.],
{z}: List(lo: float, hi: float) = [-10., 10.])
) -> plot: mpl_object {
f_plot: mpl_subplot,
gradient_plot: mpl_subplot = None,
integral_{plot, text} (explicit 2D graph if '2D' type given and closed form exists; else numerical estimate wrt some specified bounds):
{mpl_subplot, string} = None}
import numpy as np
import matplotlib.pyplot as plt
def f(x, y):
return np.sin(x) * np.cos(y)
# Create a grid of points & corresponding smaller grid for a neater quiver plot
= 100
n = np.meshgrid(np.linspace(-5, 5, n), np.linspace(-5, 5, n))
X, Y # X_quiv, Y_quiv = np.meshgrid(np.linspace(-5, 5, int(n/10)),
# np.linspace(-5, 5, int(n/10)))
# Compute the derivative of the surface
= f(X, Y)
Z # Z_quiv = f(X_quiv, Y_quiv)
= np.gradient(Z)
dx, dy # dx, dy = np.gradient(Z_quiv)
# Plot the surface and its derivative
= plt.figure()
fig = fig.add_subplot(111, projection='3d')
ax ='viridis')
ax.plot_surface(X, Y, Z, cmap
ax.quiver(X, Y, Z, dx, dy, np.zeros_like(dx),=.3, color='red')
arrow_length_ratio
# Set the axes labels
'x')
ax.set_xlabel('y')
ax.set_ylabel('z')
ax.set_zlabel(
# Show the plot
plt.show()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import scipy.integrate as integrate
# Calculate \int^\infty_0 e^{-x} dx
# invexp = lambda x: np.exp(-x)
# integrate.quad(invexp, 0, np.inf)
# function = lambda x, y: x**2 + y**2
# integrate.quad(function, 0, np.inf)
def integral(x, y):
# return integrate.quad(lambda t: np.sqrt((x**2 + y**2 - 2*x*y*np.cos(np.pi*t*(np.sqrt(1/x**3) - np.sqrt(1/y**3))))/(x**3*y**3)), 0, np.sqrt(x**3*y**3))[0]
return integrate.quad(lambda t: t*x**2 + t*y**2, 0, x**2 + y**2)[0]
# X = np.arange(0.1, 5, 0.1)
# Y = np.arange(0.1, 5, 0.1)
= np.arange(-10, 10, .1)
X = np.arange(-10, 10, .1)
Y = np.meshgrid(X, Y)
X, Y = np.vectorize(integral)(X, Y)
Z
= plt.figure()
fig = plt.axes(projection="3d")
ax ='green')
ax.plot_wireframe(X, Y, Z, color'x')
ax.set_xlabel('y')
ax.set_ylabel('z')
ax.set_zlabel(
= plt.axes(projection='3d')
ax =1, cstride=1,
ax.plot_surface(X, Y, Z, rstride=plt.Normalize(np.nanmin(Z), np.nanmax(Z)),
norm='winter', edgecolor='none')
cmap
# ax.set_xlim(-10, 10); ax.set_ylim(-10, 10); ax.set_zlim(-10, 10)
plt.show()
[TODO] Plot common 2/3D dists alongside estimated: - mean/variance - probability wrt some specified bounds
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
=1000
random_seed
# Distribution params (3 covariance options, shared mean)
= [-0.8, 0, 0.8]
cov_val = np.array([0,0])
mean
# Container for density functions for further analysis
= []
pdf_list
## Plot densities
= plt.figure(figsize=(8,6))
ax
# iterate over different covariance values
for idx, val in enumerate(cov_val):
# Initialize the covariance matrix
= np.array([[1, val], [val, 1]])
cov
# Generate Gaussian bivariate dist with given mean and covariance matrix
= multivariate_normal(cov = cov, mean = mean,
distr = random_seed)
seed
# Generate a meshgrid complacent with the 3-sigma boundary
= mean[0], mean[1]
mean_1, mean_2 = cov[0,0], cov[1,1]
sigma_1, sigma_2
= np.linspace(-3*sigma_1, 3*sigma_1, num=100)
x = np.linspace(-3*sigma_2, 3*sigma_2, num=100)
y = np.meshgrid(x,y)
X, Y
# Evaluate density for each point in the meshgrid
= np.zeros(X.shape)
pdf for i in range(X.shape[0]):
for j in range(X.shape[1]):
= distr.pdf([X[i,j], Y[i,j]])
pdf[i,j]
pdf_list.append(pdf)
# Plot density function values
= plt.subplot(1, 3, idx + 1, projection = '3d')
ax = 'viridis')
ax.plot_surface(X, Y, pdf, cmap
ax.axes.zaxis.set_ticks([])# plt.xlabel("$x_1$")
# plt.ylabel("$x_2$")
#plt.title(f'Covariance between x1 and x2 = {val}')
plt.tight_layout()
plt.show()
## Plot contour maps
= plt.figure(figsize=(8,6))
ax for idx, val in enumerate(pdf_list):
2, 3, idx + 1, aspect=1)
plt.subplot(='viridis')
plt.contourf(X, Y, val, cmap
"$x_1$")
plt.xlabel("$x_2$")
plt.ylabel(#plt.title(f'Covariance between x1 and x2 = {cov_val[idx]}')
plt.tight_layout() plt.show()
### Groundwork (data, preprocess, decompose, model, and datashape inspection
import numpy as np
from timeit import default_timer as timer
from sklearn import datasets
from sklearn import linear_model, naive_bayes, svm, tree, ensemble
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
# Mutable parameters and obtain colourmap
= 7
n_features = 5
n_classes
# higher resolutions reduce pixelation and thin out decision boundary (300)
= 300
resolution
# higher values crowd plot (100)
=1000
n_samples
= cmr.get_sub_cmap('gnuplot2', .1, .8, N=n_classes)
cmap = [cmap(i)[:3] for i in range(cmap.N)]
color_list # ListedColormap(color_list)
#
# (1) Make data
= datasets.make_classification(
X, y =n_samples, n_features=n_features,
n_samples=n_features, n_redundant=0,
n_informative=1, n_classes=n_classes,
n_clusters_per_class=42,
random_state
)= train_test_split(X, y, test_size=.2,
X_train, X_test, y_train, y_test =42)
random_state
# (2) Preprocess, decompose and project feature space for later plotting
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_scaled # dr_model = PCA(n_components=2)
= KernelPCA(n_components=2, fit_inverse_transform=True)
dr_model = dr_model.fit_transform(scaler.fit_transform(X_train))
X2D
# (3) Build, fit
# model = linear_model.LogisticRegression()
= naive_bayes.GaussianNB()
model # model = svm.SVC(kernel='rbf', C=6, probability=True, random_state=42)
# model = tree.DecisionTreeClassifier(max_depth=100)
# model = ensemble.RandomForestClassifier(n_estimators=100, max_depth=100, criterion='gini')
# model = MLPClassifier(
# hidden_layer_sizes=np.size(X_train, 1) * 1,
# activation='relu', solver='adam',
# batch_size=round(.01 * len(X)),
# alpha=.001, momentum=.9
# )
= timer()
t_start
model.fit(X_scaled, y_train)= timer()
t_end = round(t_end - t_start, 4)
t_elapsed
# Get projected scatter boundaries for plotting
= X2D[:, 0].min() - 0.1 * (X2D[:, 0].max() - X2D[:, 0].min())
x_min = X2D[:, 0].max() + 0.1 * (X2D[:, 0].max() - X2D[:, 0].min())
x_max = X2D[:, 1].min() - 0.1 * (X2D[:, 1].max() - X2D[:, 1].min())
y_min = X2D[:, 1].max() + 0.1 * (X2D[:, 1].max() - X2D[:, 1].min())
y_max = [x_min, x_max]; yrg = [y_min, y_max]
xrg = [x_min, x_max, y_min, y_max]
axis
# (4a) Define projected inference grid wrt original feature space (inversion)
= np.meshgrid(
xx, yy 1. * (x_max - x_min) / resolution),
np.arange(x_min, x_max, 1. * (y_max - y_min) / resolution)
np.arange(y_min, y_max,
)= np.c_[xx.ravel(), yy.ravel()]
X_grid = scaler.inverse_transform(dr_model.inverse_transform(X_grid))
X_grid_inverse # assert np.all(list(zip(np.ravel(xx), np.ravel(yy))) == X_grid)
# (4b) Get probabilistic and deterministic predictions over inference grid
= model.predict_proba(X_grid_inverse)
Zpp = model.predict(X_grid_inverse)
Zp # Z = svm.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# (P1) Obtain decision boundary line within projected feature space
# as [...]
= Zp.reshape(xx.shape)
Zp_grid = np.zeros(xx.shape)
Zb -1, :] = np.maximum((Zp_grid[:-1, :] != Zp_grid[1:, :]), Zb[:-1, :])
Zb[:1:, :] = np.maximum((Zp_grid[:-1, :] != Zp_grid[1:, :]), Zb[1:, :])
Zb[-1] = np.maximum((Zp_grid[:, :-1] != Zp_grid[:, 1:]), Zb[:, :-1])
Zb[:, :1:] = np.maximum((Zp_grid[:, :-1] != Zp_grid[:, 1:]), Zb[:, 1:])
Zb[:,
# (P2) Obtain graded probability surface within projected feature space
# as [(n_samples, n_classes) probability surface] * [(n_classes, 3) color vec]
# yielding (Zp_grid size) grid with RGB color vector values
= np.array(color_list[0:n_classes])
colors = np.dot(Zpp, colors)
zz = zz.reshape(xx.shape[0], xx.shape[1], colors.shape[1])
zz_r
#
# (1-2) Attributes, Targets > Features > Decomposed features >
# (4a) Inference grid, Ravelled into features, Inverted to og feature space >
# (4b) Probabilistic + Deterministic predictions,
# (P1) Unravelled into grid for plotting > Decision boundary line >
# (P2) Colour-graded probability surface
'X', 'y', 'X_scaled', 'X2D',
display('xx', 'yy', 'X_grid', 'X_grid_inverse',
'Zpp', 'Zp',
'Zp_grid', '(Zp_grid[:-1, :])', '(Zp_grid[1:, :])', '(Zb[:-1, :])', 'Zb',
'colors', 'zz', 'zz_r',
)
### Plotting: illustrative plot of probability surface
# [TODO] implement cross-validation here instead of a single train/test split
# [TODO] define search grids for each model and complexity knobs for val
# [TODO] pipe prep & model & plot grid/rand search results below as heat or lol
# [TODO] plot training/validation curve & learning curve below on 1 line
# [TODO] incorporate all models from sklearn classifier comparison plot
# [TODO] repeat for clustering, manifold
import matplotlib.patches as mpatches
= [min(xrg[0], yrg[0]), max(xrg[1], yrg[1])]
axis_lim = axis_lim + axis_lim
axis_equal
# Prep test data identically to training data, evaluate
= scaler.fit_transform(X_test)
X_scaled_test = model.predict(X_scaled_test)
y_pred = accuracy_score(y_pred=y_pred, y_true=y_test)
accuracy_val
= model.predict(X_scaled)
y_pred_train = accuracy_score(y_pred=y_pred_train, y_true=y_train)
accuracy_train
= dr_model.fit_transform(X_scaled_test)
X2D_test
#
= plt.subplots(1, 1, figsize=(6,6))
fig, ax
# Boundary line
='lower', interpolation=None, cmap='Greys',
ax.imshow(Zb, origin=1.0, extent=axis_equal)
alpha
# Probability surface
='lower', interpolation=None,
ax.imshow(zz_r, origin=.7, extent=axis_equal)
alpha
# Scatter
0], X2D[:, 1], c=[colors[i, :] for i in y_train],
ax.scatter(X2D[:, # linewidths=.5, edgecolors='k',
)
#
# Add legend
= []
colors_bar for v1 in colors[:n_classes, :]:
= list(v1)
v1 .7)
v1.append(
colors_bar.append(v1)
# create a patch (proxy artist) for every color
= [mpatches.Patch(color=colors_bar[i],
patches ="Class {k}".format(k=i))
labelfor i in range(n_classes)]
# put those patched as legend-handles into the legend
=patches, bbox_to_anchor=(1.05, 1),
plt.legend(handles=2, borderaxespad=0., framealpha=0.5)
loc
plt.grid()# ax.set_aspect('equal', adjustable='box')
ax.set_xlim(axis_lim)
ax.set_ylim(axis_lim)
if dr_model is None:
'Raw axis $x_1$')
plt.xlabel('Raw axis $x_2$')
plt.ylabel(else:
'Dimension reduced axis 1')
plt.xlabel('Dimension reduced axis 2')
plt.ylabel(
f"{str(model)[:]}\ndecision surface", size=10)
ax.set_title(
ax.text(1] - 0.3, axis_lim[0] + 0.3,
axis_lim[f"Training accuracy: {accuracy_train}, Validation accuracy: {accuracy_val}\n\
Training time: {t_elapsed}s",
=9,
size="right",
horizontalalignment=dict(boxstyle="round", alpha=0.8, facecolor="white"),
bbox# transform=ax.transAxes,
)
plt.suptitle(f"{n_classes} classes by {n_features} features\n\
{len(X_train)} training samples, {len(X_test)} testing samples",
=1 - .4, y=0 + .025,
x# ha='center', va='bottom',
)# Training accuracy: {accuracy_train}, Validation accuracy: {accuracy_val}\n\
# Training time: {t_elapsed} s",
plt.show()
# confusion_matrix(y_pred=y_pred, y_true=y_test)
=y_pred_train, y_true=y_train) confusion_matrix(y_pred
# 2-class classifier plot with decision boundary
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
# Make data and preprocess
= 2
n_classes = make_classification(n_samples=500, n_features=2,
X, y =2, n_redundant=0,
n_informative=1, n_classes=n_classes,
n_clusters_per_class
)= StandardScaler()
scaler = scaler.fit_transform(X)
X_scaled
# Build classifier, fit, and predict
= SVC(kernel='rbf', random_state=42, probability=True)
svm
svm.fit(X_scaled, y)= np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
xx, yy = svm.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
Z
# Plot scatter with decision surface
with sns.axes_style('whitegrid'):
plt.figure()='PiYG', alpha=0.8) # cmap='coolwarm'
plt.contourf(xx, yy, Z, cmap0], X_scaled[:, 1], c=y,
plt.scatter(X_scaled[:, =ListedColormap(["#b30065", "#178000"]),
cmap# cmap=cmr.get_sub_cmap('PiYG', .1, .9, N=n_classes),
# cmap='viridis',
)
plt.grid()-3, 3); plt.ylim(-3, 3)
plt.xlim('Feature 1'); plt.ylabel('Feature 2')
plt.xlabel('2-class classifier with scatter and decision surface')
plt.title(
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay
# Data
= datasets.load_iris()
iris = iris.data[:, :2] # only take first two features
X = iris.target # 3 classes
Y
# Build classifier & fit
= svm.SVC(kernel='rbf')
clf
clf.fit(X, Y)
# Plot model
# h = 0.02 # step size in the mesh
with sns.axes_style('whitegrid'):
plt.figure()= plt.gca()
ax
DecisionBoundaryDisplay.from_estimator(=ax,
clf, X, ax=plt.cm.viridis_r, grid_resolution=100,
cmap="predict", plot_method="pcolormesh", shading="auto",
response_method=.85
alpha
)
# Plot scatter
0], X[:, 1], c=Y, s=20,
plt.scatter(X[:, =plt.cm.viridis_r, edgecolors="k", linewidths=1, alpha=.85,
cmap
)
'Feature 1'); plt.ylabel('Feature 2')
plt.xlabel("3-class classifier with scatter and decision boundary")
plt.title("tight")
plt.axis( plt.show()
# Generative kernels and graded densities
from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
# Get data & preprocess
= make_blobs(n_samples=500, centers=3, random_state=42, cluster_std=3.0)
X, y = StandardScaler()
scaler = scaler.fit_transform(X)
X_scaled
# Model & fit
= GaussianMixture(n_components=3, random_state=42)
gmm
gmm.fit(X_scaled)
# Build grid & predict
= 0.02
h = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
x_min, x_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
y_min, y_max = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
xx, yy = -gmm.score_samples(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
Z
# Plot model & scatter
with sns.axes_style('whitegrid'):
plt.figure()=20, alpha=0.8, cmap='viridis')
plt.contourf(xx, yy, Z, levels= sns.scatterplot(
scatter =X_scaled[:, 0], y=X_scaled[:, 1], hue=y, s=20,
x='pastel', edgecolor='black', alpha=.9,
palette
)# Plot centers
plt.scatter(0], gmm.means_[:, 1],
gmm.means_[:, # color='red', marker='o', s=30, alpha=.85,
='white', marker='o', s=75,
color='black', linewidths=.5,
edgecolor
)
'Feature 1'); plt.ylabel('Feature 2')
plt.xlabel(; plt.ylim(y_min, y_max)
plt.xlim(x_min, x_max)'3-class clustering with scatter and densities')
plt.title(
= scatter.get_legend_handles_labels()
handles, labels 'Cluster ' + item for item in labels],
plt.legend(handles, [='best', #bbox_to_anchor=(1, 0.5),
loc=1,
frameon
) plt.show()
# Code source: Gaël Varoquaux
# Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
## Make models
= [
names # "Nearest Neighbors",
# "Linear SVM",
"RBF SVM",
# "Gaussian Process",
"Decision Tree",
"Random Forest",
"Neural Net",
# "AdaBoost",
"Naive Bayes",
# "QDA",
]
= [
classifiers # KNeighborsClassifier(3),
# SVC(kernel="linear", C=0.025, random_state=42),
=2, C=1, random_state=42),
SVC(gamma# GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
=5, random_state=42),
DecisionTreeClassifier(max_depth
RandomForestClassifier(=5, n_estimators=10, max_features=1, random_state=42
max_depth
),=1, max_iter=1000, random_state=42),
MLPClassifier(alpha# AdaBoostClassifier(algorithm="SAMME", random_state=42),
GaussianNB(),# QuadraticDiscriminantAnalysis(),
]
## Make datasets
= make_classification(
X, y =2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
n_features
)= np.random.RandomState(2)
rng += 2 * rng.uniform(size=X.shape)
X = (X, y)
linearly_separable
= [
datasets =0.3, random_state=0),
make_moons(noise=0.2, factor=0.5, random_state=1),
make_circles(noise
linearly_separable,
]
## Plot
= plt.figure(figsize=(8, 6)) # figure = plt.figure(figsize=(27, 9))
figure
# iterate over datasets
= 1
i for ds_cnt, ds in enumerate(datasets):
## Split into training and test part
= ds
X, y = train_test_split(
X_train, X_test, y_train, y_test =0.4, random_state=42
X, y, test_size
)
= X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
x_min, x_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
y_min, y_max
## Plot the dataset
= plt.cm.PiYG # cm = plt.cm.RdBu
cm = ListedColormap(["#b30065", "#178000"]) # cm_bright = ListedColormap(["#FF0000", "#0000FF"])
cm_bright = plt.subplot(len(datasets), len(classifiers) + 1, i)
ax if ds_cnt == 0:
"Input data")
ax.set_title(
# Plot the training points & testing points (latter slightly faded)
0], X_train[:, 1], c=y_train,
ax.scatter(X_train[:, =cm_bright, edgecolors="k",
cmap
)0], X_test[:, 1], c=y_test,
ax.scatter(X_test[:, =cm_bright, edgecolors="k", alpha=0.6,
cmap
)
# Plotting adjustments
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())+= 1
i
# iterate over classifiers
for name, clf in zip(names, classifiers):
= plt.subplot(len(datasets), len(classifiers) + 1, i)
ax
## Preprocess, fit, and plot fit
= make_pipeline(StandardScaler(), clf)
clf
clf.fit(X_train, y_train)= clf.score(X_test, y_test)
score
DecisionBoundaryDisplay.from_estimator(=cm, alpha=0.8, ax=ax, eps=0.5
clf, X, cmap
)
# Plot the training points & testing points (latter slightly faded)
0], X_train[:, 1], c=y_train,
ax.scatter(X_train[:, =cm_bright, edgecolors="k",
cmap
)0], X_test[:, 1], c=y_test,
ax.scatter(X_test[:, =cm_bright, edgecolors="k", alpha=0.6,
cmap
)
# Plotting adjustments & annotation
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(())if ds_cnt == 0:
ax.set_title(name)
ax.text(- 0.3,
x_max + 0.3,
y_min "%.2f" % score).lstrip("0"),
(=12,
size="right",
horizontalalignment=dict(boxstyle="round", alpha=0.8, facecolor="white"),
bbox# transform=ax.transAxes,
)+= 1
i
'Classifier comparison')
figure.suptitle(# figure.supxlabel('x')
# figure.supylabel('y')
plt.tight_layout() plt.show()
# [TODO] polish this
import time
import warnings
from itertools import cycle, islice
import matplotlib.pyplot as plt
import numpy as np
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
= 500
n_samples = 30
seed = datasets.make_circles(
noisy_circles =n_samples, factor=0.5, noise=0.05, random_state=seed
n_samples
)= datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=seed)
noisy_moons = datasets.make_blobs(n_samples=n_samples, random_state=seed)
blobs = np.random.RandomState(seed)
rng = rng.rand(n_samples, 2), None
no_structure
# Anisotropicly distributed data
= 170
random_state = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
X, y = [[0.6, -0.6], [-0.4, 0.8]]
transformation = np.dot(X, transformation)
X_aniso = (X_aniso, y)
aniso
# blobs with varied variances
= datasets.make_blobs(
varied =n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
n_samples
)
# ============
# Set up cluster parameters
# ============
=(12, 8)) # (9 * 2 + 3, 13)
plt.figure(figsize
plt.subplots_adjust(=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
left
)
= 1
plot_num
= {
default_base "quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
"allow_single_cluster": True,
"hdbscan_min_cluster_size": 15,
"hdbscan_min_samples": 3,
"random_state": 42,
}
= [
datasets
(
noisy_circles,
{"damping": 0.77,
"preference": -240,
"quantile": 0.2,
"n_clusters": 2,
"min_samples": 7,
"xi": 0.08,
},
),
(
noisy_moons,
{"damping": 0.75,
"preference": -220,
"n_clusters": 2,
"min_samples": 7,
"xi": 0.1,
},
),
(
varied,
{"eps": 0.18,
"n_neighbors": 2,
"min_samples": 7,
"xi": 0.01,
"min_cluster_size": 0.2,
},
),
(
aniso,
{"eps": 0.15,
"n_neighbors": 2,
"min_samples": 7,
"xi": 0.1,
"min_cluster_size": 0.2,
},
),"min_samples": 7, "xi": 0.1, "min_cluster_size": 0.2}),
(blobs, {
(no_structure, {}),
]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
# update parameters with dataset-specific values
= default_base.copy()
params
params.update(algo_params)
= dataset
X, y
# normalize dataset for easier parameter selection
= StandardScaler().fit_transform(X)
X
# estimate bandwidth for mean shift
= cluster.estimate_bandwidth(X, quantile=params["quantile"])
bandwidth
# connectivity matrix for structured Ward
= kneighbors_graph(
connectivity =params["n_neighbors"], include_self=False
X, n_neighbors
)# make connectivity symmetric
= 0.5 * (connectivity + connectivity.T)
connectivity
# ============
# Create cluster objects
# ============
= cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms = cluster.MiniBatchKMeans(
two_means =params["n_clusters"],
n_clusters=params["random_state"],
random_state
)= cluster.AgglomerativeClustering(
ward =params["n_clusters"], linkage="ward", connectivity=connectivity
n_clusters
)= cluster.SpectralClustering(
spectral =params["n_clusters"],
n_clusters="arpack",
eigen_solver="nearest_neighbors",
affinity=params["random_state"],
random_state
)= cluster.DBSCAN(eps=params["eps"])
dbscan # hdbscan = cluster.HDBSCAN(
# min_samples=params["hdbscan_min_samples"],
# min_cluster_size=params["hdbscan_min_cluster_size"],
# allow_single_cluster=params["allow_single_cluster"],
# )
= cluster.OPTICS(
optics =params["min_samples"],
min_samples=params["xi"],
xi=params["min_cluster_size"],
min_cluster_size
)= cluster.AffinityPropagation(
affinity_propagation =params["damping"],
damping=params["preference"],
preference=params["random_state"],
random_state
)= cluster.AgglomerativeClustering(
average_linkage ="average",
linkage="cityblock",
metric=params["n_clusters"],
n_clusters=connectivity,
connectivity
)= cluster.Birch(n_clusters=params["n_clusters"])
birch = mixture.GaussianMixture(
gmm =params["n_clusters"],
n_components="full",
covariance_type=params["random_state"],
random_state
)
= (
clustering_algorithms "MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
(# ("HDBSCAN", hdbscan),
"OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
(
)
for name, algorithm in clustering_algorithms:
= time.time()
t0
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
="the number of connected components of the "
message+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
=UserWarning,
category
)
warnings.filterwarnings("ignore",
="Graph is not fully connected, spectral embedding"
message+ " may not work as expected.",
=UserWarning,
category
)
algorithm.fit(X)
= time.time()
t1 if hasattr(algorithm, "labels_"):
= algorithm.labels_.astype(int)
y_pred else:
= algorithm.predict(X)
y_pred
len(datasets), len(clustering_algorithms), plot_num)
plt.subplot(if i_dataset == 0:
# size=18
plt.title(name)
= np.array(
colors list(
islice(
cycle(
["#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
]
),int(max(y_pred) + 1),
)
)
)# add black color for outliers (if any)
= np.append(colors, ["#000000"])
colors 0], X[:, 1], s=10, color=colors[y_pred])
plt.scatter(X[:,
-2.5, 2.5)
plt.xlim(-2.5, 2.5)
plt.ylim(
plt.xticks(())
plt.yticks(())
plt.text(0.99,
0.01,
"%.2fs" % (t1 - t0)).lstrip("0"),
(=plt.gca().transAxes,
transform# size=15,
="right",
horizontalalignment
)+= 1
plot_num
plt.show()
# [TODO] polish this
# Author: Jaques Grobler <jaques.grobler@inria.fr>
# License: BSD 3 clause
from time import time
import matplotlib.pyplot as plt
# Unused but required import for doing 3d projections with matplotlib < 3.2
import mpl_toolkits.mplot3d # noqa: F401
import numpy as np
from matplotlib.ticker import NullFormatter
from sklearn import manifold
from sklearn.utils import check_random_state
# Variables for manifold learning.
= 10
n_neighbors = 1000
n_samples
# Create our sphere.
= check_random_state(0)
random_state = random_state.rand(n_samples) * (2 * np.pi - 0.55)
p = random_state.rand(n_samples) * np.pi
t
# Sever the poles from the sphere.
= (t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8)))
indices = p[indices]
colors = (
x, y, z * np.cos(p[indices]),
np.sin(t[indices]) * np.sin(p[indices]),
np.sin(t[indices])
np.cos(t[indices]),
)
# Plot our dataset.
= plt.figure(figsize=(10, 6)) # (15, 8)
fig
plt.suptitle("Manifold Learning with %i points, %i neighbors" % (1000, n_neighbors),
# fontsize=14
)
= fig.add_subplot(251, projection="3d")
ax =p[indices], cmap=plt.cm.rainbow)
ax.scatter(x, y, z, c40, -10)
ax.view_init(
= np.array([x, y, z]).T
sphere_data
# Perform Locally Linear Embedding Manifold learning
= ["standard", "ltsa", "hessian", "modified"]
methods = ["LLE", "LTSA", "Hessian LLE", "Modified LLE"]
labels
for i, method in enumerate(methods):
= time()
t0 = (
trans_data
manifold.LocallyLinearEmbedding(=n_neighbors, n_components=2, method=method, random_state=42
n_neighbors
)
.fit_transform(sphere_data)
.T
)= time()
t1 print("%s: %.2g sec" % (methods[i], t1 - t0))
= fig.add_subplot(252 + i)
ax 0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.scatter(trans_data["%s (%.2g sec)" % (labels[i], t1 - t0))
plt.title(
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"tight")
plt.axis(
# Perform Isomap Manifold learning.
= time()
t0 = (
trans_data =n_neighbors, n_components=2)
manifold.Isomap(n_neighbors
.fit_transform(sphere_data)
.T
)= time()
t1 print("%s: %.2g sec" % ("ISO", t1 - t0))
= fig.add_subplot(257)
ax 0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.scatter(trans_data["%s (%.2g sec)" % ("Isomap", t1 - t0))
plt.title(
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"tight")
plt.axis(
# Perform Multi-dimensional scaling.
= time()
t0 = manifold.MDS(2, max_iter=100, n_init=1, random_state=42)
mds = mds.fit_transform(sphere_data).T
trans_data = time()
t1 print("MDS: %.2g sec" % (t1 - t0))
= fig.add_subplot(258)
ax 0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.scatter(trans_data["MDS (%.2g sec)" % (t1 - t0))
plt.title(
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"tight")
plt.axis(
# Perform Spectral Embedding.
= time()
t0 = manifold.SpectralEmbedding(
se =2, n_neighbors=n_neighbors, random_state=42
n_components
)= se.fit_transform(sphere_data).T
trans_data = time()
t1 print("Spectral Embedding: %.2g sec" % (t1 - t0))
= fig.add_subplot(259)
ax 0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.scatter(trans_data["Spectral Embedding (%.2g sec)" % (t1 - t0))
plt.title(
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"tight")
plt.axis(
# Perform t-distributed stochastic neighbor embedding.
= time()
t0 = manifold.TSNE(n_components=2, random_state=0)
tsne = tsne.fit_transform(sphere_data).T
trans_data = time()
t1 print("t-SNE: %.2g sec" % (t1 - t0))
= fig.add_subplot(2, 5, 10)
ax 0], trans_data[1], c=colors, cmap=plt.cm.rainbow)
plt.scatter(trans_data["t-SNE (%.2g sec)" % (t1 - t0))
plt.title(
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"tight")
plt.axis(
plt.show()