Commit f2f2df74 authored by Leodegario Lorenzo II's avatar Leodegario Lorenzo II
Browse files

Add LSA analysis plots

parent 08618d0f
from ._word_cloud import word_cloud
from ._exp_var import exp_var
from ._lsa_analysis import lsa_analysis
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def lsa_analysis(df_lsa, lsa_components, column_1, column_2, margin=0.05,
hue=None, vals=None, figsize=(16, 5), wspace=0.03,
palette='default'):
"""
Plot the LSA graphs given df_lsa, lsa_components, components
Plots the word scatter plot projected along the chosen components, word
vectors, and word counts given the LSA dataframe and components
Parameters
----------
df_lsa : pandas Data Frame
Data Frame containing the SVD reduced BoW
lsa_components : pandas Data Frame
The LSA components data frame
column_1, column 2 : str
Singular vector columns to be inspected
margin : float, default=0.05
Text margin on subplot 2 for prettification
hue : numpy array, default=None
Discriminator array to use as hue
vals : list, default=None
Specify to focus on specific hue values for pair plot.
figsize : tuple, default=(16, 5)
Specify the figure size of the lsa analysis plot
wspace : float, default=0.03
Specify the horizontal space between subplots
palette : list of rgb, default='default'
palette to use for plotting
Returns
-------
fig, axes : matplotlib Figure and Axes
Figure and axes of the LSA analysis
"""
# Set color palette
if palette == 'default':
palette = sns.color_palette('tab10')
# Initialize figure
fig, axes = plt.subplots(1, 3, figsize=figsize,
gridspec_kw={'wspace': wspace})
# Set vals if not specified
if vals is None:
if hue is not None:
vals = set(hue)
# Plot word scatter plot for first 2 lsa_components's
if hue is not None:
for i, val in enumerate(vals):
axes[0].plot(df_lsa.loc[hue == val, column_1],
df_lsa.loc[hue == val, column_2], 'o',
color=palette[i],
label=val)
axes[0].legend()
else:
axes[0].plot(df_lsa.loc[:, column_1], df_lsa.loc[:, column_2], 'o')
# Set axis labels
axes[0].set_xlabel(column_1, fontsize=12)
axes[0].set_ylabel(column_2, fontsize=12)
# Remove spines
for spine in ['top', 'right']:
axes[0].spines[spine].set_visible(False)
axes[2].spines[spine].set_visible(False)
# Get lsas
lsas = np.append(lsa_components.loc[:, [column_1]],
lsa_components.loc[:, [column_2]],
axis=1)
# Compute for weights, rank, then get indices
weights = np.linalg.norm(lsas, axis=1)
indices = weights.argsort()[-20:]
# Get features
features = lsa_components.index
# Iterate through all top features
for feature, vec in zip(features[indices], lsas[indices]):
# Draw vector representation
axes[1].annotate('', xy=(vec[0], vec[1]), xycoords='data',
xytext=(0, 0), textcoords='data',
arrowprops=dict(facecolor=palette[0],
edgecolor='none'))
# Draw corresponding word
axes[1].text(vec[0], vec[1], feature, ha='center', color=palette[1],
fontsize=12, weight='bold', zorder=10)
# Adjust xlim and ylim
xlim = [np.min(lsa_components.loc[:, column_1]),
np.max(lsa_components.loc[:, column_1])]
xlim_range = xlim[1] - xlim[0]
ylim = [np.min(lsa_components.loc[:, column_2]),
np.max(lsa_components.loc[:, column_2])]
ylim_range = ylim[1] - ylim[0]
axes[1].set_xlim(xlim[0] - xlim_range*margin,
xlim[1] + xlim_range*margin)
axes[1].set_ylim(ylim[0] - ylim_range*margin,
ylim[1] + ylim_range*margin)
# Off axis for the vector plot
axes[1].tick_params(axis='both',which='both',top=False, bottom=False,
labelbottom=False, labelleft=False, left=False)
# Set axis labels
axes[1].set_xlabel(column_1, fontsize=12)
axes[1].set_ylabel(column_2, fontsize=12)
# Plot top word frequency graph
axes[2].barh(features[indices], weights[indices])
# Set axis labels for last axis
axes[2].set_xlabel('Weights')
# Remove spines
for spine in ['top', 'right', 'left', 'bottom']:
axes[1].spines[spine].set_visible(False)
return fig, axes
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment