Commit 4b64b4f1 authored by Leodegario Lorenzo II's avatar Leodegario Lorenzo II
Browse files

Remove mltools directory

parent f2f2df74
"""Machine Learning Tools Package"""
from ._word_cloud import word_cloud
from ._exp_var import exp_var
from ._lsa_analysis import lsa_analysis
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
def exp_var(exp_var_ratio, tol=0.90):
"""Return explained variance plot given explained variance ratio
Parameters
----------
exp_var_ratio : numpy array
Array containing the percentage of variance explained by
each of the singular vectors.
tol : float, default=0.90
Default tolerance value for the optimal threshold
Returns
-------
fig, ax : matplotlib figure and axes
Figure and axes of the plot
"""
# Get cumsum of explained variance ratio
exp_var = (exp_var_ratio).cumsum()
# Get index where exp_var exceeds tolerance
thresh = np.min(np.arange(len(exp_var))[exp_var >= tol])
# Initialize figure
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
# Plot explained varianced
ax.plot(range(len(exp_var)), exp_var, lw=4.0)
# Plot threshold line
ax.axvline(thresh, linestyle='--', lw=2.5, color='tab:orange')
# Annotate threshold
trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
ax.text(1.05*thresh, 0.05, f"Threshold: {int(thresh)}", color='tab:orange',
weight='bold', fontsize=12, transform=trans)
# Set ylim
ax.set_ylim([min(exp_var), 1.05])
ax.set_xlim([0., len(exp_var)])
# Remove spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Set axis labels
ax.set_xlabel('Number of components', fontsize=14)
ax.set_ylabel('Cumulative explained variance', fontsize=14)
return fig, ax
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def lsa_analysis(df_lsa, lsa_components, column_1, column_2, margin=0.05,
hue=None, vals=None, figsize=(16, 5), wspace=0.03,
palette='default'):
"""
Plot the LSA graphs given df_lsa, lsa_components, components
Plots the word scatter plot projected along the chosen components, word
vectors, and word counts given the LSA dataframe and components
Parameters
----------
df_lsa : pandas Data Frame
Data Frame containing the SVD reduced BoW
lsa_components : pandas Data Frame
The LSA components data frame
column_1, column 2 : str
Singular vector columns to be inspected
margin : float, default=0.05
Text margin on subplot 2 for prettification
hue : numpy array, default=None
Discriminator array to use as hue
vals : list, default=None
Specify to focus on specific hue values for pair plot.
figsize : tuple, default=(16, 5)
Specify the figure size of the lsa analysis plot
wspace : float, default=0.03
Specify the horizontal space between subplots
palette : list of rgb, default='default'
palette to use for plotting
Returns
-------
fig, axes : matplotlib Figure and Axes
Figure and axes of the LSA analysis
"""
# Set color palette
if palette == 'default':
palette = sns.color_palette('tab10')
# Initialize figure
fig, axes = plt.subplots(1, 3, figsize=figsize,
gridspec_kw={'wspace': wspace})
# Set vals if not specified
if vals is None:
if hue is not None:
vals = set(hue)
# Plot word scatter plot for first 2 lsa_components's
if hue is not None:
for i, val in enumerate(vals):
axes[0].plot(df_lsa.loc[hue == val, column_1],
df_lsa.loc[hue == val, column_2], 'o',
color=palette[i],
label=val)
axes[0].legend()
else:
axes[0].plot(df_lsa.loc[:, column_1], df_lsa.loc[:, column_2], 'o')
# Set axis labels
axes[0].set_xlabel(column_1, fontsize=12)
axes[0].set_ylabel(column_2, fontsize=12)
# Remove spines
for spine in ['top', 'right']:
axes[0].spines[spine].set_visible(False)
axes[2].spines[spine].set_visible(False)
# Get lsas
lsas = np.append(lsa_components.loc[:, [column_1]],
lsa_components.loc[:, [column_2]],
axis=1)
# Compute for weights, rank, then get indices
weights = np.linalg.norm(lsas, axis=1)
indices = weights.argsort()[-20:]
# Get features
features = lsa_components.index
# Iterate through all top features
for feature, vec in zip(features[indices], lsas[indices]):
# Draw vector representation
axes[1].annotate('', xy=(vec[0], vec[1]), xycoords='data',
xytext=(0, 0), textcoords='data',
arrowprops=dict(facecolor=palette[0],
edgecolor='none'))
# Draw corresponding word
axes[1].text(vec[0], vec[1], feature, ha='center', color=palette[1],
fontsize=12, weight='bold', zorder=10)
# Adjust xlim and ylim
xlim = [np.min(lsa_components.loc[:, column_1]),
np.max(lsa_components.loc[:, column_1])]
xlim_range = xlim[1] - xlim[0]
ylim = [np.min(lsa_components.loc[:, column_2]),
np.max(lsa_components.loc[:, column_2])]
ylim_range = ylim[1] - ylim[0]
axes[1].set_xlim(xlim[0] - xlim_range*margin,
xlim[1] + xlim_range*margin)
axes[1].set_ylim(ylim[0] - ylim_range*margin,
ylim[1] + ylim_range*margin)
# Off axis for the vector plot
axes[1].tick_params(axis='both',which='both',top=False, bottom=False,
labelbottom=False, labelleft=False, left=False)
# Set axis labels
axes[1].set_xlabel(column_1, fontsize=12)
axes[1].set_ylabel(column_2, fontsize=12)
# Plot top word frequency graph
axes[2].barh(features[indices], weights[indices])
# Set axis labels for last axis
axes[2].set_xlabel('Weights')
# Remove spines
for spine in ['top', 'right', 'left', 'bottom']:
axes[1].spines[spine].set_visible(False)
return fig, axes
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
def word_cloud(text, figsize=(17, 8), width=500, height=500,
stopwords=None, bg_color=None, ax=None,
**kwargs):
"""Generate wordcloud given text corpus
Parameters
----------
text : str
Text corpus to be plotted
figsize : tuple, default=(17, 8)
Figure size of the subplot
width : float, default=500
Desired width of the wordcloud
height : float, default=500
Desired height of the wordcloud
stopwords : iterable, default=None
Stopwords to be used, if None uses wordcloud's default
stopwords
bg_color : str, default=None
Background color to be used. If None, sets this as
"rgba(255, 255, 255, 0)"
ax : matplotlib.Axes, default=None
Set axes if wc is to be plotted on an axes
**kwargs : other keyword arguments
Other keyword arguments to pass on the word cloud
Returns
-------
fig, ax : matplotlib figure and axes
Figure and axes of the subplots
"""
# Initialize subplots
if ax is None:
fig, ax = plt.subplots(figsize=figsize)
else:
fig = plt.gcf()
# Set stopwords
if stopwords is None:
stopwords = set(STOPWORDS)
if bg_color is None:
bg_color = "rgba(255, 255, 255, 0)"
# Initialize WordCloud class
wc = WordCloud(
background_color=bg_color,
stopwords=stopwords,
width=width,
height=height,
**kwargs
)
wc.generate(text)
# Generate visualization
ax.imshow(wc, interpolation='bilinear')
ax.axis('off')
return fig, ax
from .tfidf import get_tfidf
from .lsa import get_lsa
import pandas as pd
from sklearn.decomposition import TruncatedSVD
def get_lsa(bow, random_state=1, **kwargs):
"""Return LSA and its components given bow/TF-IDF data
Parameters
----------
bow : pandas DataFrame
DataFrame containing bag of words or TF-IDF matrix
random_state : int, default=1
Random state to be used in the LSA
**kwargs : keyword arguments
Other keyword arguments to pass on TruncatedSVD
Returns
-------
df_lsa, lsa_components : pandas DataFrame
DataFrame containing LSA matrix and its components
explained_variance_ratio : numpy array
Explained variance ratio of each SV
"""
# Perform truncated SVD
svd = TruncatedSVD(bow.shape[1] - 1,
random_state=random_state,
**kwargs)
lsa = svd.fit_transform(bow)
# Set columns for results data frame
cols = [f'SV {i + 1}' for i in range(lsa.shape[1])]
# Create results data frame
df_lsa = pd.DataFrame(lsa, columns=cols, index=bow.index)
lsa_components = pd.DataFrame(svd.components_, index=cols,
columns=bow.columns)
return df_lsa, lsa_components.T, svd.explained_variance_ratio_
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tfidf(docs, token_pattern=r'\S+', min_df=0.01,
max_features=2000, return_df=True, **kwargs):
"""Return TF-IDF matrix given documents
Uses TfidfVectorizer to get the TF-IDF weightedbag of words
representation of the given documents.
Parameters
----------
docs : iterable of str
Documents
token_pattern : regex pattern, default=r'\S+'
Token pattern to use in the token extraction
min_df : float, default=0.01
Min df setting for TF-IDF Vectorizer
max_features : int, default=2000
Number of features to consider
return_df : bool, default=True
Whether to return result as data frame
**kwargs : other keyword arguments
Other keyword arguments to pass on TfidfVectorizer
Returns
-------
tfidf : sparser matrix or pandas DataFrame
TF-IDF weighted bag of words representation
"""
# Initialize vectorizer
vectorizer = TfidfVectorizer(min_df=min_df, token_pattern=token_pattern,
max_features=max_features, **kwargs)
# Fit vectorizer
vectorizer.fit(docs)
# Get bag of words vectorizer
tfidf = vectorizer.transform(docs)
# Set as dataframe
if return_df:
tfidf = pd.DataFrame(
data=tfidf.todense(),
columns=vectorizer.get_feature_names(),
)
return tfidf
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment