Commit 55547cba authored by Leodegario Lorenzo II's avatar Leodegario Lorenzo II
Browse files

Add get tfidf module

parent ef9df90d
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tfidf(docs, token_pattern=r'\S+', min_df=0.01,
max_features=2000, return_df=True, **kwargs):
"""Return TF-IDF matrix given documents
Uses TfidfVectorizer to get the TF-IDF weightedbag of words
representation of the given documents.
Parameters
----------
docs : iterable of str
Documents
token_pattern : regex pattern, default=r'\S+'
Token pattern to use in the token extraction
min_df : float, default=0.01
Min df setting for TF-IDF Vectorizer
max_features : int, default=2000
Number of features to consider
return_df : bool, default=True
Whether to return result as data frame
**kwargs : other keyword arguments
Other keyword arguments to pass on TfidfVectorizer
Returns
-------
tfidf : sparser matrix or pandas DataFrame
TF-IDF weighted bag of words representation
"""
# Initialize vectorizer
vectorizer = TfidfVectorizer(min_df=min_df, token_pattern=token_pattern,
max_features=max_features, **kwargs)
# Fit vectorizer
vectorizer.fit(docs)
# Get bag of words vectorizer
tfidf = vectorizer.transform(docs)
# Set as dataframe
if return_df:
tfidf = pd.DataFrame(
data=tfidf.todense(),
columns=vectorizer.get_feature_names(),
)
return tfidf
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment