Commit 5f0fe49e authored by Melvin Macapinlac's avatar Melvin Macapinlac
Browse files

Initial commit

parents
File added
# Auto detect text files and perform LF normalization
* text=auto
from flask import Flask, request, render_template
from utils import tokenize
import pickle
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
app = Flask(__name__, static_url_path='/data', template_folder='web')
# Load models
model_toxic = joblib.load('./data/model_toxic.pkl')
model_severe_toxic = joblib.load('./data/model_severe_toxic.pkl')
model_identity_hate = joblib.load('./data/model_identity_hate.pkl')
model_insult = joblib.load('./data/model_insult.pkl')
model_obscene = joblib.load('./data/model_obscene.pkl')
model_threat = joblib.load('./data/model_threat.pkl')
tfidf = joblib.load('./data/tfidf.pkl')
@app.route('/')
def my_form():
return render_template('index.html')
@app.route('/predict')
def load_predict():
return render_template('predict.html')
@app.route('/dataset')
def load_dataset():
return render_template('dataset.html')
@app.route('/predict', methods=['POST'])
def post_predict():
"""
Get the post typed then apply TFIDF vectorizer and predict using trained models
"""
text = request.form['text']
comment = tfidf.transform([text])
dict_preds = {}
dict_preds['pred_toxic'] = model_toxic.predict_proba(comment)[:, 1][0]
dict_preds['pred_severe_toxic'] = model_severe_toxic.predict_proba(comment)[:, 1][0]
dict_preds['pred_identity_hate'] = model_identity_hate.predict_proba(comment)[:, 1][0]
dict_preds['pred_insult'] = model_insult.predict_proba(comment)[:, 1][0]
dict_preds['pred_obscene'] = model_obscene.predict_proba(comment)[:, 1][0]
dict_preds['pred_threat'] = model_threat.predict_proba(comment)[:, 1][0]
for k in dict_preds:
perc = dict_preds[k] * 100
dict_preds[k] = "{0:.2f}%".format(perc)
return render_template('predict.html', text=text,
pred_toxic=dict_preds['pred_toxic'],
pred_severe_toxic=dict_preds['pred_severe_toxic'],
pred_identity_hate=dict_preds['pred_identity_hate'],
pred_insult=dict_preds['pred_insult'],
pred_obscene=dict_preds['pred_obscene'],
pred_threat=dict_preds['pred_threat'])
if __name__ == '__main__':
app.run(host='127.0.0.1', port=8082)
\ No newline at end of file
File added
name: base
channels:
- conda-forge
- defaults
dependencies:
- _ipyw_jlab_nb_ext_conf=0.1.0=py37_0
- _tflow_select=2.3.0=mkl
- alabaster=0.7.12=py37_0
- anaconda-client=1.7.2=py37_0
- anaconda-navigator=1.9.6=py37_0
- anaconda-project=0.8.2=py37_0
- asn1crypto=0.24.0=py37_0
- astroid=2.1.0=py37_0
- astropy=3.1=py37he774522_0
- atomicwrites=1.2.1=py37_0
- attrs=18.2.0=py37h28b3542_0
- babel=2.6.0=py37_0
- backcall=0.1.0=py37_0
- backports=1.0=py37_1
- backports.os=0.1.1=py37_0
- backports.shutil_get_terminal_size=1.0.0=py37_2
- beautifulsoup4=4.6.3=py37_0
- bitarray=0.8.3=py37hfa6e2cd_0
- bkcharts=0.2=py37_0
- blas=1.0=mkl
- blaze=0.11.3=py37_0
- bleach=3.0.2=py37_0
- blosc=1.14.4=he51fdeb_0
- bokeh=1.0.2=py37_0
- boto=2.49.0=py37_0
- bottleneck=1.2.1=py37h452e1ab_1
- bzip2=1.0.6=hfa6e2cd_5
- ca-certificates=2019.11.27=0
- certifi=2019.11.28=py37_0
- cffi=1.11.5=py37h74b6da3_1
- chardet=3.0.4=py37_1
- click=7.0=py37_0
- click-plugins=1.1.1=py_0
- cligj=0.5.0=py37_0
- cloudpickle=0.6.1=py37_0
- clyent=1.2.2=py37_1
- colorama=0.4.1=py37_0
- comtypes=1.1.7=py37_0
- conda=4.8.1=py37_0
- conda-build=3.17.6=py37_0
- conda-env=2.6.0=1
- conda-package-handling=1.3.11=py37_0
- conda-verify=3.1.1=py37_0
- console_shortcut=0.1.1=3
- contextlib2=0.5.5=py37_0
- cryptography=2.4.2=py37h7a1dbc1_0
- curl=7.63.0=h2a8f88b_1000
- cycler=0.10.0=py37_0
- cython=0.29.2=py37ha925a31_0
- cytoolz=0.9.0.1=py37hfa6e2cd_1
- dask=1.0.0=py37_0
- dask-core=1.0.0=py37_0
- datashape=0.5.4=py37_1
- decorator=4.3.0=py37_0
- defusedxml=0.5.0=py37_1
- descartes=1.1.0=py37_0
- distributed=1.25.1=py37_0
- docutils=0.14=py37_0
- entrypoints=0.2.3=py37_2
- et_xmlfile=1.0.1=py37_0
- expat=2.2.5=he025d50_0
- fastcache=1.0.2=py37hfa6e2cd_2
- filelock=3.0.10=py37_0
- fiona=1.8.4=py37h22081e2_0
- flask=1.0.2=py37_1
- flask-cors=3.0.7=py37_0
- freetype=2.9.1=ha9979f8_1
- freexl=1.0.5=hfa6e2cd_0
- future=0.17.1=py37_0
- gdal=2.3.3=py37hdf43c64_0
- geopandas=0.4.1=py_0
- geos=3.7.1=h33f27b4_0
- get_terminal_size=1.0.0=h38e98db_0
- gevent=1.3.7=py37he774522_1
- glob2=0.6=py37_1
- graphviz=2.38=hfd603c8_2
- greenlet=0.4.15=py37hfa6e2cd_0
- hdf4=4.2.13=h712560f_2
- hdf5=1.10.4=h7ebc959_0
- heapdict=1.0.0=py37_2
- html5lib=1.0.1=py37_0
- icc_rt=2019.0.0=h0cc432a_1
- icu=58.2=ha66f8fd_1
- idna=2.8=py37_0
- imageio=2.4.1=py37_0
- imagesize=1.1.0=py37_0
- importlib_metadata=1.3.0=py37_0
- intel-openmp=2019.1=144
- ipykernel=5.1.0=py37h39e3cac_0
- ipython=7.2.0=py37h39e3cac_0
- ipython_genutils=0.2.0=py37_0
- ipywidgets=7.4.2=py37_0
- isort=4.3.4=py37_0
- itsdangerous=1.1.0=py37_0
- jdcal=1.4=py37_0
- jedi=0.13.2=py37_0
- jinja2=2.10=py37_0
- jpeg=9b=hb83a4c4_2
- jsonschema=2.6.0=py37_0
- jupyter=1.0.0=py37_7
- jupyter_client=5.2.4=py37_0
- jupyter_console=6.0.0=py37_0
- jupyter_core=4.4.0=py37_0
- jupyterlab=0.35.3=py37_0
- jupyterlab_server=0.2.0=py37_0
- kealib=1.4.7=h07cbb95_6
- keras-applications=1.0.8=py_0
- keras-preprocessing=1.1.0=py_1
- keyring=17.0.0=py37_0
- kiwisolver=1.0.1=py37h6538335_0
- krb5=1.16.1=hc04afaa_7
- lazy-object-proxy=1.3.1=py37hfa6e2cd_2
- libarchive=3.3.3=h0643e63_5
- libboost=1.67.0=hd9e427e_4
- libcurl=7.63.0=h2a8f88b_1000
- libgdal=2.3.3=h10f50ba_0
- libgpuarray=0.7.6=hfa6e2cd_0
- libiconv=1.15=h1df5818_7
- libkml=1.3.0=he5f2a48_4
- libmklml=2019.0.5=0
- libnetcdf=4.6.1=h411e497_2
- libpng=1.6.35=h2a8f88b_0
- libpq=11.2=h3235a2c_0
- libprotobuf=3.8.0=h7bd577a_0
- libpython=2.1=py37_0
- libsodium=1.0.16=h9d3ae62_0
- libspatialindex=1.8.5=h6538335_2
- libspatialite=4.3.0a=hc36aec2_19
- libssh2=1.8.0=h7a1dbc1_4
- libtiff=4.0.9=h36446d0_2
- libxml2=2.9.8=hadb2253_1
- libxslt=1.1.32=hf6f1972_0
- llvmlite=0.26.0=py37ha925a31_0
- locket=0.2.0=py37_1
- lxml=4.2.5=py37hef2cd61_0
- lz4-c=1.8.1.2=h2fa13f4_0
- lzo=2.10=h6df0209_2
- m2w64-binutils=2.25.1=5
- m2w64-bzip2=1.0.6=6
- m2w64-crt-git=5.0.0.4636.2595836=2
- m2w64-gcc=5.3.0=6
- m2w64-gcc-ada=5.3.0=6
- m2w64-gcc-fortran=5.3.0=6
- m2w64-gcc-libgfortran=5.3.0=6
- m2w64-gcc-libs=5.3.0=7
- m2w64-gcc-libs-core=5.3.0=7
- m2w64-gcc-objc=5.3.0=6
- m2w64-gmp=6.1.0=2
- m2w64-headers-git=5.0.0.4636.c0ad18a=2
- m2w64-isl=0.16.1=2
- m2w64-libiconv=1.14=6
- m2w64-libmangle-git=5.0.0.4509.2e5a9a2=2
- m2w64-libwinpthread-git=5.0.0.4634.697f757=2
- m2w64-make=4.1.2351.a80a8b8=2
- m2w64-mpc=1.0.3=3
- m2w64-mpfr=3.1.4=4
- m2w64-pkg-config=0.29.1=2
- m2w64-toolchain=5.3.0=7
- m2w64-tools-git=5.0.0.4592.90b8472=2
- m2w64-windows-default-manifest=6.4=3
- m2w64-winpthreads-git=5.0.0.4634.697f757=2
- m2w64-zlib=1.2.8=10
- mapclassify=2.0.1=py_0
- markdown=3.1.1=py37_0
- markupsafe=1.1.0=py37he774522_0
- matplotlib=3.0.2=py37hc8f65d3_0
- mccabe=0.6.1=py37_1
- menuinst=1.4.14=py37hfa6e2cd_0
- mistune=0.8.4=py37he774522_0
- mkl=2019.1=144
- mkl_fft=1.0.6=py37h6288b17_0
- mkl_random=1.0.2=py37h343c172_0
- mock=3.0.5=py37_0
- more-itertools=4.3.0=py37_0
- mpmath=1.1.0=py37_0
- msgpack-python=0.5.6=py37he980bc4_1
- msys2-conda-epoch=20160418=1
- multipledispatch=0.6.0=py37_0
- munch=2.3.2=py37_0
- navigator-updater=0.2.1=py37_0
- nbconvert=5.4.0=py37_1
- nbformat=4.4.0=py37_0
- networkx=2.2=py37_1
- nltk=3.4=py37_1
- nose=1.3.7=py37_2
- notebook=5.7.4=py37_0
- numba=0.41.0=py37hf9181ef_0
- numexpr=2.6.8=py37hdce8814_0
- numpydoc=0.8.0=py37_0
- odo=0.5.1=py37_0
- olefile=0.46=py37_0
- openpyxl=2.5.12=py37_0
- openssl=1.1.1c=he774522_1
- packaging=18.0=py37_0
- pandas=0.25.1=py37ha925a31_0
- pandoc=1.19.2.1=hb2460c7_1
- pandocfilters=1.4.2=py37_1
- parso=0.3.1=py37_0
- partd=0.3.9=py37_0
- path.py=11.5.0=py37_0
- pathlib2=2.3.3=py37_0
- patsy=0.5.1=py37_0
- pcre=8.43=ha925a31_0
- pep8=1.7.1=py37_0
- pickleshare=0.7.5=py37_0
- pillow=5.3.0=py37hdc69c19_0
- pkginfo=1.4.2=py37_1
- pluggy=0.8.0=py37_0
- ply=3.11=py37_0
- proj4=5.2.0=ha925a31_1
- prometheus_client=0.5.0=py37_0
- prompt_toolkit=2.0.7=py37_0
- psutil=5.4.8=py37he774522_0
- psycopg2=2.7.6.1=py37h7a1dbc1_0
- py=1.7.0=py37_0
- pycodestyle=2.4.0=py37_0
- pycosat=0.6.3=py37hfa6e2cd_0
- pycparser=2.19=py37_0
- pycrypto=2.6.1=py37hfa6e2cd_9
- pycurl=7.43.0.2=py37h7a1dbc1_0
- pydotplus=2.0.2=py37_1
- pyflakes=2.0.0=py37_0
- pygments=2.3.1=py37_0
- pygpu=0.7.6=py37h452e1ab_0
- pylint=2.2.2=py37_0
- pymc3=3.6=py37_0
- pyodbc=4.0.25=py37ha925a31_0
- pyopenssl=18.0.0=py37_0
- pyparsing=2.3.0=py37_0
- pyproj=1.9.6=py37h6782396_0
- pyqt=5.9.2=py37h6538335_2
- pyreadline=2.1=py37_1
- pysocks=1.6.8=py37_0
- pytables=3.4.4=py37h1da0976_0
- pytest=4.0.2=py37_0
- pytest-arraydiff=0.3=py37h39e3cac_0
- pytest-astropy=0.5.0=py37_0
- pytest-doctestplus=0.2.0=py37_0
- pytest-openfiles=0.3.1=py37_0
- pytest-remotedata=0.3.1=py37_0
- python=3.7.1=h8c8aaf0_6
- python-dateutil=2.7.5=py37_0
- python-graphviz=0.10.1=py_0
- python-libarchive-c=2.8=py37_6
- python-snappy=0.5.4=py37ha925a31_0
- pytz=2018.7=py37_0
- pywavelets=1.0.1=py37h8c2d366_0
- pywin32=223=py37hfa6e2cd_1
- pywinpty=0.5.5=py37_1000
- pyyaml=3.13=py37hfa6e2cd_0
- pyzmq=17.1.2=py37hfa6e2cd_0
- qt=5.9.7=vc14h73c81de_0
- qtawesome=0.5.3=py37_0
- qtconsole=4.4.3=py37_0
- qtpy=1.5.2=py37_0
- requests=2.21.0=py37_0
- rope=0.11.0=py37_0
- rtree=0.8.3=py37_0
- ruamel_yaml=0.15.46=py37hfa6e2cd_0
- scikit-image=0.14.1=py37ha925a31_0
- scipy=1.1.0=py37h29ff71c_2
- seaborn=0.9.0=py37_0
- send2trash=1.5.0=py37_0
- shapely=1.6.4=py37h222a598_0
- simplegeneric=0.8.1=py37_2
- singledispatch=3.4.0.3=py37_0
- sip=4.19.8=py37h6538335_0
- six=1.13.0=py37_0
- snappy=1.1.7=h777316e_3
- snowballstemmer=1.2.1=py37_0
- sortedcollections=1.0.1=py37_0
- sortedcontainers=2.1.0=py37_0
- sphinx=1.8.2=py37_0
- sphinxcontrib=1.0=py37_1
- sphinxcontrib-websupport=1.1.0=py37_1
- spyder=3.3.2=py37_0
- spyder-kernels=0.3.0=py37_0
- sqlalchemy=1.2.15=py37he774522_0
- sqlite=3.26.0=he774522_0
- statsmodels=0.9.0=py37h452e1ab_0
- sympy=1.3=py37_0
- tbb=2019.4=h74a9793_0
- tblib=1.3.2=py37_0
- tensorboard=1.13.1=py37h33f27b4_0
- tensorflow-base=1.13.1=mkl_py37hcaf7020_0
- tensorflow-estimator=1.13.0=py_0
- terminado=0.8.1=py37_1
- testpath=0.4.2=py37_0
- theano=1.0.4=py37h6538335_1000
- tk=8.6.8=hfa6e2cd_0
- toolz=0.9.0=py37_0
- tornado=5.1.1=py37hfa6e2cd_0
- tqdm=4.28.1=py37h28b3542_0
- traitlets=4.3.2=py37_0
- unicodecsv=0.14.1=py37_0
- urllib3=1.24.1=py37_0
- vc=14.1=h0510ff6_4
- vs2015_runtime=14.15.26706=h3a45250_0
- vs2015_win-64=14.0.25420=h55c1224_11
- wcwidth=0.1.7=py37_0
- webencodings=0.5.1=py37_1
- widgetsnbextension=3.4.2=py37_0
- win_inet_pton=1.0.1=py37_1
- win_unicode_console=0.5=py37_0
- wincertstore=0.2=py37_0
- winpty=0.4.3=4
- xerces-c=3.2.2=ha925a31_0
- xlrd=1.2.0=py37_0
- xlsxwriter=1.1.2=py37_0
- xlwings=0.15.1=py37_0
- xlwt=1.3.0=py37_0
- xz=5.2.4=h2fa13f4_4
- yaml=0.1.7=hc54c509_2
- zeromq=4.2.5=he025d50_1
- zict=0.1.3=py37_0
- zipp=0.6.0=py_0
- zlib=1.2.11=h62dcd97_3
- zstd=1.3.7=h508b16e_0
- pip:
- absl-py==0.8.1
- alembic==1.0.9
- asteval==0.9.13
- astor==0.8.1
- atari-py==0.2.6
- autograd==1.3
- autopep8==1.4.4
- blis==0.4.1
- boto3==1.10.37
- botocore==1.13.37
- bs4==0.0.1
- cachetools==4.0.0
- catalogue==0.0.8
- colorlover==0.3.0
- cufflinks==0.15
- cymem==2.0.3
- dash==1.6.1
- dash-core-components==1.5.1
- dash-html-components==1.0.2
- dash-renderer==1.2.1
- dash-table==4.5.1
- dataset==1.1.2
- dill==0.3.0
- flask-compress==1.4.0
- flask-socketio==4.2.1
- gast==0.2.2
- gensim==3.8.1
- gevent-websocket==0.10.1
- google-auth==1.10.0
- google-auth-oauthlib==0.4.1
- google-pasta==0.1.8
- grpcio==1.25.0
- gym==0.15.4
- h5py==2.10.0
- imbalanced-learn==0.4.3
- imblearn==0.0
- jmespath==0.9.4
- joblib==0.13.2
- jsonpickle==1.1
- keras==2.3.1
- kmodes==0.10.1
- lifetimes==0.11.1
- lmfit==0.9.12
- mako==1.0.9
- mglearn==0.1.7
- mpi4py==3.0.3
- murmurhash==1.0.2
- mysql-connector==2.2.9
- numpy==1.18.0
- oauthlib==3.0.1
- opencv-python==4.1.2.30
- opt-einsum==3.1.0
- pdvega==0.1
- pip==19.3.1
- plac==1.1.3
- plotly==4.1.0
- pomegranate==0.11.0
- powerlaw==1.4.6
- preshed==3.0.2
- protobuf==3.11.1
- pyasn1==0.4.8
- pyasn1-modules==0.2.7
- pydot==1.4.1
- pyglet==1.3.2
- python-editor==1.0.4
- python-engineio==3.10.0
- python-socketio==4.3.1
- requests-oauthlib==1.2.0
- retrying==1.3.3
- rsa==4.0
- s3transfer==0.2.1
- scikit-learn==0.22
- setuptools==42.0.2
- shap==0.29.3
- simpy==3.0.11
- sklearn-extensions==0.0.2
- smart-open==1.9.0
- spacy==2.2.3
- srsly==0.2.0
- stable-baselines==2.8.0
- tensorflow==2.0.0
- tensorflow-gpu==1.13.1
- termcolor==1.1.0
- thinc==7.3.1
- thinkbayes2==2.0.0
- thinkx==1.1.3
- tokenizer==2.0.3
- tweepy==3.7.0
- uncertainties==3.0.3
- utils==1.0.0
- vadersentiment==3.2.1
- vega3==0.13.0
- wasabi==0.4.2
- werkzeug==0.16.0
- wheel==0.33.6
- wordcloud==1.5.0
- wrapt==1.11.2
- wtforms==2.2.1
prefix: C:\Users\Melvin\Anaconda3
import re
import string
def tokenize(s):
regex = re.compile('[%s]' % re.escape(string.punctuation))
out = regex.sub(' ', s).split()
return out
\ No newline at end of file
File added
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Toxic or Not?</title>
<!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
<!-- Optional theme -->
<!-- <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css"> -->
<style>
/* This controls th style of the text area where user provide input */
textarea {
position: absolute;
top: 20%;
left: 23%;
margin-top: 45%;
width: 50%;
line-height: 75px;
margin: 15px;
border: 2px solid #ccc;
border-radius: 6px;
box-sizing: border-box;
font-family: Arial, Helvetica, sans-serif;
font-size: 20px;
}
input[id='submit'],
select {
position: absolute;
top: 46%;
left: 23%;
margin-top: 45%;
width: 50%;
line-height: 25px;
margin: 15px;
border: 2px solid #1da1f2;
;
border-radius: 6px;
background-color: #1da1f2;
font-family: Arial, Helvetica, sans-serif;
color: white;
box-sizing: border-box;
}
input[id='clear'],
select {
position: absolute;
top: 51%;
left: 23%;
width: 50%;
line-height: 25px;
margin: 15px;
border: 1px solid #1da1f2;
;
border-radius: 6px;
background-color: white;
font-family: Arial, Helvetica, sans-serif;
color: #1da1f2;
box-sizing: border-box;
}
.results {
position: absolute;
top: 75%;
left: 24%;
}
label {
font-family: Arial, Helvetica, sans-serif;
position: absolute;
left: 25%;
top: 60%;
font-size: 21px;
}
</style>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
<nav class="navbar navbar-default">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="/">Toxic Or Not?</a>
</div>
<div id="navbar" class="collapse navbar-collapse">
<ul class="nav navbar-nav">
<li><a href="/">Home</a></li>
<li><a href="/dataset">The Dataset</a></li>
<li><a href="/predict">Comment Classifier</a></li>
</ul>
</div><!--/.nav-collapse -->
</div>
</nav>
<div class="jumbotron">
<div class="container">
<h2>The Dataset</h2><br>
<h4>The data that was used in this project is the Jigsaw Toxic Comments Classification Dataset. The dataset contains 159,571 entries of Wikipedia comments which have been labelled by human raters for the comment’s toxicity.<br>
<br>
The plot below shows the distribution of each comment on their toxic comment characteristics:
</h4><br>
<div id="tester" style="width:1130px;height: 500px;"></div>
<script>
var data = [
{
x: ['toxic', 'obscene', 'insult', 'identity hate', 'severe toxic', 'threat'],