| |
| """modeling.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1x78fRDZAuK5FaSTKHPGy8eSbZ_gYAFr6 |
| """ |
|
|
| from google.colab import drive |
| drive.mount('/content/drive') |
|
|
| |
|
|
| |
|
|
| !python -m spacy download en_core_web_lg |
|
|
| !pip install -U SpaCy==2.2.0 |
|
|
| |
|
|
| |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| |
| import numpy as np |
| import pandas as pd |
|
|
| |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
|
|
| |
| from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
| |
| from yellowbrick.text import TSNEVisualizer |
| from sklearn import manifold |
|
|
| |
| from sklearn.model_selection import train_test_split |
|
|
| |
| from sklearn import feature_selection |
|
|
| |
| from sklearn.pipeline import Pipeline |
| import sklearn.metrics as skm |
| from sklearn.metrics import confusion_matrix, accuracy_score |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.svm import SVC |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.ensemble import RandomForestClassifier |
|
|
| |
| import pickle |
|
|
| |
| |
|
|
| |
| from nltk.tokenize.treebank import TreebankWordDetokenizer |
|
|
| |
| import gensim |
| import gensim.downloader as gensim_api |
| from gensim.models import Word2Vec |
| from gensim.models import KeyedVectors |
| from keras.preprocessing.text import Tokenizer |
| from keras.preprocessing.sequence import pad_sequences |
|
|
| |
| import spacy |
| import en_core_web_lg |
|
|
| |
| from keras.models import load_model |
| from keras.models import Model, Sequential |
| from keras.callbacks import EarlyStopping, ModelCheckpoint |
| from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D |
| from tensorflow.keras import models, layers, preprocessing as kprocessing |
| from tensorflow.keras import backend as K |
| import tensorflow as tf |
| import keras |
| from keras.layers import Lambda |
| import tensorflow as tf |
| from keras.models import model_from_json |
|
|
| |
| |
|
|
| """## Loading the dataset:""" |
|
|
| df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv", |
| sep='\t', encoding='utf-8') |
|
|
| df_all |
|
|
| """## Classification models as well as LSTM with pretrained model(Spacy): |
| |
| In order to run a supervised learning model, we first need to convert the clean_text into feature representation. |
| """ |
|
|
| nlp = en_core_web_lg.load() |
|
|
| |
| all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ |
| for s in df_all['clean_text']]) |
|
|
| |
| Y= df_all["label"] |
| X = all_vectors |
|
|
| from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| validation_size = 0.3 |
| seed = 7 |
| X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) |
|
|
| |
| num_folds = 10 |
| seed = 7 |
| scoring = 'accuracy' |
|
|
| |
| models = [] |
| models.append(('LR', LogisticRegression())) |
| models.append(('KNN', KNeighborsClassifier())) |
| models.append(('CART', DecisionTreeClassifier())) |
| models.append(('SVM', SVC())) |
| |
| models.append(('NN', MLPClassifier())) |
| |
| models.append(('RF', RandomForestClassifier())) |
|
|
| |
| results = [] |
| names = [] |
| kfold_results = [] |
| test_results = [] |
| train_results = [] |
| for name, model in models: |
| kfold = KFold(n_splits=num_folds, random_state=seed) |
| cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) |
| results.append(cv_results) |
| names.append(name) |
| |
| |
| |
| |
| res = model.fit(X_train, Y_train) |
| train_result = accuracy_score(res.predict(X_train), Y_train) |
| train_results.append(train_result) |
| |
| |
| test_result = accuracy_score(res.predict(X_test), Y_test) |
| test_results.append(test_result) |
| |
| msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result) |
| print(msg) |
| print(confusion_matrix(res.predict(X_test), Y_test)) |
| |
|
|
| |
| from matplotlib import pyplot |
| fig = pyplot.figure() |
| ind = np.arange(len(names)) |
| width = 0.35 |
| fig.suptitle('Algorithm Comparison') |
| ax = fig.add_subplot(111) |
| pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') |
| pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') |
| fig.set_size_inches(15,8) |
| pyplot.legend() |
| ax.set_xticks(ind) |
| ax.set_xticklabels(names) |
| pyplot.show() |
|
|
| """The best model with the highest accuracy is **Support Vector Machine(SVM)** with **85.79**% accuracy on test dataset. Logistic Regression performed good as well but we see overfitting problem with CART, NN and RF. |
| |
| ### LSTM model: |
| """ |
|
|
| |
| vocabulary_size = 20000 |
| tokenizer = Tokenizer(num_words= vocabulary_size) |
| tokenizer.fit_on_texts(df_all['clean_text']) |
| sequences = tokenizer.texts_to_sequences(df_all['clean_text']) |
| X_LSTM = pad_sequences(sequences, maxlen=50) |
|
|
| |
| Y_LSTM = df_all["label"] |
| X_train_LSTM, X_test_LSTM, Y_train_LSTM, Y_test_LSTM = train_test_split(X_LSTM, \ |
| Y_LSTM, test_size=validation_size, random_state=seed) |
|
|
| from keras.wrappers.scikit_learn import KerasClassifier |
| def create_model(input_length=50): |
| model = Sequential() |
| model.add(Embedding(20000, 300, input_length=50)) |
| model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) |
| model.add(Dense(1, activation='sigmoid')) |
| model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
| return model |
| model_LSTM = KerasClassifier(build_fn=create_model, epochs=3, verbose=1, validation_split=0.4) |
| model_LSTM.fit(X_train_LSTM, Y_train_LSTM) |
|
|
| train_result_LSTM = accuracy_score(model_LSTM.predict(X_train_LSTM), Y_train_LSTM) |
| |
| test_result_LSTM = accuracy_score(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) |
|
|
| print("train result:", train_result_LSTM) |
| print("test result:", test_result_LSTM) |
|
|
| confusion_matrix(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) |
|
|
| """### Compare all the models:""" |
|
|
| train_results.append(train_result_LSTM);test_results.append(test_result_LSTM) |
| names.append("LSTM") |
|
|
| |
| from matplotlib import pyplot |
| fig = pyplot.figure() |
| ind = np.arange(len(names)) |
| width = 0.35 |
| fig.suptitle('Algorithm Comparison') |
| ax = fig.add_subplot(111) |
| pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') |
| pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') |
| fig.set_size_inches(15,8) |
| pyplot.legend() |
| ax.set_xticks(ind) |
| ax.set_xticklabels(names) |
| pyplot.show() |
| plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/classification_comparision.png') |
|
|
| """## Evaluate the performance: |
| |
| * **Accuracy:** the fraction of predictions the model got right. |
| * **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class. |
| * **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one. |
| * **Precision:** the fraction of relevant instances among the retrieved instances. |
| * **Recall:** the fraction of the total amount of relevant instances that were actually retrieved. |
| """ |
|
|
| def conf_matrix_acc(y_true, y_pred): |
| |
| cm = confusion_matrix(y_true, y_pred) |
| fig, ax = plt.subplots() |
| sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, |
| cbar=False) |
| ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, |
| yticklabels=classes, title="Confusion matrix") |
| plt.yticks(rotation=0) |
| print("=========================================") |
| print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}') |
| print("=========================================") |
| print("Detail:") |
| print(skm.classification_report(y_true, y_pred)) |
|
|
| |
| def roc_precision_auc(): |
| fig, ax = plt.subplots(nrows=1, ncols=2) |
| |
| for i in range(len(classes)): |
| fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i], |
| probs[:,i]) |
| ax[0].plot(fpr, tpr, lw=3, |
| label='{0} (area={1:0.2f})'.format(classes[i], |
| skm.auc(fpr, tpr)) |
| ) |
| ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--') |
| ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], |
| xlabel='False Positive Rate', |
| ylabel="True Positive Rate (Recall)", |
| title="Receiver operating characteristic") |
| ax[0].legend(loc="lower right") |
| ax[0].grid(True) |
|
|
| |
| for i in range(len(classes)): |
| precision, recall, thresholds = skm.precision_recall_curve( |
| y_test_array[:,i], probs[:,i]) |
| ax[1].plot(recall, precision, lw=3, |
| label='{0} (area={1:0.2f})'.format(classes[i], |
| skm.auc(recall, precision)) |
| ) |
| ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', |
| ylabel="Precision", title="Precision-Recall curve") |
| ax[1].legend(loc="best") |
| ax[1].grid(True) |
| plt.show() |
| |
| |
| |
| print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}') |
|
|
| """## Support Vector Machine(SVM) with word embedding:""" |
|
|
| nlp = en_core_web_lg.load() |
|
|
| |
| all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ |
| for s in df_all['clean_text']]) |
|
|
| |
| Y= df_all["label"] |
| X = all_vectors |
|
|
| from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| validation_size = 0.3 |
| seed = 7 |
| X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) |
|
|
| |
| num_folds = 10 |
| seed = 7 |
| scoring = 'accuracy' |
|
|
| |
| clf = SVC(probability=True) |
|
|
| |
| |
| |
| res = clf.fit(X_train, Y_train) |
| train_result = accuracy_score(res.predict(X_train), Y_train) |
| test_result = accuracy_score(res.predict(X_test), Y_test) |
|
|
| print("train_result:", "test_resuld:", train_result, test_result, sep=" ") |
|
|
| |
| SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm1.pkl" |
|
|
| with open(SVM, 'wb') as file: |
| pickle.dump(clf, file) |
|
|
| |
| with open(SVM, 'rb') as file: |
| clf = pickle.load(file) |
|
|
| clf |
|
|
| |
| |
| y_pred_svm = res.predict(X_test) |
| classes = np.unique(Y_test.to_list()) |
| y_test_array = pd.get_dummies(Y_test, drop_first=False).values |
| probs = res.predict_proba(X_test) |
| conf_matrix_acc(Y_test.to_list(),y_pred_svm) |
| roc_precision_auc() |
|
|
| """## Exploring False positive and False negative:""" |
|
|
| |
| y_test_1 = [x for x in y_test] |
| y_pred_lr_1 = [x for x in y_pred_lr] |
|
|
| |
| idx = [] |
| for i in range(len(y_test_1)): |
| if y_test_1[i] != y_pred_lr_1[i]: |
| idx.append(i) |
| i+=1 |
|
|
| print('There are", {} "wrong preditions", len(idx)) |
| |
| wrong_arr = cv.inverse_transform(X_test_tfidf[idx]) |
| |
| ## detokenize the wrong array |
| detokenized = [TreebankWordDetokenizer().detokenize(x) for x in wrong_arr] |
| |
| detokenized[:50] |
| |
| """There is no specific patterns between false positive and false negative predictions.""" |