Spaces:

ameythakur
/

Depression-Detection-Using-Tweets

Running

App Files Files Community

Depression-Detection-Using-Tweets / source_code /core /train_utilities.py

ameythakur

DEPRESSION-DETECTION

4d1cb0c verified 4 months ago

raw

history blame contribute delete

8.87 kB

	# ==============================================================================
	# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
	# AUTHORS: AMEY THAKUR & MEGA SATISH
	# GITHUB (AMEY): https://github.com/Amey-Thakur
	# GITHUB (MEGA): https://github.com/msatmod
	# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
	# RELEASE DATE: June 5, 2022
	# LICENSE: MIT License
	# DESCRIPTION: Utility module for the model training pipeline.
	# ==============================================================================

	import pickle
	import warnings
	import numpy as np
	import pandas as pd
	import spacy
	import en_core_web_lg
	from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
	from sklearn.metrics import confusion_matrix, accuracy_score
	from sklearn.linear_model import LogisticRegression
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.svm import SVC
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import RandomForestClassifier

	# Suppression of non-critical runtime warnings to maintain algorithmic output integrity
	warnings.filterwarnings("ignore")

	def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
	"""
	Ingests raw data, performs feature extraction via word embeddings,
	and partitions the dataset for model validation.

	Methodology:
	- TSV Ingestion: Data is loaded from the specified file.
	- Semantic Vectorization: Utilizing spaCy's dense 300-dimensional
	word embeddings (centroid of token vectors).
	- Validation Partitioning: Stratified splitting of data into
	training and testing subsets.

	Args:
	filename (str): Path to the TSV/CSV dataset.
	targets (list): Column name for the dependent variable.
	validation_size (float): Proportion of data reserved for testing.
	seed (int): Random seed for reproducibility.

	Returns:
	tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
	"""
	print(f"Acquiring dataset from: {filename}")
	df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')

	# Step 1: Initialize the Linguistic Engine
	nlp_engine = en_core_web_lg.load()

	# Step 2: Compute Dense Word Embeddings (Feature Extraction)
	print("Extracting semantic features via spaCy embeddings...")
	feature_vectors = np.array([
	np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
	for s in df_all['clean_text']
	])

	# Step 3: Dataset Splitting
	y_labels = df_all.loc[:, targets]
	x_features = feature_vectors

	x_train, x_test, y_train, y_test = train_test_split(
	x_features, y_labels, test_size=validation_size, random_state=seed
	)

	return x_train, x_test, y_train, y_test

	def classification(X_train, Y_train, model=""):
	"""
	Facilitates the training and serialization of various classification
	architectures.

	Architectures Supported:
	- SVM: Support Vector Machine (Selected as the production primary).
	- LR: Logistic Regression.
	- DT: Decision Tree Classifier.
	- KNN: k-Nearest Neighbors (with automated k-optimization).
	- RF: Random Forest Classifier.
	- NN: Multi-layer Perceptron (MLP) Neural Network.

	Args:
	X_train: Training feature set.
	Y_train: Training label set.
	model (str): Target architecture identifier.

	Returns:
	object: The trained Scikit-learn model instance.
	"""
	if model == "SVM":
	# Support Vector Machines are effective in high-dimensional semantic spaces
	print("Initializing SVM (Support Vector Machine) training...")
	clf = SVC(probability=True)
	clf.fit(X_train, Y_train)

	# Performance Evaluation (Accuracy Metric)
	train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
	print(f"Training Convergence Accuracy: {train_accuracy:.4f}")

	# Persistence: Serializing the model artifact
	save_path = "../assets/models/model_svm_pc.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(clf, file)
	return clf

	elif model == "LR":
	# Logistic Regression serves as a robust baseline for linear classification
	print("Initializing Logistic Regression training...")
	lr_model = LogisticRegression()
	lr_model.fit(X_train, Y_train)

	save_path = "../assets/models/model_LogReg.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(lr_model, file)
	return lr_model

	elif model == "DT":
	# Decision Trees provide hierarchical decision boundaries
	print("Initializing Decision Tree training...")
	dt_model = DecisionTreeClassifier()
	dt_model.fit(X_train, Y_train)

	save_path = "../assets/models/model_DTC.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(dt_model, file)
	return dt_model

	elif model == "KNN":
	# kNN requires hyperparameter tuning (k value) via cross-validation
	print("Initializing kNN training with automated k-optimization...")
	k_values = range(1, 32, 1)
	k_scores = []

	# 10-Fold Cross-Validation for optimal k-neighbor selection
	for k in k_values:
	knn = KNeighborsClassifier(n_neighbors=k)
	score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
	k_scores.append(score)

	optimal_k = k_values[np.argmax(k_scores)]
	print(f"Optimized Hyperparameter discovered: k = {optimal_k}")

	best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
	best_knn.fit(X_train, Y_train)

	save_path = "../assets/models/model_KNN.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(best_knn, file)
	return best_knn

	elif model == "RF":
	# Random Forest: Ensemble bagged decision trees for variance reduction
	print("Initializing Random Forest training...")
	rf_model = RandomForestClassifier()
	rf_model.fit(X_train, Y_train)

	save_path = "../assets/models/model_RF.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(rf_model, file)
	return rf_model

	elif model == "NN":
	# MLP (Multi-layer Perceptron): Basic artificial neural network
	print("Initializing Neural Network (MLP) training...")
	nn_model = MLPClassifier()
	nn_model.fit(X_train, Y_train)

	save_path = "../assets/models/model_NN.pkl"
	with open(save_path, 'wb') as file:
	pickle.dump(nn_model, file)
	return nn_model

	def LSTM(filename: str):
	"""
	Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM)
	recurrent neural networks for capturing temporal lingustical patterns.

	Methodology:
	- Tokenization: Integer encoding of sequences.
	- Padding: Uniform sequence length normalization.
	- Architecture: Embedding layer followed by LSTM with Dropouts.
	"""
	from keras.models import Sequential
	from keras.layers import Dense, Embedding, LSTM
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from keras.wrappers.scikit_learn import KerasClassifier

	print(f"Acquiring data for Deep Learning (LSTM): {filename}")
	df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')

	# Step 1: Sequence Tokenization and Padding
	vocab_size = 20000
	max_len = 50
	tokenizer = Tokenizer(num_words=vocab_size)
	tokenizer.fit_on_texts(df_dl['clean_text'])
	seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
	x_lstm = pad_sequences(seqs, maxlen=max_len)
	y_lstm = df_dl["label"]

	# Step 2: Architecture Definition
	print("Constructing LSTM topology...")
	model = Sequential()
	model.add(Embedding(vocab_size, 300, input_length=max_len))
	model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

	# Step 3: Model Execution and Persistance
	print("Commencing Deep Learning Convergence (LSTM)...")
	# In a professional context, create_model should be passed to KerasClassifier
	# Here we demonstrate the fundamental fit operation
	model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)

	# Persistence: JSON topology and H5 weights
	model_json = model.to_json()
	with open("model_LSTM.json", "w") as json_file:
	json_file.write(model_json)
	model.save_weights("model_LSTM.h5")
	print("Deep Learning model (LSTM) artifacts successfully persisted.")