| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import pickle |
| import warnings |
| import numpy as np |
| import pandas as pd |
| import spacy |
| import en_core_web_lg |
| from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| from sklearn.metrics import confusion_matrix, accuracy_score |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.neighbors import KNeighborsClassifier |
| from sklearn.svm import SVC |
| from sklearn.tree import DecisionTreeClassifier |
| from sklearn.neural_network import MLPClassifier |
| from sklearn.ensemble import RandomForestClassifier |
|
|
| |
| warnings.filterwarnings("ignore") |
|
|
| def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7): |
| """ |
| Ingests raw data, performs feature extraction via word embeddings, |
| and partitions the dataset for model validation. |
| |
| Methodology: |
| - TSV Ingestion: Data is loaded from the specified file. |
| - Semantic Vectorization: Utilizing spaCy's dense 300-dimensional |
| word embeddings (centroid of token vectors). |
| - Validation Partitioning: Stratified splitting of data into |
| training and testing subsets. |
| |
| Args: |
| filename (str): Path to the TSV/CSV dataset. |
| targets (list): Column name for the dependent variable. |
| validation_size (float): Proportion of data reserved for testing. |
| seed (int): Random seed for reproducibility. |
| |
| Returns: |
| tuple: (X_train, X_test, Y_train, Y_test) feature and label sets. |
| """ |
| print(f"Acquiring dataset from: {filename}") |
| df_all = pd.read_csv(filename, sep='\t', encoding='utf-8') |
|
|
| |
| nlp_engine = en_core_web_lg.load() |
|
|
| |
| print("Extracting semantic features via spaCy embeddings...") |
| feature_vectors = np.array([ |
| np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300)) |
| for s in df_all['clean_text'] |
| ]) |
| |
| |
| y_labels = df_all.loc[:, targets] |
| x_features = feature_vectors |
|
|
| x_train, x_test, y_train, y_test = train_test_split( |
| x_features, y_labels, test_size=validation_size, random_state=seed |
| ) |
|
|
| return x_train, x_test, y_train, y_test |
|
|
| def classification(X_train, Y_train, model=""): |
| """ |
| Facilitates the training and serialization of various classification |
| architectures. |
| |
| Architectures Supported: |
| - SVM: Support Vector Machine (Selected as the production primary). |
| - LR: Logistic Regression. |
| - DT: Decision Tree Classifier. |
| - KNN: k-Nearest Neighbors (with automated k-optimization). |
| - RF: Random Forest Classifier. |
| - NN: Multi-layer Perceptron (MLP) Neural Network. |
| |
| Args: |
| X_train: Training feature set. |
| Y_train: Training label set. |
| model (str): Target architecture identifier. |
| |
| Returns: |
| object: The trained Scikit-learn model instance. |
| """ |
| if model == "SVM": |
| |
| print("Initializing SVM (Support Vector Machine) training...") |
| clf = SVC(probability=True) |
| clf.fit(X_train, Y_train) |
| |
| |
| train_accuracy = accuracy_score(clf.predict(X_train), Y_train) |
| print(f"Training Convergence Accuracy: {train_accuracy:.4f}") |
|
|
| |
| save_path = "../assets/models/model_svm_pc.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(clf, file) |
| return clf |
|
|
| elif model == "LR": |
| |
| print("Initializing Logistic Regression training...") |
| lr_model = LogisticRegression() |
| lr_model.fit(X_train, Y_train) |
| |
| save_path = "../assets/models/model_LogReg.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(lr_model, file) |
| return lr_model |
|
|
| elif model == "DT": |
| |
| print("Initializing Decision Tree training...") |
| dt_model = DecisionTreeClassifier() |
| dt_model.fit(X_train, Y_train) |
| |
| save_path = "../assets/models/model_DTC.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(dt_model, file) |
| return dt_model |
|
|
| elif model == "KNN": |
| |
| print("Initializing kNN training with automated k-optimization...") |
| k_values = range(1, 32, 1) |
| k_scores = [] |
|
|
| |
| for k in k_values: |
| knn = KNeighborsClassifier(n_neighbors=k) |
| score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10)) |
| k_scores.append(score) |
| |
| optimal_k = k_values[np.argmax(k_scores)] |
| print(f"Optimized Hyperparameter discovered: k = {optimal_k}") |
|
|
| best_knn = KNeighborsClassifier(n_neighbors=optimal_k) |
| best_knn.fit(X_train, Y_train) |
|
|
| save_path = "../assets/models/model_KNN.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(best_knn, file) |
| return best_knn |
|
|
| elif model == "RF": |
| |
| print("Initializing Random Forest training...") |
| rf_model = RandomForestClassifier() |
| rf_model.fit(X_train, Y_train) |
| |
| save_path = "../assets/models/model_RF.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(rf_model, file) |
| return rf_model |
|
|
| elif model == "NN": |
| |
| print("Initializing Neural Network (MLP) training...") |
| nn_model = MLPClassifier() |
| nn_model.fit(X_train, Y_train) |
| |
| save_path = "../assets/models/model_NN.pkl" |
| with open(save_path, 'wb') as file: |
| pickle.dump(nn_model, file) |
| return nn_model |
|
|
| def LSTM(filename: str): |
| """ |
| Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM) |
| recurrent neural networks for capturing temporal lingustical patterns. |
| |
| Methodology: |
| - Tokenization: Integer encoding of sequences. |
| - Padding: Uniform sequence length normalization. |
| - Architecture: Embedding layer followed by LSTM with Dropouts. |
| """ |
| from keras.models import Sequential |
| from keras.layers import Dense, Embedding, LSTM |
| from keras.preprocessing.text import Tokenizer |
| from keras.preprocessing.sequence import pad_sequences |
| from keras.wrappers.scikit_learn import KerasClassifier |
|
|
| print(f"Acquiring data for Deep Learning (LSTM): {filename}") |
| df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8') |
|
|
| |
| vocab_size = 20000 |
| max_len = 50 |
| tokenizer = Tokenizer(num_words=vocab_size) |
| tokenizer.fit_on_texts(df_dl['clean_text']) |
| seqs = tokenizer.texts_to_sequences(df_dl['clean_text']) |
| x_lstm = pad_sequences(seqs, maxlen=max_len) |
| y_lstm = df_dl["label"] |
|
|
| |
| print("Constructing LSTM topology...") |
| model = Sequential() |
| model.add(Embedding(vocab_size, 300, input_length=max_len)) |
| model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) |
| model.add(Dense(1, activation='sigmoid')) |
| model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
|
|
| |
| print("Commencing Deep Learning Convergence (LSTM)...") |
| |
| |
| model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3) |
|
|
| |
| model_json = model.to_json() |
| with open("model_LSTM.json", "w") as json_file: |
| json_file.write(model_json) |
| model.save_weights("model_LSTM.h5") |
| print("Deep Learning model (LSTM) artifacts successfully persisted.") |
|
|
|
|