Spaces:
Sleeping
Sleeping
Update models/hate_speech_classifier.py
Browse files- models/hate_speech_classifier.py +100 -100
models/hate_speech_classifier.py
CHANGED
|
@@ -5,39 +5,15 @@ import os
|
|
| 5 |
import re
|
| 6 |
import torch
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
-
from pathlib import Path
|
| 9 |
|
| 10 |
|
| 11 |
class HateSpeechClassifier:
|
| 12 |
-
# def __init__(self):
|
| 13 |
-
# base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 14 |
-
# models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
|
| 15 |
-
|
| 16 |
-
# # Initialize translator
|
| 17 |
-
# self.translator = GoogleTranslator(source='bn', target='en')
|
| 18 |
-
|
| 19 |
-
# # Use multiple pretrained models for better accuracy
|
| 20 |
-
# self.pretrained_models = {
|
| 21 |
-
# "primary": {
|
| 22 |
-
# "name": "facebook/roberta-hate-speech-dynabench-r4-target",
|
| 23 |
-
# "pipeline": None,
|
| 24 |
-
# "weight": 0.6
|
| 25 |
-
# },
|
| 26 |
-
# "secondary": {
|
| 27 |
-
# "name": "cardiffnlp/twitter-roberta-base-hate-latest",
|
| 28 |
-
# "pipeline": None,
|
| 29 |
-
# "weight": 0.4
|
| 30 |
-
# }
|
| 31 |
-
# }
|
| 32 |
def __init__(self):
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# For Hugging Face Spaces, also check environment
|
| 38 |
-
if not self.model_dir.exists():
|
| 39 |
-
# Try alternative paths for deployed environment
|
| 40 |
-
self.model_dir = Path('/app/models/model_weights/custom_models')
|
| 41 |
self.translator = GoogleTranslator(source='bn', target='en')
|
| 42 |
|
| 43 |
# Use multiple pretrained models for better accuracy
|
|
@@ -53,80 +29,104 @@ class HateSpeechClassifier:
|
|
| 53 |
"weight": 0.4
|
| 54 |
}
|
| 55 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
# Bengali custom model paths
|
| 71 |
-
self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
|
| 72 |
-
self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
|
| 73 |
-
self.bengali_model = None
|
| 74 |
-
self.bengali_vectorizer = None
|
| 75 |
-
self.bengali_model_loaded = False
|
| 76 |
-
|
| 77 |
-
# Load models
|
| 78 |
-
self._load_custom_models()
|
| 79 |
-
|
| 80 |
-
# Enhanced hate keywords
|
| 81 |
-
self.hate_keywords = {
|
| 82 |
-
"english": [
|
| 83 |
-
"hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
|
| 84 |
-
"die", "dead", "shoot", "stab", "burn", "hang", "lynch",
|
| 85 |
-
"terrorist", "racist", "sexist", "discrimination", "discriminate",
|
| 86 |
-
"scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
|
| 87 |
-
"chamar", "bhangi", "sc/st", "reservation quota",
|
| 88 |
-
"no right to live", "don't deserve", "shouldn't exist", "subhuman",
|
| 89 |
-
"inferior", "worthless", "scum", "vermin", "parasite",
|
| 90 |
-
"should be killed", "must die", "deserve to die", "need to be eliminated",
|
| 91 |
-
"jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
|
| 92 |
-
"nigger", "chink", "paki", "kike", "faggot", "tranny"
|
| 93 |
-
],
|
| 94 |
-
"bengali": [
|
| 95 |
-
"শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
|
| 96 |
-
"ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
|
| 97 |
-
"বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
|
| 98 |
-
"দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
|
| 99 |
-
]
|
| 100 |
-
}
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
def _translate_to_english(self, text: str) -> Optional[str]:
|
| 132 |
"""Translate Bengali to English using deep-translator"""
|
|
|
|
| 5 |
import re
|
| 6 |
import torch
|
| 7 |
from deep_translator import GoogleTranslator
|
| 8 |
+
# from pathlib import Path
|
| 9 |
|
| 10 |
|
| 11 |
class HateSpeechClassifier:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def __init__(self):
|
| 13 |
+
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 14 |
+
models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
|
| 15 |
+
|
| 16 |
+
# Initialize translator
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
self.translator = GoogleTranslator(source='bn', target='en')
|
| 18 |
|
| 19 |
# Use multiple pretrained models for better accuracy
|
|
|
|
| 29 |
"weight": 0.4
|
| 30 |
}
|
| 31 |
}
|
| 32 |
+
# def __init__(self):
|
| 33 |
+
# # Get absolute path to model weights
|
| 34 |
+
# base_dir = Path(__file__).parent
|
| 35 |
+
# self.model_dir = base_dir / 'model_weights' / 'custom_models'
|
| 36 |
+
|
| 37 |
+
# # For Hugging Face Spaces, also check environment
|
| 38 |
+
# if not self.model_dir.exists():
|
| 39 |
+
# # Try alternative paths for deployed environment
|
| 40 |
+
# self.model_dir = Path('/app/models/model_weights/custom_models')
|
| 41 |
+
# self.translator = GoogleTranslator(source='bn', target='en')
|
| 42 |
|
| 43 |
+
# # Use multiple pretrained models for better accuracy
|
| 44 |
+
# self.pretrained_models = {
|
| 45 |
+
# "primary": {
|
| 46 |
+
# "name": "facebook/roberta-hate-speech-dynabench-r4-target",
|
| 47 |
+
# "pipeline": None,
|
| 48 |
+
# "weight": 0.6
|
| 49 |
+
# },
|
| 50 |
+
# "secondary": {
|
| 51 |
+
# "name": "cardiffnlp/twitter-roberta-base-hate-latest",
|
| 52 |
+
# "pipeline": None,
|
| 53 |
+
# "weight": 0.4
|
| 54 |
+
# }
|
| 55 |
+
# }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# print(f"Model directory: {self.model_dir}")
|
| 58 |
+
# print(f"Model directory exists: {self.model_dir.exists()}")
|
| 59 |
+
|
| 60 |
+
# if self.model_dir.exists():
|
| 61 |
+
# print(f"Files in model directory: {list(self.model_dir.iterdir())}")
|
| 62 |
+
|
| 63 |
+
# # English custom model paths
|
| 64 |
+
# self.english_model_path = os.path.join(models_dir, "english_model.pkl")
|
| 65 |
+
# self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
|
| 66 |
+
# self.english_model = None
|
| 67 |
+
# self.english_vectorizer = None
|
| 68 |
+
# self.english_model_loaded = False
|
| 69 |
+
|
| 70 |
+
# # Bengali custom model paths
|
| 71 |
+
# self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
|
| 72 |
+
# self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
|
| 73 |
+
# self.bengali_model = None
|
| 74 |
+
# self.bengali_vectorizer = None
|
| 75 |
+
# self.bengali_model_loaded = False
|
| 76 |
+
|
| 77 |
+
# # Load models
|
| 78 |
+
# self._load_custom_models()
|
| 79 |
+
|
| 80 |
+
# # Enhanced hate keywords
|
| 81 |
+
# self.hate_keywords = {
|
| 82 |
+
# "english": [
|
| 83 |
+
# "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
|
| 84 |
+
# "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
|
| 85 |
+
# "terrorist", "racist", "sexist", "discrimination", "discriminate",
|
| 86 |
+
# "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
|
| 87 |
+
# "chamar", "bhangi", "sc/st", "reservation quota",
|
| 88 |
+
# "no right to live", "don't deserve", "shouldn't exist", "subhuman",
|
| 89 |
+
# "inferior", "worthless", "scum", "vermin", "parasite",
|
| 90 |
+
# "should be killed", "must die", "deserve to die", "need to be eliminated",
|
| 91 |
+
# "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
|
| 92 |
+
# "nigger", "chink", "paki", "kike", "faggot", "tranny"
|
| 93 |
+
# ],
|
| 94 |
+
# "bengali": [
|
| 95 |
+
# "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
|
| 96 |
+
# "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
|
| 97 |
+
# "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
|
| 98 |
+
# "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
|
| 99 |
+
# ]
|
| 100 |
+
# }
|
| 101 |
|
| 102 |
+
# self.hate_patterns = {
|
| 103 |
+
# "english": [
|
| 104 |
+
# r"no right to (live|exist|be here|survive)",
|
| 105 |
+
# r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
|
| 106 |
+
# r"don'?t deserve (to live|life|existence|to exist)",
|
| 107 |
+
# r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
|
| 108 |
+
# r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
|
| 109 |
+
# r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
|
| 110 |
+
# r"(send|throw|kick|drive) (them|back) (out|away|home)",
|
| 111 |
+
# r"(all|these) .{0,30} (should die|must be killed|need to go)",
|
| 112 |
+
# r"(death to|kill all|eliminate all) .{0,30}",
|
| 113 |
+
# r"(inferior|subhuman|less than human|not human)",
|
| 114 |
+
# ],
|
| 115 |
+
# "bengali": [
|
| 116 |
+
# r"বাঁচার অধিকার নেই",
|
| 117 |
+
# r"মরে যাওয়া উচিত",
|
| 118 |
+
# r"নিশ্চিহ্ন করা উচিত"
|
| 119 |
+
# ]
|
| 120 |
+
# }
|
| 121 |
+
|
| 122 |
+
# self.offensive_keywords = {
|
| 123 |
+
# "english": [
|
| 124 |
+
# "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
|
| 125 |
+
# "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
|
| 126 |
+
# "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
|
| 127 |
+
# ],
|
| 128 |
+
# "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
|
| 129 |
+
# }
|
| 130 |
|
| 131 |
def _translate_to_english(self, text: str) -> Optional[str]:
|
| 132 |
"""Translate Bengali to English using deep-translator"""
|