Spaces:
Sleeping
Sleeping
Update models/hate_speech_classifier.py
Browse files- models/hate_speech_classifier.py +71 -96
models/hate_speech_classifier.py
CHANGED
|
@@ -29,104 +29,79 @@ class HateSpeechClassifier:
|
|
| 29 |
"weight": 0.4
|
| 30 |
}
|
| 31 |
}
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
# # Load models
|
| 78 |
-
# self._load_custom_models()
|
| 79 |
-
|
| 80 |
-
# # Enhanced hate keywords
|
| 81 |
-
# self.hate_keywords = {
|
| 82 |
-
# "english": [
|
| 83 |
-
# "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
|
| 84 |
-
# "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
|
| 85 |
-
# "terrorist", "racist", "sexist", "discrimination", "discriminate",
|
| 86 |
-
# "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
|
| 87 |
-
# "chamar", "bhangi", "sc/st", "reservation quota",
|
| 88 |
-
# "no right to live", "don't deserve", "shouldn't exist", "subhuman",
|
| 89 |
-
# "inferior", "worthless", "scum", "vermin", "parasite",
|
| 90 |
-
# "should be killed", "must die", "deserve to die", "need to be eliminated",
|
| 91 |
-
# "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
|
| 92 |
-
# "nigger", "chink", "paki", "kike", "faggot", "tranny"
|
| 93 |
-
# ],
|
| 94 |
-
# "bengali": [
|
| 95 |
-
# "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
|
| 96 |
-
# "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
|
| 97 |
-
# "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
|
| 98 |
-
# "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
|
| 99 |
-
# ]
|
| 100 |
-
# }
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
|
| 131 |
def _translate_to_english(self, text: str) -> Optional[str]:
|
| 132 |
"""Translate Bengali to English using deep-translator"""
|
|
|
|
| 29 |
"weight": 0.4
|
| 30 |
}
|
| 31 |
}
|
| 32 |
+
print(f"Model directory: {self.models_dir}")
|
| 33 |
+
print(f"Model directory exists: {self.models_dir.exists()}")
|
| 34 |
+
|
| 35 |
+
if self.model_dir.exists():
|
| 36 |
+
print(f"Files in model directory: {list(self.model_dir.iterdir())}")
|
| 37 |
+
|
| 38 |
+
# English custom model paths
|
| 39 |
+
self.english_model_path = os.path.join(models_dir, "english_model.pkl")
|
| 40 |
+
self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
|
| 41 |
+
self.english_model = None
|
| 42 |
+
self.english_vectorizer = None
|
| 43 |
+
self.english_model_loaded = False
|
| 44 |
+
|
| 45 |
+
# Bengali custom model paths
|
| 46 |
+
self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
|
| 47 |
+
self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
|
| 48 |
+
self.bengali_model = None
|
| 49 |
+
self.bengali_vectorizer = None
|
| 50 |
+
self.bengali_model_loaded = False
|
| 51 |
+
|
| 52 |
+
# Load models
|
| 53 |
+
self._load_custom_models()
|
| 54 |
+
|
| 55 |
+
# Enhanced hate keywords
|
| 56 |
+
self.hate_keywords = {
|
| 57 |
+
"english": [
|
| 58 |
+
"hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
|
| 59 |
+
"die", "dead", "shoot", "stab", "burn", "hang", "lynch",
|
| 60 |
+
"terrorist", "racist", "sexist", "discrimination", "discriminate",
|
| 61 |
+
"scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
|
| 62 |
+
"chamar", "bhangi", "sc/st", "reservation quota",
|
| 63 |
+
"no right to live", "don't deserve", "shouldn't exist", "subhuman",
|
| 64 |
+
"inferior", "worthless", "scum", "vermin", "parasite",
|
| 65 |
+
"should be killed", "must die", "deserve to die", "need to be eliminated",
|
| 66 |
+
"jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
|
| 67 |
+
"nigger", "chink", "paki", "kike", "faggot", "tranny"
|
| 68 |
+
],
|
| 69 |
+
"bengali": [
|
| 70 |
+
"শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
|
| 71 |
+
"ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
|
| 72 |
+
"বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
|
| 73 |
+
"দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
|
| 74 |
+
]
|
| 75 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
self.hate_patterns = {
|
| 78 |
+
"english": [
|
| 79 |
+
r"no right to (live|exist|be here|survive)",
|
| 80 |
+
r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
|
| 81 |
+
r"don'?t deserve (to live|life|existence|to exist)",
|
| 82 |
+
r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
|
| 83 |
+
r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
|
| 84 |
+
r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
|
| 85 |
+
r"(send|throw|kick|drive) (them|back) (out|away|home)",
|
| 86 |
+
r"(all|these) .{0,30} (should die|must be killed|need to go)",
|
| 87 |
+
r"(death to|kill all|eliminate all) .{0,30}",
|
| 88 |
+
r"(inferior|subhuman|less than human|not human)",
|
| 89 |
+
],
|
| 90 |
+
"bengali": [
|
| 91 |
+
r"বাঁচার অধিকার নেই",
|
| 92 |
+
r"মরে যাওয়া উচিত",
|
| 93 |
+
r"নিশ্চিহ্ন করা উচিত"
|
| 94 |
+
]
|
| 95 |
+
}
|
| 96 |
|
| 97 |
+
self.offensive_keywords = {
|
| 98 |
+
"english": [
|
| 99 |
+
"damn", "hell", "crap", "suck", "dumb", "loser", "trash",
|
| 100 |
+
"stupid", "idiot", "moron", "pathetic", "bad", "ugly",
|
| 101 |
+
"disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
|
| 102 |
+
],
|
| 103 |
+
"bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
|
| 104 |
+
}
|
| 105 |
|
| 106 |
def _translate_to_english(self, text: str) -> Optional[str]:
|
| 107 |
"""Translate Bengali to English using deep-translator"""
|