sgAtdbd commited on
Commit
b1a0040
·
verified ·
1 Parent(s): cb75d9a

Update models/hate_speech_classifier.py

Browse files
Files changed (1) hide show
  1. models/hate_speech_classifier.py +71 -96
models/hate_speech_classifier.py CHANGED
@@ -29,104 +29,79 @@ class HateSpeechClassifier:
29
  "weight": 0.4
30
  }
31
  }
32
- # def __init__(self):
33
- # # Get absolute path to model weights
34
- # base_dir = Path(__file__).parent
35
- # self.model_dir = base_dir / 'model_weights' / 'custom_models'
36
-
37
- # # For Hugging Face Spaces, also check environment
38
- # if not self.model_dir.exists():
39
- # # Try alternative paths for deployed environment
40
- # self.model_dir = Path('/app/models/model_weights/custom_models')
41
- # self.translator = GoogleTranslator(source='bn', target='en')
42
-
43
- # # Use multiple pretrained models for better accuracy
44
- # self.pretrained_models = {
45
- # "primary": {
46
- # "name": "facebook/roberta-hate-speech-dynabench-r4-target",
47
- # "pipeline": None,
48
- # "weight": 0.6
49
- # },
50
- # "secondary": {
51
- # "name": "cardiffnlp/twitter-roberta-base-hate-latest",
52
- # "pipeline": None,
53
- # "weight": 0.4
54
- # }
55
- # }
56
-
57
- # print(f"Model directory: {self.model_dir}")
58
- # print(f"Model directory exists: {self.model_dir.exists()}")
59
-
60
- # if self.model_dir.exists():
61
- # print(f"Files in model directory: {list(self.model_dir.iterdir())}")
62
-
63
- # # English custom model paths
64
- # self.english_model_path = os.path.join(models_dir, "english_model.pkl")
65
- # self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
66
- # self.english_model = None
67
- # self.english_vectorizer = None
68
- # self.english_model_loaded = False
69
-
70
- # # Bengali custom model paths
71
- # self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
72
- # self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
73
- # self.bengali_model = None
74
- # self.bengali_vectorizer = None
75
- # self.bengali_model_loaded = False
76
-
77
- # # Load models
78
- # self._load_custom_models()
79
-
80
- # # Enhanced hate keywords
81
- # self.hate_keywords = {
82
- # "english": [
83
- # "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
84
- # "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
85
- # "terrorist", "racist", "sexist", "discrimination", "discriminate",
86
- # "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
87
- # "chamar", "bhangi", "sc/st", "reservation quota",
88
- # "no right to live", "don't deserve", "shouldn't exist", "subhuman",
89
- # "inferior", "worthless", "scum", "vermin", "parasite",
90
- # "should be killed", "must die", "deserve to die", "need to be eliminated",
91
- # "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
92
- # "nigger", "chink", "paki", "kike", "faggot", "tranny"
93
- # ],
94
- # "bengali": [
95
- # "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
96
- # "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
97
- # "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
98
- # "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
99
- # ]
100
- # }
101
 
102
- # self.hate_patterns = {
103
- # "english": [
104
- # r"no right to (live|exist|be here|survive)",
105
- # r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
106
- # r"don'?t deserve (to live|life|existence|to exist)",
107
- # r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
108
- # r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
109
- # r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
110
- # r"(send|throw|kick|drive) (them|back) (out|away|home)",
111
- # r"(all|these) .{0,30} (should die|must be killed|need to go)",
112
- # r"(death to|kill all|eliminate all) .{0,30}",
113
- # r"(inferior|subhuman|less than human|not human)",
114
- # ],
115
- # "bengali": [
116
- # r"বাঁচার অধিকার নেই",
117
- # r"মরে যাওয়া উচিত",
118
- # r"নিশ্চিহ্ন করা উচিত"
119
- # ]
120
- # }
121
 
122
- # self.offensive_keywords = {
123
- # "english": [
124
- # "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
125
- # "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
126
- # "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
127
- # ],
128
- # "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
129
- # }
130
 
131
  def _translate_to_english(self, text: str) -> Optional[str]:
132
  """Translate Bengali to English using deep-translator"""
 
29
  "weight": 0.4
30
  }
31
  }
32
+ print(f"Model directory: {self.models_dir}")
33
+ print(f"Model directory exists: {self.models_dir.exists()}")
34
+
35
+ if self.model_dir.exists():
36
+ print(f"Files in model directory: {list(self.model_dir.iterdir())}")
37
+
38
+ # English custom model paths
39
+ self.english_model_path = os.path.join(models_dir, "english_model.pkl")
40
+ self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
41
+ self.english_model = None
42
+ self.english_vectorizer = None
43
+ self.english_model_loaded = False
44
+
45
+ # Bengali custom model paths
46
+ self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
47
+ self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
48
+ self.bengali_model = None
49
+ self.bengali_vectorizer = None
50
+ self.bengali_model_loaded = False
51
+
52
+ # Load models
53
+ self._load_custom_models()
54
+
55
+ # Enhanced hate keywords
56
+ self.hate_keywords = {
57
+ "english": [
58
+ "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
59
+ "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
60
+ "terrorist", "racist", "sexist", "discrimination", "discriminate",
61
+ "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
62
+ "chamar", "bhangi", "sc/st", "reservation quota",
63
+ "no right to live", "don't deserve", "shouldn't exist", "subhuman",
64
+ "inferior", "worthless", "scum", "vermin", "parasite",
65
+ "should be killed", "must die", "deserve to die", "need to be eliminated",
66
+ "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
67
+ "nigger", "chink", "paki", "kike", "faggot", "tranny"
68
+ ],
69
+ "bengali": [
70
+ "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
71
+ "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
72
+ "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
73
+ "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
74
+ ]
75
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ self.hate_patterns = {
78
+ "english": [
79
+ r"no right to (live|exist|be here|survive)",
80
+ r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
81
+ r"don'?t deserve (to live|life|existence|to exist)",
82
+ r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
83
+ r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
84
+ r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
85
+ r"(send|throw|kick|drive) (them|back) (out|away|home)",
86
+ r"(all|these) .{0,30} (should die|must be killed|need to go)",
87
+ r"(death to|kill all|eliminate all) .{0,30}",
88
+ r"(inferior|subhuman|less than human|not human)",
89
+ ],
90
+ "bengali": [
91
+ r"বাঁচার অধিকার নেই",
92
+ r"মরে যাওয়া উচিত",
93
+ r"নিশ্চিহ্ন করা উচিত"
94
+ ]
95
+ }
96
 
97
+ self.offensive_keywords = {
98
+ "english": [
99
+ "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
100
+ "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
101
+ "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
102
+ ],
103
+ "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
104
+ }
105
 
106
  def _translate_to_english(self, text: str) -> Optional[str]:
107
  """Translate Bengali to English using deep-translator"""