sgAtdbd commited on
Commit
cb75d9a
·
verified ·
1 Parent(s): 1e84de6

Update models/hate_speech_classifier.py

Browse files
Files changed (1) hide show
  1. models/hate_speech_classifier.py +100 -100
models/hate_speech_classifier.py CHANGED
@@ -5,39 +5,15 @@ import os
5
  import re
6
  import torch
7
  from deep_translator import GoogleTranslator
8
- from pathlib import Path
9
 
10
 
11
  class HateSpeechClassifier:
12
- # def __init__(self):
13
- # base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14
- # models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
15
-
16
- # # Initialize translator
17
- # self.translator = GoogleTranslator(source='bn', target='en')
18
-
19
- # # Use multiple pretrained models for better accuracy
20
- # self.pretrained_models = {
21
- # "primary": {
22
- # "name": "facebook/roberta-hate-speech-dynabench-r4-target",
23
- # "pipeline": None,
24
- # "weight": 0.6
25
- # },
26
- # "secondary": {
27
- # "name": "cardiffnlp/twitter-roberta-base-hate-latest",
28
- # "pipeline": None,
29
- # "weight": 0.4
30
- # }
31
- # }
32
  def __init__(self):
33
- # Get absolute path to model weights
34
- base_dir = Path(__file__).parent
35
- self.model_dir = base_dir / 'model_weights' / 'custom_models'
36
-
37
- # For Hugging Face Spaces, also check environment
38
- if not self.model_dir.exists():
39
- # Try alternative paths for deployed environment
40
- self.model_dir = Path('/app/models/model_weights/custom_models')
41
  self.translator = GoogleTranslator(source='bn', target='en')
42
 
43
  # Use multiple pretrained models for better accuracy
@@ -53,80 +29,104 @@ class HateSpeechClassifier:
53
  "weight": 0.4
54
  }
55
  }
 
 
 
 
 
 
 
 
 
 
56
 
57
- print(f"Model directory: {self.model_dir}")
58
- print(f"Model directory exists: {self.model_dir.exists()}")
59
-
60
- if self.model_dir.exists():
61
- print(f"Files in model directory: {list(self.model_dir.iterdir())}")
62
-
63
- # English custom model paths
64
- self.english_model_path = os.path.join(models_dir, "english_model.pkl")
65
- self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
66
- self.english_model = None
67
- self.english_vectorizer = None
68
- self.english_model_loaded = False
69
-
70
- # Bengali custom model paths
71
- self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
72
- self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
73
- self.bengali_model = None
74
- self.bengali_vectorizer = None
75
- self.bengali_model_loaded = False
76
-
77
- # Load models
78
- self._load_custom_models()
79
-
80
- # Enhanced hate keywords
81
- self.hate_keywords = {
82
- "english": [
83
- "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
84
- "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
85
- "terrorist", "racist", "sexist", "discrimination", "discriminate",
86
- "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
87
- "chamar", "bhangi", "sc/st", "reservation quota",
88
- "no right to live", "don't deserve", "shouldn't exist", "subhuman",
89
- "inferior", "worthless", "scum", "vermin", "parasite",
90
- "should be killed", "must die", "deserve to die", "need to be eliminated",
91
- "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
92
- "nigger", "chink", "paki", "kike", "faggot", "tranny"
93
- ],
94
- "bengali": [
95
- "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
96
- "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
97
- "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
98
- "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
99
- ]
100
- }
101
 
102
- self.hate_patterns = {
103
- "english": [
104
- r"no right to (live|exist|be here|survive)",
105
- r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
106
- r"don'?t deserve (to live|life|existence|to exist)",
107
- r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
108
- r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
109
- r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
110
- r"(send|throw|kick|drive) (them|back) (out|away|home)",
111
- r"(all|these) .{0,30} (should die|must be killed|need to go)",
112
- r"(death to|kill all|eliminate all) .{0,30}",
113
- r"(inferior|subhuman|less than human|not human)",
114
- ],
115
- "bengali": [
116
- r"বাঁচার অধিকার নেই",
117
- r"মরে যাওয়া উচিত",
118
- r"নিশ্চিহ্ন করা উচিত"
119
- ]
120
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- self.offensive_keywords = {
123
- "english": [
124
- "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
125
- "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
126
- "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
127
- ],
128
- "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
129
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  def _translate_to_english(self, text: str) -> Optional[str]:
132
  """Translate Bengali to English using deep-translator"""
 
5
  import re
6
  import torch
7
  from deep_translator import GoogleTranslator
8
+ # from pathlib import Path
9
 
10
 
11
  class HateSpeechClassifier:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def __init__(self):
13
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
14
+ models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
15
+
16
+ # Initialize translator
 
 
 
 
17
  self.translator = GoogleTranslator(source='bn', target='en')
18
 
19
  # Use multiple pretrained models for better accuracy
 
29
  "weight": 0.4
30
  }
31
  }
32
+ # def __init__(self):
33
+ # # Get absolute path to model weights
34
+ # base_dir = Path(__file__).parent
35
+ # self.model_dir = base_dir / 'model_weights' / 'custom_models'
36
+
37
+ # # For Hugging Face Spaces, also check environment
38
+ # if not self.model_dir.exists():
39
+ # # Try alternative paths for deployed environment
40
+ # self.model_dir = Path('/app/models/model_weights/custom_models')
41
+ # self.translator = GoogleTranslator(source='bn', target='en')
42
 
43
+ # # Use multiple pretrained models for better accuracy
44
+ # self.pretrained_models = {
45
+ # "primary": {
46
+ # "name": "facebook/roberta-hate-speech-dynabench-r4-target",
47
+ # "pipeline": None,
48
+ # "weight": 0.6
49
+ # },
50
+ # "secondary": {
51
+ # "name": "cardiffnlp/twitter-roberta-base-hate-latest",
52
+ # "pipeline": None,
53
+ # "weight": 0.4
54
+ # }
55
+ # }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ # print(f"Model directory: {self.model_dir}")
58
+ # print(f"Model directory exists: {self.model_dir.exists()}")
59
+
60
+ # if self.model_dir.exists():
61
+ # print(f"Files in model directory: {list(self.model_dir.iterdir())}")
62
+
63
+ # # English custom model paths
64
+ # self.english_model_path = os.path.join(models_dir, "english_model.pkl")
65
+ # self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
66
+ # self.english_model = None
67
+ # self.english_vectorizer = None
68
+ # self.english_model_loaded = False
69
+
70
+ # # Bengali custom model paths
71
+ # self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
72
+ # self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
73
+ # self.bengali_model = None
74
+ # self.bengali_vectorizer = None
75
+ # self.bengali_model_loaded = False
76
+
77
+ # # Load models
78
+ # self._load_custom_models()
79
+
80
+ # # Enhanced hate keywords
81
+ # self.hate_keywords = {
82
+ # "english": [
83
+ # "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
84
+ # "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
85
+ # "terrorist", "racist", "sexist", "discrimination", "discriminate",
86
+ # "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
87
+ # "chamar", "bhangi", "sc/st", "reservation quota",
88
+ # "no right to live", "don't deserve", "shouldn't exist", "subhuman",
89
+ # "inferior", "worthless", "scum", "vermin", "parasite",
90
+ # "should be killed", "must die", "deserve to die", "need to be eliminated",
91
+ # "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
92
+ # "nigger", "chink", "paki", "kike", "faggot", "tranny"
93
+ # ],
94
+ # "bengali": [
95
+ # "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
96
+ # "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
97
+ # "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
98
+ # "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
99
+ # ]
100
+ # }
101
 
102
+ # self.hate_patterns = {
103
+ # "english": [
104
+ # r"no right to (live|exist|be here|survive)",
105
+ # r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
106
+ # r"don'?t deserve (to live|life|existence|to exist)",
107
+ # r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
108
+ # r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
109
+ # r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
110
+ # r"(send|throw|kick|drive) (them|back) (out|away|home)",
111
+ # r"(all|these) .{0,30} (should die|must be killed|need to go)",
112
+ # r"(death to|kill all|eliminate all) .{0,30}",
113
+ # r"(inferior|subhuman|less than human|not human)",
114
+ # ],
115
+ # "bengali": [
116
+ # r"বাঁচার অধিকার নেই",
117
+ # r"মরে যাওয়া উচিত",
118
+ # r"নিশ্চিহ্ন করা উচিত"
119
+ # ]
120
+ # }
121
+
122
+ # self.offensive_keywords = {
123
+ # "english": [
124
+ # "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
125
+ # "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
126
+ # "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
127
+ # ],
128
+ # "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
129
+ # }
130
 
131
  def _translate_to_english(self, text: str) -> Optional[str]:
132
  """Translate Bengali to English using deep-translator"""