Spaces:
Sleeping
Sleeping
fix filesystem permissions: use /tmp for checkpoints and database
Browse files- src/advanced_reddit_scraper.py +14 -9
- src/streamlit_app.py +1 -2
src/advanced_reddit_scraper.py
CHANGED
|
@@ -330,10 +330,13 @@ class UserHistoryCollector:
|
|
| 330 |
|
| 331 |
class CheckpointManager:
|
| 332 |
"""Manages checkpoint saving and restoration for long-running operations"""
|
| 333 |
-
|
| 334 |
-
def __init__(self, checkpoint_dir: str =
|
|
|
|
|
|
|
|
|
|
| 335 |
self.checkpoint_dir = Path(checkpoint_dir)
|
| 336 |
-
self.checkpoint_dir.mkdir(exist_ok=True)
|
| 337 |
|
| 338 |
def save_checkpoint(self, state: Dict, checkpoint_name: str):
|
| 339 |
"""Save current state to checkpoint file"""
|
|
@@ -397,10 +400,10 @@ class AdvancedRedditScraper:
|
|
| 397 |
- Database persistence
|
| 398 |
"""
|
| 399 |
|
| 400 |
-
def __init__(self, client_id: str, client_secret: str, user_agent: str,
|
| 401 |
-
db_path: str =
|
| 402 |
"""Initialize advanced scraper with all components"""
|
| 403 |
-
|
| 404 |
# Reddit instance
|
| 405 |
self.reddit = praw.Reddit(
|
| 406 |
client_id=client_id,
|
|
@@ -408,14 +411,16 @@ class AdvancedRedditScraper:
|
|
| 408 |
user_agent=user_agent,
|
| 409 |
check_for_async=False
|
| 410 |
)
|
| 411 |
-
|
| 412 |
# Components
|
| 413 |
self.backoff = ExponentialBackoff(base_delay=1.0, max_delay=60.0)
|
| 414 |
self.hierarchy_tracker = CommentHierarchyTracker()
|
| 415 |
self.user_collector = UserHistoryCollector(self.reddit, self.backoff)
|
| 416 |
self.checkpoint_manager = CheckpointManager()
|
| 417 |
-
|
| 418 |
-
# Database setup
|
|
|
|
|
|
|
| 419 |
self.db_path = db_path
|
| 420 |
self._init_database()
|
| 421 |
|
|
|
|
| 330 |
|
| 331 |
class CheckpointManager:
|
| 332 |
"""Manages checkpoint saving and restoration for long-running operations"""
|
| 333 |
+
|
| 334 |
+
def __init__(self, checkpoint_dir: str = None):
|
| 335 |
+
# Use /tmp for HuggingFace Spaces compatibility (read-only filesystem)
|
| 336 |
+
if checkpoint_dir is None:
|
| 337 |
+
checkpoint_dir = os.environ.get('CHECKPOINT_DIR', '/tmp/checkpoints')
|
| 338 |
self.checkpoint_dir = Path(checkpoint_dir)
|
| 339 |
+
self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
|
| 340 |
|
| 341 |
def save_checkpoint(self, state: Dict, checkpoint_name: str):
|
| 342 |
"""Save current state to checkpoint file"""
|
|
|
|
| 400 |
- Database persistence
|
| 401 |
"""
|
| 402 |
|
| 403 |
+
def __init__(self, client_id: str, client_secret: str, user_agent: str,
|
| 404 |
+
db_path: str = None):
|
| 405 |
"""Initialize advanced scraper with all components"""
|
| 406 |
+
|
| 407 |
# Reddit instance
|
| 408 |
self.reddit = praw.Reddit(
|
| 409 |
client_id=client_id,
|
|
|
|
| 411 |
user_agent=user_agent,
|
| 412 |
check_for_async=False
|
| 413 |
)
|
| 414 |
+
|
| 415 |
# Components
|
| 416 |
self.backoff = ExponentialBackoff(base_delay=1.0, max_delay=60.0)
|
| 417 |
self.hierarchy_tracker = CommentHierarchyTracker()
|
| 418 |
self.user_collector = UserHistoryCollector(self.reddit, self.backoff)
|
| 419 |
self.checkpoint_manager = CheckpointManager()
|
| 420 |
+
|
| 421 |
+
# Database setup - use /tmp for HuggingFace Spaces
|
| 422 |
+
if db_path is None:
|
| 423 |
+
db_path = os.environ.get('DB_PATH', '/tmp/reddit_data.db')
|
| 424 |
self.db_path = db_path
|
| 425 |
self._init_database()
|
| 426 |
|
src/streamlit_app.py
CHANGED
|
@@ -824,8 +824,7 @@ def main():
|
|
| 824 |
)
|
| 825 |
if ADVANCED_FEATURES:
|
| 826 |
st.session_state.advanced_scraper = AdvancedRedditScraper(
|
| 827 |
-
client_id, client_secret, user_agent
|
| 828 |
-
db_path="reddit_research.db"
|
| 829 |
)
|
| 830 |
st.success("✅ Scrapers initialized successfully (with advanced features)!")
|
| 831 |
else:
|
|
|
|
| 824 |
)
|
| 825 |
if ADVANCED_FEATURES:
|
| 826 |
st.session_state.advanced_scraper = AdvancedRedditScraper(
|
| 827 |
+
client_id, client_secret, user_agent
|
|
|
|
| 828 |
)
|
| 829 |
st.success("✅ Scrapers initialized successfully (with advanced features)!")
|
| 830 |
else:
|