milwright commited on
Commit
abf06a0
·
1 Parent(s): ac1b924

fix filesystem permissions: use /tmp for checkpoints and database

Browse files
src/advanced_reddit_scraper.py CHANGED
@@ -330,10 +330,13 @@ class UserHistoryCollector:
330
 
331
  class CheckpointManager:
332
  """Manages checkpoint saving and restoration for long-running operations"""
333
-
334
- def __init__(self, checkpoint_dir: str = "./checkpoints"):
 
 
 
335
  self.checkpoint_dir = Path(checkpoint_dir)
336
- self.checkpoint_dir.mkdir(exist_ok=True)
337
 
338
  def save_checkpoint(self, state: Dict, checkpoint_name: str):
339
  """Save current state to checkpoint file"""
@@ -397,10 +400,10 @@ class AdvancedRedditScraper:
397
  - Database persistence
398
  """
399
 
400
- def __init__(self, client_id: str, client_secret: str, user_agent: str,
401
- db_path: str = "reddit_data.db"):
402
  """Initialize advanced scraper with all components"""
403
-
404
  # Reddit instance
405
  self.reddit = praw.Reddit(
406
  client_id=client_id,
@@ -408,14 +411,16 @@ class AdvancedRedditScraper:
408
  user_agent=user_agent,
409
  check_for_async=False
410
  )
411
-
412
  # Components
413
  self.backoff = ExponentialBackoff(base_delay=1.0, max_delay=60.0)
414
  self.hierarchy_tracker = CommentHierarchyTracker()
415
  self.user_collector = UserHistoryCollector(self.reddit, self.backoff)
416
  self.checkpoint_manager = CheckpointManager()
417
-
418
- # Database setup
 
 
419
  self.db_path = db_path
420
  self._init_database()
421
 
 
330
 
331
  class CheckpointManager:
332
  """Manages checkpoint saving and restoration for long-running operations"""
333
+
334
+ def __init__(self, checkpoint_dir: str = None):
335
+ # Use /tmp for HuggingFace Spaces compatibility (read-only filesystem)
336
+ if checkpoint_dir is None:
337
+ checkpoint_dir = os.environ.get('CHECKPOINT_DIR', '/tmp/checkpoints')
338
  self.checkpoint_dir = Path(checkpoint_dir)
339
+ self.checkpoint_dir.mkdir(exist_ok=True, parents=True)
340
 
341
  def save_checkpoint(self, state: Dict, checkpoint_name: str):
342
  """Save current state to checkpoint file"""
 
400
  - Database persistence
401
  """
402
 
403
+ def __init__(self, client_id: str, client_secret: str, user_agent: str,
404
+ db_path: str = None):
405
  """Initialize advanced scraper with all components"""
406
+
407
  # Reddit instance
408
  self.reddit = praw.Reddit(
409
  client_id=client_id,
 
411
  user_agent=user_agent,
412
  check_for_async=False
413
  )
414
+
415
  # Components
416
  self.backoff = ExponentialBackoff(base_delay=1.0, max_delay=60.0)
417
  self.hierarchy_tracker = CommentHierarchyTracker()
418
  self.user_collector = UserHistoryCollector(self.reddit, self.backoff)
419
  self.checkpoint_manager = CheckpointManager()
420
+
421
+ # Database setup - use /tmp for HuggingFace Spaces
422
+ if db_path is None:
423
+ db_path = os.environ.get('DB_PATH', '/tmp/reddit_data.db')
424
  self.db_path = db_path
425
  self._init_database()
426
 
src/streamlit_app.py CHANGED
@@ -824,8 +824,7 @@ def main():
824
  )
825
  if ADVANCED_FEATURES:
826
  st.session_state.advanced_scraper = AdvancedRedditScraper(
827
- client_id, client_secret, user_agent,
828
- db_path="reddit_research.db"
829
  )
830
  st.success("✅ Scrapers initialized successfully (with advanced features)!")
831
  else:
 
824
  )
825
  if ADVANCED_FEATURES:
826
  st.session_state.advanced_scraper = AdvancedRedditScraper(
827
+ client_id, client_secret, user_agent
 
828
  )
829
  st.success("✅ Scrapers initialized successfully (with advanced features)!")
830
  else: