| |
|
| | import subprocess
|
| | import sys
|
| | import time
|
| | from pathlib import Path
|
| |
|
| | def scale_training_data():
|
| | print("π MAP-NEO Mini Data Scaling")
|
| | print("=" * 50)
|
| | print("Target: 50,000 documents (10x current scale)")
|
| | print("Expected result: ~25,000 training sequences")
|
| | print("Estimated time: 45-60 minutes")
|
| | print("=" * 50)
|
| |
|
| |
|
| | large_data = Path("data/tokens/packed_1024_large.txt")
|
| | if large_data.exists():
|
| | print("β
Large dataset already exists!")
|
| | print(f"Found: {large_data}")
|
| | return str(large_data)
|
| |
|
| |
|
| | small_data = Path("data/tokens/packed_1024.txt")
|
| | if small_data.exists():
|
| | backup_path = Path("data/tokens/packed_1024_small_backup.txt")
|
| | print(f"π Backing up current dataset to: {backup_path}")
|
| | small_data.rename(backup_path)
|
| |
|
| |
|
| | print("\nπ Starting data processing...")
|
| | print("This will download and process 50,000 English documents")
|
| |
|
| | cmd = [
|
| | sys.executable, "data_prep.py",
|
| | "--num_docs", "50000",
|
| | "--seq_length", "1024"
|
| | ]
|
| |
|
| | start_time = time.time()
|
| |
|
| | try:
|
| | result = subprocess.run(cmd, check=True, capture_output=False, text=True)
|
| |
|
| | elapsed = time.time() - start_time
|
| | print(f"\nβ
Data scaling completed in {elapsed/60:.1f} minutes!")
|
| |
|
| |
|
| | old_path = Path("data/tokens/packed_1024.txt")
|
| | new_path = Path("data/tokens/packed_1024_large.txt")
|
| | if old_path.exists():
|
| | old_path.rename(new_path)
|
| | print(f"π Large dataset saved as: {new_path}")
|
| |
|
| |
|
| | with open(new_path, 'r') as f:
|
| | seq_count = sum(1 for _ in f)
|
| | print(f"π Total sequences: {seq_count:,}")
|
| |
|
| | return str(new_path)
|
| | else:
|
| | print("β Expected output file not found")
|
| | return None
|
| |
|
| | except subprocess.CalledProcessError as e:
|
| | print(f"β Error in data processing:")
|
| | print(f"Return code: {e.returncode}")
|
| | return None
|
| | except KeyboardInterrupt:
|
| | print("\nβΉοΈ Process interrupted by user")
|
| | return None
|
| |
|
| | def update_training_config():
|
| | """Update train_neo.py to use large dataset"""
|
| | print("\nπ§ Updating training configuration...")
|
| |
|
| | train_file = Path("train_neo.py")
|
| | if not train_file.exists():
|
| | print("β train_neo.py not found")
|
| | return
|
| |
|
| |
|
| | content = train_file.read_text(encoding='utf-8')
|
| |
|
| |
|
| | old_data_path = 'data_path: str = "data/tokens/packed_1024.txt"'
|
| | new_data_path = 'data_path: str = "data/tokens/packed_1024_large.txt"'
|
| |
|
| | old_max_steps = 'max_steps: int = 50000'
|
| | new_max_steps = 'max_steps: int = 100000'
|
| |
|
| | if old_data_path in content:
|
| | content = content.replace(old_data_path, new_data_path)
|
| | print("β
Updated data_path to use large dataset")
|
| |
|
| | if old_max_steps in content:
|
| | content = content.replace(old_max_steps, new_max_steps)
|
| | print("β
Updated max_steps to 100,000 for extended training")
|
| |
|
| |
|
| | train_file.write_text(content, encoding='utf-8')
|
| | print("πΎ Training configuration updated!")
|
| |
|
| | def main():
|
| | print("MAP-NEO Mini Data Scaling Pipeline")
|
| |
|
| |
|
| | result = scale_training_data()
|
| |
|
| | if result:
|
| |
|
| | update_training_config()
|
| |
|
| | print("\n" + "="*60)
|
| | print("π DATA SCALING COMPLETE!")
|
| | print("="*60)
|
| | print("Next steps:")
|
| | print("1. Your large dataset is ready for training")
|
| | print("2. Training config updated for 100k steps")
|
| | print("3. Run: python train_neo.py")
|
| | print("4. Expected training time: ~3-4 hours")
|
| | print("5. Expected quality: Much more coherent text!")
|
| | print("="*60)
|
| | else:
|
| | print("\nβ Data scaling failed. Check the errors above.")
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|