| |
| import subprocess |
| import os |
| import urllib.request |
| import tarfile |
| from pathlib import Path |
|
|
| def run(cmd): |
| """Run command and capture output for debugging""" |
| print(f"Running: {cmd}") |
| result = subprocess.run(cmd, shell=True, capture_output=True, text=True) |
| if result.returncode != 0: |
| print(f"Error: {result.stderr}") |
| raise subprocess.CalledProcessError(result.returncode, cmd) |
| print(f"Success: {result.stdout}") |
| return result |
|
|
| def install_spark_colab(): |
| spark_ver = "3.5.6" |
| spark_dir = f"spark-{spark_ver}-bin-hadoop3" |
| url = f"https://downloads.apache.org/spark/spark-{spark_ver}/spark-{spark_ver}-bin-hadoop3.tgz" |
| archive = f"{spark_dir}.tgz" |
| |
| |
| work_dir = Path.home() / "spark_install" |
| work_dir.mkdir(exist_ok=True) |
| os.chdir(work_dir) |
| |
| |
| if not Path(archive).exists(): |
| print(f"Downloading Spark {spark_ver}...") |
| urllib.request.urlretrieve(url, archive) |
| |
| |
| print("Extracting archive...") |
| with tarfile.open(archive, "r:gz") as tar: |
| tar.extractall(path=".") |
| |
| |
| spark_home = Path.home() / "spark" |
| if spark_home.exists(): |
| spark_home.unlink() |
| spark_home.symlink_to(work_dir / spark_dir) |
| |
| |
| os.environ['SPARK_HOME'] = str(spark_home) |
| os.environ['PATH'] = f"{spark_home}/bin:{spark_home}/sbin:{os.environ.get('PATH', '')}" |
| os.environ['PYSPARK_PYTHON'] = 'python3' |
| |
| |
| Path(archive).unlink(missing_ok=True) |
| |
| print(f"Spark installed to: {spark_home}") |
| print("Environment variables set for this session!") |
| print("\nTo verify installation:") |
| print("!ls $SPARK_HOME") |
| print("!spark-submit --version") |
| |
| return str(spark_home) |
|
|
| if __name__ == "__main__": |
| spark_path = install_spark_colab() |