import httpx import pandas as pd import io url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet" print("Downloading...") with httpx.stream("GET", url) as r: data = b"" for chunk in r.iter_bytes(): data += chunk print("Reading parquet...") df = pd.read_parquet(io.BytesIO(data)) print("Columns:", df.columns.tolist()) print("Shape:", df.shape) print(df.head(3))