| import httpx | |
| import pandas as pd | |
| import io | |
| url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet" | |
| print("Downloading...") | |
| with httpx.stream("GET", url) as r: | |
| data = b"" | |
| for chunk in r.iter_bytes(): | |
| data += chunk | |
| print("Reading parquet...") | |
| df = pd.read_parquet(io.BytesIO(data)) | |
| print("Columns:", df.columns.tolist()) | |
| print("Shape:", df.shape) | |
| print(df.head(3)) | |