Crawl / app.py
Neon-AI's picture
Update app.py
16ece2f verified
raw
history blame contribute delete
489 Bytes
import httpx
import pandas as pd
import io
url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet"
print("Downloading...")
with httpx.stream("GET", url) as r:
data = b""
for chunk in r.iter_bytes():
data += chunk
print("Reading parquet...")
df = pd.read_parquet(io.BytesIO(data))
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print(df.head(3))