File size: 489 Bytes
7e80c08
16ece2f
 
3180e55
16ece2f
7e80c08
 
 
16ece2f
 
 
7e80c08
16ece2f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import httpx
import pandas as pd
import io

url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet"

print("Downloading...")
with httpx.stream("GET", url) as r:
    data = b""
    for chunk in r.iter_bytes():
        data += chunk

print("Reading parquet...")
df = pd.read_parquet(io.BytesIO(data))
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print(df.head(3))