Neon-AI commited on
Commit
16ece2f
·
verified ·
1 Parent(s): e9fa8ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -14
app.py CHANGED
@@ -1,20 +1,17 @@
1
  import httpx
2
- import gzip
 
3
 
4
- url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2013-20/cc-index-table.paths.gz"
5
 
6
  print("Downloading...")
7
  with httpx.stream("GET", url) as r:
8
- with open("/data/cc-index-table.paths.gz", "wb") as f:
9
- for chunk in r.iter_bytes():
10
- f.write(chunk)
11
 
12
- print("Unzipping...")
13
- with gzip.open("/data/cc-index-table.paths.gz", "rb") as gz:
14
- content = gz.read()
15
- with open("/data/cc-index-table.paths.txt", "wb") as f:
16
- f.write(content)
17
-
18
- print("First 2 lines:")
19
- for line in content.decode("utf-8", errors="ignore").splitlines()[:2]:
20
- print(line)
 
1
  import httpx
2
+ import pandas as pd
3
+ import io
4
 
5
+ url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet"
6
 
7
  print("Downloading...")
8
  with httpx.stream("GET", url) as r:
9
+ data = b""
10
+ for chunk in r.iter_bytes():
11
+ data += chunk
12
 
13
+ print("Reading parquet...")
14
+ df = pd.read_parquet(io.BytesIO(data))
15
+ print("Columns:", df.columns.tolist())
16
+ print("Shape:", df.shape)
17
+ print(df.head(3))