Spaces:

Neon-AI
/

Crawl

Runtime error

Neon-AI commited on 5 days ago

Commit

16ece2f

verified ·

1 Parent(s): e9fa8ed

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,20 +1,17 @@
 import httpx
-import gzip
-url = "https://data.commoncrawl.org/crawl-data/CC-MAIN-2013-20/cc-index-table.paths.gz"
 print("Downloading...")
 with httpx.stream("GET", url) as r:
-    with open("/data/cc-index-table.paths.gz", "wb") as f:
-        for chunk in r.iter_bytes():
-            f.write(chunk)
-print("Unzipping...")
-with gzip.open("/data/cc-index-table.paths.gz", "rb") as gz:
-    content = gz.read()
-    with open("/data/cc-index-table.paths.txt", "wb") as f:
-        f.write(content)
-print("First 2 lines:")
-for line in content.decode("utf-8", errors="ignore").splitlines()[:2]:
-    print(line)

 import httpx
+import pandas as pd
+import io
+url = "https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl=CC-MAIN-2013-20/subset=warc/part-00000-6ac52f25-05a1-4998-adf1-b8c830c08eec.c000.gz.parquet"
 print("Downloading...")
 with httpx.stream("GET", url) as r:
+    data = b""
+    for chunk in r.iter_bytes():
+        data += chunk
+print("Reading parquet...")
+df = pd.read_parquet(io.BytesIO(data))
+print("Columns:", df.columns.tolist())
+print("Shape:", df.shape)
+print(df.head(3))