{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 5.76MB/s]\n",
      "Downloading data: 100%|██████████| 182M/182M [00:12<00:00, 14.3MB/s] \n",
      "Generating test split: 100%|██████████| 404/404 [00:01<00:00, 364.97 examples/s]\n"
     ]
    }
   ],
   "source": [
    "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = data[\"test\"].to_pandas()\n",
    "df = df[df[\"checker\"].notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "subtask\n",
       "Contextual Analysis     103\n",
       "Deeper Implications      97\n",
       "Broader Implications     79\n",
       "Concrete Recognition      59\n",
       "Further Insights         55\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"subtask\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sample(frac=1).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df[\"id\"] != 35]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_425178/992929014.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "250"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n",
    "top_50_per_subtask.reset_index(drop=True, inplace=True)\n",
    "len(top_50_per_subtask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "subtask\n",
       "Concrete Recognition     50\n",
       "Broader Implications    50\n",
       "Contextual Analysis     50\n",
       "Deeper Implications     50\n",
       "Further Insights        50\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_50_per_subtask[\"subtask\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9.128"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.mean(top_50_per_subtask[\"score\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset, Features\n",
    "import datasets\n",
    "\n",
    "\n",
    "def gen():\n",
    "    for d in top_50_per_subtask:\n",
    "        yield d\n",
    "\n",
    "\n",
    "data = Dataset.from_pandas(\n",
    "    top_50_per_subtask,\n",
    "    features=Features(\n",
    "        {\n",
    "            \"id\": datasets.Value(\"int32\"),\n",
    "            \"images\": datasets.Sequence(datasets.Image()),\n",
    "            \"website\": datasets.Value(\"string\"),\n",
    "            \"question\": datasets.Value(\"string\"),\n",
    "            \"answer\": datasets.Value(\"string\"),\n",
    "            \"criteria\": datasets.Value(\"string\"),\n",
    "            \"subtask\": datasets.Value(\"string\"),\n",
    "            \"data_generator\": datasets.Value(\"string\"),\n",
    "            \"checker\": datasets.Value(\"string\"),\n",
    "            \"date_time\": datasets.Value(\"string\"),\n",
    "            \"screen_shoter\": datasets.Value(\"string\"),\n",
    "            \"screen_size\": datasets.Value(\"string\"),\n",
    "            \"score\": datasets.Value(\"int32\"),\n",
    "            \"reason\": datasets.Value(\"string\"),\n",
    "            \"scorer_name\": datasets.Value(\"string\"),\n",
    "        }\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n",
       "    num_rows: 250\n",
       "})"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': Value(dtype='int32', id=None),\n",
       " 'images': Sequence(feature=Image(mode=None, decode=True, id=None), length=-1, id=None),\n",
       " 'website': Value(dtype='string', id=None),\n",
       " 'question': Value(dtype='string', id=None),\n",
       " 'answer': Value(dtype='string', id=None),\n",
       " 'criteria': Value(dtype='string', id=None),\n",
       " 'subtask': Value(dtype='string', id=None),\n",
       " 'data_generator': Value(dtype='string', id=None),\n",
       " 'checker': Value(dtype='string', id=None),\n",
       " 'date_time': Value(dtype='string', id=None),\n",
       " 'screen_shoter': Value(dtype='string', id=None),\n",
       " 'screen_size': Value(dtype='string', id=None),\n",
       " 'score': Value(dtype='int32', id=None),\n",
       " 'reason': Value(dtype='string', id=None),\n",
       " 'scorer_name': Value(dtype='string', id=None)}"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 250/250 [00:00<00:00, 314.90 examples/s]it/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  5.14ba/s]\n",
      "Uploading the dataset shards: 100%|██████████| 1/1 [00:19<00:00, 19.84s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/19dd279ffe05a6911e11f93a1e9699b6daf7be4e', commit_message='Upload dataset', commit_description='', oid='19dd279ffe05a6911e11f93a1e9699b6daf7be4e', pr_url=None, pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "live_bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}