{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 5.76MB/s]\n", "Downloading data: 100%|██████████| 182M/182M [00:12<00:00, 14.3MB/s] \n", "Generating test split: 100%|██████████| 404/404 [00:01<00:00, 364.97 examples/s]\n" ] } ], "source": [ "data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = data[\"test\"].to_pandas()\n", "df = df[df[\"checker\"].notna()]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtask\n", "Contextual Analysis 103\n", "Deeper Implications 97\n", "Broader Implications 79\n", "Concrete Recognition 59\n", "Further Insights 55\n", "Name: count, dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"subtask\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df = df.sample(frac=1).reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df = df[df[\"id\"] != 35]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425178/992929014.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", " top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n" ] }, { "data": { "text/plain": [ "250" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_50_per_subtask = df.groupby(\"subtask\").apply(lambda x: x.nlargest(50, \"score\"))\n", "top_50_per_subtask.reset_index(drop=True, inplace=True)\n", "len(top_50_per_subtask)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subtask\n", "Concrete Recognition 50\n", "Broader Implications 50\n", "Contextual Analysis 50\n", "Deeper Implications 50\n", "Further Insights 50\n", "Name: count, dtype: int64" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_50_per_subtask[\"subtask\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9.128" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "np.mean(top_50_per_subtask[\"score\"])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "from datasets import Dataset, Features\n", "import datasets\n", "\n", "\n", "def gen():\n", " for d in top_50_per_subtask:\n", " yield d\n", "\n", "\n", "data = Dataset.from_pandas(\n", " top_50_per_subtask,\n", " features=Features(\n", " {\n", " \"id\": datasets.Value(\"int32\"),\n", " \"images\": datasets.Sequence(datasets.Image()),\n", " \"website\": datasets.Value(\"string\"),\n", " \"question\": datasets.Value(\"string\"),\n", " \"answer\": datasets.Value(\"string\"),\n", " \"criteria\": datasets.Value(\"string\"),\n", " \"subtask\": datasets.Value(\"string\"),\n", " \"data_generator\": datasets.Value(\"string\"),\n", " \"checker\": datasets.Value(\"string\"),\n", " \"date_time\": datasets.Value(\"string\"),\n", " \"screen_shoter\": datasets.Value(\"string\"),\n", " \"screen_size\": datasets.Value(\"string\"),\n", " \"score\": datasets.Value(\"int32\"),\n", " \"reason\": datasets.Value(\"string\"),\n", " \"scorer_name\": datasets.Value(\"string\"),\n", " }\n", " ),\n", ")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'images', 'website', 'question', 'answer', 'criteria', 'subtask', 'data_generator', 'checker', 'date_time', 'screen_shoter', 'screen_size', 'score', 'reason', 'scorer_name'],\n", " num_rows: 250\n", "})" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': Value(dtype='int32', id=None),\n", " 'images': Sequence(feature=Image(mode=None, decode=True, id=None), length=-1, id=None),\n", " 'website': Value(dtype='string', id=None),\n", " 'question': Value(dtype='string', id=None),\n", " 'answer': Value(dtype='string', id=None),\n", " 'criteria': Value(dtype='string', id=None),\n", " 'subtask': Value(dtype='string', id=None),\n", " 'data_generator': Value(dtype='string', id=None),\n", " 'checker': Value(dtype='string', id=None),\n", " 'date_time': Value(dtype='string', id=None),\n", " 'screen_shoter': Value(dtype='string', id=None),\n", " 'screen_size': Value(dtype='string', id=None),\n", " 'score': Value(dtype='int32', id=None),\n", " 'reason': Value(dtype='string', id=None),\n", " 'scorer_name': Value(dtype='string', id=None)}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 250/250 [00:00<00:00, 314.90 examples/s]it/s]\n", "Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 5.14ba/s]\n", "Uploading the dataset shards: 100%|██████████| 1/1 [00:19<00:00, 19.84s/it]\n" ] }, { "data": { "text/plain": [ "CommitInfo(commit_url='https://huggingface.co/datasets/lmms-lab/LiveBench/commit/19dd279ffe05a6911e11f93a1e9699b6daf7be4e', commit_message='Upload dataset', commit_description='', oid='19dd279ffe05a6911e11f93a1e9699b6daf7be4e', pr_url=None, pr_revision=None, pr_num=None)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\", split=\"test\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "live_bench", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }