Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
d1a58c9
1
Parent(s):
37b8c09
curation part 1
Browse files- app.py +0 -1
- datacards/curation.py +146 -3
- datacards/gem.py +0 -3
- datacards/overview.py +11 -1
app.py
CHANGED
|
@@ -77,7 +77,6 @@ def main():
|
|
| 77 |
|
| 78 |
def glance_page():
|
| 79 |
with st.expander("Dataset at a Glance", expanded=True):
|
| 80 |
-
st.markdown(f"### Dataset Name: {st.session_state.save_state.get('dataset_name', '')}")
|
| 81 |
dataset_summary = ""
|
| 82 |
dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
|
| 83 |
dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
|
|
|
|
| 77 |
|
| 78 |
def glance_page():
|
| 79 |
with st.expander("Dataset at a Glance", expanded=True):
|
|
|
|
| 80 |
dataset_summary = ""
|
| 81 |
dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
|
| 82 |
dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
|
datacards/curation.py
CHANGED
|
@@ -4,10 +4,153 @@ from .streamlit_utils import (
|
|
| 4 |
make_text_input
|
| 5 |
)
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def curation_page():
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def curation_summary():
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
make_text_input
|
| 5 |
)
|
| 6 |
|
| 7 |
+
from .streamlit_utils import (
|
| 8 |
+
make_multiselect,
|
| 9 |
+
make_selectbox,
|
| 10 |
+
make_text_area,
|
| 11 |
+
make_text_input,
|
| 12 |
+
make_radio,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
N_FIELDS_ORIGINAL = 4
|
| 16 |
+
N_FIELDS_LANGUAGE = 12
|
| 17 |
+
N_FIELDS_ANNOTATIONS = 0
|
| 18 |
+
N_FIELDS_CONSENT = 0
|
| 19 |
+
N_FIELDS_PII = 0
|
| 20 |
+
N_FIELDS_MAINTENANCE = 0
|
| 21 |
+
N_FIELDS_GEM = 0
|
| 22 |
+
|
| 23 |
+
N_FIELDS = N_FIELDS_ORIGINAL + \
|
| 24 |
+
N_FIELDS_LANGUAGE + \
|
| 25 |
+
N_FIELDS_ANNOTATIONS + \
|
| 26 |
+
N_FIELDS_CONSENT + \
|
| 27 |
+
N_FIELDS_PII + \
|
| 28 |
+
N_FIELDS_MAINTENANCE + \
|
| 29 |
+
N_FIELDS_GEM
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
"""
|
| 33 |
+
What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
|
| 34 |
+
"""
|
| 35 |
|
| 36 |
def curation_page():
|
| 37 |
+
st.session_state.card_dict["curation"] = st.session_state.card_dict.get("curation", {})
|
| 38 |
+
with st.expander("Original Curation", expanded=False):
|
| 39 |
+
key_pref = ["curation", "original"]
|
| 40 |
+
st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict["curation"].get("original", {})
|
| 41 |
+
make_text_area(
|
| 42 |
+
label="Original curation rationale",
|
| 43 |
+
key_list=key_pref + ["rationale"],
|
| 44 |
+
help="Describe the curation rationale behind the original dataset(s)."
|
| 45 |
+
)
|
| 46 |
+
make_text_area(
|
| 47 |
+
label="What was the communicative goal?",
|
| 48 |
+
key_list=key_pref + ["communicative"],
|
| 49 |
+
help="Describe the communicative goal that the original dataset(s) was trying to represent."
|
| 50 |
+
)
|
| 51 |
+
make_radio(
|
| 52 |
+
label="Is the dataset aggregated from different data sources?",
|
| 53 |
+
options=["no", "yes"],
|
| 54 |
+
key_list=key_pref + ["is-aggregated"],
|
| 55 |
+
help="e.g. Wikipedia, movi dialogues, etc.",
|
| 56 |
+
)
|
| 57 |
+
make_text_area(
|
| 58 |
+
label="If yes, list the sources",
|
| 59 |
+
key_list=key_pref + ["aggregated-sources"],
|
| 60 |
+
help="Otherwise, type N/A"
|
| 61 |
+
)
|
| 62 |
+
with st.expander("Language Data", expanded=False):
|
| 63 |
+
key_pref = ["curation", "language"]
|
| 64 |
+
st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict["curation"].get("language", {})
|
| 65 |
+
make_multiselect(
|
| 66 |
+
label="How was the language data obtained?",
|
| 67 |
+
options=["found", "created for the dataset", "crowdsourced", "machine-generated", "other"],
|
| 68 |
+
key_list=key_pref+["obtained"],
|
| 69 |
+
)
|
| 70 |
+
make_multiselect(
|
| 71 |
+
label="If found, where from?",
|
| 72 |
+
options=["website", "offline media collection", "other", "N/A"],
|
| 73 |
+
key_list=key_pref+["found"],
|
| 74 |
+
help="select N/A if none of the language data was found"
|
| 75 |
+
)
|
| 76 |
+
make_multiselect(
|
| 77 |
+
label="If crowdsourced, where from?",
|
| 78 |
+
options=["Amazon Mechanical Turk", "other crowdworker platform", "participatory experiment", "other", "N/A"],
|
| 79 |
+
key_list=key_pref+["crowdsourced"],
|
| 80 |
+
help="select N/A if none of the language data was crowdsourced"
|
| 81 |
+
)
|
| 82 |
+
make_text_area(
|
| 83 |
+
label="If created for the dataset, describe the creation process.",
|
| 84 |
+
key_list=key_pref+["created"],
|
| 85 |
+
)
|
| 86 |
+
make_text_area(
|
| 87 |
+
label="What further information do we have on the language producers?",
|
| 88 |
+
key_list=key_pref+["producers-description"],
|
| 89 |
+
help="Provide a description of the context in which the language was produced and who produced it.",
|
| 90 |
+
)
|
| 91 |
+
make_text_input(
|
| 92 |
+
label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
|
| 93 |
+
key_list=key_pref+["machine-generated"],
|
| 94 |
+
help="if the generation code is unavailable, enter N/A",
|
| 95 |
+
)
|
| 96 |
+
make_selectbox(
|
| 97 |
+
label="Was the text validated by a different worker or a data curator?",
|
| 98 |
+
options=["not validated", "validated by crowdworker", "validated by data curator", "other"],
|
| 99 |
+
key_list=key_pref+["validated"],
|
| 100 |
+
help="this question is about human or human-in-the-loop validation only"
|
| 101 |
+
)
|
| 102 |
+
make_multiselect(
|
| 103 |
+
label="In what kind of organization did the curation happen?",
|
| 104 |
+
options= ["industry", "academic", "independent", "other"],
|
| 105 |
+
key_list=key_pref+["organization-type"],
|
| 106 |
+
)
|
| 107 |
+
make_text_input(
|
| 108 |
+
label="Name the organization(s).",
|
| 109 |
+
key_list=key_pref+["organization-names"],
|
| 110 |
+
help="comma-separated",
|
| 111 |
+
)
|
| 112 |
+
make_text_area(
|
| 113 |
+
label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
|
| 114 |
+
key_list=key_pref+["pre-processed"],
|
| 115 |
+
help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken."
|
| 116 |
+
)
|
| 117 |
+
make_selectbox(
|
| 118 |
+
label="Were text instances selected or filtered?",
|
| 119 |
+
options=["not filtered", "manually", "algorithmically", "hybrid"],
|
| 120 |
+
key_list=key_pref+["is-filtered"],
|
| 121 |
+
)
|
| 122 |
+
make_text_area(
|
| 123 |
+
label="What were the selection criteria?",
|
| 124 |
+
key_list=key_pref+["filtered-criteria"],
|
| 125 |
+
help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A."
|
| 126 |
+
)
|
| 127 |
+
with st.expander("Structured Annotations", expanded=False):
|
| 128 |
+
key_pref = ["curation", "annotations"]
|
| 129 |
+
st.session_state.card_dict["curation"]["annotations"] = st.session_state.card_dict["curation"].get("annotations", {})
|
| 130 |
+
with st.expander("Consent", expanded=False):
|
| 131 |
+
key_pref = ["curation", "consent"]
|
| 132 |
+
st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict["curation"].get("consent", {})
|
| 133 |
+
with st.expander("Private Identifying Information (PII)", expanded=False):
|
| 134 |
+
key_pref = ["curation", "pii"]
|
| 135 |
+
st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict["curation"].get("pii", {})
|
| 136 |
+
with st.expander("Maintenance", expanded=False):
|
| 137 |
+
key_pref = ["curation", "maintenance"]
|
| 138 |
+
st.session_state.card_dict["curation"]["maintenance"] = st.session_state.card_dict["curation"].get("maintenance", {})
|
| 139 |
+
with st.expander("GEM Additional Curation", expanded=False):
|
| 140 |
+
key_pref = ["curation", "gem"]
|
| 141 |
+
st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict["curation"].get("gem", {})
|
| 142 |
+
|
| 143 |
|
| 144 |
def curation_summary():
|
| 145 |
+
total_filled = sum([len(dct) for dct in st.session_state.card_dict.get('curation', {}).values()])
|
| 146 |
+
with st.expander(f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False):
|
| 147 |
+
completion_markdown = ""
|
| 148 |
+
completion_markdown += f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
|
| 149 |
+
completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
|
| 150 |
+
completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
|
| 151 |
+
completion_markdown += f"- **Sub-section - Structured Annotations:**\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n"
|
| 152 |
+
completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
|
| 153 |
+
completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
|
| 154 |
+
completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
|
| 155 |
+
completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
|
| 156 |
+
st.markdown(completion_markdown)
|
datacards/gem.py
CHANGED
|
@@ -5,10 +5,7 @@ from .streamlit_utils import (
|
|
| 5 |
)
|
| 6 |
|
| 7 |
from .streamlit_utils import (
|
| 8 |
-
make_multiselect,
|
| 9 |
-
make_selectbox,
|
| 10 |
make_text_area,
|
| 11 |
-
make_text_input,
|
| 12 |
make_radio,
|
| 13 |
)
|
| 14 |
|
|
|
|
| 5 |
)
|
| 6 |
|
| 7 |
from .streamlit_utils import (
|
|
|
|
|
|
|
| 8 |
make_text_area,
|
|
|
|
| 9 |
make_radio,
|
| 10 |
)
|
| 11 |
|
datacards/overview.py
CHANGED
|
@@ -12,7 +12,7 @@ from .streamlit_utils import (
|
|
| 12 |
)
|
| 13 |
|
| 14 |
N_FIELDS_WHERE = 9
|
| 15 |
-
N_FIELDS_LANGUAGES =
|
| 16 |
N_FIELDS_CREDIT = 3
|
| 17 |
N_FIELDS_STRUCTURE = 7
|
| 18 |
|
|
@@ -98,6 +98,16 @@ def overview_page():
|
|
| 98 |
],
|
| 99 |
help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
|
| 100 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
make_text_area(
|
| 102 |
label="What is the intended use of the dataset?",
|
| 103 |
key_list=key_pref + ["intended-use"],
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
N_FIELDS_WHERE = 9
|
| 15 |
+
N_FIELDS_LANGUAGES = 8
|
| 16 |
N_FIELDS_CREDIT = 3
|
| 17 |
N_FIELDS_STRUCTURE = 7
|
| 18 |
|
|
|
|
| 98 |
],
|
| 99 |
help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
|
| 100 |
)
|
| 101 |
+
make_text_area(
|
| 102 |
+
label="What dialects are covered? Are there multiple dialects per language?",
|
| 103 |
+
key_list=key_pref + ["language-dialects"],
|
| 104 |
+
help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
|
| 105 |
+
)
|
| 106 |
+
make_text_area(
|
| 107 |
+
label="Whose language is in the dataset?",
|
| 108 |
+
key_list=key_pref + ["language-speakers"],
|
| 109 |
+
help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
|
| 110 |
+
)
|
| 111 |
make_text_area(
|
| 112 |
label="What is the intended use of the dataset?",
|
| 113 |
key_list=key_pref + ["intended-use"],
|