Spaces:

GEM
/

DatasetCardForm

Runtime error

App Files Files Community

Yacine Jernite commited on Nov 15, 2021

Commit

d1a58c9

1 Parent(s): 37b8c09

curation part 1

Browse files

Files changed (4) hide show

app.py +0 -1
datacards/curation.py +146 -3
datacards/gem.py +0 -3
datacards/overview.py +11 -1

app.py CHANGED Viewed

@@ -77,7 +77,6 @@ def main():
 def glance_page():
     with st.expander("Dataset at a Glance", expanded=True):
-        st.markdown(f"### Dataset Name: {st.session_state.save_state.get('dataset_name', '')}")
         dataset_summary = ""
         dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
         dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"

 def glance_page():
     with st.expander("Dataset at a Glance", expanded=True):
         dataset_summary = ""
         dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
         dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"

datacards/curation.py CHANGED Viewed

@@ -4,10 +4,153 @@ from .streamlit_utils import (
     make_text_input
 )
-N_FIELDS = 1
 def curation_page():
-    return None
 def curation_summary():
-    return None

     make_text_input
 )
+from .streamlit_utils import (
+    make_multiselect,
+    make_selectbox,
+    make_text_area,
+    make_text_input,
+    make_radio,
+)
+N_FIELDS_ORIGINAL = 4
+N_FIELDS_LANGUAGE = 12
+N_FIELDS_ANNOTATIONS = 0
+N_FIELDS_CONSENT = 0
+N_FIELDS_PII = 0
+N_FIELDS_MAINTENANCE = 0
+N_FIELDS_GEM = 0
+N_FIELDS = N_FIELDS_ORIGINAL + \
+    N_FIELDS_LANGUAGE + \
+    N_FIELDS_ANNOTATIONS + \
+    N_FIELDS_CONSENT + \
+    N_FIELDS_PII + \
+    N_FIELDS_MAINTENANCE + \
+    N_FIELDS_GEM
+"""
+What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
+"""
 def curation_page():
+    st.session_state.card_dict["curation"] = st.session_state.card_dict.get("curation", {})
+    with st.expander("Original Curation", expanded=False):
+        key_pref = ["curation", "original"]
+        st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict["curation"].get("original", {})
+        make_text_area(
+            label="Original curation rationale",
+            key_list=key_pref + ["rationale"],
+            help="Describe the curation rationale behind the original dataset(s)."
+        )
+        make_text_area(
+            label="What was the communicative goal?",
+            key_list=key_pref + ["communicative"],
+            help="Describe the communicative goal that the original dataset(s) was trying to represent."
+        )
+        make_radio(
+            label="Is the dataset aggregated from different data sources?",
+            options=["no", "yes"],
+            key_list=key_pref + ["is-aggregated"],
+            help="e.g. Wikipedia, movi dialogues, etc.",
+        )
+        make_text_area(
+            label="If yes, list the sources",
+            key_list=key_pref + ["aggregated-sources"],
+            help="Otherwise, type N/A"
+        )
+    with st.expander("Language Data", expanded=False):
+        key_pref = ["curation", "language"]
+        st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict["curation"].get("language", {})
+        make_multiselect(
+            label="How was the language data obtained?",
+            options=["found", "created for the dataset", "crowdsourced", "machine-generated", "other"],
+            key_list=key_pref+["obtained"],
+        )
+        make_multiselect(
+            label="If found, where from?",
+            options=["website", "offline media collection", "other", "N/A"],
+            key_list=key_pref+["found"],
+            help="select N/A if none of the language data was found"
+        )
+        make_multiselect(
+            label="If crowdsourced, where from?",
+            options=["Amazon Mechanical Turk", "other crowdworker platform", "participatory experiment", "other", "N/A"],
+            key_list=key_pref+["crowdsourced"],
+            help="select N/A if none of the language data was crowdsourced"
+        )
+        make_text_area(
+            label="If created for the dataset, describe the creation process.",
+            key_list=key_pref+["created"],
+        )
+        make_text_area(
+            label="What further information do we have on the language producers?",
+            key_list=key_pref+["producers-description"],
+            help="Provide a description of the context in which the language was produced and who produced it.",
+        )
+        make_text_input(
+            label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
+            key_list=key_pref+["machine-generated"],
+            help="if the generation code is unavailable, enter N/A",
+        )
+        make_selectbox(
+            label="Was the text validated by a different worker or a data curator?",
+            options=["not validated", "validated by crowdworker", "validated by data curator", "other"],
+            key_list=key_pref+["validated"],
+            help="this question is about human or human-in-the-loop validation only"
+        )
+        make_multiselect(
+            label="In what kind of organization did the curation happen?",
+            options= ["industry",  "academic", "independent", "other"],
+            key_list=key_pref+["organization-type"],
+        )
+        make_text_input(
+            label="Name the organization(s).",
+            key_list=key_pref+["organization-names"],
+            help="comma-separated",
+        )
+        make_text_area(
+            label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
+            key_list=key_pref+["pre-processed"],
+            help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken."
+        )
+        make_selectbox(
+            label="Were text instances selected or filtered?",
+            options=["not filtered", "manually", "algorithmically", "hybrid"],
+            key_list=key_pref+["is-filtered"],
+        )
+        make_text_area(
+            label="What were the selection criteria?",
+            key_list=key_pref+["filtered-criteria"],
+            help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A."
+        )
+    with st.expander("Structured Annotations", expanded=False):
+        key_pref = ["curation", "annotations"]
+        st.session_state.card_dict["curation"]["annotations"] = st.session_state.card_dict["curation"].get("annotations", {})
+    with st.expander("Consent", expanded=False):
+        key_pref = ["curation", "consent"]
+        st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict["curation"].get("consent", {})
+    with st.expander("Private Identifying Information (PII)", expanded=False):
+        key_pref = ["curation", "pii"]
+        st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict["curation"].get("pii", {})
+    with st.expander("Maintenance", expanded=False):
+        key_pref = ["curation", "maintenance"]
+        st.session_state.card_dict["curation"]["maintenance"] = st.session_state.card_dict["curation"].get("maintenance", {})
+    with st.expander("GEM Additional Curation", expanded=False):
+        key_pref = ["curation", "gem"]
+        st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict["curation"].get("gem", {})
 def curation_summary():
+    total_filled = sum([len(dct) for dct in st.session_state.card_dict.get('curation', {}).values()])
+    with st.expander(f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False):
+        completion_markdown = ""
+        completion_markdown += f"- **Overall competion:**\n  - {total_filled} of {N_FIELDS} fields\n"
+        completion_markdown += f"- **Sub-section - Original Curation:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
+        completion_markdown += f"- **Sub-section - Language Data:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
+        completion_markdown += f"- **Sub-section - Structured Annotations:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n"
+        completion_markdown += f"- **Sub-section - Consent:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
+        completion_markdown += f"- **Sub-section - PII:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
+        completion_markdown += f"- **Sub-section - Maintenance:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
+        completion_markdown += f"- **Sub-section - GEM Curation:**\n  - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
+        st.markdown(completion_markdown)

datacards/gem.py CHANGED Viewed

@@ -5,10 +5,7 @@ from .streamlit_utils import (
 )
 from .streamlit_utils import (
-    make_multiselect,
-    make_selectbox,
     make_text_area,
-    make_text_input,
     make_radio,
 )

 )
 from .streamlit_utils import (
     make_text_area,
     make_radio,
 )

datacards/overview.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .streamlit_utils import (
 )
 N_FIELDS_WHERE = 9
-N_FIELDS_LANGUAGES = 6
 N_FIELDS_CREDIT = 3
 N_FIELDS_STRUCTURE = 7
@@ -98,6 +98,16 @@ def overview_page():
             ],
             help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
         )
         make_text_area(
             label="What is the intended use of the dataset?",
             key_list=key_pref + ["intended-use"],

 )
 N_FIELDS_WHERE = 9
+N_FIELDS_LANGUAGES = 8
 N_FIELDS_CREDIT = 3
 N_FIELDS_STRUCTURE = 7
             ],
             help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
         )
+        make_text_area(
+            label="What dialects are covered? Are there multiple dialects per language?",
+            key_list=key_pref + ["language-dialects"],
+            help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
+        )
+        make_text_area(
+            label="Whose language is in the dataset?",
+            key_list=key_pref + ["language-speakers"],
+            help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
+        )
         make_text_area(
             label="What is the intended use of the dataset?",
             key_list=key_pref + ["intended-use"],