| | import transformers |
| | import pandas as pd |
| | import streamlit as st |
| | from preprocess import preprocess_data |
| |
|
| | def anonymize_text(text): |
| | model_name = "distilbert-base-uncased" |
| | tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) |
| | model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) |
| |
|
| | input_ids = tokenizer.encode(text, return_tensors="pt") |
| | mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] |
| |
|
| | token_logits = model(input_ids)[0] |
| | mask_token_logits = token_logits[0, mask_token_index, :] |
| |
|
| | top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() |
| |
|
| | anonymized_text = [] |
| | for token in top_5_tokens: |
| | token = tokenizer.decode([token]) |
| | anonymized_text.append(token) |
| |
|
| | return anonymized_text |
| |
|
| | def run_app(): |
| | st.title("Text Anonymization App") |
| |
|
| | |
| | st.subheader("Upload your data") |
| | file = st.file_uploader("Upload CSV", type=["csv"]) |
| |
|
| | if file is not None: |
| | |
| | data = pd.read_csv(file) |
| |
|
| | |
| | preprocessed_data = preprocess_data(data) |
| |
|
| | |
| | st.subheader("Select columns to anonymize") |
| | selected_columns = [] |
| | for col in preprocessed_data.columns: |
| | if st.checkbox(col): |
| | selected_columns.append(col) |
| |
|
| | |
| |
|