| import transformers |
| import pandas as pd |
| import streamlit as st |
| from preprocess import preprocess_data |
|
|
| def anonymize_text(text): |
| model_name = "distilbert-base-uncased" |
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) |
| model = transformers.AutoModelForMaskedLM.from_pretrained(model_name) |
|
|
| input_ids = tokenizer.encode(text, return_tensors="pt") |
| mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1] |
|
|
| token_logits = model(input_ids)[0] |
| mask_token_logits = token_logits[0, mask_token_index, :] |
|
|
| top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() |
|
|
| anonymized_text = [] |
| for token in top_5_tokens: |
| token = tokenizer.decode([token]) |
| anonymized_text.append(token) |
|
|
| return anonymized_text |
|
|
| def run_app(): |
| st.title("Text Anonymization App") |
|
|
| |
| st.subheader("Upload your data") |
| file = st.file_uploader("Upload CSV", type=["csv"]) |
|
|
| if file is not None: |
| |
| data = pd.read_csv(file) |
|
|
| |
| preprocessed_data = preprocess_data(data) |
|
|
| |
| st.subheader("Select columns to anonymize") |
| selected_columns = [] |
| for col in preprocessed_data.columns: |
| if st.checkbox(col): |
| selected_columns.append(col) |
|
|
| |
|
|