| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| import torch |
| import pandas as pd |
|
|
| |
| model_name = "PL-RnD/privacy-moderation-large-4bit" |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) |
| |
| model.to(device) |
|
|
| |
| texts = [ |
| "Here is my credit card number: 1234-5678-9012-3456", |
| "This is a regular message without sensitive information.", |
| "For homeowners insurance, select deductibles from $500 to $2,500. Higher deductibles lower premiums.", |
| "Solidarity: My enrollment includes my kid's braces at $4,000 total—family strained. Push for orthodontic expansions. Email blast to reps starting now.", |
| ] |
| |
| inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| outputs = model(**inputs) |
|
|
| logits = outputs.logits |
| predictions = torch.argmax(logits, dim=-1) |
| |
| labels = ["non-violation", "violation"] |
| |
| predicted_labels = [labels[pred] for pred in predictions.cpu().tolist()] |
| |
| df = pd.DataFrame({"text": texts, "label": predicted_labels}) |
| print(df) |
|
|