dataset_id = "axrml/vlm2_ads_mpo" # dataset de treinamento.
role_prompt = """ As a cybersecurity specialist, your task is to analyze Facebook Ads to determine if the ads are impersonating a legitimate brand by comparing provided evidence. Check if the ads are using the same visual identity, logos, and products as the original site.\nUse only the data below: """
instructions = """ Your analysis must: • Compare the ad profile name, profile image, profile cover image (if present), ad image, and ad text with the original data from the brand website (HTML, Screenshot and logos (if provided)) • Focus on visual cues—logos (when available), layout, and content—without using any external knowledge. • Decide on an impersonation level: – “high” if the suspicious ad mimics the official branding (even if slightly different in style) using similar logos, visual identity, and/or offering similar products and services, – “medium” if the similarities are moderate, – “low” if the suspicious site shows different or unclear branding. • If any input is missing or not informative, treat it as low evidence. • Base your final decision solely on the provided evidence. • Always think, as a user, would you believe that this is an ad from the company (even remotely)? • Use all the information to provide a robust answer. • Pay more attention to ad image. • If the brand logo is present the minimum value is medium.
Return your answer as valid JSON exactly in this structure (with no extra text):
{"brand": "", "level": "low/medium/high", "explanation": "Brief explanation based solely on comparing the provided data."} """
Example of input:
messages = [ { "role": "user", "content": user_content # Usamos a lista user_content que reconstruímos }, { "role": "assistant", "content": [ {"type": "text", "text": assistant_content_text} ] } ]
Example in python:
for row in dataset_to_process:
# 1. Processar o Conteúdo do Usuário ('prompt')
user_content = []
# O conteúdo do usuário está dentro de row['prompt'][0]['content']
prompt_items = row['prompt'][0]['content']
# row['images'] é uma lista de objetos PIL Image, na ordem em que aparecem
image_list = row['images'] if row['images'] is not None else []
image_index_counter = 0
for item in prompt_items:
if item['type'] == 'text':
# Adiciona o texto diretamente
user_content.append({"type": "text", "text": item['text']})
elif item['type'] == 'image':
# Adiciona a imagem carregada do array 'images'
if image_index_counter < len(image_list):
user_content.append({"type": "image", "image": image_list[image_index_counter]})
image_index_counter += 1
else:
# Caso haja mais placeholders 'image' no prompt do que imagens em 'images'
print(f"Aviso: placeholder de imagem encontrado sem imagem correspondente no índice {image_index_counter}.")
# Opcional: Adicionar um texto para indicar a falta, ou apenas ignorar o placeholder
user_content.append({"type": "text", "text": "IMAGE_MISSING_PLACEHOLDER"})
# 2. Processar o Conteúdo do Assistente ('chosen')
assistant_content_text = ""
# O conteúdo do assistente está aninhado em row['chosen'][0]['content'][0]['text']
try:
raw_text = row['chosen'][0]['content'][0]['text']
# O texto é uma string que representa um dicionário (com aspas simples),
# então usamos ast.literal_eval para converter com segurança para um dicionário Python.
dict_representation = ast.literal_eval(raw_text)
# Convertemos o dicionário Python de volta para uma string JSON válida (com aspas duplas)
assistant_content_text = json.dumps(dict_representation)
except (ValueError, SyntaxError, IndexError, TypeError) as e:
print(f"Erro ao analisar o conteúdo do assistente na linha {row.name if hasattr(row, 'name') else 'desconhecida'}: {e}")
# Fallback para a string bruta se a análise falhar
assistant_content_text = raw_text
# 3. Criar a estrutura 'messages'
messages = [
{
"role": "user",
"content": user_content # Usamos a lista user_content que reconstruímos
},
{
"role": "assistant",
"content": [
{"type": "text", "text": assistant_content_text}
]
}
]
dataset_list.append({
"messages": messages
})
def create_custom_collator_v2(processor, debug: bool = False):
"""
Enhanced version that handles variable length sequences with padding
"""
def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
# Process all examples
processed_examples = []
user_lengths = []
for example in examples:
# Process full conversation
inputs = processor.apply_chat_template(
example["messages"],
tokenize=True,
add_generation_prompt=False,
return_dict=True,
return_tensors="pt"
)
processed_examples.append(inputs)
# Calculate user message length for masking
user_only = processor.apply_chat_template(
[example["messages"][0]], # Just user message
tokenize=True,
add_generation_prompt=True, # Add assistant start
return_dict=True,
return_tensors="pt"
)
user_lengths.append(user_only["input_ids"].shape[1])
# Find max length for padding
max_length = max(ex["input_ids"].shape[1] for ex in processed_examples)
# Prepare batched tensors
batch_size = len(examples)
input_ids = torch.full((batch_size, max_length), processor.tokenizer.pad_token_id, dtype=torch.long)
attention_mask = torch.zeros((batch_size, max_length), dtype=torch.long)
# Fill in the actual values
for i, ex in enumerate(processed_examples):
seq_len = ex["input_ids"].shape[1]
input_ids[i, :seq_len] = ex["input_ids"][0]
attention_mask[i, :seq_len] = ex["attention_mask"][0]
# Handle pixel_values and image_grid_thw (no padding needed, just concatenate)
pixel_values = torch.cat([ex["pixel_values"] for ex in processed_examples], dim=0)
image_grid_thw = torch.cat([ex["image_grid_thw"] for ex in processed_examples], dim=0)
batch = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"image_grid_thw": image_grid_thw,
}
# Create labels
labels = batch["input_ids"].clone()
# Mask padding
labels[labels == processor.tokenizer.pad_token_id] = -100
# Mask user messages
for i, user_len in enumerate(user_lengths):
labels[i, :user_len] = -100
batch["labels"] = labels
if debug:
print("\n" + "="*80)
print("COLLATOR DEBUG")
print("="*80)
print(f"Batch size: {batch_size}")
print(f"Max sequence length: {max_length}")
print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Pixel values shape: {batch['pixel_values'].shape}")
for i in range(min(2, batch_size)):
total = (attention_mask[i] == 1).sum().item()
masked = (labels[i] == -100).sum().item()
training = total - masked
print(f"\nExample {i}:")
print(f" User message length: {user_lengths[i]}")
print(f" Total non-padding tokens: {total}")
print(f" Masked tokens: {masked}")
print(f" Training tokens: {training} ({training/total*100:.1f}%)")
return batch
return collate_fn
---
Verification:
collator = create_custom_collator_v2(processor, debug=False)
batch = collator([dataset_list[0]])
list(batch.keys())
labels = batch['labels'][0]
valid_labels = labels[labels != -100]
decoded = processor.tokenizer.decode(valid_labels, skip_special_tokens=False)
print(decoded)
- Downloads last month
- 2