dataset_id = "axrml/vlm2_ads_mpo" # dataset de treinamento.

role_prompt = """ As a cybersecurity specialist, your task is to analyze Facebook Ads to determine if the ads are impersonating a legitimate brand by comparing provided evidence. Check if the ads are using the same visual identity, logos, and products as the original site.\nUse only the data below: """

instructions = """ Your analysis must: • Compare the ad profile name, profile image, profile cover image (if present), ad image, and ad text with the original data from the brand website (HTML, Screenshot and logos (if provided)) • Focus on visual cues—logos (when available), layout, and content—without using any external knowledge. • Decide on an impersonation level: – “high” if the suspicious ad mimics the official branding (even if slightly different in style) using similar logos, visual identity, and/or offering similar products and services, – “medium” if the similarities are moderate, – “low” if the suspicious site shows different or unclear branding. • If any input is missing or not informative, treat it as low evidence. • Base your final decision solely on the provided evidence. • Always think, as a user, would you believe that this is an ad from the company (even remotely)? • Use all the information to provide a robust answer. • Pay more attention to ad image. • If the brand logo is present the minimum value is medium.

Return your answer as valid JSON exactly in this structure (with no extra text):

{"brand": "", "level": "low/medium/high", "explanation": "Brief explanation based solely on comparing the provided data."} """

Example of input:

messages = [ { "role": "user", "content": user_content # Usamos a lista user_content que reconstruímos }, { "role": "assistant", "content": [ {"type": "text", "text": assistant_content_text} ] } ]

Example in python:

for row in dataset_to_process:
    # 1. Processar o Conteúdo do Usuário ('prompt')
    user_content = []
    
    # O conteúdo do usuário está dentro de row['prompt'][0]['content']
    prompt_items = row['prompt'][0]['content']
    
    # row['images'] é uma lista de objetos PIL Image, na ordem em que aparecem
    image_list = row['images'] if row['images'] is not None else []
    image_index_counter = 0

    for item in prompt_items:
        if item['type'] == 'text':
            # Adiciona o texto diretamente
            user_content.append({"type": "text", "text": item['text']})
        
        elif item['type'] == 'image':
            # Adiciona a imagem carregada do array 'images'
            if image_index_counter < len(image_list):
                user_content.append({"type": "image", "image": image_list[image_index_counter]})
                image_index_counter += 1
            else:
                # Caso haja mais placeholders 'image' no prompt do que imagens em 'images'
                print(f"Aviso: placeholder de imagem encontrado sem imagem correspondente no índice {image_index_counter}.")
                # Opcional: Adicionar um texto para indicar a falta, ou apenas ignorar o placeholder
                user_content.append({"type": "text", "text": "IMAGE_MISSING_PLACEHOLDER"})
                
    # 2. Processar o Conteúdo do Assistente ('chosen')
    assistant_content_text = ""
    # O conteúdo do assistente está aninhado em row['chosen'][0]['content'][0]['text']
    try:
        raw_text = row['chosen'][0]['content'][0]['text']
        
        # O texto é uma string que representa um dicionário (com aspas simples), 
        # então usamos ast.literal_eval para converter com segurança para um dicionário Python.
        dict_representation = ast.literal_eval(raw_text)
        
        # Convertemos o dicionário Python de volta para uma string JSON válida (com aspas duplas)
        assistant_content_text = json.dumps(dict_representation)
        
    except (ValueError, SyntaxError, IndexError, TypeError) as e:
        print(f"Erro ao analisar o conteúdo do assistente na linha {row.name if hasattr(row, 'name') else 'desconhecida'}: {e}")
        # Fallback para a string bruta se a análise falhar
        assistant_content_text = raw_text 

    # 3. Criar a estrutura 'messages'
    messages = [
        {
            "role": "user",
            "content": user_content # Usamos a lista user_content que reconstruímos
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": assistant_content_text}
            ]
        }
    ]

    dataset_list.append({
        "messages": messages
    })

def create_custom_collator_v2(processor, debug: bool = False):
    """
    Enhanced version that handles variable length sequences with padding
    """

    def collate_fn(examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Process all examples
        processed_examples = []
        user_lengths = []

        for example in examples:
            # Process full conversation
            inputs = processor.apply_chat_template(
                example["messages"],
                tokenize=True,
                add_generation_prompt=False,
                return_dict=True,
                return_tensors="pt"
            )
            processed_examples.append(inputs)

            # Calculate user message length for masking
            user_only = processor.apply_chat_template(
                [example["messages"][0]],  # Just user message
                tokenize=True,
                add_generation_prompt=True,  # Add assistant start
                return_dict=True,
                return_tensors="pt"
            )
            user_lengths.append(user_only["input_ids"].shape[1])

        # Find max length for padding
        max_length = max(ex["input_ids"].shape[1] for ex in processed_examples)

        # Prepare batched tensors
        batch_size = len(examples)
        input_ids = torch.full((batch_size, max_length), processor.tokenizer.pad_token_id, dtype=torch.long)
        attention_mask = torch.zeros((batch_size, max_length), dtype=torch.long)

        # Fill in the actual values
        for i, ex in enumerate(processed_examples):
            seq_len = ex["input_ids"].shape[1]
            input_ids[i, :seq_len] = ex["input_ids"][0]
            attention_mask[i, :seq_len] = ex["attention_mask"][0]

        # Handle pixel_values and image_grid_thw (no padding needed, just concatenate)
        pixel_values = torch.cat([ex["pixel_values"] for ex in processed_examples], dim=0)
        image_grid_thw = torch.cat([ex["image_grid_thw"] for ex in processed_examples], dim=0)

        batch = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "image_grid_thw": image_grid_thw,
        }

        # Create labels
        labels = batch["input_ids"].clone()

        # Mask padding
        labels[labels == processor.tokenizer.pad_token_id] = -100

        # Mask user messages
        for i, user_len in enumerate(user_lengths):
            labels[i, :user_len] = -100

        batch["labels"] = labels

        if debug:
            print("\n" + "="*80)
            print("COLLATOR DEBUG")
            print("="*80)
            print(f"Batch size: {batch_size}")
            print(f"Max sequence length: {max_length}")
            print(f"Input IDs shape: {batch['input_ids'].shape}")
            print(f"Pixel values shape: {batch['pixel_values'].shape}")

            for i in range(min(2, batch_size)):
                total = (attention_mask[i] == 1).sum().item()
                masked = (labels[i] == -100).sum().item()
                training = total - masked
                print(f"\nExample {i}:")
                print(f"  User message length: {user_lengths[i]}")
                print(f"  Total non-padding tokens: {total}")
                print(f"  Masked tokens: {masked}")
                print(f"  Training tokens: {training} ({training/total*100:.1f}%)")

        return batch

    return collate_fn

---

Verification:

collator = create_custom_collator_v2(processor, debug=False)

batch = collator([dataset_list[0]])

list(batch.keys())

labels = batch['labels'][0]
valid_labels = labels[labels != -100]

decoded = processor.tokenizer.decode(valid_labels, skip_special_tokens=False)
print(decoded)

Downloads last month: 2

Safetensors

Model size

4B params

Tensor type

BF16