""" Example inference script for the fine-tuned Reading Steiner model. Replace with your actual LoRA adapter path after training. """ from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch # Load base model base_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen3.5-2B", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) # Load LoRA adapter (replace with your actual adapter path) # model = PeftModel.from_pretrained(base_model, "OmAlve/reading-steiner-qwen3.5-2b") # Optionally merge adapter into base model for faster inference: # model = model.merge_and_unload() model = base_model # Replace with above after training # Tokenizer tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3.5-2B", trust_remote_code=True) # Example input messages = [ {"role": "system", "content": ( "You are Reading Steiner, a web content extraction model. " "Given a webpage split into indexed blocks, identify which blocks contain the main content. " "Output indices as a Python list of [start, end] intervals." )}, {"role": "user", "content": ( "URL: https://example.com\n" "Title: Example Page\n" "Blocks:\n" '[1] \n' '[2] \n' '[3]

This is the main article content.

\n' '[4]

More content here.

\n' '[5] ' )}, ] inputs = tokenizer.apply_chat_template( messages, tokenize=True, return_tensors="pt", add_generation_prompt=True, ) inputs = inputs.to(model.device) # Generate with torch.no_grad(): outputs = model.generate( inputs, max_new_tokens=128, temperature=0.1, do_sample=False, pad_token_id=tokenizer.pad_token_id, ) result = tokenizer.decode(outputs[0], skip_special_tokens=True) print("=== Generated Output ===") print(result)