mdok detector of machine-generated texts for PAN2025 (available for non-commercial research purpose only - by requesting the access to this model you are agreeing to this condition). More info, as well as the training code is available in [repo](https://github.com/kinit-sk/mdok). ## Usage The model is fine-tuned Qwen3-14B-Base; therefore, use the latest transformers library supporting it. Assuming you have the texts for evaluation loaded in the Pandas dataframe 'test_df' in the column 'text'. Firstly, anonymize the texts (not necessary, but the model has been trained in this way). Then, run the inference. ``` def preprocess(text): EMAIL_PATTERN = re.compile(r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b") # e.g., name@example.com USER_MENTION_PATTERN = re.compile(r"@[A-Za-z0-9_-]+") # e.g., @my_username PHONE_PATTERN = re.compile(r"(\+?\d{1,3})?[\s\*\.-]?\(?\d{1,4}\)?[\s\*\.-]?\d{2,4}[\s\*\.-]?\d{2,6}") #modified from https://stackabuse.com/python-regular-expressions-validate-phone-numbers/ text = re.sub(EMAIL_PATTERN, "[EMAIL]", text) text = re.sub(USER_MENTION_PATTERN, "[USER]", text) text = re.sub(PHONE_PATTERN, " [PHONE]", text).replace(' [PHONE]', ' [PHONE]') return text.lower().strip() def preprocess_function(examples, **fn_kwargs): return fn_kwargs['tokenizer'](examples["text"], truncation=True, max_length=512) f1_metric = evaluate.load("f1") def compute_metrics(eval_pred): predictions, labels = eval_pred probs = predictions[:,1] predictions = np.argmax(predictions, axis=1) results = {"AUC": roc_auc_score(labels, probs), "ACC": accuracy_score(labels, predictions), "MacroF1": f1_score(labels, predictions, average='macro'), "MAE": mean_absolute_error(labels, predictions)} return results def test(test_df, model_path, id2label, label2id): tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForSequenceClassification.from_pretrained( model_path, trust_remote_code=True, num_labels=len(label2id), id2label=id2label, label2id=label2id, torch_dtype=torch.float16 ) if tokenizer.pad_token is None: if tokenizer.eos_token is not None: tokenizer.pad_token = tokenizer.eos_token else: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32) try: model.config.pad_token_id = tokenizer.get_vocab()[tokenizer.pad_token] except: print("Warning: Exception occured while setting pad_token_id") test_dataset = Dataset.from_pandas(test_df) tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer}) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model=model, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) predictions = trainer.predict(tokenized_test_dataset) prob_pred = softmax(predictions.predictions, axis=-1) return prob_pred test_df['text'] = [preprocess(x) for x in test_df['text']] probs = test(test_df, "DominikMacko/mdok", {0: "human", 1: "machine"}, {"human": 0, "machine": 1}) ``` Now 'probs[:,1]' contains probabilities of the texts being of machine class. Either use calibration of the classification threshold on you data, or use the default of >0.5 representing "machine" label. If limited GPU memory, load the model in 4-bit: ``` bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True, num_labels=len(label2id), id2label=id2label, label2id=label2id, torch_dtype=torch.float16, quantization_config=bnb_config) ``` ## Cite If you use the model, code, or any information from this repository, please cite the paper(s): ``` @misc{macko2025mdokkinitrobustlyfinetuned, title={mdok of {KInIT}: Robustly Fine-tuned {LLM} for Binary and Multiclass {AI}-Generated Text Detection}, author={Dominik Macko}, year={2025}, eprint={2506.01702}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2506.01702}, } @misc{macko2025increasingrobustnessfinetunedmultilingual, title={Increasing the Robustness of the Fine-tuned Multilingual Machine-Generated Text Detectors}, author={Dominik Macko and Robert Moro and Ivan Srba}, year={2025}, eprint={2503.15128}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2503.15128}, } ```