Spaces:

VaxiusK
/

axiusai

Runtime error

App Files Files Community

axiusai / app.py

VaxiusK

ai.py

2a2f3b3 verified 12 days ago

raw

history blame contribute delete

10.2 kB

	# =====================================================
	# TINYLLAMA 1.1B + 12 ДАТАСЕТОВ \| 75 000 ШАГОВ
	# С ЛОГАМИ В TELEGRAM И АВТО-ОТПРАВКОЙ ССЫЛКИ
	# =====================================================

	# 1. УСТАНОВКА
	!pip install -q unsloth datasets transformers trl accelerate requests

	import json
	import torch
	import random
	import requests
	import time
	import os
	import shutil
	import datetime
	from datasets import Dataset, concatenate_datasets, load_dataset
	from unsloth import FastLanguageModel
	from trl import SFTTrainer, SFTConfig
	from google.colab import drive

	# =====================================================
	# НАСТРОЙКИ TELEGRAM
	# =====================================================
	BOT_TOKEN = "8552885780:AAGDEjeEHhW03VYodRRmU_0fHPrx8b9GqH4"
	CHAT_ID = "8552885780" # Твой Telegram ID (можно оставить как есть, бот сам определит)

	def send_tg_message(text):
	"""Отправляет сообщение в Telegram"""
	try:
	url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
	payload = {"chat_id": CHAT_ID, "text": text, "parse_mode": "HTML"}
	response = requests.post(url, json=payload, timeout=10)
	return response.ok
	except Exception as e:
	print(f"Ошибка отправки сообщения: {e}")
	return False

	def send_tg_file(file_path, caption=""):
	"""Отправляет файл в Telegram (если он меньше 50 МБ)"""
	try:
	file_size = os.path.getsize(file_path) / (1024 * 1024)
	if file_size > 45: # Оставляем запас 5 МБ
	send_tg_message(f"⚠️ Файл {os.path.basename(file_path)} слишком большой ({file_size:.1f} МБ). Отправляю только ссылку.")
	return False
	url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendDocument"
	with open(file_path, "rb") as f:
	files = {"document": f}
	data = {"chat_id": CHAT_ID, "caption": caption}
	response = requests.post(url, files=files, data=data, timeout=60)
	return response.ok
	except Exception as e:
	print(f"Ошибка отправки файла: {e}")
	return False

	send_tg_message("🚀 <b>Запущено обучение TinyLlama 1.1B</b>\n📚 12 датасетов\n🎯 75 000 шагов\n⏱️ Ожидаемое время: 2-3 часа")
	print("✅ Telegram бот подключён")

	# 2. МОНТИРУЕМ DRIVE
	drive.mount('/content/drive')
	send_tg_message("✅ Google Drive смонтирован")
	print("✅ Google Drive смонтирован")

	# 3. ЗАГРУЗКА МОДЕЛИ
	send_tg_message("🧠 Загружаю модель TinyLlama 1.1B...")
	print("\n🧠 Загружаем TinyLlama 1.1B...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	"unsloth/tinyllama-bnb-4bit",
	max_seq_length=2048,
	load_in_4bit=True,
	)

	model = FastLanguageModel.get_peft_model(
	model,
	r=32,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_alpha=32,
	lora_dropout=0,
	)
	send_tg_message("✅ Модель загружена")

	# =====================================================
	# 4. ЗАГРУЗКА 12 ДАТАСЕТОВ
	# =====================================================
	send_tg_message("📚 Начинаю загрузку 12 датасетов...")
	print("\n📚 Загружаю 12 датасетов...")

	def convert_to_standard(example):
	instr = (example.get("instruction") or example.get("prompt") or
	example.get("query") or example.get("question") or
	example.get("conversation") or example.get("text") or "")
	out = (example.get("output") or example.get("response") or
	example.get("answer") or example.get("completion") or
	example.get("chosen") or "")
	if isinstance(instr, list):
	instr = str(instr[0]) if instr else ""
	if isinstance(out, list):
	out = str(out[0]) if out else ""
	if instr and out:
	return {"instruction": str(instr)[:1000], "output": str(out)[:1000]}
	return None

	dataset_links = [
	("princeton-nlp/gemma2-ultrafeedback-armorm", "Ультрафидбек"),
	("sanjay920/gemma-function-calling", "Функции"),
	("Jackrong/qwen3-coder-480b-distill-mini", "Кодер"),
	("masint/qwen3-30b-a3b-instruct-deflate-general", "Инструкции"),
	("mizinovmv/ru_codefeedback_python_Qwen2.5-Coder-32B-Instruct-GPTQ-Int8_sample", "Русский код"),
	("YuminChoi/ThinkSafe-qwen-0.6B-ablation-prompt-risk", "Безопасность"),
	("Crownelius/UltraCHAT-4200x-Qwen3", "Чаты"),
	("Congliu/Chinese-DeepSeek-R1-Distill-data-110k", "Китайский R1"),
	("Kedreamix/psychology-10k-Deepseek-R1-zh", "Психология"),
	("Trelis/openassistant-deepseek-coder", "Код ассистент"),
	("TeichAI/DeepSeek-v4-Pro-Agent", "Агент"),
	("SuperbEmphasis/Claude-4.0-DeepSeek-R1-RP-SFWish", "Ролевые"),
	]

	all_datasets = []
	for link, name in dataset_links:
	try:
	send_tg_message(f"📥 Загружаю {name}...")
	print(f" Загружаю {name}...")
	ds = load_dataset(link, split="train", trust_remote_code=True)
	if len(ds) > 10000:
	ds = ds.select(range(10000))
	converted = ds.map(convert_to_standard, remove_columns=ds.column_names)
	converted = converted.filter(lambda x: x is not None)
	if len(converted) > 0:
	all_datasets.append(converted)
	msg = f"✅ {name}: {len(converted)} примеров"
	print(f" {msg}")
	send_tg_message(msg)
	except Exception as e:
	error_msg = f"⚠️ Ошибка {name}: {str(e)[:100]}"
	print(f" {error_msg}")
	send_tg_message(error_msg)

	# 5. ОБЪЕДИНЕНИЕ
	full_dataset = concatenate_datasets(all_datasets)
	full_dataset = full_dataset.shuffle(seed=42)
	total = len(full_dataset)
	send_tg_message(f"🎯 ИТОГО: {total} примеров из датасетов")
	print(f"\n🎯 ИТОГО: {total} примеров")

	# 6. ФОРМАТИРОВАНИЕ
	def format_tiny(example):
	text = f"<\|user\|>\n{example['instruction']}\n<\|assistant\|>\n{example['output']}</s>"
	return {"text": text}

	formatted_dataset = full_dataset.map(format_tiny)

	# 7. ОБУЧЕНИЕ 75 000 ШАГОВ
	send_tg_message("🔥 <b>НАЧИНАЮ ОБУЧЕНИЕ НА 75 000 ШАГОВ</b>\n⏱️ Поставь кликер на кнопку 'Подключиться'!")
	print("\n🔥 НАЧИНАЮ ОБУЧЕНИЕ НА 75 000 ШАГОВ")

	trainer = SFTTrainer(
	model=model,
	tokenizer=tokenizer,
	train_dataset=formatted_dataset,
	args=SFTConfig(
	output_dir="./tiny_75k_model",
	per_device_train_batch_size=4,
	gradient_accumulation_steps=2,
	warmup_steps=100,
	max_steps=10000,
	learning_rate=2e-4,
	logging_steps=500,
	save_steps=5000,
	optim="adamw_8bit",
	dataset_text_field="text",
	max_seq_length=2048,
	report_to="none",
	),
	)

	# Запускаем обучение с периодической отправкой логов
	trainer.train()
	send_tg_message("✅ <b>ОБУЧЕНИЕ ЗАВЕРШЕНО!</b>")

	# 8. СОХРАНЕНИЕ В DRIVE + ССЫЛКА
	print("\n💾 Сохраняю модель в Google Drive...")
	send_tg_message("💾 Сохраняю модель в Google Drive...")

	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
	save_folder = f"/content/drive/MyDrive/tinyllama_75k_{timestamp}"

	# Сохраняем полную модель
	model.save_pretrained(save_folder)
	tokenizer.save_pretrained(save_folder)
	print(f"✅ Полная модель: {save_folder}")

	# Конвертируем в GGUF
	print("\n🔄 Конвертирую в GGUF...")
	send_tg_message("🔄 Конвертирую модель в GGUF формат...")
	model.save_pretrained_gguf("/content/tiny_75k_gguf", tokenizer, quantization_method="q4_k_m")

	gguf_file = "/content/tiny_75k_gguf/unsloth.Q4_K_M.gguf"
	gguf_drive_path = f"/content/drive/MyDrive/tinyllama_75k_{timestamp}.gguf"
	if os.path.exists(gguf_file):
	shutil.copy(gguf_file, gguf_drive_path)
	print(f"✅ GGUF файл: {gguf_drive_path}")
	send_tg_message("✅ Модель сконвертирована в GGUF")

	# 9. ОТПРАВКА ССЫЛОК В TELEGRAM
	print("\n" + "="*60)
	print("🔗 ОТПРАВЛЯЮ ССЫЛКИ В TELEGRAM...")
	print("="*60)

	# Кодируем название папки для ссылки
	folder_id = save_folder.split("/")[-1]
	drive_link = f"https://drive.google.com/drive/folders/{folder_id}"

	# Отправляем сообщение со ссылками
	final_message = f"""
	🎉 <b>МОДЕЛЬ ГОТОВА!</b>

	📁 <b>Папка с полной моделью:</b>
	<a href="{drive_link}">открыть в Google Drive</a>

	📄 <b>GGUF файл (для телефона):</b>
	Имя: tinyllama_75k_{timestamp}.gguf
	Путь в Drive: /MyDrive/tinyllama_75k_{timestamp}.gguf

	💡 <b>Как скачать:</b>
	1. Открой Google Drive
	2. Найди файл или папку по ссылке выше
	3. Нажми правой кнопкой → "Скачать"

	⚙️ <b>Характеристики модели:</b>
	- Модель: TinyLlama 1.1B
	- Шагов обучения: 75 000
	- Формат: GGUF Q4_K_M
	- Размер: ~600-650 МБ
	"""

	send_tg_message(final_message)

	# Пробуем отправить сам файл (если меньше 50 МБ)
	if os.path.exists(gguf_file):
	file_size_mb = os.path.getsize(gguf_file) / (1024 * 1024)
	if file_size_mb <= 45:
	send_tg_file(gguf_file, f"TinyLlama 1.1B — 75k шагов\nРазмер: {file_size_mb:.1f} МБ")
	else:
	send_tg_message(f"📦 Размер GGUF файла: {file_size_mb:.1f} МБ\nСкачай по ссылке выше ↑")

	print("\n✅ ВСЕ СООБЩЕНИЯ ОТПРАВЛЕНЫ В TELEGRAM")
	print(f"📁 Ссылка на папку: {drive_link}")