OktoSeek
/

oktoscript

Model card Files Files and versions

oktoscript / examples /monitor-full.okt

OktoSeek's picture

Upload 48 files

5fc8c9d verified 5 months ago

history blame contribute delete

1.5 kB

	# okto_version: "1.2"
	PROJECT "MonitorFullExample"
	DESCRIPTION "Demonstrates complete MONITOR block with all metrics"

	ENV {
	accelerator: "gpu"
	min_memory: "16GB"
	precision: "fp16"
	}

	DATASET {
	train: "examples/datasets/demo_train.jsonl"
	validation: "examples/datasets/demo_train.jsonl"
	format: "jsonl"
	type: "chat"
	}

	MODEL {
	name: "monitor-full-model"
	base: "oktoseek/base-mini"
	device: "cuda"
	}

	TRAIN {
	epochs: 10
	batch_size: 32
	learning_rate: 0.0001
	device: "cuda"
	}

	MONITOR {
	metrics: [
	"loss",
	"val_loss",
	"accuracy",
	"val_accuracy",
	"precision",
	"recall",
	"f1_score",
	"perplexity",
	"confidence",
	"hallucination_score"
	]

	notify_if {
	loss > 2.0
	gpu_usage > 90%
	gpu_temperature > 85
	val_loss > 2.5
	hallucination_score > 0.5
	}

	log_system: [
	"gpu_usage",
	"gpu_memory_used",
	"gpu_memory_free",
	"gpu_temperature",
	"cpu_usage",
	"ram_usage"
	]

	log_speed: [
	"tokens_per_second",
	"samples_per_second",
	"throughput",
	"latency",
	"step_time"
	]

	refresh_interval: 2s
	export_to: "runs/logs/system.json"
	dashboard: true
	log_to: "logs/training.log"
	}

	CONTROL {
	on_epoch_end {
	IF gpu_temperature > 85 {
	SET batch_size = 16
	LOG "GPU temperature high, reducing batch size"
	}
	}
	}

	EXPORT {
	format: ["okm"]
	path: "export/"
	}