DeepImagix commited on
Commit
1971de2
Β·
verified Β·
1 Parent(s): e155766

Upload neurones_vision.py

Browse files
Files changed (1) hide show
  1. models/neurones_vision.py +108 -31
models/neurones_vision.py CHANGED
@@ -1,66 +1,143 @@
1
  """
2
  Neurones Vision 1.0
3
  ===================
4
- NeuraPrompt's multimodal model. Powered by Llama 4 Scout (17B) on Groq.
5
- Can see and understand images, read text in images (OCR), describe scenes,
6
- and answer questions about visual content.
7
 
8
- To upgrade: change `groq_model` and bump `version`. That's it.
9
  """
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  MODEL = {
12
- # ── Identity ────────────────────────────────────────────────
13
  "id": "neurones-vision-1.0",
14
  "display_name": "Neurones Vision 1.0",
15
- "version": "1.0",
16
  "release_date": "2026-03-29",
17
- "tagline": "NeuraPrompt's eyes. Sees, reads, and understands images.",
18
 
19
- # ── Speed ───────────────────────────────────────────────────
20
  "speed": "balanced",
21
  "speed_label": "πŸ‘οΈ Vision",
22
 
23
- # ── Backend ─────────────────────────────────────────────────
24
- # llama-4-scout supports image input via Groq's vision API
25
- "groq_model": "openai/gpt-oss-120b",
26
- "groq_vision_model": "openai/gpt-oss-120b", # same model handles vision
27
- "max_tokens": 4096,
28
- "temperature": 0.6,
29
 
30
- # ── Capabilities ────────────────────────────────────────────
31
  "can_stream": False,
32
- "can_reason": False,
33
- "can_vision": True, # Image upload enabled for this model
34
- "can_generate_image": True, # Analysis only, not generation
 
35
  "can_search": False,
36
  "can_code": False,
37
- "can_translate": True,
38
  "can_summarise": True,
39
  "is_local": False,
 
40
 
41
- # ── Limits ──────────────────────────────────────────────────
42
  "context_window": 16384,
43
  "rate_limit_rpm": 10,
44
 
45
- # ── System Prompt ───────────────────────────────────────────
46
  "system_prompt": (
47
- "You are Neurones Vision 1.0, NeuraPrompt's multimodal AI model, "
48
  "created by Andile Mtolo (Toxic Dee Modder). "
49
- "When given an image, analyse it thoroughly: describe what you see, "
50
- "read and transcribe any text visible in the image, identify objects, "
51
- "people, locations, colours, and context. "
52
- "Answer questions about images with precision and detail. "
53
- "If text is present in the image, always extract and include it in your response."
 
 
 
 
 
54
  ),
55
 
56
- # ── UI Hints ─────────────────────────────────────────────────
57
  "badge_color": "#ff6d00",
58
  "icon": "πŸ‘οΈ",
59
  "recommended_for": [
60
- "image analysis", "OCR / text extraction",
61
- "visual Q&A", "document scanning", "photo description",
 
62
  ],
63
  "not_recommended_for": [
64
- "deep mathematical reasoning", "generating images",
65
  ],
66
  }
 
1
  """
2
  Neurones Vision 1.0
3
  ===================
4
+ NeuraPrompt's multimodal model. Powered by meta-llama/llama-4-scout-17b on Groq.
5
+ Handles: images, documents (PDF/text/code), OCR, file analysis, visual Q&A.
6
+ NOT for: general chat, math, coding tasks β€” redirects user to another model.
7
 
8
+ Datasets: scans models/datasets/ for image-related datasets automatically.
9
  """
10
 
11
+ import pathlib
12
+ import json
13
+ import logging
14
+
15
+ # ── Dataset scanner ──────────────────────────────────────────────
16
+ DATASETS_DIR = pathlib.Path(__file__).parent / "datasets"
17
+
18
+ # Keywords that indicate a dataset is image/vision related
19
+ _IMAGE_KEYWORDS = {
20
+ "image", "vision", "visual", "photo", "picture", "img",
21
+ "caption", "scene", "object", "detection", "classify",
22
+ "ocr", "document", "diagram", "chart", "screenshot",
23
+ }
24
+
25
+ def _is_vision_dataset(filepath: pathlib.Path) -> bool:
26
+ """Heuristic: check filename for vision-related keywords."""
27
+ name = filepath.stem.lower()
28
+ return any(kw in name for kw in _IMAGE_KEYWORDS)
29
+
30
+ def load_vision_datasets(max_per_file: int = 2000) -> list[dict]:
31
+ """
32
+ Load vision/image-related datasets from models/datasets/.
33
+ Skips text-only datasets based on filename heuristics.
34
+ Returns list of {prompt, response} pairs for few-shot context.
35
+ """
36
+ if not DATASETS_DIR.exists():
37
+ return []
38
+
39
+ pairs = []
40
+ for fp in sorted(DATASETS_DIR.iterdir()):
41
+ if not fp.is_file():
42
+ continue
43
+ suffix = "".join(fp.suffixes).lower()
44
+ if suffix not in (".jsonl", ".jsonl.txt", ".json", ".txt"):
45
+ continue
46
+ if not _is_vision_dataset(fp):
47
+ logging.debug(f"[Vision] Skipping non-vision dataset: {fp.name}")
48
+ continue
49
+
50
+ count = 0
51
+ try:
52
+ with open(fp, "r", encoding="utf-8", errors="replace") as f:
53
+ for line in f:
54
+ line = line.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ entry = json.loads(line)
59
+ except json.JSONDecodeError:
60
+ continue
61
+
62
+ # Extract prompt/response pair
63
+ prompt = (entry.get("question") or entry.get("prompt") or
64
+ entry.get("instruction") or entry.get("input") or "")
65
+ response = (entry.get("answer") or entry.get("response") or
66
+ entry.get("output") or entry.get("caption") or "")
67
+ if prompt and response and len(response) > 10:
68
+ pairs.append({"prompt": str(prompt)[:300], "response": str(response)[:500]})
69
+ count += 1
70
+ if count >= max_per_file:
71
+ break
72
+
73
+ if count:
74
+ logging.info(f"[Vision] Loaded {count} pairs from {fp.name}")
75
+ except Exception as e:
76
+ logging.warning(f"[Vision] Failed to read {fp.name}: {e}")
77
+
78
+ return pairs
79
+
80
+
81
  MODEL = {
82
+ # ── Identity ─────────────────────────────────────────────────
83
  "id": "neurones-vision-1.0",
84
  "display_name": "Neurones Vision 1.0",
85
+ "version": "1.1",
86
  "release_date": "2026-03-29",
87
+ "tagline": "NeuraPrompt's eyes. Sees, reads, and understands images and files.",
88
 
89
+ # ── Speed ────────────────────────────────────────────────────
90
  "speed": "balanced",
91
  "speed_label": "πŸ‘οΈ Vision",
92
 
93
+ # ── Backend ─────────────────────────────────��────────────────
94
+ "groq_model": "meta-llama/llama-4-scout-17b-16e-instruct",
95
+ "groq_vision_model": "meta-llama/llama-4-scout-17b-16e-instruct",
96
+ "max_tokens": 4096,
97
+ "temperature": 0.3,
 
98
 
99
+ # ── Capabilities ─────────────────────────────────────────────
100
  "can_stream": False,
101
+ "can_reason": True,
102
+ "can_vision": True,
103
+ "can_files": True, # NEW: file analysis
104
+ "can_generate_image": False,
105
  "can_search": False,
106
  "can_code": False,
107
+ "can_translate": False,
108
  "can_summarise": True,
109
  "is_local": False,
110
+ "vision_only": True, # main.py uses this to block plain chat
111
 
112
+ # ── Limits ───────────────────────────────────────────────────
113
  "context_window": 16384,
114
  "rate_limit_rpm": 10,
115
 
116
+ # ── System Prompt ─────────────────────────────────────────────
117
  "system_prompt": (
118
+ "You are Neurones Vision 1.0, NeuraPrompt's visual analysis model, "
119
  "created by Andile Mtolo (Toxic Dee Modder). "
120
+ "Your specialty is images and files ONLY.\n\n"
121
+ "For IMAGES: describe thoroughly, extract all visible text (OCR), "
122
+ "identify objects, people, colours, scene type, and context. "
123
+ "Answer any question about the visual content with precision.\n\n"
124
+ "For FILES/DOCUMENTS: extract text content, summarise key points, "
125
+ "identify structure (headings, tables, code), and answer questions.\n\n"
126
+ "If a user sends a plain text message with NO image or file, respond:\n"
127
+ "'I am Neurones Vision β€” I specialise in images and files. "
128
+ "For general chat please switch to Neurones Pro or Flash using the model selector.'\n\n"
129
+ "Never guess when you cannot see something clearly β€” say so."
130
  ),
131
 
132
+ # ── UI Hints ──────────────────────────────────────────────────
133
  "badge_color": "#ff6d00",
134
  "icon": "πŸ‘οΈ",
135
  "recommended_for": [
136
+ "image analysis", "OCR / text extraction", "file reading",
137
+ "document scanning", "photo description", "visual Q&A",
138
+ "PDF summary", "screenshot analysis",
139
  ],
140
  "not_recommended_for": [
141
+ "general chat", "math", "coding", "real-time search",
142
  ],
143
  }