Spaces:
Sleeping
Sleeping
Initial commit - Code education RAG Space
Browse files- .gitignore +21 -0
- app.py +216 -0
- data/chunks_articles.jsonl +0 -0
- requirements.txt +13 -0
- src/rag_core.py +381 -0
.gitignore
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
|
| 5 |
+
# Local venv (si jamais)
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
llm_code_education_env/
|
| 9 |
+
|
| 10 |
+
# Modèles (jamais dans git)
|
| 11 |
+
models/
|
| 12 |
+
*.gguf
|
| 13 |
+
*.bin
|
| 14 |
+
models/*.gguf
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Secrets
|
| 18 |
+
.env
|
| 19 |
+
|
| 20 |
+
# OS
|
| 21 |
+
.DS_Store
|
app.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py — Gradio UI for hf-code-education (CPU / Hugging Face Spaces)
|
| 2 |
+
# This file must NOT change the validated RAG logic.
|
| 3 |
+
# It only calls src/rag_core.py:answer_query(query).
|
| 4 |
+
# to launch http://localhost:7860
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import traceback
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def ensure_model_present():
|
| 15 |
+
os.makedirs("models", exist_ok=True)
|
| 16 |
+
local_path = os.path.join("models", "mistral.gguf")
|
| 17 |
+
if os.path.exists(local_path):
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
+
repo_id = os.environ.get("MODEL_REPO_ID")
|
| 21 |
+
filename = os.environ.get("MODEL_FILENAME", "mistral.gguf")
|
| 22 |
+
|
| 23 |
+
if not repo_id:
|
| 24 |
+
raise RuntimeError(
|
| 25 |
+
"Modèle GGUF absent (models/mistral.gguf) et variable MODEL_REPO_ID non définie."
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
downloaded = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 29 |
+
import shutil
|
| 30 |
+
shutil.copyfile(downloaded, local_path)
|
| 31 |
+
|
| 32 |
+
ensure_model_present()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Ensure we can import src/rag_core.py without requiring src/ to be a package
|
| 37 |
+
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 38 |
+
SRC_DIR = os.path.join(ROOT_DIR, "src")
|
| 39 |
+
if SRC_DIR not in sys.path:
|
| 40 |
+
sys.path.insert(0, SRC_DIR)
|
| 41 |
+
|
| 42 |
+
# Import the validated core
|
| 43 |
+
try:
|
| 44 |
+
import rag_core # src/rag_core.py
|
| 45 |
+
except Exception as e:
|
| 46 |
+
raise RuntimeError(
|
| 47 |
+
"Impossible d'importer src/rag_core.py. "
|
| 48 |
+
"Vérifie que le fichier existe bien et qu'il s'appelle exactement rag_core.py."
|
| 49 |
+
) from e
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _format_result(result) -> str:
|
| 53 |
+
"""
|
| 54 |
+
Formats output robustly WITHOUT assuming a strict schema.
|
| 55 |
+
We do NOT modify any RAG logic, just display what comes back.
|
| 56 |
+
"""
|
| 57 |
+
if result is None:
|
| 58 |
+
return "Aucune réponse (result=None)."
|
| 59 |
+
|
| 60 |
+
# Most common: a string answer
|
| 61 |
+
if isinstance(result, str):
|
| 62 |
+
return result
|
| 63 |
+
|
| 64 |
+
# If the core returns a dict-like object
|
| 65 |
+
if isinstance(result, dict):
|
| 66 |
+
# Try common keys while staying generic
|
| 67 |
+
parts = []
|
| 68 |
+
if "mode" in result:
|
| 69 |
+
parts.append(f"Mode: {result['mode']}")
|
| 70 |
+
if "answer" in result:
|
| 71 |
+
parts.append(str(result["answer"]))
|
| 72 |
+
elif "response" in result:
|
| 73 |
+
parts.append(str(result["response"]))
|
| 74 |
+
else:
|
| 75 |
+
# Fallback: dump dict (readable)
|
| 76 |
+
parts.append(str(result))
|
| 77 |
+
|
| 78 |
+
# Optional: sources / context / citations
|
| 79 |
+
for k in ["sources", "citations", "articles", "context_used", "context"]:
|
| 80 |
+
if k in result and result[k]:
|
| 81 |
+
parts.append(f"\n\n---\n{k}:\n{result[k]}")
|
| 82 |
+
return "\n\n".join(parts)
|
| 83 |
+
|
| 84 |
+
# If the core returns a tuple/list (e.g., (answer, meta))
|
| 85 |
+
if isinstance(result, (tuple, list)):
|
| 86 |
+
return "\n\n".join([str(x) for x in result])
|
| 87 |
+
|
| 88 |
+
# Fallback
|
| 89 |
+
return str(result)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def chat_once(user_query: str) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Single-shot call to the validated RAG core.
|
| 95 |
+
"""
|
| 96 |
+
q = (user_query or "").strip()
|
| 97 |
+
if not q:
|
| 98 |
+
return "Entre une question ou une demande (vide = rien à traiter)."
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
# IMPORTANT: Do not change rag_core logic; just call it.
|
| 102 |
+
result = rag_core.answer_query(q)
|
| 103 |
+
return _format_result(result)
|
| 104 |
+
except Exception:
|
| 105 |
+
# Show error transparently (useful on HF Spaces logs)
|
| 106 |
+
err = traceback.format_exc()
|
| 107 |
+
return "Erreur côté application (pas côté utilisateur):\n\n" + err
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
CSS = """
|
| 111 |
+
/* Police sérieuse, institutionnelle */
|
| 112 |
+
:root {
|
| 113 |
+
--font-sans: Inter, "Source Sans 3", Roboto, "Segoe UI", Arial, sans-serif;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
body, .gradio-container {
|
| 117 |
+
font-family: var(--font-sans) !important;
|
| 118 |
+
font-size: 15px;
|
| 119 |
+
line-height: 1.5;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
/* Titres plus sobres */
|
| 123 |
+
h1, h2, h3 {
|
| 124 |
+
font-weight: 600;
|
| 125 |
+
letter-spacing: -0.01em;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
/* Page un peu plus compacte */
|
| 129 |
+
.gradio-container {
|
| 130 |
+
max-width: 980px !important;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/* Réponse : hauteur max + scroll */
|
| 134 |
+
#answer textarea {
|
| 135 |
+
max-height: 360px !important;
|
| 136 |
+
overflow-y: auto !important;
|
| 137 |
+
font-size: 14px;
|
| 138 |
+
line-height: 1.55;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
/* Moins d'espacement vertical */
|
| 142 |
+
.wrap {
|
| 143 |
+
gap: 0.6rem !important;
|
| 144 |
+
}
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
with gr.Blocks(
|
| 149 |
+
title="Assistant Code de l’éducation (RAG)",
|
| 150 |
+
css=CSS,
|
| 151 |
+
theme=gr.themes.Soft(),
|
| 152 |
+
) as demo:
|
| 153 |
+
gr.Markdown(
|
| 154 |
+
"""
|
| 155 |
+
# Assistant Code de l’éducation
|
| 156 |
+
Cet outil recherche dans le Code de l’éducation (version du 7 janvier 2026) et répond uniquement à partir des articles retrouvés.
|
| 157 |
+
|
| 158 |
+
### Ce que l’outil fait
|
| 159 |
+
- Cite des articles (ou liste des articles pertinents)
|
| 160 |
+
- Répond à une question si le texte nécessaire est présent dans les articles retrouvés
|
| 161 |
+
|
| 162 |
+
### Ce que l’outil ne fait pas
|
| 163 |
+
- N’invente pas : si le contexte est insuffisant, il refuse et le dit clairement
|
| 164 |
+
- Ne remplace pas une validation juridique
|
| 165 |
+
|
| 166 |
+
Conseil : pour une citation exacte, demande “Donne l’intégralité de l’article …”.
|
| 167 |
+
""".strip()
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
gr.Markdown(
|
| 171 |
+
"""
|
| 172 |
+
> **Information importante**
|
| 173 |
+
> Lors du premier lancement, l’application peut nécessiter **1 à 2 minutes** d’initialisation.
|
| 174 |
+
> Ensuite, l’utilisation est immédiate.
|
| 175 |
+
> En cas d’utilisation simultanée, les demandes sont traitées **successivement** afin de garantir la fiabilité des réponses.
|
| 176 |
+
""".strip()
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
with gr.Row():
|
| 181 |
+
inp = gr.Textbox(
|
| 182 |
+
label="Votre demande",
|
| 183 |
+
placeholder="Ex : Donne l’intégralité de l’article D454-14",
|
| 184 |
+
lines=2,
|
| 185 |
+
max_lines=4,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
with gr.Row():
|
| 189 |
+
out = gr.Textbox(
|
| 190 |
+
label="Réponse",
|
| 191 |
+
elem_id="answer",
|
| 192 |
+
lines=10,
|
| 193 |
+
max_lines=14,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
with gr.Row():
|
| 197 |
+
btn = gr.Button("Répondre", variant="primary")
|
| 198 |
+
clear = gr.Button("Effacer")
|
| 199 |
+
|
| 200 |
+
btn.click(chat_once, inputs=inp, outputs=out)
|
| 201 |
+
clear.click(lambda: ("", ""), outputs=[inp, out])
|
| 202 |
+
|
| 203 |
+
with gr.Accordion("Exemples de requêtes", open=False):
|
| 204 |
+
gr.Examples(
|
| 205 |
+
examples=[
|
| 206 |
+
"Donne l'intégralité de l'article D454-14",
|
| 207 |
+
"Liste les articles qui parlent de l'obligation scolaire",
|
| 208 |
+
"Quelles sont les conditions de nomination d'un chef d'établissement ? Cite uniquement les articles fournis.",
|
| 209 |
+
],
|
| 210 |
+
inputs=inp,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# HF Spaces expects launch on 0.0.0.0:7860
|
| 215 |
+
if __name__ == "__main__":
|
| 216 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
data/chunks_articles.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.1
|
| 2 |
+
faiss-cpu==1.8.0.post1
|
| 3 |
+
numpy==1.26.4
|
| 4 |
+
sentence-transformers==3.0.1
|
| 5 |
+
llama-cpp-python==0.3.7
|
| 6 |
+
langchain-community
|
| 7 |
+
torch
|
| 8 |
+
langchain-huggingface
|
| 9 |
+
huggingface_hub
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
src/rag_core.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
rag_core.py
|
| 6 |
+
|
| 7 |
+
Transposition FIDÈLE de rag_chat_llama.py (mêmes règles, mêmes seuils, même prompt,
|
| 8 |
+
même validation anti-hallucination), mais sans boucle interactive : on expose
|
| 9 |
+
une fonction answer_query(question) utilisable par une app Hugging Face.
|
| 10 |
+
|
| 11 |
+
ROUTAGE AUTO :
|
| 12 |
+
- FULLTEXT : demande "texte exact / intégral / article X" => impression exacte depuis JSONL (SANS LLM)
|
| 13 |
+
- LIST : demande "quels articles parlent ..." => liste articles + extrait (SANS LLM)
|
| 14 |
+
- QA : RAG => LLM (llama/Mistral) + prompt strict + VALIDATION (anti-hallucinations)
|
| 15 |
+
|
| 16 |
+
Prérequis :
|
| 17 |
+
- data/chunks_articles.jsonl (article-level)
|
| 18 |
+
- db/faiss_code_edu_by_article (FAISS)
|
| 19 |
+
- models/mistral.gguf (GGUF)
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import re
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import List, Tuple, Optional, Dict, Iterable, Any
|
| 26 |
+
|
| 27 |
+
from langchain_community.vectorstores import FAISS
|
| 28 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 29 |
+
from llama_cpp import Llama
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# -------------------- CONFIG --------------------
|
| 33 |
+
CHUNKS_PATH = Path("data/chunks_articles.jsonl")
|
| 34 |
+
DB_DIR = Path("db/faiss_code_edu_by_article")
|
| 35 |
+
|
| 36 |
+
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 37 |
+
MODEL_NAME = "mistral:latest"
|
| 38 |
+
|
| 39 |
+
TOP_K_FETCH = 30 # nb de docs candidats récupérés
|
| 40 |
+
TOP_K_FINAL = 4 # nb max envoyés au LLM
|
| 41 |
+
SCORE_THRESHOLD = 1.10 # à ajuster (voir affichage des scores)
|
| 42 |
+
MAX_CHARS_PER_DOC = 800
|
| 43 |
+
SNIPPET_CHARS = 260
|
| 44 |
+
|
| 45 |
+
# Déclencheurs FULLTEXT
|
| 46 |
+
FULLTEXT_TRIGGERS = [
|
| 47 |
+
"contenu exact", "texte exact", "texte intégral", "texte integral",
|
| 48 |
+
"intégral", "integral", "cite intégralement", "cite integralement",
|
| 49 |
+
"donne l'intégralité", "donne l'integralite", "recopie", "reproduis",
|
| 50 |
+
"affiche l'article", "donne l'article", "donne moi l'article",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
# Déclencheurs LIST
|
| 54 |
+
LIST_TRIGGERS = [
|
| 55 |
+
"quels articles", "quelles dispositions", "articles parlent",
|
| 56 |
+
"articles qui parlent", "articles sur", "donne les articles",
|
| 57 |
+
"cite les articles", "références", "references",
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# Regex article id
|
| 61 |
+
ARTICLE_ID_RE = re.compile(
|
| 62 |
+
r"\b(?:article\s+)?([LDR]\s?\d{1,4}(?:[.-]\d+){0,4})\b",
|
| 63 |
+
flags=re.IGNORECASE
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
EPLE_RE = re.compile(r"\bEPLE\b", flags=re.IGNORECASE)
|
| 67 |
+
|
| 68 |
+
# Pour valider les sorties "Articles cités : ..."
|
| 69 |
+
ARTICLES_CITES_RE = re.compile(r"Articles cités\s*:\s*(.*)$", flags=re.IGNORECASE | re.MULTILINE)
|
| 70 |
+
|
| 71 |
+
# -------------------- LLM INIT (FIDÈLE) --------------------
|
| 72 |
+
llm = Llama(
|
| 73 |
+
model_path="models/mistral.gguf", # Mistral GGUF
|
| 74 |
+
n_ctx=2048,
|
| 75 |
+
n_threads=10,
|
| 76 |
+
n_batch=128,
|
| 77 |
+
verbose=False,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def llm_generate(prompt: str) -> str:
|
| 82 |
+
out = llm.create_chat_completion(
|
| 83 |
+
messages=[{"role": "user", "content": prompt}],
|
| 84 |
+
temperature=0.1,
|
| 85 |
+
max_tokens=200,
|
| 86 |
+
)
|
| 87 |
+
return out["choices"][0]["message"]["content"].strip()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# -------------------- UTILS (FIDÈLES) --------------------
|
| 91 |
+
|
| 92 |
+
def normalize_article_id(raw: str) -> str:
|
| 93 |
+
s = raw.strip().upper().replace(" ", "")
|
| 94 |
+
s = s.replace(".", "-")
|
| 95 |
+
return s
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def extract_article_id(q: str) -> Optional[str]:
|
| 99 |
+
m = ARTICLE_ID_RE.search(q)
|
| 100 |
+
if not m:
|
| 101 |
+
return None
|
| 102 |
+
return normalize_article_id(m.group(1))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def is_fulltext_request(q: str) -> bool:
|
| 106 |
+
ql = q.lower()
|
| 107 |
+
if any(t in ql for t in FULLTEXT_TRIGGERS):
|
| 108 |
+
return True
|
| 109 |
+
aid = extract_article_id(q)
|
| 110 |
+
if aid and len(ql) <= 25:
|
| 111 |
+
return True
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def is_list_request(q: str) -> bool:
|
| 116 |
+
ql = q.lower()
|
| 117 |
+
return any(t in ql for t in LIST_TRIGGERS)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
|
| 121 |
+
seen = set()
|
| 122 |
+
out = []
|
| 123 |
+
for x in items:
|
| 124 |
+
if x not in seen:
|
| 125 |
+
out.append(x)
|
| 126 |
+
seen.add(x)
|
| 127 |
+
return out
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def safe_snippet(text: str, n: int) -> str:
|
| 131 |
+
t = " ".join((text or "").split())
|
| 132 |
+
if len(t) <= n:
|
| 133 |
+
return t
|
| 134 |
+
return t[:n].rstrip() + "…"
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def load_article_text(article_id: str) -> Optional[str]:
|
| 138 |
+
if not CHUNKS_PATH.exists():
|
| 139 |
+
raise FileNotFoundError(f"Fichier chunks introuvable : {CHUNKS_PATH}")
|
| 140 |
+
|
| 141 |
+
with CHUNKS_PATH.open("r", encoding="utf-8") as f:
|
| 142 |
+
for line in f:
|
| 143 |
+
if not line.strip():
|
| 144 |
+
continue
|
| 145 |
+
obj = json.loads(line)
|
| 146 |
+
aid = normalize_article_id(obj.get("article_id", ""))
|
| 147 |
+
if aid == article_id:
|
| 148 |
+
return (obj.get("text") or "").strip()
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def load_vectorstore() -> FAISS:
|
| 153 |
+
if not DB_DIR.exists():
|
| 154 |
+
raise FileNotFoundError(f"Index FAISS introuvable : {DB_DIR}")
|
| 155 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 156 |
+
return FAISS.load_local(str(DB_DIR), embeddings, allow_dangerous_deserialization=True)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def retrieve_scored(vs: FAISS, query: str) -> List[Tuple[object, float]]:
|
| 160 |
+
"""
|
| 161 |
+
Retourne liste (Document, score). Plus le score est PETIT, plus c'est proche (distance).
|
| 162 |
+
"""
|
| 163 |
+
return vs.similarity_search_with_score(query, k=TOP_K_FETCH)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def filter_docs(scored: List[Tuple[object, float]]) -> List[Tuple[object, float]]:
|
| 167 |
+
"""
|
| 168 |
+
Filtre simple par seuil + garde TOP_K_FINAL.
|
| 169 |
+
"""
|
| 170 |
+
kept = [(d, s) for (d, s) in scored if s <= SCORE_THRESHOLD]
|
| 171 |
+
if not kept:
|
| 172 |
+
# fallback : au moins TOP_K_FINAL meilleurs, sinon tu refuses trop souvent
|
| 173 |
+
kept = sorted(scored, key=lambda x: x[1])[:TOP_K_FINAL]
|
| 174 |
+
else:
|
| 175 |
+
kept = sorted(kept, key=lambda x: x[1])[:TOP_K_FINAL]
|
| 176 |
+
return kept
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def build_context(scored_docs: List[Tuple[object, float]]) -> Tuple[str, List[str], Dict[str, str], Dict[str, float]]:
|
| 180 |
+
used = []
|
| 181 |
+
by_id: Dict[str, str] = {}
|
| 182 |
+
by_score: Dict[str, float] = {}
|
| 183 |
+
|
| 184 |
+
blocks = []
|
| 185 |
+
for d, s in scored_docs:
|
| 186 |
+
aid = d.metadata.get("article_id", "UNKNOWN")
|
| 187 |
+
aid_norm = normalize_article_id(aid)
|
| 188 |
+
used.append(aid_norm)
|
| 189 |
+
|
| 190 |
+
txt = (d.page_content or "").strip()
|
| 191 |
+
by_id[aid_norm] = txt
|
| 192 |
+
by_score[aid_norm] = float(s)
|
| 193 |
+
|
| 194 |
+
if len(txt) > MAX_CHARS_PER_DOC:
|
| 195 |
+
txt = txt[:MAX_CHARS_PER_DOC].rstrip() + "\n[.]"
|
| 196 |
+
|
| 197 |
+
blocks.append(f"[{aid_norm}]\n{txt}")
|
| 198 |
+
|
| 199 |
+
used = dedupe_keep_order(used)
|
| 200 |
+
return "\n\n".join(blocks), used, by_id, by_score
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def eple_context_ok(question: str, by_id: Dict[str, str]) -> bool:
|
| 204 |
+
"""
|
| 205 |
+
Si la question contient "EPLE", on veut que le contexte contienne explicitement
|
| 206 |
+
des indices "collège/lycée/établissement public local d'enseignement".
|
| 207 |
+
"""
|
| 208 |
+
if not EPLE_RE.search(question):
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
joined = "\n".join(by_id.values()).lower()
|
| 212 |
+
signals = [
|
| 213 |
+
"établissement public local d'enseignement",
|
| 214 |
+
"etablissement public local d'enseignement",
|
| 215 |
+
"collège", "college", "lycée", "lycee",
|
| 216 |
+
"chef d'établissement", "chef d'etablissement",
|
| 217 |
+
]
|
| 218 |
+
return any(sig in joined for sig in signals)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def extract_cited_articles(answer: str) -> List[str]:
|
| 222 |
+
m = ARTICLES_CITES_RE.search(answer)
|
| 223 |
+
if not m:
|
| 224 |
+
return []
|
| 225 |
+
tail = m.group(1).strip()
|
| 226 |
+
if not tail:
|
| 227 |
+
return []
|
| 228 |
+
parts = re.split(r"[,\s]+", tail)
|
| 229 |
+
out = []
|
| 230 |
+
for p in parts:
|
| 231 |
+
p = p.strip()
|
| 232 |
+
if not p:
|
| 233 |
+
continue
|
| 234 |
+
# tolère "D422-15." ou "[D422-15]"
|
| 235 |
+
p = p.strip("[]().;:")
|
| 236 |
+
if ARTICLE_ID_RE.match(p) or re.match(r"^[LDR]\d", p, flags=re.I):
|
| 237 |
+
out.append(normalize_article_id(p))
|
| 238 |
+
return dedupe_keep_order(out)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def validate_answer(answer: str, allowed_articles: List[str]) -> bool:
|
| 242 |
+
cited = extract_cited_articles(answer)
|
| 243 |
+
allowed_set = set(allowed_articles)
|
| 244 |
+
|
| 245 |
+
# si le LLM ne cite rien => on refuse (sinon il peut raconter)
|
| 246 |
+
if not cited:
|
| 247 |
+
return False
|
| 248 |
+
|
| 249 |
+
# interdit de citer un article non présent dans la liste autorisée
|
| 250 |
+
if any(c not in allowed_set for c in cited):
|
| 251 |
+
return False
|
| 252 |
+
|
| 253 |
+
return True
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def build_prompt(question: str, context: str, allowed_articles: List[str]) -> str:
|
| 257 |
+
allowed = ", ".join(allowed_articles)
|
| 258 |
+
|
| 259 |
+
return f"""Tu es un assistant juridique spécialisé dans le Code de l'éducation (France).
|
| 260 |
+
|
| 261 |
+
RÈGLES ABSOLUES (non négociables) :
|
| 262 |
+
1) Tu réponds UNIQUEMENT à partir du CONTEXTE fourni ci-dessous.
|
| 263 |
+
2) Tu n'inventes rien, tu ne complètes pas, tu ne "supposes" pas. Interdiction d'utiliser :
|
| 264 |
+
"on peut supposer", "il est possible que", "on peut déduire", "probablement", etc.
|
| 265 |
+
3) Si le CONTEXTE ne permet pas de répondre, tu dis exactement :
|
| 266 |
+
"Je ne peux pas répondre avec certitude à partir des articles fournis."
|
| 267 |
+
4) Tu DOIS citer uniquement des articles présents dans la liste autorisée :
|
| 268 |
+
{allowed}
|
| 269 |
+
5) Attention au sigle EPLE :
|
| 270 |
+
- EPLE = établissement public local d'enseignement (collèges/lycées).
|
| 271 |
+
- Ne confonds pas avec d'autres établissements.
|
| 272 |
+
Si le CONTEXTE ne traite pas clairement des EPLE au sens collèges/lycées, tu refuses de conclure.
|
| 273 |
+
|
| 274 |
+
QUESTION :
|
| 275 |
+
{question}
|
| 276 |
+
|
| 277 |
+
CONTEXTE :
|
| 278 |
+
{context}
|
| 279 |
+
|
| 280 |
+
FORMAT DE SORTIE OBLIGATOIRE :
|
| 281 |
+
- Une réponse courte et factuelle.
|
| 282 |
+
- Dernière ligne STRICTE : "Articles cités : A, B, C" (uniquement parmi la liste autorisée).
|
| 283 |
+
"""
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
# -------------------- CORE API (HF) --------------------
|
| 287 |
+
_REFUSAL = "Je ne peux pas répondre avec certitude à partir des articles fournis."
|
| 288 |
+
|
| 289 |
+
# cache pour éviter de recharger FAISS à chaque call
|
| 290 |
+
_VS: Optional[FAISS] = None
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def get_vectorstore() -> FAISS:
|
| 294 |
+
global _VS
|
| 295 |
+
if _VS is None:
|
| 296 |
+
_VS = load_vectorstore()
|
| 297 |
+
return _VS
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def answer_query(q: str) -> Dict[str, Any]:
|
| 301 |
+
"""
|
| 302 |
+
API équivalente à la boucle interactive de rag_chat_llama.py.
|
| 303 |
+
|
| 304 |
+
Retourne un dict structuré :
|
| 305 |
+
- mode: "FULLTEXT" | "LIST" | "QA"
|
| 306 |
+
- answer: str (réponse finale ou refus)
|
| 307 |
+
- articles: liste des articles récupérés (pour debug/affichage)
|
| 308 |
+
- scores: dict {article: score} (pour debug/affichage)
|
| 309 |
+
- snippets: (LIST) dict {article: snippet}
|
| 310 |
+
- fulltext: (FULLTEXT) texte exact
|
| 311 |
+
"""
|
| 312 |
+
q = (q or "").strip()
|
| 313 |
+
if not q:
|
| 314 |
+
return {"mode": "QA", "answer": _REFUSAL, "articles": [], "scores": {}}
|
| 315 |
+
|
| 316 |
+
vs = get_vectorstore()
|
| 317 |
+
|
| 318 |
+
# --- FULLTEXT ---
|
| 319 |
+
aid = extract_article_id(q)
|
| 320 |
+
if aid and is_fulltext_request(q):
|
| 321 |
+
txt = load_article_text(aid)
|
| 322 |
+
if not txt:
|
| 323 |
+
return {
|
| 324 |
+
"mode": "FULLTEXT",
|
| 325 |
+
"answer": f"Je ne trouve pas l'article {aid} dans {CHUNKS_PATH}.",
|
| 326 |
+
"articles": [],
|
| 327 |
+
"scores": {},
|
| 328 |
+
"fulltext": None,
|
| 329 |
+
}
|
| 330 |
+
return {
|
| 331 |
+
"mode": "FULLTEXT",
|
| 332 |
+
"answer": txt,
|
| 333 |
+
"articles": [aid],
|
| 334 |
+
"scores": {},
|
| 335 |
+
"fulltext": txt,
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
# --- RETRIEVE (scored) ---
|
| 339 |
+
scored = retrieve_scored(vs, q)
|
| 340 |
+
scored = filter_docs(scored)
|
| 341 |
+
context, articles, by_id, by_score = build_context(scored)
|
| 342 |
+
|
| 343 |
+
# --- LIST ---
|
| 344 |
+
if is_list_request(q):
|
| 345 |
+
snippets = {a: safe_snippet(by_id.get(a, ""), SNIPPET_CHARS) for a in articles}
|
| 346 |
+
return {
|
| 347 |
+
"mode": "LIST",
|
| 348 |
+
"answer": "",
|
| 349 |
+
"articles": articles,
|
| 350 |
+
"scores": by_score,
|
| 351 |
+
"snippets": snippets,
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
# --- EPLE safety gate ---
|
| 355 |
+
if not eple_context_ok(q, by_id):
|
| 356 |
+
return {
|
| 357 |
+
"mode": "QA",
|
| 358 |
+
"answer": _REFUSAL,
|
| 359 |
+
"articles": articles,
|
| 360 |
+
"scores": by_score,
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
# --- QA (LLM) ---
|
| 364 |
+
prompt = build_prompt(q, context, articles)
|
| 365 |
+
answer = llm_generate(prompt)
|
| 366 |
+
|
| 367 |
+
# --- VALIDATION ---
|
| 368 |
+
if not validate_answer(answer, articles):
|
| 369 |
+
return {
|
| 370 |
+
"mode": "QA",
|
| 371 |
+
"answer": _REFUSAL,
|
| 372 |
+
"articles": articles,
|
| 373 |
+
"scores": by_score,
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
return {
|
| 377 |
+
"mode": "QA",
|
| 378 |
+
"answer": answer,
|
| 379 |
+
"articles": articles,
|
| 380 |
+
"scores": by_score,
|
| 381 |
+
}
|