111 lines
3.8 KiB
Python
111 lines
3.8 KiB
Python
import logging
|
|
from io import BytesIO
|
|
import httpx
|
|
|
|
try:
|
|
import pypdf
|
|
_pypdf_ok = True
|
|
except ImportError:
|
|
_pypdf_ok = False
|
|
log_tmp = logging.getLogger(__name__)
|
|
log_tmp.warning("pypdf non installé — extraction PDF désactivée. pip install pypdf")
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
try:
|
|
import chromadb
|
|
_chroma_ok = True
|
|
except ImportError:
|
|
_chroma_ok = False
|
|
log.warning("chromadb non installé — FAQ SAV désactivée. Installez : pip install chromadb")
|
|
|
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
|
OLLAMA_MODEL = "phi3"
|
|
|
|
|
|
class SAVAssistant:
|
|
def __init__(self):
|
|
self._collection = None
|
|
if not _chroma_ok:
|
|
return
|
|
try:
|
|
db = chromadb.PersistentClient(path="./sav_knowledge")
|
|
self._collection = db.get_or_create_collection("docs_techniques")
|
|
except Exception as exc:
|
|
log.error("SAVAssistant init : %s", exc)
|
|
|
|
@property
|
|
def disponible(self) -> bool:
|
|
return self._collection is not None and self._collection.count() > 0
|
|
|
|
async def indexer_document(self, texte: str, source: str) -> int:
|
|
if not self._collection:
|
|
return 0
|
|
chunks = [texte[i:i + 500] for i in range(0, len(texte), 400)]
|
|
try:
|
|
self._collection.delete(where={"source": source})
|
|
except Exception:
|
|
pass
|
|
self._collection.add(
|
|
documents=chunks,
|
|
ids=[f"{source}_{i}" for i in range(len(chunks))],
|
|
metadatas=[{"source": source}] * len(chunks),
|
|
)
|
|
log.info("Indexé %d chunks depuis '%s'", len(chunks), source)
|
|
return len(chunks)
|
|
|
|
async def chercher(self, question: str, n_results: int = 3) -> str | None:
|
|
if not self.disponible:
|
|
return None
|
|
try:
|
|
results = self._collection.query(query_texts=[question], n_results=n_results)
|
|
extraits = results["documents"][0]
|
|
sources = [m["source"] for m in results["metadatas"][0]]
|
|
if not extraits:
|
|
return None
|
|
# Reformuler avec Ollama si disponible
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30) as client:
|
|
r = await client.post(
|
|
OLLAMA_URL,
|
|
json={
|
|
"model": OLLAMA_MODEL,
|
|
"prompt": (
|
|
f"Problème terrain : {question}\n\n"
|
|
"Documentation :\n" + "\n\n".join(extraits) +
|
|
"\n\nDonne 3 pistes de diagnostic en français, "
|
|
"en langage simple pour un technicien de terrain."
|
|
),
|
|
"stream": False,
|
|
},
|
|
)
|
|
if r.status_code == 200:
|
|
return r.json().get("response", "").strip() or None
|
|
except Exception:
|
|
pass
|
|
# Fallback : extraits bruts
|
|
return "\n\n---\n\n".join(f"📄 *{s}*\n{e}" for s, e in zip(sources, extraits))
|
|
except Exception as exc:
|
|
log.error("SAVAssistant.chercher : %s", exc)
|
|
return None
|
|
|
|
|
|
sav_assistant = SAVAssistant()
|
|
|
|
|
|
def extract_pdf_text(pdf_bytes: bytes) -> str:
|
|
"""Extrait le texte d'un PDF. Retourne une chaîne vide si non lisible."""
|
|
if not _pypdf_ok:
|
|
return ""
|
|
try:
|
|
reader = pypdf.PdfReader(BytesIO(pdf_bytes))
|
|
pages = []
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
pages.append(text.strip())
|
|
return "\n\n".join(pages)
|
|
except Exception as exc:
|
|
logging.getLogger(__name__).error("extract_pdf_text : %s", exc)
|
|
return ""
|