52 lines
1.4 KiB
Python
52 lines
1.4 KiB
Python
"""
|
|
Réindexation complète de la FAQ depuis data/docs/.
|
|
Usage : python scripts/reindex_all.py
|
|
"""
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
|
|
|
from services.faq_service import faq_service
|
|
from services.pdf_indexer import indexer_pdf
|
|
|
|
PDF_DIR = "./data/docs"
|
|
|
|
|
|
def reindexer_tout():
|
|
pdfs = []
|
|
for root, _, files in os.walk(PDF_DIR):
|
|
for f in files:
|
|
if f.lower().endswith(".pdf"):
|
|
pdfs.append(os.path.join(root, f))
|
|
|
|
if not pdfs:
|
|
print(f"❌ Aucun PDF dans {PDF_DIR}/")
|
|
return
|
|
|
|
print(f"🗑️ Réinitialisation ChromaDB...")
|
|
faq_service.reset_collection()
|
|
|
|
print(f"📚 {len(pdfs)} PDF(s) à indexer...\n")
|
|
total_sections = 0
|
|
erreurs = []
|
|
|
|
for pdf_path in sorted(pdfs):
|
|
try:
|
|
n = indexer_pdf(faq_service, pdf_path)
|
|
total_sections += n
|
|
print(f" ✅ {os.path.basename(pdf_path)} — {n} sections")
|
|
except Exception as e:
|
|
erreurs.append(os.path.basename(pdf_path))
|
|
print(f" ❌ {os.path.basename(pdf_path)} : {e}")
|
|
|
|
print(f"\n{'='*40}")
|
|
print(f"✅ Terminé — {total_sections} sections dans ChromaDB")
|
|
print(f"📄 {len(pdfs) - len(erreurs)}/{len(pdfs)} PDFs indexés")
|
|
if erreurs:
|
|
print(f"❌ Échecs : {', '.join(erreurs)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
reindexer_tout()
|