etm-terrain/scripts/reindex_all.py

52 lines
1.4 KiB
Python

"""
Réindexation complète de la FAQ depuis data/docs/.
Usage : python scripts/reindex_all.py
"""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from services.faq_service import faq_service
from services.pdf_indexer import indexer_pdf
PDF_DIR = "./data/docs"
def reindexer_tout():
pdfs = []
for root, _, files in os.walk(PDF_DIR):
for f in files:
if f.lower().endswith(".pdf"):
pdfs.append(os.path.join(root, f))
if not pdfs:
print(f"❌ Aucun PDF dans {PDF_DIR}/")
return
print(f"🗑️ Réinitialisation ChromaDB...")
faq_service.reset_collection()
print(f"📚 {len(pdfs)} PDF(s) à indexer...\n")
total_sections = 0
erreurs = []
for pdf_path in sorted(pdfs):
try:
n = indexer_pdf(faq_service, pdf_path)
total_sections += n
print(f"{os.path.basename(pdf_path)}{n} sections")
except Exception as e:
erreurs.append(os.path.basename(pdf_path))
print(f"{os.path.basename(pdf_path)} : {e}")
print(f"\n{'='*40}")
print(f"✅ Terminé — {total_sections} sections dans ChromaDB")
print(f"📄 {len(pdfs) - len(erreurs)}/{len(pdfs)} PDFs indexés")
if erreurs:
print(f"❌ Échecs : {', '.join(erreurs)}")
if __name__ == "__main__":
reindexer_tout()