etm-terrain/scripts/reindex_all.py

"""
Réindexation complète de la FAQ depuis data/docs/.
Usage : python scripts/reindex_all.py
"""
import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))

from services.faq_service import faq_service
from services.pdf_indexer import indexer_pdf

PDF_DIR = "./data/docs"


def reindexer_tout():
    pdfs = []
    for root, _, files in os.walk(PDF_DIR):
        for f in files:
            if f.lower().endswith(".pdf"):
                pdfs.append(os.path.join(root, f))

    if not pdfs:
        print(f"❌ Aucun PDF dans {PDF_DIR}/")
        return

    print(f"🗑️  Réinitialisation ChromaDB...")
    faq_service.reset_collection()

    print(f"📚 {len(pdfs)} PDF(s) à indexer...\n")
    total_sections = 0
    erreurs = []

    for pdf_path in sorted(pdfs):
        try:
            n = indexer_pdf(faq_service, pdf_path)
            total_sections += n
            print(f"  ✅ {os.path.basename(pdf_path)} — {n} sections")
        except Exception as e:
            erreurs.append(os.path.basename(pdf_path))
            print(f"  ❌ {os.path.basename(pdf_path)} : {e}")

    print(f"\n{'='*40}")
    print(f"✅ Terminé — {total_sections} sections dans ChromaDB")
    print(f"📄 {len(pdfs) - len(erreurs)}/{len(pdfs)} PDFs indexés")
    if erreurs:
        print(f"❌ Échecs : {', '.join(erreurs)}")


if __name__ == "__main__":
    reindexer_tout()