Tech Stack Advisor - Code Viewer

← Back to File Tree

ingest_knowledge.py

Language: python | Path: scripts/ingest_knowledge.py | Lines: 127
"""Ingest knowledge base documents into Qdrant vector store."""
import json
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from backend.src.rag.vectorstore import VectorStore
from backend.src.core.logging import setup_logging, get_logger

setup_logging()
logger = get_logger(__name__)


def load_documents_from_json(file_path: Path) -> list[dict]:
    """Load documents from a JSON file.

    Args:
        file_path: Path to JSON file

    Returns:
        List of document dictionaries
    """
    logger.info("loading_documents", file=str(file_path))
    with open(file_path) as f:
        documents = json.load(f)
    logger.info("documents_loaded", count=len(documents), file=file_path.name)
    return documents


def ingest_all(use_local: bool = False) -> None:
    """Ingest all knowledge base documents.

    Args:
        use_local: If True, use local in-memory Qdrant for testing
    """
    logger.info("starting_ingestion", use_local=use_local)

    # Initialize vector store
    vector_store = VectorStore(use_local=use_local)

    # Get knowledge base directory
    kb_dir = Path(__file__).parent.parent / "knowledge_base"

    if not kb_dir.exists():
        logger.error("knowledge_base_not_found", path=str(kb_dir))
        print(f"āŒ Knowledge base directory not found: {kb_dir}")
        return

    # Find all JSON files
    json_files = list(kb_dir.glob("*.json"))

    if not json_files:
        logger.warning("no_json_files_found", path=str(kb_dir))
        print(f"āš ļø  No JSON files found in {kb_dir}")
        return

    print(f"\nšŸ“š Found {len(json_files)} knowledge base files:")
    for file in json_files:
        print(f"   - {file.name}")

    # Load and ingest documents
    total_docs = 0
    for json_file in json_files:
        try:
            documents = load_documents_from_json(json_file)
            count = vector_store.add_documents(documents)
            total_docs += count
            print(f"   āœ… {json_file.name}: {count} documents")

        except Exception as e:
            logger.error("ingestion_error", file=json_file.name, error=str(e))
            print(f"   āŒ {json_file.name}: Error - {str(e)}")

    # Get collection info
    info = vector_store.get_collection_info()

    print(f"\nāœ… Ingestion Complete!")
    print(f"\nšŸ“Š Collection Stats:")
    print(f"   - Collection: {info['name']}")
    print(f"   - Documents: {info['vectors_count']}")
    print(f"   - Status: {info['status']}")

    logger.info(
        "ingestion_complete",
        total_documents=total_docs,
        collection=info['name'],
    )

    # Test search
    print(f"\nšŸ” Testing Search...")
    test_queries = [
        "database for chat application",
        "kubernetes container orchestration",
        "GDPR compliance requirements",
    ]

    for query in test_queries:
        results = vector_store.search(query, limit=2)
        print(f"\n   Query: '{query}'")
        if results:
            top_result = results[0]
            print(f"   → {top_result['text'][:100]}...")
            print(f"     Score: {top_result['score']:.3f}")
            print(f"     Category: {top_result['metadata'].get('category', 'N/A')}")
        else:
            print(f"   → No results found")

    print("\n" + "=" * 70)
    print("\nšŸŽ‰ Knowledge base is ready!")
    print("\nšŸ’” The RAG system can now provide intelligent recommendations")
    print("   based on real technical documentation.\n")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Ingest knowledge base into vector store")
    parser.add_argument(
        "--local",
        action="store_true",
        help="Use local in-memory Qdrant (for testing)",
    )
    args = parser.parse_args()

    ingest_all(use_local=args.local)