ā Back to File Tree
ingest_knowledge.py
Language: python |
Path: scripts/ingest_knowledge.py |
Lines: 127
"""Ingest knowledge base documents into Qdrant vector store."""
import json
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.src.rag.vectorstore import VectorStore
from backend.src.core.logging import setup_logging, get_logger
setup_logging()
logger = get_logger(__name__)
def load_documents_from_json(file_path: Path) -> list[dict]:
"""Load documents from a JSON file.
Args:
file_path: Path to JSON file
Returns:
List of document dictionaries
"""
logger.info("loading_documents", file=str(file_path))
with open(file_path) as f:
documents = json.load(f)
logger.info("documents_loaded", count=len(documents), file=file_path.name)
return documents
def ingest_all(use_local: bool = False) -> None:
"""Ingest all knowledge base documents.
Args:
use_local: If True, use local in-memory Qdrant for testing
"""
logger.info("starting_ingestion", use_local=use_local)
# Initialize vector store
vector_store = VectorStore(use_local=use_local)
# Get knowledge base directory
kb_dir = Path(__file__).parent.parent / "knowledge_base"
if not kb_dir.exists():
logger.error("knowledge_base_not_found", path=str(kb_dir))
print(f"ā Knowledge base directory not found: {kb_dir}")
return
# Find all JSON files
json_files = list(kb_dir.glob("*.json"))
if not json_files:
logger.warning("no_json_files_found", path=str(kb_dir))
print(f"ā ļø No JSON files found in {kb_dir}")
return
print(f"\nš Found {len(json_files)} knowledge base files:")
for file in json_files:
print(f" - {file.name}")
# Load and ingest documents
total_docs = 0
for json_file in json_files:
try:
documents = load_documents_from_json(json_file)
count = vector_store.add_documents(documents)
total_docs += count
print(f" ā
{json_file.name}: {count} documents")
except Exception as e:
logger.error("ingestion_error", file=json_file.name, error=str(e))
print(f" ā {json_file.name}: Error - {str(e)}")
# Get collection info
info = vector_store.get_collection_info()
print(f"\nā
Ingestion Complete!")
print(f"\nš Collection Stats:")
print(f" - Collection: {info['name']}")
print(f" - Documents: {info['vectors_count']}")
print(f" - Status: {info['status']}")
logger.info(
"ingestion_complete",
total_documents=total_docs,
collection=info['name'],
)
# Test search
print(f"\nš Testing Search...")
test_queries = [
"database for chat application",
"kubernetes container orchestration",
"GDPR compliance requirements",
]
for query in test_queries:
results = vector_store.search(query, limit=2)
print(f"\n Query: '{query}'")
if results:
top_result = results[0]
print(f" ā {top_result['text'][:100]}...")
print(f" Score: {top_result['score']:.3f}")
print(f" Category: {top_result['metadata'].get('category', 'N/A')}")
else:
print(f" ā No results found")
print("\n" + "=" * 70)
print("\nš Knowledge base is ready!")
print("\nš” The RAG system can now provide intelligent recommendations")
print(" based on real technical documentation.\n")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Ingest knowledge base into vector store")
parser.add_argument(
"--local",
action="store_true",
help="Use local in-memory Qdrant (for testing)",
)
args = parser.parse_args()
ingest_all(use_local=args.local)