本部署方案包含以下组件:
# 安装 Azure CLI
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
# 安装 kubectl
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
# 安装 Helm
curl https://baltocdn.com/helm/signing.asc | gpg --dearmor | sudo tee /usr/share/keyrings/helm.gpg > /dev/null
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list
sudo apt-get update
sudo apt-get install helm# 登录 Azure
az login
# 设置订阅(替换为您的订阅ID)
az account set --subscription "your-subscription-id"
# 创建资源组
az group create --name rg-llamaindex-prod --location eastus# 创建 AKS 集群
az aks create \
--resource-group rg-llamaindex-prod \
--name aks-llamaindex-cluster \
--node-count 3 \
--node-vm-size Standard_D4s_v3 \
--enable-addons monitoring \
--generate-ssh-keys \
--network-plugin azure \
--service-cidr 10.0.0.0/16 \
--dns-service-ip 10.0.0.10
# 获取 AKS 凭据
az aks get-credentials --resource-group rg-llamaindex-prod --name aks-llamaindex-cluster重要参数说明:
--node-vm-size Standard_D4s_v3: 选择适合向量计算的VM规格(4vCPU, 16GB RAM)--enable-addons monitoring: 启用 Azure Monitor 进行集群监控--network-plugin azure: 使用 Azure CNI 网络插件,提供更好的网络性能# namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: llamaindex-system
labels:
name: llamaindex-systemkubectl apply -f namespace.yaml# storage-class.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: qdrant-storage
provisioner: disk.csi.azure.com
parameters:
storageaccounttype: Premium_LRS
kind: Managed
reclaimPolicy: Retain
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumerkubectl apply -f storage-class.yaml重要参数说明:
Premium_LRS: 高性能SSD存储,适合数据库工作负载reclaimPolicy: Retain: 删除PVC时保留数据allowVolumeExpansion: true: 允许动态扩展存储容量# qdrant-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: qdrant-config
namespace: llamaindex-system
data:
config.yaml: |
log_level: INFO
storage:
on_disk_payload: true
performance:
max_search_threads: 4
max_optimization_threads: 2
service:
http_port: 6333
grpc_port: 6334
max_request_size_mb: 32
cluster:
enabled: false
telemetry_disabled: true# qdrant-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: qdrant
namespace: llamaindex-system
labels:
app: qdrant
spec:
replicas: 1
selector:
matchLabels:
app: qdrant
template:
metadata:
labels:
app: qdrant
spec:
containers:
- name: qdrant
image: qdrant/qdrant:v1.7.3
ports:
- containerPort: 6333
name: http
- containerPort: 6334
name: grpc
env:
- name: QDRANT__SERVICE__HTTP_PORT
value: "6333"
- name: QDRANT__SERVICE__GRPC_PORT
value: "6334"
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
volumeMounts:
- name: qdrant-storage
mountPath: /qdrant/storage
- name: config-volume
mountPath: /qdrant/config
livenessProbe:
httpGet:
path: /
port: 6333
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 6333
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: qdrant-storage
persistentVolumeClaim:
claimName: qdrant-pvc
- name: config-volume
configMap:
name: qdrant-config# qdrant-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: qdrant-pvc
namespace: llamaindex-system
spec:
accessModes:
- ReadWriteOnce
storageClassName: qdrant-storage
resources:
requests:
storage: 50Gi# qdrant-service.yaml
apiVersion: v1
kind: Service
metadata:
name: qdrant-service
namespace: llamaindex-system
labels:
app: qdrant
spec:
selector:
app: qdrant
ports:
- name: http
port: 6333
targetPort: 6333
protocol: TCP
- name: grpc
port: 6334
targetPort: 6334
protocol: TCP
type: ClusterIP重要参数说明:
replicas: 1: QDrant单实例部署,生产环境可考虑集群模式resources: 内存和CPU限制,根据数据量和查询频率调整storage: 50Gi: 存储大小,根据向量数据量预估# llamaindex-app-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llamaindex-app-config
namespace: llamaindex-system
data:
app.py: |
import os
import logging
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_index.core import VectorStoreIndex, Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from qdrant_client import QdrantClient
import uvicorn
# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="LlamaIndex RAG API", version="1.0.0")
# 配置参数
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant-service")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "documents")
# 初始化客户端
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# 初始化 LLM 和 Embedding
llm = OpenAI(api_key=OPENAI_API_KEY)
embed_model = OpenAIEmbedding(api_key=OPENAI_API_KEY)
class QueryRequest(BaseModel):
query: str
top_k: int = 5
class DocumentRequest(BaseModel):
text: str
metadata: dict = {}
@app.get("/health")
async def health_check():
return {"status": "healthy", "qdrant_host": QDRANT_HOST}
@app.post("/index/document")
async def index_document(request: DocumentRequest):
try:
document = Document(text=request.text, metadata=request.metadata)
index = VectorStoreIndex.from_documents(
[document],
storage_context=storage_context,
embed_model=embed_model
)
return {"message": "Document indexed successfully"}
except Exception as e:
logger.error(f"Error indexing document: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/query")
async def query_documents(request: QueryRequest):
try:
index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=embed_model
)
query_engine = index.as_query_engine(
llm=llm,
similarity_top_k=request.top_k
)
response = query_engine.query(request.query)
return {
"query": request.query,
"response": str(response),
"source_nodes": [
{
"text": node.text,
"score": node.score,
"metadata": node.metadata
} for node in response.source_nodes
]
}
except Exception as e:
logger.error(f"Error querying documents: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt: |
fastapi==0.104.1
uvicorn==0.24.0
llama-index==0.9.12
llama-index-vector-stores-qdrant==0.2.8
llama-index-embeddings-openai==0.1.7
llama-index-llms-openai==0.1.13
qdrant-client==1.7.0
pydantic==2.5.0
python-multipart==0.0.6# llamaindex-dockerfile.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: llamaindex-dockerfile
namespace: llamaindex-system
data:
Dockerfile: |
FROM python:3.11-slim
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# 复制依赖文件
COPY requirements.txt .
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY app.py .
# 暴露端口
EXPOSE 8000
# 启动应用
CMD ["python", "app.py"]# llamaindex-secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: llamaindex-secret
namespace: llamaindex-system
type: Opaque
data:
# Base64 编码的 OpenAI API Key
openai-api-key: <your-base64-encoded-openai-api-key># 创建 Secret
echo -n "your-openai-api-key" | base64
kubectl create secret generic llamaindex-secret \
--from-literal=openai-api-key=your-openai-api-key \
-n llamaindex-system# llamaindex-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llamaindex-app
namespace: llamaindex-system
labels:
app: llamaindex-app
spec:
replicas: 2
selector:
matchLabels:
app: llamaindex-app
template:
metadata:
labels:
app: llamaindex-app
spec:
containers:
- name: llamaindex-app
image: your-registry/llamaindex-app:latest
ports:
- containerPort: 8000
env:
- name: QDRANT_HOST
value: "qdrant-service"
- name: QDRANT_PORT
value: "6333"
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: llamaindex-secret
key: openai-api-key
- name: COLLECTION_NAME
value: "documents"
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 10
periodSeconds: 5# llamaindex-service.yaml
apiVersion: v1
kind: Service
metadata:
name: llamaindex-service
namespace: llamaindex-system
labels:
app: llamaindex-app
spec:
selector:
app: llamaindex-app
ports:
- name: http
port: 80
targetPort: 8000
protocol: TCP
type: LoadBalancer# ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: llamaindex-ingress
namespace: llamaindex-system
annotations:
kubernetes.io/ingress.class: azure/application-gateway
appgw.ingress.kubernetes.io/ssl-redirect: "true"
spec:
tls:
- hosts:
- llamaindex.yourdomain.com
secretName: llamaindex-tls
rules:
- host: llamaindex.yourdomain.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: llamaindex-service
port:
number: 80# hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: llamaindex-hpa
namespace: llamaindex-system
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: llamaindex-app
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80重要参数说明:
minReplicas: 2: 最小副本数,保证高可用maxReplicas: 10: 最大副本数,根据负载需求设置averageUtilization: 70: CPU使用率超过70%时触发扩容#!/bin/bash
# deploy.sh
set -e
echo "=== LlamaIndex + QDrant AKS 部署脚本 ==="
# 检查必要的工具
command -v kubectl >/dev/null 2>&1 || { echo "kubectl 未安装"; exit 1; }
command -v az >/dev/null 2>&1 || { echo "Azure CLI 未安装"; exit 1; }
# 设置变量
RESOURCE_GROUP="rg-llamaindex-prod"
AKS_CLUSTER="aks-llamaindex-cluster"
NAMESPACE="llamaindex-system"
echo "1. 创建命名空间..."
kubectl apply -f namespace.yaml
echo "2. 创建存储类..."
kubectl apply -f storage-class.yaml
echo "3. 部署 QDrant..."
kubectl apply -f qdrant-config.yaml
kubectl apply -f qdrant-pvc.yaml
kubectl apply -f qdrant-deployment.yaml
kubectl apply -f qdrant-service.yaml
echo "4. 等待 QDrant 启动..."
kubectl wait --for=condition=available --timeout=300s deployment/qdrant -n $NAMESPACE
echo "5. 创建 LlamaIndex 配置..."
kubectl apply -f llamaindex-app-config.yaml
kubectl apply -f llamaindex-secret.yaml
echo "6. 部署 LlamaIndex 应用..."
kubectl apply -f llamaindex-deployment.yaml
kubectl apply -f llamaindex-service.yaml
echo "7. 配置自动扩缩容..."
kubectl apply -f hpa.yaml
echo "8. 等待服务启动..."
kubectl wait --for=condition=available --timeout=300s deployment/llamaindex-app -n $NAMESPACE
echo "=== 部署完成 ==="
echo "获取服务访问地址:"
kubectl get service llamaindex-service -n $NAMESPACE#!/bin/bash
# verify.sh
NAMESPACE="llamaindex-system"
echo "=== 验证部署状态 ==="
echo "1. 检查 Pod 状态:"
kubectl get pods -n $NAMESPACE
echo "2. 检查服务状态:"
kubectl get services -n $NAMESPACE
echo "3. 检查 PVC 状态:"
kubectl get pvc -n $NAMESPACE
echo "4. 测试 QDrant 连接:"
kubectl exec -n $NAMESPACE deployment/qdrant -- curl -s http://localhost:6333/ || echo "QDrant 连接失败"
echo "5. 测试 LlamaIndex 健康检查:"
LLAMAINDEX_IP=$(kubectl get service llamaindex-service -n $NAMESPACE -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
if [ ! -z "$LLAMAINDEX_IP" ]; then
curl -s http://$LLAMAINDEX_IP/health || echo "LlamaIndex 健康检查失败"
else
echo "LlamaIndex 服务 IP 未分配"
fi
echo "=== 验证完成 ==="# 查看 QDrant 日志
kubectl logs -f deployment/qdrant -n llamaindex-system
# 查看 LlamaIndex 日志
kubectl logs -f deployment/llamaindex-app -n llamaindex-system
# 查看所有 Pod 日志
kubectl logs -f -l app=qdrant -n llamaindex-system --all-containers=true# monitoring.yaml
apiVersion: v1
kind: Service
metadata:
name: qdrant-metrics
namespace: llamaindex-system
labels:
app: qdrant
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "6333"
prometheus.io/path: "/metrics"
spec:
selector:
app: qdrant
ports:
- name: metrics
port: 6333
targetPort: 6333QDrant 启动失败:
# 检查存储权限
kubectl describe pvc qdrant-pvc -n llamaindex-system
# 检查配置
kubectl describe configmap qdrant-config -n llamaindex-systemLlamaIndex 连接失败:
# 检查网络连接
kubectl exec -n llamaindex-system deployment/llamaindex-app -- nslookup qdrant-service
# 检查环境变量
kubectl exec -n llamaindex-system deployment/llamaindex-app -- env | grep QDRANTQDrant 性能优化:
max_search_threads 提高搜索性能on_disk_payload 减少内存使用LlamaIndex 性能优化:
similarity_top_k 参数# 备份 QDrant 数据
kubectl exec -n llamaindex-system deployment/qdrant -- tar czf /tmp/qdrant-backup.tar.gz /qdrant/storage
kubectl cp llamaindex-system/qdrant-pod:/tmp/qdrant-backup.tar.gz ./qdrant-backup.tar.gz
# 重启服务
kubectl rollout restart deployment/qdrant -n llamaindex-system
kubectl rollout restart deployment/llamaindex-app -n llamaindex-system
# 扩展存储
kubectl patch pvc qdrant-pvc -n llamaindex-system -p '{"spec":{"resources":{"requests":{"storage":"100Gi"}}}}'本部署指南提供了在 Azure AKS 上部署 LlamaIndex + QDrant 的完整解决方案,包括:
部署完成后,您将拥有一个生产就绪的 RAG 系统,支持文档索引和智能问答功能。