RAG / Vector Search on AEM Content
Building a RAG Chatbot on AEM Content using LangChain + OpenAI
Introduction
Imagine a chatbot that can answer questions like "What is our return policy?" or "How do I configure the Dispatcher?" — drawing answers directly from your AEM-managed content. This is exactly what Retrieval-Augmented Generation (RAG) enables.
RAG combines a vector database (to store and search your content semantically) with an LLM (to generate natural language answers). The result is an AI assistant grounded in your actual AEM content, not hallucinated facts.
In this post, we'll build a complete RAG pipeline that ingests AEM content fragments, stores them in a vector database, and serves answers via a chatbot API.
Architecture
AEM Content Fragments / Pages
↓
AEM Content API (JSON exporter)
↓
Python Ingestion Pipeline
(chunking + embedding)
↓
Vector Database (Pinecone / ChromaDB)
↓
Query → Semantic Search → Top-K chunks
↓
OpenAI LLM (GPT-4o) with retrieved context
↓
Answer returned to user
Step 1: Export AEM Content via JSON API
AEM's Sling JSON exporter makes content available as JSON out of the box.
# Fetch a content fragment as JSON
curl -u admin:admin \
"http://localhost:4502/content/dam/mysite/content-fragments/article-1/_jcr_content/data/master.infinity.json"
For pages:
curl -u admin:admin \
"http://localhost:4502/content/mysite/en/products/overview.model.json"
AEM Servlet — Bulk Content Exporter
Build a custom servlet to export all content fragments for a given path:
@Component(service = Servlet.class)
@SlingServletPaths("/bin/mysite/content-export")
public class ContentExportServlet extends SlingSafeMethodsServlet {
@Reference
private ResourceResolverFactory resolverFactory;
@Override
protected void doGet(SlingHttpServletRequest req,
SlingHttpServletResponse resp)
throws IOException {
String rootPath = req.getParameter("path");
if (rootPath == null) rootPath = "/content/dam/mysite/content-fragments";
Map<String, Object> param = new HashMap<>();
param.put(ResourceResolverFactory.SUBSERVICE, "exportService");
List<Map<String, String>> contentList = new ArrayList<>();
try (ResourceResolver resolver =
resolverFactory.getServiceResourceResolver(param)) {
Resource root = resolver.getResource(rootPath);
if (root != null) {
Iterator<Resource> children = root.listChildren();
while (children.hasNext()) {
Resource child = children.next();
Resource dataNode = child.getChild("jcr:content/data/master");
if (dataNode != null) {
ValueMap vm = dataNode.adaptTo(ValueMap.class);
Map<String, String> item = new HashMap<>();
item.put("path", child.getPath());
item.put("title", vm.get("jcr:title", String.class));
item.put("body", vm.get("bodyText", String.class));
contentList.add(item);
}
}
}
} catch (LoginException e) {
resp.sendError(500);
return;
}
resp.setContentType("application/json");
// Serialize contentList to JSON using Gson/Jackson
resp.getWriter().write(new Gson().toJson(contentList));
}
}
Step 2: Ingest AEM Content into Vector Database
# ingest.py
import requests
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
AEM_HOST = "http://localhost:4502"
EXPORT_URL = f"{AEM_HOST}/bin/mysite/content-export?path=/content/dam/mysite"
def fetch_aem_content():
"""Fetch all content from AEM export servlet"""
response = requests.get(
EXPORT_URL,
auth=("admin", "admin"), # Use service token in production
timeout=30
)
return response.json()
def chunk_content(content_list):
"""Split content into chunks for embedding"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", " "]
)
docs = []
for item in content_list:
if not item.get("body"):
continue
chunks = splitter.split_text(item["body"])
for i, chunk in enumerate(chunks):
docs.append({
"text": chunk,
"metadata": {
"path": item["path"],
"title": item.get("title", ""),
"chunk_index": i
}
})
return docs
def build_vector_store(docs):
"""Create and persist ChromaDB vector store"""
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_key="your-api-key"
)
texts = [d["text"] for d in docs]
metadatas = [d["metadata"] for d in docs]
vectorstore = Chroma.from_texts(
texts=texts,
embedding=embeddings,
metadatas=metadatas,
persist_directory="./aem_vectorstore"
)
vectorstore.persist()
print(f"Indexed {len(docs)} chunks into vector store.")
return vectorstore
# Run ingestion
content = fetch_aem_content()
docs = chunk_content(content)
build_vector_store(docs)
Step 3: Build the RAG Query Pipeline
# rag_query.py
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
def load_rag_chain():
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_key="your-api-key"
)
vectorstore = Chroma(
persist_directory="./aem_vectorstore",
embedding_function=embeddings
)
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 4} # Retrieve top 4 chunks
)
llm = ChatOpenAI(
model="gpt-4o",
temperature=0.2, # Low temp for factual answers
openai_api_key="your-api-key"
)
# Custom prompt that grounds answers in AEM content
prompt_template = """You are a helpful assistant for our website.
Answer the question based ONLY on the following context from our content.
If the answer is not in the context, say "I don't have that information."
Context:
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(
template=prompt_template,
input_variables=["context", "question"]
)
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
chain_type_kwargs={"prompt": prompt},
return_source_documents=True
)
return chain
def answer_question(question: str) -> dict:
chain = load_rag_chain()
result = chain({"query": question})
sources = list({
doc.metadata["path"]
for doc in result["source_documents"]
})
return {
"answer": result["result"],
"sources": sources
}
# Test
result = answer_question("What are the benefits of AEM as a Cloud Service?")
print("Answer:", result["answer"])
print("Sources:", result["sources"])
Step 4: Serve as a REST API (Flask)
# app.py
from flask import Flask, request, jsonify
from rag_query import answer_question
app = Flask(__name__)
@app.route("/api/ask", methods=["POST"])
def ask():
data = request.get_json()
question = data.get("question", "").strip()
if not question:
return jsonify({"error": "Question is required"}), 400
if len(question) > 500:
return jsonify({"error": "Question too long"}), 400
result = answer_question(question)
return jsonify(result)
@app.route("/health", methods=["GET"])
def health():
return jsonify({"status": "ok"})
if __name__ == "__main__":
app.run(port=5000)
Test the API:
curl -X POST http://localhost:5000/api/ask \
-H "Content-Type: application/json" \
-d '{"question": "How do I configure the AEM Dispatcher cache TTL?"}'
Response:
{
"answer": "To configure the cache TTL in AEM Dispatcher, you set the /timeout property in your farm configuration...",
"sources": [
"/content/dam/mysite/content-fragments/dispatcher-guide",
"/content/dam/mysite/content-fragments/caching-strategy"
]
}
Step 5: Embed the Chatbot in AEM (HTL Component)
<!-- apps/your-project/components/ai-chatbot/ai-chatbot.html -->
<div class="ai-chatbot">
<div class="chatbot-header">
<span>✨ Ask our AI Assistant</span>
</div>
<div class="chatbot-messages" id="chatMessages"></div>
<div class="chatbot-input">
<input type="text" id="chatInput"
placeholder="Ask a question about our content..." />
<button onclick="sendQuestion()">Ask</button>
</div>
</div>
<script>
async function sendQuestion() {
const input = document.getElementById('chatInput');
const question = input.value.trim();
if (!question) return;
appendMessage('user', question);
input.value = '';
const res = await fetch('/api/ask', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ question })
});
const data = await res.json();
appendMessage('bot', data.answer);
if (data.sources?.length) {
const srcText = 'Sources: ' + data.sources.join(', ');
appendMessage('sources', srcText);
}
}
function appendMessage(type, text) {
const msgs = document.getElementById('chatMessages');
const div = document.createElement('div');
div.className = `message message-${type}`;
div.textContent = text;
msgs.appendChild(div);
msgs.scrollTop = msgs.scrollHeight;
}
</script>
Key Takeaways
- Always chunk content before embedding — large documents exceed token limits and reduce retrieval quality.
- Set
temperature: 0.2for RAG — you want factual, consistent answers, not creative ones. - Include source paths in the response — builds user trust and helps editors see which content is being used.
- Re-run ingestion after content publishes to keep the vector store current; hook it into your AEM replication event listener.
- Use ChromaDB for local/dev, Pinecone or pgvector for production at scale.
What's Next?
Final blog: AEM Security in the Age of AI — new attack vectors introduced by AI and how to protect your AEM platform.
Published on aemrules.com | Tags: AEM, RAG, LangChain, OpenAI, Vector Database, Chatbot, Content Fragments, ChromaDB
Comments
Post a Comment