RAG-Anything多模态混合检索实战教程

本教程将带你完整搭建一套RAG-Anything多模态检索工作流,深入探索如何在文本、表格、数学公式和图片等多种模态数据下实现高效检索。我们会从Colab环境配置开始,逐步完成依赖安装、安全录入OpenAI API密钥,接着创建合成多模态测试报告、生成图表与PDF文档,将内容转换为RAG-Anything支持的格式后导入检索系统。后续我们会配置基于OpenAI的对话、视觉和嵌入模型工具,初始化RAG-Anything实例,并测试朴素、本地、全局和混合四种不同的检索模式。
一、安装RAG-Anything依赖环境
首先我们需要完成整个Colab开发环境的配置,包括安装必要的Python库、修复Pillow依赖版本问题,同时导入绘图、PDF生成、OpenAI接口以及RAG-Anything相关的工具模块。我们还封装了可复用的Shell执行工具,让整个安装流程清晰易懂,方便后续重复运行。
import os
import re
import sys
import json
import time
import shutil
import hashlib
import asyncio
import inspect
import getpass
import subprocess
import importlib
import importlib.metadata
from pathlib import Path
from typing import List, Dict, Any
def run_shell(cmd, check=True):
print(f"\n$ {cmd}")
result = subprocess.run(cmd, shell=True, text=True)
if check and result.returncode != 0:
raise RuntimeError(f"Command failed: {cmd}")
return result.returncode
print("=" * 80)
print("RAG-Anything Advanced Colab Tutorial")
print("=" * 80)
print("\n[1/10] Installing dependencies...")
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
run_shell(
'pip -q install -U '
'"raganything[image,text]" '
'"openai>=1.0.0" '
'"python-dotenv" '
'"reportlab" '
'"pandas" '
'"matplotlib" '
'"tabulate"'
)
run_shell('pip -q install --no-cache-dir --force-reinstall "pillow==11.3.0"')
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
importlib.invalidate_caches()
try:
print("Pillow version:", importlib.metadata.version("Pillow"))
except Exception as e:
print("Could not read Pillow version:", repr(e))
print("\n[2/10] Importing libraries...")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from openai import AsyncOpenAI
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.utils import EmbeddingFunc
print("Imports successful.")
二、配置目录与运行参数
接下来我们会创建项目所需的工作目录、资源文件夹、输出目录和日志存储路径,并配置RAG-Anything运行时需要的环境变量。我们会通过隐藏输入框安全获取用户的OpenAI API密钥,对输入的密钥进行标准化清洗,同时验证对话和嵌入接口的连通性,最后定义本次教程使用的大模型、视觉模型和嵌入模型参数。
print("\n[3/10] Preparing directories and runtime settings...")
BASE_DIR = Path("/content/raganything_advanced_tutorial") if Path("/content").exists() else Path.cwd() / "raganything_advanced_tutorial"
ASSET_DIR = BASE_DIR / "assets"
OUTPUT_DIR = BASE_DIR / "output"
WORKING_DIR = BASE_DIR / "rag_storage"
LOG_DIR = BASE_DIR / "logs"
RESET_STORAGE = True
RUN_FULL_DOCUMENT_PARSE = False
PARSER_FOR_FULL_PARSE = "mineru"
PARSE_METHOD = "auto"
for d in [BASE_DIR, ASSET_DIR, OUTPUT_DIR, WORKING_DIR, LOG_DIR]:
d.mkdir(parents=True, exist_ok=True)
if RESET_STORAGE and WORKING_DIR.exists():
shutil.rmtree(WORKING_DIR)
WORKING_DIR.mkdir(parents=True, exist_ok=True)
os.environ["LOG_DIR"] = str(LOG_DIR)
os.environ["SUMMARY_LANGUAGE"] = "English"
os.environ["ENABLE_LLM_CACHE"] = "false"
os.environ["ENABLE_LLM_CACHE_FOR_EXTRACT"] = "false"
os.environ["MAX_ASYNC"] = "2"
os.environ["CHUNK_SIZE"] = "900"
os.environ["CHUNK_OVERLAP_SIZE"] = "120"
os.environ["TIMEOUT"] = "240"
for var in [
"OPENAI_API_KEY",
"OPENAI_ORG_ID",
"OPENAI_ORGANIZATION",
"OPENAI_PROJECT",
"OPENAI_DEFAULT_HEADERS",
"LLM_BINDING_API_KEY",
"LLM_BINDING_HOST",
]:
os.environ.pop(var, None)
print(f"Base directory: {BASE_DIR}")
print(f"Assets directory: {ASSET_DIR}")
print(f"Storage directory: {WORKING_DIR}")
print("\n[4/10] Entering OpenAI API key securely...")
def clean_api_key(raw_value: str) -> str:
raw_value = str(raw_value or "").strip()
raw_value = raw_value.replace("Bearer ", "").replace("bearer ", "").strip()
raw_value = raw_value.strip("'").strip('"').strip("`").strip()
if "=" in raw_value:
raw_value = raw_value.split("=", 1)[1].strip().strip("'").strip('"').strip("`")
raw_value = re.sub(r"\s+", "", raw_value)
raw_value = raw_value.encode("ascii", errors="ignore").decode("ascii").strip()
return raw_value
OPENAI_API_KEY_RAW = getpass.getpass("Paste your OpenAI API key here. Input is hidden: ")
OPENAI_API_KEY = clean_api_key(OPENAI_API_KEY_RAW)
if not OPENAI_API_KEY:
raise ValueError(
"No API key was captured. Paste the key into the hidden input box and press Enter."
)
print("Captured key length:", len(OPENAI_API_KEY))
print("Captured key prefix:", OPENAI_API_KEY[:12] + "...")
print("Captured key suffix:", "..." + OPENAI_API_KEY[-6:])
LLM_MODEL = "gpt-4o-mini"
VISION_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
os.environ["LLM_MODEL"] = LLM_MODEL
os.environ["VISION_MODEL"] = VISION_MODEL
os.environ["EMBEDDING_MODEL"] = EMBEDDING_MODEL
os.environ["EMBEDDING_DIM"] = str(EMBEDDING_DIM)
print("Testing OpenAI chat API with the captured key...")
try:
test_response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": "Reply with exactly: ok"}],
temperature=0,
)
print("Chat API test response:", test_response.choices[0].message.content)
except Exception as e:
raise RuntimeError(
"The key was captured, but OpenAI rejected the request or the account/model access failed. "
"Check billing, project permissions, and make sure this is an OpenAI Platform API key."
) from e
print("\nTesting OpenAI embedding API...")
try:
test_embedding = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=["RAG-Anything embedding test"],
)
print("Embedding vector length:", len(test_embedding.data[0].embedding))
except Exception as e:
raise RuntimeError(
"Chat worked, but embeddings failed. Make sure your API key has permission for embeddings."
) from e
print("OpenAI API key is working.")
print(f"Chat model: {LLM_MODEL}")
print(f"Vision model: {VISION_MODEL}")
print(f"Embedding model: {EMBEDDING_MODEL}")
print(f"Embedding dimension: {EMBEDDING_DIM}")
三、生成合成多模态测试报告
为了方便测试多模态检索能力,我们先创建一份可控的合成测试报告。这份报告包含结构化的性能数据表、趋势折线图、数学评分公式和对应的PDF文档,覆盖了文本、表格、公式和图片四种常见的文档模态,让我们可以清晰观察系统对不同类型内容的处理效果。
print("\n[5/10] Creating a synthetic multimodal report...")
monthly_data = pd.DataFrame(
{
"Month": ["Jan", "Feb", "Mar", "Apr", "May", "Jun"],
"Query Volume": [1200, 1700, 2100, 2600, 3300, 4100],
"Hybrid Accuracy": [0.71, 0.74, 0.79, 0.83, 0.87, 0.91],
"Average Latency ms": [980, 920, 850, 790, 760, 730],
}
)
table_md = monthly_data.to_markdown(index=False)
plt.figure(figsize=(8, 4.8))
plt.plot(monthly_data["Month"], monthly_data["Query Volume"], marker="o", label="Query Volume")
plt.plot(monthly_data["Month"], monthly_data["Hybrid Accuracy"] * 4000, marker="s", label="Hybrid Accuracy scaled")
plt.title("Multimodal RAG Usage and Quality Trend")
plt.xlabel("Month")
plt.ylabel("Volume / Scaled Accuracy")
plt.legend()
plt.grid(True, alpha=0.3)
plt.text(
0.02,
0.95,
"Synthetic figure: usage rises while latency falls",
transform=plt.gca().transAxes,
fontsize=9,
verticalalignment="top",
bbox=dict(boxstyle="round", alpha=0.15),
)
chart_path = ASSET_DIR / "raganything_quality_trend.png"
plt.tight_layout()
plt.savefig(chart_path, dpi=180)
plt.close()
report_pdf_path = ASSET_DIR / "synthetic_multimodal_rag_report.pdf"
c = canvas.Canvas(str(report_pdf_path), pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 18)
c.drawString(0.8 * inch, height - 0.8 * inch, "Synthetic Multimodal RAG Evaluation Report")
c.setFont("Helvetica", 10)
intro_lines = [
"This report evaluates a synthetic multimodal RAG pipeline for enterprise documents.",
"The knowledge base includes text, tables, equations, and visual evidence.",
"The central hypothesis is that hybrid retrieval improves answer quality when evidence spans modalities.",
]
y = height - 1.25 * inch
for line in intro_lines:
c.drawString(0.8 * inch, y, line)
y -= 0.22 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.1 * inch, "Table 1. Monthly system measurements")
y -= 0.4 * inch
c.setFont("Courier", 7.5)
for row in table_md.splitlines():
c.drawString(0.8 * inch, y, row[:120])
y -= 0.17 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.15 * inch, "Equation 1. Weighted multimodal score")
y -= 0.45 * inch
c.setFont("Helvetica", 9)
c.drawString(
0.8 * inch,
y,
"Score(q, d) = alpha * Sim_text(q, d) + beta * Sim_graph(q, d) + gamma * Sim_visual(q, d)",
)
y -= 0.5 * inch
c.drawImage(str(chart_path), 0.8 * inch, y - 2.8 * inch, width=6.5 * inch, height=2.6 * inch)
c.showPage()
c.setFont("Helvetica-Bold", 16)
c.drawString(0.8 * inch, height - 0.8 * inch, "Interpretation and Findings")
c.setFont("Helvetica", 10)
findings = [
"Hybrid retrieval combines semantic similarity with graph-based relationship navigation.",
"The synthetic table shows accuracy improving from 0.71 to 0.91 over six months.",
"The generated figure shows query volume increasing while latency gradually decreases.",
"Equation-level retrieval is useful when the question depends on scoring logic rather than plain prose.",
"A multimodal system should preserve page index, captions, footnotes, and local image paths for traceability.",
]
y = height - 1.25 * inch
for finding in findings:
c.drawString(0.8 * inch, y, "- " + finding)
y -= 0.28 * inch
c.save()
print(f"Created chart: {chart_path}")
print(f"Created PDF: {report_pdf_path}")
print("\nSynthetic table:")
display(monthly_data)
四、构建RAG-Anything多模态内容列表
我们将这份合成报告转换为RAG-Anything支持的结构化内容列表格式,把文本、表格、公式和图片分别封装为独立的内容块,为每个块添加页码索引、标题说明和注释信息,最后将这个内容列表保存为JSON文件,确保整个流程透明可复用。
print("\n[6/10] Building direct multimodal content_list...")
content_list: List[Dict[str, Any]] = [
{
"type": "text",
"text": (
"This synthetic report evaluates a multimodal retrieval augmented generation system. "
"The system indexes textual explanations, a structured performance table, a scoring equation, "
"and a trend figure. The main goal is to answer questions whose evidence is distributed across "
"several document modalities rather than one plain text passage."
),
"page_idx": 0,
},
{
"type": "table",
"table_body": table_md,
"table_caption": ["Table 1: Monthly query volume, hybrid accuracy, and average latency."],
"table_footnote": ["Synthetic measurements created for a Colab tutorial."],
"page_idx": 0,
},
{
"type": "equation",
"latex": r"Score(q,d)=\alpha \cdot Sim_{text}(q,d)+\beta \cdot Sim_{graph}(q,d)+\gamma \cdot Sim_{visual}(q,d)",
"text": (
"Weighted multimodal retrieval score. Alpha controls text similarity, beta controls graph relationship "
"similarity, and gamma controls visual similarity."
),
"page_idx": 0,
},
{
"type": "image",
"img_path": str(chart_path.resolve()),
"image_caption": ["Figure 1: Multimodal RAG usage and quality trend."],
"image_footnote": ["The line chart is synthetic and generated inside this tutorial."],
"page_idx": 0,
},
{
"type": "text",
"text": (
"The key finding is that hybrid retrieval is preferred for cross-modal questions. "
"Local retrieval is useful for entity-specific lookup, global retrieval is useful for broader themes, "
"and naive retrieval is a simpler baseline. In this report, hybrid accuracy rises from 0.71 in January "
"to 0.91 in June, while average latency drops from 980 milliseconds to 730 milliseconds."
),
"page_idx": 1,
},
]
content_list_path = ASSET_DIR / "content_list.json"
with open(content_list_path, "w", encoding="utf-8") as f:
json.dump(content_list, f, indent=2, ensure_ascii=False)
print(f"Saved content list: {content_list_path}")
五、配置OpenAI模型与嵌入工具
这里我们会封装基于OpenAI的对话生成、视觉处理和文本嵌入工具函数,支持系统提示、对话历史、多模态图片输入以及自定义模型参数。我们会将嵌入函数适配为LightRAG标准格式,让RAG-Anything可以在索引和检索阶段正常调用。
print("\n[7/10] Defining clean OpenAI model and embedding functions...")
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
messages = []
if system_prompt:
messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
messages.append(msg)
messages.append({"role": "user", "content": str(prompt)})
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def vision_model_func(
prompt,
system_prompt=None,
history_messages=None,
image_data=None,
messages=None,
**kwargs,
):
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
if messages:
clean_messages = [m for m in messages if m is not None]
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=clean_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
built_messages = []
if system_prompt:
built_messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
built_messages.append(msg)
if image_data:
built_messages.append(
{
"role": "user",
"content": [
{"type": "text", "text": str(prompt)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
},
],
}
)
else:
built_messages.append({"role": "user", "content": str(prompt)})
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=built_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def openai_embedding_func(texts, **kwargs):
if isinstance(texts, str):
texts = [texts]
texts = [str(t) for t in texts]
response = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=texts,
)
vectors = [item.embedding for item in response.data]
return np.array(vectors, dtype=np.float32)
embedding_func = EmbeddingFunc(
embedding_dim=EMBEDDING_DIM,
max_token_size=8192,
func=openai_embedding_func,
)
print("Model and embedding functions ready.")
六、初始化RAG-Anything并运行混合检索
我们使用配置好的工作目录、解析参数和多模态处理开关初始化RAG-Anything实例,将之前准备好的多模态内容列表导入系统,让系统自动处理文本、表格、公式和图片块。随后我们会运行四组不同的检索模式,对比朴素检索、本地检索、全局检索和混合检索的返回结果差异。
print("\n[8/10] Initializing RAG-Anything...")
config = RAGAnythingConfig(
working_dir=str(WORKING_DIR),
parser=PARSER_FOR_FULL_PARSE,
parse_method=PARSE_METHOD,
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
async def maybe_await(value):
if inspect.isawaitable(value):
return await value
return value
if hasattr(rag, "initialize_storages"):
try:
await maybe_await(rag.initialize_storages())
print("RAG-Anything storages initialized.")
except Exception as e:
print("Storage initialization skipped or already handled:", repr(e))
print(f"Working directory: {WORKING_DIR}")
print("\n[9/10] Inserting multimodal content and running retrieval queries...")
async def insert_demo_content():
await rag.insert_content_list(
content_list=content_list,
file_path=str(report_pdf_path.name),
split_by_character=None,
split_by_character_only=False,
doc_id="synthetic-multimodal-rag-report",
display_stats=True,
)
await insert_demo_content()
print("Insertion complete.")
queries = [
"What is the main purpose of the multimodal RAG report?",
"How did hybrid accuracy and latency change from January to June?",
"Why is hybrid retrieval better than naive retrieval for this report?",
"What does the weighted multimodal score equation mean?",
]
async def safe_aquery(question, mode="hybrid", vlm_enhanced=False):
try:
return await rag.aquery(question, mode=mode, vlm_enhanced=vlm_enhanced)
except TypeError:
return await rag.aquery(question, mode=mode)
async def run_query_suite():
results = []
for mode in ["naive", "local", "global", "hybrid"]:
print("\n" + "=" * 80)
print(f"QUERY MODE: {mode.upper()}")
print("=" * 80)
for q in queries:
print(f"\nQuestion: {q}")
try:
answer = await safe_aquery(q, mode=mode, vlm_enhanced=False)
except Exception as e:
answer = f"Query failed in mode={mode}: {repr(e)}"
print("\nAnswer:")
print(answer)
print("-" * 80)
results.append(
{
"mode": mode,
"question": q,
"answer_preview": str(answer)[:700],
}
)
return pd.DataFrame(results)
query_results_df = await run_query_suite()
print("\nQuery result preview:")
display(query_results_df)
七、执行显式多模态查询测试
最后我们会直接传入表格和公式内容进行针对性的多模态查询,测试系统在结构化数值计算、评分逻辑解释以及跨模态内容关联分析场景下的表现。我们还保留了可选的完整文档解析路径,方便用户测试基于解析器的PDF导入流程。
print("\n[10/10] Running explicit multimodal queries...")
async def run_multimodal_queries():
multimodal_cases = [
{
"name": "Table-aware query",
"question": (
"Using the supplied table, identify the month with the highest hybrid accuracy, "
"the month with the lowest latency, and explain whether the trend supports the report conclusion."
),
"multimodal_content": [
{
"type": "table",
"table_data": table_md,
"table_caption": "Monthly performance table",
}
],
},
{
"name": "Equation-aware query",
"question": (
"Explain how this scoring equation should affect retrieval when the user's question needs "
"textual, graph, and visual evidence at the same time."
),
"multimodal_content": [
{
"type": "equation",
"latex": r"Score(q,d)=\alpha Sim_{text}(q,d)+\beta Sim_{graph}(q,d)+\gamma Sim_{visual}(q,d)",
"equation_caption": "Weighted multimodal retrieval score",
}
],
},
{
"name": "Combined multimodal query",
"question": (
"Connect the table, equation, and document conclusion into one explanation of why a multimodal "
"hybrid retriever is useful."
),
"multimodal_content": [
{
"type": "table",
"table_data": table_md,
"table_caption": "Monthly performance table",
},
{
"type": "equation",
"latex": r"Score(q,d)=\alpha Sim_{text}(q,d)+\beta Sim_{graph}(q,d)+\gamma Sim_{visual}(q,d)",
"equation_caption": "Weighted multimodal retrieval score",
},
],
},
]
outputs = []
for case in multimodal_cases:
print("\n" + "=" * 80)
print(case["name"])
print("=" * 80)
print("Question:", case["question"])
try:
answer = await rag.aquery_with_multimodal(
case["question"],
multimodal_content=case["multimodal_content"],
mode="hybrid",
)
except Exception as e:
answer = f"Multimodal query failed: {repr(e)}"
print("\nAnswer:")
print(answer)
outputs.append(
{
"case": case["name"],
"question": case["question"],
"answer_preview": str(answer)[:900],
}
)
return pd.DataFrame(outputs)
multimodal_results_df = await run_multimodal_queries()
print("\nMultimodal result preview:")
display(multimodal_results_df)
print("\nOptional full-parser path:")
print("RUN_FULL_DOCUMENT_PARSE is currently:", RUN_FULL_DOCUMENT_PARSE)
async def optional_full_document_parse():
if not RUN_FULL_DOCUMENT_PARSE:
print(
"Skipping parser-based PDF ingestion. "
"Set RUN_FULL_DOCUMENT_PARSE=True near the top to test MinerU/Docling/PaddleOCR parsing."
)
return
print("Starting full document parsing.")
await rag.process_document_complete(
file_path=str(report_pdf_path),
output_dir=str(OUTPUT_DIR),
parse_method=PARSE_METHOD,
parser=PARSER_FOR_FULL_PARSE,
display_stats=True,
doc_id="parser-processed-synthetic-report",
)
answer = await safe_aquery(
"After full parsing, what figures, tables, and equations are present in the report?",
mode="hybrid",
vlm_enhanced=False,
)
print(answer)
await optional_full_document_parse()
print("\n" + "=" * 80)
print("Tutorial complete.")
print("=" * 80)
print(f"Assets directory: {ASSET_DIR}")
print(f"RAG storage directory: {WORKING_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print("\nGenerated files:")
for path in sorted(ASSET_DIR.glob("*")):
print(" -", path)
八、教程总结
本教程完整搭建了一套支持多模态内容的RAG-Anything检索 pipeline,可以顺利导入文本、Markdown表格、LaTeX公式和生成的图片等多种类型的内容,并通过不同的检索模式返回对应的查询结果。我们验证了系统在处理单一模态和跨模态查询时的表现,展示了多模态检索在跨领域问题回答中的优势。
塔猴是一个专注于为用户提供系统学习、内容创作与商业连接的AIGC综合服务平台,致力于为每一位AI探索者打造理想的创作、成长家园。在塔猴,你不仅可以学习众多AIGC类实战课程,获得与时俱进的AIGC技能和视野,还有机会获得长期商业合作和接单机会!点击进入:https://www.tahou.com/
AI生成内容提示:本文由人工智能辅助创作,内容仅供参考,不代表平台观点。请注意核实信息的准确性,并理性判断。




