最佳实践｜“手搓” GraphRAG 应用，社区开发者实战经验全公开

TuGraph 2025-07-16

344

点击蓝字，关注我们

GraphRAG是一个创新的知识检索与问答增强框架，它巧妙地结合了图数据库技术与检索增强生成（RAG）方法。GraphRAG往往在处理复杂数据关系任务上取得比传统RAG更好的效果，是当下LLM领域热门的工程方向之一。

去年，TuGraph图数据库作为底层的图存储系统，联合AI工程框架DB-GPT、知识图谱系统OpenSPG，设计了一个通用的开源RAG框架，以兼容未来多样化的基础研究建设和工程化应用诉求。

本文来自TuGraph社区开发者张哲源，分享了他改进GraphRAG并实践应用的经验。作者目前就职于中科数睿，拥有4年AI开发经验，同时也是多个数据库产品认证的数据库专家。

一、行业与技术背景

随着人工智能、大数据与自然语言处理技术的快速发展，各行业中数据量呈爆炸式增长，不同来源（文档、数据库、API、知识图谱等）的信息呈现碎片化、多样化特征。传统的信息检索方法（关键词或向量检索）虽然能在一定程度上提升检索效率，但往往难以充分挖掘数据之间的“隐含”关联，难以支撑复杂推理和跨文档、多跳检索需求。

知识图谱作为一种以“实体—关系”形式刻画知识的结构化表示，能够直观表达多源异构数据中的语义关联，支持图算法进行路径查询和关系推理；而 Retrieval-Augmented Generation（RAG）技术则通过“检索 + 生成”模式，将外部知识注入预训练大模型，实现了对开放域问答、智能摘要、决策支持等场景下高质量答案的生成。将知识图谱与 RAG 融合，即 Graph RAG，便在此背景下应运而生。本文基于社区 GraphRAG 版本做了一定的改进。

二、方案验证的环境

Ubuntu 20.04.6 LTS

Python 3.12

TuGraph 4.5.1

三、算法逻辑

3.1 文档读取:

为了方便大家快捷验证。我提供了doc、docx、txt 的解析代码。

import os
import re
import struct
import sys
import olefile
from docx import Document as DocxDocument
from typing import List, Tuple, Dict, Any
ANSI_TO_UNICODE: Dict[int, int] = {
    0x82: 0x201A, 0x83: 0x0192, 0x84: 0x201E, 0x85: 0x2026,
    0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02C6, 0x89: 0x2030,
    0x8A: 0x0160, 0x8B: 0x2039, 0x8C: 0x0152, 0x91: 0x2018,
    0x92: 0x2019, 0x93: 0x201C, 0x94: 0x201D, 0x95: 0x2022,
    0x96: 0x2013, 0x97: 0x2014, 0x98: 0x02DC, 0x99: 0x2122,
    0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9F: 0x0178,
}
_FIELD_RE = re.compile(r'\bPAGE\b|\bMERGEFORMAT\b', re.IGNORECASE)
_TABLE_FRAGMENT_RE = re.compile(r'^[\d\.\-\t ]+$')  # 只含数字、点、连字符、制表符、空格
def _clean_text(text: str) -> str:
    cleaned = ''.join(ch for ch in text if ch.isprintable() or ch in '\t\r\n ')
    return re.sub(r'[ \t]+', ' ', cleaned).strip()
def parse_docx(path: str) -> List[Tuple[int, str]]:
    doc = DocxDocument(path)
    lines = []
    for idx, para in enumerate(doc.paragraphs, start=1):
        txt = para.text.strip()
        if txt and not txt.startswith('附件'):
            lines.append((idx, txt))
        if txt.startswith('附件'):
            break
    return lines
def parse_txt(path: str) -> List[Tuple[int, str]]:
    lines = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        for idx, line in enumerate(f, start=1):
            txt = line.strip()
            if not txt:
                continue
            if txt.startswith('附件'):
                break
            lines.append((idx, txt))
    return lines
def _read_fib(stream) -> Dict[str, Any]:
    stream.seek(0)
    # Skip header
    stream.read(64)
    fib_rg_lw = stream.read(88)
    ccpText = struct.unpack("<I", fib_rg_lw[12:16])[0]
    cb = struct.unpack("<H", stream.read(2))[0]
    blob = stream.read(cb * 8)
    csw_new = struct.unpack("<H", stream.read(2))[0]
    if csw_new:
        stream.read(csw_new * 2)
    return {
        'ccpText': ccpText,
        'fcClx': struct.unpack("<I", blob[0x108:0x10C])[0],
        'lcbClx': struct.unpack("<I", blob[0x10C:0x110])[0],
        'fcPlcfBtePapx': struct.unpack("<I", blob[0x68:0x6C])[0],
        'lcbPlcfBtePapx': struct.unpack("<I", blob[0x6C:0x70])[0],
    }
def _read_clx(table_stream, fc_clx: int, lcb_clx: int) -> bytes:
    table_stream.seek(fc_clx)
    data = table_stream.read(lcb_clx)
    return data if data and data[0] == 0x02 else b''
def _parse_plc_pcd(pcdt_data: bytes) -> List[Dict[str, Any]]:
    lcb = struct.unpack("<I", pcdt_data[1:5])[0]
    plc = pcdt_data[5:5 + lcb]
    n = (lcb - 4)  12
    aCP = [struct.unpack("<I", plc[i*4:(i+1)*4])[0] for i in range(n+1)]
    pieces = []
    base = (n+1)*4
    for i in range(n):
        part = plc[base + i*8: base + (i+1)*8]
        raw_fc = int.from_bytes(part[2:6], 'little')
        pieces.append({
            'fc': raw_fc & ~1,
            'start_cp': aCP[i],
            'end_cp': aCP[i+1],
            'compressed': bool((raw_fc >> 1) & 1)
        })
    return pieces
def extract_paragraphs_from_doc(path: str) -> List[str]:
    ole = olefile.OleFileIO(path)
    wd = ole.openstream('WordDocument')
    ts = '0Table' if ole.exists('0Table') else '1Table'
    tb = ole.openstream(ts)
    fib = _read_fib(wd)
    clx = _read_clx(tb, fib['fcClx'], fib['lcbClx'])
    pieces = _parse_plc_pcd(clx) if clx else []
    paragraphs = []
    for piece in pieces:
        raw_chars = []
        for cp in range(piece['start_cp'], piece['end_cp']):
            offset = piece['fc'] + (cp - piece['start_cp']) * (1 if piece['compressed'] else 2)
            wd.seek(offset)
            if piece['compressed']:
                code = wd.read(1)[0]
                raw_chars.append(chr(ANSI_TO_UNICODE.get(code, code)))
            else:
                raw_chars.append(wd.read(2).decode('utf-16-le', errors='ignore'))
        raw_text = ''.join(raw_chars)
        for sub in re.split(r'[\r\n]+', raw_text):
            text = _clean_text(sub)
            if not text or _FIELD_RE.search(text) or _TABLE_FRAGMENT_RE.fullmatch(text):
                continue
            if text.startswith('附件'):
                ole.close()
                return paragraphs
            paragraphs.append(text)
    ole.close()
    return paragraphs
def parse_document(path: str) -> List[Tuple[int, str]]:
    ext = os.path.splitext(path)[1].lower()
    if ext == '.docx':
        return parse_docx(path)
    if ext == '.txt':
        return parse_txt(path)
    if ext == '.doc':
        return [(i+1, p) for i, p in enumerate(extract_paragraphs_from_doc(path))]
    raise ValueError(f"Unsupported format: {ext}")

3.2 构建知识抽取并通过正则提取到 TuGraph 中:

注意：要根据自己的垂于修改提示词。为了方便大家验证，提供抽取三元组的代码。导入 TuGraph 由于涉及到了数据不做提供。

import asyncio
import re
import json
import tomllib
import pandas as pd
from tqdm.asyncio import tqdm_asyncio
from openai import AsyncOpenAI
from document_parse import parse_document
with open("config.toml", "rb") as f:
    cfg = tomllib.load(f)["openai"]
client = AsyncOpenAI(
    api_key=cfg["api_key"],
    base_url=cfg.get("base_url", "https://api.openai.com/v1"),
)
PROMPT = """你是一个结构化信息提取助手，请从给定的中文文本中抽取所有实体及它们之间的关系。
将结果以严格的 JSON 数组格式输出，数组中的每个元素都是一个对象，包含 "subject"、"predicate"、"object" 三个字段。
不要输出任何多余说明或文本，只要合法的 JSON。
示例输入：
“Acme公司注册资本为500万元，法定代表人为张三。Acme公司的子公司为Beta公司。”
示例输出：
[
  { "subject": "Acme公司", "predicate": "注册资本", "object": "500万元" },
  { "subject": "张三",   "predicate": "法定代表人", "object": "Acme公司" },
  { "subject": "Acme公司", "predicate": "子公司", "object": "Beta公司" }
]
文本：
「{chunk}」
"""
async def _extract(chunks, model, concurrency=5):
    sem = asyncio.Semaphore(concurrency)
    triples = []
    async def job(i, txt):
        async with sem:
            resp = await client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": PROMPT.format(chunk=txt[:8000])}],
                temperature=0,
            )
            content = resp.choices[0].message.content.strip()
            try:
                data = json.loads(content)
                for obj in data:
                    h = obj.get("subject", "").strip()
                    r = obj.get("predicate", "").strip()
                    t = obj.get("object", "").strip()
                    if h and r and t:
                        triples.append((h, r, t, i))
            except json.JSONDecodeError:
                for line in content.splitlines():
                    text = line.strip("（）() ")
                    if not text:
                        continue
                    parts = re.split(r"[，,]", text, maxsplit=2)
                    if len(parts) == 3:
                        triples.append((*map(str.strip, parts), i))
    await tqdm_asyncio.gather(*[job(i, t) for i, t in enumerate(chunks)])
    return triples
async def run_pipeline(file_path: str,
                       out_csv: str | None = None,
                       model: str | None = None):
    texts = [t for _, t in parse_document(file_path)]
    triples = await _extract(texts, model or cfg["model"])
    for head, rel, tail, idx in triples:
        print(f"({head}, {rel}, {tail})   [chunk #{idx+1}]")
    if out_csv:
        pd.DataFrame(triples, columns=["subject", "predicate", "object", "chunk"]) \
          .to_csv(out_csv, index=False, encoding="utf-8-sig")