feat(report): restructure Gemini analysis and overview documentation

Update Gemini response schema to extract detailed overview, logic steps, data dependencies, and consultation insights. Add pipeline processing logic and file categorization explanations to OVERVIEW.md.
2026-04-02 10:33:19 +08:00
parent d6218d6bad
commit 5ba2e3217a
5 changed files with 302 additions and 28 deletions
--- a/README.md
+++ b/README.md
@@ -43,8 +43,14 @@ GEMINI_MODEL=gemini-2.5-flash
 输出目录会保留原项目的相对目录结构，并额外生成：

 - `manifest.json`: 扫描、剪枝、告警和分析结果清单
- `OVERVIEW.md`: 总览摘要
- `*.analysis.md`: 每个最终保留文件对应的 Gemini 说明
+- `OVERVIEW.md`: 总览说明书，除了文件数量，还会解释“初次扫描 -> 清理 Comment Out -> 二次扫描 -> 剪枝 -> Gemini 分析”的整条流水线逻辑
+- `*.analysis.md`: 每个最终保留文件对应的 Gemini 咨询式说明，固定包含：
+  - 文件定位：这个文件在整个流程中的角色、触发方式、核心作用
+  - 流程拆解：按步骤说明它是怎么执行的
+  - 关键逻辑：条件判断、调用关系、边界情况
+  - 数据与依赖：输入、输出、变量、外部文件或系统
+  - 咨询视角：业务含义、风险点、无法确认的信息
+  - 场景范例：帮助交接和理解的示例

 ## 当前实现约束

--- a/src/uipath_explainator/gemini.py
+++ b/src/uipath_explainator/gemini.py
@@ -8,6 +8,9 @@ import json
 from .config import Settings


+UNKNOWN_TEXT = "无法从当前文件确定"
+
+
@dataclass(slots=True)
 class GeminiAnalyzer:
    settings: Settings
@@ -36,33 +39,108 @@ class GeminiAnalyzer:
            config=self._types.GenerateContentConfig(
                temperature=0.2,
                response_mime_type="application/json",
-                response_schema={
-                    "type": "OBJECT",
-                    "required": ["summary", "implementation", "data", "example"],
-                    "properties": {
-                        "summary": {"type": "STRING"},
-                        "implementation": {"type": "STRING"},
-                        "data": {"type": "STRING"},
-                        "example": {"type": "STRING"},
-                    },
-                },
+                response_schema=self._response_schema(),
            ),
        )

        payload = json.loads(response.text)
        return self._to_markdown(relative_path, payload)

+    def _response_schema(self) -> dict[str, Any]:
+        return {
+            "type": "OBJECT",
+            "required": ["overview", "logic", "data", "consultation"],
+            "properties": {
+                "overview": {
+                    "type": "OBJECT",
+                    "required": ["purpose", "role", "trigger"],
+                    "properties": {
+                        "purpose": {"type": "STRING"},
+                        "role": {"type": "STRING"},
+                        "trigger": {"type": "STRING"},
+                    },
+                },
+                "logic": {
+                    "type": "OBJECT",
+                    "required": ["steps", "decision_logic", "exceptions"],
+                    "properties": {
+                        "steps": {
+                            "type": "ARRAY",
+                            "items": {
+                                "type": "OBJECT",
+                                "required": ["title", "detail", "why", "result"],
+                                "properties": {
+                                    "title": {"type": "STRING"},
+                                    "detail": {"type": "STRING"},
+                                    "why": {"type": "STRING"},
+                                    "result": {"type": "STRING"},
+                                },
+                            },
+                        },
+                        "decision_logic": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                        "exceptions": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                    },
+                },
+                "data": {
+                    "type": "OBJECT",
+                    "required": ["inputs", "outputs", "variables", "external_dependencies"],
+                    "properties": {
+                        "inputs": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                        "outputs": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                        "variables": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                        "external_dependencies": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                    },
+                },
+                "consultation": {
+                    "type": "OBJECT",
+                    "required": ["business_meaning", "risks", "example", "unknowns"],
+                    "properties": {
+                        "business_meaning": {"type": "STRING"},
+                        "risks": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                        "example": {"type": "STRING"},
+                        "unknowns": {
+                            "type": "ARRAY",
+                            "items": {"type": "STRING"},
+                        },
+                    },
+                },
+            },
+        }
+
    def _build_prompt(self, relative_path: Path, content: str) -> str:
        language = "xml" if relative_path.suffix.lower() == ".xaml" else "vb"
        return f"""你是资深 UiPath 自动化架构师。请只基于给定文件内容分析，不要臆造未出现的系统、字段或业务规则。

-请用中文返回 JSON，字段固定为：
- summary: 这个文件在做什么
- implementation: 这个文件怎么做
- data: 这个文件涉及的数据、变量、参数、外部文件或系统
- example: 一个贴近当前代码的说明性范例
+请严格返回 JSON，并且完全匹配给定 schema。所有字段都必须填写。

-如果信息不足，请明确写“无法从当前文件确定”。
+输出要求：
+1. 用中文，写成“给交接人员/业务方看的咨询说明”，不要只给一句笼统概括。
+2. 先讲这个文件在整个流程中的定位，再按执行顺序拆解步骤。
+3. 重点解释判断逻辑、调用链、输入输出、关键变量、外部依赖。
+4. 如果代码里能看出顺序动作，请在 steps 里拆成明确步骤；不要只复述节点名称。
+5. 无法确认的信息必须明确写“无法从当前文件确定”，数组字段则写成仅包含这一项的数组。
+6. 不要输出 JSON 以外的任何文字。

 文件路径: {relative_path.as_posix()}
 代码:
@@ -70,22 +148,109 @@ class GeminiAnalyzer:
 {content}
 ```"""

-    def _to_markdown(self, relative_path: Path, payload: dict[str, str]) -> str:
+    def _to_markdown(self, relative_path: Path, payload: dict[str, Any]) -> str:
+        overview = payload.get("overview") or {}
+        logic = payload.get("logic") or {}
+        data = payload.get("data") or {}
+        consultation = payload.get("consultation") or {}
+
        return "\n".join(
            [
                f"# {relative_path.as_posix()}",
                "",
-                "## 做什么",
-                payload.get("summary", "").strip() or "无法从当前文件确定",
+                "## 文件定位",
+                f"- 核心作用：{self._clean_text(overview.get('purpose'))}",
+                f"- 流程角色：{self._clean_text(overview.get('role'))}",
+                f"- 触发方式：{self._clean_text(overview.get('trigger'))}",
                "",
-                "## 怎么做",
-                payload.get("implementation", "").strip() or "无法从当前文件确定",
+                "## 流程拆解",
+                self._render_steps(logic.get("steps")),
                "",
-                "## 涉及数据",
-                payload.get("data", "").strip() or "无法从当前文件确定",
+                "## 关键逻辑",
+                self._render_bullets(logic.get("decision_logic")),
                "",
-                "## 范例",
-                payload.get("example", "").strip() or "无法从当前文件确定",
+                "## 异常与边界",
+                self._render_bullets(logic.get("exceptions")),
+                "",
+                "## 数据与依赖",
+                "### 输入",
+                self._render_bullets(data.get("inputs")),
+                "",
+                "### 输出",
+                self._render_bullets(data.get("outputs")),
+                "",
+                "### 关键变量",
+                self._render_bullets(data.get("variables")),
+                "",
+                "### 外部依赖",
+                self._render_bullets(data.get("external_dependencies")),
+                "",
+                "## 咨询视角",
+                f"- 业务含义：{self._clean_text(consultation.get('business_meaning'))}",
+                "",
+                "### 风险与注意点",
+                self._render_bullets(consultation.get("risks")),
+                "",
+                "### 无法确认的点",
+                self._render_bullets(consultation.get("unknowns")),
+                "",
+                "## 场景范例",
+                self._clean_text(consultation.get("example")),
                "",
            ]
-        )
+        ).rstrip() + "\n"
+
+    def _render_steps(self, items: Any) -> str:
+        if not isinstance(items, list):
+            items = []
+
+        normalized: list[dict[str, str]] = []
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            normalized.append(
+                {
+                    "title": self._clean_text(item.get("title")),
+                    "detail": self._clean_text(item.get("detail")),
+                    "why": self._clean_text(item.get("why")),
+                    "result": self._clean_text(item.get("result")),
+                }
+            )
+
+        if not normalized:
+            normalized = [
+                {
+                    "title": UNKNOWN_TEXT,
+                    "detail": UNKNOWN_TEXT,
+                    "why": UNKNOWN_TEXT,
+                    "result": UNKNOWN_TEXT,
+                }
+            ]
+
+        lines: list[str] = []
+        for index, item in enumerate(normalized, start=1):
+            lines.extend(
+                [
+                    f"{index}. **{item['title']}**",
+                    f"   - 动作：{item['detail']}",
+                    f"   - 目的：{item['why']}",
+                    f"   - 结果：{item['result']}",
+                ]
+            )
+        return "\n".join(lines)
+
+    def _render_bullets(self, items: Any) -> str:
+        if not isinstance(items, list):
+            items = []
+
+        cleaned = [self._clean_text(item) for item in items if isinstance(item, str) and item.strip()]
+        if not cleaned:
+            cleaned = [UNKNOWN_TEXT]
+        return "\n".join(f"- {item}" for item in cleaned)
+
+    def _clean_text(self, value: Any) -> str:
+        if isinstance(value, str):
+            stripped = value.strip()
+            if stripped:
+                return stripped
+        return UNKNOWN_TEXT
--- a/src/uipath_explainator/pipeline.py
+++ b/src/uipath_explainator/pipeline.py
@@ -148,6 +148,7 @@ class ProjectPipeline:

    def _build_overview(self, report: PipelineReport) -> str:
        warnings = "\n".join(f"- {item}" for item in report.warnings) or "- 无"
+        cleaned = "\n".join(f"- {item.as_posix()}" for item in report.cleaned_files) or "- 无"
        pruned = "\n".join(f"- {item.as_posix()}" for item in report.pruned_files) or "- 无"
        analyses = "\n".join(f"- {item.as_posix()}" for item in report.analysis_files) or "- 未启用 Gemini 分析"
        return f"""# UiPath Explainator Overview
@@ -164,6 +165,24 @@ class ProjectPipeline:
 ## Final Files
 {chr(10).join(f"- {item.as_posix()}" for item in report.final_files)}

+## Processing Logic
+1. **Initial Scan**: 从入口文件开始递归扫描 `Invoke Workflow` 与 `Invoke VBA`，得到初始依赖集合。这个阶段会把 `Comment Out` 里的引用也一起算进去，因为此时还未清理失效代码。
+2. **Copy to Workspace**: 将初始依赖集合完整复制到输出目录，保持相对目录结构不变，确保后续裁剪只发生在工作区副本中。
+3. **Comment Cleanup**: 对已复制的 XAML 删除 `<ui:CommentOut>` 代码块，让后续扫描只看到当前真正生效的流程节点。
+4. **Rescan After Cleanup**: 以清理后的入口文件重新递归扫描，得到清理后的实际依赖集合。
+5. **Prune Unused Files**: 删除首次扫描能到达、但二次扫描已不可达的文件。这些文件通常来自被注释掉的工作流、VBA，或清理后失效的调用链。
+6. **Gemini Analysis**: 仅针对最终保留文件生成 `*.analysis.md`，让说明文档与实际可执行流程保持一致。
+
+## How To Read This Output
+- `Initial Files`: 清理 `<ui:CommentOut>` 前扫描到的所有文件，代表“理论上被引用过”的集合。
+- `Final Files`: 清理后仍可达的文件，代表“当前实际流程会用到”的集合。
+- `Cleaned XAML Files`: 被移除 `<ui:CommentOut>` 代码块的文件。
+- `Pruned Files`: 只在失效分支中出现、已从输出目录删除的文件。
+- `Analysis Files`: Gemini 生成的逐文件说明，重点解释该文件的流程、逻辑、数据与风险点。
+
+## Cleaned XAML Files
+{cleaned}
+
 ## Pruned Files
 {pruned}

--- a/tests/test_gemini.py
+++ b/tests/test_gemini.py
@@ -81,6 +81,84 @@ class GeminiAnalyzerTests(unittest.TestCase):
            {"base_url": "https://newapi.tootaio.com", "timeout": 120_000},
        )

+    def test_markdown_output_uses_consulting_format(self) -> None:
+        fake_types = SimpleNamespace(
+            HttpOptions=FakeHttpOptions,
+            GenerateContentConfig=FakeGenerateContentConfig,
+        )
+        fake_genai = ModuleType("google.genai")
+        fake_genai.Client = FakeClient
+        fake_genai.types = fake_types
+
+        fake_google = ModuleType("google")
+        fake_google.genai = fake_genai
+
+        with patch.dict(sys.modules, {"google": fake_google, "google.genai": fake_genai}):
+            analyzer = GeminiAnalyzer(Settings(api_key="test-key", base_url=None, model="gemini-test"))
+
+        markdown = analyzer._to_markdown(
+            Path("Flows/Active.xaml"),
+            {
+                "overview": {
+                    "purpose": "负责调用 VBA 并准备执行上下文",
+                    "role": "主流程中的子流程节点",
+                    "trigger": "由上游工作流通过 Invoke Workflow 调用",
+                },
+                "logic": {
+                    "steps": [
+                        {
+                            "title": "读取 VBA 文件路径",
+                            "detail": "从 `CodeFilePath` 读取脚本位置。",
+                            "why": "定位需要执行的 VBA 代码。",
+                            "result": "得到待执行的脚本文件。",
+                        }
+                    ],
+                    "decision_logic": ["如果 `CodeFilePath` 为空，则无法继续执行脚本。"],
+                    "exceptions": ["当前文件未展示脚本执行失败后的补偿逻辑。"],
+                },
+                "data": {
+                    "inputs": ["`Scripts/Keep.bas` 路径"],
+                    "outputs": ["VBA 执行结果未直接在当前文件中落盘"],
+                    "variables": ["`CodeFilePath`"],
+                    "external_dependencies": ["外部 VBA 文件 `Scripts/Keep.bas`"],
+                },
+                "consultation": {
+                    "business_meaning": "这是把业务动作下沉到 VBA 的桥接层。",
+                    "risks": ["脚本文件缺失会导致执行失败。"],
+                    "example": "例如：财务流程在这里调用 Excel VBA 完成批量格式整理。",
+                    "unknowns": ["无法从当前文件确定 VBA 内部实现逻辑。"],
+                },
+            },
+        )
+
+        self.assertIn("## 文件定位", markdown)
+        self.assertIn("## 流程拆解", markdown)
+        self.assertIn("1. **读取 VBA 文件路径**", markdown)
+        self.assertIn("### 输入", markdown)
+        self.assertIn("## 咨询视角", markdown)
+        self.assertIn("## 场景范例", markdown)
+
+    def test_prompt_requires_strict_structured_json(self) -> None:
+        fake_types = SimpleNamespace(
+            HttpOptions=FakeHttpOptions,
+            GenerateContentConfig=FakeGenerateContentConfig,
+        )
+        fake_genai = ModuleType("google.genai")
+        fake_genai.Client = FakeClient
+        fake_genai.types = fake_types
+
+        fake_google = ModuleType("google")
+        fake_google.genai = fake_genai
+
+        with patch.dict(sys.modules, {"google": fake_google, "google.genai": fake_genai}):
+            analyzer = GeminiAnalyzer(Settings(api_key="test-key", base_url=None, model="gemini-test"))
+
+        prompt = analyzer._build_prompt(Path("main.xaml"), "<Sequence />")
+
+        self.assertIn("请严格返回 JSON", prompt)
+        self.assertIn("先讲这个文件在整个流程中的定位", prompt)
+        self.assertIn("判断逻辑、调用链、输入输出、关键变量、外部依赖", prompt)
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -108,6 +108,12 @@ class PipelineTests(unittest.TestCase):
            self.assertFalse((output_root / "Scripts" / "Drop.bas").exists())
            self.assertTrue((output_root / "Flows" / "Active.xaml.analysis.md").exists())

+            overview = (output_root / "OVERVIEW.md").read_text(encoding="utf-8")
+            self.assertIn("## Processing Logic", overview)
+            self.assertIn("Initial Scan", overview)
+            self.assertIn("## How To Read This Output", overview)
+            self.assertIn("## Cleaned XAML Files", overview)
+

 if __name__ == "__main__":
    unittest.main()