diff --git a/README.md b/README.md index 4434434..0663a1b 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,14 @@ GEMINI_MODEL=gemini-2.5-flash 输出目录会保留原项目的相对目录结构,并额外生成: - `manifest.json`: 扫描、剪枝、告警和分析结果清单 -- `OVERVIEW.md`: 总览摘要 -- `*.analysis.md`: 每个最终保留文件对应的 Gemini 说明 +- `OVERVIEW.md`: 总览说明书,除了文件数量,还会解释“初次扫描 -> 清理 Comment Out -> 二次扫描 -> 剪枝 -> Gemini 分析”的整条流水线逻辑 +- `*.analysis.md`: 每个最终保留文件对应的 Gemini 咨询式说明,固定包含: + - 文件定位:这个文件在整个流程中的角色、触发方式、核心作用 + - 流程拆解:按步骤说明它是怎么执行的 + - 关键逻辑:条件判断、调用关系、边界情况 + - 数据与依赖:输入、输出、变量、外部文件或系统 + - 咨询视角:业务含义、风险点、无法确认的信息 + - 场景范例:帮助交接和理解的示例 ## 当前实现约束 diff --git a/src/uipath_explainator/gemini.py b/src/uipath_explainator/gemini.py index a9b12e0..99d9fa3 100644 --- a/src/uipath_explainator/gemini.py +++ b/src/uipath_explainator/gemini.py @@ -8,6 +8,9 @@ import json from .config import Settings +UNKNOWN_TEXT = "无法从当前文件确定" + + @dataclass(slots=True) class GeminiAnalyzer: settings: Settings @@ -36,33 +39,108 @@ class GeminiAnalyzer: config=self._types.GenerateContentConfig( temperature=0.2, response_mime_type="application/json", - response_schema={ - "type": "OBJECT", - "required": ["summary", "implementation", "data", "example"], - "properties": { - "summary": {"type": "STRING"}, - "implementation": {"type": "STRING"}, - "data": {"type": "STRING"}, - "example": {"type": "STRING"}, - }, - }, + response_schema=self._response_schema(), ), ) payload = json.loads(response.text) return self._to_markdown(relative_path, payload) + def _response_schema(self) -> dict[str, Any]: + return { + "type": "OBJECT", + "required": ["overview", "logic", "data", "consultation"], + "properties": { + "overview": { + "type": "OBJECT", + "required": ["purpose", "role", "trigger"], + "properties": { + "purpose": {"type": "STRING"}, + "role": {"type": "STRING"}, + "trigger": {"type": "STRING"}, + }, + }, + "logic": { + "type": "OBJECT", + "required": ["steps", "decision_logic", "exceptions"], + "properties": { + "steps": { + "type": "ARRAY", + "items": { + "type": "OBJECT", + "required": ["title", "detail", "why", "result"], + "properties": { + "title": {"type": "STRING"}, + "detail": {"type": "STRING"}, + "why": {"type": "STRING"}, + "result": {"type": "STRING"}, + }, + }, + }, + "decision_logic": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + "exceptions": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + }, + }, + "data": { + "type": "OBJECT", + "required": ["inputs", "outputs", "variables", "external_dependencies"], + "properties": { + "inputs": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + "outputs": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + "variables": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + "external_dependencies": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + }, + }, + "consultation": { + "type": "OBJECT", + "required": ["business_meaning", "risks", "example", "unknowns"], + "properties": { + "business_meaning": {"type": "STRING"}, + "risks": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + "example": {"type": "STRING"}, + "unknowns": { + "type": "ARRAY", + "items": {"type": "STRING"}, + }, + }, + }, + }, + } + def _build_prompt(self, relative_path: Path, content: str) -> str: language = "xml" if relative_path.suffix.lower() == ".xaml" else "vb" return f"""你是资深 UiPath 自动化架构师。请只基于给定文件内容分析,不要臆造未出现的系统、字段或业务规则。 -请用中文返回 JSON,字段固定为: -- summary: 这个文件在做什么 -- implementation: 这个文件怎么做 -- data: 这个文件涉及的数据、变量、参数、外部文件或系统 -- example: 一个贴近当前代码的说明性范例 +请严格返回 JSON,并且完全匹配给定 schema。所有字段都必须填写。 -如果信息不足,请明确写“无法从当前文件确定”。 +输出要求: +1. 用中文,写成“给交接人员/业务方看的咨询说明”,不要只给一句笼统概括。 +2. 先讲这个文件在整个流程中的定位,再按执行顺序拆解步骤。 +3. 重点解释判断逻辑、调用链、输入输出、关键变量、外部依赖。 +4. 如果代码里能看出顺序动作,请在 steps 里拆成明确步骤;不要只复述节点名称。 +5. 无法确认的信息必须明确写“无法从当前文件确定”,数组字段则写成仅包含这一项的数组。 +6. 不要输出 JSON 以外的任何文字。 文件路径: {relative_path.as_posix()} 代码: @@ -70,22 +148,109 @@ class GeminiAnalyzer: {content} ```""" - def _to_markdown(self, relative_path: Path, payload: dict[str, str]) -> str: + def _to_markdown(self, relative_path: Path, payload: dict[str, Any]) -> str: + overview = payload.get("overview") or {} + logic = payload.get("logic") or {} + data = payload.get("data") or {} + consultation = payload.get("consultation") or {} + return "\n".join( [ f"# {relative_path.as_posix()}", "", - "## 做什么", - payload.get("summary", "").strip() or "无法从当前文件确定", + "## 文件定位", + f"- 核心作用:{self._clean_text(overview.get('purpose'))}", + f"- 流程角色:{self._clean_text(overview.get('role'))}", + f"- 触发方式:{self._clean_text(overview.get('trigger'))}", "", - "## 怎么做", - payload.get("implementation", "").strip() or "无法从当前文件确定", + "## 流程拆解", + self._render_steps(logic.get("steps")), "", - "## 涉及数据", - payload.get("data", "").strip() or "无法从当前文件确定", + "## 关键逻辑", + self._render_bullets(logic.get("decision_logic")), "", - "## 范例", - payload.get("example", "").strip() or "无法从当前文件确定", + "## 异常与边界", + self._render_bullets(logic.get("exceptions")), + "", + "## 数据与依赖", + "### 输入", + self._render_bullets(data.get("inputs")), + "", + "### 输出", + self._render_bullets(data.get("outputs")), + "", + "### 关键变量", + self._render_bullets(data.get("variables")), + "", + "### 外部依赖", + self._render_bullets(data.get("external_dependencies")), + "", + "## 咨询视角", + f"- 业务含义:{self._clean_text(consultation.get('business_meaning'))}", + "", + "### 风险与注意点", + self._render_bullets(consultation.get("risks")), + "", + "### 无法确认的点", + self._render_bullets(consultation.get("unknowns")), + "", + "## 场景范例", + self._clean_text(consultation.get("example")), "", ] - ) + ).rstrip() + "\n" + + def _render_steps(self, items: Any) -> str: + if not isinstance(items, list): + items = [] + + normalized: list[dict[str, str]] = [] + for item in items: + if not isinstance(item, dict): + continue + normalized.append( + { + "title": self._clean_text(item.get("title")), + "detail": self._clean_text(item.get("detail")), + "why": self._clean_text(item.get("why")), + "result": self._clean_text(item.get("result")), + } + ) + + if not normalized: + normalized = [ + { + "title": UNKNOWN_TEXT, + "detail": UNKNOWN_TEXT, + "why": UNKNOWN_TEXT, + "result": UNKNOWN_TEXT, + } + ] + + lines: list[str] = [] + for index, item in enumerate(normalized, start=1): + lines.extend( + [ + f"{index}. **{item['title']}**", + f" - 动作:{item['detail']}", + f" - 目的:{item['why']}", + f" - 结果:{item['result']}", + ] + ) + return "\n".join(lines) + + def _render_bullets(self, items: Any) -> str: + if not isinstance(items, list): + items = [] + + cleaned = [self._clean_text(item) for item in items if isinstance(item, str) and item.strip()] + if not cleaned: + cleaned = [UNKNOWN_TEXT] + return "\n".join(f"- {item}" for item in cleaned) + + def _clean_text(self, value: Any) -> str: + if isinstance(value, str): + stripped = value.strip() + if stripped: + return stripped + return UNKNOWN_TEXT diff --git a/src/uipath_explainator/pipeline.py b/src/uipath_explainator/pipeline.py index 9ce7897..38f6060 100644 --- a/src/uipath_explainator/pipeline.py +++ b/src/uipath_explainator/pipeline.py @@ -148,6 +148,7 @@ class ProjectPipeline: def _build_overview(self, report: PipelineReport) -> str: warnings = "\n".join(f"- {item}" for item in report.warnings) or "- 无" + cleaned = "\n".join(f"- {item.as_posix()}" for item in report.cleaned_files) or "- 无" pruned = "\n".join(f"- {item.as_posix()}" for item in report.pruned_files) or "- 无" analyses = "\n".join(f"- {item.as_posix()}" for item in report.analysis_files) or "- 未启用 Gemini 分析" return f"""# UiPath Explainator Overview @@ -164,6 +165,24 @@ class ProjectPipeline: ## Final Files {chr(10).join(f"- {item.as_posix()}" for item in report.final_files)} +## Processing Logic +1. **Initial Scan**: 从入口文件开始递归扫描 `Invoke Workflow` 与 `Invoke VBA`,得到初始依赖集合。这个阶段会把 `Comment Out` 里的引用也一起算进去,因为此时还未清理失效代码。 +2. **Copy to Workspace**: 将初始依赖集合完整复制到输出目录,保持相对目录结构不变,确保后续裁剪只发生在工作区副本中。 +3. **Comment Cleanup**: 对已复制的 XAML 删除 `` 代码块,让后续扫描只看到当前真正生效的流程节点。 +4. **Rescan After Cleanup**: 以清理后的入口文件重新递归扫描,得到清理后的实际依赖集合。 +5. **Prune Unused Files**: 删除首次扫描能到达、但二次扫描已不可达的文件。这些文件通常来自被注释掉的工作流、VBA,或清理后失效的调用链。 +6. **Gemini Analysis**: 仅针对最终保留文件生成 `*.analysis.md`,让说明文档与实际可执行流程保持一致。 + +## How To Read This Output +- `Initial Files`: 清理 `` 前扫描到的所有文件,代表“理论上被引用过”的集合。 +- `Final Files`: 清理后仍可达的文件,代表“当前实际流程会用到”的集合。 +- `Cleaned XAML Files`: 被移除 `` 代码块的文件。 +- `Pruned Files`: 只在失效分支中出现、已从输出目录删除的文件。 +- `Analysis Files`: Gemini 生成的逐文件说明,重点解释该文件的流程、逻辑、数据与风险点。 + +## Cleaned XAML Files +{cleaned} + ## Pruned Files {pruned} diff --git a/tests/test_gemini.py b/tests/test_gemini.py index 787af8a..83ac7a4 100644 --- a/tests/test_gemini.py +++ b/tests/test_gemini.py @@ -81,6 +81,84 @@ class GeminiAnalyzerTests(unittest.TestCase): {"base_url": "https://newapi.tootaio.com", "timeout": 120_000}, ) + def test_markdown_output_uses_consulting_format(self) -> None: + fake_types = SimpleNamespace( + HttpOptions=FakeHttpOptions, + GenerateContentConfig=FakeGenerateContentConfig, + ) + fake_genai = ModuleType("google.genai") + fake_genai.Client = FakeClient + fake_genai.types = fake_types + + fake_google = ModuleType("google") + fake_google.genai = fake_genai + + with patch.dict(sys.modules, {"google": fake_google, "google.genai": fake_genai}): + analyzer = GeminiAnalyzer(Settings(api_key="test-key", base_url=None, model="gemini-test")) + + markdown = analyzer._to_markdown( + Path("Flows/Active.xaml"), + { + "overview": { + "purpose": "负责调用 VBA 并准备执行上下文", + "role": "主流程中的子流程节点", + "trigger": "由上游工作流通过 Invoke Workflow 调用", + }, + "logic": { + "steps": [ + { + "title": "读取 VBA 文件路径", + "detail": "从 `CodeFilePath` 读取脚本位置。", + "why": "定位需要执行的 VBA 代码。", + "result": "得到待执行的脚本文件。", + } + ], + "decision_logic": ["如果 `CodeFilePath` 为空,则无法继续执行脚本。"], + "exceptions": ["当前文件未展示脚本执行失败后的补偿逻辑。"], + }, + "data": { + "inputs": ["`Scripts/Keep.bas` 路径"], + "outputs": ["VBA 执行结果未直接在当前文件中落盘"], + "variables": ["`CodeFilePath`"], + "external_dependencies": ["外部 VBA 文件 `Scripts/Keep.bas`"], + }, + "consultation": { + "business_meaning": "这是把业务动作下沉到 VBA 的桥接层。", + "risks": ["脚本文件缺失会导致执行失败。"], + "example": "例如:财务流程在这里调用 Excel VBA 完成批量格式整理。", + "unknowns": ["无法从当前文件确定 VBA 内部实现逻辑。"], + }, + }, + ) + + self.assertIn("## 文件定位", markdown) + self.assertIn("## 流程拆解", markdown) + self.assertIn("1. **读取 VBA 文件路径**", markdown) + self.assertIn("### 输入", markdown) + self.assertIn("## 咨询视角", markdown) + self.assertIn("## 场景范例", markdown) + + def test_prompt_requires_strict_structured_json(self) -> None: + fake_types = SimpleNamespace( + HttpOptions=FakeHttpOptions, + GenerateContentConfig=FakeGenerateContentConfig, + ) + fake_genai = ModuleType("google.genai") + fake_genai.Client = FakeClient + fake_genai.types = fake_types + + fake_google = ModuleType("google") + fake_google.genai = fake_genai + + with patch.dict(sys.modules, {"google": fake_google, "google.genai": fake_genai}): + analyzer = GeminiAnalyzer(Settings(api_key="test-key", base_url=None, model="gemini-test")) + + prompt = analyzer._build_prompt(Path("main.xaml"), "") + + self.assertIn("请严格返回 JSON", prompt) + self.assertIn("先讲这个文件在整个流程中的定位", prompt) + self.assertIn("判断逻辑、调用链、输入输出、关键变量、外部依赖", prompt) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index ae5aeb4..06c39f8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -108,6 +108,12 @@ class PipelineTests(unittest.TestCase): self.assertFalse((output_root / "Scripts" / "Drop.bas").exists()) self.assertTrue((output_root / "Flows" / "Active.xaml.analysis.md").exists()) + overview = (output_root / "OVERVIEW.md").read_text(encoding="utf-8") + self.assertIn("## Processing Logic", overview) + self.assertIn("Initial Scan", overview) + self.assertIn("## How To Read This Output", overview) + self.assertIn("## Cleaned XAML Files", overview) + if __name__ == "__main__": unittest.main()