from __future__ import annotations from dataclasses import dataclass from pathlib import Path from shutil import copy2, rmtree import json import logging from time import perf_counter from .scanner import ScanResult, crawl_dependencies, find_entry_file, read_text, strip_comment_out_blocks logger = logging.getLogger(__name__) @dataclass(slots=True) class PipelineReport: project_root: Path output_root: Path code_root: Path docs_root: Path entry_file: Path initial_files: list[Path] final_files: list[Path] pruned_files: list[Path] cleaned_files: list[Path] warnings: list[str] analysis_files: list[Path] def to_json(self) -> str: return json.dumps( { "project_root": self.project_root.as_posix(), "output_root": self.output_root.as_posix(), "code_root": self.code_root.as_posix(), "docs_root": self.docs_root.as_posix(), "entry_file": self.entry_file.as_posix(), "initial_files": [item.as_posix() for item in self.initial_files], "final_files": [item.as_posix() for item in self.final_files], "pruned_files": [item.as_posix() for item in self.pruned_files], "cleaned_files": [item.as_posix() for item in self.cleaned_files], "warnings": self.warnings, "analysis_files": [item.as_posix() for item in self.analysis_files], }, ensure_ascii=False, indent=2, ) class ProjectPipeline: def __init__(self, project_root: Path, output_root: Path, entry_name: str, force: bool = False) -> None: self.project_root = project_root.resolve() self.output_root = output_root.resolve() self.code_root = self.output_root / "code" self.docs_root = self.output_root / "docs" self.entry_name = entry_name self.force = force def run(self, analyzer=None) -> PipelineReport: started = perf_counter() logger.info( "Starting pipeline: project_root=%s output_root=%s entry=%s analysis=%s force=%s", self.project_root, self.output_root, self.entry_name, analyzer is not None, self.force, ) entry_file = find_entry_file(self.project_root, self.entry_name) entry_rel = entry_file.relative_to(self.project_root) logger.debug("Using entry file %s", entry_rel.as_posix()) initial_scan = crawl_dependencies(self.project_root, entry_file) initial_rel_files = self._relative_files(initial_scan, self.project_root) logger.info( "Initial scan complete: files=%d warnings=%d", len(initial_rel_files), len(initial_scan.warnings), ) self._prepare_output_root() self._copy_files(initial_rel_files) cleaned_files = self._clean_copied_xaml_files(initial_rel_files) logger.info("Copied %d files and cleaned %d XAML files", len(initial_rel_files), len(cleaned_files)) final_scan = crawl_dependencies(self.code_root, self.code_root / entry_rel) final_rel_files = self._relative_files(final_scan, self.code_root) pruned_files = self._prune_unused_files(initial_rel_files, final_rel_files) logger.info( "Final scan complete: files=%d warnings=%d pruned=%d", len(final_rel_files), len(final_scan.warnings), len(pruned_files), ) analysis_files = self._write_analysis(final_rel_files, analyzer) warnings = initial_scan.warnings + final_scan.warnings report = PipelineReport( project_root=self.project_root, output_root=self.output_root, code_root=self.code_root, docs_root=self.docs_root, entry_file=entry_rel, initial_files=initial_rel_files, final_files=final_rel_files, pruned_files=pruned_files, cleaned_files=cleaned_files, warnings=warnings, analysis_files=analysis_files, ) self._write_report_files(report) logger.info( "Pipeline completed in %.2fs: final_files=%d analysis_files=%d warnings=%d", perf_counter() - started, len(report.final_files), len(report.analysis_files), len(report.warnings), ) return report def _prepare_output_root(self) -> None: if self.output_root.exists(): if not self.force: raise FileExistsError(f"Output directory already exists: {self.output_root}") logger.info("Removing existing output directory because force=True: %s", self.output_root) rmtree(self.output_root) self.code_root.mkdir(parents=True, exist_ok=True) self.docs_root.mkdir(parents=True, exist_ok=True) logger.debug("Prepared output directories: code=%s docs=%s", self.code_root, self.docs_root) def _copy_files(self, relative_files: list[Path]) -> None: for relative_path in relative_files: source = self.project_root / relative_path destination = self.code_root / relative_path destination.parent.mkdir(parents=True, exist_ok=True) copy2(source, destination) logger.debug("Copied file: %s -> %s", source, destination) def _clean_copied_xaml_files(self, relative_files: list[Path]) -> list[Path]: cleaned: list[Path] = [] for relative_path in relative_files: if relative_path.suffix.lower() != ".xaml": continue output_file = self.code_root / relative_path original = read_text(output_file) updated = strip_comment_out_blocks(original) if updated != original: output_file.write_text(updated, encoding="utf-8") cleaned.append(relative_path) logger.debug("Removed CommentOut blocks from %s", output_file) return cleaned def _prune_unused_files(self, initial_files: list[Path], final_files: list[Path]) -> list[Path]: final_set = set(final_files) pruned: list[Path] = [] for relative_path in initial_files: if relative_path in final_set: continue target = self.code_root / relative_path if target.exists(): target.unlink() pruned.append(relative_path) logger.debug("Pruned unreachable file: %s", target) self._cleanup_empty_dirs() return pruned def _cleanup_empty_dirs(self) -> None: directories = sorted( [path for path in self.code_root.rglob("*") if path.is_dir()], key=lambda item: len(item.parts), reverse=True, ) for directory in directories: if any(directory.iterdir()): continue directory.rmdir() logger.debug("Removed empty directory: %s", directory) def _write_analysis(self, final_files: list[Path], analyzer) -> list[Path]: if analyzer is None: logger.info("Skipping Gemini analysis because analyzer is disabled") return [] output_files: list[Path] = [] for relative_path in self._ordered_files(final_files): content = read_text(self.code_root / relative_path) analysis = analyzer.analyze(relative_path, content) analysis_path = self.docs_root / f"{relative_path.as_posix()}.analysis.md" analysis_path.parent.mkdir(parents=True, exist_ok=True) analysis_path.write_text(analysis, encoding="utf-8") output_files.append(Path(f"{relative_path.as_posix()}.analysis.md")) logger.debug("Wrote analysis file: %s", analysis_path) return output_files def _write_report_files(self, report: PipelineReport) -> None: (self.docs_root / "manifest.json").write_text(report.to_json(), encoding="utf-8") (self.docs_root / "OVERVIEW.md").write_text(self._build_overview(report), encoding="utf-8") logger.debug("Wrote report files to %s", self.docs_root) def _build_overview(self, report: PipelineReport) -> str: warnings = "\n".join(f"- {item}" for item in report.warnings) or "- 无" cleaned = "\n".join(f"- {item.as_posix()}" for item in report.cleaned_files) or "- 无" pruned = "\n".join(f"- {item.as_posix()}" for item in report.pruned_files) or "- 无" analyses = "\n".join(f"- {item.as_posix()}" for item in report.analysis_files) or "- 未启用 Gemini 分析" return f"""# UiPath Explainator Overview - Project Root: `{report.project_root.as_posix()}` - Output Root: `{report.output_root.as_posix()}` - Code Root: `{report.code_root.as_posix()}` - Docs Root: `{report.docs_root.as_posix()}` - Entry File: `{report.entry_file.as_posix()}` - Initial Files: {len(report.initial_files)} - Final Files: {len(report.final_files)} - Cleaned XAML Files: {len(report.cleaned_files)} - Pruned Files: {len(report.pruned_files)} - Analysis Files: {len(report.analysis_files)} ## Final Files {chr(10).join(f"- {item.as_posix()}" for item in report.final_files)} ## Processing Logic 1. **Initial Scan**: 从入口文件开始递归扫描 `Invoke Workflow` 与 `Invoke VBA`,得到初始依赖集合。这个阶段会把 `Comment Out` 里的引用也一起算进去,因为此时还未清理失效代码。 2. **Copy to Workspace**: 将初始依赖集合完整复制到 `code/` 目录,保持相对目录结构不变,确保后续裁剪只发生在代码副本中。 3. **Comment Cleanup**: 对已复制的 XAML 删除 `` 代码块,让后续扫描只看到当前真正生效的流程节点。 4. **Rescan After Cleanup**: 以清理后的入口文件重新递归扫描,得到清理后的实际依赖集合。 5. **Prune Unused Files**: 删除 `code/` 目录中首次扫描能到达、但二次扫描已不可达的文件。这些文件通常来自被注释掉的工作流、VBA,或清理后失效的调用链。 6. **Gemini Analysis**: 仅针对最终保留文件在 `docs/` 目录生成 `*.analysis.md`,让说明文档与实际可执行流程保持一致。 ## How To Read This Output - `Initial Files`: 清理 `` 前扫描到的所有文件,代表“理论上被引用过”的集合。 - `Final Files`: 清理后 `code/` 目录中仍可达的文件,代表“当前实际流程会用到”的集合。 - `Cleaned XAML Files`: 被移除 `` 代码块的文件。 - `Pruned Files`: 只在失效分支中出现、已从 `code/` 目录删除的文件。 - `Analysis Files`: `docs/` 目录下 Gemini 生成的逐文件说明,重点解释该文件的流程、逻辑、数据与风险点。 ## Cleaned XAML Files {cleaned} ## Pruned Files {pruned} ## Analysis Files {analyses} ## Warnings {warnings} """ def _relative_files(self, scan: ScanResult, root: Path) -> list[Path]: return sorted(path.relative_to(root) for path in scan.files) def _ordered_files(self, paths: list[Path]) -> list[Path]: return sorted(paths, key=lambda item: (item.suffix.lower() != ".xaml", item.as_posix().lower()))