From a4bf39a9584fa81c145418329fe0de5c0edc837a Mon Sep 17 00:00:00 2001 From: kingsmai Date: Tue, 22 Apr 2025 09:11:40 +0800 Subject: [PATCH] Open Source --- .gitignore | 52 +++++++++++ LICENSE | 21 +++++ README.md | 111 +++++++++++++++++++++++ frontend/v1/index.html | 67 ++++++++++++++ frontend/v1/script.js | 98 +++++++++++++++++++++ frontend/v1/style.css | 82 +++++++++++++++++ frontend/v2/index.html | 41 +++++++++ frontend/v2/script.js | 108 +++++++++++++++++++++++ frontend/v2/style.css | 115 ++++++++++++++++++++++++ requirements.txt | 11 +++ scrapper/1 main.py | 136 +++++++++++++++++++++++++++++ scrapper/2 asset_downloader.py | 72 +++++++++++++++ scrapper/3 image_downloader.py | 61 +++++++++++++ scrapper/4 kenney_data_to_local.py | 68 +++++++++++++++ 14 files changed, 1043 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 frontend/v1/index.html create mode 100644 frontend/v1/script.js create mode 100644 frontend/v1/style.css create mode 100644 frontend/v2/index.html create mode 100644 frontend/v2/script.js create mode 100644 frontend/v2/style.css create mode 100644 requirements.txt create mode 100644 scrapper/1 main.py create mode 100644 scrapper/2 asset_downloader.py create mode 100644 scrapper/3 image_downloader.py create mode 100644 scrapper/4 kenney_data_to_local.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..81914c3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Python 缓存文件 +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.pyc + +# 编辑器配置 +.vscode/ +.idea/ +*.sublime-workspace +*.sublime-project + +# OS 系统文件 +.DS_Store +Thumbs.db + +# 虚拟环境 +.env/ +.venv/ +venv/ +ENV/ +env.bak/ + +# 日志与运行时生成 +*.log + +# 数据与下载目录 +scrapper/kenney_data.json +scrapper/kenney_data_local.json +scrapper/kenney_links.txt +kenney_assets/ +kenney_assets_images/ + +# 临时测试文件 +*.tmp +*.bak +*.old + +# node/npm/yarn(如前端使用过) +node_modules/ +dist/ +build/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* + +# 压缩包等导出文件 +*.zip +*.tar.gz +*.rar \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a5d92b8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 [kingsmai] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..eaeb0e6 --- /dev/null +++ b/README.md @@ -0,0 +1,111 @@ +# 🧲 Kenney Asset Scrapper + +一个为开发者而生的自动化工具,专为抓取、整理和本地化 [Kenney.nl](https://kenney.nl/assets) 免费资源库而设计。支持批量提取资源信息、下载素材包和图片,并可生成本地资源索引,方便在离线环境中高效使用。 + +--- + +## 📁 项目结构 + +```plaintext +Kenney-Asset-Scrapper/ +├── frontend/ # 已废弃的旧前端版本(v1~v3) +│ ├── v1/ +│ ├── v2/ +│ └── v3/ +├── scrapper/ # 爬虫与处理核心 +│ ├── main.py # 爬取所有资源链接与详情 +│ ├── asset_downloader.py # 批量下载 ZIP 素材包 +│ ├── image_downloader.py # 批量下载预览图 +│ ├── kenney_data_to_local.py # 替换 JSON 为本地路径 +``` + +--- + +## 🚀 功能亮点 + +| 模块 | 功能 | +|------|------| +| `main.py` | 爬取 Kenney 网站资源列表与详情,包括标题、分类、版本日志、预览图与下载链接,并保存为结构化 JSON | +| `asset_downloader.py` | 根据 JSON 批量下载 ZIP 资源,自动分类存储(按类型 + 系列) | +| `image_downloader.py` | 批量下载所有预览图,结构化存储至本地 | +| `kenney_data_to_local.py` | 将所有网络路径(资源/图片)替换为对应的本地路径,方便离线访问或嵌入其他系统 | + +--- + +## 🛠 使用说明 + +1. **抓取资源页面数据** + + ```bash + python scrapper/main.py + ``` + + ⏳ 会生成 `kenney_links.txt` 和 `kenney_data.json`。 + +2. **下载 ZIP 素材包** + + ```bash + python scrapper/asset_downloader.py + ``` + + ✅ 下载到 `kenney_assets/`,按分类 & 系列整理。 + +3. **下载图片预览** + + ```bash + python scrapper/image_downloader.py + ``` + + 🎨 所有图片将下载到 `kenney_assets_images/`,与资源信息匹配。 + +4. **本地路径替换** + + ```bash + python scrapper/kenney_data_to_local.py + ``` + + 📝 生成 `kenney_data_local.json`,将所有 `download` 和 `images` 字段替换为本地路径。 + +--- + +## 🗃 示例结构 + +```plaintext +kenney_assets/ +└── 2D/ + └── Roguelike RPG/ + └── Roguelike RPG V1.4.zip + +kenney_assets_images/ +└── 2D/ + └── Roguelike RPG/ + └── Roguelike RPG/ + ├── preview1.png + ├── preview2.jpg + └── ... +``` + +--- + +## 🧪 注意事项 + +- 所有请求均带有浏览器头(User-Agent)以防被屏蔽 +- 下载任务中加入了随机延迟以模拟人类操作,避免触发反爬机制 +- 项目未来将继续支持更多格式,如 CSV 导出、数据库同步、本地 Web UI 预览等 + +--- + +## 📦 前端说明 + +原始 `frontend/` 文件夹中的 v1~v3 版本已**弃用**,请转而使用新的 Vue 前端项目 👉 [`kenney-asset-frontend-vue`](https://github.com/kingsmai/kenney-asset-frontend-vue) + +--- + +## 📜 License + +MIT License © 2025 — Inspired by the generosity of **Kenney** and his contributions to the open game development community. + +--- + +如需增强或集成至你的工作流,欢迎提 Issue 或 PR! +Let's automate creativity! 🎮✨ diff --git a/frontend/v1/index.html b/frontend/v1/index.html new file mode 100644 index 0000000..e1b0162 --- /dev/null +++ b/frontend/v1/index.html @@ -0,0 +1,67 @@ + + + + + + Kenney Asset Gallery + + + + + + + + + +

🎮 Kenney Asset Gallery

+ + +
+ + + + +
+ + + + + + diff --git a/frontend/v1/script.js b/frontend/v1/script.js new file mode 100644 index 0000000..f5ae82e --- /dev/null +++ b/frontend/v1/script.js @@ -0,0 +1,98 @@ +let allData = []; + +function sanitize(str) { + return str.replaceAll("\\", "/"); +} + +function populateFilters(data) { + const catSet = new Set(); + const tagSet = new Set(); + + data.forEach(item => { + catSet.add(item.properties?.Category?.[0]); + (item.properties?.Tags || []).forEach(tag => tagSet.add(tag)); + }); + + const catFilter = document.getElementById("categoryFilter"); + [...catSet].sort().forEach(cat => { + const option = document.createElement("option"); + option.value = cat; + option.textContent = cat; + catFilter.appendChild(option); + }); + + const tagFilter = document.getElementById("tagFilter"); + [...tagSet].sort().forEach(tag => { + const option = document.createElement("option"); + option.value = tag; + option.textContent = tag; + tagFilter.appendChild(option); + }); +} + +function render(data) { + const gallery = document.getElementById("gallery"); + gallery.innerHTML = ""; // clear + + data.forEach((item, index) => { + const images = (item.images || []).map(sanitize); + const tags = (item.properties?.Tags || []).join(', '); + const category = item.properties?.Category?.[0] || 'Uncategorized'; + const downloadPath = sanitize(item.download); + + const card = document.createElement("div"); + card.className = "card"; + card.setAttribute("data-category", category); + card.setAttribute("data-tags", tags); + + const galleryGroupId = `gallery-${index}`; + card.innerHTML = ` + +
+
${item.title}
+
Tags: ${tags}
+ +
+ `; + gallery.appendChild(card); + + // 初始化 lightGallery + lightGallery(document.getElementById(galleryGroupId), { + selector: 'a', + thumbnail: true, + zoom: true + }); + }); +} + +function filterGallery() { + const cat = document.getElementById("categoryFilter").value; + const tag = document.getElementById("tagFilter").value; + + const filtered = allData.filter(item => { + const matchCat = (cat === 'all') || (item.properties?.Category?.[0] === cat); + const matchTag = (tag === 'all') || (item.properties?.Tags || []).includes(tag); + return matchCat && matchTag; + }); + + render(filtered); +} + +fetch("data/kenney_data_local.json") + .then(res => res.json()) + .then(data => { + allData = data; + populateFilters(data); + render(data); + document.getElementById("categoryFilter").addEventListener("change", filterGallery); + document.getElementById("tagFilter").addEventListener("change", filterGallery); + }); diff --git a/frontend/v1/style.css b/frontend/v1/style.css new file mode 100644 index 0000000..2123a15 --- /dev/null +++ b/frontend/v1/style.css @@ -0,0 +1,82 @@ +body { + font-family: "Segoe UI", sans-serif; + background-color: #f0f2f5; + margin: 0; + padding: 2rem; + color: #333; +} + +h1 { + text-align: center; + margin-bottom: 2rem; + color: #444; +} + +#gallery { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: 1.5rem; +} + +.card { + background: white; + border-radius: 12px; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1); + overflow: hidden; + transition: transform 0.2s; +} + +.card:hover { + transform: translateY(-5px); +} + +.card img { + width: 100%; + height: 200px; + object-fit: cover; +} + +.card-body { + padding: 1rem; +} + +.card-title { + font-size: 1.2rem; + margin-bottom: 0.5rem; +} + +.card-tags { + font-size: 0.85rem; + color: #666; +} + +.card-footer { + margin-top: 1rem; +} + +.download-btn { + display: inline-block; + padding: 0.4rem 0.8rem; + background: #4caf50; + color: white; + border-radius: 6px; + text-decoration: none; + font-size: 0.9rem; + transition: background 0.2s; +} + +.download-btn:hover { + background: #45a049; +} + +#filters { + display: flex; + justify-content: center; + gap: 1rem; + margin-bottom: 1.5rem; +} + +select { + padding: 0.5rem; + font-size: 1rem; +} diff --git a/frontend/v2/index.html b/frontend/v2/index.html new file mode 100644 index 0000000..72a170f --- /dev/null +++ b/frontend/v2/index.html @@ -0,0 +1,41 @@ + + + + + Kenney 资源库 + + + + + + + + + + +
+ + +
+ +
+ ← 请选择左侧资源以查看详情 +
+
+
+ + + + + + + diff --git a/frontend/v2/script.js b/frontend/v2/script.js new file mode 100644 index 0000000..bdfb75c --- /dev/null +++ b/frontend/v2/script.js @@ -0,0 +1,108 @@ +let allData = []; +let currentActive = null; + +// 定义一个函数,用于将字符串中的反斜杠替换为正斜杠 +function sanitize(str) { + return str.replaceAll("\\", "/"); +} + +function renderList(data) { + const list = document.getElementById("resourceList"); + list.innerHTML = ""; + + data.forEach((item, index) => { + const li = document.createElement("li"); + li.setAttribute("data-index", index); + + const thumb = item.images?.[0] ? sanitize(item.images[0]) : ""; + const name = item.title; + const assets = item.properties?.Assets || "未知"; + + li.innerHTML = ` + +
+
${name}
+
素材量: ${assets}
+
+ `; + + li.addEventListener("click", () => showDetails(item, li)); + list.appendChild(li); + }); +} + +function showDetails(item, li) { + if (currentActive) currentActive.classList.remove("active"); + currentActive = li; + currentActive.classList.add("active"); + + document.getElementById("placeholder").classList.add("hidden"); + document.getElementById("details").classList.remove("hidden"); + + document.getElementById("detailTitle").textContent = item.title; + document.getElementById("detailTags").textContent = `分类: ${ + item.properties?.Category?.[0] || "N/A" + } | 标签: ${(item.properties?.Tags || []).join(", ")}`; + + // 下载链接 + const download = sanitize(item.download); + document.getElementById( + "downloadBlock" + ).innerHTML = `⬇️ 下载资源`; + + // 图集 + const gallery = document.createElement("div"); + gallery.id = "gallery"; + item.images?.forEach((img) => { + img = sanitize(img); + const a = document.createElement("a"); + a.href = img; + a.innerHTML = ``; + gallery.appendChild(a); + }); + const galleryWrapper = document.getElementById("galleryWrapper"); + galleryWrapper.innerHTML = ""; + galleryWrapper.appendChild(gallery); + + lightGallery(gallery, { + selector: "a", + thumbnail: true, + zoom: true, + }); + + // 版本信息 + const versionBlock = document.getElementById("versionTableWrapper"); + if (item.changelog?.length > 0) { + let table = ``; + item.changelog.forEach((row) => { + table += ``; + }); + table += `
日期版本描述
${row.date}${row.version}${ + row.description || "" + }
`; + versionBlock.innerHTML = table; + } else { + versionBlock.innerHTML = ""; + } +} + +function handleSearch() { + const keyword = document + .getElementById("searchInput") + .value.trim() + .toLowerCase(); + const filtered = allData.filter((item) => + item.title.toLowerCase().includes(keyword) + ); + renderList(filtered); +} + +fetch("data/kenney_data_local.json") + .then((res) => res.json()) + .then((data) => { + allData = data; + renderList(data); + document + .getElementById("searchInput") + .addEventListener("input", handleSearch); + }); diff --git a/frontend/v2/style.css b/frontend/v2/style.css new file mode 100644 index 0000000..fa714a3 --- /dev/null +++ b/frontend/v2/style.css @@ -0,0 +1,115 @@ +body { + margin: 0; + font-family: "Segoe UI", sans-serif; + background-color: #f0f2f5; +} + +.container { + display: flex; + height: 100vh; +} + +.sidebar { + width: 320px; + background: #fff; + border-right: 1px solid #ddd; + padding: 1rem; + overflow-y: auto; +} + +.sidebar input { + width: 100%; + padding: 0.5rem; + margin-bottom: 1rem; + font-size: 1rem; +} + +.sidebar ul { + list-style: none; + padding: 0; + margin: 0; +} + +.sidebar li { + padding: 0.5rem; + margin-bottom: 0.5rem; + cursor: pointer; + border-radius: 6px; + display: flex; + align-items: center; + gap: 1rem; + transition: background 0.2s; +} + +.sidebar li:hover, +.sidebar li.active { + background-color: #e6f7ff; +} + +.sidebar img.thumb { + width: 48px; + height: 48px; + object-fit: cover; + border-radius: 4px; +} + +.content { + flex: 1; + padding: 2rem; + overflow-y: auto; +} + +.placeholder { + font-size: 1.2rem; + color: #999; +} + +.hidden { + display: none; +} + +#galleryWrapper { + margin-top: 1rem; +} + +#galleryWrapper a img { + height: 120px; + margin: 5px; + object-fit: cover; + border-radius: 4px; +} + +#detailTags { + margin-bottom: 1rem; + color: #666; + font-size: 0.9rem; +} + +#downloadBlock { + margin: 1rem 0; +} + +.download-btn { + padding: 0.5rem 1rem; + background-color: #4caf50; + color: white; + border-radius: 6px; + text-decoration: none; + font-size: 0.95rem; +} + +.download-btn:hover { + background-color: #45a049; +} + +#versionTableWrapper table { + margin-top: 1rem; + width: 100%; + border-collapse: collapse; +} + +#versionTableWrapper th, +#versionTableWrapper td { + padding: 0.5rem; + border: 1px solid #ccc; +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c76d3d5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +beautifulsoup4==4.13.4 +certifi==2025.1.31 +charset-normalizer==3.4.1 +colorama==0.4.6 +idna==3.10 +lxml==5.3.2 +requests==2.32.3 +soupsieve==2.6 +tqdm==4.67.1 +typing_extensions==4.13.2 +urllib3==2.4.0 diff --git a/scrapper/1 main.py b/scrapper/1 main.py new file mode 100644 index 0000000..240ce25 --- /dev/null +++ b/scrapper/1 main.py @@ -0,0 +1,136 @@ +import requests +import json +from bs4 import BeautifulSoup +from tqdm import tqdm, trange + +base_url = "https://www.kenney.nl/assets/page:" +total_pages = 13 +all_links = [] + +headers = { + "User-Agent": "Mozilla/5.0" +} + + +def parse_resource_page(url): + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, "lxml") + result = {} + + # 资源名称 + title_tag = soup.select_one( + '#content > section > div > div > div:nth-of-type(1) > h1') + result['title'] = title_tag.text.strip() if title_tag else 'N/A' + + # 属性表 + properties = {} + prop_table = soup.select_one( + '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody') + if prop_table: + for row in prop_table.find_all('tr'): + cols = row.find_all('td') + if len(cols) == 2: + key = cols[0].text.strip().rstrip(':') + value_links = cols[1].find_all('a') + if value_links: + value = [a.text.strip() for a in value_links] + else: + value = cols[1].text.strip() + properties[key] = value + result['properties'] = properties + + # 更新记录 + changelog = [] + update_table = soup.select_one( + '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody') + if update_table: + for row in update_table.find_all('tr'): + cols = row.find_all('td') + if len(cols) == 2: + date = cols[0].text.strip() + spans = cols[1].find_all('span') + version = spans[0].text.strip() if len(spans) >= 1 else '' + description = spans[1].text.strip() if len(spans) >= 2 else '' + changelog.append({ + 'date': date, + 'version': version, + 'description': description + }) + result['changelog'] = changelog + + # 下载链接 + zip_link = None + for a_tag in soup.find_all("a", href=True): + href = a_tag["href"] + if href.endswith(".zip"): + zip_link = "https://www.kenney.nl" + \ + href if href.startswith("/") else href + break + result['download'] = zip_link if zip_link else "N/A" + + # ✅ 图片提取 + images = [] + + # 封面图(Cover) + cover_img = soup.select_one( + '#content > section > div > div > div:nth-of-type(2) > a > img') + if cover_img and cover_img.get("src"): + cover_url = cover_img["src"] + if cover_url.startswith("/"): + cover_url = "https://www.kenney.nl" + cover_url + images.append(cover_url) + + # 图集中的图像 + gallery_divs = soup.select( + '#content > section > div > div > div:nth-of-type(2) > div > div') + for div in gallery_divs: + img_tag = div.select_one("a > img") + if img_tag and img_tag.get("src"): + img_url = img_tag["src"] + if img_url.startswith("/"): + img_url = "https://www.kenney.nl" + img_url + images.append(img_url) + + result['images'] = images + + return result + + +# for page in range(1, total_pages + 1): +for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"): + url = base_url + str(page) + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, "lxml") + + # 定位到//*[@id="content"]/section/div/div[1] + content_div = soup.select_one( + "#content > section > div > div:nth-of-type(1)") + + if content_div: + item_divs = content_div.find_all("div", recursive=False) + + for item_div in item_divs: + a_tag = item_div.find("a") + if a_tag and "href" in a_tag.attrs: + link = a_tag["href"] + full_link = link + all_links.append(full_link) + +print(f"总共提取到 {len(all_links)} 个链接 ✅") + +with open("kenney_links.txt", "w", encoding="utf-8") as f: + for link in all_links: + f.write(link + "\n") + +# 爬取页面内的信息 +all_resource_data = [] + +# for link in all_links: +for link in tqdm(all_links, desc="Fetching all assets' data"): + resource_data = parse_resource_page(link) + all_resource_data.append(resource_data) + +with open("kenney_data.json", "w", encoding="utf-8") as f: + json.dump(all_resource_data, f, ensure_ascii=False, indent=4) + +print("数据爬取完成 ✅") diff --git a/scrapper/2 asset_downloader.py b/scrapper/2 asset_downloader.py new file mode 100644 index 0000000..069efaf --- /dev/null +++ b/scrapper/2 asset_downloader.py @@ -0,0 +1,72 @@ +import os +import json +import requests +import time +import random +from tqdm import tqdm + +# === 配置路径 === +json_path = "kenney_data.json" # JSON 数据路径 +output_dir = "kenney_assets" # 下载根目录 + +# === 加载 JSON 数据 === +with open(json_path, "r", encoding="utf-8") as f: + resources = json.load(f) + +# === 工具函数 === + + +def sanitize_filename(name): + return "".join(c for c in name if c.isalnum() or c in "._- ()").strip() + + +def download_zip(entry): + title = entry["title"] + version = entry["changelog"][0]["version"] if entry["changelog"] else "1.0" + download_url = entry.get("download") + + # 提取分类、系列 + category = entry["properties"].get("Category", ["Uncategorized"])[0] + series = entry["properties"].get("Series", [None])[0] + + # 构建目录结构 + folder_path = os.path.join(output_dir, sanitize_filename(category)) + if series: + folder_path = os.path.join(folder_path, sanitize_filename(series)) + os.makedirs(folder_path, exist_ok=True) + + # 构建文件路径 + filename = f"{sanitize_filename(title)} V{version}.zip" + filepath = os.path.join(folder_path, filename) + + if os.path.exists(filepath): + print(f"✅ 已存在,跳过: {filename}") + return + + try: + print(f"⬇️ 开始下载: {filename}") + with requests.get(download_url, stream=True, timeout=60) as r: + r.raise_for_status() + with open(filepath, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + print(f"✅ 下载完成: {filename}") + except Exception as e: + print(f"❌ 下载失败: {filename} - {e}") + + # 模拟人类行为:随机等待 + time.sleep(random.uniform(1.5, 4.0)) + + +# === 启动批量下载 === +idx = 0 +for resource in tqdm(resources, desc="处理资源"): + if idx < 156: + idx += 1 + continue + if "download" in resource and resource["download"].endswith(".zip"): + download_zip(resource) + idx += 1 + +print("\n✅ 所有资源处理完成") diff --git a/scrapper/3 image_downloader.py b/scrapper/3 image_downloader.py new file mode 100644 index 0000000..687756f --- /dev/null +++ b/scrapper/3 image_downloader.py @@ -0,0 +1,61 @@ +import os +import json +import requests +import time +import random +from urllib.parse import urlparse, unquote +from tqdm import tqdm + +# ========== 配置 ========== +json_path = "kenney_data.json" # JSON 数据路径 +output_root = "kenney_assets_images" # 存储根目录 +headers = {"User-Agent": "Mozilla/5.0"} + +# ========== 工具函数 ========== + + +def sanitize_filename(name): + return "".join(c for c in name if c.isalnum() or c in "._- ()").strip() + + +def download_image(url, save_path): + if os.path.exists(save_path): + print(f"✅ 已存在,跳过: {save_path}") + return + try: + response = requests.get(url, stream=True, timeout=30) + with open(save_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + print(f"✅ 下载完成: {save_path}") + except Exception as e: + print(f"❌ 下载失败: {url} - {e}") + time.sleep(random.uniform(1.5, 4.0)) # 模拟人类访问 + + +# ========== 加载 JSON ========== +with open(json_path, "r", encoding="utf-8") as f: + resources = json.load(f) + +# ========== 批量处理 ========== +for entry in tqdm(resources, desc="处理资源"): + title = entry["title"] + category = entry["properties"].get("Category", ["Uncategorized"])[0] + series = entry["properties"].get("Series", [None])[0] + images = entry.get("images", []) + + # 构建路径:Category/Series/Title/ + path = os.path.join(output_root, sanitize_filename(category)) + if series: + path = os.path.join(path, sanitize_filename(series)) + path = os.path.join(path, sanitize_filename(title)) + os.makedirs(path, exist_ok=True) + + for img_url in images: + parsed_url = urlparse(img_url) + img_name = os.path.basename(parsed_url.path) + img_name = unquote(img_name) # 处理 URL 编码,如 %20 => 空格 + img_path = os.path.join(path, img_name) + download_image(img_url, img_path) + +print("\n🎉 所有图片处理完成!") diff --git a/scrapper/4 kenney_data_to_local.py b/scrapper/4 kenney_data_to_local.py new file mode 100644 index 0000000..80729bc --- /dev/null +++ b/scrapper/4 kenney_data_to_local.py @@ -0,0 +1,68 @@ +import os +import json +from urllib.parse import urlparse, unquote + +# === 配置路径 === +input_json = "kenney_data.json" +output_json = "kenney_data_local.json" + +zip_root = "kenney_assets" +img_root = "kenney_assets_images" + +# === 工具函数 === + + +def sanitize_filename(name): + return "".join(c for c in name if c.isalnum() or c in "._- ()").strip() + + +def build_zip_path(entry): + title = entry["title"] + version = entry["changelog"][0]["version"] if entry["changelog"] else "1.0" + category = entry["properties"].get("Category", ["Uncategorized"])[0] + series = entry["properties"].get("Series", [None])[0] + + folder = os.path.join(zip_root, sanitize_filename(category)) + if series: + folder = os.path.join(folder, sanitize_filename(series)) + filename = f"{sanitize_filename(title)} V{version}.zip" + return os.path.join(folder, filename) + + +def build_image_paths(entry): + title = entry["title"] + category = entry["properties"].get("Category", ["Uncategorized"])[0] + series = entry["properties"].get("Series", [None])[0] + images = entry.get("images", []) + + folder = os.path.join(img_root, sanitize_filename(category)) + if series: + folder = os.path.join(folder, sanitize_filename(series)) + folder = os.path.join(folder, sanitize_filename(title)) + + local_paths = [] + for img_url in images: + parsed = urlparse(img_url) + filename = unquote(os.path.basename(parsed.path)) + local_paths.append(os.path.join(folder, filename)) + + return local_paths + + +# === 主处理 === +with open(input_json, "r", encoding="utf-8") as f: + data = json.load(f) + +for entry in data: + if "download" in entry and entry["download"].endswith(".zip"): + zip_path = build_zip_path(entry) + if os.path.exists(zip_path): + entry["download"] = zip_path # 替换为本地路径 + if "images" in entry and isinstance(entry["images"], list): + entry["images"] = build_image_paths(entry) + +# === 保存修改后的 JSON === +with open(output_json, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + +print("✅ 已更新 JSON:本地路径写入完毕!")