Files
kenney-asset-scrapper/scrapper/3 image_downloader.py
2025-04-22 09:11:40 +08:00

62 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
import requests
import time
import random
from urllib.parse import urlparse, unquote
from tqdm import tqdm
# ========== 配置 ==========
json_path = "kenney_data.json" # JSON 数据路径
output_root = "kenney_assets_images" # 存储根目录
headers = {"User-Agent": "Mozilla/5.0"}
# ========== 工具函数 ==========
def sanitize_filename(name):
return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()
def download_image(url, save_path):
if os.path.exists(save_path):
print(f"✅ 已存在,跳过: {save_path}")
return
try:
response = requests.get(url, stream=True, timeout=30)
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"✅ 下载完成: {save_path}")
except Exception as e:
print(f"❌ 下载失败: {url} - {e}")
time.sleep(random.uniform(1.5, 4.0)) # 模拟人类访问
# ========== 加载 JSON ==========
with open(json_path, "r", encoding="utf-8") as f:
resources = json.load(f)
# ========== 批量处理 ==========
for entry in tqdm(resources, desc="处理资源"):
title = entry["title"]
category = entry["properties"].get("Category", ["Uncategorized"])[0]
series = entry["properties"].get("Series", [None])[0]
images = entry.get("images", [])
# 构建路径Category/Series/Title/
path = os.path.join(output_root, sanitize_filename(category))
if series:
path = os.path.join(path, sanitize_filename(series))
path = os.path.join(path, sanitize_filename(title))
os.makedirs(path, exist_ok=True)
for img_url in images:
parsed_url = urlparse(img_url)
img_name = os.path.basename(parsed_url.path)
img_name = unquote(img_name) # 处理 URL 编码,如 %20 => 空格
img_path = os.path.join(path, img_name)
download_image(img_url, img_path)
print("\n🎉 所有图片处理完成!")