kenney-asset-scrapper/scrapper/3 image_downloader.py

import os
import json
import requests
import time
import random
from urllib.parse import urlparse, unquote
from tqdm import tqdm

# ========== 配置 ==========
json_path = "kenney_data.json"         # JSON 数据路径
output_root = "kenney_assets_images"   # 存储根目录
headers = {"User-Agent": "Mozilla/5.0"}

# ========== 工具函数 ==========


def sanitize_filename(name):
    return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()


def download_image(url, save_path):
    if os.path.exists(save_path):
        print(f"✅ 已存在，跳过: {save_path}")
        return
    try:
        response = requests.get(url, stream=True, timeout=30)
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"✅ 下载完成: {save_path}")
    except Exception as e:
        print(f"❌ 下载失败: {url} - {e}")
    time.sleep(random.uniform(1.5, 4.0))  # 模拟人类访问


# ========== 加载 JSON ==========
with open(json_path, "r", encoding="utf-8") as f:
    resources = json.load(f)

# ========== 批量处理 ==========
for entry in tqdm(resources, desc="处理资源"):
    title = entry["title"]
    category = entry["properties"].get("Category", ["Uncategorized"])[0]
    series = entry["properties"].get("Series", [None])[0]
    images = entry.get("images", [])

    # 构建路径：Category/Series/Title/
    path = os.path.join(output_root, sanitize_filename(category))
    if series:
        path = os.path.join(path, sanitize_filename(series))
    path = os.path.join(path, sanitize_filename(title))
    os.makedirs(path, exist_ok=True)

    for img_url in images:
        parsed_url = urlparse(img_url)
        img_name = os.path.basename(parsed_url.path)
        img_name = unquote(img_name)  # 处理 URL 编码，如 %20 => 空格
        img_path = os.path.join(path, img_name)
        download_image(img_url, img_path)

print("\n🎉 所有图片处理完成！")