Open Source
This commit is contained in:
61
scrapper/3 image_downloader.py
Normal file
61
scrapper/3 image_downloader.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import urlparse, unquote
|
||||
from tqdm import tqdm
|
||||
|
||||
# ========== 配置 ==========
|
||||
json_path = "kenney_data.json" # JSON 数据路径
|
||||
output_root = "kenney_assets_images" # 存储根目录
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
|
||||
# ========== 工具函数 ==========
|
||||
|
||||
|
||||
def sanitize_filename(name):
|
||||
return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()
|
||||
|
||||
|
||||
def download_image(url, save_path):
|
||||
if os.path.exists(save_path):
|
||||
print(f"✅ 已存在,跳过: {save_path}")
|
||||
return
|
||||
try:
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
with open(save_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
print(f"✅ 下载完成: {save_path}")
|
||||
except Exception as e:
|
||||
print(f"❌ 下载失败: {url} - {e}")
|
||||
time.sleep(random.uniform(1.5, 4.0)) # 模拟人类访问
|
||||
|
||||
|
||||
# ========== 加载 JSON ==========
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
resources = json.load(f)
|
||||
|
||||
# ========== 批量处理 ==========
|
||||
for entry in tqdm(resources, desc="处理资源"):
|
||||
title = entry["title"]
|
||||
category = entry["properties"].get("Category", ["Uncategorized"])[0]
|
||||
series = entry["properties"].get("Series", [None])[0]
|
||||
images = entry.get("images", [])
|
||||
|
||||
# 构建路径:Category/Series/Title/
|
||||
path = os.path.join(output_root, sanitize_filename(category))
|
||||
if series:
|
||||
path = os.path.join(path, sanitize_filename(series))
|
||||
path = os.path.join(path, sanitize_filename(title))
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
for img_url in images:
|
||||
parsed_url = urlparse(img_url)
|
||||
img_name = os.path.basename(parsed_url.path)
|
||||
img_name = unquote(img_name) # 处理 URL 编码,如 %20 => 空格
|
||||
img_path = os.path.join(path, img_name)
|
||||
download_image(img_url, img_path)
|
||||
|
||||
print("\n🎉 所有图片处理完成!")
|
||||
Reference in New Issue
Block a user