refactor(core): replace JSON-based pipeline with MySQL database
This commit introduces a major architectural overhaul, migrating the data backend from a flat JSON file to a relational MySQL database. - Replaced multiple scraping scripts with a unified `main.py` that handles crawling, parsing, and database synchronization. - Introduced `mysql_helper.py` for robust database interaction with a connection pool. - Added `queries.sql` defining the new database schema for assets, categories, tags, and changelogs. - Removed all obsolete frontend code (v1, v2) and old scraping scripts. This change provides a more scalable and maintainable foundation for managing asset data.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -50,3 +50,6 @@ pnpm-debug.log*
|
||||
*.zip
|
||||
*.tar.gz
|
||||
*.rar
|
||||
|
||||
# Output
|
||||
media
|
||||
1
all_asset_infos.json
Normal file
1
all_asset_infos.json
Normal file
File diff suppressed because one or more lines are too long
1
all_asset_infos_detailed.json
Normal file
1
all_asset_infos_detailed.json
Normal file
File diff suppressed because one or more lines are too long
@@ -1,67 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<meta charset="UTF-8" />
|
||||
<title>Kenney Asset Gallery</title>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lightgallery.min.css"
|
||||
integrity="sha512-QMCloGTsG2vNSnHcsxYTapI6pFQNnUP6yNizuLL5Wh3ha6AraI6HrJ3ABBaw6SIUHqlSTPQDs/SydiR98oTeaQ=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
/>
|
||||
<script
|
||||
src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/lightgallery.min.js"
|
||||
integrity="sha512-n02TbYimj64qb98ed5WwkNiSw/i9Xlvv4Ehvhg0jLp3qMAMWCYUHbOMbppZ0vimtyiyw9NqNqxUZC4hq86f4aQ=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lg-zoom.min.css"
|
||||
integrity="sha512-S/hU6dGSK3D7SRpCvRF/IEufIr6Ikgp5vDiJarhdeFGEnw36hWZ6gVBjnwBbzjA+NEP7D8Gdm+5LL1HEsyiB1w=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
/>
|
||||
<script
|
||||
src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/plugins/zoom/lg-zoom.min.js"
|
||||
integrity="sha512-fwxc/NvaA3du4ZRE6J/Ilrqi2xwOB1QfHBR4neA+ha13/pkweiRfPgBiV4VbfAf/Vi3rXAXdQ3zexUJ1V2bWrg=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
></script>
|
||||
<link
|
||||
rel="stylesheet"
|
||||
href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lg-thumbnail.min.css"
|
||||
integrity="sha512-rKuOh3xlF/027KUPuMok0ESsZ2zWPRzkniD3n5zZKCAtbiVkYw66DR4KtVAGf8dLPLr5DdyQs05BlSmEyXctkQ=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
/>
|
||||
<script
|
||||
src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/plugins/thumbnail/lg-thumbnail.min.js"
|
||||
integrity="sha512-jZxB8WysJ6S6e4Hz5IZpAzR1WiflBl0hBxriHGlLkUN32T18+rD1aLNifa1KTll/zx8lIfWVP1NqEjHi/Khy5w=="
|
||||
crossorigin="anonymous"
|
||||
referrerpolicy="no-referrer"
|
||||
></script>
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>🎮 Kenney Asset Gallery</h1>
|
||||
|
||||
<!-- 筛选栏 -->
|
||||
<div id="filters">
|
||||
<label for="categoryFilter">选择分类:</label>
|
||||
<select name="categoryFilter" id="categoryFilter">
|
||||
<option value="all">📂 所有分类</option>
|
||||
</select>
|
||||
<label for="tagFilter">选择标签:</label>
|
||||
<select name="tagFilter" id="tagFilter">
|
||||
<option value="all">🏷️ 所有标签</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div id="gallery"></div>
|
||||
|
||||
<script src="script.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,98 +0,0 @@
|
||||
let allData = [];
|
||||
|
||||
function sanitize(str) {
|
||||
return str.replaceAll("\\", "/");
|
||||
}
|
||||
|
||||
function populateFilters(data) {
|
||||
const catSet = new Set();
|
||||
const tagSet = new Set();
|
||||
|
||||
data.forEach(item => {
|
||||
catSet.add(item.properties?.Category?.[0]);
|
||||
(item.properties?.Tags || []).forEach(tag => tagSet.add(tag));
|
||||
});
|
||||
|
||||
const catFilter = document.getElementById("categoryFilter");
|
||||
[...catSet].sort().forEach(cat => {
|
||||
const option = document.createElement("option");
|
||||
option.value = cat;
|
||||
option.textContent = cat;
|
||||
catFilter.appendChild(option);
|
||||
});
|
||||
|
||||
const tagFilter = document.getElementById("tagFilter");
|
||||
[...tagSet].sort().forEach(tag => {
|
||||
const option = document.createElement("option");
|
||||
option.value = tag;
|
||||
option.textContent = tag;
|
||||
tagFilter.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
function render(data) {
|
||||
const gallery = document.getElementById("gallery");
|
||||
gallery.innerHTML = ""; // clear
|
||||
|
||||
data.forEach((item, index) => {
|
||||
const images = (item.images || []).map(sanitize);
|
||||
const tags = (item.properties?.Tags || []).join(', ');
|
||||
const category = item.properties?.Category?.[0] || 'Uncategorized';
|
||||
const downloadPath = sanitize(item.download);
|
||||
|
||||
const card = document.createElement("div");
|
||||
card.className = "card";
|
||||
card.setAttribute("data-category", category);
|
||||
card.setAttribute("data-tags", tags);
|
||||
|
||||
const galleryGroupId = `gallery-${index}`;
|
||||
card.innerHTML = `
|
||||
<div class="lg-gallery" id="${galleryGroupId}">
|
||||
<a href="${images[0]}" data-lg-size="1400-800">
|
||||
<img src="${images[0]}" alt="${item.title}">
|
||||
</a>
|
||||
${images.slice(1).map(img => `
|
||||
<a href="${img}" data-lg-size="1400-800" style="display:none;"></a>
|
||||
`).join("")}
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="card-title">${item.title}</div>
|
||||
<div class="card-tags">Tags: ${tags}</div>
|
||||
<div class="card-footer">
|
||||
<a class="download-btn" href="${downloadPath}" download>⬇️ 下载资源</a>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
gallery.appendChild(card);
|
||||
|
||||
// 初始化 lightGallery
|
||||
lightGallery(document.getElementById(galleryGroupId), {
|
||||
selector: 'a',
|
||||
thumbnail: true,
|
||||
zoom: true
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function filterGallery() {
|
||||
const cat = document.getElementById("categoryFilter").value;
|
||||
const tag = document.getElementById("tagFilter").value;
|
||||
|
||||
const filtered = allData.filter(item => {
|
||||
const matchCat = (cat === 'all') || (item.properties?.Category?.[0] === cat);
|
||||
const matchTag = (tag === 'all') || (item.properties?.Tags || []).includes(tag);
|
||||
return matchCat && matchTag;
|
||||
});
|
||||
|
||||
render(filtered);
|
||||
}
|
||||
|
||||
fetch("data/kenney_data_local.json")
|
||||
.then(res => res.json())
|
||||
.then(data => {
|
||||
allData = data;
|
||||
populateFilters(data);
|
||||
render(data);
|
||||
document.getElementById("categoryFilter").addEventListener("change", filterGallery);
|
||||
document.getElementById("tagFilter").addEventListener("change", filterGallery);
|
||||
});
|
||||
@@ -1,82 +0,0 @@
|
||||
body {
|
||||
font-family: "Segoe UI", sans-serif;
|
||||
background-color: #f0f2f5;
|
||||
margin: 0;
|
||||
padding: 2rem;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
margin-bottom: 2rem;
|
||||
color: #444;
|
||||
}
|
||||
|
||||
#gallery {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
overflow: hidden;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
|
||||
.card:hover {
|
||||
transform: translateY(-5px);
|
||||
}
|
||||
|
||||
.card img {
|
||||
width: 100%;
|
||||
height: 200px;
|
||||
object-fit: cover;
|
||||
}
|
||||
|
||||
.card-body {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.card-title {
|
||||
font-size: 1.2rem;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.card-tags {
|
||||
font-size: 0.85rem;
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.card-footer {
|
||||
margin-top: 1rem;
|
||||
}
|
||||
|
||||
.download-btn {
|
||||
display: inline-block;
|
||||
padding: 0.4rem 0.8rem;
|
||||
background: #4caf50;
|
||||
color: white;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
font-size: 0.9rem;
|
||||
transition: background 0.2s;
|
||||
}
|
||||
|
||||
.download-btn:hover {
|
||||
background: #45a049;
|
||||
}
|
||||
|
||||
#filters {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
gap: 1rem;
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
|
||||
select {
|
||||
padding: 0.5rem;
|
||||
font-size: 1rem;
|
||||
}
|
||||
@@ -1,41 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>Kenney 资源库</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lightgallery.min.css" integrity="sha512-QMCloGTsG2vNSnHcsxYTapI6pFQNnUP6yNizuLL5Wh3ha6AraI6HrJ3ABBaw6SIUHqlSTPQDs/SydiR98oTeaQ==" crossorigin="anonymous" referrerpolicy="no-referrer" />
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/lightgallery.min.js" integrity="sha512-n02TbYimj64qb98ed5WwkNiSw/i9Xlvv4Ehvhg0jLp3qMAMWCYUHbOMbppZ0vimtyiyw9NqNqxUZC4hq86f4aQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lg-zoom.min.css" integrity="sha512-S/hU6dGSK3D7SRpCvRF/IEufIr6Ikgp5vDiJarhdeFGEnw36hWZ6gVBjnwBbzjA+NEP7D8Gdm+5LL1HEsyiB1w==" crossorigin="anonymous" referrerpolicy="no-referrer" />
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/plugins/zoom/lg-zoom.min.js" integrity="sha512-fwxc/NvaA3du4ZRE6J/Ilrqi2xwOB1QfHBR4neA+ha13/pkweiRfPgBiV4VbfAf/Vi3rXAXdQ3zexUJ1V2bWrg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/css/lg-thumbnail.min.css" integrity="sha512-rKuOh3xlF/027KUPuMok0ESsZ2zWPRzkniD3n5zZKCAtbiVkYw66DR4KtVAGf8dLPLr5DdyQs05BlSmEyXctkQ==" crossorigin="anonymous" referrerpolicy="no-referrer" />
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/lightgallery/2.8.3/plugins/thumbnail/lg-thumbnail.min.js" integrity="sha512-jZxB8WysJ6S6e4Hz5IZpAzR1WiflBl0hBxriHGlLkUN32T18+rD1aLNifa1KTll/zx8lIfWVP1NqEjHi/Khy5w==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<aside class="sidebar">
|
||||
<input type="text" id="searchInput" placeholder="🔍 搜索资源名称..." />
|
||||
<ul id="resourceList"></ul>
|
||||
</aside>
|
||||
|
||||
<main class="content">
|
||||
<div id="details" class="hidden">
|
||||
<h2 id="detailTitle"></h2>
|
||||
<div id="detailTags"></div>
|
||||
<div id="galleryWrapper"></div>
|
||||
<div id="downloadBlock"></div>
|
||||
<div id="versionTableWrapper"></div>
|
||||
</div>
|
||||
<div id="placeholder" class="placeholder">
|
||||
← 请选择左侧资源以查看详情
|
||||
</div>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<script src="libs/lightgallery.min.js"></script>
|
||||
<script src="libs/lg-thumbnail.min.js"></script>
|
||||
<script src="libs/lg-zoom.min.js"></script>
|
||||
<script src="script.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,108 +0,0 @@
|
||||
let allData = [];
|
||||
let currentActive = null;
|
||||
|
||||
// 定义一个函数,用于将字符串中的反斜杠替换为正斜杠
|
||||
function sanitize(str) {
|
||||
return str.replaceAll("\\", "/");
|
||||
}
|
||||
|
||||
function renderList(data) {
|
||||
const list = document.getElementById("resourceList");
|
||||
list.innerHTML = "";
|
||||
|
||||
data.forEach((item, index) => {
|
||||
const li = document.createElement("li");
|
||||
li.setAttribute("data-index", index);
|
||||
|
||||
const thumb = item.images?.[0] ? sanitize(item.images[0]) : "";
|
||||
const name = item.title;
|
||||
const assets = item.properties?.Assets || "未知";
|
||||
|
||||
li.innerHTML = `
|
||||
<img src="${thumb}" class="thumb" alt="">
|
||||
<div>
|
||||
<div><strong>${name}</strong></div>
|
||||
<div style="font-size:0.85rem; color: #666;">素材量: ${assets}</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
li.addEventListener("click", () => showDetails(item, li));
|
||||
list.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function showDetails(item, li) {
|
||||
if (currentActive) currentActive.classList.remove("active");
|
||||
currentActive = li;
|
||||
currentActive.classList.add("active");
|
||||
|
||||
document.getElementById("placeholder").classList.add("hidden");
|
||||
document.getElementById("details").classList.remove("hidden");
|
||||
|
||||
document.getElementById("detailTitle").textContent = item.title;
|
||||
document.getElementById("detailTags").textContent = `分类: ${
|
||||
item.properties?.Category?.[0] || "N/A"
|
||||
} | 标签: ${(item.properties?.Tags || []).join(", ")}`;
|
||||
|
||||
// 下载链接
|
||||
const download = sanitize(item.download);
|
||||
document.getElementById(
|
||||
"downloadBlock"
|
||||
).innerHTML = `<a class="download-btn" href="${download}" download>⬇️ 下载资源</a>`;
|
||||
|
||||
// 图集
|
||||
const gallery = document.createElement("div");
|
||||
gallery.id = "gallery";
|
||||
item.images?.forEach((img) => {
|
||||
img = sanitize(img);
|
||||
const a = document.createElement("a");
|
||||
a.href = img;
|
||||
a.innerHTML = `<img src="${img}" alt="">`;
|
||||
gallery.appendChild(a);
|
||||
});
|
||||
const galleryWrapper = document.getElementById("galleryWrapper");
|
||||
galleryWrapper.innerHTML = "";
|
||||
galleryWrapper.appendChild(gallery);
|
||||
|
||||
lightGallery(gallery, {
|
||||
selector: "a",
|
||||
thumbnail: true,
|
||||
zoom: true,
|
||||
});
|
||||
|
||||
// 版本信息
|
||||
const versionBlock = document.getElementById("versionTableWrapper");
|
||||
if (item.changelog?.length > 0) {
|
||||
let table = `<table><tr><th>日期</th><th>版本</th><th>描述</th></tr>`;
|
||||
item.changelog.forEach((row) => {
|
||||
table += `<tr><td>${row.date}</td><td>${row.version}</td><td>${
|
||||
row.description || ""
|
||||
}</td></tr>`;
|
||||
});
|
||||
table += `</table>`;
|
||||
versionBlock.innerHTML = table;
|
||||
} else {
|
||||
versionBlock.innerHTML = "";
|
||||
}
|
||||
}
|
||||
|
||||
function handleSearch() {
|
||||
const keyword = document
|
||||
.getElementById("searchInput")
|
||||
.value.trim()
|
||||
.toLowerCase();
|
||||
const filtered = allData.filter((item) =>
|
||||
item.title.toLowerCase().includes(keyword)
|
||||
);
|
||||
renderList(filtered);
|
||||
}
|
||||
|
||||
fetch("data/kenney_data_local.json")
|
||||
.then((res) => res.json())
|
||||
.then((data) => {
|
||||
allData = data;
|
||||
renderList(data);
|
||||
document
|
||||
.getElementById("searchInput")
|
||||
.addEventListener("input", handleSearch);
|
||||
});
|
||||
@@ -1,115 +0,0 @@
|
||||
body {
|
||||
margin: 0;
|
||||
font-family: "Segoe UI", sans-serif;
|
||||
background-color: #f0f2f5;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
height: 100vh;
|
||||
}
|
||||
|
||||
.sidebar {
|
||||
width: 320px;
|
||||
background: #fff;
|
||||
border-right: 1px solid #ddd;
|
||||
padding: 1rem;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.sidebar input {
|
||||
width: 100%;
|
||||
padding: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
.sidebar ul {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.sidebar li {
|
||||
padding: 0.5rem;
|
||||
margin-bottom: 0.5rem;
|
||||
cursor: pointer;
|
||||
border-radius: 6px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
transition: background 0.2s;
|
||||
}
|
||||
|
||||
.sidebar li:hover,
|
||||
.sidebar li.active {
|
||||
background-color: #e6f7ff;
|
||||
}
|
||||
|
||||
.sidebar img.thumb {
|
||||
width: 48px;
|
||||
height: 48px;
|
||||
object-fit: cover;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.content {
|
||||
flex: 1;
|
||||
padding: 2rem;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.placeholder {
|
||||
font-size: 1.2rem;
|
||||
color: #999;
|
||||
}
|
||||
|
||||
.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#galleryWrapper {
|
||||
margin-top: 1rem;
|
||||
}
|
||||
|
||||
#galleryWrapper a img {
|
||||
height: 120px;
|
||||
margin: 5px;
|
||||
object-fit: cover;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
#detailTags {
|
||||
margin-bottom: 1rem;
|
||||
color: #666;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
#downloadBlock {
|
||||
margin: 1rem 0;
|
||||
}
|
||||
|
||||
.download-btn {
|
||||
padding: 0.5rem 1rem;
|
||||
background-color: #4caf50;
|
||||
color: white;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
font-size: 0.95rem;
|
||||
}
|
||||
|
||||
.download-btn:hover {
|
||||
background-color: #45a049;
|
||||
}
|
||||
|
||||
#versionTableWrapper table {
|
||||
margin-top: 1rem;
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
#versionTableWrapper th,
|
||||
#versionTableWrapper td {
|
||||
padding: 0.5rem;
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
364
main.py
Normal file
364
main.py
Normal file
@@ -0,0 +1,364 @@
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
import random
|
||||
import time
|
||||
import json
|
||||
from datetime import datetime
|
||||
from mysql_helper import MySQLHelper
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
KENNEY_ASSET_URL = "https://www.kenney.nl/assets/"
|
||||
|
||||
def get_headers():
|
||||
"""生成随机 UA 的请求头"""
|
||||
# 一些常见的桌面浏览器 UA 列表(可以自己扩充)
|
||||
USER_AGENTS = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/126.0.0.0 Safari/537.36",
|
||||
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
||||
"Version/17.3 Safari/605.1.15",
|
||||
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) "
|
||||
"Gecko/20100101 Firefox/128.0"
|
||||
]
|
||||
return {
|
||||
"User-Agent": random.choice(USER_AGENTS),
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
||||
),
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Referer": "https://www.google.com/",
|
||||
"Upgrade-Insecure-Requests": "1"
|
||||
}
|
||||
|
||||
class SmartCrawler:
|
||||
def __init__(self, delay_range=(1, 3)):
|
||||
self.session = requests.Session()
|
||||
self.delay_range = delay_range
|
||||
self.headers = get_headers()
|
||||
|
||||
def get(self, url):
|
||||
time.sleep(random.uniform(*self.delay_range))
|
||||
return self.session.get(url, headers=self.headers)
|
||||
|
||||
def parse_date(date_str):
|
||||
"""将 dd/mm/yyyy 转为 yyyy-mm-dd 格式"""
|
||||
return datetime.strptime(date_str, "%d/%m/%Y").strftime("%Y-%m-%d")
|
||||
|
||||
def get_total_pages() -> int:
|
||||
"""Get total asset pages"""
|
||||
crawler = SmartCrawler()
|
||||
resp = crawler.get(KENNEY_ASSET_URL)
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
result = 0
|
||||
last_page_button_selector = "#content > section > div > div.row.text-center.margin-top > div > ul > li:last-child > a"
|
||||
last_page_button = soup.select_one(last_page_button_selector)
|
||||
# Should obtain element like this:
|
||||
# <a class="icon" href="https://www.kenney.nl/assets/page:13"> ...
|
||||
# obtain the href attribute, and extract the number after "page:"
|
||||
if last_page_button and "href" in last_page_button.attrs:
|
||||
page_link = last_page_button["href"]
|
||||
splitted_page_link = page_link.split(':')
|
||||
if splitted_page_link[-1].isdigit():
|
||||
result = int(splitted_page_link[-1])
|
||||
else:
|
||||
raise ValueError(f"Expected int in page number, got {splitted_page_link[-1]}")
|
||||
else:
|
||||
raise Exception("Last page button or its href attribute not found")
|
||||
return result
|
||||
|
||||
def get_all_asset_infos(total_pages: int) -> list[dict[str, str]]:
|
||||
result = []
|
||||
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
|
||||
asset_page_url = KENNEY_ASSET_URL + f"page:{page}?search=&sort=release"
|
||||
crawler = SmartCrawler(delay_range=(1, 3))
|
||||
resp = crawler.get(asset_page_url)
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
contents_selector = "#content > section > div > div:nth-of-type(1)"
|
||||
contents_div = soup.select_one(contents_selector)
|
||||
if contents_div:
|
||||
item_divs = contents_div.find_all("div", recursive=False)
|
||||
for item_div in item_divs:
|
||||
# We may get these info from the grid
|
||||
asset_info = {
|
||||
"name": "",
|
||||
"category": "",
|
||||
"series": "",
|
||||
"page_link": ""
|
||||
}
|
||||
h2_tag = item_div.find("h2")
|
||||
asset_info["name"] = h2_tag.text if h2_tag else None
|
||||
a_tags = item_div.find_all("a")
|
||||
asset_info["category"] = a_tags[2].text if len(a_tags) > 2 else None
|
||||
asset_info["series"] = a_tags[3].text if len(a_tags) > 3 else None
|
||||
asset_info["page_link"] = a_tags[1]["href"] if "href" in a_tags[0].attrs else None
|
||||
result.append(asset_info)
|
||||
return result
|
||||
|
||||
def get_asset_pack_info(asset: dict[str, ]) -> None:
|
||||
crawler = SmartCrawler()
|
||||
resp = crawler.get(asset["page_link"])
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
properties = {}
|
||||
prop_table = soup.select_one("#content > section > div > div > div.col-md-6.text-left > table:nth-of-type(1) > tbody")
|
||||
if prop_table:
|
||||
for row in prop_table.find_all("tr"):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
key = cols[0].text.strip().rstrip(':')
|
||||
value_links = cols[1].find_all('a')
|
||||
if value_links:
|
||||
value = [a.text.strip() for a in value_links]
|
||||
else:
|
||||
value = cols[1].text.strip()
|
||||
properties[key] = value
|
||||
asset["tags"] = properties.get("Tags", [])
|
||||
|
||||
zip_link = None
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"]
|
||||
if href.endswith(".zip"):
|
||||
zip_link = "https://www.kenney.nl" + \
|
||||
href if href.startswith("/") else href
|
||||
break
|
||||
|
||||
changelog = []
|
||||
update_table = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
|
||||
if update_table:
|
||||
for idx, row in enumerate(update_table.find_all('tr')):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
date = cols[0].text.strip()
|
||||
spans = cols[1].find_all('span')
|
||||
version = spans[0].text.strip() if len(spans) >= 1 else ''
|
||||
description = spans[1].text.strip() if len(spans) >= 2 else ''
|
||||
version_info = {
|
||||
'date': parse_date(date),
|
||||
'version': version,
|
||||
'description': description,
|
||||
'files': 0,
|
||||
'feat_animation': False,
|
||||
'feat_variation': False,
|
||||
'orig_file_link': None
|
||||
}
|
||||
if idx == 0:
|
||||
# Latest version
|
||||
version_info["files"] = int(''.join(ch for ch in properties["Files"] if ch.isdigit())) if "Files" in properties else 0
|
||||
version_info["feat_animation"] = "Animation" in properties["Features"]
|
||||
version_info["feat_variation"] = "Variation" in properties["Features"]
|
||||
version_info["orig_file_link"] = zip_link if zip_link else None
|
||||
changelog.append(version_info)
|
||||
changelog.reverse()
|
||||
asset["changelog"] = changelog
|
||||
asset["released_at"] = changelog[-1]["date"]
|
||||
asset["updated_at"] = changelog[0]["date"]
|
||||
|
||||
images = []
|
||||
# 封面图(Cover)
|
||||
cover_img = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(2) > a > img')
|
||||
if cover_img and cover_img.get("src"):
|
||||
cover_url = cover_img["src"]
|
||||
if cover_url.startswith("/"):
|
||||
cover_url = "https://www.kenney.nl" + cover_url
|
||||
images.append(cover_url)
|
||||
# 图集中的图像
|
||||
gallery_divs = soup.select(
|
||||
'#content > section > div > div > div:nth-of-type(2) > div > div')
|
||||
for div in gallery_divs:
|
||||
img_tag = div.select_one("a > img")
|
||||
if img_tag and img_tag.get("src"):
|
||||
img_url = img_tag["src"]
|
||||
if img_url.startswith("/"):
|
||||
img_url = "https://www.kenney.nl" + img_url
|
||||
images.append(img_url)
|
||||
asset['images'] = images
|
||||
|
||||
def sync_table(
|
||||
db,
|
||||
table_name: str,
|
||||
column_name: str,
|
||||
items: set[str],
|
||||
) -> dict[str, int]:
|
||||
"""同步唯一字段数据到指定表,并返回 name -> id 的映射"""
|
||||
# 从数据库读取已存在的记录
|
||||
saved_records = db.fetch_all(f"SELECT * FROM {table_name}")
|
||||
saved_names = {r[column_name] for r in saved_records}
|
||||
|
||||
# 找出缺失项
|
||||
missing_items = sorted(items - saved_names)
|
||||
|
||||
# 插入缺失项(假设 id 是自增,不需要手动计算)
|
||||
if missing_items:
|
||||
insert_sql = f"INSERT INTO {table_name} ({column_name}) VALUES (%s)"
|
||||
with db.get_conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.executemany(insert_sql, [(name,) for name in missing_items])
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
# 重新获取完整映射(保证 ID 正确)
|
||||
final_records = db.fetch_all(f"SELECT * FROM {table_name}")
|
||||
return {r[column_name]: r["id"] for r in final_records}
|
||||
|
||||
def build_id_map(db, table, name_field, values):
|
||||
return sync_table(db, table, name_field, {v for v in values if v})
|
||||
|
||||
def build_insert_sql(table: str, columns: list[str]):
|
||||
cols_str = ', '.join(columns)
|
||||
placeholders = ', '.join(['%s'] * len(columns))
|
||||
return f"INSERT INTO {table} ({cols_str}) VALUES ({placeholders})"
|
||||
|
||||
allowed_path_pattern = re.compile(r'[^a-zA-Z0-9._-]')
|
||||
def sanitize_path(path: str):
|
||||
if not path:
|
||||
return None
|
||||
return '_'.join([allowed_path_pattern.sub('', word.lower()) for word in path.split()])
|
||||
|
||||
def main() -> None:
|
||||
# total_pages = get_total_pages()
|
||||
# all_asset_infos = get_all_asset_infos(total_pages)
|
||||
|
||||
# # Let the oldest become first in the array to make sure it can be inserted into database first
|
||||
# all_asset_infos.reverse()
|
||||
|
||||
# for asset in tqdm(all_asset_infos, "Fetching asset pack info"):
|
||||
# get_asset_pack_info(asset)
|
||||
|
||||
all_asset_infos: list[dict[str, ]] = json.load(open("all_asset_infos_detailed.json"))
|
||||
# for asset in all_asset_infos:
|
||||
# asset["name"] = None if asset["name"] == "" else asset["name"]
|
||||
# asset["category"] = None if asset["category"] == "" else asset["category"]
|
||||
# asset["series"] = None if asset["series"] == "" else asset["series"]
|
||||
# asset["page_link"] = None if asset["page_link"] == "" else asset["page_link"]
|
||||
# # for log in asset["changelog"]:
|
||||
# # log['files'] = 0 if "files" not in log else log["files"]
|
||||
# # log['feat_animation'] = False if "feat_animation" not in log else log["feat_animation"]
|
||||
# # log['feat_variation'] = False if "feat_variation" not in log else log["feat_variation"]
|
||||
# # log['orig_file_link'] = asset["download"] if log["files"] != 0 else None
|
||||
# # asset.pop('download')
|
||||
# json.dump(all_asset_infos, open("all_asset_infos_detailed_fix.json", "w"))
|
||||
# exit()
|
||||
|
||||
# Download file and save to database
|
||||
output_dir = "media"
|
||||
|
||||
for asset in tqdm(all_asset_infos, "Downloading assets and images"):
|
||||
asset_name = sanitize_path(asset["name"])
|
||||
asset_category = sanitize_path(asset["category"])
|
||||
asset_version = asset["changelog"][-1]["version"]
|
||||
asset_dir_path = os.path.join(output_dir, asset_category)
|
||||
asset_dir_path = os.path.join(asset_dir_path, asset_name)
|
||||
asset["base_asset_path"] = asset_dir_path
|
||||
os.makedirs(asset_dir_path, exist_ok=True)
|
||||
filename = f"{asset_name} V{asset_version}.zip"
|
||||
filepath = os.path.join(asset_dir_path, filename)
|
||||
if os.path.exists(filepath):
|
||||
print(f"✅ {filename} exists, skipping...")
|
||||
else:
|
||||
try:
|
||||
download_url = asset["changelog"][-1]["orig_file_link"]
|
||||
with requests.get(download_url, headers=get_headers()) as resp:
|
||||
resp.raise_for_status()
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
print(f"✅ {filename} download completed.")
|
||||
except Exception as e:
|
||||
print(f"❌ Download failed: {filename} - {e}")
|
||||
time.sleep(random.uniform(1.5, 4.0))
|
||||
# Download asset file
|
||||
exit()
|
||||
|
||||
# Initialize database
|
||||
db = MySQLHelper(
|
||||
"10.147.20.103",
|
||||
"kenney-assets",
|
||||
"9a77caa2a5c705db7e8a93c6a3fbc46a",
|
||||
"kenney_assets"
|
||||
)
|
||||
|
||||
# Build mapping tables
|
||||
category_id_map = build_id_map(db, "category", "name", (a["category"] for a in all_asset_infos))
|
||||
series_id_map = build_id_map(db, "series", "name", (a["series"] for a in all_asset_infos))
|
||||
tags_id_map = build_id_map(db, "tag", "name", (tag for a in all_asset_infos for tag in (a.get("tags") or [])))
|
||||
|
||||
asset_pack_sql = build_insert_sql("asset_pack", ["name", "category_id", "series_id", "released_at", "updated_at", "base_asset_path", "orig_page_link"])
|
||||
asset_pack_values: list[tuple] = []
|
||||
# 给每个 asset 添加 category_id 和 series_id
|
||||
for asset in all_asset_infos:
|
||||
asset["category_id"] = category_id_map.get(asset.get("category"))
|
||||
asset["series_id"] = series_id_map.get(asset.get("series"))
|
||||
asset_pack_values.append((
|
||||
asset["name"],
|
||||
asset["category_id"],
|
||||
asset["series_id"],
|
||||
asset["released_at"],
|
||||
asset["updated_at"],
|
||||
asset["base_asset_path"],
|
||||
asset["page_link"]
|
||||
))
|
||||
db.bulk_insert(asset_pack_sql, asset_pack_values)
|
||||
|
||||
# Get asset_pack id map
|
||||
asset_pack_records = db.fetch_all(f"SELECT id, name FROM asset_pack")
|
||||
asset_pack_id_map = {r["name"]: r["id"] for r in asset_pack_records}
|
||||
|
||||
asset_pack_tag_sql = build_insert_sql("asset_pack_tag", ["asset_pack_id", "tag_id"])
|
||||
asset_pack_tag_values: list[tuple] = []
|
||||
|
||||
update_log_sql = build_insert_sql("update_log", ["asset_pack_id", "released_date", "version", "description", "files_count", "feat_animations", "feat_variations", "orig_download_link"])
|
||||
update_log_values: list[tuple] = []
|
||||
|
||||
asset_pack_image_sql = build_insert_sql("asset_pack_image", ["asset_pack_id", "orig_file_link"])
|
||||
asset_pack_image_values: list[tuple] = []
|
||||
|
||||
for asset in tqdm(all_asset_infos, "Preparing data to database"):
|
||||
asset_pack_id = asset_pack_id_map.get(asset.get("name"))
|
||||
tags = asset.get("tags")
|
||||
for tag in tags:
|
||||
asset_pack_tag_values.append((
|
||||
asset_pack_id,
|
||||
tags_id_map[tag]
|
||||
))
|
||||
changelog = asset.get("changelog")
|
||||
for log in changelog:
|
||||
update_log_values.append((
|
||||
asset_pack_id,
|
||||
log["date"],
|
||||
log["version"],
|
||||
log["description"],
|
||||
log["files"],
|
||||
log["feat_animation"],
|
||||
log["feat_variation"],
|
||||
log["orig_file_link"]
|
||||
))
|
||||
images = asset.get("images")
|
||||
for image in images:
|
||||
asset_pack_image_values.append((
|
||||
asset_pack_id,
|
||||
image
|
||||
))
|
||||
|
||||
with db.get_conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.executemany(asset_pack_tag_sql, asset_pack_tag_values)
|
||||
cursor.executemany(update_log_sql, update_log_values)
|
||||
cursor.executemany(asset_pack_image_sql, asset_pack_image_values)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
145
mysql_helper.py
Normal file
145
mysql_helper.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from mysql.connector import pooling, Error
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
import time
|
||||
|
||||
# ========== 日志配置 ==========
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("MySQLHelper")
|
||||
|
||||
class MySQLHelper:
|
||||
def __init__(self, host, user, password, database, pool_size=5):
|
||||
"""
|
||||
初始化 MySQL 连接池
|
||||
"""
|
||||
try:
|
||||
self.pool = pooling.MySQLConnectionPool(
|
||||
pool_name="mypool",
|
||||
pool_size=pool_size,
|
||||
pool_reset_session=True,
|
||||
host=host,
|
||||
user=user,
|
||||
password=password,
|
||||
database=database,
|
||||
charset="utf8mb4"
|
||||
)
|
||||
logger.info("✅ MySQL 连接池已创建,大小=%s", pool_size)
|
||||
except Error as e:
|
||||
logger.error("❌ 创建连接池失败: %s", e)
|
||||
raise
|
||||
|
||||
@contextmanager
|
||||
def get_conn(self):
|
||||
"""
|
||||
获取连接并在使用完毕后释放
|
||||
"""
|
||||
conn = None
|
||||
try:
|
||||
conn = self.pool.get_connection()
|
||||
yield conn
|
||||
except Error as e:
|
||||
logger.error("数据库连接错误: %s", e)
|
||||
raise
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
|
||||
def execute(self, sql, params=None, commit=False, retry=3):
|
||||
"""
|
||||
执行 INSERT/UPDATE/DELETE 等操作
|
||||
"""
|
||||
for attempt in range(1, retry + 1):
|
||||
try:
|
||||
with self.get_conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql, params or ())
|
||||
if commit:
|
||||
conn.commit()
|
||||
affected = cursor.rowcount
|
||||
cursor.close()
|
||||
return affected
|
||||
except Error as e:
|
||||
logger.warning("执行 SQL 失败 (尝试 %s/%s): %s", attempt, retry, e)
|
||||
time.sleep(1)
|
||||
if attempt == retry:
|
||||
raise
|
||||
|
||||
def fetch_all(self, sql, params=None, retry=3):
|
||||
"""
|
||||
查询多条记录
|
||||
"""
|
||||
for attempt in range(1, retry + 1):
|
||||
try:
|
||||
with self.get_conn() as conn:
|
||||
cursor = conn.cursor(dictionary=True)
|
||||
cursor.execute(sql, params or ())
|
||||
result = cursor.fetchall()
|
||||
cursor.close()
|
||||
return result
|
||||
except Error as e:
|
||||
logger.warning("查询失败 (尝试 %s/%s): %s", attempt, retry, e)
|
||||
time.sleep(1)
|
||||
if attempt == retry:
|
||||
raise
|
||||
|
||||
def fetch_one(self, sql, params=None, retry=3):
|
||||
"""
|
||||
查询单条记录
|
||||
"""
|
||||
result = self.fetch_all(sql, params, retry)
|
||||
return result[0] if result else None
|
||||
|
||||
def bulk_insert(self, sql: str, rows: list[tuple]):
|
||||
with self.get_conn() as conn:
|
||||
cursor = conn.cursor()
|
||||
cursor.executemany(sql, rows)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
@contextmanager
|
||||
def transaction(self):
|
||||
"""
|
||||
事务上下文管理器
|
||||
用法:
|
||||
with db.transaction() as cursor:
|
||||
cursor.execute(...)
|
||||
cursor.execute(...)
|
||||
"""
|
||||
with self.get_conn() as conn:
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
yield cursor
|
||||
conn.commit()
|
||||
except:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
# ================= 使用示例 =================
|
||||
if __name__ == "__main__":
|
||||
db = MySQLHelper(
|
||||
host="localhost",
|
||||
user="root",
|
||||
password="123456",
|
||||
database="test_db",
|
||||
pool_size=5
|
||||
)
|
||||
|
||||
# 插入数据
|
||||
db.execute("INSERT INTO users(name, age) VALUES (%s, %s)", ("Alice", 25), commit=True)
|
||||
|
||||
# 查询数据
|
||||
users = db.fetch_all("SELECT * FROM users WHERE age > %s", (18,))
|
||||
logger.info("查询结果: %s", users)
|
||||
|
||||
# 事务示例
|
||||
try:
|
||||
with db.transaction() as cur:
|
||||
cur.execute("UPDATE users SET age = age + 1 WHERE name = %s", ("Alice",))
|
||||
cur.execute("INSERT INTO logs(message) VALUES (%s)", ("Alice age updated",))
|
||||
except Error as e:
|
||||
logger.error("事务失败: %s", e)
|
||||
73
queries.sql
Normal file
73
queries.sql
Normal file
@@ -0,0 +1,73 @@
|
||||
SET FOREIGN_KEY_CHECKS = 0;
|
||||
DROP TABLE IF EXISTS category;
|
||||
DROP TABLE IF EXISTS series;
|
||||
DROP TABLE IF EXISTS tag;
|
||||
DROP TABLE IF EXISTS asset_pack;
|
||||
DROP TABLE IF EXISTS asset_pack_image;
|
||||
DROP TABLE IF EXISTS asset_pack_tag;
|
||||
DROP TABLE IF EXISTS update_log;
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
|
||||
CREATE TABLE category (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
name VARCHAR(100) NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE series (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
name VARCHAR(100) NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE tag (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
name VARCHAR(100) NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE asset_pack (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
name VARCHAR(255) NOT NULL UNIQUE,
|
||||
category_id INT NOT NULL,
|
||||
series_id INT,
|
||||
discovered_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
released_at DATE,
|
||||
updated_at DATE,
|
||||
base_asset_path VARCHAR(255),
|
||||
orig_page_link VARCHAR(255),
|
||||
FOREIGN KEY (category_id) REFERENCES category(id) ON DELETE RESTRICT ON UPDATE CASCADE,
|
||||
FOREIGN KEY (series_id) REFERENCES series(id) ON DELETE SET NULL ON UPDATE CASCADE,
|
||||
INDEX idx_name (name),
|
||||
INDEX idx_category (category_id),
|
||||
INDEX idx_series (series_id)
|
||||
);
|
||||
|
||||
CREATE TABLE asset_pack_image (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
asset_pack_id INT NOT NULL,
|
||||
image_file_name VARCHAR(255),
|
||||
orig_file_link VARCHAR(255),
|
||||
FOREIGN KEY (asset_pack_id) REFERENCES asset_pack(id) ON DELETE CASCADE,
|
||||
INDEX idx_asset_pack_id (asset_pack_id)
|
||||
);
|
||||
|
||||
CREATE TABLE asset_pack_tag (
|
||||
asset_pack_id INT NOT NULL,
|
||||
tag_id INT NOT NULL,
|
||||
PRIMARY KEY (asset_pack_id, tag_id),
|
||||
FOREIGN KEY (asset_pack_id) REFERENCES asset_pack(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES tag(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE update_log (
|
||||
id INT PRIMARY KEY AUTO_INCREMENT,
|
||||
asset_pack_id INT NOT NULL,
|
||||
released_date DATE NOT NULL,
|
||||
version VARCHAR(20),
|
||||
description VARCHAR(500),
|
||||
files_count INT UNSIGNED DEFAULT 0,
|
||||
feat_animations TINYINT(1) DEFAULT 0,
|
||||
feat_variations TINYINT(1) DEFAULT 0,
|
||||
zip_file_name VARCHAR(255),
|
||||
orig_download_link VARCHAR(255),
|
||||
FOREIGN KEY (asset_pack_id) REFERENCES asset_pack(id) ON DELETE CASCADE,
|
||||
INDEX idx_released_date (released_date)
|
||||
);
|
||||
@@ -1,11 +0,0 @@
|
||||
beautifulsoup4==4.13.4
|
||||
certifi==2025.1.31
|
||||
charset-normalizer==3.4.1
|
||||
colorama==0.4.6
|
||||
idna==3.10
|
||||
lxml==5.3.2
|
||||
requests==2.32.3
|
||||
soupsieve==2.6
|
||||
tqdm==4.67.1
|
||||
typing_extensions==4.13.2
|
||||
urllib3==2.4.0
|
||||
@@ -1,136 +0,0 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
base_url = "https://www.kenney.nl/assets/page:"
|
||||
total_pages = 13
|
||||
all_links = []
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
|
||||
|
||||
def parse_resource_page(url):
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
result = {}
|
||||
|
||||
# 资源名称
|
||||
title_tag = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > h1')
|
||||
result['title'] = title_tag.text.strip() if title_tag else 'N/A'
|
||||
|
||||
# 属性表
|
||||
properties = {}
|
||||
prop_table = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
|
||||
if prop_table:
|
||||
for row in prop_table.find_all('tr'):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
key = cols[0].text.strip().rstrip(':')
|
||||
value_links = cols[1].find_all('a')
|
||||
if value_links:
|
||||
value = [a.text.strip() for a in value_links]
|
||||
else:
|
||||
value = cols[1].text.strip()
|
||||
properties[key] = value
|
||||
result['properties'] = properties
|
||||
|
||||
# 更新记录
|
||||
changelog = []
|
||||
update_table = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
|
||||
if update_table:
|
||||
for row in update_table.find_all('tr'):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
date = cols[0].text.strip()
|
||||
spans = cols[1].find_all('span')
|
||||
version = spans[0].text.strip() if len(spans) >= 1 else ''
|
||||
description = spans[1].text.strip() if len(spans) >= 2 else ''
|
||||
changelog.append({
|
||||
'date': date,
|
||||
'version': version,
|
||||
'description': description
|
||||
})
|
||||
result['changelog'] = changelog
|
||||
|
||||
# 下载链接
|
||||
zip_link = None
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"]
|
||||
if href.endswith(".zip"):
|
||||
zip_link = "https://www.kenney.nl" + \
|
||||
href if href.startswith("/") else href
|
||||
break
|
||||
result['download'] = zip_link if zip_link else "N/A"
|
||||
|
||||
# ✅ 图片提取
|
||||
images = []
|
||||
|
||||
# 封面图(Cover)
|
||||
cover_img = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(2) > a > img')
|
||||
if cover_img and cover_img.get("src"):
|
||||
cover_url = cover_img["src"]
|
||||
if cover_url.startswith("/"):
|
||||
cover_url = "https://www.kenney.nl" + cover_url
|
||||
images.append(cover_url)
|
||||
|
||||
# 图集中的图像
|
||||
gallery_divs = soup.select(
|
||||
'#content > section > div > div > div:nth-of-type(2) > div > div')
|
||||
for div in gallery_divs:
|
||||
img_tag = div.select_one("a > img")
|
||||
if img_tag and img_tag.get("src"):
|
||||
img_url = img_tag["src"]
|
||||
if img_url.startswith("/"):
|
||||
img_url = "https://www.kenney.nl" + img_url
|
||||
images.append(img_url)
|
||||
|
||||
result['images'] = images
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# for page in range(1, total_pages + 1):
|
||||
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
|
||||
url = base_url + str(page)
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 定位到//*[@id="content"]/section/div/div[1]
|
||||
content_div = soup.select_one(
|
||||
"#content > section > div > div:nth-of-type(1)")
|
||||
|
||||
if content_div:
|
||||
item_divs = content_div.find_all("div", recursive=False)
|
||||
|
||||
for item_div in item_divs:
|
||||
a_tag = item_div.find("a")
|
||||
if a_tag and "href" in a_tag.attrs:
|
||||
link = a_tag["href"]
|
||||
full_link = link
|
||||
all_links.append(full_link)
|
||||
|
||||
print(f"总共提取到 {len(all_links)} 个链接 ✅")
|
||||
|
||||
with open("kenney_links.txt", "w", encoding="utf-8") as f:
|
||||
for link in all_links:
|
||||
f.write(link + "\n")
|
||||
|
||||
# 爬取页面内的信息
|
||||
all_resource_data = []
|
||||
|
||||
# for link in all_links:
|
||||
for link in tqdm(all_links, desc="Fetching all assets' data"):
|
||||
resource_data = parse_resource_page(link)
|
||||
all_resource_data.append(resource_data)
|
||||
|
||||
with open("kenney_data.json", "w", encoding="utf-8") as f:
|
||||
json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print("数据爬取完成 ✅")
|
||||
@@ -1,72 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
|
||||
# === 配置路径 ===
|
||||
json_path = "kenney_data.json" # JSON 数据路径
|
||||
output_dir = "kenney_assets" # 下载根目录
|
||||
|
||||
# === 加载 JSON 数据 ===
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
resources = json.load(f)
|
||||
|
||||
# === 工具函数 ===
|
||||
|
||||
|
||||
def sanitize_filename(name):
|
||||
return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()
|
||||
|
||||
|
||||
def download_zip(entry):
|
||||
title = entry["title"]
|
||||
version = entry["changelog"][0]["version"] if entry["changelog"] else "1.0"
|
||||
download_url = entry.get("download")
|
||||
|
||||
# 提取分类、系列
|
||||
category = entry["properties"].get("Category", ["Uncategorized"])[0]
|
||||
series = entry["properties"].get("Series", [None])[0]
|
||||
|
||||
# 构建目录结构
|
||||
folder_path = os.path.join(output_dir, sanitize_filename(category))
|
||||
if series:
|
||||
folder_path = os.path.join(folder_path, sanitize_filename(series))
|
||||
os.makedirs(folder_path, exist_ok=True)
|
||||
|
||||
# 构建文件路径
|
||||
filename = f"{sanitize_filename(title)} V{version}.zip"
|
||||
filepath = os.path.join(folder_path, filename)
|
||||
|
||||
if os.path.exists(filepath):
|
||||
print(f"✅ 已存在,跳过: {filename}")
|
||||
return
|
||||
|
||||
try:
|
||||
print(f"⬇️ 开始下载: {filename}")
|
||||
with requests.get(download_url, stream=True, timeout=60) as r:
|
||||
r.raise_for_status()
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
print(f"✅ 下载完成: {filename}")
|
||||
except Exception as e:
|
||||
print(f"❌ 下载失败: {filename} - {e}")
|
||||
|
||||
# 模拟人类行为:随机等待
|
||||
time.sleep(random.uniform(1.5, 4.0))
|
||||
|
||||
|
||||
# === 启动批量下载 ===
|
||||
idx = 0
|
||||
for resource in tqdm(resources, desc="处理资源"):
|
||||
if idx < 156:
|
||||
idx += 1
|
||||
continue
|
||||
if "download" in resource and resource["download"].endswith(".zip"):
|
||||
download_zip(resource)
|
||||
idx += 1
|
||||
|
||||
print("\n✅ 所有资源处理完成")
|
||||
@@ -1,61 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import urlparse, unquote
|
||||
from tqdm import tqdm
|
||||
|
||||
# ========== 配置 ==========
|
||||
json_path = "kenney_data.json" # JSON 数据路径
|
||||
output_root = "kenney_assets_images" # 存储根目录
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
|
||||
# ========== 工具函数 ==========
|
||||
|
||||
|
||||
def sanitize_filename(name):
|
||||
return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()
|
||||
|
||||
|
||||
def download_image(url, save_path):
|
||||
if os.path.exists(save_path):
|
||||
print(f"✅ 已存在,跳过: {save_path}")
|
||||
return
|
||||
try:
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
with open(save_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
print(f"✅ 下载完成: {save_path}")
|
||||
except Exception as e:
|
||||
print(f"❌ 下载失败: {url} - {e}")
|
||||
time.sleep(random.uniform(1.5, 4.0)) # 模拟人类访问
|
||||
|
||||
|
||||
# ========== 加载 JSON ==========
|
||||
with open(json_path, "r", encoding="utf-8") as f:
|
||||
resources = json.load(f)
|
||||
|
||||
# ========== 批量处理 ==========
|
||||
for entry in tqdm(resources, desc="处理资源"):
|
||||
title = entry["title"]
|
||||
category = entry["properties"].get("Category", ["Uncategorized"])[0]
|
||||
series = entry["properties"].get("Series", [None])[0]
|
||||
images = entry.get("images", [])
|
||||
|
||||
# 构建路径:Category/Series/Title/
|
||||
path = os.path.join(output_root, sanitize_filename(category))
|
||||
if series:
|
||||
path = os.path.join(path, sanitize_filename(series))
|
||||
path = os.path.join(path, sanitize_filename(title))
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
for img_url in images:
|
||||
parsed_url = urlparse(img_url)
|
||||
img_name = os.path.basename(parsed_url.path)
|
||||
img_name = unquote(img_name) # 处理 URL 编码,如 %20 => 空格
|
||||
img_path = os.path.join(path, img_name)
|
||||
download_image(img_url, img_path)
|
||||
|
||||
print("\n🎉 所有图片处理完成!")
|
||||
@@ -1,68 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
# === 配置路径 ===
|
||||
input_json = "kenney_data.json"
|
||||
output_json = "kenney_data_local.json"
|
||||
|
||||
zip_root = "kenney_assets"
|
||||
img_root = "kenney_assets_images"
|
||||
|
||||
# === 工具函数 ===
|
||||
|
||||
|
||||
def sanitize_filename(name):
|
||||
return "".join(c for c in name if c.isalnum() or c in "._- ()").strip()
|
||||
|
||||
|
||||
def build_zip_path(entry):
|
||||
title = entry["title"]
|
||||
version = entry["changelog"][0]["version"] if entry["changelog"] else "1.0"
|
||||
category = entry["properties"].get("Category", ["Uncategorized"])[0]
|
||||
series = entry["properties"].get("Series", [None])[0]
|
||||
|
||||
folder = os.path.join(zip_root, sanitize_filename(category))
|
||||
if series:
|
||||
folder = os.path.join(folder, sanitize_filename(series))
|
||||
filename = f"{sanitize_filename(title)} V{version}.zip"
|
||||
return os.path.join(folder, filename)
|
||||
|
||||
|
||||
def build_image_paths(entry):
|
||||
title = entry["title"]
|
||||
category = entry["properties"].get("Category", ["Uncategorized"])[0]
|
||||
series = entry["properties"].get("Series", [None])[0]
|
||||
images = entry.get("images", [])
|
||||
|
||||
folder = os.path.join(img_root, sanitize_filename(category))
|
||||
if series:
|
||||
folder = os.path.join(folder, sanitize_filename(series))
|
||||
folder = os.path.join(folder, sanitize_filename(title))
|
||||
|
||||
local_paths = []
|
||||
for img_url in images:
|
||||
parsed = urlparse(img_url)
|
||||
filename = unquote(os.path.basename(parsed.path))
|
||||
local_paths.append(os.path.join(folder, filename))
|
||||
|
||||
return local_paths
|
||||
|
||||
|
||||
# === 主处理 ===
|
||||
with open(input_json, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
for entry in data:
|
||||
if "download" in entry and entry["download"].endswith(".zip"):
|
||||
zip_path = build_zip_path(entry)
|
||||
if os.path.exists(zip_path):
|
||||
entry["download"] = zip_path # 替换为本地路径
|
||||
if "images" in entry and isinstance(entry["images"], list):
|
||||
entry["images"] = build_image_paths(entry)
|
||||
|
||||
# === 保存修改后的 JSON ===
|
||||
with open(output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("✅ 已更新 JSON:本地路径写入完毕!")
|
||||
33
思路.txt
33
思路.txt
@@ -1,33 +0,0 @@
|
||||
我想要爬取这个网页 https://www.kenney.nl/assets/page:1 下(共有 13 页)
|
||||
//*[@id="content"]/section/div/div[1] 这个元素(里边是 n 个 div 元素)
|
||||
我需要获取每个 Div 元素的
|
||||
//*[@id="content"]/section/div/div[1]/div[1]/div/a 这个 a 元素
|
||||
将它的链接保存成一个列表
|
||||
|
||||
然后依次访问这个列表中的所有页面,
|
||||
|
||||
CSV 格式如下:
|
||||
资源名称, 分类, 系列, 资源数, 标签(有多个),资源数量
|
||||
|
||||
访问这个界面,然后我需要获取它的所有信息:
|
||||
资源名称://*[@id="content"]/section/div/div/div[1]/h1
|
||||
|
||||
各种属性://*[@id="content"]/section/div/div/div[1]/table[1]/tbody
|
||||
上面这个表,他是个两列 N 行的表。
|
||||
表的左侧可以看作是 key: Category, Series, Assets, Variation(s), Tags, License 等……
|
||||
表的右侧是 Value,但它可能会有各种组成方式,下边是常见的:
|
||||
- 普通的 td > a
|
||||
- 只有 td
|
||||
- td 之后有多个 a(如 Tags)
|
||||
|
||||
更新记录://*[@id="content"]/section/div/div/div[1]/table[2]/tbody
|
||||
他也是个两行 N 列的表,
|
||||
左侧是更新日期 dd/mm/yyyy
|
||||
右侧是 td > 第一个 span 是版本,第二个 span 是更新内容,有时候没有第二个 span
|
||||
|
||||
继续增加 parse_resource_page(url) 函数,
|
||||
我需要获取它的封面图://*[@id="content"]/section/div/div/div[2]/a/img
|
||||
以及其他图片(如有)
|
||||
//*[@id="content"]/section/div/div/div[2]/div 这里子元素可能有 n 个 div,我需要拿到它的 img
|
||||
//*[@id="content"]/section/div/div/div[2]/div/div[1]/a/img
|
||||
封面图存了之后,也得加在图集的数组里。
|
||||
Reference in New Issue
Block a user