Files
kenney-asset-scrapper/scrapper/1 main.py
2025-04-22 09:11:40 +08:00

137 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm, trange
base_url = "https://www.kenney.nl/assets/page:"
total_pages = 13
all_links = []
headers = {
"User-Agent": "Mozilla/5.0"
}
def parse_resource_page(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
result = {}
# 资源名称
title_tag = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > h1')
result['title'] = title_tag.text.strip() if title_tag else 'N/A'
# 属性表
properties = {}
prop_table = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
if prop_table:
for row in prop_table.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 2:
key = cols[0].text.strip().rstrip(':')
value_links = cols[1].find_all('a')
if value_links:
value = [a.text.strip() for a in value_links]
else:
value = cols[1].text.strip()
properties[key] = value
result['properties'] = properties
# 更新记录
changelog = []
update_table = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
if update_table:
for row in update_table.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 2:
date = cols[0].text.strip()
spans = cols[1].find_all('span')
version = spans[0].text.strip() if len(spans) >= 1 else ''
description = spans[1].text.strip() if len(spans) >= 2 else ''
changelog.append({
'date': date,
'version': version,
'description': description
})
result['changelog'] = changelog
# 下载链接
zip_link = None
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if href.endswith(".zip"):
zip_link = "https://www.kenney.nl" + \
href if href.startswith("/") else href
break
result['download'] = zip_link if zip_link else "N/A"
# ✅ 图片提取
images = []
# 封面图Cover
cover_img = soup.select_one(
'#content > section > div > div > div:nth-of-type(2) > a > img')
if cover_img and cover_img.get("src"):
cover_url = cover_img["src"]
if cover_url.startswith("/"):
cover_url = "https://www.kenney.nl" + cover_url
images.append(cover_url)
# 图集中的图像
gallery_divs = soup.select(
'#content > section > div > div > div:nth-of-type(2) > div > div')
for div in gallery_divs:
img_tag = div.select_one("a > img")
if img_tag and img_tag.get("src"):
img_url = img_tag["src"]
if img_url.startswith("/"):
img_url = "https://www.kenney.nl" + img_url
images.append(img_url)
result['images'] = images
return result
# for page in range(1, total_pages + 1):
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
url = base_url + str(page)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
# 定位到//*[@id="content"]/section/div/div[1]
content_div = soup.select_one(
"#content > section > div > div:nth-of-type(1)")
if content_div:
item_divs = content_div.find_all("div", recursive=False)
for item_div in item_divs:
a_tag = item_div.find("a")
if a_tag and "href" in a_tag.attrs:
link = a_tag["href"]
full_link = link
all_links.append(full_link)
print(f"总共提取到 {len(all_links)} 个链接 ✅")
with open("kenney_links.txt", "w", encoding="utf-8") as f:
for link in all_links:
f.write(link + "\n")
# 爬取页面内的信息
all_resource_data = []
# for link in all_links:
for link in tqdm(all_links, desc="Fetching all assets' data"):
resource_data = parse_resource_page(link)
all_resource_data.append(resource_data)
with open("kenney_data.json", "w", encoding="utf-8") as f:
json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
print("数据爬取完成 ✅")