Open Source

This commit is contained in:
2025-04-22 09:11:40 +08:00
commit a4bf39a958
14 changed files with 1043 additions and 0 deletions

136
scrapper/1 main.py Normal file
View File

@@ -0,0 +1,136 @@
import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm, trange
base_url = "https://www.kenney.nl/assets/page:"
total_pages = 13
all_links = []
headers = {
"User-Agent": "Mozilla/5.0"
}
def parse_resource_page(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
result = {}
# 资源名称
title_tag = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > h1')
result['title'] = title_tag.text.strip() if title_tag else 'N/A'
# 属性表
properties = {}
prop_table = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
if prop_table:
for row in prop_table.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 2:
key = cols[0].text.strip().rstrip(':')
value_links = cols[1].find_all('a')
if value_links:
value = [a.text.strip() for a in value_links]
else:
value = cols[1].text.strip()
properties[key] = value
result['properties'] = properties
# 更新记录
changelog = []
update_table = soup.select_one(
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
if update_table:
for row in update_table.find_all('tr'):
cols = row.find_all('td')
if len(cols) == 2:
date = cols[0].text.strip()
spans = cols[1].find_all('span')
version = spans[0].text.strip() if len(spans) >= 1 else ''
description = spans[1].text.strip() if len(spans) >= 2 else ''
changelog.append({
'date': date,
'version': version,
'description': description
})
result['changelog'] = changelog
# 下载链接
zip_link = None
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
if href.endswith(".zip"):
zip_link = "https://www.kenney.nl" + \
href if href.startswith("/") else href
break
result['download'] = zip_link if zip_link else "N/A"
# ✅ 图片提取
images = []
# 封面图Cover
cover_img = soup.select_one(
'#content > section > div > div > div:nth-of-type(2) > a > img')
if cover_img and cover_img.get("src"):
cover_url = cover_img["src"]
if cover_url.startswith("/"):
cover_url = "https://www.kenney.nl" + cover_url
images.append(cover_url)
# 图集中的图像
gallery_divs = soup.select(
'#content > section > div > div > div:nth-of-type(2) > div > div')
for div in gallery_divs:
img_tag = div.select_one("a > img")
if img_tag and img_tag.get("src"):
img_url = img_tag["src"]
if img_url.startswith("/"):
img_url = "https://www.kenney.nl" + img_url
images.append(img_url)
result['images'] = images
return result
# for page in range(1, total_pages + 1):
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
url = base_url + str(page)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
# 定位到//*[@id="content"]/section/div/div[1]
content_div = soup.select_one(
"#content > section > div > div:nth-of-type(1)")
if content_div:
item_divs = content_div.find_all("div", recursive=False)
for item_div in item_divs:
a_tag = item_div.find("a")
if a_tag and "href" in a_tag.attrs:
link = a_tag["href"]
full_link = link
all_links.append(full_link)
print(f"总共提取到 {len(all_links)} 个链接 ✅")
with open("kenney_links.txt", "w", encoding="utf-8") as f:
for link in all_links:
f.write(link + "\n")
# 爬取页面内的信息
all_resource_data = []
# for link in all_links:
for link in tqdm(all_links, desc="Fetching all assets' data"):
resource_data = parse_resource_page(link)
all_resource_data.append(resource_data)
with open("kenney_data.json", "w", encoding="utf-8") as f:
json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
print("数据爬取完成 ✅")