137 lines
4.5 KiB
Python
137 lines
4.5 KiB
Python
import requests
|
||
import json
|
||
from bs4 import BeautifulSoup
|
||
from tqdm import tqdm, trange
|
||
|
||
base_url = "https://www.kenney.nl/assets/page:"
|
||
total_pages = 13
|
||
all_links = []
|
||
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0"
|
||
}
|
||
|
||
|
||
def parse_resource_page(url):
|
||
response = requests.get(url, headers=headers)
|
||
soup = BeautifulSoup(response.text, "lxml")
|
||
result = {}
|
||
|
||
# 资源名称
|
||
title_tag = soup.select_one(
|
||
'#content > section > div > div > div:nth-of-type(1) > h1')
|
||
result['title'] = title_tag.text.strip() if title_tag else 'N/A'
|
||
|
||
# 属性表
|
||
properties = {}
|
||
prop_table = soup.select_one(
|
||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
|
||
if prop_table:
|
||
for row in prop_table.find_all('tr'):
|
||
cols = row.find_all('td')
|
||
if len(cols) == 2:
|
||
key = cols[0].text.strip().rstrip(':')
|
||
value_links = cols[1].find_all('a')
|
||
if value_links:
|
||
value = [a.text.strip() for a in value_links]
|
||
else:
|
||
value = cols[1].text.strip()
|
||
properties[key] = value
|
||
result['properties'] = properties
|
||
|
||
# 更新记录
|
||
changelog = []
|
||
update_table = soup.select_one(
|
||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
|
||
if update_table:
|
||
for row in update_table.find_all('tr'):
|
||
cols = row.find_all('td')
|
||
if len(cols) == 2:
|
||
date = cols[0].text.strip()
|
||
spans = cols[1].find_all('span')
|
||
version = spans[0].text.strip() if len(spans) >= 1 else ''
|
||
description = spans[1].text.strip() if len(spans) >= 2 else ''
|
||
changelog.append({
|
||
'date': date,
|
||
'version': version,
|
||
'description': description
|
||
})
|
||
result['changelog'] = changelog
|
||
|
||
# 下载链接
|
||
zip_link = None
|
||
for a_tag in soup.find_all("a", href=True):
|
||
href = a_tag["href"]
|
||
if href.endswith(".zip"):
|
||
zip_link = "https://www.kenney.nl" + \
|
||
href if href.startswith("/") else href
|
||
break
|
||
result['download'] = zip_link if zip_link else "N/A"
|
||
|
||
# ✅ 图片提取
|
||
images = []
|
||
|
||
# 封面图(Cover)
|
||
cover_img = soup.select_one(
|
||
'#content > section > div > div > div:nth-of-type(2) > a > img')
|
||
if cover_img and cover_img.get("src"):
|
||
cover_url = cover_img["src"]
|
||
if cover_url.startswith("/"):
|
||
cover_url = "https://www.kenney.nl" + cover_url
|
||
images.append(cover_url)
|
||
|
||
# 图集中的图像
|
||
gallery_divs = soup.select(
|
||
'#content > section > div > div > div:nth-of-type(2) > div > div')
|
||
for div in gallery_divs:
|
||
img_tag = div.select_one("a > img")
|
||
if img_tag and img_tag.get("src"):
|
||
img_url = img_tag["src"]
|
||
if img_url.startswith("/"):
|
||
img_url = "https://www.kenney.nl" + img_url
|
||
images.append(img_url)
|
||
|
||
result['images'] = images
|
||
|
||
return result
|
||
|
||
|
||
# for page in range(1, total_pages + 1):
|
||
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
|
||
url = base_url + str(page)
|
||
response = requests.get(url, headers=headers)
|
||
soup = BeautifulSoup(response.text, "lxml")
|
||
|
||
# 定位到//*[@id="content"]/section/div/div[1]
|
||
content_div = soup.select_one(
|
||
"#content > section > div > div:nth-of-type(1)")
|
||
|
||
if content_div:
|
||
item_divs = content_div.find_all("div", recursive=False)
|
||
|
||
for item_div in item_divs:
|
||
a_tag = item_div.find("a")
|
||
if a_tag and "href" in a_tag.attrs:
|
||
link = a_tag["href"]
|
||
full_link = link
|
||
all_links.append(full_link)
|
||
|
||
print(f"总共提取到 {len(all_links)} 个链接 ✅")
|
||
|
||
with open("kenney_links.txt", "w", encoding="utf-8") as f:
|
||
for link in all_links:
|
||
f.write(link + "\n")
|
||
|
||
# 爬取页面内的信息
|
||
all_resource_data = []
|
||
|
||
# for link in all_links:
|
||
for link in tqdm(all_links, desc="Fetching all assets' data"):
|
||
resource_data = parse_resource_page(link)
|
||
all_resource_data.append(resource_data)
|
||
|
||
with open("kenney_data.json", "w", encoding="utf-8") as f:
|
||
json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
|
||
|
||
print("数据爬取完成 ✅")
|