Open Source
This commit is contained in:
136
scrapper/1 main.py
Normal file
136
scrapper/1 main.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
base_url = "https://www.kenney.nl/assets/page:"
|
||||
total_pages = 13
|
||||
all_links = []
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
|
||||
|
||||
def parse_resource_page(url):
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
result = {}
|
||||
|
||||
# 资源名称
|
||||
title_tag = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > h1')
|
||||
result['title'] = title_tag.text.strip() if title_tag else 'N/A'
|
||||
|
||||
# 属性表
|
||||
properties = {}
|
||||
prop_table = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
|
||||
if prop_table:
|
||||
for row in prop_table.find_all('tr'):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
key = cols[0].text.strip().rstrip(':')
|
||||
value_links = cols[1].find_all('a')
|
||||
if value_links:
|
||||
value = [a.text.strip() for a in value_links]
|
||||
else:
|
||||
value = cols[1].text.strip()
|
||||
properties[key] = value
|
||||
result['properties'] = properties
|
||||
|
||||
# 更新记录
|
||||
changelog = []
|
||||
update_table = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
|
||||
if update_table:
|
||||
for row in update_table.find_all('tr'):
|
||||
cols = row.find_all('td')
|
||||
if len(cols) == 2:
|
||||
date = cols[0].text.strip()
|
||||
spans = cols[1].find_all('span')
|
||||
version = spans[0].text.strip() if len(spans) >= 1 else ''
|
||||
description = spans[1].text.strip() if len(spans) >= 2 else ''
|
||||
changelog.append({
|
||||
'date': date,
|
||||
'version': version,
|
||||
'description': description
|
||||
})
|
||||
result['changelog'] = changelog
|
||||
|
||||
# 下载链接
|
||||
zip_link = None
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = a_tag["href"]
|
||||
if href.endswith(".zip"):
|
||||
zip_link = "https://www.kenney.nl" + \
|
||||
href if href.startswith("/") else href
|
||||
break
|
||||
result['download'] = zip_link if zip_link else "N/A"
|
||||
|
||||
# ✅ 图片提取
|
||||
images = []
|
||||
|
||||
# 封面图(Cover)
|
||||
cover_img = soup.select_one(
|
||||
'#content > section > div > div > div:nth-of-type(2) > a > img')
|
||||
if cover_img and cover_img.get("src"):
|
||||
cover_url = cover_img["src"]
|
||||
if cover_url.startswith("/"):
|
||||
cover_url = "https://www.kenney.nl" + cover_url
|
||||
images.append(cover_url)
|
||||
|
||||
# 图集中的图像
|
||||
gallery_divs = soup.select(
|
||||
'#content > section > div > div > div:nth-of-type(2) > div > div')
|
||||
for div in gallery_divs:
|
||||
img_tag = div.select_one("a > img")
|
||||
if img_tag and img_tag.get("src"):
|
||||
img_url = img_tag["src"]
|
||||
if img_url.startswith("/"):
|
||||
img_url = "https://www.kenney.nl" + img_url
|
||||
images.append(img_url)
|
||||
|
||||
result['images'] = images
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# for page in range(1, total_pages + 1):
|
||||
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
|
||||
url = base_url + str(page)
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
|
||||
# 定位到//*[@id="content"]/section/div/div[1]
|
||||
content_div = soup.select_one(
|
||||
"#content > section > div > div:nth-of-type(1)")
|
||||
|
||||
if content_div:
|
||||
item_divs = content_div.find_all("div", recursive=False)
|
||||
|
||||
for item_div in item_divs:
|
||||
a_tag = item_div.find("a")
|
||||
if a_tag and "href" in a_tag.attrs:
|
||||
link = a_tag["href"]
|
||||
full_link = link
|
||||
all_links.append(full_link)
|
||||
|
||||
print(f"总共提取到 {len(all_links)} 个链接 ✅")
|
||||
|
||||
with open("kenney_links.txt", "w", encoding="utf-8") as f:
|
||||
for link in all_links:
|
||||
f.write(link + "\n")
|
||||
|
||||
# 爬取页面内的信息
|
||||
all_resource_data = []
|
||||
|
||||
# for link in all_links:
|
||||
for link in tqdm(all_links, desc="Fetching all assets' data"):
|
||||
resource_data = parse_resource_page(link)
|
||||
all_resource_data.append(resource_data)
|
||||
|
||||
with open("kenney_data.json", "w", encoding="utf-8") as f:
|
||||
json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print("数据爬取完成 ✅")
|
||||
Reference in New Issue
Block a user