import requests import json from bs4 import BeautifulSoup from tqdm import tqdm, trange base_url = "https://www.kenney.nl/assets/page:" total_pages = 13 all_links = [] headers = { "User-Agent": "Mozilla/5.0" } def parse_resource_page(url): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "lxml") result = {} # 资源名称 title_tag = soup.select_one( '#content > section > div > div > div:nth-of-type(1) > h1') result['title'] = title_tag.text.strip() if title_tag else 'N/A' # 属性表 properties = {} prop_table = soup.select_one( '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody') if prop_table: for row in prop_table.find_all('tr'): cols = row.find_all('td') if len(cols) == 2: key = cols[0].text.strip().rstrip(':') value_links = cols[1].find_all('a') if value_links: value = [a.text.strip() for a in value_links] else: value = cols[1].text.strip() properties[key] = value result['properties'] = properties # 更新记录 changelog = [] update_table = soup.select_one( '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody') if update_table: for row in update_table.find_all('tr'): cols = row.find_all('td') if len(cols) == 2: date = cols[0].text.strip() spans = cols[1].find_all('span') version = spans[0].text.strip() if len(spans) >= 1 else '' description = spans[1].text.strip() if len(spans) >= 2 else '' changelog.append({ 'date': date, 'version': version, 'description': description }) result['changelog'] = changelog # 下载链接 zip_link = None for a_tag in soup.find_all("a", href=True): href = a_tag["href"] if href.endswith(".zip"): zip_link = "https://www.kenney.nl" + \ href if href.startswith("/") else href break result['download'] = zip_link if zip_link else "N/A" # ✅ 图片提取 images = [] # 封面图(Cover) cover_img = soup.select_one( '#content > section > div > div > div:nth-of-type(2) > a > img') if cover_img and cover_img.get("src"): cover_url = cover_img["src"] if cover_url.startswith("/"): cover_url = "https://www.kenney.nl" + cover_url images.append(cover_url) # 图集中的图像 gallery_divs = soup.select( '#content > section > div > div > div:nth-of-type(2) > div > div') for div in gallery_divs: img_tag = div.select_one("a > img") if img_tag and img_tag.get("src"): img_url = img_tag["src"] if img_url.startswith("/"): img_url = "https://www.kenney.nl" + img_url images.append(img_url) result['images'] = images return result # for page in range(1, total_pages + 1): for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"): url = base_url + str(page) response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, "lxml") # 定位到//*[@id="content"]/section/div/div[1] content_div = soup.select_one( "#content > section > div > div:nth-of-type(1)") if content_div: item_divs = content_div.find_all("div", recursive=False) for item_div in item_divs: a_tag = item_div.find("a") if a_tag and "href" in a_tag.attrs: link = a_tag["href"] full_link = link all_links.append(full_link) print(f"总共提取到 {len(all_links)} 个链接 ✅") with open("kenney_links.txt", "w", encoding="utf-8") as f: for link in all_links: f.write(link + "\n") # 爬取页面内的信息 all_resource_data = [] # for link in all_links: for link in tqdm(all_links, desc="Fetching all assets' data"): resource_data = parse_resource_page(link) all_resource_data.append(resource_data) with open("kenney_data.json", "w", encoding="utf-8") as f: json.dump(all_resource_data, f, ensure_ascii=False, indent=4) print("数据爬取完成 ✅")