kenney-asset-scrapper/scrapper/1 main.py

import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm, trange

base_url = "https://www.kenney.nl/assets/page:"
total_pages = 13
all_links = []

headers = {
    "User-Agent": "Mozilla/5.0"
}


def parse_resource_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    result = {}

    # 资源名称
    title_tag = soup.select_one(
        '#content > section > div > div > div:nth-of-type(1) > h1')
    result['title'] = title_tag.text.strip() if title_tag else 'N/A'

    # 属性表
    properties = {}
    prop_table = soup.select_one(
        '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
    if prop_table:
        for row in prop_table.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) == 2:
                key = cols[0].text.strip().rstrip(':')
                value_links = cols[1].find_all('a')
                if value_links:
                    value = [a.text.strip() for a in value_links]
                else:
                    value = cols[1].text.strip()
                properties[key] = value
    result['properties'] = properties

    # 更新记录
    changelog = []
    update_table = soup.select_one(
        '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
    if update_table:
        for row in update_table.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) == 2:
                date = cols[0].text.strip()
                spans = cols[1].find_all('span')
                version = spans[0].text.strip() if len(spans) >= 1 else ''
                description = spans[1].text.strip() if len(spans) >= 2 else ''
                changelog.append({
                    'date': date,
                    'version': version,
                    'description': description
                })
    result['changelog'] = changelog

    # 下载链接
    zip_link = None
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.endswith(".zip"):
            zip_link = "https://www.kenney.nl" + \
                href if href.startswith("/") else href
            break
    result['download'] = zip_link if zip_link else "N/A"

    # ✅ 图片提取
    images = []

    # 封面图（Cover）
    cover_img = soup.select_one(
        '#content > section > div > div > div:nth-of-type(2) > a > img')
    if cover_img and cover_img.get("src"):
        cover_url = cover_img["src"]
        if cover_url.startswith("/"):
            cover_url = "https://www.kenney.nl" + cover_url
        images.append(cover_url)

    # 图集中的图像
    gallery_divs = soup.select(
        '#content > section > div > div > div:nth-of-type(2) > div > div')
    for div in gallery_divs:
        img_tag = div.select_one("a > img")
        if img_tag and img_tag.get("src"):
            img_url = img_tag["src"]
            if img_url.startswith("/"):
                img_url = "https://www.kenney.nl" + img_url
            images.append(img_url)

    result['images'] = images

    return result


# for page in range(1, total_pages + 1):
for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
    url = base_url + str(page)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")

    # 定位到//*[@id="content"]/section/div/div[1]
    content_div = soup.select_one(
        "#content > section > div > div:nth-of-type(1)")

    if content_div:
        item_divs = content_div.find_all("div", recursive=False)

        for item_div in item_divs:
            a_tag = item_div.find("a")
            if a_tag and "href" in a_tag.attrs:
                link = a_tag["href"]
                full_link = link
                all_links.append(full_link)

print(f"总共提取到 {len(all_links)} 个链接 ✅")

with open("kenney_links.txt", "w", encoding="utf-8") as f:
    for link in all_links:
        f.write(link + "\n")

# 爬取页面内的信息
all_resource_data = []

# for link in all_links:
for link in tqdm(all_links, desc="Fetching all assets' data"):
    resource_data = parse_resource_page(link)
    all_resource_data.append(resource_data)

with open("kenney_data.json", "w", encoding="utf-8") as f:
    json.dump(all_resource_data, f, ensure_ascii=False, indent=4)

print("数据爬取完成 ✅")