Open Source

2025-04-22 09:11:40 +08:00
commit a4bf39a958
14 changed files with 1043 additions and 0 deletions
--- a/scrapper/1
+++ b/scrapper/1
@@ -0,0 +1,136 @@
+import requests
+import json
+from bs4 import BeautifulSoup
+from tqdm import tqdm, trange
+
+base_url = "https://www.kenney.nl/assets/page:"
+total_pages = 13
+all_links = []
+
+headers = {
+    "User-Agent": "Mozilla/5.0"
+}
+
+
+def parse_resource_page(url):
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, "lxml")
+    result = {}
+
+    # 资源名称
+    title_tag = soup.select_one(
+        '#content > section > div > div > div:nth-of-type(1) > h1')
+    result['title'] = title_tag.text.strip() if title_tag else 'N/A'
+
+    # 属性表
+    properties = {}
+    prop_table = soup.select_one(
+        '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(1) > tbody')
+    if prop_table:
+        for row in prop_table.find_all('tr'):
+            cols = row.find_all('td')
+            if len(cols) == 2:
+                key = cols[0].text.strip().rstrip(':')
+                value_links = cols[1].find_all('a')
+                if value_links:
+                    value = [a.text.strip() for a in value_links]
+                else:
+                    value = cols[1].text.strip()
+                properties[key] = value
+    result['properties'] = properties
+
+    # 更新记录
+    changelog = []
+    update_table = soup.select_one(
+        '#content > section > div > div > div:nth-of-type(1) > table:nth-of-type(2) > tbody')
+    if update_table:
+        for row in update_table.find_all('tr'):
+            cols = row.find_all('td')
+            if len(cols) == 2:
+                date = cols[0].text.strip()
+                spans = cols[1].find_all('span')
+                version = spans[0].text.strip() if len(spans) >= 1 else ''
+                description = spans[1].text.strip() if len(spans) >= 2 else ''
+                changelog.append({
+                    'date': date,
+                    'version': version,
+                    'description': description
+                })
+    result['changelog'] = changelog
+
+    # 下载链接
+    zip_link = None
+    for a_tag in soup.find_all("a", href=True):
+        href = a_tag["href"]
+        if href.endswith(".zip"):
+            zip_link = "https://www.kenney.nl" + \
+                href if href.startswith("/") else href
+            break
+    result['download'] = zip_link if zip_link else "N/A"
+
+    # ✅ 图片提取
+    images = []
+
+    # 封面图（Cover）
+    cover_img = soup.select_one(
+        '#content > section > div > div > div:nth-of-type(2) > a > img')
+    if cover_img and cover_img.get("src"):
+        cover_url = cover_img["src"]
+        if cover_url.startswith("/"):
+            cover_url = "https://www.kenney.nl" + cover_url
+        images.append(cover_url)
+
+    # 图集中的图像
+    gallery_divs = soup.select(
+        '#content > section > div > div > div:nth-of-type(2) > div > div')
+    for div in gallery_divs:
+        img_tag = div.select_one("a > img")
+        if img_tag and img_tag.get("src"):
+            img_url = img_tag["src"]
+            if img_url.startswith("/"):
+                img_url = "https://www.kenney.nl" + img_url
+            images.append(img_url)
+
+    result['images'] = images
+
+    return result
+
+
+# for page in range(1, total_pages + 1):
+for page in trange(1, total_pages + 1, desc="Fetching all assets' page links"):
+    url = base_url + str(page)
+    response = requests.get(url, headers=headers)
+    soup = BeautifulSoup(response.text, "lxml")
+
+    # 定位到//*[@id="content"]/section/div/div[1]
+    content_div = soup.select_one(
+        "#content > section > div > div:nth-of-type(1)")
+
+    if content_div:
+        item_divs = content_div.find_all("div", recursive=False)
+
+        for item_div in item_divs:
+            a_tag = item_div.find("a")
+            if a_tag and "href" in a_tag.attrs:
+                link = a_tag["href"]
+                full_link = link
+                all_links.append(full_link)
+
+print(f"总共提取到 {len(all_links)} 个链接 ✅")
+
+with open("kenney_links.txt", "w", encoding="utf-8") as f:
+    for link in all_links:
+        f.write(link + "\n")
+
+# 爬取页面内的信息
+all_resource_data = []
+
+# for link in all_links:
+for link in tqdm(all_links, desc="Fetching all assets' data"):
+    resource_data = parse_resource_page(link)
+    all_resource_data.append(resource_data)
+
+with open("kenney_data.json", "w", encoding="utf-8") as f:
+    json.dump(all_resource_data, f, ensure_ascii=False, indent=4)
+
+print("数据爬取完成 ✅")