Files
xkwsoftlist/process_urls.py
2026-02-09 08:59:54 +08:00

134 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
# 配置
URL_LIST_FILE = '/Users/liushuming/projects/xkwsoftlist/url_list.txt'
OUTPUT_DIR = '/Users/liushuming/projects/xkwsoftlist/output_html'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
CONCURRENCY = 10 # 并发数,不要太高以免被封
def extract_id_from_url(url):
match = re.search(r'/soft/(\d+)\.html', url)
if match:
return int(match.group(1))
return None
def fetch_publish_date(soft_id):
url = f"https://www.zxxk.com/soft/{soft_id}.html"
try:
# 使用 Session 保持 Cookie
session = requests.Session()
response = session.get(url, headers=HEADERS, timeout=10)
# 调试打印:检查是否被拦截
if "check()" in response.text and "<script" in response.text:
print(f"ID {soft_id}: Blocked by JS challenge (Status: {response.status_code})")
return soft_id, "blocked"
if response.status_code != 200:
return soft_id, f"error_{response.status_code}"
soup = BeautifulSoup(response.text, 'html.parser')
# 查找 other-info 节点
other_info = soup.find('div', class_='other-info')
if not other_info:
return soft_id, "nodata"
# 验证 time, views, download 节点是否存在
time_node = other_info.find('div', class_='time')
views_node = other_info.find('div', class_='views')
download_node = other_info.find('div', class_='download')
if time_node and views_node and download_node:
# 提取日期,例如 "2023-04-26 发布" -> "2023-04-26"
date_text = time_node.get_text(strip=True)
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text)
if date_match:
return soft_id, date_match.group(1)
return soft_id, "nodata"
except Exception as e:
print(f"Error fetching ID {soft_id}: {e}")
return soft_id, "nodata"
def main():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
# 1. 读取并解析原始 ID
original_ids = []
with open(URL_LIST_FILE, 'r', encoding='utf-8') as f:
for line in f:
url = line.strip()
if 'zxxk.com/soft/' in url:
soft_id = extract_id_from_url(url)
if soft_id:
original_ids.append(soft_id)
if len(original_ids) >= 2: # 仅测试前两条
break
print(f"Total original IDs: {len(original_ids)}")
# 2. 收集所有需要查询的 ID去重
all_needed_ids = set()
for oid in original_ids:
for i in range(oid - 20, oid + 21):
all_needed_ids.add(i)
print(f"Total unique IDs to fetch: {len(all_needed_ids)}")
# 3. 并发抓取数据
id_to_date = {}
total = len(all_needed_ids)
done = 0
print("Starting fetching dates...")
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
futures = {executor.submit(fetch_publish_date, sid): sid for sid in all_needed_ids}
for future in as_completed(futures):
sid, date = future.result()
id_to_date[sid] = date
done += 1
if done % 100 == 0 or done == total:
print(f"Progress: {done}/{total}")
# 稍微慢一点,避免给对方服务器太大压力
# time.sleep(0.05)
# 4. 生成 HTML 文件
print("Generating HTML files...")
for oid in original_ids:
filename = f"{oid}.html"
filepath = os.path.join(OUTPUT_DIR, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write('<!DOCTYPE html>\n<html>\n<head>\n<meta charset="utf-8">\n')
f.write(f'<title>ID {oid} 记录</title>\n')
f.write('<style>table { border-collapse: collapse; width: 500px; } td { border: 1px solid #ccc; padding: 5px; } .current { color: red; font-weight: bold; }</style>\n')
f.write('</head>\n<body>\n')
f.write('<table>\n')
for i in range(oid - 20, oid + 21):
date = id_to_date.get(i, "nodata")
url = f"https://www.zxxk.com/soft/{i}.html"
f.write(' <tr>\n')
if i == oid:
f.write(f' <td><a href="{url}" class="current">{i}</a></td>\n')
else:
f.write(f' <td><a href="{url}">{i}</a></td>\n')
f.write(f' <td>{date}</td>\n')
f.write(' </tr>\n')
f.write('</table>\n</body>\n</html>')
print(f"Done! Generated {len(original_ids)} files in {OUTPUT_DIR}")
if __name__ == "__main__":
main()