import os import re import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed import time # 配置 URL_LIST_FILE = '/Users/liushuming/projects/xkwsoftlist/url_list.txt' OUTPUT_DIR = '/Users/liushuming/projects/xkwsoftlist/output_html' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } CONCURRENCY = 10 # 并发数,不要太高以免被封 def extract_id_from_url(url): match = re.search(r'/soft/(\d+)\.html', url) if match: return int(match.group(1)) return None def fetch_publish_date(soft_id): url = f"https://www.zxxk.com/soft/{soft_id}.html" try: # 使用 Session 保持 Cookie session = requests.Session() response = session.get(url, headers=HEADERS, timeout=10) # 调试打印:检查是否被拦截 if "check()" in response.text and " "2023-04-26" date_text = time_node.get_text(strip=True) date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text) if date_match: return soft_id, date_match.group(1) return soft_id, "nodata" except Exception as e: print(f"Error fetching ID {soft_id}: {e}") return soft_id, "nodata" def main(): if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) # 1. 读取并解析原始 ID original_ids = [] with open(URL_LIST_FILE, 'r', encoding='utf-8') as f: for line in f: url = line.strip() if 'zxxk.com/soft/' in url: soft_id = extract_id_from_url(url) if soft_id: original_ids.append(soft_id) if len(original_ids) >= 2: # 仅测试前两条 break print(f"Total original IDs: {len(original_ids)}") # 2. 收集所有需要查询的 ID(去重) all_needed_ids = set() for oid in original_ids: for i in range(oid - 20, oid + 21): all_needed_ids.add(i) print(f"Total unique IDs to fetch: {len(all_needed_ids)}") # 3. 并发抓取数据 id_to_date = {} total = len(all_needed_ids) done = 0 print("Starting fetching dates...") with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor: futures = {executor.submit(fetch_publish_date, sid): sid for sid in all_needed_ids} for future in as_completed(futures): sid, date = future.result() id_to_date[sid] = date done += 1 if done % 100 == 0 or done == total: print(f"Progress: {done}/{total}") # 稍微慢一点,避免给对方服务器太大压力 # time.sleep(0.05) # 4. 生成 HTML 文件 print("Generating HTML files...") for oid in original_ids: filename = f"{oid}.html" filepath = os.path.join(OUTPUT_DIR, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write('\n\n\n\n') f.write(f'ID {oid} 记录\n') f.write('\n') f.write('\n\n') f.write('\n') for i in range(oid - 20, oid + 21): date = id_to_date.get(i, "nodata") url = f"https://www.zxxk.com/soft/{i}.html" f.write(' \n') if i == oid: f.write(f' \n') else: f.write(f' \n') f.write(f' \n') f.write(' \n') f.write('
{i}{i}{date}
\n\n') print(f"Done! Generated {len(original_ids)} files in {OUTPUT_DIR}") if __name__ == "__main__": main()