first commit
This commit is contained in:
133
process_urls.py
Normal file
133
process_urls.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import time
|
||||
|
||||
# 配置
|
||||
URL_LIST_FILE = '/Users/liushuming/projects/xkwsoftlist/url_list.txt'
|
||||
OUTPUT_DIR = '/Users/liushuming/projects/xkwsoftlist/output_html'
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
CONCURRENCY = 10 # 并发数,不要太高以免被封
|
||||
|
||||
def extract_id_from_url(url):
|
||||
match = re.search(r'/soft/(\d+)\.html', url)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
def fetch_publish_date(soft_id):
|
||||
url = f"https://www.zxxk.com/soft/{soft_id}.html"
|
||||
try:
|
||||
# 使用 Session 保持 Cookie
|
||||
session = requests.Session()
|
||||
response = session.get(url, headers=HEADERS, timeout=10)
|
||||
|
||||
# 调试打印:检查是否被拦截
|
||||
if "check()" in response.text and "<script" in response.text:
|
||||
print(f"ID {soft_id}: Blocked by JS challenge (Status: {response.status_code})")
|
||||
return soft_id, "blocked"
|
||||
|
||||
if response.status_code != 200:
|
||||
return soft_id, f"error_{response.status_code}"
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# 查找 other-info 节点
|
||||
other_info = soup.find('div', class_='other-info')
|
||||
if not other_info:
|
||||
return soft_id, "nodata"
|
||||
|
||||
# 验证 time, views, download 节点是否存在
|
||||
time_node = other_info.find('div', class_='time')
|
||||
views_node = other_info.find('div', class_='views')
|
||||
download_node = other_info.find('div', class_='download')
|
||||
|
||||
if time_node and views_node and download_node:
|
||||
# 提取日期,例如 "2023-04-26 发布" -> "2023-04-26"
|
||||
date_text = time_node.get_text(strip=True)
|
||||
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text)
|
||||
if date_match:
|
||||
return soft_id, date_match.group(1)
|
||||
|
||||
return soft_id, "nodata"
|
||||
except Exception as e:
|
||||
print(f"Error fetching ID {soft_id}: {e}")
|
||||
return soft_id, "nodata"
|
||||
|
||||
def main():
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
# 1. 读取并解析原始 ID
|
||||
original_ids = []
|
||||
with open(URL_LIST_FILE, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
url = line.strip()
|
||||
if 'zxxk.com/soft/' in url:
|
||||
soft_id = extract_id_from_url(url)
|
||||
if soft_id:
|
||||
original_ids.append(soft_id)
|
||||
if len(original_ids) >= 2: # 仅测试前两条
|
||||
break
|
||||
|
||||
print(f"Total original IDs: {len(original_ids)}")
|
||||
|
||||
# 2. 收集所有需要查询的 ID(去重)
|
||||
all_needed_ids = set()
|
||||
for oid in original_ids:
|
||||
for i in range(oid - 20, oid + 21):
|
||||
all_needed_ids.add(i)
|
||||
|
||||
print(f"Total unique IDs to fetch: {len(all_needed_ids)}")
|
||||
|
||||
# 3. 并发抓取数据
|
||||
id_to_date = {}
|
||||
total = len(all_needed_ids)
|
||||
done = 0
|
||||
|
||||
print("Starting fetching dates...")
|
||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as executor:
|
||||
futures = {executor.submit(fetch_publish_date, sid): sid for sid in all_needed_ids}
|
||||
for future in as_completed(futures):
|
||||
sid, date = future.result()
|
||||
id_to_date[sid] = date
|
||||
done += 1
|
||||
if done % 100 == 0 or done == total:
|
||||
print(f"Progress: {done}/{total}")
|
||||
# 稍微慢一点,避免给对方服务器太大压力
|
||||
# time.sleep(0.05)
|
||||
|
||||
# 4. 生成 HTML 文件
|
||||
print("Generating HTML files...")
|
||||
for oid in original_ids:
|
||||
filename = f"{oid}.html"
|
||||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||||
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write('<!DOCTYPE html>\n<html>\n<head>\n<meta charset="utf-8">\n')
|
||||
f.write(f'<title>ID {oid} 记录</title>\n')
|
||||
f.write('<style>table { border-collapse: collapse; width: 500px; } td { border: 1px solid #ccc; padding: 5px; } .current { color: red; font-weight: bold; }</style>\n')
|
||||
f.write('</head>\n<body>\n')
|
||||
f.write('<table>\n')
|
||||
|
||||
for i in range(oid - 20, oid + 21):
|
||||
date = id_to_date.get(i, "nodata")
|
||||
url = f"https://www.zxxk.com/soft/{i}.html"
|
||||
|
||||
f.write(' <tr>\n')
|
||||
if i == oid:
|
||||
f.write(f' <td><a href="{url}" class="current">{i}</a></td>\n')
|
||||
else:
|
||||
f.write(f' <td><a href="{url}">{i}</a></td>\n')
|
||||
f.write(f' <td>{date}</td>\n')
|
||||
f.write(' </tr>\n')
|
||||
|
||||
f.write('</table>\n</body>\n</html>')
|
||||
|
||||
print(f"Done! Generated {len(original_ids)} files in {OUTPUT_DIR}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user