186 lines
7.3 KiB
Python
Executable File
186 lines
7.3 KiB
Python
Executable File
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
import random
|
||
import sqlite3
|
||
import datetime
|
||
import threading
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
from config import TOTAL_START_ID, THREAD_COUNT, STEP, ACTIVE_THREADS
|
||
|
||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
|
||
def get_driver():
|
||
chrome_options = Options()
|
||
chrome_options.page_load_strategy = 'eager'
|
||
|
||
chrome_options.add_argument("--headless")
|
||
chrome_options.add_argument("--disable-gpu")
|
||
chrome_options.add_argument("--no-sandbox")
|
||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||
|
||
# 禁用图片、CSS和字体以加快加载速度
|
||
prefs = {
|
||
"profile.managed_default_content_settings.images": 2,
|
||
"profile.managed_default_content_settings.stylesheets": 2,
|
||
"profile.managed_default_content_settings.fonts": 2
|
||
}
|
||
chrome_options.add_experimental_option("prefs", prefs)
|
||
chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||
|
||
service = Service(ChromeDriverManager().install())
|
||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
return driver
|
||
|
||
def init_thread_db(db_path):
|
||
conn = sqlite3.connect(db_path)
|
||
cursor = conn.cursor()
|
||
cursor.execute('''
|
||
CREATE TABLE IF NOT EXISTS softinfo (
|
||
softid INTEGER PRIMARY KEY,
|
||
softname TEXT,
|
||
softdate TEXT,
|
||
createtime TEXT
|
||
)
|
||
''')
|
||
conn.commit()
|
||
return conn
|
||
|
||
def get_last_processed_id(conn, start_id, end_id):
|
||
cursor = conn.cursor()
|
||
# 查找该线程负责范围内的最大 ID
|
||
cursor.execute('SELECT MAX(softid) FROM softinfo WHERE softid >= ? AND softid <= ?', (start_id, end_id))
|
||
result = cursor.fetchone()
|
||
if result and result[0] is not None:
|
||
return result[0]
|
||
return start_id - 1
|
||
|
||
def worker(thread_idx, start_id, end_id):
|
||
db_name = f'softlist_{thread_idx}.db'
|
||
db_path = os.path.join(BASE_DIR, db_name)
|
||
print(f"Thread-{thread_idx} started: range [{start_id}, {end_id}], DB: {db_name}")
|
||
|
||
# 初始化自己的数据库
|
||
conn = init_thread_db(db_path)
|
||
current_id = get_last_processed_id(conn, start_id, end_id) + 1
|
||
|
||
if current_id > end_id:
|
||
print(f"Thread-{thread_idx} has already finished its range.")
|
||
conn.close()
|
||
return
|
||
|
||
print(f"Thread-{thread_idx} resuming from: {current_id}")
|
||
|
||
driver = None
|
||
try:
|
||
driver = get_driver()
|
||
wait = WebDriverWait(driver, 95) # 稍微增加等待时间
|
||
|
||
while current_id <= end_id:
|
||
url = f"https://www.zxxk.com/soft/{current_id}.html"
|
||
softname = None
|
||
softdate = None
|
||
createtime = None
|
||
|
||
success = False
|
||
max_retries = 3
|
||
for attempt in range(max_retries):
|
||
start_time = time.time()
|
||
try:
|
||
driver.get(url)
|
||
# 检查是否包含基本数据容器 或 出现错误提示文字
|
||
wait.until(lambda d:
|
||
d.find_elements(By.CLASS_NAME, "document-basic-data") or
|
||
"页面出错了" in d.page_source
|
||
)
|
||
|
||
elements = driver.find_elements(By.CLASS_NAME, "document-basic-data")
|
||
if elements:
|
||
title_element = driver.find_element(By.CSS_SELECTOR, ".document-basic-data .title")
|
||
softname = title_element.text.strip()
|
||
|
||
time_element = driver.find_element(By.CSS_SELECTOR, ".document-basic-data .time")
|
||
date_text = time_element.text
|
||
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text)
|
||
if date_match:
|
||
softdate = date_match.group(1)
|
||
|
||
createtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
success = True
|
||
break # 成功获取或确认页面不存在,跳出重试循环
|
||
except Exception as e:
|
||
if attempt < max_retries - 1:
|
||
wait_time = (attempt + 1) * 2
|
||
print(f"T{thread_idx} | ID {current_id}: Attempt {attempt+1} failed ({e}), retrying in {wait_time}s...")
|
||
time.sleep(wait_time)
|
||
else:
|
||
print(f"T{thread_idx} | ID {current_id}: All {max_retries} attempts failed. {e}")
|
||
|
||
# 只有在成功访问页面(无论是抓到数据还是确认页面出错)后才记录
|
||
# 如果是由于网络超时等原因导致的完全失败,则不存数据库,以便下次重跑
|
||
if success:
|
||
# 保存到自己的数据库
|
||
cursor = conn.cursor()
|
||
cursor.execute('''
|
||
INSERT OR REPLACE INTO softinfo (softid, softname, softdate, createtime)
|
||
VALUES (?, ?, ?, ?)
|
||
''', (current_id, softname, softdate, createtime))
|
||
conn.commit()
|
||
|
||
elapsed = time.time() - start_time
|
||
if softname or current_id % 100 == 0:
|
||
print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] T{thread_idx} | Saved: {current_id} | {elapsed:.2f}s | {softname}")
|
||
|
||
current_id += 1
|
||
else:
|
||
# 如果连续重试都失败,可能是网络问题,建议稍微停顿或让该线程退出,避免无效重试
|
||
print(f"T{thread_idx} | ID {current_id}: Skipping save due to persistent errors.")
|
||
time.sleep(5)
|
||
# 这里我们依然 current_id += 1 还是 停留在原处?
|
||
# 如果不加 1,会陷入死循环;如果加 1,这条数据就丢了。
|
||
# 建议记录到错误日志或直接跳过,既然已经重试了 3 次。
|
||
#current_id += 1
|
||
|
||
except Exception as e:
|
||
print(f"Thread-{thread_idx} fatal error: {e}")
|
||
finally:
|
||
if driver:
|
||
driver.quit()
|
||
conn.close()
|
||
print(f"Thread-{thread_idx} finished.")
|
||
|
||
def main():
|
||
threads = []
|
||
active = set(ACTIVE_THREADS) if ACTIVE_THREADS else None
|
||
for i in range(THREAD_COUNT):
|
||
if active is not None and i not in active:
|
||
continue
|
||
# 计算每个线程的范围
|
||
t_start = TOTAL_START_ID + i * STEP
|
||
t_end = t_start + STEP - 1
|
||
|
||
|
||
t = threading.Thread(target=worker, args=(i, t_start, t_end))
|
||
threads.append(t)
|
||
t.start()
|
||
|
||
# 稍微错开启动时间,避免瞬间高并发创建 50 个浏览器实例
|
||
time.sleep(1.5)
|
||
|
||
for t in threads:
|
||
t.join()
|
||
|
||
print("All threads completed. Goodbye!")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|