Files
xkwsoftlistnew/fill_missing.py
2026-02-27 15:30:59 +08:00

238 lines
8.6 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
fill_missing.py
分段检查每个 softlist_*.db 中是否存在缺失 ID若有则重新抓取并写入。
支持断点续传:进度保存在各 db 文件的 repair_progress 表中。
使用与 scrape_to_sqlite.py 相同的配置文件 config.py。
"""
import os
import re
import sqlite3
import datetime
import threading
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from scrape_to_sqlite import get_driver
from config import TOTAL_START_ID, THREAD_COUNT, STEP, ACTIVE_THREADS
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# 每次扫描的 ID 区间大小(不是一次读取条数,而是按 softid 区间切片)
CHUNK_SIZE = 10
# ──────────────────────────────────────────────
# 进度表操作
# ──────────────────────────────────────────────
def init_progress_table(conn):
"""在 db 中创建断点续传进度表(若不存在)。"""
conn.execute('''
CREATE TABLE IF NOT EXISTS repair_progress (
key TEXT PRIMARY KEY,
value INTEGER
)
''')
conn.commit()
def get_progress(conn, start_id):
"""读取上次扫描到的位置,默认从 start_id 开始。"""
row = conn.execute(
"SELECT value FROM repair_progress WHERE key='scan_pos'"
).fetchone()
return row[0] if row else start_id
def save_progress(conn, pos):
"""保存当前扫描位置。"""
conn.execute(
"INSERT OR REPLACE INTO repair_progress (key, value) VALUES ('scan_pos', ?)",
(pos,)
)
conn.commit()
def clear_progress(conn):
"""扫描全部完成后清除进度,方便下次全量重新检查。"""
conn.execute("DELETE FROM repair_progress WHERE key='scan_pos'")
conn.commit()
# ──────────────────────────────────────────────
# 分段缺失检测(只查一小段)
# ──────────────────────────────────────────────
def find_missing_in_chunk(conn, chunk_start, chunk_end):
"""
在 [chunk_start, chunk_end] 范围内,找出数据库中缺失的 softid。
仅查询这一小段,不读取整张表。
"""
rows = conn.execute(
'SELECT softid FROM softinfo WHERE softid >= ? AND softid <= ? ORDER BY softid',
(chunk_start, chunk_end)
).fetchall()
existing = {row[0] for row in rows}
return [i for i in range(chunk_start, chunk_end + 1) if i not in existing]
# ──────────────────────────────────────────────
# 页面抓取
# ──────────────────────────────────────────────
def fetch_one(driver, wait, softid):
"""
抓取单条数据,返回 (softname, softdate, createtime)。
页面不存在时 softname/softdate 为 Nonecreatetime 仍有值。
失败时抛出异常。
"""
url = f"https://www.zxxk.com/soft/{softid}.html"
driver.get(url)
wait.until(lambda d:
d.find_elements(By.CLASS_NAME, "document-basic-data") or
"页面出错了" in d.page_source
)
softname = softdate = None
elements = driver.find_elements(By.CLASS_NAME, "document-basic-data")
if elements:
softname = driver.find_element(
By.CSS_SELECTOR, ".document-basic-data .title"
).text.strip()
date_text = driver.find_element(
By.CSS_SELECTOR, ".document-basic-data .time"
).text
m = re.search(r'(\d{4}-\d{2}-\d{2})', date_text)
if m:
softdate = m.group(1)
createtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return softname, softdate, createtime
def repair_id(conn, driver, wait, thread_idx, softid):
"""
补抓单个 softid重试 3 次。
成功后写入数据库并返回。
3 次均失败则抛出异常,由调用方决定是否退出线程(进度不会推进)。
"""
max_retries = 3
for attempt in range(max_retries):
try:
start_time = time.time()
softname, softdate, createtime = fetch_one(driver, wait, softid)
conn.execute(
'INSERT OR REPLACE INTO softinfo (softid, softname, softdate, createtime) VALUES (?, ?, ?, ?)',
(softid, softname, softdate, createtime)
)
conn.commit()
elapsed = time.time() - start_time
print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] T{thread_idx} | Repaired: {softid} | {elapsed:.2f}s | {softname}")
return
except Exception as e:
wait_sec = (attempt + 1) * 2
if attempt < max_retries - 1:
print(f"[T{thread_idx}] ID {softid}{attempt+1}次失败({e}){wait_sec}s 后重试...")
time.sleep(wait_sec)
else:
print(f"[T{thread_idx}] ID {softid} 重试 {max_retries} 次均失败,退出线程(进度未推进,重启后可续传)。")
raise # 向上抛出,触发 repair_worker 的 except 分支
# ──────────────────────────────────────────────
# 线程主逻辑
# ──────────────────────────────────────────────
def repair_worker(thread_idx, start_id, end_id):
db_name = f'softlist_{thread_idx}.db'
db_path = os.path.join(BASE_DIR, db_name)
if not os.path.exists(db_path):
print(f"[T{thread_idx}] DB 文件不存在,跳过: {db_name}")
return
conn = sqlite3.connect(db_path)
init_progress_table(conn)
# 确定本次扫描的实际上界:数据库中已爬到的最大 ID
row = conn.execute(
'SELECT MAX(softid) FROM softinfo WHERE softid >= ? AND softid <= ?',
(start_id, end_id)
).fetchone()
if not row or row[0] is None:
print(f"[T{thread_idx}] 数据库为空,跳过。")
conn.close()
return
actual_end = row[0]
# 断点续传:从上次保存的位置继续
scan_pos = get_progress(conn, start_id)
if scan_pos > actual_end:
print(f"[T{thread_idx}] 已全部扫描完毕(上次进度 {scan_pos} > 当前最大 {actual_end}),若需重新检查请清除进度。")
conn.close()
return
print(f"[T{thread_idx}] 开始分段扫描,从 {scan_pos}{actual_end},分块大小 {CHUNK_SIZE}...")
driver = None
total_repaired = 0
try:
driver = get_driver()
wait = WebDriverWait(driver, 95)
chunk_start = scan_pos
while chunk_start <= actual_end:
chunk_end = min(chunk_start + CHUNK_SIZE - 1, actual_end)
missing = find_missing_in_chunk(conn, chunk_start, chunk_end)
if missing:
print(f"[T{thread_idx}] [{chunk_start}, {chunk_end}] 缺失 {len(missing)} 个: {missing}")
for mid in missing:
repair_id(conn, driver, wait, thread_idx, mid) # 成功才返回
total_repaired += 1
# 该分块全部补完才推进进度
chunk_start = chunk_end + 1
save_progress(conn, chunk_start)
# 扫描全部完成,清除进度记录
clear_progress(conn)
print(f"[T{thread_idx}] 扫描完成。共补充 {total_repaired} 条。")
except Exception as e:
print(f"[T{thread_idx}] 致命错误: {e}(进度已保存,下次可续传)")
finally:
if driver:
driver.quit()
conn.close()
# ──────────────────────────────────────────────
# 入口
# ──────────────────────────────────────────────
def main():
active = set(ACTIVE_THREADS) if ACTIVE_THREADS else set(range(THREAD_COUNT))
threads = []
for i in sorted(active):
t_start = TOTAL_START_ID + i * STEP
t_end = t_start + STEP - 1
t = threading.Thread(target=repair_worker, args=(i, t_start, t_end))
threads.append(t)
t.start()
time.sleep(1.0)
for t in threads:
t.join()
print("所有线程检查/修补完毕。")
if __name__ == "__main__":
main()