diff --git a/config.py b/config.py new file mode 100755 index 0000000..25099c4 --- /dev/null +++ b/config.py @@ -0,0 +1,16 @@ +# ===== 爬虫配置 ===== + +# 起始 ID +TOTAL_START_ID = 6000000 + +# 总线程数(决定 ID 分段数量) +THREAD_COUNT = 50 + +# 每个线程负责的 ID 数量 +STEP = 1000000 + +# 指定只运行哪些线程(使用线程索引列表) +# 例如: [16, 17, 18, 19, 36, 37, 38, 39, 40] 表示只运行这些线程 +# 设为 None 或空列表 [] 表示运行全部线程 +ACTIVE_THREADS = [7,8,9,10,11,12,13,14,15,16, 17, 18, 19, 36, 37, 38, 39, 40] +#ACTIVE_THREADS = [16, 17, 18, 19, 36, 37, 38, 39, 40,41,42,43,44] diff --git a/fill_missing.py b/fill_missing.py new file mode 100755 index 0000000..cbbb6a3 --- /dev/null +++ b/fill_missing.py @@ -0,0 +1,237 @@ +""" +fill_missing.py +分段检查每个 softlist_*.db 中是否存在缺失 ID,若有则重新抓取并写入。 +支持断点续传:进度保存在各 db 文件的 repair_progress 表中。 +使用与 scrape_to_sqlite.py 相同的配置文件 config.py。 +""" + +import os +import re +import sqlite3 +import datetime +import threading +import time +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait + +from scrape_to_sqlite import get_driver +from config import TOTAL_START_ID, THREAD_COUNT, STEP, ACTIVE_THREADS + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +# 每次扫描的 ID 区间大小(不是一次读取条数,而是按 softid 区间切片) +CHUNK_SIZE = 10 + + +# ────────────────────────────────────────────── +# 进度表操作 +# ────────────────────────────────────────────── + +def init_progress_table(conn): + """在 db 中创建断点续传进度表(若不存在)。""" + conn.execute(''' + CREATE TABLE IF NOT EXISTS repair_progress ( + key TEXT PRIMARY KEY, + value INTEGER + ) + ''') + conn.commit() + + +def get_progress(conn, start_id): + """读取上次扫描到的位置,默认从 start_id 开始。""" + row = conn.execute( + "SELECT value FROM repair_progress WHERE key='scan_pos'" + ).fetchone() + return row[0] if row else start_id + + +def save_progress(conn, pos): + """保存当前扫描位置。""" + conn.execute( + "INSERT OR REPLACE INTO repair_progress (key, value) VALUES ('scan_pos', ?)", + (pos,) + ) + conn.commit() + + +def clear_progress(conn): + """扫描全部完成后清除进度,方便下次全量重新检查。""" + conn.execute("DELETE FROM repair_progress WHERE key='scan_pos'") + conn.commit() + + +# ────────────────────────────────────────────── +# 分段缺失检测(只查一小段) +# ────────────────────────────────────────────── + +def find_missing_in_chunk(conn, chunk_start, chunk_end): + """ + 在 [chunk_start, chunk_end] 范围内,找出数据库中缺失的 softid。 + 仅查询这一小段,不读取整张表。 + """ + rows = conn.execute( + 'SELECT softid FROM softinfo WHERE softid >= ? AND softid <= ? ORDER BY softid', + (chunk_start, chunk_end) + ).fetchall() + existing = {row[0] for row in rows} + return [i for i in range(chunk_start, chunk_end + 1) if i not in existing] + + +# ────────────────────────────────────────────── +# 页面抓取 +# ────────────────────────────────────────────── + +def fetch_one(driver, wait, softid): + """ + 抓取单条数据,返回 (softname, softdate, createtime)。 + 页面不存在时 softname/softdate 为 None,createtime 仍有值。 + 失败时抛出异常。 + """ + url = f"https://www.zxxk.com/soft/{softid}.html" + driver.get(url) + wait.until(lambda d: + d.find_elements(By.CLASS_NAME, "document-basic-data") or + "页面出错了" in d.page_source + ) + + softname = softdate = None + elements = driver.find_elements(By.CLASS_NAME, "document-basic-data") + if elements: + softname = driver.find_element( + By.CSS_SELECTOR, ".document-basic-data .title" + ).text.strip() + + date_text = driver.find_element( + By.CSS_SELECTOR, ".document-basic-data .time" + ).text + m = re.search(r'(\d{4}-\d{2}-\d{2})', date_text) + if m: + softdate = m.group(1) + + createtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return softname, softdate, createtime + + +def repair_id(conn, driver, wait, thread_idx, softid): + """ + 补抓单个 softid,重试 3 次。 + 成功后写入数据库并返回。 + 3 次均失败则抛出异常,由调用方决定是否退出线程(进度不会推进)。 + """ + max_retries = 3 + for attempt in range(max_retries): + try: + start_time = time.time() + softname, softdate, createtime = fetch_one(driver, wait, softid) + conn.execute( + 'INSERT OR REPLACE INTO softinfo (softid, softname, softdate, createtime) VALUES (?, ?, ?, ?)', + (softid, softname, softdate, createtime) + ) + conn.commit() + elapsed = time.time() - start_time + print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] T{thread_idx} | Repaired: {softid} | {elapsed:.2f}s | {softname}") + return + except Exception as e: + wait_sec = (attempt + 1) * 2 + if attempt < max_retries - 1: + print(f"[T{thread_idx}] ID {softid} 第{attempt+1}次失败({e}),{wait_sec}s 后重试...") + time.sleep(wait_sec) + else: + print(f"[T{thread_idx}] ID {softid} 重试 {max_retries} 次均失败,退出线程(进度未推进,重启后可续传)。") + raise # 向上抛出,触发 repair_worker 的 except 分支 + + +# ────────────────────────────────────────────── +# 线程主逻辑 +# ────────────────────────────────────────────── + +def repair_worker(thread_idx, start_id, end_id): + db_name = f'softlist_{thread_idx}.db' + db_path = os.path.join(BASE_DIR, db_name) + + if not os.path.exists(db_path): + print(f"[T{thread_idx}] DB 文件不存在,跳过: {db_name}") + return + + conn = sqlite3.connect(db_path) + init_progress_table(conn) + + # 确定本次扫描的实际上界:数据库中已爬到的最大 ID + row = conn.execute( + 'SELECT MAX(softid) FROM softinfo WHERE softid >= ? AND softid <= ?', + (start_id, end_id) + ).fetchone() + if not row or row[0] is None: + print(f"[T{thread_idx}] 数据库为空,跳过。") + conn.close() + return + actual_end = row[0] + + # 断点续传:从上次保存的位置继续 + scan_pos = get_progress(conn, start_id) + if scan_pos > actual_end: + print(f"[T{thread_idx}] 已全部扫描完毕(上次进度 {scan_pos} > 当前最大 {actual_end}),若需重新检查请清除进度。") + conn.close() + return + + print(f"[T{thread_idx}] 开始分段扫描,从 {scan_pos} 到 {actual_end},分块大小 {CHUNK_SIZE}...") + + driver = None + total_repaired = 0 + + try: + driver = get_driver() + wait = WebDriverWait(driver, 95) + + chunk_start = scan_pos + while chunk_start <= actual_end: + chunk_end = min(chunk_start + CHUNK_SIZE - 1, actual_end) + + missing = find_missing_in_chunk(conn, chunk_start, chunk_end) + if missing: + print(f"[T{thread_idx}] [{chunk_start}, {chunk_end}] 缺失 {len(missing)} 个: {missing}") + for mid in missing: + repair_id(conn, driver, wait, thread_idx, mid) # 成功才返回 + total_repaired += 1 + + # 该分块全部补完才推进进度 + chunk_start = chunk_end + 1 + save_progress(conn, chunk_start) + + # 扫描全部完成,清除进度记录 + clear_progress(conn) + print(f"[T{thread_idx}] 扫描完成。共补充 {total_repaired} 条。") + + except Exception as e: + print(f"[T{thread_idx}] 致命错误: {e}(进度已保存,下次可续传)") + finally: + if driver: + driver.quit() + conn.close() + + +# ────────────────────────────────────────────── +# 入口 +# ────────────────────────────────────────────── + +def main(): + active = set(ACTIVE_THREADS) if ACTIVE_THREADS else set(range(THREAD_COUNT)) + + threads = [] + for i in sorted(active): + t_start = TOTAL_START_ID + i * STEP + t_end = t_start + STEP - 1 + t = threading.Thread(target=repair_worker, args=(i, t_start, t_end)) + threads.append(t) + t.start() + time.sleep(1.0) + + for t in threads: + t.join() + + print("所有线程检查/修补完毕。") + + +if __name__ == "__main__": + main() diff --git a/scrape_to_sqlite.py b/scrape_to_sqlite.py new file mode 100755 index 0000000..b293fd8 --- /dev/null +++ b/scrape_to_sqlite.py @@ -0,0 +1,185 @@ +import os +import re +import sys +import time +import random +import sqlite3 +import datetime +import threading +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +from config import TOTAL_START_ID, THREAD_COUNT, STEP, ACTIVE_THREADS + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +def get_driver(): + chrome_options = Options() + chrome_options.page_load_strategy = 'eager' + + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + + # 禁用图片、CSS和字体以加快加载速度 + prefs = { + "profile.managed_default_content_settings.images": 2, + "profile.managed_default_content_settings.stylesheets": 2, + "profile.managed_default_content_settings.fonts": 2 + } + chrome_options.add_experimental_option("prefs", prefs) + chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=chrome_options) + return driver + +def init_thread_db(db_path): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS softinfo ( + softid INTEGER PRIMARY KEY, + softname TEXT, + softdate TEXT, + createtime TEXT + ) + ''') + conn.commit() + return conn + +def get_last_processed_id(conn, start_id, end_id): + cursor = conn.cursor() + # 查找该线程负责范围内的最大 ID + cursor.execute('SELECT MAX(softid) FROM softinfo WHERE softid >= ? AND softid <= ?', (start_id, end_id)) + result = cursor.fetchone() + if result and result[0] is not None: + return result[0] + return start_id - 1 + +def worker(thread_idx, start_id, end_id): + db_name = f'softlist_{thread_idx}.db' + db_path = os.path.join(BASE_DIR, db_name) + print(f"Thread-{thread_idx} started: range [{start_id}, {end_id}], DB: {db_name}") + + # 初始化自己的数据库 + conn = init_thread_db(db_path) + current_id = get_last_processed_id(conn, start_id, end_id) + 1 + + if current_id > end_id: + print(f"Thread-{thread_idx} has already finished its range.") + conn.close() + return + + print(f"Thread-{thread_idx} resuming from: {current_id}") + + driver = None + try: + driver = get_driver() + wait = WebDriverWait(driver, 95) # 稍微增加等待时间 + + while current_id <= end_id: + url = f"https://www.zxxk.com/soft/{current_id}.html" + softname = None + softdate = None + createtime = None + + success = False + max_retries = 3 + for attempt in range(max_retries): + start_time = time.time() + try: + driver.get(url) + # 检查是否包含基本数据容器 或 出现错误提示文字 + wait.until(lambda d: + d.find_elements(By.CLASS_NAME, "document-basic-data") or + "页面出错了" in d.page_source + ) + + elements = driver.find_elements(By.CLASS_NAME, "document-basic-data") + if elements: + title_element = driver.find_element(By.CSS_SELECTOR, ".document-basic-data .title") + softname = title_element.text.strip() + + time_element = driver.find_element(By.CSS_SELECTOR, ".document-basic-data .time") + date_text = time_element.text + date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_text) + if date_match: + softdate = date_match.group(1) + + createtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + success = True + break # 成功获取或确认页面不存在,跳出重试循环 + except Exception as e: + if attempt < max_retries - 1: + wait_time = (attempt + 1) * 2 + print(f"T{thread_idx} | ID {current_id}: Attempt {attempt+1} failed ({e}), retrying in {wait_time}s...") + time.sleep(wait_time) + else: + print(f"T{thread_idx} | ID {current_id}: All {max_retries} attempts failed. {e}") + + # 只有在成功访问页面(无论是抓到数据还是确认页面出错)后才记录 + # 如果是由于网络超时等原因导致的完全失败,则不存数据库,以便下次重跑 + if success: + # 保存到自己的数据库 + cursor = conn.cursor() + cursor.execute(''' + INSERT OR REPLACE INTO softinfo (softid, softname, softdate, createtime) + VALUES (?, ?, ?, ?) + ''', (current_id, softname, softdate, createtime)) + conn.commit() + + elapsed = time.time() - start_time + if softname or current_id % 100 == 0: + print(f"[{datetime.datetime.now().strftime('%H:%M:%S')}] T{thread_idx} | Saved: {current_id} | {elapsed:.2f}s | {softname}") + + current_id += 1 + else: + # 如果连续重试都失败,可能是网络问题,建议稍微停顿或让该线程退出,避免无效重试 + print(f"T{thread_idx} | ID {current_id}: Skipping save due to persistent errors.") + time.sleep(5) + # 这里我们依然 current_id += 1 还是 停留在原处? + # 如果不加 1,会陷入死循环;如果加 1,这条数据就丢了。 + # 建议记录到错误日志或直接跳过,既然已经重试了 3 次。 + #current_id += 1 + + except Exception as e: + print(f"Thread-{thread_idx} fatal error: {e}") + finally: + if driver: + driver.quit() + conn.close() + print(f"Thread-{thread_idx} finished.") + +def main(): + threads = [] + active = set(ACTIVE_THREADS) if ACTIVE_THREADS else None + for i in range(THREAD_COUNT): + if active is not None and i not in active: + continue + # 计算每个线程的范围 + t_start = TOTAL_START_ID + i * STEP + t_end = t_start + STEP - 1 + + + t = threading.Thread(target=worker, args=(i, t_start, t_end)) + threads.append(t) + t.start() + + # 稍微错开启动时间,避免瞬间高并发创建 50 个浏览器实例 + time.sleep(1.5) + + for t in threads: + t.join() + + print("All threads completed. Goodbye!") + +if __name__ == "__main__": + main() + diff --git a/softlist_10.db b/softlist_10.db new file mode 100755 index 0000000..406ea63 Binary files /dev/null and b/softlist_10.db differ diff --git a/softlist_11.db b/softlist_11.db new file mode 100755 index 0000000..b0016bb Binary files /dev/null and b/softlist_11.db differ diff --git a/softlist_12.db b/softlist_12.db new file mode 100755 index 0000000..f821090 Binary files /dev/null and b/softlist_12.db differ diff --git a/softlist_13.db b/softlist_13.db new file mode 100755 index 0000000..c331dc0 Binary files /dev/null and b/softlist_13.db differ diff --git a/softlist_14.db b/softlist_14.db new file mode 100755 index 0000000..8663741 Binary files /dev/null and b/softlist_14.db differ diff --git a/softlist_15.db b/softlist_15.db new file mode 100755 index 0000000..8d548af Binary files /dev/null and b/softlist_15.db differ diff --git a/softlist_16.db b/softlist_16.db new file mode 100755 index 0000000..a3a0abd Binary files /dev/null and b/softlist_16.db differ diff --git a/softlist_17.db b/softlist_17.db new file mode 100755 index 0000000..3a39e42 Binary files /dev/null and b/softlist_17.db differ diff --git a/softlist_18.db b/softlist_18.db new file mode 100755 index 0000000..8f2d6c2 Binary files /dev/null and b/softlist_18.db differ diff --git a/softlist_19.db b/softlist_19.db new file mode 100755 index 0000000..e7ca3e7 Binary files /dev/null and b/softlist_19.db differ diff --git a/softlist_20.db b/softlist_20.db new file mode 100755 index 0000000..51869da Binary files /dev/null and b/softlist_20.db differ diff --git a/softlist_21.db b/softlist_21.db new file mode 100755 index 0000000..e997e8f Binary files /dev/null and b/softlist_21.db differ diff --git a/softlist_22.db b/softlist_22.db new file mode 100755 index 0000000..254bff3 Binary files /dev/null and b/softlist_22.db differ diff --git a/softlist_23.db b/softlist_23.db new file mode 100755 index 0000000..d9d8c10 Binary files /dev/null and b/softlist_23.db differ diff --git a/softlist_24.db b/softlist_24.db new file mode 100755 index 0000000..4eafe29 Binary files /dev/null and b/softlist_24.db differ diff --git a/softlist_25.db b/softlist_25.db new file mode 100755 index 0000000..ada8b69 Binary files /dev/null and b/softlist_25.db differ diff --git a/softlist_26.db b/softlist_26.db new file mode 100755 index 0000000..131aa80 Binary files /dev/null and b/softlist_26.db differ diff --git a/softlist_27.db b/softlist_27.db new file mode 100755 index 0000000..f6d97dd Binary files /dev/null and b/softlist_27.db differ diff --git a/softlist_28.db b/softlist_28.db new file mode 100755 index 0000000..751eaa3 Binary files /dev/null and b/softlist_28.db differ diff --git a/softlist_29.db b/softlist_29.db new file mode 100755 index 0000000..d732cde Binary files /dev/null and b/softlist_29.db differ diff --git a/softlist_30.db b/softlist_30.db new file mode 100755 index 0000000..a811951 Binary files /dev/null and b/softlist_30.db differ diff --git a/softlist_31.db b/softlist_31.db new file mode 100755 index 0000000..5d413c2 Binary files /dev/null and b/softlist_31.db differ diff --git a/softlist_32.db b/softlist_32.db new file mode 100755 index 0000000..18d0390 Binary files /dev/null and b/softlist_32.db differ diff --git a/softlist_33.db b/softlist_33.db new file mode 100755 index 0000000..51d43fa Binary files /dev/null and b/softlist_33.db differ diff --git a/softlist_34.db b/softlist_34.db new file mode 100755 index 0000000..94a419b Binary files /dev/null and b/softlist_34.db differ diff --git a/softlist_35.db b/softlist_35.db new file mode 100755 index 0000000..a0928e4 Binary files /dev/null and b/softlist_35.db differ diff --git a/softlist_36.db b/softlist_36.db new file mode 100755 index 0000000..0af5969 Binary files /dev/null and b/softlist_36.db differ diff --git a/softlist_37.db b/softlist_37.db new file mode 100755 index 0000000..a2cec29 Binary files /dev/null and b/softlist_37.db differ diff --git a/softlist_38.db b/softlist_38.db new file mode 100755 index 0000000..b2400a4 Binary files /dev/null and b/softlist_38.db differ diff --git a/softlist_39.db b/softlist_39.db new file mode 100755 index 0000000..fbd3b2d Binary files /dev/null and b/softlist_39.db differ diff --git a/softlist_40.db b/softlist_40.db new file mode 100755 index 0000000..4b6e3fc Binary files /dev/null and b/softlist_40.db differ diff --git a/softlist_41.db b/softlist_41.db new file mode 100755 index 0000000..1aef41b Binary files /dev/null and b/softlist_41.db differ diff --git a/softlist_42.db b/softlist_42.db new file mode 100755 index 0000000..2e95e4d Binary files /dev/null and b/softlist_42.db differ diff --git a/softlist_43.db b/softlist_43.db new file mode 100755 index 0000000..3e75dff Binary files /dev/null and b/softlist_43.db differ diff --git a/softlist_44.db b/softlist_44.db new file mode 100755 index 0000000..220e125 Binary files /dev/null and b/softlist_44.db differ diff --git a/softlist_45.db b/softlist_45.db new file mode 100755 index 0000000..a5423b3 Binary files /dev/null and b/softlist_45.db differ diff --git a/softlist_46.db b/softlist_46.db new file mode 100755 index 0000000..62c7676 Binary files /dev/null and b/softlist_46.db differ diff --git a/softlist_47.db b/softlist_47.db new file mode 100755 index 0000000..de8a27e Binary files /dev/null and b/softlist_47.db differ diff --git a/softlist_48.db b/softlist_48.db new file mode 100755 index 0000000..98ad0d4 Binary files /dev/null and b/softlist_48.db differ diff --git a/softlist_49.db b/softlist_49.db new file mode 100755 index 0000000..4fff2a3 Binary files /dev/null and b/softlist_49.db differ diff --git a/softlist_7.db b/softlist_7.db new file mode 100755 index 0000000..dca8ee8 Binary files /dev/null and b/softlist_7.db differ diff --git a/softlist_8.db b/softlist_8.db new file mode 100755 index 0000000..4728601 Binary files /dev/null and b/softlist_8.db differ diff --git a/softlist_9.db b/softlist_9.db new file mode 100755 index 0000000..90f0050 Binary files /dev/null and b/softlist_9.db differ