1056337019 / 股吧抓取工具

// ==UserScript==
// @name         股吧抓取工具
// @namespace    http://tampermonkey.net/
// @version      1.0
// @description  根据输入的目标股票代码和统计天数抓取股吧帖子数据,并在页面上显示实时进度和结果
// @author       YourName
// @license      MIT
// @match        https://guba.eastmoney.com/*
// @grant        none
// ==/UserScript==

(function() {
    'use strict';

    // 创建浮动面板
    const container = document.createElement('div');
    container.id = 'tm-crawler-container';
    container.style.position = 'fixed';
    container.style.top = '10px';
    container.style.right = '10px';
    container.style.zIndex = '9999';
    container.style.background = 'white';
    container.style.border = '1px solid #ccc';
    container.style.padding = '10px';
    container.style.maxWidth = '300px';
    container.style.fontSize = '14px';
    container.style.boxShadow = '0 0 10px rgba(0,0,0,0.3)';
    container.innerHTML = `
        <h3 style="margin-top:0;">股吧抓取工具</h3>
        <label>目标股票代码: <input type="text" id="targetStockInput" value="002131"></label><br>
        <label>统计天数: <input type="number" id="daysToCheckInput" value="5" min="1"></label><br>
        <button id="startButton">开始抓取</button>
        <div id="progress-display" style="margin-top:10px; white-space: pre-wrap; min-height: 40px;"></div>
        <div id="result-display" style="margin-top:10px;"></div>
        <button id="closeButton" style="display:none; margin-top:10px;">关闭</button>
    `;
    document.body.appendChild(container);

    // 默认配置(其中targetStock和daysToCheck会从界面读取)
    const config = {
        targetStock: '002131',
        daysToCheck: 5,
        retryCount: 3,
        minDelay: 8000,         // 8秒
        maxDelay: 15000,        // 15秒
        requestTimeout: 20000   // 请求超时20秒
    };

    // 获取页面元素
    const progressDisplay = document.getElementById('progress-display');
    const resultDisplay = document.getElementById('result-display');
    const closeButton = document.getElementById('closeButton');
    const startButton = document.getElementById('startButton');
    const targetStockInput = document.getElementById('targetStockInput');
    const daysToCheckInput = document.getElementById('daysToCheckInput');

    // 生成符合反爬策略的随机延迟
    const antiBlockDelay = () =>
        new Promise(r => setTimeout(r, Math.random() * (config.maxDelay - config.minDelay) + config.minDelay));

    // 生成日期数组(格式为 MM-DD),从较早的日期开始
    function generateDateRange(days) {
        const pad = n => n.toString().padStart(2, '0');
        return Array.from({ length: days }, (_, i) => {
            const d = new Date();
            d.setDate(d.getDate() - i);
            return `${pad(d.getMonth() + 1)}-${pad(d.getDate())}`;
        }).reverse();
    }

    // 带重试和超时机制的请求函数
    async function enhancedFetch(url) {
        const controller = new AbortController();
        const timeoutId = setTimeout(() => controller.abort(), config.requestTimeout);
        for (let attempt = 1; attempt <= config.retryCount; attempt++) {
            try {
                const response = await fetch(url, {
                    headers: {
                        'User-Agent': navigator.userAgent,
                        'Referer': location.href
                    },
                    credentials: 'include',
                    signal: controller.signal
                });
                clearTimeout(timeoutId);
                if (!response.ok) throw new Error(`HTTP ${response.status}`);
                return await response.text();
            } catch (err) {
                if (attempt === config.retryCount) throw err;
                await antiBlockDelay();
            }
        }
    }

    // 核心抓取函数,统计指定日期内的帖子数量
    async function crawlPosts() {
        const dateList = generateDateRange(config.daysToCheck);
        const countMap = Object.fromEntries(dateList.map(d => [d, 0]));
        // 使用 dateList[0] 作为截止日期(最早日期)
        const [refYear, cutoffMonth, cutoffDay] = [new Date().getFullYear(), ...dateList[0].split('-').map(Number)];
        const cutoffDate = new Date(refYear, cutoffMonth - 1, cutoffDay);

        let currentPage = 1;
        let shouldStop = false;
        let latestDate = '';
        const startTime = Date.now();
        const dateRange = `${dateList[0]} 至 ${dateList[dateList.length - 1]}`;

        // 实时进度更新器,每1.5秒更新一次
        const progressUpdater = setInterval(() => {
            const elapsed = Math.floor((Date.now() - startTime) / 1000);
            const currentTime = new Date().toLocaleTimeString();
            progressDisplay.textContent = `[${currentTime}] 正在统计 ${dateRange} 第${currentPage}页 | 已运行: ${elapsed}s${latestDate ? ' | 最新日期: ' + latestDate : ''}`;
        }, 1500);

        try {
            while (!shouldStop) {
                const pageUrl = `https://guba.eastmoney.com/list,${config.targetStock},f${currentPage > 1 ? '_' + currentPage : ''}.html`;
                let html;
                try {
                    html = await enhancedFetch(pageUrl);
                } catch (err) {
                    resultDisplay.innerHTML += `<p style="color:red;">❌ 第${currentPage}页抓取失败: ${err.message}</p>`;
                    break;
                }
                const parser = new DOMParser();
                const doc = parser.parseFromString(html, 'text/html');

                // 解析页面中每个帖子的日期
                const postDateElements = doc.querySelectorAll('.update');
                const postDates = [];
                postDateElements.forEach(el => {
                    const rawDate = el.textContent.trim().split(/\s+/)[0];
                    let month, day;
                    if (rawDate.includes('-')) {
                        [month, day] = rawDate.split('-');
                    } else {
                        month = rawDate.slice(0, 2);
                        day = rawDate.slice(2);
                    }
                    latestDate = `${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
                    const dateObj = new Date(refYear, parseInt(month, 10) - 1, parseInt(day, 10));
                    postDates.push(dateObj);

                    const dateStr = `${String(dateObj.getMonth() + 1).padStart(2, '0')}-${String(dateObj.getDate()).padStart(2, '0')}`;
                    if (dateList.includes(dateStr)) {
                        countMap[dateStr]++;
                    }
                });

                // 判断是否已超过设定的截止日期(当当前页所有帖子日期都早于截止日期时提前终止)
                if (postDates.length) {
                    const maxDate = new Date(Math.max(...postDates));
                    if (maxDate < cutoffDate) {
                        resultDisplay.innerHTML += `<p>✅ 提前终止: ${maxDate.toLocaleDateString()} 早于截止日期 ${cutoffDate.toLocaleDateString()}</p>`;
                        shouldStop = true;
                    }
                } else {
                    // 如果本页没有帖子,则退出循环
                    shouldStop = true;
                }

                currentPage++;
                await antiBlockDelay();
            }
        } catch (err) {
            resultDisplay.innerHTML += `<p style="color:red;">❌ 抓取过程中发生错误: ${err.message}</p>`;
        } finally {
            clearInterval(progressUpdater);
        }

        return countMap;
    }

    // 启动抓取逻辑,并在页面上展示最终结果(以表格形式)
    async function startCrawling() {
        // 从界面输入中更新配置
        config.targetStock = targetStockInput.value.trim() || config.targetStock;
        config.daysToCheck = parseInt(daysToCheckInput.value, 10) || config.daysToCheck;

        // 清空之前的进度和结果
        progressDisplay.textContent = '开始抓取...';
        resultDisplay.innerHTML = '';

        try {
            const results = await crawlPosts();
            // 构造结果表格
            let tableHTML = '<table border="1" style="border-collapse: collapse; width: 100%;">';
            tableHTML += '<tr><th>日期</th><th>帖子数量</th></tr>';
            for (const date of Object.keys(results)) {
                tableHTML += `<tr><td>${date}</td><td>${results[date]}</td></tr>`;
            }
            tableHTML += '</table>';
            resultDisplay.innerHTML += tableHTML;
            closeButton.style.display = 'block';
        } catch (err) {
            resultDisplay.innerHTML += `<p style="color:red;">❌ 错误: ${err.message}</p>`;
            closeButton.style.display = 'block';
        }
    }

    // 绑定开始按钮事件
    startButton.addEventListener('click', () => {
        startButton.disabled = true;
        startCrawling().finally(() => {
            startButton.disabled = false;
        });
    });

    // 绑定关闭按钮事件
    closeButton.addEventListener('click', () => {
        container.style.display = 'none';
    });
})();