diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000..905b0b0b8914d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,35 @@ +.git/ +.gitignore +*.md +README.md + +output/ + +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +.pytest_cache/ + +.vscode/ +.idea/ +*.swp +*.swo +*~ + +.DS_Store +Thumbs.db + +docker/.env + +_image/ + +.github/ + +*.log +.env.local +.env.*.local +version +index.html \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/01-bug-report.yml b/.github/ISSUE_TEMPLATE/01-bug-report.yml new file mode 100644 index 0000000000000..f028116c45484 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/01-bug-report.yml @@ -0,0 +1,208 @@ +# yaml-language-server: $schema=https://json.schemastore.org/github-issue-forms.json + +name: 🐛 遇到问题了 +description: 程序运行不正常或出现错误 +title: "[问题] " +labels: ["bug"] +assignees: + - sansan0 +body: + - type: markdown + attributes: + value: | + **详细清楚的问题描述能帮助项目作者更快理解和解决你遇到的困扰**。强烈建议上传截图,图文并茂会让问题更容易被理解和定位。 + + --- + + ### 📋 提交问题前,请先检查以下事项 + + #### 1️⃣ **建议先查看文档** 📖 + 大部分常见问题在文档中都有详细说明,建议先查看相关章节: + - [📝 配置教程](https://github.com/sansan0/TrendRadar#-快速开始) + - [❓ 常见问题](https://github.com/sansan0/TrendRadar#问题答疑与1元点赞) + - [🐳 Docker部署](https://github.com/sansan0/TrendRadar#-docker-部署) + + #### 2️⃣ **测试推送功能的注意事项** ⚠️ + 测试消息推送时,请确保以下配置正确: + + **必须检查的配置项:** + - ✅ `report.mode` 设置为 `daily` 或 `current`(请勿使用 `incremental`,该模式仅在有新内容时才推送) + - ✅ `notification.push_window.enabled` 设置为 `false`(关闭推送时间窗口控制) + - ✅ `notification.enable_notification` 设置为 `true`(确保通知功能已启用) + + **说明:** + - 推送时间窗口控制(`push_window`)是可选功能,如果开启会限制推送时间范围 + - 测试时建议关闭此功能,避免因不在推送时间窗口而收不到消息 + + #### 3️⃣ **检查配置细节** 🔍 + 部分问题可能是配置细节导致,建议检查: + - 配置文件的缩进格式是否正确(YAML 格式要求严格,必须使用空格而非 Tab) + - Webhook 地址是否完整复制(注意不要有多余或缺失的字符) + - 环境变量是否正确设置 + - 文件路径是否正确 + + #### 4️⃣ **遇到困难时的建议** 💡 + - 如果尝试 30 分钟以上仍无进展,可以考虑换个思路 + - 建议重新从头阅读相关文档章节 + - 或尝试其他部署方式(如从 Docker 切换到 GitHub Actions) + + #### 5️⃣ **根据部署方式提供完整信息** 📦 + + **如果是 GitHub Actions 部署:** + 1. **必须提供** Actions 工作流链接(如:`https://github.com/你的用户名/TrendRadar/actions/workflows/crawler.yml`) + 2. **查看执行日志的步骤:** + - 打开你的仓库页面 + - 点击顶部的 **Actions** 标签 + - 点击左侧的 **Crawler** 工作流 + - 点击最近一次运行记录 + - 点击 **Run crawler** 查看详细日志 + - **截图完整的日志内容**(特别是错误部分) + 3. 提供 `config.yaml` 配置内容(隐藏敏感信息) + + **如果是 Docker 部署:** + 1. 提供项目目录结构截图(运行 `ls -la` 或打开文件管理器) + 2. 提供 Docker 容器日志(运行 `docker logs 容器名`) + 3. 提供容器状态(运行 `docker ps -a`) + 4. 提供 `.env` 文件内容(隐藏敏感信息) + + **如果是本地运行:** + 1. 提供完整的错误日志截图 + 2. 提供 `config.yaml` 配置内容 + 3. 提供 Python 版本(运行 `python --version`) + + - type: dropdown + id: bug-category + attributes: + label: 🏷️ 遇到了什么问题 + options: + - 数据获取问题(获取不到新闻、请求失败等) + - 关键词筛选问题(关键词不生效、匹配异常等) + - 通知推送问题(收不到消息、推送失败等) + - 配置设置问题(配置文件错误、参数不生效等) + - 部署运行问题(Docker、GitHub Actions等) + - 性能问题(运行慢、卡顿等) + - 其他问题 + validations: + required: true + + - type: dropdown + id: environment + attributes: + label: 🖥️ 使用环境 + options: + - 本地运行(直接在电脑上运行) + - Docker 容器运行 + - GitHub Actions 自动运行 + - 其他方式 + validations: + required: true + + - type: textarea + id: bug-description + attributes: + label: 📝 详细描述问题 + description: 请详细说明遇到的问题(建议配合截图说明) + placeholder: | + 请清楚地描述: + - 具体发生了什么问题 + - 问题出现时的情况 + - 这个问题影响了什么功能 + + 💡 提示:上传问题截图能提供更多信息。 + validations: + required: true + + - type: dropdown + id: system-info + attributes: + label: 💻 系统信息 + description: 你的电脑系统 + options: + - Windows 10 + - Windows 11 + - macOS + - Ubuntu/Linux + - 其他系统 + - 不确定 + validations: + required: false + + - type: textarea + id: reproduction-steps + attributes: + label: 🔄 怎么重现这个问题 + description: 如何让这个问题重新出现?(可选,但建议填写) + placeholder: | + 请按步骤描述(建议每个步骤都配截图): + 1. 我点击了... + 2. 然后设置了... + 3. 接着出现了... + + 💡 操作过程的截图特别有用! + validations: + required: false + + - type: textarea + id: expected-behavior + attributes: + label: ✅ 期望的正常情况 + description: 正常情况下应该是什么样的?(可选) + placeholder: | + 描述你期望看到的正常结果... + 如果有参考图片就更好了! + validations: + required: false + + - type: textarea + id: error-logs + attributes: + label: 📋 错误信息 + description: 程序显示的错误信息或日志(如果有的话) + placeholder: | + 如果程序显示了错误信息,请完整复制到这里: + + ``` + 错误信息内容... + ``` + + validations: + required: false + + - type: textarea + id: config-info + attributes: + label: ⚙️ 相关配置 + description: 与问题相关的配置内容(请隐藏敏感信息如 webhook 地址) + placeholder: | + 相关的配置内容(记得隐藏敏感信息): + + ```yaml + notification: + enable_notification: true + webhooks: + feishu_url: "***隐藏***" + ``` + + - type: textarea + id: screenshots + attributes: + label: 📷 截图补充 + description: 上传相关截图(强烈推荐!) + placeholder: | + 请拖拽截图到这里,建议包含: + - 错误界面截图 + - 配置设置截图 + - 操作步骤截图 + + 💡 截图是最直观的问题说明方式! + + - type: textarea + id: additional-context + attributes: + label: 📎 其他补充信息 + description: 其他可能有用的信息 + placeholder: | + - 网络环境特殊情况 + - 之前是否正常工作过 + - 最近有没有改动什么设置 + - 其他你觉得可能相关的信息 diff --git a/.github/ISSUE_TEMPLATE/02-feature-request.yml b/.github/ISSUE_TEMPLATE/02-feature-request.yml new file mode 100644 index 0000000000000..227625c8da8cc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/02-feature-request.yml @@ -0,0 +1,98 @@ +# yaml-language-server: $schema=https://json.schemastore.org/github-issue-forms.json + +name: 💡 我有个想法 +description: 建议新功能或改进现有功能 +title: "[建议] " +labels: ["enhancement"] +assignees: + - sansan0 +body: + - type: markdown + attributes: + value: | + ### 💝 温馨提醒 + + 感谢你的创意想法!如果这个项目对你有帮助,欢迎给项目点个 ⭐ **Star**! + + 好的建议让项目变得更加实用。**欢迎用截图或示例**来展示你的想法! + + - type: dropdown + id: feature-category + attributes: + label: 🏷️ 建议类别 + options: + - 数据源相关(新增平台、改进抓取等) + - 分析功能相关(算法改进、筛选优化等) + - 通知方式相关(新增通知渠道、消息格式等) + - 配置管理相关(设置优化、界面改进等) + - 部署运维相关(安装简化、监控告警等) + - 数据展示相关(报告格式、图表可视化等) + - 性能优化相关 + - 用户体验改进 + - 其他想法 + validations: + required: true + + - type: textarea + id: feature-description + attributes: + label: 💭 详细描述你的想法 + description: 请详细描述你希望添加的功能 + placeholder: | + 请详细描述: + - 你希望增加什么功能 + - 这个功能应该怎么使用 + - 使用后能达到什么效果 + + 💡 提示:如果有类似功能的截图作为参考就更好了! + validations: + required: true + + - type: textarea + id: use-case + attributes: + label: 🎯 什么时候会用到这个功能 + description: 这个功能在什么场景下使用? + placeholder: | + 例如: + - 当我需要...的时候 + - 在...情况下会很方便 + - 可以解决...问题 + - 能够帮助...用户 + validations: + required: true + + - type: textarea + id: implementation-ideas + attributes: + label: 🛠️ 实现想法(可选) + description: 如果你有实现思路,欢迎分享 + placeholder: | + - 功能界面应该怎么设计 + - 配置应该怎么设置 + - 参考哪些类似的工具或网站 + - 其他实现建议 + + - type: textarea + id: mockups-examples + attributes: + label: 📷 功能示意图(推荐) + description: 上传功能示意图、参考截图或手绘草图 + placeholder: | + 请上传: + - 功能界面的设计图(手绘也可以) + - 类似功能的参考截图 + - 使用流程的示意图 + + 💡 可视化的说明最容易理解! + + - type: textarea + id: additional-context + attributes: + label: 📎 其他补充说明 + description: 其他想要补充的内容 + placeholder: | + - 相关的参考资料链接 + - 类似功能的其他工具 + - 更多使用场景说明 + - 其他相关想法 diff --git a/.github/ISSUE_TEMPLATE/03-config-help.yml b/.github/ISSUE_TEMPLATE/03-config-help.yml new file mode 100644 index 0000000000000..bdaff53349e60 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/03-config-help.yml @@ -0,0 +1,197 @@ +# yaml-language-server: $schema=https://json.schemastore.org/github-issue-forms.json + +name: ⚙️ 设置遇到困难 +description: 配置相关的问题或需要帮助 +title: "[设置] " +labels: ["配置", "帮助"] +assignees: + - sansan0 +body: + - type: markdown + attributes: + value: | + 遇到设置问题时,**请尽可能详细描述你的问题**,并上传配置文件和错误信息的截图,这样能帮助更快定位和解决问题。 + + 建议先查看项目说明文档,大部分常见问题都有详细说明。 + + --- + + ### 📋 配置问题自查清单(提问前建议阅读) + + #### 1️⃣ **优先查看文档** 📚 + 绝大部分配置问题在文档中都有详细说明,建议先查看相关章节: + - [🚀 快速开始](https://github.com/sansan0/TrendRadar#-快速开始) + - [📝 frequency_words.txt 配置](https://github.com/sansan0/TrendRadar#frequencywordstxt-配置教程) + - [🐳 Docker 部署指南](https://github.com/sansan0/TrendRadar#-docker-部署) + - [🤖 AI 分析配置](https://github.com/sansan0/TrendRadar#-ai-智能分析部署) + + #### 2️⃣ **测试推送的常见误区** ⚠️ + 测试消息推送时,请检查以下配置: + + **必须检查的配置项:** + - ❌ **错误**:`report.mode` 设置为 `incremental`(增量模式仅在有新内容时推送) + - ✅ **正确**:`report.mode` 设置为 `daily` 或 `current` + - ❌ **错误**:`notification.push_window.enabled` 设置为 `true`(推送时间窗口会限制推送时间) + - ✅ **正确**:`notification.push_window.enabled` 设置为 `false`(测试时建议关闭) + + **说明:** + - 推送时间窗口控制是可选功能,开启后只在指定时间范围内推送 + - 如果当前时间不在设定的窗口范围内,将不会收到推送消息 + - 测试时建议先关闭此功能 + + #### 3️⃣ **YAML 格式很严格** 📏 + 配置文件最常见的 3 个错误: + ```yaml + # ❌ 错误示例 1:缩进不对(必须用空格,不能用Tab) + notification: + enable_notification: true # ← 错误:缺少缩进 + + # ❌ 错误示例 2:冒号后面没有空格 + enable_notification:true # ← 错误:冒号后需要空格 + + # ✅ 正确示例: + notification: + enable_notification: true # ← 正确:2空格缩进 + 冒号后有空格 + ``` + + #### 4️⃣ **根据部署方式准备信息** 📦 + + **如果你是 GitHub Actions 部署:** + 1. **必须提供** Actions 工作流链接(格式:`https://github.com/你的用户名/TrendRadar/actions/workflows/crawler.yml`) + 2. **如何查看并截图执行日志:** + ``` + 步骤 1:打开你的仓库,点击顶部 "Actions" 标签 + 步骤 2:点击左侧 "Crawler" 工作流 + 步骤 3:点击最近一次运行记录(最上面的那个) + 步骤 4:点击展开 "Run crawler" 步骤 + 步骤 5:截图完整的日志内容(特别是红色错误部分) + ``` + 3. 提供 `config.yaml` 配置内容(记得隐藏 webhook 地址) + + **如果你是 Docker 部署:** + 1. 提供项目目录结构截图(运行 `ls -la` 或打开文件管理器) + 2. 提供 Docker 日志(运行 `docker logs 容器名`) + 3. 提供容器状态(运行 `docker ps -a`) + 4. 提供 `.env` 文件内容(隐藏敏感信息) + + **如果你是本地运行:** + 1. 提供完整的错误信息截图 + 2. 提供 `config.yaml` 配置内容 + 3. 提供 Python 版本(运行 `python --version`) + + #### 5️⃣ **遇到困难时的建议** 🤔 + - 如果尝试 30 分钟以上仍无进展,建议考虑换个思路 + - 可以尝试: + 1. 重新从头阅读相关文档章节 + 2. 尝试其他部署方式(如从 Docker 切换到 GitHub Actions) + 3. 对比文档示例,检查差异之处 + + #### 6️⃣ **提问时请尽量提供以下信息** 📋 + 为了更快地帮你定位问题,建议提供: + - ✅ 配置文件内容(请隐藏 webhook 等敏感信息) + - ✅ 完整的错误日志截图 + - ✅ 部署方式(本地运行/Docker/GitHub Actions) + - ✅ 已经尝试过的解决方法 + - ✅ 具体的问题现象(请避免只说"不生效"或"没反应",尽量描述具体表现) + + - type: dropdown + id: config-type + attributes: + label: 🏷️ 配置问题类别 + options: + - 基础配置问题(config.yaml 设置) + - 通知配置问题(webhook、消息推送等) + - 部署配置问题(Docker、GitHub Actions等) + - 关键词配置问题(frequency_words.txt 设置) + - 环境配置问题(Python、依赖包等) + - 其他配置问题 + validations: + required: true + + - type: dropdown + id: environment + attributes: + label: 🖥️ 使用环境 + options: + - 本地运行(直接在电脑上运行) + - Docker 容器运行 + - GitHub Actions 自动运行 + - 其他方式 + validations: + required: true + + - type: textarea + id: problem-description + attributes: + label: 📝 详细描述问题 + description: 请详细描述你遇到的设置问题 + placeholder: | + 请详细描述: + - 遇到的具体问题是什么 + - 你希望达到什么效果 + - 已经尝试了哪些方法 + - 参考了哪些文档或教程 + + 💡 问题截图能提供更多信息! + validations: + required: true + + - type: textarea + id: config-content + attributes: + label: 📄 配置内容 + description: 请提供相关的配置内容(记得隐藏敏感信息如 webhook 地址) + placeholder: | + 请贴出相关的配置内容(记得隐藏 webhook 地址等敏感信息): + + ```yaml + notification: + enable_notification: true + webhooks: + feishu_url: "***隐藏***" + dingtalk_url: "***隐藏***" + ``` + + 💡 配置文件截图也很有用! + validations: + required: false + + - type: textarea + id: error-messages + attributes: + label: ❌ 错误信息(如果有的话) + description: 如果程序显示了错误信息,请贴出来 + placeholder: | + 如果有错误信息,请完整复制到这里: + + ``` + 错误信息内容... + ``` + + 💡 错误信息的截图也很重要! + + - type: textarea + id: screenshots + attributes: + label: 📷 相关截图(强烈推荐) + description: 上传配置界面、错误信息等截图 + placeholder: | + 请上传相关截图,特别是: + - 配置文件内容截图 + - 错误信息截图 + - 操作界面截图 + - 期望效果的参考图 + + 💡 截图是最直观的问题展示方式! + + - type: textarea + id: additional-info + attributes: + label: 📎 其他补充信息 + description: 其他可能有用的信息 + placeholder: | + - 操作系统版本(如 Windows 11、macOS) + - Python 版本信息 + - 网络环境特殊情况 + - 具体使用场景说明 + - 其他你觉得相关的信息 diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..0443cb2b73e30 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,3 @@ +# yaml-language-server: $schema=https://json.schemastore.org/github-issue-config.json + +blank_issues_enabled: false \ No newline at end of file diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml index e44e1cfdb4e5e..bec1508ce6161 100644 --- a/.github/workflows/crawler.yml +++ b/.github/workflows/crawler.yml @@ -2,7 +2,8 @@ name: Hot News Crawler on: schedule: - - cron: '*/50 * * * *' # 每50分钟运行一次 + # 我们使用的是 github 官方提供的资源来进行的推送,而每个账号的资源是限额的,为了不被官方判定为滥用而面临封号的风险,不建议比半小时更低 + - cron: "0 * * * *" # 每小时整点运行一次(实际有偏差) 或者 "*/30 * * * *" (每半小时执行一次) 或者 "*/30 0-14 * * *"(每天早上 8 点到晚上 10 点期间,每半小时运行一次) workflow_dispatch: # 添加权限设置 @@ -12,37 +13,60 @@ permissions: jobs: crawl: runs-on: ubuntu-latest - + steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install requests pytz - - - name: Create frequency_words.txt if not exists - run: | - if [ ! -f frequency_words.txt ]; then - echo "Creating empty frequency_words.txt file" - touch frequency_words.txt - fi - - - name: Run crawler - env: - FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }} - GITHUB_ACTIONS: true - run: python main.py - - - name: Commit and push if changes - run: | - git config --global user.name 'GitHub Actions' - git config --global user.email 'actions@github.com' - git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Auto update by GitHub Actions at $(TZ=Asia/Shanghai date)" && git push) + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Verify required files + run: | + echo "🔍 检查必需的配置文件..." + + if [ ! -f config/config.yaml ]; then + echo "❌ 错误: config/config.yaml 文件不存在" + echo "请参考项目文档创建配置文件" + exit 1 + fi + + if [ ! -f config/frequency_words.txt ]; then + echo "❌ 错误: config/frequency_words.txt 文件不存在" + echo "请参考项目文档创建频率词配置文件" + exit 1 + fi + + echo "✅ 配置文件检查通过" + + - name: Run crawler + env: + FEISHU_WEBHOOK_URL: ${{ secrets.FEISHU_WEBHOOK_URL }} + TELEGRAM_BOT_TOKEN: ${{ secrets.TELEGRAM_BOT_TOKEN }} + TELEGRAM_CHAT_ID: ${{ secrets.TELEGRAM_CHAT_ID }} + DINGTALK_WEBHOOK_URL: ${{ secrets.DINGTALK_WEBHOOK_URL }} + WEWORK_WEBHOOK_URL: ${{ secrets.WEWORK_WEBHOOK_URL }} + EMAIL_FROM: ${{ secrets.EMAIL_FROM }} + EMAIL_PASSWORD: ${{ secrets.EMAIL_PASSWORD }} + EMAIL_TO: ${{ secrets.EMAIL_TO }} + EMAIL_SMTP_SERVER: ${{ secrets.EMAIL_SMTP_SERVER }} + EMAIL_SMTP_PORT: ${{ secrets.EMAIL_SMTP_PORT }} + NTFY_TOPIC: ${{ secrets.NTFY_TOPIC }} + NTFY_SERVER_URL: ${{ secrets.NTFY_SERVER_URL }} + NTFY_TOKEN: ${{ secrets.NTFY_TOKEN }} + GITHUB_ACTIONS: true + run: python main.py + + - name: Commit and push if changes + run: | + git config --global user.name 'GitHub Actions' + git config --global user.email 'actions@github.com' + git add -A + git diff --quiet && git diff --staged --quiet || (git commit -m "Auto update by GitHub Actions at $(TZ=Asia/Shanghai date)" && git push) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000000000..3ba3e8d5a7f78 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,59 @@ +name: Build and Push Multi-Arch Docker Images + +on: + push: + tags: ["v*"] + workflow_dispatch: + +env: + REGISTRY: docker.io + IMAGE_NAME: wantcat/trendradar + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push + uses: docker/build-push-action@v5 + env: + BUILDKIT_PROGRESS: plain + with: + context: . + file: ./docker/Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + BUILDKIT_INLINE_CACHE=1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000..f288702d2fa16 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README-Cherry-Studio.md b/README-Cherry-Studio.md new file mode 100644 index 0000000000000..d272716880dd1 --- /dev/null +++ b/README-Cherry-Studio.md @@ -0,0 +1,154 @@ +# TrendRadar × Cherry Studio 部署指南 🍒 + +> **适合人群**:零编程基础的用户 +> **客户端**:Cherry Studio(免费开源 GUI 客户端) + +--- + +## 📥 第一步:下载 Cherry Studio + +### Windows 用户 + +访问官网下载:https://cherry-ai.com/ +或直接下载:[Cherry-Studio-Windows.exe](https://github.com/kangfenmao/cherry-studio/releases/latest) + +### Mac 用户 + +访问官网下载:https://cherry-ai.com/ +或直接下载:[Cherry-Studio-Mac.dmg](https://github.com/kangfenmao/cherry-studio/releases/latest) + + +--- + +## 📦 第二步:获取项目代码 + +为什么需要获取项目代码? + +AI 分析功能需要读取项目中的新闻数据才能工作。无论你使用 GitHub Actions 还是 Docker 部署,爬虫生成的新闻数据都保存在项目的 output 目录中。因此,在配置 MCP 服务器之前,需要先获取完整的项目代码(包含数据文件)。 + +根据你的技术水平,可以选择以下任一方式获取:: + +### 方法一:Git Clone(推荐给技术用户) + +如果你熟悉 Git,可以使用以下命令克隆项目: + +```bash +git clone https://github.com/你的用户名/你的项目名.git +cd 你的项目名 +``` + +**优点**: + +- 可以随时拉取一个命令就可以更新最新数据到本地了(`git pull`) + +### 方法二:直接下载 ZIP 压缩包(推荐给初学者) + + +1. **访问 GitHub 项目页面** + + - 项目链接:`https://github.com/你的用户名/你的项目名` + +2. **下载压缩包** + + - 点击绿色的 "Code" 按钮 + - 选择 "Download ZIP" + - 或直接访问:`https://github.com/你的用户名/你的项目名/archive/refs/heads/master.zip` + + +**注意事项**: + +- 步骤稍微麻烦,后续更新数据需要重复上面步骤,然后覆盖本地数据(output 目录) + +--- + +## 🚀 第三步:一键部署 MCP 服务器 + +### Windows 用户 + +1. **双击运行**项目文件夹中的 `setup-windows.bat` +2. **等待安装完成** +3. **记录显示的配置信息**(命令路径和参数) + +### Mac 用户 + +1. **打开终端**(在启动台搜索"终端") +2. **拖拽**项目文件夹中的 `setup-mac.sh` 到终端窗口 +3. **按回车键** +4. **记录显示的配置信息** + +--- + +## 🔧 第四步:配置 Cherry Studio + +### 1. 打开设置 + +启动 Cherry Studio,点击右上角 ⚙️ **设置** 按钮 + +### 2. 添加 MCP 服务器 + +在设置页面找到:**MCP** → 点击 **添加** + +### 3. 填写配置(重要!) + +根据刚才的安装脚本显示的信息填写 + +### 4. 保存并启用 + +- 点击 **保存** 按钮 +- 确保 MCP 服务器列表中的开关是 **开启** 状态 ✅ + +--- + +## ✅ 第五步:验证是否成功 + +### 1. 测试连接 + +在 Cherry Studio 的对话框中输入: + +``` +帮我爬取最新的新闻 +``` + +或者尝试其他测试命令: + +``` +搜索最近3天关于"人工智能"的新闻 +查找2025年1月的"特斯拉"相关报道 +分析"iPhone"的热度趋势 +``` + +**提示**:当你说"最近3天"时,AI会自动计算日期范围并搜索。 + +### 2. 成功标志 + +如果配置成功,AI 会: + +- ✅ 调用 TrendRadar 工具 +- ✅ 返回真实的新闻数据 +- ✅ 显示平台、标题、排名等信息 + + +--- + +## 🎯 进阶配置 + +### HTTP 模式(可选) + +如果需要远程访问或多客户端共享,可以使用 HTTP 模式: + +#### Windows + +双击运行 `start-http.bat` + +#### Mac + +```bash +./start-http.sh +``` + +然后在 Cherry Studio 中配置: + +``` +类型: streamableHttp +URL: http://localhost:3333/mcp +``` diff --git a/README-MCP-FAQ.md b/README-MCP-FAQ.md new file mode 100644 index 0000000000000..b1005925a68ca --- /dev/null +++ b/README-MCP-FAQ.md @@ -0,0 +1,471 @@ +# TrendRadar MCP 工具使用问答 + +> AI 提问指南 - 如何通过对话使用新闻热点分析工具 + +## ⚙️ 默认设置说明(重要!) + +默认采用以下优化策略,主要是为了节约 AI token 消耗: + +| 默认设置 | 说明 | 如何调整 | +| -------------- | --------------------------------------- | ------------------------------------- | +| **限制条数** | 默认返回 50 条新闻 | 对话中说"返回前 10 条"或"给我 100 条" | +| **时间范围** | 默认查询今天的数据 | 说"查询昨天"、"最近一周"或"1月1日到7日" | +| **URL 链接** | 默认不返回链接(节省约 160 tokens/条) | 说"需要链接"或"包含 URL" | +| **关键词列表** | 默认不使用 frequency_words.txt 过滤新闻 | 只有调用"趋势话题"工具时才使用 | + +**⚠️ 重要:** AI 模型的选择直接影响工具调用效果,AI 越智能,调用越准确。当你解除上面的限制,比如从今天的查询,放宽到一周的查询,首先你要在本地有一周的数据,其次,token 消耗量可能会倍增(为什么说可能,比如我查询 分析'苹果'最近一周的热度趋势,如果一周中没多少苹果的新闻,那么 token消耗量可能反而很少) + +**💡 提示:** 当你说"最近7天"时,AI会自动计算对应的日期范围(如 2025-10-18 至 2025-10-25)并传递给工具。 + + +## 💰 AI 模型 + +下面我以 **[硅基流动](https://cloud.siliconflow.cn)** 平台作为例子,里面有很多大模型可选择。在开发和测试本项目的过程中,我使用本平台进行了许多的功能测试和验证。 + +### 📊 注册方式对比 + +| 注册方式 | 无邀请链接直接注册 | 含有邀请链接注册 | +|:-------:|:-------:|:-----------------:| +| 注册链接 | [siliconflow.cn](https://cloud.siliconflow.cn) | [邀请链接](https://cloud.siliconflow.cn/i/fqnyVaIU) | +| 免费额度 | 0 tokens | **2000万 tokens** (≈14元) | +| 额外福利 | ❌ | ✅ 邀请者也获得2000万tokens | + +> 💡 **提示**:上面的赠送额度,应该可以询问 **200次以上** + + +### 🚀 快速开始 + +#### 1️⃣ 注册并获取 API 密钥 + +1. 使用上方链接完成注册 +2. 访问 [API 密钥管理页面](https://cloud.siliconflow.cn/me/account/ak) +3. 点击「新建 API 密钥」 +4. 复制生成的密钥(请妥善保管) + +#### 2️⃣ 在 Cherry Studio 中配置 + +1. 打开 **Cherry Studio** +2. 进入「模型服务」设置 +3. 找到「硅基流动」 +4. 将复制的密钥粘贴到 **[API密钥]** 输入框 +5. 确保右上角勾选框打开后显示为 **绿色** ✅ + +--- + +### ✨ 配置完成! + +现在你可以开始使用本项目,享受稳定快速的 AI 服务了! + +在你测试一次询问后,请立刻去 [硅基流动账单](https://cloud.siliconflow.cn/me/bills) 查询这一次的消耗量,心底有个估算。 + + +## 基础查询 + +### Q1: 如何查看最新的新闻? + +**你可以这样问:** + +- "给我看看最新的新闻" +- "查询今天的热点新闻" +- "获取知乎和微博的最新 10 条新闻" +- "查看最新新闻,需要包含链接" + +**调用的工具:** `get_latest_news` + +**工具返回行为:** + +- MCP 工具会返回所有平台的最新 50 条新闻给 AI +- 不包含 URL 链接(节省 token) + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会自动总结**,只展示部分新闻(如 TOP 10-20 条) +- ✅ 如果你想看全部 50 条,需要明确要求:"展示所有新闻"或"完整列出所有 50 条" +- 💡 这是 AI 模型的自然行为,不是工具的限制 + +**可以调整:** + +- 指定平台:如"只看知乎的" +- 调整数量:如"返回前 20 条" +- 包含链接:如"需要链接" +- **要求完整展示**:如"展示全部,不要总结" + +--- + +### Q2: 如何查询特定日期的新闻? + +**你可以这样问:** + +- "查询昨天的新闻" +- "看看 3 天前知乎的新闻" +- "2025-10-10 的新闻有哪些" +- "上周一的新闻" +- "给我看看最新新闻"(自动查询今天) + +**调用的工具:** `get_news_by_date` + +**支持的日期格式:** + +- 相对日期:今天、昨天、前天、3 天前 +- 星期:上周一、本周三、last monday +- 绝对日期:2025-10-10、10 月 10 日 + +**工具返回行为:** + +- 不指定日期时自动查询今天(节省 token) +- MCP 工具会返回所有平台的 50 条新闻给 AI +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会自动总结**,只展示部分新闻(如 TOP 10-20 条) +- ✅ 如果你想看全部,需要明确要求:"展示所有新闻,不要总结" + +--- + +### Q3: 如何查看我关注的话题频率统计? + +**你可以这样问:** + +- "我关注的词今天出现了多少次" +- "看看我的关注词列表中哪些词最热门" +- "统计一下 frequency_words.txt 中的关注词频率" + +**调用的工具:** `get_trending_topics` + +**重要说明:** + +- 本工具**不是**自动提取新闻热点 +- 而是统计你在 `config/frequency_words.txt` 中设置的**个人关注词** +- 这是一个**可自定义**的列表,你可以根据兴趣添加关注词 + +--- + +## 搜索检索 + +### Q4: 如何搜索包含特定关键词的新闻? + +**你可以这样问:** + +- "搜索包含'人工智能'的新闻" +- "查找关于'特斯拉降价'的报道" +- "搜索马斯克相关的新闻,返回前 20 条" +- "查找最近7天关于'iPhone 16'的新闻" +- "查找2025年1月1日到7日'特斯拉'的相关新闻" +- "查找'iPhone 16 发布'这条新闻的链接" + +**调用的工具:** `search_news` + +**工具返回行为:** + +- 使用关键词模式搜索 +- 默认搜索今天的数据 +- AI会自动将"最近7天"、"上周"等相对时间转换为具体日期范围 +- MCP 工具会返回最多 50 条结果给 AI +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会自动总结**,只展示部分搜索结果 +- ✅ 如果你想看全部,需要明确要求:"展示所有搜索结果" + +**可以调整:** + +- 指定时间范围: + - 相对方式:"搜索最近一周的"(AI 自动计算日期) + - 绝对日期:"搜索2025年1月1日到7日的" +- 指定平台:如"只搜索知乎" +- 调整排序:如"按权重排序" +- 包含链接:如"需要链接" + +**示例对话:** + +``` +用户:搜索最近7天关于"人工智能突破"的新闻 +AI:(自动计算:date_range={"start": "2025-10-18", "end": "2025-10-25"}) + +用户:查找2025年1月的"特斯拉"报道 +AI:(date_range={"start": "2025-01-01", "end": "2025-01-31"}) +``` + +--- + +### Q5: 如何查找历史相关新闻? + +**你可以这样问:** + +- "查找昨天与'人工智能突破'相关的新闻" +- "搜索上周关于'特斯拉'的历史报道" +- "找出上个月与'ChatGPT'相关的新闻" +- "看看'iPhone 发布会'相关的历史新闻" + +**调用的工具:** `search_related_news_history` + +**工具返回行为:** + +- 搜索昨天的数据 +- 相似度阈值 0.4 +- MCP 工具会返回最多 50 条结果给 AI +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会自动总结**,只展示部分相关新闻 +- ✅ 如果你想看全部,需要明确要求:"展示所有相关新闻" + +--- + +## 趋势分析 + +### Q6: 如何分析话题的热度趋势? + +**你可以这样问:** + +- "分析'人工智能'最近一周的热度趋势" +- "看看'特斯拉'话题是昙花一现还是持续热点" +- "检测今天有哪些突然爆火的话题" +- "预测接下来可能的热点话题" +- "分析'比特币'在2024年12月的生命周期" + +**调用的工具:** `analyze_topic_trend` + +**工具返回行为:** + +- 支持多种分析模式:热度趋势、生命周期、异常检测、预测 +- AI会自动将"最近一周"等相对时间转换为具体日期范围 +- 默认分析最近7天数据 +- 按天粒度统计 + +**AI 展示行为:** + +- 通常会展示趋势分析结果和图表 +- AI 可能会总结关键发现 + +**示例对话:** + +``` +用户:分析'人工智能'最近一周的生命周期 +AI:(自动计算:date_range={"start": "2025-10-18", "end": "2025-10-25"}) + +用户:看看'比特币'在2024年12月是昙花一现还是持续热点 +AI:(date_range={"start": "2024-12-01", "end": "2024-12-31"}) +``` + +--- + +## 数据洞察 + +### Q7: 如何对比不同平台对话题的关注度? + +**你可以这样问:** + +- "对比各个平台对'人工智能'话题的关注度" +- "看看哪个平台更新最频繁" +- "分析一下哪些关键词经常一起出现" + +**调用的工具:** `analyze_data_insights` + +**三种洞察模式:** + +| 模式 | 功能 | 示例问法 | +| -------------- | ---------------- | -------------------------- | +| **平台对比** | 对比各平台关注度 | "对比各平台对'AI'的关注度" | +| **活跃度统计** | 统计平台发布频率 | "看看哪个平台更新最频繁" | +| **关键词共现** | 分析关键词关联 | "哪些关键词经常一起出现" | + +**工具返回行为:** + +- 平台对比模式 +- 分析今天的数据 +- 关键词共现最小频次 3 次 + +**AI 展示行为:** + +- 通常会展示分析结果和统计数据 +- AI 可能会总结洞察发现 + +--- + +## 情感分析 + +### Q8: 如何分析新闻的情感倾向? + +**你可以这样问:** + +- "分析一下今天新闻的情感倾向" +- "看看'特斯拉'相关新闻是正面还是负面的" +- "分析各平台对'人工智能'的情感态度" +- "看看'比特币'一周内的情感倾向,选择前 20 条最重要的" + +**调用的工具:** `analyze_sentiment` + +**工具返回行为:** + +- 分析今天的数据 +- MCP 工具会返回最多 50 条新闻给 AI +- 按权重排序(优先展示重要新闻) +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ 本工具返回 **AI 提示词**,不是直接的情感分析结果 +- AI 会根据提示词生成情感分析报告 +- 通常会展示情感分布、关键发现和代表性新闻 + +**可以调整:** + +- 指定话题:如"关于'特斯拉'" +- 指定时间:如"最近一周" +- 调整数量:如"返回前 20 条" + +--- + +### Q9: 如何查找相似的新闻报道? + +**你可以这样问:** + +- "找出和'特斯拉降价'相似的新闻" +- "查找关于 iPhone 发布的类似报道" +- "看看有没有和这条新闻相似的报道" +- "找相似新闻,需要链接" + +**调用的工具:** `find_similar_news` + +**工具返回行为:** + +- 相似度阈值 0.6 +- MCP 工具会返回最多 50 条结果给 AI +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会自动总结**,只展示部分相似新闻 +- ✅ 如果你想看全部,需要明确要求:"展示所有相似新闻" + +--- + +### Q10: 如何生成每日或每周的热点摘要? + +**你可以这样问:** + +- "生成今天的新闻摘要报告" +- "给我一份本周的热点总结" +- "生成过去 7 天的新闻分析报告" + +**调用的工具:** `generate_summary_report` + +**报告类型:** + +- 每日摘要:总结当天的热点新闻 +- 每周摘要:总结一周的热点趋势 + +--- + +## 系统管理 + +### Q11: 如何查看系统配置? + +**你可以这样问:** + +- "查看当前系统配置" +- "显示配置文件内容" +- "有哪些可用的平台?" +- "当前的权重配置是什么?" + +**调用的工具:** `get_current_config` + +**可以查询:** + +- 可用平台列表 +- 爬虫配置(请求间隔、超时设置) +- 权重配置(排名权重、频次权重) +- 通知配置(钉钉、微信) + +--- + +### Q12: 如何检查系统运行状态? + +**你可以这样问:** + +- "检查系统状态" +- "系统运行正常吗?" +- "最后一次爬取是什么时候?" +- "有多少天的历史数据?" + +**调用的工具:** `get_system_status` + +**返回信息:** + +- 系统版本和状态 +- 最后爬取时间 +- 历史数据天数 +- 健康检查结果 + +--- + +### Q13: 如何手动触发爬取任务? + +**你可以这样问:** + +- "请你爬取当前的今日头条的新闻"(临时查询) +- "帮我抓取一下知乎和微博的最新新闻并保存"(持久化) +- "触发一次爬取并保存数据"(持久化) +- "获取 36 氪 的实时数据但不保存"(临时查询) + +**调用的工具:** `trigger_crawl` + +**两种模式:** + +| 模式 | 用途 | 示例 | +| -------------- | -------------------- | -------------------- | +| **临时爬取** | 只返回数据不保存 | "爬取今日头条的新闻" | +| **持久化爬取** | 保存到 output 文件夹 | "抓取并保存知乎新闻" | + +**工具返回行为:** + +- 临时爬取模式(不保存) +- 爬取所有平台 +- 不包含 URL 链接 + +**AI 展示行为(重要):** + +- ⚠️ **AI 通常会总结爬取结果**,只展示部分新闻 +- ✅ 如果你想看全部,需要明确要求:"展示所有爬取的新闻" + +**可以调整:** + +- 指定平台:如"只爬取知乎" +- 保存数据:说"并保存"或"保存到本地" +- 包含链接:说"需要链接" + +--- + +## 💡 使用技巧 + +### 1. 如何让 AI 展示全部数据而不是自动总结? + +**背景**: 有时 AI 会自动总结数据,只展示部分内容,即使工具返回了完整的 50 条数据。 + +**如果 AI 仍然总结,你可以**: + +- **方法 1 - 明确要求**: "请展示全部新闻,不要总结" +- **方法 2 - 指定数量**: "展示所有 50 条新闻" +- **方法 3 - 质疑行为**: "为什么只展示了 15 条?我要看全部" +- **方法 4 - 提前说明**: "查询今天的新闻,完整展示所有结果" + +**注意**: AI 仍可能根据上下文调整展示方式。 + + +### 2. 如何组合使用多个工具? + +**示例:深度分析某个话题** + +1. 先搜索:"搜索'人工智能'相关新闻" +2. 再分析趋势:"分析'人工智能'的热度趋势" +3. 最后情感分析:"分析'人工智能'新闻的情感倾向" + +**示例:追踪某个事件** + +1. 查看最新:"查询今天关于'iPhone'的新闻" +2. 查找历史:"查找上周与'iPhone'相关的历史新闻" +3. 找相似报道:"找出和'iPhone 发布会'相似的新闻" diff --git a/_image/after.jpg b/_image/after.jpg new file mode 100644 index 0000000000000..d8e05b0ca9a9b Binary files /dev/null and b/_image/after.jpg differ diff --git a/_image/ai.png b/_image/ai.png new file mode 100644 index 0000000000000..f0c0c3a9d79e9 Binary files /dev/null and b/_image/ai.png differ diff --git a/_image/ai2.png b/_image/ai2.png new file mode 100644 index 0000000000000..620c07428a01d Binary files /dev/null and b/_image/ai2.png differ diff --git a/_image/banner.jpg b/_image/banner.jpg new file mode 100644 index 0000000000000..fec0bf8e5f68c Binary files /dev/null and b/_image/banner.jpg differ diff --git a/_image/before.jpg b/_image/before.jpg new file mode 100644 index 0000000000000..c71ebb304250d Binary files /dev/null and b/_image/before.jpg differ diff --git a/_image/feishu.jpg b/_image/feishu.jpg new file mode 100644 index 0000000000000..3c0c799108295 Binary files /dev/null and b/_image/feishu.jpg differ diff --git a/_image/github-pages.png b/_image/github-pages.png new file mode 100644 index 0000000000000..b52852b643a4d Binary files /dev/null and b/_image/github-pages.png differ diff --git a/image.png b/_image/image.png similarity index 100% rename from image.png rename to _image/image.png diff --git a/_image/next.jpg b/_image/next.jpg new file mode 100644 index 0000000000000..9ec2d7b1bc928 Binary files /dev/null and b/_image/next.jpg differ diff --git a/_image/secrets.png b/_image/secrets.png new file mode 100644 index 0000000000000..cb4b09521c64f Binary files /dev/null and b/_image/secrets.png differ diff --git a/_image/support.jpg b/_image/support.jpg new file mode 100644 index 0000000000000..6422d2cc78ca9 Binary files /dev/null and b/_image/support.jpg differ diff --git a/_image/weixin.png b/_image/weixin.png new file mode 100644 index 0000000000000..7c5588800ecea Binary files /dev/null and b/_image/weixin.png differ diff --git a/_image/wework.png b/_image/wework.png new file mode 100644 index 0000000000000..5528fc9445d7d Binary files /dev/null and b/_image/wework.png differ diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000000000..094a87d7d08de --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,107 @@ +app: + version_check_url: "https://raw.githubusercontent.com/sansan0/TrendRadar/refs/heads/master/version" + show_version_update: true # 控制显示版本更新提示,如果 false,则不接受新版本提示 + +crawler: + request_interval: 1000 # 请求间隔(毫秒) + enable_crawler: true # 是否启用爬取新闻功能,如果 false,则直接停止程序 + use_proxy: false # 是否启用代理,false 时为关闭 + default_proxy: "http://127.0.0.1:10086" + +# 🔸 daily(当日汇总模式) +# • 推送时机:按时推送(默认每小时推送一次) +# • 显示内容:当日所有匹配新闻 + 新增新闻区域 +# • 适用场景:日报总结、全面了解当日热点趋势 +# +# 🔸 current(当前榜单模式) +# • 推送时机:按时推送(默认每小时推送一次) +# • 显示内容:当前榜单匹配新闻 + 新增新闻区域 +# • 适用场景:实时热点追踪、了解当前最火的内容 +# +# 🔸 incremental(增量监控模式) +# • 推送时机:有新增才推送 +# • 显示内容:新出现的匹配频率词新闻 +# • 适用场景:避免重复信息干扰 + +# 推送模式选择 +report: + mode: "daily" # 可选: "daily"|"incremental"|"current" + rank_threshold: 5 # 排名高亮阈值 + +notification: + enable_notification: true # 是否启用通知功能,如果 false,则不发送手机通知 + message_batch_size: 4000 # 消息分批大小(字节)(这个配置别动) + dingtalk_batch_size: 20000 # 钉钉消息分批大小(字节)(这个配置也别动) + feishu_batch_size: 29000 # 飞书消息分批大小(字节) + batch_send_interval: 3 # 批次发送间隔(秒) + feishu_message_separator: "━━━━━━━━━━━━━━━━━━━" # feishu 消息分割线 + + # 🔄 企业微信微信兼容模式 + # 用途:解决企业微信推送到个人微信时格式不兼容的问题 + # true: 使用纯文本格式,微信用户可以正常查看 + # false: 使用 Markdown 格式,企业微信内显示更美观但微信用户无法正常显示 + wework_wechat_compatible: true + + # 🕐 推送时间窗口控制(可选功能) + # 用途:限制推送的时间范围,避免非工作时间打扰 + # 适用场景: + # - 只想在工作日白天接收推送(如 09:00-18:00) + # - 希望在晚上固定时间收到汇总(如 20:00-22:00) + push_window: + enabled: false # 是否启用推送时间窗口控制,默认关闭 + # 注意:GitHub Actions 执行时间不稳定,时间范围建议至少留足 2 小时 + # 如果想要精准的定时推送,建议使用 Docker 部署在个人服务器上 + time_range: + start: "20:00" # 推送时间窗口开始(北京时间) + end: "22:00" # 推送时间窗口结束(北京时间) + once_per_day: true # 每天在时间窗口内只推送一次,如果 false,则窗口内每次执行都推送 + push_record_retention_days: 7 # 推送记录保留天数 + + # 请务必妥善保管好 webhooks,不要公开 + # 如果你以 fork 的方式将本项目部署在 GitHub 上,请勿在此填写任何 webhooks,而是将 webhooks 填入 GitHub Secret + # 不然轻则手机上收到奇怪的广告推送,重则存在更严重的安全隐患 + webhooks: + feishu_url: "" # 飞书机器人的 webhook URL + dingtalk_url: "" # 钉钉机器人的 webhook URL + wework_url: "" # 企业微信机器人的 webhook URL + telegram_bot_token: "" # Telegram Bot Token + telegram_chat_id: "" # Telegram Chat ID + email_from: "" # 发件人邮箱地址 + email_password: "" # 发件人邮箱密码或授权码 + email_to: "" # 收件人邮箱地址,多个收件人用逗号分隔 + email_smtp_server: "" # SMTP服务器地址(可选,留空自动识别) + email_smtp_port: "" # SMTP端口(可选,留空自动识别) + ntfy_server_url: "https://ntfy.sh" # ntfy服务器地址,默认使用公共服务,可改为自托管地址 + ntfy_topic: "" # ntfy主题名称 + ntfy_token: "" # ntfy访问令牌(可选,用于私有主题) + +# 用于让关注度更高的新闻在更前面显示,即用算法重新组合不同平台的热搜排序形成你侧重的热搜,合起来是 1 就行 +weight: + rank_weight: 0.6 # 排名权重 + frequency_weight: 0.3 # 频次权重 + hotness_weight: 0.1 # 热度权重 + +# name 可以定义任意名称,只具有显示作用,即使项目运行了几天后,忽然改掉 name 也不会影响代码的正常运行 +platforms: + - id: "toutiao" + name: "今日头条" + - id: "baidu" + name: "百度热搜" + - id: "wallstreetcn-hot" + name: "华尔街见闻" + - id: "thepaper" + name: "澎湃新闻" + - id: "bilibili-hot-search" + name: "bilibili 热搜" + - id: "cls-hot" + name: "财联社热门" + - id: "ifeng" + name: "凤凰网" + - id: "tieba" + name: "贴吧" + - id: "weibo" + name: "微博" + - id: "douyin" + name: "抖音" + - id: "zhihu" + name: "知乎" diff --git a/frequency_words.txt b/config/frequency_words.txt similarity index 73% rename from frequency_words.txt rename to config/frequency_words.txt index b1e21f6f3164f..48f28ab636ed0 100644 --- a/frequency_words.txt +++ b/config/frequency_words.txt @@ -5,51 +5,59 @@ DeepSeek 梁文锋 华为 -任正非 鸿蒙 HarmonyOS +任正非 比亚迪 王传福 +大疆 +DJI + 宇树 王兴兴 -稚晖君 智元 +灵犀 +稚晖君 +彭志辉 黑神话 冯骥 +影之刃零 +梁其伟 + 哪吒 饺子 +杨宇 !车 !餐 -流浪地球 -郭帆 三体 +流浪地球 刘慈欣 +郭帆 -米哈游 -原神 -星穹铁道 +申奥 京东 刘强东 字节 +bytedance 张一鸣 -马斯克 特斯拉 +马斯克 微软 Microsoft -黄仁勋 英伟达 NVIDIA +黄仁勋 AMD @@ -60,6 +68,7 @@ deepmind chatgpt openai +sora claude Anthropic @@ -70,24 +79,35 @@ mac ios ai +!gai 人工智能 -汽车 自动驾驶 -l3 机器人 +国产 +中国 + +美国 +日本 +韩国 + 芯片 -半导体 光刻机 科技 核能 +水电站 +雅鲁藏布江 + +新质生产力 + 月球 登月 火星 宇宙 -飞船 \ No newline at end of file +飞船 +航空 diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000000000..c20194a7aa4b7 --- /dev/null +++ b/docker/.env @@ -0,0 +1,60 @@ +# ============================================ +# 核心配置(环境变量优先级 > config.yaml) +# ============================================ + +# 是否启用爬虫 (true/false) +ENABLE_CRAWLER= +# 是否启用通知 (true/false) +ENABLE_NOTIFICATION= +# 报告模式(daily|incremental|current) +REPORT_MODE= + +# ============================================ +# 推送时间窗口配置 +# ============================================ + +# 是否启用推送时间窗口 (true/false) +PUSH_WINDOW_ENABLED= +# 推送开始时间 (HH:MM 格式,如 08:00) +PUSH_WINDOW_START= +# 推送结束时间 (HH:MM 格式,如 22:00) +PUSH_WINDOW_END= +# 每天只推送一次 (true/false) +PUSH_WINDOW_ONCE_PER_DAY= +# 推送记录保留天数 (数字,如 7) +PUSH_WINDOW_RETENTION_DAYS= + +# ============================================ +# 通知渠道配置 +# ============================================ + +# 推送配置 +FEISHU_WEBHOOK_URL= +TELEGRAM_BOT_TOKEN= +TELEGRAM_CHAT_ID= +DINGTALK_WEBHOOK_URL= +WEWORK_WEBHOOK_URL= + +EMAIL_FROM= +EMAIL_PASSWORD= +EMAIL_TO= +EMAIL_SMTP_SERVER= +EMAIL_SMTP_PORT= + +# ntfy 推送配置 +NTFY_SERVER_URL=https://ntfy.sh +# ntfy主题名称 +NTFY_TOPIC= +# 可选:访问令牌(用于私有主题) +NTFY_TOKEN= + +# ============================================ +# 运行配置 +# ============================================ + +# 定时任务表达式,每 30 分钟执行一次(比如 8点,8点半,9点,9点半这种时间规律执行) +CRON_SCHEDULE=*/30 * * * * +# 运行模式:cron/once +RUN_MODE=cron +# 启动时立即执行一次 +IMMEDIATE_RUN=true \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000..f86f65c32f668 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,72 @@ +FROM python:3.10-slim + +WORKDIR /app + +# https://github.com/aptible/supercronic +ARG TARGETARCH +ENV SUPERCRONIC_VERSION=v0.2.34 + +RUN set -ex && \ + apt-get update && \ + apt-get install -y --no-install-recommends curl ca-certificates && \ + case ${TARGETARCH} in \ + amd64) \ + export SUPERCRONIC_URL=https://github.com/aptible/supercronic/releases/download/${SUPERCRONIC_VERSION}/supercronic-linux-amd64; \ + export SUPERCRONIC_SHA1SUM=e8631edc1775000d119b70fd40339a7238eece14; \ + export SUPERCRONIC=supercronic-linux-amd64; \ + ;; \ + arm64) \ + export SUPERCRONIC_URL=https://github.com/aptible/supercronic/releases/download/${SUPERCRONIC_VERSION}/supercronic-linux-arm64; \ + export SUPERCRONIC_SHA1SUM=4ab6343b52bf9da592e8b4bb7ae6eb5a8e21b71e; \ + export SUPERCRONIC=supercronic-linux-arm64; \ + ;; \ + *) \ + echo "Unsupported architecture: ${TARGETARCH}"; \ + exit 1; \ + ;; \ + esac && \ + echo "Downloading supercronic for ${TARGETARCH} from ${SUPERCRONIC_URL}" && \ + # 添加重试机制和超时设置 + for i in 1 2 3 4 5; do \ + echo "Download attempt $i/5"; \ + if curl --fail --silent --show-error --location --retry 3 --retry-delay 2 --connect-timeout 30 --max-time 120 -o "$SUPERCRONIC" "$SUPERCRONIC_URL"; then \ + echo "Download successful"; \ + break; \ + else \ + echo "Download attempt $i failed, exit code: $?"; \ + if [ $i -eq 5 ]; then \ + echo "All download attempts failed"; \ + exit 1; \ + fi; \ + sleep $((i * 2)); \ + fi; \ + done && \ + echo "${SUPERCRONIC_SHA1SUM} ${SUPERCRONIC}" | sha1sum -c - && \ + chmod +x "$SUPERCRONIC" && \ + mv "$SUPERCRONIC" "/usr/local/bin/${SUPERCRONIC}" && \ + ln -s "/usr/local/bin/${SUPERCRONIC}" /usr/local/bin/supercronic && \ + # 验证安装 + supercronic -version && \ + apt-get remove -y curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . +COPY docker/manage.py . + +# 复制 entrypoint.sh 并强制转换为 LF 格式 +COPY docker/entrypoint.sh /entrypoint.sh.tmp +RUN sed -i 's/\r$//' /entrypoint.sh.tmp && \ + mv /entrypoint.sh.tmp /entrypoint.sh && \ + chmod +x /entrypoint.sh && \ + chmod +x manage.py && \ + mkdir -p /app/config /app/output + +ENV PYTHONUNBUFFERED=1 \ + CONFIG_PATH=/app/config/config.yaml \ + FREQUENCY_WORDS_PATH=/app/config/frequency_words.txt + +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/docker/docker-compose-build.yml b/docker/docker-compose-build.yml new file mode 100644 index 0000000000000..5dc69d8c6fa94 --- /dev/null +++ b/docker/docker-compose-build.yml @@ -0,0 +1,44 @@ +services: + trend-radar: + build: + context: .. + dockerfile: docker/Dockerfile + container_name: trend-radar + restart: unless-stopped + + volumes: + - ../config:/app/config:ro + - ../output:/app/output + + environment: + - TZ=Asia/Shanghai + # 核心配置 + - ENABLE_CRAWLER=${ENABLE_CRAWLER:-} + - ENABLE_NOTIFICATION=${ENABLE_NOTIFICATION:-} + - REPORT_MODE=${REPORT_MODE:-} + # 推送时间窗口 + - PUSH_WINDOW_ENABLED=${PUSH_WINDOW_ENABLED:-} + - PUSH_WINDOW_START=${PUSH_WINDOW_START:-} + - PUSH_WINDOW_END=${PUSH_WINDOW_END:-} + - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} + - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-} + # 通知渠道 + - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} + - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} + - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID:-} + - DINGTALK_WEBHOOK_URL=${DINGTALK_WEBHOOK_URL:-} + - WEWORK_WEBHOOK_URL=${WEWORK_WEBHOOK_URL:-} + # 邮件配置 + - EMAIL_FROM=${EMAIL_FROM:-} + - EMAIL_PASSWORD=${EMAIL_PASSWORD:-} + - EMAIL_TO=${EMAIL_TO:-} + - EMAIL_SMTP_SERVER=${EMAIL_SMTP_SERVER:-} + - EMAIL_SMTP_PORT=${EMAIL_SMTP_PORT:-} + # ntfy配置 + - NTFY_SERVER_URL=${NTFY_SERVER_URL:-https://ntfy.sh} + - NTFY_TOPIC=${NTFY_TOPIC:-} + - NTFY_TOKEN=${NTFY_TOKEN:-} + # 运行模式 + - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} + - RUN_MODE=${RUN_MODE:-cron} + - IMMEDIATE_RUN=${IMMEDIATE_RUN:-true} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000000000..94f47fb6b500b --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,42 @@ +services: + trend-radar: + image: wantcat/trendradar:latest + container_name: trend-radar + restart: unless-stopped + + volumes: + - ../config:/app/config:ro + - ../output:/app/output + + environment: + - TZ=Asia/Shanghai + # 核心配置 + - ENABLE_CRAWLER=${ENABLE_CRAWLER:-} + - ENABLE_NOTIFICATION=${ENABLE_NOTIFICATION:-} + - REPORT_MODE=${REPORT_MODE:-} + # 推送时间窗口 + - PUSH_WINDOW_ENABLED=${PUSH_WINDOW_ENABLED:-} + - PUSH_WINDOW_START=${PUSH_WINDOW_START:-} + - PUSH_WINDOW_END=${PUSH_WINDOW_END:-} + - PUSH_WINDOW_ONCE_PER_DAY=${PUSH_WINDOW_ONCE_PER_DAY:-} + - PUSH_WINDOW_RETENTION_DAYS=${PUSH_WINDOW_RETENTION_DAYS:-} + # 通知渠道 + - FEISHU_WEBHOOK_URL=${FEISHU_WEBHOOK_URL:-} + - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-} + - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID:-} + - DINGTALK_WEBHOOK_URL=${DINGTALK_WEBHOOK_URL:-} + - WEWORK_WEBHOOK_URL=${WEWORK_WEBHOOK_URL:-} + # 邮件配置 + - EMAIL_FROM=${EMAIL_FROM:-} + - EMAIL_PASSWORD=${EMAIL_PASSWORD:-} + - EMAIL_TO=${EMAIL_TO:-} + - EMAIL_SMTP_SERVER=${EMAIL_SMTP_SERVER:-} + - EMAIL_SMTP_PORT=${EMAIL_SMTP_PORT:-} + # ntfy配置 + - NTFY_SERVER_URL=${NTFY_SERVER_URL:-https://ntfy.sh} + - NTFY_TOPIC=${NTFY_TOPIC:-} + - NTFY_TOKEN=${NTFY_TOKEN:-} + # 运行模式 + - CRON_SCHEDULE=${CRON_SCHEDULE:-*/5 * * * *} + - RUN_MODE=${RUN_MODE:-cron} + - IMMEDIATE_RUN=${IMMEDIATE_RUN:-true} diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000000000..01d489db91fdf --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -e + +# 检查配置文件 +if [ ! -f "/app/config/config.yaml" ] || [ ! -f "/app/config/frequency_words.txt" ]; then + echo "❌ 配置文件缺失" + exit 1 +fi + +# 保存环境变量 +env >> /etc/environment + +case "${RUN_MODE:-cron}" in +"once") + echo "🔄 单次执行" + exec /usr/local/bin/python main.py + ;; +"cron") + # 生成 crontab + echo "${CRON_SCHEDULE:-*/30 * * * *} cd /app && /usr/local/bin/python main.py" > /tmp/crontab + + echo "📅 生成的crontab内容:" + cat /tmp/crontab + + if ! /usr/local/bin/supercronic -test /tmp/crontab; then + echo "❌ crontab格式验证失败" + exit 1 + fi + + # 立即执行一次(如果配置了) + if [ "${IMMEDIATE_RUN:-false}" = "true" ]; then + echo "▶️ 立即执行一次" + /usr/local/bin/python main.py + fi + + echo "⏰ 启动supercronic: ${CRON_SCHEDULE:-*/30 * * * *}" + echo "🎯 supercronic 将作为 PID 1 运行" + + exec /usr/local/bin/supercronic -passthrough-logs /tmp/crontab + ;; +*) + exec "$@" + ;; +esac \ No newline at end of file diff --git a/docker/manage.py b/docker/manage.py new file mode 100644 index 0000000000000..e72d553e040b7 --- /dev/null +++ b/docker/manage.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +新闻爬虫容器管理工具 - supercronic +""" + +import os +import sys +import subprocess +import time +from pathlib import Path + + +def run_command(cmd, shell=True, capture_output=True): + """执行系统命令""" + try: + result = subprocess.run( + cmd, shell=shell, capture_output=capture_output, text=True + ) + return result.returncode == 0, result.stdout, result.stderr + except Exception as e: + return False, "", str(e) + + +def manual_run(): + """手动执行一次爬虫""" + print("🔄 手动执行爬虫...") + try: + result = subprocess.run( + ["python", "main.py"], cwd="/app", capture_output=False, text=True + ) + if result.returncode == 0: + print("✅ 执行完成") + else: + print(f"❌ 执行失败,退出码: {result.returncode}") + except Exception as e: + print(f"❌ 执行出错: {e}") + + +def parse_cron_schedule(cron_expr): + """解析cron表达式并返回人类可读的描述""" + if not cron_expr or cron_expr == "未设置": + return "未设置" + + try: + parts = cron_expr.strip().split() + if len(parts) != 5: + return f"原始表达式: {cron_expr}" + + minute, hour, day, month, weekday = parts + + # 分析分钟 + if minute == "*": + minute_desc = "每分钟" + elif minute.startswith("*/"): + interval = minute[2:] + minute_desc = f"每{interval}分钟" + elif "," in minute: + minute_desc = f"在第{minute}分钟" + else: + minute_desc = f"在第{minute}分钟" + + # 分析小时 + if hour == "*": + hour_desc = "每小时" + elif hour.startswith("*/"): + interval = hour[2:] + hour_desc = f"每{interval}小时" + elif "," in hour: + hour_desc = f"在{hour}点" + else: + hour_desc = f"在{hour}点" + + # 分析日期 + if day == "*": + day_desc = "每天" + elif day.startswith("*/"): + interval = day[2:] + day_desc = f"每{interval}天" + else: + day_desc = f"每月{day}号" + + # 分析月份 + if month == "*": + month_desc = "每月" + else: + month_desc = f"在{month}月" + + # 分析星期 + weekday_names = { + "0": "周日", "1": "周一", "2": "周二", "3": "周三", + "4": "周四", "5": "周五", "6": "周六", "7": "周日" + } + if weekday == "*": + weekday_desc = "" + else: + weekday_desc = f"在{weekday_names.get(weekday, weekday)}" + + # 组合描述 + if minute.startswith("*/") and hour == "*" and day == "*" and month == "*" and weekday == "*": + # 简单的间隔模式,如 */30 * * * * + return f"每{minute[2:]}分钟执行一次" + elif hour != "*" and minute != "*" and day == "*" and month == "*" and weekday == "*": + # 每天特定时间,如 0 9 * * * + return f"每天{hour}:{minute.zfill(2)}执行" + elif weekday != "*" and day == "*": + # 每周特定时间 + return f"{weekday_desc}{hour}:{minute.zfill(2)}执行" + else: + # 复杂模式,显示详细信息 + desc_parts = [part for part in [month_desc, day_desc, weekday_desc, hour_desc, minute_desc] if part and part != "每月" and part != "每天" and part != "每小时"] + if desc_parts: + return " ".join(desc_parts) + "执行" + else: + return f"复杂表达式: {cron_expr}" + + except Exception as e: + return f"解析失败: {cron_expr}" + + +def show_status(): + """显示容器状态""" + print("📊 容器状态:") + + # 检查 PID 1 状态 + supercronic_is_pid1 = False + pid1_cmdline = "" + try: + with open('/proc/1/cmdline', 'r') as f: + pid1_cmdline = f.read().replace('\x00', ' ').strip() + print(f" 🔍 PID 1 进程: {pid1_cmdline}") + + if "supercronic" in pid1_cmdline.lower(): + print(" ✅ supercronic 正确运行为 PID 1") + supercronic_is_pid1 = True + else: + print(" ❌ PID 1 不是 supercronic") + print(f" 📋 实际的 PID 1: {pid1_cmdline}") + except Exception as e: + print(f" ❌ 无法读取 PID 1 信息: {e}") + + # 检查环境变量 + cron_schedule = os.environ.get("CRON_SCHEDULE", "未设置") + run_mode = os.environ.get("RUN_MODE", "未设置") + immediate_run = os.environ.get("IMMEDIATE_RUN", "未设置") + + print(f" ⚙️ 运行配置:") + print(f" CRON_SCHEDULE: {cron_schedule}") + + # 解析并显示cron表达式的含义 + cron_description = parse_cron_schedule(cron_schedule) + print(f" ⏰ 执行频率: {cron_description}") + + print(f" RUN_MODE: {run_mode}") + print(f" IMMEDIATE_RUN: {immediate_run}") + + # 检查配置文件 + config_files = ["/app/config/config.yaml", "/app/config/frequency_words.txt"] + print(" 📁 配置文件:") + for file_path in config_files: + if Path(file_path).exists(): + print(f" ✅ {Path(file_path).name}") + else: + print(f" ❌ {Path(file_path).name} 缺失") + + # 检查关键文件 + key_files = [ + ("/usr/local/bin/supercronic-linux-amd64", "supercronic二进制文件"), + ("/usr/local/bin/supercronic", "supercronic软链接"), + ("/tmp/crontab", "crontab文件"), + ("/entrypoint.sh", "启动脚本") + ] + + print(" 📂 关键文件检查:") + for file_path, description in key_files: + if Path(file_path).exists(): + print(f" ✅ {description}: 存在") + # 对于crontab文件,显示内容 + if file_path == "/tmp/crontab": + try: + with open(file_path, 'r') as f: + crontab_content = f.read().strip() + print(f" 内容: {crontab_content}") + except: + pass + else: + print(f" ❌ {description}: 不存在") + + # 检查容器运行时间 + print(" ⏱️ 容器时间信息:") + try: + # 检查 PID 1 的启动时间 + with open('/proc/1/stat', 'r') as f: + stat_content = f.read().strip().split() + if len(stat_content) >= 22: + # starttime 是第22个字段(索引21) + starttime_ticks = int(stat_content[21]) + + # 读取系统启动时间 + with open('/proc/stat', 'r') as stat_f: + for line in stat_f: + if line.startswith('btime'): + boot_time = int(line.split()[1]) + break + else: + boot_time = 0 + + # 读取系统时钟频率 + clock_ticks = os.sysconf(os.sysconf_names['SC_CLK_TCK']) + + if boot_time > 0: + pid1_start_time = boot_time + (starttime_ticks / clock_ticks) + current_time = time.time() + uptime_seconds = int(current_time - pid1_start_time) + uptime_minutes = uptime_seconds // 60 + uptime_hours = uptime_minutes // 60 + + if uptime_hours > 0: + print(f" PID 1 运行时间: {uptime_hours} 小时 {uptime_minutes % 60} 分钟") + else: + print(f" PID 1 运行时间: {uptime_minutes} 分钟 ({uptime_seconds} 秒)") + else: + print(f" PID 1 运行时间: 无法精确计算") + else: + print(" ❌ 无法解析 PID 1 统计信息") + except Exception as e: + print(f" ❌ 时间检查失败: {e}") + + # 状态总结和建议 + print(" 📊 状态总结:") + if supercronic_is_pid1: + print(" ✅ supercronic 正确运行为 PID 1") + print(" ✅ 定时任务应该正常工作") + + # 显示当前的调度信息 + if cron_schedule != "未设置": + print(f" ⏰ 当前调度: {cron_description}") + + # 提供一些常见的调度建议 + if "分钟" in cron_description and "每30分钟" not in cron_description and "每60分钟" not in cron_description: + print(" 💡 频繁执行模式,适合实时监控") + elif "小时" in cron_description: + print(" 💡 按小时执行模式,适合定期汇总") + elif "天" in cron_description: + print(" 💡 每日执行模式,适合日报生成") + + print(" 💡 如果定时任务不执行,检查:") + print(" • crontab 格式是否正确") + print(" • 时区设置是否正确") + print(" • 应用程序是否有错误") + else: + print(" ❌ supercronic 状态异常") + if pid1_cmdline: + print(f" 📋 当前 PID 1: {pid1_cmdline}") + print(" 💡 建议操作:") + print(" • 重启容器: docker restart trend-radar") + print(" • 检查容器日志: docker logs trend-radar") + + # 显示日志检查建议 + print(" 📋 运行状态检查:") + print(" • 查看完整容器日志: docker logs trend-radar") + print(" • 查看实时日志: docker logs -f trend-radar") + print(" • 手动执行测试: python manage.py run") + print(" • 重启容器服务: docker restart trend-radar") + + +def show_config(): + """显示当前配置""" + print("⚙️ 当前配置:") + + env_vars = [ + "CRON_SCHEDULE", + "RUN_MODE", + "IMMEDIATE_RUN", + "FEISHU_WEBHOOK_URL", + "DINGTALK_WEBHOOK_URL", + "WEWORK_WEBHOOK_URL", + "TELEGRAM_BOT_TOKEN", + "TELEGRAM_CHAT_ID", + "CONFIG_PATH", + "FREQUENCY_WORDS_PATH", + ] + + for var in env_vars: + value = os.environ.get(var, "未设置") + # 隐藏敏感信息 + if any(sensitive in var for sensitive in ["WEBHOOK", "TOKEN", "KEY"]): + if value and value != "未设置": + masked_value = value[:10] + "***" if len(value) > 10 else "***" + print(f" {var}: {masked_value}") + else: + print(f" {var}: {value}") + else: + print(f" {var}: {value}") + + crontab_file = "/tmp/crontab" + if Path(crontab_file).exists(): + print(" 📅 Crontab内容:") + try: + with open(crontab_file, "r") as f: + content = f.read().strip() + print(f" {content}") + except Exception as e: + print(f" 读取失败: {e}") + else: + print(" 📅 Crontab文件不存在") + + +def show_files(): + """显示输出文件""" + print("📁 输出文件:") + + output_dir = Path("/app/output") + if not output_dir.exists(): + print(" 📭 输出目录不存在") + return + + # 显示最近的文件 + date_dirs = sorted([d for d in output_dir.iterdir() if d.is_dir()], reverse=True) + + if not date_dirs: + print(" 📭 输出目录为空") + return + + # 显示最近2天的文件 + for date_dir in date_dirs[:2]: + print(f" 📅 {date_dir.name}:") + for subdir in ["html", "txt"]: + sub_path = date_dir / subdir + if sub_path.exists(): + files = list(sub_path.glob("*")) + if files: + recent_files = sorted( + files, key=lambda x: x.stat().st_mtime, reverse=True + )[:3] + print(f" 📂 {subdir}: {len(files)} 个文件") + for file in recent_files: + mtime = time.ctime(file.stat().st_mtime) + size_kb = file.stat().st_size // 1024 + print( + f" 📄 {file.name} ({size_kb}KB, {mtime.split()[3][:5]})" + ) + else: + print(f" 📂 {subdir}: 空") + + +def show_logs(): + """显示实时日志""" + print("📋 实时日志 (按 Ctrl+C 退出):") + print("💡 提示: 这将显示 PID 1 进程的输出") + try: + # 尝试多种方法查看日志 + log_files = [ + "/proc/1/fd/1", # PID 1 的标准输出 + "/proc/1/fd/2", # PID 1 的标准错误 + ] + + for log_file in log_files: + if Path(log_file).exists(): + print(f"📄 尝试读取: {log_file}") + subprocess.run(["tail", "-f", log_file], check=True) + break + else: + print("📋 无法找到标准日志文件,建议使用: docker logs trend-radar") + + except KeyboardInterrupt: + print("\n👋 退出日志查看") + except Exception as e: + print(f"❌ 查看日志失败: {e}") + print("💡 建议使用: docker logs trend-radar") + + +def restart_supercronic(): + """重启supercronic进程""" + print("🔄 重启supercronic...") + print("⚠️ 注意: supercronic 是 PID 1,无法直接重启") + + # 检查当前 PID 1 + try: + with open('/proc/1/cmdline', 'r') as f: + pid1_cmdline = f.read().replace('\x00', ' ').strip() + print(f" 🔍 当前 PID 1: {pid1_cmdline}") + + if "supercronic" in pid1_cmdline.lower(): + print(" ✅ PID 1 是 supercronic") + print(" 💡 要重启 supercronic,需要重启整个容器:") + print(" docker restart trend-radar") + else: + print(" ❌ PID 1 不是 supercronic,这是异常状态") + print(" 💡 建议重启容器以修复问题:") + print(" docker restart trend-radar") + except Exception as e: + print(f" ❌ 无法检查 PID 1: {e}") + print(" 💡 建议重启容器: docker restart trend-radar") + + +def show_help(): + """显示帮助信息""" + help_text = """ +🐳 TrendRadar 容器管理工具 + +📋 命令列表: + run - 手动执行一次爬虫 + status - 显示容器运行状态 + config - 显示当前配置 + files - 显示输出文件 + logs - 实时查看日志 + restart - 重启说明 + help - 显示此帮助 + +📖 使用示例: + # 在容器中执行 + python manage.py run + python manage.py status + python manage.py logs + + # 在宿主机执行 + docker exec -it trend-radar python manage.py run + docker exec -it trend-radar python manage.py status + docker logs trend-radar + +💡 常用操作指南: + 1. 检查运行状态: status + - 查看 supercronic 是否为 PID 1 + - 检查配置文件和关键文件 + - 查看 cron 调度设置 + + 2. 手动执行测试: run + - 立即执行一次新闻爬取 + - 测试程序是否正常工作 + + 3. 查看日志: logs + - 实时监控运行情况 + - 也可使用: docker logs trend-radar + + 4. 重启服务: restart + - 由于 supercronic 是 PID 1,需要重启整个容器 + - 使用: docker restart trend-radar +""" + print(help_text) + + +def main(): + if len(sys.argv) < 2: + show_help() + return + + command = sys.argv[1] + commands = { + "run": manual_run, + "status": show_status, + "config": show_config, + "files": show_files, + "logs": show_logs, + "restart": restart_supercronic, + "help": show_help, + } + + if command in commands: + try: + commands[command]() + except KeyboardInterrupt: + print("\n👋 操作已取消") + except Exception as e: + print(f"❌ 执行出错: {e}") + else: + print(f"❌ 未知命令: {command}") + print("运行 'python manage.py help' 查看可用命令") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000000000..90ab5b9d9fa37 --- /dev/null +++ b/index.html @@ -0,0 +1,747 @@ + + + + + + 热点新闻分析 + + + + +
+
+
+ + +
+
热点新闻分析
+
+
+ 报告类型 + 当日汇总 +
+
+ 新闻总数 + 387 条 +
+
+ 热点新闻 + 5 条 +
+
+ 生成时间 + 06-16 07:17 +
+
+
+ +
+
+
+
+
ai 人工智能
+
3 条
+
+
1/4
+
+ +
+
1
+
+
+ 财联社热门 + 7-8 + 00:23~07:17 + 15次 +
+ +
+
+ +
+
2
+
+
+ tieba + 18-19 + 00:23~07:17 + 15次 +
+ +
+
+ + +
+ +
+
+
+
DeepSeek 梁文锋
+
1 条
+
+
2/4
+
+ +
+
1
+
+
+ 华尔街见闻 + 8-9 + 00:23~07:17 + 15次 +
+ +
+
+
+ +
+
+
+
哪吒 饺子
+
1 条
+
+
3/4
+
+ +
+
1
+
+
+ 百度热搜 + 24-30 + 00:57~06:55 + 7次 +
+ +
+
+
+ +
+
+
+
米哈游 原神 星穹铁道
+
1 条
+
+
4/4
+
+ +
+
1
+
+
+ zhihu + 5 + 06:55~07:17 + 2次 +
+ +
+
+
+
+ + +
+ + + + \ No newline at end of file diff --git a/main.py b/main.py index c2671ea12cedb..d2dac2162024a 100644 --- a/main.py +++ b/main.py @@ -2,67 +2,444 @@ import json import os -import time import random -from datetime import datetime +import re +import time import webbrowser +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from email.header import Header +from email.utils import formataddr, formatdate, make_msgid +from datetime import datetime +from pathlib import Path from typing import Dict, List, Tuple, Optional, Union -import requests import pytz - -# 配置常量 -CONFIG = { - "FEISHU_SEPARATOR": "==============================", # 飞书消息中,每个频率词之间的分割线,注意,其它类型的分割线可能会被飞书过滤而显示怪异 - "REQUEST_INTERVAL": 1000, # 毫秒 - "FEISHU_REPORT_TYPE": "daily", # 可选: "current", "daily", "both" - "RANK_THRESHOLD": 5, # 排名阈值,决定使用【】还是[]的界限 - "USE_PROXY": False, # 是否启用本地代理 - "DEFAULT_PROXY": "http://127.0.0.1:10086", - "CONTINUE_WITHOUT_FEISHU": True, # 控制是否在没有飞书webhook URL时继续执行爬虫, 如果True ,会依然进行爬虫行为,会在github上持续的生成爬取的新闻数据 - "FEISHU_WEBHOOK_URL": "", # 飞书机器人的webhook URL,大概长这样:https://www.feishu.cn/flow/api/trigger-webhook/xxxx, 默认为空,推荐通过GitHub Secrets设置 +import requests +import yaml + + +VERSION = "3.0.5" + + +# === SMTP邮件配置 === +SMTP_CONFIGS = { + # Gmail(使用 STARTTLS) + "gmail.com": {"server": "smtp.gmail.com", "port": 587, "encryption": "TLS"}, + # QQ邮箱(使用 SSL,更稳定) + "qq.com": {"server": "smtp.qq.com", "port": 465, "encryption": "SSL"}, + # Outlook(使用 STARTTLS) + "outlook.com": { + "server": "smtp-mail.outlook.com", + "port": 587, + "encryption": "TLS", + }, + "hotmail.com": { + "server": "smtp-mail.outlook.com", + "port": 587, + "encryption": "TLS", + }, + "live.com": {"server": "smtp-mail.outlook.com", "port": 587, "encryption": "TLS"}, + # 网易邮箱(使用 SSL,更稳定) + "163.com": {"server": "smtp.163.com", "port": 465, "encryption": "SSL"}, + "126.com": {"server": "smtp.126.com", "port": 465, "encryption": "SSL"}, + # 新浪邮箱(使用 SSL) + "sina.com": {"server": "smtp.sina.com", "port": 465, "encryption": "SSL"}, + # 搜狐邮箱(使用 SSL) + "sohu.com": {"server": "smtp.sohu.com", "port": 465, "encryption": "SSL"}, } -class TimeHelper: - """时间相关的辅助功能""" +# === 配置管理 === +def load_config(): + """加载配置文件""" + config_path = os.environ.get("CONFIG_PATH", "config/config.yaml") + + if not Path(config_path).exists(): + raise FileNotFoundError(f"配置文件 {config_path} 不存在") + + with open(config_path, "r", encoding="utf-8") as f: + config_data = yaml.safe_load(f) + + print(f"配置文件加载成功: {config_path}") + + # 构建配置 + config = { + "VERSION_CHECK_URL": config_data["app"]["version_check_url"], + "SHOW_VERSION_UPDATE": config_data["app"]["show_version_update"], + "REQUEST_INTERVAL": config_data["crawler"]["request_interval"], + "REPORT_MODE": os.environ.get("REPORT_MODE", "").strip() + or config_data["report"]["mode"], + "RANK_THRESHOLD": config_data["report"]["rank_threshold"], + "USE_PROXY": config_data["crawler"]["use_proxy"], + "DEFAULT_PROXY": config_data["crawler"]["default_proxy"], + "ENABLE_CRAWLER": os.environ.get("ENABLE_CRAWLER", "").strip().lower() + in ("true", "1") + if os.environ.get("ENABLE_CRAWLER", "").strip() + else config_data["crawler"]["enable_crawler"], + "ENABLE_NOTIFICATION": os.environ.get("ENABLE_NOTIFICATION", "").strip().lower() + in ("true", "1") + if os.environ.get("ENABLE_NOTIFICATION", "").strip() + else config_data["notification"]["enable_notification"], + "MESSAGE_BATCH_SIZE": config_data["notification"]["message_batch_size"], + "DINGTALK_BATCH_SIZE": config_data["notification"].get( + "dingtalk_batch_size", 20000 + ), + "FEISHU_BATCH_SIZE": config_data["notification"].get("feishu_batch_size", 29000), + "BATCH_SEND_INTERVAL": config_data["notification"]["batch_send_interval"], + "FEISHU_MESSAGE_SEPARATOR": config_data["notification"][ + "feishu_message_separator" + ], + "PUSH_WINDOW": { + "ENABLED": os.environ.get("PUSH_WINDOW_ENABLED", "").strip().lower() + in ("true", "1") + if os.environ.get("PUSH_WINDOW_ENABLED", "").strip() + else config_data["notification"] + .get("push_window", {}) + .get("enabled", False), + "TIME_RANGE": { + "START": os.environ.get("PUSH_WINDOW_START", "").strip() + or config_data["notification"] + .get("push_window", {}) + .get("time_range", {}) + .get("start", "08:00"), + "END": os.environ.get("PUSH_WINDOW_END", "").strip() + or config_data["notification"] + .get("push_window", {}) + .get("time_range", {}) + .get("end", "22:00"), + }, + "ONCE_PER_DAY": os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip().lower() + in ("true", "1") + if os.environ.get("PUSH_WINDOW_ONCE_PER_DAY", "").strip() + else config_data["notification"] + .get("push_window", {}) + .get("once_per_day", True), + "RECORD_RETENTION_DAYS": int( + os.environ.get("PUSH_WINDOW_RETENTION_DAYS", "").strip() or "0" + ) + or config_data["notification"] + .get("push_window", {}) + .get("push_record_retention_days", 7), + }, + "WEIGHT_CONFIG": { + "RANK_WEIGHT": config_data["weight"]["rank_weight"], + "FREQUENCY_WEIGHT": config_data["weight"]["frequency_weight"], + "HOTNESS_WEIGHT": config_data["weight"]["hotness_weight"], + }, + "PLATFORMS": config_data["platforms"], + } + + # 通知渠道配置(环境变量优先) + notification = config_data.get("notification", {}) + webhooks = notification.get("webhooks", {}) + + config["FEISHU_WEBHOOK_URL"] = os.environ.get( + "FEISHU_WEBHOOK_URL", "" + ).strip() or webhooks.get("feishu_url", "") + config["DINGTALK_WEBHOOK_URL"] = os.environ.get( + "DINGTALK_WEBHOOK_URL", "" + ).strip() or webhooks.get("dingtalk_url", "") + config["WEWORK_WEBHOOK_URL"] = os.environ.get( + "WEWORK_WEBHOOK_URL", "" + ).strip() or webhooks.get("wework_url", "") + config["TELEGRAM_BOT_TOKEN"] = os.environ.get( + "TELEGRAM_BOT_TOKEN", "" + ).strip() or webhooks.get("telegram_bot_token", "") + config["TELEGRAM_CHAT_ID"] = os.environ.get( + "TELEGRAM_CHAT_ID", "" + ).strip() or webhooks.get("telegram_chat_id", "") + + # 邮件配置 + config["EMAIL_FROM"] = os.environ.get("EMAIL_FROM", "").strip() or webhooks.get( + "email_from", "" + ) + config["EMAIL_PASSWORD"] = os.environ.get( + "EMAIL_PASSWORD", "" + ).strip() or webhooks.get("email_password", "") + config["EMAIL_TO"] = os.environ.get("EMAIL_TO", "").strip() or webhooks.get( + "email_to", "" + ) + config["EMAIL_SMTP_SERVER"] = os.environ.get( + "EMAIL_SMTP_SERVER", "" + ).strip() or webhooks.get("email_smtp_server", "") + config["EMAIL_SMTP_PORT"] = os.environ.get( + "EMAIL_SMTP_PORT", "" + ).strip() or webhooks.get("email_smtp_port", "") + + # ntfy配置 + config["NTFY_SERVER_URL"] = os.environ.get( + "NTFY_SERVER_URL", "https://ntfy.sh" + ).strip() or webhooks.get("ntfy_server_url", "https://ntfy.sh") + config["NTFY_TOPIC"] = os.environ.get("NTFY_TOPIC", "").strip() or webhooks.get( + "ntfy_topic", "" + ) + config["NTFY_TOKEN"] = os.environ.get("NTFY_TOKEN", "").strip() or webhooks.get( + "ntfy_token", "" + ) + + # 企业微信微信兼容模式配置 + config["WEWORK_WECHAT_COMPATIBLE"] = os.environ.get( + "WEWORK_WECHAT_COMPATIBLE", "" + ).strip().lower() in ("true", "1") if os.environ.get("WEWORK_WECHAT_COMPATIBLE", "").strip() else notification.get( + "wework_wechat_compatible", False + ) + + # 输出配置来源信息 + notification_sources = [] + if config["FEISHU_WEBHOOK_URL"]: + source = "环境变量" if os.environ.get("FEISHU_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"飞书({source})") + if config["DINGTALK_WEBHOOK_URL"]: + source = "环境变量" if os.environ.get("DINGTALK_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"钉钉({source})") + if config["WEWORK_WEBHOOK_URL"]: + source = "环境变量" if os.environ.get("WEWORK_WEBHOOK_URL") else "配置文件" + notification_sources.append(f"企业微信({source})") + if config["TELEGRAM_BOT_TOKEN"] and config["TELEGRAM_CHAT_ID"]: + token_source = ( + "环境变量" if os.environ.get("TELEGRAM_BOT_TOKEN") else "配置文件" + ) + chat_source = "环境变量" if os.environ.get("TELEGRAM_CHAT_ID") else "配置文件" + notification_sources.append(f"Telegram({token_source}/{chat_source})") + if config["EMAIL_FROM"] and config["EMAIL_PASSWORD"] and config["EMAIL_TO"]: + from_source = "环境变量" if os.environ.get("EMAIL_FROM") else "配置文件" + notification_sources.append(f"邮件({from_source})") + + if config["NTFY_SERVER_URL"] and config["NTFY_TOPIC"]: + server_source = "环境变量" if os.environ.get("NTFY_SERVER_URL") else "配置文件" + notification_sources.append(f"ntfy({server_source})") + + if notification_sources: + print(f"通知渠道配置来源: {', '.join(notification_sources)}") + else: + print("未配置任何通知渠道") + + return config + + +print("正在加载配置...") +CONFIG = load_config() +print(f"TrendRadar v{VERSION} 配置加载完成") +print(f"监控平台数量: {len(CONFIG['PLATFORMS'])}") + + +# === 工具函数 === +def get_beijing_time(): + """获取北京时间""" + return datetime.now(pytz.timezone("Asia/Shanghai")) + + +def format_date_folder(): + """格式化日期文件夹""" + return get_beijing_time().strftime("%Y年%m月%d日") + + +def format_time_filename(): + """格式化时间文件名""" + return get_beijing_time().strftime("%H时%M分") + + +def clean_title(title: str) -> str: + """清理标题中的特殊字符""" + if not isinstance(title, str): + title = str(title) + cleaned_title = title.replace("\n", " ").replace("\r", " ") + cleaned_title = re.sub(r"\s+", " ", cleaned_title) + cleaned_title = cleaned_title.strip() + return cleaned_title + + +def ensure_directory_exists(directory: str): + """确保目录存在""" + Path(directory).mkdir(parents=True, exist_ok=True) - @staticmethod - def get_beijing_time() -> datetime: - """获取北京时间""" - return datetime.now(pytz.timezone("Asia/Shanghai")) - @staticmethod - def format_date_folder() -> str: - """返回日期文件夹名称格式""" - return TimeHelper.get_beijing_time().strftime("%Y年%m月%d日") +def get_output_path(subfolder: str, filename: str) -> str: + """获取输出路径""" + date_folder = format_date_folder() + output_dir = Path("output") / date_folder / subfolder + ensure_directory_exists(str(output_dir)) + return str(output_dir / filename) - @staticmethod - def format_time_filename() -> str: - """返回时间文件名格式""" - return TimeHelper.get_beijing_time().strftime("%H时%M分") +def check_version_update( + current_version: str, version_url: str, proxy_url: Optional[str] = None +) -> Tuple[bool, Optional[str]]: + """检查版本更新""" + try: + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "text/plain, */*", + "Cache-Control": "no-cache", + } + + response = requests.get( + version_url, proxies=proxies, headers=headers, timeout=10 + ) + response.raise_for_status() + + remote_version = response.text.strip() + print(f"当前版本: {current_version}, 远程版本: {remote_version}") + + # 比较版本 + def parse_version(version_str): + try: + parts = version_str.strip().split(".") + if len(parts) != 3: + raise ValueError("版本号格式不正确") + return int(parts[0]), int(parts[1]), int(parts[2]) + except: + return 0, 0, 0 + + current_tuple = parse_version(current_version) + remote_tuple = parse_version(remote_version) + + need_update = current_tuple < remote_tuple + return need_update, remote_version if need_update else None + + except Exception as e: + print(f"版本检查失败: {e}") + return False, None + + +def is_first_crawl_today() -> bool: + """检测是否是当天第一次爬取""" + date_folder = format_date_folder() + txt_dir = Path("output") / date_folder / "txt" + + if not txt_dir.exists(): + return True + + files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) + return len(files) <= 1 + + +def html_escape(text: str) -> str: + """HTML转义""" + if not isinstance(text, str): + text = str(text) + + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + + +# === 推送记录管理 === +class PushRecordManager: + """推送记录管理器""" + + def __init__(self): + self.record_dir = Path("output") / ".push_records" + self.ensure_record_dir() + self.cleanup_old_records() + + def ensure_record_dir(self): + """确保记录目录存在""" + self.record_dir.mkdir(parents=True, exist_ok=True) + + def get_today_record_file(self) -> Path: + """获取今天的记录文件路径""" + today = get_beijing_time().strftime("%Y%m%d") + return self.record_dir / f"push_record_{today}.json" + + def cleanup_old_records(self): + """清理过期的推送记录""" + retention_days = CONFIG["PUSH_WINDOW"]["RECORD_RETENTION_DAYS"] + current_time = get_beijing_time() + + for record_file in self.record_dir.glob("push_record_*.json"): + try: + date_str = record_file.stem.replace("push_record_", "") + file_date = datetime.strptime(date_str, "%Y%m%d") + file_date = pytz.timezone("Asia/Shanghai").localize(file_date) + + if (current_time - file_date).days > retention_days: + record_file.unlink() + print(f"清理过期推送记录: {record_file.name}") + except Exception as e: + print(f"清理记录文件失败 {record_file}: {e}") + + def has_pushed_today(self) -> bool: + """检查今天是否已经推送过""" + record_file = self.get_today_record_file() -class FileHelper: - """文件操作相关的辅助功能""" + if not record_file.exists(): + return False - @staticmethod - def ensure_directory_exists(directory: str) -> None: - """确保目录存在,如果不存在则创建""" - if not os.path.exists(directory): - os.makedirs(directory) + try: + with open(record_file, "r", encoding="utf-8") as f: + record = json.load(f) + return record.get("pushed", False) + except Exception as e: + print(f"读取推送记录失败: {e}") + return False - @staticmethod - def get_output_path(subfolder: str, filename: str) -> str: - """获取输出文件路径""" - date_folder = TimeHelper.format_date_folder() - output_dir = os.path.join("output", date_folder, subfolder) - FileHelper.ensure_directory_exists(output_dir) - return os.path.join(output_dir, filename) + def record_push(self, report_type: str): + """记录推送""" + record_file = self.get_today_record_file() + now = get_beijing_time() + record = { + "pushed": True, + "push_time": now.strftime("%Y-%m-%d %H:%M:%S"), + "report_type": report_type, + } + try: + with open(record_file, "w", encoding="utf-8") as f: + json.dump(record, f, ensure_ascii=False, indent=2) + print(f"推送记录已保存: {report_type} at {now.strftime('%H:%M:%S')}") + except Exception as e: + print(f"保存推送记录失败: {e}") + + def is_in_time_range(self, start_time: str, end_time: str) -> bool: + """检查当前时间是否在指定时间范围内""" + now = get_beijing_time() + current_time = now.strftime("%H:%M") + + def normalize_time(time_str: str) -> str: + """将时间字符串标准化为 HH:MM 格式""" + try: + parts = time_str.strip().split(":") + if len(parts) != 2: + raise ValueError(f"时间格式错误: {time_str}") + + hour = int(parts[0]) + minute = int(parts[1]) + + if not (0 <= hour <= 23 and 0 <= minute <= 59): + raise ValueError(f"时间范围错误: {time_str}") + + return f"{hour:02d}:{minute:02d}" + except Exception as e: + print(f"时间格式化错误 '{time_str}': {e}") + return time_str + + normalized_start = normalize_time(start_time) + normalized_end = normalize_time(end_time) + normalized_current = normalize_time(current_time) + + result = normalized_start <= normalized_current <= normalized_end + + if not result: + print(f"时间窗口判断:当前 {normalized_current},窗口 {normalized_start}-{normalized_end}") + + return result + + +# === 数据获取 === class DataFetcher: - """数据获取相关功能""" + """数据获取器""" def __init__(self, proxy_url: Optional[str] = None): self.proxy_url = proxy_url @@ -74,20 +451,7 @@ def fetch_data( min_retry_wait: int = 3, max_retry_wait: int = 5, ) -> Tuple[Optional[str], str, str]: - """ - 同步获取指定ID的数据,失败时进行重试 - 接受'success'和'cache'两种状态,其他状态才会触发重试 - - Args: - id_info: ID信息,可以是ID字符串或(ID, 别名)元组 - max_retries: 最大重试次数 - min_retry_wait: 最小重试等待时间(秒) - max_retry_wait: 最大重试等待时间(秒) - - Returns: - (响应数据, ID, 别名)元组,如果请求失败则响应数据为None - """ - # 处理ID和别名 + """获取指定ID数据,支持重试""" if isinstance(id_info, tuple): id_value, alias = id_info else: @@ -96,16 +460,14 @@ def fetch_data( url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" - # 设置代理 proxies = None if self.proxy_url: proxies = {"http": self.proxy_url, "https": self.proxy_url} - # 添加随机性模拟真实用户 headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "application/json, text/plain, */*", - "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Cache-Control": "no-cache", } @@ -113,42 +475,32 @@ def fetch_data( retries = 0 while retries <= max_retries: try: - print( - f"正在请求 {id_value} 数据... (尝试 {retries + 1}/{max_retries + 1})" - ) response = requests.get( url, proxies=proxies, headers=headers, timeout=10 ) - response.raise_for_status() # 检查HTTP状态码 + response.raise_for_status() - # 解析JSON并检查响应状态 data_text = response.text data_json = json.loads(data_text) - # 修改状态检查逻辑:接受success和cache两种状态 status = data_json.get("status", "未知") if status not in ["success", "cache"]: raise ValueError(f"响应状态异常: {status}") - # 记录状态信息 status_info = "最新数据" if status == "success" else "缓存数据" - print(f"成功获取 {id_value} 数据({status_info})") + print(f"获取 {id_value} 成功({status_info})") return data_text, id_value, alias except Exception as e: retries += 1 if retries <= max_retries: - # 计算重试等待时间:基础3-5秒,每次重试增加1-2秒 base_wait = random.uniform(min_retry_wait, max_retry_wait) additional_wait = (retries - 1) * random.uniform(1, 2) wait_time = base_wait + additional_wait - - print( - f"请求 {id_value} 失败: {e}. 将在 {wait_time:.2f} 秒后重试..." - ) + print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") time.sleep(wait_time) else: - print(f"请求 {id_value} 失败: {e}. 已达到最大重试次数。") + print(f"请求 {id_value} 失败: {e}") return None, id_value, alias return None, id_value, alias @@ -157,931 +509,4121 @@ def crawl_websites( ids_list: List[Union[str, Tuple[str, str]]], request_interval: int = CONFIG["REQUEST_INTERVAL"], ) -> Tuple[Dict, Dict, List]: - """ - 爬取多个网站的数据,使用同步请求 - - Args: - ids_list: ID列表,每个元素可以是ID字符串或(ID, 别名)元组 - request_interval: 请求间隔(毫秒) - - Returns: - (results, id_to_alias, failed_ids)元组 - """ + """爬取多个网站数据""" results = {} - id_to_alias = {} + id_to_name = {} failed_ids = [] for i, id_info in enumerate(ids_list): - # 处理ID和别名 if isinstance(id_info, tuple): - id_value, alias = id_info + id_value, name = id_info else: id_value = id_info - alias = id_value + name = id_value - # 添加到ID-别名映射 - id_to_alias[id_value] = alias - - # 发送请求 + id_to_name[id_value] = name response, _, _ = self.fetch_data(id_info) - # 处理响应 if response: try: data = json.loads(response) - # 获取标题列表,同时记录排名 results[id_value] = {} for index, item in enumerate(data.get("items", []), 1): title = item["title"] + url = item.get("url", "") + mobile_url = item.get("mobileUrl", "") + if title in results[id_value]: - results[id_value][title].append(index) + results[id_value][title]["ranks"].append(index) else: - results[id_value][title] = [index] + results[id_value][title] = { + "ranks": [index], + "url": url, + "mobileUrl": mobile_url, + } except json.JSONDecodeError: - print(f"解析 {id_value} 的响应失败,不是有效的JSON") + print(f"解析 {id_value} 响应失败") failed_ids.append(id_value) except Exception as e: - print(f"处理 {id_value} 数据时出错: {e}") + print(f"处理 {id_value} 数据出错: {e}") failed_ids.append(id_value) else: failed_ids.append(id_value) - # 添加间隔时间,除非是最后一个请求 if i < len(ids_list) - 1: - # 添加一些随机性到间隔时间 actual_interval = request_interval + random.randint(-10, 20) - actual_interval = max(50, actual_interval) # 确保至少50毫秒 - print(f"等待 {actual_interval} 毫秒后发送下一个请求...") + actual_interval = max(50, actual_interval) time.sleep(actual_interval / 1000) - print(f"\n请求总结:") - print(f"- 成功获取数据的ID: {list(results.keys())}") - print(f"- 请求失败的ID: {failed_ids}") + print(f"成功: {list(results.keys())}, 失败: {failed_ids}") + return results, id_to_name, failed_ids - return results, id_to_alias, failed_ids +# === 数据处理 === +def save_titles_to_file(results: Dict, id_to_name: Dict, failed_ids: List) -> str: + """保存标题到文件""" + file_path = get_output_path("txt", f"{format_time_filename()}.txt") -class DataProcessor: - """数据处理相关功能""" + with open(file_path, "w", encoding="utf-8") as f: + for id_value, title_data in results.items(): + # id | name 或 id + name = id_to_name.get(id_value) + if name and name != id_value: + f.write(f"{id_value} | {name}\n") + else: + f.write(f"{id_value}\n") + + # 按排名排序标题 + sorted_titles = [] + for title, info in title_data.items(): + cleaned_title = clean_title(title) + if isinstance(info, dict): + ranks = info.get("ranks", []) + url = info.get("url", "") + mobile_url = info.get("mobileUrl", "") + else: + ranks = info if isinstance(info, list) else [] + url = "" + mobile_url = "" - @staticmethod - def save_titles_to_file(results: Dict, id_to_alias: Dict, failed_ids: List) -> str: - """将标题保存到文件,包括失败的请求信息""" - file_path = FileHelper.get_output_path( - "txt", f"{TimeHelper.format_time_filename()}.txt" - ) + rank = ranks[0] if ranks else 1 + sorted_titles.append((rank, cleaned_title, url, mobile_url)) - with open(file_path, "w", encoding="utf-8") as f: - # 先写入成功获取的数据 - for id_value, title_data in results.items(): - display_name = id_to_alias.get(id_value, id_value) - f.write(f"{display_name}\n") - for i, (title, ranks) in enumerate(title_data.items(), 1): - rank_str = ",".join(map(str, ranks)) - f.write(f"{i}. {title} (排名:{rank_str})\n") - f.write("\n") - - # 如果有失败的请求,写入失败信息 - if failed_ids: - f.write("==== 以下ID请求失败 ====\n") - for id_value in failed_ids: - display_name = id_to_alias.get(id_value, id_value) - f.write(f"{display_name} (ID: {id_value})\n") - - return file_path - - @staticmethod - def load_frequency_words( - frequency_file: str = "frequency_words.txt", - ) -> Tuple[List[List[str]], List[str]]: - """ - 加载频率词和过滤词,处理关联词 + sorted_titles.sort(key=lambda x: x[0]) - Returns: - (word_groups, filter_words)元组 - """ - if not os.path.exists(frequency_file): - print(f"频率词文件 {frequency_file} 不存在") - return [], [] + for rank, cleaned_title, url, mobile_url in sorted_titles: + line = f"{rank}. {cleaned_title}" - with open(frequency_file, "r", encoding="utf-8") as f: - content = f.read() + if url: + line += f" [URL:{url}]" + if mobile_url: + line += f" [MOBILE:{mobile_url}]" + f.write(line + "\n") - # 按双空行分割不同的词组 - word_groups = [ - group.strip() for group in content.split("\n\n") if group.strip() - ] + f.write("\n") - # 处理每个词组 - processed_groups = [] - filter_words = [] # 用于存储过滤词 + if failed_ids: + f.write("==== 以下ID请求失败 ====\n") + for id_value in failed_ids: + f.write(f"{id_value}\n") - for group in word_groups: - words = [word.strip() for word in group.split("\n") if word.strip()] + return file_path - # 分离频率词和过滤词 - group_frequency_words = [] - for word in words: - if word.startswith("!"): - # 去掉感叹号,添加到过滤词列表 - filter_words.append(word[1:]) - else: - # 正常的频率词 - group_frequency_words.append(word) +def load_frequency_words( + frequency_file: Optional[str] = None, +) -> Tuple[List[Dict], List[str]]: + """加载频率词配置""" + if frequency_file is None: + frequency_file = os.environ.get( + "FREQUENCY_WORDS_PATH", "config/frequency_words.txt" + ) - # 只有当词组中包含频率词时才添加到结果中 - if group_frequency_words: - processed_groups.append(group_frequency_words) + frequency_path = Path(frequency_file) + if not frequency_path.exists(): + raise FileNotFoundError(f"频率词文件 {frequency_file} 不存在") - return processed_groups, filter_words + with open(frequency_path, "r", encoding="utf-8") as f: + content = f.read() - @staticmethod - def read_all_today_titles() -> Tuple[Dict, Dict, Dict]: - """ - 读取当天所有txt文件的标题,并按来源合并,去除重复,记录时间和出现次数 + word_groups = [group.strip() for group in content.split("\n\n") if group.strip()] - Returns: - (all_results, id_to_alias, title_info)元组 - """ - date_folder = TimeHelper.format_date_folder() - txt_dir = os.path.join("output", date_folder, "txt") - - if not os.path.exists(txt_dir): - print(f"今日文件夹 {txt_dir} 不存在") - return {}, {}, {} - - all_results = {} # 所有源的所有标题 {source_id: {title: [ranks]}} - id_to_alias = {} # ID到别名的映射 - title_info = ( - {} - ) # 标题信息 {source_id: {title: {"first_time": 首次时间, "last_time": 最后时间, "count": 出现次数, "ranks": [排名列表]}}} - - # 读取所有txt文件,按时间排序确保早的时间优先处理 - files = sorted([f for f in os.listdir(txt_dir) if f.endswith(".txt")]) - - for file in files: - # 从文件名提取时间信息 (例如 "12时34分.txt") - time_info = file.replace(".txt", "") - - file_path = os.path.join(txt_dir, file) - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - - # 解析内容 - sections = content.split("\n\n") - for section in sections: - if not section.strip() or "==== 以下ID请求失败 ====" in section: - continue - - lines = section.strip().split("\n") - if len(lines) < 2: - continue - - # 第一行是来源名 - source_name = lines[0].strip() - - # 提取标题和排名 - title_ranks = {} - for line in lines[1:]: - if line.strip(): - try: - # 提取序号和正文部分 - match_num = None - title_part = line.strip() - - # 处理格式 "数字. 标题" - if ( - ". " in title_part - and title_part.split(". ")[0].isdigit() - ): - parts = title_part.split(". ", 1) - match_num = int(parts[0]) # 序号可能是排名 - title_part = parts[1] - - # 提取排名信息 "标题 (排名:1,2,3)" - ranks = [] - if " (排名:" in title_part: - title, rank_str = title_part.rsplit(" (排名:", 1) - rank_str = rank_str.rstrip(")") - ranks = [ - int(r) - for r in rank_str.split(",") - if r.strip() and r.isdigit() - ] - else: - title = title_part - - # 如果没找到排名但有序号,则使用序号 - if not ranks and match_num is not None: - ranks = [match_num] - - # 确保排名列表不为空 - if not ranks: - ranks = [99] # 默认排名 - - title_ranks[title] = ranks - - except Exception as e: - print(f"解析标题行出错: {line}, 错误: {e}") - - # 处理来源数据 - DataProcessor._process_source_data( - source_name, - title_ranks, - time_info, - all_results, - title_info, - id_to_alias, - ) + processed_groups = [] + filter_words = [] - # 将结果从 {source_name: {title: [ranks]}} 转换为 {source_id: {title: [ranks]}} - id_results = {} - id_title_info = {} - for name, titles in all_results.items(): - for id_value, alias in id_to_alias.items(): - if alias == name: - id_results[id_value] = titles - id_title_info[id_value] = title_info[name] - break - - return id_results, id_to_alias, id_title_info - - @staticmethod - def _process_source_data( - source_name: str, - title_ranks: Dict, - time_info: str, - all_results: Dict, - title_info: Dict, - id_to_alias: Dict, - ) -> None: - """处理来源数据,更新结果和标题信息""" - if source_name not in all_results: - # 首次遇到此来源 - all_results[source_name] = title_ranks - - # 初始化标题信息 - if source_name not in title_info: - title_info[source_name] = {} - - # 记录每个标题的时间、次数和排名 - for title, ranks in title_ranks.items(): - title_info[source_name][title] = { - "first_time": time_info, # 记录首次时间 - "last_time": time_info, # 最后时间初始同首次时间 - "count": 1, - "ranks": ranks, - } + for group in word_groups: + words = [word.strip() for word in group.split("\n") if word.strip()] - # 尝试反向生成ID - reversed_id = source_name.lower().replace(" ", "-") - id_to_alias[reversed_id] = source_name - else: - # 已有此来源,更新标题 - for title, ranks in title_ranks.items(): - if title not in all_results[source_name]: - all_results[source_name][title] = ranks - title_info[source_name][title] = { - "first_time": time_info, # 新标题的首次和最后时间都设为当前 - "last_time": time_info, - "count": 1, - "ranks": ranks, - } - else: - # 已存在的标题,更新最后时间,合并排名信息并增加计数 - existing_ranks = title_info[source_name][title]["ranks"] - merged_ranks = existing_ranks.copy() - for rank in ranks: - if rank not in merged_ranks: - merged_ranks.append(rank) - - title_info[source_name][title][ - "last_time" - ] = time_info # 更新最后时间 - title_info[source_name][title]["ranks"] = merged_ranks - title_info[source_name][title]["count"] += 1 - - -class StatisticsCalculator: - """统计计算相关功能""" - - @staticmethod - def count_word_frequency( - results: Dict, - word_groups: List[List[str]], - filter_words: List[str], - id_to_alias: Dict, - title_info: Optional[Dict] = None, - rank_threshold: int = CONFIG["RANK_THRESHOLD"], - ) -> Tuple[List[Dict], int]: - """ - 统计词频,处理关联词和大小写不敏感,每个标题只计入首个匹配词组,并应用过滤词 + group_required_words = [] + group_normal_words = [] + group_filter_words = [] - Returns: - (stats, total_titles)元组 - """ - word_stats = {} - total_titles = 0 - processed_titles = {} # 用于跟踪已处理标题 {source_id: {title: True}} + for word in words: + if word.startswith("!"): + filter_words.append(word[1:]) + group_filter_words.append(word[1:]) + elif word.startswith("+"): + group_required_words.append(word[1:]) + else: + group_normal_words.append(word) - # 初始化title_info - if title_info is None: - title_info = {} + if group_required_words or group_normal_words: + if group_normal_words: + group_key = " ".join(group_normal_words) + else: + group_key = " ".join(group_required_words) - # 为每个词组创建统计对象 - for group in word_groups: - group_key = " ".join(group) - word_stats[group_key] = {"count": 0, "titles": {}} + processed_groups.append( + { + "required": group_required_words, + "normal": group_normal_words, + "group_key": group_key, + } + ) - # 遍历所有标题并统计 - for source_id, titles_data in results.items(): - total_titles += len(titles_data) + return processed_groups, filter_words - # 初始化该来源的处理记录 - if source_id not in processed_titles: - processed_titles[source_id] = {} - for title, source_ranks in titles_data.items(): - # 跳过已处理的标题 - if title in processed_titles.get(source_id, {}): - continue +def parse_file_titles(file_path: Path) -> Tuple[Dict, Dict]: + """解析单个txt文件的标题数据,返回(titles_by_id, id_to_name)""" + titles_by_id = {} + id_to_name = {} - title_lower = title.lower() # 转换为小写以实现大小写不敏感 + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + sections = content.split("\n\n") - # 检查是否包含任何过滤词 - contains_filter_word = any( - filter_word.lower() in title_lower for filter_word in filter_words - ) + for section in sections: + if not section.strip() or "==== 以下ID请求失败 ====" in section: + continue - # 如果包含过滤词,跳过这个标题 - if contains_filter_word: - continue - - # 按顺序检查每个词组 - for group in word_groups: - group_key = " ".join(group) - - # 检查是否有任何一个词在标题中 - matched = any(word.lower() in title_lower for word in group) - - # 如果匹配,增加计数并添加标题,然后标记为已处理 - if matched: - word_stats[group_key]["count"] += 1 - if source_id not in word_stats[group_key]["titles"]: - word_stats[group_key]["titles"][source_id] = [] - - # 获取标题信息 - first_time = "" - last_time = "" - count_info = 1 - ranks = source_ranks if source_ranks else [] - - if ( - title_info - and source_id in title_info - and title in title_info[source_id] - ): - info = title_info[source_id][title] - first_time = info.get("first_time", "") - last_time = info.get("last_time", "") - count_info = info.get("count", 1) - if "ranks" in info and info["ranks"]: - ranks = info["ranks"] - - # 添加带信息的标题 - word_stats[group_key]["titles"][source_id].append( - { - "title": title, - "first_time": first_time, - "last_time": last_time, - "count": count_info, - "ranks": ranks, - } - ) + lines = section.strip().split("\n") + if len(lines) < 2: + continue - # 标记该标题已处理,不再匹配其他词组 - if source_id not in processed_titles: - processed_titles[source_id] = {} - processed_titles[source_id][title] = True - break # 找到第一个匹配的词组后退出循环 - - # 转换统计结果 - stats = [] - for group_key, data in word_stats.items(): - titles_with_info = [] - for source_id, title_list in data["titles"].items(): - source_alias = id_to_alias.get(source_id, source_id) - for title_data in title_list: - title = title_data["title"] - first_time = title_data["first_time"] - last_time = title_data["last_time"] - count_info = title_data["count"] - ranks = title_data.get("ranks", []) + # id | name 或 id + header_line = lines[0].strip() + if " | " in header_line: + parts = header_line.split(" | ", 1) + source_id = parts[0].strip() + name = parts[1].strip() + id_to_name[source_id] = name + else: + source_id = header_line + id_to_name[source_id] = source_id - # 确保排名是有效的 - if not ranks: - ranks = [99] # 使用默认排名 + titles_by_id[source_id] = {} - # 格式化排名信息 - rank_display = StatisticsCalculator._format_rank_display( - ranks, rank_threshold - ) + for line in lines[1:]: + if line.strip(): + try: + title_part = line.strip() + rank = None - # 格式化时间信息 - time_display = StatisticsCalculator._format_time_display( - first_time, last_time - ) + # 提取排名 + if ". " in title_part and title_part.split(". ")[0].isdigit(): + rank_str, title_part = title_part.split(". ", 1) + rank = int(rank_str) - # 格式化标题信息 - formatted_title = f"[{source_alias}] {title}" - if rank_display: - formatted_title += f" {rank_display}" - if time_display: - formatted_title += f" - {time_display}" - if count_info > 1: - formatted_title += f" - {count_info}次" + # 提取 MOBILE URL + mobile_url = "" + if " [MOBILE:" in title_part: + title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1) + if mobile_part.endswith("]"): + mobile_url = mobile_part[:-1] - titles_with_info.append(formatted_title) + # 提取 URL + url = "" + if " [URL:" in title_part: + title_part, url_part = title_part.rsplit(" [URL:", 1) + if url_part.endswith("]"): + url = url_part[:-1] - stats.append( - { - "word": group_key, - "count": data["count"], - "titles": titles_with_info, - "percentage": ( - round(data["count"] / total_titles * 100, 2) - if total_titles > 0 - else 0 - ), - } - ) + title = clean_title(title_part.strip()) + ranks = [rank] if rank is not None else [1] - # 按出现次数从高到低排序 - stats.sort(key=lambda x: x["count"], reverse=True) + titles_by_id[source_id][title] = { + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } - return stats, total_titles + except Exception as e: + print(f"解析标题行出错: {line}, 错误: {e}") - @staticmethod - def _format_rank_display(ranks: List[int], rank_threshold: int) -> str: - """格式化排名显示""" - if not ranks: - return "" + return titles_by_id, id_to_name - # 排序排名并确保不重复 - unique_ranks = sorted(set(ranks)) - min_rank = unique_ranks[0] - max_rank = unique_ranks[-1] - # 根据最高排名判断使用哪种括号 - if min_rank <= rank_threshold: - # 使用【】 - if min_rank == max_rank: - return f"【{min_rank}】" - else: - return f"【{min_rank} - {max_rank}】" - else: - # 使用[] - if min_rank == max_rank: - return f"[{min_rank}]" - else: - return f"[{min_rank} - {max_rank}]" +def read_all_today_titles( + current_platform_ids: Optional[List[str]] = None, +) -> Tuple[Dict, Dict, Dict]: + """读取当天所有标题文件,支持按当前监控平台过滤""" + date_folder = format_date_folder() + txt_dir = Path("output") / date_folder / "txt" - @staticmethod - def _format_time_display(first_time: str, last_time: str) -> str: - """格式化时间显示,单次显示时间,多次显示时间范围""" - if not first_time: - return "" + if not txt_dir.exists(): + return {}, {}, {} - if first_time == last_time or not last_time: - # 只有一个时间点,直接显示 - return first_time - else: - # 有两个时间点,显示范围 - return f"[{first_time} ~ {last_time}]" + all_results = {} + final_id_to_name = {} + title_info = {} + files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) -class ReportGenerator: - """报告生成相关功能""" + for file_path in files: + time_info = file_path.stem - @staticmethod - def generate_html_report( - stats: List[Dict], - total_titles: int, - failed_ids: Optional[List] = None, - is_daily: bool = False, - ) -> str: - """ - 生成HTML报告,包括失败的请求信息 - Returns: - HTML文件路径 - """ - # 创建文件路径 - if is_daily: - filename = "当日统计.html" - else: - filename = f"{TimeHelper.format_time_filename()}.html" - file_path = FileHelper.get_output_path("html", filename) + titles_by_id, file_id_to_name = parse_file_titles(file_path) - # HTML模板和内容生成 - html_content = ReportGenerator._create_html_content( - stats, total_titles, failed_ids, is_daily - ) + if current_platform_ids is not None: + filtered_titles_by_id = {} + filtered_id_to_name = {} - # 写入文件 - with open(file_path, "w", encoding="utf-8") as f: - f.write(html_content) + for source_id, title_data in titles_by_id.items(): + if source_id in current_platform_ids: + filtered_titles_by_id[source_id] = title_data + if source_id in file_id_to_name: + filtered_id_to_name[source_id] = file_id_to_name[source_id] - # 如果是当日统计,还需要在根目录下生成index.html - if is_daily: - root_file_path = "index.html" # 根目录下使用index.html作为文件名 - with open(root_file_path, "w", encoding="utf-8") as f: - f.write(html_content) - print( - f"当日统计报告已保存到根目录的index.html: {os.path.abspath(root_file_path)}" - ) + titles_by_id = filtered_titles_by_id + file_id_to_name = filtered_id_to_name - return file_path + final_id_to_name.update(file_id_to_name) - @staticmethod - def _create_html_content( - stats: List[Dict], - total_titles: int, - failed_ids: Optional[List] = None, - is_daily: bool = False, - ) -> str: - """创建HTML内容""" - # HTML头部 - html = """ - - - - - 频率词统计报告 - - - -

频率词统计报告

- """ + for source_id, title_data in titles_by_id.items(): + process_source_data( + source_id, title_data, time_info, all_results, title_info + ) - # 报告类型 - if is_daily: - html += "

报告类型: 当日汇总

" + return all_results, final_id_to_name, title_info - # 基本信息 - now = TimeHelper.get_beijing_time() - html += f"

总标题数: {total_titles}

" - html += f"

生成时间: {now.strftime('%Y-%m-%d %H:%M:%S')}

" - # 失败的请求信息 - if failed_ids and len(failed_ids) > 0: - html += """ -
-

请求失败的平台

-
    - """ - for id_value in failed_ids: - html += f"
  • {id_value}
  • " - html += """ -
-
- """ +def process_source_data( + source_id: str, + title_data: Dict, + time_info: str, + all_results: Dict, + title_info: Dict, +) -> None: + """处理来源数据,合并重复标题""" + if source_id not in all_results: + all_results[source_id] = title_data - # 表格头部 - html += """ - - - - - - - - - """ + if source_id not in title_info: + title_info[source_id] = {} - # 表格内容 - for i, stat in enumerate(stats, 1): - html += f""" - - - - - - - - """ - - # 表格结尾 - html += """ -
排名频率词出现次数占比相关标题
{i}{stat['word']}{stat['count']}{stat['percentage']}%{"
".join(stat['titles'])}
- - - """ + for title, data in title_data.items(): + ranks = data.get("ranks", []) + url = data.get("url", "") + mobile_url = data.get("mobileUrl", "") + + title_info[source_id][title] = { + "first_time": time_info, + "last_time": time_info, + "count": 1, + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } + else: + for title, data in title_data.items(): + ranks = data.get("ranks", []) + url = data.get("url", "") + mobile_url = data.get("mobileUrl", "") + + if title not in all_results[source_id]: + all_results[source_id][title] = { + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } + title_info[source_id][title] = { + "first_time": time_info, + "last_time": time_info, + "count": 1, + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } + else: + existing_data = all_results[source_id][title] + existing_ranks = existing_data.get("ranks", []) + existing_url = existing_data.get("url", "") + existing_mobile_url = existing_data.get("mobileUrl", "") + + merged_ranks = existing_ranks.copy() + for rank in ranks: + if rank not in merged_ranks: + merged_ranks.append(rank) + + all_results[source_id][title] = { + "ranks": merged_ranks, + "url": existing_url or url, + "mobileUrl": existing_mobile_url or mobile_url, + } - return html + title_info[source_id][title]["last_time"] = time_info + title_info[source_id][title]["ranks"] = merged_ranks + title_info[source_id][title]["count"] += 1 + if not title_info[source_id][title].get("url"): + title_info[source_id][title]["url"] = url + if not title_info[source_id][title].get("mobileUrl"): + title_info[source_id][title]["mobileUrl"] = mobile_url + + +def detect_latest_new_titles(current_platform_ids: Optional[List[str]] = None) -> Dict: + """检测当日最新批次的新增标题,支持按当前监控平台过滤""" + date_folder = format_date_folder() + txt_dir = Path("output") / date_folder / "txt" + + if not txt_dir.exists(): + return {} + + files = sorted([f for f in txt_dir.iterdir() if f.suffix == ".txt"]) + if len(files) < 2: + return {} + + # 解析最新文件 + latest_file = files[-1] + latest_titles, _ = parse_file_titles(latest_file) + + # 如果指定了当前平台列表,过滤最新文件数据 + if current_platform_ids is not None: + filtered_latest_titles = {} + for source_id, title_data in latest_titles.items(): + if source_id in current_platform_ids: + filtered_latest_titles[source_id] = title_data + latest_titles = filtered_latest_titles + + # 汇总历史标题(按平台过滤) + historical_titles = {} + for file_path in files[:-1]: + historical_data, _ = parse_file_titles(file_path) + + # 过滤历史数据 + if current_platform_ids is not None: + filtered_historical_data = {} + for source_id, title_data in historical_data.items(): + if source_id in current_platform_ids: + filtered_historical_data[source_id] = title_data + historical_data = filtered_historical_data + + for source_id, titles_data in historical_data.items(): + if source_id not in historical_titles: + historical_titles[source_id] = set() + for title in titles_data.keys(): + historical_titles[source_id].add(title) + + # 找出新增标题 + new_titles = {} + for source_id, latest_source_titles in latest_titles.items(): + historical_set = historical_titles.get(source_id, set()) + source_new_titles = {} + + for title, title_data in latest_source_titles.items(): + if title not in historical_set: + source_new_titles[title] = title_data + + if source_new_titles: + new_titles[source_id] = source_new_titles + + return new_titles + + +# === 统计和分析 === +def calculate_news_weight( + title_data: Dict, rank_threshold: int = CONFIG["RANK_THRESHOLD"] +) -> float: + """计算新闻权重,用于排序""" + ranks = title_data.get("ranks", []) + if not ranks: + return 0.0 + + count = title_data.get("count", len(ranks)) + weight_config = CONFIG["WEIGHT_CONFIG"] + + # 排名权重:Σ(11 - min(rank, 10)) / 出现次数 + rank_scores = [] + for rank in ranks: + score = 11 - min(rank, 10) + rank_scores.append(score) + + rank_weight = sum(rank_scores) / len(ranks) if ranks else 0 + + # 频次权重:min(出现次数, 10) × 10 + frequency_weight = min(count, 10) * 10 + + # 热度加成:高排名次数 / 总出现次数 × 100 + high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold) + hotness_ratio = high_rank_count / len(ranks) if ranks else 0 + hotness_weight = hotness_ratio * 100 + + total_weight = ( + rank_weight * weight_config["RANK_WEIGHT"] + + frequency_weight * weight_config["FREQUENCY_WEIGHT"] + + hotness_weight * weight_config["HOTNESS_WEIGHT"] + ) - @staticmethod - def send_to_feishu( - stats: List[Dict], - failed_ids: Optional[List] = None, - report_type: str = "单次爬取", - ) -> bool: - """ - 将频率词统计结果发送到飞书 + return total_weight - Returns: - 成功发送返回True,否则返回False - """ - # 获取webhook URL,优先使用环境变量,其次使用配置中的URL - webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", CONFIG["FEISHU_WEBHOOK_URL"]) - # 检查webhook URL是否有效 - if not webhook_url: - print(f"警告: FEISHU_WEBHOOK_URL未设置或无效,跳过发送飞书通知") - return False +def matches_word_groups( + title: str, word_groups: List[Dict], filter_words: List[str] +) -> bool: + """检查标题是否匹配词组规则""" + # 如果没有配置词组,则匹配所有标题(支持显示全部新闻) + if not word_groups: + return True - headers = {"Content-Type": "application/json"} + title_lower = title.lower() - # 获取总标题数 - total_titles = sum(len(stat["titles"]) for stat in stats if stat["count"] > 0) + # 过滤词检查 + if any(filter_word.lower() in title_lower for filter_word in filter_words): + return False - # 构建文本内容 - text_content = ReportGenerator._build_feishu_content(stats, failed_ids) + # 词组匹配检查 + for group in word_groups: + required_words = group["required"] + normal_words = group["normal"] - # 构造消息体 - now = TimeHelper.get_beijing_time() - payload = { - "msg_type": "text", - "content": { - "total_titles": total_titles, - "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"), - "report_type": report_type, - "text": text_content, - }, - } + # 必须词检查 + if required_words: + all_required_present = all( + req_word.lower() in title_lower for req_word in required_words + ) + if not all_required_present: + continue + + # 普通词检查 + if normal_words: + any_normal_present = any( + normal_word.lower() in title_lower for normal_word in normal_words + ) + if not any_normal_present: + continue + + return True + + return False + + +def format_time_display(first_time: str, last_time: str) -> str: + """格式化时间显示""" + if not first_time: + return "" + if first_time == last_time or not last_time: + return first_time + else: + return f"[{first_time} ~ {last_time}]" + + +def format_rank_display(ranks: List[int], rank_threshold: int, format_type: str) -> str: + """统一的排名格式化方法""" + if not ranks: + return "" + + unique_ranks = sorted(set(ranks)) + min_rank = unique_ranks[0] + max_rank = unique_ranks[-1] + + if format_type == "html": + highlight_start = "" + highlight_end = "" + elif format_type == "feishu": + highlight_start = "**" + highlight_end = "**" + elif format_type == "dingtalk": + highlight_start = "**" + highlight_end = "**" + elif format_type == "wework": + highlight_start = "**" + highlight_end = "**" + elif format_type == "telegram": + highlight_start = "" + highlight_end = "" + else: + highlight_start = "**" + highlight_end = "**" + + if min_rank <= rank_threshold: + if min_rank == max_rank: + return f"{highlight_start}[{min_rank}]{highlight_end}" + else: + return f"{highlight_start}[{min_rank} - {max_rank}]{highlight_end}" + else: + if min_rank == max_rank: + return f"[{min_rank}]" + else: + return f"[{min_rank} - {max_rank}]" + + +def count_word_frequency( + results: Dict, + word_groups: List[Dict], + filter_words: List[str], + id_to_name: Dict, + title_info: Optional[Dict] = None, + rank_threshold: int = CONFIG["RANK_THRESHOLD"], + new_titles: Optional[Dict] = None, + mode: str = "daily", +) -> Tuple[List[Dict], int]: + """统计词频,支持必须词、频率词、过滤词,并标记新增标题""" + + # 如果没有配置词组,创建一个包含所有新闻的虚拟词组 + if not word_groups: + print("频率词配置为空,将显示所有新闻") + word_groups = [{"required": [], "normal": [], "group_key": "全部新闻"}] + filter_words = [] # 清空过滤词,显示所有新闻 + + is_first_today = is_first_crawl_today() + + # 确定处理的数据源和新增标记逻辑 + if mode == "incremental": + if is_first_today: + # 增量模式 + 当天第一次:处理所有新闻,都标记为新增 + results_to_process = results + all_news_are_new = True + else: + # 增量模式 + 当天非第一次:只处理新增的新闻 + results_to_process = new_titles if new_titles else {} + all_news_are_new = True + elif mode == "current": + # current 模式:只处理当前时间批次的新闻,但统计信息来自全部历史 + if title_info: + latest_time = None + for source_titles in title_info.values(): + for title_data in source_titles.values(): + last_time = title_data.get("last_time", "") + if last_time: + if latest_time is None or last_time > latest_time: + latest_time = last_time + + # 只处理 last_time 等于最新时间的新闻 + if latest_time: + results_to_process = {} + for source_id, source_titles in results.items(): + if source_id in title_info: + filtered_titles = {} + for title, title_data in source_titles.items(): + if title in title_info[source_id]: + info = title_info[source_id][title] + if info.get("last_time") == latest_time: + filtered_titles[title] = title_data + if filtered_titles: + results_to_process[source_id] = filtered_titles - # 发送请求 - try: - response = requests.post(webhook_url, headers=headers, json=payload) - if response.status_code == 200: - print(f"数据发送到飞书成功 [{report_type}]") - return True - else: print( - f"发送到飞书失败 [{report_type}],状态码:{response.status_code},响应:{response.text}" + f"当前榜单模式:最新时间 {latest_time},筛选出 {sum(len(titles) for titles in results_to_process.values())} 条当前榜单新闻" ) - return False - except Exception as e: - print(f"发送到飞书时出错 [{report_type}]:{e}") - return False + else: + results_to_process = results + else: + results_to_process = results + all_news_are_new = False + else: + # 当日汇总模式:处理所有新闻 + results_to_process = results + all_news_are_new = False + total_input_news = sum(len(titles) for titles in results.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词过滤" + ) + print(f"当日汇总模式:处理 {total_input_news} 条新闻,模式:{filter_status}") - @staticmethod - def _build_feishu_content( - stats: List[Dict], failed_ids: Optional[List] = None - ) -> str: - """构建飞书消息内容""" - text_content = "" + word_stats = {} + total_titles = 0 + processed_titles = {} + matched_new_count = 0 - # 添加频率词统计信息 - filtered_stats = [stat for stat in stats if stat["count"] > 0] - for i, stat in enumerate(filtered_stats): - word = stat["word"] - count = stat["count"] + if title_info is None: + title_info = {} + if new_titles is None: + new_titles = {} - text_content += f"【{word}】 : {count} 条\n" + for group in word_groups: + group_key = group["group_key"] + word_stats[group_key] = {"count": 0, "titles": {}} - # 添加相关标题 - for j, title in enumerate(stat["titles"], 1): - text_content += f"{j}. {title}\n" + for source_id, titles_data in results_to_process.items(): + total_titles += len(titles_data) - # 添加分割线 - if i < len(filtered_stats) - 1: - text_content += f"\n{CONFIG['FEISHU_SEPARATOR']}\n\n" + if source_id not in processed_titles: + processed_titles[source_id] = {} - if not text_content: - text_content = "无匹配频率词\n\n" + for title, title_data in titles_data.items(): + if title in processed_titles.get(source_id, {}): + continue - # 添加失败平台信息 - if failed_ids and len(failed_ids) > 0: - if text_content and text_content != "无匹配频率词\n\n": - text_content += f"\n{CONFIG['FEISHU_SEPARATOR']}\n\n" + # 使用统一的匹配逻辑 + matches_frequency_words = matches_word_groups( + title, word_groups, filter_words + ) - text_content += "失败平台:\n" - for i, id_value in enumerate(failed_ids, 1): - text_content += f"{i}. {id_value}\n" + if not matches_frequency_words: + continue + + # 如果是增量模式或 current 模式第一次,统计匹配的新增新闻数量 + if (mode == "incremental" and all_news_are_new) or ( + mode == "current" and is_first_today + ): + matched_new_count += 1 + + source_ranks = title_data.get("ranks", []) + source_url = title_data.get("url", "") + source_mobile_url = title_data.get("mobileUrl", "") + + # 找到匹配的词组 + title_lower = title.lower() + for group in word_groups: + required_words = group["required"] + normal_words = group["normal"] + + # 如果是"全部新闻"模式,所有标题都匹配第一个(唯一的)词组 + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻": + group_key = group["group_key"] + word_stats[group_key]["count"] += 1 + if source_id not in word_stats[group_key]["titles"]: + word_stats[group_key]["titles"][source_id] = [] + else: + # 原有的匹配逻辑 + if required_words: + all_required_present = all( + req_word.lower() in title_lower + for req_word in required_words + ) + if not all_required_present: + continue - return text_content + if normal_words: + any_normal_present = any( + normal_word.lower() in title_lower + for normal_word in normal_words + ) + if not any_normal_present: + continue + + group_key = group["group_key"] + word_stats[group_key]["count"] += 1 + if source_id not in word_stats[group_key]["titles"]: + word_stats[group_key]["titles"][source_id] = [] + + first_time = "" + last_time = "" + count_info = 1 + ranks = source_ranks if source_ranks else [] + url = source_url + mobile_url = source_mobile_url + + # 对于 current 模式,从历史统计信息中获取完整数据 + if ( + mode == "current" + and title_info + and source_id in title_info + and title in title_info[source_id] + ): + info = title_info[source_id][title] + first_time = info.get("first_time", "") + last_time = info.get("last_time", "") + count_info = info.get("count", 1) + if "ranks" in info and info["ranks"]: + ranks = info["ranks"] + url = info.get("url", source_url) + mobile_url = info.get("mobileUrl", source_mobile_url) + elif ( + title_info + and source_id in title_info + and title in title_info[source_id] + ): + info = title_info[source_id][title] + first_time = info.get("first_time", "") + last_time = info.get("last_time", "") + count_info = info.get("count", 1) + if "ranks" in info and info["ranks"]: + ranks = info["ranks"] + url = info.get("url", source_url) + mobile_url = info.get("mobileUrl", source_mobile_url) + + if not ranks: + ranks = [99] + + time_display = format_time_display(first_time, last_time) + + source_name = id_to_name.get(source_id, source_id) + + # 判断是否为新增 + is_new = False + if all_news_are_new: + # 增量模式下所有处理的新闻都是新增,或者当天第一次的所有新闻都是新增 + is_new = True + elif new_titles and source_id in new_titles: + # 检查是否在新增列表中 + new_titles_for_source = new_titles[source_id] + is_new = title in new_titles_for_source + + word_stats[group_key]["titles"][source_id].append( + { + "title": title, + "source_name": source_name, + "first_time": first_time, + "last_time": last_time, + "time_display": time_display, + "count": count_info, + "ranks": ranks, + "rank_threshold": rank_threshold, + "url": url, + "mobileUrl": mobile_url, + "is_new": is_new, + } + ) + if source_id not in processed_titles: + processed_titles[source_id] = {} + processed_titles[source_id][title] = True -class NewsAnalyzer: - """新闻分析主类""" + break - def __init__( - self, - request_interval: int = CONFIG["REQUEST_INTERVAL"], - feishu_report_type: str = CONFIG["FEISHU_REPORT_TYPE"], - rank_threshold: int = CONFIG["RANK_THRESHOLD"], - ): - """ - 初始化新闻分析器 + # 最后统一打印汇总信息 + if mode == "incremental": + if is_first_today: + total_input_news = sum(len(titles) for titles in results.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"增量模式:当天第一次爬取,{total_input_news} 条新闻中有 {matched_new_count} 条{filter_status}" + ) + else: + if new_titles: + total_new_count = sum(len(titles) for titles in new_titles.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 + and word_groups[0]["group_key"] == "全部新闻" + else "匹配频率词" + ) + print( + f"增量模式:{total_new_count} 条新增新闻中,有 {matched_new_count} 条{filter_status}" + ) + if matched_new_count == 0 and len(word_groups) > 1: + print("增量模式:没有新增新闻匹配频率词,将不会发送通知") + else: + print("增量模式:未检测到新增新闻") + elif mode == "current": + total_input_news = sum(len(titles) for titles in results_to_process.values()) + if is_first_today: + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"当前榜单模式:当天第一次爬取,{total_input_news} 条当前榜单新闻中有 {matched_new_count} 条{filter_status}" + ) + else: + matched_count = sum(stat["count"] for stat in word_stats.values()) + filter_status = ( + "全部显示" + if len(word_groups) == 1 and word_groups[0]["group_key"] == "全部新闻" + else "频率词匹配" + ) + print( + f"当前榜单模式:{total_input_news} 条当前榜单新闻中有 {matched_count} 条{filter_status}" + ) - Args: - request_interval: 请求间隔(毫秒) - feishu_report_type: 飞书报告类型,可选值: "current"(当前爬取), "daily"(当日汇总), "both"(两者都发送) - rank_threshold: 排名显示阈值 - """ - self.request_interval = request_interval - self.feishu_report_type = feishu_report_type - self.rank_threshold = rank_threshold + stats = [] + for group_key, data in word_stats.items(): + all_titles = [] + for source_id, title_list in data["titles"].items(): + all_titles.extend(title_list) + + # 按权重排序 + sorted_titles = sorted( + all_titles, + key=lambda x: ( + -calculate_news_weight(x, rank_threshold), + min(x["ranks"]) if x["ranks"] else 999, + -x["count"], + ), + ) - # 判断是否在GitHub Actions环境中 - self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" + stats.append( + { + "word": group_key, + "count": data["count"], + "titles": sorted_titles, + "percentage": ( + round(data["count"] / total_titles * 100, 2) + if total_titles > 0 + else 0 + ), + } + ) - # 设置代理 - self.proxy_url = None - if not self.is_github_actions and CONFIG["USE_PROXY"]: - # 本地环境且启用代理时使用代理 - self.proxy_url = CONFIG["DEFAULT_PROXY"] - print("本地环境,使用代理") - elif not self.is_github_actions and not CONFIG["USE_PROXY"]: - print("本地环境,未启用代理") - else: - print("GitHub Actions环境,不使用代理") + stats.sort(key=lambda x: x["count"], reverse=True) + return stats, total_titles + + +# === 报告生成 === +def prepare_report_data( + stats: List[Dict], + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", +) -> Dict: + """准备报告数据""" + processed_new_titles = [] + + # 在增量模式下隐藏新增新闻区域 + hide_new_section = mode == "incremental" + + # 只有在非隐藏模式下才处理新增新闻部分 + if not hide_new_section: + filtered_new_titles = {} + if new_titles and id_to_name: + word_groups, filter_words = load_frequency_words() + for source_id, titles_data in new_titles.items(): + filtered_titles = {} + for title, title_data in titles_data.items(): + if matches_word_groups(title, word_groups, filter_words): + filtered_titles[title] = title_data + if filtered_titles: + filtered_new_titles[source_id] = filtered_titles + + if filtered_new_titles and id_to_name: + for source_id, titles_data in filtered_new_titles.items(): + source_name = id_to_name.get(source_id, source_id) + source_titles = [] + + for title, title_data in titles_data.items(): + url = title_data.get("url", "") + mobile_url = title_data.get("mobileUrl", "") + ranks = title_data.get("ranks", []) - # 初始化数据获取器 - self.data_fetcher = DataFetcher(self.proxy_url) + processed_title = { + "title": title, + "source_name": source_name, + "time_display": "", + "count": 1, + "ranks": ranks, + "rank_threshold": CONFIG["RANK_THRESHOLD"], + "url": url, + "mobile_url": mobile_url, + "is_new": True, + } + source_titles.append(processed_title) + + if source_titles: + processed_new_titles.append( + { + "source_id": source_id, + "source_name": source_name, + "titles": source_titles, + } + ) - def generate_daily_summary(self) -> Optional[str]: - """ - 生成当日统计报告 + processed_stats = [] + for stat in stats: + if stat["count"] <= 0: + continue + + processed_titles = [] + for title_data in stat["titles"]: + processed_title = { + "title": title_data["title"], + "source_name": title_data["source_name"], + "time_display": title_data["time_display"], + "count": title_data["count"], + "ranks": title_data["ranks"], + "rank_threshold": title_data["rank_threshold"], + "url": title_data.get("url", ""), + "mobile_url": title_data.get("mobileUrl", ""), + "is_new": title_data.get("is_new", False), + } + processed_titles.append(processed_title) + + processed_stats.append( + { + "word": stat["word"], + "count": stat["count"], + "percentage": stat.get("percentage", 0), + "titles": processed_titles, + } + ) - Returns: - HTML文件路径,如果生成失败则返回None - """ - print("开始生成当日统计报告...") + return { + "stats": processed_stats, + "new_titles": processed_new_titles, + "failed_ids": failed_ids or [], + "total_new_count": sum( + len(source["titles"]) for source in processed_new_titles + ), + } + + +def format_title_for_platform( + platform: str, title_data: Dict, show_source: bool = True +) -> str: + """统一的标题格式化方法""" + rank_display = format_rank_display( + title_data["ranks"], title_data["rank_threshold"], platform + ) - # 读取当天所有标题 - all_results, id_to_alias, title_info = DataProcessor.read_all_today_titles() + link_url = title_data["mobile_url"] or title_data["url"] - if not all_results: - print("没有找到当天的数据") - return None + cleaned_title = clean_title(title_data["title"]) - # 计算标题总数 - total_titles = sum(len(titles) for titles in all_results.values()) - print(f"读取到 {total_titles} 个标题") + if platform == "feishu": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title - # 加载频率词和过滤词 - word_groups, filter_words = DataProcessor.load_frequency_words() + title_prefix = "🆕 " if title_data.get("is_new") else "" - # 统计词频 - stats, total_titles = StatisticsCalculator.count_word_frequency( - all_results, - word_groups, - filter_words, - id_to_alias, - title_info, - self.rank_threshold, - ) + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "dingtalk": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "wework": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "telegram": + if link_url: + formatted_title = f'{html_escape(cleaned_title)}' + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + return result + + elif platform == "ntfy": + if link_url: + formatted_title = f"[{cleaned_title}]({link_url})" + else: + formatted_title = cleaned_title + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{formatted_title}" + else: + result = f"{title_prefix}{formatted_title}" + + if rank_display: + result += f" {rank_display}" + if title_data["time_display"]: + result += f" `- {title_data['time_display']}`" + if title_data["count"] > 1: + result += f" `({title_data['count']}次)`" + + return result + + elif platform == "html": + rank_display = format_rank_display( + title_data["ranks"], title_data["rank_threshold"], "html" + ) + + link_url = title_data["mobile_url"] or title_data["url"] + + escaped_title = html_escape(cleaned_title) + escaped_source_name = html_escape(title_data["source_name"]) + + if link_url: + escaped_url = html_escape(link_url) + formatted_title = f'[{escaped_source_name}] {escaped_title}' + else: + formatted_title = ( + f'[{escaped_source_name}] {escaped_title}' + ) + + if rank_display: + formatted_title += f" {rank_display}" + if title_data["time_display"]: + escaped_time = html_escape(title_data["time_display"]) + formatted_title += f" - {escaped_time}" + if title_data["count"] > 1: + formatted_title += f" ({title_data['count']}次)" + + if title_data.get("is_new"): + formatted_title = f"
🆕 {formatted_title}
" + + return formatted_title + + else: + return cleaned_title + + +def format_title_for_wechat_compatible(title_data: Dict, show_source: bool = True) -> str: + """微信兼容格式(纯文本,去除 Markdown)""" + cleaned_title = clean_title(title_data["title"]) + + # 排名显示(纯文本,不使用粗体) + ranks = title_data.get("ranks", []) + rank_threshold = title_data.get("rank_threshold", CONFIG["RANK_THRESHOLD"]) + rank_text = "" + + if ranks: + min_rank = min(ranks) + max_rank = max(ranks) + if min_rank == max_rank: + rank_text = f"[{min_rank}]" + else: + rank_text = f"[{min_rank}-{max_rank}]" + + title_prefix = "🆕 " if title_data.get("is_new") else "" + + if show_source: + result = f"[{title_data['source_name']}] {title_prefix}{cleaned_title}" + else: + result = f"{title_prefix}{cleaned_title}" + + if rank_text: + result += f" {rank_text}" + if title_data["time_display"]: + result += f" - {title_data['time_display']}" + if title_data["count"] > 1: + result += f" ({title_data['count']}次)" + + # 如果有链接,单独一行显示,方便微信用户点击 + link_url = title_data.get("mobile_url") or title_data.get("url", "") + if link_url: + result += f"\n🔗 {link_url}" + + return result + + +def generate_html_report( + stats: List[Dict], + total_titles: int, + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + mode: str = "daily", + is_daily_summary: bool = False, + update_info: Optional[Dict] = None, +) -> str: + """生成HTML报告""" + if is_daily_summary: + if mode == "current": + filename = "当前榜单汇总.html" + elif mode == "incremental": + filename = "当日增量.html" + else: + filename = "当日汇总.html" + else: + filename = f"{format_time_filename()}.html" + + file_path = get_output_path("html", filename) + + report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode) + + html_content = render_html_content( + report_data, total_titles, is_daily_summary, mode, update_info + ) + + with open(file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + if is_daily_summary: + root_file_path = Path("index.html") + with open(root_file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + return file_path + + +def render_html_content( + report_data: Dict, + total_titles: int, + is_daily_summary: bool = False, + mode: str = "daily", + update_info: Optional[Dict] = None, +) -> str: + """渲染HTML内容""" + html = """ + + + + + + 热点新闻分析 + + + + +
+
+
+ + +
+
热点新闻分析
+
+
+ 报告类型 + """ + + # 处理报告类型显示 + if is_daily_summary: + if mode == "current": + html += "当前榜单" + elif mode == "incremental": + html += "增量模式" + else: + html += "当日汇总" + else: + html += "实时分析" + + html += """ +
+
+ 新闻总数 + """ + + html += f"{total_titles} 条" + + # 计算筛选后的热点新闻数量 + hot_news_count = sum(len(stat["titles"]) for stat in report_data["stats"]) + + html += """ +
+
+ 热点新闻 + """ + + html += f"{hot_news_count} 条" + + html += """ +
+
+ 生成时间 + """ + + now = get_beijing_time() + html += now.strftime("%m-%d %H:%M") + + html += """ +
+
+
+ +
""" + + # 处理失败ID错误信息 + if report_data["failed_ids"]: + html += """ +
+
⚠️ 请求失败的平台
+
    """ + for id_value in report_data["failed_ids"]: + html += f'
  • {html_escape(id_value)}
  • ' + html += """ +
+
""" + + # 处理主要统计数据 + if report_data["stats"]: + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"], 1): + count = stat["count"] + + # 确定热度等级 + if count >= 10: + count_class = "hot" + elif count >= 5: + count_class = "warm" + else: + count_class = "" + + escaped_word = html_escape(stat["word"]) + + html += f""" +
+
+
+
{escaped_word}
+
{count} 条
+
+
{i}/{total_count}
+
""" + + # 处理每个词组下的新闻标题,给每条新闻标上序号 + for j, title_data in enumerate(stat["titles"], 1): + is_new = title_data.get("is_new", False) + new_class = "new" if is_new else "" + + html += f""" +
+
{j}
+
+
+ {html_escape(title_data["source_name"])}""" + + # 处理排名显示 + ranks = title_data.get("ranks", []) + if ranks: + min_rank = min(ranks) + max_rank = max(ranks) + rank_threshold = title_data.get("rank_threshold", 10) + + # 确定排名等级 + if min_rank <= 3: + rank_class = "top" + elif min_rank <= rank_threshold: + rank_class = "high" + else: + rank_class = "" + + if min_rank == max_rank: + rank_text = str(min_rank) + else: + rank_text = f"{min_rank}-{max_rank}" + + html += f'{rank_text}' + + # 处理时间显示 + time_display = title_data.get("time_display", "") + if time_display: + # 简化时间显示格式,将波浪线替换为~ + simplified_time = ( + time_display.replace(" ~ ", "~") + .replace("[", "") + .replace("]", "") + ) + html += ( + f'{html_escape(simplified_time)}' + ) + + # 处理出现次数 + count_info = title_data.get("count", 1) + if count_info > 1: + html += f'{count_info}次' + + html += """ +
+
""" + + # 处理标题和链接 + escaped_title = html_escape(title_data["title"]) + link_url = title_data.get("mobile_url") or title_data.get("url", "") + + if link_url: + escaped_url = html_escape(link_url) + html += f'{escaped_title}' + else: + html += escaped_title + + html += """ +
+
+
""" + + html += """ +
""" + + # 处理新增新闻区域 + if report_data["new_titles"]: + html += f""" +
+
本次新增热点 (共 {report_data['total_new_count']} 条)
""" + + for source_data in report_data["new_titles"]: + escaped_source = html_escape(source_data["source_name"]) + titles_count = len(source_data["titles"]) + + html += f""" +
+
{escaped_source} · {titles_count}条
""" + + # 为新增新闻也添加序号 + for idx, title_data in enumerate(source_data["titles"], 1): + ranks = title_data.get("ranks", []) + + # 处理新增新闻的排名显示 + rank_class = "" + if ranks: + min_rank = min(ranks) + if min_rank <= 3: + rank_class = "top" + elif min_rank <= title_data.get("rank_threshold", 10): + rank_class = "high" + + if len(ranks) == 1: + rank_text = str(ranks[0]) + else: + rank_text = f"{min(ranks)}-{max(ranks)}" + else: + rank_text = "?" + + html += f""" +
+
{idx}
+
{rank_text}
+
+
""" + + # 处理新增新闻的链接 + escaped_title = html_escape(title_data["title"]) + link_url = title_data.get("mobile_url") or title_data.get("url", "") + + if link_url: + escaped_url = html_escape(link_url) + html += f'{escaped_title}' + else: + html += escaped_title + + html += """ +
+
+
""" + + html += """ +
""" + + html += """ +
""" + + html += """ +
+ + +
+ + + + + """ + + return html + + +def render_feishu_content( + report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily" +) -> str: + """渲染飞书内容""" + text_content = "" + + if report_data["stats"]: + text_content += f"📊 **热点词汇统计**\n\n" + + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + + sequence_display = f"[{i + 1}/{total_count}]" + + if count >= 10: + text_content += f"🔥 {sequence_display} **{word}** : {count} 条\n\n" + elif count >= 5: + text_content += f"📈 {sequence_display} **{word}** : {count} 条\n\n" + else: + text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" + + for j, title_data in enumerate(stat["titles"], 1): + formatted_title = format_title_for_platform( + "feishu", title_data, show_source=True + ) + text_content += f" {j}. {formatted_title}\n" + + if j < len(stat["titles"]): + text_content += "\n" + + if i < len(report_data["stats"]) - 1: + text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" + + if not text_content: + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + text_content = f"📭 {mode_text}\n\n" + + if report_data["new_titles"]: + if text_content and "暂无匹配" not in text_content: + text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" + + text_content += ( + f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + ) + + for source_data in report_data["new_titles"]: + text_content += ( + f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n" + ) + + for j, title_data in enumerate(source_data["titles"], 1): + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + text_content += f" {j}. {formatted_title}\n" + + text_content += "\n" + + if report_data["failed_ids"]: + if text_content and "暂无匹配" not in text_content: + text_content += f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" + + text_content += "⚠️ **数据获取失败的平台:**\n\n" + for i, id_value in enumerate(report_data["failed_ids"], 1): + text_content += f" • {id_value}\n" + + now = get_beijing_time() + text_content += ( + f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + ) + + if update_info: + text_content += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + + return text_content + + +def render_dingtalk_content( + report_data: Dict, update_info: Optional[Dict] = None, mode: str = "daily" +) -> str: + """渲染钉钉内容""" + text_content = "" + + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_beijing_time() + + text_content += f"**总新闻数:** {total_titles}\n\n" + text_content += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" + text_content += f"**类型:** 热点分析报告\n\n" + + text_content += "---\n\n" + + if report_data["stats"]: + text_content += f"📊 **热点词汇统计**\n\n" + + total_count = len(report_data["stats"]) + + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + + sequence_display = f"[{i + 1}/{total_count}]" + + if count >= 10: + text_content += f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + elif count >= 5: + text_content += f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + else: + text_content += f"📌 {sequence_display} **{word}** : {count} 条\n\n" + + for j, title_data in enumerate(stat["titles"], 1): + formatted_title = format_title_for_platform( + "dingtalk", title_data, show_source=True + ) + text_content += f" {j}. {formatted_title}\n" + + if j < len(stat["titles"]): + text_content += "\n" + + if i < len(report_data["stats"]) - 1: + text_content += f"\n---\n\n" + + if not report_data["stats"]: + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + text_content += f"📭 {mode_text}\n\n" + + if report_data["new_titles"]: + if text_content and "暂无匹配" not in text_content: + text_content += f"\n---\n\n" + + text_content += ( + f"🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + ) + + for source_data in report_data["new_titles"]: + text_content += f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + + for j, title_data in enumerate(source_data["titles"], 1): + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + text_content += f" {j}. {formatted_title}\n" + + text_content += "\n" + + if report_data["failed_ids"]: + if text_content and "暂无匹配" not in text_content: + text_content += f"\n---\n\n" + + text_content += "⚠️ **数据获取失败的平台:**\n\n" + for i, id_value in enumerate(report_data["failed_ids"], 1): + text_content += f" • **{id_value}**\n" + + text_content += f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + + if update_info: + text_content += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + + return text_content + + +def split_content_into_batches( + report_data: Dict, + format_type: str, + update_info: Optional[Dict] = None, + max_bytes: int = None, + mode: str = "daily", +) -> List[str]: + """分批处理消息内容,确保词组标题+至少第一条新闻的完整性""" + if max_bytes is None: + if format_type == "dingtalk": + max_bytes = CONFIG.get("DINGTALK_BATCH_SIZE", 20000) + elif format_type == "feishu": + max_bytes = CONFIG.get("FEISHU_BATCH_SIZE", 29000) + elif format_type == "ntfy": + max_bytes = 3800 + else: + max_bytes = CONFIG.get("MESSAGE_BATCH_SIZE", 4000) + + batches = [] + + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_beijing_time() + + base_header = "" + if format_type == "wework": + base_header = f"**总新闻数:** {total_titles}\n\n\n\n" + elif format_type == "telegram": + base_header = f"总新闻数: {total_titles}\n\n" + elif format_type == "ntfy": + base_header = f"**总新闻数:** {total_titles}\n\n" + elif format_type == "feishu": + base_header = "" + elif format_type == "dingtalk": + base_header = f"**总新闻数:** {total_titles}\n\n" + base_header += f"**时间:** {now.strftime('%Y-%m-%d %H:%M:%S')}\n\n" + base_header += f"**类型:** 热点分析报告\n\n" + base_header += "---\n\n" + + base_footer = "" + if format_type == "wework": + base_footer = f"\n\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + elif format_type == "telegram": + base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + elif format_type == "ntfy": + base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + elif format_type == "feishu": + base_footer = f"\n\n更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\nTrendRadar 发现新版本 {update_info['remote_version']},当前 {update_info['current_version']}" + elif format_type == "dingtalk": + base_footer = f"\n\n> 更新时间:{now.strftime('%Y-%m-%d %H:%M:%S')}" + if update_info: + base_footer += f"\n> TrendRadar 发现新版本 **{update_info['remote_version']}**,当前 **{update_info['current_version']}**" + + stats_header = "" + if report_data["stats"]: + if format_type == "wework": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "telegram": + stats_header = f"📊 热点词汇统计\n\n" + elif format_type == "ntfy": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "feishu": + stats_header = f"📊 **热点词汇统计**\n\n" + elif format_type == "dingtalk": + stats_header = f"📊 **热点词汇统计**\n\n" + + current_batch = base_header + current_batch_has_content = False + + if ( + not report_data["stats"] + and not report_data["new_titles"] + and not report_data["failed_ids"] + ): + if mode == "incremental": + mode_text = "增量模式下暂无新增匹配的热点词汇" + elif mode == "current": + mode_text = "当前榜单模式下暂无匹配的热点词汇" + else: + mode_text = "暂无匹配的热点词汇" + simple_content = f"📭 {mode_text}\n\n" + final_content = base_header + simple_content + base_footer + batches.append(final_content) + return batches + + # 处理热点词汇统计 + if report_data["stats"]: + total_count = len(report_data["stats"]) + + # 添加统计标题 + test_content = current_batch + stats_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + < max_bytes + ): + current_batch = test_content + current_batch_has_content = True + else: + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + current_batch_has_content = True + + # 逐个处理词组(确保词组标题+第一条新闻的原子性) + for i, stat in enumerate(report_data["stats"]): + word = stat["word"] + count = stat["count"] + sequence_display = f"[{i + 1}/{total_count}]" + + # 构建词组标题 + word_header = "" + if format_type == "wework": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "telegram": + if count >= 10: + word_header = f"🔥 {sequence_display} {word} : {count} 条\n\n" + elif count >= 5: + word_header = f"📈 {sequence_display} {word} : {count} 条\n\n" + else: + word_header = f"📌 {sequence_display} {word} : {count} 条\n\n" + elif format_type == "ntfy": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "feishu": + if count >= 10: + word_header = f"🔥 {sequence_display} **{word}** : {count} 条\n\n" + elif count >= 5: + word_header = f"📈 {sequence_display} **{word}** : {count} 条\n\n" + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + elif format_type == "dingtalk": + if count >= 10: + word_header = ( + f"🔥 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + elif count >= 5: + word_header = ( + f"📈 {sequence_display} **{word}** : **{count}** 条\n\n" + ) + else: + word_header = f"📌 {sequence_display} **{word}** : {count} 条\n\n" + + # 构建第一条新闻 + first_news_line = "" + if stat["titles"]: + first_title_data = stat["titles"][0] + if format_type == "wework": + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + formatted_title = format_title_for_wechat_compatible( + first_title_data, show_source=True + ) + else: + formatted_title = format_title_for_platform( + "wework", first_title_data, show_source=True + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", first_title_data, show_source=True + ) + elif format_type == "ntfy": + formatted_title = format_title_for_platform( + "ntfy", first_title_data, show_source=True + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", first_title_data, show_source=True + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", first_title_data, show_source=True + ) + else: + formatted_title = f"{first_title_data['title']}" + + first_news_line = f" 1. {formatted_title}\n" + if len(stat["titles"]) > 1: + first_news_line += "\n" + + # 原子性检查:词组标题+第一条新闻必须一起处理 + word_with_first_news = word_header + first_news_line + test_content = current_batch + word_with_first_news + + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + # 当前批次容纳不下,开启新批次 + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + word_with_first_news + current_batch_has_content = True + start_index = 1 + else: + current_batch = test_content + current_batch_has_content = True + start_index = 1 + + # 处理剩余新闻条目 + for j in range(start_index, len(stat["titles"])): + title_data = stat["titles"][j] + if format_type == "wework": + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + formatted_title = format_title_for_wechat_compatible( + title_data, show_source=True + ) + else: + formatted_title = format_title_for_platform( + "wework", title_data, show_source=True + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data, show_source=True + ) + elif format_type == "ntfy": + formatted_title = format_title_for_platform( + "ntfy", title_data, show_source=True + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data, show_source=True + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data, show_source=True + ) + else: + formatted_title = f"{title_data['title']}" + + news_line = f" {j + 1}. {formatted_title}\n" + if j < len(stat["titles"]) - 1: + news_line += "\n" + + test_content = current_batch + news_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + stats_header + word_header + news_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 词组间分隔符 + if i < len(report_data["stats"]) - 1: + separator = "" + if format_type == "wework": + separator = f"\n\n\n\n" + elif format_type == "telegram": + separator = f"\n\n" + elif format_type == "ntfy": + separator = f"\n\n" + elif format_type == "feishu": + separator = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n" + elif format_type == "dingtalk": + separator = f"\n---\n\n" + + test_content = current_batch + separator + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + < max_bytes + ): + current_batch = test_content + + # 处理新增新闻(同样确保来源标题+第一条新闻的原子性) + if report_data["new_titles"]: + new_header = "" + if format_type == "wework": + new_header = f"\n\n\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "telegram": + new_header = ( + f"\n\n🆕 本次新增热点新闻 (共 {report_data['total_new_count']} 条)\n\n" + ) + elif format_type == "ntfy": + new_header = f"\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "feishu": + new_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + elif format_type == "dingtalk": + new_header = f"\n---\n\n🆕 **本次新增热点新闻** (共 {report_data['total_new_count']} 条)\n\n" + + test_content = current_batch + new_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 逐个处理新增新闻来源 + for source_data in report_data["new_titles"]: + source_header = "" + if format_type == "wework": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "telegram": + source_header = f"{source_data['source_name']} ({len(source_data['titles'])} 条):\n\n" + elif format_type == "ntfy": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "feishu": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + elif format_type == "dingtalk": + source_header = f"**{source_data['source_name']}** ({len(source_data['titles'])} 条):\n\n" + + # 构建第一条新增新闻 + first_news_line = "" + if source_data["titles"]: + first_title_data = source_data["titles"][0] + title_data_copy = first_title_data.copy() + title_data_copy["is_new"] = False + + if format_type == "wework": + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + formatted_title = format_title_for_wechat_compatible( + title_data_copy, show_source=False + ) + else: + formatted_title = format_title_for_platform( + "wework", title_data_copy, show_source=False + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data_copy, show_source=False + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + else: + formatted_title = f"{title_data_copy['title']}" + + first_news_line = f" 1. {formatted_title}\n" + + # 原子性检查:来源标题+第一条新闻 + source_with_first_news = source_header + first_news_line + test_content = current_batch + source_with_first_news + + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + source_with_first_news + current_batch_has_content = True + start_index = 1 + else: + current_batch = test_content + current_batch_has_content = True + start_index = 1 + + # 处理剩余新增新闻 + for j in range(start_index, len(source_data["titles"])): + title_data = source_data["titles"][j] + title_data_copy = title_data.copy() + title_data_copy["is_new"] = False + + if format_type == "wework": + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + formatted_title = format_title_for_wechat_compatible( + title_data_copy, show_source=False + ) + else: + formatted_title = format_title_for_platform( + "wework", title_data_copy, show_source=False + ) + elif format_type == "telegram": + formatted_title = format_title_for_platform( + "telegram", title_data_copy, show_source=False + ) + elif format_type == "feishu": + formatted_title = format_title_for_platform( + "feishu", title_data_copy, show_source=False + ) + elif format_type == "dingtalk": + formatted_title = format_title_for_platform( + "dingtalk", title_data_copy, show_source=False + ) + else: + formatted_title = f"{title_data_copy['title']}" + + news_line = f" {j + 1}. {formatted_title}\n" + + test_content = current_batch + news_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + new_header + source_header + news_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + current_batch += "\n" + + if report_data["failed_ids"]: + failed_header = "" + if format_type == "wework": + failed_header = f"\n\n\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "telegram": + failed_header = f"\n\n⚠️ 数据获取失败的平台:\n\n" + elif format_type == "ntfy": + failed_header = f"\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "feishu": + failed_header = f"\n{CONFIG['FEISHU_MESSAGE_SEPARATOR']}\n\n⚠️ **数据获取失败的平台:**\n\n" + elif format_type == "dingtalk": + failed_header = f"\n---\n\n⚠️ **数据获取失败的平台:**\n\n" + + test_content = current_batch + failed_header + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + failed_header + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + for i, id_value in enumerate(report_data["failed_ids"], 1): + if format_type == "feishu": + failed_line = f" • {id_value}\n" + elif format_type == "dingtalk": + failed_line = f" • **{id_value}**\n" + else: + failed_line = f" • {id_value}\n" + + test_content = current_batch + failed_line + if ( + len(test_content.encode("utf-8")) + len(base_footer.encode("utf-8")) + >= max_bytes + ): + if current_batch_has_content: + batches.append(current_batch + base_footer) + current_batch = base_header + failed_header + failed_line + current_batch_has_content = True + else: + current_batch = test_content + current_batch_has_content = True + + # 完成最后批次 + if current_batch_has_content: + batches.append(current_batch + base_footer) + + return batches + + +def send_to_notifications( + stats: List[Dict], + failed_ids: Optional[List] = None, + report_type: str = "当日汇总", + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", + html_file_path: Optional[str] = None, +) -> Dict[str, bool]: + """发送数据到多个通知平台""" + results = {} + + if CONFIG["PUSH_WINDOW"]["ENABLED"]: + push_manager = PushRecordManager() + time_range_start = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["START"] + time_range_end = CONFIG["PUSH_WINDOW"]["TIME_RANGE"]["END"] + + if not push_manager.is_in_time_range(time_range_start, time_range_end): + now = get_beijing_time() + print( + f"推送窗口控制:当前时间 {now.strftime('%H:%M')} 不在推送时间窗口 {time_range_start}-{time_range_end} 内,跳过推送" + ) + return results + + if CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"]: + if push_manager.has_pushed_today(): + print(f"推送窗口控制:今天已推送过,跳过本次推送") + return results + else: + print(f"推送窗口控制:今天首次推送") + + report_data = prepare_report_data(stats, failed_ids, new_titles, id_to_name, mode) + + feishu_url = CONFIG["FEISHU_WEBHOOK_URL"] + dingtalk_url = CONFIG["DINGTALK_WEBHOOK_URL"] + wework_url = CONFIG["WEWORK_WEBHOOK_URL"] + telegram_token = CONFIG["TELEGRAM_BOT_TOKEN"] + telegram_chat_id = CONFIG["TELEGRAM_CHAT_ID"] + email_from = CONFIG["EMAIL_FROM"] + email_password = CONFIG["EMAIL_PASSWORD"] + email_to = CONFIG["EMAIL_TO"] + email_smtp_server = CONFIG.get("EMAIL_SMTP_SERVER", "") + email_smtp_port = CONFIG.get("EMAIL_SMTP_PORT", "") + ntfy_server_url = CONFIG["NTFY_SERVER_URL"] + ntfy_topic = CONFIG["NTFY_TOPIC"] + ntfy_token = CONFIG.get("NTFY_TOKEN", "") + + update_info_to_send = update_info if CONFIG["SHOW_VERSION_UPDATE"] else None + + # 发送到飞书 + if feishu_url: + results["feishu"] = send_to_feishu( + feishu_url, report_data, report_type, update_info_to_send, proxy_url, mode + ) + + # 发送到钉钉 + if dingtalk_url: + results["dingtalk"] = send_to_dingtalk( + dingtalk_url, report_data, report_type, update_info_to_send, proxy_url, mode + ) + + # 发送到企业微信 + if wework_url: + print("report_data", report_data) + results["wework"] = send_to_wework( + wework_url, report_data, report_type, update_info_to_send, proxy_url, mode + ) + + # 发送到 Telegram + if telegram_token and telegram_chat_id: + results["telegram"] = send_to_telegram( + telegram_token, + telegram_chat_id, + report_data, + report_type, + update_info_to_send, + proxy_url, + mode, + ) + + # 发送到 ntfy + if ntfy_server_url and ntfy_topic: + results["ntfy"] = send_to_ntfy( + ntfy_server_url, + ntfy_topic, + ntfy_token, + report_data, + report_type, + update_info_to_send, + proxy_url, + mode, + ) + + # 发送邮件 + if email_from and email_password and email_to: + results["email"] = send_to_email( + email_from, + email_password, + email_to, + report_type, + html_file_path, + email_smtp_server, + email_smtp_port, + ) + + if not results: + print("未配置任何通知渠道,跳过通知发送") + + # 如果成功发送了任何通知,且启用了每天只推一次,则记录推送 + if ( + CONFIG["PUSH_WINDOW"]["ENABLED"] + and CONFIG["PUSH_WINDOW"]["ONCE_PER_DAY"] + and any(results.values()) + ): + push_manager = PushRecordManager() + push_manager.record_push(report_type) + + return results + + +def send_to_feishu( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", +) -> bool: + """发送到飞书(支持分批发送)""" + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容,使用飞书专用的批次大小 + batches = split_content_into_batches( + report_data, + "feishu", + update_info, + max_bytes=CONFIG.get("FEISHU_BATCH_SIZE", 29000), + mode=mode, + ) + + print(f"飞书消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + batch_size = len(batch_content.encode("utf-8")) + print( + f"发送飞书第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" + ) + + # 添加批次标识 + if len(batches) > 1: + batch_header = f"**[第 {i}/{len(batches)} 批次]**\n\n" + # 将批次标识插入到适当位置(在统计标题之后) + if "📊 **热点词汇统计**" in batch_content: + batch_content = batch_content.replace( + "📊 **热点词汇统计**\n\n", f"📊 **热点词汇统计** {batch_header}" + ) + else: + # 如果没有统计标题,直接在开头添加 + batch_content = batch_header + batch_content + + total_titles = sum( + len(stat["titles"]) for stat in report_data["stats"] if stat["count"] > 0 + ) + now = get_beijing_time() + + payload = { + "msg_type": "text", + "content": { + "total_titles": total_titles, + "timestamp": now.strftime("%Y-%m-%d %H:%M:%S"), + "report_type": report_type, + "text": batch_content, + }, + } + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + # 检查飞书的响应状态 + if result.get("StatusCode") == 0 or result.get("code") == 0: + print(f"飞书第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) + else: + error_msg = result.get("msg") or result.get("StatusMessage", "未知错误") + print( + f"飞书第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{error_msg}" + ) + return False + else: + print( + f"飞书第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"飞书第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False + + print(f"飞书所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_dingtalk( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", +) -> bool: + """发送到钉钉(支持分批发送)""" + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容,使用钉钉专用的批次大小 + batches = split_content_into_batches( + report_data, + "dingtalk", + update_info, + max_bytes=CONFIG.get("DINGTALK_BATCH_SIZE", 20000), + mode=mode, + ) + + print(f"钉钉消息分为 {len(batches)} 批次发送 [{report_type}]") - # 生成HTML报告 - html_file = ReportGenerator.generate_html_report( - stats, total_titles, is_daily=True + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + batch_size = len(batch_content.encode("utf-8")) + print( + f"发送钉钉第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" ) - print(f"当日HTML统计报告已生成: {html_file}") - # 根据配置决定是否发送当日汇总到飞书 - if self.feishu_report_type in ["daily", "both"]: - ReportGenerator.send_to_feishu(stats, [], "当日汇总") + # 添加批次标识 + if len(batches) > 1: + batch_header = f"**[第 {i}/{len(batches)} 批次]**\n\n" + # 将批次标识插入到适当位置(在标题之后) + if "📊 **热点词汇统计**" in batch_content: + batch_content = batch_content.replace( + "📊 **热点词汇统计**\n\n", f"📊 **热点词汇统计** {batch_header}\n\n" + ) + else: + # 如果没有统计标题,直接在开头添加 + batch_content = batch_header + batch_content - return html_file + payload = { + "msgtype": "markdown", + "markdown": { + "title": f"TrendRadar 热点分析报告 - {report_type}", + "text": batch_content, + }, + } - def run(self) -> None: - """执行新闻分析流程""" - # 输出当前时间信息 - now = TimeHelper.get_beijing_time() - print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}") + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 + ) + if response.status_code == 200: + result = response.json() + if result.get("errcode") == 0: + print(f"钉钉第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) + else: + print( + f"钉钉第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" + ) + return False + else: + print( + f"钉钉第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"钉钉第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False - # 检查FEISHU_WEBHOOK_URL是否存在 - webhook_url = os.environ.get("FEISHU_WEBHOOK_URL", CONFIG["FEISHU_WEBHOOK_URL"]) - if not webhook_url and not CONFIG["CONTINUE_WITHOUT_FEISHU"]: - print( - "错误: FEISHU_WEBHOOK_URL未设置或无效,且CONTINUE_WITHOUT_FEISHU为False,程序退出" + print(f"钉钉所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_wework( + webhook_url: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", +) -> bool: + """发送到企业微信(支持分批发送)""" + headers = {"Content-Type": "application/json"} + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容 + batches = split_content_into_batches(report_data, "wework", update_info, mode=mode) + + print(f"企业微信消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + batch_size = len(batch_content.encode("utf-8")) + print( + f"发送企业微信第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" + ) + + # 添加批次标识 + if len(batches) > 1: + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + # 微信兼容模式:使用纯文本 + batch_header = f"[第 {i}/{len(batches)} 批次]\n\n" + else: + # 正常模式:使用 Markdown 粗体 + batch_header = f"**[第 {i}/{len(batches)} 批次]**\n\n" + batch_content = batch_header + batch_content + + # 根据微信兼容模式选择消息类型 + if CONFIG.get("WEWORK_WECHAT_COMPATIBLE", False): + payload = {"msgtype": "text", "text": {"content": batch_content}} + else: + payload = {"msgtype": "markdown", "markdown": {"content": batch_content}} + + try: + response = requests.post( + webhook_url, headers=headers, json=payload, proxies=proxies, timeout=30 ) - return + if response.status_code == 200: + result = response.json() + if result.get("errcode") == 0: + print(f"企业微信第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) + else: + print( + f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('errmsg')}" + ) + return False + else: + print( + f"企业微信第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"企业微信第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False - if not webhook_url: - print( - "警告: FEISHU_WEBHOOK_URL未设置或无效,将继续执行爬虫但不发送飞书通知" + print(f"企业微信所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_telegram( + bot_token: str, + chat_id: str, + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", +) -> bool: + """发送到Telegram(支持分批发送)""" + headers = {"Content-Type": "application/json"} + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容 + batches = split_content_into_batches( + report_data, "telegram", update_info, mode=mode + ) + + print(f"Telegram消息分为 {len(batches)} 批次发送 [{report_type}]") + + # 逐批发送 + for i, batch_content in enumerate(batches, 1): + batch_size = len(batch_content.encode("utf-8")) + print( + f"发送Telegram第 {i}/{len(batches)} 批次,大小:{batch_size} 字节 [{report_type}]" + ) + + # 添加批次标识 + if len(batches) > 1: + batch_header = f"[第 {i}/{len(batches)} 批次]\n\n" + batch_content = batch_header + batch_content + + payload = { + "chat_id": chat_id, + "text": batch_content, + "parse_mode": "HTML", + "disable_web_page_preview": True, + } + + try: + response = requests.post( + url, headers=headers, json=payload, proxies=proxies, timeout=30 ) + if response.status_code == 200: + result = response.json() + if result.get("ok"): + print(f"Telegram第 {i}/{len(batches)} 批次发送成功 [{report_type}]") + # 批次间间隔 + if i < len(batches): + time.sleep(CONFIG["BATCH_SEND_INTERVAL"]) + else: + print( + f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],错误:{result.get('description')}" + ) + return False + else: + print( + f"Telegram第 {i}/{len(batches)} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + return False + except Exception as e: + print(f"Telegram第 {i}/{len(batches)} 批次发送出错 [{report_type}]:{e}") + return False - print(f"飞书报告类型: {self.feishu_report_type}") - print(f"排名阈值: {self.rank_threshold}") - - # 要爬取的网站ID列表 - ids = [ - ("toutiao", "今日头条"), - ("baidu", "百度热搜"), - ("wallstreetcn-hot", "华尔街见闻"), - ("thepaper", "澎湃新闻"), - ("bilibili-hot-search", "bilibili 热搜"), - ("cls-hot", "财联社热门"), - "tieba", - "weibo", - "douyin", - "zhihu", - ] - - print(f"开始爬取数据,请求间隔设置为 {self.request_interval} 毫秒") - - # 确保output目录存在 - FileHelper.ensure_directory_exists("output") - - # 爬取数据 - results, id_to_alias, failed_ids = self.data_fetcher.crawl_websites( - ids, self.request_interval + print(f"Telegram所有 {len(batches)} 批次发送完成 [{report_type}]") + return True + + +def send_to_email( + from_email: str, + password: str, + to_email: str, + report_type: str, + html_file_path: str, + custom_smtp_server: Optional[str] = None, + custom_smtp_port: Optional[int] = None, +) -> bool: + """发送邮件通知""" + try: + if not html_file_path or not Path(html_file_path).exists(): + print(f"错误:HTML文件不存在或未提供: {html_file_path}") + return False + + print(f"使用HTML文件: {html_file_path}") + with open(html_file_path, "r", encoding="utf-8") as f: + html_content = f.read() + + domain = from_email.split("@")[-1].lower() + + if custom_smtp_server and custom_smtp_port: + # 使用自定义 SMTP 配置 + smtp_server = custom_smtp_server + smtp_port = int(custom_smtp_port) + # 根据端口判断加密方式:465=SSL, 587=TLS + if smtp_port == 465: + use_tls = False # SSL 模式(SMTP_SSL) + elif smtp_port == 587: + use_tls = True # TLS 模式(STARTTLS) + else: + # 其他端口优先尝试 TLS(更安全,更广泛支持) + use_tls = True + elif domain in SMTP_CONFIGS: + # 使用预设配置 + config = SMTP_CONFIGS[domain] + smtp_server = config["server"] + smtp_port = config["port"] + use_tls = config["encryption"] == "TLS" + else: + print(f"未识别的邮箱服务商: {domain},使用通用 SMTP 配置") + smtp_server = f"smtp.{domain}" + smtp_port = 587 + use_tls = True + + msg = MIMEMultipart("alternative") + + # 严格按照 RFC 标准设置 From header + sender_name = "TrendRadar" + msg["From"] = formataddr((sender_name, from_email)) + + # 设置收件人 + recipients = [addr.strip() for addr in to_email.split(",")] + if len(recipients) == 1: + msg["To"] = recipients[0] + else: + msg["To"] = ", ".join(recipients) + + # 设置邮件主题 + now = get_beijing_time() + subject = f"TrendRadar 热点分析报告 - {report_type} - {now.strftime('%m月%d日 %H:%M')}" + msg["Subject"] = Header(subject, "utf-8") + + # 设置其他标准 header + msg["MIME-Version"] = "1.0" + msg["Date"] = formatdate(localtime=True) + msg["Message-ID"] = make_msgid() + + # 添加纯文本部分(作为备选) + text_content = f""" +TrendRadar 热点分析报告 +======================== +报告类型:{report_type} +生成时间:{now.strftime('%Y-%m-%d %H:%M:%S')} + +请使用支持HTML的邮件客户端查看完整报告内容。 + """ + text_part = MIMEText(text_content, "plain", "utf-8") + msg.attach(text_part) + + html_part = MIMEText(html_content, "html", "utf-8") + msg.attach(html_part) + + print(f"正在发送邮件到 {to_email}...") + print(f"SMTP 服务器: {smtp_server}:{smtp_port}") + print(f"发件人: {from_email}") + + try: + if use_tls: + # TLS 模式 + server = smtplib.SMTP(smtp_server, smtp_port, timeout=30) + server.set_debuglevel(0) # 设为1可以查看详细调试信息 + server.ehlo() + server.starttls() + server.ehlo() + else: + # SSL 模式 + server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=30) + server.set_debuglevel(0) + server.ehlo() + + # 登录 + server.login(from_email, password) + + # 发送邮件 + server.send_message(msg) + server.quit() + + print(f"邮件发送成功 [{report_type}] -> {to_email}") + return True + + except smtplib.SMTPServerDisconnected: + print(f"邮件发送失败:服务器意外断开连接,请检查网络或稍后重试") + return False + + except smtplib.SMTPAuthenticationError as e: + print(f"邮件发送失败:认证错误,请检查邮箱和密码/授权码") + print(f"详细错误: {str(e)}") + return False + except smtplib.SMTPRecipientsRefused as e: + print(f"邮件发送失败:收件人地址被拒绝 {e}") + return False + except smtplib.SMTPSenderRefused as e: + print(f"邮件发送失败:发件人地址被拒绝 {e}") + return False + except smtplib.SMTPDataError as e: + print(f"邮件发送失败:邮件数据错误 {e}") + return False + except smtplib.SMTPConnectError as e: + print(f"邮件发送失败:无法连接到 SMTP 服务器 {smtp_server}:{smtp_port}") + print(f"详细错误: {str(e)}") + return False + except Exception as e: + print(f"邮件发送失败 [{report_type}]:{e}") + import traceback + + traceback.print_exc() + return False + + +def send_to_ntfy( + server_url: str, + topic: str, + token: Optional[str], + report_data: Dict, + report_type: str, + update_info: Optional[Dict] = None, + proxy_url: Optional[str] = None, + mode: str = "daily", +) -> bool: + """发送到ntfy(支持分批发送,严格遵守4KB限制)""" + # 避免 HTTP header 编码问题 + report_type_en_map = { + "当日汇总": "Daily Summary", + "当前榜单汇总": "Current Ranking", + "增量更新": "Incremental Update", + "实时增量": "Realtime Incremental", + "实时当前榜单": "Realtime Current Ranking", + } + report_type_en = report_type_en_map.get(report_type, "News Report") + + headers = { + "Content-Type": "text/plain; charset=utf-8", + "Markdown": "yes", + "Title": report_type_en, + "Priority": "default", + "Tags": "news", + } + + if token: + headers["Authorization"] = f"Bearer {token}" + + # 构建完整URL,确保格式正确 + base_url = server_url.rstrip("/") + if not base_url.startswith(("http://", "https://")): + base_url = f"https://{base_url}" + url = f"{base_url}/{topic}" + + proxies = None + if proxy_url: + proxies = {"http": proxy_url, "https": proxy_url} + + # 获取分批内容,使用ntfy专用的4KB限制 + batches = split_content_into_batches( + report_data, "ntfy", update_info, max_bytes=3800, mode=mode + ) + + total_batches = len(batches) + print(f"ntfy消息分为 {total_batches} 批次发送 [{report_type}]") + + # 反转批次顺序,使得在ntfy客户端显示时顺序正确 + # ntfy显示最新消息在上面,所以我们从最后一批开始推送 + reversed_batches = list(reversed(batches)) + + print(f"ntfy将按反向顺序推送(最后批次先推送),确保客户端显示顺序正确") + + # 逐批发送(反向顺序) + success_count = 0 + for idx, batch_content in enumerate(reversed_batches, 1): + # 计算正确的批次编号(用户视角的编号) + actual_batch_num = total_batches - idx + 1 + + batch_size = len(batch_content.encode("utf-8")) + print( + f"发送ntfy第 {actual_batch_num}/{total_batches} 批次(推送顺序: {idx}/{total_batches}),大小:{batch_size} 字节 [{report_type}]" ) - # 保存标题到文件 - title_file = DataProcessor.save_titles_to_file(results, id_to_alias, failed_ids) - print(f"标题已保存到: {title_file}") + # 检查消息大小,确保不超过4KB + if batch_size > 4096: + print(f"警告:ntfy第 {actual_batch_num} 批次消息过大({batch_size} 字节),可能被拒绝") + + # 添加批次标识(使用正确的批次编号) + current_headers = headers.copy() + if total_batches > 1: + batch_header = f"**[第 {actual_batch_num}/{total_batches} 批次]**\n\n" + batch_content = batch_header + batch_content + current_headers["Title"] = ( + f"{report_type_en} ({actual_batch_num}/{total_batches})" + ) + + try: + response = requests.post( + url, + headers=current_headers, + data=batch_content.encode("utf-8"), + proxies=proxies, + timeout=30, + ) + + if response.status_code == 200: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次发送成功 [{report_type}]") + success_count += 1 + if idx < total_batches: + # 公共服务器建议 2-3 秒,自托管可以更短 + interval = 2 if "ntfy.sh" in server_url else 1 + time.sleep(interval) + elif response.status_code == 429: + print( + f"ntfy第 {actual_batch_num}/{total_batches} 批次速率限制 [{report_type}],等待后重试" + ) + time.sleep(10) # 等待10秒后重试 + # 重试一次 + retry_response = requests.post( + url, + headers=current_headers, + data=batch_content.encode("utf-8"), + proxies=proxies, + timeout=30, + ) + if retry_response.status_code == 200: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次重试成功 [{report_type}]") + success_count += 1 + else: + print( + f"ntfy第 {actual_batch_num}/{total_batches} 批次重试失败,状态码:{retry_response.status_code}" + ) + elif response.status_code == 413: + print( + f"ntfy第 {actual_batch_num}/{total_batches} 批次消息过大被拒绝 [{report_type}],消息大小:{batch_size} 字节" + ) + else: + print( + f"ntfy第 {actual_batch_num}/{total_batches} 批次发送失败 [{report_type}],状态码:{response.status_code}" + ) + try: + print(f"错误详情:{response.text}") + except: + pass + + except requests.exceptions.ConnectTimeout: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次连接超时 [{report_type}]") + except requests.exceptions.ReadTimeout: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次读取超时 [{report_type}]") + except requests.exceptions.ConnectionError as e: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次连接错误 [{report_type}]:{e}") + except Exception as e: + print(f"ntfy第 {actual_batch_num}/{total_batches} 批次发送异常 [{report_type}]:{e}") + + # 判断整体发送是否成功 + if success_count == total_batches: + print(f"ntfy所有 {total_batches} 批次发送完成 [{report_type}]") + return True + elif success_count > 0: + print(f"ntfy部分发送成功:{success_count}/{total_batches} 批次 [{report_type}]") + return True # 部分成功也视为成功 + else: + print(f"ntfy发送完全失败 [{report_type}]") + return False + + +# === 主分析器 === +class NewsAnalyzer: + """新闻分析器""" + + # 模式策略定义 + MODE_STRATEGIES = { + "incremental": { + "mode_name": "增量模式", + "description": "增量模式(只关注新增新闻,无新增时不推送)", + "realtime_report_type": "实时增量", + "summary_report_type": "当日汇总", + "should_send_realtime": True, + "should_generate_summary": True, + "summary_mode": "daily", + }, + "current": { + "mode_name": "当前榜单模式", + "description": "当前榜单模式(当前榜单匹配新闻 + 新增新闻区域 + 按时推送)", + "realtime_report_type": "实时当前榜单", + "summary_report_type": "当前榜单汇总", + "should_send_realtime": True, + "should_generate_summary": True, + "summary_mode": "current", + }, + "daily": { + "mode_name": "当日汇总模式", + "description": "当日汇总模式(所有匹配新闻 + 新增新闻区域 + 按时推送)", + "realtime_report_type": "", + "summary_report_type": "当日汇总", + "should_send_realtime": False, + "should_generate_summary": True, + "summary_mode": "daily", + }, + } + + def __init__(self): + self.request_interval = CONFIG["REQUEST_INTERVAL"] + self.report_mode = CONFIG["REPORT_MODE"] + self.rank_threshold = CONFIG["RANK_THRESHOLD"] + self.is_github_actions = os.environ.get("GITHUB_ACTIONS") == "true" + self.is_docker_container = self._detect_docker_environment() + self.update_info = None + self.proxy_url = None + self._setup_proxy() + self.data_fetcher = DataFetcher(self.proxy_url) + + if self.is_github_actions: + self._check_version_update() + + def _detect_docker_environment(self) -> bool: + """检测是否运行在 Docker 容器中""" + try: + if os.environ.get("DOCKER_CONTAINER") == "true": + return True + + if os.path.exists("/.dockerenv"): + return True + + return False + except Exception: + return False + + def _should_open_browser(self) -> bool: + """判断是否应该打开浏览器""" + return not self.is_github_actions and not self.is_docker_container + + def _setup_proxy(self) -> None: + """设置代理配置""" + if not self.is_github_actions and CONFIG["USE_PROXY"]: + self.proxy_url = CONFIG["DEFAULT_PROXY"] + print("本地环境,使用代理") + elif not self.is_github_actions and not CONFIG["USE_PROXY"]: + print("本地环境,未启用代理") + else: + print("GitHub Actions环境,不使用代理") + + def _check_version_update(self) -> None: + """检查版本更新""" + try: + need_update, remote_version = check_version_update( + VERSION, CONFIG["VERSION_CHECK_URL"], self.proxy_url + ) + + if need_update and remote_version: + self.update_info = { + "current_version": VERSION, + "remote_version": remote_version, + } + print(f"发现新版本: {remote_version} (当前: {VERSION})") + else: + print("版本检查完成,当前为最新版本") + except Exception as e: + print(f"版本检查出错: {e}") + + def _get_mode_strategy(self) -> Dict: + """获取当前模式的策略配置""" + return self.MODE_STRATEGIES.get(self.report_mode, self.MODE_STRATEGIES["daily"]) + + def _has_notification_configured(self) -> bool: + """检查是否配置了任何通知渠道""" + return any( + [ + CONFIG["FEISHU_WEBHOOK_URL"], + CONFIG["DINGTALK_WEBHOOK_URL"], + CONFIG["WEWORK_WEBHOOK_URL"], + (CONFIG["TELEGRAM_BOT_TOKEN"] and CONFIG["TELEGRAM_CHAT_ID"]), + ( + CONFIG["EMAIL_FROM"] + and CONFIG["EMAIL_PASSWORD"] + and CONFIG["EMAIL_TO"] + ), + (CONFIG["NTFY_SERVER_URL"] and CONFIG["NTFY_TOPIC"]), + ] + ) + + def _has_valid_content( + self, stats: List[Dict], new_titles: Optional[Dict] = None + ) -> bool: + """检查是否有有效的新闻内容""" + if self.report_mode in ["incremental", "current"]: + # 增量模式和current模式下,只要stats有内容就说明有匹配的新闻 + return any(stat["count"] > 0 for stat in stats) + else: + # 当日汇总模式下,检查是否有匹配的频率词新闻或新增新闻 + has_matched_news = any(stat["count"] > 0 for stat in stats) + has_new_news = bool( + new_titles and any(len(titles) > 0 for titles in new_titles.values()) + ) + return has_matched_news or has_new_news - # 从文件名中提取时间信息 - time_info = os.path.basename(title_file).replace(".txt", "") + def _load_analysis_data( + self, + ) -> Optional[Tuple[Dict, Dict, Dict, Dict, List, List]]: + """统一的数据加载和预处理,使用当前监控平台列表过滤历史数据""" + try: + # 获取当前配置的监控平台ID列表 + current_platform_ids = [] + for platform in CONFIG["PLATFORMS"]: + current_platform_ids.append(platform["id"]) + + print(f"当前监控平台: {current_platform_ids}") + + all_results, id_to_name, title_info = read_all_today_titles( + current_platform_ids + ) + + if not all_results: + print("没有找到当天的数据") + return None + + total_titles = sum(len(titles) for titles in all_results.values()) + print(f"读取到 {total_titles} 个标题(已按当前监控平台过滤)") + + new_titles = detect_latest_new_titles(current_platform_ids) + word_groups, filter_words = load_frequency_words() - # 创建标题信息字典 + return ( + all_results, + id_to_name, + title_info, + new_titles, + word_groups, + filter_words, + ) + except Exception as e: + print(f"数据加载失败: {e}") + return None + + def _prepare_current_title_info(self, results: Dict, time_info: str) -> Dict: + """从当前抓取结果构建标题信息""" title_info = {} for source_id, titles_data in results.items(): title_info[source_id] = {} - for title, ranks in titles_data.items(): + for title, title_data in titles_data.items(): + ranks = title_data.get("ranks", []) + url = title_data.get("url", "") + mobile_url = title_data.get("mobileUrl", "") + title_info[source_id][title] = { "first_time": time_info, "last_time": time_info, "count": 1, "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, } + return title_info - # 加载频率词和过滤词 - word_groups, filter_words = DataProcessor.load_frequency_words() + def _run_analysis_pipeline( + self, + data_source: Dict, + mode: str, + title_info: Dict, + new_titles: Dict, + word_groups: List[Dict], + filter_words: List[str], + id_to_name: Dict, + failed_ids: Optional[List] = None, + is_daily_summary: bool = False, + ) -> Tuple[List[Dict], str]: + """统一的分析流水线:数据处理 → 统计计算 → HTML生成""" - # 统计词频 - stats, total_titles = StatisticsCalculator.count_word_frequency( - results, + # 统计计算 + stats, total_titles = count_word_frequency( + data_source, word_groups, filter_words, - id_to_alias, + id_to_name, title_info, self.rank_threshold, + new_titles, + mode=mode, + ) + + # HTML生成 + html_file = generate_html_report( + stats, + total_titles, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + mode=mode, + is_daily_summary=is_daily_summary, + update_info=self.update_info if CONFIG["SHOW_VERSION_UPDATE"] else None, + ) + + return stats, html_file + + def _send_notification_if_needed( + self, + stats: List[Dict], + report_type: str, + mode: str, + failed_ids: Optional[List] = None, + new_titles: Optional[Dict] = None, + id_to_name: Optional[Dict] = None, + html_file_path: Optional[str] = None, + ) -> bool: + """统一的通知发送逻辑,包含所有判断条件""" + has_notification = self._has_notification_configured() + + if ( + CONFIG["ENABLE_NOTIFICATION"] + and has_notification + and self._has_valid_content(stats, new_titles) + ): + send_to_notifications( + stats, + failed_ids or [], + report_type, + new_titles, + id_to_name, + self.update_info, + self.proxy_url, + mode=mode, + html_file_path=html_file_path, + ) + return True + elif CONFIG["ENABLE_NOTIFICATION"] and not has_notification: + print("⚠️ 警告:通知功能已启用但未配置任何通知渠道,将跳过通知发送") + elif not CONFIG["ENABLE_NOTIFICATION"]: + print(f"跳过{report_type}通知:通知功能已禁用") + elif ( + CONFIG["ENABLE_NOTIFICATION"] + and has_notification + and not self._has_valid_content(stats, new_titles) + ): + mode_strategy = self._get_mode_strategy() + if "实时" in report_type: + print( + f"跳过实时推送通知:{mode_strategy['mode_name']}下未检测到匹配的新闻" + ) + else: + print( + f"跳过{mode_strategy['summary_report_type']}通知:未匹配到有效的新闻内容" + ) + + return False + + def _generate_summary_report(self, mode_strategy: Dict) -> Optional[str]: + """生成汇总报告(带通知)""" + summary_type = ( + "当前榜单汇总" if mode_strategy["summary_mode"] == "current" else "当日汇总" + ) + print(f"生成{summary_type}报告...") + + # 加载分析数据 + analysis_data = self._load_analysis_data() + if not analysis_data: + return None + + all_results, id_to_name, title_info, new_titles, word_groups, filter_words = ( + analysis_data + ) + + # 运行分析流水线 + stats, html_file = self._run_analysis_pipeline( + all_results, + mode_strategy["summary_mode"], + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + is_daily_summary=True, + ) + + print(f"{summary_type}报告已生成: {html_file}") + + # 发送通知 + self._send_notification_if_needed( + stats, + mode_strategy["summary_report_type"], + mode_strategy["summary_mode"], + failed_ids=[], + new_titles=new_titles, + id_to_name=id_to_name, + html_file_path=html_file, + ) + + return html_file + + def _generate_summary_html(self, mode: str = "daily") -> Optional[str]: + """生成汇总HTML""" + summary_type = "当前榜单汇总" if mode == "current" else "当日汇总" + print(f"生成{summary_type}HTML...") + + # 加载分析数据 + analysis_data = self._load_analysis_data() + if not analysis_data: + return None + + all_results, id_to_name, title_info, new_titles, word_groups, filter_words = ( + analysis_data + ) + + # 运行分析流水线 + _, html_file = self._run_analysis_pipeline( + all_results, + mode, + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + is_daily_summary=True, ) - # 根据配置决定发送哪种报告 - if self.feishu_report_type in ["current", "both"]: - # 发送当前爬取数据到飞书 - ReportGenerator.send_to_feishu(stats, failed_ids, "单次爬取") + print(f"{summary_type}HTML已生成: {html_file}") + return html_file + + def _initialize_and_check_config(self) -> None: + """通用初始化和配置检查""" + now = get_beijing_time() + print(f"当前北京时间: {now.strftime('%Y-%m-%d %H:%M:%S')}") + + if not CONFIG["ENABLE_CRAWLER"]: + print("爬虫功能已禁用(ENABLE_CRAWLER=False),程序退出") + return + + has_notification = self._has_notification_configured() + if not CONFIG["ENABLE_NOTIFICATION"]: + print("通知功能已禁用(ENABLE_NOTIFICATION=False),将只进行数据抓取") + elif not has_notification: + print("未配置任何通知渠道,将只进行数据抓取,不发送通知") + else: + print("通知功能已启用,将发送通知") + + mode_strategy = self._get_mode_strategy() + print(f"报告模式: {self.report_mode}") + print(f"运行模式: {mode_strategy['description']}") + + def _crawl_data(self) -> Tuple[Dict, Dict, List]: + """执行数据爬取""" + ids = [] + for platform in CONFIG["PLATFORMS"]: + if "name" in platform: + ids.append((platform["id"], platform["name"])) + else: + ids.append(platform["id"]) + + print( + f"配置的监控平台: {[p.get('name', p['id']) for p in CONFIG['PLATFORMS']]}" + ) + print(f"开始爬取数据,请求间隔 {self.request_interval} 毫秒") + ensure_directory_exists("output") - # 生成HTML报告 - html_file = ReportGenerator.generate_html_report( - stats, total_titles, failed_ids + results, id_to_name, failed_ids = self.data_fetcher.crawl_websites( + ids, self.request_interval ) - print(f"HTML报告已生成: {html_file}") - # 生成当日统计报告 - daily_html = self.generate_daily_summary() + title_file = save_titles_to_file(results, id_to_name, failed_ids) + print(f"标题已保存到: {title_file}") + + return results, id_to_name, failed_ids + + def _execute_mode_strategy( + self, mode_strategy: Dict, results: Dict, id_to_name: Dict, failed_ids: List + ) -> Optional[str]: + """执行模式特定逻辑""" + # 获取当前监控平台ID列表 + current_platform_ids = [platform["id"] for platform in CONFIG["PLATFORMS"]] + + new_titles = detect_latest_new_titles(current_platform_ids) + time_info = Path(save_titles_to_file(results, id_to_name, failed_ids)).stem + word_groups, filter_words = load_frequency_words() + + # current模式下,实时推送需要使用完整的历史数据来保证统计信息的完整性 + if self.report_mode == "current": + # 加载完整的历史数据(已按当前平台过滤) + analysis_data = self._load_analysis_data() + if analysis_data: + ( + all_results, + historical_id_to_name, + historical_title_info, + historical_new_titles, + _, + _, + ) = analysis_data + + print( + f"current模式:使用过滤后的历史数据,包含平台:{list(all_results.keys())}" + ) + + stats, html_file = self._run_analysis_pipeline( + all_results, + self.report_mode, + historical_title_info, + historical_new_titles, + word_groups, + filter_words, + historical_id_to_name, + failed_ids=failed_ids, + ) + + combined_id_to_name = {**historical_id_to_name, **id_to_name} + + print(f"HTML报告已生成: {html_file}") + + # 发送实时通知(使用完整历史数据的统计结果) + summary_html = None + if mode_strategy["should_send_realtime"]: + self._send_notification_if_needed( + stats, + mode_strategy["realtime_report_type"], + self.report_mode, + failed_ids=failed_ids, + new_titles=historical_new_titles, + id_to_name=combined_id_to_name, + html_file_path=html_file, + ) + else: + print("❌ 严重错误:无法读取刚保存的数据文件") + raise RuntimeError("数据一致性检查失败:保存后立即读取失败") + else: + title_info = self._prepare_current_title_info(results, time_info) + stats, html_file = self._run_analysis_pipeline( + results, + self.report_mode, + title_info, + new_titles, + word_groups, + filter_words, + id_to_name, + failed_ids=failed_ids, + ) + print(f"HTML报告已生成: {html_file}") + + # 发送实时通知(如果需要) + summary_html = None + if mode_strategy["should_send_realtime"]: + self._send_notification_if_needed( + stats, + mode_strategy["realtime_report_type"], + self.report_mode, + failed_ids=failed_ids, + new_titles=new_titles, + id_to_name=id_to_name, + html_file_path=html_file, + ) + + # 生成汇总报告(如果需要) + summary_html = None + if mode_strategy["should_generate_summary"]: + if mode_strategy["should_send_realtime"]: + # 如果已经发送了实时通知,汇总只生成HTML不发送通知 + summary_html = self._generate_summary_html( + mode_strategy["summary_mode"] + ) + else: + # daily模式:直接生成汇总报告并发送通知 + summary_html = self._generate_summary_report(mode_strategy) + + # 打开浏览器(仅在非容器环境) + if self._should_open_browser() and html_file: + if summary_html: + summary_url = "file://" + str(Path(summary_html).resolve()) + print(f"正在打开汇总报告: {summary_url}") + webbrowser.open(summary_url) + else: + file_url = "file://" + str(Path(html_file).resolve()) + print(f"正在打开HTML报告: {file_url}") + webbrowser.open(file_url) + elif self.is_docker_container and html_file: + if summary_html: + print(f"汇总报告已生成(Docker环境): {summary_html}") + else: + print(f"HTML报告已生成(Docker环境): {html_file}") + + return summary_html + + def run(self) -> None: + """执行分析流程""" + try: + self._initialize_and_check_config() + + mode_strategy = self._get_mode_strategy() + + results, id_to_name, failed_ids = self._crawl_data() - # 在本地环境中自动打开HTML文件 - if not self.is_github_actions and html_file: - file_url = "file://" + os.path.abspath(html_file) - print(f"正在打开HTML报告: {file_url}") - webbrowser.open(file_url) + self._execute_mode_strategy(mode_strategy, results, id_to_name, failed_ids) - if daily_html: - daily_url = "file://" + os.path.abspath(daily_html) - print(f"正在打开当日统计报告: {daily_url}") - webbrowser.open(daily_url) + except Exception as e: + print(f"分析流程执行出错: {e}") + raise def main(): - """程序入口点""" - # 初始化并运行新闻分析器 - analyzer = NewsAnalyzer( - request_interval=CONFIG["REQUEST_INTERVAL"], - feishu_report_type=CONFIG["FEISHU_REPORT_TYPE"], - rank_threshold=CONFIG["RANK_THRESHOLD"], - ) - analyzer.run() + try: + analyzer = NewsAnalyzer() + analyzer.run() + except FileNotFoundError as e: + print(f"❌ 配置文件错误: {e}") + print("\n请确保以下文件存在:") + print(" • config/config.yaml") + print(" • config/frequency_words.txt") + print("\n参考项目文档进行正确配置") + except Exception as e: + print(f"❌ 程序运行错误: {e}") + raise if __name__ == "__main__": diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py new file mode 100644 index 0000000000000..352560e87a3ff --- /dev/null +++ b/mcp_server/__init__.py @@ -0,0 +1,7 @@ +""" +TrendRadar MCP Server + +提供基于MCP协议的新闻聚合数据查询和系统管理接口。 +""" + +__version__ = "1.0.0" diff --git a/mcp_server/server.py b/mcp_server/server.py new file mode 100644 index 0000000000000..be3ce1d30db3c --- /dev/null +++ b/mcp_server/server.py @@ -0,0 +1,696 @@ +""" +TrendRadar MCP Server - FastMCP 2.0 实现 + +使用 FastMCP 2.0 提供生产级 MCP 工具服务器。 +支持 stdio 和 HTTP 两种传输模式。 +""" + +import json +from typing import List, Optional, Dict + +from fastmcp import FastMCP + +from .tools.data_query import DataQueryTools +from .tools.analytics import AnalyticsTools +from .tools.search_tools import SearchTools +from .tools.config_mgmt import ConfigManagementTools +from .tools.system import SystemManagementTools + + +# 创建 FastMCP 2.0 应用 +mcp = FastMCP('trendradar-news') + +# 全局工具实例(在第一次请求时初始化) +_tools_instances = {} + + +def _get_tools(project_root: Optional[str] = None): + """获取或创建工具实例(单例模式)""" + if not _tools_instances: + _tools_instances['data'] = DataQueryTools(project_root) + _tools_instances['analytics'] = AnalyticsTools(project_root) + _tools_instances['search'] = SearchTools(project_root) + _tools_instances['config'] = ConfigManagementTools(project_root) + _tools_instances['system'] = SystemManagementTools(project_root) + return _tools_instances + + +# ==================== 数据查询工具 ==================== + +@mcp.tool +async def get_latest_news( + platforms: Optional[List[str]] = None, + limit: int = 50, + include_url: bool = False +) -> str: + """ + 获取最新一批爬取的新闻数据,快速了解当前热点 + + Args: + platforms: 平台ID列表,如 ['zhihu', 'weibo', 'douyin'] + - 不指定时:使用 config.yaml 中配置的所有平台 + - 支持的平台来自 config/config.yaml 的 platforms 配置 + - 每个平台都有对应的name字段(如"知乎"、"微博"),方便AI识别 + limit: 返回条数限制,默认50,最大1000 + 注意:实际返回数量可能少于请求值,取决于当前可用的新闻总数 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的新闻列表 + + **重要:数据展示建议** + 本工具会返回完整的新闻列表(通常50条)给你。但请注意: + - **工具返回**:完整的50条数据 ✅ + - **建议展示**:向用户展示全部数据,除非用户明确要求总结 + - **用户期望**:用户可能需要完整数据,请谨慎总结 + + **何时可以总结**: + - 用户明确说"给我总结一下"或"挑重点说" + - 数据量超过100条时,可先展示部分并询问是否查看全部 + + **注意**:如果用户询问"为什么只显示了部分",说明他们需要完整数据 + """ + tools = _get_tools() + result = tools['data'].get_latest_news(platforms=platforms, limit=limit, include_url=include_url) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def get_trending_topics( + top_n: int = 10, + mode: str = 'current' +) -> str: + """ + 获取个人关注词的新闻出现频率统计(基于 config/frequency_words.txt) + + 注意:本工具不是自动提取新闻热点,而是统计你在 config/frequency_words.txt 中 + 设置的个人关注词在新闻中出现的频率。你可以自定义这个关注词列表。 + + Args: + top_n: 返回TOP N关注词,默认10 + mode: 模式选择 + - daily: 当日累计数据统计 + - current: 最新一批数据统计(默认) + + Returns: + JSON格式的关注词频率统计列表 + """ + tools = _get_tools() + result = tools['data'].get_trending_topics(top_n=top_n, mode=mode) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def get_news_by_date( + date_query: Optional[str] = None, + platforms: Optional[List[str]] = None, + limit: int = 50, + include_url: bool = False +) -> str: + """ + 获取指定日期的新闻数据,用于历史数据分析和对比 + + Args: + date_query: 日期查询,可选格式: + - 自然语言: "今天", "昨天", "前天", "3天前" + - 标准日期: "2024-01-15", "2024/01/15" + - 默认值: "今天"(节省token) + platforms: 平台ID列表,如 ['zhihu', 'weibo', 'douyin'] + - 不指定时:使用 config.yaml 中配置的所有平台 + - 支持的平台来自 config/config.yaml 的 platforms 配置 + - 每个平台都有对应的name字段(如"知乎"、"微博"),方便AI识别 + limit: 返回条数限制,默认50,最大1000 + 注意:实际返回数量可能少于请求值,取决于指定日期的新闻总数 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的新闻列表,包含标题、平台、排名等信息 + + **重要:数据展示建议** + 本工具会返回完整的新闻列表(通常50条)给你。但请注意: + - **工具返回**:完整的50条数据 ✅ + - **建议展示**:向用户展示全部数据,除非用户明确要求总结 + - **用户期望**:用户可能需要完整数据,请谨慎总结 + + **何时可以总结**: + - 用户明确说"给我总结一下"或"挑重点说" + - 数据量超过100条时,可先展示部分并询问是否查看全部 + + **注意**:如果用户询问"为什么只显示了部分",说明他们需要完整数据 + """ + tools = _get_tools() + result = tools['data'].get_news_by_date( + date_query=date_query, + platforms=platforms, + limit=limit, + include_url=include_url + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + + +# ==================== 高级数据分析工具 ==================== + +@mcp.tool +async def analyze_topic_trend( + topic: str, + analysis_type: str = "trend", + date_range: Optional[Dict[str, str]] = None, + granularity: str = "day", + threshold: float = 3.0, + time_window: int = 24, + lookahead_hours: int = 6, + confidence_threshold: float = 0.7 +) -> str: + """ + 统一话题趋势分析工具 - 整合多种趋势分析模式 + + Args: + topic: 话题关键词(必需) + analysis_type: 分析类型,可选值: + - "trend": 热度趋势分析(追踪话题的热度变化) + - "lifecycle": 生命周期分析(从出现到消失的完整周期) + - "viral": 异常热度检测(识别突然爆火的话题) + - "predict": 话题预测(预测未来可能的热点) + date_range: 日期范围(trend和lifecycle模式),可选 + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-10-18", "end": "2025-10-25"} + - **说明**: AI需要根据用户的自然语言(如"最近7天")自动计算日期范围 + - **默认**: 不指定时默认分析最近7天 + granularity: 时间粒度(trend模式),默认"day"(仅支持 day,因为底层数据按天聚合) + threshold: 热度突增倍数阈值(viral模式),默认3.0 + time_window: 检测时间窗口小时数(viral模式),默认24 + lookahead_hours: 预测未来小时数(predict模式),默认6 + confidence_threshold: 置信度阈值(predict模式),默认0.7 + + Returns: + JSON格式的趋势分析结果 + + **AI使用说明:** + 当用户使用相对时间表达时(如"最近7天"、"过去一周"、"上个月"), + AI需要自动计算对应的日期范围并传递给 date_range 参数。 + + Examples: + - analyze_topic_trend(topic="人工智能", analysis_type="trend", date_range={"start": "2025-10-18", "end": "2025-10-25"}) + - analyze_topic_trend(topic="特斯拉", analysis_type="lifecycle", date_range={"start": "2025-10-18", "end": "2025-10-25"}) + - analyze_topic_trend(topic="比特币", analysis_type="viral", threshold=3.0) + - analyze_topic_trend(topic="ChatGPT", analysis_type="predict", lookahead_hours=6) + """ + tools = _get_tools() + result = tools['analytics'].analyze_topic_trend_unified( + topic=topic, + analysis_type=analysis_type, + date_range=date_range, + granularity=granularity, + threshold=threshold, + time_window=time_window, + lookahead_hours=lookahead_hours, + confidence_threshold=confidence_threshold + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def analyze_data_insights( + insight_type: str = "platform_compare", + topic: Optional[str] = None, + date_range: Optional[Dict[str, str]] = None, + min_frequency: int = 3, + top_n: int = 20 +) -> str: + """ + 统一数据洞察分析工具 - 整合多种数据分析模式 + + Args: + insight_type: 洞察类型,可选值: + - "platform_compare": 平台对比分析(对比不同平台对话题的关注度) + - "platform_activity": 平台活跃度统计(统计各平台发布频率和活跃时间) + - "keyword_cooccur": 关键词共现分析(分析关键词同时出现的模式) + topic: 话题关键词(可选,platform_compare模式适用) + date_range: **【对象类型】** 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-01-01", "end": "2025-01-07"} + - **重要**: 必须是对象格式,不能传递整数 + min_frequency: 最小共现频次(keyword_cooccur模式),默认3 + top_n: 返回TOP N结果(keyword_cooccur模式),默认20 + + Returns: + JSON格式的数据洞察分析结果 + + Examples: + - analyze_data_insights(insight_type="platform_compare", topic="人工智能") + - analyze_data_insights(insight_type="platform_activity", date_range={"start": "2025-01-01", "end": "2025-01-07"}) + - analyze_data_insights(insight_type="keyword_cooccur", min_frequency=5, top_n=15) + """ + tools = _get_tools() + result = tools['analytics'].analyze_data_insights_unified( + insight_type=insight_type, + topic=topic, + date_range=date_range, + min_frequency=min_frequency, + top_n=top_n + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def analyze_sentiment( + topic: Optional[str] = None, + platforms: Optional[List[str]] = None, + date_range: Optional[Dict[str, str]] = None, + limit: int = 50, + sort_by_weight: bool = True, + include_url: bool = False +) -> str: + """ + 分析新闻的情感倾向和热度趋势 + + Args: + topic: 话题关键词(可选) + platforms: 平台ID列表,如 ['zhihu', 'weibo', 'douyin'] + - 不指定时:使用 config.yaml 中配置的所有平台 + - 支持的平台来自 config/config.yaml 的 platforms 配置 + - 每个平台都有对应的name字段(如"知乎"、"微博"),方便AI识别 + date_range: **【对象类型】** 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-01-01", "end": "2025-01-07"} + - **重要**: 必须是对象格式,不能传递整数 + limit: 返回新闻数量,默认50,最大100 + 注意:本工具会对新闻标题进行去重(同一标题在不同平台只保留一次), + 因此实际返回数量可能少于请求的 limit 值 + sort_by_weight: 是否按热度权重排序,默认True + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的分析结果,包含情感分布、热度趋势和相关新闻 + + **重要:数据展示策略** + - 本工具返回完整的分析结果和新闻列表 + - **默认展示方式**:展示完整的分析结果(包括所有新闻) + - 仅在用户明确要求"总结"或"挑重点"时才进行筛选 + """ + tools = _get_tools() + result = tools['analytics'].analyze_sentiment( + topic=topic, + platforms=platforms, + date_range=date_range, + limit=limit, + sort_by_weight=sort_by_weight, + include_url=include_url + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def find_similar_news( + reference_title: str, + threshold: float = 0.6, + limit: int = 50, + include_url: bool = False +) -> str: + """ + 查找与指定新闻标题相似的其他新闻 + + Args: + reference_title: 新闻标题(完整或部分) + threshold: 相似度阈值,0-1之间,默认0.6 + 注意:阈值越高匹配越严格,返回结果越少 + limit: 返回条数限制,默认50,最大100 + 注意:实际返回数量取决于相似度匹配结果,可能少于请求值 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的相似新闻列表,包含相似度分数 + + **重要:数据展示策略** + - 本工具返回完整的相似新闻列表 + - **默认展示方式**:展示全部返回的新闻(包括相似度分数) + - 仅在用户明确要求"总结"或"挑重点"时才进行筛选 + """ + tools = _get_tools() + result = tools['analytics'].find_similar_news( + reference_title=reference_title, + threshold=threshold, + limit=limit, + include_url=include_url + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def generate_summary_report( + report_type: str = "daily", + date_range: Optional[Dict[str, str]] = None +) -> str: + """ + 每日/每周摘要生成器 - 自动生成热点摘要报告 + + Args: + report_type: 报告类型(daily/weekly) + date_range: **【对象类型】** 自定义日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-01-01", "end": "2025-01-07"} + - **重要**: 必须是对象格式,不能传递整数 + + Returns: + JSON格式的摘要报告,包含Markdown格式内容 + """ + tools = _get_tools() + result = tools['analytics'].generate_summary_report( + report_type=report_type, + date_range=date_range + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +# ==================== 智能检索工具 ==================== + +@mcp.tool +async def search_news( + query: str, + search_mode: str = "keyword", + date_range: Optional[Dict[str, str]] = None, + platforms: Optional[List[str]] = None, + limit: int = 50, + sort_by: str = "relevance", + threshold: float = 0.6, + include_url: bool = False +) -> str: + """ + 统一搜索接口,支持多种搜索模式 + + Args: + query: 搜索关键词或内容片段 + search_mode: 搜索模式,可选值: + - "keyword": 精确关键词匹配(默认,适合搜索特定话题) + - "fuzzy": 模糊内容匹配(适合搜索内容片段,会过滤相似度低于阈值的结果) + - "entity": 实体名称搜索(适合搜索人物/地点/机构) + date_range: 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-01-01", "end": "2025-01-07"} + - **说明**: AI需要根据用户的自然语言(如"最近7天")自动计算日期范围 + - **默认**: 不指定时默认查询今天的新闻 + - **注意**: start和end可以相同(表示单日查询) + platforms: 平台ID列表,如 ['zhihu', 'weibo', 'douyin'] + - 不指定时:使用 config.yaml 中配置的所有平台 + - 支持的平台来自 config/config.yaml 的 platforms 配置 + - 每个平台都有对应的name字段(如"知乎"、"微博"),方便AI识别 + limit: 返回条数限制,默认50,最大1000 + 注意:实际返回数量取决于搜索匹配结果(特别是 fuzzy 模式下会过滤低相似度结果) + sort_by: 排序方式,可选值: + - "relevance": 按相关度排序(默认) + - "weight": 按新闻权重排序 + - "date": 按日期排序 + threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6 + 注意:阈值越高匹配越严格,返回结果越少 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的搜索结果,包含标题、平台、排名等信息 + + **重要:数据展示策略** + - 本工具返回完整的搜索结果列表 + - **默认展示方式**:展示全部返回的新闻,无需总结或筛选 + - 仅在用户明确要求"总结"或"挑重点"时才进行筛选 + + **AI使用说明:** + 当用户使用相对时间表达时(如"最近7天"、"过去一周"、"最近半个月"), + AI需要自动计算对应的日期范围。计算规则: + - "最近7天" → {"start": "今天-6天", "end": "今天"} + - "过去一周" → {"start": "今天-6天", "end": "今天"} + - "最近30天" → {"start": "今天-29天", "end": "今天"} + + Examples: + - 今天的新闻: search_news(query="人工智能") + - 最近7天: search_news(query="人工智能", date_range={"start": "2025-10-18", "end": "2025-10-25"}) + - 精确日期: search_news(query="人工智能", date_range={"start": "2025-01-01", "end": "2025-01-07"}) + - 模糊搜索: search_news(query="特斯拉降价", search_mode="fuzzy", threshold=0.4) + """ + tools = _get_tools() + result = tools['search'].search_news_unified( + query=query, + search_mode=search_mode, + date_range=date_range, + platforms=platforms, + limit=limit, + sort_by=sort_by, + threshold=threshold, + include_url=include_url + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def search_related_news_history( + reference_text: str, + time_preset: str = "yesterday", + threshold: float = 0.4, + limit: int = 50, + include_url: bool = False +) -> str: + """ + 基于种子新闻,在历史数据中搜索相关新闻 + + Args: + reference_text: 参考新闻标题(完整或部分) + time_preset: 时间范围预设值,可选: + - "yesterday": 昨天 + - "last_week": 上周 (7天) + - "last_month": 上个月 (30天) + - "custom": 自定义日期范围(需要提供 start_date 和 end_date) + threshold: 相关性阈值,0-1之间,默认0.4 + 注意:综合相似度计算(70%关键词重合 + 30%文本相似度) + 阈值越高匹配越严格,返回结果越少 + limit: 返回条数限制,默认50,最大100 + 注意:实际返回数量取决于相关性匹配结果,可能少于请求值 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的相关新闻列表,包含相关性分数和时间分布 + + **重要:数据展示策略** + - 本工具返回完整的相关新闻列表 + - **默认展示方式**:展示全部返回的新闻(包括相关性分数) + - 仅在用户明确要求"总结"或"挑重点"时才进行筛选 + """ + tools = _get_tools() + result = tools['search'].search_related_news_history( + reference_text=reference_text, + time_preset=time_preset, + threshold=threshold, + limit=limit, + include_url=include_url + ) + return json.dumps(result, ensure_ascii=False, indent=2) + + +# ==================== 配置与系统管理工具 ==================== + +@mcp.tool +async def get_current_config( + section: str = "all" +) -> str: + """ + 获取当前系统配置 + + Args: + section: 配置节,可选值: + - "all": 所有配置(默认) + - "crawler": 爬虫配置 + - "push": 推送配置 + - "keywords": 关键词配置 + - "weights": 权重配置 + + Returns: + JSON格式的配置信息 + """ + tools = _get_tools() + result = tools['config'].get_current_config(section=section) + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def get_system_status() -> str: + """ + 获取系统运行状态和健康检查信息 + + 返回系统版本、数据统计、缓存状态等信息 + + Returns: + JSON格式的系统状态信息 + """ + tools = _get_tools() + result = tools['system'].get_system_status() + return json.dumps(result, ensure_ascii=False, indent=2) + + +@mcp.tool +async def trigger_crawl( + platforms: Optional[List[str]] = None, + save_to_local: bool = False, + include_url: bool = False +) -> str: + """ + 手动触发一次爬取任务(可选持久化) + + Args: + platforms: 指定平台ID列表,如 ['zhihu', 'weibo', 'douyin'] + - 不指定时:使用 config.yaml 中配置的所有平台 + - 支持的平台来自 config/config.yaml 的 platforms 配置 + - 每个平台都有对应的name字段(如"知乎"、"微博"),方便AI识别 + - 注意:失败的平台会在返回结果的 failed_platforms 字段中列出 + save_to_local: 是否保存到本地 output 目录,默认 False + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + JSON格式的任务状态信息,包含: + - platforms: 成功爬取的平台列表 + - failed_platforms: 失败的平台列表(如有) + - total_news: 爬取的新闻总数 + - data: 新闻数据 + + Examples: + - 临时爬取: trigger_crawl(platforms=['zhihu']) + - 爬取并保存: trigger_crawl(platforms=['weibo'], save_to_local=True) + - 使用默认平台: trigger_crawl() # 爬取config.yaml中配置的所有平台 + """ + tools = _get_tools() + result = tools['system'].trigger_crawl(platforms=platforms, save_to_local=save_to_local, include_url=include_url) + return json.dumps(result, ensure_ascii=False, indent=2) + + +# ==================== 启动入口 ==================== + +def run_server( + project_root: Optional[str] = None, + transport: str = 'stdio', + host: str = '0.0.0.0', + port: int = 3333 +): + """ + 启动 MCP 服务器 + + Args: + project_root: 项目根目录路径 + transport: 传输模式,'stdio' 或 'http' + host: HTTP模式的监听地址,默认 0.0.0.0 + port: HTTP模式的监听端口,默认 3333 + """ + # 初始化工具实例 + _get_tools(project_root) + + # 打印启动信息 + print() + print("=" * 60) + print(" TrendRadar MCP Server - FastMCP 2.0") + print("=" * 60) + print(f" 传输模式: {transport.upper()}") + + if transport == 'stdio': + print(" 协议: MCP over stdio (标准输入输出)") + print(" 说明: 通过标准输入输出与 MCP 客户端通信") + elif transport == 'http': + print(f" 监听地址: http://{host}:{port}") + print(f" HTTP端点: http://{host}:{port}/mcp") + print(" 协议: MCP over HTTP (生产环境)") + + if project_root: + print(f" 项目目录: {project_root}") + else: + print(" 项目目录: 当前目录") + + print() + print(" 已注册的工具:") + print(" === 基础数据查询(P0核心)===") + print(" 1. get_latest_news - 获取最新新闻") + print(" 2. get_news_by_date - 按日期查询新闻(支持自然语言)") + print(" 3. get_trending_topics - 获取趋势话题") + print() + print(" === 智能检索工具 ===") + print(" 4. search_news - 统一新闻搜索(关键词/模糊/实体)") + print(" 5. search_related_news_history - 历史相关新闻检索") + print() + print(" === 高级数据分析 ===") + print(" 6. analyze_topic_trend - 统一话题趋势分析(热度/生命周期/爆火/预测)") + print(" 7. analyze_data_insights - 统一数据洞察分析(平台对比/活跃度/关键词共现)") + print(" 8. analyze_sentiment - 情感倾向分析") + print(" 9. find_similar_news - 相似新闻查找") + print(" 10. generate_summary_report - 每日/每周摘要生成") + print() + print(" === 配置与系统管理 ===") + print(" 11. get_current_config - 获取当前系统配置") + print(" 12. get_system_status - 获取系统运行状态") + print(" 13. trigger_crawl - 手动触发爬取任务") + print("=" * 60) + print() + + # 根据传输模式运行服务器 + if transport == 'stdio': + mcp.run(transport='stdio') + elif transport == 'http': + # HTTP 模式(生产推荐) + mcp.run( + transport='http', + host=host, + port=port, + path='/mcp' # HTTP 端点路径 + ) + else: + raise ValueError(f"不支持的传输模式: {transport}") + + +if __name__ == '__main__': + import sys + import argparse + + parser = argparse.ArgumentParser( + description='TrendRadar MCP Server - 新闻热点聚合 MCP 工具服务器', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +使用示例: + # STDIO 模式(用于 Cherry Studio) + uv run python mcp_server/server.py + + # HTTP 模式(适合远程访问) + uv run python mcp_server/server.py --transport http --port 3333 + +Cherry Studio 配置示例: + 设置 > MCP Servers > 添加服务器 + - 名称: TrendRadar + - 类型: STDIO + - 命令: [UV的完整路径] + - 参数: --directory [项目路径] run python mcp_server/server.py + +详细配置教程请查看: README-Cherry-Studio.md + """ + ) + parser.add_argument( + '--transport', + choices=['stdio', 'http'], + default='stdio', + help='传输模式:stdio (默认) 或 http (生产环境)' + ) + parser.add_argument( + '--host', + default='0.0.0.0', + help='HTTP模式的监听地址,默认 0.0.0.0' + ) + parser.add_argument( + '--port', + type=int, + default=3333, + help='HTTP模式的监听端口,默认 3333' + ) + parser.add_argument( + '--project-root', + help='项目根目录路径' + ) + + args = parser.parse_args() + + run_server( + project_root=args.project_root, + transport=args.transport, + host=args.host, + port=args.port + ) diff --git a/mcp_server/services/__init__.py b/mcp_server/services/__init__.py new file mode 100644 index 0000000000000..81fd84e56df3d --- /dev/null +++ b/mcp_server/services/__init__.py @@ -0,0 +1,5 @@ +""" +服务层模块 + +提供数据访问、缓存、解析等核心服务。 +""" diff --git a/mcp_server/services/cache_service.py b/mcp_server/services/cache_service.py new file mode 100644 index 0000000000000..ce09d00a5acd5 --- /dev/null +++ b/mcp_server/services/cache_service.py @@ -0,0 +1,136 @@ +""" +缓存服务 + +实现TTL缓存机制,提升数据访问性能。 +""" + +import time +from typing import Any, Optional +from threading import Lock + + +class CacheService: + """缓存服务类""" + + def __init__(self): + """初始化缓存服务""" + self._cache = {} + self._timestamps = {} + self._lock = Lock() + + def get(self, key: str, ttl: int = 900) -> Optional[Any]: + """ + 获取缓存数据 + + Args: + key: 缓存键 + ttl: 存活时间(秒),默认15分钟 + + Returns: + 缓存的值,如果不存在或已过期则返回None + """ + with self._lock: + if key in self._cache: + # 检查是否过期 + if time.time() - self._timestamps[key] < ttl: + return self._cache[key] + else: + # 已过期,删除缓存 + del self._cache[key] + del self._timestamps[key] + return None + + def set(self, key: str, value: Any) -> None: + """ + 设置缓存数据 + + Args: + key: 缓存键 + value: 缓存值 + """ + with self._lock: + self._cache[key] = value + self._timestamps[key] = time.time() + + def delete(self, key: str) -> bool: + """ + 删除缓存 + + Args: + key: 缓存键 + + Returns: + 是否成功删除 + """ + with self._lock: + if key in self._cache: + del self._cache[key] + del self._timestamps[key] + return True + return False + + def clear(self) -> None: + """清空所有缓存""" + with self._lock: + self._cache.clear() + self._timestamps.clear() + + def cleanup_expired(self, ttl: int = 900) -> int: + """ + 清理过期缓存 + + Args: + ttl: 存活时间(秒) + + Returns: + 清理的条目数量 + """ + with self._lock: + current_time = time.time() + expired_keys = [ + key for key, timestamp in self._timestamps.items() + if current_time - timestamp >= ttl + ] + + for key in expired_keys: + del self._cache[key] + del self._timestamps[key] + + return len(expired_keys) + + def get_stats(self) -> dict: + """ + 获取缓存统计信息 + + Returns: + 统计信息字典 + """ + with self._lock: + return { + "total_entries": len(self._cache), + "oldest_entry_age": ( + time.time() - min(self._timestamps.values()) + if self._timestamps else 0 + ), + "newest_entry_age": ( + time.time() - max(self._timestamps.values()) + if self._timestamps else 0 + ) + } + + +# 全局缓存实例 +_global_cache = None + + +def get_cache() -> CacheService: + """ + 获取全局缓存实例 + + Returns: + 全局缓存服务实例 + """ + global _global_cache + if _global_cache is None: + _global_cache = CacheService() + return _global_cache diff --git a/mcp_server/services/data_service.py b/mcp_server/services/data_service.py new file mode 100644 index 0000000000000..9e409a1f7f0d4 --- /dev/null +++ b/mcp_server/services/data_service.py @@ -0,0 +1,604 @@ +""" +数据访问服务 + +提供统一的数据查询接口,封装数据访问逻辑。 +""" + +import re +from collections import Counter +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple + +from .cache_service import get_cache +from .parser_service import ParserService +from ..utils.errors import DataNotFoundError + + +class DataService: + """数据访问服务类""" + + def __init__(self, project_root: str = None): + """ + 初始化数据服务 + + Args: + project_root: 项目根目录 + """ + self.parser = ParserService(project_root) + self.cache = get_cache() + + def get_latest_news( + self, + platforms: Optional[List[str]] = None, + limit: int = 50, + include_url: bool = False + ) -> List[Dict]: + """ + 获取最新一批爬取的新闻数据 + + Args: + platforms: 平台ID列表,None表示所有平台 + limit: 返回条数限制 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 新闻列表 + + Raises: + DataNotFoundError: 数据不存在 + """ + # 尝试从缓存获取 + cache_key = f"latest_news:{','.join(platforms or [])}:{limit}:{include_url}" + cached = self.cache.get(cache_key, ttl=900) # 15分钟缓存 + if cached: + return cached + + # 读取今天的数据 + all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date( + date=None, + platform_ids=platforms + ) + + # 获取最新的文件时间 + if timestamps: + latest_timestamp = max(timestamps.values()) + fetch_time = datetime.fromtimestamp(latest_timestamp) + else: + fetch_time = datetime.now() + + # 转换为新闻列表 + news_list = [] + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 取第一个排名 + rank = info["ranks"][0] if info["ranks"] else 0 + + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "rank": rank, + "timestamp": fetch_time.strftime("%Y-%m-%d %H:%M:%S") + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + news_list.append(news_item) + + # 按排名排序 + news_list.sort(key=lambda x: x["rank"]) + + # 限制返回数量 + result = news_list[:limit] + + # 缓存结果 + self.cache.set(cache_key, result) + + return result + + def get_news_by_date( + self, + target_date: datetime, + platforms: Optional[List[str]] = None, + limit: int = 50, + include_url: bool = False + ) -> List[Dict]: + """ + 按指定日期获取新闻 + + Args: + target_date: 目标日期 + platforms: 平台ID列表,None表示所有平台 + limit: 返回条数限制 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 新闻列表 + + Raises: + DataNotFoundError: 数据不存在 + + Examples: + >>> service = DataService() + >>> news = service.get_news_by_date( + ... target_date=datetime(2025, 10, 10), + ... platforms=['zhihu'], + ... limit=20 + ... ) + """ + # 尝试从缓存获取 + date_str = target_date.strftime("%Y-%m-%d") + cache_key = f"news_by_date:{date_str}:{','.join(platforms or [])}:{limit}:{include_url}" + cached = self.cache.get(cache_key, ttl=1800) # 30分钟缓存 + if cached: + return cached + + # 读取指定日期的数据 + all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date( + date=target_date, + platform_ids=platforms + ) + + # 转换为新闻列表 + news_list = [] + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 计算平均排名 + avg_rank = sum(info["ranks"]) / len(info["ranks"]) if info["ranks"] else 0 + + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "rank": info["ranks"][0] if info["ranks"] else 0, + "avg_rank": round(avg_rank, 2), + "count": len(info["ranks"]), + "date": date_str + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + news_list.append(news_item) + + # 按排名排序 + news_list.sort(key=lambda x: x["rank"]) + + # 限制返回数量 + result = news_list[:limit] + + # 缓存结果(历史数据缓存更久) + self.cache.set(cache_key, result) + + return result + + def search_news_by_keyword( + self, + keyword: str, + date_range: Optional[Tuple[datetime, datetime]] = None, + platforms: Optional[List[str]] = None, + limit: Optional[int] = None + ) -> Dict: + """ + 按关键词搜索新闻 + + Args: + keyword: 搜索关键词 + date_range: 日期范围 (start_date, end_date) + platforms: 平台过滤列表 + limit: 返回条数限制(可选) + + Returns: + 搜索结果字典 + + Raises: + DataNotFoundError: 数据不存在 + """ + # 确定搜索日期范围 + if date_range: + start_date, end_date = date_range + else: + # 默认搜索今天 + start_date = end_date = datetime.now() + + # 收集所有匹配的新闻 + results = [] + platform_distribution = Counter() + + # 遍历日期范围 + current_date = start_date + while current_date <= end_date: + try: + all_titles, id_to_name, _ = self.parser.read_all_titles_for_date( + date=current_date, + platform_ids=platforms + ) + + # 搜索包含关键词的标题 + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + if keyword.lower() in title.lower(): + # 计算平均排名 + avg_rank = sum(info["ranks"]) / len(info["ranks"]) if info["ranks"] else 0 + + results.append({ + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "ranks": info["ranks"], + "count": len(info["ranks"]), + "avg_rank": round(avg_rank, 2), + "url": info.get("url", ""), + "mobileUrl": info.get("mobileUrl", ""), + "date": current_date.strftime("%Y-%m-%d") + }) + + platform_distribution[platform_id] += 1 + + except DataNotFoundError: + # 该日期没有数据,继续下一天 + pass + + # 下一天 + current_date += timedelta(days=1) + + if not results: + raise DataNotFoundError( + f"未找到包含关键词 '{keyword}' 的新闻", + suggestion="请尝试其他关键词或扩大日期范围" + ) + + # 计算统计信息 + total_ranks = [] + for item in results: + total_ranks.extend(item["ranks"]) + + avg_rank = sum(total_ranks) / len(total_ranks) if total_ranks else 0 + + # 限制返回数量(如果指定) + total_found = len(results) + if limit is not None and limit > 0: + results = results[:limit] + + return { + "results": results, + "total": len(results), + "total_found": total_found, + "statistics": { + "platform_distribution": dict(platform_distribution), + "avg_rank": round(avg_rank, 2), + "keyword": keyword + } + } + + def get_trending_topics( + self, + top_n: int = 10, + mode: str = "current" + ) -> Dict: + """ + 获取个人关注词的新闻出现频率统计 + + 注意:本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计, + 而不是自动从新闻中提取热点话题。用户可以自定义这个关注词列表。 + + Args: + top_n: 返回TOP N关注词 + mode: 模式 - daily(当日累计), current(最新一批) + + Returns: + 关注词频率统计字典 + + Raises: + DataNotFoundError: 数据不存在 + """ + # 尝试从缓存获取 + cache_key = f"trending_topics:{top_n}:{mode}" + cached = self.cache.get(cache_key, ttl=1800) # 30分钟缓存 + if cached: + return cached + + # 读取今天的数据 + all_titles, id_to_name, timestamps = self.parser.read_all_titles_for_date() + + if not all_titles: + raise DataNotFoundError( + "未找到今天的新闻数据", + suggestion="请确保爬虫已经运行并生成了数据" + ) + + # 加载关键词配置 + word_groups = self.parser.parse_frequency_words() + + # 根据mode选择要处理的标题数据 + titles_to_process = {} + + if mode == "daily": + # daily模式:处理当天所有累计数据 + titles_to_process = all_titles + + elif mode == "current": + # current模式:只处理最新一批数据(最新时间戳的文件) + if timestamps: + # 找出最新的时间戳 + latest_timestamp = max(timestamps.values()) + + # 重新读取,只获取最新时间的数据 + # 这里我们通过timestamps字典反查找最新文件对应的平台 + latest_titles, _, _ = self.parser.read_all_titles_for_date() + + # 由于read_all_titles_for_date返回所有文件的合并数据, + # 我们需要通过timestamps来过滤出最新批次 + # 简化实现:使用当前所有数据作为最新批次 + # (更精确的实现需要解析服务支持按时间过滤) + titles_to_process = latest_titles + else: + titles_to_process = all_titles + + else: + raise ValueError( + f"不支持的模式: {mode}。支持的模式: daily, current" + ) + + # 统计词频 + word_frequency = Counter() + keyword_to_news = {} + + # 遍历要处理的标题 + for platform_id, titles in titles_to_process.items(): + for title in titles.keys(): + # 对每个关键词组进行匹配 + for group in word_groups: + all_words = group.get("required", []) + group.get("normal", []) + + for word in all_words: + if word and word in title: + word_frequency[word] += 1 + + if word not in keyword_to_news: + keyword_to_news[word] = [] + keyword_to_news[word].append(title) + + # 获取TOP N关键词 + top_keywords = word_frequency.most_common(top_n) + + # 构建话题列表 + topics = [] + for keyword, frequency in top_keywords: + matched_news = keyword_to_news.get(keyword, []) + + topics.append({ + "keyword": keyword, + "frequency": frequency, + "matched_news": len(set(matched_news)), # 去重后的新闻数量 + "trend": "stable", # TODO: 需要历史数据来计算趋势 + "weight_score": 0.0 # TODO: 需要实现权重计算 + }) + + # 构建结果 + result = { + "topics": topics, + "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "mode": mode, + "total_keywords": len(word_frequency), + "description": self._get_mode_description(mode) + } + + # 缓存结果 + self.cache.set(cache_key, result) + + return result + + def _get_mode_description(self, mode: str) -> str: + """获取模式描述""" + descriptions = { + "daily": "当日累计统计", + "current": "最新一批统计" + } + return descriptions.get(mode, "未知模式") + + def get_current_config(self, section: str = "all") -> Dict: + """ + 获取当前系统配置 + + Args: + section: 配置节 - all/crawler/push/keywords/weights + + Returns: + 配置字典 + + Raises: + FileParseError: 配置文件解析错误 + """ + # 尝试从缓存获取 + cache_key = f"config:{section}" + cached = self.cache.get(cache_key, ttl=3600) # 1小时缓存 + if cached: + return cached + + # 解析配置文件 + config_data = self.parser.parse_yaml_config() + word_groups = self.parser.parse_frequency_words() + + # 根据section返回对应配置 + if section == "all" or section == "crawler": + crawler_config = { + "enable_crawler": config_data.get("crawler", {}).get("enable_crawler", True), + "use_proxy": config_data.get("crawler", {}).get("use_proxy", False), + "request_interval": config_data.get("crawler", {}).get("request_interval", 1), + "retry_times": 3, + "platforms": [p["id"] for p in config_data.get("platforms", [])] + } + + if section == "all" or section == "push": + push_config = { + "enable_notification": config_data.get("notification", {}).get("enable_notification", True), + "enabled_channels": [], + "message_batch_size": config_data.get("notification", {}).get("message_batch_size", 20), + "push_window": config_data.get("notification", {}).get("push_window", {}) + } + + # 检测已配置的通知渠道 + webhooks = config_data.get("notification", {}).get("webhooks", {}) + if webhooks.get("feishu_url"): + push_config["enabled_channels"].append("feishu") + if webhooks.get("dingtalk_url"): + push_config["enabled_channels"].append("dingtalk") + if webhooks.get("wework_url"): + push_config["enabled_channels"].append("wework") + + if section == "all" or section == "keywords": + keywords_config = { + "word_groups": word_groups, + "total_groups": len(word_groups) + } + + if section == "all" or section == "weights": + weights_config = { + "rank_weight": config_data.get("weight", {}).get("rank_weight", 0.6), + "frequency_weight": config_data.get("weight", {}).get("frequency_weight", 0.3), + "hotness_weight": config_data.get("weight", {}).get("hotness_weight", 0.1) + } + + # 组装结果 + if section == "all": + result = { + "crawler": crawler_config, + "push": push_config, + "keywords": keywords_config, + "weights": weights_config + } + elif section == "crawler": + result = crawler_config + elif section == "push": + result = push_config + elif section == "keywords": + result = keywords_config + elif section == "weights": + result = weights_config + else: + result = {} + + # 缓存结果 + self.cache.set(cache_key, result) + + return result + + def get_available_date_range(self) -> Tuple[Optional[datetime], Optional[datetime]]: + """ + 扫描 output 目录,返回实际可用的日期范围 + + Returns: + (最早日期, 最新日期) 元组,如果没有数据则返回 (None, None) + + Examples: + >>> service = DataService() + >>> earliest, latest = service.get_available_date_range() + >>> print(f"可用日期范围:{earliest} 至 {latest}") + """ + output_dir = self.parser.project_root / "output" + + if not output_dir.exists(): + return (None, None) + + available_dates = [] + + # 遍历日期文件夹 + for date_folder in output_dir.iterdir(): + if date_folder.is_dir() and not date_folder.name.startswith('.'): + # 解析日期(格式: YYYY年MM月DD日) + try: + date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_folder.name) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)) + ) + available_dates.append(folder_date) + except Exception: + pass + + if not available_dates: + return (None, None) + + return (min(available_dates), max(available_dates)) + + def get_system_status(self) -> Dict: + """ + 获取系统运行状态 + + Returns: + 系统状态字典 + """ + # 获取数据统计 + output_dir = self.parser.project_root / "output" + + total_storage = 0 + oldest_record = None + latest_record = None + total_news = 0 + + if output_dir.exists(): + # 遍历日期文件夹 + for date_folder in output_dir.iterdir(): + if date_folder.is_dir(): + # 解析日期 + try: + date_str = date_folder.name + # 格式: YYYY年MM月DD日 + date_match = re.match(r'(\d{4})年(\d{2})月(\d{2})日', date_str) + if date_match: + folder_date = datetime( + int(date_match.group(1)), + int(date_match.group(2)), + int(date_match.group(3)) + ) + + if oldest_record is None or folder_date < oldest_record: + oldest_record = folder_date + if latest_record is None or folder_date > latest_record: + latest_record = folder_date + + except: + pass + + # 计算存储大小 + for item in date_folder.rglob("*"): + if item.is_file(): + total_storage += item.stat().st_size + + # 读取版本信息 + version_file = self.parser.project_root / "version" + version = "unknown" + if version_file.exists(): + try: + with open(version_file, "r") as f: + version = f.read().strip() + except: + pass + + return { + "system": { + "version": version, + "project_root": str(self.parser.project_root) + }, + "data": { + "total_storage": f"{total_storage / 1024 / 1024:.2f} MB", + "oldest_record": oldest_record.strftime("%Y-%m-%d") if oldest_record else None, + "latest_record": latest_record.strftime("%Y-%m-%d") if latest_record else None, + }, + "cache": self.cache.get_stats(), + "health": "healthy" + } diff --git a/mcp_server/services/parser_service.py b/mcp_server/services/parser_service.py new file mode 100644 index 0000000000000..6bd296906930c --- /dev/null +++ b/mcp_server/services/parser_service.py @@ -0,0 +1,355 @@ +""" +文件解析服务 + +提供txt格式新闻数据和YAML配置文件的解析功能。 +""" + +import re +from pathlib import Path +from typing import Dict, List, Tuple, Optional +from datetime import datetime + +import yaml + +from ..utils.errors import FileParseError, DataNotFoundError +from .cache_service import get_cache + + +class ParserService: + """文件解析服务类""" + + def __init__(self, project_root: str = None): + """ + 初始化解析服务 + + Args: + project_root: 项目根目录,默认为当前目录的父目录 + """ + if project_root is None: + # 获取当前文件所在目录的父目录的父目录 + current_file = Path(__file__) + self.project_root = current_file.parent.parent.parent + else: + self.project_root = Path(project_root) + + # 初始化缓存服务 + self.cache = get_cache() + + @staticmethod + def clean_title(title: str) -> str: + """ + 清理标题文本 + + Args: + title: 原始标题 + + Returns: + 清理后的标题 + """ + # 移除多余空白 + title = re.sub(r'\s+', ' ', title) + # 移除特殊字符 + title = title.strip() + return title + + def parse_txt_file(self, file_path: Path) -> Tuple[Dict, Dict]: + """ + 解析单个txt文件的标题数据 + + Args: + file_path: txt文件路径 + + Returns: + (titles_by_id, id_to_name) 元组 + - titles_by_id: {platform_id: {title: {ranks, url, mobileUrl}}} + - id_to_name: {platform_id: platform_name} + + Raises: + FileParseError: 文件解析错误 + """ + if not file_path.exists(): + raise FileParseError(str(file_path), "文件不存在") + + titles_by_id = {} + id_to_name = {} + + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + sections = content.split("\n\n") + + for section in sections: + if not section.strip() or "==== 以下ID请求失败 ====" in section: + continue + + lines = section.strip().split("\n") + if len(lines) < 2: + continue + + # 解析header: id | name 或 id + header_line = lines[0].strip() + if " | " in header_line: + parts = header_line.split(" | ", 1) + source_id = parts[0].strip() + name = parts[1].strip() + id_to_name[source_id] = name + else: + source_id = header_line + id_to_name[source_id] = source_id + + titles_by_id[source_id] = {} + + # 解析标题行 + for line in lines[1:]: + if line.strip(): + try: + title_part = line.strip() + rank = None + + # 提取排名 + if ". " in title_part and title_part.split(". ")[0].isdigit(): + rank_str, title_part = title_part.split(". ", 1) + rank = int(rank_str) + + # 提取 MOBILE URL + mobile_url = "" + if " [MOBILE:" in title_part: + title_part, mobile_part = title_part.rsplit(" [MOBILE:", 1) + if mobile_part.endswith("]"): + mobile_url = mobile_part[:-1] + + # 提取 URL + url = "" + if " [URL:" in title_part: + title_part, url_part = title_part.rsplit(" [URL:", 1) + if url_part.endswith("]"): + url = url_part[:-1] + + title = self.clean_title(title_part.strip()) + ranks = [rank] if rank is not None else [1] + + titles_by_id[source_id][title] = { + "ranks": ranks, + "url": url, + "mobileUrl": mobile_url, + } + + except Exception as e: + # 忽略单行解析错误 + continue + + except Exception as e: + raise FileParseError(str(file_path), str(e)) + + return titles_by_id, id_to_name + + def get_date_folder_name(self, date: datetime = None) -> str: + """ + 获取日期文件夹名称 + + Args: + date: 日期对象,默认为今天 + + Returns: + 文件夹名称,格式: YYYY年MM月DD日 + """ + if date is None: + date = datetime.now() + return date.strftime("%Y年%m月%d日") + + def read_all_titles_for_date( + self, + date: datetime = None, + platform_ids: Optional[List[str]] = None + ) -> Tuple[Dict, Dict, Dict]: + """ + 读取指定日期的所有标题文件(带缓存) + + Args: + date: 日期对象,默认为今天 + platform_ids: 平台ID列表,None表示所有平台 + + Returns: + (all_titles, id_to_name, all_timestamps) 元组 + - all_titles: {platform_id: {title: {ranks, url, mobileUrl, ...}}} + - id_to_name: {platform_id: platform_name} + - all_timestamps: {filename: timestamp} + + Raises: + DataNotFoundError: 数据不存在 + """ + # 生成缓存键 + date_str = self.get_date_folder_name(date) + platform_key = ','.join(sorted(platform_ids)) if platform_ids else 'all' + cache_key = f"read_all_titles:{date_str}:{platform_key}" + + # 尝试从缓存获取 + # 对于历史数据(非今天),使用更长的缓存时间(1小时) + # 对于今天的数据,使用较短的缓存时间(15分钟),因为可能有新数据 + is_today = (date is None) or (date.date() == datetime.now().date()) + ttl = 900 if is_today else 3600 # 15分钟 vs 1小时 + + cached = self.cache.get(cache_key, ttl=ttl) + if cached: + return cached + + # 缓存未命中,读取文件 + date_folder = self.get_date_folder_name(date) + txt_dir = self.project_root / "output" / date_folder / "txt" + + if not txt_dir.exists(): + raise DataNotFoundError( + f"未找到 {date_folder} 的数据目录", + suggestion="请先运行爬虫或检查日期是否正确" + ) + + all_titles = {} + id_to_name = {} + all_timestamps = {} + + # 读取所有txt文件 + txt_files = sorted(txt_dir.glob("*.txt")) + + if not txt_files: + raise DataNotFoundError( + f"{date_folder} 没有数据文件", + suggestion="请等待爬虫任务完成" + ) + + for txt_file in txt_files: + try: + titles_by_id, file_id_to_name = self.parse_txt_file(txt_file) + + # 更新id_to_name + id_to_name.update(file_id_to_name) + + # 合并标题数据 + for platform_id, titles in titles_by_id.items(): + # 如果指定了平台过滤 + if platform_ids and platform_id not in platform_ids: + continue + + if platform_id not in all_titles: + all_titles[platform_id] = {} + + for title, info in titles.items(): + if title in all_titles[platform_id]: + # 合并排名 + all_titles[platform_id][title]["ranks"].extend(info["ranks"]) + else: + all_titles[platform_id][title] = info.copy() + + # 记录文件时间戳 + all_timestamps[txt_file.name] = txt_file.stat().st_mtime + + except Exception as e: + # 忽略单个文件的解析错误,继续处理其他文件 + print(f"Warning: 解析文件 {txt_file} 失败: {e}") + continue + + if not all_titles: + raise DataNotFoundError( + f"{date_folder} 没有有效的数据", + suggestion="请检查数据文件格式或重新运行爬虫" + ) + + # 缓存结果 + result = (all_titles, id_to_name, all_timestamps) + self.cache.set(cache_key, result) + + return result + + def parse_yaml_config(self, config_path: str = None) -> dict: + """ + 解析YAML配置文件 + + Args: + config_path: 配置文件路径,默认为 config/config.yaml + + Returns: + 配置字典 + + Raises: + FileParseError: 配置文件解析错误 + """ + if config_path is None: + config_path = self.project_root / "config" / "config.yaml" + else: + config_path = Path(config_path) + + if not config_path.exists(): + raise FileParseError(str(config_path), "配置文件不存在") + + try: + with open(config_path, "r", encoding="utf-8") as f: + config_data = yaml.safe_load(f) + return config_data + except Exception as e: + raise FileParseError(str(config_path), str(e)) + + def parse_frequency_words(self, words_file: str = None) -> List[Dict]: + """ + 解析关键词配置文件 + + Args: + words_file: 关键词文件路径,默认为 config/frequency_words.txt + + Returns: + 词组列表 + + Raises: + FileParseError: 文件解析错误 + """ + if words_file is None: + words_file = self.project_root / "config" / "frequency_words.txt" + else: + words_file = Path(words_file) + + if not words_file.exists(): + return [] + + word_groups = [] + + try: + with open(words_file, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + + # 使用 | 分隔符 + parts = [p.strip() for p in line.split("|")] + if not parts: + continue + + group = { + "required": [], + "normal": [], + "filter_words": [] + } + + for part in parts: + if not part: + continue + + words = [w.strip() for w in part.split(",")] + for word in words: + if not word: + continue + if word.endswith("+"): + # 必须词 + group["required"].append(word[:-1]) + elif word.endswith("!"): + # 过滤词 + group["filter_words"].append(word[:-1]) + else: + # 普通词 + group["normal"].append(word) + + if group["required"] or group["normal"]: + word_groups.append(group) + + except Exception as e: + raise FileParseError(str(words_file), str(e)) + + return word_groups diff --git a/mcp_server/tools/__init__.py b/mcp_server/tools/__init__.py new file mode 100644 index 0000000000000..699654084f259 --- /dev/null +++ b/mcp_server/tools/__init__.py @@ -0,0 +1,5 @@ +""" +MCP 工具模块 + +包含所有MCP工具的实现。 +""" diff --git a/mcp_server/tools/analytics.py b/mcp_server/tools/analytics.py new file mode 100644 index 0000000000000..e851d8bbd9529 --- /dev/null +++ b/mcp_server/tools/analytics.py @@ -0,0 +1,1996 @@ +""" +高级数据分析工具 + +提供热度趋势分析、平台对比、关键词共现、情感分析等高级分析功能。 +""" + +import re +from collections import Counter, defaultdict +from datetime import datetime, timedelta +from typing import Dict, List, Optional +from difflib import SequenceMatcher + +from ..services.data_service import DataService +from ..utils.validators import ( + validate_platforms, + validate_limit, + validate_keyword, + validate_top_n, + validate_date_range +) +from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError + + +def calculate_news_weight(news_data: Dict, rank_threshold: int = 5) -> float: + """ + 计算新闻权重(用于排序) + + 基于 main.py 的权重算法实现,综合考虑: + - 排名权重 (60%):新闻在榜单中的排名 + - 频次权重 (30%):新闻出现的次数 + - 热度权重 (10%):高排名出现的比例 + + Args: + news_data: 新闻数据字典,包含 ranks 和 count 字段 + rank_threshold: 高排名阈值,默认5 + + Returns: + 权重分数(0-100之间的浮点数) + """ + ranks = news_data.get("ranks", []) + if not ranks: + return 0.0 + + count = news_data.get("count", len(ranks)) + + # 权重配置(与 config.yaml 保持一致) + RANK_WEIGHT = 0.6 + FREQUENCY_WEIGHT = 0.3 + HOTNESS_WEIGHT = 0.1 + + # 1. 排名权重:Σ(11 - min(rank, 10)) / 出现次数 + rank_scores = [] + for rank in ranks: + score = 11 - min(rank, 10) + rank_scores.append(score) + + rank_weight = sum(rank_scores) / len(ranks) if ranks else 0 + + # 2. 频次权重:min(出现次数, 10) × 10 + frequency_weight = min(count, 10) * 10 + + # 3. 热度加成:高排名次数 / 总出现次数 × 100 + high_rank_count = sum(1 for rank in ranks if rank <= rank_threshold) + hotness_ratio = high_rank_count / len(ranks) if ranks else 0 + hotness_weight = hotness_ratio * 100 + + # 综合权重 + total_weight = ( + rank_weight * RANK_WEIGHT + + frequency_weight * FREQUENCY_WEIGHT + + hotness_weight * HOTNESS_WEIGHT + ) + + return total_weight + + +class AnalyticsTools: + """高级数据分析工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化分析工具 + + Args: + project_root: 项目根目录 + """ + self.data_service = DataService(project_root) + + def analyze_data_insights_unified( + self, + insight_type: str = "platform_compare", + topic: Optional[str] = None, + date_range: Optional[Dict[str, str]] = None, + min_frequency: int = 3, + top_n: int = 20 + ) -> Dict: + """ + 统一数据洞察分析工具 - 整合多种数据分析模式 + + Args: + insight_type: 洞察类型,可选值: + - "platform_compare": 平台对比分析(对比不同平台对话题的关注度) + - "platform_activity": 平台活跃度统计(统计各平台发布频率和活跃时间) + - "keyword_cooccur": 关键词共现分析(分析关键词同时出现的模式) + topic: 话题关键词(可选,platform_compare模式适用) + date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + min_frequency: 最小共现频次(keyword_cooccur模式),默认3 + top_n: 返回TOP N结果(keyword_cooccur模式),默认20 + + Returns: + 数据洞察分析结果字典 + + Examples: + - analyze_data_insights_unified(insight_type="platform_compare", topic="人工智能") + - analyze_data_insights_unified(insight_type="platform_activity", date_range={...}) + - analyze_data_insights_unified(insight_type="keyword_cooccur", min_frequency=5) + """ + try: + # 参数验证 + if insight_type not in ["platform_compare", "platform_activity", "keyword_cooccur"]: + raise InvalidParameterError( + f"无效的洞察类型: {insight_type}", + suggestion="支持的类型: platform_compare, platform_activity, keyword_cooccur" + ) + + # 根据洞察类型调用相应方法 + if insight_type == "platform_compare": + return self.compare_platforms( + topic=topic, + date_range=date_range + ) + elif insight_type == "platform_activity": + return self.get_platform_activity_stats( + date_range=date_range + ) + else: # keyword_cooccur + return self.analyze_keyword_cooccurrence( + min_frequency=min_frequency, + top_n=top_n + ) + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def analyze_topic_trend_unified( + self, + topic: str, + analysis_type: str = "trend", + date_range: Optional[Dict[str, str]] = None, + granularity: str = "day", + threshold: float = 3.0, + time_window: int = 24, + lookahead_hours: int = 6, + confidence_threshold: float = 0.7 + ) -> Dict: + """ + 统一话题趋势分析工具 - 整合多种趋势分析模式 + + Args: + topic: 话题关键词(必需) + analysis_type: 分析类型,可选值: + - "trend": 热度趋势分析(追踪话题的热度变化) + - "lifecycle": 生命周期分析(从出现到消失的完整周期) + - "viral": 异常热度检测(识别突然爆火的话题) + - "predict": 话题预测(预测未来可能的热点) + date_range: 日期范围(trend和lifecycle模式),可选 + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **默认**: 不指定时默认分析最近7天 + granularity: 时间粒度(trend模式),默认"day"(hour/day) + threshold: 热度突增倍数阈值(viral模式),默认3.0 + time_window: 检测时间窗口小时数(viral模式),默认24 + lookahead_hours: 预测未来小时数(predict模式),默认6 + confidence_threshold: 置信度阈值(predict模式),默认0.7 + + Returns: + 趋势分析结果字典 + + Examples: + - analyze_topic_trend_unified(topic="人工智能", analysis_type="trend", date_range={"start": "2025-10-18", "end": "2025-10-25"}) + - analyze_topic_trend_unified(topic="特斯拉", analysis_type="lifecycle", date_range={"start": "2025-10-18", "end": "2025-10-25"}) + - analyze_topic_trend_unified(topic="比特币", analysis_type="viral", threshold=3.0) + - analyze_topic_trend_unified(topic="ChatGPT", analysis_type="predict", lookahead_hours=6) + """ + try: + # 参数验证 + topic = validate_keyword(topic) + + if analysis_type not in ["trend", "lifecycle", "viral", "predict"]: + raise InvalidParameterError( + f"无效的分析类型: {analysis_type}", + suggestion="支持的类型: trend, lifecycle, viral, predict" + ) + + # 根据分析类型调用相应方法 + if analysis_type == "trend": + return self.get_topic_trend_analysis( + topic=topic, + date_range=date_range, + granularity=granularity + ) + elif analysis_type == "lifecycle": + return self.analyze_topic_lifecycle( + topic=topic, + date_range=date_range + ) + elif analysis_type == "viral": + # viral模式不需要topic参数,使用通用检测 + return self.detect_viral_topics( + threshold=threshold, + time_window=time_window + ) + else: # predict + # predict模式不需要topic参数,使用通用预测 + return self.predict_trending_topics( + lookahead_hours=lookahead_hours, + confidence_threshold=confidence_threshold + ) + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def get_topic_trend_analysis( + self, + topic: str, + date_range: Optional[Dict[str, str]] = None, + granularity: str = "day" + ) -> Dict: + """ + 热度趋势分析 - 追踪特定话题的热度变化趋势 + + Args: + topic: 话题关键词 + date_range: 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **默认**: 不指定时默认分析最近7天 + granularity: 时间粒度,仅支持 day(天) + + Returns: + 趋势分析结果字典 + + Examples: + 用户询问示例: + - "帮我分析一下'人工智能'这个话题最近一周的热度趋势" + - "查看'比特币'过去一周的热度变化" + - "看看'iPhone'最近7天的趋势如何" + - "分析'特斯拉'最近一个月的热度趋势" + - "查看'ChatGPT'2024年12月的趋势变化" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> # 分析7天趋势 + >>> result = tools.get_topic_trend_analysis( + ... topic="人工智能", + ... date_range={"start": "2025-10-18", "end": "2025-10-25"}, + ... granularity="day" + ... ) + >>> # 分析历史月份趋势 + >>> result = tools.get_topic_trend_analysis( + ... topic="特斯拉", + ... date_range={"start": "2024-12-01", "end": "2024-12-31"}, + ... granularity="day" + ... ) + >>> print(result['trend_data']) + """ + try: + # 验证参数 + topic = validate_keyword(topic) + + # 验证粒度参数(只支持day) + if granularity != "day": + from ..utils.errors import InvalidParameterError + raise InvalidParameterError( + f"不支持的粒度参数: {granularity}", + suggestion="当前仅支持 'day' 粒度,因为底层数据按天聚合" + ) + + # 处理日期范围(不指定时默认最近7天) + if date_range: + from ..utils.validators import validate_date_range + date_range_tuple = validate_date_range(date_range) + start_date, end_date = date_range_tuple + else: + # 默认最近7天 + end_date = datetime.now() + start_date = end_date - timedelta(days=6) + + # 收集趋势数据 + trend_data = [] + current_date = start_date + + while current_date <= end_date: + try: + all_titles, _, _ = self.data_service.parser.read_all_titles_for_date( + date=current_date + ) + + # 统计该时间点的话题出现次数 + count = 0 + matched_titles = [] + + for _, titles in all_titles.items(): + for title in titles.keys(): + if topic.lower() in title.lower(): + count += 1 + matched_titles.append(title) + + trend_data.append({ + "date": current_date.strftime("%Y-%m-%d"), + "count": count, + "sample_titles": matched_titles[:3] # 只保留前3个样本 + }) + + except DataNotFoundError: + trend_data.append({ + "date": current_date.strftime("%Y-%m-%d"), + "count": 0, + "sample_titles": [] + }) + + # 按天增加时间 + current_date += timedelta(days=1) + + # 计算趋势指标 + counts = [item["count"] for item in trend_data] + total_days = (end_date - start_date).days + 1 + + if len(counts) >= 2: + # 计算涨跌幅度 + first_non_zero = next((c for c in counts if c > 0), 0) + last_count = counts[-1] + + if first_non_zero > 0: + change_rate = ((last_count - first_non_zero) / first_non_zero) * 100 + else: + change_rate = 0 + + # 找到峰值时间 + max_count = max(counts) + peak_index = counts.index(max_count) + peak_time = trend_data[peak_index]["date"] + else: + change_rate = 0 + peak_time = None + max_count = 0 + + return { + "success": True, + "topic": topic, + "date_range": { + "start": start_date.strftime("%Y-%m-%d"), + "end": end_date.strftime("%Y-%m-%d"), + "total_days": total_days + }, + "granularity": granularity, + "trend_data": trend_data, + "statistics": { + "total_mentions": sum(counts), + "average_mentions": round(sum(counts) / len(counts), 2) if counts else 0, + "peak_count": max_count, + "peak_time": peak_time, + "change_rate": round(change_rate, 2) + }, + "trend_direction": "上升" if change_rate > 10 else "下降" if change_rate < -10 else "稳定" + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def compare_platforms( + self, + topic: Optional[str] = None, + date_range: Optional[Dict[str, str]] = None + ) -> Dict: + """ + 平台对比分析 - 对比不同平台对同一话题的关注度 + + Args: + topic: 话题关键词(可选,不指定则对比整体活跃度) + date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + + Returns: + 平台对比分析结果 + + Examples: + 用户询问示例: + - "对比一下各个平台对'人工智能'话题的关注度" + - "看看知乎和微博哪个平台更关注科技新闻" + - "分析各平台今天的热点分布" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.compare_platforms( + ... topic="人工智能", + ... date_range={"start": "2025-10-01", "end": "2025-10-11"} + ... ) + >>> print(result['platform_stats']) + """ + try: + # 参数验证 + if topic: + topic = validate_keyword(topic) + date_range_tuple = validate_date_range(date_range) + + # 确定日期范围 + if date_range_tuple: + start_date, end_date = date_range_tuple + else: + start_date = end_date = datetime.now() + + # 收集各平台数据 + platform_stats = defaultdict(lambda: { + "total_news": 0, + "topic_mentions": 0, + "unique_titles": set(), + "top_keywords": Counter() + }) + + # 遍历日期范围 + current_date = start_date + while current_date <= end_date: + try: + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date( + date=current_date + ) + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title in titles.keys(): + platform_stats[platform_name]["total_news"] += 1 + platform_stats[platform_name]["unique_titles"].add(title) + + # 如果指定了话题,统计包含话题的新闻 + if topic and topic.lower() in title.lower(): + platform_stats[platform_name]["topic_mentions"] += 1 + + # 提取关键词(简单分词) + keywords = self._extract_keywords(title) + platform_stats[platform_name]["top_keywords"].update(keywords) + + except DataNotFoundError: + pass + + current_date += timedelta(days=1) + + # 转换为可序列化的格式 + result_stats = {} + for platform, stats in platform_stats.items(): + coverage_rate = 0 + if stats["total_news"] > 0: + coverage_rate = (stats["topic_mentions"] / stats["total_news"]) * 100 + + result_stats[platform] = { + "total_news": stats["total_news"], + "topic_mentions": stats["topic_mentions"], + "unique_titles": len(stats["unique_titles"]), + "coverage_rate": round(coverage_rate, 2), + "top_keywords": [ + {"keyword": k, "count": v} + for k, v in stats["top_keywords"].most_common(5) + ] + } + + # 找出各平台独有的热点 + unique_topics = self._find_unique_topics(platform_stats) + + return { + "success": True, + "topic": topic, + "date_range": { + "start": start_date.strftime("%Y-%m-%d"), + "end": end_date.strftime("%Y-%m-%d") + }, + "platform_stats": result_stats, + "unique_topics": unique_topics, + "total_platforms": len(result_stats) + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def analyze_keyword_cooccurrence( + self, + min_frequency: int = 3, + top_n: int = 20 + ) -> Dict: + """ + 关键词共现分析 - 分析哪些关键词经常同时出现 + + Args: + min_frequency: 最小共现频次 + top_n: 返回TOP N关键词对 + + Returns: + 关键词共现分析结果 + + Examples: + 用户询问示例: + - "分析一下哪些关键词经常一起出现" + - "看看'人工智能'经常和哪些词一起出现" + - "找出今天新闻中的关键词关联" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.analyze_keyword_cooccurrence( + ... min_frequency=5, + ... top_n=15 + ... ) + >>> print(result['cooccurrence_pairs']) + """ + try: + # 参数验证 + min_frequency = validate_limit(min_frequency, default=3, max_limit=100) + top_n = validate_top_n(top_n, default=20) + + # 读取今天的数据 + all_titles, _, _ = self.data_service.parser.read_all_titles_for_date() + + # 关键词共现统计 + cooccurrence = Counter() + keyword_titles = defaultdict(list) + + for platform_id, titles in all_titles.items(): + for title in titles.keys(): + # 提取关键词 + keywords = self._extract_keywords(title) + + # 记录每个关键词出现的标题 + for kw in keywords: + keyword_titles[kw].append(title) + + # 计算两两共现 + if len(keywords) >= 2: + for i, kw1 in enumerate(keywords): + for kw2 in keywords[i+1:]: + # 统一排序,避免重复 + pair = tuple(sorted([kw1, kw2])) + cooccurrence[pair] += 1 + + # 过滤低频共现 + filtered_pairs = [ + (pair, count) for pair, count in cooccurrence.items() + if count >= min_frequency + ] + + # 排序并取TOP N + top_pairs = sorted(filtered_pairs, key=lambda x: x[1], reverse=True)[:top_n] + + # 构建结果 + result_pairs = [] + for (kw1, kw2), count in top_pairs: + # 找出同时包含两个关键词的标题样本 + titles_with_both = [ + title for title in keyword_titles[kw1] + if kw2 in self._extract_keywords(title) + ] + + result_pairs.append({ + "keyword1": kw1, + "keyword2": kw2, + "cooccurrence_count": count, + "sample_titles": titles_with_both[:3] + }) + + return { + "success": True, + "cooccurrence_pairs": result_pairs, + "total_pairs": len(result_pairs), + "min_frequency": min_frequency, + "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def analyze_sentiment( + self, + topic: Optional[str] = None, + platforms: Optional[List[str]] = None, + date_range: Optional[Dict[str, str]] = None, + limit: int = 50, + sort_by_weight: bool = True, + include_url: bool = False + ) -> Dict: + """ + 情感倾向分析 - 生成用于 AI 情感分析的结构化提示词 + + 本工具收集新闻数据并生成优化的 AI 提示词,你可以将其发送给 AI 进行深度情感分析。 + + Args: + topic: 话题关键词(可选),只分析包含该关键词的新闻 + platforms: 平台过滤列表(可选),如 ['zhihu', 'weibo'] + date_range: 日期范围(可选),格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + 不指定则默认查询今天的数据 + limit: 返回新闻数量限制,默认50,最大100 + sort_by_weight: 是否按权重排序,默认True(推荐) + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 包含 AI 提示词和新闻数据的结构化结果 + + Examples: + 用户询问示例: + - "分析一下今天新闻的情感倾向" + - "看看'特斯拉'相关新闻是正面还是负面的" + - "分析各平台对'人工智能'的情感态度" + - "看看'特斯拉'相关新闻是正面还是负面的,请选择一周内的前10条新闻来分析" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> # 分析今天的特斯拉新闻,返回前10条 + >>> result = tools.analyze_sentiment( + ... topic="特斯拉", + ... limit=10 + ... ) + >>> # 分析一周内的特斯拉新闻,返回前10条按权重排序 + >>> result = tools.analyze_sentiment( + ... topic="特斯拉", + ... date_range={"start": "2025-10-06", "end": "2025-10-13"}, + ... limit=10 + ... ) + >>> print(result['ai_prompt']) # 获取生成的提示词 + """ + try: + # 参数验证 + if topic: + topic = validate_keyword(topic) + platforms = validate_platforms(platforms) + limit = validate_limit(limit, default=50) + + # 处理日期范围 + if date_range: + date_range_tuple = validate_date_range(date_range) + start_date, end_date = date_range_tuple + else: + # 默认今天 + start_date = end_date = datetime.now() + + # 收集新闻数据(支持多天) + all_news_items = [] + current_date = start_date + + while current_date <= end_date: + try: + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date( + date=current_date, + platform_ids=platforms + ) + + # 收集该日期的新闻 + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + for title, info in titles.items(): + # 如果指定了话题,只收集包含话题的标题 + if topic and topic.lower() not in title.lower(): + continue + + news_item = { + "platform": platform_name, + "title": title, + "ranks": info.get("ranks", []), + "count": len(info.get("ranks", [])), + "date": current_date.strftime("%Y-%m-%d") + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + all_news_items.append(news_item) + + except DataNotFoundError: + # 该日期没有数据,继续下一天 + pass + + # 下一天 + current_date += timedelta(days=1) + + if not all_news_items: + time_desc = "今天" if start_date == end_date else f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + raise DataNotFoundError( + f"未找到相关新闻({time_desc})", + suggestion="请尝试其他话题、日期范围或平台" + ) + + # 去重(同一标题只保留一次) + unique_news = {} + for item in all_news_items: + key = f"{item['platform']}::{item['title']}" + if key not in unique_news: + unique_news[key] = item + else: + # 合并 ranks(如果同一新闻在多天出现) + existing = unique_news[key] + existing["ranks"].extend(item["ranks"]) + existing["count"] = len(existing["ranks"]) + + deduplicated_news = list(unique_news.values()) + + # 按权重排序(如果启用) + if sort_by_weight: + deduplicated_news.sort( + key=lambda x: calculate_news_weight(x), + reverse=True + ) + + # 限制返回数量 + selected_news = deduplicated_news[:limit] + + # 生成 AI 提示词 + ai_prompt = self._create_sentiment_analysis_prompt( + news_data=selected_news, + topic=topic + ) + + # 构建时间范围描述 + if start_date == end_date: + time_range_desc = start_date.strftime("%Y-%m-%d") + else: + time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + + result = { + "success": True, + "method": "ai_prompt_generation", + "summary": { + "total_found": len(deduplicated_news), + "returned_count": len(selected_news), + "requested_limit": limit, + "duplicates_removed": len(all_news_items) - len(deduplicated_news), + "topic": topic, + "time_range": time_range_desc, + "platforms": list(set(item["platform"] for item in selected_news)), + "sorted_by_weight": sort_by_weight + }, + "ai_prompt": ai_prompt, + "news_sample": selected_news, + "usage_note": "请将 ai_prompt 字段的内容发送给 AI 进行情感分析" + } + + # 如果返回数量少于请求数量,增加提示 + if len(selected_news) < limit and len(deduplicated_news) >= limit: + result["note"] = "返回数量少于请求数量是因为去重逻辑(同一标题在不同平台只保留一次)" + elif len(deduplicated_news) < limit: + result["note"] = f"在指定时间范围内仅找到 {len(deduplicated_news)} 条匹配的新闻" + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def _create_sentiment_analysis_prompt( + self, + news_data: List[Dict], + topic: Optional[str] + ) -> str: + """ + 创建情感分析的 AI 提示词 + + Args: + news_data: 新闻数据列表(已排序和限制数量) + topic: 话题关键词 + + Returns: + 格式化的 AI 提示词 + """ + # 按平台分组 + platform_news = defaultdict(list) + for item in news_data: + platform_news[item["platform"]].append({ + "title": item["title"], + "date": item.get("date", "") + }) + + # 构建提示词 + prompt_parts = [] + + # 1. 任务说明 + if topic: + prompt_parts.append(f"请分析以下关于「{topic}」的新闻标题的情感倾向。") + else: + prompt_parts.append("请分析以下新闻标题的情感倾向。") + + prompt_parts.append("") + prompt_parts.append("分析要求:") + prompt_parts.append("1. 识别每条新闻的情感倾向(正面/负面/中性)") + prompt_parts.append("2. 统计各情感类别的数量和百分比") + prompt_parts.append("3. 分析不同平台的情感差异") + prompt_parts.append("4. 总结整体情感趋势") + prompt_parts.append("5. 列举典型的正面和负面新闻样本") + prompt_parts.append("") + + # 2. 数据概览 + prompt_parts.append(f"数据概览:") + prompt_parts.append(f"- 总新闻数:{len(news_data)}") + prompt_parts.append(f"- 覆盖平台:{len(platform_news)}") + + # 时间范围 + dates = set(item.get("date", "") for item in news_data if item.get("date")) + if dates: + date_list = sorted(dates) + if len(date_list) == 1: + prompt_parts.append(f"- 时间范围:{date_list[0]}") + else: + prompt_parts.append(f"- 时间范围:{date_list[0]} 至 {date_list[-1]}") + + prompt_parts.append("") + + # 3. 按平台展示新闻 + prompt_parts.append("新闻列表(按平台分类,已按重要性排序):") + prompt_parts.append("") + + for platform, items in sorted(platform_news.items()): + prompt_parts.append(f"【{platform}】({len(items)} 条)") + for i, item in enumerate(items, 1): + title = item["title"] + date_str = f" [{item['date']}]" if item.get("date") else "" + prompt_parts.append(f"{i}. {title}{date_str}") + prompt_parts.append("") + + # 4. 输出格式说明 + prompt_parts.append("请按以下格式输出分析结果:") + prompt_parts.append("") + prompt_parts.append("## 情感分布统计") + prompt_parts.append("- 正面:XX条 (XX%)") + prompt_parts.append("- 负面:XX条 (XX%)") + prompt_parts.append("- 中性:XX条 (XX%)") + prompt_parts.append("") + prompt_parts.append("## 平台情感对比") + prompt_parts.append("[各平台的情感倾向差异]") + prompt_parts.append("") + prompt_parts.append("## 整体情感趋势") + prompt_parts.append("[总体分析和关键发现]") + prompt_parts.append("") + prompt_parts.append("## 典型样本") + prompt_parts.append("正面新闻样本:") + prompt_parts.append("[列举3-5条]") + prompt_parts.append("") + prompt_parts.append("负面新闻样本:") + prompt_parts.append("[列举3-5条]") + + return "\n".join(prompt_parts) + + def find_similar_news( + self, + reference_title: str, + threshold: float = 0.6, + limit: int = 50, + include_url: bool = False + ) -> Dict: + """ + 相似新闻查找 - 基于标题相似度查找相关新闻 + + Args: + reference_title: 参考标题 + threshold: 相似度阈值(0-1之间) + limit: 返回条数限制,默认50 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 相似新闻列表 + + Examples: + 用户询问示例: + - "找出和'特斯拉降价'相似的新闻" + - "查找关于iPhone发布的类似报道" + - "看看有没有和这条新闻相似的报道" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.find_similar_news( + ... reference_title="特斯拉宣布降价", + ... threshold=0.6, + ... limit=10 + ... ) + >>> print(result['similar_news']) + """ + try: + # 参数验证 + reference_title = validate_keyword(reference_title) + + if not 0 <= threshold <= 1: + raise InvalidParameterError( + "threshold 必须在 0 到 1 之间", + suggestion="推荐值:0.5-0.8" + ) + + limit = validate_limit(limit, default=50) + + # 读取数据 + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date() + + # 计算相似度 + similar_items = [] + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + if title == reference_title: + continue + + # 计算相似度 + similarity = self._calculate_similarity(reference_title, title) + + if similarity >= threshold: + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "similarity": round(similarity, 3), + "rank": info["ranks"][0] if info["ranks"] else 0 + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + + similar_items.append(news_item) + + # 按相似度排序 + similar_items.sort(key=lambda x: x["similarity"], reverse=True) + + # 限制数量 + result_items = similar_items[:limit] + + if not result_items: + raise DataNotFoundError( + f"未找到相似度超过 {threshold} 的新闻", + suggestion="请降低相似度阈值或尝试其他标题" + ) + + result = { + "success": True, + "summary": { + "total_found": len(similar_items), + "returned_count": len(result_items), + "requested_limit": limit, + "threshold": threshold, + "reference_title": reference_title + }, + "similar_news": result_items + } + + if len(similar_items) < limit: + result["note"] = f"相似度阈值 {threshold} 下仅找到 {len(similar_items)} 条相似新闻" + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def search_by_entity( + self, + entity: str, + entity_type: Optional[str] = None, + limit: int = 50, + sort_by_weight: bool = True + ) -> Dict: + """ + 实体识别搜索 - 搜索包含特定人物/地点/机构的新闻 + + Args: + entity: 实体名称 + entity_type: 实体类型(person/location/organization),可选 + limit: 返回条数限制,默认50,最大200 + sort_by_weight: 是否按权重排序,默认True + + Returns: + 实体相关新闻列表 + + Examples: + 用户询问示例: + - "搜索马斯克相关的新闻" + - "查找关于特斯拉公司的报道,返回前20条" + - "看看北京有什么新闻" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.search_by_entity( + ... entity="马斯克", + ... entity_type="person", + ... limit=20 + ... ) + >>> print(result['related_news']) + """ + try: + # 参数验证 + entity = validate_keyword(entity) + limit = validate_limit(limit, default=50) + + if entity_type and entity_type not in ["person", "location", "organization"]: + raise InvalidParameterError( + f"无效的实体类型: {entity_type}", + suggestion="支持的类型: person, location, organization" + ) + + # 读取数据 + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date() + + # 搜索包含实体的新闻 + related_news = [] + entity_context = Counter() # 统计实体周边的词 + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + if entity in title: + url = info.get("url", "") + mobile_url = info.get("mobileUrl", "") + ranks = info.get("ranks", []) + count = len(ranks) + + related_news.append({ + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "url": url, + "mobileUrl": mobile_url, + "ranks": ranks, + "count": count, + "rank": ranks[0] if ranks else 999 + }) + + # 提取实体周边的关键词 + keywords = self._extract_keywords(title) + entity_context.update(keywords) + + if not related_news: + raise DataNotFoundError( + f"未找到包含实体 '{entity}' 的新闻", + suggestion="请尝试其他实体名称" + ) + + # 移除实体本身 + if entity in entity_context: + del entity_context[entity] + + # 按权重排序(如果启用) + if sort_by_weight: + related_news.sort( + key=lambda x: calculate_news_weight(x), + reverse=True + ) + else: + # 按排名排序 + related_news.sort(key=lambda x: x["rank"]) + + # 限制返回数量 + result_news = related_news[:limit] + + return { + "success": True, + "entity": entity, + "entity_type": entity_type or "auto", + "related_news": result_news, + "total_found": len(related_news), + "returned_count": len(result_news), + "sorted_by_weight": sort_by_weight, + "related_keywords": [ + {"keyword": k, "count": v} + for k, v in entity_context.most_common(10) + ] + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def generate_summary_report( + self, + report_type: str = "daily", + date_range: Optional[Dict[str, str]] = None + ) -> Dict: + """ + 每日/每周摘要生成器 - 自动生成热点摘要报告 + + Args: + report_type: 报告类型(daily/weekly) + date_range: 自定义日期范围(可选) + + Returns: + Markdown格式的摘要报告 + + Examples: + 用户询问示例: + - "生成今天的新闻摘要报告" + - "给我一份本周的热点总结" + - "生成过去7天的新闻分析报告" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.generate_summary_report( + ... report_type="daily" + ... ) + >>> print(result['markdown_report']) + """ + try: + # 参数验证 + if report_type not in ["daily", "weekly"]: + raise InvalidParameterError( + f"无效的报告类型: {report_type}", + suggestion="支持的类型: daily, weekly" + ) + + # 确定日期范围 + if date_range: + date_range_tuple = validate_date_range(date_range) + start_date, end_date = date_range_tuple + else: + if report_type == "daily": + start_date = end_date = datetime.now() + else: # weekly + end_date = datetime.now() + start_date = end_date - timedelta(days=6) + + # 收集数据 + all_keywords = Counter() + all_platforms_news = defaultdict(int) + all_titles_list = [] + + current_date = start_date + while current_date <= end_date: + try: + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date( + date=current_date + ) + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + all_platforms_news[platform_name] += len(titles) + + for title in titles.keys(): + all_titles_list.append({ + "title": title, + "platform": platform_name, + "date": current_date.strftime("%Y-%m-%d") + }) + + # 提取关键词 + keywords = self._extract_keywords(title) + all_keywords.update(keywords) + + except DataNotFoundError: + pass + + current_date += timedelta(days=1) + + # 生成报告 + report_title = f"{'每日' if report_type == 'daily' else '每周'}新闻热点摘要" + date_str = f"{start_date.strftime('%Y-%m-%d')}" if report_type == "daily" else f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + + # 构建Markdown报告 + markdown = f"""# {report_title} + +**报告日期**: {date_str} +**生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + +--- + +## 📊 数据概览 + +- **总新闻数**: {len(all_titles_list)} +- **覆盖平台**: {len(all_platforms_news)} +- **热门关键词数**: {len(all_keywords)} + +## 🔥 TOP 10 热门话题 + +""" + + # 添加TOP 10关键词 + for i, (keyword, count) in enumerate(all_keywords.most_common(10), 1): + markdown += f"{i}. **{keyword}** - 出现 {count} 次\n" + + # 平台分析 + markdown += "\n## 📱 平台活跃度\n\n" + sorted_platforms = sorted(all_platforms_news.items(), key=lambda x: x[1], reverse=True) + + for platform, count in sorted_platforms: + markdown += f"- **{platform}**: {count} 条新闻\n" + + # 趋势变化(如果是周报) + if report_type == "weekly": + markdown += "\n## 📈 趋势分析\n\n" + markdown += "本周热度持续的话题(样本数据):\n\n" + + # 简单的趋势分析 + top_keywords = [kw for kw, _ in all_keywords.most_common(5)] + for keyword in top_keywords: + markdown += f"- **{keyword}**: 持续热门\n" + + # 添加样本新闻(按权重选择,确保确定性) + markdown += "\n## 📰 精选新闻样本\n\n" + + # 确定性选取:按标题的权重排序,取前5条 + # 这样相同输入总是返回相同结果 + if all_titles_list: + # 计算每条新闻的权重分数(基于关键词出现次数) + news_with_scores = [] + for news in all_titles_list: + # 简单权重:统计包含TOP关键词的次数 + score = 0 + title_lower = news['title'].lower() + for keyword, count in all_keywords.most_common(10): + if keyword.lower() in title_lower: + score += count + news_with_scores.append((news, score)) + + # 按权重降序排序,权重相同则按标题字母顺序(确保确定性) + news_with_scores.sort(key=lambda x: (-x[1], x[0]['title'])) + + # 取前5条 + sample_news = [item[0] for item in news_with_scores[:5]] + + for news in sample_news: + markdown += f"- [{news['platform']}] {news['title']}\n" + + markdown += "\n---\n\n*本报告由 TrendRadar MCP 自动生成*\n" + + return { + "success": True, + "report_type": report_type, + "date_range": { + "start": start_date.strftime("%Y-%m-%d"), + "end": end_date.strftime("%Y-%m-%d") + }, + "markdown_report": markdown, + "statistics": { + "total_news": len(all_titles_list), + "platforms_count": len(all_platforms_news), + "keywords_count": len(all_keywords), + "top_keyword": all_keywords.most_common(1)[0] if all_keywords else None + } + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def get_platform_activity_stats( + self, + date_range: Optional[Dict[str, str]] = None + ) -> Dict: + """ + 平台活跃度统计 - 统计各平台的发布频率和活跃时间段 + + Args: + date_range: 日期范围(可选) + + Returns: + 平台活跃度统计结果 + + Examples: + 用户询问示例: + - "统计各平台今天的活跃度" + - "看看哪个平台更新最频繁" + - "分析各平台的发布时间规律" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.get_platform_activity_stats( + ... date_range={"start": "2025-10-01", "end": "2025-10-11"} + ... ) + >>> print(result['platform_activity']) + """ + try: + # 参数验证 + date_range_tuple = validate_date_range(date_range) + + # 确定日期范围 + if date_range_tuple: + start_date, end_date = date_range_tuple + else: + start_date = end_date = datetime.now() + + # 统计各平台活跃度 + platform_activity = defaultdict(lambda: { + "total_updates": 0, + "days_active": set(), + "news_count": 0, + "hourly_distribution": Counter() + }) + + # 遍历日期范围 + current_date = start_date + while current_date <= end_date: + try: + all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date( + date=current_date + ) + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + platform_activity[platform_name]["news_count"] += len(titles) + platform_activity[platform_name]["days_active"].add(current_date.strftime("%Y-%m-%d")) + + # 统计更新次数(基于文件数量) + platform_activity[platform_name]["total_updates"] += len(timestamps) + + # 统计时间分布(基于文件名中的时间) + for filename in timestamps.keys(): + # 解析文件名中的小时(格式:HHMM.txt) + match = re.match(r'(\d{2})(\d{2})\.txt', filename) + if match: + hour = int(match.group(1)) + platform_activity[platform_name]["hourly_distribution"][hour] += 1 + + except DataNotFoundError: + pass + + current_date += timedelta(days=1) + + # 转换为可序列化的格式 + result_activity = {} + for platform, stats in platform_activity.items(): + days_count = len(stats["days_active"]) + avg_news_per_day = stats["news_count"] / days_count if days_count > 0 else 0 + + # 找出最活跃的时间段 + most_active_hours = stats["hourly_distribution"].most_common(3) + + result_activity[platform] = { + "total_updates": stats["total_updates"], + "news_count": stats["news_count"], + "days_active": days_count, + "avg_news_per_day": round(avg_news_per_day, 2), + "most_active_hours": [ + {"hour": f"{hour:02d}:00", "count": count} + for hour, count in most_active_hours + ], + "activity_score": round(stats["news_count"] / max(days_count, 1), 2) + } + + # 按活跃度排序 + sorted_platforms = sorted( + result_activity.items(), + key=lambda x: x[1]["activity_score"], + reverse=True + ) + + return { + "success": True, + "date_range": { + "start": start_date.strftime("%Y-%m-%d"), + "end": end_date.strftime("%Y-%m-%d") + }, + "platform_activity": dict(sorted_platforms), + "most_active_platform": sorted_platforms[0][0] if sorted_platforms else None, + "total_platforms": len(result_activity) + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def analyze_topic_lifecycle( + self, + topic: str, + date_range: Optional[Dict[str, str]] = None + ) -> Dict: + """ + 话题生命周期分析 - 追踪话题从出现到消失的完整周期 + + Args: + topic: 话题关键词 + date_range: 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **默认**: 不指定时默认分析最近7天 + + Returns: + 话题生命周期分析结果 + + Examples: + 用户询问示例: + - "分析'人工智能'这个话题的生命周期" + - "看看'iPhone'话题是昙花一现还是持续热点" + - "追踪'比特币'话题的热度变化" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.analyze_topic_lifecycle( + ... topic="人工智能", + ... date_range={"start": "2025-10-18", "end": "2025-10-25"} + ... ) + >>> print(result['lifecycle_stage']) + """ + try: + # 参数验证 + topic = validate_keyword(topic) + + # 处理日期范围(不指定时默认最近7天) + if date_range: + from ..utils.validators import validate_date_range + date_range_tuple = validate_date_range(date_range) + start_date, end_date = date_range_tuple + else: + # 默认最近7天 + end_date = datetime.now() + start_date = end_date - timedelta(days=6) + + # 收集话题历史数据 + lifecycle_data = [] + current_date = start_date + while current_date <= end_date: + try: + all_titles, _, _ = self.data_service.parser.read_all_titles_for_date( + date=current_date + ) + + # 统计该日的话题出现次数 + count = 0 + for _, titles in all_titles.items(): + for title in titles.keys(): + if topic.lower() in title.lower(): + count += 1 + + lifecycle_data.append({ + "date": current_date.strftime("%Y-%m-%d"), + "count": count + }) + + except DataNotFoundError: + lifecycle_data.append({ + "date": current_date.strftime("%Y-%m-%d"), + "count": 0 + }) + + current_date += timedelta(days=1) + + # 计算分析天数 + total_days = (end_date - start_date).days + 1 + + # 分析生命周期阶段 + counts = [item["count"] for item in lifecycle_data] + + if not any(counts): + time_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + raise DataNotFoundError( + f"在 {time_desc} 内未找到话题 '{topic}'", + suggestion="请尝试其他话题或扩大时间范围" + ) + + # 找到首次出现和最后出现 + first_appearance = next((item["date"] for item in lifecycle_data if item["count"] > 0), None) + last_appearance = next((item["date"] for item in reversed(lifecycle_data) if item["count"] > 0), None) + + # 计算峰值 + max_count = max(counts) + peak_index = counts.index(max_count) + peak_date = lifecycle_data[peak_index]["date"] + + # 计算平均值和标准差(简单实现) + non_zero_counts = [c for c in counts if c > 0] + avg_count = sum(non_zero_counts) / len(non_zero_counts) if non_zero_counts else 0 + + # 判断生命周期阶段 + recent_counts = counts[-3:] # 最近3天 + early_counts = counts[:3] # 前3天 + + if sum(recent_counts) > sum(early_counts): + lifecycle_stage = "上升期" + elif sum(recent_counts) < sum(early_counts) * 0.5: + lifecycle_stage = "衰退期" + elif max_count in recent_counts: + lifecycle_stage = "爆发期" + else: + lifecycle_stage = "稳定期" + + # 分类:昙花一现 vs 持续热点 + active_days = sum(1 for c in counts if c > 0) + + if active_days <= 2 and max_count > avg_count * 2: + topic_type = "昙花一现" + elif active_days >= total_days * 0.6: + topic_type = "持续热点" + else: + topic_type = "周期性热点" + + return { + "success": True, + "topic": topic, + "date_range": { + "start": start_date.strftime("%Y-%m-%d"), + "end": end_date.strftime("%Y-%m-%d"), + "total_days": total_days + }, + "lifecycle_data": lifecycle_data, + "analysis": { + "first_appearance": first_appearance, + "last_appearance": last_appearance, + "peak_date": peak_date, + "peak_count": max_count, + "active_days": active_days, + "avg_daily_mentions": round(avg_count, 2), + "lifecycle_stage": lifecycle_stage, + "topic_type": topic_type + } + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def detect_viral_topics( + self, + threshold: float = 3.0, + time_window: int = 24 + ) -> Dict: + """ + 异常热度检测 - 自动识别突然爆火的话题 + + Args: + threshold: 热度突增倍数阈值 + time_window: 检测时间窗口(小时) + + Returns: + 爆火话题列表 + + Examples: + 用户询问示例: + - "检测今天有哪些突然爆火的话题" + - "看看有没有热度异常的新闻" + - "预警可能的重大事件" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.detect_viral_topics( + ... threshold=3.0, + ... time_window=24 + ... ) + >>> print(result['viral_topics']) + """ + try: + # 参数验证 + if threshold < 1.0: + raise InvalidParameterError( + "threshold 必须大于等于 1.0", + suggestion="推荐值:2.0-5.0" + ) + + time_window = validate_limit(time_window, default=24, max_limit=72) + + # 读取当前和之前的数据 + current_all_titles, _, _ = self.data_service.parser.read_all_titles_for_date() + + # 读取昨天的数据作为基准 + yesterday = datetime.now() - timedelta(days=1) + try: + previous_all_titles, _, _ = self.data_service.parser.read_all_titles_for_date( + date=yesterday + ) + except DataNotFoundError: + previous_all_titles = {} + + # 统计当前的关键词频率 + current_keywords = Counter() + current_keyword_titles = defaultdict(list) + + for _, titles in current_all_titles.items(): + for title in titles.keys(): + keywords = self._extract_keywords(title) + current_keywords.update(keywords) + + for kw in keywords: + current_keyword_titles[kw].append(title) + + # 统计之前的关键词频率 + previous_keywords = Counter() + + for _, titles in previous_all_titles.items(): + for title in titles.keys(): + keywords = self._extract_keywords(title) + previous_keywords.update(keywords) + + # 检测异常热度 + viral_topics = [] + + for keyword, current_count in current_keywords.items(): + previous_count = previous_keywords.get(keyword, 0) + + # 计算增长倍数 + if previous_count == 0: + # 新出现的话题 + if current_count >= 5: # 至少出现5次才认为是爆火 + growth_rate = float('inf') + is_viral = True + else: + continue + else: + growth_rate = current_count / previous_count + is_viral = growth_rate >= threshold + + if is_viral: + viral_topics.append({ + "keyword": keyword, + "current_count": current_count, + "previous_count": previous_count, + "growth_rate": round(growth_rate, 2) if growth_rate != float('inf') else "新话题", + "sample_titles": current_keyword_titles[keyword][:3], + "alert_level": "高" if growth_rate > threshold * 2 else "中" + }) + + # 按增长率排序 + viral_topics.sort( + key=lambda x: x["current_count"] if x["growth_rate"] == "新话题" else x["growth_rate"], + reverse=True + ) + + if not viral_topics: + return { + "success": True, + "viral_topics": [], + "total_detected": 0, + "message": f"未检测到热度增长超过 {threshold} 倍的话题" + } + + return { + "success": True, + "viral_topics": viral_topics, + "total_detected": len(viral_topics), + "threshold": threshold, + "time_window": time_window, + "detection_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def predict_trending_topics( + self, + lookahead_hours: int = 6, + confidence_threshold: float = 0.7 + ) -> Dict: + """ + 话题预测 - 基于历史数据预测未来可能的热点 + + Args: + lookahead_hours: 预测未来多少小时 + confidence_threshold: 置信度阈值 + + Returns: + 预测的潜力话题列表 + + Examples: + 用户询问示例: + - "预测接下来6小时可能的热点话题" + - "有哪些话题可能会火起来" + - "早期发现潜力话题" + + 代码调用示例: + >>> tools = AnalyticsTools() + >>> result = tools.predict_trending_topics( + ... lookahead_hours=6, + ... confidence_threshold=0.7 + ... ) + >>> print(result['predicted_topics']) + """ + try: + # 参数验证 + lookahead_hours = validate_limit(lookahead_hours, default=6, max_limit=48) + + if not 0 <= confidence_threshold <= 1: + raise InvalidParameterError( + "confidence_threshold 必须在 0 到 1 之间", + suggestion="推荐值:0.6-0.8" + ) + + # 收集最近3天的数据用于预测 + keyword_trends = defaultdict(list) + + for days_ago in range(3, 0, -1): + date = datetime.now() - timedelta(days=days_ago) + + try: + all_titles, _, _ = self.data_service.parser.read_all_titles_for_date( + date=date + ) + + # 统计关键词 + keywords_count = Counter() + for _, titles in all_titles.items(): + for title in titles.keys(): + keywords = self._extract_keywords(title) + keywords_count.update(keywords) + + # 记录每个关键词的历史数据 + for keyword, count in keywords_count.items(): + keyword_trends[keyword].append(count) + + except DataNotFoundError: + pass + + # 添加今天的数据 + try: + all_titles, _, _ = self.data_service.parser.read_all_titles_for_date() + + keywords_count = Counter() + keyword_titles = defaultdict(list) + + for _, titles in all_titles.items(): + for title in titles.keys(): + keywords = self._extract_keywords(title) + keywords_count.update(keywords) + + for kw in keywords: + keyword_titles[kw].append(title) + + for keyword, count in keywords_count.items(): + keyword_trends[keyword].append(count) + + except DataNotFoundError: + raise DataNotFoundError( + "未找到今天的数据", + suggestion="请等待爬虫任务完成" + ) + + # 预测潜力话题 + predicted_topics = [] + + for keyword, trend_data in keyword_trends.items(): + if len(trend_data) < 2: + continue + + # 简单的线性趋势预测 + # 计算增长率 + recent_value = trend_data[-1] + previous_value = trend_data[-2] if len(trend_data) >= 2 else 0 + + if previous_value == 0: + if recent_value >= 3: + growth_rate = 1.0 + else: + continue + else: + growth_rate = (recent_value - previous_value) / previous_value + + # 判断是否是上升趋势 + if growth_rate > 0.3: # 增长超过30% + # 计算置信度(基于趋势的稳定性) + if len(trend_data) >= 3: + # 检查是否连续增长 + is_consistent = all( + trend_data[i] <= trend_data[i+1] + for i in range(len(trend_data)-1) + ) + confidence = 0.9 if is_consistent else 0.7 + else: + confidence = 0.6 + + if confidence >= confidence_threshold: + predicted_topics.append({ + "keyword": keyword, + "current_count": recent_value, + "growth_rate": round(growth_rate * 100, 2), + "confidence": round(confidence, 2), + "trend_data": trend_data, + "prediction": "上升趋势,可能成为热点", + "sample_titles": keyword_titles.get(keyword, [])[:3] + }) + + # 按置信度和增长率排序 + predicted_topics.sort( + key=lambda x: (x["confidence"], x["growth_rate"]), + reverse=True + ) + + return { + "success": True, + "predicted_topics": predicted_topics[:20], # 返回TOP 20 + "total_predicted": len(predicted_topics), + "lookahead_hours": lookahead_hours, + "confidence_threshold": confidence_threshold, + "prediction_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "note": "预测基于历史趋势,实际结果可能有偏差" + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + # ==================== 辅助方法 ==================== + + def _extract_keywords(self, title: str, min_length: int = 2) -> List[str]: + """ + 从标题中提取关键词(简单实现) + + Args: + title: 标题文本 + min_length: 最小关键词长度 + + Returns: + 关键词列表 + """ + # 移除URL和特殊字符 + title = re.sub(r'http[s]?://\S+', '', title) + title = re.sub(r'[^\w\s]', ' ', title) + + # 简单分词(按空格和常见分隔符) + words = re.split(r'[\s,。!?、]+', title) + + # 过滤停用词和短词 + stopwords = {'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这'} + + keywords = [ + word.strip() for word in words + if word.strip() and len(word.strip()) >= min_length and word.strip() not in stopwords + ] + + return keywords + + def _calculate_similarity(self, text1: str, text2: str) -> float: + """ + 计算两个文本的相似度 + + Args: + text1: 文本1 + text2: 文本2 + + Returns: + 相似度分数(0-1之间) + """ + # 使用 SequenceMatcher 计算相似度 + return SequenceMatcher(None, text1, text2).ratio() + + def _find_unique_topics(self, platform_stats: Dict) -> Dict[str, List[str]]: + """ + 找出各平台独有的热点话题 + + Args: + platform_stats: 平台统计数据 + + Returns: + 各平台独有话题字典 + """ + unique_topics = {} + + # 获取每个平台的TOP关键词 + platform_keywords = {} + for platform, stats in platform_stats.items(): + top_keywords = set([kw for kw, _ in stats["top_keywords"].most_common(10)]) + platform_keywords[platform] = top_keywords + + # 找出独有关键词 + for platform, keywords in platform_keywords.items(): + # 找出其他平台的所有关键词 + other_keywords = set() + for other_platform, other_kws in platform_keywords.items(): + if other_platform != platform: + other_keywords.update(other_kws) + + # 找出独有的 + unique = keywords - other_keywords + if unique: + unique_topics[platform] = list(unique)[:5] # 最多5个 + + return unique_topics diff --git a/mcp_server/tools/config_mgmt.py b/mcp_server/tools/config_mgmt.py new file mode 100644 index 0000000000000..25ab7f5d980a1 --- /dev/null +++ b/mcp_server/tools/config_mgmt.py @@ -0,0 +1,66 @@ +""" +配置管理工具 + +实现配置查询和管理功能。 +""" + +from typing import Dict, Optional + +from ..services.data_service import DataService +from ..utils.validators import validate_config_section +from ..utils.errors import MCPError + + +class ConfigManagementTools: + """配置管理工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化配置管理工具 + + Args: + project_root: 项目根目录 + """ + self.data_service = DataService(project_root) + + def get_current_config(self, section: Optional[str] = None) -> Dict: + """ + 获取当前系统配置 + + Args: + section: 配置节 - all/crawler/push/keywords/weights,默认all + + Returns: + 配置字典 + + Example: + >>> tools = ConfigManagementTools() + >>> result = tools.get_current_config(section="crawler") + >>> print(result['crawler']['platforms']) + """ + try: + # 参数验证 + section = validate_config_section(section) + + # 获取配置 + config = self.data_service.get_current_config(section=section) + + return { + "config": config, + "section": section, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } diff --git a/mcp_server/tools/data_query.py b/mcp_server/tools/data_query.py new file mode 100644 index 0000000000000..49504da0c73ec --- /dev/null +++ b/mcp_server/tools/data_query.py @@ -0,0 +1,284 @@ +""" +数据查询工具 + +实现P0核心的数据查询工具。 +""" + +from typing import Dict, List, Optional + +from ..services.data_service import DataService +from ..utils.validators import ( + validate_platforms, + validate_limit, + validate_keyword, + validate_date_range, + validate_top_n, + validate_mode, + validate_date_query +) +from ..utils.errors import MCPError + + +class DataQueryTools: + """数据查询工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化数据查询工具 + + Args: + project_root: 项目根目录 + """ + self.data_service = DataService(project_root) + + def get_latest_news( + self, + platforms: Optional[List[str]] = None, + limit: Optional[int] = None, + include_url: bool = False + ) -> Dict: + """ + 获取最新一批爬取的新闻数据 + + Args: + platforms: 平台ID列表,如 ['zhihu', 'weibo'] + limit: 返回条数限制,默认20 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 新闻列表字典 + + Example: + >>> tools = DataQueryTools() + >>> result = tools.get_latest_news(platforms=['zhihu'], limit=10) + >>> print(result['total']) + 10 + """ + try: + # 参数验证 + platforms = validate_platforms(platforms) + limit = validate_limit(limit, default=50) + + # 获取数据 + news_list = self.data_service.get_latest_news( + platforms=platforms, + limit=limit, + include_url=include_url + ) + + return { + "news": news_list, + "total": len(news_list), + "platforms": platforms, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def search_news_by_keyword( + self, + keyword: str, + date_range: Optional[Dict] = None, + platforms: Optional[List[str]] = None, + limit: Optional[int] = None + ) -> Dict: + """ + 按关键词搜索历史新闻 + + Args: + keyword: 搜索关键词(必需) + date_range: 日期范围,格式: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + platforms: 平台过滤列表 + limit: 返回条数限制(可选,默认返回所有) + + Returns: + 搜索结果字典 + + Example: + >>> tools = DataQueryTools() + >>> result = tools.search_news_by_keyword( + ... keyword="人工智能", + ... date_range={"start": "2025-10-01", "end": "2025-10-11"}, + ... limit=50 + ... ) + >>> print(result['total']) + """ + try: + # 参数验证 + keyword = validate_keyword(keyword) + date_range_tuple = validate_date_range(date_range) + platforms = validate_platforms(platforms) + + if limit is not None: + limit = validate_limit(limit, default=100) + + # 搜索数据 + search_result = self.data_service.search_news_by_keyword( + keyword=keyword, + date_range=date_range_tuple, + platforms=platforms, + limit=limit + ) + + return { + **search_result, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def get_trending_topics( + self, + top_n: Optional[int] = None, + mode: Optional[str] = None + ) -> Dict: + """ + 获取个人关注词的新闻出现频率统计 + + 注意:本工具基于 config/frequency_words.txt 中的个人关注词列表进行统计, + 而不是自动从新闻中提取热点话题。这是一个个人可定制的关注词列表, + 用户可以根据自己的兴趣添加或删除关注词。 + + Args: + top_n: 返回TOP N关注词,默认10 + mode: 模式 - daily(当日累计), current(最新一批), incremental(增量) + + Returns: + 关注词频率统计字典,包含每个关注词在新闻中出现的次数 + + Example: + >>> tools = DataQueryTools() + >>> result = tools.get_trending_topics(top_n=5, mode="current") + >>> print(len(result['topics'])) + 5 + >>> # 返回的是你在 frequency_words.txt 中设置的关注词的频率统计 + """ + try: + # 参数验证 + top_n = validate_top_n(top_n, default=10) + valid_modes = ["daily", "current", "incremental"] + mode = validate_mode(mode, valid_modes, default="current") + + # 获取趋势话题 + trending_result = self.data_service.get_trending_topics( + top_n=top_n, + mode=mode + ) + + return { + **trending_result, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def get_news_by_date( + self, + date_query: Optional[str] = None, + platforms: Optional[List[str]] = None, + limit: Optional[int] = None, + include_url: bool = False + ) -> Dict: + """ + 按日期查询新闻,支持自然语言日期 + + Args: + date_query: 日期查询字符串(可选,默认"今天"),支持: + - 相对日期:今天、昨天、前天、3天前、yesterday、3 days ago + - 星期:上周一、本周三、last monday、this friday + - 绝对日期:2025-10-10、10月10日、2025年10月10日 + platforms: 平台ID列表,如 ['zhihu', 'weibo'] + limit: 返回条数限制,默认50 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 新闻列表字典 + + Example: + >>> tools = DataQueryTools() + >>> # 不指定日期,默认查询今天 + >>> result = tools.get_news_by_date(platforms=['zhihu'], limit=20) + >>> # 指定日期 + >>> result = tools.get_news_by_date( + ... date_query="昨天", + ... platforms=['zhihu'], + ... limit=20 + ... ) + >>> print(result['total']) + 20 + """ + try: + # 参数验证 - 默认今天 + if date_query is None: + date_query = "今天" + target_date = validate_date_query(date_query) + platforms = validate_platforms(platforms) + limit = validate_limit(limit, default=50) + + # 获取数据 + news_list = self.data_service.get_news_by_date( + target_date=target_date, + platforms=platforms, + limit=limit, + include_url=include_url + ) + + return { + "news": news_list, + "total": len(news_list), + "date": target_date.strftime("%Y-%m-%d"), + "date_query": date_query, + "platforms": platforms, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + diff --git a/mcp_server/tools/search_tools.py b/mcp_server/tools/search_tools.py new file mode 100644 index 0000000000000..f393ec6abb9d6 --- /dev/null +++ b/mcp_server/tools/search_tools.py @@ -0,0 +1,701 @@ +""" +智能新闻检索工具 + +提供模糊搜索、链接查询、历史相关新闻检索等高级搜索功能。 +""" + +import re +from collections import Counter +from datetime import datetime, timedelta +from difflib import SequenceMatcher +from typing import Dict, List, Optional, Tuple + +from ..services.data_service import DataService +from ..utils.validators import validate_keyword, validate_limit +from ..utils.errors import MCPError, InvalidParameterError, DataNotFoundError + + +class SearchTools: + """智能新闻检索工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化智能检索工具 + + Args: + project_root: 项目根目录 + """ + self.data_service = DataService(project_root) + # 中文停用词列表 + self.stopwords = { + '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', + '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', + '看', '好', '自己', '这', '那', '来', '被', '与', '为', '对', '将', '从', + '以', '及', '等', '但', '或', '而', '于', '中', '由', '可', '可以', '已', + '已经', '还', '更', '最', '再', '因为', '所以', '如果', '虽然', '然而' + } + + def search_news_unified( + self, + query: str, + search_mode: str = "keyword", + date_range: Optional[Dict[str, str]] = None, + platforms: Optional[List[str]] = None, + limit: int = 50, + sort_by: str = "relevance", + threshold: float = 0.6, + include_url: bool = False + ) -> Dict: + """ + 统一新闻搜索工具 - 整合多种搜索模式 + + Args: + query: 查询内容(必需)- 关键词、内容片段或实体名称 + search_mode: 搜索模式,可选值: + - "keyword": 精确关键词匹配(默认) + - "fuzzy": 模糊内容匹配(使用相似度算法) + - "entity": 实体名称搜索(自动按权重排序) + date_range: 日期范围(可选) + - **格式**: {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + - **示例**: {"start": "2025-01-01", "end": "2025-01-07"} + - **默认**: 不指定时默认查询今天 + - **注意**: start和end可以相同(表示单日查询) + platforms: 平台过滤列表,如 ['zhihu', 'weibo'] + limit: 返回条数限制,默认50 + sort_by: 排序方式,可选值: + - "relevance": 按相关度排序(默认) + - "weight": 按新闻权重排序 + - "date": 按日期排序 + threshold: 相似度阈值(仅fuzzy模式有效),0-1之间,默认0.6 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 搜索结果字典,包含匹配的新闻列表 + + Examples: + - search_news_unified(query="人工智能", search_mode="keyword") + - search_news_unified(query="特斯拉降价", search_mode="fuzzy", threshold=0.4) + - search_news_unified(query="马斯克", search_mode="entity", limit=20) + - search_news_unified(query="iPhone 16", date_range={"start": "2025-01-01", "end": "2025-01-07"}) + """ + try: + # 参数验证 + query = validate_keyword(query) + + if search_mode not in ["keyword", "fuzzy", "entity"]: + raise InvalidParameterError( + f"无效的搜索模式: {search_mode}", + suggestion="支持的模式: keyword, fuzzy, entity" + ) + + if sort_by not in ["relevance", "weight", "date"]: + raise InvalidParameterError( + f"无效的排序方式: {sort_by}", + suggestion="支持的排序: relevance, weight, date" + ) + + limit = validate_limit(limit, default=50) + threshold = max(0.0, min(1.0, threshold)) + + # 处理日期范围 + if date_range: + from ..utils.validators import validate_date_range + date_range_tuple = validate_date_range(date_range) + start_date, end_date = date_range_tuple + else: + # 不指定日期时,使用最新可用数据日期(而非 datetime.now()) + earliest, latest = self.data_service.get_available_date_range() + + if latest is None: + # 没有任何可用数据 + return { + "success": False, + "error": { + "code": "NO_DATA_AVAILABLE", + "message": "output 目录下没有可用的新闻数据", + "suggestion": "请先运行爬虫生成数据,或检查 output 目录" + } + } + + # 使用最新可用日期 + start_date = end_date = latest + + # 收集所有匹配的新闻 + all_matches = [] + current_date = start_date + + while current_date <= end_date: + try: + all_titles, id_to_name, timestamps = self.data_service.parser.read_all_titles_for_date( + date=current_date, + platform_ids=platforms + ) + + # 根据搜索模式执行不同的搜索逻辑 + if search_mode == "keyword": + matches = self._search_by_keyword_mode( + query, all_titles, id_to_name, current_date, include_url + ) + elif search_mode == "fuzzy": + matches = self._search_by_fuzzy_mode( + query, all_titles, id_to_name, current_date, threshold, include_url + ) + else: # entity + matches = self._search_by_entity_mode( + query, all_titles, id_to_name, current_date, include_url + ) + + all_matches.extend(matches) + + except DataNotFoundError: + # 该日期没有数据,继续下一天 + pass + + current_date += timedelta(days=1) + + if not all_matches: + # 获取可用日期范围用于错误提示 + earliest, latest = self.data_service.get_available_date_range() + + # 判断时间范围描述 + if start_date.date() == datetime.now().date() and start_date == end_date: + time_desc = "今天" + elif start_date == end_date: + time_desc = start_date.strftime("%Y-%m-%d") + else: + time_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + + # 构建错误消息 + if earliest and latest: + available_desc = f"{earliest.strftime('%Y-%m-%d')} 至 {latest.strftime('%Y-%m-%d')}" + message = f"未找到匹配的新闻(查询范围: {time_desc},可用数据: {available_desc})" + else: + message = f"未找到匹配的新闻({time_desc})" + + result = { + "success": True, + "results": [], + "total": 0, + "query": query, + "search_mode": search_mode, + "time_range": time_desc, + "message": message + } + return result + + # 统一排序逻辑 + if sort_by == "relevance": + all_matches.sort(key=lambda x: x.get("similarity_score", 1.0), reverse=True) + elif sort_by == "weight": + from .analytics import calculate_news_weight + all_matches.sort(key=lambda x: calculate_news_weight(x), reverse=True) + elif sort_by == "date": + all_matches.sort(key=lambda x: x.get("date", ""), reverse=True) + + # 限制返回数量 + results = all_matches[:limit] + + # 构建时间范围描述(正确判断是否为今天) + if start_date.date() == datetime.now().date() and start_date == end_date: + time_range_desc = "今天" + elif start_date == end_date: + time_range_desc = start_date.strftime("%Y-%m-%d") + else: + time_range_desc = f"{start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}" + + result = { + "success": True, + "summary": { + "total_found": len(all_matches), + "returned_count": len(results), + "requested_limit": limit, + "search_mode": search_mode, + "query": query, + "platforms": platforms or "所有平台", + "time_range": time_range_desc, + "sort_by": sort_by + }, + "results": results + } + + if search_mode == "fuzzy": + result["summary"]["threshold"] = threshold + if len(all_matches) < limit: + result["note"] = f"模糊搜索模式下,相似度阈值 {threshold} 仅匹配到 {len(all_matches)} 条结果" + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def _search_by_keyword_mode( + self, + query: str, + all_titles: Dict, + id_to_name: Dict, + current_date: datetime, + include_url: bool + ) -> List[Dict]: + """ + 关键词搜索模式(精确匹配) + + Args: + query: 搜索关键词 + all_titles: 所有标题字典 + id_to_name: 平台ID到名称映射 + current_date: 当前日期 + + Returns: + 匹配的新闻列表 + """ + matches = [] + query_lower = query.lower() + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 精确包含判断 + if query_lower in title.lower(): + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "date": current_date.strftime("%Y-%m-%d"), + "similarity_score": 1.0, # 精确匹配,相似度为1 + "ranks": info.get("ranks", []), + "count": len(info.get("ranks", [])), + "rank": info["ranks"][0] if info["ranks"] else 999 + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + matches.append(news_item) + + return matches + + def _search_by_fuzzy_mode( + self, + query: str, + all_titles: Dict, + id_to_name: Dict, + current_date: datetime, + threshold: float, + include_url: bool + ) -> List[Dict]: + """ + 模糊搜索模式(使用相似度算法) + + Args: + query: 搜索内容 + all_titles: 所有标题字典 + id_to_name: 平台ID到名称映射 + current_date: 当前日期 + threshold: 相似度阈值 + + Returns: + 匹配的新闻列表 + """ + matches = [] + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 模糊匹配 + is_match, similarity = self._fuzzy_match(query, title, threshold) + + if is_match: + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "date": current_date.strftime("%Y-%m-%d"), + "similarity_score": round(similarity, 4), + "ranks": info.get("ranks", []), + "count": len(info.get("ranks", [])), + "rank": info["ranks"][0] if info["ranks"] else 999 + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + matches.append(news_item) + + return matches + + def _search_by_entity_mode( + self, + query: str, + all_titles: Dict, + id_to_name: Dict, + current_date: datetime, + include_url: bool + ) -> List[Dict]: + """ + 实体搜索模式(自动按权重排序) + + Args: + query: 实体名称 + all_titles: 所有标题字典 + id_to_name: 平台ID到名称映射 + current_date: 当前日期 + + Returns: + 匹配的新闻列表 + """ + matches = [] + + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 实体搜索:精确包含实体名称 + if query in title: + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "date": current_date.strftime("%Y-%m-%d"), + "similarity_score": 1.0, + "ranks": info.get("ranks", []), + "count": len(info.get("ranks", [])), + "rank": info["ranks"][0] if info["ranks"] else 999 + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + matches.append(news_item) + + return matches + + def _calculate_similarity(self, text1: str, text2: str) -> float: + """ + 计算两个文本的相似度 + + Args: + text1: 文本1 + text2: 文本2 + + Returns: + 相似度分数 (0-1之间) + """ + # 使用 difflib.SequenceMatcher 计算序列相似度 + return SequenceMatcher(None, text1.lower(), text2.lower()).ratio() + + def _fuzzy_match(self, query: str, text: str, threshold: float = 0.3) -> Tuple[bool, float]: + """ + 模糊匹配函数 + + Args: + query: 查询文本 + text: 待匹配文本 + threshold: 匹配阈值 + + Returns: + (是否匹配, 相似度分数) + """ + # 直接包含判断 + if query.lower() in text.lower(): + return True, 1.0 + + # 计算整体相似度 + similarity = self._calculate_similarity(query, text) + if similarity >= threshold: + return True, similarity + + # 分词后的部分匹配 + query_words = set(self._extract_keywords(query)) + text_words = set(self._extract_keywords(text)) + + if not query_words or not text_words: + return False, 0.0 + + # 计算关键词重合度 + common_words = query_words & text_words + keyword_overlap = len(common_words) / len(query_words) + + if keyword_overlap >= 0.5: # 50%的关键词重合 + return True, keyword_overlap + + return False, similarity + + def _extract_keywords(self, text: str, min_length: int = 2) -> List[str]: + """ + 从文本中提取关键词 + + Args: + text: 输入文本 + min_length: 最小词长 + + Returns: + 关键词列表 + """ + # 移除URL和特殊字符 + text = re.sub(r'http[s]?://\S+', '', text) + text = re.sub(r'\[.*?\]', '', text) # 移除方括号内容 + + # 使用正则表达式分词(中文和英文) + words = re.findall(r'[\w]+', text) + + # 过滤停用词和短词 + keywords = [ + word for word in words + if word and len(word) >= min_length and word not in self.stopwords + ] + + return keywords + + def _calculate_keyword_overlap(self, keywords1: List[str], keywords2: List[str]) -> float: + """ + 计算两个关键词列表的重合度 + + Args: + keywords1: 关键词列表1 + keywords2: 关键词列表2 + + Returns: + 重合度分数 (0-1之间) + """ + if not keywords1 or not keywords2: + return 0.0 + + set1 = set(keywords1) + set2 = set(keywords2) + + # Jaccard 相似度 + intersection = len(set1 & set2) + union = len(set1 | set2) + + if union == 0: + return 0.0 + + return intersection / union + + def search_related_news_history( + self, + reference_text: str, + time_preset: str = "yesterday", + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + threshold: float = 0.4, + limit: int = 50, + include_url: bool = False + ) -> Dict: + """ + 在历史数据中搜索与给定新闻相关的新闻 + + Args: + reference_text: 参考新闻标题或内容 + time_preset: 时间范围预设值,可选: + - "yesterday": 昨天 + - "last_week": 上周 (7天) + - "last_month": 上个月 (30天) + - "custom": 自定义日期范围(需要提供 start_date 和 end_date) + start_date: 自定义开始日期(仅当 time_preset="custom" 时有效) + end_date: 自定义结束日期(仅当 time_preset="custom" 时有效) + threshold: 相似度阈值 (0-1之间),默认0.4 + limit: 返回条数限制,默认50 + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 搜索结果字典,包含相关新闻列表 + + Example: + >>> tools = SearchTools() + >>> result = tools.search_related_news_history( + ... reference_text="人工智能技术突破", + ... time_preset="last_week", + ... threshold=0.4, + ... limit=50 + ... ) + >>> for news in result['results']: + ... print(f"{news['date']}: {news['title']} (相似度: {news['similarity_score']})") + """ + try: + # 参数验证 + reference_text = validate_keyword(reference_text) + threshold = max(0.0, min(1.0, threshold)) + limit = validate_limit(limit, default=50) + + # 确定查询日期范围 + today = datetime.now() + + if time_preset == "yesterday": + search_start = today - timedelta(days=1) + search_end = today - timedelta(days=1) + elif time_preset == "last_week": + search_start = today - timedelta(days=7) + search_end = today - timedelta(days=1) + elif time_preset == "last_month": + search_start = today - timedelta(days=30) + search_end = today - timedelta(days=1) + elif time_preset == "custom": + if not start_date or not end_date: + raise InvalidParameterError( + "自定义时间范围需要提供 start_date 和 end_date", + suggestion="请提供 start_date 和 end_date 参数" + ) + search_start = start_date + search_end = end_date + else: + raise InvalidParameterError( + f"不支持的时间范围: {time_preset}", + suggestion="请使用 'yesterday', 'last_week', 'last_month' 或 'custom'" + ) + + # 提取参考文本的关键词 + reference_keywords = self._extract_keywords(reference_text) + + if not reference_keywords: + raise InvalidParameterError( + "无法从参考文本中提取关键词", + suggestion="请提供更详细的文本内容" + ) + + # 收集所有相关新闻 + all_related_news = [] + current_date = search_start + + while current_date <= search_end: + try: + # 读取该日期的数据 + all_titles, id_to_name, _ = self.data_service.parser.read_all_titles_for_date(current_date) + + # 搜索相关新闻 + for platform_id, titles in all_titles.items(): + platform_name = id_to_name.get(platform_id, platform_id) + + for title, info in titles.items(): + # 计算标题相似度 + title_similarity = self._calculate_similarity(reference_text, title) + + # 提取标题关键词 + title_keywords = self._extract_keywords(title) + + # 计算关键词重合度 + keyword_overlap = self._calculate_keyword_overlap( + reference_keywords, + title_keywords + ) + + # 综合相似度 (70% 关键词重合 + 30% 文本相似度) + combined_score = keyword_overlap * 0.7 + title_similarity * 0.3 + + if combined_score >= threshold: + news_item = { + "title": title, + "platform": platform_id, + "platform_name": platform_name, + "date": current_date.strftime("%Y-%m-%d"), + "similarity_score": round(combined_score, 4), + "keyword_overlap": round(keyword_overlap, 4), + "text_similarity": round(title_similarity, 4), + "common_keywords": list(set(reference_keywords) & set(title_keywords)), + "rank": info["ranks"][0] if info["ranks"] else 0 + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobileUrl"] = info.get("mobileUrl", "") + + all_related_news.append(news_item) + + except DataNotFoundError: + # 该日期没有数据,继续下一天 + pass + except Exception as e: + # 记录错误但继续处理其他日期 + print(f"Warning: 处理日期 {current_date.strftime('%Y-%m-%d')} 时出错: {e}") + + # 移动到下一天 + current_date += timedelta(days=1) + + if not all_related_news: + return { + "success": True, + "results": [], + "total": 0, + "query": reference_text, + "time_preset": time_preset, + "date_range": { + "start": search_start.strftime("%Y-%m-%d"), + "end": search_end.strftime("%Y-%m-%d") + }, + "message": "未找到相关新闻" + } + + # 按相似度排序 + all_related_news.sort(key=lambda x: x["similarity_score"], reverse=True) + + # 限制返回数量 + results = all_related_news[:limit] + + # 统计信息 + platform_distribution = Counter([news["platform"] for news in all_related_news]) + date_distribution = Counter([news["date"] for news in all_related_news]) + + result = { + "success": True, + "summary": { + "total_found": len(all_related_news), + "returned_count": len(results), + "requested_limit": limit, + "threshold": threshold, + "reference_text": reference_text, + "reference_keywords": reference_keywords, + "time_preset": time_preset, + "date_range": { + "start": search_start.strftime("%Y-%m-%d"), + "end": search_end.strftime("%Y-%m-%d") + } + }, + "results": results, + "statistics": { + "platform_distribution": dict(platform_distribution), + "date_distribution": dict(date_distribution), + "avg_similarity": round( + sum([news["similarity_score"] for news in all_related_news]) / len(all_related_news), + 4 + ) if all_related_news else 0.0 + } + } + + if len(all_related_news) < limit: + result["note"] = f"相关性阈值 {threshold} 下仅找到 {len(all_related_news)} 条相关新闻" + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } diff --git a/mcp_server/tools/system.py b/mcp_server/tools/system.py new file mode 100644 index 0000000000000..2cf2248d788ab --- /dev/null +++ b/mcp_server/tools/system.py @@ -0,0 +1,465 @@ +""" +系统管理工具 + +实现系统状态查询和爬虫触发功能。 +""" + +from pathlib import Path +from typing import Dict, List, Optional + +from ..services.data_service import DataService +from ..utils.validators import validate_platforms +from ..utils.errors import MCPError, CrawlTaskError + + +class SystemManagementTools: + """系统管理工具类""" + + def __init__(self, project_root: str = None): + """ + 初始化系统管理工具 + + Args: + project_root: 项目根目录 + """ + self.data_service = DataService(project_root) + if project_root: + self.project_root = Path(project_root) + else: + # 获取项目根目录 + current_file = Path(__file__) + self.project_root = current_file.parent.parent.parent + + def get_system_status(self) -> Dict: + """ + 获取系统运行状态和健康检查信息 + + Returns: + 系统状态字典 + + Example: + >>> tools = SystemManagementTools() + >>> result = tools.get_system_status() + >>> print(result['system']['version']) + """ + try: + # 获取系统状态 + status = self.data_service.get_system_status() + + return { + **status, + "success": True + } + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e) + } + } + + def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict: + """ + 手动触发一次临时爬取任务(可选持久化) + + Args: + platforms: 指定平台列表,为空则爬取所有平台 + save_to_local: 是否保存到本地 output 目录,默认 False + include_url: 是否包含URL链接,默认False(节省token) + + Returns: + 爬取结果字典,包含新闻数据和保存路径(如果保存) + + Example: + >>> tools = SystemManagementTools() + >>> # 临时爬取,不保存 + >>> result = tools.trigger_crawl(platforms=['zhihu', 'weibo']) + >>> print(result['data']) + >>> # 爬取并保存到本地 + >>> result = tools.trigger_crawl(platforms=['zhihu'], save_to_local=True) + >>> print(result['saved_files']) + """ + try: + import json + import time + import random + import requests + from datetime import datetime + import pytz + import yaml + + # 参数验证 + platforms = validate_platforms(platforms) + + # 加载配置文件 + config_path = self.project_root / "config" / "config.yaml" + if not config_path.exists(): + raise CrawlTaskError( + "配置文件不存在", + suggestion=f"请确保配置文件存在: {config_path}" + ) + + # 读取配置 + with open(config_path, "r", encoding="utf-8") as f: + config_data = yaml.safe_load(f) + + # 获取平台配置 + all_platforms = config_data.get("platforms", []) + if not all_platforms: + raise CrawlTaskError( + "配置文件中没有平台配置", + suggestion="请检查 config/config.yaml 中的 platforms 配置" + ) + + # 过滤平台 + if platforms: + target_platforms = [p for p in all_platforms if p["id"] in platforms] + if not target_platforms: + raise CrawlTaskError( + f"指定的平台不存在: {platforms}", + suggestion=f"可用平台: {[p['id'] for p in all_platforms]}" + ) + else: + target_platforms = all_platforms + + # 获取请求间隔 + request_interval = config_data.get("crawler", {}).get("request_interval", 100) + + # 构建平台ID列表 + ids = [] + for platform in target_platforms: + if "name" in platform: + ids.append((platform["id"], platform["name"])) + else: + ids.append(platform["id"]) + + print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}") + + # 爬取数据 + results = {} + id_to_name = {} + failed_ids = [] + + for i, id_info in enumerate(ids): + if isinstance(id_info, tuple): + id_value, name = id_info + else: + id_value = id_info + name = id_value + + id_to_name[id_value] = name + + # 构建请求URL + url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Accept": "application/json, text/plain, */*", + "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", + "Connection": "keep-alive", + "Cache-Control": "no-cache", + } + + # 重试机制 + max_retries = 2 + retries = 0 + success = False + + while retries <= max_retries and not success: + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + data_text = response.text + data_json = json.loads(data_text) + + status = data_json.get("status", "未知") + if status not in ["success", "cache"]: + raise ValueError(f"响应状态异常: {status}") + + status_info = "最新数据" if status == "success" else "缓存数据" + print(f"获取 {id_value} 成功({status_info})") + + # 解析数据 + results[id_value] = {} + for index, item in enumerate(data_json.get("items", []), 1): + title = item["title"] + url_link = item.get("url", "") + mobile_url = item.get("mobileUrl", "") + + if title in results[id_value]: + results[id_value][title]["ranks"].append(index) + else: + results[id_value][title] = { + "ranks": [index], + "url": url_link, + "mobileUrl": mobile_url, + } + + success = True + + except Exception as e: + retries += 1 + if retries <= max_retries: + wait_time = random.uniform(3, 5) + print(f"请求 {id_value} 失败: {e}. {wait_time:.2f}秒后重试...") + time.sleep(wait_time) + else: + print(f"请求 {id_value} 失败: {e}") + failed_ids.append(id_value) + + # 请求间隔 + if i < len(ids) - 1: + actual_interval = request_interval + random.randint(-10, 20) + actual_interval = max(50, actual_interval) + time.sleep(actual_interval / 1000) + + # 格式化返回数据 + news_data = [] + for platform_id, titles_data in results.items(): + platform_name = id_to_name.get(platform_id, platform_id) + for title, info in titles_data.items(): + news_item = { + "platform_id": platform_id, + "platform_name": platform_name, + "title": title, + "ranks": info["ranks"] + } + + # 条件性添加 URL 字段 + if include_url: + news_item["url"] = info.get("url", "") + news_item["mobile_url"] = info.get("mobileUrl", "") + + news_data.append(news_item) + + # 获取北京时间 + beijing_tz = pytz.timezone("Asia/Shanghai") + now = datetime.now(beijing_tz) + + # 构建返回结果 + result = { + "success": True, + "task_id": f"crawl_{int(time.time())}", + "status": "completed", + "crawl_time": now.strftime("%Y-%m-%d %H:%M:%S"), + "platforms": list(results.keys()), + "total_news": len(news_data), + "failed_platforms": failed_ids, + "data": news_data, + "saved_to_local": save_to_local + } + + # 如果需要持久化,调用保存逻辑 + if save_to_local: + try: + import re + + # 辅助函数:清理标题 + def clean_title(title: str) -> str: + """清理标题中的特殊字符""" + if not isinstance(title, str): + title = str(title) + cleaned_title = title.replace("\n", " ").replace("\r", " ") + cleaned_title = re.sub(r"\s+", " ", cleaned_title) + cleaned_title = cleaned_title.strip() + return cleaned_title + + # 辅助函数:创建目录 + def ensure_directory_exists(directory: str): + """确保目录存在""" + Path(directory).mkdir(parents=True, exist_ok=True) + + # 格式化日期和时间 + date_folder = now.strftime("%Y年%m月%d日") + time_filename = now.strftime("%H时%M分") + + # 创建 txt 文件路径 + txt_dir = self.project_root / "output" / date_folder / "txt" + ensure_directory_exists(str(txt_dir)) + txt_file_path = txt_dir / f"{time_filename}.txt" + + # 创建 html 文件路径 + html_dir = self.project_root / "output" / date_folder / "html" + ensure_directory_exists(str(html_dir)) + html_file_path = html_dir / f"{time_filename}.html" + + # 保存 txt 文件(按照 main.py 的格式) + with open(txt_file_path, "w", encoding="utf-8") as f: + for id_value, title_data in results.items(): + # id | name 或 id + name = id_to_name.get(id_value) + if name and name != id_value: + f.write(f"{id_value} | {name}\n") + else: + f.write(f"{id_value}\n") + + # 按排名排序标题 + sorted_titles = [] + for title, info in title_data.items(): + cleaned = clean_title(title) + if isinstance(info, dict): + ranks = info.get("ranks", []) + url = info.get("url", "") + mobile_url = info.get("mobileUrl", "") + else: + ranks = info if isinstance(info, list) else [] + url = "" + mobile_url = "" + + rank = ranks[0] if ranks else 1 + sorted_titles.append((rank, cleaned, url, mobile_url)) + + sorted_titles.sort(key=lambda x: x[0]) + + for rank, cleaned, url, mobile_url in sorted_titles: + line = f"{rank}. {cleaned}" + if url: + line += f" [URL:{url}]" + if mobile_url: + line += f" [MOBILE:{mobile_url}]" + f.write(line + "\n") + + f.write("\n") + + if failed_ids: + f.write("==== 以下ID请求失败 ====\n") + for id_value in failed_ids: + f.write(f"{id_value}\n") + + # 保存 html 文件(简化版) + html_content = self._generate_simple_html(results, id_to_name, failed_ids, now) + with open(html_file_path, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"数据已保存到:") + print(f" TXT: {txt_file_path}") + print(f" HTML: {html_file_path}") + + result["saved_files"] = { + "txt": str(txt_file_path), + "html": str(html_file_path) + } + result["note"] = "数据已持久化到 output 文件夹" + + except Exception as e: + print(f"保存文件失败: {e}") + result["save_error"] = str(e) + result["note"] = "爬取成功但保存失败,数据仅在内存中" + else: + result["note"] = "临时爬取结果,未持久化到output文件夹" + + return result + + except MCPError as e: + return { + "success": False, + "error": e.to_dict() + } + except Exception as e: + import traceback + return { + "success": False, + "error": { + "code": "INTERNAL_ERROR", + "message": str(e), + "traceback": traceback.format_exc() + } + } + + def _generate_simple_html(self, results: Dict, id_to_name: Dict, failed_ids: List, now) -> str: + """生成简化的 HTML 报告""" + html = """ + + + + + MCP 爬取结果 + + + +
+

MCP 爬取结果

+""" + + # 添加时间戳 + html += f'

爬取时间: {now.strftime("%Y-%m-%d %H:%M:%S")}

\n\n' + + # 遍历每个平台 + for platform_id, titles_data in results.items(): + platform_name = id_to_name.get(platform_id, platform_id) + html += f'
\n' + html += f'
{platform_name}
\n' + + # 排序标题 + sorted_items = [] + for title, info in titles_data.items(): + ranks = info.get("ranks", []) + url = info.get("url", "") + mobile_url = info.get("mobileUrl", "") + rank = ranks[0] if ranks else 999 + sorted_items.append((rank, title, url, mobile_url)) + + sorted_items.sort(key=lambda x: x[0]) + + # 显示新闻 + for rank, title, url, mobile_url in sorted_items: + html += f'
\n' + html += f' {rank}.\n' + html += f' {self._html_escape(title)}\n' + if url: + html += f' 链接\n' + if mobile_url and mobile_url != url: + html += f' 移动版\n' + html += '
\n' + + html += '
\n\n' + + # 失败的平台 + if failed_ids: + html += '
\n' + html += '

请求失败的平台

\n' + html += '
    \n' + for platform_id in failed_ids: + html += f'
  • {self._html_escape(platform_id)}
  • \n' + html += '
\n' + html += '
\n' + + html += """
+ +""" + + return html + + def _html_escape(self, text: str) -> str: + """HTML 转义""" + if not isinstance(text, str): + text = str(text) + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) diff --git a/mcp_server/utils/__init__.py b/mcp_server/utils/__init__.py new file mode 100644 index 0000000000000..a485fe3a0c02b --- /dev/null +++ b/mcp_server/utils/__init__.py @@ -0,0 +1,5 @@ +""" +工具类模块 + +提供参数验证、错误处理等辅助功能。 +""" diff --git a/mcp_server/utils/date_parser.py b/mcp_server/utils/date_parser.py new file mode 100644 index 0000000000000..382a82bd11610 --- /dev/null +++ b/mcp_server/utils/date_parser.py @@ -0,0 +1,278 @@ +""" +日期解析工具 + +支持多种自然语言日期格式解析,包括相对日期和绝对日期。 +""" + +import re +from datetime import datetime, timedelta + +from .errors import InvalidParameterError + + +class DateParser: + """日期解析器类""" + + # 中文日期映射 + CN_DATE_MAPPING = { + "今天": 0, + "昨天": 1, + "前天": 2, + "大前天": 3, + } + + # 英文日期映射 + EN_DATE_MAPPING = { + "today": 0, + "yesterday": 1, + } + + # 星期映射 + WEEKDAY_CN = { + "一": 0, "二": 1, "三": 2, "四": 3, + "五": 4, "六": 5, "日": 6, "天": 6 + } + + WEEKDAY_EN = { + "monday": 0, "tuesday": 1, "wednesday": 2, "thursday": 3, + "friday": 4, "saturday": 5, "sunday": 6 + } + + @staticmethod + def parse_date_query(date_query: str) -> datetime: + """ + 解析日期查询字符串 + + 支持的格式: + - 相对日期(中文):今天、昨天、前天、大前天、N天前 + - 相对日期(英文):today、yesterday、N days ago + - 星期(中文):上周一、上周二、本周三 + - 星期(英文):last monday、this friday + - 绝对日期:2025-10-10、10月10日、2025年10月10日 + + Args: + date_query: 日期查询字符串 + + Returns: + datetime对象 + + Raises: + InvalidParameterError: 日期格式无法识别 + + Examples: + >>> DateParser.parse_date_query("今天") + datetime(2025, 10, 11) + >>> DateParser.parse_date_query("昨天") + datetime(2025, 10, 10) + >>> DateParser.parse_date_query("3天前") + datetime(2025, 10, 8) + >>> DateParser.parse_date_query("2025-10-10") + datetime(2025, 10, 10) + """ + if not date_query or not isinstance(date_query, str): + raise InvalidParameterError( + "日期查询字符串不能为空", + suggestion="请提供有效的日期查询,如:今天、昨天、2025-10-10" + ) + + date_query = date_query.strip().lower() + + # 1. 尝试解析中文常用相对日期 + if date_query in DateParser.CN_DATE_MAPPING: + days_ago = DateParser.CN_DATE_MAPPING[date_query] + return datetime.now() - timedelta(days=days_ago) + + # 2. 尝试解析英文常用相对日期 + if date_query in DateParser.EN_DATE_MAPPING: + days_ago = DateParser.EN_DATE_MAPPING[date_query] + return datetime.now() - timedelta(days=days_ago) + + # 3. 尝试解析 "N天前" 或 "N days ago" + cn_days_ago_match = re.match(r'(\d+)\s*天前', date_query) + if cn_days_ago_match: + days = int(cn_days_ago_match.group(1)) + if days > 365: + raise InvalidParameterError( + f"天数过大: {days}天", + suggestion="请使用小于365天的相对日期或使用绝对日期" + ) + return datetime.now() - timedelta(days=days) + + en_days_ago_match = re.match(r'(\d+)\s*days?\s+ago', date_query) + if en_days_ago_match: + days = int(en_days_ago_match.group(1)) + if days > 365: + raise InvalidParameterError( + f"天数过大: {days}天", + suggestion="请使用小于365天的相对日期或使用绝对日期" + ) + return datetime.now() - timedelta(days=days) + + # 4. 尝试解析星期(中文):上周一、本周三 + cn_weekday_match = re.match(r'(上|本)周([一二三四五六日天])', date_query) + if cn_weekday_match: + week_type = cn_weekday_match.group(1) # 上 或 本 + weekday_str = cn_weekday_match.group(2) + target_weekday = DateParser.WEEKDAY_CN[weekday_str] + return DateParser._get_date_by_weekday(target_weekday, week_type == "上") + + # 5. 尝试解析星期(英文):last monday、this friday + en_weekday_match = re.match(r'(last|this)\s+(monday|tuesday|wednesday|thursday|friday|saturday|sunday)', date_query) + if en_weekday_match: + week_type = en_weekday_match.group(1) # last 或 this + weekday_str = en_weekday_match.group(2) + target_weekday = DateParser.WEEKDAY_EN[weekday_str] + return DateParser._get_date_by_weekday(target_weekday, week_type == "last") + + # 6. 尝试解析绝对日期:YYYY-MM-DD + iso_date_match = re.match(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_query) + if iso_date_match: + year = int(iso_date_match.group(1)) + month = int(iso_date_match.group(2)) + day = int(iso_date_match.group(3)) + try: + return datetime(year, month, day) + except ValueError as e: + raise InvalidParameterError( + f"无效的日期: {date_query}", + suggestion=f"日期值错误: {str(e)}" + ) + + # 7. 尝试解析中文日期:MM月DD日 或 YYYY年MM月DD日 + cn_date_match = re.match(r'(?:(\d{4})年)?(\d{1,2})月(\d{1,2})日', date_query) + if cn_date_match: + year_str = cn_date_match.group(1) + month = int(cn_date_match.group(2)) + day = int(cn_date_match.group(3)) + + # 如果没有年份,使用当前年份 + if year_str: + year = int(year_str) + else: + year = datetime.now().year + # 如果月份大于当前月份,说明是去年 + current_month = datetime.now().month + if month > current_month: + year -= 1 + + try: + return datetime(year, month, day) + except ValueError as e: + raise InvalidParameterError( + f"无效的日期: {date_query}", + suggestion=f"日期值错误: {str(e)}" + ) + + # 8. 尝试解析斜杠格式:YYYY/MM/DD 或 MM/DD + slash_date_match = re.match(r'(?:(\d{4})/)?(\d{1,2})/(\d{1,2})', date_query) + if slash_date_match: + year_str = slash_date_match.group(1) + month = int(slash_date_match.group(2)) + day = int(slash_date_match.group(3)) + + if year_str: + year = int(year_str) + else: + year = datetime.now().year + current_month = datetime.now().month + if month > current_month: + year -= 1 + + try: + return datetime(year, month, day) + except ValueError as e: + raise InvalidParameterError( + f"无效的日期: {date_query}", + suggestion=f"日期值错误: {str(e)}" + ) + + # 如果所有格式都不匹配 + raise InvalidParameterError( + f"无法识别的日期格式: {date_query}", + suggestion=( + "支持的格式:\n" + "- 相对日期: 今天、昨天、前天、3天前、today、yesterday、3 days ago\n" + "- 星期: 上周一、本周三、last monday、this friday\n" + "- 绝对日期: 2025-10-10、10月10日、2025年10月10日" + ) + ) + + @staticmethod + def _get_date_by_weekday(target_weekday: int, is_last_week: bool) -> datetime: + """ + 根据星期几获取日期 + + Args: + target_weekday: 目标星期 (0=周一, 6=周日) + is_last_week: 是否是上周 + + Returns: + datetime对象 + """ + today = datetime.now() + current_weekday = today.weekday() + + # 计算天数差 + if is_last_week: + # 上周的某一天 + days_diff = current_weekday - target_weekday + 7 + else: + # 本周的某一天 + days_diff = current_weekday - target_weekday + if days_diff < 0: + days_diff += 7 + + return today - timedelta(days=days_diff) + + @staticmethod + def format_date_folder(date: datetime) -> str: + """ + 将日期格式化为文件夹名称 + + Args: + date: datetime对象 + + Returns: + 文件夹名称,格式: YYYY年MM月DD日 + + Examples: + >>> DateParser.format_date_folder(datetime(2025, 10, 11)) + '2025年10月11日' + """ + return date.strftime("%Y年%m月%d日") + + @staticmethod + def validate_date_not_future(date: datetime) -> None: + """ + 验证日期不在未来 + + Args: + date: 待验证的日期 + + Raises: + InvalidParameterError: 日期在未来 + """ + if date.date() > datetime.now().date(): + raise InvalidParameterError( + f"不能查询未来的日期: {date.strftime('%Y-%m-%d')}", + suggestion="请使用今天或过去的日期" + ) + + @staticmethod + def validate_date_not_too_old(date: datetime, max_days: int = 365) -> None: + """ + 验证日期不太久远 + + Args: + date: 待验证的日期 + max_days: 最大天数 + + Raises: + InvalidParameterError: 日期太久远 + """ + days_ago = (datetime.now().date() - date.date()).days + if days_ago > max_days: + raise InvalidParameterError( + f"日期太久远: {date.strftime('%Y-%m-%d')} ({days_ago}天前)", + suggestion=f"请查询{max_days}天内的数据" + ) diff --git a/mcp_server/utils/errors.py b/mcp_server/utils/errors.py new file mode 100644 index 0000000000000..fe80d7eb7e413 --- /dev/null +++ b/mcp_server/utils/errors.py @@ -0,0 +1,93 @@ +""" +自定义错误类 + +定义MCP Server使用的所有自定义异常类型。 +""" + +from typing import Optional + + +class MCPError(Exception): + """MCP工具错误基类""" + + def __init__(self, message: str, code: str = "MCP_ERROR", suggestion: Optional[str] = None): + super().__init__(message) + self.code = code + self.message = message + self.suggestion = suggestion + + def to_dict(self) -> dict: + """转换为字典格式""" + error_dict = { + "code": self.code, + "message": self.message + } + if self.suggestion: + error_dict["suggestion"] = self.suggestion + return error_dict + + +class DataNotFoundError(MCPError): + """数据不存在错误""" + + def __init__(self, message: str, suggestion: Optional[str] = None): + super().__init__( + message=message, + code="DATA_NOT_FOUND", + suggestion=suggestion or "请检查日期范围或等待爬取任务完成" + ) + + +class InvalidParameterError(MCPError): + """参数无效错误""" + + def __init__(self, message: str, suggestion: Optional[str] = None): + super().__init__( + message=message, + code="INVALID_PARAMETER", + suggestion=suggestion or "请检查参数格式是否正确" + ) + + +class ConfigurationError(MCPError): + """配置错误""" + + def __init__(self, message: str, suggestion: Optional[str] = None): + super().__init__( + message=message, + code="CONFIGURATION_ERROR", + suggestion=suggestion or "请检查配置文件是否正确" + ) + + +class PlatformNotSupportedError(MCPError): + """平台不支持错误""" + + def __init__(self, platform: str): + super().__init__( + message=f"平台 '{platform}' 不受支持", + code="PLATFORM_NOT_SUPPORTED", + suggestion="支持的平台: zhihu, weibo, douyin, bilibili, baidu, toutiao, qq, 36kr, sspai, hellogithub, thepaper" + ) + + +class CrawlTaskError(MCPError): + """爬取任务错误""" + + def __init__(self, message: str, suggestion: Optional[str] = None): + super().__init__( + message=message, + code="CRAWL_TASK_ERROR", + suggestion=suggestion or "请稍后重试或查看日志" + ) + + +class FileParseError(MCPError): + """文件解析错误""" + + def __init__(self, file_path: str, reason: str): + super().__init__( + message=f"解析文件 {file_path} 失败: {reason}", + code="FILE_PARSE_ERROR", + suggestion="请检查文件格式是否正确" + ) diff --git a/mcp_server/utils/validators.py b/mcp_server/utils/validators.py new file mode 100644 index 0000000000000..a046929ae96b7 --- /dev/null +++ b/mcp_server/utils/validators.py @@ -0,0 +1,351 @@ +""" +参数验证工具 + +提供统一的参数验证功能。 +""" + +from datetime import datetime +from typing import List, Optional +import os +import yaml + +from .errors import InvalidParameterError +from .date_parser import DateParser + + +def get_supported_platforms() -> List[str]: + """ + 从 config.yaml 动态获取支持的平台列表 + + Returns: + 平台ID列表 + + Note: + - 读取失败时返回空列表,允许所有平台通过(降级策略) + - 平台列表来自 config/config.yaml 中的 platforms 配置 + """ + try: + # 获取 config.yaml 路径(相对于当前文件) + current_dir = os.path.dirname(os.path.abspath(__file__)) + config_path = os.path.join(current_dir, "..", "..", "config", "config.yaml") + config_path = os.path.normpath(config_path) + + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + platforms = config.get('platforms', []) + return [p['id'] for p in platforms if 'id' in p] + except Exception as e: + # 降级方案:返回空列表,允许所有平台 + print(f"警告:无法加载平台配置 ({config_path}): {e}") + return [] + + +def validate_platforms(platforms: Optional[List[str]]) -> List[str]: + """ + 验证平台列表 + + Args: + platforms: 平台ID列表,None表示使用 config.yaml 中配置的所有平台 + + Returns: + 验证后的平台列表 + + Raises: + InvalidParameterError: 平台不支持 + + Note: + - platforms=None 时,返回 config.yaml 中配置的平台列表 + - 会验证平台ID是否在 config.yaml 的 platforms 配置中 + - 配置加载失败时,允许所有平台通过(降级策略) + """ + supported_platforms = get_supported_platforms() + + if platforms is None: + # 返回配置文件中的平台列表(用户的默认配置) + return supported_platforms if supported_platforms else [] + + if not isinstance(platforms, list): + raise InvalidParameterError("platforms 参数必须是列表类型") + + if not platforms: + # 空列表时,返回配置文件中的平台列表 + return supported_platforms if supported_platforms else [] + + # 如果配置加载失败(supported_platforms为空),允许所有平台通过 + if not supported_platforms: + print("警告:平台配置未加载,跳过平台验证") + return platforms + + # 验证每个平台是否在配置中 + invalid_platforms = [p for p in platforms if p not in supported_platforms] + if invalid_platforms: + raise InvalidParameterError( + f"不支持的平台: {', '.join(invalid_platforms)}", + suggestion=f"支持的平台(来自config.yaml): {', '.join(supported_platforms)}" + ) + + return platforms + + +def validate_limit(limit: Optional[int], default: int = 20, max_limit: int = 1000) -> int: + """ + 验证数量限制参数 + + Args: + limit: 限制数量 + default: 默认值 + max_limit: 最大限制 + + Returns: + 验证后的限制值 + + Raises: + InvalidParameterError: 参数无效 + """ + if limit is None: + return default + + if not isinstance(limit, int): + raise InvalidParameterError("limit 参数必须是整数类型") + + if limit <= 0: + raise InvalidParameterError("limit 必须大于0") + + if limit > max_limit: + raise InvalidParameterError( + f"limit 不能超过 {max_limit}", + suggestion=f"请使用分页或降低limit值" + ) + + return limit + + +def validate_date(date_str: str) -> datetime: + """ + 验证日期格式 + + Args: + date_str: 日期字符串 (YYYY-MM-DD) + + Returns: + datetime对象 + + Raises: + InvalidParameterError: 日期格式错误 + """ + try: + return datetime.strptime(date_str, "%Y-%m-%d") + except ValueError: + raise InvalidParameterError( + f"日期格式错误: {date_str}", + suggestion="请使用 YYYY-MM-DD 格式,例如: 2025-10-11" + ) + + +def validate_date_range(date_range: Optional[dict]) -> Optional[tuple]: + """ + 验证日期范围 + + Args: + date_range: 日期范围字典 {"start": "YYYY-MM-DD", "end": "YYYY-MM-DD"} + + Returns: + (start_date, end_date) 元组,或 None + + Raises: + InvalidParameterError: 日期范围无效 + """ + if date_range is None: + return None + + if not isinstance(date_range, dict): + raise InvalidParameterError("date_range 必须是字典类型") + + start_str = date_range.get("start") + end_str = date_range.get("end") + + if not start_str or not end_str: + raise InvalidParameterError( + "date_range 必须包含 start 和 end 字段", + suggestion='例如: {"start": "2025-10-01", "end": "2025-10-11"}' + ) + + start_date = validate_date(start_str) + end_date = validate_date(end_str) + + if start_date > end_date: + raise InvalidParameterError( + "开始日期不能晚于结束日期", + suggestion=f"start: {start_str}, end: {end_str}" + ) + + # 检查日期是否在未来 + today = datetime.now().date() + if start_date.date() > today or end_date.date() > today: + # 获取可用日期范围提示 + try: + from ..services.data_service import DataService + data_service = DataService() + earliest, latest = data_service.get_available_date_range() + + if earliest and latest: + available_range = f"{earliest.strftime('%Y-%m-%d')} 至 {latest.strftime('%Y-%m-%d')}" + else: + available_range = "无可用数据" + except Exception: + available_range = "未知(请检查 output 目录)" + + future_dates = [] + if start_date.date() > today: + future_dates.append(start_str) + if end_date.date() > today and end_str != start_str: + future_dates.append(end_str) + + raise InvalidParameterError( + f"不允许查询未来日期: {', '.join(future_dates)}(当前日期: {today.strftime('%Y-%m-%d')})", + suggestion=f"当前可用数据范围: {available_range}" + ) + + return (start_date, end_date) + + +def validate_keyword(keyword: str) -> str: + """ + 验证关键词 + + Args: + keyword: 搜索关键词 + + Returns: + 处理后的关键词 + + Raises: + InvalidParameterError: 关键词无效 + """ + if not keyword: + raise InvalidParameterError("keyword 不能为空") + + if not isinstance(keyword, str): + raise InvalidParameterError("keyword 必须是字符串类型") + + keyword = keyword.strip() + + if not keyword: + raise InvalidParameterError("keyword 不能为空白字符") + + if len(keyword) > 100: + raise InvalidParameterError( + "keyword 长度不能超过100个字符", + suggestion="请使用更简洁的关键词" + ) + + return keyword + + +def validate_top_n(top_n: Optional[int], default: int = 10) -> int: + """ + 验证TOP N参数 + + Args: + top_n: TOP N数量 + default: 默认值 + + Returns: + 验证后的值 + + Raises: + InvalidParameterError: 参数无效 + """ + return validate_limit(top_n, default=default, max_limit=100) + + +def validate_mode(mode: Optional[str], valid_modes: List[str], default: str) -> str: + """ + 验证模式参数 + + Args: + mode: 模式字符串 + valid_modes: 有效模式列表 + default: 默认模式 + + Returns: + 验证后的模式 + + Raises: + InvalidParameterError: 模式无效 + """ + if mode is None: + return default + + if not isinstance(mode, str): + raise InvalidParameterError("mode 必须是字符串类型") + + if mode not in valid_modes: + raise InvalidParameterError( + f"无效的模式: {mode}", + suggestion=f"支持的模式: {', '.join(valid_modes)}" + ) + + return mode + + +def validate_config_section(section: Optional[str]) -> str: + """ + 验证配置节参数 + + Args: + section: 配置节名称 + + Returns: + 验证后的配置节 + + Raises: + InvalidParameterError: 配置节无效 + """ + valid_sections = ["all", "crawler", "push", "keywords", "weights"] + return validate_mode(section, valid_sections, "all") + + +def validate_date_query( + date_query: str, + allow_future: bool = False, + max_days_ago: int = 365 +) -> datetime: + """ + 验证并解析日期查询字符串 + + Args: + date_query: 日期查询字符串 + allow_future: 是否允许未来日期 + max_days_ago: 允许查询的最大天数 + + Returns: + 解析后的datetime对象 + + Raises: + InvalidParameterError: 日期查询无效 + + Examples: + >>> validate_date_query("昨天") + datetime(2025, 10, 10) + >>> validate_date_query("2025-10-10") + datetime(2025, 10, 10) + """ + if not date_query: + raise InvalidParameterError( + "日期查询字符串不能为空", + suggestion="请提供日期查询,如:今天、昨天、2025-10-10" + ) + + # 使用DateParser解析日期 + parsed_date = DateParser.parse_date_query(date_query) + + # 验证日期不在未来 + if not allow_future: + DateParser.validate_date_not_future(parsed_date) + + # 验证日期不太久远 + DateParser.validate_date_not_too_old(parsed_date, max_days=max_days_ago) + + return parsed_date + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000..45528ddd54748 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "trendradar-mcp" +version = "1.0.1" +description = "TrendRadar MCP Server - 新闻热点聚合工具" +requires-python = ">=3.10" +dependencies = [ + "requests>=2.32.5,<3.0.0", + "pytz>=2025.2,<2026.0", + "PyYAML>=6.0.3,<7.0.0", + "fastmcp>=2.12.0,<2.14.0", + "websockets>=13.0,<14.0", +] + +[project.scripts] +trendradar = "mcp_server.server:run_server" + +[dependency-groups] +dev = [] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["mcp_server"] diff --git a/readme.md b/readme.md index c00165b4bfccc..7f828b6fd0630 100644 --- a/readme.md +++ b/readme.md @@ -1,399 +1,1935 @@ -# TrendRadar - 多平台热点资讯监控分析系统 - -TrendRadar 是一款多平台热点资讯监控工具,可自动追踪主流媒体平台的热门话题,实时分析热点走势,根据自定义关键词进行筛选,并通过精美报表或飞书机器人实时推送到手机上。无论你是媒体从业者、市场分析师、还是信息爱好者,TrendRadar 都能帮你第一时间捕捉全网热点脉搏。 - -或者像我一样通过这个工具来反向减少对各种APP的使用依赖的。 +
+ + + TrendRadar Banner + + +🚀 最快30秒部署的热点助手 —— 告别无效刷屏,只看真正关心的新闻资讯 + +sansan0%2FTrendRadar | Trendshift + +[![GitHub Stars](https://img.shields.io/github/stars/sansan0/TrendRadar?style=flat-square&logo=github&color=yellow)](https://github.com/sansan0/TrendRadar/stargazers) +[![GitHub Forks](https://img.shields.io/github/forks/sansan0/TrendRadar?style=flat-square&logo=github&color=blue)](https://github.com/sansan0/TrendRadar/network/members) +[![License](https://img.shields.io/badge/license-GPL--3.0-blue.svg?style=flat-square)](LICENSE) +[![Version](https://img.shields.io/badge/version-v3.0.5-blue.svg)](https://github.com/sansan0/TrendRadar) +[![MCP](https://img.shields.io/badge/MCP-v1.0.1-green.svg)](https://github.com/sansan0/TrendRadar) + +[![企业微信通知](https://img.shields.io/badge/企业微信-通知-00D4AA?style=flat-square)](https://work.weixin.qq.com/) +[![Telegram通知](https://img.shields.io/badge/Telegram-通知-00D4AA?style=flat-square)](https://telegram.org/) +[![dingtalk通知](https://img.shields.io/badge/钉钉-通知-00D4AA?style=flat-square)](#) +[![飞书通知](https://img.shields.io/badge/飞书-通知-00D4AA?style=flat-square)](https://www.feishu.cn/) +[![邮件通知](https://img.shields.io/badge/Email-通知-00D4AA?style=flat-square)](#) +[![ntfy通知](https://img.shields.io/badge/ntfy-通知-00D4AA?style=flat-square)](https://github.com/binwiederhier/ntfy) + + +[![GitHub Actions](https://img.shields.io/badge/GitHub_Actions-自动化-2088FF?style=flat-square&logo=github-actions&logoColor=white)](https://github.com/sansan0/TrendRadar) +[![GitHub Pages](https://img.shields.io/badge/GitHub_Pages-部署-4285F4?style=flat-square&logo=github&logoColor=white)](https://sansan0.github.io/TrendRadar) +[![Docker](https://img.shields.io/badge/Docker-部署-2496ED?style=flat-square&logo=docker&logoColor=white)](https://hub.docker.com/r/wantcat/trendradar) +[![MCP Support](https://img.shields.io/badge/MCP-AI分析支持-FF6B6B?style=flat-square&logo=ai&logoColor=white)](https://modelcontextprotocol.io/) + +
+ + +> 本项目以轻量,易部署为目标 + +## 📑 快速导航 + +
+ +| [🎯 核心功能](#-核心功能) | [🚀 快速开始](#-快速开始) | [🐳 Docker部署](#-docker-部署) | [🤖 AI分析专区](#-ai-智能分析部署) | +|:---:|:---:|:---:|:---:| +| [📝 更新日志](#-更新日志) | [🔌 MCP客户端](#-mcp-客户端) | [❓ 答疑与常见问题](#问题答疑与1元点赞) | [⭐ 项目相关](#项目相关) | +| [🔧 自定义监控平台](#自定义监控平台) | [📝 frequency_words.txt 配置](#frequencywordstxt-配置教程) | | | + +
+ +- 感谢**耐心反馈 bug** 的贡献者,你们的每一条反馈让项目更加完善😉; +- 感谢**为项目点 star** 的观众们,**fork** 你所欲也,**star** 我所欲也,两者得兼😍是对开源精神最好的支持; +- 感谢**关注[公众号](#问题答疑与1元点赞)** 的读者们,你们的留言、点赞、分享和推荐等积极互动让内容更有温度😎。 + +
+👉 点击查看致谢名单 (当前 🔥66🔥 位) + +### 数据支持 + +本项目使用了 [newsnow](https://github.com/ourongxing/newsnow) 项目提供的 API 接口获取多平台数据 + +### 推广助力 + +> 感谢以下平台和个人的推荐(按时间排列) + +- [小众软件](https://mp.weixin.qq.com/s/fvutkJ_NPUelSW9OGK39aA) - 开源软件推荐平台 +- [LinuxDo 社区](https://linux.do/) - 技术爱好者的聚集地 +- [阮一峰周刊](https://github.com/ruanyf/weekly) - 技术圈有影响力的周刊 + +### 观众支持 + +> 感谢**给予资金支持** 的朋友们,你们的慷慨已化身为键盘旁的零食饮料,陪伴着项目的每一次迭代 + +| 点赞人 | 金额 | 日期 | 备注 | +| :-------------------------: | :----: | :----: | :-----------------------: | +| *海 | 1 | 2025.11.15 | | +| *德 | 1.99 | 2025.11.15 | | +| *疏 | 8.8 | 2025.11.14 | 感谢开源,项目很棒,支持一下 | +| M*e | 10 | 2025.11.14 | 开源不易,大佬辛苦了 | +| **柯 | 1 | 2025.11.14 | | +| *云 | 88 | 2025.11.13 | 好项目,感谢开源 | +| *W | 6 | 2025.11.13 | | +| *凯 | 1 | 2025.11.13 | | +| 对*. | 1 | 2025.11.13 | Thanks for your TrendRadar | +| s*y | 1 | 2025.11.13 | | +| **翔 | 10 | 2025.11.13 | 好项目,相见恨晚,感谢开源! | +| *韦 | 9.9 | 2025.11.13 | TrendRadar超赞,请老师喝咖啡~ | +| h*p | 5 | 2025.11.12 | 支持中国开源力量,加油! | +| c*r | 6 | 2025.11.12 | | +| a*n | 5 | 2025.11.12 | | +| 。*c | 1 | 2025.11.12 | 感谢开源分享 | +| *记 | 1 | 2025.11.11 | | +| *主 | 1 | 2025.11.10 | | +| *了 | 10 | 2025.11.09 | | +| *杰 | 5 | 2025.11.08 | | +| *点 | 8.80 | 2025.11.07 | 开发不易,支持一下。 | +| Q*Q | 6.66 | 2025.11.07 | 感谢开源! | +| C*e | 1 | 2025.11.05 | | +| Peter Fan | 20 | 2025.10.29 | | +| M*n | 1 | 2025.10.27 | 感谢开源 | +| *许 | 8.88 | 2025.10.23 | 老师 小白一枚,摸了几天了还没整起来,求教 | +| Eason | 1 | 2025.10.22 | 还没整明白,但你在做好事 | +| P*n | 1 | 2025.10.20 | | +| *杰 | 1 | 2025.10.19 | | +| *徐 | 1 | 2025.10.18 | | +| *志 | 1 | 2025.10.17 | | +| *😀 | 10 | 2025.10.16 | 点赞 | +| **杰 | 10 | 2025.10.16 | | +| *啸 | 10 | 2025.10.16 | | +| *纪 | 5 | 2025.10.14 | TrendRadar | +| J*d | 1 | 2025.10.14 | 谢谢你的工具,很好玩... | +| *H | 1 | 2025.10.14 | | +| 那*O | 10 | 2025.10.13 | | +| *圆 | 1 | 2025.10.13 | | +| P*g | 6 | 2025.10.13 | | +| Ocean | 20 | 2025.10.12 | ...真的太棒了!!!小白级别也能直接用... | +| **培 | 5.2 | 2025.10.2 | github-yzyf1312:开源万岁 | +| *椿 | 3 | 2025.9.23 | 加油,很不错 | +| *🍍 | 10 | 2025.9.21 | | +| E*f | 1 | 2025.9.20 | | +| *记 | 1 | 2025.9.20 | | +| z*u | 2 | 2025.9.19 | | +| **昊 | 5 | 2025.9.17 | | +| *号 | 1 | 2025.9.15 | | +| T*T | 2 | 2025.9.15 | 点赞 | +| *家 | 10 | 2025.9.10 | | +| *X | 1.11 | 2025.9.3 | | +| *飙 | 20 | 2025.8.31 | 来自老童谢谢 | +| *下 | 1 | 2025.8.30 | | +| 2*D | 88 | 2025.8.13 下午 | | +| 2*D | 1 | 2025.8.13 上午 | | +| S*o | 1 | 2025.8.05 | 支持一下 | +| *侠 | 10 | 2025.8.04 | | +| x*x | 2 | 2025.8.03 | trendRadar 好项目 点赞 | +| *远 | 1 | 2025.8.01 | | +| *邪 | 5 | 2025.8.01 | | +| *梦 | 0.1 | 2025.7.30 | | +| **龙 | 10 | 2025.7.29 | 支持一下 | + + +
## ✨ 核心功能 -- **多平台覆盖** - 一次监控 10+主流平台(今日头条、百度热搜、微博、抖音、知乎、B 站等) -- **智能分析** - 自定义频率词和过滤词,精准捕捉你关心的热点 -- **数据可视化** - 生成美观的 HTML 统计报告,热点一目了然 -- **实时推送** - 支持飞书机器人通知,重要热点即时知晓 -- **全自动化** - 基于 GitHub Actions,定时运行无需服务器 - -## 🔍 支持的平台 - -目前已支持以下 10 个热门平台: +### **全网热点聚合** -- 今日头条 -- 百度热搜 -- 华尔街见闻 -- 澎湃新闻 +- 知乎 +- 抖音 - bilibili 热搜 -- 财联社热门 +- 华尔街见闻 - 贴吧 +- 百度热搜 +- 财联社热门 +- 澎湃新闻 +- 凤凰网 +- 今日头条 - 微博 -- 抖音 -- 知乎 +默认监控 11 个主流平台,也可自行增加额外的平台 +
+👉 点击展开:自定义监控平台 +
+本项目的资讯数据来源于 [newsnow](https://github.com/ourongxing/newsnow) ,你可以点击[网站](https://newsnow.busiyi.world/),点击[更多],查看是否有你想要的平台。 -## 🚀 使用方式 +具体添加可访问 [项目源代码](https://github.com/ourongxing/newsnow/tree/main/server/sources),根据里面的文件名,在 `config/config.yaml` 文件中修改 `platforms` 配置: -### 方式一:GitHub Actions 远程运行(推荐) +```yaml +platforms: + - id: "toutiao" + name: "今日头条" + - id: "baidu" + name: "百度热搜" + - id: "wallstreetcn-hot" + name: "华尔街见闻" + # 添加更多平台... +``` +如果不会看的话,就直接复制他人整理好的部分[平台配置](https://github.com/sansan0/TrendRadar/issues/95) -1. **Fork 本项目**到你的 GitHub 账户 +
- - 点击本页面右上角的"Fork"按钮 +### **智能推送策略** + +**三种推送模式**: + +| 模式 | 适用人群 | 推送时机 | 显示内容 | 适用场景 | +|------|----------|----------|----------|----------| +| **当日汇总**
`daily` | 📋 企业管理者/普通用户 | 按时推送(默认每小时推送一次) | 当日所有匹配新闻
+ 新增新闻区域 | 日报总结
全面了解当日热点趋势 | +| **当前榜单**
`current` | 📰 自媒体人/内容创作者 | 按时推送(默认每小时推送一次) | 当前榜单匹配新闻
+ 新增新闻区域 | 实时热点追踪
了解当前最火的内容 | +| **增量监控**
`incremental` | 📈 投资者/交易员 | 有新增才推送 | 新出现的匹配频率词新闻 | 避免重复信息干扰
高频监控场景 | + +**附加功能 - 推送时间窗口控制**(可选): + +此功能独立于上述三种推送模式,可与任意模式搭配使用: + +- **时间窗口限制**: 设定推送时间范围(如 09:00-18:00 或 20:00-22:00),只在指定时间内推送 +- **推送频率控制**: + - 窗口内多次推送: 时间窗口内每次执行都推送 + - 每天仅推送一次: 时间窗口内只推送一次(适合当日汇总或当前榜单模式) +- **典型场景**: + - 工作时间推送: 只在工作日 09:00-18:00 接收消息 + - 晚间汇总推送: 希望在晚上固定时间(如 20:00-22:00)收到汇总 + - 避免打扰: 防止非工作时间收到推送通知 + +> 提示: 此功能默认关闭,需在 `config/config.yaml` 中手动启用 `push_window.enabled` -2. **设置 GitHub Secrets**: +### **精准内容筛选** - - 在你 Fork 后的仓库中,进入`Settings` > `Secrets and variables` > `Actions` - - 点击"New repository secret" - - 名称填写`FEISHU_WEBHOOK_URL` - - 值填写你的飞书机器人 Webhook 地址(webhook 获取,请直接跳转到下方的 "🤖 飞书机器人设置") - - 点击"Add secret"保存 +设置个人关键词(如:AI、比亚迪、教育政策),只推送相关热点,过滤无关信息 -3. **自定义关键词**: +- 支持普通词、必须词(+)、过滤词(!)三种语法,见【frequency_words.txt 配置教程】 +- 词组化管理,独立统计不同主题热点 - - 修改`frequency_words.txt`文件,添加你需要监控的频率词和过滤词 +> 也可以不做筛选,完整的推送所有热点,具体见【历史更新】中的 v2.0.1 -4. **自动运行**: +
+👉 点击展开:frequency_words.txt 配置教程 +
- - 项目已包含`.github/workflows/crawler.yml`配置文件,默认每 50 分钟自动运行一次 - - 你也可以在 GitHub 仓库的 Actions 页面手动触发运行 +在 `frequency_words.txt` 文件中配置监控的关键词,支持三种语法和词组功能。 -5. **查看结果**: - - 运行结果将自动保存在仓库的`output`目录中 - - 同时通过飞书机器人发送通知到你的群组 +关键词越靠前,新闻的优先级越高,你可以根据自己的关注度调整关键词顺序 -6. **增加或减少平台**: -如果想支持更多平台或者不想看某些歪屁股平台,可以访问newsnow的源代码:https://github.com/ourongxing/newsnow/tree/main/server/sources ,根据里面的文件名自己来修改 main.py 中的下面代码,可以在你 Fork 的项目上直接修改源码 +| 语法类型 | 符号 | 作用 | 示例 | 匹配逻辑 | +|---------|------|------|------|---------| +| **普通词** | 无 | 基础匹配 | `华为` | 包含任意一个即可 | +| **必须词** | `+` | 限定范围 | `+手机` | 必须同时包含 | +| **过滤词** | `!` | 排除干扰 | `!广告` | 包含则直接排除 | +### 📋 基础语法说明 + +#### 1. **普通关键词** - 基础匹配 +```txt +华为 +OPPO +苹果 ``` - ids = [ - ("toutiao", "今日头条"), - ("baidu", "百度热搜"), - ("wallstreetcn-hot", "华尔街见闻"), - ("thepaper", "澎湃新闻"), - ("bilibili-hot-search", "bilibili 热搜"), - ("cls-hot", "财联社热门"), - "tieba", - "weibo", - "douyin", - "zhihu", - ] +**作用:** 新闻标题包含其中**任意一个词**就会被捕获 + +#### 2. **必须词** `+词汇` - 限定范围 +```txt +华为 +OPPO ++手机 +``` +**作用:** 必须同时包含普通词**和**必须词才会被捕获 + +#### 3. **过滤词** `!词汇` - 排除干扰 +```txt +苹果 +华为 +!水果 +!价格 ``` +**作用:** 包含过滤词的新闻会被**直接排除**,即使包含关键词 -### 方式二:本地运行 +### 🔗 词组功能 - 空行分隔的重要作用 -1. **克隆项目**到本地: +**核心规则:** 用**空行**分隔不同的词组,每个词组独立统计 -```bash -git clone https://github.com/sansan0/TrendRadar.git -cd TrendRadar +#### 示例配置: +```txt +iPhone +华为 +OPPO ++发布 + +A股 +上证 +深证 ++涨跌 +!预测 + +世界杯 +欧洲杯 +亚洲杯 ++比赛 ``` -2. **安装依赖**: +#### 词组解释及匹配效果: + +**第1组 - 手机新品类:** +- 关键词:iPhone、华为、OPPO +- 必须词:发布 +- 效果:必须包含手机品牌名,同时包含"发布" + +**匹配示例:** +- ✅ "iPhone 15正式发布售价公布" ← 有"iPhone"+"发布" +- ✅ "华为Mate60系列发布会直播" ← 有"华为"+"发布" +- ✅ "OPPO Find X7发布时间确定" ← 有"OPPO"+"发布" +- ❌ "iPhone销量创新高" ← 有"iPhone"但缺少"发布" + +**第2组 - 股市行情类:** +- 关键词:A股、上证、深证 +- 必须词:涨跌 +- 过滤词:预测 +- 效果:包含股市相关词,同时包含"涨跌",但排除包含"预测"的内容 + +**匹配示例:** +- ✅ "A股今日大幅涨跌分析" ← 有"A股"+"涨跌" +- ✅ "上证指数涨跌原因解读" ← 有"上证"+"涨跌" +- ❌ "专家预测A股涨跌趋势" ← 有"A股"+"涨跌"但包含"预测" +- ❌ "A股成交量创新高" ← 有"A股"但缺少"涨跌" + +**第3组 - 足球赛事类:** +- 关键词:世界杯、欧洲杯、亚洲杯 +- 必须词:比赛 +- 效果:必须包含杯赛名称,同时包含"比赛" + +**匹配示例:** +- ✅ "世界杯小组赛比赛结果" ← 有"世界杯"+"比赛" +- ✅ "欧洲杯决赛比赛时间" ← 有"欧洲杯"+"比赛" +- ❌ "世界杯门票开售" ← 有"世界杯"但缺少"比赛" + +### 🎯 配置技巧 + +#### 1. **从宽到严的配置策略** +```txt +# 第一步:先用宽泛关键词测试 +人工智能 +AI +ChatGPT -```bash -pip install requests pytz +# 第二步:发现误匹配后,加入必须词限定 +人工智能 +AI +ChatGPT ++技术 + +# 第三步:发现干扰内容后,加入过滤词 +人工智能 +AI +ChatGPT ++技术 +!广告 +!培训 ``` -3. **配置飞书 Webhook URL**(两种方式): +#### 2. **避免过度复杂** +❌ **不推荐:** 一个词组包含太多词汇 +```txt +华为 +OPPO +苹果 +三星 +vivo +一加 +魅族 ++手机 ++发布 ++销量 +!假货 +!维修 +!二手 +``` - - 方式 1:直接在代码顶部的`CONFIG`字典中修改`FEISHU_WEBHOOK_URL`的值 - - 方式 2:设置环境变量`FEISHU_WEBHOOK_URL`(优先级更高) +✅ **推荐:** 拆分成多个精确的词组 +```txt +华为 +OPPO ++新品 -4. **创建或修改关键词**: +苹果 +三星 ++发布 - - 编辑`frequency_words.txt`文件,添加你需要监控的频率词和过滤词 +手机 +销量 ++市场 +``` -5. **运行程序**: +
-```bash -python main.py + +### **热点趋势分析** + +实时追踪新闻热度变化,让你不仅知道"什么在热搜",更了解"热点如何演变" + +- **时间轴追踪**:记录每条新闻从首次出现到最后出现的完整时间跨度 +- **热度变化**:统计新闻在不同时间段的排名变化和出现频次 +- **新增检测**:实时识别新出现的热点话题,用🆕标记第一时间提醒 +- **持续性分析**:区分一次性热点话题和持续发酵的深度新闻 +- **跨平台对比**:同一新闻在不同平台的排名表现,看出媒体关注度差异 + +> 不再错过重要新闻的完整发展过程,从话题萌芽到高峰热议,全程掌握 + +
+👉 点击展开:推送格式说明 +
+ +📊 热点词汇统计 + +🔥 [1/3] AI ChatGPT : 2 条 + + 1. [百度热搜] 🆕 ChatGPT-5正式发布 [**1**] - 09时15分 (1次) + + 2. [今日头条] AI芯片概念股暴涨 [**3**] - [08时30分 ~ 10时45分] (3次) + +━━━━━━━━━━━━━━━━━━━ + +📈 [2/3] 比亚迪 特斯拉 : 2 条 + + 1. [微博] 🆕 比亚迪月销量破纪录 [**2**] - 10时20分 (1次) + + 2. [抖音] 特斯拉降价促销 [**4**] - [07时45分 ~ 09时15分] (2次) + +━━━━━━━━━━━━━━━━━━━ + +📌 [3/3] A股 股市 : 1 条 + + 1. [华尔街见闻] A股午盘点评分析 [**5**] - [11时30分 ~ 12时00分] (2次) + +🆕 本次新增热点新闻 (共 2 条) + +**百度热搜** (1 条): + 1. ChatGPT-5正式发布 [**1**] + +**微博** (1 条): + 1. 比亚迪月销量破纪录 [**2**] + +更新时间:2025-01-15 12:30:15 + + +## **消息格式说明** + +| 格式元素 | 示例 | 含义 | 说明 | +| ------------- | --------------------------- | ------------ | --------------------------------------- | +| 🔥📈📌 | 🔥 [1/3] AI ChatGPT | 热度等级 | 🔥高热度(≥10条) 📈中热度(5-9条) 📌普通热度(<5条) | +| [序号/总数] | [1/3] | 排序位置 | 当前词组在所有匹配词组中的排名 | +| 频率词组 | AI ChatGPT | 关键词组 | 配置文件中的词组,标题必须包含其中词汇 | +| : N 条 | : 2 条 | 匹配数量 | 该词组匹配的新闻总数 | +| [平台名] | [百度热搜] | 来源平台 | 新闻所属的平台名称 | +| 🆕 | 🆕 ChatGPT-5正式发布 | 新增标记 | 本轮抓取中首次出现的热点 | +| [**数字**] | [**1**] | 高排名 | 排名≤阈值的热搜,红色加粗显示 | +| [数字] | [7] | 普通排名 | 排名>阈值的热搜,普通显示 | +| - 时间 | - 09时15分 | 首次时间 | 该新闻首次被发现的时间 | +| [时间~时间] | [08时30分 ~ 10时45分] | 持续时间 | 从首次出现到最后出现的时间范围 | +| (N次) | (3次) | 出现频率 | 在监控期间出现的总次数 | +| **新增区域** | 🆕 **本次新增热点新闻** | 新话题汇总 | 单独展示本轮新出现的热点话题 | + +
+ + +### **个性化热点算法** + +不再被各个平台的算法牵着走,TrendRadar 会重新整理全网热搜: + +- **看重排名高的新闻**(占60%):各平台前几名的新闻优先显示 +- **关注持续出现的话题**(占30%):反复出现的新闻更重要 +- **考虑排名质量**(占10%):不仅多次出现,还经常排在前列 + +> 把分散在各个平台的热搜合并起来,按照你关心的热度重新排序,这三个比例可以选择适合自己的场景进行调整 + +
+👉 点击展开:热点权重调整 +
+ +当前默认的配置是平衡性配置 + +### 两个核心场景 + +**追实时热点型**: +```yaml +weight: + rank_weight: 0.8 # 主要看排名 + frequency_weight: 0.1 # 不太在乎持续性 + hotness_weight: 0.1 ``` +**适用人群**:自媒体博主、营销人员、想快速了解当下最火话题的用户 + +**追深度话题型**: +```yaml +weight: + rank_weight: 0.4 # 适度看排名 + frequency_weight: 0.5 # 重视当天内的持续热度 + hotness_weight: 0.1 +``` +**适用人群**:投资者、研究人员、新闻工作者、需要深度分析趋势的用户 -程序将自动爬取热点数据,生成报告,并在本地浏览器中打开 HTML 统计页面。 +### 调整的方法 +1. **三个数字加起来必须等于 1.0** +2. **哪个重要就调大哪个**:在乎排名就调大 rank_weight,在乎持续性就调大 frequency_weight +3. **建议每次只调 0.1-0.2**,观察效果 -## ⚙️ 配置说明 +核心思路:追求速度和时效性的用户提高排名权重,追求深度和稳定性的用户提高频次权重。 -### 全局配置项 +
-代码顶部的`CONFIG`字典包含了所有可配置的选项: +### **多渠道实时推送** -```python -CONFIG = { - "FEISHU_SEPARATOR": "==============================", # 飞书消息分割线 - "REQUEST_INTERVAL": 1000, # 请求间隔(毫秒) - "FEISHU_REPORT_TYPE": "daily", # 可选: "current", "daily", "both" - "RANK_THRESHOLD": 5, # 排名阈值,决定使用【】还是[]的界限 - "USE_PROXY": False, # 是否启用本地代理 - "DEFAULT_PROXY": "http://127.0.0.1:10086", # 默认代理地址 - "CONTINUE_WITHOUT_FEISHU": False, # 是否在没有飞书webhook URL时继续执行爬虫 - "FEISHU_WEBHOOK_URL": "" # 飞书机器人的webhook URL,默认为空 -} +支持**企业微信**(+ 微信推送方案)、**飞书**、**钉钉**、**Telegram**、**邮件**、**ntfy**,消息直达手机和邮箱 + +### **多端适配** +- **GitHub Pages**:自动生成精美网页报告,PC/移动端适配 +- **Docker部署**:支持多架构容器化运行 +- **数据持久化**:HTML/TXT多格式历史记录保存 + + +### **AI 智能分析(v3.0.0 新增)** + +基于 MCP (Model Context Protocol) 协议的 AI 对话分析系统,让你用自然语言深度挖掘新闻数据 + +- **对话式查询**:用自然语言提问,如"查询昨天知乎的热点"、"分析比特币最近的热度趋势" +- **13 种分析工具**:涵盖基础查询、智能检索、趋势分析、数据洞察、情感分析等 +- **多客户端支持**:Cherry Studio(GUI 配置)、Claude Desktop、Cursor、Cline 等 +- **深度分析能力**: + - 话题趋势追踪(热度变化、生命周期、爆火检测、趋势预测) + - 跨平台数据对比(活跃度统计、关键词共现) + - 智能摘要生成、相似新闻查找、历史关联检索 + +> 告别手动翻阅数据文件,AI 助手帮你秒懂新闻背后的故事 + +### **零技术门槛部署** + +GitHub 一键 Fork 即可使用,无需编程基础。 + +> 30秒部署: GitHub Pages(网页浏览)支持一键保存成图片,随时分享给他人 +> +> 1分钟部署: 企业微信(手机通知) + +**💡 提示:** 想要**实时更新**的网页版?fork 后,进入你的仓库 Settings → Pages,启用 GitHub Pages。[效果预览](https://sansan0.github.io/TrendRadar/)。 + +### **减少 APP 依赖** + +从"被算法推荐绑架"变成"主动获取自己想要的信息" + +**适合人群:** 投资者、自媒体人、企业公关、关心时事的普通用户 + +**典型场景:** 股市投资监控、品牌舆情追踪、行业动态关注、生活资讯获取 + + +| Github Pages 效果(手机端适配、邮箱推送效果) | 飞书推送效果 | +|:---:|:---:| +| ![Github Pages效果](_image/github-pages.png) | ![飞书推送效果](_image/feishu.jpg) | + + +## 📝 更新日志 + +>**升级说明**: +- **提示**:不要通过 **Sync fork** 更新本项目, 建议查看【历史更新】,明确具体的【升级方式】和【功能内容】 +- **小版本更新**:从 v2.x 升级到 v2.y, 用本项目的 `main.py` 代码替换你 fork 仓库中的对应文件 +- **大版本升级**:从 v1.x 升级到 v2.y, 建议删除现有 fork 后重新 fork,这样更省力且避免配置冲突 + + +### 2025/11/12 - v3.0.5 + +- 修复邮件发送 SSL/TLS 端口配置逻辑错误 +- 优化邮箱服务商(QQ/163/126)默认使用 465 端口(SSL) +- **新增 Docker 环境变量支持**:核心配置项(`enable_crawler`、`report_mode`、`push_window` 等)支持通过环境变量覆盖,解决 NAS 用户修改配置文件不生效的问题(详见 [🐳 Docker 部署](#-docker-部署) 章节) + + + +### 2025/10/26 - mcp-v1.0.1 + + **MCP 模块更新:** + - 修复日期查询参数传递错误 + - 统一所有工具的时间参数格式 + + +
+👉 点击展开:历史更新 + + +### 2025/10/31 - v3.0.4 + +- 解决飞书因推送内容过长而产生的错误,实现了分批推送 + + +### 2025/10/23 - v3.0.3 + +- 扩大 ntfy 错误信息显示范围 + + +### 2025/10/21 - v3.0.2 + +- 修复 ntfy 推送编码问题 + +### 2025/10/20 - v3.0.0 + +**重大更新 - AI 分析功能上线** 🤖 + +- **核心功能**: + - 新增基于 MCP (Model Context Protocol) 的 AI 分析服务器 + - 支持13种智能分析工具:基础查询、智能检索、高级分析、系统管理 + - 自然语言交互:通过对话方式查询和分析新闻数据 + - 多客户端支持:Claude Desktop、Cherry Studio、Cursor、Cline 等 + +- **分析能力**: + - 话题趋势分析(热度追踪、生命周期、爆火检测、趋势预测) + - 数据洞察(平台对比、活跃度统计、关键词共现) + - 情感分析、相似新闻查找、智能摘要生成 + - 历史相关新闻检索、多模式搜索 + +- **更新提示**: + - 这是独立的 AI 分析功能,不影响现有的推送功能 + - 可选择性使用,无需升级现有部署 + + +### 2025/10/15 - v2.4.4 + +- **更新内容**: + - 修复 ntfy 推送编码问题 + 1 + - 修复推送时间窗口判断问题 + +- **更新提示**: + - 建议【小版本升级】 + + +### 2025/10/10 - v2.4.3 + +> 感谢 [nidaye996](https://github.com/sansan0/TrendRadar/issues/98) 发现的体验问题 + +- **更新内容**: + - 重构"静默推送模式"命名为"推送时间窗口控制",提升功能理解度 + - 明确推送时间窗口作为可选附加功能,可与三种推送模式搭配使用 + - 改进注释和文档描述,使功能定位更加清晰 + +- **更新提示**: + - 这个仅仅是重构,可以不用升级 + + +### 2025/10/8 - v2.4.2 + +- **更新内容**: + - 修复 ntfy 推送编码问题 + - 修复配置文件缺失问题 + - 优化 ntfy 推送效果 + - 增加 github page 图片分段导出功能 + +- **更新提示**: + - 建议使用【大版本更新】 + + +### 2025/10/2 - v2.4.0 + +**新增 ntfy 推送通知** + +- **核心功能**: + - 支持 ntfy.sh 公共服务和自托管服务器 + +- **使用场景**: + - 适合追求隐私的用户(支持自托管) + - 跨平台推送(iOS、Android、Desktop、Web) + - 无需注册账号(公共服务器) + - 开源免费(MIT 协议) + +- **更新提示**: + - 建议使用【大版本更新】 + + +### 2025/09/26 - v2.3.2 + +- 修正了邮件通知配置检查被遗漏的问题([#88](https://github.com/sansan0/TrendRadar/issues/88)) + +**修复说明**: +- 解决了即使正确配置邮件通知,系统仍提示"未配置任何webhook"的问题 + +### 2025/09/22 - v2.3.1 + +- **新增邮件推送功能**,支持将热点新闻报告发送到邮箱 +- **智能 SMTP 识别**:自动识别 Gmail、QQ邮箱、Outlook、网易邮箱等 10+ 种邮箱服务商配置 +- **HTML 精美格式**:邮件内容采用与网页版相同的 HTML 格式,排版精美,移动端适配 +- **批量发送支持**:支持多个收件人,用逗号分隔即可同时发送给多人 +- **自定义 SMTP**:可自定义 SMTP 服务器和端口 +- 修复Docker构建网络连接问题 + +**使用说明**: +- 适用场景:适合需要邮件归档、团队分享、定时报告的用户 +- 支持邮箱:Gmail、QQ邮箱、Outlook/Hotmail、163/126邮箱、新浪邮箱、搜狐邮箱等 + +**更新提示**: +- 此次更新的内容比较多,如果想升级,建议采用【大版本升级】 + +### 2025/09/17 - v2.2.0 + +- 新增一键保存新闻图片功能,让你轻松分享关注的热点 + +**使用说明**: +- 适用场景:当你按照教程开启了网页版功能后(GitHub Pages) +- 使用方法:用手机或电脑打开该网页链接,点击页面顶部的"保存为图片"按钮 +- 实际效果:系统会自动将当前的新闻报告制作成一张精美图片,保存到你的手机相册或电脑桌面 +- 分享便利:你可以直接把这张图片发给朋友、发到朋友圈,或分享到工作群,让别人也能看到你发现的重要资讯 + +### 2025/09/13 - v2.1.2 + +- 解决钉钉的推送容量限制导致的新闻推送失败问题(采用分批推送) + +### 2025/09/04 - v2.1.1 + +- 修复docker在某些架构中无法正常运行的问题 +- 正式发布官方 Docker 镜像 wantcat/trendradar,支持多架构 +- 优化 Docker 部署流程,无需本地构建即可快速使用 + +### 2025/08/30 - v2.1.0 + +**核心改进**: +- **推送逻辑优化**:从"每次执行都推送"改为"时间窗口内可控推送" +- **时间窗口控制**:可设定推送时间范围,避免非工作时间打扰 +- **推送频率可选**:时间段内支持单次推送或多次推送 + +**更新提示**: +- 本功能默认关闭,需手动在 config.yaml 中开启推送时间窗口控制 +- 升级需同时更新 main.py 和 config.yaml 两个文件 + +### 2025/08/27 - v2.0.4 + +- 本次版本不是功能修复,而是重要提醒 +- 请务必妥善保管好 webhooks,不要公开,不要公开,不要公开 +- 如果你以 fork 的方式将本项目部署在 GitHub 上,请将 webhooks 填入 GitHub Secret,而非 config.yaml +- 如果你已经暴露了 webhooks 或将其填入了 config.yaml,建议删除后重新生成 + +### 2025/08/06 - v2.0.3 + +- 优化 github page 的网页版效果,方便移动端使用 + +### 2025/07/28 - v2.0.2 + +- 重构代码 +- 解决版本号容易被遗漏修改的问题 + +### 2025/07/27 - v2.0.1 + +**修复问题**: + +1. docker 的 shell 脚本的换行符为 CRLF 导致的执行异常问题 +2. frequency_words.txt 为空时,导致新闻发送也为空的逻辑问题 + - 修复后,当你选择 frequency_words.txt 为空时,将**推送所有新闻**,但受限于消息推送大小限制,请做如下调整 + - 方案一:关闭手机推送,只选择 Github Pages 布置(这是能获得最完整信息的方案,将把所有平台的热点按照你**自定义的热搜算法**进行重新排序) + - 方案二:减少推送平台,优先选择**企业微信**或**Telegram**,这两个推送我做了分批推送功能(因为分批推送影响推送体验,且只有这两个平台只给一点点推送容量,所以才不得已做了分批推送功能,但至少能保证获得的信息完整) + - 方案三:可与方案二结合,模式选择 current 或 incremental 可有效减少一次性推送的内容 + +### 2025/07/17 - v2.0.0 + +**重大重构**: +- 配置管理重构:所有配置现在通过 `config/config.yaml` 文件管理(main.py 我依旧没拆分,方便你们复制升级) +- 运行模式升级:支持三种模式 - `daily`(当日汇总)、`current`(当前榜单)、`incremental`(增量监控) +- Docker 支持:完整的 Docker 部署方案,支持容器化运行 + +**配置文件说明**: +- `config/config.yaml` - 主配置文件(应用设置、爬虫配置、通知配置、平台配置等) +- `config/frequency_words.txt` - 关键词配置(监控词汇设置) + +### 2025/07/09 - v1.4.1 + +**功能新增**:增加增量推送(在 main.py 头部配置 FOCUS_NEW_ONLY),该开关只关心新话题而非持续热度,只在有新内容时才发通知。 + +**修复问题**: 某些情况下,由于新闻本身含有特殊符号导致的偶发性排版异常。 + +### 2025/06/23 - v1.3.0 + +企业微信 和 Telegram 的推送消息有长度限制,对此我采用将消息拆分推送的方式。开发文档详见[企业微信](https://developer.work.weixin.qq.com/document/path/91770) 和 [Telegram](https://core.telegram.org/bots/api) + +### 2025/06/21 - v1.2.1 + +在本版本之前的旧版本,不仅 main.py 需要复制替换, crawler.yml 也需要你复制替换 +https://github.com/sansan0/TrendRadar/blob/master/.github/workflows/crawler.yml + +### 2025/06/19 - v1.2.0 + +> 感谢 claude research 整理的各平台 api ,让我快速完成各平台适配(虽然代码更多冗余了~ + +1. 支持 telegram ,企业微信,钉钉推送渠道, 支持多渠道配置和同时推送 + +### 2025/06/18 - v1.1.0 + +> **200 star⭐** 了, 继续给大伙儿助兴~近期,在我的"怂恿"下,挺多人在我公众号点赞分享推荐助力了我,我都在后台看见了具体账号的鼓励数据,很多都成了天使轮老粉(我玩公众号才一个多月,虽然注册是七八年前的事了哈哈,属于上车早,发车晚),但因为你们没有留言或私信我,所以我也无法一一回应并感谢支持,在此一并谢谢! + +1. 重要的更新,加了权重,你现在看到的新闻都是最热点最有关注度的出现在最上面 +2. 更新文档使用,因为近期更新了很多功能,而且之前的使用文档我偷懒写的简单(见下面的 ⚙️ frequency_words.txt 配置完整教程) + +### 2025/06/16 - v1.0.0 + +1. 增加了一个项目新版本更新提示,默认打开,如要关掉,可以在 main.py 中把 "FEISHU_SHOW_VERSION_UPDATE": True 中的 True 改成 False 即可 + +### 2025/06/13+14 + +1. 去掉了兼容代码,之前 fork 的同学,直接复制代码会在当天显示异常(第二天会恢复正常) +2. feishu 和 html 底部增加一个新增新闻显示 + +### 2025/06/09 + +**100 star⭐** 了,写个小功能给大伙儿助助兴 +frequency_words.txt 文件增加了一个【必须词】功能,使用 + 号 + +1. 必须词语法如下: + 唐僧或者猪八戒必须在标题里同时出现,才会收录到推送新闻中 + +``` ++唐僧 ++猪八戒 ``` -主要配置项说明: +2. 过滤词的优先级更高: + 如果标题中过滤词匹配到唐僧念经,那么即使必须词里有唐僧,也不显示 + +``` ++唐僧 +!唐僧念经 +``` + +### 2025/06/02 + +1. **网页**和**飞书消息**支持手机直接跳转详情新闻 +2. 优化显示效果 + 1 + +### 2025/05/26 + +1. 飞书消息显示效果优化 + + + + + + +
+优化前
+飞书消息界面 - 优化前 +
+优化后
+飞书消息界面 - 优化后 +
+ +
+ + +## 🚀 快速开始 + +> 配置完成后,新闻数据一小时后才会更新,如想加快,可参照【第4步】手动测试配置效果 + +1. **Fork 本项目**到你的 GitHub 账户 + + - 点击本页面右上角的"Fork"按钮 + +2. **设置 GitHub Secrets(选择你需要的平台)**: + + 在你 Fork 后的仓库中,进入 `Settings` > `Secrets and variables` > `Actions` > `New repository secret`,然后根据需要配置以下任一或多个通知平台: + + 可以同时配置多个平台,系统会向所有配置的平台发送通知。 + + 效果类似下图,一个 name 对应一个 secret,保存完就行,你重新编辑看不到 secret 是正常情况。 + + GitHub Secrets + + +
+ 👉 点击展开:企业微信机器人(配置最简单最迅速) +
+ + **GitHub Secret 配置:** + - 名称:`WEWORK_WEBHOOK_URL` + - 值:你的企业微信机器人 Webhook 地址 -- `REQUEST_INTERVAL`: 控制爬取不同平台之间的时间间隔 -- `FEISHU_REPORT_TYPE`: 控制发送到飞书的报告类型 - - `current`: 只发送当前爬取结果 - - `daily`: 只发送当日汇总 - - `both`: 两者都发送 -- `RANK_THRESHOLD`: 排名显示阈值,小于等于此值的排名使用【】,大于此值使用[] -- `USE_PROXY`: 是否在本地运行时使用代理 -- `DEFAULT_PROXY`: 本地代理地址 -- `CONTINUE_WITHOUT_FEISHU`: 如果为`True`,即使没有飞书 webhook URL 也会执行爬虫;如果为`False`,则不执行 -- `FEISHU_WEBHOOK_URL`: 飞书机器人的 webhook URL,可以直接在此设置 +
-### 频率词和过滤词 + **机器人设置步骤:** -在`frequency_words.txt`文件中配置监控的频率词和过滤词: + #### 手机端设置: + 1. 打开企业微信 App → 进入目标内部群聊 + 2. 点击右上角"…"按钮 → 选择"消息推送" + 3. 点击"添加" → 名称输入"TrendRadar" + 4. 复制 Webhook 地址,点击保存,复制的内容配置到上方的 GitHub Secret 中 -- 每组相关的频率词用换行分隔,不同组之间用空行分隔 -- 以`!`开头的词为过滤词 -- 如果一个标题既包含频率词又包含过滤词,则该标题不会被统计 + #### PC 端设置流程类似 +
-示例: +
+ 👉 点击展开:飞书机器人(消息显示最友好) +
+ **GitHub Secret 配置:** + - 名称:`FEISHU_WEBHOOK_URL` + - 值:你的飞书机器人 Webhook 地址(该链接开头类似 https://www.feishu.cn/flow/api/trigger-webhook/********) +
+ + 有两个方案,**方案一**配置简单,**方案二**配置复杂(但是稳定推送) + + 其中方案一,由 **ziventian**发现并提供建议,在这里感谢他,默认是个人推送,也可以配置群组推送操作[#97](https://github.com/sansan0/TrendRadar/issues/97) , + + **方案一:** + + > 对部分人存在额外操作,否则会报"系统错误"。需要手机端搜索下机器人,然后开启飞书机器人应用(该建议来自于网友,可参考) + + 1. 电脑浏览器打开 https://botbuilder.feishu.cn/home/my-command + + 2. 点击"新建机器人指令" + + 3. 点击"选择触发器",往下滑动,点击"Webhook 触发" + + 4. 此时你会看到"Webhook 地址",把这个链接先复制到本地记事本暂存,继续接下来的操作 + + 5. "参数"里面放上下面的内容,然后点击"完成" + + ```json + { + "message_type": "text", + "content": { + "total_titles": "{{内容}}", + "timestamp": "{{内容}}", + "report_type": "{{内容}}", + "text": "{{内容}}" + } + } + ``` + + 6. 点击"选择操作" > "通过官方机器人发消息" + + 7. 消息标题填写"TrendRadar 热点监控" + + 8. 最关键的部分来了,点击 + 按钮,选择"Webhook 触发",然后按照下面的图片摆放 + + ![飞书机器人配置示例](_image/image.png) + + 9. 配置完成后,将第 4 步复制的 Webhook 地址配置到 GitHub Secrets 中的 `FEISHU_WEBHOOK_URL` + +
+ + **方案二:** + + 1. 电脑浏览器打开 https://botbuilder.feishu.cn/home/my-app + + 2. 点击"新建机器人应用" + + 3. 进入创建的应用后,点击"流程涉及" > "创建流程" > "选择触发器" + + 4. 往下滑动,点击"Webhook 触发" + + 5. 此时你会看到"Webhook 地址",把这个链接先复制到本地记事本暂存,继续接下来的操作 + + 6. "参数"里面放上下面的内容,然后点击"完成" + + ```json + { + "message_type": "text", + "content": { + "total_titles": "{{内容}}", + "timestamp": "{{内容}}", + "report_type": "{{内容}}", + "text": "{{内容}}" + } + } + ``` + + 7. 点击"选择操作" > "发送飞书消息",勾选 "群消息",然后点击下面的输入框,点击"我管理的群组"(如果没有群组,你可以在飞书 app 上创建群组) + + 8. 消息标题填写"TrendRadar 热点监控" + + 9. 最关键的部分来了,点击 + 按钮,选择"Webhook 触发",然后按照下面的图片摆放 + + ![飞书机器人配置示例](_image/image.png) + + 10. 配置完成后,将第 5 步复制的 Webhook 地址配置到 GitHub Secrets 中的 `FEISHU_WEBHOOK_URL` + +
+ +
+ 👉 点击展开:钉钉机器人 +
+ + **GitHub Secret 配置:** + - 名称:`DINGTALK_WEBHOOK_URL` + - 值:你的钉钉机器人 Webhook 地址 + +
+ + **机器人设置步骤:** + + 1. **创建机器人(仅 PC 端支持)**: + - 打开钉钉 PC 客户端,进入目标群聊 + - 点击群设置图标(⚙️)→ 往下翻找到"机器人"点开 + - 选择"添加机器人" → "自定义" + + 2. **配置机器人**: + - 设置机器人名称 + - **安全设置**: + - **自定义关键词**:设置 "热点" + + 3. **完成设置**: + - 勾选服务条款协议 → 点击"完成" + - 复制获得的 Webhook URL + - 将 URL 配置到 GitHub Secrets 中的 `DINGTALK_WEBHOOK_URL` + + **注意**:移动端只能接收消息,无法创建新机器人。 +
+ +
+ 👉 点击展开:Telegram Bot +
+ + **GitHub Secret 配置:** + - 名称:`TELEGRAM_BOT_TOKEN` - 你的 Telegram Bot Token + - 名称:`TELEGRAM_CHAT_ID` - 你的 Telegram Chat ID + +
+ + **机器人设置步骤:** + + 1. **创建机器人**: + - 在 Telegram 中搜索 `@BotFather`(大小写注意,有蓝色徽章勾勾,有类似 37849827 monthly users,这个才是官方的,有一些仿官方的账号注意辨别) + - 发送 `/newbot` 命令创建新机器人 + - 设置机器人名称(必须以"bot"结尾,很容易遇到重复名字,所以你要绞尽脑汁想不同的名字) + - 获取 Bot Token(格式如:`123456789:AAHfiqksKZ8WmR2zSjiQ7_v4TMAKdiHm9T0`) + + 2. **获取 Chat ID**: + + **方法一:通过官方 API 获取** + - 先向你的机器人发送一条消息 + - 访问:`https://api.telegram.org/bot<你的Bot Token>/getUpdates` + - 在返回的 JSON 中找到 `"chat":{"id":数字}` 中的数字 + + **方法二:使用第三方工具** + - 搜索 `@userinfobot` 并发送 `/start` + - 获取你的用户 ID 作为 Chat ID + + 3. **配置到 GitHub**: + - `TELEGRAM_BOT_TOKEN`:填入第 1 步获得的 Bot Token + - `TELEGRAM_CHAT_ID`:填入第 2 步获得的 Chat ID +
+ +
+ 👉 点击展开:邮件推送(支持所有主流邮箱) +
+ + - 注意事项:为防止邮件群发功能被**滥用**,当前的群发是所有收件人都能看到彼此的邮箱地址。 + - 如果你没有过配置下面这种邮箱发送的经历,不建议尝试 + +
+ + **GitHub Secret 配置:** + - 名称:`EMAIL_FROM` - 发件人邮箱地址 + - 名称:`EMAIL_PASSWORD` - 邮箱密码或授权码 + - 名称:`EMAIL_TO` - 收件人邮箱地址(多个收件人用英文逗号分隔)也可以和 EMAIL_FROM 一样,自己发送给自己 + - 名称:`EMAIL_SMTP_SERVER` - SMTP服务器地址(可选,留空则自动识别) + - 名称:`EMAIL_SMTP_PORT` - SMTP端口(可选,留空则自动识别) + +
+ + **支持的邮箱服务商**(自动识别 SMTP 配置): + + | 邮箱服务商 | 域名 | SMTP 服务器 | 端口 | 加密方式 | + |-----------|------|------------|------|---------| + | **Gmail** | gmail.com | smtp.gmail.com | 587 | TLS | + | **QQ邮箱** | qq.com | smtp.qq.com | 465 | SSL | + | **Outlook** | outlook.com | smtp-mail.outlook.com | 587 | TLS | + | **Hotmail** | hotmail.com | smtp-mail.outlook.com | 587 | TLS | + | **Live** | live.com | smtp-mail.outlook.com | 587 | TLS | + | **163邮箱** | 163.com | smtp.163.com | 465 | SSL | + | **126邮箱** | 126.com | smtp.126.com | 465 | SSL | + | **新浪邮箱** | sina.com | smtp.sina.com | 465 | SSL | + | **搜狐邮箱** | sohu.com | smtp.sohu.com | 465 | SSL | + + > **自动识别**:使用以上邮箱时,无需手动配置 `EMAIL_SMTP_SERVER` 和 `EMAIL_SMTP_PORT`,系统会自动识别。 + > + > **反馈说明**: + > - 如果您使用**其他邮箱**测试成功,欢迎开 [Issues](https://github.com/sansan0/TrendRadar/issues) 告知,我会添加到支持列表 + > - 如果上述邮箱配置有误或无法使用,也请开 [Issues](https://github.com/sansan0/TrendRadar/issues) 反馈,帮助改进项目 + + **常见邮箱设置:** + + #### QQ邮箱: + 1. 登录 QQ邮箱网页版 → 设置 → 账户 + 2. 开启 POP3/SMTP 服务 + 3. 生成授权码(16位字母) + 4. `EMAIL_PASSWORD` 填写授权码,而非 QQ 密码 + + #### Gmail: + 1. 开启两步验证 + 2. 生成应用专用密码 + 3. `EMAIL_PASSWORD` 填写应用专用密码 + + #### 163/126邮箱: + 1. 登录网页版 → 设置 → POP3/SMTP/IMAP + 2. 开启 SMTP 服务 + 3. 设置客户端授权码 + 4. `EMAIL_PASSWORD` 填写授权码 +
+ + **高级配置**: + 如果自动识别失败,可手动配置 SMTP: + - `EMAIL_SMTP_SERVER`:如 smtp.gmail.com + - `EMAIL_SMTP_PORT`:如 587(TLS)或 465(SSL) +
+ + **如果有多个收件人(注意是英文逗号分隔)**: + - EMAIL_TO="user1@example.com,user2@example.com,user3@example.com" + +
+ +
+ 👉 点击展开:ntfy 推送(开源免费,支持自托管) +
+ + **两种使用方式:** + + ### 方式一:免费使用(推荐新手) 🆓 + + **特点**: + - ✅ 无需注册账号,立即使用 + - ✅ 每天 250 条消息(足够 90% 用户) + - ✅ Topic 名称即"密码"(需选择不易猜测的名称) + - ⚠️ 消息未加密,不适合敏感信息, 但适合我们这个项目的不敏感信息 + + **快速开始:** + + 1. **下载 ntfy 应用**: + - Android:[Google Play](https://play.google.com/store/apps/details?id=io.heckel.ntfy) / [F-Droid](https://f-droid.org/en/packages/io.heckel.ntfy/) + - iOS:[App Store](https://apps.apple.com/us/app/ntfy/id1625396347) + - 桌面:访问 [ntfy.sh](https://ntfy.sh) + + 2. **订阅主题**(选择一个难猜的名称): + ``` + 建议格式:trendradar-{你的名字缩写}-{随机数字} + + 不能使用中文 + + ✅ 好例子:trendradar-zs-8492 + ❌ 坏例子:news、alerts(太容易被猜到) + ``` + + 3. **配置 GitHub Secret**: + - `NTFY_TOPIC`:填写你刚才订阅的主题名称 + - `NTFY_SERVER_URL`:留空(默认使用 ntfy.sh) + - `NTFY_TOKEN`:留空 + + 4. **测试**: + ```bash + curl -d "测试消息" ntfy.sh/你的主题名称 + ``` + + --- + + ### 方式二:自托管(完全隐私控制) 🔒 + + **适合人群**:有服务器、追求完全隐私、技术能力强 + + **优势**: + - ✅ 完全开源(Apache 2.0 + GPLv2) + - ✅ 数据完全自主控制 + - ✅ 无任何限制 + - ✅ 零费用 + + **Docker 一键部署**: + ```bash + docker run -d \ + --name ntfy \ + -p 80:80 \ + -v /var/cache/ntfy:/var/cache/ntfy \ + binwiederhier/ntfy \ + serve --cache-file /var/cache/ntfy/cache.db + ``` + + **配置 TrendRadar**: + ```yaml + NTFY_SERVER_URL: https://ntfy.yourdomain.com + NTFY_TOPIC: trendradar-alerts # 自托管可用简单名称 + NTFY_TOKEN: tk_your_token # 可选:启用访问控制 + ``` + + **在应用中订阅**: + - 点击"Use another server" + - 输入你的服务器地址 + - 输入主题名称 + - (可选)输入登录凭据 + + --- + + **常见问题:** + +
+ Q1: 免费版够用吗? + + 每天 250 条消息对大多数用户足够。按 30 分钟抓取一次计算,每天约 48 次推送,完全够用。 +
+ +
+ Q2: Topic 名称真的安全吗? + + 如果你选择随机的、足够长的名称(如 `trendradar-zs-8492-news`),暴力破解几乎不可能: + - ntfy 有严格的速率限制(1 秒 1 次请求) + - 64 个字符选择(A-Z, a-z, 0-9, _, -) + - 10 位随机字符串有 64^10 种可能性(需要数年才能破解) +
+ + --- + + **推荐选择:** + + | 用户类型 | 推荐方案 | 理由 | + |---------|---------|------| + | 普通用户 | 方式一(免费) | 简单快速,够用 | + | 技术用户 | 方式二(自托管) | 完全控制,无限制 | + | 高频用户 | 方式三(付费) | 这个自己去官网看吧 | + + **相关链接:** + - [ntfy 官方文档](https://docs.ntfy.sh/) + - [自托管教程](https://docs.ntfy.sh/install/) + - [GitHub 仓库](https://github.com/binwiederhier/ntfy) + +
+ + > **💡 新手快速上手建议**: + > + > 第一次部署时,建议先完成 **GitHub Secrets** 配置(选择一个推送平台即可),然后直接跳到【第4步】测试推送是否成功。 + > + > **暂时不要修改** `config/config.yaml` 和 `frequency_words.txt`,等推送测试成功后再根据需要调整这些配置。 + + +3. **配置说明:**: + + - **推送设置**:在 [config/config.yaml](config/config.yaml) 中配置推送模式和通知选项 + - **关键词设置**:在 [config/frequency_words.txt](config/frequency_words.txt) 中添加你关心的关键词 + - **推送频率调整**:在 [.github/workflows/crawler.yml](.github/workflows/crawler.yml) 请谨慎调整,别贪心 + + **注意**:建议只调整文档中明确说明的配置项,其他选项主要供作者开发时测试使用 + +4. **手动测试新闻推送**: + + 我这里是拿我的项目举例,你要去你**fork**的项目做测试 + + 1. **进入 Actions**:https://github.com/sansan0/TrendRadar/actions + 2. 找到 "Hot News Crawler" 的点进去,如果看不到该字样,那么参照[#109](https://github.com/sansan0/TrendRadar/issues/109)解决 + 3. 点击 "Run workflow" 按钮运行,等待 1 分钟左右数据到你手机上 + + +## 🐳 Docker 部署 + +#### 方式一:快速体验(一行命令) + +**Linux/macOS 系统:** +```bash +# 创建配置目录并下载配置文件 +mkdir -p config output +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/ +wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/ ``` -人工智能 -AI -GPT -大模型 -!AI绘画 +或者**手动创建**: +1. 在当前目录创建 `config` 文件夹 +2. 下载配置文件: + - 访问 https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml → 右键"另存为" → 保存到 `config\config.yaml` + - 访问 https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt → 右键"另存为" → 保存到 `config\frequency_words.txt` + +完成后的目录结构应该是: +``` +当前目录/ +└── config/ + ├── config.yaml + └── frequency_words.txt +``` + +```bash +docker run -d --name trend-radar \ + -v ./config:/app/config:ro \ + -v ./output:/app/output \ + -e FEISHU_WEBHOOK_URL="你的飞书webhook" \ + -e DINGTALK_WEBHOOK_URL="你的钉钉webhook" \ + -e WEWORK_WEBHOOK_URL="你的企业微信webhook" \ + -e TELEGRAM_BOT_TOKEN="你的telegram_bot_token" \ + -e TELEGRAM_CHAT_ID="你的telegram_chat_id" \ + -e EMAIL_FROM="你的发件邮箱" \ + -e EMAIL_PASSWORD="你的邮箱密码或授权码" \ + -e EMAIL_TO="收件人邮箱" \ + -e CRON_SCHEDULE="*/30 * * * *" \ + -e RUN_MODE="cron" \ + -e IMMEDIATE_RUN="true" \ + wantcat/trendradar:latest +``` + +#### 方式二:使用 docker-compose(推荐) -芯片 -半导体 +1. **创建项目目录和配置**: + ```bash + # 创建目录结构 + mkdir -p trendradar/{config,docker} + cd trendradar + + # 下载配置文件模板 + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/config.yaml -P config/ + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/config/frequency_words.txt -P config/ + + # 下载 docker-compose 配置 + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/.env + wget https://raw.githubusercontent.com/sansan0/TrendRadar/master/docker/docker-compose.yml + ``` + +完成后的目录结构应该是: +``` +当前目录/ +├── config/ +│ ├── config.yaml +│ └── frequency_words.txt +└── docker/ + ├── .env + └── docker-compose.yml ``` -上述配置表示: +2. **配置文件说明**: + - `config/config.yaml` - 应用主配置(报告模式、推送设置等) + - `config/frequency_words.txt` - 关键词配置(设置你关心的热点词汇) + - `.env` - 环境变量配置(webhook URLs 和定时任务) -- 监控包含"人工智能"、"AI"、"GPT"或"大模型"的标题,但若同时包含"AI 绘画"则排除 -- 监控包含"芯片"或"半导体"的标题 + **⚙️ 环境变量覆盖机制(v3.0.5+)** -## 📊 输出示例 + 如果你在 NAS 或其他 Docker 环境中遇到**修改 `config.yaml` 后配置不生效**的问题,可以通过环境变量直接覆盖配置: -程序会生成两种报告: + | 环境变量 | 对应配置 | 示例值 | 说明 | + |---------|---------|-------|------| + | `ENABLE_CRAWLER` | `crawler.enable_crawler` | `true` / `false` | 是否启用爬虫 | + | `ENABLE_NOTIFICATION` | `notification.enable_notification` | `true` / `false` | 是否启用通知 | + | `REPORT_MODE` | `report.mode` | `daily` / `incremental` / `current`| 报告模式 | + | `PUSH_WINDOW_ENABLED` | `notification.push_window.enabled` | `true` / `false` | 推送时间窗口开关 | + | `PUSH_WINDOW_START` | `notification.push_window.time_range.start` | `08:00` | 推送开始时间 | + | `PUSH_WINDOW_END` | `notification.push_window.time_range.end` | `22:00` | 推送结束时间 | + | `FEISHU_WEBHOOK_URL` | `notification.webhooks.feishu_url` | `https://...` | 飞书 Webhook | -1. **单次爬取报告**:每次爬取后生成的报告,包含当次爬取的热点数据 -2. **当日汇总报告**:汇总当天所有爬取的数据,去重并统计出现频率 + **配置优先级**:环境变量 > config.yaml -### HTML 报告示例: + **使用方法**: + - 修改 `.env` 文件,取消注释并填写需要的配置 + - 或在 NAS/群晖 Docker 管理界面的"环境变量"中直接添加 + - 重启容器后生效:`docker-compose restart` -| 排名 | 频率词 | 出现次数 | 占比 | 相关标题 | -| ---- | ----------- | -------- | ----- | ------------------------------------------------------------------------------------------------------------------- | -| 1 | 人工智能 AI | 12 | 24.5% | [百度热搜] 科技巨头发布新 AI 模型 【1】- 12 时 30 分 - 4 次
[今日头条] AI 技术最新突破 【2】- 13 时 15 分 - 2 次 | -| 2 | 芯片 半导体 | 8 | 16.3% | [华尔街见闻] 半导体行业最新动态 【3】- 12 时 45 分 - 3 次
[财联社] 芯片设计新技术 [7] - 14 时 00 分 - 1 次 | -### 飞书通知示例: +3. **启动服务**: + ```bash + # 拉取最新镜像并启动 + docker-compose pull + docker-compose up -d + ``` +4. **查看运行状态**: + ```bash + # 查看日志 + docker logs -f trend-radar + + # 查看容器状态 + docker ps | grep trend-radar + ``` + +#### 方式三:本地构建(开发者选项) + +如果需要自定义修改代码或构建自己的镜像: + +```bash +# 克隆项目 +git clone https://github.com/sansan0/TrendRadar.git +cd TrendRadar + +# 修改配置文件 +vim config/config.yaml +vim config/frequency_words.txt + +# 使用构建版本的 docker-compose +cd docker +cp docker-compose-build.yml docker-compose.yml + +# 构建并启动 +docker-compose build +docker-compose up -d ``` -【人工智能 AI】 : 12 条 -1. [百度热搜] 科技巨头发布新AI模型 【1】- 12时30分 - 4次 -2. [今日头条] AI技术最新突破 【2】- 13时15分 - 2次 -============================== +#### 镜像更新 -【芯片 半导体】 : 8 条 -1. [华尔街见闻] 半导体行业最新动态 【3】- 12时45分 - 3次 -2. [财联社] 芯片设计新技术 [7] - 14时00分 - 1次 +```bash +# 方式一:手动更新 +docker pull wantcat/trendradar:latest +docker-compose down +docker-compose up -d + +# 方式二:使用 docker-compose 更新 +docker-compose pull +docker-compose up -d ``` -### 飞书消息格式说明 +#### 服务管理命令 -| 格式元素 | 示例 | 含义 | 说明 | -| ------------- | ------------------------------ | ------------ | ----------------------------------- | -| 【关键词】 | 【人工智能 AI】 | 频率词组 | 表示本组匹配的关键词 | -| : N 条 | : 12 条 | 匹配数量 | 该关键词组匹配的标题总数 | -| [平台名] | [百度热搜] | 来源平台 | 标题所属的平台名称 | -| 【数字】 | 【1】 | 高排名标记 | 排名 ≤ 阈值(默认 5)的热搜,重要性高 | -| [数字] | [7] | 普通排名标记 | 排名>阈值的热搜,重要性一般 | -| - 时间 | - 12 时 30 分 | 首次发现时间 | 标题首次被发现的时间 | -| [时间 ~ 时间] | [12 时 30 分 ~ 14 时 00 分] | 时间范围 | 标题出现的时间范围(首次~最后) | -| - N 次 | - 4 次 | 出现次数 | 标题在监控期间出现的总次数 | -| ====== | ============================== | 分隔线 | 不同频率词组之间的分隔符 | +```bash +# 查看运行状态 +docker exec -it trend-radar python manage.py status -## 🤖 飞书机器人设置 +# 手动执行一次爬虫 +docker exec -it trend-radar python manage.py run -1. 电脑浏览器打开 https://botbuilder.feishu.cn/home/my-app +# 查看实时日志 +docker exec -it trend-radar python manage.py logs -2. 点击"新建机器人应用" +# 显示当前配置 +docker exec -it trend-radar python manage.py config -3. 进入创建的应用后,点击"流程涉及" > "创建流程" > "选择触发器" +# 显示输出文件 +docker exec -it trend-radar python manage.py files -4. 往下滑动,点击"Webhook 触发" +# 查看帮助信息 +docker exec -it trend-radar python manage.py help -5. 此时你会看到"Webhook 地址",把这个链接先复制到本地记事本暂存,继续接下来的操作 +# 重启容器 +docker restart trend-radar -6. "参数"里面放上下面的内容,然后点击"完成" +# 停止容器 +docker stop trend-radar +# 删除容器(保留数据) +docker rm trend-radar ``` + +#### 数据持久化 + +生成的报告和数据默认保存在 `./output` 目录下,即使容器重启或删除,数据也会保留。 + +#### 故障排查 + +```bash +# 检查容器状态 +docker inspect trend-radar + +# 查看容器日志 +docker logs --tail 100 trend-radar + +# 进入容器调试 +docker exec -it trend-radar /bin/bash + +# 验证配置文件 +docker exec -it trend-radar ls -la /app/config/ +``` + + +## 🤖 AI 智能分析部署 + +TrendRadar v3.0.0 新增了基于 **MCP (Model Context Protocol)** 的 AI 分析功能,让你可以通过自然语言与新闻数据对话,进行深度分析。使用 **AI 功能** 的最佳前提是已使用本项目至少运行一天(积累新闻数据) + +### 1. 快速部署 + +Cherry Studio 提供 GUI 配置界面, 5 分钟快速部署, 复杂的部分是一键安装的。 + +**图文部署教程**:现已更新到我的[公众号](#问题答疑与1元点赞),回复 "mcp" 即可 + +**详细部署教程**:[README-Cherry-Studio.md](README-Cherry-Studio.md) + +### 2. 学习与 AI 对话的姿势 + +**详细对话教程**:[README-MCP-FAQ.md](README-MCP-FAQ.md) + +**提问效果**: + +> 实际不建议一次性问多个问题。如果你选择的 ai 模型连下图的按顺序调用都无法做到,建议换一个。 + +mcp 使用效果图2 + + +## 🔌 MCP 客户端 + +TrendRadar MCP 服务支持标准的 Model Context Protocol (MCP) 协议,可以接入各种支持 MCP 的 AI 客户端进行智能分析。 + +### 支持的客户端 + +**注意事项**: +- 将 `/path/to/TrendRadar` 替换为你的项目实际路径 +- Windows 路径使用双反斜杠:`C:\\Users\\YourName\\TrendRadar` +- 保存后记得重启 + +
+👉 点击展开:Claude Desktop + +#### 配置文件方式 + +编辑 Claude Desktop 的 MCP 配置文件: + +**Windows**: +`%APPDATA%\Claude\claude_desktop_config.json` + +**Mac**: +`~/Library/Application Support/Claude/claude_desktop_config.json` + +**配置内容**: +```json { -"message_type ":"text", -"content":{ - "total_titles": "{{内容}}", - "timestamp": "{{内容}}", - "report_type": "{{内容}}", - "text": "{{内容}}" -} + "mcpServers": { + "trendradar": { + "command": "uv", + "args": [ + "--directory", + "/path/to/TrendRadar", + "run", + "python", + "-m", + "mcp_server.server" + ], + "env": {}, + "disabled": false, + "alwaysAllow": [] + } + } } ``` -7. 点击"选择操作" > "发送飞书消息" ,勾选 "群消息", 然后点击下面的输入框,点击"我管理的群组"(如果没有群组,你可以在飞书 app 上创建群组) +
-8. 消息标题填写"我是热搜" +
+👉 点击展开:Cursor -9. 最关键的部分来了,点击 + 按钮,选择"Webhook 触发",然后按照下面的图片摆放 +#### 方式一:HTTP 模式 -![alt text](image.png) +1. **启动 HTTP 服务**: + ```bash + # Windows + start-http.bat + + # Mac/Linux + ./start-http.sh + ``` -```bash -# Linux/macOS -export FEISHU_WEBHOOK_URL="你的Webhook URL" +2. **配置 Cursor**: + + **项目级配置**(推荐): + 在项目根目录创建 `.cursor/mcp.json`: + ```json + { + "mcpServers": { + "trendradar": { + "url": "http://localhost:3333/mcp", + "description": "TrendRadar 新闻热点聚合分析" + } + } + } + ``` -# Windows -set FEISHU_WEBHOOK_URL="你的Webhook URL" -``` + **全局配置**: + 在用户目录创建 `~/.cursor/mcp.json`(同样内容) -## 📡 数据来源说明 +3. **使用步骤**: + - 保存配置文件后重启 Cursor + - 在聊天界面的 "Available Tools" 中查看已连接的工具 + - 开始使用:`搜索今天的"AI"相关新闻` -本项目使用的数据来自 [newsnow](https://github.com/ourongxing/newsnow) 的 API 服务。每个平台的数据通过以下格式的 API 请求获取: +#### 方式二:STDIO 模式(推荐) +创建 `.cursor/mcp.json`: +```json +{ + "mcpServers": { + "trendradar": { + "command": "uv", + "args": [ + "--directory", + "/path/to/TrendRadar", + "run", + "python", + "-m", + "mcp_server.server" + ] + } + } +} ``` -https://newsnow.busiyi.world/api/s?id={平台ID}&latest + +
+ +
+👉 点击展开:VSCode (Cline/Continue) + +#### Cline 配置 + +在 Cline 的 MCP 设置中添加: + +**HTTP 模式**: +```json +{ + "trendradar": { + "url": "http://localhost:3333/mcp", + "type": "streamableHttp", + "autoApprove": [], + "disabled": false + } +} ``` -其中`{平台ID}`为各平台的标识符,如`baidu`、`toutiao`等。 +**STDIO 模式**(推荐): +```json +{ + "trendradar": { + "command": "uv", + "args": [ + "--directory", + "/path/to/TrendRadar", + "run", + "python", + "-m", + "mcp_server.server" + ], + "type": "stdio", + "disabled": false + } +} +``` -API 返回的数据格式为 JSON,包含平台的热搜榜单: +#### Continue 配置 +编辑 `~/.continue/config.json`: ```json { - "status": "success", - "items": [ - { - "title": "热搜标题1", - "url": "https://example.com/news1" - }, - { - "title": "热搜标题2", - "url": "https://example.com/news2" - } - // ...更多条目 - ] + "experimental": { + "modelContextProtocolServers": [ + { + "transport": { + "type": "stdio", + "command": "uv", + "args": [ + "--directory", + "/path/to/TrendRadar", + "run", + "python", + "-m", + "mcp_server.server" + ] + } + } + ] + } } ``` -### 自建 API 服务 +**使用示例**: +``` +分析最近7天"特斯拉"的热度变化趋势 +生成今天的热点摘要报告 +搜索"比特币"相关新闻并分析情感倾向 +``` + +
+ +
+👉 点击展开:Claude Code CLI + +#### HTTP 模式配置 + +```bash +# 1. 启动 HTTP 服务 +# Windows: start-http.bat +# Mac/Linux: ./start-http.sh + +# 2. 添加 MCP 服务器 +claude mcp add --transport http trendradar http://localhost:3333/mcp + +# 3. 验证连接(确保服务已启动) +claude mcp list +``` + +#### 使用示例 + +```bash +# 查询新闻 +claude "搜索今天知乎的热点新闻,前10条" + +# 趋势分析 +claude "分析'人工智能'这个话题最近一周的热度趋势" + +# 数据对比 +claude "对比知乎和微博平台对'比特币'的关注度" +``` -如果你想自己部署 API 服务而不依赖第三方: +
-1. 克隆 [newsnow](https://github.com/ourongxing/newsnow) 仓库 +
+👉 点击展开:MCP Inspector(调试工具) +
+MCP Inspector 是官方调试工具,用于测试 MCP 连接: + +#### 使用步骤 + +1. **启动 TrendRadar HTTP 服务**: ```bash - git clone https://github.com/ourongxing/newsnow.git - cd newsnow + # Windows + start-http.bat + + # Mac/Linux + ./start-http.sh ``` -2. 按照该仓库的 README 说明部署 API 服务 - -3. 修改 TrendRadar 中的 API URL: +2. **启动 MCP Inspector**: + ```bash + npx @modelcontextprotocol/inspector + ``` - - 在`DataFetcher.fetch_data`方法中,将 - ```python - url = f"https://newsnow.busiyi.world/api/s?id={id_value}&latest" - ``` - 更改为你自己的 API 地址 - ```python - url = f"https://你的域名/api/s?id={id_value}&latest" - ``` +3. **在浏览器中连接**: + - 访问:`http://localhost:3333/mcp` + - 测试 "Ping Server" 功能验证连接 + - 检查 "List Tools" 是否返回 13 个工具: + - 基础查询:get_latest_news, get_news_by_date, get_trending_topics + - 智能检索:search_news, search_related_news_history + - 高级分析:analyze_topic_trend, analyze_data_insights, analyze_sentiment, find_similar_news, generate_summary_report + - 系统管理:get_current_config, get_system_status, trigger_crawl -4. 如需添加新的平台支持,请参考 newsnow 项目中的爬虫实现并添加到你的 API 服务中 +
-## 📁 代码结构 +
+👉 点击展开:其他支持 MCP 的客户端 +
-代码采用了面向对象设计模式,主要包含以下几个类: +任何支持 Model Context Protocol 的客户端都可以连接 TrendRadar: -- `TimeHelper`: 时间相关的辅助功能 -- `FileHelper`: 文件操作相关的辅助功能 -- `DataFetcher`: 负责从 API 获取数据 -- `DataProcessor`: 负责处理和转换数据 -- `StatisticsCalculator`: 负责统计计算 -- `ReportGenerator`: 负责生成 HTML 报告和飞书消息 -- `NewsAnalyzer`: 主类,协调整个流程的执行 +#### HTTP 模式 -## 🔧 高级用法 +**服务地址**:`http://localhost:3333/mcp` -### 自定义监控平台 +**基本配置模板**: +```json +{ + "name": "trendradar", + "url": "http://localhost:3333/mcp", + "type": "http", + "description": "新闻热点聚合分析" +} +``` -可以在`NewsAnalyzer.run`方法中修改`ids`列表来添加或移除监控的平台: +#### STDIO 模式(推荐) -```python -ids = [ - ("toutiao", '今日头条'), - ("baidu", '百度热搜'), - # 添加或移除平台 -] +**基本配置模板**: +```json +{ + "name": "trendradar", + "command": "uv", + "args": [ + "--directory", + "/path/to/TrendRadar", + "run", + "python", + "-m", + "mcp_server.server" + ], + "type": "stdio" +} ``` -### 飞书通知选项 +**注意事项**: +- 替换 `/path/to/TrendRadar` 为实际项目路径 +- Windows 路径使用反斜杠转义:`C:\\Users\\...` +- 确保已完成项目依赖安装(运行过 setup 脚本) + +
-你可以通过以下方式控制飞书通知行为: -1. `FEISHU_WEBHOOK_URL`: 设置为有效的 webhook URL 以启用飞书通知 -2. `CONTINUE_WITHOUT_FEISHU`: 控制在没有有效 webhook URL 时的行为 - - `True`: 执行爬虫但不发送通知(默认值) - - `False`: 完全不执行爬虫 -3. `FEISHU_REPORT_TYPE`: 控制发送哪种类型的报告 +## ☕问题答疑与1元点赞 -### 扩展功能 +> 心意到就行,收到的**点赞**用于提高开发者开源的积极性。**点赞**已收录于**致谢名单** -如果你想扩展功能,可以: +- **GitHub Issues**:适合针对性强的解答。提问时请提供完整信息(截图、错误日志、系统环境等)。 +- **公众号交流**:适合快速咨询。建议优先在相关文章下的公共留言区交流,如私信,请文明礼貌用语😉 -1. 继承已有类并重写特定方法 -2. 添加新的统计方法到`StatisticsCalculator`类 -3. 添加新的报告格式到`ReportGenerator`类 -4. 修改`NewsAnalyzer`类以支持新的工作流程 -## ❓ 常见问题 +|公众号关注 |微信点赞 | 支付宝点赞 | +|:---:|:---:|:---:| +| | | | -1. **GitHub Actions 不执行怎么办?** +### 常见问题 - - 检查`.github/workflows/crawler.yml`文件是否存在 - - 在 Actions 页面手动触发一次 workflow - - 确认你有足够的 GitHub Actions 免费分钟数 +
+👉 点击展开:Q1: HTTP 服务无法启动? +
-2. **本地运行失败怎么办?** +**检查步骤**: +1. 确认端口 3333 未被占用: + ```bash + # Windows + netstat -ano | findstr :3333 + + # Mac/Linux + lsof -i :3333 + ``` - - 检查网络连接 - - 尝试修改`CONFIG`中的`USE_PROXY`和`DEFAULT_PROXY`设置 - - 检查依赖是否正确安装 +2. 检查项目依赖是否安装: + ```bash + # 重新运行安装脚本 + # Windows: setup-windows.bat 或者 setup-windows-en.bat + # Mac/Linux: ./setup-mac.sh + ``` -3. **没有收到飞书通知怎么办?** +3. 查看详细错误日志: + ```bash + uv run python -m mcp_server.server --transport http --port 3333 + ``` +4. 尝试自定义端口: + ```bash + uv run python -m mcp_server.server --transport http --port 33333 + ``` - - 检查`FEISHU_WEBHOOK_URL`是否正确设置(环境变量或 CONFIG 中) - - 检查飞书机器人是否仍在群内且启用 - - 查看程序输出中是否有发送失败的错误信息 +
-4. **想要停止爬虫行为但保留仓库怎么办?** +
+👉 点击展开:Q2: 客户端无法连接到 MCP 服务? +
- - 将`CONTINUE_WITHOUT_FEISHU`设置为`False`并删除`FEISHU_WEBHOOK_URL`secret - - 或修改 GitHub Actions workflow 文件禁用自动执行 +**解决方案**: -5. **如何处理 API 限制或访问问题?** - - 适当增加`REQUEST_INTERVAL`值,避免频繁请求 - - 考虑使用上述"自建 API 服务"部分的说明部署自己的服务 - - 本地运行时可尝试启用或更换代理 +1. **STDIO 模式**: + - 确认 UV 路径正确(运行 `which uv` 或 `where uv`) + - 确认项目路径正确且无中文字符 + - 查看客户端错误日志 -## 💡 应用场景 +2. **HTTP 模式**: + - 确认服务已启动(访问 `http://localhost:3333/mcp`) + - 检查防火墙设置 + - 尝试使用 127.0.0.1 替代 localhost -- **媒体从业者**: 实时追踪热点,把握报道方向 -- **市场营销**: 及时发现与品牌相关的热点话题 -- **内容创作**: 获取热门话题灵感,提高内容曝光 -- **投资分析**: 追踪特定行业或公司的热点消息 -- **个人使用**: 不错过任何你关心领域的热点信息 +3. **通用检查**: + - 重启客户端应用 + - 查看 MCP 服务日志 + - 使用 MCP Inspector 测试连接 + +
+ +
+👉 点击展开:Q3: 工具调用失败或返回错误? +
+ +**可能原因**: + +1. **数据不存在**: + - 确认已运行过爬虫(有 output 目录数据) + - 检查查询日期范围是否有数据 + - 查看 output 目录的可用日期 + +2. **参数错误**: + - 检查日期格式:`YYYY-MM-DD` + - 确认平台 ID 正确:`zhihu`, `weibo` 等 + - 查看工具文档中的参数说明 + +3. **配置问题**: + - 确认 `config/config.yaml` 存在 + - 确认 `config/frequency_words.txt` 存在 + - 检查配置文件格式是否正确 + +
+ +### 项目相关 + +> **4 篇文章**: + +- [可在该文章下方留言,方便项目作者用手机答疑](https://mp.weixin.qq.com/s/KYEPfTPVzZNWFclZh4am_g) +- [2个月破 1000 star,我的GitHub项目推广实战经验](https://mp.weixin.qq.com/s/jzn0vLiQFX408opcfpPPxQ) +- [github fork 运行本项目的注意事项 ](https://mp.weixin.qq.com/s/C8evK-U7onG1sTTdwdW2zg) +- [基于本项目,如何开展公众号或者新闻资讯类文章写作](https://mp.weixin.qq.com/s/8ghyfDAtQZjLrnWTQabYOQ) + +>**AI 开发**: +- 如果你有小众需求,完全可以基于我的项目自行开发,零编程基础的也可以试试 +- 我所有的开源项目或多或少都使用了自己写的**AI辅助软件**来提升开发效率,这款工具已开源 +- **核心功能**:迅速筛选项目代码喂给AI,你只需要补充个人需求即可 +- **项目地址**:https://github.com/sansan0/ai-code-context-helper + +### 其余项目 + +> 📍 毛主席足迹地图 - 交互式动态展示1893-1976年完整轨迹。欢迎诸位同志贡献数据 + +- https://github.com/sansan0/mao-map + +> 哔哩哔哩(bilibili)评论区数据可视化分析软件 + +- https://github.com/sansan0/bilibili-comment-analyzer + + +
+👉 点击展开:微信推送通知方案 +
+ +> 由于该方案是基于企业微信的插件机制,推送样式也十分不同,所以相关实现我暂时不准备纳入当前项目 + +- fork 这位兄台的项目 https://github.com/jayzqj/TrendRadar +- 完成上方的企业微信推送设置 +- 按照下面图片操作 +- 配置好后,手机上的企业微信 app 删除掉也没事 + + + +
+ +### 本项目流程图 + +```mermaid +flowchart TD + A[👤 用户开始] --> B{🚀 选择部署方式} + + B -->|云端部署| C1[🍴 Fork 项目到 GitHub] + B -->|本地部署| C2[🐳 Docker 部署] + + C1 --> D[⚙️ 配置通知渠道
可同时配置多个] + C2 --> D + + D --> E[选择通知方式:
📱企业微信 💬飞书 🔔钉钉
📟Telegram 📧邮件] + + E --> F[🔑 填写通知参数
GitHub Secrets 或环境变量] + + F --> G[📝 配置关键词
config/frequency_words.txt
普通词/必须词+/过滤词!] + + G --> H[🎯 选择运行模式
config/config.yaml] + + H --> H1[📋 daily - 当日汇总
定时推送所有匹配新闻] + H --> H2[📰 current - 当前榜单
定时推送最新榜单] + H --> H3[📈 incremental - 增量监控
仅推送新增内容] + + H1 --> I[可选:推送时间窗口控制
⏰ 限制推送时间范围] + H2 --> I + H3 --> I + + I --> J[✅ 配置完成] + + J --> K[🤖 系统自动运行] + + K --> L[🕷️ 爬取11+平台热点] + L --> M[🔍 关键词筛选] + M --> N[⚖️ 权重算法排序
排名60% + 频次30% + 热度10%] + N --> O[📊 生成报告
HTML网页 + 推送消息] + O --> P[📱 多渠道推送通知] + + P --> Q[🎉 持续接收精准推送
告别信息过载] + + style A fill:#e3f2fd + style B fill:#f3e5f5 + style D fill:#fff3e0 + style F fill:#fff9c4 + style G fill:#e8f5e9 + style H fill:#e0f2f1 + style I fill:#fce4ec + style O fill:#e1bee7 + style Q fill:#c8e6c9 +``` -## 🙏 致谢 +[![Star History Chart](https://api.star-history.com/svg?repos=sansan0/TrendRadar&type=Date)](https://www.star-history.com/#sansan0/TrendRadar&Date) -本项目使用了 [newsnow](https://github.com/ourongxing/newsnow) 提供的 API 服务,感谢其提供的数据支持。 ## 📄 许可证 -MIT License \ No newline at end of file +GPL-3.0 License + +--- + +
+ +[🔝 回到顶部](#trendradar) + +
diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000..7d66038683dfe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.32.5,<3.0.0 +pytz>=2025.2,<2026.0 +PyYAML>=6.0.3,<7.0.0 +fastmcp>=2.12.0,<2.14.0 +websockets>=13.0,<14.0 diff --git a/setup-mac.sh b/setup-mac.sh new file mode 100644 index 0000000000000..f6adec6ceac9e --- /dev/null +++ b/setup-mac.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +echo -e "${BOLD}╔════════════════════════════════════════╗${NC}" +echo -e "${BOLD}║ TrendRadar MCP 一键部署 (Mac) ║${NC}" +echo -e "${BOLD}╚════════════════════════════════════════╝${NC}" +echo "" + +# 获取项目根目录 +PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)" + +echo -e "📍 项目目录: ${BLUE}${PROJECT_ROOT}${NC}" +echo "" + +# 检查 UV 是否已安装 +if ! command -v uv &> /dev/null; then + echo -e "${YELLOW}[1/3] 🔧 UV 未安装,正在自动安装...${NC}" + echo "提示: UV 是一个快速的 Python 包管理器,只需安装一次" + echo "" + curl -LsSf https://astral.sh/uv/install.sh | sh + + echo "" + echo "正在刷新 PATH 环境变量..." + echo "" + + # 添加 UV 到 PATH + export PATH="$HOME/.cargo/bin:$PATH" + + # 验证 UV 是否真正可用 + if ! command -v uv &> /dev/null; then + echo -e "${RED}❌ [错误] UV 安装失败${NC}" + echo "" + echo "可能的原因:" + echo " 1. 网络连接问题,无法下载安装脚本" + echo " 2. 安装路径权限不足" + echo " 3. 安装脚本执行异常" + echo "" + echo "解决方案:" + echo " 1. 检查网络连接是否正常" + echo " 2. 手动安装: https://docs.astral.sh/uv/getting-started/installation/" + echo " 3. 或运行: curl -LsSf https://astral.sh/uv/install.sh | sh" + exit 1 + fi + + echo -e "${GREEN}✅ [成功] UV 已安装${NC}" + echo -e "${YELLOW}⚠️ 请重新运行此脚本以继续${NC}" + exit 0 +else + echo -e "${GREEN}[1/3] ✅ UV 已安装${NC}" + uv --version +fi + +echo "" +echo "[2/3] 📦 安装项目依赖..." +echo "提示: 这可能需要 1-2 分钟,请耐心等待" +echo "" + +# 创建虚拟环境并安装依赖 +uv sync + +if [ $? -ne 0 ]; then + echo "" + echo -e "${RED}❌ [错误] 依赖安装失败${NC}" + echo "请检查网络连接后重试" + exit 1 +fi + +echo "" +echo -e "${GREEN}[3/3] ✅ 检查配置文件...${NC}" +echo "" + +# 检查配置文件 +if [ ! -f "config/config.yaml" ]; then + echo -e "${YELLOW}⚠️ [警告] 未找到配置文件: config/config.yaml${NC}" + echo "请确保配置文件存在" + echo "" +fi + +# 添加执行权限 +chmod +x start-http.sh 2>/dev/null || true + +# 获取 UV 路径 +UV_PATH=$(which uv) + +echo "" +echo -e "${BOLD}╔════════════════════════════════════════╗${NC}" +echo -e "${BOLD}║ 部署完成! ║${NC}" +echo -e "${BOLD}╚════════════════════════════════════════╝${NC}" +echo "" +echo "📋 下一步操作:" +echo "" +echo " 1️⃣ 打开 Cherry Studio" +echo " 2️⃣ 进入 设置 > MCP Servers > 添加服务器" +echo " 3️⃣ 填入以下配置:" +echo "" +echo " 名称: TrendRadar" +echo " 描述: 新闻热点聚合工具" +echo " 类型: STDIO" +echo -e " 命令: ${BLUE}${UV_PATH}${NC}" +echo " 参数(每个占一行):" +echo -e " ${BLUE}--directory${NC}" +echo -e " ${BLUE}${PROJECT_ROOT}${NC}" +echo -e " ${BLUE}run${NC}" +echo -e " ${BLUE}python${NC}" +echo -e " ${BLUE}-m${NC}" +echo -e " ${BLUE}mcp_server.server${NC}" +echo "" +echo " 4️⃣ 保存并启用 MCP 开关" +echo "" +echo "📖 详细教程请查看: README-Cherry-Studio.md,本窗口别关,待会儿用于填入参数" +echo "" diff --git a/setup-windows-en.bat b/setup-windows-en.bat new file mode 100644 index 0000000000000..9edcdc6161054 --- /dev/null +++ b/setup-windows-en.bat @@ -0,0 +1,117 @@ +@echo off +:: 使用系统默认编码而不是强制 UTF-8 +setlocal enabledelayedexpansion + +echo ========================================== +echo TrendRadar MCP Setup (Windows) +echo ========================================== +echo: + +REM Get current directory +set "PROJECT_ROOT=%CD%" +echo Project Directory: %PROJECT_ROOT% +echo: + +REM Check Python +echo Checking Python... +python --version >nul 2>&1 +if %errorlevel% neq 0 ( + echo [ERROR] Python not detected. Please install Python 3.10+ + echo Download: https://www.python.org/downloads/ + pause + exit /b 1 +) +echo Python OK +echo: + +REM Check UV +echo Checking UV... +where uv >nul 2>&1 +if %errorlevel% neq 0 ( + echo [1/3] UV not installed, installing automatically... + echo: + + REM Use Bypass execution policy + powershell -ExecutionPolicy Bypass -Command "irm https://astral.sh/uv/install.ps1 | iex" + + if %errorlevel% neq 0 ( + echo [ERROR] UV installation failed + echo: + echo Please install UV manually: + echo Method 1: Visit https://docs.astral.sh/uv/getting-started/installation/ + echo Method 2: Use pip install uv + pause + exit /b 1 + ) + + echo: + echo [SUCCESS] UV installed successfully + echo [IMPORTANT] Please follow these steps: + echo 1. Close this window + echo 2. Reopen Command Prompt or PowerShell + echo 3. Navigate to project directory: cd "%PROJECT_ROOT%" + echo 4. Run this script again: setup-windows.bat + echo: + pause + exit /b 0 +) else ( + echo [1/3] UV already installed + uv --version +) +echo: + +echo [2/3] Installing project dependencies... +echo: + +REM Install dependencies with UV +uv sync +if %errorlevel% neq 0 ( + echo [ERROR] Dependency installation failed + echo: + echo Possible causes: + echo - Missing pyproject.toml file + echo - Network connection issues + echo - Incompatible Python version + pause + exit /b 1 +) +echo: + +echo [3/3] Checking configuration file... +if not exist "config\config.yaml" ( + echo [WARNING] Configuration file not found: config\config.yaml + if exist "config\config.example.yaml" ( + echo Tip: Example config found, please copy and modify: + echo copy config\config.example.yaml config\config.yaml + ) + echo: +) + +REM Get UV path +for /f "tokens=*" %%i in ('where uv 2^>nul') do set "UV_PATH=%%i" +if not defined UV_PATH ( + echo [WARNING] Unable to get UV path, please find it manually + set "UV_PATH=uv" +) + +echo: +echo ========================================== +echo Setup Complete! +echo ========================================== +echo: +echo MCP Server Configuration: +echo: +echo Command: %UV_PATH% +echo Working Directory: %PROJECT_ROOT% +echo: +echo Arguments (one per line): +echo --directory +echo %PROJECT_ROOT% +echo run +echo python +echo -m +echo mcp_server.server +echo: +echo Documentation: README-Cherry-Studio.md +echo: +pause \ No newline at end of file diff --git a/setup-windows.bat b/setup-windows.bat new file mode 100644 index 0000000000000..2e9ce0851f8ca --- /dev/null +++ b/setup-windows.bat @@ -0,0 +1,114 @@ +@echo off +chcp 65001 >nul +setlocal enabledelayedexpansion +echo ╔════════════════════════════════════════╗ +echo ║ TrendRadar MCP 一键部署 (Windows) ║ +echo ╚════════════════════════════════════════╝ +echo. + +REM 获取当前目录 +set "PROJECT_ROOT=%CD%" +echo 📍 项目目录: %PROJECT_ROOT% +echo. + +REM 检查 Python +python --version >nul 2>&1 +if %errorlevel% neq 0 ( + echo ❌ 未检测到 Python,请先安装 Python 3.10+ + echo 下载地址: https://www.python.org/downloads/ + pause + exit /b 1 +) + +REM 检查 UV +where uv >nul 2>&1 +if %errorlevel% neq 0 ( + echo [1/3] 🔧 UV 未安装,正在自动安装... + echo. + + REM 使用 Bypass 执行策略 + powershell -ExecutionPolicy Bypass -Command "irm https://astral.sh/uv/install.ps1 | iex" + + if %errorlevel% neq 0 ( + echo ❌ UV 安装失败 + echo. + echo 请手动安装 UV: + echo 方法1: 访问 https://docs.astral.sh/uv/getting-started/installation/ + echo 方法2: 使用 pip install uv + pause + exit /b 1 + ) + + echo. + echo ✅ UV 安装完成 + echo ⚠️ 重要: 请按照以下步骤操作: + echo 1. 关闭此窗口 + echo 2. 重新打开命令提示符(或 PowerShell) + echo 3. 回到项目目录: cd "%PROJECT_ROOT%" + echo 4. 重新运行此脚本: setup-windows.bat + echo. + pause + exit /b 0 +) else ( + echo [1/3] ✅ UV 已安装 + uv --version +) + +echo. +echo [2/3] 📦 安装项目依赖... +echo. + +REM 使用 UV 安装依赖 +uv sync +if %errorlevel% neq 0 ( + echo ❌ 依赖安装失败 + echo. + echo 可能的原因: + echo - 缺少 pyproject.toml 文件 + echo - 网络连接问题 + echo - Python 版本不兼容 + pause + exit /b 1 +) + +echo. +echo [3/3] ✅ 检查配置文件... + +if not exist "config\config.yaml" ( + echo ⚠️ 配置文件不存在: config\config.yaml + if exist "config\config.example.yaml" ( + echo 提示: 发现示例配置文件,请复制并修改: + echo copy config\config.example.yaml config\config.yaml + ) + echo. +) + +REM 获取 UV 路径 +for /f "tokens=*" %%i in ('where uv 2^>nul') do set "UV_PATH=%%i" + +if not defined UV_PATH ( + echo ⚠️ 无法获取 UV 路径,请手动查找 + set "UV_PATH=uv" +) + +echo. +echo ╔════════════════════════════════════════╗ +echo ║ 部署完成! ║ +echo ╚════════════════════════════════════════╝ +echo. +echo 📋 MCP 服务器配置信息: +echo. +echo 命令: %UV_PATH% +echo 工作目录: %PROJECT_ROOT% +echo. +echo 参数(逐行填入): +echo --directory +echo %PROJECT_ROOT% +echo run +echo python +echo -m +echo mcp_server.server +echo. +echo 📖 详细教程: README-Cherry-Studio.md +echo. +pause diff --git a/start-http.bat b/start-http.bat new file mode 100644 index 0000000000000..ea3ca6c36ca9f --- /dev/null +++ b/start-http.bat @@ -0,0 +1,25 @@ +@echo off +chcp 65001 >nul + +echo ╔════════════════════════════════════════╗ +echo ║ TrendRadar MCP Server (HTTP 模式) ║ +echo ╚════════════════════════════════════════╝ +echo. + +REM 检查虚拟环境 +if not exist ".venv\Scripts\python.exe" ( + echo ❌ [错误] 虚拟环境未找到 + echo 请先运行 setup-windows.bat 或 setup-windows-en.bat 进行部署 + echo. + pause + exit /b 1 +) + +echo [模式] HTTP (适合远程访问) +echo [地址] http://localhost:3333/mcp +echo [提示] 按 Ctrl+C 停止服务 +echo. + +uv run python -m mcp_server.server --transport http --host 0.0.0.0 --port 3333 + +pause diff --git a/start-http.sh b/start-http.sh new file mode 100644 index 0000000000000..09e61a6e644cc --- /dev/null +++ b/start-http.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +echo "╔════════════════════════════════════════╗" +echo "║ TrendRadar MCP Server (HTTP 模式) ║" +echo "╚════════════════════════════════════════╝" +echo "" + +# 检查虚拟环境 +if [ ! -d ".venv" ]; then + echo "❌ [错误] 虚拟环境未找到" + echo "请先运行 ./setup-mac.sh 进行部署" + echo "" + exit 1 +fi + +echo "[模式] HTTP (适合远程访问)" +echo "[地址] http://localhost:3333/mcp" +echo "[提示] 按 Ctrl+C 停止服务" +echo "" + +uv run python -m mcp_server.server --transport http --host 0.0.0.0 --port 3333 diff --git a/version b/version new file mode 100644 index 0000000000000..7da3c16870e85 --- /dev/null +++ b/version @@ -0,0 +1 @@ +3.0.5 \ No newline at end of file