dot 3 tuần trước cách đây
mục cha
commit
7c12097d19
75 tập tin đã thay đổi với 3857 bổ sung4756 xóa
  1. 729 0
      TG商户采集系统-需求方案书.md
  2. 323 0
      backup/spider_pre_v2_20260410_194440.sql
  3. 36 45
      cmd/server/main.go
  4. 0 30
      cmd/server/seed.go
  5. 1 1
      deploy/Dockerfile.api
  6. 150 0
      deploy/migrate_v2.sql
  7. 58 0
      deploy/upgrade_v2.sh
  8. 1 4
      go.mod
  9. 0 6
      go.sum
  10. 0 82
      internal/handler/channel.go
  11. 0 77
      internal/handler/config.go
  12. 0 61
      internal/handler/dashboard.go
  13. 32 38
      internal/handler/keyword.go
  14. 89 65
      internal/handler/merchant.go
  15. 0 38
      internal/handler/nav_site.go
  16. 10 36
      internal/handler/router.go
  17. 0 120
      internal/handler/seed.go
  18. 45 102
      internal/handler/task.go
  19. 7 95
      internal/llm/client.go
  20. 6 7
      internal/model/channel.go
  21. 0 12
      internal/model/config_revision.go
  22. 8 6
      internal/model/keyword.go
  23. 19 20
      internal/model/merchant_clean.go
  24. 18 12
      internal/model/merchant_raw.go
  25. 0 14
      internal/model/nav_site.go
  26. 0 12
      internal/model/seed.go
  27. 0 13
      internal/model/setting.go
  28. 0 20
      internal/model/task.go
  29. 18 0
      internal/model/task_log.go
  30. 0 0
      internal/pipeline/.gitkeep
  31. 0 46
      internal/pipeline/phase.go
  32. 0 183
      internal/pipeline/phase1_discover.go
  33. 0 129
      internal/pipeline/phase2_search.go
  34. 0 250
      internal/pipeline/phase3_github.go
  35. 0 220
      internal/pipeline/phase4_scrape.go
  36. 0 200
      internal/pipeline/phase5_crawl.go
  37. 0 322
      internal/pipeline/phase6_clean.go
  38. 0 126
      internal/pipeline/phase7_score.go
  39. 0 112
      internal/pipeline/pipeline.go
  40. 31 0
      internal/plugin/interface.go
  41. 50 0
      internal/plugin/registry.go
  42. 263 0
      internal/plugins/githubcollector/collector.go
  43. 323 0
      internal/plugins/tgcollector/collector.go
  44. 243 0
      internal/plugins/webcollector/collector.go
  45. 87 0
      internal/processor/blacklist.go
  46. 119 0
      internal/processor/dedup.go
  47. 173 0
      internal/processor/pipeline.go
  48. 66 0
      internal/processor/tagger.go
  49. 40 0
      internal/processor/tmechecker.go
  50. 0 0
      internal/service/.gitkeep
  51. 0 146
      internal/service/settings_service.go
  52. 0 181
      internal/service/task_service.go
  53. 59 0
      internal/store/channel_repo.go
  54. 13 0
      internal/store/db.go
  55. 49 0
      internal/store/keyword_repo.go
  56. 111 0
      internal/store/merchant_repo.go
  57. 392 0
      internal/task/manager.go
  58. 0 0
      internal/worker/.gitkeep
  59. 0 325
      internal/worker/worker.go
  60. BIN
      server.exe
  61. 4 17
      web/src/App.tsx
  62. 48 107
      web/src/api/index.ts
  63. 3 17
      web/src/components/Layout.tsx
  64. 32 72
      web/src/components/TaskControl.tsx
  65. 0 192
      web/src/pages/Channels.tsx
  66. 0 171
      web/src/pages/Dashboard.tsx
  67. 74 53
      web/src/pages/Keywords.tsx
  68. 0 194
      web/src/pages/Logs.tsx
  69. 84 57
      web/src/pages/MerchantsClean.tsx
  70. 0 162
      web/src/pages/MerchantsRaw.tsx
  71. 0 138
      web/src/pages/NavSites.tsx
  72. 0 217
      web/src/pages/Seeds.tsx
  73. 0 167
      web/src/pages/Settings.tsx
  74. 40 33
      web/src/pages/Tasks.tsx
  75. 3 3
      web/src/store/index.ts

+ 729 - 0
TG商户采集系统-需求方案书.md

@@ -0,0 +1,729 @@
+# TG 商户采集系统 — 需求方案书(v2)
+
+> 本文档描述系统要实现的功能和业务逻辑,供开发者从零设计和实现。
+> 版本: v2(精简版,砍掉低 ROI 模块,强调模块化隔离)
+
+---
+
+## 一、系统目标
+
+**一句话:用关键词去 Google 搜,把搜到的网页里的商户联系方式扒下来,清洗后输出一张可以直接联系的客户表。**
+
+### 什么是"商户"
+
+系统要找的是**在 TG 上提供产品或服务的人或组织**。
+
+判定标准(满足任意一条即算商户):
+- 有 TG 联系方式(@xxx 或 t.me/xxx)**并且**有商业意图(接单、代理、价格、购买、咨询、客服、官网、订阅)
+- 被导航站收录(导航站本身就是商户目录)
+
+**不算商户的**:聊天用户、新闻频道、系统 bot。
+
+### 目标行业(当前)
+
+当前只做**机场 / VPN / 科学上网**。行业规则可配置,以后可扩展。
+
+### 输入
+
+- 一组**关键词**(比如"机场推荐"、"VPN 订阅"、"科学上网")
+
+### 最终输出
+
+| 字段 | 说明 |
+|------|------|
+| 商户名 | 显示名称 |
+| TG 用户名 | @xxx |
+| TG 链接 | https://t.me/xxx |
+| 网站 | 商户官网 |
+| 邮箱 | 联系邮箱 |
+| 电话 | 联系电话 |
+| 来源 | 从哪个网页/渠道发现的 |
+| 行业标签 | 机场 / VPN 等 |
+| 等级 | Hot / Warm / Cold |
+
+---
+
+## 二、核心架构:插件式采集 + 统一清洗
+
+### 设计理念
+
+系统分两大部分:**采集端**和**处理端**。
+
+- **采集端**:负责从各种渠道找商户,每个渠道是一个**独立插件**
+- **处理端**:负责清洗、去重、验证、打标签,**所有插件共用同一套**
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     采集端(插件式)                           │
+│                                                               │
+│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐          │
+│  │ 插件 A      │  │ 插件 B      │  │ 插件 C      │  ← 互相  │
+│  │ 网页采集    │  │ TG 频道采集 │  │ 未来新增... │  不影响  │
+│  └──────┬──────┘  └──────┬──────┘  └──────┬──────┘          │
+│         │                │                │                   │
+│         ▼                ▼                ▼                   │
+│  ┌────────────────────────────────────────────────────┐      │
+│  │          统一入口:商户表 raw                        │      │
+│  │   所有插件的产出格式一样,往同一张表写              │      │
+│  └──────────────────────┬─────────────────────────────┘      │
+└─────────────────────────┼─────────────────────────────────────┘
+                          │
+┌─────────────────────────┼─────────────────────────────────────┐
+│                     处理端(固定流程)                         │
+│                          ▼                                     │
+│  ┌──────────┐   ┌──────────┐   ┌──────────┐   ┌──────────┐  │
+│  │ 死号预检 │ → │ 黑名单   │ → │ 去重     │ → │ 打标签   │  │
+│  │ (t.me)   │   │ 过滤     │   │ 合并     │   │ 分等级   │  │
+│  └──────────┘   └──────────┘   └──────────┘   └──────────┘  │
+│                                                     │         │
+│                                                     ▼         │
+│                                          商户表 clean          │
+│                                      (Hot / Warm / Cold)      │
+└───────────────────────────────────────────────────────────────┘
+```
+
+### 为什么这样设计
+
+**核心问题:后期加新的采集渠道,不能把前面的东西弄坏。**
+
+解决办法:**插件隔离**。
+
+```
+规则 1: 每个采集插件是独立的代码模块,有自己的目录/文件
+规则 2: 插件之间零依赖,A 插件的代码不能 import B 插件的任何东西
+规则 3: 所有插件的产出格式统一(见下方"标准产出格式")
+规则 4: 插件只管采集,不管清洗/去重/打分 — 那是处理端的事
+规则 5: 新增插件 = 新建一个目录 + 实现标准接口,不改任何已有代码
+```
+
+### 标准产出格式(所有插件必须遵守)
+
+每个插件采集到商户后,必须按这个格式写入 `merchants_raw` 表:
+
+```
+{
+  "merchant_name": "商户名(选填)",
+  "tg_username": "xxx(必填,没有就不入库)",
+  "tg_link": "https://t.me/xxx",
+  "website": "官网地址(选填)",
+  "email": "邮箱(选填)",
+  "phone": "电话(选填)",
+  "source_type": "web / tg_channel / github / ...",
+  "source_name": "具体来源(哪个网页/频道)",
+  "source_url": "来源 URL",
+  "original_text": "原始文本(留底)",
+  "industry_tag": "行业标签(选填)"
+}
+```
+
+**关键约束**:没有 `tg_username` 的商户不入库。这是核心数据,其他都是锦上添花。
+
+### 插件的标准接口
+
+每个插件需要实现以下接口(伪代码):
+
+```
+class CollectorPlugin:
+    name: str              # 插件名,比如 "web_collector"
+    
+    async def run(config, callback):
+        """
+        config: 该插件的配置(关键词、URL 列表等)
+        callback(merchant_data): 每找到一个商户就调一次,由框架写入 raw 表
+        """
+        
+    async def stop():
+        """外部可以随时叫停"""
+```
+
+框架负责:调度插件、写数据库、记日志、控制并发。
+插件负责:采集逻辑,只管找商户,找到就 callback。
+
+---
+
+## 三、采集插件 A:网页采集(优先开发)
+
+**这是最高优先级的插件,也是系统的核心价值。**
+
+### 为什么优先
+
+- 一个导航站几秒出 50 个商户,效率最高
+- 没有限速问题,想跑多快跑多快
+- 导航站上的商户是别人已经整理好的,质量高
+
+### 流程
+
+```
+关键词 → Google 搜索 → 拿到 URL 列表
+                              │
+                    ┌─────────┴─────────┐
+                    ↓                   ↓
+              URL 是 t.me/xxx      URL 是网页
+              直接提取 username     打开网页读 HTML
+                    │                   │
+                    │             ┌─────┴──────┐
+                    │             ↓            ↓
+                    │        找 t.me 链接   找联系方式
+                    │        提取 username  (邮箱/电话/网址)
+                    │             │            │
+                    └──────┬──────┘            │
+                           ↓                   │
+                    写入 merchants_raw ←────────┘
+```
+
+### 详细逻辑
+
+**第一步:关键词搜索**
+
+1. 从关键词表拿关键词(比如"机场推荐 telegram")
+2. 调搜索 API(Serper 或 Brave Search),拿搜索结果
+3. 每个关键词搜 3-5 页,每页 10 条
+4. 关键词之间等几秒,避免被封
+
+**第二步:URL 分拣**
+
+拿到的 URL 分三种:
+
+| URL 类型 | 怎么判断 | 怎么处理 |
+|----------|---------|---------|
+| `t.me/xxx` | URL 以 t.me/ 开头 | 直接提取 username,写 raw 表 |
+| 导航站/有用网页 | 不在黑名单里的网页 | 打开网页,进入第三步 |
+| 垃圾 | 在黑名单里(twitter/google/youtube 等 80 个域名) | 丢弃 |
+
+**第三步:网页解析**
+
+1. 用 HTTP 请求抓网页 HTML
+2. HTML 前 3000 字不含中文 → 跳过(不是中文站)
+3. 解析 HTML,找所有外链:
+   - `t.me/xxx` 链接 → 提取 username
+   - `mailto:xxx` → 提取邮箱
+   - 电话号码正则 → 提取电话
+4. 如果页面上有很多 t.me 链接(>5 个),说明这是个导航站,每个链接都是一个商户
+5. 每个提取到的商户按标准格式写入 raw 表
+
+**HTTP 请求失败时的 fallback**(按顺序尝试):
+1. 标准 `net/http`(或 colly 爬虫框架)
+2. 带自定义 TLS 指纹的 HTTP 客户端(绕反爬,如 utls 库)
+3. chromedp / rod(Go 原生浏览器引擎,处理 JS 渲染页面)
+
+### 搜索 API 选择
+
+| 方案 | 免费额度 | 付费 | 推荐 |
+|------|---------|------|------|
+| **Brave Search API** | 5000 次/月 | $5/1000 次 | 先用这个测试 |
+| **Serper.dev** | 2500 次(一次性) | $50/50000 次 | Google 结果最准 |
+| **DuckDuckGo** | 无限 | 免费 | 开源库,稳定性差 |
+
+建议:**先用 Brave 免费额度测试**,结果不够好再切 Serper。代码层面做成可配置,换 API 只改配置不改代码。
+
+---
+
+## 四、采集插件 B:TG 频道采集(第二优先级)
+
+**等插件 A 跑稳了再开发这个。** 这是锦上添花,不是核心。
+
+### 为什么第二优先级
+
+- TG 限速严,一天最多出几十个商户
+- 需要手机号注册账号,被封就废
+- 开发和维护成本比网页采集高得多
+
+### 流程
+
+```
+种子频道列表 → 进频道 → 读历史消息(最近 500 条)
+                              │
+                    每条消息看有没有联系方式
+                    正则快扫 → 有 → AI 精确提取
+                              │
+                    写入 merchants_raw(标准格式)
+```
+
+### 详细逻辑
+
+1. 从种子列表拿频道(比如 @bbs3000),种子由用户手动添加
+2. 用 TG 客户端库(Go: gotd/td)登录 TG 账号,进入频道
+3. 读最近 500 条消息(支持断点续传,记住上次读到哪条)
+4. 每条消息:
+   - 系统消息 / 非中文 → 跳过
+   - 正则快速扫描有没有 `@xxx`、`t.me/xxx`、邮箱、网址
+   - 有联系方式 → 调 AI 精确提取商户信息
+   - AI 超时(>5 秒)或失败 → 用正则兜底
+5. 提取到的商户按标准格式写入 raw 表
+
+### TG 账号管理(重要)
+
+TG 账号是稀缺资源,需要专门的调度器:
+
+**entity ID 缓存(必须做)**:
+- 第一次 `ResolveUsername` 拿到频道的数字 ID 后存到本地
+- 以后直接用数字 ID 访问,不再调 `ResolveUsername`
+- 这样同一个频道只消耗 1 次 resolve 额度,之后无限次不限速
+
+**限速处理**:
+- 全局请求频率控制(所有模块共享,不超过 30 次/分钟)
+- FloodWait < 60 秒 → 等完继续
+- FloodWait > 60 秒 → 切账号
+- FloodWait > 300 秒 → 停止,下次再来
+- 没有可用账号 → 排队等待,不崩溃
+
+**每个 TG 账号需要的信息**:
+
+| 字段 | 说明 |
+|------|------|
+| 手机号 | 注册 Telegram 用的 |
+| api_id | 在 https://my.telegram.org 申请 |
+| api_hash | 同上 |
+| session 文件 | 首次登录后生成 |
+
+---
+
+## 五、采集插件 C/D/E...:未来扩展
+
+以下是以后可能加的插件,**现在不做,但架构要能支持**:
+
+| 插件 | 数据源 | 什么时候加 |
+|------|--------|-----------|
+| GitHub 搜索 | GitHub README 里的 t.me 链接 | 网页+TG 都稳定后 |
+| TG 频道裂变 | 从种子频道滚雪球发现新频道 | TG 采集稳定后 |
+| 百度搜索 | 百度搜索结果 | 如果 Google 覆盖不够 |
+| Twitter/X | 推文里的 t.me 链接 | 如果有需求 |
+
+**加新插件的步骤**(这是模块化的价值):
+1. 新建一个目录/文件
+2. 实现 `run()` 和 `stop()` 接口
+3. 按标准格式 callback 产出商户
+4. 在配置里注册插件名
+5. **不改任何已有代码**
+
+---
+
+## 六、处理端:清洗流程
+
+所有插件的产出都进 `merchants_raw` 表,然后统一过清洗流程。
+
+### 清洗流水线(4 步,按顺序执行)
+
+```
+merchants_raw → [死号预检] → [黑名单过滤] → [去重合并] → [打标签分等级] → merchants_clean
+```
+
+### 第一步:t.me 死号预检(免费,无限速)
+
+- 用 HTTP 请求访问 `https://t.me/{username}`
+- 看返回 HTML 里有没有 `tgme_page_photo_image` 标记
+- 有头像 = 活号 → 继续
+- 没头像 = 死号 → 标记 invalid,不进后面的步骤
+- **准确率 100%,不花钱,不限速**
+- 建议并发 10 个,每分钟能检 600 个
+
+### 第二步:黑名单过滤(本地,秒级)
+
+| 规则 | 处理 |
+|------|------|
+| TG 用户名是系统 bot(@BotFather、@SpamBot、以 bot 结尾) | 标记 bot |
+| TG 用户名像邀请链接哈希(16-24 位随机字符串) | 标记 invalid |
+| 原始文本不含中文(如果有原始文本的话) | 标记 invalid |
+
+### 第三步:去重合并(本地,秒级)
+
+- 同一个 tg_username 可能被多个插件多次发现
+- 按信息丰富度保留最好的一条(有网站 > 没网站,有邮箱 > 没邮箱)
+- 其他标记为 duplicate
+- 合并所有来源信息到保留的那条
+
+### 第四步:打标签 + 分等级
+
+**行业标签**:用关键词匹配(商户名/原始文本里包含"机场""节点""VPN"→ 打标签)。只做机场/VPN 一个行业时,关键词匹配完全够用,不需要 AI。
+
+**等级划分**(3 个桶,不打分):
+
+| 等级 | 条件 | 含义 |
+|------|------|------|
+| **Hot** | 行业匹配 + 有网站或邮箱 | 优先联系,信息最全 |
+| **Warm** | 行业匹配 + 只有 TG 号 | 可以联系,但信息少 |
+| **Cold** | 行业不匹配 / 信息太少 | 暂不联系 |
+
+**为什么不用 0-100 打分**:
+- 100 分制需要成员数、Premium、活跃度等数据,但这些要调 TG API 才能拿到
+- TG API 是最大的瓶颈,为了打分去调 API 得不偿失
+- 3 个桶简单直观,销售拿到手就能用
+
+### 可选增强:TG 验证(需要 TG 账号)
+
+如果有 TG 账号且未被限速,可以在第三步和第四步之间加一步:
+- 调 `ResolveUsername` 验证账号真实性
+- 拿到:显示名、是否 Premium、最后在线时间
+- 有了这些数据可以更精准地分等级
+
+**但这不是必须的。** 没有 TG 账号,系统照样能跑(靠 t.me 预检 + 黑名单就能过滤大部分垃圾)。
+
+---
+
+## 七、数据模型(5 张表)
+
+### 表 1:关键词表 (keywords)
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| id | int | 主键 |
+| keyword | string | 搜索关键词 |
+| industry_tag | string | 行业标签(机场/VPN) |
+| enabled | bool | 是否启用 |
+| created_at | datetime | 创建时间 |
+
+种子频道也放这个表(`industry_tag = 'seed'`),不单独建表。
+
+### 表 2:商户表 — 原始 (merchants_raw)
+
+所有插件的产出统一写这张表。
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| id | int | 主键 |
+| tg_username | string | **必填**,TG 用户名 |
+| tg_link | string | t.me 链接 |
+| merchant_name | string | 商户名 |
+| website | string | 官网 |
+| email | string | 邮箱 |
+| phone | string | 电话 |
+| source_type | string | 来源类型(web / tg_channel / github) |
+| source_name | string | 具体来源(哪个网页/频道) |
+| source_url | string | 来源 URL |
+| original_text | text | 原始文本 |
+| industry_tag | string | 行业标签 |
+| status | string | raw / processing / done |
+| created_at | datetime | 入库时间 |
+
+**入库去重规则**:同 `tg_username` + 同 `source_url` 不重复插入。不同来源发现同一个 username 允许多条(去重在清洗阶段做)。
+
+### 表 3:商户表 — 已清洗 (merchants_clean)
+
+清洗通过的商户。
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| id | int | 主键 |
+| tg_username | string | TG 用户名 |
+| tg_link | string | t.me 链接 |
+| merchant_name | string | 商户名 |
+| website | string | 官网 |
+| email | string | 邮箱 |
+| phone | string | 电话 |
+| source_count | int | 被多少个来源发现 |
+| all_sources | text | 所有来源列表(JSON) |
+| industry_tag | string | 行业标签 |
+| level | string | **Hot / Warm / Cold** |
+| status | string | valid / invalid / bot / duplicate |
+| is_alive | bool | t.me 预检结果 |
+| last_checked_at | datetime | 最近一次验证时间 |
+| created_at | datetime | 首次发现时间 |
+
+### 表 4:频道表 (channels)
+
+只有启用了 TG 采集插件才需要这张表。
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| id | int | 主键 |
+| username | string | 频道用户名(唯一) |
+| channel_id | bigint | TG 数字 ID(缓存,避免重复 resolve) |
+| access_hash | bigint | TG access_hash(缓存) |
+| status | string | pending / scraped / skipped |
+| last_message_id | int | 上次采集到哪条(断点续传) |
+| merchants_found | int | 发现了多少商户 |
+| source | string | seed / discovered |
+| created_at | datetime | 入库时间 |
+
+### 表 5:任务日志表 (task_logs)
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| id | int | 主键 |
+| task_type | string | web_collect / tg_collect / clean / ... |
+| plugin_name | string | 哪个插件 |
+| status | string | running / success / failed / stopped |
+| items_processed | int | 处理了多少条 |
+| merchants_added | int | 新增了多少商户 |
+| errors_count | int | 错误数 |
+| started_at | datetime | 开始时间 |
+| finished_at | datetime | 结束时间 |
+| detail | text | 详细日志/错误信息 |
+
+---
+
+## 八、前端需求(只做 2 个页面)
+
+早期只需要 2 个页面,其他的等有需求了再加。
+
+### 页面 1:商户列表
+
+- 显示 `merchants_clean` 表的数据
+- 按等级筛选(Hot / Warm / Cold)
+- 按行业筛选
+- 按来源筛选
+- 搜索(按商户名、TG 用户名)
+- 排序(按发现时间、来源数)
+- 导出 CSV / Excel
+- 点击 TG 链接可以直接跳转
+
+### 页面 2:任务管理
+
+- 选择插件启动任务(网页采集 / TG 采集 / 清洗)
+- 显示当前运行中的任务
+- 停止任务
+- 查看历史任务和结果
+
+### 不做的(延后)
+
+| 功能 | 为什么不做 |
+|------|-----------|
+| ~~仪表盘~~ | 数据量小时看列表就够了 |
+| ~~配置管理~~ | 改配置文件比写前端快 |
+| ~~实时日志流~~ | SSH 看日志就行 |
+| ~~种子频道管理~~ | 初期手动维护,量不大 |
+
+---
+
+## 九、外部依赖
+
+### 技术栈
+
+| 层 | 选型 | 说明 |
+|---|---|---|
+| **后端语言** | **Go** | 高并发、编译型、单二进制部署 |
+| **Web 框架** | Gin 或 Echo | 轻量 HTTP 框架 |
+| **ORM** | GORM | Go 主流 ORM |
+| **数据库** | SQLite(初期)/ PostgreSQL(后期) | 初期单机够用 |
+| **TG 客户端** | gotd/td 或 gotdlib | Go 原生 Telegram MTProto 库 |
+| **HTML 解析** | goquery | 类似 jQuery 的 HTML 解析 |
+| **HTTP 客户端** | net/http + colly | 标准库 + 爬虫框架 |
+| **浏览器引擎** | chromedp 或 rod | Go 原生 Chrome DevTools Protocol(替代 Playwright) |
+| **前端** | Vue 3 + Vite + TypeScript | 不变 |
+| **配置** | YAML(viper 库) | Go 生态标准 |
+| **日志** | zerolog 或 zap | 结构化日志 |
+
+### 必须的外部服务
+
+| 服务 | 用途 | 备注 |
+|------|------|------|
+| **搜索 API** | 关键词搜索 | Brave(免费 5000 次/月)或 Serper($50/50000 次) |
+| **HTTP 客户端** | 抓网页、t.me 预检 | net/http + colly + chromedp 三层 fallback |
+
+### 可选的(TG 采集插件启用后才需要)
+
+| 服务 | 用途 | 备注 |
+|------|------|------|
+| **gotd/td** | TG 频道采集 | Go 原生 MTProto 库,替代 Python Telethon |
+| **AI 大模型 API** | 联系方式提取 | 智谱 GLM 或 DeepSeek,仅 TG 采集时用,HTTP 调用即可 |
+
+### AI 使用策略
+
+**规则优先,AI 只在一个地方用。**
+
+| 环节 | 方法 | 说明 |
+|------|------|------|
+| 网页联系方式提取 | **纯正则** | 网页上的 t.me 链接、邮箱、电话,正则就能 100% 提取 |
+| TG 消息联系方式提取 | **正则 + AI** | 非标准格式("加V:xxx")需要 AI |
+| 行业分类 | **纯关键词匹配** | 只做机场/VPN,关键词够用 |
+| 导航站识别 | **纯规则**(黑名单 + 正向关键词) | 不需要 AI |
+
+AI 只在 TG 采集插件的联系方式提取环节使用。网页采集完全不需要 AI。
+
+---
+
+## 十、运行方式
+
+### 单插件运行
+
+每个插件可以独立跑:
+- 只跑网页采集 → 看搜到了什么
+- 只跑清洗 → 处理已有的脏数据
+- 只跑 TG 采集 → 从指定频道挖商户
+
+### 全链路运行
+
+也可以串起来:`网页采集 → 清洗`(两步就够了)
+
+### 任务控制
+
+- 每个任务有状态:运行中 / 完成 / 失败 / 已停止
+- 支持手动停止
+- 同类型任务不能同时跑两个
+- 支持测试模式(只跑少量数据)
+
+---
+
+## 十一、踩过的坑(供参考)
+
+### 1. TG 限速是最大坑
+
+单账号一天最多几百次 `ResolveUsername`,之后被限速 10-24 小时。
+
+**根治方案**:缓存 `channel_id + access_hash`,同一个频道只调一次 `ResolveUsername`,之后用数字 ID 直接访问。
+
+### 2. t.me 网页可以免费检测死号
+
+访问 `https://t.me/{username}`,HTML 里有 `tgme_page_photo_image` = 活号,没有 = 死号。准确率 100%,不限速,不花钱。**在调 TG API 之前先做这一步能省 90% 的 API 调用。**
+
+### 3. AI 会编造联系方式
+
+AI 提取后必须用正则回原文二次验证。原文里找不到的,丢弃 AI 的结果。
+
+### 4. 清洗后数据和原始数据分开存
+
+用两张表(raw 和 clean),清洗通过的搬到 clean 表。raw 表保留原始数据,可以反复清洗。
+
+### 5. 非中文内容直接跳过
+
+系统只做中文商户,非中文的网页/消息直接跳过,节省大量处理时间。
+
+### 6. 网页抓取要有 fallback
+
+有些网页有反爬(Cloudflare),有些是 JS 渲染。按顺序试:net/http → utls(自定义 TLS 指纹)→ chromedp/rod(浏览器引擎)。
+
+### 7. 不要一上来就做全链路
+
+先把一个插件(网页采集)做稳做透,再加第二个(TG)。一上来就做 7 阶段 pipeline,结果哪个都不稳。
+
+---
+
+## 附录 A:模块化目录结构建议(Go)
+
+```
+tg-lead-scraper/
+├── cmd/
+│   └── server/
+│       └── main.go                 # 程序入口
+│
+├── internal/
+│   ├── plugin/                     # 插件框架
+│   │   ├── interface.go            # 插件接口定义(Collector interface)
+│   │   └── registry.go             # 插件注册中心
+│   │
+│   ├── plugins/                    # 采集插件目录(每个插件一个包)
+│   │   ├── webcollector/           # 插件 A:网页采集
+│   │   │   ├── collector.go        # 实现 Collector 接口
+│   │   │   ├── searcher.go         # 调搜索 API
+│   │   │   └── parser.go           # 解析网页 HTML
+│   │   ├── tgcollector/            # 插件 B:TG 频道采集
+│   │   │   ├── collector.go        # 实现 Collector 接口
+│   │   │   ├── scraper.go          # TG 消息采集
+│   │   │   └── account.go          # TG 账号调度
+│   │   └── githubcollector/        # 插件 C:未来新增
+│   │       └── ...
+│   │
+│   ├── processor/                  # 处理端(清洗流程)
+│   │   ├── pipeline.go             # 清洗流水线调度
+│   │   ├── tmechecker.go           # t.me 死号预检
+│   │   ├── blacklist.go            # 黑名单过滤
+│   │   ├── dedup.go                # 去重合并
+│   │   └── tagger.go               # 打标签 + 分等级
+│   │
+│   ├── model/                      # 数据模型
+│   │   ├── merchant.go             # 商户结构体 + GORM model
+│   │   ├── channel.go              # 频道
+│   │   ├── keyword.go              # 关键词
+│   │   └── tasklog.go              # 任务日志
+│   │
+│   ├── store/                      # 数据访问层
+│   │   ├── db.go                   # 数据库连接 + 初始化
+│   │   ├── merchant_repo.go        # 商户 CRUD
+│   │   └── keyword_repo.go         # 关键词 CRUD
+│   │
+│   ├── extractor/                  # 联系方式提取器
+│   │   ├── regex.go                # 正则提取
+│   │   └── ai.go                   # AI 提取(调大模型 API)
+│   │
+│   └── task/                       # 任务调度
+│       └── manager.go              # 任务启停、并发控制
+│
+├── api/                            # HTTP API
+│   ├── server.go                   # Gin/Echo 初始化
+│   ├── handler/
+│   │   ├── merchant.go             # 商户列表 API
+│   │   └── task.go                 # 任务管理 API
+│   └── middleware/
+│       └── auth.go                 # 认证中间件
+│
+├── frontend/                       # 前端(Vue 3)
+│   └── ...
+│
+├── config/
+│   └── config.yaml                 # 全局配置
+│
+├── go.mod
+├── go.sum
+└── Makefile
+```
+
+### 插件接口定义(Go interface)
+
+```go
+// internal/plugin/interface.go
+package plugin
+
+import "context"
+
+// MerchantData 是所有插件的标准产出格式
+type MerchantData struct {
+    TgUsername    string `json:"tg_username"`
+    TgLink       string `json:"tg_link"`
+    MerchantName string `json:"merchant_name"`
+    Website      string `json:"website"`
+    Email        string `json:"email"`
+    Phone        string `json:"phone"`
+    SourceType   string `json:"source_type"`
+    SourceName   string `json:"source_name"`
+    SourceURL    string `json:"source_url"`
+    OriginalText string `json:"original_text"`
+    IndustryTag  string `json:"industry_tag"`
+}
+
+// Collector 是所有采集插件必须实现的接口
+type Collector interface {
+    // Name 返回插件名,比如 "web_collector"
+    Name() string
+    // Run 启动采集,每找到一个商户就调 callback,ctx 取消时优雅退出
+    Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error
+    // Stop 外部可以随时叫停
+    Stop() error
+}
+```
+
+### 加新插件的步骤
+
+1. 在 `internal/plugins/` 下新建包(比如 `baiducollector/`)
+2. 实现 `Collector` 接口的 3 个方法
+3. 在 `registry.go` 注册插件名
+4. 在 `config.yaml` 加插件配置
+5. **不改 internal/plugins/ 外的任何代码**
+
+---
+
+## 附录 B:完整数据流图
+
+```
+┌─────────── 采集端 ───────────┐
+│                                │
+│  关键词 → [网页采集插件]       │
+│              │                 │
+│              ├→ t.me 链接      │
+│              └→ 网页 → 解析    │      ┌─────── 处理端 ──────┐
+│                    │           │      │                       │
+│                    ↓           │      │  [死号预检]           │
+│            merchants_raw  ←────┼──→   │      ↓               │
+│                    ↑           │      │  [黑名单过滤]        │
+│  种子频道 → [TG 采集插件]     │      │      ↓               │
+│              │                 │      │  [去重合并]          │
+│              └→ 消息 → AI提取 │      │      ↓               │
+│                                │      │  [打标签分等级]      │
+│  (未来)  → [GitHub 插件]      │      │      ↓               │
+│  (未来)  → [百度插件]         │      │  merchants_clean     │
+│  (未来)  → [Twitter 插件]     │      │  (Hot/Warm/Cold)     │
+│                                │      │                       │
+└────────────────────────────────┘      └───────────────────────┘
+                                                    │
+                                                    ↓
+                                           前端:商户列表 + 导出
+```

+ 323 - 0
backup/spider_pre_v2_20260410_194440.sql

@@ -0,0 +1,323 @@
+-- MySQL dump 10.13  Distrib 8.0.45, for Linux (x86_64)
+--
+-- Host: localhost    Database: spider
+-- ------------------------------------------------------
+-- Server version	8.0.45
+
+/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
+/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
+/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
+/*!50503 SET NAMES utf8mb4 */;
+/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
+/*!40103 SET TIME_ZONE='+00:00' */;
+/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
+/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
+/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
+/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
+
+--
+-- Table structure for table `channels`
+--
+
+DROP TABLE IF EXISTS `channels`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `channels` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `username` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `title` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `member_count` bigint DEFAULT '0',
+  `about` text COLLATE utf8mb4_unicode_ci,
+  `source` enum('seed','snowball','search','github') COLLATE utf8mb4_unicode_ci NOT NULL,
+  `source_detail` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `status` enum('pending','scraped','failed','skipped') COLLATE utf8mb4_unicode_ci DEFAULT 'pending',
+  `last_message_id` bigint DEFAULT '0',
+  `relevance_score` double DEFAULT NULL,
+  `created_at` datetime(3) DEFAULT NULL,
+  `updated_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_channels_username` (`username`),
+  KEY `idx_channels_source` (`source`),
+  KEY `idx_channels_status` (`status`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `channels`
+--
+
+LOCK TABLES `channels` WRITE;
+/*!40000 ALTER TABLE `channels` DISABLE KEYS */;
+/*!40000 ALTER TABLE `channels` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `config_revisions`
+--
+
+DROP TABLE IF EXISTS `config_revisions`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `config_revisions` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `setting_key` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `old_value` text COLLATE utf8mb4_unicode_ci,
+  `new_value` text COLLATE utf8mb4_unicode_ci,
+  `changed_by` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT 'admin',
+  `created_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `idx_config_revisions_setting_key` (`setting_key`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `config_revisions`
+--
+
+LOCK TABLES `config_revisions` WRITE;
+/*!40000 ALTER TABLE `config_revisions` DISABLE KEYS */;
+/*!40000 ALTER TABLE `config_revisions` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `managed_keywords`
+--
+
+DROP TABLE IF EXISTS `managed_keywords`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `managed_keywords` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `keyword` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `category` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `status` enum('active','inactive') COLLATE utf8mb4_unicode_ci DEFAULT 'active',
+  `created_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_managed_keywords_keyword` (`keyword`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `managed_keywords`
+--
+
+LOCK TABLES `managed_keywords` WRITE;
+/*!40000 ALTER TABLE `managed_keywords` DISABLE KEYS */;
+/*!40000 ALTER TABLE `managed_keywords` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `managed_seeds`
+--
+
+DROP TABLE IF EXISTS `managed_seeds`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `managed_seeds` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `channel_name` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `status` enum('active','inactive') COLLATE utf8mb4_unicode_ci DEFAULT 'active',
+  `note` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `created_at` datetime(3) DEFAULT NULL,
+  `updated_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_managed_seeds_channel_name` (`channel_name`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `managed_seeds`
+--
+
+LOCK TABLES `managed_seeds` WRITE;
+/*!40000 ALTER TABLE `managed_seeds` DISABLE KEYS */;
+/*!40000 ALTER TABLE `managed_seeds` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `managed_settings`
+--
+
+DROP TABLE IF EXISTS `managed_settings`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `managed_settings` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `key_name` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `value` text COLLATE utf8mb4_unicode_ci NOT NULL,
+  `value_type` enum('int','float','bool','string','json') COLLATE utf8mb4_unicode_ci NOT NULL,
+  `effect_level` enum('runtime','new_task') COLLATE utf8mb4_unicode_ci DEFAULT 'runtime',
+  `description` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `updated_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_managed_settings_key_name` (`key_name`)
+) ENGINE=InnoDB AUTO_INCREMENT=14 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `managed_settings`
+--
+
+LOCK TABLES `managed_settings` WRITE;
+/*!40000 ALTER TABLE `managed_settings` DISABLE KEYS */;
+INSERT INTO `managed_settings` VALUES (1,'pipeline.skip_phases','[]','json','new_task','默认跳过的阶段','2026-04-09 12:40:53.888'),(2,'pipeline.checkpoint_interval','30','int','runtime','进度上报间隔(秒)','2026-04-09 12:40:53.900'),(3,'tg_scraper.message_limit_per_channel','500','int','runtime','每频道最大消息数','2026-04-09 12:40:53.915'),(4,'tg_scraper.delay_per_message','1.0','float','runtime','消息间延迟(秒)','2026-04-09 12:40:53.925'),(5,'tg_scraper.delay_per_channel','5.0','float','runtime','频道间延迟(秒)','2026-04-09 12:40:53.934'),(6,'tg_scraper.delay_per_verify','3.0','float','runtime','验证间延迟(秒)','2026-04-09 12:40:53.944'),(7,'clean.timeout_seconds','3600','int','runtime','清洗阶段超时(秒)','2026-04-09 12:40:53.954'),(8,'search.timeout_seconds','3600','int','runtime','搜索阶段超时(秒)','2026-04-09 12:40:53.964'),(9,'snowball.max_channels_per_layer','200','int','runtime','每层最大频道数','2026-04-09 12:40:53.974'),(10,'snowball.max_channels_total','500','int','runtime','总最大频道数','2026-04-09 12:40:53.983'),(11,'tme_validator.enabled','true','bool','runtime','启用t.me死号预检','2026-04-09 12:40:53.993'),(12,'tme_validator.rate_per_min','60','int','runtime','预检限速(次/分)','2026-04-09 12:40:54.006'),(13,'tme_validator.concurrency','10','int','runtime','预检并发数','2026-04-09 12:40:54.015');
+/*!40000 ALTER TABLE `managed_settings` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `merchant_cleans`
+--
+
+DROP TABLE IF EXISTS `merchant_cleans`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `merchant_cleans` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `raw_id` bigint unsigned DEFAULT NULL,
+  `merchant_name` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `tg_username` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `website` varchar(2048) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `email` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `phone` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `industry` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `status` enum('valid','invalid','bot','duplicate','group') COLLATE utf8mb4_unicode_ci NOT NULL,
+  `tg_first_name` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `tg_last_name` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `is_premium` tinyint(1) DEFAULT '0',
+  `last_online` datetime(3) DEFAULT NULL,
+  `active_level` enum('active','moderate','inactive') COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `member_count` bigint DEFAULT '0',
+  `quality_score` double DEFAULT '0',
+  `source_count` bigint DEFAULT '1',
+  `source_links` json DEFAULT NULL,
+  `created_at` datetime(3) DEFAULT NULL,
+  `updated_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_merchant_cleans_tg_username` (`tg_username`),
+  KEY `idx_merchant_cleans_raw_id` (`raw_id`),
+  KEY `idx_merchant_cleans_industry` (`industry`),
+  KEY `idx_merchant_cleans_status` (`status`),
+  KEY `idx_merchant_cleans_quality_score` (`quality_score`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `merchant_cleans`
+--
+
+LOCK TABLES `merchant_cleans` WRITE;
+/*!40000 ALTER TABLE `merchant_cleans` DISABLE KEYS */;
+/*!40000 ALTER TABLE `merchant_cleans` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `merchant_raws`
+--
+
+DROP TABLE IF EXISTS `merchant_raws`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `merchant_raws` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `merchant_name` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `tg_username` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `website` varchar(2048) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `email` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `phone` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `industry` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `source_type` enum('tg_scrape','web_crawl','github') COLLATE utf8mb4_unicode_ci NOT NULL,
+  `source_id` varchar(500) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `original_message` text COLLATE utf8mb4_unicode_ci,
+  `status` enum('raw','glm_parsed') COLLATE utf8mb4_unicode_ci DEFAULT 'raw',
+  `created_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `idx_merchant_raws_tg_username` (`tg_username`),
+  KEY `idx_merchant_raws_status` (`status`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `merchant_raws`
+--
+
+LOCK TABLES `merchant_raws` WRITE;
+/*!40000 ALTER TABLE `merchant_raws` DISABLE KEYS */;
+/*!40000 ALTER TABLE `merchant_raws` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `nav_sites`
+--
+
+DROP TABLE IF EXISTS `nav_sites`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `nav_sites` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `url` varchar(2048) COLLATE utf8mb4_unicode_ci NOT NULL,
+  `domain` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `source` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `status` enum('pending','scraped','filtered','failed') COLLATE utf8mb4_unicode_ci DEFAULT 'pending',
+  `filter_reason` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
+  `merchant_count` bigint DEFAULT '0',
+  `created_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_url` (`url`(500)),
+  KEY `idx_nav_sites_domain` (`domain`),
+  KEY `idx_nav_sites_status` (`status`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `nav_sites`
+--
+
+LOCK TABLES `nav_sites` WRITE;
+/*!40000 ALTER TABLE `nav_sites` DISABLE KEYS */;
+/*!40000 ALTER TABLE `nav_sites` ENABLE KEYS */;
+UNLOCK TABLES;
+
+--
+-- Table structure for table `tasks`
+--
+
+DROP TABLE IF EXISTS `tasks`;
+/*!40101 SET @saved_cs_client     = @@character_set_client */;
+/*!50503 SET character_set_client = utf8mb4 */;
+CREATE TABLE `tasks` (
+  `id` bigint unsigned NOT NULL AUTO_INCREMENT,
+  `task_type` enum('full','discover','search','github','scrape','crawl','clean','score') COLLATE utf8mb4_unicode_ci NOT NULL,
+  `status` enum('pending','running','completed','failed','stopped') COLLATE utf8mb4_unicode_ci DEFAULT 'pending',
+  `params` json DEFAULT NULL,
+  `progress` json DEFAULT NULL,
+  `result` json DEFAULT NULL,
+  `error_msg` text COLLATE utf8mb4_unicode_ci,
+  `started_at` datetime(3) DEFAULT NULL,
+  `finished_at` datetime(3) DEFAULT NULL,
+  `created_at` datetime(3) DEFAULT NULL,
+  PRIMARY KEY (`id`),
+  KEY `idx_tasks_task_type` (`task_type`),
+  KEY `idx_tasks_status` (`status`)
+) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+/*!40101 SET character_set_client = @saved_cs_client */;
+
+--
+-- Dumping data for table `tasks`
+--
+
+LOCK TABLES `tasks` WRITE;
+/*!40000 ALTER TABLE `tasks` DISABLE KEYS */;
+INSERT INTO `tasks` VALUES (1,'full','completed','{\"target\": \"\", \"test_run\": {\"item_limit\": 10, \"message_limit\": 100}, \"task_type\": \"full\", \"skip_phases\": null}',NULL,'{\"message\": \"task completed successfully\"}','','2026-04-09 12:44:49.753','2026-04-09 12:44:50.501','2026-04-09 12:44:49.733'),(2,'score','completed','{\"target\": \"\", \"test_run\": {\"item_limit\": 5, \"message_limit\": 10}, \"task_type\": \"score\", \"skip_phases\": null}',NULL,'{\"message\": \"task completed successfully\"}','','2026-04-09 13:10:50.991','2026-04-09 13:10:51.008','2026-04-09 13:10:50.402'),(3,'score','completed','{\"target\": \"\", \"test_run\": {\"item_limit\": 5, \"message_limit\": 10}, \"task_type\": \"score\", \"skip_phases\": null}',NULL,'{\"message\": \"task completed successfully\"}','','2026-04-09 13:10:56.222','2026-04-09 13:10:56.238','2026-04-09 13:10:55.281'),(4,'score','completed','{\"target\": \"\", \"test_run\": {\"item_limit\": 5, \"message_limit\": 10}, \"task_type\": \"score\", \"skip_phases\": null}',NULL,'{\"message\": \"task completed successfully\"}','','2026-04-09 13:20:42.118','2026-04-09 13:20:42.132','2026-04-09 13:20:42.031'),(5,'discover','completed','{\"target\": \"\", \"test_run\": {\"item_limit\": 10, \"message_limit\": 100}, \"task_type\": \"discover\", \"skip_phases\": null}',NULL,'{\"message\": \"task completed successfully\"}','','2026-04-09 13:36:20.604','2026-04-09 13:36:20.626','2026-04-09 13:36:19.601');
+/*!40000 ALTER TABLE `tasks` ENABLE KEYS */;
+UNLOCK TABLES;
+/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
+
+/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
+/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
+/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
+/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
+/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
+/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
+/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
+
+-- Dump completed on 2026-04-10 11:44:41

+ 36 - 45
cmd/server/main.go

@@ -1,7 +1,6 @@
 package main
 
 import (
-	"context"
 	"fmt"
 	"log"
 	"time"
@@ -10,10 +9,15 @@ import (
 	"spider/internal/handler"
 	"spider/internal/llm"
 	"spider/internal/model"
+	"spider/internal/plugin"
+	"spider/internal/plugins/githubcollector"
+	"spider/internal/plugins/tgcollector"
+	"spider/internal/plugins/webcollector"
+	"spider/internal/processor"
 	"spider/internal/search"
-	"spider/internal/service"
+	"spider/internal/store"
+	"spider/internal/task"
 	"spider/internal/telegram"
-	"spider/internal/worker"
 
 	"github.com/redis/go-redis/v9"
 	"gorm.io/driver/mysql"
@@ -21,13 +25,13 @@ import (
 )
 
 func main() {
-	// 1. 加载配置
+	// 1. Load config
 	cfg, err := config.Load("configs/config.yaml")
 	if err != nil {
 		log.Fatalf("load config: %v", err)
 	}
 
-	// 2. 连接 MySQL
+	// 2. Connect MySQL
 	dsn := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?charset=utf8mb4&parseTime=True&loc=Local",
 		cfg.MySQL.User, cfg.MySQL.Password, cfg.MySQL.Host, cfg.MySQL.Port, cfg.MySQL.Database)
 	db, err := gorm.Open(mysql.Open(dsn), &gorm.Config{})
@@ -35,27 +39,20 @@ func main() {
 		log.Fatalf("connect mysql: %v", err)
 	}
 
-	// 3. AutoMigrate 所有表
+	// 3. AutoMigrate (5 tables)
 	err = db.AutoMigrate(
-		&model.ManagedSeed{},
-		&model.ManagedKeyword{},
-		&model.ManagedSetting{},
+		&model.Keyword{},
 		&model.Channel{},
-		&model.NavSite{},
 		&model.MerchantRaw{},
 		&model.MerchantClean{},
-		&model.Task{},
-		&model.ConfigRevision{},
+		&model.TaskLog{},
 	)
 	if err != nil {
 		log.Fatalf("automigrate: %v", err)
 	}
 	log.Println("MySQL tables migrated")
 
-	// 3a. 初始化 managed_settings 默认值(幂等,已有记录不覆盖)
-	seedSettings(db)
-
-	// 4. 连接 Redis
+	// 4. Connect Redis
 	rdb := redis.NewClient(&redis.Options{
 		Addr:     fmt.Sprintf("%s:%d", cfg.Redis.Host, cfg.Redis.Port),
 		Password: cfg.Redis.Password,
@@ -63,16 +60,20 @@ func main() {
 	})
 	log.Println("Redis connected")
 
-	// 5. 初始化 TaskService
-	taskSvc := service.NewTaskService(db, rdb)
+	// 5. Initialize store
+	s := store.New(db)
 
-	// 5a. 初始化 SettingsService 并加载到 Redis 缓存
-	settings := service.NewSettingsService(db, rdb)
-	if err := settings.Load(context.Background()); err != nil {
-		log.Printf("load settings into cache: %v", err)
+	// 6. Initialize external clients
+	var llmClient *llm.Client
+	if cfg.LLM.APIKey != "" {
+		llmClient = llm.New(cfg.LLM.BaseURL, cfg.LLM.APIKey, cfg.LLM.Model, 30*time.Second)
+	}
+
+	var serperClient *search.SerperClient
+	if cfg.Serper.APIKey != "" {
+		serperClient = search.NewSerperClient(cfg.Serper.APIKey, cfg.Serper.ResultsPerPage, cfg.Serper.MaxPages)
 	}
 
-	// 5b. 初始化 AccountManager(账号从配置读取,为空时运行时从 DB 动态加载)
 	tgAccounts := make([]telegram.Account, 0, len(cfg.Telegram.Accounts))
 	for _, a := range cfg.Telegram.Accounts {
 		tgAccounts = append(tgAccounts, telegram.Account{
@@ -84,30 +85,20 @@ func main() {
 	}
 	tgManager := telegram.NewAccountManager(tgAccounts, rdb)
 
-	// 5c. 初始化 LLM Client(配置缺失时为 nil,phase 会安全跳过)
-	var llmClient *llm.Client
-	if cfg.LLM.APIKey != "" {
-		llmClient = llm.New(cfg.LLM.BaseURL, cfg.LLM.APIKey, cfg.LLM.Model, 30*time.Second)
-	}
+	// 7. Register plugins
+	registry := plugin.NewRegistry()
+	registry.Register(webcollector.New(serperClient))
+	registry.Register(tgcollector.New(tgManager, llmClient, s))
+	registry.Register(githubcollector.New(cfg.GitHub.Token, s))
 
-	// 5d. 初始化 Serper Client(配置缺失时为 nil)
-	var serperClient *search.SerperClient
-	if cfg.Serper.APIKey != "" {
-		serperClient = search.NewSerperClient(cfg.Serper.APIKey, cfg.Serper.ResultsPerPage, cfg.Serper.MaxPages)
-	}
+	// 8. Initialize processor
+	proc := processor.NewProcessor(s)
+
+	// 9. Initialize task manager
+	taskMgr := task.NewManager(db, rdb, registry, s, proc)
 
-	// 6. 初始化并启动 asynq Worker
-	redisAddr := fmt.Sprintf("%s:%d", cfg.Redis.Host, cfg.Redis.Port)
-	w := worker.New(redisAddr, cfg.Redis.Password, cfg.Redis.DB, db, rdb, tgManager, llmClient, settings, serperClient, cfg.GitHub.Token)
-	go func() {
-		log.Println("asynq worker starting...")
-		if err := w.Start(); err != nil {
-			log.Fatalf("asynq worker error: %v", err)
-		}
-	}()
-
-	// 7. 初始化 Gin router
-	r := handler.SetupRouter(db, rdb, taskSvc)
+	// 10. Start HTTP server
+	r := handler.SetupRouter(s, taskMgr)
 
 	addr := handler.ServerAddr(cfg.Server.Port)
 	log.Printf("Server starting on %s", addr)

+ 0 - 30
cmd/server/seed.go

@@ -1,30 +0,0 @@
-package main
-
-import (
-	"spider/internal/model"
-
-	"gorm.io/gorm"
-)
-
-// seedSettings 在 managed_settings 表为空时插入默认配置值。
-// 使用 FirstOrCreate 保证幂等性,已有记录不会被覆盖。
-func seedSettings(db *gorm.DB) {
-	defaults := []model.ManagedSetting{
-		{KeyName: "pipeline.skip_phases", Value: "[]", ValueType: "json", EffectLevel: "new_task", Description: "默认跳过的阶段"},
-		{KeyName: "pipeline.checkpoint_interval", Value: "30", ValueType: "int", EffectLevel: "runtime", Description: "进度上报间隔(秒)"},
-		{KeyName: "tg_scraper.message_limit_per_channel", Value: "500", ValueType: "int", EffectLevel: "runtime", Description: "每频道最大消息数"},
-		{KeyName: "tg_scraper.delay_per_message", Value: "1.0", ValueType: "float", EffectLevel: "runtime", Description: "消息间延迟(秒)"},
-		{KeyName: "tg_scraper.delay_per_channel", Value: "5.0", ValueType: "float", EffectLevel: "runtime", Description: "频道间延迟(秒)"},
-		{KeyName: "tg_scraper.delay_per_verify", Value: "3.0", ValueType: "float", EffectLevel: "runtime", Description: "验证间延迟(秒)"},
-		{KeyName: "clean.timeout_seconds", Value: "3600", ValueType: "int", EffectLevel: "runtime", Description: "清洗阶段超时(秒)"},
-		{KeyName: "search.timeout_seconds", Value: "3600", ValueType: "int", EffectLevel: "runtime", Description: "搜索阶段超时(秒)"},
-		{KeyName: "snowball.max_channels_per_layer", Value: "200", ValueType: "int", EffectLevel: "runtime", Description: "每层最大频道数"},
-		{KeyName: "snowball.max_channels_total", Value: "500", ValueType: "int", EffectLevel: "runtime", Description: "总最大频道数"},
-		{KeyName: "tme_validator.enabled", Value: "true", ValueType: "bool", EffectLevel: "runtime", Description: "启用t.me死号预检"},
-		{KeyName: "tme_validator.rate_per_min", Value: "60", ValueType: "int", EffectLevel: "runtime", Description: "预检限速(次/分)"},
-		{KeyName: "tme_validator.concurrency", Value: "10", ValueType: "int", EffectLevel: "runtime", Description: "预检并发数"},
-	}
-	for _, s := range defaults {
-		db.Where(model.ManagedSetting{KeyName: s.KeyName}).FirstOrCreate(&s)
-	}
-}

+ 1 - 1
deploy/Dockerfile.api

@@ -1,4 +1,4 @@
-FROM golang:1.23-alpine AS builder
+FROM golang:1.26-alpine AS builder
 WORKDIR /app
 ENV GOTOOLCHAIN=auto
 COPY go.mod go.sum ./

+ 150 - 0
deploy/migrate_v2.sql

@@ -0,0 +1,150 @@
+-- Migration: v1 (pipeline) -> v2 (plugin architecture)
+-- Compatible with MySQL 5.7+
+--
+-- BEFORE running:
+--   1. Stop the old application
+--   2. Backup: mysqldump -u root -p spider > spider_backup_$(date +%Y%m%d).sql
+
+-- 1. Create keywords table (merge managed_seeds + managed_keywords)
+CREATE TABLE IF NOT EXISTS keywords (
+    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
+    keyword VARCHAR(255) NOT NULL,
+    industry_tag VARCHAR(100) DEFAULT '',
+    enabled TINYINT(1) DEFAULT 1,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE INDEX idx_keyword (keyword)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+INSERT IGNORE INTO keywords (keyword, industry_tag, enabled, created_at)
+SELECT keyword, COALESCE(category,''), IF(status='active', 1, 0), created_at
+FROM managed_keywords;
+
+INSERT IGNORE INTO keywords (keyword, industry_tag, enabled, created_at)
+SELECT channel_name, 'seed', IF(status='active', 1, 0), created_at
+FROM managed_seeds;
+
+-- 2. Recreate merchants_raw with new schema
+CREATE TABLE merchants_raw_v2 (
+    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
+    tg_username VARCHAR(255) NOT NULL,
+    tg_link VARCHAR(500) DEFAULT '',
+    merchant_name VARCHAR(500) DEFAULT '',
+    website VARCHAR(2048) DEFAULT '',
+    email VARCHAR(255) DEFAULT '',
+    phone VARCHAR(100) DEFAULT '',
+    source_type VARCHAR(50) NOT NULL DEFAULT 'web',
+    source_name VARCHAR(500) DEFAULT '',
+    source_url VARCHAR(2048) DEFAULT '',
+    original_text TEXT,
+    industry_tag VARCHAR(100) DEFAULT '',
+    status VARCHAR(20) DEFAULT 'raw',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    INDEX idx_tg_username (tg_username),
+    INDEX idx_status (status)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+INSERT INTO merchants_raw_v2 (id, tg_username, tg_link, merchant_name, website, email, phone,
+    source_type, source_name, source_url, original_text, industry_tag, status, created_at)
+SELECT id, COALESCE(tg_username,''), CONCAT('https://t.me/', COALESCE(tg_username,'')),
+    COALESCE(merchant_name,''), COALESCE(website,''), COALESCE(email,''), COALESCE(phone,''),
+    COALESCE(source_type,'web'), '', COALESCE(source_id, ''),
+    COALESCE(original_message, ''), COALESCE(industry, ''),
+    CASE WHEN status IN ('raw', 'glm_parsed') THEN 'raw' ELSE 'raw' END,
+    created_at
+FROM merchants_raw;
+
+RENAME TABLE merchants_raw TO merchants_raw_old, merchants_raw_v2 TO merchants_raw;
+
+-- 3. Recreate merchants_clean with new schema
+CREATE TABLE merchants_clean_v2 (
+    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
+    tg_username VARCHAR(255) DEFAULT '',
+    tg_link VARCHAR(500) DEFAULT '',
+    merchant_name VARCHAR(500) DEFAULT '',
+    website VARCHAR(2048) DEFAULT '',
+    email VARCHAR(255) DEFAULT '',
+    phone VARCHAR(100) DEFAULT '',
+    source_count INT DEFAULT 1,
+    all_sources JSON,
+    industry_tag VARCHAR(100) DEFAULT '',
+    level VARCHAR(10) DEFAULT 'Cold',
+    status VARCHAR(20) NOT NULL DEFAULT 'valid',
+    is_alive TINYINT(1) DEFAULT 0,
+    last_checked_at DATETIME,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    UNIQUE INDEX idx_tg_username (tg_username),
+    INDEX idx_status (status),
+    INDEX idx_level (level),
+    INDEX idx_industry_tag (industry_tag)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+INSERT INTO merchants_clean_v2 (id, tg_username, tg_link, merchant_name, website, email, phone,
+    source_count, all_sources, industry_tag, level, status, is_alive, created_at, updated_at)
+SELECT id, COALESCE(tg_username,''), CONCAT('https://t.me/', COALESCE(tg_username,'')),
+    COALESCE(merchant_name,''), COALESCE(website,''), COALESCE(email,''), COALESCE(phone,''),
+    COALESCE(source_count,1), COALESCE(source_links, CAST('[]' AS JSON)),
+    COALESCE(industry, ''),
+    CASE
+        WHEN quality_score >= 60 THEN 'Hot'
+        WHEN quality_score >= 30 THEN 'Warm'
+        ELSE 'Cold'
+    END,
+    status,
+    CASE WHEN status = 'valid' THEN 1 ELSE 0 END,
+    created_at, updated_at
+FROM merchants_clean;
+
+RENAME TABLE merchants_clean TO merchants_clean_old, merchants_clean_v2 TO merchants_clean;
+
+-- 4. Recreate channels with new schema (simpler than ALTER for enum->varchar)
+CREATE TABLE channels_v2 (
+    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
+    username VARCHAR(255) NOT NULL,
+    channel_id BIGINT DEFAULT 0,
+    access_hash BIGINT DEFAULT 0,
+    status VARCHAR(20) DEFAULT 'pending',
+    last_message_id INT DEFAULT 0,
+    merchants_found INT DEFAULT 0,
+    source VARCHAR(50) NOT NULL DEFAULT 'search',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+    UNIQUE INDEX idx_username (username),
+    INDEX idx_status (status),
+    INDEX idx_source (source)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+INSERT INTO channels_v2 (id, username, status, last_message_id, source, created_at, updated_at)
+SELECT id, username, status, last_message_id, source, created_at, updated_at
+FROM channels;
+
+RENAME TABLE channels TO channels_old, channels_v2 TO channels;
+
+-- 5. Create task_logs table
+CREATE TABLE IF NOT EXISTS task_logs (
+    id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
+    task_type VARCHAR(50) NOT NULL,
+    plugin_name VARCHAR(100) DEFAULT '',
+    status VARCHAR(20) DEFAULT 'pending',
+    items_processed INT DEFAULT 0,
+    merchants_added INT DEFAULT 0,
+    errors_count INT DEFAULT 0,
+    started_at DATETIME,
+    finished_at DATETIME,
+    detail TEXT,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    INDEX idx_task_type (task_type),
+    INDEX idx_status (status)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- Done! Old tables preserved as *_old for safety.
+-- After verifying the new system works, run:
+--   DROP TABLE IF EXISTS merchants_raw_old;
+--   DROP TABLE IF EXISTS merchants_clean_old;
+--   DROP TABLE IF EXISTS channels_old;
+--   DROP TABLE IF EXISTS managed_seeds;
+--   DROP TABLE IF EXISTS managed_keywords;
+--   DROP TABLE IF EXISTS managed_settings;
+--   DROP TABLE IF EXISTS config_revisions;
+--   DROP TABLE IF EXISTS nav_sites;
+--   DROP TABLE IF EXISTS tasks;

+ 58 - 0
deploy/upgrade_v2.sh

@@ -0,0 +1,58 @@
+#!/bin/bash
+set -e
+
+# TG 商户采集系统 v2 升级脚本
+# 用法: 在服务器上的项目目录运行 bash deploy/upgrade_v2.sh
+
+DEPLOY_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(dirname "$DEPLOY_DIR")"
+MYSQL_CONTAINER="im_mysql"
+MYSQL_USER="root"
+MYSQL_PASS="root123"
+MYSQL_DB="spider"
+BACKUP_DIR="$PROJECT_DIR/backup"
+
+echo "=== TG 商户采集系统 v2 升级 ==="
+echo "项目目录: $PROJECT_DIR"
+echo ""
+
+# 1. 备份数据库
+echo "[1/5] 备份数据库..."
+mkdir -p "$BACKUP_DIR"
+BACKUP_FILE="$BACKUP_DIR/spider_pre_v2_$(date +%Y%m%d_%H%M%S).sql"
+docker exec "$MYSQL_CONTAINER" mysqldump -u"$MYSQL_USER" -p"$MYSQL_PASS" "$MYSQL_DB" > "$BACKUP_FILE" 2>/dev/null
+echo "  备份完成: $BACKUP_FILE"
+
+# 2. 停止旧服务
+echo "[2/5] 停止旧服务..."
+cd "$DEPLOY_DIR"
+docker compose down 2>/dev/null || docker-compose down 2>/dev/null || true
+echo "  旧服务已停止"
+
+# 3. 执行数据库迁移
+echo "[3/5] 执行数据库迁移..."
+docker exec -i "$MYSQL_CONTAINER" mysql -u"$MYSQL_USER" -p"$MYSQL_PASS" "$MYSQL_DB" < "$DEPLOY_DIR/migrate_v2.sql" 2>/dev/null
+echo "  迁移完成"
+
+# 4. 构建新镜像
+echo "[4/5] 构建 Docker 镜像..."
+cd "$DEPLOY_DIR"
+docker compose build --no-cache 2>/dev/null || docker-compose build --no-cache 2>/dev/null
+echo "  镜像构建完成"
+
+# 5. 启动新服务
+echo "[5/5] 启动新服务..."
+docker compose up -d 2>/dev/null || docker-compose up -d 2>/dev/null
+echo "  服务已启动"
+
+echo ""
+echo "=== 升级完成 ==="
+echo "前端: http://localhost:8300"
+echo "API:  http://localhost:8200/ping"
+echo ""
+echo "检查日志: docker compose logs -f"
+echo ""
+echo "确认无问题后,可删除旧表:"
+echo "  docker exec -i $MYSQL_CONTAINER mysql -u$MYSQL_USER -p$MYSQL_PASS $MYSQL_DB <<< \\"
+echo "    'DROP TABLE IF EXISTS merchants_raw_old, merchants_clean_old, channels_old,"
+echo "     managed_seeds, managed_keywords, managed_settings, config_revisions, nav_sites, tasks;'"

+ 1 - 4
go.mod

@@ -8,11 +8,9 @@ require (
 	github.com/gocolly/colly/v2 v2.3.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/gotd/td v0.143.0
-	github.com/hibiken/asynq v0.26.0
 	github.com/redis/go-redis/v9 v9.14.1
 	github.com/sashabaranov/go-openai v1.41.2
 	github.com/spf13/viper v1.17.0
-	golang.org/x/net v0.52.0
 	gorm.io/datatypes v1.2.7
 	gorm.io/driver/mysql v1.6.0
 	gorm.io/gorm v1.31.1
@@ -81,7 +79,6 @@ require (
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
 	github.com/quic-go/qpack v0.6.0 // indirect
 	github.com/quic-go/quic-go v0.59.0 // indirect
-	github.com/robfig/cron/v3 v3.0.1 // indirect
 	github.com/sagikazarmark/locafero v0.3.0 // indirect
 	github.com/sagikazarmark/slog-shim v0.1.0 // indirect
 	github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
@@ -106,10 +103,10 @@ require (
 	golang.org/x/crypto v0.49.0 // indirect
 	golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
 	golang.org/x/mod v0.34.0 // indirect
+	golang.org/x/net v0.52.0 // indirect
 	golang.org/x/sync v0.20.0 // indirect
 	golang.org/x/sys v0.42.0 // indirect
 	golang.org/x/text v0.35.0 // indirect
-	golang.org/x/time v0.14.0 // indirect
 	golang.org/x/tools v0.43.0 // indirect
 	google.golang.org/appengine v1.6.8 // indirect
 	google.golang.org/protobuf v1.36.10 // indirect

+ 0 - 6
go.sum

@@ -239,8 +239,6 @@ github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ
 github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
 github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
-github.com/hibiken/asynq v0.26.0 h1:1Zxr92MlDnb1Zt/QR5g2vSCqUS03i95lUfqx5X7/wrw=
-github.com/hibiken/asynq v0.26.0/go.mod h1:Qk4e57bTnWDoyJ67VkchuV6VzSM9IQW2nPvAGuDyw58=
 github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
 github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
@@ -315,8 +313,6 @@ github.com/quic-go/quic-go v0.59.0 h1:OLJkp1Mlm/aS7dpKgTc6cnpynnD2Xg7C1pwL6vy/SA
 github.com/quic-go/quic-go v0.59.0/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU=
 github.com/redis/go-redis/v9 v9.14.1 h1:nDCrEiJmfOWhD76xlaw+HXT0c9hfNWeXgl0vIRYSDvQ=
 github.com/redis/go-redis/v9 v9.14.1/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw=
-github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
-github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
@@ -601,8 +597,6 @@ golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
-golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
-golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=

+ 0 - 82
internal/handler/channel.go

@@ -1,82 +0,0 @@
-package handler
-
-import (
-	"spider/internal/model"
-
-	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
-)
-
-// ChannelHandler handles channel queries.
-type ChannelHandler struct {
-	db *gorm.DB
-}
-
-// List returns channels with optional filters and pagination.
-// GET /channels?status=&source=&page=&page_size=
-func (h *ChannelHandler) List(c *gin.Context) {
-	page, pageSize, offset := parsePage(c)
-
-	query := h.db.Model(&model.Channel{})
-	if status := c.Query("status"); status != "" {
-		query = query.Where("status = ?", status)
-	}
-	if source := c.Query("source"); source != "" {
-		query = query.Where("source = ?", source)
-	}
-
-	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	var items []model.Channel
-	if err := query.Order("id DESC").Limit(pageSize).Offset(offset).Find(&items).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	PageOK(c, items, total, page, pageSize)
-}
-
-// Stats returns channel counts grouped by status and source.
-// GET /channels/stats
-func (h *ChannelHandler) Stats(c *gin.Context) {
-	var statusRows []struct {
-		Status string `json:"status"`
-		Cnt    int64  `json:"count"`
-	}
-	h.db.Model(&model.Channel{}).
-		Select("status, count(*) as cnt").
-		Group("status").
-		Scan(&statusRows)
-
-	byStatus := map[string]int64{}
-	for _, r := range statusRows {
-		byStatus[r.Status] = r.Cnt
-	}
-
-	var sourceRows []struct {
-		Source string `json:"source"`
-		Cnt    int64  `json:"count"`
-	}
-	h.db.Model(&model.Channel{}).
-		Select("source, count(*) as cnt").
-		Group("source").
-		Scan(&sourceRows)
-
-	bySource := map[string]int64{}
-	for _, r := range sourceRows {
-		bySource[r.Source] = r.Cnt
-	}
-
-	var total int64
-	h.db.Model(&model.Channel{}).Count(&total)
-
-	OK(c, gin.H{
-		"total":     total,
-		"by_status": byStatus,
-		"by_source": bySource,
-	})
-}

+ 0 - 77
internal/handler/config.go

@@ -1,77 +0,0 @@
-package handler
-
-import (
-	"net/http"
-
-	"spider/internal/model"
-
-	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
-)
-
-// ConfigHandler handles managed settings.
-type ConfigHandler struct {
-	db *gorm.DB
-}
-
-// ListSettings returns all settings.
-// GET /config/settings
-func (h *ConfigHandler) ListSettings(c *gin.Context) {
-	var settings []model.ManagedSetting
-	if err := h.db.Order("key_name ASC").Find(&settings).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-	OK(c, settings)
-}
-
-// UpdateSetting updates a setting by key and records an audit revision.
-// PUT /config/settings/:key
-func (h *ConfigHandler) UpdateSetting(c *gin.Context) {
-	key := c.Param("key")
-	if key == "" {
-		Fail(c, http.StatusBadRequest, "key is required")
-		return
-	}
-
-	var body struct {
-		Value       string `json:"value" binding:"required"`
-		Description string `json:"description"`
-	}
-	if err := c.ShouldBindJSON(&body); err != nil {
-		Fail(c, http.StatusBadRequest, err.Error())
-		return
-	}
-
-	var setting model.ManagedSetting
-	if err := h.db.Where("key_name = ?", key).First(&setting).Error; err != nil {
-		Fail(c, 404, "setting not found")
-		return
-	}
-
-	oldValue := setting.Value
-
-	updates := map[string]interface{}{
-		"value": body.Value,
-	}
-	if body.Description != "" {
-		updates["description"] = body.Description
-	}
-
-	if err := h.db.Model(&setting).Updates(updates).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	// Record audit revision.
-	revision := model.ConfigRevision{
-		SettingKey: key,
-		OldValue:   oldValue,
-		NewValue:   body.Value,
-		ChangedBy:  "admin",
-	}
-	h.db.Create(&revision)
-
-	h.db.Where("key_name = ?", key).First(&setting)
-	OK(c, setting)
-}

+ 0 - 61
internal/handler/dashboard.go

@@ -1,61 +0,0 @@
-package handler
-
-import (
-	"spider/internal/model"
-
-	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
-)
-
-// DashboardHandler handles the dashboard summary endpoint.
-type DashboardHandler struct {
-	db *gorm.DB
-}
-
-// Get returns aggregated dashboard statistics.
-// GET /dashboard
-func (h *DashboardHandler) Get(c *gin.Context) {
-	var channelsTotal int64
-	h.db.Model(&model.Channel{}).Count(&channelsTotal)
-
-	var merchantsRawTotal int64
-	h.db.Model(&model.MerchantRaw{}).Count(&merchantsRawTotal)
-
-	var merchantsCleanTotal int64
-	h.db.Model(&model.MerchantClean{}).Count(&merchantsCleanTotal)
-
-	var merchantsValid int64
-	h.db.Model(&model.MerchantClean{}).Where("status = ?", "valid").Count(&merchantsValid)
-
-	var navSitesTotal int64
-	h.db.Model(&model.NavSite{}).Count(&navSitesTotal)
-
-	var seedsTotal int64
-	h.db.Model(&model.ManagedSeed{}).Count(&seedsTotal)
-
-	var keywordsTotal int64
-	h.db.Model(&model.ManagedKeyword{}).Count(&keywordsTotal)
-
-	// Recent 5 tasks.
-	var recentTasks []model.Task
-	h.db.Order("created_at DESC").Limit(5).Find(&recentTasks)
-
-	// Currently running task (first one).
-	var runningTask *model.Task
-	var rt model.Task
-	if err := h.db.Where("status = ?", "running").First(&rt).Error; err == nil {
-		runningTask = &rt
-	}
-
-	OK(c, gin.H{
-		"channels_total":         channelsTotal,
-		"merchants_raw_total":    merchantsRawTotal,
-		"merchants_clean_total":  merchantsCleanTotal,
-		"merchants_valid":        merchantsValid,
-		"nav_sites_total":        navSitesTotal,
-		"seeds_total":            seedsTotal,
-		"keywords_total":         keywordsTotal,
-		"recent_tasks":           recentTasks,
-		"running_task":           runningTask,
-	})
-}

+ 32 - 38
internal/handler/keyword.go

@@ -5,36 +5,31 @@ import (
 	"strconv"
 
 	"spider/internal/model"
+	"spider/internal/store"
 
 	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
 )
 
-// KeywordHandler handles managed keyword CRUD.
+// KeywordHandler handles unified keyword + seed CRUD.
 type KeywordHandler struct {
-	db *gorm.DB
+	store *store.Store
 }
 
 // List returns keywords with optional filters and pagination.
-// GET /keywords?page=1&page_size=20&category=&status=
+// GET /keywords?page=1&page_size=20&industry_tag=
 func (h *KeywordHandler) List(c *gin.Context) {
 	page, pageSize, offset := parsePage(c)
+	industryTag := c.Query("industry_tag")
 
-	query := h.db.Model(&model.ManagedKeyword{})
-	if category := c.Query("category"); category != "" {
-		query = query.Where("category = ?", category)
-	}
-	if status := c.Query("status"); status != "" {
-		query = query.Where("status = ?", status)
+	query := h.store.DB.Model(&model.Keyword{})
+	if industryTag != "" {
+		query = query.Where("industry_tag = ?", industryTag)
 	}
 
 	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
+	query.Count(&total)
 
-	var keywords []model.ManagedKeyword
+	var keywords []model.Keyword
 	if err := query.Order("id DESC").Limit(pageSize).Offset(offset).Find(&keywords).Error; err != nil {
 		Fail(c, 500, err.Error())
 		return
@@ -44,29 +39,28 @@ func (h *KeywordHandler) List(c *gin.Context) {
 }
 
 // Create creates one or more keywords in batch.
-// POST /keywords  body: {keywords:["k1","k2"], category:"机场"}
+// POST /keywords  body: {keywords:["k1","k2"], industry_tag:"机场"}
 func (h *KeywordHandler) Create(c *gin.Context) {
 	var body struct {
-		Keywords []string `json:"keywords" binding:"required,min=1"`
-		Category string   `json:"category"`
+		Keywords    []string `json:"keywords" binding:"required,min=1"`
+		IndustryTag string   `json:"industry_tag"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		Fail(c, http.StatusBadRequest, err.Error())
 		return
 	}
 
-	var created []model.ManagedKeyword
+	var created []model.Keyword
 	for _, kw := range body.Keywords {
 		if kw == "" {
 			continue
 		}
-		k := model.ManagedKeyword{
-			Keyword:  kw,
-			Category: body.Category,
-			Status:   "active",
+		k := model.Keyword{
+			Keyword:     kw,
+			IndustryTag: body.IndustryTag,
+			Enabled:     true,
 		}
-		// Use FirstOrCreate to avoid duplicate errors.
-		if err := h.db.Where(model.ManagedKeyword{Keyword: kw}).FirstOrCreate(&k).Error; err != nil {
+		if err := h.store.DB.Where(model.Keyword{Keyword: kw}).FirstOrCreate(&k).Error; err != nil {
 			Fail(c, 500, err.Error())
 			return
 		}
@@ -86,38 +80,38 @@ func (h *KeywordHandler) Update(c *gin.Context) {
 	}
 
 	var body struct {
-		Keyword  string `json:"keyword"`
-		Category string `json:"category"`
-		Status   string `json:"status"`
+		Keyword     string `json:"keyword"`
+		IndustryTag string `json:"industry_tag"`
+		Enabled     *bool  `json:"enabled"`
 	}
 	if err := c.ShouldBindJSON(&body); err != nil {
 		Fail(c, http.StatusBadRequest, err.Error())
 		return
 	}
 
-	var kw model.ManagedKeyword
-	if err := h.db.First(&kw, id).Error; err != nil {
+	var kw model.Keyword
+	if err := h.store.DB.First(&kw, id).Error; err != nil {
 		Fail(c, 404, "keyword not found")
 		return
 	}
 
-	updates := map[string]interface{}{}
+	updates := map[string]any{}
 	if body.Keyword != "" {
 		updates["keyword"] = body.Keyword
 	}
-	if body.Category != "" {
-		updates["category"] = body.Category
+	if body.IndustryTag != "" {
+		updates["industry_tag"] = body.IndustryTag
 	}
-	if body.Status != "" {
-		updates["status"] = body.Status
+	if body.Enabled != nil {
+		updates["enabled"] = *body.Enabled
 	}
 
-	if err := h.db.Model(&kw).Updates(updates).Error; err != nil {
+	if err := h.store.DB.Model(&kw).Updates(updates).Error; err != nil {
 		Fail(c, 500, err.Error())
 		return
 	}
 
-	h.db.First(&kw, id)
+	h.store.DB.First(&kw, id)
 	OK(c, kw)
 }
 
@@ -130,7 +124,7 @@ func (h *KeywordHandler) Delete(c *gin.Context) {
 		return
 	}
 
-	if err := h.db.Delete(&model.ManagedKeyword{}, id).Error; err != nil {
+	if err := h.store.DB.Delete(&model.Keyword{}, id).Error; err != nil {
 		Fail(c, 500, err.Error())
 		return
 	}

+ 89 - 65
internal/handler/merchant.go

@@ -1,54 +1,64 @@
 package handler
 
 import (
+	"encoding/csv"
+	"fmt"
 	"net/http"
 	"strconv"
 
 	"spider/internal/model"
+	"spider/internal/store"
 
 	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
 )
 
 // MerchantHandler handles merchant queries.
 type MerchantHandler struct {
-	db *gorm.DB
+	store *store.Store
 }
 
 // Stats returns aggregate statistics for merchants.
-// GET /merchants/stats
 func (h *MerchantHandler) Stats(c *gin.Context) {
-	type countRow struct {
-		Key   string `json:"key"`
-		Count int64  `json:"count"`
-	}
-
 	var rawTotal int64
-	h.db.Model(&model.MerchantRaw{}).Count(&rawTotal)
+	h.store.DB.Model(&model.MerchantRaw{}).Count(&rawTotal)
 
 	var cleanTotal int64
-	h.db.Model(&model.MerchantClean{}).Count(&cleanTotal)
+	h.store.DB.Model(&model.MerchantClean{}).Count(&cleanTotal)
 
-	// Count by status in clean table.
-	statusCounts := map[string]int64{}
+	// Count by status
 	var statusRows []struct {
 		Status string
 		Cnt    int64
 	}
-	h.db.Model(&model.MerchantClean{}).
+	h.store.DB.Model(&model.MerchantClean{}).
 		Select("status, count(*) as cnt").
 		Group("status").
 		Scan(&statusRows)
+	byStatus := map[string]int64{}
 	for _, r := range statusRows {
-		statusCounts[r.Status] = r.Cnt
+		byStatus[r.Status] = r.Cnt
 	}
 
-	// Count by source_type in raw table.
+	// Count by level
+	var levelRows []struct {
+		Level string
+		Cnt   int64
+	}
+	h.store.DB.Model(&model.MerchantClean{}).
+		Select("level, count(*) as cnt").
+		Group("level").
+		Scan(&levelRows)
+	byLevel := map[string]int64{}
+	for _, r := range levelRows {
+		byLevel[r.Level] = r.Cnt
+	}
+
+	// Count by source_type
 	var sourceRows []struct {
 		SourceType string
 		Cnt        int64
 	}
-	h.db.Model(&model.MerchantRaw{}).
+	h.store.DB.Model(&model.MerchantRaw{}).
 		Select("source_type, count(*) as cnt").
 		Group("source_type").
 		Scan(&sourceRows)
@@ -57,51 +67,33 @@ func (h *MerchantHandler) Stats(c *gin.Context) {
 		bySource[r.SourceType] = r.Cnt
 	}
 
-	// Count by industry in clean table.
-	var industryRows []struct {
-		Industry string
-		Cnt      int64
-	}
-	h.db.Model(&model.MerchantClean{}).
-		Select("industry, count(*) as cnt").
-		Group("industry").
-		Scan(&industryRows)
-	byIndustry := map[string]int64{}
-	for _, r := range industryRows {
-		byIndustry[r.Industry] = r.Cnt
-	}
-
 	OK(c, gin.H{
 		"raw_total":   rawTotal,
 		"clean_total": cleanTotal,
-		"valid":       statusCounts["valid"],
-		"invalid":     statusCounts["invalid"],
-		"bot":         statusCounts["bot"],
-		"duplicate":   statusCounts["duplicate"],
-		"group":       statusCounts["group"],
+		"by_status":   byStatus,
+		"by_level":    byLevel,
 		"by_source":   bySource,
-		"by_industry": byIndustry,
 	})
 }
 
 // ListRaw returns raw merchants with filters and pagination.
-// GET /merchants/raw?status=&source_type=&page=&page_size=
 func (h *MerchantHandler) ListRaw(c *gin.Context) {
 	page, pageSize, offset := parsePage(c)
 
-	query := h.db.Model(&model.MerchantRaw{})
+	query := h.store.DB.Model(&model.MerchantRaw{})
 	if status := c.Query("status"); status != "" {
 		query = query.Where("status = ?", status)
 	}
 	if sourceType := c.Query("source_type"); sourceType != "" {
 		query = query.Where("source_type = ?", sourceType)
 	}
+	if search := c.Query("search"); search != "" {
+		like := "%" + search + "%"
+		query = query.Where("tg_username LIKE ? OR merchant_name LIKE ?", like, like)
+	}
 
 	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
+	query.Count(&total)
 
 	var items []model.MerchantRaw
 	if err := query.Order("created_at DESC").Limit(pageSize).Offset(offset).Find(&items).Error; err != nil {
@@ -113,33 +105,33 @@ func (h *MerchantHandler) ListRaw(c *gin.Context) {
 }
 
 // ListClean returns clean merchants with filters and pagination.
-// GET /merchants/clean?status=&industry=&min_score=&sort=quality_score&order=desc&page=&page_size=
 func (h *MerchantHandler) ListClean(c *gin.Context) {
 	page, pageSize, offset := parsePage(c)
 
-	query := h.db.Model(&model.MerchantClean{})
+	query := h.store.DB.Model(&model.MerchantClean{})
 	if status := c.Query("status"); status != "" {
 		query = query.Where("status = ?", status)
 	}
-	if industry := c.Query("industry"); industry != "" {
-		query = query.Where("industry = ?", industry)
+	if level := c.Query("level"); level != "" {
+		query = query.Where("level = ?", level)
+	}
+	if industry := c.Query("industry_tag"); industry != "" {
+		query = query.Where("industry_tag = ?", industry)
 	}
-	if minScore := c.Query("min_score"); minScore != "" {
-		if score, err := strconv.ParseFloat(minScore, 64); err == nil {
-			query = query.Where("quality_score >= ?", score)
-		}
+	if search := c.Query("search"); search != "" {
+		like := "%" + search + "%"
+		query = query.Where("tg_username LIKE ? OR merchant_name LIKE ?", like, like)
 	}
 
-	sortField := c.DefaultQuery("sort", "quality_score")
-	// whitelist sort fields to prevent SQL injection
+	sortField := c.DefaultQuery("sort", "created_at")
 	allowedSort := map[string]bool{
-		"quality_score": true,
-		"created_at":    true,
-		"updated_at":    true,
-		"member_count":  true,
+		"created_at":   true,
+		"updated_at":   true,
+		"source_count": true,
+		"level":        true,
 	}
 	if !allowedSort[sortField] {
-		sortField = "quality_score"
+		sortField = "created_at"
 	}
 	order := c.DefaultQuery("order", "desc")
 	if order != "asc" && order != "desc" {
@@ -147,10 +139,7 @@ func (h *MerchantHandler) ListClean(c *gin.Context) {
 	}
 
 	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
+	query.Count(&total)
 
 	var items []model.MerchantClean
 	if err := query.Order(sortField + " " + order).Limit(pageSize).Offset(offset).Find(&items).Error; err != nil {
@@ -161,8 +150,43 @@ func (h *MerchantHandler) ListClean(c *gin.Context) {
 	PageOK(c, items, total, page, pageSize)
 }
 
-// GetByID fetches a merchant by ID, checking clean table first then raw.
-// GET /merchants/:id
+// ExportCSV exports clean merchants as CSV.
+func (h *MerchantHandler) ExportCSV(c *gin.Context) {
+	query := h.store.DB.Model(&model.MerchantClean{}).Where("status = ?", "valid")
+	if level := c.Query("level"); level != "" {
+		query = query.Where("level = ?", level)
+	}
+
+	var merchants []model.MerchantClean
+	query.Order("level ASC, created_at DESC").Find(&merchants)
+
+	c.Header("Content-Type", "text/csv; charset=utf-8")
+	c.Header("Content-Disposition", "attachment; filename=merchants.csv")
+
+	// Write BOM for Excel compatibility
+	c.Writer.Write([]byte{0xEF, 0xBB, 0xBF})
+
+	w := csv.NewWriter(c.Writer)
+	w.Write([]string{"商户名", "TG用户名", "TG链接", "网站", "邮箱", "电话", "行业", "等级", "来源数"})
+
+	for _, m := range merchants {
+		w.Write([]string{
+			m.MerchantName,
+			m.TgUsername,
+			m.TgLink,
+			m.Website,
+			m.Email,
+			m.Phone,
+			m.IndustryTag,
+			m.Level,
+			fmt.Sprintf("%d", m.SourceCount),
+		})
+	}
+
+	w.Flush()
+}
+
+// GetByID fetches a merchant by ID.
 func (h *MerchantHandler) GetByID(c *gin.Context) {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 64)
 	if err != nil {
@@ -171,13 +195,13 @@ func (h *MerchantHandler) GetByID(c *gin.Context) {
 	}
 
 	var clean model.MerchantClean
-	if err := h.db.First(&clean, id).Error; err == nil {
+	if err := h.store.DB.First(&clean, id).Error; err == nil {
 		OK(c, gin.H{"source": "clean", "data": clean})
 		return
 	}
 
 	var raw model.MerchantRaw
-	if err := h.db.First(&raw, id).Error; err == nil {
+	if err := h.store.DB.First(&raw, id).Error; err == nil {
 		OK(c, gin.H{"source": "raw", "data": raw})
 		return
 	}

+ 0 - 38
internal/handler/nav_site.go

@@ -1,38 +0,0 @@
-package handler
-
-import (
-	"spider/internal/model"
-
-	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
-)
-
-// NavSiteHandler handles nav site queries.
-type NavSiteHandler struct {
-	db *gorm.DB
-}
-
-// List returns nav sites with optional status filter and pagination.
-// GET /nav-sites?status=&page=&page_size=
-func (h *NavSiteHandler) List(c *gin.Context) {
-	page, pageSize, offset := parsePage(c)
-
-	query := h.db.Model(&model.NavSite{})
-	if status := c.Query("status"); status != "" {
-		query = query.Where("status = ?", status)
-	}
-
-	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	var items []model.NavSite
-	if err := query.Order("id DESC").Limit(pageSize).Offset(offset).Find(&items).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	PageOK(c, items, total, page, pageSize)
-}

+ 10 - 36
internal/handler/router.go

@@ -5,64 +5,38 @@ import (
 	"net/http"
 
 	"github.com/gin-gonic/gin"
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
 
-	"spider/internal/service"
+	"spider/internal/store"
+	"spider/internal/task"
 )
 
 // SetupRouter builds and returns the Gin engine with all routes registered.
-func SetupRouter(db *gorm.DB, rdb *redis.Client, taskSvc *service.TaskService) *gin.Engine {
+func SetupRouter(s *store.Store, taskMgr *task.Manager) *gin.Engine {
 	r := gin.Default()
 
-	// Health check.
 	r.GET("/ping", func(c *gin.Context) {
 		c.JSON(http.StatusOK, gin.H{"message": "pong"})
 	})
 
 	api := r.Group("/api/v1")
 
-	// Dashboard.
-	dash := &DashboardHandler{db: db}
-	api.GET("/dashboard", dash.Get)
-
-	// Channels.
-	ch := &ChannelHandler{db: db}
-	api.GET("/channels", ch.List)
-	api.GET("/channels/stats", ch.Stats)
-
-	// Keywords.
-	kw := &KeywordHandler{db: db}
+	// Keywords (unified: search keywords + seeds)
+	kw := &KeywordHandler{store: s}
 	api.GET("/keywords", kw.List)
 	api.POST("/keywords", kw.Create)
 	api.PUT("/keywords/:id", kw.Update)
 	api.DELETE("/keywords/:id", kw.Delete)
 
-	// Seeds.
-	sd := &SeedHandler{db: db}
-	api.GET("/seeds", sd.List)
-	api.POST("/seeds", sd.Create)
-	api.PUT("/seeds/:id", sd.Update)
-	api.DELETE("/seeds/:id", sd.Delete)
-
-	// Merchants.
-	mc := &MerchantHandler{db: db}
+	// Merchants
+	mc := &MerchantHandler{store: s}
 	api.GET("/merchants/stats", mc.Stats)
 	api.GET("/merchants/raw", mc.ListRaw)
 	api.GET("/merchants/clean", mc.ListClean)
+	api.GET("/merchants/clean/export", mc.ExportCSV)
 	api.GET("/merchants/:id", mc.GetByID)
 
-	// Nav sites.
-	ns := &NavSiteHandler{db: db}
-	api.GET("/nav-sites", ns.List)
-
-	// Config / Settings.
-	cfg := &ConfigHandler{db: db}
-	api.GET("/config/settings", cfg.ListSettings)
-	api.PUT("/config/settings/:key", cfg.UpdateSetting)
-
-	// Tasks.
-	th := NewTaskHandler(db, taskSvc, rdb)
+	// Tasks
+	th := &TaskHandler{store: s, taskMgr: taskMgr}
 	api.GET("/tasks", th.List)
 	api.POST("/tasks/start", th.Start)
 	api.GET("/tasks/:id", th.Get)

+ 0 - 120
internal/handler/seed.go

@@ -1,120 +0,0 @@
-package handler
-
-import (
-	"net/http"
-	"strconv"
-
-	"spider/internal/model"
-
-	"github.com/gin-gonic/gin"
-	"gorm.io/gorm"
-)
-
-// SeedHandler handles managed seed CRUD.
-type SeedHandler struct {
-	db *gorm.DB
-}
-
-// List returns seeds with optional status filter and pagination.
-// GET /seeds?page=1&page_size=20&status=active
-func (h *SeedHandler) List(c *gin.Context) {
-	page, pageSize, offset := parsePage(c)
-
-	query := h.db.Model(&model.ManagedSeed{})
-	if status := c.Query("status"); status != "" {
-		query = query.Where("status = ?", status)
-	}
-
-	var total int64
-	if err := query.Count(&total).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	var seeds []model.ManagedSeed
-	if err := query.Order("id DESC").Limit(pageSize).Offset(offset).Find(&seeds).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	PageOK(c, seeds, total, page, pageSize)
-}
-
-// Create creates a new seed.
-// POST /seeds  body: {channel_name, note}
-func (h *SeedHandler) Create(c *gin.Context) {
-	var body struct {
-		ChannelName string `json:"channel_name" binding:"required"`
-		Note        string `json:"note"`
-	}
-	if err := c.ShouldBindJSON(&body); err != nil {
-		Fail(c, http.StatusBadRequest, err.Error())
-		return
-	}
-
-	seed := model.ManagedSeed{
-		ChannelName: body.ChannelName,
-		Note:        body.Note,
-		Status:      "active",
-	}
-	if err := h.db.Create(&seed).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-	OK(c, seed)
-}
-
-// Update modifies a seed's status and/or note.
-// PUT /seeds/:id  body: {status, note}
-func (h *SeedHandler) Update(c *gin.Context) {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 64)
-	if err != nil {
-		Fail(c, http.StatusBadRequest, "invalid id")
-		return
-	}
-
-	var body struct {
-		Status string `json:"status"`
-		Note   string `json:"note"`
-	}
-	if err := c.ShouldBindJSON(&body); err != nil {
-		Fail(c, http.StatusBadRequest, err.Error())
-		return
-	}
-
-	var seed model.ManagedSeed
-	if err := h.db.First(&seed, id).Error; err != nil {
-		Fail(c, 404, "seed not found")
-		return
-	}
-
-	updates := map[string]interface{}{}
-	if body.Status != "" {
-		updates["status"] = body.Status
-	}
-	updates["note"] = body.Note
-
-	if err := h.db.Model(&seed).Updates(updates).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-
-	h.db.First(&seed, id)
-	OK(c, seed)
-}
-
-// Delete removes a seed by ID.
-// DELETE /seeds/:id
-func (h *SeedHandler) Delete(c *gin.Context) {
-	id, err := strconv.ParseUint(c.Param("id"), 10, 64)
-	if err != nil {
-		Fail(c, http.StatusBadRequest, "invalid id")
-		return
-	}
-
-	if err := h.db.Delete(&model.ManagedSeed{}, id).Error; err != nil {
-		Fail(c, 500, err.Error())
-		return
-	}
-	OK(c, nil)
-}

+ 45 - 102
internal/handler/task.go

@@ -8,60 +8,24 @@ import (
 
 	"github.com/gin-gonic/gin"
 	"github.com/gorilla/websocket"
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
 
 	"spider/internal/model"
-	"spider/internal/service"
+	"spider/internal/store"
+	"spider/internal/task"
 )
 
-// validTaskTypes is the set of accepted task_type values.
-var validTaskTypes = map[string]bool{
-	"full":     true,
-	"discover": true,
-	"search":   true,
-	"github":   true,
-	"scrape":   true,
-	"crawl":    true,
-	"clean":    true,
-	"score":    true,
-}
-
 // TaskHandler handles task-related HTTP and WebSocket requests.
 type TaskHandler struct {
-	db          *gorm.DB
-	taskService *service.TaskService
-	redis       *redis.Client
-	upgrader    websocket.Upgrader
-}
-
-// NewTaskHandler creates a TaskHandler.
-func NewTaskHandler(db *gorm.DB, svc *service.TaskService, rdb *redis.Client) *TaskHandler {
-	return &TaskHandler{
-		db:          db,
-		taskService: svc,
-		redis:       rdb,
-		upgrader: websocket.Upgrader{
-			CheckOrigin: func(r *http.Request) bool { return true },
-		},
-	}
+	store   *store.Store
+	taskMgr *task.Manager
 }
 
 // List handles GET /tasks
-// Query params: status, page, page_size
 func (h *TaskHandler) List(c *gin.Context) {
+	page, pageSize, offset := parsePage(c)
 	status := c.Query("status")
-	page, _ := strconv.Atoi(c.DefaultQuery("page", "1"))
-	pageSize, _ := strconv.Atoi(c.DefaultQuery("page_size", "20"))
-	if page < 1 {
-		page = 1
-	}
-	if pageSize < 1 || pageSize > 100 {
-		pageSize = 20
-	}
-	offset := (page - 1) * pageSize
 
-	query := h.db.Model(&model.Task{}).Order("created_at DESC")
+	query := h.store.DB.Model(&model.TaskLog{}).Order("created_at DESC")
 	if status != "" {
 		query = query.Where("status = ?", status)
 	}
@@ -69,7 +33,7 @@ func (h *TaskHandler) List(c *gin.Context) {
 	var total int64
 	query.Count(&total)
 
-	var tasks []model.Task
+	var tasks []model.TaskLog
 	if err := query.Limit(pageSize).Offset(offset).Find(&tasks).Error; err != nil {
 		Fail(c, 500, err.Error())
 		return
@@ -80,24 +44,30 @@ func (h *TaskHandler) List(c *gin.Context) {
 
 // Start handles POST /tasks/start
 func (h *TaskHandler) Start(c *gin.Context) {
-	var req service.StartTaskRequest
+	var req task.StartRequest
 	if err := c.ShouldBindJSON(&req); err != nil {
 		Fail(c, 400, err.Error())
 		return
 	}
 
-	if !validTaskTypes[req.TaskType] {
-		Fail(c, 400, fmt.Sprintf("invalid task_type: %s", req.TaskType))
+	// Special case: clean task
+	if req.PluginName == "clean" {
+		taskLog, err := h.taskMgr.StartClean()
+		if err != nil {
+			Fail(c, 409, err.Error())
+			return
+		}
+		c.JSON(http.StatusCreated, Response{Code: 0, Message: "ok", Data: taskLog})
 		return
 	}
 
-	task, err := h.taskService.StartTask(req)
+	taskLog, err := h.taskMgr.StartTask(req)
 	if err != nil {
 		Fail(c, 409, err.Error())
 		return
 	}
 
-	c.JSON(http.StatusCreated, Response{Code: 0, Message: "ok", Data: task})
+	c.JSON(http.StatusCreated, Response{Code: 0, Message: "ok", Data: taskLog})
 }
 
 // Get handles GET /tasks/:id
@@ -108,16 +78,16 @@ func (h *TaskHandler) Get(c *gin.Context) {
 		return
 	}
 
-	var task model.Task
-	if err := h.db.First(&task, id).Error; err != nil {
+	var taskLog model.TaskLog
+	if err := h.store.DB.First(&taskLog, id).Error; err != nil {
 		Fail(c, 404, "task not found")
 		return
 	}
 
-	progress := h.taskService.GetProgress(&task)
+	progress := h.taskMgr.GetProgress(uint(id))
 
 	OK(c, gin.H{
-		"task":     task,
+		"task":     taskLog,
 		"progress": progress,
 	})
 }
@@ -130,12 +100,7 @@ func (h *TaskHandler) Stop(c *gin.Context) {
 		return
 	}
 
-	var body struct {
-		Force bool `json:"force"`
-	}
-	_ = c.ShouldBindJSON(&body)
-
-	if err := h.taskService.StopTask(uint(id), body.Force); err != nil {
+	if err := h.taskMgr.StopTask(uint(id)); err != nil {
 		Fail(c, 500, err.Error())
 		return
 	}
@@ -144,8 +109,6 @@ func (h *TaskHandler) Stop(c *gin.Context) {
 }
 
 // Logs handles GET /tasks/:id/logs via WebSocket.
-// On connect it immediately sends history logs from Redis, then streams live progress
-// until the task finishes or the client disconnects.
 func (h *TaskHandler) Logs(c *gin.Context) {
 	id, err := strconv.ParseUint(c.Param("id"), 10, 64)
 	if err != nil {
@@ -153,63 +116,43 @@ func (h *TaskHandler) Logs(c *gin.Context) {
 		return
 	}
 
-	conn, err := h.upgrader.Upgrade(c.Writer, c.Request, nil)
+	upgrader := websocket.Upgrader{
+		CheckOrigin: func(r *http.Request) bool { return true },
+	}
+
+	conn, err := upgrader.Upgrade(c.Writer, c.Request, nil)
 	if err != nil {
 		return
 	}
 	defer conn.Close()
 
-	ctx := c.Request.Context()
-	ticker := time.NewTicker(time.Second)
-	defer ticker.Stop()
-
 	send := func(msg string) bool {
 		err := conn.WriteMessage(websocket.TextMessage, []byte(msg))
 		return err == nil
 	}
 
-	// Fetch task record immediately.
-	var task model.Task
-	if err := h.db.First(&task, id).Error; err != nil {
+	// Fetch task record
+	var taskLog model.TaskLog
+	if err := h.store.DB.First(&taskLog, id).Error; err != nil {
 		send(fmt.Sprintf("[错误] 任务 #%d 不存在", id))
 		return
 	}
 
-	// Send history logs from Redis list first.
-	logKey := fmt.Sprintf("spider:task:logs:%d", id)
-	historyLogs, _ := h.redis.LRange(ctx, logKey, 0, -1).Result()
-	for _, line := range historyLogs {
+	// Send history logs
+	logs := h.taskMgr.GetLogs(uint(id))
+	for _, line := range logs {
 		if !send(line) {
 			return
 		}
 	}
 
-	// If no history logs, send current task status summary.
-	if len(historyLogs) == 0 {
-		send(fmt.Sprintf("[信息] 任务 #%d (%s) 状态: %s", task.ID, task.TaskType, task.Status))
-
-		// Also send current Redis progress if available.
-		progressKey := fmt.Sprintf("spider:task:progress:%d", id)
-		vals, _ := h.redis.HGetAll(ctx, progressKey).Result()
-		if len(vals) > 0 {
-			msg := fmt.Sprintf("[进度] 阶段: %s | 进度: %s/%s | %s",
-				vals["phase"], vals["current"], vals["total"], vals["message"])
-			send(msg)
-		}
-	}
-
-	// If the task has already finished, send completion message and close.
-	if task.Status == "completed" || task.Status == "failed" || task.Status == "stopped" {
-		statusLabel := map[string]string{
-			"completed": "完成",
-			"failed":    "失败",
-			"stopped":   "停止",
-		}[task.Status]
-		send(fmt.Sprintf("[完成] 任务已%s", statusLabel))
+	// If finished, close
+	if taskLog.Status == "completed" || taskLog.Status == "failed" || taskLog.Status == "stopped" {
+		send(fmt.Sprintf("[完成] 任务已结束,状态: %s", taskLog.Status))
 		return
 	}
 
-	// Task is still running — handle client close messages in the background.
+	// Stream live updates
 	clientGone := make(chan struct{})
 	go func() {
 		for {
@@ -220,22 +163,22 @@ func (h *TaskHandler) Logs(c *gin.Context) {
 		}
 	}()
 
-	progressKey := fmt.Sprintf("spider:task:progress:%d", id)
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
 
 	for {
 		select {
 		case <-clientGone:
 			return
 		case <-ticker.C:
-			var t model.Task
-			if err := h.db.First(&t, id).Error; err != nil {
+			var t model.TaskLog
+			if err := h.store.DB.First(&t, id).Error; err != nil {
 				return
 			}
 
-			vals, _ := h.redis.HGetAll(ctx, progressKey).Result()
-			if len(vals) > 0 {
-				msg := fmt.Sprintf("[进度] 阶段: %s | %s/%s | %s",
-					vals["phase"], vals["current"], vals["total"], vals["message"])
+			progress := h.taskMgr.GetProgress(uint(id))
+			if progress != nil {
+				msg := fmt.Sprintf("[进度] %v", progress["message"])
 				if !send(msg) {
 					return
 				}

+ 7 - 95
internal/llm/client.go

@@ -4,7 +4,6 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
-	"strconv"
 	"strings"
 	"time"
 
@@ -13,15 +12,15 @@ import (
 	"spider/internal/extractor"
 )
 
-// Client OpenAI 兼容的 LLM 客户端
+// Client is an OpenAI-compatible LLM client.
+// Used only for TG message merchant extraction (fallback when regex fails).
 type Client struct {
 	client  *openai.Client
 	model   string
 	timeout time.Duration
 }
 
-// New 创建客户端,支持任意 OpenAI 兼容接口
-// baseURL 为空时使用 OpenAI 官方接口
+// New creates a client. baseURL empty = OpenAI official endpoint.
 func New(baseURL, apiKey, model string, timeout time.Duration) *Client {
 	cfg := openai.DefaultConfig(apiKey)
 	if baseURL != "" {
@@ -34,7 +33,6 @@ func New(baseURL, apiKey, model string, timeout time.Duration) *Client {
 	}
 }
 
-// chat 内部封装:发送 system + user 消息,返回第一条回复文本
 func (c *Client) chat(ctx context.Context, system, user string) (string, error) {
 	ctx, cancel := context.WithTimeout(ctx, c.timeout)
 	defer cancel()
@@ -55,36 +53,8 @@ func (c *Client) chat(ctx context.Context, system, user string) (string, error)
 	return strings.TrimSpace(resp.Choices[0].Message.Content), nil
 }
 
-// EvalChannelRelevance 评估 TG 频道是否与商户相关
-// 返回相关度评分 0-1,<0.5 认为不相关
-// 调用失败时返回 0.5 表示不确定
-func (c *Client) EvalChannelRelevance(ctx context.Context, name, about string, memberCount int) (float64, error) {
-	const system = `你是商户识别专家。请判断以下 Telegram 频道是否与商户/卖家/服务提供商相关。
-只关注是否有商品/服务在售。返回 0-1 的数字,1 表示高度相关,0 表示完全不相关。只返回数字,不要解释。`
-
-	user := fmt.Sprintf("频道名:%s\n简介:%s\n成员数:%d", name, about, memberCount)
-
-	text, err := c.chat(ctx, system, user)
-	if err != nil {
-		return 0.5, err
-	}
-
-	score, parseErr := strconv.ParseFloat(text, 64)
-	if parseErr != nil {
-		// 尝试从文本中提取第一个数字
-		fields := strings.Fields(text)
-		for _, f := range fields {
-			if s, e := strconv.ParseFloat(f, 64); e == nil {
-				return clamp01(s), nil
-			}
-		}
-		return 0.5, fmt.Errorf("llm eval: cannot parse score from %q", text)
-	}
-	return clamp01(score), nil
-}
-
-// ParseMerchant 从消息文本中解析商户信息
-// 用于正则提取失败时的 fallback,或提取非标准格式如"加V:xxx"
+// ParseMerchant extracts merchant info from text.
+// Used as fallback when regex extraction fails on non-standard formats like "加V:xxx".
 func (c *Client) ParseMerchant(ctx context.Context, message string) (*extractor.MerchantInfo, error) {
 	const system = `你是信息提取专家。从以下文本中提取商户联系信息,返回 JSON 格式。
 字段:merchant_name, tg_username(不含@), website, email, phone, industry, description
@@ -92,62 +62,20 @@ func (c *Client) ParseMerchant(ctx context.Context, message string) (*extractor.
 
 	text, err := c.chat(ctx, system, message)
 	if err != nil {
-		return defaultMerchantInfo(), err
+		return &extractor.MerchantInfo{}, err
 	}
 
-	// 去除可能的 markdown 代码块包裹
 	text = stripMarkdownCode(text)
 
 	info := &extractor.MerchantInfo{}
 	if jsonErr := json.Unmarshal([]byte(text), info); jsonErr != nil {
-		return defaultMerchantInfo(), fmt.Errorf("llm parse merchant: json unmarshal: %w (raw: %s)", jsonErr, text)
+		return &extractor.MerchantInfo{}, fmt.Errorf("llm parse merchant: json unmarshal: %w (raw: %s)", jsonErr, text)
 	}
 	return info, nil
 }
 
-// ClassifyIndustry 行业分类
-// 返回行业标签:机场/发卡/成人/电商/游戏/其他 等
-func (c *Client) ClassifyIndustry(ctx context.Context, name, about string) (string, error) {
-	const system = `你是电商行业分类专家。根据频道信息,从以下类别中选择最匹配的一个:
-机场、发卡、成人、电商、游戏充值、金融、软件工具、其他
-只返回类别名称,不要解释。`
-
-	user := fmt.Sprintf("名称:%s,简介:%s", name, about)
-
-	text, err := c.chat(ctx, system, user)
-	if err != nil {
-		return "其他", err
-	}
-	return strings.TrimSpace(text), nil
-}
-
-// IsNavSite 判断 URL 是否是导航站/目录站
-// 返回 (是否是导航站, 置信度 0-1)
-func (c *Client) IsNavSite(ctx context.Context, url string) (bool, float64, error) {
-	const system = `判断以下 URL 是否是导航站、目录站或聚合站(收录多个商家/服务的网站)。
-返回 JSON: {"is_nav": true/false, "confidence": 0.0-1.0}`
-
-	text, err := c.chat(ctx, system, url)
-	if err != nil {
-		return false, 0, err
-	}
-
-	text = stripMarkdownCode(text)
-
-	var result struct {
-		IsNav      bool    `json:"is_nav"`
-		Confidence float64 `json:"confidence"`
-	}
-	if jsonErr := json.Unmarshal([]byte(text), &result); jsonErr != nil {
-		return false, 0, fmt.Errorf("llm is_nav_site: json unmarshal: %w (raw: %s)", jsonErr, text)
-	}
-	return result.IsNav, clamp01(result.Confidence), nil
-}
-
-// stripMarkdownCode 去除 LLM 响应中可能包含的 markdown 代码块标记
 func stripMarkdownCode(s string) string {
 	s = strings.TrimSpace(s)
-	// 去除 ```json ... ``` 或 ``` ... ```
 	if strings.HasPrefix(s, "```") {
 		lines := strings.SplitN(s, "\n", 2)
 		if len(lines) == 2 {
@@ -160,19 +88,3 @@ func stripMarkdownCode(s string) string {
 	}
 	return s
 }
-
-// clamp01 将浮点数限制在 [0, 1] 范围内
-func clamp01(v float64) float64 {
-	if v < 0 {
-		return 0
-	}
-	if v > 1 {
-		return 1
-	}
-	return v
-}
-
-// defaultMerchantInfo 返回空的 MerchantInfo(JSON 解析失败时的默认值)
-func defaultMerchantInfo() *extractor.MerchantInfo {
-	return &extractor.MerchantInfo{}
-}

+ 6 - 7
internal/model/channel.go

@@ -2,17 +2,16 @@ package model
 
 import "time"
 
+// Channel stores discovered Telegram channels.
 type Channel struct {
 	ID             uint      `gorm:"primaryKey;autoIncrement" json:"id"`
 	Username       string    `gorm:"uniqueIndex;size:255;not null" json:"username"`
-	Title          string    `gorm:"size:500" json:"title"`
-	MemberCount    int       `gorm:"default:0" json:"member_count"`
-	About          string    `gorm:"type:text" json:"about"`
-	Source         string    `gorm:"type:enum('seed','snowball','search','github');not null;index" json:"source"`
-	SourceDetail   string    `gorm:"size:500" json:"source_detail"`
-	Status         string    `gorm:"type:enum('pending','scraped','failed','skipped');default:'pending';index" json:"status"`
+	ChannelID      int64     `gorm:"default:0" json:"channel_id"`    // cached TG entity ID
+	AccessHash     int64     `gorm:"default:0" json:"access_hash"`   // cached TG access hash
+	Status         string    `gorm:"size:20;default:'pending';index" json:"status"` // pending / scraped / failed / skipped
 	LastMessageID  int       `gorm:"default:0" json:"last_message_id"`
-	RelevanceScore float64   `json:"relevance_score"`
+	MerchantsFound int       `gorm:"default:0" json:"merchants_found"`
+	Source         string    `gorm:"size:50;not null;index" json:"source"` // seed / snowball / search / github
 	CreatedAt      time.Time `json:"created_at"`
 	UpdatedAt      time.Time `json:"updated_at"`
 }

+ 0 - 12
internal/model/config_revision.go

@@ -1,12 +0,0 @@
-package model
-
-import "time"
-
-type ConfigRevision struct {
-	ID         uint      `gorm:"primaryKey;autoIncrement"`
-	SettingKey string    `gorm:"size:255;not null;index"`
-	OldValue   string    `gorm:"type:text"`
-	NewValue   string    `gorm:"type:text"`
-	ChangedBy  string    `gorm:"size:100;default:'admin'"`
-	CreatedAt  time.Time
-}

+ 8 - 6
internal/model/keyword.go

@@ -2,10 +2,12 @@ package model
 
 import "time"
 
-type ManagedKeyword struct {
-	ID        uint      `gorm:"primaryKey;autoIncrement" json:"id"`
-	Keyword   string    `gorm:"uniqueIndex;size:255;not null" json:"keyword"`
-	Category  string    `gorm:"size:100" json:"category"`
-	Status    string    `gorm:"type:enum('active','inactive');default:'active'" json:"status"`
-	CreatedAt time.Time `json:"created_at"`
+// Keyword is the unified table for search keywords and seed channels.
+// Seeds use industry_tag = "seed", their keyword field holds the channel name.
+type Keyword struct {
+	ID          uint      `gorm:"primaryKey;autoIncrement" json:"id"`
+	Keyword     string    `gorm:"uniqueIndex;size:255;not null" json:"keyword"`
+	IndustryTag string    `gorm:"size:100" json:"industry_tag"`
+	Enabled     bool      `gorm:"default:true" json:"enabled"`
+	CreatedAt   time.Time `json:"created_at"`
 }

+ 19 - 20
internal/model/merchant_clean.go

@@ -6,25 +6,24 @@ import (
 	"gorm.io/datatypes"
 )
 
+// MerchantClean stores validated, deduplicated merchants.
 type MerchantClean struct {
-	ID           uint           `gorm:"primaryKey;autoIncrement" json:"id"`
-	RawID        *uint          `gorm:"index" json:"raw_id"`
-	MerchantName string         `gorm:"size:500" json:"merchant_name"`
-	TgUsername   string         `gorm:"uniqueIndex;size:255" json:"tg_username"`
-	Website      string         `gorm:"size:2048" json:"website"`
-	Email        string         `gorm:"size:255" json:"email"`
-	Phone        string         `gorm:"size:100" json:"phone"`
-	Industry     string         `gorm:"size:100;index" json:"industry"`
-	Status       string         `gorm:"type:enum('valid','invalid','bot','duplicate','group');not null;index" json:"status"`
-	TgFirstName  string         `gorm:"size:255" json:"tg_first_name"`
-	TgLastName   string         `gorm:"size:255" json:"tg_last_name"`
-	IsPremium    bool           `gorm:"default:false" json:"is_premium"`
-	LastOnline   *time.Time     `json:"last_online"`
-	ActiveLevel  string         `gorm:"type:enum('active','moderate','inactive')" json:"active_level"`
-	MemberCount  int            `gorm:"default:0" json:"member_count"`
-	QualityScore float64        `gorm:"default:0;index" json:"quality_score"`
-	SourceCount  int            `gorm:"default:1" json:"source_count"`
-	SourceLinks  datatypes.JSON `gorm:"type:json" json:"source_links"`
-	CreatedAt    time.Time      `json:"created_at"`
-	UpdatedAt    time.Time      `json:"updated_at"`
+	ID            uint           `gorm:"primaryKey;autoIncrement" json:"id"`
+	TgUsername    string         `gorm:"uniqueIndex;size:255" json:"tg_username"`
+	TgLink        string         `gorm:"size:500" json:"tg_link"`
+	MerchantName  string         `gorm:"size:500" json:"merchant_name"`
+	Website       string         `gorm:"size:2048" json:"website"`
+	Email         string         `gorm:"size:255" json:"email"`
+	Phone         string         `gorm:"size:100" json:"phone"`
+	SourceCount   int            `gorm:"default:1" json:"source_count"`
+	AllSources    datatypes.JSON `gorm:"type:json" json:"all_sources"`
+	IndustryTag   string         `gorm:"size:100;index" json:"industry_tag"`
+	Level         string         `gorm:"size:10;index" json:"level"` // Hot / Warm / Cold
+	Status        string         `gorm:"size:20;not null;index" json:"status"` // valid / invalid / bot / duplicate
+	IsAlive       bool           `gorm:"default:false" json:"is_alive"`
+	LastCheckedAt *time.Time     `json:"last_checked_at"`
+	CreatedAt     time.Time      `json:"created_at"`
+	UpdatedAt     time.Time      `json:"updated_at"`
 }
+
+func (MerchantClean) TableName() string { return "merchants_clean" }

+ 18 - 12
internal/model/merchant_raw.go

@@ -2,17 +2,23 @@ package model
 
 import "time"
 
+// MerchantRaw stores raw merchant data from all collector plugins.
+// Dedup rule: same tg_username + same source_url = skip insert.
 type MerchantRaw struct {
-	ID              uint      `gorm:"primaryKey;autoIncrement" json:"id"`
-	MerchantName    string    `gorm:"size:500" json:"merchant_name"`
-	TgUsername      string    `gorm:"size:255;index" json:"tg_username"`
-	Website         string    `gorm:"size:2048" json:"website"`
-	Email           string    `gorm:"size:255" json:"email"`
-	Phone           string    `gorm:"size:100" json:"phone"`
-	Industry        string    `gorm:"size:100" json:"industry"`
-	SourceType      string    `gorm:"type:enum('tg_scrape','web_crawl','github');not null" json:"source_type"`
-	SourceID        string    `gorm:"size:500" json:"source_id"`
-	OriginalMessage string    `gorm:"type:text" json:"original_message"`
-	Status          string    `gorm:"type:enum('raw','glm_parsed');default:'raw';index" json:"status"`
-	CreatedAt       time.Time `json:"created_at"`
+	ID           uint      `gorm:"primaryKey;autoIncrement" json:"id"`
+	TgUsername   string    `gorm:"size:255;index;not null" json:"tg_username"`
+	TgLink       string    `gorm:"size:500" json:"tg_link"`
+	MerchantName string    `gorm:"size:500" json:"merchant_name"`
+	Website      string    `gorm:"size:2048" json:"website"`
+	Email        string    `gorm:"size:255" json:"email"`
+	Phone        string    `gorm:"size:100" json:"phone"`
+	SourceType   string    `gorm:"size:50;not null" json:"source_type"`
+	SourceName   string    `gorm:"size:500" json:"source_name"`
+	SourceURL    string    `gorm:"size:2048" json:"source_url"`
+	OriginalText string    `gorm:"type:text" json:"original_text"`
+	IndustryTag  string    `gorm:"size:100" json:"industry_tag"`
+	Status       string    `gorm:"size:20;default:'raw';index" json:"status"` // raw / processing / done
+	CreatedAt    time.Time `json:"created_at"`
 }
+
+func (MerchantRaw) TableName() string { return "merchants_raw" }

+ 0 - 14
internal/model/nav_site.go

@@ -1,14 +0,0 @@
-package model
-
-import "time"
-
-type NavSite struct {
-	ID            uint      `gorm:"primaryKey;autoIncrement" json:"id"`
-	URL           string    `gorm:"uniqueIndex:idx_url,length:500;type:varchar(2048);not null" json:"url"`
-	Domain        string    `gorm:"size:255;index" json:"domain"`
-	Source        string    `gorm:"size:100" json:"source"`
-	Status        string    `gorm:"type:enum('pending','scraped','filtered','failed');default:'pending';index" json:"status"`
-	FilterReason  string    `gorm:"size:255" json:"filter_reason"`
-	MerchantCount int       `gorm:"default:0" json:"merchant_count"`
-	CreatedAt     time.Time `json:"created_at"`
-}

+ 0 - 12
internal/model/seed.go

@@ -1,12 +0,0 @@
-package model
-
-import "time"
-
-type ManagedSeed struct {
-	ID          uint      `gorm:"primaryKey;autoIncrement" json:"id"`
-	ChannelName string    `gorm:"uniqueIndex;size:255;not null" json:"channel_name"`
-	Status      string    `gorm:"type:enum('active','inactive');default:'active'" json:"status"`
-	Note        string    `gorm:"size:500" json:"note"`
-	CreatedAt   time.Time `json:"created_at"`
-	UpdatedAt   time.Time `json:"updated_at"`
-}

+ 0 - 13
internal/model/setting.go

@@ -1,13 +0,0 @@
-package model
-
-import "time"
-
-type ManagedSetting struct {
-	ID          uint      `gorm:"primaryKey;autoIncrement" json:"id"`
-	KeyName     string    `gorm:"uniqueIndex;column:key_name;size:255;not null" json:"key_name"`
-	Value       string    `gorm:"type:text;not null" json:"value"`
-	ValueType   string    `gorm:"type:enum('int','float','bool','string','json');not null" json:"value_type"`
-	EffectLevel string    `gorm:"type:enum('runtime','new_task');default:'runtime'" json:"effect_level"`
-	Description string    `gorm:"size:500" json:"description"`
-	UpdatedAt   time.Time `json:"updated_at"`
-}

+ 0 - 20
internal/model/task.go

@@ -1,20 +0,0 @@
-package model
-
-import (
-	"time"
-
-	"gorm.io/datatypes"
-)
-
-type Task struct {
-	ID         uint           `gorm:"primaryKey;autoIncrement" json:"id"`
-	TaskType   string         `gorm:"type:enum('full','discover','search','github','scrape','crawl','clean','score');not null;index" json:"task_type"`
-	Status     string         `gorm:"type:enum('pending','running','completed','failed','stopped');default:'pending';index" json:"status"`
-	Params     datatypes.JSON `gorm:"type:json" json:"params"`
-	Progress   datatypes.JSON `gorm:"type:json" json:"progress"`
-	Result     datatypes.JSON `gorm:"type:json" json:"result"`
-	ErrorMsg   string         `gorm:"type:text" json:"error_msg"`
-	StartedAt  *time.Time     `json:"started_at"`
-	FinishedAt *time.Time     `json:"finished_at"`
-	CreatedAt  time.Time      `json:"created_at"`
-}

+ 18 - 0
internal/model/task_log.go

@@ -0,0 +1,18 @@
+package model
+
+import "time"
+
+// TaskLog records the execution of a plugin or processor run.
+type TaskLog struct {
+	ID             uint       `gorm:"primaryKey;autoIncrement" json:"id"`
+	TaskType       string     `gorm:"size:50;not null;index" json:"task_type"` // collect / clean
+	PluginName     string     `gorm:"size:100" json:"plugin_name"`
+	Status         string     `gorm:"size:20;default:'pending';index" json:"status"` // pending / running / completed / failed / stopped
+	ItemsProcessed int        `gorm:"default:0" json:"items_processed"`
+	MerchantsAdded int        `gorm:"default:0" json:"merchants_added"`
+	ErrorsCount    int        `gorm:"default:0" json:"errors_count"`
+	StartedAt      *time.Time `json:"started_at"`
+	FinishedAt     *time.Time `json:"finished_at"`
+	Detail         string     `gorm:"type:text" json:"detail"`
+	CreatedAt      time.Time  `json:"created_at"`
+}

+ 0 - 0
internal/pipeline/.gitkeep


+ 0 - 46
internal/pipeline/phase.go

@@ -1,46 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"spider/internal/model"
-)
-
-// Settings is a minimal interface satisfied by *service.SettingsService.
-// Using an interface here avoids an import cycle (service → worker → pipeline → service).
-type Settings interface {
-	GetInt(ctx context.Context, key string, defaultVal int) int
-	GetFloat(ctx context.Context, key string, defaultVal float64) float64
-	GetBool(ctx context.Context, key string, defaultVal bool) bool
-}
-
-// Phase 每个采集阶段的接口
-type Phase interface {
-	Name() string
-	Run(ctx context.Context, task *model.Task, opts *Options) error
-}
-
-// Options Pipeline 执行选项(来自任务参数)
-type Options struct {
-	Target     string
-	SkipPhases []string
-	TestRun    *TestRun
-}
-
-type TestRun struct {
-	ItemLimit    int
-	MessageLimit int
-}
-
-// ProgressReporter 进度上报函数类型
-// 由 pipeline.Runner 提供,各 phase 调用
-type ProgressReporter func(phase string, current, total int, message string)
-
-// ShouldSkip 检查某阶段是否被跳过
-func ShouldSkip(phaseName string, skipPhases []string) bool {
-	for _, s := range skipPhases {
-		if s == phaseName {
-			return true
-		}
-	}
-	return false
-}

+ 0 - 183
internal/pipeline/phase1_discover.go

@@ -1,183 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"log"
-	"regexp"
-	"strings"
-	"time"
-
-	"spider/internal/model"
-	"spider/internal/telegram"
-
-	"gorm.io/gorm"
-)
-
-// DiscoverPhase Phase 1: TG 频道裂变发现
-type DiscoverPhase struct {
-	db        *gorm.DB
-	tgManager *telegram.AccountManager
-	settings  Settings
-	reporter  ProgressReporter
-}
-
-// NewDiscoverPhase creates a new DiscoverPhase.
-func NewDiscoverPhase(db *gorm.DB, tgManager *telegram.AccountManager, settings Settings) *DiscoverPhase {
-	return &DiscoverPhase{
-		db:        db,
-		tgManager: tgManager,
-		settings:  settings,
-	}
-}
-
-func (p *DiscoverPhase) Name() string { return "discover" }
-
-func (p *DiscoverPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	log.Printf("[discover] starting, task_id=%d", task.ID)
-
-	if p.tgManager == nil {
-		log.Printf("[discover] tgManager is nil, skipping")
-		return nil
-	}
-
-	// 1. 读配置
-	maxDepth := 3
-	maxPerLayer := p.settings.GetInt(ctx, "snowball.max_channels_per_layer", 200)
-	maxTotal := p.settings.GetInt(ctx, "snowball.max_channels_total", 500)
-
-	// 2. 从 managed_seeds 拿所有 active 种子
-	var seeds []model.ManagedSeed
-	p.db.Where("status = ?", "active").Find(&seeds)
-	log.Printf("[discover] found %d active seeds", len(seeds))
-
-	// 3. BFS 队列
-	type QueueItem struct {
-		Username string
-		Depth    int
-		Source   string // "seed" 或 "snowball"
-	}
-
-	queue := make([]QueueItem, 0, len(seeds))
-	for _, s := range seeds {
-		queue = append(queue, QueueItem{Username: s.ChannelName, Depth: 0, Source: "seed"})
-	}
-
-	visited := map[string]bool{}
-	totalFound := 0
-
-	// 4. BFS 处理
-	for len(queue) > 0 && totalFound < maxTotal {
-		if isContextDone(ctx) {
-			break
-		}
-
-		item := queue[0]
-		queue = queue[1:]
-
-		username := cleanUsername(item.Username)
-		if username == "" || visited[username] {
-			continue
-		}
-		visited[username] = true
-
-		// 获取 TG 账号
-		acc, err := p.tgManager.Acquire(ctx)
-		if err != nil {
-			log.Printf("[discover] no available TG account: %v", err)
-			break
-		}
-
-		// 连接并获取频道信息
-		if err := acc.Client.Connect(ctx); err != nil {
-			log.Printf("[discover] connect failed for account: %v", err)
-			p.tgManager.Release(acc, 0)
-			continue
-		}
-
-		channelInfo, err := acc.Client.GetChannelInfo(ctx, username)
-		if err != nil {
-			if fw, ok := err.(*telegram.FloodWaitError); ok {
-				log.Printf("[discover] FloodWait %ds on @%s", fw.Seconds, username)
-				p.tgManager.HandleFloodWait(acc, fw.Seconds)
-			} else {
-				log.Printf("[discover] GetChannelInfo error @%s: %v", username, err)
-				p.tgManager.Release(acc, 0)
-			}
-			continue
-		}
-
-		// 写入 channels 表(忽略 unique 冲突)
-		ch := &model.Channel{
-			Username:    username,
-			Title:       channelInfo.Title,
-			MemberCount: channelInfo.MemberCount,
-			About:       channelInfo.About,
-			Source:      item.Source,
-			Status:      "pending",
-		}
-		p.db.Where(model.Channel{Username: username}).FirstOrCreate(ch)
-		totalFound++
-
-		if p.reporter != nil {
-			p.reporter("discover", totalFound, maxTotal, "发现频道: @"+username)
-		}
-
-		// 如果还没到最大深度,读消息提取更多频道
-		if item.Depth < maxDepth {
-			msgs, err := acc.Client.GetMessages(ctx, username, 0, 100)
-			if err == nil {
-				layerCount := 0
-				for _, msg := range msgs {
-					if layerCount >= maxPerLayer {
-						break
-					}
-					// 提取 forward 来源频道
-					if msg.ForwardFromChannel != "" {
-						fwdName := cleanUsername(msg.ForwardFromChannel)
-						if fwdName != "" && !visited[fwdName] {
-							queue = append(queue, QueueItem{fwdName, item.Depth + 1, "snowball"})
-							layerCount++
-						}
-					}
-					// 提取消息中的 t.me 链接
-					for _, link := range msg.Links {
-						name := extractUsernameFromLink(link)
-						if name != "" && !visited[name] {
-							queue = append(queue, QueueItem{name, item.Depth + 1, "snowball"})
-							layerCount++
-						}
-					}
-				}
-			} else {
-				log.Printf("[discover] GetMessages @%s: %v", username, err)
-			}
-		}
-
-		p.tgManager.Release(acc, 0)
-
-		// 频道间 sleep
-		select {
-		case <-ctx.Done():
-			return nil
-		case <-time.After(5 * time.Second):
-		}
-	}
-
-	log.Printf("[discover] done, found %d channels", totalFound)
-	return nil
-}
-
-// cleanUsername 清理用户名(去除 @ 前缀及空白)
-func cleanUsername(s string) string {
-	return strings.TrimPrefix(strings.TrimSpace(s), "@")
-}
-
-// extractUsernameFromLink 从 t.me/xxx 链接提取用户名
-func extractUsernameFromLink(link string) string {
-	re := regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
-	m := re.FindStringSubmatch(link)
-	if len(m) > 1 {
-		return m[1]
-	}
-	return ""
-}

+ 0 - 129
internal/pipeline/phase2_search.go

@@ -1,129 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"log"
-	"net/url"
-	"regexp"
-	"time"
-
-	"spider/internal/model"
-	"spider/internal/search"
-
-	"gorm.io/gorm"
-)
-
-// SearchPhase Phase 2: 搜索引擎采集
-type SearchPhase struct {
-	db       *gorm.DB
-	serper   *search.SerperClient
-	settings Settings
-	reporter ProgressReporter
-}
-
-// NewSearchPhase creates a new SearchPhase.
-func NewSearchPhase(db *gorm.DB, serper *search.SerperClient, settings Settings) *SearchPhase {
-	return &SearchPhase{
-		db:       db,
-		serper:   serper,
-		settings: settings,
-	}
-}
-
-func (p *SearchPhase) Name() string { return "search" }
-
-func (p *SearchPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	if p.serper == nil {
-		log.Println("[search] no serper client configured, skipping")
-		return nil
-	}
-
-	// 取 active 关键词
-	var keywords []model.ManagedKeyword
-	q := p.db.Where("status = ?", "active")
-	if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
-		q = q.Limit(opts.TestRun.ItemLimit)
-	}
-	q.Find(&keywords)
-
-	total := len(keywords)
-	channelCount, navCount := 0, 0
-
-	for i, kw := range keywords {
-		if isContextDone(ctx) {
-			break
-		}
-
-		if p.reporter != nil {
-			p.reporter("search", i+1, total, "搜索: "+kw.Keyword)
-		}
-
-		results, err := p.serper.Search(ctx, kw.Keyword)
-		if err != nil {
-			log.Printf("[search] keyword=%s err=%v", kw.Keyword, err)
-			continue
-		}
-
-		for _, r := range results {
-			switch search.ClassifyURL(r.URL) {
-			case "tg_channel":
-				username := extractTGUsername(r.URL)
-				if username == "" {
-					continue
-				}
-				ch := &model.Channel{
-					Username:     username,
-					Source:       "search",
-					SourceDetail: kw.Keyword,
-					Status:       "pending",
-				}
-				result := p.db.Where(model.Channel{Username: username}).FirstOrCreate(ch)
-				if result.RowsAffected > 0 {
-					channelCount++
-				}
-
-			case "nav_site":
-				domain := extractDomain(r.URL)
-				site := &model.NavSite{
-					URL:    r.URL,
-					Domain: domain,
-					Source: kw.Keyword,
-					Status: "pending",
-				}
-				result := p.db.Where("url = ?", r.URL).FirstOrCreate(site)
-				if result.RowsAffected > 0 {
-					navCount++
-				}
-			}
-		}
-
-		// 关键词间 sleep 2s
-		select {
-		case <-ctx.Done():
-			return nil
-		case <-time.After(2 * time.Second):
-		}
-	}
-
-	log.Printf("[search] done: %d channels, %d nav_sites found", channelCount, navCount)
-	return nil
-}
-
-// extractTGUsername 从 t.me/username 或 telegram.me/username 提取用户名
-func extractTGUsername(rawURL string) string {
-	re := regexp.MustCompile(`(?:t(?:elegram)?\.me)/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
-	m := re.FindStringSubmatch(rawURL)
-	if len(m) > 1 {
-		return m[1]
-	}
-	return ""
-}
-
-// extractDomain 从 URL 中提取域名
-func extractDomain(rawURL string) string {
-	u, err := url.Parse(rawURL)
-	if err != nil {
-		return ""
-	}
-	return u.Hostname()
-}

+ 0 - 250
internal/pipeline/phase3_github.go

@@ -1,250 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log"
-	"net/http"
-	"net/url"
-	"regexp"
-	"strings"
-	"time"
-
-	"spider/internal/extractor"
-	"spider/internal/model"
-
-	"gorm.io/gorm"
-)
-
-// GithubPhase Phase 3: GitHub README 挖掘
-type GithubPhase struct {
-	db       *gorm.DB
-	token    string // GitHub token(可选)
-	settings Settings
-	reporter ProgressReporter
-	http     *http.Client
-}
-
-// NewGithubPhase creates a new GithubPhase.
-func NewGithubPhase(db *gorm.DB, token string, settings Settings) *GithubPhase {
-	return &GithubPhase{
-		db:       db,
-		token:    token,
-		settings: settings,
-		http:     &http.Client{Timeout: 15 * time.Second},
-	}
-}
-
-func (p *GithubPhase) Name() string { return "github" }
-
-func (p *GithubPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	// GitHub 搜索 query:从 managed_keywords 取前 10 个生成 query
-	var keywords []model.ManagedKeyword
-	p.db.Where("status = ?", "active").Limit(10).Find(&keywords)
-
-	queries := []string{}
-	for _, kw := range keywords {
-		queries = append(queries, fmt.Sprintf("%s telegram", kw.Keyword))
-	}
-
-	itemLimit := 50 // 默认处理 50 个 repo
-	if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
-		itemLimit = opts.TestRun.ItemLimit
-	}
-
-	total := len(queries)
-	found := 0
-
-	reposPerQuery := 1
-	if len(queries) > 0 {
-		reposPerQuery = itemLimit/len(queries) + 1
-	}
-
-	for i, query := range queries {
-		if isContextDone(ctx) {
-			break
-		}
-		if p.reporter != nil {
-			p.reporter("github", i+1, total, "GitHub搜索: "+query)
-		}
-
-		repos, err := p.searchRepos(ctx, query, reposPerQuery)
-		if err != nil {
-			log.Printf("[github] search err: %v", err)
-			continue
-		}
-
-		for _, repo := range repos {
-			if isContextDone(ctx) {
-				break
-			}
-
-			readme, err := p.fetchReadme(ctx, repo)
-			if err != nil {
-				continue
-			}
-
-			// 过滤:README 前 5000 字必须含中文
-			preview := readme
-			if len(preview) > 5000 {
-				preview = preview[:5000]
-			}
-			if !extractor.ContainsChinese(preview, 0) {
-				continue
-			}
-
-			// 提取 t.me 链接
-			links := extractTMeLinks(readme)
-			for _, link := range links {
-				// 前后 200 字必须含中文
-				idx := strings.Index(readme, link)
-				if idx < 0 {
-					continue
-				}
-				start := idx - 200
-				if start < 0 {
-					start = 0
-				}
-				end := idx + len(link) + 200
-				if end > len(readme) {
-					end = len(readme)
-				}
-				context200 := readme[start:end]
-				if !extractor.ContainsChinese(context200, 0) {
-					continue
-				}
-
-				username := extractTGUsernameFromLink(link)
-				if username == "" {
-					continue
-				}
-
-				ch := &model.Channel{
-					Username:     username,
-					Source:       "github",
-					SourceDetail: repo,
-					Status:       "pending",
-				}
-				result := p.db.Where(model.Channel{Username: username}).FirstOrCreate(ch)
-				if result.RowsAffected > 0 {
-					found++
-				}
-			}
-
-			// repo 间 sleep 2s
-			select {
-			case <-ctx.Done():
-				return nil
-			case <-time.After(2 * time.Second):
-			}
-		}
-
-		// query 间 sleep 5s
-		select {
-		case <-ctx.Done():
-			return nil
-		case <-time.After(5 * time.Second):
-		}
-	}
-
-	log.Printf("[github] done: %d channels found", found)
-	return nil
-}
-
-// searchRepos 通过 GitHub Search API 搜索仓库
-func (p *GithubPhase) searchRepos(ctx context.Context, query string, limit int) ([]string, error) {
-	perPage := limit
-	if perPage > 30 {
-		perPage = 30
-	}
-	apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d",
-		url.QueryEscape(query), perPage)
-
-	req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
-	if err != nil {
-		return nil, err
-	}
-	req.Header.Set("Accept", "application/vnd.github.v3+json")
-	if p.token != "" {
-		req.Header.Set("Authorization", "token "+p.token)
-	}
-
-	resp, err := p.http.Do(req)
-	if err != nil {
-		return nil, err
-	}
-	defer resp.Body.Close()
-
-	var result struct {
-		Items []struct {
-			FullName string `json:"full_name"`
-		} `json:"items"`
-	}
-	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
-		return nil, err
-	}
-
-	var repos []string
-	for _, item := range result.Items {
-		repos = append(repos, item.FullName)
-	}
-	return repos, nil
-}
-
-// fetchReadme 下载 README.md(先尝试 main 分支,失败则尝试 master)
-func (p *GithubPhase) fetchReadme(ctx context.Context, fullName string) (string, error) {
-	rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName)
-	req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
-	if err != nil {
-		return "", err
-	}
-	if p.token != "" {
-		req.Header.Set("Authorization", "token "+p.token)
-	}
-
-	resp, err := p.http.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode == 404 {
-		// 尝试 master 分支
-		masterURL := strings.Replace(rawURL, "/main/", "/master/", 1)
-		req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil)
-		if err != nil {
-			return "", err
-		}
-		if p.token != "" {
-			req2.Header.Set("Authorization", "token "+p.token)
-		}
-		resp2, err := p.http.Do(req2)
-		if err != nil {
-			return "", err
-		}
-		defer resp2.Body.Close()
-		data, _ := io.ReadAll(resp2.Body)
-		return string(data), nil
-	}
-
-	data, _ := io.ReadAll(resp.Body)
-	return string(data), nil
-}
-
-// extractTMeLinks 从文本中提取所有 t.me 链接
-func extractTMeLinks(text string) []string {
-	re := regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`)
-	return re.FindAllString(text, -1)
-}
-
-// extractTGUsernameFromLink 从 t.me/xxx 链接提取用户名
-func extractTGUsernameFromLink(link string) string {
-	re := regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
-	m := re.FindStringSubmatch(link)
-	if len(m) > 1 {
-		return m[1]
-	}
-	return ""
-}

+ 0 - 220
internal/pipeline/phase4_scrape.go

@@ -1,220 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"log"
-	"strings"
-	"time"
-
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
-
-	"spider/internal/extractor"
-	"spider/internal/llm"
-	"spider/internal/model"
-	"spider/internal/telegram"
-)
-
-// ScrapePhase Phase 4: TG 消息采集
-type ScrapePhase struct {
-	db        *gorm.DB
-	tgManager *telegram.AccountManager
-	llmClient *llm.Client
-	settings  Settings
-	redis     *redis.Client
-	reporter  ProgressReporter
-}
-
-// NewScrapePhase creates a new ScrapePhase.
-func NewScrapePhase(db *gorm.DB, tgManager *telegram.AccountManager, llmClient *llm.Client, settings Settings, rdb *redis.Client) *ScrapePhase {
-	return &ScrapePhase{
-		db:        db,
-		tgManager: tgManager,
-		llmClient: llmClient,
-		settings:  settings,
-		redis:     rdb,
-	}
-}
-
-func (p *ScrapePhase) Name() string { return "scrape" }
-
-func (p *ScrapePhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	log.Printf("[scrape] starting, task_id=%d", task.ID)
-
-	if p.tgManager == nil {
-		log.Printf("[scrape] tgManager is nil, skipping")
-		return nil
-	}
-
-	msgLimit := p.settings.GetInt(ctx, "tg_scraper.message_limit_per_channel", 500)
-	delayMsg := p.settings.GetFloat(ctx, "tg_scraper.delay_per_message", 1.0)
-	delayChannel := p.settings.GetFloat(ctx, "tg_scraper.delay_per_channel", 5.0)
-
-	if opts.TestRun != nil && opts.TestRun.MessageLimit > 0 {
-		msgLimit = opts.TestRun.MessageLimit
-	}
-
-	// 取 pending 频道
-	var channels []model.Channel
-	q := p.db.Where("status = ?", "pending")
-	if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
-		q = q.Limit(opts.TestRun.ItemLimit)
-	}
-	q.Find(&channels)
-
-	total := len(channels)
-	log.Printf("[scrape] found %d pending channels", total)
-
-	for i, ch := range channels {
-		if isContextDone(ctx) {
-			break
-		}
-
-		if p.reporter != nil {
-			p.reporter("scrape", i+1, total, "采集频道: @"+ch.Username)
-		}
-
-		acc, err := p.tgManager.Acquire(ctx)
-		if err != nil {
-			log.Printf("[scrape] no available account: %v", err)
-			break
-		}
-
-		if err := acc.Client.Connect(ctx); err != nil {
-			log.Printf("[scrape] connect failed: %v", err)
-			p.tgManager.Release(acc, 0)
-			p.db.Model(&ch).Update("status", "failed")
-			continue
-		}
-
-		// LLM 相关性评估
-		if p.llmClient != nil {
-			score, err := p.llmClient.EvalChannelRelevance(ctx, ch.Title, ch.About, ch.MemberCount)
-			if err == nil && score < 0.5 {
-				log.Printf("[scrape] skipping @%s, relevance score=%.2f", ch.Username, score)
-				p.tgManager.Release(acc, 0)
-				p.db.Model(&ch).Update("status", "skipped")
-				continue
-			}
-		}
-
-		// 读置顶消息
-		pinnedMsgs, _ := acc.Client.GetPinnedMessages(ctx, ch.Username)
-		p.processMessages(ctx, pinnedMsgs, &ch, delayMsg)
-
-		// 读历史消息(断点续传)
-		offsetID := ch.LastMessageID
-		fetched := 0
-		for fetched < msgLimit {
-			if isContextDone(ctx) {
-				break
-			}
-
-			batchSize := 100
-			if msgLimit-fetched < batchSize {
-				batchSize = msgLimit - fetched
-			}
-
-			msgs, err := acc.Client.GetMessages(ctx, ch.Username, offsetID, batchSize)
-			if err != nil {
-				if fw, ok := err.(*telegram.FloodWaitError); ok {
-					log.Printf("[scrape] FloodWait %ds on @%s", fw.Seconds, ch.Username)
-					p.tgManager.HandleFloodWait(acc, fw.Seconds)
-					acc = nil
-				} else {
-					log.Printf("[scrape] GetMessages @%s: %v", ch.Username, err)
-				}
-				break
-			}
-			if len(msgs) == 0 {
-				break
-			}
-
-			p.processMessages(ctx, msgs, &ch, delayMsg)
-
-			// 更新断点
-			lastID := msgs[len(msgs)-1].ID
-			p.db.Model(&ch).Update("last_message_id", lastID)
-			offsetID = lastID
-			fetched += len(msgs)
-		}
-
-		if acc != nil {
-			p.tgManager.Release(acc, 0)
-		}
-		p.db.Model(&ch).Update("status", "scraped")
-
-		select {
-		case <-ctx.Done():
-			return nil
-		case <-time.After(time.Duration(float64(time.Second) * delayChannel)):
-		}
-	}
-
-	log.Printf("[scrape] done")
-	return nil
-}
-
-// processMessages 处理一批消息,提取商户写入 merchants_raw
-func (p *ScrapePhase) processMessages(ctx context.Context, msgs []telegram.Message, ch *model.Channel, delayMsg float64) {
-	for _, msg := range msgs {
-		if msg.IsService || msg.Text == "" {
-			continue
-		}
-		if !extractor.ContainsChinese(msg.Text, 0) {
-			continue
-		}
-		if !extractor.HasContact(msg.Text) {
-			continue
-		}
-
-		// 快速去重(Redis SET NX key)
-		if p.redis != nil {
-			info := extractor.Extract(msg.Text)
-			if info.TgUsername != "" {
-				dedupKey := "spider:dedup:merchant:" + info.TgUsername
-				set, _ := p.redis.SetNX(ctx, dedupKey, "1", 7*24*time.Hour).Result()
-				if !set {
-					continue // 已存在,跳过
-				}
-			}
-		}
-
-		// LLM 精准解析
-		var merchantInfo *extractor.MerchantInfo
-		if p.llmClient != nil {
-			merchantInfo, _ = p.llmClient.ParseMerchant(ctx, msg.Text)
-		}
-
-		// Fallback 到正则
-		if merchantInfo == nil || merchantInfo.TgUsername == "" {
-			info := extractor.Extract(msg.Text)
-			merchantInfo = &extractor.MerchantInfo{
-				TgUsername: info.TgUsername,
-				Website:    info.Website,
-				Email:      info.Email,
-				Phone:      info.Phone,
-			}
-		}
-
-		if merchantInfo.TgUsername == "" && merchantInfo.Website == "" {
-			continue
-		}
-
-		raw := &model.MerchantRaw{
-			MerchantName:    extractor.CleanMerchantName(merchantInfo.MerchantName),
-			TgUsername:      strings.TrimPrefix(merchantInfo.TgUsername, "@"),
-			Website:         merchantInfo.Website,
-			Email:           merchantInfo.Email,
-			Phone:           merchantInfo.Phone,
-			Industry:        merchantInfo.Industry,
-			SourceType:      "tg_scrape",
-			SourceID:        ch.Username,
-			OriginalMessage: msg.Text,
-			Status:          "raw",
-		}
-		p.db.Create(raw)
-
-		time.Sleep(time.Duration(float64(time.Second) * delayMsg))
-	}
-}

+ 0 - 200
internal/pipeline/phase5_crawl.go

@@ -1,200 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"log"
-	"strings"
-
-	"gorm.io/gorm"
-
-	"spider/internal/crawler"
-	"spider/internal/extractor"
-	"spider/internal/llm"
-	"spider/internal/model"
-)
-
-// CrawlPhase Phase 5: 网页爬取
-type CrawlPhase struct {
-	db            *gorm.DB
-	staticCrawler *crawler.StaticCrawler
-	dynCrawler    *crawler.DynamicCrawler
-	tmeValidator  *crawler.TMeValidator
-	llmClient     *llm.Client
-	settings      Settings
-	reporter      ProgressReporter
-}
-
-// NewCrawlPhase creates a new CrawlPhase.
-func NewCrawlPhase(db *gorm.DB, llmClient *llm.Client, settings Settings) *CrawlPhase {
-	return &CrawlPhase{
-		db:            db,
-		staticCrawler: crawler.NewStaticCrawler(),
-		dynCrawler:    crawler.NewDynamicCrawler(),
-		tmeValidator:  crawler.NewTMeValidator(),
-		llmClient:     llmClient,
-		settings:      settings,
-	}
-}
-
-func (p *CrawlPhase) Name() string { return "crawl" }
-
-func (p *CrawlPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	log.Printf("[crawl] starting, task_id=%d", task.ID)
-
-	tmeEnabled := true
-	if p.settings != nil {
-		tmeEnabled = p.settings.GetBool(ctx, "tme_validator.enabled", true)
-	}
-
-	var navSites []model.NavSite
-	q := p.db.Where("status = ?", "pending")
-	if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
-		q = q.Limit(opts.TestRun.ItemLimit)
-	}
-	q.Find(&navSites)
-
-	total := len(navSites)
-	merchantCount := 0
-
-	for i, site := range navSites {
-		if isContextDone(ctx) {
-			break
-		}
-
-		if p.reporter != nil {
-			p.reporter("crawl", i+1, total, "爬取: "+site.URL)
-		}
-
-		// 预过滤
-		filterResult := crawler.RuleFilter(site.URL)
-		if filterResult == crawler.FilterDiscard {
-			p.db.Model(&site).Updates(map[string]interface{}{
-				"status":        "filtered",
-				"filter_reason": "blacklist",
-			})
-			continue
-		}
-
-		// 不确定的 URL,交 LLM 判断
-		if filterResult == crawler.FilterUncertain && p.llmClient != nil {
-			isNav, confidence, err := p.llmClient.IsNavSite(ctx, site.URL)
-			if err != nil || !isNav || confidence < 0.6 {
-				p.db.Model(&site).Updates(map[string]interface{}{
-					"status":        "filtered",
-					"filter_reason": "llm_reject",
-				})
-				continue
-			}
-		}
-
-		// 爬取:先尝试静态,失败则动态
-		result := p.staticCrawler.Crawl(ctx, site.URL)
-		if result.Error != nil || result.HTML == "" {
-			log.Printf("[crawl] static failed for %s, trying dynamic", site.URL)
-			result = p.dynCrawler.Crawl(ctx, site.URL)
-		}
-
-		if result.Error != nil {
-			p.db.Model(&site).Update("status", "failed")
-			continue
-		}
-
-		// 过滤非中文页面
-		snippet := result.HTML
-		if len(snippet) > 5000 {
-			snippet = snippet[:5000]
-		}
-		if !extractor.ContainsChinese(snippet, 0) {
-			p.db.Model(&site).Updates(map[string]interface{}{
-				"status":        "filtered",
-				"filter_reason": "non_chinese",
-			})
-			continue
-		}
-
-		// 处理发现的 TG 链接
-		for _, tgLink := range result.TgLinks {
-			username := crawler.ExtractTGUsername(tgLink)
-			if username == "" {
-				continue
-			}
-
-			// t.me 死号预检
-			if tmeEnabled {
-				if !p.tmeValidator.IsAlive(ctx, username) {
-					log.Printf("[crawl] dead account: %s", username)
-					continue
-				}
-			}
-
-			raw := &model.MerchantRaw{
-				TgUsername: username,
-				SourceType: "web_crawl",
-				SourceID:   site.URL,
-				Status:     "raw",
-			}
-			p.db.Create(raw)
-			merchantCount++
-		}
-
-		// 处理普通链接(商户官网子页)
-		for _, link := range result.Links {
-			if isContextDone(ctx) {
-				break
-			}
-			// 排除 TG 链接(已处理)和无效链接
-			if strings.Contains(link, "t.me") || strings.Contains(link, "telegram.me") {
-				continue
-			}
-			if crawler.RuleFilter(link) == crawler.FilterDiscard {
-				continue
-			}
-
-			// 爬商户官网子页提取联系方式
-			p.crawlMerchantSite(ctx, link, site.URL)
-		}
-
-		p.db.Model(&site).Updates(map[string]interface{}{
-			"status":         "scraped",
-			"merchant_count": merchantCount,
-		})
-	}
-
-	log.Printf("[crawl] done: %d merchants found", merchantCount)
-	return nil
-}
-
-// crawlMerchantSite 爬取商户官网,提取联系方式
-func (p *CrawlPhase) crawlMerchantSite(ctx context.Context, siteURL, sourceURL string) {
-	subPages := []string{siteURL, siteURL + "/contact", siteURL + "/about", siteURL + "/关于我们"}
-
-	for _, page := range subPages {
-		if isContextDone(ctx) {
-			break
-		}
-
-		result := p.staticCrawler.Crawl(ctx, page)
-		if result.Error != nil || result.HTML == "" {
-			continue
-		}
-
-		info := extractor.Extract(result.HTML)
-		if !info.HasContact {
-			continue
-		}
-
-		raw := &model.MerchantRaw{
-			TgUsername: info.TgUsername,
-			Website:    siteURL,
-			Email:      info.Email,
-			Phone:      info.Phone,
-			SourceType: "web_crawl",
-			SourceID:   sourceURL,
-			Status:     "raw",
-		}
-		if raw.TgUsername != "" || raw.Email != "" || raw.Phone != "" {
-			p.db.Create(raw)
-		}
-		break // 找到联系方式就停止
-	}
-}

+ 0 - 322
internal/pipeline/phase6_clean.go

@@ -1,322 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"math"
-	"regexp"
-	"strings"
-	"time"
-
-	"gorm.io/datatypes"
-	"gorm.io/gorm"
-
-	"spider/internal/extractor"
-	"spider/internal/model"
-	"spider/internal/telegram"
-)
-
-// CleanPhase Phase 6: 数据清洗
-type CleanPhase struct {
-	db        *gorm.DB
-	tgManager *telegram.AccountManager
-	settings  Settings
-	reporter  ProgressReporter
-}
-
-// NewCleanPhase creates a new CleanPhase.
-func NewCleanPhase(db *gorm.DB, tgManager *telegram.AccountManager, settings Settings) *CleanPhase {
-	return &CleanPhase{
-		db:        db,
-		tgManager: tgManager,
-		settings:  settings,
-	}
-}
-
-func (p *CleanPhase) Name() string { return "clean" }
-
-func (p *CleanPhase) SetReporter(r ProgressReporter) { p.reporter = r }
-
-func (p *CleanPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	// 取所有 status=raw 的商户
-	var raws []model.MerchantRaw
-	q := p.db.Where("status = ?", "raw")
-	if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
-		q = q.Limit(opts.TestRun.ItemLimit)
-	}
-	q.Find(&raws)
-
-	total := len(raws)
-	log.Printf("[clean] processing %d raw merchants", total)
-
-	// 第一关:黑名单过滤
-	var pass1 []model.MerchantRaw
-	for _, raw := range raws {
-		status := p.filterBlacklist(raw)
-		if status != "" {
-			p.saveCleaned(raw, status, nil)
-		} else {
-			pass1 = append(pass1, raw)
-		}
-	}
-
-	if p.reporter != nil {
-		p.reporter("clean", 1, 3, "第一关完成,剩余 "+itoa(len(pass1))+" 条")
-	}
-
-	// 第二关:去重
-	pass2 := p.deduplicate(pass1)
-
-	if p.reporter != nil {
-		p.reporter("clean", 2, 3, "第二关完成,去重后 "+itoa(len(pass2))+" 条")
-	}
-
-	// 第三关:TG 真实性验证(有独立 rate limiter)
-	delayVerify := 3.0
-	if p.settings != nil {
-		delayVerify = p.settings.GetFloat(ctx, "tg_scraper.delay_per_verify", 3.0)
-	}
-
-	for i, raw := range pass2 {
-		if isContextDone(ctx) {
-			break
-		}
-
-		if p.reporter != nil {
-			p.reporter("clean", i+1, len(pass2), "验证: @"+raw.TgUsername)
-		}
-
-		if raw.TgUsername == "" {
-			// 没有 TG 用户名但有其他联系方式,标记为 valid
-			p.saveCleaned(raw, "valid", nil)
-			continue
-		}
-
-		userInfo, err := p.verifyTG(ctx, raw.TgUsername)
-		if err != nil {
-			log.Printf("[clean] verify error for %s: %v", raw.TgUsername, err)
-			continue
-		}
-
-		status := "invalid"
-		if userInfo != nil {
-			if userInfo.IsChannel {
-				status = "group"
-			} else if userInfo.IsBot {
-				status = "bot"
-			} else if userInfo.Exists {
-				status = "valid"
-			}
-		}
-
-		p.saveCleaned(raw, status, userInfo)
-
-		// 独立 rate limiter
-		select {
-		case <-ctx.Done():
-			return nil
-		case <-time.After(time.Duration(float64(time.Second) * delayVerify)):
-		}
-	}
-
-	log.Printf("[clean] done")
-	return nil
-}
-
-// filterBlacklist 第一关:黑名单过滤
-// 返回应被标记的状态,"" 表示通过
-func (p *CleanPhase) filterBlacklist(raw model.MerchantRaw) string {
-	// 系统 bot 黑名单
-	botNames := []string{
-		"telegram", "telegramhints", "gif", "pic", "bing", "vid",
-		"bold", "vote", "like", "sticker", "music",
-		"channel_bot", "BotFather", "SpamBot",
-	}
-	username := strings.ToLower(raw.TgUsername)
-	for _, b := range botNames {
-		if username == strings.ToLower(b) {
-			return "bot"
-		}
-	}
-	// xxxbot 后缀
-	if strings.HasSuffix(username, "bot") && len(username) > 3 {
-		return "bot"
-	}
-
-	// 邀请链接哈希(16-24位 base64)
-	if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 {
-		reBase64 := regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`)
-		if reBase64.MatchString(raw.TgUsername) {
-			// 计算熵:如果大写+小写+数字混合度高,认为是哈希
-			if entropy(raw.TgUsername) > 3.5 {
-				return "invalid"
-			}
-		}
-	}
-
-	// original_message 非空且不含中文
-	if raw.OriginalMessage != "" && !extractor.ContainsChinese(raw.OriginalMessage, 0) {
-		return "invalid"
-	}
-
-	return ""
-}
-
-// entropy 计算字符串的信息熵
-func entropy(s string) float64 {
-	freq := map[rune]int{}
-	for _, r := range s {
-		freq[r]++
-	}
-	n := float64(len(s))
-	h := 0.0
-	for _, count := range freq {
-		p := float64(count) / n
-		h -= p * math.Log2(p)
-	}
-	return h
-}
-
-// deduplicate 第二关:去重
-// 同 tg_username 保留信息最丰富的一条,其余标 duplicate
-func (p *CleanPhase) deduplicate(raws []model.MerchantRaw) []model.MerchantRaw {
-	// 按 tg_username 分组
-	groups := map[string][]model.MerchantRaw{}
-	for _, raw := range raws {
-		key := raw.TgUsername
-		if key == "" {
-			key = raw.Website
-		}
-		if key == "" {
-			key = raw.Email
-		}
-		if key == "" {
-			key = itoa(int(raw.ID)) // 无法去重的保留
-		}
-		groups[key] = append(groups[key], raw)
-	}
-
-	var keepers []model.MerchantRaw
-	for _, group := range groups {
-		if len(group) == 1 {
-			keepers = append(keepers, group[0])
-			continue
-		}
-
-		// 按丰富度打分,保留最高分
-		best := group[0]
-		bestScore := richness(best)
-		for _, r := range group[1:] {
-			s := richness(r)
-			if s > bestScore {
-				// 将被替换的标为 duplicate
-				p.saveCleaned(best, "duplicate", nil)
-				bestScore = s
-				best = r
-			} else {
-				p.saveCleaned(r, "duplicate", nil)
-			}
-		}
-		keepers = append(keepers, best)
-	}
-
-	return keepers
-}
-
-// richness 信息丰富度评分
-func richness(r model.MerchantRaw) int {
-	score := 0
-	if r.TgUsername != "" {
-		score++
-	}
-	if r.Website != "" {
-		score++
-	}
-	if r.Email != "" {
-		score++
-	}
-	if r.Phone != "" {
-		score++
-	}
-	if r.MerchantName != "" {
-		score++
-	}
-	return score
-}
-
-// verifyTG 调用 TG API 验证用户名
-func (p *CleanPhase) verifyTG(ctx context.Context, username string) (*telegram.UserInfo, error) {
-	if p.tgManager == nil {
-		return nil, nil
-	}
-
-	acc, err := p.tgManager.Acquire(ctx)
-	if err != nil {
-		return nil, err
-	}
-
-	if err := acc.Client.Connect(ctx); err != nil {
-		p.tgManager.Release(acc, 0)
-		return nil, err
-	}
-
-	userInfo, err := acc.Client.VerifyUser(ctx, username)
-	if err != nil {
-		if fw, ok := err.(*telegram.FloodWaitError); ok {
-			handleErr := p.tgManager.HandleFloodWait(acc, fw.Seconds)
-			return nil, handleErr
-		}
-		p.tgManager.Release(acc, 0)
-		return nil, err
-	}
-
-	p.tgManager.Release(acc, 0)
-	return userInfo, nil
-}
-
-// saveCleaned 将原始商户写入 merchants_clean
-func (p *CleanPhase) saveCleaned(raw model.MerchantRaw, status string, userInfo *telegram.UserInfo) {
-	clean := model.MerchantClean{
-		RawID:        &raw.ID,
-		MerchantName: raw.MerchantName,
-		TgUsername:   raw.TgUsername,
-		Website:      raw.Website,
-		Email:        raw.Email,
-		Phone:        raw.Phone,
-		Industry:     raw.Industry,
-		Status:       status,
-		SourceCount:  1,
-		SourceLinks:  datatypes.JSON([]byte(`[]`)),
-	}
-
-	if userInfo != nil && userInfo.Exists {
-		clean.TgFirstName = userInfo.FirstName
-		clean.TgLastName = userInfo.LastName
-		clean.IsPremium = userInfo.IsPremium
-		clean.LastOnline = userInfo.LastOnline
-		// 活跃度
-		if userInfo.LastOnline != nil {
-			days := time.Since(*userInfo.LastOnline).Hours() / 24
-			if days < 3 {
-				clean.ActiveLevel = "active"
-			} else if days < 30 {
-				clean.ActiveLevel = "moderate"
-			} else {
-				clean.ActiveLevel = "inactive"
-			}
-		}
-	}
-
-	// 冲突时按 tg_username unique 更新
-	if clean.TgUsername != "" {
-		p.db.Where(model.MerchantClean{TgUsername: clean.TgUsername}).FirstOrCreate(&clean)
-	} else {
-		p.db.Create(&clean)
-	}
-}
-
-// itoa converts int to string.
-func itoa(n int) string {
-	return fmt.Sprintf("%d", n)
-}

+ 0 - 126
internal/pipeline/phase7_score.go

@@ -1,126 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"log"
-	"math"
-
-	"gorm.io/gorm"
-
-	"spider/internal/model"
-)
-
-// ScorePhase Phase 7: 评分
-type ScorePhase struct {
-	db       *gorm.DB
-	reporter ProgressReporter
-}
-
-// NewScorePhase creates a new ScorePhase.
-func NewScorePhase(db *gorm.DB) *ScorePhase {
-	return &ScorePhase{db: db}
-}
-
-func (p *ScorePhase) Name() string { return "score" }
-
-func (p *ScorePhase) SetReporter(r ProgressReporter) { p.reporter = r }
-
-func (p *ScorePhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	var merchants []model.MerchantClean
-	p.db.Where("status = ?", "valid").Find(&merchants)
-
-	total := len(merchants)
-	log.Printf("[score] scoring %d valid merchants", total)
-
-	for i, m := range merchants {
-		if isContextDone(ctx) {
-			break
-		}
-
-		if p.reporter != nil {
-			p.reporter("score", i+1, total, "评分: @"+m.TgUsername)
-		}
-
-		score := calcScore(m)
-		p.db.Model(&m).Update("quality_score", score)
-	}
-
-	log.Printf("[score] done")
-	return nil
-}
-
-// calcScore 6 维度加权打分 (0-100)
-func calcScore(m model.MerchantClean) float64 {
-	// 维度1: member_count (权重 0.25)
-	memberScore := memberCountScore(m.MemberCount)
-
-	// 维度2: premium (权重 0.15)
-	premiumScore := 0.0
-	if m.IsPremium {
-		premiumScore = 100.0
-	}
-
-	// 维度3: activity (权重 0.25)
-	activityScore := activityLevelScore(m.ActiveLevel)
-
-	// 维度4: multi_source (权重 0.20)
-	multiScore := multiSourceScore(m.SourceCount)
-
-	// 维度5: has_website (权重 0.10)
-	websiteScore := 0.0
-	if m.Website != "" {
-		websiteScore = 100.0
-	}
-
-	// 维度6: has_email (权重 0.05)
-	emailScore := 0.0
-	if m.Email != "" {
-		emailScore = 100.0
-	}
-
-	total := memberScore*0.25 + premiumScore*0.15 + activityScore*0.25 +
-		multiScore*0.20 + websiteScore*0.10 + emailScore*0.05
-
-	return math.Round(total*100) / 100
-}
-
-func memberCountScore(count int) float64 {
-	switch {
-	case count >= 100000:
-		return 100
-	case count >= 10000:
-		return 80
-	case count >= 1000:
-		return 50
-	case count >= 100:
-		return 30
-	default:
-		return 10
-	}
-}
-
-func activityLevelScore(level string) float64 {
-	switch level {
-	case "active":
-		return 100
-	case "moderate":
-		return 50
-	case "inactive":
-		return 20
-	default:
-		return 0
-	}
-}
-
-func multiSourceScore(count int) float64 {
-	switch {
-	case count >= 4:
-		return 100
-	case count == 3:
-		return 70
-	case count == 2:
-		return 40
-	default:
-		return 10
-	}
-}

+ 0 - 112
internal/pipeline/pipeline.go

@@ -1,112 +0,0 @@
-package pipeline
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"spider/internal/model"
-
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
-)
-
-// fullPhaseOrder defines the sequential execution order for a full pipeline run.
-var fullPhaseOrder = []string{
-	"discover",
-	"search",
-	"github",
-	"scrape",
-	"crawl",
-	"clean",
-	"score",
-}
-
-// Runner Pipeline 调度器
-type Runner struct {
-	db       *gorm.DB
-	redis    *redis.Client
-	phases   map[string]Phase // 注册的 phase,key 是 phase 名称
-	reporter ProgressReporter
-}
-
-// NewRunner creates a new pipeline Runner.
-func NewRunner(db *gorm.DB, rdb *redis.Client) *Runner {
-	return &Runner{
-		db:    db,
-		redis: rdb,
-		phases: make(map[string]Phase),
-	}
-}
-
-// RegisterPhase 注册一个 phase 实现
-func (r *Runner) RegisterPhase(p Phase) {
-	r.phases[p.Name()] = p
-}
-
-// SetProgressReporter 设置进度上报函数
-func (r *Runner) SetProgressReporter(fn ProgressReporter) {
-	r.reporter = fn
-}
-
-// report calls the reporter if one is set; otherwise logs to stderr.
-func (r *Runner) report(phase string, current, total int, message string) {
-	if r.reporter != nil {
-		r.reporter(phase, current, total, message)
-	}
-}
-
-// Run 执行 pipeline
-// task.TaskType: "full" | "discover" | "search" | "github" | "scrape" | "crawl" | "clean" | "score"
-// full 类型按顺序执行所有未跳过的 phase
-// 单阶段类型直接执行对应 phase
-func (r *Runner) Run(ctx context.Context, task *model.Task, opts *Options) error {
-	if task.TaskType == "full" {
-		for _, phaseName := range fullPhaseOrder {
-			if isContextDone(ctx) {
-				return fmt.Errorf("pipeline cancelled before phase %s", phaseName)
-			}
-			if ShouldSkip(phaseName, opts.SkipPhases) {
-				log.Printf("[pipeline] skipping phase=%s (in SkipPhases)", phaseName)
-				continue
-			}
-			r.report(phaseName, 0, 0, "开始 "+phaseName)
-			if err := r.runSingle(ctx, task, phaseName, opts); err != nil {
-				log.Printf("[pipeline] phase=%s error: %v (continuing)", phaseName, err)
-			}
-			r.report(phaseName, 100, 100, phaseName+" 完成")
-		}
-		return nil
-	}
-
-	// Single-phase task
-	phaseName := task.TaskType
-	if isContextDone(ctx) {
-		return fmt.Errorf("pipeline cancelled before phase %s", phaseName)
-	}
-	r.report(phaseName, 0, 0, "开始 "+phaseName)
-	if err := r.runSingle(ctx, task, phaseName, opts); err != nil {
-		r.report(phaseName, 0, 0, phaseName+" 失败: "+err.Error())
-		return err
-	}
-	r.report(phaseName, 100, 100, phaseName+" 完成")
-	return nil
-}
-
-// runSingle 执行单个 phase
-func (r *Runner) runSingle(ctx context.Context, task *model.Task, phaseName string, opts *Options) error {
-	p, ok := r.phases[phaseName]
-	if !ok {
-		return fmt.Errorf("phase %q not registered", phaseName)
-	}
-	return p.Run(ctx, task, opts)
-}
-
-// isContextDone 检查 context 是否已取消(用于各阶段检查停止信号)
-func isContextDone(ctx context.Context) bool {
-	select {
-	case <-ctx.Done():
-		return true
-	default:
-		return false
-	}
-}

+ 31 - 0
internal/plugin/interface.go

@@ -0,0 +1,31 @@
+package plugin
+
+import "context"
+
+// MerchantData is the standard output format for all collector plugins.
+// Every plugin must produce data in this shape via the callback.
+type MerchantData struct {
+	TgUsername   string `json:"tg_username"`   // required — no tg_username, no insert
+	TgLink       string `json:"tg_link"`
+	MerchantName string `json:"merchant_name"`
+	Website      string `json:"website"`
+	Email        string `json:"email"`
+	Phone        string `json:"phone"`
+	SourceType   string `json:"source_type"`   // web / tg_channel / github
+	SourceName   string `json:"source_name"`   // specific source (page title / channel name)
+	SourceURL    string `json:"source_url"`    // source URL
+	OriginalText string `json:"original_text"` // raw text for audit
+	IndustryTag  string `json:"industry_tag"`
+}
+
+// Collector is the interface every collection plugin must implement.
+type Collector interface {
+	// Name returns the plugin name, e.g. "web_collector".
+	Name() string
+	// Run starts collection. For every merchant found, call callback.
+	// cfg carries plugin-specific configuration (keywords, limits, etc.).
+	// The function should respect ctx cancellation for graceful shutdown.
+	Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error
+	// Stop requests graceful shutdown from outside.
+	Stop() error
+}

+ 50 - 0
internal/plugin/registry.go

@@ -0,0 +1,50 @@
+package plugin
+
+import (
+	"fmt"
+	"sync"
+)
+
+// Registry holds all registered collector plugins.
+type Registry struct {
+	mu      sync.RWMutex
+	plugins map[string]Collector
+}
+
+// NewRegistry creates an empty plugin registry.
+func NewRegistry() *Registry {
+	return &Registry{plugins: make(map[string]Collector)}
+}
+
+// Register adds a collector to the registry. Panics on duplicate name.
+func (r *Registry) Register(c Collector) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	name := c.Name()
+	if _, exists := r.plugins[name]; exists {
+		panic(fmt.Sprintf("plugin %q already registered", name))
+	}
+	r.plugins[name] = c
+}
+
+// Get returns a collector by name, or an error if not found.
+func (r *Registry) Get(name string) (Collector, error) {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	c, ok := r.plugins[name]
+	if !ok {
+		return nil, fmt.Errorf("plugin %q not registered", name)
+	}
+	return c, nil
+}
+
+// List returns all registered plugin names.
+func (r *Registry) List() []string {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+	names := make([]string, 0, len(r.plugins))
+	for name := range r.plugins {
+		names = append(names, name)
+	}
+	return names
+}

+ 263 - 0
internal/plugins/githubcollector/collector.go

@@ -0,0 +1,263 @@
+package githubcollector
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"spider/internal/extractor"
+	"spider/internal/model"
+	"spider/internal/plugin"
+	"spider/internal/store"
+)
+
+// Collector implements plugin.Collector for GitHub README mining.
+// Searches GitHub repos by keywords, extracts t.me links from READMEs.
+type Collector struct {
+	token   string // GitHub token (optional)
+	store   *store.Store
+	http    *http.Client
+	stopped atomic.Bool
+}
+
+// New creates a new GitHub collector.
+func New(token string, s *store.Store) *Collector {
+	return &Collector{
+		token: token,
+		store: s,
+		http:  &http.Client{Timeout: 15 * time.Second},
+	}
+}
+
+func (c *Collector) Name() string { return "github_collector" }
+
+func (c *Collector) Stop() error {
+	c.stopped.Store(true)
+	return nil
+}
+
+// Run searches GitHub for repos matching keywords, extracts t.me links from READMEs.
+//
+// cfg keys:
+//   - "keywords": []string — search keywords
+//   - "repos_limit": int — max repos to process (default 50)
+func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
+	c.stopped.Store(false)
+
+	keywords, _ := cfg["keywords"].([]string)
+	if len(keywords) == 0 {
+		log.Println("[github_collector] no keywords provided")
+		return nil
+	}
+
+	reposLimit := 50
+	if v, ok := cfg["repos_limit"].(int); ok && v > 0 {
+		reposLimit = v
+	}
+
+	queries := make([]string, 0, len(keywords))
+	for _, kw := range keywords {
+		queries = append(queries, fmt.Sprintf("%s telegram", kw))
+	}
+
+	reposPerQuery := 1
+	if len(queries) > 0 {
+		reposPerQuery = reposLimit/len(queries) + 1
+	}
+
+	found := 0
+
+	for _, query := range queries {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+
+		log.Printf("[github_collector] searching: %s", query)
+
+		repos, err := c.searchRepos(ctx, query, reposPerQuery)
+		if err != nil {
+			log.Printf("[github_collector] search error: %v", err)
+			continue
+		}
+
+		for _, repo := range repos {
+			if c.stopped.Load() || ctx.Err() != nil {
+				break
+			}
+
+			readme, err := c.fetchReadme(ctx, repo)
+			if err != nil {
+				continue
+			}
+
+			// Filter: README must contain Chinese
+			preview := readme
+			if len(preview) > 5000 {
+				preview = preview[:5000]
+			}
+			if !extractor.ContainsChinese(preview, 0) {
+				continue
+			}
+
+			// Extract t.me links
+			links := extractTMeLinks(readme)
+			for _, link := range links {
+				// Context check: 200 chars around link must contain Chinese
+				idx := strings.Index(readme, link)
+				if idx < 0 {
+					continue
+				}
+				start := idx - 200
+				if start < 0 {
+					start = 0
+				}
+				end := idx + len(link) + 200
+				if end > len(readme) {
+					end = len(readme)
+				}
+				context200 := readme[start:end]
+				if !extractor.ContainsChinese(context200, 0) {
+					continue
+				}
+
+				username := extractTGUsername(link)
+				if username == "" {
+					continue
+				}
+
+				// Save channel to DB
+				c.store.UpsertChannel(&model.Channel{
+					Username: username,
+					Source:   "github",
+					Status:   "pending",
+				})
+
+				callback(plugin.MerchantData{
+					TgUsername: username,
+					TgLink:     "https://t.me/" + username,
+					SourceType: "github",
+					SourceName: repo,
+					SourceURL:  fmt.Sprintf("https://github.com/%s", repo),
+				})
+				found++
+			}
+
+			// Delay between repos
+			select {
+			case <-ctx.Done():
+				return nil
+			case <-time.After(2 * time.Second):
+			}
+		}
+
+		// Delay between queries
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-time.After(5 * time.Second):
+		}
+	}
+
+	log.Printf("[github_collector] done: %d channels found", found)
+	return nil
+}
+
+func (c *Collector) searchRepos(ctx context.Context, query string, limit int) ([]string, error) {
+	perPage := limit
+	if perPage > 30 {
+		perPage = 30
+	}
+	apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d",
+		url.QueryEscape(query), perPage)
+
+	req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
+	if err != nil {
+		return nil, err
+	}
+	req.Header.Set("Accept", "application/vnd.github.v3+json")
+	if c.token != "" {
+		req.Header.Set("Authorization", "token "+c.token)
+	}
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	var result struct {
+		Items []struct {
+			FullName string `json:"full_name"`
+		} `json:"items"`
+	}
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		return nil, err
+	}
+
+	var repos []string
+	for _, item := range result.Items {
+		repos = append(repos, item.FullName)
+	}
+	return repos, nil
+}
+
+func (c *Collector) fetchReadme(ctx context.Context, fullName string) (string, error) {
+	rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName)
+	req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
+	if err != nil {
+		return "", err
+	}
+	if c.token != "" {
+		req.Header.Set("Authorization", "token "+c.token)
+	}
+
+	resp, err := c.http.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == 404 {
+		masterURL := strings.Replace(rawURL, "/main/", "/master/", 1)
+		req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil)
+		if err != nil {
+			return "", err
+		}
+		if c.token != "" {
+			req2.Header.Set("Authorization", "token "+c.token)
+		}
+		resp2, err := c.http.Do(req2)
+		if err != nil {
+			return "", err
+		}
+		defer resp2.Body.Close()
+		data, _ := io.ReadAll(resp2.Body)
+		return string(data), nil
+	}
+
+	data, _ := io.ReadAll(resp.Body)
+	return string(data), nil
+}
+
+var reTMeLink = regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`)
+var reTGUsername = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
+
+func extractTMeLinks(text string) []string {
+	return reTMeLink.FindAllString(text, -1)
+}
+
+func extractTGUsername(link string) string {
+	m := reTGUsername.FindStringSubmatch(link)
+	if len(m) > 1 {
+		return m[1]
+	}
+	return ""
+}

+ 323 - 0
internal/plugins/tgcollector/collector.go

@@ -0,0 +1,323 @@
+package tgcollector
+
+import (
+	"context"
+	"log"
+	"regexp"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"spider/internal/extractor"
+	"spider/internal/llm"
+	"spider/internal/model"
+	"spider/internal/plugin"
+	"spider/internal/store"
+	"spider/internal/telegram"
+)
+
+// Collector implements plugin.Collector for TG channel collection.
+// Combines BFS channel discovery (from seeds) + message scraping.
+// AI: regex first, LLM fallback only for non-standard contact formats.
+type Collector struct {
+	tgManager *telegram.AccountManager
+	llmClient *llm.Client // can be nil
+	store     *store.Store
+	stopped   atomic.Bool
+}
+
+// New creates a new TG collector.
+func New(tgManager *telegram.AccountManager, llmClient *llm.Client, s *store.Store) *Collector {
+	return &Collector{
+		tgManager: tgManager,
+		llmClient: llmClient,
+		store:     s,
+	}
+}
+
+func (c *Collector) Name() string { return "tg_collector" }
+
+func (c *Collector) Stop() error {
+	c.stopped.Store(true)
+	return nil
+}
+
+// Run executes the TG collection pipeline:
+// 1. BFS discover channels from seeds
+// 2. Scrape messages from discovered channels
+// 3. Extract merchants via regex (+ LLM fallback)
+//
+// cfg keys:
+//   - "seeds": []string — seed channel names
+//   - "max_depth": int — BFS max depth (default 3)
+//   - "max_channels": int — max channels to discover (default 500)
+//   - "message_limit": int — messages per channel (default 500)
+func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
+	c.stopped.Store(false)
+
+	if c.tgManager == nil {
+		log.Println("[tg_collector] no TG account manager, skipping")
+		return nil
+	}
+
+	seeds, _ := cfg["seeds"].([]string)
+	if len(seeds) == 0 {
+		log.Println("[tg_collector] no seeds provided")
+		return nil
+	}
+
+	maxDepth := getIntCfg(cfg, "max_depth", 3)
+	maxChannels := getIntCfg(cfg, "max_channels", 500)
+	msgLimit := getIntCfg(cfg, "message_limit", 500)
+
+	// Phase 1: BFS channel discovery
+	channels := c.discover(ctx, seeds, maxDepth, maxChannels)
+	log.Printf("[tg_collector] discovered %d channels", len(channels))
+
+	// Phase 2: Scrape each channel
+	for i, ch := range channels {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+
+		log.Printf("[tg_collector] scraping %d/%d: @%s", i+1, len(channels), ch)
+		c.scrapeChannel(ctx, ch, msgLimit, callback)
+
+		// Delay between channels
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-time.After(5 * time.Second):
+		}
+	}
+
+	log.Println("[tg_collector] done")
+	return nil
+}
+
+type queueItem struct {
+	Username string
+	Depth    int
+	Source   string
+}
+
+func (c *Collector) discover(ctx context.Context, seeds []string, maxDepth, maxTotal int) []string {
+	queue := make([]queueItem, 0, len(seeds))
+	for _, s := range seeds {
+		queue = append(queue, queueItem{Username: cleanUsername(s), Depth: 0, Source: "seed"})
+	}
+
+	visited := map[string]bool{}
+	var result []string
+
+	for len(queue) > 0 && len(result) < maxTotal {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+
+		item := queue[0]
+		queue = queue[1:]
+
+		username := cleanUsername(item.Username)
+		if username == "" || visited[username] {
+			continue
+		}
+		visited[username] = true
+
+		acc, err := c.tgManager.Acquire(ctx)
+		if err != nil {
+			log.Printf("[tg_collector] no available TG account: %v", err)
+			break
+		}
+
+		if err := acc.Client.Connect(ctx); err != nil {
+			log.Printf("[tg_collector] connect failed: %v", err)
+			c.tgManager.Release(acc, 0)
+			continue
+		}
+
+		_, err = acc.Client.GetChannelInfo(ctx, username)
+		if err != nil {
+			if fw, ok := err.(*telegram.FloodWaitError); ok {
+				c.tgManager.HandleFloodWait(acc, fw.Seconds)
+			} else {
+				c.tgManager.Release(acc, 0)
+			}
+			continue
+		}
+
+		// Save channel to DB
+		c.store.UpsertChannel(&model.Channel{
+			Username: username,
+			Source:   item.Source,
+			Status:   "pending",
+		})
+		result = append(result, username)
+
+		// BFS: read messages to find more channels
+		if item.Depth < maxDepth {
+			msgs, err := acc.Client.GetMessages(ctx, username, 0, 100)
+			if err == nil {
+				for _, msg := range msgs {
+					if msg.ForwardFromChannel != "" {
+						fwd := cleanUsername(msg.ForwardFromChannel)
+						if fwd != "" && !visited[fwd] {
+							queue = append(queue, queueItem{fwd, item.Depth + 1, "snowball"})
+						}
+					}
+					for _, link := range msg.Links {
+						name := extractUsernameFromLink(link)
+						if name != "" && !visited[name] {
+							queue = append(queue, queueItem{name, item.Depth + 1, "snowball"})
+						}
+					}
+				}
+			}
+		}
+
+		c.tgManager.Release(acc, 0)
+
+		select {
+		case <-ctx.Done():
+			return result
+		case <-time.After(5 * time.Second):
+		}
+	}
+
+	return result
+}
+
+func (c *Collector) scrapeChannel(ctx context.Context, username string, msgLimit int, callback func(plugin.MerchantData)) {
+	acc, err := c.tgManager.Acquire(ctx)
+	if err != nil {
+		return
+	}
+
+	if err := acc.Client.Connect(ctx); err != nil {
+		c.tgManager.Release(acc, 0)
+		return
+	}
+
+	// Read pinned messages
+	pinnedMsgs, _ := acc.Client.GetPinnedMessages(ctx, username)
+	c.processMessages(ctx, pinnedMsgs, username, callback)
+
+	// Read historical messages
+	offsetID := 0
+	fetched := 0
+	for fetched < msgLimit {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+
+		batchSize := 100
+		if msgLimit-fetched < batchSize {
+			batchSize = msgLimit - fetched
+		}
+
+		msgs, err := acc.Client.GetMessages(ctx, username, offsetID, batchSize)
+		if err != nil {
+			if fw, ok := err.(*telegram.FloodWaitError); ok {
+				c.tgManager.HandleFloodWait(acc, fw.Seconds)
+				acc = nil
+			}
+			break
+		}
+		if len(msgs) == 0 {
+			break
+		}
+
+		c.processMessages(ctx, msgs, username, callback)
+
+		offsetID = msgs[len(msgs)-1].ID
+		fetched += len(msgs)
+	}
+
+	if acc != nil {
+		c.tgManager.Release(acc, 0)
+	}
+
+	// Update channel status
+	c.store.DB.Model(&model.Channel{}).Where("username = ?", username).
+		Update("status", "scraped")
+}
+
+func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message, channelUsername string, callback func(plugin.MerchantData)) {
+	for _, msg := range msgs {
+		if msg.IsService || msg.Text == "" {
+			continue
+		}
+		if !extractor.ContainsChinese(msg.Text, 0) {
+			continue
+		}
+		if !extractor.HasContact(msg.Text) {
+			continue
+		}
+
+		// Regex first
+		info := extractor.Extract(msg.Text)
+		merchantName := ""
+		industry := ""
+
+		// LLM fallback only when regex found no TG username
+		if info.TgUsername == "" && c.llmClient != nil {
+			merchantInfo, err := c.llmClient.ParseMerchant(ctx, msg.Text)
+			if err == nil && merchantInfo != nil && merchantInfo.TgUsername != "" {
+				info.TgUsername = strings.TrimPrefix(merchantInfo.TgUsername, "@")
+				if merchantInfo.Website != "" && info.Website == "" {
+					info.Website = merchantInfo.Website
+				}
+				if merchantInfo.Email != "" && info.Email == "" {
+					info.Email = merchantInfo.Email
+				}
+				if merchantInfo.Phone != "" && info.Phone == "" {
+					info.Phone = merchantInfo.Phone
+				}
+				merchantName = extractor.CleanMerchantName(merchantInfo.MerchantName)
+				industry = merchantInfo.Industry
+			}
+		}
+
+		if info.TgUsername == "" {
+			continue
+		}
+
+		callback(plugin.MerchantData{
+			TgUsername:   info.TgUsername,
+			TgLink:       "https://t.me/" + info.TgUsername,
+			MerchantName: merchantName,
+			Website:      info.Website,
+			Email:        info.Email,
+			Phone:        info.Phone,
+			SourceType:   "tg_channel",
+			SourceName:   channelUsername,
+			SourceURL:    "https://t.me/" + channelUsername,
+			OriginalText: msg.Text,
+			IndustryTag:  industry,
+		})
+	}
+}
+
+func cleanUsername(s string) string {
+	return strings.TrimPrefix(strings.TrimSpace(s), "@")
+}
+
+var reUsernameFromLink = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
+
+func extractUsernameFromLink(link string) string {
+	m := reUsernameFromLink.FindStringSubmatch(link)
+	if len(m) > 1 {
+		return m[1]
+	}
+	return ""
+}
+
+func getIntCfg(cfg map[string]any, key string, def int) int {
+	if v, ok := cfg[key].(int); ok {
+		return v
+	}
+	if v, ok := cfg[key].(float64); ok {
+		return int(v)
+	}
+	return def
+}

+ 243 - 0
internal/plugins/webcollector/collector.go

@@ -0,0 +1,243 @@
+package webcollector
+
+import (
+	"context"
+	"log"
+	"net/url"
+	"regexp"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"spider/internal/crawler"
+	"spider/internal/extractor"
+	"spider/internal/plugin"
+	"spider/internal/search"
+)
+
+// Collector implements plugin.Collector for web-based merchant collection.
+// Combines search (Google via Serper) + page crawling + contact extraction.
+// NO AI — pure regex and rule-based filtering.
+type Collector struct {
+	serper       *search.SerperClient
+	static       *crawler.StaticCrawler
+	dynamic      *crawler.DynamicCrawler
+	tmeValidator *crawler.TMeValidator
+	stopped      atomic.Bool
+}
+
+// New creates a new web collector.
+func New(serper *search.SerperClient) *Collector {
+	return &Collector{
+		serper:       serper,
+		static:       crawler.NewStaticCrawler(),
+		dynamic:      crawler.NewDynamicCrawler(),
+		tmeValidator: crawler.NewTMeValidator(),
+	}
+}
+
+func (c *Collector) Name() string { return "web_collector" }
+
+func (c *Collector) Stop() error {
+	c.stopped.Store(true)
+	return nil
+}
+
+// Run executes the web collection pipeline:
+// 1. For each keyword, search via Serper API
+// 2. Classify results: t.me links -> direct extract, web pages -> crawl
+// 3. Crawl pages, extract TG usernames and contact info
+// 4. Call callback for each merchant found
+//
+// cfg keys: (none required — keywords come from DB via the task manager)
+// The cfg map can contain:
+//   - "keywords": []string — override keywords (optional)
+func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
+	c.stopped.Store(false)
+
+	if c.serper == nil {
+		log.Println("[web_collector] no serper client configured, skipping")
+		return nil
+	}
+
+	keywords, _ := cfg["keywords"].([]string)
+	if len(keywords) == 0 {
+		log.Println("[web_collector] no keywords provided")
+		return nil
+	}
+
+	for _, kw := range keywords {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+
+		log.Printf("[web_collector] searching: %s", kw)
+
+		results, err := c.serper.Search(ctx, kw)
+		if err != nil {
+			log.Printf("[web_collector] search error for %q: %v", kw, err)
+			continue
+		}
+
+		for _, r := range results {
+			if c.stopped.Load() || ctx.Err() != nil {
+				break
+			}
+
+			classification := search.ClassifyURL(r.URL)
+
+			switch classification {
+			case "tg_channel":
+				// Direct t.me link — extract username immediately
+				username := extractTGUsername(r.URL)
+				if username == "" {
+					continue
+				}
+				callback(plugin.MerchantData{
+					TgUsername: username,
+					TgLink:     "https://t.me/" + username,
+					SourceType: "web",
+					SourceName: r.Title,
+					SourceURL:  r.URL,
+				})
+
+			case "nav_site":
+				// Crawl the page for TG links and contacts
+				c.crawlPage(ctx, r.URL, r.Title, callback)
+
+			default:
+				// "discard" or unknown — also try rule filter for non-blacklisted
+				if crawler.RuleFilter(r.URL) != crawler.FilterDiscard {
+					c.crawlPage(ctx, r.URL, r.Title, callback)
+				}
+			}
+		}
+
+		// Delay between keywords
+		select {
+		case <-ctx.Done():
+			return nil
+		case <-time.After(2 * time.Second):
+		}
+	}
+
+	log.Println("[web_collector] done")
+	return nil
+}
+
+// crawlPage fetches a page and extracts merchants from it.
+func (c *Collector) crawlPage(ctx context.Context, pageURL, title string, callback func(plugin.MerchantData)) {
+	// Rule-based filter (no LLM)
+	filterResult := crawler.RuleFilter(pageURL)
+	if filterResult == crawler.FilterDiscard {
+		return
+	}
+	// FilterUncertain: per requirements, discard without AI
+	// FilterValid: proceed
+
+	// Try static first, fallback to dynamic
+	result := c.static.Crawl(ctx, pageURL)
+	if result.Error != nil || result.HTML == "" {
+		result = c.dynamic.Crawl(ctx, pageURL)
+	}
+	if result.Error != nil || result.HTML == "" {
+		return
+	}
+
+	// Chinese content filter
+	snippet := result.HTML
+	if len(snippet) > 5000 {
+		snippet = snippet[:5000]
+	}
+	if !extractor.ContainsChinese(snippet, 0) {
+		return
+	}
+
+	// Process t.me links found on the page
+	for _, tgLink := range result.TgLinks {
+		username := crawler.ExtractTGUsername(tgLink)
+		if username == "" {
+			continue
+		}
+		// t.me dead check (free, unlimited)
+		if !c.tmeValidator.IsAlive(ctx, username) {
+			continue
+		}
+		callback(plugin.MerchantData{
+			TgUsername: username,
+			TgLink:     "https://t.me/" + username,
+			SourceType: "web",
+			SourceName: title,
+			SourceURL:  pageURL,
+		})
+	}
+
+	// Process other links — crawl merchant sub-pages for contact info
+	for _, link := range result.Links {
+		if c.stopped.Load() || ctx.Err() != nil {
+			break
+		}
+		// Skip TG links (already processed) and blacklisted
+		if strings.Contains(link, "t.me") || strings.Contains(link, "telegram.me") {
+			continue
+		}
+		if crawler.RuleFilter(link) == crawler.FilterDiscard {
+			continue
+		}
+		c.crawlMerchantSite(ctx, link, pageURL, callback)
+	}
+}
+
+// crawlMerchantSite crawls a merchant's website for contact info.
+func (c *Collector) crawlMerchantSite(ctx context.Context, siteURL, sourceURL string, callback func(plugin.MerchantData)) {
+	subPages := []string{siteURL, siteURL + "/contact", siteURL + "/about"}
+
+	for _, page := range subPages {
+		if ctx.Err() != nil {
+			break
+		}
+		result := c.static.Crawl(ctx, page)
+		if result.Error != nil || result.HTML == "" {
+			continue
+		}
+
+		info := extractor.Extract(result.HTML)
+		if !info.HasContact {
+			continue
+		}
+		if info.TgUsername == "" {
+			continue // per requirements: no tg_username = don't insert
+		}
+
+		callback(plugin.MerchantData{
+			TgUsername:   info.TgUsername,
+			TgLink:       "https://t.me/" + info.TgUsername,
+			Website:      siteURL,
+			Email:        info.Email,
+			Phone:        info.Phone,
+			SourceType:   "web",
+			SourceName:   extractDomain(siteURL),
+			SourceURL:    sourceURL,
+			OriginalText: "",
+		})
+		break // found contact info, stop
+	}
+}
+
+var reTGUsername = regexp.MustCompile(`(?:t(?:elegram)?\.me)/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
+
+func extractTGUsername(rawURL string) string {
+	m := reTGUsername.FindStringSubmatch(rawURL)
+	if len(m) > 1 {
+		return m[1]
+	}
+	return ""
+}
+
+func extractDomain(rawURL string) string {
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return ""
+	}
+	return u.Hostname()
+}

+ 87 - 0
internal/processor/blacklist.go

@@ -0,0 +1,87 @@
+package processor
+
+import (
+	"math"
+	"regexp"
+	"spider/internal/extractor"
+	"spider/internal/model"
+	"strings"
+)
+
+// BlacklistResult holds the outcome of blacklist filtering.
+type BlacklistResult struct {
+	Passed  []model.MerchantRaw
+	Blocked []FilteredMerchant
+}
+
+// FilteredMerchant pairs a raw merchant with its rejection status.
+type FilteredMerchant struct {
+	Raw    model.MerchantRaw
+	Status string // bot / invalid
+}
+
+var systemBots = []string{
+	"telegram", "telegramhints", "gif", "pic", "bing", "vid",
+	"bold", "vote", "like", "sticker", "music",
+	"channel_bot", "botfather", "spambot",
+}
+
+var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`)
+
+// FilterBlacklist applies blacklist rules to raw merchants.
+func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult {
+	var result BlacklistResult
+	for _, raw := range raws {
+		status := checkBlacklist(raw)
+		if status != "" {
+			result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status})
+		} else {
+			result.Passed = append(result.Passed, raw)
+		}
+	}
+	return result
+}
+
+func checkBlacklist(raw model.MerchantRaw) string {
+	username := strings.ToLower(raw.TgUsername)
+
+	// System bot names
+	for _, b := range systemBots {
+		if username == b {
+			return "bot"
+		}
+	}
+
+	// xxxbot suffix
+	if strings.HasSuffix(username, "bot") && len(username) > 3 {
+		return "bot"
+	}
+
+	// Invite link hash (16-24 char base64 with high entropy)
+	if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 {
+		if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 {
+			return "invalid"
+		}
+	}
+
+	// Original text non-Chinese
+	if raw.OriginalText != "" && !extractor.ContainsChinese(raw.OriginalText, 0) {
+		return "invalid"
+	}
+
+	return ""
+}
+
+func entropy(s string) float64 {
+	freq := map[rune]int{}
+	for _, r := range s {
+		freq[r]++
+	}
+	n := float64(len(s))
+	h := 0.0
+	for _, count := range freq {
+		p := float64(count) / n
+		h -= p * math.Log2(p)
+	}
+	return h
+}

+ 119 - 0
internal/processor/dedup.go

@@ -0,0 +1,119 @@
+package processor
+
+import (
+	"encoding/json"
+	"fmt"
+	"spider/internal/model"
+)
+
+// SourceInfo describes where a merchant was discovered.
+type SourceInfo struct {
+	SourceType string `json:"source_type"`
+	SourceName string `json:"source_name"`
+	SourceURL  string `json:"source_url"`
+}
+
+// DedupResult holds keepers and duplicates after deduplication.
+type DedupResult struct {
+	Keepers    []MergedMerchant
+	Duplicates []model.MerchantRaw
+}
+
+// MergedMerchant is the best raw record plus merged source info.
+type MergedMerchant struct {
+	Best        model.MerchantRaw
+	AllSources  []SourceInfo
+	SourceCount int
+}
+
+// Deduplicate groups by tg_username, keeps the richest record, merges sources.
+func Deduplicate(raws []model.MerchantRaw) DedupResult {
+	groups := map[string][]model.MerchantRaw{}
+	for _, raw := range raws {
+		key := raw.TgUsername
+		if key == "" {
+			key = raw.Website
+		}
+		if key == "" {
+			key = raw.Email
+		}
+		if key == "" {
+			key = fmt.Sprintf("_id_%d", raw.ID)
+		}
+		groups[key] = append(groups[key], raw)
+	}
+
+	var result DedupResult
+	for _, group := range groups {
+		// Collect all sources
+		var sources []SourceInfo
+		for _, r := range group {
+			sources = append(sources, SourceInfo{
+				SourceType: r.SourceType,
+				SourceName: r.SourceName,
+				SourceURL:  r.SourceURL,
+			})
+		}
+
+		if len(group) == 1 {
+			result.Keepers = append(result.Keepers, MergedMerchant{
+				Best:        group[0],
+				AllSources:  sources,
+				SourceCount: 1,
+			})
+			continue
+		}
+
+		// Find richest record
+		bestIdx := 0
+		bestScore := richness(group[0])
+		for i := 1; i < len(group); i++ {
+			s := richness(group[i])
+			if s > bestScore {
+				bestIdx = i
+				bestScore = s
+			}
+		}
+
+		// Mark all others as duplicate
+		for i, r := range group {
+			if i != bestIdx {
+				result.Duplicates = append(result.Duplicates, r)
+			}
+		}
+
+		result.Keepers = append(result.Keepers, MergedMerchant{
+			Best:        group[bestIdx],
+			AllSources:  sources,
+			SourceCount: len(group),
+		})
+	}
+
+	return result
+}
+
+func richness(r model.MerchantRaw) int {
+	score := 0
+	if r.TgUsername != "" {
+		score++
+	}
+	if r.Website != "" {
+		score++
+	}
+	if r.Email != "" {
+		score++
+	}
+	if r.Phone != "" {
+		score++
+	}
+	if r.MerchantName != "" {
+		score++
+	}
+	return score
+}
+
+// MarshalSources converts source list to JSON bytes.
+func MarshalSources(sources []SourceInfo) []byte {
+	b, _ := json.Marshal(sources)
+	return b
+}

+ 173 - 0
internal/processor/pipeline.go

@@ -0,0 +1,173 @@
+package processor
+
+import (
+	"context"
+	"log"
+	"spider/internal/model"
+	"spider/internal/store"
+	"time"
+
+	"gorm.io/datatypes"
+)
+
+// ProgressFn reports processing progress.
+type ProgressFn func(step string, current, total int, message string)
+
+// Processor runs the 4-step cleaning pipeline.
+type Processor struct {
+	store     *store.Store
+	checker   *TMeChecker
+	onProgress ProgressFn
+}
+
+// NewProcessor creates a new Processor.
+func NewProcessor(s *store.Store) *Processor {
+	return &Processor{
+		store:   s,
+		checker: NewTMeChecker(),
+	}
+}
+
+// SetProgressFn sets the progress callback.
+func (p *Processor) SetProgressFn(fn ProgressFn) {
+	p.onProgress = fn
+}
+
+func (p *Processor) report(step string, current, total int, msg string) {
+	if p.onProgress != nil {
+		p.onProgress(step, current, total, msg)
+	}
+}
+
+// ProcessResult summarizes a processor run.
+type ProcessResult struct {
+	InputCount     int
+	AliveCount     int
+	PassedCount    int
+	DedupedCount   int
+	OutputCount    int
+	HotCount       int
+	WarmCount      int
+	ColdCount      int
+}
+
+// Process runs the 4-step pipeline on raw merchants with status="raw".
+func (p *Processor) Process(ctx context.Context) (*ProcessResult, error) {
+	raws, err := p.store.ListRawByStatus("raw", 0)
+	if err != nil {
+		return nil, err
+	}
+
+	result := &ProcessResult{InputCount: len(raws)}
+	log.Printf("[processor] processing %d raw merchants", len(raws))
+
+	if len(raws) == 0 {
+		return result, nil
+	}
+
+	// Step 1: t.me dead account check
+	p.report("tmechecker", 0, len(raws), "开始死号预检...")
+	alive, dead := p.checker.Filter(ctx, raws)
+	result.AliveCount = len(alive)
+	log.Printf("[processor] step1 tmechecker: %d alive, %d dead", len(alive), len(dead))
+	p.report("tmechecker", len(raws), len(raws), "死号预检完成")
+
+	// Mark dead ones
+	for _, d := range dead {
+		p.saveClean(d, "invalid", "", "Cold", nil, 1)
+		p.store.UpdateRawStatus(d.ID, "done")
+	}
+
+	if ctx.Err() != nil {
+		return result, ctx.Err()
+	}
+
+	// Step 2: Blacklist filter
+	p.report("blacklist", 0, len(alive), "黑名单过滤...")
+	blResult := FilterBlacklist(alive)
+	result.PassedCount = len(blResult.Passed)
+	log.Printf("[processor] step2 blacklist: %d passed, %d blocked", len(blResult.Passed), len(blResult.Blocked))
+	p.report("blacklist", len(alive), len(alive), "黑名单完成")
+
+	for _, b := range blResult.Blocked {
+		p.saveClean(b.Raw, b.Status, "", "Cold", nil, 1)
+		p.store.UpdateRawStatus(b.Raw.ID, "done")
+	}
+
+	if ctx.Err() != nil {
+		return result, ctx.Err()
+	}
+
+	// Step 3: Dedup + merge sources
+	p.report("dedup", 0, len(blResult.Passed), "去重合并...")
+	dedupResult := Deduplicate(blResult.Passed)
+	result.DedupedCount = len(dedupResult.Keepers)
+	log.Printf("[processor] step3 dedup: %d keepers, %d duplicates", len(dedupResult.Keepers), len(dedupResult.Duplicates))
+	p.report("dedup", len(blResult.Passed), len(blResult.Passed), "去重完成")
+
+	for _, dup := range dedupResult.Duplicates {
+		p.saveClean(dup, "duplicate", "", "Cold", nil, 1)
+		p.store.UpdateRawStatus(dup.ID, "done")
+	}
+
+	if ctx.Err() != nil {
+		return result, ctx.Err()
+	}
+
+	// Step 4: Tag + grade
+	p.report("tagger", 0, len(dedupResult.Keepers), "打标签分等级...")
+	tagged := TagAndGrade(dedupResult.Keepers)
+	log.Printf("[processor] step4 tagger: %d tagged", len(tagged))
+
+	for _, t := range tagged {
+		switch t.Level {
+		case "Hot":
+			result.HotCount++
+		case "Warm":
+			result.WarmCount++
+		case "Cold":
+			result.ColdCount++
+		}
+
+		sources := MarshalSources(t.Merged.AllSources)
+		p.saveClean(t.Merged.Best, "valid", t.IndustryTag, t.Level, sources, t.Merged.SourceCount)
+		p.store.UpdateRawStatus(t.Merged.Best.ID, "done")
+	}
+
+	result.OutputCount = len(tagged)
+	p.report("tagger", len(tagged), len(tagged), "分级完成")
+	log.Printf("[processor] done: Hot=%d, Warm=%d, Cold=%d", result.HotCount, result.WarmCount, result.ColdCount)
+
+	return result, nil
+}
+
+func (p *Processor) saveClean(raw model.MerchantRaw, status, industryTag, level string, allSources []byte, sourceCount int) {
+	now := time.Now()
+	tgLink := raw.TgLink
+	if tgLink == "" && raw.TgUsername != "" {
+		tgLink = "https://t.me/" + raw.TgUsername
+	}
+
+	sourcesJSON := datatypes.JSON([]byte("[]"))
+	if allSources != nil {
+		sourcesJSON = datatypes.JSON(allSources)
+	}
+
+	clean := &model.MerchantClean{
+		TgUsername:    raw.TgUsername,
+		TgLink:        tgLink,
+		MerchantName:  raw.MerchantName,
+		Website:       raw.Website,
+		Email:         raw.Email,
+		Phone:         raw.Phone,
+		IndustryTag:   industryTag,
+		Level:         level,
+		Status:        status,
+		IsAlive:       status == "valid",
+		LastCheckedAt: &now,
+		SourceCount:   sourceCount,
+		AllSources:    sourcesJSON,
+	}
+
+	p.store.SaveClean(clean)
+}

+ 66 - 0
internal/processor/tagger.go

@@ -0,0 +1,66 @@
+package processor
+
+import (
+	"strings"
+)
+
+// Industry keywords for matching. Extensible via config in the future.
+var industryKeywords = map[string][]string{
+	"机场": {
+		"机场", "节点", "订阅", "clash", "v2ray", "trojan", "shadowsocks", "ss/ssr",
+		"翻墙", "梯子", "科学上网", "加速器", "proxy", "代理",
+	},
+	"VPN": {
+		"vpn", "VPN", "wireguard", "openvpn",
+	},
+}
+
+// TagAndGrade assigns industry_tag and level (Hot/Warm/Cold) to each merchant.
+func TagAndGrade(merchants []MergedMerchant) []TaggedMerchant {
+	var result []TaggedMerchant
+	for _, m := range merchants {
+		tagged := TaggedMerchant{Merged: m}
+
+		// Industry matching: check merchant_name + original_text
+		text := strings.ToLower(m.Best.MerchantName + " " + m.Best.OriginalText)
+		for industry, keywords := range industryKeywords {
+			for _, kw := range keywords {
+				if strings.Contains(text, strings.ToLower(kw)) {
+					tagged.IndustryTag = industry
+					break
+				}
+			}
+			if tagged.IndustryTag != "" {
+				break
+			}
+		}
+
+		// Inherit from raw if no match found
+		if tagged.IndustryTag == "" && m.Best.IndustryTag != "" {
+			tagged.IndustryTag = m.Best.IndustryTag
+		}
+
+		// Level grading
+		hasIndustry := tagged.IndustryTag != ""
+		hasWebsiteOrEmail := m.Best.Website != "" || m.Best.Email != ""
+
+		switch {
+		case hasIndustry && hasWebsiteOrEmail:
+			tagged.Level = "Hot"
+		case hasIndustry:
+			tagged.Level = "Warm"
+		default:
+			tagged.Level = "Cold"
+		}
+
+		result = append(result, tagged)
+	}
+	return result
+}
+
+// TaggedMerchant is the final output of the processor.
+type TaggedMerchant struct {
+	Merged      MergedMerchant
+	IndustryTag string
+	Level       string // Hot / Warm / Cold
+}

+ 40 - 0
internal/processor/tmechecker.go

@@ -0,0 +1,40 @@
+package processor
+
+import (
+	"context"
+	"log"
+	"spider/internal/crawler"
+	"spider/internal/model"
+)
+
+// TMeChecker filters dead TG accounts via HTTP t.me page scraping.
+// Free, unlimited, 100% accurate.
+type TMeChecker struct {
+	validator *crawler.TMeValidator
+}
+
+// NewTMeChecker creates a new TMeChecker.
+func NewTMeChecker() *TMeChecker {
+	return &TMeChecker{validator: crawler.NewTMeValidator()}
+}
+
+// Filter checks each raw merchant's tg_username against t.me.
+// Returns alive merchants; dead ones are returned separately with status="invalid".
+func (c *TMeChecker) Filter(ctx context.Context, raws []model.MerchantRaw) (alive []model.MerchantRaw, dead []model.MerchantRaw) {
+	for _, raw := range raws {
+		if ctx.Err() != nil {
+			break
+		}
+		if raw.TgUsername == "" {
+			alive = append(alive, raw)
+			continue
+		}
+		if c.validator.IsAlive(ctx, raw.TgUsername) {
+			alive = append(alive, raw)
+		} else {
+			log.Printf("[tmechecker] dead: @%s", raw.TgUsername)
+			dead = append(dead, raw)
+		}
+	}
+	return
+}

+ 0 - 0
internal/service/.gitkeep


+ 0 - 146
internal/service/settings_service.go

@@ -1,146 +0,0 @@
-package service
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"strconv"
-	"time"
-
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
-
-	"spider/internal/model"
-)
-
-const settingsCacheKey = "spider:cache:settings"
-const settingsCacheTTL = 5 * time.Minute
-
-// SettingsService provides hot-reloadable access to managed_settings.
-type SettingsService struct {
-	db    *gorm.DB
-	redis *redis.Client
-}
-
-// NewSettingsService creates a new SettingsService.
-func NewSettingsService(db *gorm.DB, rdb *redis.Client) *SettingsService {
-	return &SettingsService{db: db, redis: rdb}
-}
-
-// Load 从数据库加载所有设置到 Redis 缓存
-func (s *SettingsService) Load(ctx context.Context) error {
-	var settings []model.ManagedSetting
-	if err := s.db.WithContext(ctx).Find(&settings).Error; err != nil {
-		return fmt.Errorf("load settings from db: %w", err)
-	}
-
-	if len(settings) == 0 {
-		// Nothing to cache; ensure any stale cache is cleared.
-		return s.redis.Del(ctx, settingsCacheKey).Err()
-	}
-
-	fields := make([]interface{}, 0, len(settings)*2)
-	for _, setting := range settings {
-		fields = append(fields, setting.KeyName, setting.Value)
-	}
-
-	pipe := s.redis.Pipeline()
-	pipe.HSet(ctx, settingsCacheKey, fields...)
-	pipe.Expire(ctx, settingsCacheKey, settingsCacheTTL)
-	_, err := pipe.Exec(ctx)
-	if err != nil {
-		return fmt.Errorf("cache settings to redis: %w", err)
-	}
-	return nil
-}
-
-// Get 获取设置值(先读 Redis 缓存,缓存不存在则读 DB 并回填)
-func (s *SettingsService) Get(ctx context.Context, key string) (string, error) {
-	// Try cache first.
-	val, err := s.redis.HGet(ctx, settingsCacheKey, key).Result()
-	if err == nil {
-		return val, nil
-	}
-
-	// Cache miss or Redis error — fall back to DB.
-	var setting model.ManagedSetting
-	if err := s.db.WithContext(ctx).Where("key_name = ?", key).First(&setting).Error; err != nil {
-		return "", fmt.Errorf("setting %q not found: %w", key, err)
-	}
-
-	// Back-fill the cache entry.
-	pipe := s.redis.Pipeline()
-	pipe.HSet(ctx, settingsCacheKey, key, setting.Value)
-	pipe.Expire(ctx, settingsCacheKey, settingsCacheTTL)
-	pipe.Exec(ctx) //nolint:errcheck — best-effort
-
-	return setting.Value, nil
-}
-
-// GetInt 获取整数类型设置
-func (s *SettingsService) GetInt(ctx context.Context, key string, defaultVal int) int {
-	raw, err := s.Get(ctx, key)
-	if err != nil {
-		return defaultVal
-	}
-	v, err := strconv.Atoi(raw)
-	if err != nil {
-		return defaultVal
-	}
-	return v
-}
-
-// GetFloat 获取浮点类型设置
-func (s *SettingsService) GetFloat(ctx context.Context, key string, defaultVal float64) float64 {
-	raw, err := s.Get(ctx, key)
-	if err != nil {
-		return defaultVal
-	}
-	v, err := strconv.ParseFloat(raw, 64)
-	if err != nil {
-		return defaultVal
-	}
-	return v
-}
-
-// GetBool 获取布尔类型设置
-func (s *SettingsService) GetBool(ctx context.Context, key string, defaultVal bool) bool {
-	raw, err := s.Get(ctx, key)
-	if err != nil {
-		return defaultVal
-	}
-	v, err := strconv.ParseBool(raw)
-	if err != nil {
-		return defaultVal
-	}
-	return v
-}
-
-// GetJSON 获取 JSON 类型设置,解析到 target
-func (s *SettingsService) GetJSON(ctx context.Context, key string, target interface{}) error {
-	raw, err := s.Get(ctx, key)
-	if err != nil {
-		return err
-	}
-	return json.Unmarshal([]byte(raw), target)
-}
-
-// Set 更新设置(更新 DB + 清除缓存)
-func (s *SettingsService) Set(ctx context.Context, key, value string) error {
-	result := s.db.WithContext(ctx).Model(&model.ManagedSetting{}).
-		Where("key_name = ?", key).
-		Update("value", value)
-	if result.Error != nil {
-		return fmt.Errorf("update setting %q in db: %w", key, result.Error)
-	}
-	if result.RowsAffected == 0 {
-		return fmt.Errorf("setting %q not found", key)
-	}
-	// Invalidate cache so next read reloads from DB.
-	return s.Invalidate(ctx)
-}
-
-// Invalidate 清除缓存,下次读取时从 DB 加载
-func (s *SettingsService) Invalidate(ctx context.Context) error {
-	return s.redis.Del(ctx, settingsCacheKey).Err()
-}

+ 0 - 181
internal/service/task_service.go

@@ -1,181 +0,0 @@
-package service
-
-import (
-	"encoding/json"
-	"fmt"
-	"time"
-
-	"github.com/hibiken/asynq"
-	"github.com/redis/go-redis/v9"
-	"golang.org/x/net/context"
-	"gorm.io/gorm"
-
-	"spider/internal/model"
-	"spider/internal/worker"
-)
-
-// StartTaskRequest is the payload for starting a new task.
-type StartTaskRequest struct {
-	TaskType   string          `json:"task_type" binding:"required"`
-	Target     string          `json:"target"`
-	TestRun    *worker.TestRun `json:"test_run"`
-	SkipPhases []string        `json:"skip_phases"`
-}
-
-// TaskService manages task lifecycle.
-type TaskService struct {
-	db     *gorm.DB
-	redis  *redis.Client
-	client *asynq.Client
-}
-
-// NewTaskService creates a TaskService. The asynq.Client is constructed from the
-// same Redis client options used by the rest of the application.
-func NewTaskService(db *gorm.DB, rdb *redis.Client) *TaskService {
-	opts := rdb.Options()
-	client := asynq.NewClient(asynq.RedisClientOpt{
-		Addr:     opts.Addr,
-		Password: opts.Password,
-		DB:       opts.DB,
-	})
-	return &TaskService{
-		db:     db,
-		redis:  rdb,
-		client: client,
-	}
-}
-
-// asynqTypeForTaskType maps model task_type to asynq task type constant.
-func asynqTypeForTaskType(taskType string) (string, error) {
-	m := map[string]string{
-		"full":     worker.TypeFullPipeline,
-		"discover": worker.TypeDiscover,
-		"search":   worker.TypeSearch,
-		"github":   worker.TypeGithub,
-		"scrape":   worker.TypeScrape,
-		"crawl":    worker.TypeCrawl,
-		"clean":    worker.TypeClean,
-		"score":    worker.TypeScore,
-	}
-	at, ok := m[taskType]
-	if !ok {
-		return "", fmt.Errorf("unknown task type: %s", taskType)
-	}
-	return at, nil
-}
-
-// StartTask validates, creates a Task record, and enqueues it via asynq.
-func (s *TaskService) StartTask(req StartTaskRequest) (*model.Task, error) {
-	// Check if a task of the same type is already running.
-	var count int64
-	if err := s.db.Model(&model.Task{}).
-		Where("task_type = ? AND status = ?", req.TaskType, "running").
-		Count(&count).Error; err != nil {
-		return nil, fmt.Errorf("check running tasks: %w", err)
-	}
-	if count > 0 {
-		return nil, fmt.Errorf("a %s task is already running", req.TaskType)
-	}
-
-	// Validate and get asynq type.
-	asynqType, err := asynqTypeForTaskType(req.TaskType)
-	if err != nil {
-		return nil, err
-	}
-
-	// Encode params.
-	paramsJSON, err := json.Marshal(req)
-	if err != nil {
-		return nil, fmt.Errorf("marshal params: %w", err)
-	}
-
-	// Create Task record in DB.
-	task := &model.Task{
-		TaskType: req.TaskType,
-		Status:   "pending",
-		Params:   paramsJSON,
-	}
-	if err := s.db.Create(task).Error; err != nil {
-		return nil, fmt.Errorf("create task record: %w", err)
-	}
-
-	// Build asynq payload.
-	payload := worker.TaskPayload{
-		TaskID:     task.ID,
-		Target:     req.Target,
-		TestRun:    req.TestRun,
-		SkipPhases: req.SkipPhases,
-	}
-	payloadBytes, err := json.Marshal(payload)
-	if err != nil {
-		return nil, fmt.Errorf("marshal payload: %w", err)
-	}
-
-	// Enqueue.
-	asynqTask := asynq.NewTask(asynqType, payloadBytes, asynq.Queue(worker.QueueDefault))
-	if _, err := s.client.Enqueue(asynqTask); err != nil {
-		// Roll back the DB record to failed.
-		s.db.Model(task).Updates(map[string]interface{}{"status": "failed", "error_msg": err.Error()})
-		return nil, fmt.Errorf("enqueue task: %w", err)
-	}
-
-	return task, nil
-}
-
-// StopTask marks the task as stopped in the DB and sets a Redis stop signal.
-func (s *TaskService) StopTask(taskID uint, force bool) error {
-	var task model.Task
-	if err := s.db.First(&task, taskID).Error; err != nil {
-		return fmt.Errorf("task not found: %w", err)
-	}
-
-	// Set the Redis stop signal so the worker can detect it.
-	stopKey := fmt.Sprintf("spider:task:stop:%d", taskID)
-	if err := s.redis.Set(context.Background(), stopKey, "1", time.Hour).Err(); err != nil {
-		return fmt.Errorf("set stop signal: %w", err)
-	}
-
-	// If force, immediately update the DB status.
-	if force {
-		finishedAt := time.Now()
-		if err := s.db.Model(&model.Task{}).Where("id = ?", taskID).Updates(map[string]interface{}{
-			"status":      "stopped",
-			"finished_at": &finishedAt,
-		}).Error; err != nil {
-			return fmt.Errorf("update task stopped: %w", err)
-		}
-	}
-
-	return nil
-}
-
-// GetProgress reads progress from Redis and returns a merged map.
-func (s *TaskService) GetProgress(task *model.Task) map[string]interface{} {
-	result := make(map[string]interface{})
-
-	// Start with DB-stored progress if any.
-	if len(task.Progress) > 0 {
-		_ = json.Unmarshal(task.Progress, &result)
-	}
-
-	// Overlay with live Redis progress.
-	progressKey := fmt.Sprintf("spider:task:progress:%d", task.ID)
-	vals, err := s.redis.HGetAll(context.Background(), progressKey).Result()
-	if err == nil && len(vals) > 0 {
-		for k, v := range vals {
-			result[k] = v
-		}
-	}
-
-	return result
-}
-
-// IsStopRequested checks whether a stop signal has been set for the given task.
-func (s *TaskService) IsStopRequested(taskID uint) bool {
-	stopKey := fmt.Sprintf("spider:task:stop:%d", taskID)
-	val, err := s.redis.Get(context.Background(), stopKey).Result()
-	if err != nil {
-		return false
-	}
-	return val == "1"
-}

+ 59 - 0
internal/store/channel_repo.go

@@ -0,0 +1,59 @@
+package store
+
+import (
+	"spider/internal/model"
+	"strings"
+
+	"gorm.io/gorm"
+)
+
+// UpsertChannel inserts or updates a channel by username.
+func (s *Store) UpsertChannel(ch *model.Channel) error {
+	ch.Username = strings.TrimPrefix(strings.TrimSpace(ch.Username), "@")
+	if ch.Username == "" {
+		return nil
+	}
+
+	var existing model.Channel
+	err := s.DB.Where("username = ?", ch.Username).First(&existing).Error
+	if err == gorm.ErrRecordNotFound {
+		return s.DB.Create(ch).Error
+	}
+	if err != nil {
+		return err
+	}
+	// already exists, skip
+	return nil
+}
+
+// ListPendingChannels returns channels with status=pending, up to limit.
+func (s *Store) ListPendingChannels(limit int) ([]model.Channel, error) {
+	var channels []model.Channel
+	q := s.DB.Where("status = ?", "pending")
+	if limit > 0 {
+		q = q.Limit(limit)
+	}
+	err := q.Find(&channels).Error
+	return channels, err
+}
+
+// UpdateChannelStatus sets the status and optionally last_message_id.
+func (s *Store) UpdateChannelStatus(id uint, status string, lastMessageID int) error {
+	updates := map[string]any{"status": status}
+	if lastMessageID > 0 {
+		updates["last_message_id"] = lastMessageID
+	}
+	return s.DB.Model(&model.Channel{}).Where("id = ?", id).Updates(updates).Error
+}
+
+// UpdateChannelEntity caches the TG entity IDs for a channel.
+func (s *Store) UpdateChannelEntity(id uint, channelID, accessHash int64) error {
+	return s.DB.Model(&model.Channel{}).Where("id = ?", id).
+		Updates(map[string]any{"channel_id": channelID, "access_hash": accessHash}).Error
+}
+
+// IncrementMerchantsFound increments the merchants_found counter.
+func (s *Store) IncrementMerchantsFound(id uint) error {
+	return s.DB.Model(&model.Channel{}).Where("id = ?", id).
+		Update("merchants_found", gorm.Expr("merchants_found + 1")).Error
+}

+ 13 - 0
internal/store/db.go

@@ -0,0 +1,13 @@
+package store
+
+import "gorm.io/gorm"
+
+// Store is the central data access layer.
+type Store struct {
+	DB *gorm.DB
+}
+
+// New creates a new Store.
+func New(db *gorm.DB) *Store {
+	return &Store{DB: db}
+}

+ 49 - 0
internal/store/keyword_repo.go

@@ -0,0 +1,49 @@
+package store
+
+import "spider/internal/model"
+
+// ListEnabledKeywords returns all enabled keywords (excluding seeds).
+func (s *Store) ListEnabledKeywords() ([]model.Keyword, error) {
+	var keywords []model.Keyword
+	err := s.DB.Where("enabled = ? AND industry_tag != ?", true, "seed").Find(&keywords).Error
+	return keywords, err
+}
+
+// ListSeeds returns all enabled seed entries (industry_tag = 'seed').
+func (s *Store) ListSeeds() ([]model.Keyword, error) {
+	var seeds []model.Keyword
+	err := s.DB.Where("enabled = ? AND industry_tag = ?", true, "seed").Find(&seeds).Error
+	return seeds, err
+}
+
+// ListAllKeywords returns all keywords for the given filters, paginated.
+func (s *Store) ListAllKeywords(industryTag string, page, pageSize int) ([]model.Keyword, int64, error) {
+	var keywords []model.Keyword
+	var total int64
+
+	q := s.DB.Model(&model.Keyword{})
+	if industryTag != "" {
+		q = q.Where("industry_tag = ?", industryTag)
+	}
+
+	q.Count(&total)
+
+	offset := (page - 1) * pageSize
+	err := q.Order("created_at DESC").Offset(offset).Limit(pageSize).Find(&keywords).Error
+	return keywords, total, err
+}
+
+// CreateKeyword inserts a new keyword. Returns the created record.
+func (s *Store) CreateKeyword(k *model.Keyword) error {
+	return s.DB.Create(k).Error
+}
+
+// UpdateKeyword updates a keyword by ID.
+func (s *Store) UpdateKeyword(id uint, updates map[string]any) error {
+	return s.DB.Model(&model.Keyword{}).Where("id = ?", id).Updates(updates).Error
+}
+
+// DeleteKeyword deletes a keyword by ID.
+func (s *Store) DeleteKeyword(id uint) error {
+	return s.DB.Delete(&model.Keyword{}, id).Error
+}

+ 111 - 0
internal/store/merchant_repo.go

@@ -0,0 +1,111 @@
+package store
+
+import (
+	"spider/internal/model"
+	"spider/internal/plugin"
+	"strings"
+
+	"gorm.io/gorm"
+)
+
+// SaveRaw inserts a merchant into merchants_raw from plugin output.
+// Dedup: same tg_username + same source_url => skip.
+// Returns true if a new record was inserted.
+func (s *Store) SaveRaw(data plugin.MerchantData) (bool, error) {
+	if strings.TrimSpace(data.TgUsername) == "" {
+		return false, nil // no tg_username = don't insert
+	}
+	username := strings.TrimPrefix(strings.TrimSpace(data.TgUsername), "@")
+
+	tgLink := data.TgLink
+	if tgLink == "" && username != "" {
+		tgLink = "https://t.me/" + username
+	}
+
+	raw := model.MerchantRaw{
+		MerchantName:    data.MerchantName,
+		TgUsername:      username,
+		TgLink:          tgLink,
+		Website:         data.Website,
+		Email:           data.Email,
+		Phone:           data.Phone,
+		IndustryTag:     data.IndustryTag,
+		SourceType:      data.SourceType,
+		SourceName:      data.SourceName,
+		SourceURL:       data.SourceURL,
+		OriginalText:    data.OriginalText,
+		Status:          "raw",
+	}
+
+	// Dedup: same tg_username + source_url => skip
+	var count int64
+	s.DB.Model(&model.MerchantRaw{}).
+		Where("tg_username = ? AND source_url = ?", username, data.SourceURL).
+		Count(&count)
+	if count > 0 {
+		return false, nil
+	}
+
+	if err := s.DB.Create(&raw).Error; err != nil {
+		return false, err
+	}
+	return true, nil
+}
+
+// ListRawByStatus returns raw merchants with the given status.
+func (s *Store) ListRawByStatus(status string, limit int) ([]model.MerchantRaw, error) {
+	var raws []model.MerchantRaw
+	q := s.DB.Where("status = ?", status)
+	if limit > 0 {
+		q = q.Limit(limit)
+	}
+	err := q.Find(&raws).Error
+	return raws, err
+}
+
+// UpdateRawStatus sets the status of a raw merchant.
+func (s *Store) UpdateRawStatus(id uint, status string) error {
+	return s.DB.Model(&model.MerchantRaw{}).Where("id = ?", id).Update("status", status).Error
+}
+
+// SaveClean upserts a clean merchant by tg_username.
+func (s *Store) SaveClean(m *model.MerchantClean) error {
+	var existing model.MerchantClean
+	err := s.DB.Where("tg_username = ?", m.TgUsername).First(&existing).Error
+	if err == gorm.ErrRecordNotFound {
+		return s.DB.Create(m).Error
+	}
+	if err != nil {
+		return err
+	}
+	m.ID = existing.ID
+	return s.DB.Save(m).Error
+}
+
+// ListClean returns paginated clean merchants with optional filters.
+func (s *Store) ListClean(filters map[string]string, page, pageSize int) ([]model.MerchantClean, int64, error) {
+	var merchants []model.MerchantClean
+	var total int64
+
+	q := s.DB.Model(&model.MerchantClean{})
+
+	if v, ok := filters["status"]; ok && v != "" {
+		q = q.Where("status = ?", v)
+	}
+	if v, ok := filters["level"]; ok && v != "" {
+		q = q.Where("level = ?", v)
+	}
+	if v, ok := filters["industry_tag"]; ok && v != "" {
+		q = q.Where("industry_tag = ?", v)
+	}
+	if v, ok := filters["search"]; ok && v != "" {
+		like := "%" + v + "%"
+		q = q.Where("tg_username LIKE ? OR merchant_name LIKE ?", like, like)
+	}
+
+	q.Count(&total)
+
+	offset := (page - 1) * pageSize
+	err := q.Order("created_at DESC").Offset(offset).Limit(pageSize).Find(&merchants).Error
+	return merchants, total, err
+}

+ 392 - 0
internal/task/manager.go

@@ -0,0 +1,392 @@
+package task
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+
+	"github.com/redis/go-redis/v9"
+	"gorm.io/gorm"
+
+	"spider/internal/model"
+	"spider/internal/plugin"
+	"spider/internal/processor"
+	"spider/internal/store"
+)
+
+// StartRequest is the payload for starting a new task.
+type StartRequest struct {
+	PluginName string `json:"plugin_name" binding:"required"`
+	AutoClean  bool   `json:"auto_clean"` // run processor after collection (default true)
+}
+
+// Manager manages plugin task lifecycle using goroutines.
+// Replaces the asynq-based worker.
+type Manager struct {
+	db        *gorm.DB
+	redis     *redis.Client
+	registry  *plugin.Registry
+	store     *store.Store
+	processor *processor.Processor
+
+	mu       sync.Mutex
+	running  map[uint]context.CancelFunc // taskID -> cancel
+}
+
+// NewManager creates a new task manager.
+func NewManager(db *gorm.DB, rdb *redis.Client, reg *plugin.Registry, s *store.Store, proc *processor.Processor) *Manager {
+	return &Manager{
+		db:        db,
+		redis:     rdb,
+		registry:  reg,
+		store:     s,
+		processor: proc,
+		running:   make(map[uint]context.CancelFunc),
+	}
+}
+
+// StartTask validates, creates a TaskLog record, and runs the plugin in a goroutine.
+func (m *Manager) StartTask(req StartRequest) (*model.TaskLog, error) {
+	// Validate plugin exists
+	collector, err := m.registry.Get(req.PluginName)
+	if err != nil {
+		return nil, fmt.Errorf("unknown plugin: %s", req.PluginName)
+	}
+
+	// Check if same plugin is already running
+	var count int64
+	m.db.Model(&model.TaskLog{}).
+		Where("plugin_name = ? AND status = ?", req.PluginName, "running").
+		Count(&count)
+	if count > 0 {
+		return nil, fmt.Errorf("plugin %s is already running", req.PluginName)
+	}
+
+	// Create task log record
+	now := time.Now()
+	taskLog := &model.TaskLog{
+		TaskType:   "collect",
+		PluginName: req.PluginName,
+		Status:     "running",
+		StartedAt:  &now,
+	}
+	if err := m.db.Create(taskLog).Error; err != nil {
+		return nil, fmt.Errorf("create task log: %w", err)
+	}
+
+	// Build config for the plugin
+	cfg, err := m.buildPluginConfig(req.PluginName)
+	if err != nil {
+		m.failTask(taskLog, err)
+		return nil, err
+	}
+
+	// Start in goroutine
+	ctx, cancel := context.WithCancel(context.Background())
+
+	m.mu.Lock()
+	m.running[taskLog.ID] = cancel
+	m.mu.Unlock()
+
+	autoClean := req.AutoClean
+	// Default to true if not explicitly set
+	if !req.AutoClean {
+		autoClean = true
+	}
+
+	go m.runTask(ctx, taskLog, collector, cfg, autoClean)
+
+	return taskLog, nil
+}
+
+func (m *Manager) runTask(ctx context.Context, taskLog *model.TaskLog, collector plugin.Collector, cfg map[string]any, autoClean bool) {
+	defer func() {
+		m.mu.Lock()
+		delete(m.running, taskLog.ID)
+		m.mu.Unlock()
+	}()
+
+	m.writeLog(ctx, taskLog.ID, fmt.Sprintf("开始采集: %s", collector.Name()))
+
+	merchantCount := 0
+	errCount := 0
+
+	// Callback: for each merchant found, save to raw table
+	callback := func(data plugin.MerchantData) {
+		inserted, err := m.store.SaveRaw(data)
+		if err != nil {
+			errCount++
+			log.Printf("[task] save raw error: %v", err)
+			return
+		}
+		if inserted {
+			merchantCount++
+			if merchantCount%10 == 0 {
+				m.writeProgress(ctx, taskLog.ID, collector.Name(), merchantCount, 0,
+					fmt.Sprintf("已采集 %d 个商户", merchantCount))
+			}
+		}
+	}
+
+	// Run the collector
+	runErr := collector.Run(ctx, cfg, callback)
+
+	// Check if stopped
+	if m.isStopRequested(ctx, taskLog.ID) || ctx.Err() != nil {
+		m.writeLog(ctx, taskLog.ID, "任务已停止")
+		finishedAt := time.Now()
+		m.db.Model(taskLog).Updates(map[string]any{
+			"status":          "stopped",
+			"finished_at":     &finishedAt,
+			"merchants_added": merchantCount,
+			"errors_count":    errCount,
+		})
+		return
+	}
+
+	if runErr != nil {
+		m.failTask(taskLog, runErr)
+		m.writeLog(ctx, taskLog.ID, "采集失败: "+runErr.Error())
+		return
+	}
+
+	m.writeLog(ctx, taskLog.ID, fmt.Sprintf("采集完成: 新增 %d 个商户", merchantCount))
+
+	// Auto-clean: run processor on new raw records
+	if autoClean && merchantCount > 0 {
+		m.writeLog(ctx, taskLog.ID, "开始清洗流程...")
+		m.writeProgress(ctx, taskLog.ID, "clean", 0, 0, "清洗中...")
+
+		m.processor.SetProgressFn(func(step string, current, total int, msg string) {
+			m.writeProgress(ctx, taskLog.ID, step, current, total, msg)
+			m.writeLog(ctx, taskLog.ID, fmt.Sprintf("[%s] %d/%d %s", step, current, total, msg))
+		})
+
+		procResult, procErr := m.processor.Process(ctx)
+		if procErr != nil {
+			m.writeLog(ctx, taskLog.ID, "清洗失败: "+procErr.Error())
+		} else {
+			m.writeLog(ctx, taskLog.ID, fmt.Sprintf("清洗完成: Hot=%d, Warm=%d, Cold=%d",
+				procResult.HotCount, procResult.WarmCount, procResult.ColdCount))
+		}
+	}
+
+	// Complete
+	finishedAt := time.Now()
+	detail := fmt.Sprintf("采集 %d 个商户, 错误 %d 次", merchantCount, errCount)
+	m.db.Model(taskLog).Updates(map[string]any{
+		"status":          "completed",
+		"finished_at":     &finishedAt,
+		"merchants_added": merchantCount,
+		"errors_count":    errCount,
+		"detail":          detail,
+	})
+
+	m.writeProgress(ctx, taskLog.ID, "done", 100, 100, "任务完成")
+	m.writeLog(ctx, taskLog.ID, "任务完成")
+	log.Printf("[task] task %d completed: %s", taskLog.ID, detail)
+}
+
+// StartClean runs the processor independently (not tied to a plugin).
+func (m *Manager) StartClean() (*model.TaskLog, error) {
+	var count int64
+	m.db.Model(&model.TaskLog{}).
+		Where("task_type = ? AND status = ?", "clean", "running").
+		Count(&count)
+	if count > 0 {
+		return nil, fmt.Errorf("clean task is already running")
+	}
+
+	now := time.Now()
+	taskLog := &model.TaskLog{
+		TaskType:   "clean",
+		PluginName: "",
+		Status:     "running",
+		StartedAt:  &now,
+	}
+	if err := m.db.Create(taskLog).Error; err != nil {
+		return nil, err
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	m.mu.Lock()
+	m.running[taskLog.ID] = cancel
+	m.mu.Unlock()
+
+	go func() {
+		defer func() {
+			m.mu.Lock()
+			delete(m.running, taskLog.ID)
+			m.mu.Unlock()
+		}()
+
+		m.writeLog(ctx, taskLog.ID, "开始独立清洗任务")
+
+		m.processor.SetProgressFn(func(step string, current, total int, msg string) {
+			m.writeProgress(ctx, taskLog.ID, step, current, total, msg)
+		})
+
+		result, err := m.processor.Process(ctx)
+
+		finishedAt := time.Now()
+		if err != nil {
+			m.db.Model(taskLog).Updates(map[string]any{
+				"status":      "failed",
+				"finished_at": &finishedAt,
+				"detail":      err.Error(),
+			})
+			return
+		}
+
+		detail := fmt.Sprintf("输入 %d, Hot=%d, Warm=%d, Cold=%d",
+			result.InputCount, result.HotCount, result.WarmCount, result.ColdCount)
+		m.db.Model(taskLog).Updates(map[string]any{
+			"status":          "completed",
+			"finished_at":     &finishedAt,
+			"items_processed": result.InputCount,
+			"merchants_added": result.OutputCount,
+			"detail":          detail,
+		})
+
+		m.writeLog(ctx, taskLog.ID, "清洗完成: "+detail)
+	}()
+
+	return taskLog, nil
+}
+
+// StopTask cancels a running task.
+func (m *Manager) StopTask(taskID uint) error {
+	// Set Redis stop signal
+	key := fmt.Sprintf("spider:task:stop:%d", taskID)
+	m.redis.Set(context.Background(), key, "1", time.Hour)
+
+	// Cancel the goroutine context
+	m.mu.Lock()
+	cancel, ok := m.running[taskID]
+	m.mu.Unlock()
+
+	if ok {
+		cancel()
+	}
+
+	// Also try to stop the collector
+	var taskLog model.TaskLog
+	if err := m.db.First(&taskLog, taskID).Error; err == nil && taskLog.PluginName != "" {
+		if collector, err := m.registry.Get(taskLog.PluginName); err == nil {
+			collector.Stop()
+		}
+	}
+
+	return nil
+}
+
+// GetProgress reads live progress from Redis.
+func (m *Manager) GetProgress(taskID uint) map[string]any {
+	key := fmt.Sprintf("spider:task:progress:%d", taskID)
+	vals, err := m.redis.HGetAll(context.Background(), key).Result()
+	if err != nil {
+		return nil
+	}
+	result := make(map[string]any)
+	for k, v := range vals {
+		result[k] = v
+	}
+	return result
+}
+
+// GetLogs reads task logs from Redis.
+func (m *Manager) GetLogs(taskID uint) []string {
+	key := fmt.Sprintf("spider:task:logs:%d", taskID)
+	logs, err := m.redis.LRange(context.Background(), key, 0, -1).Result()
+	if err != nil {
+		return nil
+	}
+	return logs
+}
+
+// buildPluginConfig builds the config map for a plugin from the DB.
+func (m *Manager) buildPluginConfig(pluginName string) (map[string]any, error) {
+	cfg := make(map[string]any)
+
+	switch pluginName {
+	case "web_collector":
+		keywords, err := m.store.ListEnabledKeywords()
+		if err != nil {
+			return nil, err
+		}
+		kws := make([]string, 0, len(keywords))
+		for _, k := range keywords {
+			kws = append(kws, k.Keyword)
+		}
+		cfg["keywords"] = kws
+
+	case "tg_collector":
+		seeds, err := m.store.ListSeeds()
+		if err != nil {
+			return nil, err
+		}
+		seedNames := make([]string, 0, len(seeds))
+		for _, s := range seeds {
+			seedNames = append(seedNames, s.Keyword)
+		}
+		cfg["seeds"] = seedNames
+		cfg["max_depth"] = 3
+		cfg["max_channels"] = 500
+		cfg["message_limit"] = 500
+
+	case "github_collector":
+		keywords, err := m.store.ListEnabledKeywords()
+		if err != nil {
+			return nil, err
+		}
+		kws := make([]string, 0, len(keywords))
+		for _, k := range keywords {
+			kws = append(kws, k.Keyword)
+		}
+		cfg["keywords"] = kws
+		cfg["repos_limit"] = 50
+	}
+
+	return cfg, nil
+}
+
+func (m *Manager) failTask(taskLog *model.TaskLog, err error) {
+	finishedAt := time.Now()
+	m.db.Model(taskLog).Updates(map[string]any{
+		"status":      "failed",
+		"finished_at": &finishedAt,
+		"detail":      err.Error(),
+	})
+}
+
+func (m *Manager) isStopRequested(ctx context.Context, taskID uint) bool {
+	key := fmt.Sprintf("spider:task:stop:%d", taskID)
+	val, _ := m.redis.Get(ctx, key).Result()
+	return val == "1"
+}
+
+func (m *Manager) writeLog(ctx context.Context, taskID uint, msg string) {
+	key := fmt.Sprintf("spider:task:logs:%d", taskID)
+	ts := time.Now().Format("15:04:05")
+	line := fmt.Sprintf("[%s] %s", ts, msg)
+	m.redis.RPush(ctx, key, line)
+	m.redis.LTrim(ctx, key, -500, -1)
+	m.redis.Expire(ctx, key, 24*time.Hour)
+}
+
+func (m *Manager) writeProgress(ctx context.Context, taskID uint, phase string, current, total int, message string) {
+	key := fmt.Sprintf("spider:task:progress:%d", taskID)
+	now := time.Now().UTC().Format(time.RFC3339)
+	fields := map[string]any{
+		"phase":      phase,
+		"current":    current,
+		"total":      total,
+		"message":    message,
+		"updated_at": now,
+	}
+	b, _ := json.Marshal(fields)
+	m.redis.Set(ctx, key, string(b), 24*time.Hour)
+}

+ 0 - 0
internal/worker/.gitkeep


+ 0 - 325
internal/worker/worker.go

@@ -1,325 +0,0 @@
-package worker
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"log"
-	"time"
-
-	"github.com/hibiken/asynq"
-	"github.com/redis/go-redis/v9"
-	"gorm.io/gorm"
-
-	"spider/internal/llm"
-	"spider/internal/model"
-	"spider/internal/pipeline"
-	"spider/internal/search"
-	"spider/internal/telegram"
-)
-
-const (
-	QueueDefault = "default"
-
-	TypeFullPipeline = "task:full"
-	TypeDiscover     = "task:discover"
-	TypeSearch       = "task:search"
-	TypeGithub       = "task:github"
-	TypeScrape       = "task:scrape"
-	TypeCrawl        = "task:crawl"
-	TypeClean        = "task:clean"
-	TypeScore        = "task:score"
-)
-
-// lockKeyForType returns the Redis lock key for a given task type.
-func lockKeyForType(taskType string) string {
-	if taskType == "full" {
-		return "spider:task:lock:global"
-	}
-	return fmt.Sprintf("spider:task:lock:%s", taskType)
-}
-
-// progressKey returns the Redis hash key for task progress.
-func progressKey(taskID uint) string {
-	return fmt.Sprintf("spider:task:progress:%d", taskID)
-}
-
-// stopKey returns the Redis key used to signal stop for a task.
-func stopKey(taskID uint) string {
-	return fmt.Sprintf("spider:task:stop:%d", taskID)
-}
-
-// TaskPayload is the asynq task payload.
-type TaskPayload struct {
-	TaskID     uint     `json:"task_id"`
-	Target     string   `json:"target,omitempty"`
-	TestRun    *TestRun `json:"test_run,omitempty"`
-	SkipPhases []string `json:"skip_phases,omitempty"`
-}
-
-// TestRun limits items processed during a test run.
-type TestRun struct {
-	ItemLimit    int `json:"item_limit"`
-	MessageLimit int `json:"message_limit"`
-}
-
-// Worker wraps the asynq server.
-type Worker struct {
-	server       *asynq.Server
-	mux          *asynq.ServeMux
-	db           *gorm.DB
-	redis        *redis.Client
-	tgManager    *telegram.AccountManager
-	llmClient    *llm.Client
-	settings     pipeline.Settings
-	serperClient *search.SerperClient
-	githubToken  string
-	pipeline     *pipeline.Runner
-}
-
-// New creates and configures a new Worker.
-func New(redisAddr, redisPassword string, redisDB int, db *gorm.DB, rdb *redis.Client, tgManager *telegram.AccountManager, llmClient *llm.Client, settings pipeline.Settings, serperClient *search.SerperClient, githubToken string) *Worker {
-	srv := asynq.NewServer(
-		asynq.RedisClientOpt{
-			Addr:     redisAddr,
-			Password: redisPassword,
-			DB:       redisDB,
-		},
-		asynq.Config{
-			Concurrency: 4,
-			Queues: map[string]int{
-				QueueDefault: 10,
-			},
-		},
-	)
-
-	runner := pipeline.NewRunner(db, rdb)
-	runner.RegisterPhase(pipeline.NewDiscoverPhase(db, tgManager, settings))
-	runner.RegisterPhase(pipeline.NewSearchPhase(db, serperClient, settings))
-	runner.RegisterPhase(pipeline.NewGithubPhase(db, githubToken, settings))
-	runner.RegisterPhase(pipeline.NewScrapePhase(db, tgManager, llmClient, settings, rdb))
-	runner.RegisterPhase(pipeline.NewCrawlPhase(db, llmClient, settings))
-	runner.RegisterPhase(pipeline.NewCleanPhase(db, tgManager, settings))
-	runner.RegisterPhase(pipeline.NewScorePhase(db))
-
-	w := &Worker{
-		server:       srv,
-		mux:          asynq.NewServeMux(),
-		db:           db,
-		redis:        rdb,
-		tgManager:    tgManager,
-		llmClient:    llmClient,
-		settings:     settings,
-		serperClient: serperClient,
-		githubToken:  githubToken,
-		pipeline:     runner,
-	}
-
-	// Register all task types to the same generic handler.
-	w.mux.HandleFunc(TypeFullPipeline, w.processTask)
-	w.mux.HandleFunc(TypeDiscover, w.processTask)
-	w.mux.HandleFunc(TypeSearch, w.processTask)
-	w.mux.HandleFunc(TypeGithub, w.processTask)
-	w.mux.HandleFunc(TypeScrape, w.processTask)
-	w.mux.HandleFunc(TypeCrawl, w.processTask)
-	w.mux.HandleFunc(TypeClean, w.processTask)
-	w.mux.HandleFunc(TypeScore, w.processTask)
-
-	return w
-}
-
-// acquireLock tries to acquire a Redis SET NX EX lock. Returns true on success.
-func (w *Worker) acquireLock(ctx context.Context, lockKey string) bool {
-	ok, err := w.redis.SetNX(ctx, lockKey, "1", 24*time.Hour).Result()
-	if err != nil {
-		log.Printf("[worker] acquireLock error key=%s: %v", lockKey, err)
-		return false
-	}
-	return ok
-}
-
-// releaseLock deletes the Redis lock key.
-func (w *Worker) releaseLock(ctx context.Context, lockKey string) {
-	if err := w.redis.Del(ctx, lockKey).Err(); err != nil {
-		log.Printf("[worker] releaseLock error key=%s: %v", lockKey, err)
-	}
-}
-
-// writeLog appends a timestamped log line to the Redis list for this task.
-// Keeps only the last 500 entries and sets a 24-hour TTL.
-func (w *Worker) writeLog(ctx context.Context, taskID uint, msg string) {
-	key := fmt.Sprintf("spider:task:logs:%d", taskID)
-	ts := time.Now().Format("15:04:05")
-	line := fmt.Sprintf("[%s] %s", ts, msg)
-	w.redis.RPush(ctx, key, line)
-	w.redis.LTrim(ctx, key, -500, -1)
-	w.redis.Expire(ctx, key, 24*time.Hour)
-}
-
-// writeProgress writes task progress fields to Redis.
-func (w *Worker) writeProgress(ctx context.Context, taskID uint, phase string, current, total int, message string) {
-	key := progressKey(taskID)
-	now := time.Now().UTC().Format(time.RFC3339)
-	err := w.redis.HSet(ctx, key,
-		"phase", phase,
-		"current", current,
-		"total", total,
-		"message", message,
-		"updated_at", now,
-	).Err()
-	if err != nil {
-		log.Printf("[worker] writeProgress error task=%d: %v", taskID, err)
-		return
-	}
-	w.redis.Expire(ctx, key, 24*time.Hour)
-}
-
-// isStopRequested checks whether a stop signal has been set for this task.
-func (w *Worker) isStopRequested(ctx context.Context, taskID uint) bool {
-	val, err := w.redis.Get(ctx, stopKey(taskID)).Result()
-	if err != nil {
-		return false
-	}
-	return val == "1"
-}
-
-// taskTypeFromAsynqType converts an asynq type string to the model task_type value.
-func taskTypeFromAsynqType(asynqType string) string {
-	switch asynqType {
-	case TypeFullPipeline:
-		return "full"
-	case TypeDiscover:
-		return "discover"
-	case TypeSearch:
-		return "search"
-	case TypeGithub:
-		return "github"
-	case TypeScrape:
-		return "scrape"
-	case TypeCrawl:
-		return "crawl"
-	case TypeClean:
-		return "clean"
-	case TypeScore:
-		return "score"
-	default:
-		return asynqType
-	}
-}
-
-// processTask is the core handler invoked for every registered task type.
-func (w *Worker) processTask(ctx context.Context, t *asynq.Task) error {
-	var payload TaskPayload
-	if err := json.Unmarshal(t.Payload(), &payload); err != nil {
-		return fmt.Errorf("unmarshal payload: %w", err)
-	}
-
-	taskID := payload.TaskID
-	taskType := taskTypeFromAsynqType(t.Type())
-	lockKey := lockKeyForType(taskType)
-
-	log.Printf("[worker] processing task id=%d type=%s", taskID, taskType)
-
-	// Acquire distributed lock.
-	if !w.acquireLock(ctx, lockKey) {
-		return fmt.Errorf("another %s task is already running, skipping", taskType)
-	}
-	defer w.releaseLock(ctx, lockKey)
-
-	// 1. Update task status → running.
-	now := time.Now()
-	if err := w.db.WithContext(ctx).Model(&model.Task{}).Where("id = ?", taskID).Updates(map[string]interface{}{
-		"status":     "running",
-		"started_at": &now,
-	}).Error; err != nil {
-		return fmt.Errorf("update task running: %w", err)
-	}
-
-	// 2. Write initial progress to Redis and log task start.
-	w.writeProgress(ctx, taskID, taskType, 0, 0, "任务启动中...")
-	w.writeLog(ctx, taskID, fmt.Sprintf("任务开始: %s (id=%d)", taskType, taskID))
-
-	// 3. Fetch the full task record for the pipeline.
-	var task model.Task
-	if err := w.db.WithContext(ctx).First(&task, taskID).Error; err != nil {
-		return fmt.Errorf("fetch task record: %w", err)
-	}
-
-	// 4. Build pipeline options from payload.
-	opts := &pipeline.Options{
-		Target:     payload.Target,
-		SkipPhases: payload.SkipPhases,
-	}
-	if payload.TestRun != nil {
-		opts.TestRun = &pipeline.TestRun{
-			ItemLimit:    payload.TestRun.ItemLimit,
-			MessageLimit: payload.TestRun.MessageLimit,
-		}
-	}
-
-	// Wire progress reporter so pipeline phases report through writeProgress and writeLog.
-	w.pipeline.SetProgressReporter(func(phase string, current, total int, message string) {
-		// Also check for stop signal on each progress report.
-		if w.isStopRequested(ctx, taskID) {
-			log.Printf("[worker] task %d stop requested during phase=%s", taskID, phase)
-		}
-		w.writeProgress(ctx, taskID, phase, current, total, message)
-		if message != "" {
-			w.writeLog(ctx, taskID, fmt.Sprintf("[%s] %d/%d %s", phase, current, total, message))
-		}
-	})
-
-	// 5. Run the pipeline. For full tasks, phase failures are logged but non-fatal.
-	if pipelineErr := w.pipeline.Run(ctx, &task, opts); pipelineErr != nil {
-		// Single-phase tasks propagate errors; full-pipeline errors are already handled inside Run.
-		log.Printf("[worker] pipeline error task=%d: %v", taskID, pipelineErr)
-		errorTime := time.Now()
-		w.db.WithContext(ctx).Model(&model.Task{}).Where("id = ?", taskID).Updates(map[string]interface{}{
-			"status":      "failed",
-			"finished_at": &errorTime,
-			"error_msg":   pipelineErr.Error(),
-		})
-		w.writeProgress(ctx, taskID, taskType, 0, 0, "任务失败: "+pipelineErr.Error())
-		w.writeLog(ctx, taskID, "任务失败: "+pipelineErr.Error())
-		return pipelineErr
-	}
-
-	// Check for stop request after pipeline finishes.
-	if w.isStopRequested(ctx, taskID) {
-		log.Printf("[worker] task %d stop requested", taskID)
-		stopTime := time.Now()
-		w.db.WithContext(ctx).Model(&model.Task{}).Where("id = ?", taskID).Updates(map[string]interface{}{
-			"status":      "stopped",
-			"finished_at": &stopTime,
-		})
-		w.writeProgress(ctx, taskID, taskType, 0, 0, "任务已停止")
-		w.writeLog(ctx, taskID, "任务已停止")
-		return nil
-	}
-
-	// 6. Mark task as completed.
-	finishedAt := time.Now()
-	resultJSON, _ := json.Marshal(map[string]interface{}{"message": "task completed successfully"})
-	if err := w.db.WithContext(ctx).Model(&model.Task{}).Where("id = ?", taskID).Updates(map[string]interface{}{
-		"status":      "completed",
-		"finished_at": &finishedAt,
-		"result":      resultJSON,
-	}).Error; err != nil {
-		return fmt.Errorf("update task completed: %w", err)
-	}
-
-	w.writeProgress(ctx, taskID, taskType, 100, 100, "任务完成")
-	w.writeLog(ctx, taskID, "任务完成")
-	log.Printf("[worker] task %d completed", taskID)
-	return nil
-}
-
-// Start runs the asynq server (blocking).
-func (w *Worker) Start() error {
-	return w.server.Run(w.mux)
-}
-
-// Stop gracefully shuts down the asynq server.
-func (w *Worker) Stop() {
-	w.server.Shutdown()
-}

BIN
server.exe


+ 4 - 17
web/src/App.tsx

@@ -1,31 +1,18 @@
-import { BrowserRouter, Routes, Route } from 'react-router-dom'
+import { BrowserRouter, Routes, Route, Navigate } from 'react-router-dom'
 import Layout from './components/Layout'
-import Dashboard from './pages/Dashboard'
-import Tasks from './pages/Tasks'
-import MerchantsRaw from './pages/MerchantsRaw'
 import MerchantsClean from './pages/MerchantsClean'
-import Channels from './pages/Channels'
-import NavSites from './pages/NavSites'
-import Seeds from './pages/Seeds'
+import Tasks from './pages/Tasks'
 import Keywords from './pages/Keywords'
-import Settings from './pages/Settings'
-import Logs from './pages/Logs'
 
 export default function App() {
   return (
     <BrowserRouter>
       <Layout>
         <Routes>
-          <Route path="/" element={<Dashboard />} />
+          <Route path="/" element={<Navigate to="/merchants" replace />} />
+          <Route path="/merchants" element={<MerchantsClean />} />
           <Route path="/tasks" element={<Tasks />} />
-          <Route path="/merchants/raw" element={<MerchantsRaw />} />
-          <Route path="/merchants/clean" element={<MerchantsClean />} />
-          <Route path="/channels" element={<Channels />} />
-          <Route path="/nav-sites" element={<NavSites />} />
-          <Route path="/seeds" element={<Seeds />} />
           <Route path="/keywords" element={<Keywords />} />
-          <Route path="/settings" element={<Settings />} />
-          <Route path="/logs" element={<Logs />} />
         </Routes>
       </Layout>
     </BrowserRouter>

+ 48 - 107
web/src/api/index.ts

@@ -2,22 +2,12 @@ import axios from 'axios'
 
 const api = axios.create({ baseURL: '/api/v1' })
 
-// 响应拦截器:提取 data 字段
 api.interceptors.response.use(
   (res) => res.data,
-  (error) => {
-    return Promise.reject(error)
-  }
+  (error) => Promise.reject(error)
 )
 
 // Types
-export interface StartTaskRequest {
-  task_type: 'full' | 'discover' | 'search' | 'github' | 'scrape' | 'crawl' | 'clean' | 'score'
-  target?: string
-  test_run?: { item_limit: number; message_limit: number }
-  skip_phases?: string[]
-}
-
 export interface ApiResponse<T> {
   code: number
   message: string
@@ -31,147 +21,98 @@ export interface PagedResponse<T> {
   page_size: number
 }
 
-export interface Task {
+export interface StartTaskRequest {
+  plugin_name: string
+  auto_clean?: boolean
+}
+
+export interface TaskLog {
   id: number
   task_type: string
+  plugin_name: string
   status: string
-  params: Record<string, unknown>
-  current_phase: string
-  progress: number
+  items_processed: number
+  merchants_added: number
+  errors_count: number
+  started_at: string | null
+  finished_at: string | null
+  detail: string
   created_at: string
-  completed_at: string | null
-  error: string | null
 }
 
 export interface MerchantRaw {
   id: number
   tg_username: string
+  tg_link: string
   merchant_name: string
+  website: string
+  email: string
+  phone: string
   source_type: string
+  source_name: string
+  source_url: string
+  industry_tag: string
   status: string
-  industry: string
   created_at: string
 }
 
 export interface MerchantClean {
   id: number
   tg_username: string
+  tg_link: string
   merchant_name: string
+  website: string
+  email: string
+  phone: string
+  source_count: number
+  all_sources: unknown[]
+  industry_tag: string
+  level: string // Hot / Warm / Cold
   status: string
-  industry: string
-  quality_score: number
-  is_premium: boolean
-  activity_score: number
-  created_at: string
-}
-
-export interface Channel {
-  id: number
-  username: string
-  title: string
-  member_count: number
-  source: string
-  status: string
-  relevance_score: number
-  created_at: string
-}
-
-export interface NavSite {
-  id: number
-  url: string
-  domain: string
-  status: string
-  filter_reason: string | null
-  merchant_count: number
-  created_at: string
-}
-
-export interface Seed {
-  id: number
-  channel_name: string
-  status: boolean
-  note: string
+  is_alive: boolean
+  last_checked_at: string | null
   created_at: string
+  updated_at: string
 }
 
 export interface Keyword {
   id: number
   keyword: string
-  category: string
-  status: boolean
+  industry_tag: string
+  enabled: boolean
   created_at: string
 }
 
-export interface Setting {
-  key: string
-  value: string
-  description: string
-  updated_at: string
-}
-
-export interface DashboardData {
-  channel_count: number
-  merchant_raw_count: number
-  merchant_clean_count: number
-  merchant_valid_count: number
-  nav_site_count: number
-  seed_count: number
-  running_task: Task | null
-  recent_tasks: Task[]
+export interface MerchantStats {
+  raw_total: number
+  clean_total: number
+  by_status: Record<string, number>
+  by_level: Record<string, number>
+  by_source: Record<string, number>
 }
 
-export interface ChannelStats {
-  source_distribution: Record<string, number>
-  status_distribution: Record<string, number>
-}
-
-// Dashboard
-export const getDashboard = () => api.get<unknown, ApiResponse<DashboardData>>('/dashboard')
-
 // Tasks
 export const getTasks = (params?: Record<string, unknown>) =>
-  api.get<unknown, ApiResponse<PagedResponse<Task>>>('/tasks', { params })
-export const getTask = (id: number) => api.get<unknown, ApiResponse<Task>>(`/tasks/${id}`)
+  api.get<unknown, ApiResponse<PagedResponse<TaskLog>>>('/tasks', { params })
+export const getTask = (id: number) => api.get<unknown, ApiResponse<{ task: TaskLog; progress: Record<string, unknown> }>>(`/tasks/${id}`)
 export const startTask = (data: StartTaskRequest) =>
-  api.post<unknown, ApiResponse<Task>>('/tasks/start', data)
-export const stopTask = (id: number, force = false) =>
-  api.post<unknown, ApiResponse<null>>(`/tasks/${id}/stop`, { force })
+  api.post<unknown, ApiResponse<TaskLog>>('/tasks/start', data)
+export const stopTask = (id: number) =>
+  api.post<unknown, ApiResponse<null>>(`/tasks/${id}/stop`)
 
 // Merchants
-export const getMerchantsStats = () => api.get<unknown, ApiResponse<Record<string, number>>>('/merchants/stats')
+export const getMerchantsStats = () => api.get<unknown, ApiResponse<MerchantStats>>('/merchants/stats')
 export const getMerchantsRaw = (params?: Record<string, unknown>) =>
   api.get<unknown, ApiResponse<PagedResponse<MerchantRaw>>>('/merchants/raw', { params })
 export const getMerchantsClean = (params?: Record<string, unknown>) =>
   api.get<unknown, ApiResponse<PagedResponse<MerchantClean>>>('/merchants/clean', { params })
-export const getMerchant = (id: number) => api.get<unknown, ApiResponse<MerchantRaw | MerchantClean>>(`/merchants/${id}`)
-
-// Channels
-export const getChannels = (params?: Record<string, unknown>) =>
-  api.get<unknown, ApiResponse<PagedResponse<Channel>>>('/channels', { params })
-export const getChannelsStats = () => api.get<unknown, ApiResponse<ChannelStats>>('/channels/stats')
-
-// Nav Sites
-export const getNavSites = (params?: Record<string, unknown>) =>
-  api.get<unknown, ApiResponse<PagedResponse<NavSite>>>('/nav-sites', { params })
-
-// Seeds
-export const getSeeds = (params?: Record<string, unknown>) =>
-  api.get<unknown, ApiResponse<PagedResponse<Seed>>>('/seeds', { params })
-export const createSeed = (data: Partial<Seed>) => api.post<unknown, ApiResponse<Seed>>('/seeds', data)
-export const updateSeed = (id: number, data: Partial<Seed>) =>
-  api.put<unknown, ApiResponse<Seed>>(`/seeds/${id}`, data)
-export const deleteSeed = (id: number) => api.delete<unknown, ApiResponse<null>>(`/seeds/${id}`)
+export const getMerchant = (id: number) => api.get<unknown, ApiResponse<unknown>>(`/merchants/${id}`)
 
 // Keywords
 export const getKeywords = (params?: Record<string, unknown>) =>
   api.get<unknown, ApiResponse<PagedResponse<Keyword>>>('/keywords', { params })
-export const createKeywords = (data: { keywords: string[]; category: string }) =>
+export const createKeywords = (data: { keywords: string[]; industry_tag: string }) =>
   api.post<unknown, ApiResponse<Keyword[]>>('/keywords', data)
 export const updateKeyword = (id: number, data: Partial<Keyword>) =>
   api.put<unknown, ApiResponse<Keyword>>(`/keywords/${id}`, data)
 export const deleteKeyword = (id: number) => api.delete<unknown, ApiResponse<null>>(`/keywords/${id}`)
-
-// Settings
-export const getSettings = () => api.get<unknown, ApiResponse<Setting[]>>('/config/settings')
-export const updateSetting = (key: string, value: string) =>
-  api.put<unknown, ApiResponse<Setting>>(`/config/settings/${key}`, { value })

+ 3 - 17
web/src/components/Layout.tsx

@@ -1,16 +1,9 @@
 import { useState, useEffect } from 'react'
 import { Layout, Menu, theme } from 'antd'
 import {
-  DashboardOutlined,
-  PlayCircleOutlined,
-  DatabaseOutlined,
   CheckCircleOutlined,
-  TeamOutlined,
-  GlobalOutlined,
-  NodeIndexOutlined,
+  PlayCircleOutlined,
   TagsOutlined,
-  SettingOutlined,
-  FileTextOutlined,
 } from '@ant-design/icons'
 import { useNavigate, useLocation } from 'react-router-dom'
 
@@ -21,16 +14,9 @@ interface LayoutProps {
 }
 
 const menuItems = [
-  { key: '/', icon: <DashboardOutlined />, label: '仪表盘' },
+  { key: '/merchants', icon: <CheckCircleOutlined />, label: '商户列表' },
   { key: '/tasks', icon: <PlayCircleOutlined />, label: '任务管理' },
-  { key: '/merchants/raw', icon: <DatabaseOutlined />, label: '原始商户' },
-  { key: '/merchants/clean', icon: <CheckCircleOutlined />, label: '清洗商户' },
-  { key: '/channels', icon: <TeamOutlined />, label: '频道列表' },
-  { key: '/nav-sites', icon: <GlobalOutlined />, label: '导航网页' },
-  { key: '/seeds', icon: <NodeIndexOutlined />, label: '种子管理' },
   { key: '/keywords', icon: <TagsOutlined />, label: '关键词管理' },
-  { key: '/settings', icon: <SettingOutlined />, label: '系统配置' },
-  { key: '/logs', icon: <FileTextOutlined />, label: '实时日志' },
 ]
 
 export default function AppLayout({ children }: LayoutProps) {
@@ -88,7 +74,7 @@ export default function AppLayout({ children }: LayoutProps) {
               whiteSpace: 'nowrap',
             }}
           >
-            商户查找系统
+            商户采集系统
           </span>
         </div>
         <Menu

+ 32 - 72
web/src/components/TaskControl.tsx

@@ -2,10 +2,6 @@ import { useState } from 'react'
 import {
   Button,
   Modal,
-  Switch,
-  InputNumber,
-  Form,
-  Progress,
   Space,
   Typography,
   message,
@@ -19,21 +15,17 @@ import { useAppStore } from '../store'
 
 const { Text } = Typography
 
-interface TaskButton {
-  type: StartTaskRequest['task_type']
+interface PluginButton {
+  name: string
   label: string
   isPrimary?: boolean
 }
 
-const taskButtons: TaskButton[] = [
-  { type: 'full', label: '完整流水线', isPrimary: true },
-  { type: 'discover', label: '频道发现' },
-  { type: 'search', label: '搜索引擎' },
-  { type: 'github', label: 'GitHub采集' },
-  { type: 'scrape', label: 'TG采集' },
-  { type: 'crawl', label: '网页爬取' },
-  { type: 'clean', label: '数据清洗' },
-  { type: 'score', label: '评分' },
+const pluginButtons: PluginButton[] = [
+  { name: 'web_collector', label: '网页采集', isPrimary: true },
+  { name: 'tg_collector', label: 'TG采集' },
+  { name: 'github_collector', label: 'GitHub采集' },
+  { name: 'clean', label: '清洗' },
 ]
 
 interface TaskControlProps {
@@ -43,44 +35,40 @@ interface TaskControlProps {
 export default function TaskControl({ onTaskStarted }: TaskControlProps) {
   const { runningTask, setRunningTask } = useAppStore()
   const [modalOpen, setModalOpen] = useState(false)
-  const [selectedTask, setSelectedTask] = useState<TaskButton | null>(null)
-  const [testMode, setTestMode] = useState(false)
-  const [itemLimit, setItemLimit] = useState(10)
-  const [messageLimit, setMessageLimit] = useState(100)
+  const [selectedPlugin, setSelectedPlugin] = useState<PluginButton | null>(null)
   const [loading, setLoading] = useState(false)
   const [stopLoading, setStopLoading] = useState(false)
 
-  const handleTaskClick = (task: TaskButton) => {
-    setSelectedTask(task)
-    setTestMode(false)
+  const handleClick = (plugin: PluginButton) => {
+    setSelectedPlugin(plugin)
     setModalOpen(true)
   }
 
   const handleConfirm = async () => {
-    if (!selectedTask) return
+    if (!selectedPlugin) return
     setLoading(true)
     try {
-      const req: StartTaskRequest = { task_type: selectedTask.type }
-      if (testMode) {
-        req.test_run = { item_limit: itemLimit, message_limit: messageLimit }
+      const req: StartTaskRequest = {
+        plugin_name: selectedPlugin.name,
+        auto_clean: selectedPlugin.name !== 'clean',
       }
       const res = await startTask(req)
       setRunningTask(res.data)
-      message.success(`任务「${selectedTask.label}」已启动`)
+      message.success(`任务「${selectedPlugin.label}」已启动`)
       setModalOpen(false)
       onTaskStarted?.()
-    } catch (err) {
+    } catch {
       message.error('启动任务失败')
     } finally {
       setLoading(false)
     }
   }
 
-  const handleStop = async (force = false) => {
+  const handleStop = async () => {
     if (!runningTask) return
     setStopLoading(true)
     try {
-      await stopTask(runningTask.id, force)
+      await stopTask(runningTask.id)
       setRunningTask(null)
       message.success('任务已停止')
     } catch {
@@ -96,12 +84,12 @@ export default function TaskControl({ onTaskStarted }: TaskControlProps) {
     <Card title="任务控制" style={{ marginBottom: 24 }}>
       <Space direction="vertical" style={{ width: '100%' }} size="middle">
         <Row gutter={[8, 8]}>
-          {taskButtons.map((btn) => (
-            <Col key={btn.type}>
+          {pluginButtons.map((btn) => (
+            <Col key={btn.name}>
               <Button
                 type={btn.isPrimary ? 'primary' : 'default'}
                 disabled={isRunning}
-                onClick={() => handleTaskClick(btn)}
+                onClick={() => handleClick(btn)}
               >
                 {btn.label}
               </Button>
@@ -113,7 +101,7 @@ export default function TaskControl({ onTaskStarted }: TaskControlProps) {
                 danger
                 icon={<StopOutlined />}
                 loading={stopLoading}
-                onClick={() => handleStop(false)}
+                onClick={handleStop}
               >
                 停止任务
               </Button>
@@ -123,15 +111,11 @@ export default function TaskControl({ onTaskStarted }: TaskControlProps) {
 
         {isRunning && runningTask ? (
           <div>
-            <Space style={{ marginBottom: 8 }}>
+            <Space>
               <Text strong>当前任务:</Text>
-              <Text>{runningTask.task_type}</Text>
-              <Text type="secondary">阶段:{runningTask.current_phase || '初始化中'}</Text>
+              <Text>{runningTask.plugin_name || runningTask.task_type}</Text>
+              <Text type="secondary">状态:{runningTask.status}</Text>
             </Space>
-            <Progress
-              percent={runningTask.progress ?? 0}
-              status="active"
-            />
           </div>
         ) : (
           <Text type="secondary">暂无运行中的任务</Text>
@@ -139,7 +123,7 @@ export default function TaskControl({ onTaskStarted }: TaskControlProps) {
       </Space>
 
       <Modal
-        title={`启动任务:${selectedTask?.label}`}
+        title={`启动任务:${selectedPlugin?.label}`}
         open={modalOpen}
         onOk={handleConfirm}
         onCancel={() => setModalOpen(false)}
@@ -147,36 +131,12 @@ export default function TaskControl({ onTaskStarted }: TaskControlProps) {
         okText="确认启动"
         cancelText="取消"
       >
-        <Form layout="vertical" style={{ marginTop: 16 }}>
-          <Form.Item label="测试模式">
-            <Switch
-              checked={testMode}
-              onChange={setTestMode}
-              checkedChildren="开"
-              unCheckedChildren="关"
-            />
-          </Form.Item>
-          {testMode && (
-            <>
-              <Form.Item label="Item 限制">
-                <InputNumber
-                  min={1}
-                  value={itemLimit}
-                  onChange={(v) => setItemLimit(v ?? 10)}
-                  style={{ width: '100%' }}
-                />
-              </Form.Item>
-              <Form.Item label="Message 限制">
-                <InputNumber
-                  min={1}
-                  value={messageLimit}
-                  onChange={(v) => setMessageLimit(v ?? 100)}
-                  style={{ width: '100%' }}
-                />
-              </Form.Item>
-            </>
-          )}
-        </Form>
+        <p>
+          {selectedPlugin?.name === 'clean'
+            ? '将对所有未处理的原始商户数据进行清洗、去重和分等级。'
+            : `将启动「${selectedPlugin?.label}」插件进行商户采集,采集完成后自动执行清洗流程。`
+          }
+        </p>
       </Modal>
     </Card>
   )

+ 0 - 192
web/src/pages/Channels.tsx

@@ -1,192 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import { Table, Tag, Select, message, Row, Col, Card, Statistic, Badge } from 'antd'
-import { getChannels, getChannelsStats, type Channel, type ChannelStats } from '../api'
-
-const { Option } = Select
-
-function formatDateTime(dateStr: string) {
-  return new Date(dateStr).toLocaleString('zh-CN')
-}
-
-const statusOptions = [
-  { label: '全部', value: '' },
-  { label: 'active', value: 'active' },
-  { label: 'inactive', value: 'inactive' },
-  { label: 'banned', value: 'banned' },
-]
-
-const sourceOptions = [
-  { label: '全部', value: '' },
-  { label: 'seed', value: 'seed' },
-  { label: 'discover', value: 'discover' },
-  { label: 'search', value: 'search' },
-]
-
-const sourceColors: Record<string, string> = {
-  seed: 'blue',
-  discover: 'green',
-  search: 'purple',
-}
-
-const statusBadgeMap: Record<string, 'success' | 'error' | 'warning' | 'default'> = {
-  active: 'success',
-  inactive: 'default',
-  banned: 'error',
-}
-
-export default function Channels() {
-  const [data, setData] = useState<Channel[]>([])
-  const [total, setTotal] = useState(0)
-  const [page, setPage] = useState(1)
-  const [loading, setLoading] = useState(false)
-  const [stats, setStats] = useState<ChannelStats | null>(null)
-  const [status, setStatus] = useState('')
-  const [source, setSource] = useState('')
-
-  const fetchData = useCallback(async (currentPage = 1) => {
-    setLoading(true)
-    try {
-      const params: Record<string, unknown> = { page: currentPage, page_size: 20 }
-      if (status) params.status = status
-      if (source) params.source = source
-      const res = await getChannels(params)
-      setData(res.data.items)
-      setTotal(res.data.total)
-    } catch {
-      message.error('获取频道列表失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [status, source])
-
-  const fetchStats = useCallback(async () => {
-    try {
-      const res = await getChannelsStats()
-      setStats(res.data)
-    } catch {
-      // ignore stats error
-    }
-  }, [])
-
-  useEffect(() => {
-    fetchStats()
-  }, [fetchStats])
-
-  useEffect(() => {
-    setPage(1)
-    fetchData(1)
-  }, [status, source, fetchData])
-
-  const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
-    {
-      title: '用户名',
-      dataIndex: 'username',
-      key: 'username',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '标题',
-      dataIndex: 'title',
-      key: 'title',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '成员数',
-      dataIndex: 'member_count',
-      key: 'member_count',
-      render: (v: number) => v?.toLocaleString() ?? '-',
-    },
-    {
-      title: '来源',
-      dataIndex: 'source',
-      key: 'source',
-      render: (v: string) => <Tag color={sourceColors[v] ?? 'default'}>{v}</Tag>,
-    },
-    {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
-      render: (v: string) => <Badge status={statusBadgeMap[v] ?? 'default'} text={v} />,
-    },
-    {
-      title: '相关度评分',
-      dataIndex: 'relevance_score',
-      key: 'relevance_score',
-      render: (v: number) => v?.toFixed(2) ?? '-',
-    },
-    {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
-    },
-  ]
-
-  return (
-    <div>
-      {stats && (
-        <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
-          {Object.entries(stats.source_distribution ?? {}).map(([key, val]) => (
-            <Col key={key}>
-              <Card size="small">
-                <Statistic title={`来源: ${key}`} value={val as number} />
-              </Card>
-            </Col>
-          ))}
-          {Object.entries(stats.status_distribution ?? {}).map(([key, val]) => (
-            <Col key={key}>
-              <Card size="small">
-                <Statistic title={`状态: ${key}`} value={val as number} />
-              </Card>
-            </Col>
-          ))}
-        </Row>
-      )}
-
-      <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
-        <Col>
-          <Select
-            style={{ width: 160 }}
-            value={status}
-            onChange={setStatus}
-            placeholder="状态筛选"
-          >
-            {statusOptions.map((o) => (
-              <Option key={o.value} value={o.value}>{o.label}</Option>
-            ))}
-          </Select>
-        </Col>
-        <Col>
-          <Select
-            style={{ width: 160 }}
-            value={source}
-            onChange={setSource}
-            placeholder="来源筛选"
-          >
-            {sourceOptions.map((o) => (
-              <Option key={o.value} value={o.value}>{o.label}</Option>
-            ))}
-          </Select>
-        </Col>
-      </Row>
-
-      <Table
-        dataSource={data}
-        columns={columns}
-        rowKey="id"
-        loading={loading}
-        pagination={{
-          current: page,
-          pageSize: 20,
-          total,
-          onChange: (p) => {
-            setPage(p)
-            fetchData(p)
-          },
-          showTotal: (t) => `共 ${t} 条`,
-        }}
-      />
-    </div>
-  )
-}

+ 0 - 171
web/src/pages/Dashboard.tsx

@@ -1,171 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import { Card, Col, Row, Statistic, Table, Progress, Typography, Tag, message, Space } from 'antd'
-import {
-  TeamOutlined,
-  DatabaseOutlined,
-  CheckCircleOutlined,
-  SafetyCertificateOutlined,
-  GlobalOutlined,
-  NodeIndexOutlined,
-} from '@ant-design/icons'
-import { getDashboard, type DashboardData, type Task } from '../api'
-import { useAppStore } from '../store'
-
-const { Text } = Typography
-
-const taskStatusColor: Record<string, string> = {
-  running: 'processing',
-  completed: 'success',
-  failed: 'error',
-  stopped: 'warning',
-  pending: 'default',
-}
-
-const taskTypeColor: Record<string, string> = {
-  full: 'purple',
-  discover: 'blue',
-  search: 'cyan',
-  github: 'geekblue',
-  scrape: 'orange',
-  crawl: 'green',
-  clean: 'lime',
-  score: 'gold',
-}
-
-function formatDuration(start: string, end: string | null) {
-  if (!end) return '-'
-  const diff = Math.floor((new Date(end).getTime() - new Date(start).getTime()) / 1000)
-  if (diff < 60) return `${diff}秒`
-  if (diff < 3600) return `${Math.floor(diff / 60)}分${diff % 60}秒`
-  return `${Math.floor(diff / 3600)}时${Math.floor((diff % 3600) / 60)}分`
-}
-
-function formatDateTime(dateStr: string) {
-  const d = new Date(dateStr)
-  const mm = String(d.getMonth() + 1).padStart(2, '0')
-  const dd = String(d.getDate()).padStart(2, '0')
-  const hh = String(d.getHours()).padStart(2, '0')
-  const min = String(d.getMinutes()).padStart(2, '0')
-  const ss = String(d.getSeconds()).padStart(2, '0')
-  return `${mm}-${dd} ${hh}:${min}:${ss}`
-}
-
-export default function Dashboard() {
-  const [data, setData] = useState<DashboardData | null>(null)
-  const [loading, setLoading] = useState(true)
-  const { setRunningTask } = useAppStore()
-
-  const fetchData = useCallback(async () => {
-    try {
-      const res = await getDashboard()
-      setData(res.data)
-      setRunningTask(res.data.running_task)
-    } catch {
-      message.error('获取仪表盘数据失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [setRunningTask])
-
-  useEffect(() => {
-    fetchData()
-    const timer = setInterval(fetchData, 5000)
-    return () => clearInterval(timer)
-  }, [fetchData])
-
-  const recentTaskColumns = [
-    {
-      title: '任务类型',
-      dataIndex: 'task_type',
-      key: 'task_type',
-      render: (type: string) => <Tag color={taskTypeColor[type] ?? 'default'}>{type}</Tag>,
-    },
-    {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
-      render: (status: string) => (
-        <Tag color={taskStatusColor[status] ?? 'default'}>{status}</Tag>
-      ),
-    },
-    {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
-    },
-    {
-      title: '耗时',
-      key: 'duration',
-      render: (_: unknown, record: Task) =>
-        formatDuration(record.created_at, record.completed_at),
-    },
-  ]
-
-  const stats = [
-    { title: '频道总数', value: data?.channel_count ?? 0, icon: <TeamOutlined />, color: '#1890ff' },
-    { title: '原始商户', value: data?.merchant_raw_count ?? 0, icon: <DatabaseOutlined />, color: '#52c41a' },
-    { title: '清洗商户', value: data?.merchant_clean_count ?? 0, icon: <CheckCircleOutlined />, color: '#13c2c2' },
-    { title: '有效商户', value: data?.merchant_valid_count ?? 0, icon: <SafetyCertificateOutlined />, color: '#722ed1' },
-    { title: '导航网页', value: data?.nav_site_count ?? 0, icon: <GlobalOutlined />, color: '#fa8c16' },
-    { title: '种子数量', value: data?.seed_count ?? 0, icon: <NodeIndexOutlined />, color: '#eb2f96' },
-  ]
-
-  return (
-    <div>
-      <Row gutter={[16, 16]}>
-        {stats.map((stat) => (
-          <Col key={stat.title} xs={24} sm={12} md={8} lg={4}>
-            <Card loading={loading}>
-              <Statistic
-                title={
-                  <Space>
-                    <span style={{ color: stat.color }}>{stat.icon}</span>
-                    {stat.title}
-                  </Space>
-                }
-                value={stat.value}
-                valueStyle={{ color: stat.color }}
-              />
-            </Card>
-          </Col>
-        ))}
-      </Row>
-
-      <Row gutter={[16, 16]} style={{ marginTop: 16 }}>
-        <Col span={24}>
-          <Card title="当前任务" loading={loading}>
-            {data?.running_task ? (
-              <div>
-                <Space style={{ marginBottom: 12 }}>
-                  <Tag color="processing">{data.running_task.task_type}</Tag>
-                  <Text>阶段:{data.running_task.current_phase || '初始化中'}</Text>
-                </Space>
-                <Progress
-                  percent={data.running_task.progress ?? 0}
-                  status="active"
-                />
-              </div>
-            ) : (
-              <Text type="secondary">暂无运行中的任务</Text>
-            )}
-          </Card>
-        </Col>
-      </Row>
-
-      <Row gutter={[16, 16]} style={{ marginTop: 16 }}>
-        <Col span={24}>
-          <Card title="最近任务" loading={loading}>
-            <Table
-              dataSource={data?.recent_tasks ?? []}
-              columns={recentTaskColumns}
-              rowKey="id"
-              pagination={false}
-              size="small"
-            />
-          </Card>
-        </Col>
-      </Row>
-    </div>
-  )
-}

+ 74 - 53
web/src/pages/Keywords.tsx

@@ -11,6 +11,8 @@ import {
   message,
   Popconfirm,
   Tag,
+  Row,
+  Col,
 } from 'antd'
 import { PlusOutlined, DeleteOutlined } from '@ant-design/icons'
 import { getKeywords, createKeywords, updateKeyword, deleteKeyword, type Keyword } from '../api'
@@ -22,25 +24,21 @@ function formatDateTime(dateStr: string) {
   return new Date(dateStr).toLocaleString('zh-CN')
 }
 
-const categoryColors: Record<string, string> = {
-  product: 'blue',
-  service: 'green',
-  location: 'orange',
-  brand: 'purple',
-  other: 'default',
+const tagColors: Record<string, string> = {
+  seed: 'volcano',
+  '机场': 'blue',
+  VPN: 'green',
 }
 
-const categoryOptions = [
-  { label: 'product', value: 'product' },
-  { label: 'service', value: 'service' },
-  { label: 'location', value: 'location' },
-  { label: 'brand', value: 'brand' },
-  { label: 'other', value: 'other' },
+const industryOptions = [
+  { label: '搜索关键词', value: '机场' },
+  { label: '种子频道', value: 'seed' },
+  { label: 'VPN', value: 'VPN' },
 ]
 
 interface BatchFormValues {
   keywords_text: string
-  category: string
+  industry_tag: string
 }
 
 export default function Keywords() {
@@ -50,12 +48,15 @@ export default function Keywords() {
   const [loading, setLoading] = useState(false)
   const [modalOpen, setModalOpen] = useState(false)
   const [saving, setSaving] = useState(false)
+  const [filterTag, setFilterTag] = useState('')
   const [form] = Form.useForm<BatchFormValues>()
 
   const fetchData = useCallback(async (currentPage = 1) => {
     setLoading(true)
     try {
-      const res = await getKeywords({ page: currentPage, page_size: 20 })
+      const params: Record<string, unknown> = { page: currentPage, page_size: 20 }
+      if (filterTag) params.industry_tag = filterTag
+      const res = await getKeywords(params)
       setData(res.data.items)
       setTotal(res.data.total)
     } catch {
@@ -63,11 +64,12 @@ export default function Keywords() {
     } finally {
       setLoading(false)
     }
-  }, [])
+  }, [filterTag])
 
   useEffect(() => {
-    fetchData(page)
-  }, [page, fetchData])
+    setPage(1)
+    fetchData(1)
+  }, [filterTag, fetchData])
 
   const handleBatchAdd = () => {
     form.resetFields()
@@ -86,8 +88,8 @@ export default function Keywords() {
         return
       }
       setSaving(true)
-      await createKeywords({ keywords, category: values.category })
-      message.success(`成功添加 ${keywords.length} 个关键词`)
+      await createKeywords({ keywords, industry_tag: values.industry_tag })
+      message.success(`成功添加 ${keywords.length} 个条目`)
       setModalOpen(false)
       fetchData(page)
     } catch (err) {
@@ -108,9 +110,9 @@ export default function Keywords() {
     }
   }
 
-  const handleStatusToggle = async (record: Keyword, checked: boolean) => {
+  const handleToggle = async (record: Keyword, checked: boolean) => {
     try {
-      await updateKeyword(record.id, { status: checked })
+      await updateKeyword(record.id, { enabled: checked })
       message.success('状态已更新')
       fetchData(page)
     } catch {
@@ -119,26 +121,26 @@ export default function Keywords() {
   }
 
   const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
+    { title: 'ID', dataIndex: 'id', key: 'id', width: 70 },
     {
-      title: '关键词',
+      title: '关键词/频道名',
       dataIndex: 'keyword',
       key: 'keyword',
     },
     {
-      title: '类',
-      dataIndex: 'category',
-      key: 'category',
-      render: (v: string) => <Tag color={categoryColors[v] ?? 'default'}>{v}</Tag>,
+      title: '类',
+      dataIndex: 'industry_tag',
+      key: 'industry_tag',
+      render: (v: string) => <Tag color={tagColors[v] ?? 'default'}>{v || '未分类'}</Tag>,
     },
     {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
+      title: '启用',
+      dataIndex: 'enabled',
+      key: 'enabled',
       render: (v: boolean, record: Keyword) => (
         <Switch
           checked={v}
-          onChange={(checked) => handleStatusToggle(record, checked)}
+          onChange={(checked) => handleToggle(record, checked)}
           checkedChildren="启用"
           unCheckedChildren="禁用"
         />
@@ -156,7 +158,7 @@ export default function Keywords() {
       render: (_: unknown, record: Keyword) => (
         <Space>
           <Popconfirm
-            title="确认删除该关键词?"
+            title="确认删除?"
             onConfirm={() => handleDelete(record.id)}
             okText="确认"
             cancelText="取消"
@@ -170,11 +172,27 @@ export default function Keywords() {
 
   return (
     <div>
-      <div style={{ marginBottom: 16 }}>
-        <Button type="primary" icon={<PlusOutlined />} onClick={handleBatchAdd}>
-          批量添加关键词
-        </Button>
-      </div>
+      <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
+        <Col>
+          <Button type="primary" icon={<PlusOutlined />} onClick={handleBatchAdd}>
+            批量添加
+          </Button>
+        </Col>
+        <Col>
+          <Select
+            style={{ width: 160 }}
+            value={filterTag}
+            onChange={setFilterTag}
+            placeholder="按类型筛选"
+            allowClear
+          >
+            <Option value="">全部</Option>
+            <Option value="seed">种子频道</Option>
+            <Option value="机场">机场</Option>
+            <Option value="VPN">VPN</Option>
+          </Select>
+        </Col>
+      </Row>
 
       <Table
         dataSource={data}
@@ -185,13 +203,16 @@ export default function Keywords() {
           current: page,
           pageSize: 20,
           total,
-          onChange: (p) => setPage(p),
+          onChange: (p) => {
+            setPage(p)
+            fetchData(p)
+          },
           showTotal: (t) => `共 ${t} 条`,
         }}
       />
 
       <Modal
-        title="批量添加关键词"
+        title="批量添加"
         open={modalOpen}
         onOk={handleSave}
         onCancel={() => setModalOpen(false)}
@@ -200,27 +221,27 @@ export default function Keywords() {
         cancelText="取消"
       >
         <Form form={form} layout="vertical" style={{ marginTop: 16 }}>
+          <Form.Item
+            name="industry_tag"
+            label="类型"
+            rules={[{ required: true, message: '请选择类型' }]}
+          >
+            <Select placeholder="选择类型">
+              {industryOptions.map((o) => (
+                <Option key={o.value} value={o.value}>{o.label}</Option>
+              ))}
+            </Select>
+          </Form.Item>
           <Form.Item
             name="keywords_text"
-            label="关键词列表(每行一个)"
-            rules={[{ required: true, message: '请输入关键词' }]}
+            label="内容(每行一个)"
+            rules={[{ required: true, message: '请输入内容' }]}
           >
             <TextArea
               rows={8}
-              placeholder="每行输入一个关键词,例如:&#10;外卖&#10;超市&#10;便利店"
+              placeholder="种子频道填频道名(如 bbs3000),关键词填搜索词(如 机场推荐)&#10;每行一个"
             />
           </Form.Item>
-          <Form.Item
-            name="category"
-            label="分类"
-            rules={[{ required: true, message: '请选择分类' }]}
-          >
-            <Select placeholder="选择分类">
-              {categoryOptions.map((o) => (
-                <Option key={o.value} value={o.value}>{o.label}</Option>
-              ))}
-            </Select>
-          </Form.Item>
         </Form>
       </Modal>
     </div>

+ 0 - 194
web/src/pages/Logs.tsx

@@ -1,194 +0,0 @@
-import { useEffect, useRef, useState, useCallback } from 'react'
-import { Card, Typography, Badge, Space, Button, Select } from 'antd'
-import { ClearOutlined, PauseOutlined, PlayCircleOutlined } from '@ant-design/icons'
-import { useAppStore } from '../store'
-
-const { Text } = Typography
-const { Option } = Select
-
-interface LogLine {
-  id: number
-  text: string
-  timestamp: string
-}
-
-let logIdCounter = 0
-
-export default function Logs() {
-  const { runningTask } = useAppStore()
-  const [logs, setLogs] = useState<LogLine[]>([])
-  const [connected, setConnected] = useState(false)
-  const [paused, setPaused] = useState(false)
-  const [connectedTaskId, setConnectedTaskId] = useState<number | null>(null)
-  const wsRef = useRef<WebSocket | null>(null)
-  const logContainerRef = useRef<HTMLDivElement>(null)
-  const pausedRef = useRef(false)
-
-  pausedRef.current = paused
-
-  const appendLog = useCallback((text: string) => {
-    if (pausedRef.current) return
-    const now = new Date()
-    const ts = `${String(now.getHours()).padStart(2, '0')}:${String(now.getMinutes()).padStart(2, '0')}:${String(now.getSeconds()).padStart(2, '0')}`
-    setLogs((prev) => {
-      const newLog: LogLine = { id: ++logIdCounter, text, timestamp: ts }
-      const next = [...prev, newLog]
-      return next.length > 2000 ? next.slice(next.length - 2000) : next
-    })
-  }, [])
-
-  const connect = useCallback((taskId: number) => {
-    if (wsRef.current) {
-      wsRef.current.close()
-    }
-
-    setConnectedTaskId(taskId)
-    setLogs([])
-
-    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
-    const host = window.location.host
-    const wsUrl = `${protocol}//${host}/api/v1/tasks/${taskId}/logs`
-
-    const ws = new WebSocket(wsUrl)
-    wsRef.current = ws
-
-    ws.onopen = () => {
-      setConnected(true)
-      appendLog(`[系统] 已连接到任务 #${taskId} 的日志流`)
-    }
-
-    ws.onmessage = (event: MessageEvent) => {
-      const text = typeof event.data === 'string' ? event.data : String(event.data)
-      appendLog(text)
-    }
-
-    ws.onerror = () => {
-      appendLog('[系统] WebSocket 连接错误')
-    }
-
-    ws.onclose = () => {
-      setConnected(false)
-      appendLog(`[系统] 与任务 #${taskId} 的连接已断开`)
-    }
-  }, [appendLog])
-
-  const runningTaskId = runningTask?.id
-
-  useEffect(() => {
-    if (runningTaskId !== undefined) {
-      connect(runningTaskId)
-    }
-    return () => {
-      wsRef.current?.close()
-    }
-  }, [runningTaskId, connect])
-
-  useEffect(() => {
-    if (!paused && logContainerRef.current) {
-      logContainerRef.current.scrollTop = logContainerRef.current.scrollHeight
-    }
-  }, [logs, paused])
-
-  const handleClear = () => {
-    setLogs([])
-  }
-
-  const handleTogglePause = () => {
-    setPaused((p) => !p)
-  }
-
-  const handleManualConnect = (taskId: number) => {
-    connect(taskId)
-  }
-
-  return (
-    <div>
-      <Card
-        title={
-          <Space>
-            <Badge
-              status={connected ? 'success' : 'error'}
-              text={connected ? '已连接' : '未连接'}
-            />
-            {connectedTaskId !== null && (
-              <Text type="secondary">任务 #{connectedTaskId}</Text>
-            )}
-            {!runningTask && (
-              <Space>
-                <Text type="secondary">手动连接任务 ID:</Text>
-                <Select
-                  style={{ width: 120 }}
-                  placeholder="输入任务ID"
-                  showSearch
-                  onSelect={(v: number) => handleManualConnect(v)}
-                >
-                  {[1, 2, 3, 4, 5].map((id) => (
-                    <Option key={id} value={id}>#{id}</Option>
-                  ))}
-                </Select>
-              </Space>
-            )}
-          </Space>
-        }
-        extra={
-          <Space>
-            <Button
-              size="small"
-              icon={paused ? <PlayCircleOutlined /> : <PauseOutlined />}
-              onClick={handleTogglePause}
-            >
-              {paused ? '继续' : '暂停'}
-            </Button>
-            <Button size="small" icon={<ClearOutlined />} onClick={handleClear}>
-              清空
-            </Button>
-          </Space>
-        }
-        styles={{ body: { padding: 0 } }}
-      >
-        <div
-          ref={logContainerRef}
-          style={{
-            background: '#1a1a1a',
-            color: '#00ff41',
-            fontFamily: 'Monaco, Menlo, "Courier New", monospace',
-            fontSize: 13,
-            lineHeight: '1.6',
-            height: 'calc(100vh - 220px)',
-            overflowY: 'auto',
-            padding: '12px 16px',
-            borderRadius: '0 0 8px 8px',
-          }}
-        >
-          {logs.length === 0 ? (
-            <span style={{ color: '#555' }}>
-              {runningTask
-                ? '正在连接日志流...'
-                : '当前没有运行中的任务。启动任务后日志将自动显示。'}
-            </span>
-          ) : (
-            logs.map((log) => (
-              <div key={log.id} style={{ display: 'flex', gap: 12 }}>
-                <span style={{ color: '#555', flexShrink: 0 }}>{log.timestamp}</span>
-                <span
-                  style={{
-                    color: log.text.includes('[ERROR]') || log.text.includes('[系统] WebSocket')
-                      ? '#ff6b6b'
-                      : log.text.includes('[WARN]')
-                      ? '#ffd93d'
-                      : log.text.includes('[系统]')
-                      ? '#74b9ff'
-                      : '#00ff41',
-                    wordBreak: 'break-all',
-                  }}
-                >
-                  {log.text}
-                </span>
-              </div>
-            ))
-          )}
-        </div>
-      </Card>
-    </div>
-  )
-}

+ 84 - 57
web/src/pages/MerchantsClean.tsx

@@ -1,5 +1,6 @@
 import { useEffect, useState, useCallback } from 'react'
-import { Table, Tag, Select, Input, InputNumber, Space, message, Row, Col, Progress, Badge } from 'antd'
+import { Table, Tag, Select, Input, Space, Button, message, Row, Col, Badge } from 'antd'
+import { DownloadOutlined } from '@ant-design/icons'
 import { getMerchantsClean, type MerchantClean } from '../api'
 
 const { Option } = Select
@@ -14,15 +15,26 @@ const statusOptions = [
   { label: 'invalid', value: 'invalid' },
   { label: 'bot', value: 'bot' },
   { label: 'duplicate', value: 'duplicate' },
-  { label: 'group', value: 'group' },
 ]
 
-const statusBadgeMap: Record<string, 'success' | 'error' | 'warning' | 'default' | 'processing'> = {
+const levelOptions = [
+  { label: '全部', value: '' },
+  { label: 'Hot', value: 'Hot' },
+  { label: 'Warm', value: 'Warm' },
+  { label: 'Cold', value: 'Cold' },
+]
+
+const levelColor: Record<string, string> = {
+  Hot: 'red',
+  Warm: 'orange',
+  Cold: 'blue',
+}
+
+const statusBadgeMap: Record<string, 'success' | 'error' | 'warning' | 'default'> = {
   valid: 'success',
   invalid: 'error',
   bot: 'warning',
   duplicate: 'default',
-  group: 'processing',
 }
 
 export default function MerchantsClean() {
@@ -31,8 +43,8 @@ export default function MerchantsClean() {
   const [page, setPage] = useState(1)
   const [loading, setLoading] = useState(false)
   const [status, setStatus] = useState('')
-  const [industry, setIndustry] = useState('')
-  const [minScore, setMinScore] = useState<number | null>(null)
+  const [level, setLevel] = useState('')
+  const [search, setSearch] = useState('')
 
   const fetchData = useCallback(async (currentPage = 1) => {
     setLoading(true)
@@ -40,34 +52,42 @@ export default function MerchantsClean() {
       const params: Record<string, unknown> = {
         page: currentPage,
         page_size: 20,
-        order_by: 'quality_score',
-        order_dir: 'desc',
+        sort: 'created_at',
+        order: 'desc',
       }
       if (status) params.status = status
-      if (industry) params.industry = industry
-      if (minScore !== null) params.min_quality_score = minScore
+      if (level) params.level = level
+      if (search) params.search = search
       const res = await getMerchantsClean(params)
       setData(res.data.items)
       setTotal(res.data.total)
     } catch {
-      message.error('获取清洗商户数据失败')
+      message.error('获取商户数据失败')
     } finally {
       setLoading(false)
     }
-  }, [status, industry, minScore])
+  }, [status, level, search])
 
   useEffect(() => {
     setPage(1)
     fetchData(1)
-  }, [status, industry, minScore, fetchData])
+  }, [status, level, search, fetchData])
+
+  const handleExport = () => {
+    let url = '/api/v1/merchants/clean/export?'
+    if (level) url += `level=${level}`
+    window.open(url, '_blank')
+  }
 
   const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
+    { title: 'ID', dataIndex: 'id', key: 'id', width: 70 },
     {
       title: 'TG用户名',
       dataIndex: 'tg_username',
       key: 'tg_username',
-      render: (v: string) => v || '-',
+      render: (v: string) => v ? (
+        <a href={`https://t.me/${v}`} target="_blank" rel="noreferrer">@{v}</a>
+      ) : '-',
     },
     {
       title: '商户名',
@@ -75,6 +95,12 @@ export default function MerchantsClean() {
       key: 'merchant_name',
       render: (v: string) => v || '-',
     },
+    {
+      title: '等级',
+      dataIndex: 'level',
+      key: 'level',
+      render: (v: string) => <Tag color={levelColor[v] ?? 'default'}>{v}</Tag>,
+    },
     {
       title: '状态',
       dataIndex: 'status',
@@ -84,35 +110,31 @@ export default function MerchantsClean() {
       ),
     },
     {
-      title: '行业',
-      dataIndex: 'industry',
-      key: 'industry',
-      render: (v: string) => v || '-',
+      title: '网站',
+      dataIndex: 'website',
+      key: 'website',
+      ellipsis: true,
+      render: (v: string) => v ? (
+        <a href={v} target="_blank" rel="noreferrer">{v}</a>
+      ) : '-',
     },
     {
-      title: '质量分',
-      dataIndex: 'quality_score',
-      key: 'quality_score',
-      width: 160,
-      render: (v: number) => (
-        <Progress
-          percent={Math.round((v ?? 0) * 10) / 10}
-          size="small"
-          strokeColor={v >= 70 ? '#52c41a' : v >= 40 ? '#faad14' : '#ff4d4f'}
-        />
-      ),
+      title: '邮箱',
+      dataIndex: 'email',
+      key: 'email',
+      render: (v: string) => v || '-',
     },
     {
-      title: 'Premium',
-      dataIndex: 'is_premium',
-      key: 'is_premium',
-      render: (v: boolean) => v ? <Tag color="gold">Premium</Tag> : <Tag>普通</Tag>,
+      title: '行业',
+      dataIndex: 'industry_tag',
+      key: 'industry_tag',
+      render: (v: string) => v || '-',
     },
     {
-      title: '活跃度',
-      dataIndex: 'activity_score',
-      key: 'activity_score',
-      render: (v: number) => v?.toFixed(2) ?? '-',
+      title: '来源数',
+      dataIndex: 'source_count',
+      key: 'source_count',
+      width: 80,
     },
     {
       title: '创建时间',
@@ -124,10 +146,10 @@ export default function MerchantsClean() {
 
   return (
     <div>
-      <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
+      <Row gutter={[16, 16]} style={{ marginBottom: 16 }} align="middle">
         <Col>
           <Select
-            style={{ width: 160 }}
+            style={{ width: 140 }}
             value={status}
             onChange={setStatus}
             placeholder="状态筛选"
@@ -138,26 +160,30 @@ export default function MerchantsClean() {
           </Select>
         </Col>
         <Col>
-          <Input
-            placeholder="行业"
-            value={industry}
-            onChange={(e) => setIndustry(e.target.value)}
-            style={{ width: 160 }}
+          <Select
+            style={{ width: 140 }}
+            value={level}
+            onChange={setLevel}
+            placeholder="等级筛选"
+          >
+            {levelOptions.map((o) => (
+              <Option key={o.value} value={o.value}>{o.label}</Option>
+            ))}
+          </Select>
+        </Col>
+        <Col>
+          <Input.Search
+            placeholder="搜索商户名/TG用户名"
+            value={search}
+            onChange={(e) => setSearch(e.target.value)}
+            style={{ width: 240 }}
             allowClear
           />
         </Col>
-        <Col>
-          <Space>
-            <span>最低质量分:</span>
-            <InputNumber
-              min={0}
-              max={100}
-              value={minScore}
-              onChange={(v) => setMinScore(v)}
-              placeholder="0-100"
-              style={{ width: 100 }}
-            />
-          </Space>
+        <Col flex="auto" style={{ textAlign: 'right' }}>
+          <Button icon={<DownloadOutlined />} onClick={handleExport}>
+            导出 CSV
+          </Button>
         </Col>
       </Row>
 
@@ -176,6 +202,7 @@ export default function MerchantsClean() {
           },
           showTotal: (t) => `共 ${t} 条`,
         }}
+        scroll={{ x: 1200 }}
       />
     </div>
   )

+ 0 - 162
web/src/pages/MerchantsRaw.tsx

@@ -1,162 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import { Table, Tag, Select, Input, Space, message, Row, Col } from 'antd'
-import { SearchOutlined } from '@ant-design/icons'
-import { getMerchantsRaw, type MerchantRaw } from '../api'
-
-const { Option } = Select
-
-function formatDateTime(dateStr: string) {
-  return new Date(dateStr).toLocaleString('zh-CN')
-}
-
-const statusOptions = [
-  { label: '全部', value: '' },
-  { label: 'raw', value: 'raw' },
-  { label: 'glm_parsed', value: 'glm_parsed' },
-]
-
-const sourceTypeOptions = [
-  { label: '全部', value: '' },
-  { label: 'telegram', value: 'telegram' },
-  { label: 'web', value: 'web' },
-  { label: 'github', value: 'github' },
-]
-
-export default function MerchantsRaw() {
-  const [data, setData] = useState<MerchantRaw[]>([])
-  const [total, setTotal] = useState(0)
-  const [page, setPage] = useState(1)
-  const [loading, setLoading] = useState(false)
-  const [status, setStatus] = useState('')
-  const [sourceType, setSourceType] = useState('')
-  const [searchText, setSearchText] = useState('')
-
-  const fetchData = useCallback(async (currentPage = 1) => {
-    setLoading(true)
-    try {
-      const params: Record<string, unknown> = {
-        page: currentPage,
-        page_size: 20,
-        order_by: 'created_at',
-        order_dir: 'desc',
-      }
-      if (status) params.status = status
-      if (sourceType) params.source_type = sourceType
-      if (searchText) params.tg_username = searchText
-      const res = await getMerchantsRaw(params)
-      setData(res.data.items)
-      setTotal(res.data.total)
-    } catch {
-      message.error('获取原始商户数据失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [status, sourceType, searchText])
-
-  useEffect(() => {
-    setPage(1)
-    fetchData(1)
-  }, [status, sourceType, searchText, fetchData])
-
-  const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
-    {
-      title: 'TG用户名',
-      dataIndex: 'tg_username',
-      key: 'tg_username',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '商户名',
-      dataIndex: 'merchant_name',
-      key: 'merchant_name',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '来源类型',
-      dataIndex: 'source_type',
-      key: 'source_type',
-      render: (v: string) => <Tag>{v}</Tag>,
-    },
-    {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
-      render: (v: string) => (
-        <Tag color={v === 'glm_parsed' ? 'green' : 'orange'}>{v}</Tag>
-      ),
-    },
-    {
-      title: '行业',
-      dataIndex: 'industry',
-      key: 'industry',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
-    },
-  ]
-
-  return (
-    <div>
-      <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
-        <Col>
-          <Select
-            style={{ width: 160 }}
-            value={status}
-            onChange={setStatus}
-            placeholder="状态筛选"
-          >
-            {statusOptions.map((o) => (
-              <Option key={o.value} value={o.value}>{o.label}</Option>
-            ))}
-          </Select>
-        </Col>
-        <Col>
-          <Select
-            style={{ width: 160 }}
-            value={sourceType}
-            onChange={setSourceType}
-            placeholder="来源类型"
-          >
-            {sourceTypeOptions.map((o) => (
-              <Option key={o.value} value={o.value}>{o.label}</Option>
-            ))}
-          </Select>
-        </Col>
-        <Col>
-          <Space.Compact>
-            <Input
-              prefix={<SearchOutlined />}
-              placeholder="搜索 TG 用户名"
-              value={searchText}
-              onChange={(e) => setSearchText(e.target.value)}
-              style={{ width: 220 }}
-              allowClear
-            />
-          </Space.Compact>
-        </Col>
-      </Row>
-
-      <Table
-        dataSource={data}
-        columns={columns}
-        rowKey="id"
-        loading={loading}
-        pagination={{
-          current: page,
-          pageSize: 20,
-          total,
-          onChange: (p) => {
-            setPage(p)
-            fetchData(p)
-          },
-          showTotal: (t) => `共 ${t} 条`,
-        }}
-      />
-    </div>
-  )
-}

+ 0 - 138
web/src/pages/NavSites.tsx

@@ -1,138 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import { Table, Tag, Select, message, Row, Col } from 'antd'
-import { getNavSites, type NavSite } from '../api'
-
-const { Option } = Select
-
-function formatDateTime(dateStr: string) {
-  return new Date(dateStr).toLocaleString('zh-CN')
-}
-
-const statusOptions = [
-  { label: '全部', value: '' },
-  { label: 'pending', value: 'pending' },
-  { label: 'crawled', value: 'crawled' },
-  { label: 'filtered', value: 'filtered' },
-  { label: 'error', value: 'error' },
-]
-
-const statusColors: Record<string, string> = {
-  pending: 'orange',
-  crawled: 'green',
-  filtered: 'red',
-  error: 'volcano',
-}
-
-export default function NavSites() {
-  const [data, setData] = useState<NavSite[]>([])
-  const [total, setTotal] = useState(0)
-  const [page, setPage] = useState(1)
-  const [loading, setLoading] = useState(false)
-  const [status, setStatus] = useState('')
-
-  const fetchData = useCallback(async (currentPage = 1) => {
-    setLoading(true)
-    try {
-      const params: Record<string, unknown> = { page: currentPage, page_size: 20 }
-      if (status) params.status = status
-      const res = await getNavSites(params)
-      setData(res.data.items)
-      setTotal(res.data.total)
-    } catch {
-      message.error('获取导航网页数据失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [status])
-
-  useEffect(() => {
-    setPage(1)
-    fetchData(1)
-  }, [status, fetchData])
-
-  const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
-    {
-      title: 'URL',
-      dataIndex: 'url',
-      key: 'url',
-      render: (v: string) => (
-        <a
-          href={v}
-          target="_blank"
-          rel="noreferrer"
-          style={{ maxWidth: 300, display: 'inline-block', overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}
-          title={v}
-        >
-          {v.length > 60 ? v.slice(0, 60) + '...' : v}
-        </a>
-      ),
-    },
-    {
-      title: '域名',
-      dataIndex: 'domain',
-      key: 'domain',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
-      render: (v: string) => <Tag color={statusColors[v] ?? 'default'}>{v}</Tag>,
-    },
-    {
-      title: '过滤原因',
-      dataIndex: 'filter_reason',
-      key: 'filter_reason',
-      render: (v: string | null) => v || '-',
-    },
-    {
-      title: '商户数量',
-      dataIndex: 'merchant_count',
-      key: 'merchant_count',
-      render: (v: number) => v ?? 0,
-    },
-    {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
-    },
-  ]
-
-  return (
-    <div>
-      <Row gutter={[16, 16]} style={{ marginBottom: 16 }}>
-        <Col>
-          <Select
-            style={{ width: 160 }}
-            value={status}
-            onChange={setStatus}
-            placeholder="状态筛选"
-          >
-            {statusOptions.map((o) => (
-              <Option key={o.value} value={o.value}>{o.label}</Option>
-            ))}
-          </Select>
-        </Col>
-      </Row>
-
-      <Table
-        dataSource={data}
-        columns={columns}
-        rowKey="id"
-        loading={loading}
-        pagination={{
-          current: page,
-          pageSize: 20,
-          total,
-          onChange: (p) => {
-            setPage(p)
-            fetchData(p)
-          },
-          showTotal: (t) => `共 ${t} 条`,
-        }}
-      />
-    </div>
-  )
-}

+ 0 - 217
web/src/pages/Seeds.tsx

@@ -1,217 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import {
-  Table,
-  Button,
-  Modal,
-  Form,
-  Input,
-  Switch,
-  Space,
-  message,
-  Popconfirm,
-} from 'antd'
-import { PlusOutlined, EditOutlined, DeleteOutlined } from '@ant-design/icons'
-import { getSeeds, createSeed, updateSeed, deleteSeed, type Seed } from '../api'
-
-function formatDateTime(dateStr: string) {
-  return new Date(dateStr).toLocaleString('zh-CN')
-}
-
-interface SeedFormValues {
-  channel_name: string
-  note: string
-  status: boolean
-}
-
-export default function Seeds() {
-  const [data, setData] = useState<Seed[]>([])
-  const [total, setTotal] = useState(0)
-  const [page, setPage] = useState(1)
-  const [loading, setLoading] = useState(false)
-  const [modalOpen, setModalOpen] = useState(false)
-  const [editingRecord, setEditingRecord] = useState<Seed | null>(null)
-  const [saving, setSaving] = useState(false)
-  const [form] = Form.useForm<SeedFormValues>()
-
-  const fetchData = useCallback(async (currentPage = 1) => {
-    setLoading(true)
-    try {
-      const res = await getSeeds({ page: currentPage, page_size: 20 })
-      setData(res.data.items)
-      setTotal(res.data.total)
-    } catch {
-      message.error('获取种子列表失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [])
-
-  useEffect(() => {
-    fetchData(page)
-  }, [page, fetchData])
-
-  const handleAdd = () => {
-    setEditingRecord(null)
-    form.resetFields()
-    form.setFieldsValue({ status: true })
-    setModalOpen(true)
-  }
-
-  const handleEdit = (record: Seed) => {
-    setEditingRecord(record)
-    form.setFieldsValue({
-      channel_name: record.channel_name,
-      note: record.note,
-      status: record.status,
-    })
-    setModalOpen(true)
-  }
-
-  const handleDelete = async (id: number) => {
-    try {
-      await deleteSeed(id)
-      message.success('删除成功')
-      fetchData(page)
-    } catch {
-      message.error('删除失败')
-    }
-  }
-
-  const handleSave = async () => {
-    try {
-      const values = await form.validateFields()
-      setSaving(true)
-      if (editingRecord) {
-        await updateSeed(editingRecord.id, values)
-        message.success('更新成功')
-      } else {
-        await createSeed(values)
-        message.success('添加成功')
-      }
-      setModalOpen(false)
-      fetchData(page)
-    } catch (err) {
-      if (err && typeof err === 'object' && 'errorFields' in err) return
-      message.error('保存失败')
-    } finally {
-      setSaving(false)
-    }
-  }
-
-  const handleStatusToggle = async (record: Seed, checked: boolean) => {
-    try {
-      await updateSeed(record.id, { status: checked })
-      message.success('状态已更新')
-      fetchData(page)
-    } catch {
-      message.error('状态更新失败')
-    }
-  }
-
-  const columns = [
-    { title: 'ID', dataIndex: 'id', key: 'id', width: 80 },
-    {
-      title: '频道名',
-      dataIndex: 'channel_name',
-      key: 'channel_name',
-    },
-    {
-      title: '状态',
-      dataIndex: 'status',
-      key: 'status',
-      render: (v: boolean, record: Seed) => (
-        <Switch
-          checked={v}
-          onChange={(checked) => handleStatusToggle(record, checked)}
-          checkedChildren="启用"
-          unCheckedChildren="禁用"
-        />
-      ),
-    },
-    {
-      title: '备注',
-      dataIndex: 'note',
-      key: 'note',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
-    },
-    {
-      title: '操作',
-      key: 'action',
-      render: (_: unknown, record: Seed) => (
-        <Space>
-          <Button
-            size="small"
-            icon={<EditOutlined />}
-            onClick={() => handleEdit(record)}
-          >
-            编辑
-          </Button>
-          <Popconfirm
-            title="确认删除该种子?"
-            onConfirm={() => handleDelete(record.id)}
-            okText="确认"
-            cancelText="取消"
-          >
-            <Button size="small" danger icon={<DeleteOutlined />}>删除</Button>
-          </Popconfirm>
-        </Space>
-      ),
-    },
-  ]
-
-  return (
-    <div>
-      <div style={{ marginBottom: 16 }}>
-        <Button type="primary" icon={<PlusOutlined />} onClick={handleAdd}>
-          添加种子
-        </Button>
-      </div>
-
-      <Table
-        dataSource={data}
-        columns={columns}
-        rowKey="id"
-        loading={loading}
-        pagination={{
-          current: page,
-          pageSize: 20,
-          total,
-          onChange: (p) => setPage(p),
-          showTotal: (t) => `共 ${t} 条`,
-        }}
-      />
-
-      <Modal
-        title={editingRecord ? '编辑种子' : '添加种子'}
-        open={modalOpen}
-        onOk={handleSave}
-        onCancel={() => setModalOpen(false)}
-        confirmLoading={saving}
-        okText="保存"
-        cancelText="取消"
-      >
-        <Form form={form} layout="vertical" style={{ marginTop: 16 }}>
-          <Form.Item
-            name="channel_name"
-            label="频道名"
-            rules={[{ required: true, message: '请输入频道名' }]}
-          >
-            <Input placeholder="例如:@channel_name" />
-          </Form.Item>
-          <Form.Item name="note" label="备注">
-            <Input.TextArea rows={3} placeholder="可选备注" />
-          </Form.Item>
-          <Form.Item name="status" label="状态" valuePropName="checked">
-            <Switch checkedChildren="启用" unCheckedChildren="禁用" />
-          </Form.Item>
-        </Form>
-      </Modal>
-    </div>
-  )
-}

+ 0 - 167
web/src/pages/Settings.tsx

@@ -1,167 +0,0 @@
-import { useEffect, useState, useCallback } from 'react'
-import { Tabs, Table, Input, Button, Space, message } from 'antd'
-import { EditOutlined, SaveOutlined, CloseOutlined } from '@ant-design/icons'
-import { getSettings, updateSetting, type Setting } from '../api'
-import Seeds from './Seeds'
-import Keywords from './Keywords'
-
-function formatDateTime(dateStr: string) {
-  return new Date(dateStr).toLocaleString('zh-CN')
-}
-
-function PipelineSettings() {
-  const [data, setData] = useState<Setting[]>([])
-  const [loading, setLoading] = useState(false)
-  const [editingKey, setEditingKey] = useState<string | null>(null)
-  const [editingValue, setEditingValue] = useState('')
-  const [saving, setSaving] = useState(false)
-
-  const fetchData = useCallback(async () => {
-    setLoading(true)
-    try {
-      const res = await getSettings()
-      setData(res.data)
-    } catch {
-      message.error('获取配置失败')
-    } finally {
-      setLoading(false)
-    }
-  }, [])
-
-  useEffect(() => {
-    fetchData()
-  }, [fetchData])
-
-  const handleEdit = (record: Setting) => {
-    setEditingKey(record.key)
-    setEditingValue(record.value)
-  }
-
-  const handleSave = async (key: string) => {
-    setSaving(true)
-    try {
-      await updateSetting(key, editingValue)
-      message.success('保存成功')
-      setEditingKey(null)
-      fetchData()
-    } catch {
-      message.error('保存失败')
-    } finally {
-      setSaving(false)
-    }
-  }
-
-  const handleCancel = () => {
-    setEditingKey(null)
-    setEditingValue('')
-  }
-
-  const columns = [
-    {
-      title: '配置项',
-      dataIndex: 'key',
-      key: 'key',
-      width: 240,
-    },
-    {
-      title: '值',
-      dataIndex: 'value',
-      key: 'value',
-      render: (v: string, record: Setting) => {
-        if (editingKey === record.key) {
-          return (
-            <Input
-              value={editingValue}
-              onChange={(e) => setEditingValue(e.target.value)}
-              onPressEnter={() => handleSave(record.key)}
-              size="small"
-            />
-          )
-        }
-        return <span>{v}</span>
-      },
-    },
-    {
-      title: '描述',
-      dataIndex: 'description',
-      key: 'description',
-      render: (v: string) => v || '-',
-    },
-    {
-      title: '更新时间',
-      dataIndex: 'updated_at',
-      key: 'updated_at',
-      render: (t: string) => t ? formatDateTime(t) : '-',
-    },
-    {
-      title: '操作',
-      key: 'action',
-      width: 150,
-      render: (_: unknown, record: Setting) => {
-        if (editingKey === record.key) {
-          return (
-            <Space>
-              <Button
-                size="small"
-                type="primary"
-                icon={<SaveOutlined />}
-                loading={saving}
-                onClick={() => handleSave(record.key)}
-              >
-                保存
-              </Button>
-              <Button size="small" icon={<CloseOutlined />} onClick={handleCancel}>
-                取消
-              </Button>
-            </Space>
-          )
-        }
-        return (
-          <Button
-            size="small"
-            icon={<EditOutlined />}
-            onClick={() => handleEdit(record)}
-          >
-            编辑
-          </Button>
-        )
-      },
-    },
-  ]
-
-  return (
-    <Table
-      dataSource={data}
-      columns={columns}
-      rowKey="key"
-      loading={loading}
-      pagination={false}
-    />
-  )
-}
-
-const tabItems = [
-  {
-    key: 'pipeline',
-    label: '流水线参数',
-    children: <PipelineSettings />,
-  },
-  {
-    key: 'seeds',
-    label: '种子管理',
-    children: <Seeds />,
-  },
-  {
-    key: 'keywords',
-    label: '关键词管理',
-    children: <Keywords />,
-  },
-]
-
-export default function Settings() {
-  return (
-    <div>
-      <Tabs items={tabItems} />
-    </div>
-  )
-}

+ 40 - 33
web/src/pages/Tasks.tsx

@@ -1,19 +1,15 @@
 import { useEffect, useState, useCallback } from 'react'
 import { Table, Tag, Button, message, Badge } from 'antd'
 import { StopOutlined } from '@ant-design/icons'
-import { getTasks, stopTask, type Task } from '../api'
+import { getTasks, stopTask, type TaskLog } from '../api'
 import { useAppStore } from '../store'
 import TaskControl from '../components/TaskControl'
 
-const taskTypeColor: Record<string, string> = {
-  full: 'purple',
-  discover: 'blue',
-  search: 'cyan',
-  github: 'geekblue',
-  scrape: 'orange',
-  crawl: 'green',
+const pluginColor: Record<string, string> = {
+  web_collector: 'green',
+  tg_collector: 'orange',
+  github_collector: 'geekblue',
   clean: 'lime',
-  score: 'gold',
 }
 
 const taskStatusBadge: Record<string, 'processing' | 'success' | 'error' | 'warning' | 'default'> = {
@@ -26,12 +22,11 @@ const taskStatusBadge: Record<string, 'processing' | 'success' | 'error' | 'warn
 
 function formatDateTime(dateStr: string | null) {
   if (!dateStr) return '-'
-  const d = new Date(dateStr)
-  return d.toLocaleString('zh-CN')
+  return new Date(dateStr).toLocaleString('zh-CN')
 }
 
 export default function Tasks() {
-  const [tasks, setTasks] = useState<Task[]>([])
+  const [tasks, setTasks] = useState<TaskLog[]>([])
   const [total, setTotal] = useState(0)
   const [page, setPage] = useState(1)
   const [loading, setLoading] = useState(false)
@@ -78,47 +73,58 @@ export default function Tasks() {
   const columns = [
     { title: 'ID', dataIndex: 'id', key: 'id', width: 70 },
     {
-      title: '任务类型',
+      title: '类型',
       dataIndex: 'task_type',
       key: 'task_type',
-      render: (type: string) => <Tag color={taskTypeColor[type] ?? 'default'}>{type}</Tag>,
+      render: (v: string) => <Tag>{v}</Tag>,
+    },
+    {
+      title: '插件',
+      dataIndex: 'plugin_name',
+      key: 'plugin_name',
+      render: (v: string) => v ? <Tag color={pluginColor[v] ?? 'default'}>{v}</Tag> : '-',
     },
     {
       title: '状态',
       dataIndex: 'status',
       key: 'status',
-      render: (status: string) => (
-        <Badge status={taskStatusBadge[status] ?? 'default'} text={status} />
+      render: (v: string) => (
+        <Badge status={taskStatusBadge[v] ?? 'default'} text={v} />
       ),
     },
     {
-      title: '参数预览',
-      dataIndex: 'params',
-      key: 'params',
-      render: (params: Record<string, unknown>) => (
-        <span style={{ fontSize: 12, color: '#666' }}>
-          {JSON.stringify(params).slice(0, 60)}
-          {JSON.stringify(params).length > 60 ? '...' : ''}
-        </span>
-      ),
+      title: '新增商户',
+      dataIndex: 'merchants_added',
+      key: 'merchants_added',
+    },
+    {
+      title: '错误数',
+      dataIndex: 'errors_count',
+      key: 'errors_count',
+    },
+    {
+      title: '详情',
+      dataIndex: 'detail',
+      key: 'detail',
       ellipsis: true,
+      render: (v: string) => v || '-',
     },
     {
-      title: '创建时间',
-      dataIndex: 'created_at',
-      key: 'created_at',
-      render: (t: string) => formatDateTime(t),
+      title: '开始时间',
+      dataIndex: 'started_at',
+      key: 'started_at',
+      render: (t: string | null) => formatDateTime(t),
     },
     {
-      title: '完成时间',
-      dataIndex: 'completed_at',
-      key: 'completed_at',
+      title: '结束时间',
+      dataIndex: 'finished_at',
+      key: 'finished_at',
       render: (t: string | null) => formatDateTime(t),
     },
     {
       title: '操作',
       key: 'action',
-      render: (_: unknown, record: Task) =>
+      render: (_: unknown, record: TaskLog) =>
         record.status === 'running' ? (
           <Button
             danger
@@ -148,6 +154,7 @@ export default function Tasks() {
           onChange: (p) => setPage(p),
           showTotal: (t) => `共 ${t} 条`,
         }}
+        scroll={{ x: 1100 }}
       />
     </div>
   )

+ 3 - 3
web/src/store/index.ts

@@ -1,9 +1,9 @@
 import { create } from 'zustand'
-import type { Task } from '../api'
+import type { TaskLog } from '../api'
 
 interface AppState {
-  runningTask: Task | null
-  setRunningTask: (task: Task | null) => void
+  runningTask: TaskLog | null
+  setRunningTask: (task: TaskLog | null) => void
 }
 
 export const useAppStore = create<AppState>((set) => ({