collector.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. package tgcollector
  2. import (
  3. "context"
  4. "fmt"
  5. "log"
  6. "regexp"
  7. "strings"
  8. "sync/atomic"
  9. "time"
  10. "spider/internal/extractor"
  11. "spider/internal/llm"
  12. "spider/internal/model"
  13. "spider/internal/plugin"
  14. proxypool "spider/internal/proxy"
  15. "spider/internal/store"
  16. "spider/internal/telegram"
  17. )
  18. // Collector implements plugin.Collector for TG channel collection.
  19. // Combines BFS channel discovery (from seeds) + message scraping.
  20. // AI: regex first, LLM fallback only for non-standard contact formats.
  21. type Collector struct {
  22. tgManager *telegram.AccountManager
  23. llmClient *llm.Client // can be nil
  24. store *store.Store
  25. stopped atomic.Bool
  26. logger plugin.TaskLogger
  27. proxyPool *proxypool.Pool
  28. }
  29. // New creates a new TG collector.
  30. func New(tgManager *telegram.AccountManager, llmClient *llm.Client, s *store.Store) *Collector {
  31. return &Collector{
  32. tgManager: tgManager,
  33. llmClient: llmClient,
  34. store: s,
  35. logger: plugin.NopLogger(),
  36. }
  37. }
  38. func (c *Collector) Name() string { return "tg_collector" }
  39. func (c *Collector) SetLogger(l plugin.TaskLogger) { c.logger = l }
  40. func (c *Collector) Stop() error {
  41. c.stopped.Store(true)
  42. return nil
  43. }
  44. // Run executes the TG collection pipeline:
  45. // 1. BFS discover channels from seeds
  46. // 2. Scrape messages from discovered channels
  47. // 3. Extract merchants via regex (+ LLM fallback)
  48. //
  49. // cfg keys:
  50. // - "seeds": []string — seed channel names
  51. // - "max_depth": int — BFS max depth (default 3)
  52. // - "max_channels": int — max channels to discover (default 500)
  53. // - "message_limit": int — messages per channel (default 500)
  54. func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
  55. c.stopped.Store(false)
  56. // Proxy pool rotation: rotate proxy on each account acquire
  57. var pool *proxypool.Pool
  58. if p, ok := cfg["proxy_pool"].(*proxypool.Pool); ok && p != nil {
  59. pool = p
  60. log.Printf("[tg_collector] using proxy pool with %d proxies", pool.Size())
  61. // Set initial proxy
  62. if next := pool.Next(); next != "" {
  63. c.tgManager.SetProxy(next)
  64. }
  65. } else if proxyURL, ok := cfg["proxy_url"].(string); ok && proxyURL != "" {
  66. log.Printf("[tg_collector] proxy configured: %s (pass to TG account manager)", proxyURL)
  67. c.tgManager.SetProxy(proxyURL)
  68. }
  69. c.proxyPool = pool
  70. if c.tgManager == nil {
  71. log.Println("[tg_collector] no TG account manager, skipping")
  72. return nil
  73. }
  74. seeds, _ := cfg["seeds"].([]string)
  75. if len(seeds) == 0 {
  76. log.Println("[tg_collector] no seeds provided")
  77. return nil
  78. }
  79. maxDepth := getIntCfg(cfg, "max_depth", 3)
  80. maxChannels := getIntCfg(cfg, "max_channels", 500)
  81. msgLimit := getIntCfg(cfg, "message_limit", 500)
  82. // Phase 1: BFS channel discovery
  83. t0 := time.Now()
  84. channels := c.discover(ctx, seeds, maxDepth, maxChannels)
  85. log.Printf("[tg_collector] discovered %d channels", len(channels))
  86. c.logger.LogSearchResult("BFS discover", 0, fmt.Sprintf("discovered %d channels from seeds", len(channels)), strings.Join(seeds, ","), fmt.Sprintf("duration: %v", time.Since(t0)))
  87. // Phase 2: Scrape each channel
  88. for i, ch := range channels {
  89. if c.stopped.Load() || ctx.Err() != nil {
  90. break
  91. }
  92. log.Printf("[tg_collector] scraping %d/%d: @%s", i+1, len(channels), ch)
  93. c.logger.LogCrawlPage("tg://"+ch, "", 0, fmt.Sprintf("scraping channel %d/%d", i+1, len(channels)), nil, 0, nil, 0)
  94. c.scrapeChannel(ctx, ch, msgLimit, callback)
  95. // Delay between channels
  96. select {
  97. case <-ctx.Done():
  98. return nil
  99. case <-time.After(5 * time.Second):
  100. }
  101. }
  102. log.Println("[tg_collector] done")
  103. return nil
  104. }
  105. type queueItem struct {
  106. Username string
  107. Depth int
  108. Source string
  109. }
  110. func (c *Collector) discover(ctx context.Context, seeds []string, maxDepth, maxTotal int) []string {
  111. queue := make([]queueItem, 0, len(seeds))
  112. for _, s := range seeds {
  113. queue = append(queue, queueItem{Username: cleanUsername(s), Depth: 0, Source: "seed"})
  114. }
  115. visited := map[string]bool{}
  116. var result []string
  117. for len(queue) > 0 && len(result) < maxTotal {
  118. if c.stopped.Load() || ctx.Err() != nil {
  119. break
  120. }
  121. item := queue[0]
  122. queue = queue[1:]
  123. username := cleanUsername(item.Username)
  124. if username == "" || visited[username] {
  125. continue
  126. }
  127. visited[username] = true
  128. // Rotate proxy before each channel in BFS
  129. c.rotateProxy()
  130. acc, err := c.tgManager.Acquire(ctx)
  131. if err != nil {
  132. log.Printf("[tg_collector] no available TG account: %v, waiting 30s before retry", err)
  133. select {
  134. case <-ctx.Done():
  135. return result
  136. case <-time.After(30 * time.Second):
  137. }
  138. acc, err = c.tgManager.Acquire(ctx)
  139. if err != nil {
  140. log.Printf("[tg_collector] still no available TG account after retry: %v, stopping discovery", err)
  141. break
  142. }
  143. }
  144. if err := acc.Client.Connect(ctx); err != nil {
  145. log.Printf("[tg_collector] connect failed: %v", err)
  146. c.tgManager.Release(acc, 0)
  147. continue
  148. }
  149. _, err = acc.Client.GetChannelInfo(ctx, username)
  150. if err != nil {
  151. if fw, ok := err.(*telegram.FloodWaitError); ok {
  152. c.tgManager.HandleFloodWait(acc, fw.Seconds)
  153. } else {
  154. c.tgManager.Release(acc, 0)
  155. }
  156. continue
  157. }
  158. // Save channel to DB
  159. c.store.UpsertChannel(&model.Channel{
  160. Username: username,
  161. Source: item.Source,
  162. Status: "pending",
  163. })
  164. result = append(result, username)
  165. // BFS: read messages to find more channels
  166. if item.Depth < maxDepth {
  167. msgs, err := acc.Client.GetMessages(ctx, username, 0, 100)
  168. if err == nil {
  169. for _, msg := range msgs {
  170. if msg.ForwardFromChannel != "" {
  171. fwd := cleanUsername(msg.ForwardFromChannel)
  172. if fwd != "" && !visited[fwd] {
  173. queue = append(queue, queueItem{fwd, item.Depth + 1, "snowball"})
  174. }
  175. }
  176. for _, link := range msg.Links {
  177. name := extractUsernameFromLink(link)
  178. if name != "" && !visited[name] {
  179. queue = append(queue, queueItem{name, item.Depth + 1, "snowball"})
  180. }
  181. }
  182. }
  183. }
  184. }
  185. c.tgManager.Release(acc, 0)
  186. select {
  187. case <-ctx.Done():
  188. return result
  189. case <-time.After(5 * time.Second):
  190. }
  191. }
  192. return result
  193. }
  194. func (c *Collector) scrapeChannel(ctx context.Context, username string, msgLimit int, callback func(plugin.MerchantData)) {
  195. // Rotate proxy before each channel scrape
  196. c.rotateProxy()
  197. acc, err := c.tgManager.Acquire(ctx)
  198. if err != nil {
  199. log.Printf("[tg_collector] scrape %s: no available account: %v, waiting 30s", username, err)
  200. select {
  201. case <-ctx.Done():
  202. return
  203. case <-time.After(30 * time.Second):
  204. }
  205. acc, err = c.tgManager.Acquire(ctx)
  206. if err != nil {
  207. log.Printf("[tg_collector] scrape %s: still no account, skipping", username)
  208. return
  209. }
  210. }
  211. if err := acc.Client.Connect(ctx); err != nil {
  212. c.tgManager.Release(acc, 0)
  213. return
  214. }
  215. // Read pinned messages
  216. pinnedMsgs, _ := acc.Client.GetPinnedMessages(ctx, username)
  217. c.processMessages(ctx, pinnedMsgs, username, callback)
  218. // Read historical messages
  219. offsetID := 0
  220. fetched := 0
  221. for fetched < msgLimit {
  222. if c.stopped.Load() || ctx.Err() != nil {
  223. break
  224. }
  225. batchSize := 100
  226. if msgLimit-fetched < batchSize {
  227. batchSize = msgLimit - fetched
  228. }
  229. msgs, err := acc.Client.GetMessages(ctx, username, offsetID, batchSize)
  230. if err != nil {
  231. if fw, ok := err.(*telegram.FloodWaitError); ok {
  232. c.tgManager.HandleFloodWait(acc, fw.Seconds)
  233. acc = nil
  234. }
  235. break
  236. }
  237. if len(msgs) == 0 {
  238. break
  239. }
  240. c.processMessages(ctx, msgs, username, callback)
  241. offsetID = msgs[len(msgs)-1].ID
  242. fetched += len(msgs)
  243. }
  244. if acc != nil {
  245. c.tgManager.Release(acc, 0)
  246. }
  247. // Update channel status
  248. c.store.DB.Model(&model.Channel{}).Where("username = ?", username).
  249. Update("status", "scraped")
  250. }
  251. func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message, channelUsername string, callback func(plugin.MerchantData)) {
  252. for _, msg := range msgs {
  253. if msg.IsService || msg.Text == "" {
  254. continue
  255. }
  256. // Relaxed: allow messages with any contact info even without Chinese
  257. // Many merchants post in English or mixed language
  258. if !extractor.HasContact(msg.Text) {
  259. continue
  260. }
  261. // Regex first
  262. info := extractor.Extract(msg.Text)
  263. merchantName := ""
  264. industry := ""
  265. // LLM fallback only when regex found no TG username
  266. if info.TgUsername == "" && c.llmClient != nil {
  267. merchantInfo, err := c.llmClient.ParseMerchant(ctx, msg.Text)
  268. if err == nil && merchantInfo != nil && merchantInfo.TgUsername != "" {
  269. info.TgUsername = strings.TrimPrefix(merchantInfo.TgUsername, "@")
  270. if merchantInfo.Website != "" && info.Website == "" {
  271. info.Website = merchantInfo.Website
  272. }
  273. if merchantInfo.Email != "" && info.Email == "" {
  274. info.Email = merchantInfo.Email
  275. }
  276. if merchantInfo.Phone != "" && info.Phone == "" {
  277. info.Phone = merchantInfo.Phone
  278. }
  279. merchantName = extractor.CleanMerchantName(merchantInfo.MerchantName)
  280. industry = merchantInfo.Industry
  281. }
  282. }
  283. if info.TgUsername == "" {
  284. continue
  285. }
  286. md := plugin.MerchantData{
  287. TgUsername: info.TgUsername,
  288. TgLink: "https://t.me/" + info.TgUsername,
  289. MerchantName: merchantName,
  290. Website: info.Website,
  291. Email: info.Email,
  292. Phone: info.Phone,
  293. SourceType: "tg_channel",
  294. SourceName: channelUsername,
  295. SourceURL: "https://t.me/" + channelUsername,
  296. OriginalText: msg.Text,
  297. IndustryTag: industry,
  298. GroupUsername: channelUsername,
  299. }
  300. c.logger.LogMerchantFound(md, "tg_message_extract", 0, "tg://"+channelUsername)
  301. callback(md)
  302. }
  303. }
  304. // rotateProxy switches to the next proxy in the pool for the TG account manager.
  305. func (c *Collector) rotateProxy() {
  306. if c.proxyPool == nil {
  307. return
  308. }
  309. next := c.proxyPool.Next()
  310. if next != "" {
  311. c.tgManager.SetProxy(next)
  312. log.Printf("[tg_collector] rotated to proxy: %s (active: %d/%d)",
  313. next, c.proxyPool.ActiveCount(), c.proxyPool.Size())
  314. } else {
  315. log.Printf("[tg_collector] proxy pool exhausted, all proxies disabled")
  316. }
  317. }
  318. func cleanUsername(s string) string {
  319. return strings.TrimPrefix(strings.TrimSpace(s), "@")
  320. }
  321. var reUsernameFromLink = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  322. func extractUsernameFromLink(link string) string {
  323. m := reUsernameFromLink.FindStringSubmatch(link)
  324. if len(m) > 1 {
  325. return m[1]
  326. }
  327. return ""
  328. }
  329. func getIntCfg(cfg map[string]any, key string, def int) int {
  330. if v, ok := cfg[key].(int); ok {
  331. return v
  332. }
  333. if v, ok := cfg[key].(float64); ok {
  334. return int(v)
  335. }
  336. return def
  337. }