collector.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. package tgcollector
  2. import (
  3. "context"
  4. "log"
  5. "regexp"
  6. "strings"
  7. "sync/atomic"
  8. "time"
  9. "spider/internal/extractor"
  10. "spider/internal/llm"
  11. "spider/internal/model"
  12. "spider/internal/plugin"
  13. "spider/internal/store"
  14. "spider/internal/telegram"
  15. )
  16. // Collector implements plugin.Collector for TG channel collection.
  17. // Combines BFS channel discovery (from seeds) + message scraping.
  18. // AI: regex first, LLM fallback only for non-standard contact formats.
  19. type Collector struct {
  20. tgManager *telegram.AccountManager
  21. llmClient *llm.Client // can be nil
  22. store *store.Store
  23. stopped atomic.Bool
  24. }
  25. // New creates a new TG collector.
  26. func New(tgManager *telegram.AccountManager, llmClient *llm.Client, s *store.Store) *Collector {
  27. return &Collector{
  28. tgManager: tgManager,
  29. llmClient: llmClient,
  30. store: s,
  31. }
  32. }
  33. func (c *Collector) Name() string { return "tg_collector" }
  34. func (c *Collector) Stop() error {
  35. c.stopped.Store(true)
  36. return nil
  37. }
  38. // Run executes the TG collection pipeline:
  39. // 1. BFS discover channels from seeds
  40. // 2. Scrape messages from discovered channels
  41. // 3. Extract merchants via regex (+ LLM fallback)
  42. //
  43. // cfg keys:
  44. // - "seeds": []string — seed channel names
  45. // - "max_depth": int — BFS max depth (default 3)
  46. // - "max_channels": int — max channels to discover (default 500)
  47. // - "message_limit": int — messages per channel (default 500)
  48. func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
  49. c.stopped.Store(false)
  50. if c.tgManager == nil {
  51. log.Println("[tg_collector] no TG account manager, skipping")
  52. return nil
  53. }
  54. seeds, _ := cfg["seeds"].([]string)
  55. if len(seeds) == 0 {
  56. log.Println("[tg_collector] no seeds provided")
  57. return nil
  58. }
  59. maxDepth := getIntCfg(cfg, "max_depth", 3)
  60. maxChannels := getIntCfg(cfg, "max_channels", 500)
  61. msgLimit := getIntCfg(cfg, "message_limit", 500)
  62. // Phase 1: BFS channel discovery
  63. channels := c.discover(ctx, seeds, maxDepth, maxChannels)
  64. log.Printf("[tg_collector] discovered %d channels", len(channels))
  65. // Phase 2: Scrape each channel
  66. for i, ch := range channels {
  67. if c.stopped.Load() || ctx.Err() != nil {
  68. break
  69. }
  70. log.Printf("[tg_collector] scraping %d/%d: @%s", i+1, len(channels), ch)
  71. c.scrapeChannel(ctx, ch, msgLimit, callback)
  72. // Delay between channels
  73. select {
  74. case <-ctx.Done():
  75. return nil
  76. case <-time.After(5 * time.Second):
  77. }
  78. }
  79. log.Println("[tg_collector] done")
  80. return nil
  81. }
  82. type queueItem struct {
  83. Username string
  84. Depth int
  85. Source string
  86. }
  87. func (c *Collector) discover(ctx context.Context, seeds []string, maxDepth, maxTotal int) []string {
  88. queue := make([]queueItem, 0, len(seeds))
  89. for _, s := range seeds {
  90. queue = append(queue, queueItem{Username: cleanUsername(s), Depth: 0, Source: "seed"})
  91. }
  92. visited := map[string]bool{}
  93. var result []string
  94. for len(queue) > 0 && len(result) < maxTotal {
  95. if c.stopped.Load() || ctx.Err() != nil {
  96. break
  97. }
  98. item := queue[0]
  99. queue = queue[1:]
  100. username := cleanUsername(item.Username)
  101. if username == "" || visited[username] {
  102. continue
  103. }
  104. visited[username] = true
  105. acc, err := c.tgManager.Acquire(ctx)
  106. if err != nil {
  107. log.Printf("[tg_collector] no available TG account: %v", err)
  108. break
  109. }
  110. if err := acc.Client.Connect(ctx); err != nil {
  111. log.Printf("[tg_collector] connect failed: %v", err)
  112. c.tgManager.Release(acc, 0)
  113. continue
  114. }
  115. _, err = acc.Client.GetChannelInfo(ctx, username)
  116. if err != nil {
  117. if fw, ok := err.(*telegram.FloodWaitError); ok {
  118. c.tgManager.HandleFloodWait(acc, fw.Seconds)
  119. } else {
  120. c.tgManager.Release(acc, 0)
  121. }
  122. continue
  123. }
  124. // Save channel to DB
  125. c.store.UpsertChannel(&model.Channel{
  126. Username: username,
  127. Source: item.Source,
  128. Status: "pending",
  129. })
  130. result = append(result, username)
  131. // BFS: read messages to find more channels
  132. if item.Depth < maxDepth {
  133. msgs, err := acc.Client.GetMessages(ctx, username, 0, 100)
  134. if err == nil {
  135. for _, msg := range msgs {
  136. if msg.ForwardFromChannel != "" {
  137. fwd := cleanUsername(msg.ForwardFromChannel)
  138. if fwd != "" && !visited[fwd] {
  139. queue = append(queue, queueItem{fwd, item.Depth + 1, "snowball"})
  140. }
  141. }
  142. for _, link := range msg.Links {
  143. name := extractUsernameFromLink(link)
  144. if name != "" && !visited[name] {
  145. queue = append(queue, queueItem{name, item.Depth + 1, "snowball"})
  146. }
  147. }
  148. }
  149. }
  150. }
  151. c.tgManager.Release(acc, 0)
  152. select {
  153. case <-ctx.Done():
  154. return result
  155. case <-time.After(5 * time.Second):
  156. }
  157. }
  158. return result
  159. }
  160. func (c *Collector) scrapeChannel(ctx context.Context, username string, msgLimit int, callback func(plugin.MerchantData)) {
  161. acc, err := c.tgManager.Acquire(ctx)
  162. if err != nil {
  163. return
  164. }
  165. if err := acc.Client.Connect(ctx); err != nil {
  166. c.tgManager.Release(acc, 0)
  167. return
  168. }
  169. // Read pinned messages
  170. pinnedMsgs, _ := acc.Client.GetPinnedMessages(ctx, username)
  171. c.processMessages(ctx, pinnedMsgs, username, callback)
  172. // Read historical messages
  173. offsetID := 0
  174. fetched := 0
  175. for fetched < msgLimit {
  176. if c.stopped.Load() || ctx.Err() != nil {
  177. break
  178. }
  179. batchSize := 100
  180. if msgLimit-fetched < batchSize {
  181. batchSize = msgLimit - fetched
  182. }
  183. msgs, err := acc.Client.GetMessages(ctx, username, offsetID, batchSize)
  184. if err != nil {
  185. if fw, ok := err.(*telegram.FloodWaitError); ok {
  186. c.tgManager.HandleFloodWait(acc, fw.Seconds)
  187. acc = nil
  188. }
  189. break
  190. }
  191. if len(msgs) == 0 {
  192. break
  193. }
  194. c.processMessages(ctx, msgs, username, callback)
  195. offsetID = msgs[len(msgs)-1].ID
  196. fetched += len(msgs)
  197. }
  198. if acc != nil {
  199. c.tgManager.Release(acc, 0)
  200. }
  201. // Update channel status
  202. c.store.DB.Model(&model.Channel{}).Where("username = ?", username).
  203. Update("status", "scraped")
  204. }
  205. func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message, channelUsername string, callback func(plugin.MerchantData)) {
  206. for _, msg := range msgs {
  207. if msg.IsService || msg.Text == "" {
  208. continue
  209. }
  210. if !extractor.ContainsChinese(msg.Text, 0) {
  211. continue
  212. }
  213. if !extractor.HasContact(msg.Text) {
  214. continue
  215. }
  216. // Regex first
  217. info := extractor.Extract(msg.Text)
  218. merchantName := ""
  219. industry := ""
  220. // LLM fallback only when regex found no TG username
  221. if info.TgUsername == "" && c.llmClient != nil {
  222. merchantInfo, err := c.llmClient.ParseMerchant(ctx, msg.Text)
  223. if err == nil && merchantInfo != nil && merchantInfo.TgUsername != "" {
  224. info.TgUsername = strings.TrimPrefix(merchantInfo.TgUsername, "@")
  225. if merchantInfo.Website != "" && info.Website == "" {
  226. info.Website = merchantInfo.Website
  227. }
  228. if merchantInfo.Email != "" && info.Email == "" {
  229. info.Email = merchantInfo.Email
  230. }
  231. if merchantInfo.Phone != "" && info.Phone == "" {
  232. info.Phone = merchantInfo.Phone
  233. }
  234. merchantName = extractor.CleanMerchantName(merchantInfo.MerchantName)
  235. industry = merchantInfo.Industry
  236. }
  237. }
  238. if info.TgUsername == "" {
  239. continue
  240. }
  241. callback(plugin.MerchantData{
  242. TgUsername: info.TgUsername,
  243. TgLink: "https://t.me/" + info.TgUsername,
  244. MerchantName: merchantName,
  245. Website: info.Website,
  246. Email: info.Email,
  247. Phone: info.Phone,
  248. SourceType: "tg_channel",
  249. SourceName: channelUsername,
  250. SourceURL: "https://t.me/" + channelUsername,
  251. OriginalText: msg.Text,
  252. IndustryTag: industry,
  253. })
  254. }
  255. }
  256. func cleanUsername(s string) string {
  257. return strings.TrimPrefix(strings.TrimSpace(s), "@")
  258. }
  259. var reUsernameFromLink = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  260. func extractUsernameFromLink(link string) string {
  261. m := reUsernameFromLink.FindStringSubmatch(link)
  262. if len(m) > 1 {
  263. return m[1]
  264. }
  265. return ""
  266. }
  267. func getIntCfg(cfg map[string]any, key string, def int) int {
  268. if v, ok := cfg[key].(int); ok {
  269. return v
  270. }
  271. if v, ok := cfg[key].(float64); ok {
  272. return int(v)
  273. }
  274. return def
  275. }