static.go 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. package crawler
  2. import (
  3. "context"
  4. "strings"
  5. "sync"
  6. "time"
  7. "github.com/gocolly/colly/v2"
  8. )
  9. // StaticCrawler 静态网页爬取(colly)
  10. type StaticCrawler struct {
  11. mu sync.RWMutex
  12. proxyURL string
  13. }
  14. // NewStaticCrawler 创建 StaticCrawler
  15. func NewStaticCrawler() *StaticCrawler { return &StaticCrawler{} }
  16. // SetProxy sets the proxy URL for subsequent crawl requests.
  17. func (c *StaticCrawler) SetProxy(proxyURL string) {
  18. c.mu.Lock()
  19. c.proxyURL = proxyURL
  20. c.mu.Unlock()
  21. }
  22. // GetProxy returns the current proxy URL.
  23. func (c *StaticCrawler) GetProxy() string {
  24. c.mu.RLock()
  25. defer c.mu.RUnlock()
  26. return c.proxyURL
  27. }
  28. // CrawlResult 爬取结果
  29. type CrawlResult struct {
  30. Links []string // 发现的链接
  31. TgLinks []string // t.me 链接
  32. Emails []string
  33. HTML string
  34. Error error
  35. }
  36. // Crawl 爬取网页,提取所有链接
  37. func (c *StaticCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
  38. result := &CrawlResult{}
  39. collector := colly.NewCollector(
  40. colly.MaxDepth(1),
  41. colly.Async(false),
  42. )
  43. collector.SetRequestTimeout(15 * time.Second)
  44. // Snapshot proxy under lock
  45. proxyURL := c.GetProxy()
  46. if proxyURL != "" {
  47. collector.SetProxy(proxyURL)
  48. }
  49. // 提取所有 <a href> 链接
  50. collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
  51. href := e.Attr("href")
  52. if href == "" {
  53. return
  54. }
  55. // 绝对 URL
  56. absURL := e.Request.AbsoluteURL(href)
  57. if absURL == "" {
  58. return
  59. }
  60. result.Links = append(result.Links, absURL)
  61. if strings.Contains(absURL, "t.me/") || strings.Contains(absURL, "telegram.me/") {
  62. result.TgLinks = append(result.TgLinks, absURL)
  63. }
  64. })
  65. collector.OnResponse(func(r *colly.Response) {
  66. result.HTML = string(r.Body)
  67. })
  68. collector.OnError(func(r *colly.Response, err error) {
  69. result.Error = err
  70. })
  71. // 使用 channel 传递 context 取消
  72. done := make(chan struct{})
  73. go func() {
  74. collector.Visit(targetURL) //nolint:errcheck
  75. close(done)
  76. }()
  77. select {
  78. case <-ctx.Done():
  79. result.Error = ctx.Err()
  80. case <-done:
  81. }
  82. return result
  83. }