static.go 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. package crawler
  2. import (
  3. "context"
  4. "strings"
  5. "time"
  6. "github.com/gocolly/colly/v2"
  7. )
  8. // StaticCrawler 静态网页爬取(colly)
  9. type StaticCrawler struct{}
  10. // NewStaticCrawler 创建 StaticCrawler
  11. func NewStaticCrawler() *StaticCrawler { return &StaticCrawler{} }
  12. // CrawlResult 爬取结果
  13. type CrawlResult struct {
  14. Links []string // 发现的链接
  15. TgLinks []string // t.me 链接
  16. Emails []string
  17. HTML string
  18. Error error
  19. }
  20. // Crawl 爬取网页,提取所有链接
  21. func (c *StaticCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
  22. result := &CrawlResult{}
  23. collector := colly.NewCollector(
  24. colly.MaxDepth(1),
  25. colly.Async(false),
  26. )
  27. collector.SetRequestTimeout(15 * time.Second)
  28. // 提取所有 <a href> 链接
  29. collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
  30. href := e.Attr("href")
  31. if href == "" {
  32. return
  33. }
  34. // 绝对 URL
  35. absURL := e.Request.AbsoluteURL(href)
  36. if absURL == "" {
  37. return
  38. }
  39. result.Links = append(result.Links, absURL)
  40. if strings.Contains(absURL, "t.me/") || strings.Contains(absURL, "telegram.me/") {
  41. result.TgLinks = append(result.TgLinks, absURL)
  42. }
  43. })
  44. collector.OnResponse(func(r *colly.Response) {
  45. result.HTML = string(r.Body)
  46. })
  47. collector.OnError(func(r *colly.Response, err error) {
  48. result.Error = err
  49. })
  50. // 使用 channel 传递 context 取消
  51. done := make(chan struct{})
  52. go func() {
  53. collector.Visit(targetURL) //nolint:errcheck
  54. close(done)
  55. }()
  56. select {
  57. case <-ctx.Done():
  58. result.Error = ctx.Err()
  59. case <-done:
  60. }
  61. return result
  62. }