dynamic.go 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. package crawler
  2. import (
  3. "context"
  4. "strings"
  5. "time"
  6. "github.com/chromedp/chromedp"
  7. )
  8. // DynamicCrawler chromedp 动态爬取
  9. type DynamicCrawler struct{}
  10. // NewDynamicCrawler 创建 DynamicCrawler
  11. func NewDynamicCrawler() *DynamicCrawler { return &DynamicCrawler{} }
  12. // Crawl 用无头浏览器爬取(用于 JS 渲染的页面)
  13. func (c *DynamicCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
  14. result := &CrawlResult{}
  15. // 创建 chromedp allocator context,最多等待 30s
  16. allocCtx, cancel := chromedp.NewExecAllocator(ctx,
  17. chromedp.Headless,
  18. chromedp.DisableGPU,
  19. chromedp.NoSandbox,
  20. chromedp.Flag("disable-web-security", true),
  21. )
  22. defer cancel()
  23. taskCtx, taskCancel := context.WithTimeout(allocCtx, 30*time.Second)
  24. defer taskCancel()
  25. chromeCtx, chromeCancel := chromedp.NewContext(taskCtx)
  26. defer chromeCancel()
  27. var html string
  28. var links []interface{}
  29. err := chromedp.Run(chromeCtx,
  30. chromedp.Navigate(targetURL),
  31. chromedp.Sleep(3*time.Second), // 等待 JS 渲染
  32. chromedp.OuterHTML("html", &html),
  33. chromedp.Evaluate(`Array.from(document.querySelectorAll('a[href]')).map(a => a.href)`, &links),
  34. )
  35. if err != nil {
  36. result.Error = err
  37. return result
  38. }
  39. result.HTML = html
  40. // 将 interface{} 切片转为字符串切片
  41. for _, item := range links {
  42. link, ok := item.(string)
  43. if !ok {
  44. continue
  45. }
  46. result.Links = append(result.Links, link)
  47. if strings.Contains(link, "t.me/") || strings.Contains(link, "telegram.me/") {
  48. result.TgLinks = append(result.TgLinks, link)
  49. }
  50. }
  51. return result
  52. }