| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- package crawler
- import (
- "context"
- "strings"
- "time"
- "github.com/chromedp/chromedp"
- )
- // DynamicCrawler chromedp 动态爬取
- type DynamicCrawler struct{}
- // NewDynamicCrawler 创建 DynamicCrawler
- func NewDynamicCrawler() *DynamicCrawler { return &DynamicCrawler{} }
- // Crawl 用无头浏览器爬取(用于 JS 渲染的页面)
- func (c *DynamicCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
- result := &CrawlResult{}
- // 创建 chromedp allocator context,最多等待 30s
- allocCtx, cancel := chromedp.NewExecAllocator(ctx,
- chromedp.Headless,
- chromedp.DisableGPU,
- chromedp.NoSandbox,
- chromedp.Flag("disable-web-security", true),
- )
- defer cancel()
- taskCtx, taskCancel := context.WithTimeout(allocCtx, 30*time.Second)
- defer taskCancel()
- chromeCtx, chromeCancel := chromedp.NewContext(taskCtx)
- defer chromeCancel()
- var html string
- var links []interface{}
- err := chromedp.Run(chromeCtx,
- chromedp.Navigate(targetURL),
- chromedp.Sleep(3*time.Second), // 等待 JS 渲染
- chromedp.OuterHTML("html", &html),
- chromedp.Evaluate(`Array.from(document.querySelectorAll('a[href]')).map(a => a.href)`, &links),
- )
- if err != nil {
- result.Error = err
- return result
- }
- result.HTML = html
- // 将 interface{} 切片转为字符串切片
- for _, item := range links {
- link, ok := item.(string)
- if !ok {
- continue
- }
- result.Links = append(result.Links, link)
- if strings.Contains(link, "t.me/") || strings.Contains(link, "telegram.me/") {
- result.TgLinks = append(result.TgLinks, link)
- }
- }
- return result
- }
|