package crawler import ( "context" "strings" "sync" "time" "github.com/gocolly/colly/v2" ) // StaticCrawler 静态网页爬取(colly) type StaticCrawler struct { mu sync.RWMutex proxyURL string } // NewStaticCrawler 创建 StaticCrawler func NewStaticCrawler() *StaticCrawler { return &StaticCrawler{} } // SetProxy sets the proxy URL for subsequent crawl requests. func (c *StaticCrawler) SetProxy(proxyURL string) { c.mu.Lock() c.proxyURL = proxyURL c.mu.Unlock() } // GetProxy returns the current proxy URL. func (c *StaticCrawler) GetProxy() string { c.mu.RLock() defer c.mu.RUnlock() return c.proxyURL } // CrawlResult 爬取结果 type CrawlResult struct { Links []string // 发现的链接 TgLinks []string // t.me 链接 Emails []string HTML string Error error } // Crawl 爬取网页,提取所有链接 func (c *StaticCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult { result := &CrawlResult{} collector := colly.NewCollector( colly.MaxDepth(1), colly.Async(false), ) collector.SetRequestTimeout(15 * time.Second) // Snapshot proxy under lock proxyURL := c.GetProxy() if proxyURL != "" { collector.SetProxy(proxyURL) } // 提取所有 链接 collector.OnHTML("a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if href == "" { return } // 绝对 URL absURL := e.Request.AbsoluteURL(href) if absURL == "" { return } result.Links = append(result.Links, absURL) if strings.Contains(absURL, "t.me/") || strings.Contains(absURL, "telegram.me/") { result.TgLinks = append(result.TgLinks, absURL) } }) collector.OnResponse(func(r *colly.Response) { result.HTML = string(r.Body) }) collector.OnError(func(r *colly.Response, err error) { result.Error = err }) // 使用 channel 传递 context 取消 done := make(chan struct{}) go func() { collector.Visit(targetURL) //nolint:errcheck close(done) }() select { case <-ctx.Done(): result.Error = ctx.Err() case <-done: } return result }