| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- package crawler
- import (
- "context"
- "strings"
- "sync"
- "time"
- "github.com/gocolly/colly/v2"
- )
- // StaticCrawler 静态网页爬取(colly)
- type StaticCrawler struct {
- mu sync.RWMutex
- proxyURL string
- }
- // NewStaticCrawler 创建 StaticCrawler
- func NewStaticCrawler() *StaticCrawler { return &StaticCrawler{} }
- // SetProxy sets the proxy URL for subsequent crawl requests.
- func (c *StaticCrawler) SetProxy(proxyURL string) {
- c.mu.Lock()
- c.proxyURL = proxyURL
- c.mu.Unlock()
- }
- // GetProxy returns the current proxy URL.
- func (c *StaticCrawler) GetProxy() string {
- c.mu.RLock()
- defer c.mu.RUnlock()
- return c.proxyURL
- }
- // CrawlResult 爬取结果
- type CrawlResult struct {
- Links []string // 发现的链接
- TgLinks []string // t.me 链接
- Emails []string
- HTML string
- Error error
- }
- // Crawl 爬取网页,提取所有链接
- func (c *StaticCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
- result := &CrawlResult{}
- collector := colly.NewCollector(
- colly.MaxDepth(1),
- colly.Async(false),
- )
- collector.SetRequestTimeout(15 * time.Second)
- // Snapshot proxy under lock
- proxyURL := c.GetProxy()
- if proxyURL != "" {
- collector.SetProxy(proxyURL)
- }
- // 提取所有 <a href> 链接
- collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
- href := e.Attr("href")
- if href == "" {
- return
- }
- // 绝对 URL
- absURL := e.Request.AbsoluteURL(href)
- if absURL == "" {
- return
- }
- result.Links = append(result.Links, absURL)
- if strings.Contains(absURL, "t.me/") || strings.Contains(absURL, "telegram.me/") {
- result.TgLinks = append(result.TgLinks, absURL)
- }
- })
- collector.OnResponse(func(r *colly.Response) {
- result.HTML = string(r.Body)
- })
- collector.OnError(func(r *colly.Response, err error) {
- result.Error = err
- })
- // 使用 channel 传递 context 取消
- done := make(chan struct{})
- go func() {
- collector.Visit(targetURL) //nolint:errcheck
- close(done)
- }()
- select {
- case <-ctx.Done():
- result.Error = ctx.Err()
- case <-done:
- }
- return result
- }
|