Taio
/
spider


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
							package crawler

import (
	"context"
	"strings"
	"time"

	"github.com/gocolly/colly/v2"
)

// StaticCrawler 静态网页爬取（colly）
type StaticCrawler struct{}

// NewStaticCrawler 创建 StaticCrawler
func NewStaticCrawler() *StaticCrawler { return &StaticCrawler{} }

// CrawlResult 爬取结果
type CrawlResult struct {
	Links   []string // 发现的链接
	TgLinks []string // t.me 链接
	Emails  []string
	HTML    string
	Error   error
}

// Crawl 爬取网页，提取所有链接
func (c *StaticCrawler) Crawl(ctx context.Context, targetURL string) *CrawlResult {
	result := &CrawlResult{}

	collector := colly.NewCollector(
		colly.MaxDepth(1),
		colly.Async(false),
	)
	collector.SetRequestTimeout(15 * time.Second)

	// 提取所有 <a href> 链接
	collector.OnHTML("a[href]", func(e *colly.HTMLElement) {
		href := e.Attr("href")
		if href == "" {
			return
		}

		// 绝对 URL
		absURL := e.Request.AbsoluteURL(href)
		if absURL == "" {
			return
		}

		result.Links = append(result.Links, absURL)

		if strings.Contains(absURL, "t.me/") || strings.Contains(absURL, "telegram.me/") {
			result.TgLinks = append(result.TgLinks, absURL)
		}
	})

	collector.OnResponse(func(r *colly.Response) {
		result.HTML = string(r.Body)
	})

	collector.OnError(func(r *colly.Response, err error) {
		result.Error = err
	})

	// 使用 channel 传递 context 取消
	done := make(chan struct{})
	go func() {
		collector.Visit(targetURL) //nolint:errcheck
		close(done)
	}()

	select {
	case <-ctx.Done():
		result.Error = ctx.Err()
	case <-done:
	}

	return result
}