package search import ( "bytes" "context" "encoding/json" "fmt" "net/http" "strings" "time" ) const serperEndpoint = "https://google.serper.dev/search" const serperVideoEndpoint = "https://google.serper.dev/videos" // SerperClient Serper API 客户端 type SerperClient struct { apiKey string perPage int maxPage int http *http.Client } // NewSerperClient 创建 Serper 客户端 func NewSerperClient(apiKey string, perPage, maxPage int) *SerperClient { return &SerperClient{ apiKey: apiKey, perPage: perPage, maxPage: maxPage, http: &http.Client{Timeout: 15 * time.Second}, } } // SearchResult 单条搜索结果 type SearchResult struct { Title string URL string Snippet string } // Search 搜索关键词,返回所有翻页结果(organic) func (c *SerperClient) Search(ctx context.Context, query string) ([]SearchResult, error) { var results []SearchResult for page := 1; page <= c.maxPage; page++ { pageResults, err := c.searchPage(ctx, query, page) if err != nil { break } results = append(results, pageResults...) if len(pageResults) < c.perPage { break } } return results, nil } // SearchVideos 搜索视频结果 — YouTube 等视频描述中经常包含 TG 联系方式 func (c *SerperClient) SearchVideos(ctx context.Context, query string) ([]SearchResult, error) { body := map[string]interface{}{ "q": query, "num": c.perPage, "gl": "cn", "hl": "zh-cn", } data, _ := json.Marshal(body) req, err := http.NewRequestWithContext(ctx, "POST", serperVideoEndpoint, bytes.NewReader(data)) if err != nil { return nil, err } req.Header.Set("X-API-KEY", c.apiKey) req.Header.Set("Content-Type", "application/json") resp, err := c.http.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("serper videos API error: %d", resp.StatusCode) } var result struct { Videos []struct { Title string `json:"title"` Link string `json:"link"` Snippet string `json:"snippet"` Channel string `json:"channel"` } `json:"videos"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, err } var out []SearchResult for _, v := range result.Videos { // Combine snippet + title + channel for maximum extraction combinedSnippet := v.Snippet if v.Channel != "" { combinedSnippet = v.Snippet + " " + v.Channel } out = append(out, SearchResult{ Title: v.Title, URL: v.Link, Snippet: combinedSnippet, }) } return out, nil } // searchPage 搜索单页 func (c *SerperClient) searchPage(ctx context.Context, query string, page int) ([]SearchResult, error) { body := map[string]interface{}{ "q": query, "num": c.perPage, "page": page, "gl": "cn", "hl": "zh-cn", } data, _ := json.Marshal(body) req, err := http.NewRequestWithContext(ctx, "POST", serperEndpoint, bytes.NewReader(data)) if err != nil { return nil, err } req.Header.Set("X-API-KEY", c.apiKey) req.Header.Set("Content-Type", "application/json") resp, err := c.http.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, fmt.Errorf("serper API error: %d", resp.StatusCode) } var result struct { Organic []struct { Title string `json:"title"` Link string `json:"link"` Snippet string `json:"snippet"` } `json:"organic"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, err } var out []SearchResult for _, r := range result.Organic { out = append(out, SearchResult{Title: r.Title, URL: r.Link, Snippet: r.Snippet}) } return out, nil } // ClassifyURL 判断 URL 类型 // 返回: "tg_channel", "nav_site", "web_page", "discard" func ClassifyURL(rawURL string) string { // t.me 链接 if strings.Contains(rawURL, "t.me/") || strings.Contains(rawURL, "telegram.me/") { return "tg_channel" } u := strings.ToLower(rawURL) // 社交媒体/大站黑名单 blacklistDomains := []string{ "twitter.com", "x.com", "facebook.com", "instagram.com", "youtube.com", "google.com", "baidu.com", "weibo.com", "zhihu.com", "stackoverflow.com", "wikipedia.org", "amazon.com", "taobao.com", "jd.com", "tmall.com", "apple.com", "microsoft.com", "qq.com", } for _, d := range blacklistDomains { if strings.Contains(u, d) { return "discard" } } // 黑名单扩展名 blacklistExt := []string{".apk", ".zip", ".pdf", ".exe", ".dmg", ".ipa", ".mp4", ".mp3"} for _, ext := range blacklistExt { if strings.HasSuffix(u, ext) { return "discard" } } // 正向信号:导航站/聚合页 navSignals := []string{ "nav", "directory", "catalog", "list", "daohang", "dh", "导航", "目录", "聚合", "推荐", "收录", "汇总", "telegram", "channel", "group", "tg", } for _, sig := range navSignals { if strings.Contains(u, sig) { return "nav_site" } } // 不再直接丢弃 — 普通网页也可能含联系方式,标记为 web_page 允许爬取 return "web_page" }