| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204 |
- package search
- import (
- "bytes"
- "context"
- "encoding/json"
- "fmt"
- "net/http"
- "strings"
- "time"
- )
- const serperEndpoint = "https://google.serper.dev/search"
- const serperVideoEndpoint = "https://google.serper.dev/videos"
- // SerperClient Serper API 客户端
- type SerperClient struct {
- apiKey string
- perPage int
- maxPage int
- http *http.Client
- }
- // NewSerperClient 创建 Serper 客户端
- func NewSerperClient(apiKey string, perPage, maxPage int) *SerperClient {
- return &SerperClient{
- apiKey: apiKey,
- perPage: perPage,
- maxPage: maxPage,
- http: &http.Client{Timeout: 15 * time.Second},
- }
- }
- // SearchResult 单条搜索结果
- type SearchResult struct {
- Title string
- URL string
- Snippet string
- }
- // Search 搜索关键词,返回所有翻页结果(organic)
- func (c *SerperClient) Search(ctx context.Context, query string) ([]SearchResult, error) {
- var results []SearchResult
- for page := 1; page <= c.maxPage; page++ {
- pageResults, err := c.searchPage(ctx, query, page)
- if err != nil {
- break
- }
- results = append(results, pageResults...)
- if len(pageResults) < c.perPage {
- break
- }
- }
- return results, nil
- }
- // SearchVideos 搜索视频结果 — YouTube 等视频描述中经常包含 TG 联系方式
- func (c *SerperClient) SearchVideos(ctx context.Context, query string) ([]SearchResult, error) {
- body := map[string]interface{}{
- "q": query,
- "num": c.perPage,
- "gl": "cn",
- "hl": "zh-cn",
- }
- data, _ := json.Marshal(body)
- req, err := http.NewRequestWithContext(ctx, "POST", serperVideoEndpoint, bytes.NewReader(data))
- if err != nil {
- return nil, err
- }
- req.Header.Set("X-API-KEY", c.apiKey)
- req.Header.Set("Content-Type", "application/json")
- resp, err := c.http.Do(req)
- if err != nil {
- return nil, err
- }
- defer resp.Body.Close()
- if resp.StatusCode != 200 {
- return nil, fmt.Errorf("serper videos API error: %d", resp.StatusCode)
- }
- var result struct {
- Videos []struct {
- Title string `json:"title"`
- Link string `json:"link"`
- Snippet string `json:"snippet"`
- Channel string `json:"channel"`
- } `json:"videos"`
- }
- if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
- return nil, err
- }
- var out []SearchResult
- for _, v := range result.Videos {
- // Combine snippet + title + channel for maximum extraction
- combinedSnippet := v.Snippet
- if v.Channel != "" {
- combinedSnippet = v.Snippet + " " + v.Channel
- }
- out = append(out, SearchResult{
- Title: v.Title,
- URL: v.Link,
- Snippet: combinedSnippet,
- })
- }
- return out, nil
- }
- // searchPage 搜索单页
- func (c *SerperClient) searchPage(ctx context.Context, query string, page int) ([]SearchResult, error) {
- body := map[string]interface{}{
- "q": query,
- "num": c.perPage,
- "page": page,
- "gl": "cn",
- "hl": "zh-cn",
- }
- data, _ := json.Marshal(body)
- req, err := http.NewRequestWithContext(ctx, "POST", serperEndpoint, bytes.NewReader(data))
- if err != nil {
- return nil, err
- }
- req.Header.Set("X-API-KEY", c.apiKey)
- req.Header.Set("Content-Type", "application/json")
- resp, err := c.http.Do(req)
- if err != nil {
- return nil, err
- }
- defer resp.Body.Close()
- if resp.StatusCode != 200 {
- return nil, fmt.Errorf("serper API error: %d", resp.StatusCode)
- }
- var result struct {
- Organic []struct {
- Title string `json:"title"`
- Link string `json:"link"`
- Snippet string `json:"snippet"`
- } `json:"organic"`
- }
- if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
- return nil, err
- }
- var out []SearchResult
- for _, r := range result.Organic {
- out = append(out, SearchResult{Title: r.Title, URL: r.Link, Snippet: r.Snippet})
- }
- return out, nil
- }
- // ClassifyURL 判断 URL 类型
- // 返回: "tg_channel", "nav_site", "web_page", "discard"
- func ClassifyURL(rawURL string) string {
- // t.me 链接
- if strings.Contains(rawURL, "t.me/") || strings.Contains(rawURL, "telegram.me/") {
- return "tg_channel"
- }
- u := strings.ToLower(rawURL)
- // 社交媒体/大站黑名单
- blacklistDomains := []string{
- "twitter.com", "x.com", "facebook.com", "instagram.com", "youtube.com",
- "google.com", "baidu.com", "weibo.com", "zhihu.com",
- "stackoverflow.com", "wikipedia.org",
- "amazon.com", "taobao.com", "jd.com", "tmall.com",
- "apple.com", "microsoft.com", "qq.com",
- }
- for _, d := range blacklistDomains {
- if strings.Contains(u, d) {
- return "discard"
- }
- }
- // 黑名单扩展名
- blacklistExt := []string{".apk", ".zip", ".pdf", ".exe", ".dmg", ".ipa", ".mp4", ".mp3"}
- for _, ext := range blacklistExt {
- if strings.HasSuffix(u, ext) {
- return "discard"
- }
- }
- // 正向信号:导航站/聚合页
- navSignals := []string{
- "nav", "directory", "catalog", "list", "daohang", "dh",
- "导航", "目录", "聚合", "推荐", "收录", "汇总",
- "telegram", "channel", "group", "tg",
- }
- for _, sig := range navSignals {
- if strings.Contains(u, sig) {
- return "nav_site"
- }
- }
- // 不再直接丢弃 — 普通网页也可能含联系方式,标记为 web_page 允许爬取
- return "web_page"
- }
|