package pipeline import ( "context" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "time" "spider/internal/extractor" "spider/internal/model" "gorm.io/gorm" ) // GithubPhase Phase 3: GitHub README 挖掘 type GithubPhase struct { db *gorm.DB token string // GitHub token(可选) settings Settings reporter ProgressReporter http *http.Client } // NewGithubPhase creates a new GithubPhase. func NewGithubPhase(db *gorm.DB, token string, settings Settings) *GithubPhase { return &GithubPhase{ db: db, token: token, settings: settings, http: &http.Client{Timeout: 15 * time.Second}, } } func (p *GithubPhase) Name() string { return "github" } func (p *GithubPhase) Run(ctx context.Context, task *model.Task, opts *Options) error { // GitHub 搜索 query:从 managed_keywords 取前 10 个生成 query var keywords []model.ManagedKeyword p.db.Where("status = ?", "active").Limit(10).Find(&keywords) queries := []string{} for _, kw := range keywords { queries = append(queries, fmt.Sprintf("%s telegram", kw.Keyword)) } itemLimit := 50 // 默认处理 50 个 repo if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 { itemLimit = opts.TestRun.ItemLimit } total := len(queries) found := 0 reposPerQuery := 1 if len(queries) > 0 { reposPerQuery = itemLimit/len(queries) + 1 } for i, query := range queries { if isContextDone(ctx) { break } if p.reporter != nil { p.reporter("github", i+1, total, "GitHub搜索: "+query) } repos, err := p.searchRepos(ctx, query, reposPerQuery) if err != nil { log.Printf("[github] search err: %v", err) continue } for _, repo := range repos { if isContextDone(ctx) { break } readme, err := p.fetchReadme(ctx, repo) if err != nil { continue } // 过滤:README 前 5000 字必须含中文 preview := readme if len(preview) > 5000 { preview = preview[:5000] } if !extractor.ContainsChinese(preview, 0) { continue } // 提取 t.me 链接 links := extractTMeLinks(readme) for _, link := range links { // 前后 200 字必须含中文 idx := strings.Index(readme, link) if idx < 0 { continue } start := idx - 200 if start < 0 { start = 0 } end := idx + len(link) + 200 if end > len(readme) { end = len(readme) } context200 := readme[start:end] if !extractor.ContainsChinese(context200, 0) { continue } username := extractTGUsernameFromLink(link) if username == "" { continue } ch := &model.Channel{ Username: username, Source: "github", SourceDetail: repo, Status: "pending", } result := p.db.Where(model.Channel{Username: username}).FirstOrCreate(ch) if result.RowsAffected > 0 { found++ } } // repo 间 sleep 2s select { case <-ctx.Done(): return nil case <-time.After(2 * time.Second): } } // query 间 sleep 5s select { case <-ctx.Done(): return nil case <-time.After(5 * time.Second): } } log.Printf("[github] done: %d channels found", found) return nil } // searchRepos 通过 GitHub Search API 搜索仓库 func (p *GithubPhase) searchRepos(ctx context.Context, query string, limit int) ([]string, error) { perPage := limit if perPage > 30 { perPage = 30 } apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d", url.QueryEscape(query), perPage) req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) if err != nil { return nil, err } req.Header.Set("Accept", "application/vnd.github.v3+json") if p.token != "" { req.Header.Set("Authorization", "token "+p.token) } resp, err := p.http.Do(req) if err != nil { return nil, err } defer resp.Body.Close() var result struct { Items []struct { FullName string `json:"full_name"` } `json:"items"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, err } var repos []string for _, item := range result.Items { repos = append(repos, item.FullName) } return repos, nil } // fetchReadme 下载 README.md(先尝试 main 分支,失败则尝试 master) func (p *GithubPhase) fetchReadme(ctx context.Context, fullName string) (string, error) { rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName) req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) if err != nil { return "", err } if p.token != "" { req.Header.Set("Authorization", "token "+p.token) } resp, err := p.http.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode == 404 { // 尝试 master 分支 masterURL := strings.Replace(rawURL, "/main/", "/master/", 1) req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil) if err != nil { return "", err } if p.token != "" { req2.Header.Set("Authorization", "token "+p.token) } resp2, err := p.http.Do(req2) if err != nil { return "", err } defer resp2.Body.Close() data, _ := io.ReadAll(resp2.Body) return string(data), nil } data, _ := io.ReadAll(resp.Body) return string(data), nil } // extractTMeLinks 从文本中提取所有 t.me 链接 func extractTMeLinks(text string) []string { re := regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`) return re.FindAllString(text, -1) } // extractTGUsernameFromLink 从 t.me/xxx 链接提取用户名 func extractTGUsernameFromLink(link string) string { re := regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`) m := re.FindStringSubmatch(link) if len(m) > 1 { return m[1] } return "" }