| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- package pipeline
- import (
- "context"
- "encoding/json"
- "fmt"
- "io"
- "log"
- "net/http"
- "net/url"
- "regexp"
- "strings"
- "time"
- "spider/internal/extractor"
- "spider/internal/model"
- "gorm.io/gorm"
- )
- // GithubPhase Phase 3: GitHub README 挖掘
- type GithubPhase struct {
- db *gorm.DB
- token string // GitHub token(可选)
- settings Settings
- reporter ProgressReporter
- http *http.Client
- }
- // NewGithubPhase creates a new GithubPhase.
- func NewGithubPhase(db *gorm.DB, token string, settings Settings) *GithubPhase {
- return &GithubPhase{
- db: db,
- token: token,
- settings: settings,
- http: &http.Client{Timeout: 15 * time.Second},
- }
- }
- func (p *GithubPhase) Name() string { return "github" }
- func (p *GithubPhase) Run(ctx context.Context, task *model.Task, opts *Options) error {
- // GitHub 搜索 query:从 managed_keywords 取前 10 个生成 query
- var keywords []model.ManagedKeyword
- p.db.Where("status = ?", "active").Limit(10).Find(&keywords)
- queries := []string{}
- for _, kw := range keywords {
- queries = append(queries, fmt.Sprintf("%s telegram", kw.Keyword))
- }
- itemLimit := 50 // 默认处理 50 个 repo
- if opts.TestRun != nil && opts.TestRun.ItemLimit > 0 {
- itemLimit = opts.TestRun.ItemLimit
- }
- total := len(queries)
- found := 0
- reposPerQuery := 1
- if len(queries) > 0 {
- reposPerQuery = itemLimit/len(queries) + 1
- }
- for i, query := range queries {
- if isContextDone(ctx) {
- break
- }
- if p.reporter != nil {
- p.reporter("github", i+1, total, "GitHub搜索: "+query)
- }
- repos, err := p.searchRepos(ctx, query, reposPerQuery)
- if err != nil {
- log.Printf("[github] search err: %v", err)
- continue
- }
- for _, repo := range repos {
- if isContextDone(ctx) {
- break
- }
- readme, err := p.fetchReadme(ctx, repo)
- if err != nil {
- continue
- }
- // 过滤:README 前 5000 字必须含中文
- preview := readme
- if len(preview) > 5000 {
- preview = preview[:5000]
- }
- if !extractor.ContainsChinese(preview, 0) {
- continue
- }
- // 提取 t.me 链接
- links := extractTMeLinks(readme)
- for _, link := range links {
- // 前后 200 字必须含中文
- idx := strings.Index(readme, link)
- if idx < 0 {
- continue
- }
- start := idx - 200
- if start < 0 {
- start = 0
- }
- end := idx + len(link) + 200
- if end > len(readme) {
- end = len(readme)
- }
- context200 := readme[start:end]
- if !extractor.ContainsChinese(context200, 0) {
- continue
- }
- username := extractTGUsernameFromLink(link)
- if username == "" {
- continue
- }
- ch := &model.Channel{
- Username: username,
- Source: "github",
- SourceDetail: repo,
- Status: "pending",
- }
- result := p.db.Where(model.Channel{Username: username}).FirstOrCreate(ch)
- if result.RowsAffected > 0 {
- found++
- }
- }
- // repo 间 sleep 2s
- select {
- case <-ctx.Done():
- return nil
- case <-time.After(2 * time.Second):
- }
- }
- // query 间 sleep 5s
- select {
- case <-ctx.Done():
- return nil
- case <-time.After(5 * time.Second):
- }
- }
- log.Printf("[github] done: %d channels found", found)
- return nil
- }
- // searchRepos 通过 GitHub Search API 搜索仓库
- func (p *GithubPhase) searchRepos(ctx context.Context, query string, limit int) ([]string, error) {
- perPage := limit
- if perPage > 30 {
- perPage = 30
- }
- apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d",
- url.QueryEscape(query), perPage)
- req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
- if err != nil {
- return nil, err
- }
- req.Header.Set("Accept", "application/vnd.github.v3+json")
- if p.token != "" {
- req.Header.Set("Authorization", "token "+p.token)
- }
- resp, err := p.http.Do(req)
- if err != nil {
- return nil, err
- }
- defer resp.Body.Close()
- var result struct {
- Items []struct {
- FullName string `json:"full_name"`
- } `json:"items"`
- }
- if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
- return nil, err
- }
- var repos []string
- for _, item := range result.Items {
- repos = append(repos, item.FullName)
- }
- return repos, nil
- }
- // fetchReadme 下载 README.md(先尝试 main 分支,失败则尝试 master)
- func (p *GithubPhase) fetchReadme(ctx context.Context, fullName string) (string, error) {
- rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName)
- req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
- if err != nil {
- return "", err
- }
- if p.token != "" {
- req.Header.Set("Authorization", "token "+p.token)
- }
- resp, err := p.http.Do(req)
- if err != nil {
- return "", err
- }
- defer resp.Body.Close()
- if resp.StatusCode == 404 {
- // 尝试 master 分支
- masterURL := strings.Replace(rawURL, "/main/", "/master/", 1)
- req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil)
- if err != nil {
- return "", err
- }
- if p.token != "" {
- req2.Header.Set("Authorization", "token "+p.token)
- }
- resp2, err := p.http.Do(req2)
- if err != nil {
- return "", err
- }
- defer resp2.Body.Close()
- data, _ := io.ReadAll(resp2.Body)
- return string(data), nil
- }
- data, _ := io.ReadAll(resp.Body)
- return string(data), nil
- }
- // extractTMeLinks 从文本中提取所有 t.me 链接
- func extractTMeLinks(text string) []string {
- re := regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`)
- return re.FindAllString(text, -1)
- }
- // extractTGUsernameFromLink 从 t.me/xxx 链接提取用户名
- func extractTGUsernameFromLink(link string) string {
- re := regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
- m := re.FindStringSubmatch(link)
- if len(m) > 1 {
- return m[1]
- }
- return ""
- }
|