package githubcollector import ( "context" "encoding/json" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "sync/atomic" "time" "spider/internal/extractor" "spider/internal/model" "spider/internal/plugin" "spider/internal/store" ) // Collector implements plugin.Collector for GitHub README mining. // Searches GitHub repos by keywords, extracts t.me links from READMEs. type Collector struct { token string // GitHub token (optional) store *store.Store http *http.Client stopped atomic.Bool } // New creates a new GitHub collector. func New(token string, s *store.Store) *Collector { return &Collector{ token: token, store: s, http: &http.Client{Timeout: 15 * time.Second}, } } func (c *Collector) Name() string { return "github_collector" } func (c *Collector) Stop() error { c.stopped.Store(true) return nil } // Run searches GitHub for repos matching keywords, extracts t.me links from READMEs. // // cfg keys: // - "keywords": []string — search keywords // - "repos_limit": int — max repos to process (default 50) func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error { c.stopped.Store(false) keywords, _ := cfg["keywords"].([]string) if len(keywords) == 0 { log.Println("[github_collector] no keywords provided") return nil } reposLimit := 50 if v, ok := cfg["repos_limit"].(int); ok && v > 0 { reposLimit = v } queries := make([]string, 0, len(keywords)) for _, kw := range keywords { queries = append(queries, fmt.Sprintf("%s telegram", kw)) } reposPerQuery := 1 if len(queries) > 0 { reposPerQuery = reposLimit/len(queries) + 1 } found := 0 for _, query := range queries { if c.stopped.Load() || ctx.Err() != nil { break } log.Printf("[github_collector] searching: %s", query) repos, err := c.searchRepos(ctx, query, reposPerQuery) if err != nil { log.Printf("[github_collector] search error: %v", err) continue } for _, repo := range repos { if c.stopped.Load() || ctx.Err() != nil { break } readme, err := c.fetchReadme(ctx, repo) if err != nil { continue } // Filter: README must contain Chinese preview := readme if len(preview) > 5000 { preview = preview[:5000] } if !extractor.ContainsChinese(preview, 0) { continue } // Extract t.me links links := extractTMeLinks(readme) for _, link := range links { // Context check: 200 chars around link must contain Chinese idx := strings.Index(readme, link) if idx < 0 { continue } start := idx - 200 if start < 0 { start = 0 } end := idx + len(link) + 200 if end > len(readme) { end = len(readme) } context200 := readme[start:end] if !extractor.ContainsChinese(context200, 0) { continue } username := extractTGUsername(link) if username == "" { continue } // Save channel to DB c.store.UpsertChannel(&model.Channel{ Username: username, Source: "github", Status: "pending", }) callback(plugin.MerchantData{ TgUsername: username, TgLink: "https://t.me/" + username, SourceType: "github", SourceName: repo, SourceURL: fmt.Sprintf("https://github.com/%s", repo), }) found++ } // Delay between repos select { case <-ctx.Done(): return nil case <-time.After(2 * time.Second): } } // Delay between queries select { case <-ctx.Done(): return nil case <-time.After(5 * time.Second): } } log.Printf("[github_collector] done: %d channels found", found) return nil } func (c *Collector) searchRepos(ctx context.Context, query string, limit int) ([]string, error) { perPage := limit if perPage > 30 { perPage = 30 } apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d", url.QueryEscape(query), perPage) req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil) if err != nil { return nil, err } req.Header.Set("Accept", "application/vnd.github.v3+json") if c.token != "" { req.Header.Set("Authorization", "token "+c.token) } resp, err := c.http.Do(req) if err != nil { return nil, err } defer resp.Body.Close() var result struct { Items []struct { FullName string `json:"full_name"` } `json:"items"` } if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { return nil, err } var repos []string for _, item := range result.Items { repos = append(repos, item.FullName) } return repos, nil } func (c *Collector) fetchReadme(ctx context.Context, fullName string) (string, error) { rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName) req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil) if err != nil { return "", err } if c.token != "" { req.Header.Set("Authorization", "token "+c.token) } resp, err := c.http.Do(req) if err != nil { return "", err } defer resp.Body.Close() if resp.StatusCode == 404 { masterURL := strings.Replace(rawURL, "/main/", "/master/", 1) req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil) if err != nil { return "", err } if c.token != "" { req2.Header.Set("Authorization", "token "+c.token) } resp2, err := c.http.Do(req2) if err != nil { return "", err } defer resp2.Body.Close() data, _ := io.ReadAll(resp2.Body) return string(data), nil } data, _ := io.ReadAll(resp.Body) return string(data), nil } var reTMeLink = regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`) var reTGUsername = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`) func extractTMeLinks(text string) []string { return reTMeLink.FindAllString(text, -1) } func extractTGUsername(link string) string { m := reTGUsername.FindStringSubmatch(link) if len(m) > 1 { return m[1] } return "" }