collector.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. package githubcollector
  2. import (
  3. "context"
  4. "encoding/json"
  5. "fmt"
  6. "io"
  7. "log"
  8. "net/http"
  9. "net/url"
  10. "regexp"
  11. "strings"
  12. "sync/atomic"
  13. "time"
  14. "spider/internal/extractor"
  15. "spider/internal/model"
  16. "spider/internal/plugin"
  17. "spider/internal/store"
  18. )
  19. // Collector implements plugin.Collector for GitHub README mining.
  20. // Searches GitHub repos by keywords, extracts t.me links from READMEs.
  21. type Collector struct {
  22. token string // GitHub token (optional)
  23. store *store.Store
  24. http *http.Client
  25. stopped atomic.Bool
  26. }
  27. // New creates a new GitHub collector.
  28. func New(token string, s *store.Store) *Collector {
  29. return &Collector{
  30. token: token,
  31. store: s,
  32. http: &http.Client{Timeout: 15 * time.Second},
  33. }
  34. }
  35. func (c *Collector) Name() string { return "github_collector" }
  36. func (c *Collector) Stop() error {
  37. c.stopped.Store(true)
  38. return nil
  39. }
  40. // Run searches GitHub for repos matching keywords, extracts t.me links from READMEs.
  41. //
  42. // cfg keys:
  43. // - "keywords": []string — search keywords
  44. // - "repos_limit": int — max repos to process (default 50)
  45. func (c *Collector) Run(ctx context.Context, cfg map[string]any, callback func(plugin.MerchantData)) error {
  46. c.stopped.Store(false)
  47. keywords, _ := cfg["keywords"].([]string)
  48. if len(keywords) == 0 {
  49. log.Println("[github_collector] no keywords provided")
  50. return nil
  51. }
  52. reposLimit := 50
  53. if v, ok := cfg["repos_limit"].(int); ok && v > 0 {
  54. reposLimit = v
  55. }
  56. queries := make([]string, 0, len(keywords))
  57. for _, kw := range keywords {
  58. queries = append(queries, fmt.Sprintf("%s telegram", kw))
  59. }
  60. reposPerQuery := 1
  61. if len(queries) > 0 {
  62. reposPerQuery = reposLimit/len(queries) + 1
  63. }
  64. found := 0
  65. for _, query := range queries {
  66. if c.stopped.Load() || ctx.Err() != nil {
  67. break
  68. }
  69. log.Printf("[github_collector] searching: %s", query)
  70. repos, err := c.searchRepos(ctx, query, reposPerQuery)
  71. if err != nil {
  72. log.Printf("[github_collector] search error: %v", err)
  73. continue
  74. }
  75. for _, repo := range repos {
  76. if c.stopped.Load() || ctx.Err() != nil {
  77. break
  78. }
  79. readme, err := c.fetchReadme(ctx, repo)
  80. if err != nil {
  81. continue
  82. }
  83. // Filter: README must contain Chinese
  84. preview := readme
  85. if len(preview) > 5000 {
  86. preview = preview[:5000]
  87. }
  88. if !extractor.ContainsChinese(preview, 0) {
  89. continue
  90. }
  91. // Extract t.me links
  92. links := extractTMeLinks(readme)
  93. for _, link := range links {
  94. // Context check: 200 chars around link must contain Chinese
  95. idx := strings.Index(readme, link)
  96. if idx < 0 {
  97. continue
  98. }
  99. start := idx - 200
  100. if start < 0 {
  101. start = 0
  102. }
  103. end := idx + len(link) + 200
  104. if end > len(readme) {
  105. end = len(readme)
  106. }
  107. context200 := readme[start:end]
  108. if !extractor.ContainsChinese(context200, 0) {
  109. continue
  110. }
  111. username := extractTGUsername(link)
  112. if username == "" {
  113. continue
  114. }
  115. // Save channel to DB
  116. c.store.UpsertChannel(&model.Channel{
  117. Username: username,
  118. Source: "github",
  119. Status: "pending",
  120. })
  121. callback(plugin.MerchantData{
  122. TgUsername: username,
  123. TgLink: "https://t.me/" + username,
  124. SourceType: "github",
  125. SourceName: repo,
  126. SourceURL: fmt.Sprintf("https://github.com/%s", repo),
  127. })
  128. found++
  129. }
  130. // Delay between repos
  131. select {
  132. case <-ctx.Done():
  133. return nil
  134. case <-time.After(2 * time.Second):
  135. }
  136. }
  137. // Delay between queries
  138. select {
  139. case <-ctx.Done():
  140. return nil
  141. case <-time.After(5 * time.Second):
  142. }
  143. }
  144. log.Printf("[github_collector] done: %d channels found", found)
  145. return nil
  146. }
  147. func (c *Collector) searchRepos(ctx context.Context, query string, limit int) ([]string, error) {
  148. perPage := limit
  149. if perPage > 30 {
  150. perPage = 30
  151. }
  152. apiURL := fmt.Sprintf("https://api.github.com/search/repositories?q=%s&sort=stars&per_page=%d",
  153. url.QueryEscape(query), perPage)
  154. req, err := http.NewRequestWithContext(ctx, "GET", apiURL, nil)
  155. if err != nil {
  156. return nil, err
  157. }
  158. req.Header.Set("Accept", "application/vnd.github.v3+json")
  159. if c.token != "" {
  160. req.Header.Set("Authorization", "token "+c.token)
  161. }
  162. resp, err := c.http.Do(req)
  163. if err != nil {
  164. return nil, err
  165. }
  166. defer resp.Body.Close()
  167. var result struct {
  168. Items []struct {
  169. FullName string `json:"full_name"`
  170. } `json:"items"`
  171. }
  172. if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
  173. return nil, err
  174. }
  175. var repos []string
  176. for _, item := range result.Items {
  177. repos = append(repos, item.FullName)
  178. }
  179. return repos, nil
  180. }
  181. func (c *Collector) fetchReadme(ctx context.Context, fullName string) (string, error) {
  182. rawURL := fmt.Sprintf("https://raw.githubusercontent.com/%s/main/README.md", fullName)
  183. req, err := http.NewRequestWithContext(ctx, "GET", rawURL, nil)
  184. if err != nil {
  185. return "", err
  186. }
  187. if c.token != "" {
  188. req.Header.Set("Authorization", "token "+c.token)
  189. }
  190. resp, err := c.http.Do(req)
  191. if err != nil {
  192. return "", err
  193. }
  194. defer resp.Body.Close()
  195. if resp.StatusCode == 404 {
  196. masterURL := strings.Replace(rawURL, "/main/", "/master/", 1)
  197. req2, err := http.NewRequestWithContext(ctx, "GET", masterURL, nil)
  198. if err != nil {
  199. return "", err
  200. }
  201. if c.token != "" {
  202. req2.Header.Set("Authorization", "token "+c.token)
  203. }
  204. resp2, err := c.http.Do(req2)
  205. if err != nil {
  206. return "", err
  207. }
  208. defer resp2.Body.Close()
  209. data, _ := io.ReadAll(resp2.Body)
  210. return string(data), nil
  211. }
  212. data, _ := io.ReadAll(resp.Body)
  213. return string(data), nil
  214. }
  215. var reTMeLink = regexp.MustCompile(`https?://t(?:elegram)?\.me/[a-zA-Z][a-zA-Z0-9_]{4,31}`)
  216. var reTGUsername = regexp.MustCompile(`t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  217. func extractTMeLinks(text string) []string {
  218. return reTMeLink.FindAllString(text, -1)
  219. }
  220. func extractTGUsername(link string) string {
  221. m := reTGUsername.FindStringSubmatch(link)
  222. if len(m) > 1 {
  223. return m[1]
  224. }
  225. return ""
  226. }