Browse Source

feat(telegram): broader participant search coverage + jittered pacing

- Expand participantSearchQueries: add Japanese hiragana, Korean hangul,
  more Chinese surnames + common given-name characters + city prefixes
  (~96 → ~156 queries)
- Pace queries with random 2-4s sleeps (was fixed 300ms) to dodge rate-limit
  pattern detection; pages with random 800-1500ms (was fixed 500ms)
- FloodWait / other errors during search now break the query loop and
  return partial results instead of silently continuing
- Early-exit when we've found all known participants

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
dot 2 tuần trước cách đây
mục cha
commit
8fb2ca4bce
1 tập tin đã thay đổi với 71 bổ sung17 xóa
  1. 71 17
      internal/telegram/client.go

+ 71 - 17
internal/telegram/client.go

@@ -6,6 +6,7 @@ import (
 	"encoding/base64"
 	"fmt"
 	"log"
+	"math/rand/v2"
 	"net"
 	"net/url"
 	"regexp"
@@ -428,22 +429,35 @@ func (c *Client) GetGroupParticipants(ctx context.Context, username string) ([]G
 		return nil, err
 	}
 
-	// Phase 2: If group has more members than we found, search by character sets to discover more
+	// Phase 2: If group has more members than we found, search by character sets to discover more.
+	// We pace queries with jitter (2–4s) to avoid looking like a bot scanner and triggering FloodWait.
+	// If FloodWait does hit, stop early and return what we already have — the calling task can
+	// re-attempt later after the account cools down.
 	if totalCount > len(allParticipants) && totalCount <= 10000 {
 		queries := participantSearchQueries()
 		for _, q := range queries {
 			if ctx.Err() != nil {
 				break
 			}
+			if len(allParticipants) >= totalCount {
+				break // already collected everyone visible
+			}
 			beforeCount := len(allParticipants)
-			_ = c.fetchParticipantPages(ctx, api, inputChannel, q, seen, extractUsers, nil)
+			err := c.fetchParticipantPages(ctx, api, inputChannel, q, seen, extractUsers, nil)
+			if err != nil {
+				if fwe, ok := err.(*FloodWaitError); ok {
+					log.Printf("[tg_client] flood wait %ds during search q=%q for %s; returning %d/%d",
+						fwe.Seconds, q, username, len(allParticipants), totalCount)
+				} else {
+					log.Printf("[tg_client] search q=%q for %s: %v (returning partial)", q, username, err)
+				}
+				break
+			}
 			if len(allParticipants) == beforeCount {
-				continue // No new results for this query
+				continue // no new results; skip sleep and try next query
 			}
-			select {
-			case <-ctx.Done():
-				return allParticipants, ctx.Err()
-			case <-time.After(300 * time.Millisecond):
+			if err := jitterSleep(ctx, 2*time.Second, 4*time.Second); err != nil {
+				return allParticipants, err
 			}
 		}
 	}
@@ -452,9 +466,25 @@ func (c *Client) GetGroupParticipants(ctx context.Context, username string) ([]G
 	return allParticipants, nil
 }
 
-// participantSearchQueries returns search queries covering Latin, Cyrillic, CJK and other scripts.
+// jitterSleep sleeps a random duration in [min, max) while respecting ctx.
+// Returns ctx.Err() if cancelled. Used to spread out TG API calls and avoid
+// looking like a deterministic scanner.
+func jitterSleep(ctx context.Context, min, max time.Duration) error {
+	d := min + time.Duration(rand.Int64N(int64(max-min)))
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(d):
+		return nil
+	}
+}
+
+// participantSearchQueries returns search queries covering Latin, Cyrillic, Japanese,
+// Korean, and CJK scripts. TG's ChannelParticipantsSearch does substring matching on
+// first_name + last_name + username, so more starter-character coverage = more users
+// surfaced on groups beyond the 200-per-query cap. Total ~150 queries.
 func participantSearchQueries() []string {
-	queries := make([]string, 0, 80)
+	queries := make([]string, 0, 170)
 	// Latin a-z
 	for c := 'a'; c <= 'z'; c++ {
 		queries = append(queries, string(c))
@@ -467,11 +497,36 @@ func participantSearchQueries() []string {
 	for c := 'а'; c <= 'я'; c++ {
 		queries = append(queries, string(c))
 	}
-	// Common CJK first characters (high frequency Chinese surnames and words)
-	cjk := []string{"王", "李", "张", "刘", "陈", "杨", "黄", "赵", "周", "吴",
+	// Japanese Hiragana — common name-starter syllables
+	queries = append(queries,
+		"あ", "い", "う", "え", "お",
+		"か", "さ", "た", "な", "ま",
+	)
+	// Korean Hangul — common initial syllables
+	queries = append(queries,
+		"가", "나", "다", "라", "마", "바", "사", "아", "자", "차",
+		"카", "타", "파", "하",
+	)
+	// CJK: top Chinese surnames (百家姓 high frequency)
+	surnames := []string{
+		"王", "李", "张", "刘", "陈", "杨", "黄", "赵", "周", "吴",
 		"徐", "孙", "马", "朱", "胡", "林", "何", "高", "郭", "罗",
-		"大", "小", "新", "老", "中", "天", "金", "一"}
-	queries = append(queries, cjk...)
+		"谢", "宋", "唐", "许", "邓", "梁", "韩", "曹", "彭", "余",
+		"潘", "袁", "蒋", "蔡", "卢", "田", "董", "叶", "程", "姜",
+	}
+	queries = append(queries, surnames...)
+	// CJK: common given-name characters (高频二字名)
+	given := []string{
+		"伟", "芳", "娜", "秀", "敏", "静", "丽", "强", "磊", "军",
+		"洋", "勇", "艳", "杰", "涛", "明", "超", "霞", "平", "刚",
+	}
+	queries = append(queries, given...)
+	// CJK: common modifiers and city prefixes (covers nicknames/titles)
+	misc := []string{
+		"大", "小", "新", "老", "中", "天", "金", "一", "龙", "虎",
+		"京", "沪", "深", "广", "杭", "苏",
+	}
+	queries = append(queries, misc...)
 	return queries
 }
 
@@ -517,10 +572,9 @@ func (c *Client) fetchParticipantPages(
 			break
 		}
 
-		select {
-		case <-ctx.Done():
-			return ctx.Err()
-		case <-time.After(500 * time.Millisecond):
+		// Page interval: jittered to avoid a detectable request cadence.
+		if err := jitterSleep(ctx, 800*time.Millisecond, 1500*time.Millisecond); err != nil {
+			return err
 		}
 	}
 	return nil