package processor import ( "math" "regexp" "spider/internal/extractor" "spider/internal/model" "strings" ) // BlacklistResult holds the outcome of blacklist filtering. type BlacklistResult struct { Passed []model.MerchantRaw Blocked []FilteredMerchant } // FilteredMerchant pairs a raw merchant with its rejection status. type FilteredMerchant struct { Raw model.MerchantRaw Status string // bot / invalid } var systemBots = []string{ "telegram", "telegramhints", "gif", "pic", "bing", "vid", "bold", "vote", "like", "sticker", "music", "channel_bot", "botfather", "spambot", } var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`) // FilterBlacklist applies blacklist rules to raw merchants. func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult { var result BlacklistResult for _, raw := range raws { status := checkBlacklist(raw) if status != "" { result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status}) } else { result.Passed = append(result.Passed, raw) } } return result } func checkBlacklist(raw model.MerchantRaw) string { username := strings.ToLower(raw.TgUsername) // System bot names for _, b := range systemBots { if username == b { return "bot" } } // xxxbot suffix if strings.HasSuffix(username, "bot") && len(username) > 3 { return "bot" } // Invite link hash (16-24 char base64 with high entropy) if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 { if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 { return "invalid" } } // Original text non-Chinese if raw.OriginalText != "" && !extractor.ContainsChinese(raw.OriginalText, 0) { return "invalid" } return "" } func entropy(s string) float64 { freq := map[rune]int{} for _, r := range s { freq[r]++ } n := float64(len(s)) h := 0.0 for _, count := range freq { p := float64(count) / n h -= p * math.Log2(p) } return h }