package processor import ( "math" "regexp" "spider/internal/extractor" "spider/internal/model" "strings" ) // BlacklistResult holds the outcome of blacklist filtering. type BlacklistResult struct { Passed []model.MerchantRaw Blocked []FilteredMerchant } // FilteredMerchant pairs a raw merchant with its rejection status. type FilteredMerchant struct { Raw model.MerchantRaw Status string // bot / invalid } var systemBots = []string{ "telegram", "telegramhints", "gif", "pic", "bing", "vid", "bold", "vote", "like", "sticker", "music", "channel_bot", "botfather", "spambot", // Common false positives from HTML/code extraction "github", "gmail", "email", "admin", "login", "signup", "about", "contact", "support", "https", "style", "script", "header", "footer", "button", "input", "image", "video", "media", "share", "click", "undefined", "object", "string", "number", "function", "return", "const", "class", "export", "import", "ssage", "messages", "channel", "username", } var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`) // FilterBlacklist applies blacklist rules to raw merchants. func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult { var result BlacklistResult for _, raw := range raws { status := checkBlacklist(raw) if status != "" { result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status}) } else { result.Passed = append(result.Passed, raw) } } return result } func checkBlacklist(raw model.MerchantRaw) string { username := strings.ToLower(raw.TgUsername) // System bot names for _, b := range systemBots { if username == b { return "bot" } } // xxxbot suffix if strings.HasSuffix(username, "bot") && len(username) > 3 { return "bot" } // Invite link hash (16-24 char base64 with high entropy) if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 { if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 { return "invalid" } } // Original text non-Chinese: only filter if text is long and has NO Chinese at all // Short texts and mixed-language texts are allowed if len(raw.OriginalText) > 200 && !extractor.ContainsChinese(raw.OriginalText, 0) { return "invalid" } return "" } func entropy(s string) float64 { freq := map[rune]int{} for _, r := range s { freq[r]++ } n := float64(len(s)) h := 0.0 for _, count := range freq { p := float64(count) / n h -= p * math.Log2(p) } return h }