| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- package processor
- import (
- "math"
- "regexp"
- "spider/internal/extractor"
- "spider/internal/model"
- "strings"
- )
- // BlacklistResult holds the outcome of blacklist filtering.
- type BlacklistResult struct {
- Passed []model.MerchantRaw
- Blocked []FilteredMerchant
- }
- // FilteredMerchant pairs a raw merchant with its rejection status.
- type FilteredMerchant struct {
- Raw model.MerchantRaw
- Status string // bot / invalid
- }
- var systemBots = []string{
- "telegram", "telegramhints", "gif", "pic", "bing", "vid",
- "bold", "vote", "like", "sticker", "music",
- "channel_bot", "botfather", "spambot",
- }
- var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`)
- // FilterBlacklist applies blacklist rules to raw merchants.
- func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult {
- var result BlacklistResult
- for _, raw := range raws {
- status := checkBlacklist(raw)
- if status != "" {
- result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status})
- } else {
- result.Passed = append(result.Passed, raw)
- }
- }
- return result
- }
- func checkBlacklist(raw model.MerchantRaw) string {
- username := strings.ToLower(raw.TgUsername)
- // System bot names
- for _, b := range systemBots {
- if username == b {
- return "bot"
- }
- }
- // xxxbot suffix
- if strings.HasSuffix(username, "bot") && len(username) > 3 {
- return "bot"
- }
- // Invite link hash (16-24 char base64 with high entropy)
- if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 {
- if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 {
- return "invalid"
- }
- }
- // Original text non-Chinese
- if raw.OriginalText != "" && !extractor.ContainsChinese(raw.OriginalText, 0) {
- return "invalid"
- }
- return ""
- }
- func entropy(s string) float64 {
- freq := map[rune]int{}
- for _, r := range s {
- freq[r]++
- }
- n := float64(len(s))
- h := 0.0
- for _, count := range freq {
- p := float64(count) / n
- h -= p * math.Log2(p)
- }
- return h
- }
|