blacklist.go 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. package processor
  2. import (
  3. "math"
  4. "regexp"
  5. "spider/internal/extractor"
  6. "spider/internal/model"
  7. "strings"
  8. )
  9. // BlacklistResult holds the outcome of blacklist filtering.
  10. type BlacklistResult struct {
  11. Passed []model.MerchantRaw
  12. Blocked []FilteredMerchant
  13. }
  14. // FilteredMerchant pairs a raw merchant with its rejection status.
  15. type FilteredMerchant struct {
  16. Raw model.MerchantRaw
  17. Status string // bot / invalid
  18. }
  19. var systemBots = []string{
  20. "telegram", "telegramhints", "gif", "pic", "bing", "vid",
  21. "bold", "vote", "like", "sticker", "music",
  22. "channel_bot", "botfather", "spambot",
  23. // Common false positives from HTML/code extraction
  24. "github", "gmail", "email", "admin", "login", "signup",
  25. "about", "contact", "support", "https", "style",
  26. "script", "header", "footer", "button", "input",
  27. "image", "video", "media", "share", "click",
  28. "undefined", "object", "string", "number", "function",
  29. "return", "const", "class", "export", "import",
  30. "ssage", "messages", "channel", "username",
  31. }
  32. var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`)
  33. // FilterBlacklist applies blacklist rules to raw merchants.
  34. func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult {
  35. var result BlacklistResult
  36. for _, raw := range raws {
  37. status := checkBlacklist(raw)
  38. if status != "" {
  39. result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status})
  40. } else {
  41. result.Passed = append(result.Passed, raw)
  42. }
  43. }
  44. return result
  45. }
  46. func checkBlacklist(raw model.MerchantRaw) string {
  47. username := strings.ToLower(raw.TgUsername)
  48. // System bot names
  49. for _, b := range systemBots {
  50. if username == b {
  51. return "bot"
  52. }
  53. }
  54. // xxxbot suffix
  55. if strings.HasSuffix(username, "bot") && len(username) > 3 {
  56. return "bot"
  57. }
  58. // Invite link hash (16-24 char base64 with high entropy)
  59. if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 {
  60. if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 {
  61. return "invalid"
  62. }
  63. }
  64. // Original text non-Chinese: only filter if text is long and has NO Chinese at all
  65. // Short texts and mixed-language texts are allowed
  66. if len(raw.OriginalText) > 200 && !extractor.ContainsChinese(raw.OriginalText, 0) {
  67. return "invalid"
  68. }
  69. return ""
  70. }
  71. func entropy(s string) float64 {
  72. freq := map[rune]int{}
  73. for _, r := range s {
  74. freq[r]++
  75. }
  76. n := float64(len(s))
  77. h := 0.0
  78. for _, count := range freq {
  79. p := float64(count) / n
  80. h -= p * math.Log2(p)
  81. }
  82. return h
  83. }