blacklist.go 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. package processor
  2. import (
  3. "math"
  4. "regexp"
  5. "spider/internal/extractor"
  6. "spider/internal/model"
  7. "strings"
  8. )
  9. // BlacklistResult holds the outcome of blacklist filtering.
  10. type BlacklistResult struct {
  11. Passed []model.MerchantRaw
  12. Blocked []FilteredMerchant
  13. }
  14. // FilteredMerchant pairs a raw merchant with its rejection status.
  15. type FilteredMerchant struct {
  16. Raw model.MerchantRaw
  17. Status string // bot / invalid
  18. }
  19. var systemBots = []string{
  20. "telegram", "telegramhints", "gif", "pic", "bing", "vid",
  21. "bold", "vote", "like", "sticker", "music",
  22. "channel_bot", "botfather", "spambot",
  23. }
  24. var reBase64 = regexp.MustCompile(`^[A-Za-z0-9_-]{16,24}$`)
  25. // FilterBlacklist applies blacklist rules to raw merchants.
  26. func FilterBlacklist(raws []model.MerchantRaw) BlacklistResult {
  27. var result BlacklistResult
  28. for _, raw := range raws {
  29. status := checkBlacklist(raw)
  30. if status != "" {
  31. result.Blocked = append(result.Blocked, FilteredMerchant{Raw: raw, Status: status})
  32. } else {
  33. result.Passed = append(result.Passed, raw)
  34. }
  35. }
  36. return result
  37. }
  38. func checkBlacklist(raw model.MerchantRaw) string {
  39. username := strings.ToLower(raw.TgUsername)
  40. // System bot names
  41. for _, b := range systemBots {
  42. if username == b {
  43. return "bot"
  44. }
  45. }
  46. // xxxbot suffix
  47. if strings.HasSuffix(username, "bot") && len(username) > 3 {
  48. return "bot"
  49. }
  50. // Invite link hash (16-24 char base64 with high entropy)
  51. if len(raw.TgUsername) >= 16 && len(raw.TgUsername) <= 24 {
  52. if reBase64.MatchString(raw.TgUsername) && entropy(raw.TgUsername) > 3.5 {
  53. return "invalid"
  54. }
  55. }
  56. // Original text non-Chinese
  57. if raw.OriginalText != "" && !extractor.ContainsChinese(raw.OriginalText, 0) {
  58. return "invalid"
  59. }
  60. return ""
  61. }
  62. func entropy(s string) float64 {
  63. freq := map[rune]int{}
  64. for _, r := range s {
  65. freq[r]++
  66. }
  67. n := float64(len(s))
  68. h := 0.0
  69. for _, count := range freq {
  70. p := float64(count) / n
  71. h -= p * math.Log2(p)
  72. }
  73. return h
  74. }