regex.go 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package extractor
  2. import (
  3. "regexp"
  4. "strings"
  5. "unicode"
  6. )
  7. // TG 用户名正则模式:
  8. // 标准: @username 或 t.me/username
  9. // 变体: t点me/xxx, t . me/xxx, t.me/xxx, tg:xxx, telegram.me/xxx
  10. var (
  11. reTgAt = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  12. reTgLink = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  13. reTgDot = regexp.MustCompile(`t[点.·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  14. reTgColon = regexp.MustCompile(`(?i)tg[::\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  15. reEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
  16. rePhone = regexp.MustCompile(`\+?[0-9]{7,15}`)
  17. reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`)
  18. reWeChat = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[::\s]*([a-zA-Z0-9_\-]{5,20})`)
  19. reHTMLTag = regexp.MustCompile(`<[^>]+>`)
  20. )
  21. // Extract 从文本提取联系方式(正则,快速)
  22. // 优先级:先提取 t.me 链接(更精确),再提取 @用户名,避免重复
  23. func Extract(text string) *ContactInfo {
  24. info := &ContactInfo{}
  25. // 1. 优先提取标准 t.me 链接
  26. if m := reTgLink.FindStringSubmatch(text); m != nil {
  27. info.TgUsername = m[1]
  28. info.TgLink = "t.me/" + m[1]
  29. }
  30. // 2. 若无 t.me 链接,尝试中文变体 (t点me / t.me)
  31. if info.TgUsername == "" {
  32. if m := reTgDot.FindStringSubmatch(text); m != nil {
  33. info.TgUsername = m[1]
  34. info.TgLink = "t.me/" + m[1]
  35. }
  36. }
  37. // 3. 若无 t.me 变体,尝试 tg: 前缀
  38. if info.TgUsername == "" {
  39. if m := reTgColon.FindStringSubmatch(text); m != nil {
  40. info.TgUsername = m[1]
  41. info.TgLink = "t.me/" + m[1]
  42. }
  43. }
  44. // 4. 最后尝试 @用户名(避免与已提取用户名重复)
  45. if info.TgUsername == "" {
  46. if m := reTgAt.FindStringSubmatch(text); m != nil {
  47. info.TgUsername = m[1]
  48. info.TgLink = "t.me/" + m[1]
  49. }
  50. }
  51. // 5. 提取 Email(过滤掉 TG 用户名中的 @ 误匹配)
  52. if m := reEmail.FindString(text); m != "" {
  53. info.Email = m
  54. }
  55. // 6. 提取网站
  56. if m := reWebsite.FindString(text); m != "" {
  57. // 过滤掉 t.me 本身
  58. if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") {
  59. info.Website = strings.TrimRight(m, ".,;)")
  60. }
  61. }
  62. // 7. 提取电话(过滤纯数字短于7位)
  63. if m := rePhone.FindString(text); m != "" {
  64. cleaned := strings.TrimPrefix(m, "+")
  65. if len(cleaned) >= 7 {
  66. info.Phone = m
  67. }
  68. }
  69. // 8. 提取微信
  70. if m := reWeChat.FindStringSubmatch(text); m != nil {
  71. info.WeChat = m[1]
  72. }
  73. info.HasContact = info.TgUsername != "" || info.Email != "" ||
  74. info.Website != "" || info.Phone != "" || info.WeChat != ""
  75. return info
  76. }
  77. // HasContact 快速判断文本是否含任何联系方式(无需完整提取)
  78. func HasContact(text string) bool {
  79. return reTgAt.MatchString(text) ||
  80. reTgLink.MatchString(text) ||
  81. reTgDot.MatchString(text) ||
  82. reTgColon.MatchString(text) ||
  83. reEmail.MatchString(text) ||
  84. reWebsite.MatchString(text) ||
  85. reWeChat.MatchString(text)
  86. }
  87. // ContainsChinese 检查文本是否包含中文
  88. // threshold: 中文字符占总字符的最低比例 (0-1),0表示只要有中文就返回true
  89. func ContainsChinese(text string, threshold float64) bool {
  90. if threshold <= 0 {
  91. for _, r := range text {
  92. if unicode.Is(unicode.Han, r) {
  93. return true
  94. }
  95. }
  96. return false
  97. }
  98. return ChineseRatio(text) >= threshold
  99. }
  100. // ChineseRatio 返回中文字符比例
  101. func ChineseRatio(text string) float64 {
  102. runes := []rune(text)
  103. if len(runes) == 0 {
  104. return 0
  105. }
  106. var count int
  107. for _, r := range runes {
  108. if unicode.Is(unicode.Han, r) {
  109. count++
  110. }
  111. }
  112. return float64(count) / float64(len(runes))
  113. }
  114. // CleanMerchantName 清洗商户名(去除HTML标签、多余空白)
  115. func CleanMerchantName(name string) string {
  116. name = reHTMLTag.ReplaceAllString(name, "")
  117. return strings.TrimSpace(name)
  118. }