regex.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. package extractor
  2. import (
  3. "regexp"
  4. "strings"
  5. "unicode"
  6. )
  7. // TG 用户名正则模式:
  8. // 标准: @username 或 t.me/username
  9. // 变体: t点me/xxx, t . me/xxx, t.me/xxx, tg:xxx, telegram.me/xxx
  10. var (
  11. reTgAt = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  12. reTgLink = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  13. reTgDot = regexp.MustCompile(`t[点.·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  14. reTgColon = regexp.MustCompile(`(?i)tg[::\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`)
  15. reEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
  16. rePhone = regexp.MustCompile(`\+?[0-9]{7,15}`)
  17. reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`)
  18. reWeChat = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[::\s]*([a-zA-Z0-9_\-]{5,20})`)
  19. reHTMLTag = regexp.MustCompile(`<[^>]+>`)
  20. )
  21. // Extract 从文本提取联系方式(正则,快速)— 只返回第一个TG用户名
  22. // 优先级:先提取 t.me 链接(更精确),再提取 @用户名,避免重复
  23. func Extract(text string) *ContactInfo {
  24. info := &ContactInfo{}
  25. // 1. 优先提取标准 t.me 链接
  26. if m := reTgLink.FindStringSubmatch(text); m != nil {
  27. info.TgUsername = m[1]
  28. info.TgLink = "t.me/" + m[1]
  29. }
  30. // 2. 若无 t.me 链接,尝试中文变体 (t点me / t.me)
  31. if info.TgUsername == "" {
  32. if m := reTgDot.FindStringSubmatch(text); m != nil {
  33. info.TgUsername = m[1]
  34. info.TgLink = "t.me/" + m[1]
  35. }
  36. }
  37. // 3. 若无 t.me 变体,尝试 tg: 前缀
  38. if info.TgUsername == "" {
  39. if m := reTgColon.FindStringSubmatch(text); m != nil {
  40. info.TgUsername = m[1]
  41. info.TgLink = "t.me/" + m[1]
  42. }
  43. }
  44. // 4. 最后尝试 @用户名(避免与已提取用户名重复)
  45. if info.TgUsername == "" {
  46. if m := reTgAt.FindStringSubmatch(text); m != nil {
  47. info.TgUsername = m[1]
  48. info.TgLink = "t.me/" + m[1]
  49. }
  50. }
  51. extractContactFields(text, info)
  52. return info
  53. }
  54. // commonFalsePositives are words that regex matches as TG usernames but aren't.
  55. var commonFalsePositives = map[string]bool{
  56. // TG system bots
  57. "telegram": true, "telegramhints": true, "botfather": true, "spambot": true,
  58. // HTML/CSS/JS keywords that regex can match
  59. "github": true, "gmail": true, "email": true, "admin": true,
  60. "login": true, "signup": true, "about": true, "contact": true,
  61. "support": true, "https": true, "style": true, "script": true,
  62. "header": true, "footer": true, "button": true, "input": true,
  63. "image": true, "video": true, "media": true, "share": true,
  64. "click": true, "undefined": true, "object": true, "string": true,
  65. "number": true, "function": true, "return": true, "const": true,
  66. "class": true, "export": true, "import": true, "ssage": true,
  67. "messages": true, "channel": true, "username": true,
  68. // CSS properties/values commonly found in HTML pages
  69. "context": true, "graph": true, "supports": true, "keyframes": true,
  70. "container": true, "ffmpeg": true, "original": true, "wrapped": true,
  71. "newrelic": true, "viewport": true, "charset": true, "content": true,
  72. "inherit": true, "initial": true, "normal": true, "center": true,
  73. "inline": true, "block": true, "fixed": true, "static": true,
  74. "relative": true, "absolute": true, "hidden": true, "visible": true,
  75. "transparent": true, "important": true, "default": true,
  76. // Common web tech terms
  77. "jquery": true, "webpack": true, "babel": true, "eslint": true,
  78. "prettier": true, "typescript": true, "javascript": true,
  79. "angular": true, "vuejs": true, "nextjs": true, "nuxtjs": true,
  80. "nodejs": true, "express": true, "python": true, "django": true,
  81. "docker": true, "nginx": true, "redis": true, "mysql": true,
  82. "mongodb": true, "postgresql": true, "firebase": true,
  83. "google": true, "apple": true, "microsoft": true, "amazon": true,
  84. "twitter": true, "facebook": true, "instagram": true, "tiktok": true,
  85. "linkedin": true, "pinterest": true, "reddit": true, "discord": true,
  86. }
  87. func isValidTgUsername(username string) bool {
  88. lower := strings.ToLower(username)
  89. if commonFalsePositives[lower] {
  90. return false
  91. }
  92. // Reject if ends with "bot" (except very short ones which might be real names)
  93. if strings.HasSuffix(lower, "bot") && len(lower) > 5 {
  94. return false
  95. }
  96. return true
  97. }
  98. // ExtractAll 从文本提取所有不同的TG用户名及联系方式
  99. // 用于导航站/聚合页面,一个页面可能包含多个商户
  100. func ExtractAll(text string) []*ContactInfo {
  101. seen := map[string]bool{}
  102. var results []*ContactInfo
  103. // Collect all usernames from all regex patterns
  104. addUsername := func(username string) {
  105. if username == "" || seen[strings.ToLower(username)] {
  106. return
  107. }
  108. if !isValidTgUsername(username) {
  109. return
  110. }
  111. seen[strings.ToLower(username)] = true
  112. info := &ContactInfo{
  113. TgUsername: username,
  114. TgLink: "t.me/" + username,
  115. }
  116. results = append(results, info)
  117. }
  118. // t.me links (highest priority, most accurate)
  119. for _, m := range reTgLink.FindAllStringSubmatch(text, -1) {
  120. addUsername(m[1])
  121. }
  122. // Chinese variants: t点me, t.me
  123. for _, m := range reTgDot.FindAllStringSubmatch(text, -1) {
  124. addUsername(m[1])
  125. }
  126. // tg: prefix
  127. for _, m := range reTgColon.FindAllStringSubmatch(text, -1) {
  128. addUsername(m[1])
  129. }
  130. // @username
  131. for _, m := range reTgAt.FindAllStringSubmatch(text, -1) {
  132. addUsername(m[1])
  133. }
  134. // If no results, return nil
  135. if len(results) == 0 {
  136. return nil
  137. }
  138. // Attach shared contact fields to the first result
  139. extractContactFields(text, results[0])
  140. return results
  141. }
  142. // extractContactFields fills in email, website, phone, wechat fields.
  143. func extractContactFields(text string, info *ContactInfo) {
  144. // 提取 Email(过滤掉 TG 用户名中的 @ 误匹配)
  145. if m := reEmail.FindString(text); m != "" {
  146. info.Email = m
  147. }
  148. // 提取网站
  149. if m := reWebsite.FindString(text); m != "" {
  150. // 过滤掉 t.me 本身
  151. if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") {
  152. info.Website = strings.TrimRight(m, ".,;)")
  153. }
  154. }
  155. // 提取电话(过滤纯数字短于7位)
  156. if m := rePhone.FindString(text); m != "" {
  157. cleaned := strings.TrimPrefix(m, "+")
  158. if len(cleaned) >= 7 {
  159. info.Phone = m
  160. }
  161. }
  162. // 提取微信
  163. if m := reWeChat.FindStringSubmatch(text); m != nil {
  164. info.WeChat = m[1]
  165. }
  166. info.HasContact = info.TgUsername != "" || info.Email != "" ||
  167. info.Website != "" || info.Phone != "" || info.WeChat != ""
  168. }
  169. // HasContact 快速判断文本是否含任何联系方式(无需完整提取)
  170. func HasContact(text string) bool {
  171. return reTgAt.MatchString(text) ||
  172. reTgLink.MatchString(text) ||
  173. reTgDot.MatchString(text) ||
  174. reTgColon.MatchString(text) ||
  175. reEmail.MatchString(text) ||
  176. reWebsite.MatchString(text) ||
  177. reWeChat.MatchString(text)
  178. }
  179. // ContainsChinese 检查文本是否包含中文
  180. // threshold: 中文字符占总字符的最低比例 (0-1),0表示只要有中文就返回true
  181. func ContainsChinese(text string, threshold float64) bool {
  182. if threshold <= 0 {
  183. for _, r := range text {
  184. if unicode.Is(unicode.Han, r) {
  185. return true
  186. }
  187. }
  188. return false
  189. }
  190. return ChineseRatio(text) >= threshold
  191. }
  192. // ChineseRatio 返回中文字符比例
  193. func ChineseRatio(text string) float64 {
  194. runes := []rune(text)
  195. if len(runes) == 0 {
  196. return 0
  197. }
  198. var count int
  199. for _, r := range runes {
  200. if unicode.Is(unicode.Han, r) {
  201. count++
  202. }
  203. }
  204. return float64(count) / float64(len(runes))
  205. }
  206. // CleanMerchantName 清洗商户名(去除HTML标签、多余空白)
  207. func CleanMerchantName(name string) string {
  208. name = reHTMLTag.ReplaceAllString(name, "")
  209. return strings.TrimSpace(name)
  210. }