| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- package extractor
- import (
- "regexp"
- "strings"
- "unicode"
- )
- // TG 用户名正则模式:
- // 标准: @username 或 t.me/username
- // 变体: t点me/xxx, t . me/xxx, t.me/xxx, tg:xxx, telegram.me/xxx
- var (
- reTgAt = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`)
- reTgLink = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
- reTgDot = regexp.MustCompile(`t[点.·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`)
- reTgColon = regexp.MustCompile(`(?i)tg[::\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`)
- reEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
- rePhone = regexp.MustCompile(`\+?[0-9]{7,15}`)
- reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`)
- reWeChat = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[::\s]*([a-zA-Z0-9_\-]{5,20})`)
- reHTMLTag = regexp.MustCompile(`<[^>]+>`)
- )
- // Extract 从文本提取联系方式(正则,快速)
- // 优先级:先提取 t.me 链接(更精确),再提取 @用户名,避免重复
- func Extract(text string) *ContactInfo {
- info := &ContactInfo{}
- // 1. 优先提取标准 t.me 链接
- if m := reTgLink.FindStringSubmatch(text); m != nil {
- info.TgUsername = m[1]
- info.TgLink = "t.me/" + m[1]
- }
- // 2. 若无 t.me 链接,尝试中文变体 (t点me / t.me)
- if info.TgUsername == "" {
- if m := reTgDot.FindStringSubmatch(text); m != nil {
- info.TgUsername = m[1]
- info.TgLink = "t.me/" + m[1]
- }
- }
- // 3. 若无 t.me 变体,尝试 tg: 前缀
- if info.TgUsername == "" {
- if m := reTgColon.FindStringSubmatch(text); m != nil {
- info.TgUsername = m[1]
- info.TgLink = "t.me/" + m[1]
- }
- }
- // 4. 最后尝试 @用户名(避免与已提取用户名重复)
- if info.TgUsername == "" {
- if m := reTgAt.FindStringSubmatch(text); m != nil {
- info.TgUsername = m[1]
- info.TgLink = "t.me/" + m[1]
- }
- }
- // 5. 提取 Email(过滤掉 TG 用户名中的 @ 误匹配)
- if m := reEmail.FindString(text); m != "" {
- info.Email = m
- }
- // 6. 提取网站
- if m := reWebsite.FindString(text); m != "" {
- // 过滤掉 t.me 本身
- if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") {
- info.Website = strings.TrimRight(m, ".,;)")
- }
- }
- // 7. 提取电话(过滤纯数字短于7位)
- if m := rePhone.FindString(text); m != "" {
- cleaned := strings.TrimPrefix(m, "+")
- if len(cleaned) >= 7 {
- info.Phone = m
- }
- }
- // 8. 提取微信
- if m := reWeChat.FindStringSubmatch(text); m != nil {
- info.WeChat = m[1]
- }
- info.HasContact = info.TgUsername != "" || info.Email != "" ||
- info.Website != "" || info.Phone != "" || info.WeChat != ""
- return info
- }
- // HasContact 快速判断文本是否含任何联系方式(无需完整提取)
- func HasContact(text string) bool {
- return reTgAt.MatchString(text) ||
- reTgLink.MatchString(text) ||
- reTgDot.MatchString(text) ||
- reTgColon.MatchString(text) ||
- reEmail.MatchString(text) ||
- reWebsite.MatchString(text) ||
- reWeChat.MatchString(text)
- }
- // ContainsChinese 检查文本是否包含中文
- // threshold: 中文字符占总字符的最低比例 (0-1),0表示只要有中文就返回true
- func ContainsChinese(text string, threshold float64) bool {
- if threshold <= 0 {
- for _, r := range text {
- if unicode.Is(unicode.Han, r) {
- return true
- }
- }
- return false
- }
- return ChineseRatio(text) >= threshold
- }
- // ChineseRatio 返回中文字符比例
- func ChineseRatio(text string) float64 {
- runes := []rune(text)
- if len(runes) == 0 {
- return 0
- }
- var count int
- for _, r := range runes {
- if unicode.Is(unicode.Han, r) {
- count++
- }
- }
- return float64(count) / float64(len(runes))
- }
- // CleanMerchantName 清洗商户名(去除HTML标签、多余空白)
- func CleanMerchantName(name string) string {
- name = reHTMLTag.ReplaceAllString(name, "")
- return strings.TrimSpace(name)
- }
|