package extractor import ( "regexp" "strings" "unicode" ) // TG 用户名正则模式: // 标准: @username 或 t.me/username // 变体: t点me/xxx, t . me/xxx, t.me/xxx, tg:xxx, telegram.me/xxx var ( reTgAt = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgLink = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgDot = regexp.MustCompile(`t[点.·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgColon = regexp.MustCompile(`(?i)tg[::\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`) reEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`) rePhone = regexp.MustCompile(`\+?[0-9]{7,15}`) reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`) reWeChat = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[::\s]*([a-zA-Z0-9_\-]{5,20})`) reHTMLTag = regexp.MustCompile(`<[^>]+>`) ) // Extract 从文本提取联系方式(正则,快速) // 优先级:先提取 t.me 链接(更精确),再提取 @用户名,避免重复 func Extract(text string) *ContactInfo { info := &ContactInfo{} // 1. 优先提取标准 t.me 链接 if m := reTgLink.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } // 2. 若无 t.me 链接,尝试中文变体 (t点me / t.me) if info.TgUsername == "" { if m := reTgDot.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } // 3. 若无 t.me 变体,尝试 tg: 前缀 if info.TgUsername == "" { if m := reTgColon.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } // 4. 最后尝试 @用户名(避免与已提取用户名重复) if info.TgUsername == "" { if m := reTgAt.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } // 5. 提取 Email(过滤掉 TG 用户名中的 @ 误匹配) if m := reEmail.FindString(text); m != "" { info.Email = m } // 6. 提取网站 if m := reWebsite.FindString(text); m != "" { // 过滤掉 t.me 本身 if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") { info.Website = strings.TrimRight(m, ".,;)") } } // 7. 提取电话(过滤纯数字短于7位) if m := rePhone.FindString(text); m != "" { cleaned := strings.TrimPrefix(m, "+") if len(cleaned) >= 7 { info.Phone = m } } // 8. 提取微信 if m := reWeChat.FindStringSubmatch(text); m != nil { info.WeChat = m[1] } info.HasContact = info.TgUsername != "" || info.Email != "" || info.Website != "" || info.Phone != "" || info.WeChat != "" return info } // HasContact 快速判断文本是否含任何联系方式(无需完整提取) func HasContact(text string) bool { return reTgAt.MatchString(text) || reTgLink.MatchString(text) || reTgDot.MatchString(text) || reTgColon.MatchString(text) || reEmail.MatchString(text) || reWebsite.MatchString(text) || reWeChat.MatchString(text) } // ContainsChinese 检查文本是否包含中文 // threshold: 中文字符占总字符的最低比例 (0-1),0表示只要有中文就返回true func ContainsChinese(text string, threshold float64) bool { if threshold <= 0 { for _, r := range text { if unicode.Is(unicode.Han, r) { return true } } return false } return ChineseRatio(text) >= threshold } // ChineseRatio 返回中文字符比例 func ChineseRatio(text string) float64 { runes := []rune(text) if len(runes) == 0 { return 0 } var count int for _, r := range runes { if unicode.Is(unicode.Han, r) { count++ } } return float64(count) / float64(len(runes)) } // CleanMerchantName 清洗商户名(去除HTML标签、多余空白) func CleanMerchantName(name string) string { name = reHTMLTag.ReplaceAllString(name, "") return strings.TrimSpace(name) }