package extractor import ( "regexp" "strings" "unicode" ) // TG 用户名正则模式: // 标准: @username 或 t.me/username // 变体: t点me/xxx, t . me/xxx, t.me/xxx, tg:xxx, telegram.me/xxx var ( reTgAt = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgLink = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgDot = regexp.MustCompile(`t[点.·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`) reTgColon = regexp.MustCompile(`(?i)tg[::\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`) reEmail = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`) rePhone = regexp.MustCompile(`\+?[0-9]{7,15}`) reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`) reWeChat = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[::\s]*([a-zA-Z0-9_\-]{5,20})`) reHTMLTag = regexp.MustCompile(`<[^>]+>`) ) // Extract 从文本提取联系方式(正则,快速)— 只返回第一个TG用户名 // 优先级:先提取 t.me 链接(更精确),再提取 @用户名,避免重复 func Extract(text string) *ContactInfo { info := &ContactInfo{} // 1. 优先提取标准 t.me 链接 if m := reTgLink.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } // 2. 若无 t.me 链接,尝试中文变体 (t点me / t.me) if info.TgUsername == "" { if m := reTgDot.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } // 3. 若无 t.me 变体,尝试 tg: 前缀 if info.TgUsername == "" { if m := reTgColon.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } // 4. 最后尝试 @用户名(避免与已提取用户名重复) if info.TgUsername == "" { if m := reTgAt.FindStringSubmatch(text); m != nil { info.TgUsername = m[1] info.TgLink = "t.me/" + m[1] } } extractContactFields(text, info) return info } // commonFalsePositives are words that regex matches as TG usernames but aren't. var commonFalsePositives = map[string]bool{ // TG system bots "telegram": true, "telegramhints": true, "botfather": true, "spambot": true, // HTML/CSS/JS keywords that regex can match "github": true, "gmail": true, "email": true, "admin": true, "login": true, "signup": true, "about": true, "contact": true, "support": true, "https": true, "style": true, "script": true, "header": true, "footer": true, "button": true, "input": true, "image": true, "video": true, "media": true, "share": true, "click": true, "undefined": true, "object": true, "string": true, "number": true, "function": true, "return": true, "const": true, "class": true, "export": true, "import": true, "ssage": true, "messages": true, "channel": true, "username": true, // CSS properties/values commonly found in HTML pages "context": true, "graph": true, "supports": true, "keyframes": true, "container": true, "ffmpeg": true, "original": true, "wrapped": true, "newrelic": true, "viewport": true, "charset": true, "content": true, "inherit": true, "initial": true, "normal": true, "center": true, "inline": true, "block": true, "fixed": true, "static": true, "relative": true, "absolute": true, "hidden": true, "visible": true, "transparent": true, "important": true, "default": true, // Common web tech terms "jquery": true, "webpack": true, "babel": true, "eslint": true, "prettier": true, "typescript": true, "javascript": true, "angular": true, "vuejs": true, "nextjs": true, "nuxtjs": true, "nodejs": true, "express": true, "python": true, "django": true, "docker": true, "nginx": true, "redis": true, "mysql": true, "mongodb": true, "postgresql": true, "firebase": true, "google": true, "apple": true, "microsoft": true, "amazon": true, "twitter": true, "facebook": true, "instagram": true, "tiktok": true, "linkedin": true, "pinterest": true, "reddit": true, "discord": true, } func isValidTgUsername(username string) bool { lower := strings.ToLower(username) if commonFalsePositives[lower] { return false } // Reject if ends with "bot" (except very short ones which might be real names) if strings.HasSuffix(lower, "bot") && len(lower) > 5 { return false } return true } // ExtractAll 从文本提取所有不同的TG用户名及联系方式 // 用于导航站/聚合页面,一个页面可能包含多个商户 func ExtractAll(text string) []*ContactInfo { seen := map[string]bool{} var results []*ContactInfo // Collect all usernames from all regex patterns addUsername := func(username string) { if username == "" || seen[strings.ToLower(username)] { return } if !isValidTgUsername(username) { return } seen[strings.ToLower(username)] = true info := &ContactInfo{ TgUsername: username, TgLink: "t.me/" + username, } results = append(results, info) } // t.me links (highest priority, most accurate) for _, m := range reTgLink.FindAllStringSubmatch(text, -1) { addUsername(m[1]) } // Chinese variants: t点me, t.me for _, m := range reTgDot.FindAllStringSubmatch(text, -1) { addUsername(m[1]) } // tg: prefix for _, m := range reTgColon.FindAllStringSubmatch(text, -1) { addUsername(m[1]) } // @username for _, m := range reTgAt.FindAllStringSubmatch(text, -1) { addUsername(m[1]) } // If no results, return nil if len(results) == 0 { return nil } // Attach shared contact fields to the first result extractContactFields(text, results[0]) return results } // extractContactFields fills in email, website, phone, wechat fields. func extractContactFields(text string, info *ContactInfo) { // 提取 Email(过滤掉 TG 用户名中的 @ 误匹配) if m := reEmail.FindString(text); m != "" { info.Email = m } // 提取网站 if m := reWebsite.FindString(text); m != "" { // 过滤掉 t.me 本身 if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") { info.Website = strings.TrimRight(m, ".,;)") } } // 提取电话(过滤纯数字短于7位) if m := rePhone.FindString(text); m != "" { cleaned := strings.TrimPrefix(m, "+") if len(cleaned) >= 7 { info.Phone = m } } // 提取微信 if m := reWeChat.FindStringSubmatch(text); m != nil { info.WeChat = m[1] } info.HasContact = info.TgUsername != "" || info.Email != "" || info.Website != "" || info.Phone != "" || info.WeChat != "" } // HasContact 快速判断文本是否含任何联系方式(无需完整提取) func HasContact(text string) bool { return reTgAt.MatchString(text) || reTgLink.MatchString(text) || reTgDot.MatchString(text) || reTgColon.MatchString(text) || reEmail.MatchString(text) || reWebsite.MatchString(text) || reWeChat.MatchString(text) } // ContainsChinese 检查文本是否包含中文 // threshold: 中文字符占总字符的最低比例 (0-1),0表示只要有中文就返回true func ContainsChinese(text string, threshold float64) bool { if threshold <= 0 { for _, r := range text { if unicode.Is(unicode.Han, r) { return true } } return false } return ChineseRatio(text) >= threshold } // ChineseRatio 返回中文字符比例 func ChineseRatio(text string) float64 { runes := []rune(text) if len(runes) == 0 { return 0 } var count int for _, r := range runes { if unicode.Is(unicode.Han, r) { count++ } } return float64(count) / float64(len(runes)) } // CleanMerchantName 清洗商户名(去除HTML标签、多余空白) func CleanMerchantName(name string) string { name = reHTMLTag.ReplaceAllString(name, "") return strings.TrimSpace(name) }