Taio
/
spider


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
							package extractor

import (
	"regexp"
	"strings"
	"unicode"
)

// TG 用户名正则模式：
// 标准: @username 或 t.me/username
// 变体: t点me/xxx, t . me/xxx, t．me/xxx, tg:xxx, telegram.me/xxx

var (
	reTgAt    = regexp.MustCompile(`@([a-zA-Z][a-zA-Z0-9_]{4,31})`)
	reTgLink  = regexp.MustCompile(`(?:https?://)?t(?:elegram)?\.me/([a-zA-Z][a-zA-Z0-9_]{4,31})`)
	reTgDot   = regexp.MustCompile(`t[点．·\s]*me[/\s]*([a-zA-Z][a-zA-Z0-9_]{4,31})`)
	reTgColon = regexp.MustCompile(`(?i)tg[:：\s]+([a-zA-Z][a-zA-Z0-9_]{4,31})`)
	reEmail   = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
	rePhone   = regexp.MustCompile(`\+?[0-9]{7,15}`)
	reWebsite = regexp.MustCompile(`https?://[^\s<>"'` + "\u4e00-\u9fa5" + `]+`)
	reWeChat  = regexp.MustCompile(`(?i)(?:微信|vx|wx|加v|加V|weixin)[:：\s]*([a-zA-Z0-9_\-]{5,20})`)

	reHTMLTag = regexp.MustCompile(`<[^>]+>`)
)

// Extract 从文本提取联系方式（正则，快速）— 只返回第一个TG用户名
// 优先级：先提取 t.me 链接（更精确），再提取 @用户名，避免重复
func Extract(text string) *ContactInfo {
	info := &ContactInfo{}

	// 1. 优先提取标准 t.me 链接
	if m := reTgLink.FindStringSubmatch(text); m != nil {
		info.TgUsername = m[1]
		info.TgLink = "t.me/" + m[1]
	}

	// 2. 若无 t.me 链接，尝试中文变体 (t点me / t．me)
	if info.TgUsername == "" {
		if m := reTgDot.FindStringSubmatch(text); m != nil {
			info.TgUsername = m[1]
			info.TgLink = "t.me/" + m[1]
		}
	}

	// 3. 若无 t.me 变体，尝试 tg: 前缀
	if info.TgUsername == "" {
		if m := reTgColon.FindStringSubmatch(text); m != nil {
			info.TgUsername = m[1]
			info.TgLink = "t.me/" + m[1]
		}
	}

	// 4. 最后尝试 @用户名（避免与已提取用户名重复）
	if info.TgUsername == "" {
		if m := reTgAt.FindStringSubmatch(text); m != nil {
			info.TgUsername = m[1]
			info.TgLink = "t.me/" + m[1]
		}
	}

	extractContactFields(text, info)
	return info
}

// commonFalsePositives are words that regex matches as TG usernames but aren't.
var commonFalsePositives = map[string]bool{
	// TG system bots
	"telegram": true, "telegramhints": true, "botfather": true, "spambot": true,
	// HTML/CSS/JS keywords that regex can match
	"github": true, "gmail": true, "email": true, "admin": true,
	"login": true, "signup": true, "about": true, "contact": true,
	"support": true, "https": true, "style": true, "script": true,
	"header": true, "footer": true, "button": true, "input": true,
	"image": true, "video": true, "media": true, "share": true,
	"click": true, "undefined": true, "object": true, "string": true,
	"number": true, "function": true, "return": true, "const": true,
	"class": true, "export": true, "import": true, "ssage": true,
	"messages": true, "channel": true, "username": true,
	// CSS properties/values commonly found in HTML pages
	"context": true, "graph": true, "supports": true, "keyframes": true,
	"container": true, "ffmpeg": true, "original": true, "wrapped": true,
	"newrelic": true, "viewport": true, "charset": true, "content": true,
	"inherit": true, "initial": true, "normal": true, "center": true,
	"inline": true, "block": true, "fixed": true, "static": true,
	"relative": true, "absolute": true, "hidden": true, "visible": true,
	"transparent": true, "important": true, "default": true,
	// Common web tech terms
	"jquery": true, "webpack": true, "babel": true, "eslint": true,
	"prettier": true, "typescript": true, "javascript": true,
	"angular": true, "vuejs": true, "nextjs": true, "nuxtjs": true,
	"nodejs": true, "express": true, "python": true, "django": true,
	"docker": true, "nginx": true, "redis": true, "mysql": true,
	"mongodb": true, "postgresql": true, "firebase": true,
	"google": true, "apple": true, "microsoft": true, "amazon": true,
	"twitter": true, "facebook": true, "instagram": true, "tiktok": true,
	"linkedin": true, "pinterest": true, "reddit": true, "discord": true,
}

func isValidTgUsername(username string) bool {
	lower := strings.ToLower(username)
	if commonFalsePositives[lower] {
		return false
	}
	// Reject if ends with "bot" (except very short ones which might be real names)
	if strings.HasSuffix(lower, "bot") && len(lower) > 5 {
		return false
	}
	return true
}

// ExtractAll 从文本提取所有不同的TG用户名及联系方式
// 用于导航站/聚合页面，一个页面可能包含多个商户
func ExtractAll(text string) []*ContactInfo {
	seen := map[string]bool{}
	var results []*ContactInfo

	// Collect all usernames from all regex patterns
	addUsername := func(username string) {
		if username == "" || seen[strings.ToLower(username)] {
			return
		}
		if !isValidTgUsername(username) {
			return
		}
		seen[strings.ToLower(username)] = true
		info := &ContactInfo{
			TgUsername: username,
			TgLink:    "t.me/" + username,
		}
		results = append(results, info)
	}

	// t.me links (highest priority, most accurate)
	for _, m := range reTgLink.FindAllStringSubmatch(text, -1) {
		addUsername(m[1])
	}

	// Chinese variants: t点me, t．me
	for _, m := range reTgDot.FindAllStringSubmatch(text, -1) {
		addUsername(m[1])
	}

	// tg: prefix
	for _, m := range reTgColon.FindAllStringSubmatch(text, -1) {
		addUsername(m[1])
	}

	// @username
	for _, m := range reTgAt.FindAllStringSubmatch(text, -1) {
		addUsername(m[1])
	}

	// If no results, return nil
	if len(results) == 0 {
		return nil
	}

	// Attach shared contact fields to the first result
	extractContactFields(text, results[0])

	return results
}

// extractContactFields fills in email, website, phone, wechat fields.
func extractContactFields(text string, info *ContactInfo) {
	// 提取 Email（过滤掉 TG 用户名中的 @ 误匹配）
	if m := reEmail.FindString(text); m != "" {
		info.Email = m
	}

	// 提取网站
	if m := reWebsite.FindString(text); m != "" {
		// 过滤掉 t.me 本身
		if !strings.Contains(strings.ToLower(m), "t.me/") && !strings.Contains(strings.ToLower(m), "telegram.me/") {
			info.Website = strings.TrimRight(m, ".,;)")
		}
	}

	// 提取电话（过滤纯数字短于7位）
	if m := rePhone.FindString(text); m != "" {
		cleaned := strings.TrimPrefix(m, "+")
		if len(cleaned) >= 7 {
			info.Phone = m
		}
	}

	// 提取微信
	if m := reWeChat.FindStringSubmatch(text); m != nil {
		info.WeChat = m[1]
	}

	info.HasContact = info.TgUsername != "" || info.Email != "" ||
		info.Website != "" || info.Phone != "" || info.WeChat != ""
}

// HasContact 快速判断文本是否含任何联系方式（无需完整提取）
func HasContact(text string) bool {
	return reTgAt.MatchString(text) ||
		reTgLink.MatchString(text) ||
		reTgDot.MatchString(text) ||
		reTgColon.MatchString(text) ||
		reEmail.MatchString(text) ||
		reWebsite.MatchString(text) ||
		reWeChat.MatchString(text)
}

// ContainsChinese 检查文本是否包含中文
// threshold: 中文字符占总字符的最低比例 (0-1)，0表示只要有中文就返回true
func ContainsChinese(text string, threshold float64) bool {
	if threshold <= 0 {
		for _, r := range text {
			if unicode.Is(unicode.Han, r) {
				return true
			}
		}
		return false
	}
	return ChineseRatio(text) >= threshold
}

// ChineseRatio 返回中文字符比例
func ChineseRatio(text string) float64 {
	runes := []rune(text)
	if len(runes) == 0 {
		return 0
	}
	var count int
	for _, r := range runes {
		if unicode.Is(unicode.Han, r) {
			count++
		}
	}
	return float64(count) / float64(len(runes))
}

// CleanMerchantName 清洗商户名（去除HTML标签、多余空白）
func CleanMerchantName(name string) string {
	name = reHTMLTag.ReplaceAllString(name, "")
	return strings.TrimSpace(name)
}