|
@@ -293,17 +293,37 @@ func (c *Collector) scrapeChannel(ctx context.Context, username string, msgLimit
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message, channelUsername string, callback func(plugin.MerchantData)) {
|
|
func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message, channelUsername string, callback func(plugin.MerchantData)) {
|
|
|
|
|
+ channelLower := strings.ToLower(channelUsername)
|
|
|
|
|
+
|
|
|
for _, msg := range msgs {
|
|
for _, msg := range msgs {
|
|
|
- if msg.IsService || msg.Text == "" {
|
|
|
|
|
|
|
+ if msg.IsService {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // ── Path 1: sender-based (group chats) ──
|
|
|
|
|
+ // If the message has a sender username, record that person directly.
|
|
|
|
|
+ // This is the correct way to collect users who post in a chat group.
|
|
|
|
|
+ if msg.SenderUsername != "" && strings.ToLower(msg.SenderUsername) != channelLower {
|
|
|
|
|
+ md := plugin.MerchantData{
|
|
|
|
|
+ TgUsername: msg.SenderUsername,
|
|
|
|
|
+ TgLink: "https://t.me/" + msg.SenderUsername,
|
|
|
|
|
+ SourceType: "tg_group",
|
|
|
|
|
+ SourceName: channelUsername,
|
|
|
|
|
+ SourceURL: "https://t.me/" + channelUsername,
|
|
|
|
|
+ OriginalText: msg.Text,
|
|
|
|
|
+ GroupUsername: channelUsername,
|
|
|
|
|
+ }
|
|
|
|
|
+ c.logger.LogMerchantFound(md, "tg_sender", 0, "tg://"+channelUsername)
|
|
|
|
|
+ callback(md)
|
|
|
continue
|
|
continue
|
|
|
}
|
|
}
|
|
|
- // Relaxed: allow messages with any contact info even without Chinese
|
|
|
|
|
- // Many merchants post in English or mixed language
|
|
|
|
|
- if !extractor.HasContact(msg.Text) {
|
|
|
|
|
|
|
+
|
|
|
|
|
+ // ── Path 2: text-based extraction (broadcast channels) ──
|
|
|
|
|
+ // Only run when there's no sender (broadcast channel posts) and text is non-empty.
|
|
|
|
|
+ if msg.Text == "" || !extractor.HasContact(msg.Text) {
|
|
|
continue
|
|
continue
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Regex first
|
|
|
|
|
info := extractor.Extract(msg.Text)
|
|
info := extractor.Extract(msg.Text)
|
|
|
merchantName := ""
|
|
merchantName := ""
|
|
|
industry := ""
|
|
industry := ""
|
|
@@ -327,22 +347,23 @@ func (c *Collector) processMessages(ctx context.Context, msgs []telegram.Message
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if info.TgUsername == "" {
|
|
|
|
|
|
|
+ // Skip if still no username, or if it's the channel itself (self-referencing link)
|
|
|
|
|
+ if info.TgUsername == "" || strings.ToLower(info.TgUsername) == channelLower {
|
|
|
continue
|
|
continue
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
md := plugin.MerchantData{
|
|
md := plugin.MerchantData{
|
|
|
TgUsername: info.TgUsername,
|
|
TgUsername: info.TgUsername,
|
|
|
- TgLink: "https://t.me/" + info.TgUsername,
|
|
|
|
|
- MerchantName: merchantName,
|
|
|
|
|
- Website: info.Website,
|
|
|
|
|
- Email: info.Email,
|
|
|
|
|
- Phone: info.Phone,
|
|
|
|
|
- SourceType: "tg_channel",
|
|
|
|
|
- SourceName: channelUsername,
|
|
|
|
|
- SourceURL: "https://t.me/" + channelUsername,
|
|
|
|
|
- OriginalText: msg.Text,
|
|
|
|
|
- IndustryTag: industry,
|
|
|
|
|
|
|
+ TgLink: "https://t.me/" + info.TgUsername,
|
|
|
|
|
+ MerchantName: merchantName,
|
|
|
|
|
+ Website: info.Website,
|
|
|
|
|
+ Email: info.Email,
|
|
|
|
|
+ Phone: info.Phone,
|
|
|
|
|
+ SourceType: "tg_channel",
|
|
|
|
|
+ SourceName: channelUsername,
|
|
|
|
|
+ SourceURL: "https://t.me/" + channelUsername,
|
|
|
|
|
+ OriginalText: msg.Text,
|
|
|
|
|
+ IndustryTag: industry,
|
|
|
GroupUsername: channelUsername,
|
|
GroupUsername: channelUsername,
|
|
|
}
|
|
}
|
|
|
c.logger.LogMerchantFound(md, "tg_message_extract", 0, "tg://"+channelUsername)
|
|
c.logger.LogMerchantFound(md, "tg_message_extract", 0, "tg://"+channelUsername)
|