package plugin import ( "context" "time" ) // MerchantData is the standard output format for all collector plugins. type MerchantData struct { TgUsername string `json:"tg_username"` TgLink string `json:"tg_link"` MerchantName string `json:"merchant_name"` Website string `json:"website"` Email string `json:"email"` Phone string `json:"phone"` SourceType string `json:"source_type"` SourceName string `json:"source_name"` SourceURL string `json:"source_url"` OriginalText string `json:"original_text"` IndustryTag string `json:"industry_tag"` // GroupUsername is set when this merchant was found inside a TG group/channel. // Used to build group-member relationships. GroupUsername string `json:"group_username,omitempty"` GroupTitle string `json:"group_title,omitempty"` } // TaskLogger records detailed per-operation logs within a task. // Every important node is logged with full content for auditability. type TaskLogger interface { // LogSearchResult records each individual serper search result. // query: the search query, position: result index (1-based) // title, link, snippet: raw serper fields LogSearchResult(query string, position int, title, link, snippet string) // LogCrawlPage records a page fetch attempt with content summary. // parentURL: which page led here (empty for top-level serper results) // depth: 0=serper result page, 1=link from depth-0, 2=sub-link, etc. // htmlSummary: first N chars of HTML body for audit // tgLinks: t.me links found in href attributes LogCrawlPage(url, parentURL string, depth int, htmlSummary string, tgLinks []string, allLinksCount int, err error, dur time.Duration) // LogSnippetExtract records extraction from a snippet/title text. // rawText: the full snippet+title text that was analyzed // extracted: what was found (usernames, websites, etc.) LogSnippetExtract(sourceURL, rawText string, extracted []string) // LogPageExtract records extraction from a crawled page body. // contentSample: representative text from the page // extracted: what was found LogPageExtract(pageURL, parentURL string, depth int, contentSample string, extracted []string) // LogMerchantFound records a merchant being produced. // All fields are stored for full audit trail. LogMerchantFound(data MerchantData, sourceAction string, depth int, parentURL string) // LogCleanStep records a cleaning pipeline decision for a single merchant. // step: tmechecker / blacklist / dedup / tagger // decision: alive/dead, passed/blocked, keeper/duplicate, Hot/Warm/Cold LogCleanStep(tgUsername, step, decision, reason string) // LogSkip records a skipped URL or item with the reason. LogSkip(action, url, reason string) // LogError records an error at any stage. LogError(action, url, errMsg string) } // nopLogger is a no-op logger for when no logger is set. type nopLogger struct{} func (nopLogger) LogSearchResult(string, int, string, string, string) {} func (nopLogger) LogCrawlPage(string, string, int, string, []string, int, error, time.Duration) {} func (nopLogger) LogSnippetExtract(string, string, []string) {} func (nopLogger) LogPageExtract(string, string, int, string, []string) {} func (nopLogger) LogMerchantFound(MerchantData, string, int, string) {} func (nopLogger) LogCleanStep(string, string, string, string) {} func (nopLogger) LogSkip(string, string, string) {} func (nopLogger) LogError(string, string, string) {} // NopLogger returns a no-op logger. func NopLogger() TaskLogger { return nopLogger{} } // Collector is the interface every collection plugin must implement. type Collector interface { Name() string Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error Stop() error SetLogger(logger TaskLogger) }