| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- package plugin
- import (
- "context"
- "time"
- )
- // MerchantData is the standard output format for all collector plugins.
- type MerchantData struct {
- TgUsername string `json:"tg_username"`
- TgLink string `json:"tg_link"`
- MerchantName string `json:"merchant_name"`
- Website string `json:"website"`
- Email string `json:"email"`
- Phone string `json:"phone"`
- SourceType string `json:"source_type"`
- SourceName string `json:"source_name"`
- SourceURL string `json:"source_url"`
- OriginalText string `json:"original_text"`
- IndustryTag string `json:"industry_tag"`
- // GroupUsername is set when this merchant was found inside a TG group/channel.
- // Used to build group-member relationships.
- GroupUsername string `json:"group_username,omitempty"`
- GroupTitle string `json:"group_title,omitempty"`
- }
- // TaskLogger records detailed per-operation logs within a task.
- // Every important node is logged with full content for auditability.
- type TaskLogger interface {
- // LogSearchResult records each individual serper search result.
- // query: the search query, position: result index (1-based)
- // title, link, snippet: raw serper fields
- LogSearchResult(query string, position int, title, link, snippet string)
- // LogCrawlPage records a page fetch attempt with content summary.
- // parentURL: which page led here (empty for top-level serper results)
- // depth: 0=serper result page, 1=link from depth-0, 2=sub-link, etc.
- // htmlSummary: first N chars of HTML body for audit
- // tgLinks: t.me links found in href attributes
- LogCrawlPage(url, parentURL string, depth int, htmlSummary string, tgLinks []string, allLinksCount int, err error, dur time.Duration)
- // LogSnippetExtract records extraction from a snippet/title text.
- // rawText: the full snippet+title text that was analyzed
- // extracted: what was found (usernames, websites, etc.)
- LogSnippetExtract(sourceURL, rawText string, extracted []string)
- // LogPageExtract records extraction from a crawled page body.
- // contentSample: representative text from the page
- // extracted: what was found
- LogPageExtract(pageURL, parentURL string, depth int, contentSample string, extracted []string)
- // LogMerchantFound records a merchant being produced.
- // All fields are stored for full audit trail.
- LogMerchantFound(data MerchantData, sourceAction string, depth int, parentURL string)
- // LogCleanStep records a cleaning pipeline decision for a single merchant.
- // step: tmechecker / blacklist / dedup / tagger
- // decision: alive/dead, passed/blocked, keeper/duplicate, Hot/Warm/Cold
- LogCleanStep(tgUsername, step, decision, reason string)
- // LogSkip records a skipped URL or item with the reason.
- LogSkip(action, url, reason string)
- // LogError records an error at any stage.
- LogError(action, url, errMsg string)
- }
- // nopLogger is a no-op logger for when no logger is set.
- type nopLogger struct{}
- func (nopLogger) LogSearchResult(string, int, string, string, string) {}
- func (nopLogger) LogCrawlPage(string, string, int, string, []string, int, error, time.Duration) {}
- func (nopLogger) LogSnippetExtract(string, string, []string) {}
- func (nopLogger) LogPageExtract(string, string, int, string, []string) {}
- func (nopLogger) LogMerchantFound(MerchantData, string, int, string) {}
- func (nopLogger) LogCleanStep(string, string, string, string) {}
- func (nopLogger) LogSkip(string, string, string) {}
- func (nopLogger) LogError(string, string, string) {}
- // NopLogger returns a no-op logger.
- func NopLogger() TaskLogger { return nopLogger{} }
- // Collector is the interface every collection plugin must implement.
- type Collector interface {
- Name() string
- Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error
- Stop() error
- SetLogger(logger TaskLogger)
- }
|