Taio
/
spider


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
							package plugin

import (
	"context"
	"time"
)

// MerchantData is the standard output format for all collector plugins.
type MerchantData struct {
	TgUsername   string `json:"tg_username"`
	TgLink       string `json:"tg_link"`
	MerchantName string `json:"merchant_name"`
	Website      string `json:"website"`
	Email        string `json:"email"`
	Phone        string `json:"phone"`
	SourceType   string `json:"source_type"`
	SourceName   string `json:"source_name"`
	SourceURL    string `json:"source_url"`
	OriginalText string `json:"original_text"`
	IndustryTag  string `json:"industry_tag"`
	// GroupUsername is set when this merchant was found inside a TG group/channel.
	// Used to build group-member relationships.
	GroupUsername string `json:"group_username,omitempty"`
	GroupTitle    string `json:"group_title,omitempty"`
}

// TaskLogger records detailed per-operation logs within a task.
// Every important node is logged with full content for auditability.
type TaskLogger interface {
	// LogSearchResult records each individual serper search result.
	// query: the search query, position: result index (1-based)
	// title, link, snippet: raw serper fields
	LogSearchResult(query string, position int, title, link, snippet string)

	// LogCrawlPage records a page fetch attempt with content summary.
	// parentURL: which page led here (empty for top-level serper results)
	// depth: 0=serper result page, 1=link from depth-0, 2=sub-link, etc.
	// htmlSummary: first N chars of HTML body for audit
	// tgLinks: t.me links found in href attributes
	LogCrawlPage(url, parentURL string, depth int, htmlSummary string, tgLinks []string, allLinksCount int, err error, dur time.Duration)

	// LogSnippetExtract records extraction from a snippet/title text.
	// rawText: the full snippet+title text that was analyzed
	// extracted: what was found (usernames, websites, etc.)
	LogSnippetExtract(sourceURL, rawText string, extracted []string)

	// LogPageExtract records extraction from a crawled page body.
	// contentSample: representative text from the page
	// extracted: what was found
	LogPageExtract(pageURL, parentURL string, depth int, contentSample string, extracted []string)

	// LogMerchantFound records a merchant being produced.
	// All fields are stored for full audit trail.
	LogMerchantFound(data MerchantData, sourceAction string, depth int, parentURL string)

	// LogCleanStep records a cleaning pipeline decision for a single merchant.
	// step: tmechecker / blacklist / dedup / tagger
	// decision: alive/dead, passed/blocked, keeper/duplicate, Hot/Warm/Cold
	LogCleanStep(tgUsername, step, decision, reason string)

	// LogSkip records a skipped URL or item with the reason.
	LogSkip(action, url, reason string)

	// LogError records an error at any stage.
	LogError(action, url, errMsg string)
}

// nopLogger is a no-op logger for when no logger is set.
type nopLogger struct{}

func (nopLogger) LogSearchResult(string, int, string, string, string)                       {}
func (nopLogger) LogCrawlPage(string, string, int, string, []string, int, error, time.Duration) {}
func (nopLogger) LogSnippetExtract(string, string, []string)                                {}
func (nopLogger) LogPageExtract(string, string, int, string, []string)                      {}
func (nopLogger) LogMerchantFound(MerchantData, string, int, string)                        {}
func (nopLogger) LogCleanStep(string, string, string, string)                               {}
func (nopLogger) LogSkip(string, string, string)                                            {}
func (nopLogger) LogError(string, string, string)                                           {}

// NopLogger returns a no-op logger.
func NopLogger() TaskLogger { return nopLogger{} }

// Collector is the interface every collection plugin must implement.
type Collector interface {
	Name() string
	Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error
	Stop() error
	SetLogger(logger TaskLogger)
}