interface.go 3.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. package plugin
  2. import (
  3. "context"
  4. "time"
  5. )
  6. // MerchantData is the standard output format for all collector plugins.
  7. type MerchantData struct {
  8. TgUsername string `json:"tg_username"`
  9. TgLink string `json:"tg_link"`
  10. MerchantName string `json:"merchant_name"`
  11. Website string `json:"website"`
  12. Email string `json:"email"`
  13. Phone string `json:"phone"`
  14. SourceType string `json:"source_type"`
  15. SourceName string `json:"source_name"`
  16. SourceURL string `json:"source_url"`
  17. OriginalText string `json:"original_text"`
  18. IndustryTag string `json:"industry_tag"`
  19. // GroupUsername is set when this merchant was found inside a TG group/channel.
  20. // Used to build group-member relationships.
  21. GroupUsername string `json:"group_username,omitempty"`
  22. GroupTitle string `json:"group_title,omitempty"`
  23. }
  24. // TaskLogger records detailed per-operation logs within a task.
  25. // Every important node is logged with full content for auditability.
  26. type TaskLogger interface {
  27. // LogSearchResult records each individual serper search result.
  28. // query: the search query, position: result index (1-based)
  29. // title, link, snippet: raw serper fields
  30. LogSearchResult(query string, position int, title, link, snippet string)
  31. // LogCrawlPage records a page fetch attempt with content summary.
  32. // parentURL: which page led here (empty for top-level serper results)
  33. // depth: 0=serper result page, 1=link from depth-0, 2=sub-link, etc.
  34. // htmlSummary: first N chars of HTML body for audit
  35. // tgLinks: t.me links found in href attributes
  36. LogCrawlPage(url, parentURL string, depth int, htmlSummary string, tgLinks []string, allLinksCount int, err error, dur time.Duration)
  37. // LogSnippetExtract records extraction from a snippet/title text.
  38. // rawText: the full snippet+title text that was analyzed
  39. // extracted: what was found (usernames, websites, etc.)
  40. LogSnippetExtract(sourceURL, rawText string, extracted []string)
  41. // LogPageExtract records extraction from a crawled page body.
  42. // contentSample: representative text from the page
  43. // extracted: what was found
  44. LogPageExtract(pageURL, parentURL string, depth int, contentSample string, extracted []string)
  45. // LogMerchantFound records a merchant being produced.
  46. // All fields are stored for full audit trail.
  47. LogMerchantFound(data MerchantData, sourceAction string, depth int, parentURL string)
  48. // LogCleanStep records a cleaning pipeline decision for a single merchant.
  49. // step: tmechecker / blacklist / dedup / tagger
  50. // decision: alive/dead, passed/blocked, keeper/duplicate, Hot/Warm/Cold
  51. LogCleanStep(tgUsername, step, decision, reason string)
  52. // LogSkip records a skipped URL or item with the reason.
  53. LogSkip(action, url, reason string)
  54. // LogError records an error at any stage.
  55. LogError(action, url, errMsg string)
  56. }
  57. // nopLogger is a no-op logger for when no logger is set.
  58. type nopLogger struct{}
  59. func (nopLogger) LogSearchResult(string, int, string, string, string) {}
  60. func (nopLogger) LogCrawlPage(string, string, int, string, []string, int, error, time.Duration) {}
  61. func (nopLogger) LogSnippetExtract(string, string, []string) {}
  62. func (nopLogger) LogPageExtract(string, string, int, string, []string) {}
  63. func (nopLogger) LogMerchantFound(MerchantData, string, int, string) {}
  64. func (nopLogger) LogCleanStep(string, string, string, string) {}
  65. func (nopLogger) LogSkip(string, string, string) {}
  66. func (nopLogger) LogError(string, string, string) {}
  67. // NopLogger returns a no-op logger.
  68. func NopLogger() TaskLogger { return nopLogger{} }
  69. // Collector is the interface every collection plugin must implement.
  70. type Collector interface {
  71. Name() string
  72. Run(ctx context.Context, cfg map[string]any, callback func(MerchantData)) error
  73. Stop() error
  74. SetLogger(logger TaskLogger)
  75. }