dedup.go 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. package processor
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "spider/internal/model"
  6. )
  7. // SourceInfo describes where a merchant was discovered.
  8. type SourceInfo struct {
  9. SourceType string `json:"source_type"`
  10. SourceName string `json:"source_name"`
  11. SourceURL string `json:"source_url"`
  12. }
  13. // DedupResult holds keepers and duplicates after deduplication.
  14. type DedupResult struct {
  15. Keepers []MergedMerchant
  16. Duplicates []model.MerchantRaw
  17. }
  18. // MergedMerchant is the best raw record plus merged source info.
  19. type MergedMerchant struct {
  20. Best model.MerchantRaw
  21. AllSources []SourceInfo
  22. SourceCount int
  23. }
  24. // Deduplicate groups by tg_username, keeps the richest record, merges sources.
  25. func Deduplicate(raws []model.MerchantRaw) DedupResult {
  26. groups := map[string][]model.MerchantRaw{}
  27. for _, raw := range raws {
  28. key := raw.TgUsername
  29. if key == "" {
  30. key = raw.Website
  31. }
  32. if key == "" {
  33. key = raw.Email
  34. }
  35. if key == "" {
  36. key = fmt.Sprintf("_id_%d", raw.ID)
  37. }
  38. groups[key] = append(groups[key], raw)
  39. }
  40. var result DedupResult
  41. for _, group := range groups {
  42. // Collect all sources
  43. var sources []SourceInfo
  44. for _, r := range group {
  45. sources = append(sources, SourceInfo{
  46. SourceType: r.SourceType,
  47. SourceName: r.SourceName,
  48. SourceURL: r.SourceURL,
  49. })
  50. }
  51. if len(group) == 1 {
  52. result.Keepers = append(result.Keepers, MergedMerchant{
  53. Best: group[0],
  54. AllSources: sources,
  55. SourceCount: 1,
  56. })
  57. continue
  58. }
  59. // Find richest record
  60. bestIdx := 0
  61. bestScore := richness(group[0])
  62. for i := 1; i < len(group); i++ {
  63. s := richness(group[i])
  64. if s > bestScore {
  65. bestIdx = i
  66. bestScore = s
  67. }
  68. }
  69. // Mark all others as duplicate
  70. for i, r := range group {
  71. if i != bestIdx {
  72. result.Duplicates = append(result.Duplicates, r)
  73. }
  74. }
  75. result.Keepers = append(result.Keepers, MergedMerchant{
  76. Best: group[bestIdx],
  77. AllSources: sources,
  78. SourceCount: len(group),
  79. })
  80. }
  81. return result
  82. }
  83. func richness(r model.MerchantRaw) int {
  84. score := 0
  85. if r.TgUsername != "" {
  86. score++
  87. }
  88. if r.Website != "" {
  89. score++
  90. }
  91. if r.Email != "" {
  92. score++
  93. }
  94. if r.Phone != "" {
  95. score++
  96. }
  97. if r.MerchantName != "" {
  98. score++
  99. }
  100. return score
  101. }
  102. // MarshalSources converts source list to JSON bytes.
  103. func MarshalSources(sources []SourceInfo) []byte {
  104. b, _ := json.Marshal(sources)
  105. return b
  106. }