| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- package processor
- import (
- "encoding/json"
- "fmt"
- "spider/internal/model"
- )
- // SourceInfo describes where a merchant was discovered.
- type SourceInfo struct {
- SourceType string `json:"source_type"`
- SourceName string `json:"source_name"`
- SourceURL string `json:"source_url"`
- }
- // DedupResult holds keepers and duplicates after deduplication.
- type DedupResult struct {
- Keepers []MergedMerchant
- Duplicates []model.MerchantRaw
- }
- // MergedMerchant is the best raw record plus merged source info.
- type MergedMerchant struct {
- Best model.MerchantRaw
- AllSources []SourceInfo
- SourceCount int
- }
- // Deduplicate groups by tg_username, keeps the richest record, merges sources.
- func Deduplicate(raws []model.MerchantRaw) DedupResult {
- groups := map[string][]model.MerchantRaw{}
- for _, raw := range raws {
- key := raw.TgUsername
- if key == "" {
- key = raw.Website
- }
- if key == "" {
- key = raw.Email
- }
- if key == "" {
- key = fmt.Sprintf("_id_%d", raw.ID)
- }
- groups[key] = append(groups[key], raw)
- }
- var result DedupResult
- for _, group := range groups {
- // Collect all sources
- var sources []SourceInfo
- for _, r := range group {
- sources = append(sources, SourceInfo{
- SourceType: r.SourceType,
- SourceName: r.SourceName,
- SourceURL: r.SourceURL,
- })
- }
- if len(group) == 1 {
- result.Keepers = append(result.Keepers, MergedMerchant{
- Best: group[0],
- AllSources: sources,
- SourceCount: 1,
- })
- continue
- }
- // Find richest record
- bestIdx := 0
- bestScore := richness(group[0])
- for i := 1; i < len(group); i++ {
- s := richness(group[i])
- if s > bestScore {
- bestIdx = i
- bestScore = s
- }
- }
- // Mark all others as duplicate
- for i, r := range group {
- if i != bestIdx {
- result.Duplicates = append(result.Duplicates, r)
- }
- }
- result.Keepers = append(result.Keepers, MergedMerchant{
- Best: group[bestIdx],
- AllSources: sources,
- SourceCount: len(group),
- })
- }
- return result
- }
- func richness(r model.MerchantRaw) int {
- score := 0
- if r.TgUsername != "" {
- score++
- }
- if r.Website != "" {
- score++
- }
- if r.Email != "" {
- score++
- }
- if r.Phone != "" {
- score++
- }
- if r.MerchantName != "" {
- score++
- }
- return score
- }
- // MarshalSources converts source list to JSON bytes.
- func MarshalSources(sources []SourceInfo) []byte {
- b, _ := json.Marshal(sources)
- return b
- }
|