package processor import ( "encoding/json" "fmt" "spider/internal/model" ) // SourceInfo describes where a merchant was discovered. type SourceInfo struct { SourceType string `json:"source_type"` SourceName string `json:"source_name"` SourceURL string `json:"source_url"` } // DedupResult holds keepers and duplicates after deduplication. type DedupResult struct { Keepers []MergedMerchant Duplicates []model.MerchantRaw } // MergedMerchant is the best raw record plus merged source info. type MergedMerchant struct { Best model.MerchantRaw AllSources []SourceInfo SourceCount int } // Deduplicate groups by tg_username, keeps the richest record, merges sources. func Deduplicate(raws []model.MerchantRaw) DedupResult { groups := map[string][]model.MerchantRaw{} for _, raw := range raws { key := raw.TgUsername if key == "" { key = raw.Website } if key == "" { key = raw.Email } if key == "" { key = fmt.Sprintf("_id_%d", raw.ID) } groups[key] = append(groups[key], raw) } var result DedupResult for _, group := range groups { // Collect all sources var sources []SourceInfo for _, r := range group { sources = append(sources, SourceInfo{ SourceType: r.SourceType, SourceName: r.SourceName, SourceURL: r.SourceURL, }) } if len(group) == 1 { result.Keepers = append(result.Keepers, MergedMerchant{ Best: group[0], AllSources: sources, SourceCount: 1, }) continue } // Find richest record bestIdx := 0 bestScore := richness(group[0]) for i := 1; i < len(group); i++ { s := richness(group[i]) if s > bestScore { bestIdx = i bestScore = s } } // Mark all others as duplicate for i, r := range group { if i != bestIdx { result.Duplicates = append(result.Duplicates, r) } } result.Keepers = append(result.Keepers, MergedMerchant{ Best: group[bestIdx], AllSources: sources, SourceCount: len(group), }) } return result } func richness(r model.MerchantRaw) int { score := 0 if r.TgUsername != "" { score++ } if r.Website != "" { score++ } if r.Email != "" { score++ } if r.Phone != "" { score++ } if r.MerchantName != "" { score++ } return score } // MarshalSources converts source list to JSON bytes. func MarshalSources(sources []SourceInfo) []byte { b, _ := json.Marshal(sources) return b }