【问题标题】:Inverse Association Rules逆关联规则
【发布时间】:2025-12-25 23:10:16
【问题描述】:

当您想弄清楚哪些事件一起发生时,关联规则是一种非常常见的技术(例如汉堡和面包大多一起销售)。在营销中,这种技术被用来找出免费的产品。

我正在寻找一种提取“替代产品”的技术,并像逆关联规则一样找出哪些事件不太可能一起发生。 Spark、R、Python 等中是否有任何算法或技术可用于此?

谢谢, 阿米尔

【问题讨论】:

    标签: associations rules inverse


    【解决方案1】:

    我已经使用Teng, Hsieh and Chen (2002) for R 为替代规则挖掘做了一个非常实用的实现。也许它可以帮助你:

    # Used packages:
    library(arules)
    
    
    SRM <- function(TransData, MinSup, MinConf, pMin, pChi, itemLabel, nTID){
    
    # Packages ----------------------------------------------------------------
    
    if (sum(search() %in% "package:arules") == 0) {
    stop("Please load package arules")
    }  
    
    # Checking Input data -----------------------------------------------------
     if (missing(TransData)) {
      stop("Transaction data is missing")
    }
    
    if (is.numeric(nTID) == FALSE) {
      stop("nTID has to be one numeric number for the count of      
     Transactions")
     }
    
      if (length(nTID) > 1) {
       stop("nTID has to be one number for the count of Transactions")
      }
    
      if (is.character(itemLabel) == FALSE) {
       stop("itemLabel has to be a character")
      }
      # Concrete Item sets  ---------------------------------------------------
    
      # adding complements to transaction data
      compl_trans <- addComplement(TransData,labels = itemLabel)
      compl_tab <- crossTable(compl_trans,"support")
      compl_tab_D <- as.data.frame(compl_tab)
      # ordering matrix
      compl_tab_D <-           compl_tab_D[order(rownames((compl_tab))),order(colnames((compl_tab)))]
    
    
      # Chi Value ---------------------------------------------------------------
    
    
      # empty data frame for loop
    
      complement_data <- data.frame(Chi = as.numeric(),
                               Sup_X.Y = as.numeric(),
                               X = as.character(),
                               Sup_X = as.numeric(),
                               Y = as.character(),
                               Sup_Y = as.numeric(),
                               CX = as.character(),
                               SupCX = as.numeric(),
                               CY = as.character(),
                               Sup_CY = as.numeric(),
                               Conf_X.CY = as.numeric(),
                               Sup_X.CY = as.numeric(),
                               Conf_Y.CX = as.numeric(),
                               SupY_CX = as.numeric())
    
    
    
      # first loop for one item
      for ( i in 1 : (length(itemLabel) - 1)) {
       # second loop combines it with all other items
       for (u in (i + 1) : length(itemLabel)) {
    
    
        # getting chi value from Teng
        a <-  itemLabel[i]
        b <-  itemLabel[u]
        ca <- paste0("!", itemLabel[i])
        cb <- paste0("!", itemLabel[u])
    
        chiValue <- nTID * (
         compl_tab[ca, cb] ^ 2 / (compl_tab[ca, ca] * compl_tab[cb, cb]) +
          compl_tab[ca, b] ^ 2 / (compl_tab[ca, ca] * compl_tab[b, b]) +
          compl_tab[a, cb] ^ 2 / (compl_tab[a, a] * compl_tab[cb, cb]) +
          compl_tab[a, b] ^ 2 / (compl_tab[a, a] * compl_tab[b, b]) - 1)
    
    
    
        # condition to be dependent
        if (compl_tab[a, b] > compl_tab[a, a] * compl_tab[b, b] &&      chiValue >= qchisq(pChi, 1) && 
            compl_tab[a, a] >= MinSup && compl_tab[b, b] >= MinSup ) {
    
    
    
         chi_sup <- data.frame(Chi = chiValue,
                          Sup_X.Y = compl_tab[a, b],
                          X = a,
                          Sup_X = compl_tab[a, a],
                          Y = b,
                          Sup_Y = compl_tab[b, b],
                          CX = ca,
                          SupCX = compl_tab[ca, ca],
                          CY = cb,
                          Sup_CY = compl_tab[cb, cb],
                          Conf_X.CY = compl_tab[a, cb] / compl_tab[a, a],
                          Sup_X.CY = compl_tab[a, cb],
                          Conf_Y.CX = compl_tab[ca, b] / compl_tab[b, b],
                          SupY_CX = compl_tab[ca, b])
    
    
         try(complement_data <- rbind(complement_data, chi_sup))
    
        }
    
    
       }
      }
      if (nrow(complement_data) == 0) {
       stop("No complement item sets could have been found")
      }
    
    
      #  changing mode of 
      complement_data$X <- as.character(complement_data$X)
      complement_data$Y <- as.character(complement_data$Y)
    
    
      # calculating support for concrete itemsets with all others and their complements -------------------
    
    
      ## with complements
      matrix_trans <- as.data.frame(as(compl_trans, "matrix"))
    
      sup_three <- data.frame(Items = as.character(),
                         Support = as.numeric()) 
    
    
      setCompl <- names(matrix_trans)
      # 1. extracts all other values than that are not in the itemset
      for (i in 1 : nrow(complement_data)) {
       value <- setCompl[ !setCompl %in% c(complement_data$X[i], 
                                      complement_data$Y[i], 
                                      paste0("!", complement_data$X[i]), 
                                      paste0("!",complement_data$Y[i]))]
    
    
       # 2. calculation of support
       for (u in value) {
        count <- sum(rowSums(matrix_trans[, c(complement_data$X[i],      complement_data$Y[i], u )]) == 3)
        sup <- count / nTID  
        sup_three_items <- data.frame(Items =      paste0(complement_data$X[i], complement_data$Y[i], u),
                                 Support=sup) 
        sup_three <- rbind(sup_three, sup_three_items)
       }
    
      }
    
      # Correlation of single items-------------------------------------------------------------
    
    
      # all items of concrete itemsets should be mixed for correlation
      combis <- unique(c(complement_data$X, complement_data$Y))
    
      # empty object
      rules<- data.frame(
       Substitute = as.character(),
       Product = as.character(),
       Support = as.numeric(),
       Confidence = as.numeric(),
       Correlation = as.numeric())
    
      # first loop for one item
      for (i in 1 : (length(combis) - 1)) {
       # second loop combines it with all other items
       for (u in (i + 1) : length(combis)) {
    
        first <- combis[i]
        second <- combis[u]
    
        corXY <- (compl_tab[first, second] - (compl_tab[first, first] *      compl_tab[second, second])) /
    (sqrt((compl_tab[first, first] * (1 - compl_tab[first,first])) *
           (compl_tab[second, second] * (1 - compl_tab[second, second]))))
    
    
        # confidence
        conf1 <- compl_tab[first, paste0("!", second)] / compl_tab[first, first]
        conf2 <- compl_tab[second, paste0("!", first)] / compl_tab[second, second]
    
        two_rules <- data.frame(
         Substitute = c(paste("{", first, "}"), 
                   paste("{", second, "}")),
         Product = c(paste("=>", "{", second, "}"),
                paste("=>", "{", first, "}")),
         Support = c(compl_tab[first, paste0("!", second)], compl_tab[second, paste0("!", first)]),
         Confidence = c(conf1, conf2),
         Correlation = c(corXY, corXY)
        )
    
        # conditions
        try({
         if (two_rules$Correlation[1] < pMin) {
          if (two_rules$Support[1] >= MinSup && two_rules$Confidence[1] >= MinConf) {
           rules <- rbind(rules, two_rules[1, ])
     }
          if (two_rules$Support[2] >= MinSup && two_rules$Confidence[2] >= MinConf) {
           rules <- rbind(rules, two_rules[2, ])
          }
    
         } })
    
       }
      }
    
    
      # Correlation of concrete item pairs with single items --------------------
      # adding variable for loop
      complement_data$XY <- paste0(complement_data$X, complement_data$Y)
    
      # combination of items
      for (i in 1 : nrow(complement_data)){
    
       # set of combinations from dependent items with single items
       univector <- c(as.vector(unique(complement_data$X)),      as.vector(unique(complement_data$Y)))
       univector <- univector[!univector %in% c(complement_data$X[i], complement_data$Y[i])]
    
       combis <- c(complement_data[i,"XY"], univector)
    
    
    
       for (u in 2 : length(combis)) {
        corXYZ <-(sup_three[sup_three$Items == paste0(combis[1], combis[u]),2] - 
                   complement_data[complement_data$XY == combis[1],"Sup_X.Y"] *
              compl_tab[combis[u],combis[u]]) /
    (sqrt((complement_data[complement_data$XY == combis[1],"Sup_X.Y"] * 
             (1 - complement_data[complement_data$XY == combis[1],"Sup_X.Y"]) *
            compl_tab[combis[u],combis[u]] * (1 - compl_tab[combis[u],combis[u]]))))
    
        dataXYZ <- data.frame(
    Substitute = paste("{", combis[1], "}"), 
    Product = paste("=>", "{", combis[u], "}"),
    Support = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2],
    Confidence = sup_three[sup_three$Items == paste0(combis[1], "!", combis[u]),2] /
     complement_data[complement_data$XY == combis[1],"Sup_X.Y"],
    Correlation = corXYZ)
    
    
        # conditions
        if (dataXYZ$Correlation < pMin && dataXYZ$Support >= MinSup && dataXYZ$Confidence >= MinConf) {
    
         try(rules <- rbind(rules, dataXYZ))
        }
       }
      }
      if (nrow(rules) == 0) {
       message("Sorry no rules could have been calculated. Maybe change input conditions.")
      }      else {
       return(rules)
      }
    
      # end
     }
    

    我认为我的博客中有更好的解释: http://mattimeyer.github.io/2016-12-21-Substitution-Rule-Mining/

    【讨论】: