数据挖掘算法之 Apriori

一、什么是Apriori算法？

Apriori算法是寻找所有支持度不小于minsup的项集。项集的支持度指的是包含该项集的事务占所有事务的比例。频繁项集指的是满足给定最下支持度的项集

Apriori算法是由Agrawal等人于1993提出的，它采用逐层搜索策略（层次搜索策略）产生所有的频繁项集。

Apriori性质：

1：若A是一个频繁项集，则A的每一个子集都是一个频繁项集。

2：如果一个项集不是频繁的，则它的所有超集也一定不是频繁的。例如如果{ac}不是频繁集、{abc}、{acd}也一定不是频繁集。

二、Apriori算法思想

Apriori算法的基本思路是采用层次搜索的迭代方法，由候选(k-1)-项集来寻找候选k-项集，并逐一判断产生的候选k-项集是否是频繁的。

设Ck是长度为k的候选项集的集合，Lk是长度为k的频繁项集的集合。为了简单，设最小支持度阈值min_sup为最小元组数，即采用最小支持度计数。

首先，找出频繁1-项集，用L1表示。

由L1寻找C2，由C2产生L2，即产生频繁2-项集的集合。

由L2寻找C3，由C3产生L3。

以此类推，直至没有新的频繁k-项集被发现。求每个Lk时都要对事务数据库D作一次完全扫描。

举例如下：在所示的事务数据库，设min_sup=2，产生所有频繁项集的过程如图5.1所示，最后L4=Ф，算法结束，产生的所有频繁项集为L1∪L2∪L3。

算法需要解决两个问题

①如何由Lk-1构建Ck （解决方案：自连接）

②如何由Ck产生Lk （解决方案：剪枝操作，从Ck中删除明显不是频繁项集的项集）

三、实现代码：

package dataMining;

import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

/**
 * @author xcy
 * @version 2016 10:30:40 AM
 */
public class Apriori {

    private double min_sup = 0.6;
    private double min_conf = 0.2;
    private int max_size = 40;
    private String[] transSet = { "abc", "abc", "acde", "bcdf", "abcd", "abcdf" };
    private int frequencyIndex;
 
//  re  是指为了在写到项目中输出方便
    private String re = "";

    private IdentityHashMap ruleMap = new IdentityHashMap();
    private int itemCounts = 0;
    private TreeSet[] frequencySet = new TreeSet[max_size];
    private TreeSet maxFrequency = new TreeSet();
    private TreeSet canditate = new TreeSet();
    private TreeSet[] canditateSet = new TreeSet[max_size];

    public Apriori() {
        maxFrequency = new TreeSet();
        canditate = new TreeSet();
        itemCounts = counts();
        for (int i = 0; i < transSet.length; i++) {
            frequencySet[i] = new TreeSet();
            canditateSet[i] = new TreeSet();
        }
        canditateSet[0] = canditate;
    }

    public Apriori(String[] tranSet, double min_sup, double min_conf) {
        this.transSet = tranSet;
        this.min_sup = min_sup;
        this.min_conf = min_conf;

        maxFrequency = new TreeSet();
        canditate = new TreeSet();
        itemCounts = counts();
        for (int i = 0; i < transSet.length; i++) {
            frequencySet[i] = new TreeSet();
            canditateSet[i] = new TreeSet();
        }
        canditateSet[0] = canditate;
    }

    public void gen_item1() {
        String temp1 = "";
        Iterator temp = canditateSet[0].iterator();
        while (temp.hasNext()) {
            temp1 = temp.next() + "";
            if (count_sup(temp1) >= min_sup)
                frequencySet[0].add(temp1);

        }
    }

    public void gen_canditatek(int k) {
        String t1 = "", t2 = "", t3 = "";
        char c1 = 'a', c2 = 'a';
        Iterator it1 = frequencySet[k - 2].iterator();
        Iterator it2 = frequencySet[0].iterator();
        TreeSet tmp = new TreeSet();

        while (it1.hasNext()) {

            t1 = it1.next() + "";
            c1 = t1.charAt(t1.length() - 1);
            while (it2.hasNext()) {
                t2 = it2.next() + "";
                c2 = t2.charAt(0);
                if (c1 >= c2)
                    continue;
                else {
                    t3 = t1 + t2;
                    tmp.add(t3);
                }
            }

            it2 = frequencySet[0].iterator();

        }
        canditateSet[k - 1] = tmp;

    }

    public void gen_frequentk(int k) {
        String s1 = "";
        Iterator it = canditateSet[k - 1].iterator();
        frequencySet[k - 1] = new TreeSet();
        while (it.hasNext()) {
            s1 = it.next() + "";
            if (count_sup(s1) > min_sup) {
                frequencySet[k - 1].add(s1);
            }
        }
    }

    public boolean is_frequentk_empty(int k) {
        return frequencySet[k - 1].isEmpty() ? true : false;
    }

    public boolean included(String s1, String s2) {
        boolean flag = true;
        for (int i = 0; i < s1.length(); i++) {
            if (s2.indexOf(s1.charAt(i)) == -1)
                return false;
            if (i == s1.length() - 1)
                return true;
        }
        return flag;
    }

    public void gen_maxFrenquent() {
        for (int i = 0; i < frequencyIndex; i++) {
            maxFrequency.addAll(frequencySet[i]);
        }
    }

    public void gen_sub(String s) {
        String s1 = "", s2 = "";
        String tl = "";
        int sl = s.length();
        for (int i = 0; i < s.length() + 1; i++) {
            tl = tl + "0";
        }
        for (int i = 1; i < Math.pow(2, sl) - 1; i++) {
            String tmp = Integer.toBinaryString(i);
            tmp = tl + tmp;
            tmp = tmp.substring(tmp.length() - sl, tmp.length());
            for (int j = 0; j < tmp.length(); j++) {
                if (tmp.charAt(j) == '0') {
                    s1 = s1 + s.charAt(j);
                } else {
                    s2 = s2 + s.charAt(j);
                }
            }

            if (count_sup(s1 + s2) / count_sup(s1) >= min_conf) {
                ruleMap.put(s1, s2);
            }
            s1 = "";
            s2 = "";
        }
    }

    public void gen_rule() {
        String s = "";
        Iterator it = maxFrequency.iterator();
        while (it.hasNext()) {
            s = it.next() + "";
            gen_sub(s);
        }
    }

    public void print() {
        print_maxFrequency();
        gen_rule();
        print_rule();
    }

    public void print_maxFrequency() {
        Iterator it = maxFrequency.iterator();
        String k = null;
        re += "关联规则频繁项集" + "\n";
        System.out.println("关联规则频繁项集");
        while (it.hasNext()) {
            k = (String) it.next();
            re += k + "\t";
            System.out.print(k + "\t");
        }
        re += "\n";
        System.out.println();
    }

    public void print_rule() {
        String s1, s2;
        double sup = 0;
        Set hs = ruleMap.keySet();
        Iterator it = hs.iterator();

        //        StringBuffer sb = new StringBuffer();
        re += "关联规则" + "\n";
        System.out.println("关联规则");
        while (it.hasNext()) {

            s1 = (String) it.next();
            s2 = ruleMap.get(s1) + "";

            sup = count_sup(s1 + s2) / count_sup(s1);
            re += s1 + "-->" + s2 + "\t" + sup + "\n";
            System.out.println(s1 + "-->" + s2 + "\t" + sup);

        }

    }

    public void print_canditate() {

        String k = null;
        for (int i = 0; i < frequencySet[0].size(); i++) {
            Iterator ix = canditateSet[i].iterator();
            re += "候选集" + (i + 1) + ":" + "\n";
            System.out.println("候选集" + (i + 1) + ":");
            while (ix.hasNext()) {
                k = (String) ix.next();
                re += k + "\t";
                System.out.print(k + "\t");
            }
            re += "\n";
            System.out.println();

            re += "频繁项集" + (i + 1) + ":" + "\n";
            System.out.println("频繁项集" + (i + 1) + ":");
            if (frequencySet[i] != null) {
                Iterator iy = frequencySet[i].iterator();
                while (iy.hasNext()) {
                    k = (String) iy.next();
                    re += k + "\t";
                    System.out.print(k + "\t");
                }
            }
            re += "\n";
            System.out.println();
        }

    }

    public int counts() {
        String temp1 = null;
        char temp2 = 'a';
        
        for (int i = 0; i < transSet.length; i++) {
            temp1 = transSet[i];
            for (int j = 0; j < temp1.length(); j++) {
                temp2 = temp1.charAt(j);
                canditate.add(temp2 + "");
            }
        }
        return canditate.size();

    }

    public double count_sup(String t) {
        int temp = 0;
        String temp1 = "";
        for (int i = 0; i < transSet.length; i++) {
            temp1 = transSet[i];
            for (int j = 0; j < t.length(); j++) {
                if (temp1.indexOf(t.charAt(j)) == -1)
                    break;
                if (j == (t.length() - 1))
                    temp++;
            }
        }
        return 1.0 * temp / transSet.length;
    }

    public double getMax_size() {
        return max_size;
    }

    public void setMax_size(int max_size) {
        this.max_size = max_size;
    }

    public double getMin_conf() {
        return min_conf;
    }

    public double getMin_sup() {
        return min_sup;
    }

    public void setMin_sup(double min_sup) {
        this.min_sup = min_sup;
    }

    public void setMin_conf(double min_conf) {
        this.min_conf = min_conf;
    }

    public String[] getTransSet() {
        return transSet;
    }

    public void setTransSet(String[] transSet) {
        this.transSet = transSet;
    }

    public String run() {
        int k = 1;
        gen_item1();
        while (!is_frequentk_empty(k)) {
            k++;
            gen_canditatek(k);
            if (canditateSet[k - 1].size() == 0) {
                break;
            }
            gen_frequentk(k);
        }

        frequencyIndex = k - 1;
        print_canditate();
        gen_maxFrenquent();
        print();
        return re;

    }
}

1、用IdentityHashMap 存放关联规则是因为和HashMap相比，IdentityHashMap 的key可以重复。

2、用treeset来存放频繁项集和候选集是因为TreeSet 实现了set接口，元素不会重复、TreeSet中的数据是自动排序的，不允许放入null.

原文链接：https://blog.csdn.net/u011060119/article/details/52886975