一、什么是Apriori算法?
Apriori算法是寻找所有支持度不小于minsup的项集。项集的支持度指的是包含该项集的事务占所有事务的比例。频繁项集指的是满足给定最下支持度的项集
Apriori算法是由Agrawal等人于1993提出的,它采用逐层搜索策略(层次搜索策略)产生所有的频繁项集。
Apriori性质:
1: 若A是一个频繁项集,则A的每一个子集都是一个频繁项集。
2:如果一个项集不是频繁的,则它的所有超集也一定不是频繁的。 例如 如果{ac}不是频繁集、{abc}、{acd}也一定不是频繁集。
二、Apriori算法思想
Apriori算法的基本思路是采用层次搜索的迭代方法,由候选(k-1)-项集来寻找候选k-项集,并逐一判断产生的候选k-项集是否是频繁的。
设Ck是长度为k的候选项集的集合,Lk是长度为k的频繁项集的集合。为了简单,设最小支持度阈值min_sup为最小元组数,即采用最小支持度计数。
首先,找出频繁1-项集,用L1表示。
由L1寻找C2,由C2产生L2,即产生频繁2-项集的集合。
由L2寻找C3,由C3产生L3。
以此类推,直至没有新的频繁k-项集被发现。求每个Lk时都要对事务数据库D作一次完全扫描。
举例如下: 在所示的事务数据库,设min_sup=2,产生所有频繁项集的过程如图5.1所示,最后L4=Ф,算法结束,产生的所有频繁项集为L1∪L2∪L3。
算法需要解决两个问题
①如何由Lk-1构建Ck (解决方案: 自连接)
②如何由Ck产生Lk (解决方案:剪枝操作,从Ck中删除明显不是频繁项集的项集)
三、实现代码:
package dataMining;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
/**
* @author xcy
* @version 2016 10:30:40 AM
*/
public class Apriori {
private double min_sup = 0.6;
private double min_conf = 0.2;
private int max_size = 40;
private String[] transSet = { "abc", "abc", "acde", "bcdf", "abcd", "abcdf" };
private int frequencyIndex;
// re 是指为了在写到项目中输出方便
private String re = "";
private IdentityHashMap ruleMap = new IdentityHashMap();
private int itemCounts = 0;
private TreeSet[] frequencySet = new TreeSet[max_size];
private TreeSet maxFrequency = new TreeSet();
private TreeSet canditate = new TreeSet();
private TreeSet[] canditateSet = new TreeSet[max_size];
public Apriori() {
maxFrequency = new TreeSet();
canditate = new TreeSet();
itemCounts = counts();
for (int i = 0; i < transSet.length; i++) {
frequencySet[i] = new TreeSet();
canditateSet[i] = new TreeSet();
}
canditateSet[0] = canditate;
}
public Apriori(String[] tranSet, double min_sup, double min_conf) {
this.transSet = tranSet;
this.min_sup = min_sup;
this.min_conf = min_conf;
maxFrequency = new TreeSet();
canditate = new TreeSet();
itemCounts = counts();
for (int i = 0; i < transSet.length; i++) {
frequencySet[i] = new TreeSet();
canditateSet[i] = new TreeSet();
}
canditateSet[0] = canditate;
}
public void gen_item1() {
String temp1 = "";
Iterator temp = canditateSet[0].iterator();
while (temp.hasNext()) {
temp1 = temp.next() + "";
if (count_sup(temp1) >= min_sup)
frequencySet[0].add(temp1);
}
}
public void gen_canditatek(int k) {
String t1 = "", t2 = "", t3 = "";
char c1 = 'a', c2 = 'a';
Iterator it1 = frequencySet[k - 2].iterator();
Iterator it2 = frequencySet[0].iterator();
TreeSet tmp = new TreeSet();
while (it1.hasNext()) {
t1 = it1.next() + "";
c1 = t1.charAt(t1.length() - 1);
while (it2.hasNext()) {
t2 = it2.next() + "";
c2 = t2.charAt(0);
if (c1 >= c2)
continue;
else {
t3 = t1 + t2;
tmp.add(t3);
}
}
it2 = frequencySet[0].iterator();
}
canditateSet[k - 1] = tmp;
}
public void gen_frequentk(int k) {
String s1 = "";
Iterator it = canditateSet[k - 1].iterator();
frequencySet[k - 1] = new TreeSet();
while (it.hasNext()) {
s1 = it.next() + "";
if (count_sup(s1) > min_sup) {
frequencySet[k - 1].add(s1);
}
}
}
public boolean is_frequentk_empty(int k) {
return frequencySet[k - 1].isEmpty() ? true : false;
}
public boolean included(String s1, String s2) {
boolean flag = true;
for (int i = 0; i < s1.length(); i++) {
if (s2.indexOf(s1.charAt(i)) == -1)
return false;
if (i == s1.length() - 1)
return true;
}
return flag;
}
public void gen_maxFrenquent() {
for (int i = 0; i < frequencyIndex; i++) {
maxFrequency.addAll(frequencySet[i]);
}
}
public void gen_sub(String s) {
String s1 = "", s2 = "";
String tl = "";
int sl = s.length();
for (int i = 0; i < s.length() + 1; i++) {
tl = tl + "0";
}
for (int i = 1; i < Math.pow(2, sl) - 1; i++) {
String tmp = Integer.toBinaryString(i);
tmp = tl + tmp;
tmp = tmp.substring(tmp.length() - sl, tmp.length());
for (int j = 0; j < tmp.length(); j++) {
if (tmp.charAt(j) == '0') {
s1 = s1 + s.charAt(j);
} else {
s2 = s2 + s.charAt(j);
}
}
if (count_sup(s1 + s2) / count_sup(s1) >= min_conf) {
ruleMap.put(s1, s2);
}
s1 = "";
s2 = "";
}
}
public void gen_rule() {
String s = "";
Iterator it = maxFrequency.iterator();
while (it.hasNext()) {
s = it.next() + "";
gen_sub(s);
}
}
public void print() {
print_maxFrequency();
gen_rule();
print_rule();
}
public void print_maxFrequency() {
Iterator it = maxFrequency.iterator();
String k = null;
re += "关联规则频繁项集" + "\n";
System.out.println("关联规则频繁项集");
while (it.hasNext()) {
k = (String) it.next();
re += k + "\t";
System.out.print(k + "\t");
}
re += "\n";
System.out.println();
}
public void print_rule() {
String s1, s2;
double sup = 0;
Set hs = ruleMap.keySet();
Iterator it = hs.iterator();
// StringBuffer sb = new StringBuffer();
re += "关联规则" + "\n";
System.out.println("关联规则");
while (it.hasNext()) {
s1 = (String) it.next();
s2 = ruleMap.get(s1) + "";
sup = count_sup(s1 + s2) / count_sup(s1);
re += s1 + "-->" + s2 + "\t" + sup + "\n";
System.out.println(s1 + "-->" + s2 + "\t" + sup);
}
}
public void print_canditate() {
String k = null;
for (int i = 0; i < frequencySet[0].size(); i++) {
Iterator ix = canditateSet[i].iterator();
re += "候选集" + (i + 1) + ":" + "\n";
System.out.println("候选集" + (i + 1) + ":");
while (ix.hasNext()) {
k = (String) ix.next();
re += k + "\t";
System.out.print(k + "\t");
}
re += "\n";
System.out.println();
re += "频繁项集" + (i + 1) + ":" + "\n";
System.out.println("频繁项集" + (i + 1) + ":");
if (frequencySet[i] != null) {
Iterator iy = frequencySet[i].iterator();
while (iy.hasNext()) {
k = (String) iy.next();
re += k + "\t";
System.out.print(k + "\t");
}
}
re += "\n";
System.out.println();
}
}
public int counts() {
String temp1 = null;
char temp2 = 'a';
for (int i = 0; i < transSet.length; i++) {
temp1 = transSet[i];
for (int j = 0; j < temp1.length(); j++) {
temp2 = temp1.charAt(j);
canditate.add(temp2 + "");
}
}
return canditate.size();
}
public double count_sup(String t) {
int temp = 0;
String temp1 = "";
for (int i = 0; i < transSet.length; i++) {
temp1 = transSet[i];
for (int j = 0; j < t.length(); j++) {
if (temp1.indexOf(t.charAt(j)) == -1)
break;
if (j == (t.length() - 1))
temp++;
}
}
return 1.0 * temp / transSet.length;
}
public double getMax_size() {
return max_size;
}
public void setMax_size(int max_size) {
this.max_size = max_size;
}
public double getMin_conf() {
return min_conf;
}
public double getMin_sup() {
return min_sup;
}
public void setMin_sup(double min_sup) {
this.min_sup = min_sup;
}
public void setMin_conf(double min_conf) {
this.min_conf = min_conf;
}
public String[] getTransSet() {
return transSet;
}
public void setTransSet(String[] transSet) {
this.transSet = transSet;
}
public String run() {
int k = 1;
gen_item1();
while (!is_frequentk_empty(k)) {
k++;
gen_canditatek(k);
if (canditateSet[k - 1].size() == 0) {
break;
}
gen_frequentk(k);
}
frequencyIndex = k - 1;
print_canditate();
gen_maxFrenquent();
print();
return re;
}
}
1、用IdentityHashMap 存放关联规则是因为和HashMap相比,IdentityHashMap 的key可以重复。
2、用treeset来存放频繁项集和候选集是因为TreeSet 实现了set接口,元素不会重复、TreeSet中的数据是自动排序的,不允许放入null.
版权声明:本文为u011060119原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。