背景:需要对数据中元素较多的list根据子矩阵进行聚类,聚类结果进行相关处理。
目录
一、创建子矩阵
1.1 子矩阵定义
https://zhidao.baidu.com/question/1609139445669797947.html
https://blog.csdn.net/qq_16964363/article/details/79497917
设原矩阵为m行n列,则取原矩阵的第 a1,a2,...,ak行(0<a1<a2<...<ak≤m,数列各项均为整数)和第b1,b2,...,bq列(0<b1<b2<...<bq≤n,数列各项均为整数)的元素,这些元素之间保持同一行列关系不变,收缩为一个矩阵,则该矩阵为原矩阵的子矩阵。
相当于求交叉点的元素
1.2 创建子矩阵
# construct sub_correlation_matrix
sub_correlation_matrix=np.zeros([len(classes_in_current_split_group),len(classes_in_current_split_group)])
# print("sub_correlation_matrix before initialize: \n",sub_correlation_matrix)
for row_idx in range(len(classes_in_current_split_group)):
for col_idx in range(len(classes_in_current_split_group)):
sub_correlation_matrix[row_idx,col_idx]=coco_correlation_A_B[classes_in_current_split_group[row_idx],classes_in_current_split_group[col_idx]]
# print("sub_correlation_matrix after initialize: \n",sub_correlation_matrix)
创建list,list中为需要采样的行数和列数
然后for list中的元素
子矩阵行下标为 row_idx, 对应原矩阵 行下标为 classes_in_current_split_group[row_idx]
二、字典、list、array的添加与遍历
2.1 添加元素
split_groups={}
split_groups[1]=classes_number
直接 字典[key]=key_value
中括号内为 键,等号后为 键值
print该字典为
{1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
2.2 遍历字典
直接for 变量 in 字典
则变量即为key, 字典[变量] 即字典中元素
for key in split_groups:
print("\ngroup:",key,
" group element numbers",len(split_groups[key]),
"\ngroup_elements",split_groups[key])
2.3 添加list元素
直接 listname.append(需要加入的元素)
# number of label classes from 0,1,2,...79
classes_number=[]
for class_idx in range(0,80):
classes_number.append(class_idx)
2.4 遍历list
for row_idx in range(len(classes_in_current_split_group)):
for col_idx in range(len(classes_in_current_split_group)):
sub_correlation_matrix[row_idx,col_idx]=coco_correlation_A_B[classes_in_current_split_group[row_idx],classes_in_current_split_group[col_idx]]
这里需要注意,len需要 in range(len(list_name)),如果直接 for index in len(list_name)则会报错
同时,找list中元素用中括号[], 中括号中不同维度用逗号隔开。
2.5 np.array
创建数组
https://www.cnblogs.com/hezhefly/p/8278842.html
sub_correlation_matrix=np.zeros([len(classes_in_current_split_group),len(classes_in_current_split_group)])
创建的array名称=np.zeros([维度])
2.6 array中元素
arrya_name[维度0 坐标,维度1坐标]
例如:
for row_idx in range(len(classes_in_current_split_group)):
for col_idx in range(len(classes_in_current_split_group)):
sub_correlation_matrix[row_idx,col_idx]=coco_correlation_A_B[classes_in_current_split_group[row_idx],classes_in_current_split_group[col_idx]]
三、完整程序
#-*-coding:utf-8 -*-
"""
created by xingxinangrui on 2019.5.7
this program is to perform spectral clustering on coco dataset labels
Cluster big groups to two groups until no elemnts size >10
-----------------------1.----------------------------
load coco_correlations.pkl
load coco_names.pkl
in which:
names is a 80 dimension list contains label names
names : ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
......
'hot dog', 'pizza', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
A_B is a correlation matrix
A_B [[1.00000000e+00 8.26410144e-01 7.04392284e-01 ... 4.03311258e-01
4.45312500e-01 5.40000000e-01]
...
[8.36764511e-03 1.74901618e-03 6.97188008e-04 ... 3.97350993e-03
1.40625000e-01 1.00000000e+00]]
A_B.shape (80, 80)
notA_B.shape (80, 80)
A_notB.shape (80, 80)
notA_notB.shape (80, 80)
correlations = {}
correlations.update(pp=A_B) #p(A/B)
correlations.update(fp=notA_B) # P(not A/B)
correlations.update(pf=A_notB)
correlations.update(ff=notA_notB)
----------------------2.------------------------
cluster from one big group to two small groups until all group_element_number < max_classes_per_group
"""
import numpy as np
from sklearn import datasets
from sklearn.cluster import SpectralClustering
from sklearn import metrics
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import json
import os
import argparse
import warnings
warnings.filterwarnings("ignore")
# env/bin/python sk_spectral_cluster/coco_SpecCluster_iter2group.py --max_classes_per_group 10 --probability_filter_threshold 0.1 --show_cluster_process 0
parser = argparse.ArgumentParser(description='coco_label_spectral_clustering_iter_two_groups')
parser.add_argument('--max_classes_per_group', '-i', default=10, type=int,
metavar='N', help='Max classes in each group after cluster')
parser.add_argument('--probability_filter_threshold', default=0.1, type=float,
help='filter probilities which less than probability_filter_threshold set to 0')
parser.add_argument('--show_cluster_process', default=1, type=int,
help='if 1 show cluster process, else just show final result')
def coco_label_spectral_clustering_iter_two_groups():
#parsars
global args
args = parser.parse_args()
max_classes_per_group=args.max_classes_per_group
show_cluster_process=args.show_cluster_process
print("-----------------------------------------------------")
print("-----------------------------------------------------")
print("-----------------------------------------------------")
# ----------------------load coco_correlations.pkl and load coco_names.pkl--------
with open('sk_spectral_cluster/coco_correlations.pkl', 'rb') as f:
print("loading coco_correlations.pkl ")
correlations= pickle.load(f)
with open('sk_spectral_cluster/coco_names.pkl', 'rb') as f:
print("loading coco_names.pkl")
names=pickle.load(f)
coco_correlation_before_filter=correlations['pp']
#print('coco label corrrelation matrix (80*80) : \n' , coco_correlation_A_B)
# filter probilities which less than probability_filter_threshold set to 0
coco_correlation_A_B=coco_correlation_before_filter
probability_filter_threshold=args.probability_filter_threshold
for row_idx in range(coco_correlation_before_filter.shape[0]):
for col_idx in range(coco_correlation_before_filter.shape[1]):
# print(coco_correlation_before_filter[row_idx,col_idx])
if coco_correlation_A_B[row_idx,col_idx]<probability_filter_threshold:
coco_correlation_A_B[row_idx, col_idx]=0
else:
coco_correlation_A_B[row_idx, col_idx]=coco_correlation_before_filter[row_idx, col_idx]
# # print filtered matrix for check
# for row_idx in range(coco_correlation_A_B.shape[0]):
# for col_idx in range(coco_correlation_A_B.shape[1]):
# print(coco_correlation_A_B[row_idx,col_idx])
#---------cluster from one big group to two small groups until group_element_number < 10
# number of label classes from [0,1,2,...79]
classes_number=[]
for class_idx in range(0,80):
classes_number.append(class_idx)
# splited groups after clustering format:
# { 1 : [1.2,4,6,45,... ]
# 2 : [8,10,....]
# 3 : [55,66.... ] }
split_groups={}
split_groups[1]=classes_number
# -------print all classes before split-----------
#print("All_classes_number : \n", classes_number)
#print("groups_before_split : \n", split_groups)
exist_GroupElementNumber_more_than_ten=1
iter_times=0
# loop until no elemnts bigger than 10 in split_groups
while exist_GroupElementNumber_more_than_ten==1:
# find if all elements in split_groups small than ten
# if exist GroupElementNumber more than ten split current group
# if not exist means all GroupElementNumber less than ten , break while
exist_GroupElementNumber_more_than_ten=0
current_split_group_idx=-1
for key in split_groups:
# print("group: ",key," group element numbers: ",len(split_groups[key])," group_elements : ",split_groups[key])
if len(split_groups[key])>max_classes_per_group:
iter_times=iter_times+1
print("Not all element number less than", max_classes_per_group ," split start, iteration : ",iter_times,"split group: ",key)
exist_GroupElementNumber_more_than_ten=1
current_split_group_idx=key
break
if exist_GroupElementNumber_more_than_ten==0:
print(" All group element number less than ten! program end!")
print(" Final split groups: ")
for key in split_groups:
print("group:", key, " group element numbers", len(split_groups[key]), " group_elements : ", split_groups[key])
break
# ----------------split current_split_group---------------
classes_in_current_split_group = split_groups[key]
# print("Current split group idx : ",key,"\nClassses in current split group : ",classes_in_current_split_group)
# construct sub_correlation_matrix
sub_correlation_matrix=np.zeros([len(classes_in_current_split_group),len(classes_in_current_split_group)])
# print("sub_correlation_matrix before initialize: \n",sub_correlation_matrix)
for row_idx in range(len(classes_in_current_split_group)):
for col_idx in range(len(classes_in_current_split_group)):
sub_correlation_matrix[row_idx,col_idx]=coco_correlation_A_B[classes_in_current_split_group[row_idx],classes_in_current_split_group[col_idx]]
# print("sub_correlation_matrix after initialize: \n",sub_correlation_matrix)
# sepctral clustering to two groups
# pred_y in format [1,0,0,0,1,1,0,1,...0,1]
gamma=1
pred_y = SpectralClustering(n_clusters=2,gamma=gamma).fit_predict(sub_correlation_matrix)
# wirte splited groups into two list and update split_groups
splited_class_idx_list_0 = []
splited_class_idx_list_1 = []
for pred_idx in range(len(pred_y)):
# write two list
if pred_y[pred_idx]==0:
splited_class_idx_list_0.append(classes_in_current_split_group[pred_idx])
else:
splited_class_idx_list_1.append(classes_in_current_split_group[pred_idx])
# print("splited_class_idx_list_0",splited_class_idx_list_0, "\nsplited_class_idx_list_1",splited_class_idx_list_1)
# update split_groups
split_groups[current_split_group_idx]=splited_class_idx_list_0
split_groups[len(split_groups)+1]=splited_class_idx_list_1
if show_cluster_process==1 :
print("Updated split_groups: ")
for group_idx in split_groups:
print("group", group_idx," group element numbers: ",len(split_groups[group_idx]) , " group_elements : ", split_groups[group_idx])
print("split ended !\n")
#---------------split ended ,write final results-------------------------
# write group and class names into group_class_names
class_names_in_groups={}
for group_idx in split_groups:
current_group_name_list=[]
current_group_class_idx_list=split_groups[group_idx]
for class_idx in range(len(current_group_class_idx_list)):
current_group_name_list.append(names[current_group_class_idx_list[class_idx]])
class_names_in_groups[group_idx]=current_group_name_list
# print final results
print("\nFinal results,group numbers: ",len(split_groups), " max_classes_per_group: ",max_classes_per_group," probability filter threshold: ", probability_filter_threshold)
for group_idx in class_names_in_groups:
print("group:", group_idx, " group element numbers: ", len(class_names_in_groups[group_idx]), "\ngroup_elements : ",class_names_in_groups[group_idx])
if __name__ == '__main__':
coco_label_spectral_clustering_iter_two_groups()
版权声明:本文为weixin_36474809原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。