YOLOv2、v3使用K-means聚类计算anchor boxes的具体方法

k-means需要有数据，中心点个数是需要人为指定的，位置可以随机初始化，但是还需要度量到聚类中心的距离。这里怎么度量这个距离是很关键的。
距离度量如果使用标准的欧氏距离，大盒子会比小盒子产生更多的错误。例。因此这里使用其他的距离度量公式。聚类的目的是anchor boxes和临近的ground truth有更大的IOU值，这和anchor box的尺寸没有直接关系。自定义的距离度量公式：

到聚类中心的距离越小越好，但IOU值是越大越好，所以使用 1 - IOU，这样就保证距离越小，IOU值越大。

使用的聚类原始数据是只有标注框的检测数据集，YOLOv2、v3都会生成一个包含标注框位置和类别的TXT文件，其中每行都包含，即ground truth boxes相对于原图的坐标，是框的中心点，是框的宽和高，N是所有标注框的个数；
首先给定k个聚类中心点，这里的是anchor boxes的宽和高尺寸，由于anchor boxes位置不固定，所以没有(x,y)的坐标，只有宽和高；
计算每个标注框和每个聚类中心点的距离 d=1-IOU(标注框,聚类中心)，计算时每个标注框的中心点都与聚类中心重合，这样才能计算IOU值，即。将标注框分配给“距离”最近的聚类中心；
所有标注框分配完毕以后，对每个簇重新计算聚类中心点，计算方式为，是第i个簇的标注框个数，就是求该簇中所有标注框的宽和高的平均值。
重复第3、4步，直到聚类中心改变量很小。

代码实现主要是AlexeyAB/darknet中scripts/gen_anchors.py，这里根据yolov2，yolov3的版本不同进行部分修改。yolov2的配置文件yolov2.cfg需要的anchors是相对特征图的，值很小基本都小于13；yolov3的配置文件yolov3.cfg需要的3个anchors是相对于原图来说的，相对都比较大。还有输入图片的大小（32的倍数）对于输出也是有影响的。
例：
yolov2.cfg中[region] anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
yolov3.cfg中[region] anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326

from os import listdir

from os.path import isfile, join

import argparse

#import cv2

import numpy as np

import sys

import os

import shutil

import random

import math



def IOU(x,centroids):

    '''

    :param x: 某一个ground truth的w,h

    :param centroids:  anchor的w,h的集合[(w,h),(),...]，共k个

    :return: 单个ground truth box与所有k个anchor box的IoU值集合

    '''

    IoUs = []

    w, h = x  # ground truth的w,h

    for centroid in centroids:

        c_w,c_h = centroid   #anchor的w,h

        if c_w>=w and c_h>=h:   #anchor包围ground truth

            iou = w*h/(c_w*c_h)

        elif c_w>=w and c_h<=h:    #anchor宽矮

            iou = w*c_h/(w*h + (c_w-w)*c_h)

        elif c_w<=w and c_h>=h:    #anchor瘦长

            iou = c_w*h/(w*h + c_w*(c_h-h))

        else: #ground truth包围anchor     means both w,h are bigger than c_w and c_h respectively

            iou = (c_w*c_h)/(w*h)

        IoUs.append(iou) # will become (k,) shape

    return np.array(IoUs)



def avg_IOU(X,centroids):

    '''

    :param X: ground truth的w,h的集合[(w,h),(),...]

    :param centroids: anchor的w,h的集合[(w,h),(),...]，共k个

    '''

    n,d = X.shape

    sum = 0.

    for i in range(X.shape[0]):

        sum+= max(IOU(X[i],centroids))  #返回一个ground truth与所有anchor的IoU中的最大值

    return sum/n    #对所有ground truth求平均



def write_anchors_to_file(centroids,X,anchor_file,input_shape,yolo_version):

    '''

    :param centroids: anchor的w,h的集合[(w,h),(),...]，共k个

    :param X: ground truth的w,h的集合[(w,h),(),...]

    :param anchor_file: anchor和平均IoU的输出路径

    '''

    f = open(anchor_file,'w')

   

    anchors = centroids.copy()

    print(anchors.shape)



    if yolo_version=='yolov2':

        for i in range(anchors.shape[0]):

            #yolo中对图片的缩放倍数为32倍，所以这里除以32，

            # 如果网络架构有改变，根据实际的缩放倍数来

            #求出anchor相对于缩放32倍以后的特征图的实际大小（yolov2）

            anchors[i][0]*=input_shape/32.

            anchors[i][1]*=input_shape/32.

    elif yolo_version=='yolov3':

        for i in range(anchors.shape[0]):

            #求出yolov3相对于原图的实际大小

            anchors[i][0]*=input_shape

            anchors[i][1]*=input_shape

    else:

        print("the yolo version is not right!")

        exit(-1)



    widths = anchors[:,0]

    sorted_indices = np.argsort(widths)



    print('Anchors = ', anchors[sorted_indices])

       

    for i in sorted_indices[:-1]:

        f.write('%0.2f,%0.2f, '%(anchors[i,0],anchors[i,1]))



    #there should not be comma after last anchor, that's why

    f.write('%0.2f,%0.2f\n'%(anchors[sorted_indices[-1:],0],anchors[sorted_indices[-1:],1]))

   

    f.write('%f\n'%(avg_IOU(X,centroids)))

    print()



def kmeans(X,centroids,eps,anchor_file,input_shape,yolo_version):

   

    N = X.shape[0] #ground truth的个数

    iterations = 0

    print("centroids.shape",centroids)

    k,dim = centroids.shape  #anchor的个数k以及w,h两维，dim默认等于2

    prev_assignments = np.ones(N)*(-1)    #对每个ground truth分配初始标签

    iter = 0

    old_D = np.zeros((N,k))  #初始化每个ground truth对每个anchor的IoU



    while True:

        D = []

        iter+=1          

        for i in range(N):

            d = 1 - IOU(X[i],centroids)

            D.append(d)

        D = np.array(D) # D.shape = (N,k)  得到每个ground truth对每个anchor的IoU

       

        print("iter {}: dists = {}".format(iter,np.sum(np.abs(old_D-D))))  #计算每次迭代和前一次IoU的变化值

           

        #assign samples to centroids

        assignments = np.argmin(D,axis=1)  #将每个ground truth分配给距离d最小的anchor序号

       

        if (assignments == prev_assignments).all() :  #如果前一次分配的结果和这次的结果相同，就输出anchor以及平均IoU

            print("Centroids = ",centroids)

            write_anchors_to_file(centroids,X,anchor_file,input_shape,yolo_version)

            return



        #calculate new centroids

        centroid_sums=np.zeros((k,dim),np.float)   #初始化以便对每个簇的w,h求和

        for i in range(N):

            centroid_sums[assignments[i]]+=X[i]         #将每个簇中的ground truth的w和h分别累加

        for j in range(k):            #对簇中的w,h求平均

            centroids[j] = centroid_sums[j]/(np.sum(assignments==j)+1)

       

        prev_assignments = assignments.copy()    

        old_D = D.copy() 



def main(argv):

    parser = argparse.ArgumentParser()

    parser.add_argument('-filelist', default = r'scripts\train.txt',

                        help='path to filelist\n' ) #这个文件是由运行scripts文件夹中的   

                             #voc_label.py文件得到的，scripts文件夹中会生成几个TXT文件。

                             #python voc_label.py

                             #目前yolo打标签可以使用labelimg中的yolo格式

    parser.add_argument('-output_dir', default = r'\scripts', type = str,

                        help='Output anchor directory\n' )

    parser.add_argument('-num_clusters', default = 0, type = int,

                        help='number of clusters\n' )

    '''

    需要注意的是yolov2输出的值比较小是相对特征图来说的，

    yolov3输出值较大是相对原图来说的，

    所以yolov2和yolov3的输出是有区别的

    '''

    parser.add_argument('-yolo_version', default='yolov2', type=str,

                        help='yolov2 or yolov3\n')

    parser.add_argument('-yolo_input_shape', default=416, type=int,

                        help='input images shape，multiples of 32. etc. 416*416\n')

    args = parser.parse_args()

   

    if not os.path.exists(args.output_dir):

        os.mkdir(args.output_dir)



    f = open(args.filelist)

 

    lines = [line.rstrip('\n') for line in f.readlines()]

   

    annotation_dims = []



    for line in lines:

        line = line.replace('JPEGImages','labels')

        line = line.replace('.jpg','.txt')

        line = line.replace('.png','.txt')

        print(line)

        f2 = open(line)

        for line in f2.readlines():

            line = line.rstrip('\n')

            w,h = line.split(' ')[3:]           

            #print(w,h)

            annotation_dims.append((float(w),float(h)))

    annotation_dims = np.array(annotation_dims) #保存所有ground truth框的(w,h)

 

    eps = 0.005



    if args.num_clusters == 0:

        for num_clusters in range(1,11): #we make 1 through 10 clusters

            anchor_file = join( args.output_dir,'anchors%d.txt'%(num_clusters))



            indices = [ random.randrange(annotation_dims.shape[0]) for i in range(num_clusters)]

            centroids = annotation_dims[indices]

            kmeans(annotation_dims,centroids,eps,anchor_file,args.yolo_input_shape,args.yolo_version)

            print('centroids.shape', centroids.shape)

    else:

        anchor_file = join( args.output_dir,'anchors%d.txt'%(args.num_clusters))

        indices = [ random.randrange(annotation_dims.shape[0]) for i in range(args.num_clusters)]

        centroids = annotation_dims[indices]

        kmeans(annotation_dims,centroids,eps,anchor_file,args.yolo_input_shape,args.yolo_version)

        print('centroids.shape', centroids.shape)



if __name__=="__main__":

    main(sys.argv)

这是其中的yolov3的结果

参考的内容：https://github.com/AlexeyAB/darknet#how-to-mark-bounded-boxes-of-objects-and-create-annotation-files

原文链接：https://blog.csdn.net/m0_50617544/article/details/120639193