1.先把数据集分开
import numpy as np
import pandas as pd
with open('./data/train.csv') as file:
array_of_lines = file.readlines()
number_of_lines = len(array_of_lines) # 有100000行数据
# 分割的话按照8:2分,8W行训练集,2W行测试集
with open('./data/train_train.csv', 'a') as file_train:
for line in array_of_lines[0:80000]:
file_train.write(line)
with open('./data/train_test.csv', 'a') as file_test:
file_test.write(array_of_lines[0])
for line in array_of_lines[80000:]:
file_test.write(line)
2.把训练集、测试集的id 输入heartbeat_signals(x) 输出label(y) 分割开
df_train = pd.read_csv('./data/train_train.csv')
df_test = pd.read_csv('./data/train_test.csv')
test_id = df_test['id']
test_x_str = df_test['heartbeat_signals']
test_y = df_test['label']
train_id = df_train['id']
train_x_str = df_train['heartbeat_signals']
train_y = df_train['label']
def split_heartbeat_signals(array_str):
length_of_array = len(array_str)
number_of_str = len(array_str[0].split(','))
return_mat = np.zeros((length_of_array, number_of_str))
index = 0
for line in array_str:
line = line.strip()
list_from_line = line.split(',')
return_mat[index, :] = list_from_line[0:number_of_str]
index += 1
return return_mat
3.将数据转换成适合classify0输入的类型
train_dataset = split_heartbeat_signals(train_x_str)
train_label = np.array(train_y)
test_dataset = split_heartbeat_signals(test_x_str)
test_label = np.array(test_y)
4.试用classify0算法
def classify0(in_x, dataset, labels, k):
dataset_size = dataset.shape[0]
diff_mat = np.tile(in_x, (dataset_size, 1)) - dataset
sq_diff_mat = diff_mat ** 2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances ** 0.5
sort_dist_indicies = np.argsort(distances)
class_count = {}
for i in range(k):
vote_ilabel = labels[sort_dist_indicies[i]]
class_count[vote_ilabel] = class_count.get(vote_ilabel, 0) + 1
sorted_class_count = sorted(class_count.items(), key=lambda kv: kv[1], reverse=True)
return sorted_class_count[0][0]
classify0(test_dataset[-3], train_dataset[:100], train_label[:100], 10)
5.测试算法
def test_classify0_rate(test_dataset, test_label):
num_of_test = len(test_dataset)
correct_count = 0.0
for i in range(num_of_test):
classifier_result = classify0(test_dataset[i], train_dataset[:10000], train_label[:10000], 8)
print("the classifier came back with: %d, the real answer is: %d" % (classifier_result, test_label[i]))
if classifier_result == test_label[i]:
correct_count += 1.0
print("the total correct rate is: %f" % (correct_count / float(num_of_test)))
test_classify0_rate(test_dataset[1000:2000], test_label[1000:2000])
版权声明:本文为Atticus_zhang原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。