在本部分练习中,将实现Logistic回归的正则化,以预测来自制造工厂的微芯片是否通过质量保证。
假设你是工厂的产品经理,在两次不同的测试中获得了某些微芯片的测试结果。从这两次测试中,你想确定应该接受还是拒绝微芯片。为了帮助你做出决定,你拥有过去微芯片测试结果的数据集,可以从中建立Logistic回归模型。
代码:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report # 评价报告
df = pd.read_csv('ex2data2.txt',names=['test1','test2','accepted'])
# print(data.head())
# sns.set(context='notebook',style='ticks',font_scale=1.5)
# sns.lmplot('test1','test2',hue='accepted',data=data,
# height=6,
# fit_reg=False,
# scatter_kws={'s':50}
# )
# plt.title('Regularized Logistic Regression')
# plt.show()
# 特征映射,目的是将原始数据映射为新的特征组合,以匹配更复杂的假设函数
# def feature_mapping(x1:np.ndarray,x2:np.ndarray,max_power:int):
# feature_data = {}
# for i in np.arange(max_power + 1):
# for p in np.arange(i + 1):
# feature_data["f{}{}".format(i - p, p)] = np.power(x1, i - p) * np.power(x2, p)
# return pd.DataFrame(feature_data)
# x1 = data['test1'].values
# x2 = data['test2'].values
def feature_mapping(x,y,power,as_ndarray=False):
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in range(power + 1)
for p in range(i + 1)
}
if as_ndarray:
# return pd.DataFrame(data).as_matrix()
return pd.DataFrame(data).values
# return np.array(pd.DataFrame(data))
else:
return pd.DataFrame(data)
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1,x2,power=6)
# print(data.shape)
# print(data.head())
# 正则化代价函数 data:(118 * 28)
theta = np.zeros(data.shape[1])
X = feature_mapping(x1,x2,power=6,as_ndarray=True)
# print(X.shape)
def get_Y(df):
# data.iloc[:,-1]是指data的最后一列
return np.array(df.iloc[:,-1])
# return df.iloc[:, -1]
y = get_Y(df)
# # print(y.shape)
def sigmoid(z):
return 1 / (1+np.exp(-z))
def cost(theta,X,y):
return np.mean(-y * np.log(sigmoid(X.dot(theta)))-(1-y) * np.log(1-sigmoid(X.dot(theta))))
# return np.mean(-y * np.log(sigmoid(X @ theta)) - (1-y) * np.log(1-sigmoid(X @ theta)))
def regularized_cost(theta,X,y,l=1):
theta_j1_to_n = theta[1:]
regularized_term = (l / 2 * len(X)) * np.power(theta_j1_to_n,2).sum()
return cost(theta,X,y) + regularized_term
# print(regularized_cost(theta,X,y))
def gradient(theta,X,y):
return (1/len(X)) * np.dot(X.T,(sigmoid(X.dot(theta)) - y))
def regularized_gradient(theta,X,y,l=1):
theta_j1_to_n = theta[1:]
regularized_theta = (l/len(X)) * theta_j1_to_n
regularized_term = np.concatenate([np.array([0]),regularized_theta])
return gradient(theta,X,y) + regularized_term
# print(regularized_gradient(theta,X,y))
import scipy.optimize as opt
# print('init cost = {}'.format(regularized_cost(theta,X,y)))
df = pd.read_csv('ex2data2.txt',names=['test1','test2','accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_Y(df)
X = feature_mapping(x1,x2,power=6,as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X,y),
# method='Newton-CG',
method='TNC',
jac=regularized_gradient)
final_theta = res.x
print(res)
# 预测
final_theta = res.x
def predict(x,theta):
prob = sigmoid(x.dot(theta))
return (prob >= 0.5).astype(int)
y_pred = predict(X,theta)
print(classification_report(y,y_pred))
""" 决策边界未解决
# 画出决策边界
def draw_boundary(power, l):
density = 1000
threshhold = 2 * 10**-3
final_theta = feature_mapped_logistic_regression(power, l)
x, y = find_decision_boundary(density, power, final_theta, threshhold)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})
# plt.scatter(x, y, corlor='R', s=10)
plt.scatter(x, y, s=10)
plt.title('Decision boundary')
plt.show()
def feature_mapped_logistic_regression(power, l):
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_Y(df)
X = feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=regularized_gradient)
final_theta = res.x
return final_theta
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density) #1000个样本
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power) # this is a dataframe
# inner_product = mapped_cord.as_matrix() @ theta
inner_product = mapped_cord.values @ theta
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
draw_boundary(6,1)
"""
版权声明:本文为weixin_51444827原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。