【吴恩达机器学习】正则化逻辑回归习题

在本部分练习中，将实现Logistic回归的正则化，以预测来自制造工厂的微芯片是否通过质量保证。
假设你是工厂的产品经理，在两次不同的测试中获得了某些微芯片的测试结果。从这两次测试中，你想确定应该接受还是拒绝微芯片。为了帮助你做出决定，你拥有过去微芯片测试结果的数据集，可以从中建立Logistic回归模型。
代码：
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report # 评价报告

df = pd.read_csv('ex2data2.txt',names=['test1','test2','accepted'])
# print(data.head())
# sns.set(context='notebook',style='ticks',font_scale=1.5)
# sns.lmplot('test1','test2',hue='accepted',data=data,
#            height=6,
#            fit_reg=False,
#            scatter_kws={'s':50}
#            )
# plt.title('Regularized Logistic Regression')
# plt.show()

# 特征映射,目的是将原始数据映射为新的特征组合,以匹配更复杂的假设函数
# def feature_mapping(x1:np.ndarray,x2:np.ndarray,max_power:int):
#     feature_data = {}
#     for i in np.arange(max_power + 1):
#         for p in np.arange(i + 1):
#             feature_data["f{}{}".format(i - p, p)] = np.power(x1, i - p) * np.power(x2, p)
#     return pd.DataFrame(feature_data)

# x1 = data['test1'].values
# x2 = data['test2'].values

def feature_mapping(x,y,power,as_ndarray=False):

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                    for i in range(power + 1)
                    for p in range(i + 1)
            }
    if as_ndarray:
        # return pd.DataFrame(data).as_matrix()
        return pd.DataFrame(data).values
        # return np.array(pd.DataFrame(data))
    else:
        return pd.DataFrame(data)

x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1,x2,power=6)
# print(data.shape)
# print(data.head())

# 正则化代价函数 data:(118 * 28)
theta = np.zeros(data.shape[1])
X = feature_mapping(x1,x2,power=6,as_ndarray=True)

# print(X.shape)


def get_Y(df):
    # data.iloc[:,-1]是指data的最后一列
    return np.array(df.iloc[:,-1])
    # return df.iloc[:, -1]


y = get_Y(df)
# # print(y.shape)

def sigmoid(z):
    return 1 / (1+np.exp(-z))

def cost(theta,X,y):
    return np.mean(-y * np.log(sigmoid(X.dot(theta)))-(1-y) * np.log(1-sigmoid(X.dot(theta))))
    # return np.mean(-y * np.log(sigmoid(X @ theta)) - (1-y) * np.log(1-sigmoid(X @ theta)))

def regularized_cost(theta,X,y,l=1):
    theta_j1_to_n = theta[1:]
    regularized_term = (l / 2 * len(X)) * np.power(theta_j1_to_n,2).sum()
    return cost(theta,X,y) + regularized_term

# print(regularized_cost(theta,X,y))

def gradient(theta,X,y):
    return (1/len(X)) * np.dot(X.T,(sigmoid(X.dot(theta)) - y))

def regularized_gradient(theta,X,y,l=1):
    theta_j1_to_n = theta[1:]
    regularized_theta = (l/len(X)) * theta_j1_to_n
    regularized_term = np.concatenate([np.array([0]),regularized_theta])
    return gradient(theta,X,y) + regularized_term

# print(regularized_gradient(theta,X,y))



import scipy.optimize as opt
# print('init cost = {}'.format(regularized_cost(theta,X,y)))

df = pd.read_csv('ex2data2.txt',names=['test1','test2','accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = get_Y(df)
X = feature_mapping(x1,x2,power=6,as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=regularized_cost,
                    x0=theta,
                    args=(X,y),
                    # method='Newton-CG',
                    method='TNC',
                    jac=regularized_gradient)
final_theta = res.x
print(res)


# 预测
final_theta = res.x
def predict(x,theta):
    prob = sigmoid(x.dot(theta))
    return (prob >= 0.5).astype(int)

y_pred = predict(X,theta)
print(classification_report(y,y_pred))


""" 决策边界未解决
# 画出决策边界

def draw_boundary(power, l):
    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    # plt.scatter(x, y, corlor='R', s=10)
    plt.scatter(x, y, s=10)
    plt.title('Decision boundary')
    plt.show()


def feature_mapped_logistic_regression(power, l):

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = get_Y(df)

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient)
    final_theta = res.x

    return final_theta


def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)  #1000个样本
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    # inner_product = mapped_cord.as_matrix() @ theta
    inner_product = mapped_cord.values @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01



draw_boundary(6,1)

"""
原文链接：https://blog.csdn.net/weixin_51444827/article/details/114818820