@rianusr
2019-08-13T10:15:30.000000Z
字数 6086
阅读 2112
06-机器学习
- 快速构建你的第一个简单分类模型
- 简单了解分类模型的优化方向
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")
train_src=pd.read_csv("./../data/titanic_data/train.csv")
train_src.info()
train_src.head()
# Age分布
train_src.hist(column="Age",bins=50)
# 性别分布
train_src["Sex"].value_counts().plot(kind="bar")
# Sex与目标(是否生存)的相关性
pd.crosstab(train_src["Sex"],train_src["Survived"]).plot(kind="bar")
# Pclass与目标(是否生存)的相关性
pd.crosstab(train_src["Pclass"],train_src["Survived"]).plot(kind="bar")
# Age与目标(是否生存)的相关性
train_src.age=pd.cut(train_src.Age,[0,5,15,20,35,50,60,100])
pd.crosstab(train_src.age,train_src.Survived).plot(kind="bar")
# 筛选Plcass,Sex,Age,SibSp,Parch,Fare六个变量作为预测变量(特征)
train=train_src[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare"]]
# 把Sex变量的取值male替换为1,female替换为0
train["Sex"]=train["Sex"].replace({"male":1,"female":0})
#有117和乘客Age有缺失,用平均年龄替换
age_mean=train["Age"].mean()
train["Age"]=train["Age"].fillna(age_mean)
# 查看一下准备好的数据集
train.head(10)
train.describe()
# 拆分出自变量X,目标变量y
train_X = train.ix[:,1:] # 训练集自变量
train_y = train["Survived"] #训练集因变量
# 使用逻辑回归算法训练模型
lr = LogisticRegression() #使用默认参数
lr.fit(train_X,train_y) #训练
# 查看lr模型的系数
print (lr.coef_)
print(train_X.columns)
pd.DataFrame(list(zip(np.transpose(lr.coef_),train_X.columns)),columns=["coef","columns"])
train_y_pred=lr.predict(train_X) #对训练集进行预测,输出标签
train_y_pred_prob=lr.predict_proba(train_X) # 对训练集进行预测,输出概率
print(train_y_pred)
print(train_y_pred_prob)
# 误分类矩阵
cnf_matrix=metrics.confusion_matrix(train_y,train_y_pred)
print(cnf_matrix)
# 准确率
precision = metrics.accuracy_score(train_y,train_y_pred)
print(precision)
# 更直观一点的展现误分类矩阵
def show_confusion_matrix(cnf_matrix,class_labels):
plt.matshow(cnf_matrix,cmap=plt.cm.YlGn,alpha=0.7)
ax=plt.gca()
ax.set_xlabel("Predicted Label",fontsize=16)
ax.set_xticks(range(0,len(class_labels)))
ax.set_xticklabels(class_labels,rotation=45)
ax.set_ylabel("Actual Label",fontsize=16,rotation=90)
ax.set_yticks(range(0,len(class_labels)))
ax.set_yticklabels(class_labels)
ax.xaxis.set_label_position("top")
ax.xaxis.tick_top()
for row in range(len(cnf_matrix)):
for col in range(len(cnf_matrix[row])):
ax.text(col,row,cnf_matrix[row][col],va="center",ha="center",fontsize=16)
class_labels=[0,1]
show_confusion_matrix(cnf_matrix,class_labels)
# 测试数据准备,与训练集的准备完全一致
test_src=pd.read_csv("./../data/titanic_data/test.csv")
test=test_src[["PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare"]]
test["Sex"].replace({"male":1,"female":0},inplace=True)
test["Age"].fillna(age_mean,inplace=True)
# Fare船票价格在测试集中出现了控制,用训练集的平均值替换
test["Fare"].fillna(round(train["Fare"].mean()),inplace=True)
# 对测试数据预测
test_X=test.ix[:,1:]
test_y_pred = lr.predict(test_X) #对测试集进行预测
test_pred = pd.DataFrame({"PassengerId":test["PassengerId"],"Survived":test_y_pred.astype(int)})
test_pred.to_csv("./../data/titanic_data/test_pred_0601.csv",index=False)
# 查看预测结果
test_pred.head()
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import model_selectiom
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")
train_src=pd.read_csv("./../data/titanic_data/train.csv")
train_src.info()
train_src.head()
# Age分布
train_src.hist(column="Age",bins=50)
# 性别分布
train_src["Sex"].value_counts().plot(kind="bar")
# Sex与目标(是否生存)的相关性
pd.crosstab(train_src["Sex"],train_src["Survived"]).plot(kind="bar")
# Pclass与目标(是否生存)的相关性
pd.crosstab(train_src["Pclass"],train_src["Survived"]).plot(kind="bar")
# Age与目标(是否生存)的相关性
train_src.age=pd.cut(train_src.Age,[0,5,15,20,35,50,60,100])
pd.crosstab(train_src.age,train_src.Survived).plot(kind="bar")
# 筛选Plcass,Sex,Age,SibSp,Parch,Fare六个变量作为预测变量(特征)
train=train_src[["Survived","Pclass","Sex","Age","SibSp","Parch","Fare"]]
# 把Sex变量的取值male替换为1,female替换为0
train["Sex"]=train["Sex"].replace({"male":1,"female":0})
#有117和乘客Age有缺失,用平均年龄替换
age_mean=train["Age"].mean()
train["Age"]=train["Age"].fillna(age_mean)
# 查看一下准备好的数据集
train.head(10)
train.describe()
# 拆分出自变量X,目标变量y
train_X = train.ix[:,1:] # 训练集自变量
train_y = train["Survived"] #训练集因变量
X_train,X_test,y_train,y_test=model_selection.train_test_split(train_X,train_y,test_size=0.3,random_state=42)
# 使用逻辑回归算法训练模型
lr = LogisticRegression() #使用默认参数
lr.fit(X_train,y_train) #训练
y_train_pre=lr.predict(X_train) # 对训练集进行预测
metrics.accuracy_score(y_train,y_train_pre) # 训练集准确率
y_test_pre=lr.predict(X_test) # 对测试集进行预测
metrics.accuracy_score(y_test,y_test_pre)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
svc=SVC()
svc.fit(X_train,y_train)
print("train accurary:",svc.score(X_train,y_train))
print("test accurary:",svc.score(X_test,y_test))
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
print("train accurary:",dtree.score(X_train,y_train))
print("test accurary:",dtree.score(X_test,y_test))
random_forest=RandomForestClassifier(n_estimators=10)
random_forest.fit(X_train,y_train)
print("train accurary:",random_forest.score(X_train,y_train))
print("test accurary:",random_forest.score(X_test,y_test))
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
print("train accurary:",knn.score(X_train,y_train))
print("test accurary:",knn.score(X_test,y_test))
#E:\Jupyter_workspace\Scikit-learn video learning\监督学习\准备一个更好的训练集.ipynb
模型融合存在过拟合的现象