Chapter 7 Logistic回归
7.1 相关概念
- 因变量(Dependent variable,Y):类别变量;
- 自变量(Independent variable,X):连续型变量或离散型变量;
- 二分类分类模型:邮件分类(好邮件,垃圾邮件);疾病诊断(是否得Covid-19)等。
7.2 Logistic模型
odds: \(odds=\frac{p(x)}{1-p(x)}\)
sigmoid函数:\(y=\frac{1}{1+e^{-z}}\)
library(ggplot2)
<- seq(-10,10,0.01)
x <- function(x){
sigmoid 1/(1+exp(-x))
}<- sigmoid(x)
y <- data.frame(x=x,y=y)
df_sig #plot(x,sigmoid(x),col="blue")
ggplot(data=df_sig,aes(x=x,y=y))+
geom_line(color="blue")+
annotate("text",x=2,y=0.3,parse=TRUE,
label="frac(1,1+e^{-z})")+
labs(title=expression(paste("sigmoid funtion ",sigma(z)==frac(1,1+e^{-z}))))+
geom_hline(aes(yintercept=0.5),color="red",linetype="dashed")
- logistic回归模型:是解决分类的机器学习算法,主要解决二分类的问题。因变量的估计值由条件概率和决策边界决定。\[p(y)=p(y=k|X=x)=\frac{1}{1+e^{-X\beta}}\]
7.3 模型估计
7.3.1 损失函数(Loss function)
\[L(\hat{y},y)=-(y*log(\hat{y})+(1-y)*log(1-\hat{y}))\] ### 代价函数(Cost funtion)
\[J(\theta)=-\frac{1}{m}\sum\limits_{i=1}^mL(\hat{y}^{(i)}-y^{(i)})\]
7.3.2 模型估计(estimation)
梯度下降(Gradient Descent)的方法。
\[\theta_j:=\theta_j-\alpha\frac{\partial}{\partial \theta_j}J(\theta)\]
7.3.3 模拟数据模型
library(ggplot2)
# faked data produce
set.seed(520)
<- rnorm(100,0,1)
x1 <- rnorm(100,0,1)
x2 <- 3+2*x1+5*x2
z <- 1/(1+exp(-z))
pr <- rbinom(1000,1,pr)
y
<- data.frame(y=y,x1=x1,x2=x2)
data_logreg $y_factor <- as.factor(data_logreg$y)
data_logreghead(data_logreg)
## y x1 x2 y_factor
## 1 0 -1.07545511 -0.7586247 0
## 2 1 -1.28608682 0.7911798 1
## 3 1 -1.21028388 0.7932912 1
## 4 1 0.08780203 -0.1222843 1
## 5 1 -0.48698853 0.6076338 1
## 6 1 2.39494281 -0.9934082 1
# visualize data
ggplot(data_logreg,aes(x=x1,y=y,color=y_factor))+
geom_point()+
stat_smooth(method="glm",color="blue",se=FALSE,method.args=list(family=binomial))
## `geom_smooth()` using formula 'y ~ x'
# model estimate
<- glm(y_factor~x1+x2,family = binomial(link="logit"),data=data_logreg)
log_fit summary(log_fit)
##
## Call:
## glm(formula = y_factor ~ x1 + x2, family = binomial(link = "logit"),
## data = data_logreg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5699 -0.0877 0.0247 0.1788 3.1761
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.3905 0.2848 11.90 <2e-16 ***
## x1 2.1482 0.1878 11.44 <2e-16 ***
## x2 5.7433 0.4348 13.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1276.69 on 999 degrees of freedom
## Residual deviance: 377.26 on 997 degrees of freedom
## AIC: 383.26
##
## Number of Fisher Scoring iterations: 7
# Result analysis
#install.packages("caret")
library(caret)
## Loading required package: lattice
<- predict(log_fit,newdata=data_logreg,type="response")
pdata head(pdata)
## 1 2 3 4 5 6
## 0.03637469 0.99435703 0.99525852 0.94668772 0.99708267 0.94427164
# use caret to calculate the confusion matrix
$pre_glm <- ifelse(pdata>0.5,"1","0")
data_logreg$pre_glm <- as.factor(data_logreg$pre_glm)
data_logreghead(data_logreg)
## y x1 x2 y_factor pre_glm
## 1 0 -1.07545511 -0.7586247 0 0
## 2 1 -1.28608682 0.7911798 1 1
## 3 1 -1.21028388 0.7932912 1 1
## 4 1 0.08780203 -0.1222843 1 1
## 5 1 -0.48698853 0.6076338 1 1
## 6 1 2.39494281 -0.9934082 1 1
confusionMatrix(factor(data_logreg$y),factor(data_logreg$pre_glm))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 295 41
## 1 35 629
##
## Accuracy : 0.924
## 95% CI : (0.9058, 0.9397)
## No Information Rate : 0.67
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8289
##
## Mcnemar's Test P-Value : 0.5663
##
## Sensitivity : 0.8939
## Specificity : 0.9388
## Pos Pred Value : 0.8780
## Neg Pred Value : 0.9473
## Prevalence : 0.3300
## Detection Rate : 0.2950
## Detection Prevalence : 0.3360
## Balanced Accuracy : 0.9164
##
## 'Positive' Class : 0
##
# plot ROC
#install("ROCR")
library(ROCR)
<- prediction(pdata,data_logreg$y_factor)
pred <- performance(pred,measure="tpr",x.measure="fpr")
perf plot(perf,col=rainbow(2),main="ROC curve admissions",xlab="Specificity",ylab="Sensitivity")
abline(0,1) # add a 45 degree line
7.3.4 Logistic回归实例模型
- Produce a simple data
# Create a hand-out data
<- data.frame(x1=c(8,3,4,5,9),x2=c(6,5,9,8,9),y=factor(c(1,0,0,1,1)))
df_logsim head(df_logsim)
## x1 x2 y
## 1 8 6 1
## 2 3 5 0
## 3 4 9 0
## 4 5 8 1
## 5 9 9 1
# visual the data
library(ggplot2)
ggplot(data=df_logsim,aes(x=x1,y=x2,color=y,shape=y,size=3))+
geom_point()