## ---- lasso-example ----
# to obtain glmnet and install it directly from CRAN.

# install.packages("glmnet", repos = "http://cran.us.r-project.org")

# load the glmnet package:

library(glmnet)

# The default model used in the package is the Guassian linear
# model or "least squares", which we will demonstrate in this
# section. We load a set of data created beforehand for
# illustration. Users can either load their own data or use
# those saved in the workspace.
getwd()
load("Bardet.rda")

# The command loads an input matrix x and a response
# vector y from this saved R data archive.
#
# We fit the model using the most basic call to glmnet.

fit = glmnet(x, y)


# "fit" is an object of class glmnet that contains all the
# relevant information of the fitted model for further use.
# We do not encourage users to extract the components directly.
# Instead, various methods are provided for the object such
# as plot, print, coef and predict that enable us to execute
# those tasks more elegantly.

# We can visualize the coefficients by executing the plot function:

plot(fit)


# Each curve corresponds to a variable. It shows the path of
# its coefficient against the l1-norm of the whole
# coefficient vector at as lambda varies. The axis above
# indicates the number of nonzero coefficients at the
# current lambda, which is the effective degrees of freedom
# (df) for the lasso. Users may also wish to annotate
# the curves; this can be done by setting label = TRUE
# in the plot command.

# A summary of the glmnet path at each step is displayed
# if we just enter the object name or use
# the print function:

print(fit)

# It shows from left to right the number of nonzero
# coefficients (Df), the values of -log(likelihood)
# (%dev) and the value of lambda (Lambda).
# Although by default glmnet calls for 100 values of
# lambda the program stops early if %dev% does not
# change sufficently from one lambda to the next
# (typically near the end of the path.)

# We can obtain the actual coefficients at one or more lambda's
# within the range of the sequence:

coef0 = coef(fit,s=0.1)


# The function glmnet returns a sequence of models
# for the users to choose from. In many cases, users
# may prefer the software to select one of them.
# Cross-validation is perhaps the simplest and most
# widely used method for that task.
#
# cv.glmnet is the main function to do cross-validation
# here, along with various supporting methods such as
# plotting and prediction. We still act on the sample
# data loaded before.

cvfit = cv.glmnet(x, y)

# cv.glmnet returns a cv.glmnet object, which is "cvfit"
# here, a list with all the ingredients of the
# cross-validation fit. As for glmnet, we do not
# encourage users to extract the components directly
# except for viewing the selected values of lambda.
# The package provides well-designed functions
# for potential tasks.

# We can plot the object.

plot(cvfit)

# It includes the cross-validation curve (red dotted line),
# and upper and lower standard deviation curves along the
# lambda sequence (error bars). Two selected lambda's are
# indicated by the vertical dotted lines (see below).

# We can view the selected lambda's and the corresponding
# coefficients. For example,

cvfit$lambda.min

# lambda.min is the value of lambda that gives minimum
# mean cross-validated error. The other lambda saved is
# lambda.1se, which gives the most regularized model
# such that error is within one standard error of
# the minimum. To use that, we only need to replace
# lambda.min with  lambda.1se above.

coef1 = coef(cvfit, s = "lambda.min")


# Note that the coefficients are represented in the
# sparse matrix format. The reason is that the
# solutions along the regularization path are
# often sparse, and hence it is more efficient
# in time and space to use a sparse format.
# If you prefer non-sparse format,
# pipe the output through  as.matrix().


# Predictions can be made based on the fitted
# cv.glmnet object. Let's see a toy example.

predict(cvfit, newx = x[1:5,], s = "lambda.min")


# newx is for the new input matrix and s,
# as before, is the value(s) of lambda at which
# predictions are made.


## ---- adaptive-lasso-example ----


################################################
# penalty.factor example
################################################


# [penalty.factor] argument allows users to apply separate penalty
# factors to each coefficient. Its default is 1 for each parameter,
# but other values can be specified. In particular, 
# any variable with penalty.factor equal to zero is not penalized at all! Let
# v_j denote the penalty factor for j-th variable.

# Note the penalty factors are internally rescaled to sum to nvars.

# This is very useful when people have prior knowledge or
# preference over the variables. In many cases, some
# variables may be so important that one wants to keep
# them all the time, which can be achieved by setting
# corresponding penalty factors to 0:

p.fac = rep(1, 200)
p.fac[c(31, 174, 151)] = 0
pfit = glmnet(x, y, penalty.factor = p.fac)
plot(pfit, xvar="lambda", label = TRUE)

# We see from the labels that the three variables with 0
# penalty factors always stay in the model, while the
# others follow typical regularization paths and
# shrunken to 0 eventually.


################################################
# Adaptive lasso example
################################################

# Some other useful arguments: [exclude] allows one to block
# certain variables from being the model at all. Of course,
# one could simply subset these out of x, but sometimes
# exclude is more useful, since it returns a full vector
# of coefficients, just with the excluded ones set to zero.
# There is also an intercept argument which defaults to
# TRUE; if FALSE the intercept is forced to be zero.


## The adaptive lasso needs a first stage that is consistent. 
## Zou (2006) recommends OLS or ridge
## first stage lasso
thelasso.cv<-cv.glmnet(x,y,family = "gaussian",alpha=1) 
## Second stage weights from the coefficients of the first stage
## coef() is a sparseMatrix
bhat<-as.matrix(coef(thelasso.cv,s="lambda.1se"))[-1,1] 
if(all(bhat==0)){
  ## if bhat is all zero then assign very close to zero weight to all.
  ## Amounts to penalizing all of the second stage to zero.
  bhat<-rep(.Machine$double.eps*2,length(bhat))
}
## the adaptive lasso weight
adpen<-(1/pmax(abs(bhat),.Machine$double.eps)) 
## Second stage lasso (the adaptive lasso)
m_adlasso <- glmnet(x,y,family = "gaussian",alpha=1,exclude=which(bhat==0),
penalty.factor=adpen)
plot(m_adlasso)


## ---- elastic-net ----

################################################
# Elastic net example
################################################

# glmnet provides various options for users to customize
# the fit. We introduce some commonly used options here
# and they can be specified in the glmnet function.

# [family="gaussian"] is the default family option in
# the function glmnet. "gaussian"

# [alpha] is for the elastic-net mixing parameter alpha,
# with range alpha in [0,1]. alpha=1 is the lasso
# (default) and alpha=0 is the ridge.

# [nlambda] is the number of lambda values in the sequence.
# Default is 100.


# As an example, we set alpha=0.2 (more like a ridge regression),
# and give double weights to the latter half of the observations.
# To avoid too long a display here, we set nlambda to 20.
# In practice, however, the number of values of lambda is
# recommended to be 100 (default) or more. In most cases,
# it does not come with extra cost because of the warm-starts
# used in the algorithm, and for nonlinear models leads to
# better convergence properties.

fit = glmnet(x, y, alpha = 0.2, family="gaussian")

plot(fit, xvar = "lambda", label = TRUE)


## ---- group-lasso ----

################################################
# Group lasso example
################################################

install.packages("gglasso", repos = "http://cran.us.r-project.org")
# load gglasso library
library(gglasso)

# load bardet data set
data(bardet)

# define 20 groups 
group1 <- rep(1:20,each=5)

# fit group lasso penalized least squares
m1 <- gglasso(x=bardet$x,y=bardet$y,group=group1,loss="ls")

plot(m1)

# 5-fold cross validation using group lasso 
# penalized ls regression
cv <- cv.gglasso(x=bardet$x, y=bardet$y, group=group1, loss="ls",
pred.loss="L2", lambda.factor=0.05, nfolds=5)

plot(cv)

# load colon data set
data(colon)

# define group index
group2 <- rep(1:20,each=5)

# fit group lasso penalized logistic regression
m2 <- gglasso(x=colon$x,y=colon$y,group=group2,loss="logit")

plot(m2)

# 5-fold cross validation using group lasso 
# penalized logisitic regression
cv2 <- cv.gglasso(x=colon$x, y=colon$y, group=group2, loss="logit",
pred.loss="misclass", lambda.factor=0.05, nfolds=5)

plot(cv2)

# the coefficients at lambda = lambda.1se
pre = coef(cv$gglasso.fit, s = cv$lambda.1se)


## ---- simulation-demo-1 ----

# Compare lasso, adaptive lasso, scad and 
# mcp in the logistic regression and least squares
#
# load R libraries
# glmnet: for LASSO, adaptive LASSO, elastic net 
#			penalized least squares and logistic regression (all 
#   		supports poisson, multinomial and cox model)
# ncvreg: for SCAD and MCP penalized least squares and logistic regression
library(glmnet)
library(ncvreg)
library(MASS)

##############################
#	PART I Least squares

n=100
p=200

# true beta
truebeta <- c(4,4,4,-6*sqrt(2),4/3,rep(0,p-5))

# error variance 
sigma2 <- 0.3

# The covariance between Xj and Xk is cov(X_j, X_k) = rho^|i-j|
# covariance matrix
covmat <- function(rho, p) {
  rho^(abs(outer(seq(p), seq(p), "-")))
}
# rho = 0.1, generate covariance matrix for X
sigma <- covmat(0.1,p)

# X ~ N(0, sigma)
# epsilon ~ N(0, sigma2)
# The true model:  	y = x*truebeta + epsilon
x <- mvrnorm(n,rep(0,p),sigma)
epsilon <- rnorm(n,0,sd=sqrt(sigma2))
y <- x %*% truebeta + epsilon
  
# fit lasso and use five-fold CV to select lambda
cvfit <- cv.glmnet(x = x, y = y, alpha = 1, family = "gaussian")
plot(cvfit)

# lambda selected by CV
cvfit$lambda.min

# plot the solution paths
plot(cvfit$glmnet.fit,label=TRUE,xvar="lambda")
# plot lambda.min
abline(v=log(cvfit$lambda.min),lty=1)
# plot lambda.1se
abline(v=log(cvfit$lambda.1se),lty=2)


model_compare <- matrix(NA, nrow=5, ncol=p,
                dimnames=list(c("true model",
				"lasso","adaptive lasso","mcp","scad"),
				paste("V",seq(p),sep="")))

# save the true model
model_compare[1, ] <- truebeta


## coefficients estimated by lasso
cvfit <- cv.glmnet(x = x, y = y, alpha = 1, 
family = "gaussian")
tmp <- cvfit$glmnet.fit$beta
lasso_beta <- as.matrix(tmp[,cvfit$lambda==cvfit$lambda.min])
model_compare[2, ] <- lasso_beta


# Compute weight and fit an adaptive lasso
weight = 1/(lasso_beta)^2
# Some est. coef. is zero, the corresponding weight is Inf
# to prevent numerical error, convert Inf to a large number (e.g. 1e6)
weight[weight==Inf] = 1e6

cvfit <- cv.glmnet(x = x, y = y, alpha = 1, 
		family = "gaussian", 
		nfolds = 5, penalty.factor=weight)

## coefficients estimated by adaptive lasso
tmp <- cvfit$glmnet.fit$beta
adaptive_lasso_beta <- as.matrix(tmp[,cvfit$lambda==cvfit$lambda.min])
model_compare[3, ] <- adaptive_lasso_beta

## coefficients estimated by mcp
cvfit <- cv.ncvreg(X = x, y = y, penalty = "MCP", family = "gaussian")
mcp_beta <- cvfit$fit$beta[, cvfit$min]
model_compare[4, ] <- mcp_beta[-1]

## coefficients estimated by scad
cvfit <- cv.ncvreg(X = x, y = y, penalty = "SCAD", family = "gaussian")
scad_beta <- cvfit$fit$beta[, cvfit$min]
model_compare[5, ] <- scad_beta[-1]

# make a comparison of the estimated coef. from four methods
# we see that lasso over-selected, adaptive lasso, scad and mcp fix the problem.

model_compare


## ---- simulation-demo-2 ----

##############################
#	PART II logistic regression 


# generate data
n <- 200 
p <- 8

# truebeta
truebeta <- c(6,3.5,0,5,rep(0,p-4)) 
truebeta

# generate x and y from the true model
# The true model:  	P(y=1|x) = exp(x*truebeta)/(exp(x*truebeta)+1)
x <- matrix(rnorm(n*p), n, p)
feta <- x %*% truebeta 
fprob <- ifelse(feta < 0, exp(feta)/(1+exp(feta)), 1/(1 + exp(-feta)))
y <- rbinom(n, 1, fprob)


model_compare <- matrix(NA, nrow=5, ncol=p,
                        dimnames=list(c("true model","lasso",
						"adaptive lasso","mcp","scad"),
						paste("V",seq(p),sep="")))


# save the true model
model_compare[1, ] <- truebeta


# lasso case
# cv.glmfit fit lasso model and use cross validation for lambda selection

# family = "binomial", logistic regression
# family = "gaussian", least squares

# alpha controls the degree of L1 penalty term, 
#		alpha = 1, lasso, 
#		alpha = 0, ridge, 
#		alpha = (0,1), elastic net

# nfolds = 5, five-fold cross validation (CV)
cvfit <- cv.glmnet(x = x, y = y, alpha = 1, family = "binomial", nfolds = 5)

# make a plot of the CV result 
# the left vertical line (lambda.min) correspondes to the lambda 
# that gives smallest deviance.
# the right vertical line (lambda.1se) correspondes to the lambda 
# from the one standard deviation rule
plot(cvfit)

# plot the solution paths
plot(cvfit$glmnet.fit,label=TRUE,xvar="lambda")
# plot lambda.min
abline(v=log(cvfit$lambda.min),lty=1)
# plot lambda.1se
abline(v=log(cvfit$lambda.1se),lty=2)


# save the Lasso coefficient from solution path, each column 
# represents an estimate for a lambda value
tmp <- cvfit$glmnet.fit$beta
tmp
# the beta that correspondes to lambda.min selected by CV
lasso_beta <- as.matrix(tmp[,cvfit$lambda==cvfit$lambda.min])
model_compare[2, ] <- lasso_beta


# Compute weight and fit an adaptive lasso
weight = 1/(lasso_beta)^2
# Some est. coef. is zero, the corresponding weight is Inf
# to prevent numerical error, convert Inf to a large number (e.g. 1e6)
weight[weight==Inf] = 1e6

cvfit <- cv.glmnet(x = x, y = y, alpha = 1, family = "binomial", 
						nfolds = 5, penalty.factor=weight)

## coefficients estimated by adaptive lasso
tmp <- cvfit$glmnet.fit$beta
adaptive_lasso_beta <- as.matrix(tmp[,cvfit$lambda==cvfit$lambda.min])
model_compare[3, ] <- adaptive_lasso_beta


## coefficients estimated by mcp
cvfit <- cv.ncvreg(X = x, y = y, penalty = "MCP", family = "binomial")
mcp_beta <- cvfit$fit$beta[, cvfit$min]
model_compare[4, ] <- mcp_beta[-1]

## coefficients estimated by scad
cvfit <- cv.ncvreg(X = x, y = y, penalty = "SCAD", family = "binomial")
scad_beta <- cvfit$fit$beta[, cvfit$min]
model_compare[5, ] <- scad_beta[-1]

# make a comparison of the estimated coef. from four methods
model_compare