#------------------------------------------------------------------- # 3.1 MODEL SELECTION #------------------------------------------------------------------- m <- matrix(scan("/glim/datasets/rents1"), ncol = 4, byrow = T) > m [,1] [,2] [,3] [,4] [1,] 1 1.5 170 215 [2,] 1 2.5 200 450 [3,] 1 3.5 230 295 [4,] 1 3.5 285 325 [5,] 1 3.5 375 295 [6,] 1 3.5 285 225 [7,] 1 3.5 400 173 [8,] 1 3.5 250 300 [9,] 1 4.5 270 375 [10,] 1 4.5 375 370 [11,] 1 4.5 330 460 [12,] 2 3.5 395 425 [13,] 2 3.5 350 325 [14,] 2 3.5 270 370 [127,] 9 5.5 485 410 [128,] 9 6.5 850 725 [129,] 9 7.5 850 715 > dist <- factor(m[, 1]) > room <- m[, 2] > rntc <- m[, 3] > rntv <- m[, 4] > par(mfrow = c(2, 2)) > plot(dist, rntc, main = "Figure 3.1 Rent vs. district") > plot(room, rntc, main = "Figure 3.2 Rent vs. rooms") > sse <- rep(0, 10) > p <- sse > msep1 <- sse > msep2 <- sse > glm <- glm(rntc ~ 1) > glm Call: glm(formula = rntc ~ 1) Coefficients: (Intercept) 386.4186 Degrees of Freedom: 129 Total; 128 Residual Residual Deviance: 2496537 > sse[1] <- glm$deviance > p[1] <- length(glm$coefficients) > msep1[1] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[1] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ room) > glm Call: glm(formula = rntc ~ room) Coefficients: (Intercept) room 88.36561 68.47522 Degrees of Freedom: 129 Total; 127 Residual Residual Deviance: 1389022 > sse[2] <- glm$deviance > p[2] <- length(glm$coefficients) > msep1[2] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[2] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ dist) > glm Call: glm(formula = rntc ~ dist) Coefficients: (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 388.819 52.40909 38.8447 -4.650568 -4.06415 2.531367 -4.780934 dist7 dist8 -5.924986 26.27263 Degrees of Freedom: 129 Total; 120 Residual Residual Deviance: 1691595 > sse[3] <- glm$deviance > p[3] <- length(glm$coefficients) > msep1[3] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[3] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ dist + room) > glm Call: glm(formula = rntc ~ dist + room) Coefficients: (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 96.50139 38.90942 32.09486 -20.68143 -12.7184 -2.251958 -5.943476 dist7 dist8 room -3.783573 18.93839 67.49836 Degrees of Freedom: 129 Total; 119 Residual Residual Deviance: 735829.1 > sse[4] <- glm$deviance > p[4] <- length(glm$coefficients) > msep1[4] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[4] <- sum((glm$residuals/(1 - vl/sc))^2) > room2 <- room^2 > glm <- glm(rntc ~ dist + room + room2) > glm Call: glm(formula = rntc ~ dist + room + room2) Coefficients: (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 170.4111 41.15573 30.47321 -20.31532 -13.56629 -2.006493 -5.570079 dist7 dist8 room room2 -3.896641 18.91089 32.00944 3.923987 Degrees of Freedom: 129 Total; 118 Residual Residual Deviance: 721066.7 > sse[5] <- glm$deviance > p[5] <- length(glm$coefficients) > msep1[5] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[5] <- sum((glm$residuals/(1 - vl/sc))^2) > Room <- factor(room) > glm <- glm(rntc ~ dist + room + room2 + Room) > glm Call: glm(formula = rntc ~ dist + room + room2 + Room) Coefficients: (2 not defined because of singularities) (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 -275.7035 38.82937 31.73855 -18.6531 -14.37587 -1.441776 -5.995388 dist7 dist8 room room2 Room1 Room2 Room3 -3.531613 19.17884 234.4999 -15.36088 -88.8612 -40.37063 -33.2221 Room4 Room5 Room6 -27.94773 NA NA Degrees of Freedom: 129 Total; 114 Residual Residual Deviance: 673898.2 > sse[6] <- glm$deviance > p[6] <- length(glm$coefficients) > msep1[6] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[6] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ dist + room + dist:room) > glm Call: glm(formula = rntc ~ dist + room + dist:room) Coefficients: (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 47.63733 -118.9347 60.74293 -29.08606 9.955246 22.19292 1.833318 dist7 dist8 room dist1room dist2room dist3room dist4room 20.20184 -17.13752 77.43597 41.14583 -7.176816 0.9403115 -5.2848 dist5room dist6room dist7room dist8room -5.570043 -1.88576 -5.977849 6.870091 Degrees of Freedom: 129 Total; 111 Residual Residual Deviance: 628708.6 > sse[7] <- glm$deviance > p[7] <- length(glm$coefficients) > msep1[7] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[7] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ dist + Room + dist:room) > glm Call: glm(formula = rntc ~ dist + Room + dist:room) Coefficients: (1 not defined because of singularities) (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 624.5589 -147.4465 61.78422 -37.31161 19.33201 16.96856 -0.3743489 dist7 dist8 Room1 Room2 Room3 Room4 Room5 Room6 17.93636 -14.72725 35.08207 77.30663 69.6349 61.62162 78.65954 64.45962 dist1room dist2room dist3room dist4room dist5room dist6room dist7room -83.45259 12.74035 -56.3697 -30.00452 -76.58234 -72.44228 -61.17556 dist8room dist9room -94.48416 NA Degrees of Freedom: 129 Total; 106 Residual Residual Deviance: 576393.5 > sse[8] <- glm$deviance > p[8] <- length(glm$coefficients) > msep1[8] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[8] <- sum((glm$residuals/(1 - vl/sc))^2) > glm <- glm(rntc ~ dist + Room + dist:room + dist:room2) > glm Call: glm(formula = rntc ~ dist + Room + dist:room + dist:room2) Coefficients: (3 not defined because of singularities) (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 1156.411 -222.9261 100.2875 35.98757 71.00673 24.50395 69.1886 dist7 dist8 Room1 Room2 Room3 Room4 Room5 Room6 20.55217 -92.62196 111.9196 143.7912 124.758 97.77201 99.1814 70.37783 dist1room dist2room dist3room dist4room dist5room dist6room dist7room -224.9935 -74.7829 -249.7592 -327.8935 -403.5763 -320.2131 -488.7992 dist8room dist9room dist1room2 dist2room2 dist3room2 dist4room2 -372.2498 NA 3.891184 NA 15.40776 26.84546 dist5room2 dist6room2 dist7room2 dist8room2 dist9room2 30.68451 22.04969 38.6687 25.88491 NA Degrees of Freedom: 129 Total; 99 Residual Residual Deviance: 531540.8 > sse[9] <- glm$deviance > p[9] <- length(glm$coefficients) > msep1[9] <- sum((rntv - glm$fitted.values)^2) > vl <- predict.glm(glm, se.fit = T)$se.fit^2 > sc <- summary(glm)$dispersion > msep2[9] <- sum((glm$residuals/(1 - vl/sc))^2) > sse[10] <- 0 > p[10] <- length(rntc) > msep1[10] <- sum((rntv - rntc)^2) > msep2[10] <- NA > msep3 <- sse + 2 * p * sc > cbind(p, sse, msep1, msep2, msep3) p sse msep1 msep2 msep3 [1,] 1 2496537.4 2512488 2535698.2 2507275.6 [2,] 2 1389022.3 1442669 1434300.7 1410498.7 [3,] 9 1691595.1 2282656 1938382.6 1788238.9 [4,] 10 735829.1 1294490 864764.3 843211.0 [5,] 11 721066.7 1293582 859487.7 839186.9 [6,] 17 673898.2 1343720 880915.9 856447.6 [7,] 18 628708.6 1355654 838805.7 821996.2 [8,] 24 576393.5 1448617 887167.3 834110.3 [9,] 33 531540.8 1411650 Inf 885901.4 [10,] 129 0.0 2054300 NA 1385227.7 > m <- matrix(scan("/glim/datasets/rents2"), ncol = 3, byrow = T) > dist <- factor(m[, 1]) > room <- m[, 2] > rent <- m[, 3] > w <- c(rep(1, 129), rep(0, 129)) > glm <- glm(rent ~ dist + room, weight = w) > glm Call: glm(formula = rent ~ dist + room, weights = w) Coefficients: (Intercept) dist1 dist2 dist3 dist4 dist5 dist6 96.50139 38.90942 32.09486 -20.68143 -12.7184 -2.251958 -5.943476 dist7 dist8 room -3.783573 18.93839 67.49836 Degrees of Freedom: 258 Total; 119 Residual Residual Deviance: 735829.1 > cbind(rent, glm$fitted.values, glm$residuals) rent 1 170 153.1851 16.81490071 2 200 220.6835 -20.68345873 3 230 288.1818 -58.18181818 4 285 288.1818 -3.18181818 5 375 288.1818 86.81818182 256 655 619.2495 35.75049217 257 515 619.2495 -104.24950783 258 790 686.7479 103.25213272 > msep <- sum((1 - w) * glm$residuals^2) > msep [1] 1357235 #------------------------------------------------------------------- # 3.2 STEIN SHRINKAGE: PREDICTING HOCKEY SCORES #------------------------------------------------------------------- > m <- matrix(scan("/glim/datasets/nhl8586"), ncol = 2, byrow = T) > m [,1] [,2] [1,] 82 86 [2,] 90 80 [3,] 94 89 [4,] 83 86 [5,] 66 40 [6,] 109 119 [7,] 69 84 [8,] 82 54 [9,] 62 85 [10,] 94 87 [11,] 54 59 [12,] 86 90 [13,] 62 78 [14,] 113 110 [15,] 53 76 [16,] 91 92 [17,] 86 85 [18,] 48 57 [19,] 59 59 [20,] 101 107 [21,] 96 59 > ycal <- m[, 1] > yval <- m[, 2] > plot(ycal, yval, main = "Figure 3.3 Hockey") > abline(0, 1) > msep <- sum((ycal - yval)^2) > msep [1] 4836 > glm <- glm(yval ~ ycal) > glm Call: glm(formula = yval ~ ycal) Coefficients: (Intercept) ycal 23.48999 0.7075656 Degrees of Freedom: 21 Total; 19 Residual Residual Deviance: 4223.159 > abline(coef(glm), lty = 2) > mean(ycal) [1] 80 > abline(80, 0, lty = 3) > msep <- sum((80 - yval)^2) > msep [1] 7810 > m <- matrix(scan("/glim/datasets/nhl82_84"), ncol = 3, byrow = T) > m [,1] [,2] [,3] [1,] 96 110 104 [2,] 93 89 103 [3,] 75 78 82 [4,] 72 104 68 [5,] 54 57 69 [6,] 111 106 119 [7,] 60 45 66 [8,] 63 66 59 [9,] 94 96 88 [10,] 109 98 75 [11,] 49 48 41 [12,] 118 96 104 [13,] 92 80 93 [14,] 87 106 98 [15,] 75 45 38 [16,] 82 80 94 [17,] 72 65 71 [18,] 56 68 61 [19,] 77 75 73 [20,] 65 94 101 [21,] 80 74 73 > y1 <- m[, 1] > y2 <- m[, 2] > y3 <- m[, 3] > ym <- (y1 + y2 + y3)/3 > sigma2 <- sum((y1 - ym)^2 + (y2 - ym)^2 + (y3 - ym)^2)/(2 * 20) > sigma2 [1] 111 > f <- sum((ycal - 80)^2)/20/sigma2 > f [1] 3.227027 > k <- 1 - ((1/f * 18)/20 * 40)/42 > k [1] 0.7343862 > yshrink <- (1 - k) * 80 + k * ycal > abline((1 - k) * 80, k, lty = 4) > legend(47, 120, c("Calib", "Regres", "80", "Shrink"), lty = 1:4) > cbind(ycal, yshrink, yval) ycal yshrink yval [1,] 82 81.46877 86 [2,] 90 87.34386 80 [3,] 94 90.28141 89 [4,] 83 82.20316 86 [5,] 66 69.71859 40 [6,] 109 101.29720 119 [7,] 69 71.92175 84 [8,] 82 81.46877 54 [9,] 62 66.78105 85 [10,] 94 90.28141 87 [11,] 54 60.90596 59 [12,] 86 84.40632 90 [13,] 62 66.78105 78 [14,] 113 104.23475 110 [15,] 53 60.17157 76 [16,] 91 88.07825 92 [17,] 86 84.40632 85 [18,] 48 56.49964 57 [19,] 59 64.57789 59 [20,] 101 95.42211 107 [21,] 96 91.75018 59 > msep <- sum((yshrink - yval)^2) > msep [1] 4228.503