> #------------------------------------------------------------------- > # 3.1 MODEL SELECTION > #------------------------------------------------------------------- > > m<-matrix(scan("/glim/datasets/rents1"),ncol=4,byrow=T) Read 516 items > m [,1] [,2] [,3] [,4] [1,] 1 1.5 170 215 [2,] 1 2.5 200 450 [3,] 1 3.5 230 295 [4,] 1 3.5 285 325 [5,] 1 3.5 375 295 [6,] 1 3.5 285 225 [7,] 1 3.5 400 173 [8,] 1 3.5 250 300 [9,] 1 4.5 270 375 [10,] 1 4.5 375 370 [126,] 9 5.5 800 575 [127,] 9 5.5 485 410 [128,] 9 6.5 850 725 [129,] 9 7.5 850 715 > dist<-factor(m[,1]) > room<-m[,2] > rntc<-m[,3] > rntv<-m[,4] > par(mfrow=c(2,2)) > plot(dist,rntc,main="Figure 3.1 Rent vs. district") > plot(room,rntc,main="Figure 3.2 Rent vs. rooms") > > sse<-rep(0,10) > p<-sse > msep1<-sse > msep2<-sse > > glm<-glm(rntc~1); glm Call: glm(formula = rntc ~ 1) Coefficients: (Intercept) 386.4 Degrees of Freedom: 128 Total (i.e. Null); 128 Residual Null Deviance: 2497000 Residual Deviance: 2497000 AIC: 1643 > sse[1]<-deviance(glm); p[1]<-length(coef(glm)) > msep1[1]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[1]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~room); glm Call: glm(formula = rntc ~ room) Coefficients: (Intercept) room 88.37 68.48 Degrees of Freedom: 128 Total (i.e. Null); 127 Residual Null Deviance: 2497000 Residual Deviance: 1389000 AIC: 1570 > sse[2]<-deviance(glm); p[2]<-length(coef(glm)) > msep1[2]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[2]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~dist); glm Call: glm(formula = rntc ~ dist) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 288.18 104.82 168.94 72.65 66.28 97.73 dist7 dist8 dist9 51.60 32.89 310.82 Degrees of Freedom: 128 Total (i.e. Null); 120 Residual Null Deviance: 2497000 Residual Deviance: 1692000 AIC: 1609 > sse[3]<-deviance(glm); p[3]<-length(coef(glm)) > msep1[3]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[3]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~dist+room); glm Call: glm(formula = rntc ~ dist + room) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 51.9376 77.8188 135.1940 -11.7214 -13.2692 24.0927 dist7 dist8 dist9 room -6.2518 -0.8596 196.0710 67.4984 Degrees of Freedom: 128 Total (i.e. Null); 119 Residual Null Deviance: 2497000 Residual Deviance: 735800 AIC: 1504 > sse[4]<-deviance(glm); p[4]<-length(coef(glm)) > msep1[4]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[4]<-sum((resid(glm)/(1-vl/sc))^2) > > room2<-room^2 > glm<-glm(rntc~dist+room+room2); glm Call: glm(formula = rntc ~ dist + room + room2) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 125.226 82.311 132.575 -9.632 -16.518 25.708 dist7 dist8 dist9 room room2 -3.250 -1.002 196.472 32.009 3.924 Degrees of Freedom: 128 Total (i.e. Null); 118 Residual Null Deviance: 2497000 Residual Deviance: 721100 AIC: 1503 > sse[5]<-deviance(glm); p[5]<-length(coef(glm)) > msep1[5]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[5]<-sum((resid(glm)/(1-vl/sc))^2) > > Room<-factor(room) > glm<-glm(rntc~dist+room+room2+Room); glm Call: glm(formula = rntc ~ dist + room + room2 + Room) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 -12.050 77.659 134.045 -4.044 -19.965 28.888 dist7 dist8 dist9 room room2 Room2.5 -5.871 1.849 199.180 145.646 -9.014 -114.255 Room3.5 Room4.5 Room5.5 Room6.5 Room7.5 -95.732 -109.799 -124.484 NA NA Degrees of Freedom: 128 Total (i.e. Null); 114 Residual Null Deviance: 2497000 Residual Deviance: 673900 AIC: 1502 > sse[6]<-deviance(glm); p[6]<-length(coef(glm)) > msep1[6]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[6]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~dist+room+dist:room); glm Call: glm(formula = rntc ~ dist + room + dist:room) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 97.869 -237.869 63.294 -174.536 -37.502 55.835 dist7 dist8 dist9 room dist2.room dist3.room -42.296 108.318 -187.332 54.375 82.292 19.615 dist4.room dist5.room dist6.room dist7.room dist8.room dist9.room 37.730 8.485 -3.796 10.854 -25.654 78.022 Degrees of Freedom: 128 Total (i.e. Null); 111 Residual Null Deviance: 2497000 Residual Deviance: 628700 AIC: 1500 > sse[7]<-deviance(glm); p[7]<-length(coef(glm)) > msep1[7]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[7]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~dist+Room+dist:room); glm Call: glm(formula = rntc ~ dist + Room + dist:room) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 321.633 -294.893 37.906 -234.909 -26.314 -1.830 dist7 dist8 dist9 Room2.5 Room3.5 Room4.5 -89.294 56.443 -201.657 70.164 267.002 390.928 Room5.5 Room6.5 Room7.5 dist1.room dist2.room dist3.room 490.132 715.602 773.522 -83.453 12.740 -56.370 dist4.room dist5.room dist6.room dist7.room dist8.room dist9.room -30.005 -76.582 -72.442 -61.176 -94.484 NA Degrees of Freedom: 128 Total (i.e. Null); 106 Residual Null Deviance: 2497000 Residual Deviance: 576400 AIC: 1498 > sse[8]<-deviance(glm); p[8]<-length(coef(glm)) > msep1[8]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[8]<-sum((resid(glm)/(1-vl/sc))^2) > > glm<-glm(rntc~dist+Room+dist:room+dist:room2); glm Call: glm(formula = rntc ~ dist + Room + dist:room + dist:room2) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 502.632 -445.852 77.936 21.312 268.383 131.379 dist7 dist8 dist9 Room2.5 Room3.5 Room4.5 493.180 242.466 -734.997 223.839 543.293 754.743 Room5.5 Room6.5 Room7.5 dist1.room dist2.room dist3.room 869.329 1073.329 1070.067 -224.994 -74.783 -249.759 dist4.room dist5.room dist6.room dist7.room dist8.room dist9.room -327.894 -403.576 -320.213 -488.799 -372.250 NA dist1.room2 dist2.room2 dist3.room2 dist4.room2 dist5.room2 dist6.room2 3.891 NA 15.408 26.845 30.685 22.050 dist7.room2 dist8.room2 dist9.room2 38.669 25.885 NA Degrees of Freedom: 128 Total (i.e. Null); 99 Residual Null Deviance: 2497000 Residual Deviance: 531500 AIC: 1502 > sse[9]<-deviance(glm); p[9]<-length(coef(glm)) > msep1[9]<-sum((rntv-fitted(glm))^2) > vl<-predict.lm(glm,se.fit=T)$se.fit^2 > sc<-summary(glm)$dispersion > msep2[9]<-sum((resid(glm)/(1-vl/sc))^2) > > sse[10]<-0; p[10]<-length(rntc) > msep1[10]<-sum((rntv-rntc)^2) > msep2[10]<-NA > > msep3<-sse+2*p*sc > aic<-129*(log(2*pi*sse/129)+1)+2*p > cbind(p,sse,msep1,msep2,msep3,aic) p sse msep1 msep2 msep3 aic [1,] 1 2496537.4 2512488 2535698.2 2507275.6 1641.394 [2,] 2 1389022.3 1442669 1434300.7 1410498.7 1567.761 [3,] 9 1691595.1 2282656 1938382.6 1788238.9 1607.183 [4,] 10 735829.1 1294490 864764.3 843211.0 1501.799 [5,] 11 721066.7 1293582 859487.7 839186.9 1501.185 [6,] 17 673898.2 1343720 880915.9 856447.6 1504.458 [7,] 18 628708.6 1355654 838805.7 821996.2 1497.504 [8,] 24 576393.5 1448617 887167.3 834110.3 1498.297 [9,] 33 531540.8 1411650 1494367.6 885901.4 1505.846 [10,] 129 0.0 2054300 NA 1385227.7 -Inf > > > m<-matrix(scan("/glim/datasets/rents2"),ncol=3,byrow=T) Read 774 items > dist<-factor(m[,1]) > room<-m[,2] > rent<-m[,3] > w<-c(rep(1,129),rep(0,129)) > glm<-glm(rent~dist+room,weight=w) > glm Call: glm(formula = rent ~ dist + room, weights = w) Coefficients: (Intercept) dist2 dist3 dist4 dist5 dist6 51.9376 77.8188 135.1940 -11.7214 -13.2692 24.0927 dist7 dist8 dist9 room -6.2518 -0.8596 196.0710 67.4984 Degrees of Freedom: 128 Total (i.e. Null); 119 Residual Null Deviance: 2497000 Residual Deviance: 735800 AIC: 1504 > cbind(rent,fitted(glm),resid(glm)) rent 1 170 153.1851 16.8149007 2 200 220.6835 -20.6834587 3 230 288.1818 -58.1818182 4 285 288.1818 -3.1818182 126 800 619.2495 180.7504922 127 485 619.2495 -134.2495078 128 850 686.7479 163.2521327 129 850 754.2462 95.7537733 130 185 288.1818 -103.1818182 131 320 355.6802 -35.6801776 132 700 423.1785 276.8214629 133 518 490.6769 27.3231035 255 525 551.7511 -26.7511484 256 655 619.2495 35.7504922 257 515 619.2495 -104.2495078 258 790 686.7479 103.2521327 > msep<-sum((1-w)*resid(glm)^2) > msep [1] 1357235 > > > #------------------------------------------------------------------- > # 3.2 STEIN SHRINKAGE: PREDICTING HOCKEY SCORES > #------------------------------------------------------------------- > > m<-matrix(scan("/glim/datasets/nhl8586"),ncol=2,byrow=T) Read 42 items > m [,1] [,2] [1,] 82 86 [2,] 90 80 [3,] 94 89 [4,] 83 86 [5,] 66 40 [6,] 109 119 [7,] 69 84 [8,] 82 54 [9,] 62 85 [10,] 94 87 [11,] 54 59 [12,] 86 90 [13,] 62 78 [14,] 113 110 [15,] 53 76 [16,] 91 92 [17,] 86 85 [18,] 48 57 [19,] 59 59 [20,] 101 107 [21,] 96 59 > ycal<-m[,1] > yval<-m[,2] > plot(ycal,yval,main="Figure 3.3 Hockey") > abline(0,1) > msep<-sum((ycal-yval)^2) > msep [1] 4836 > glm<-glm(yval~ycal) > glm Call: glm(formula = yval ~ ycal) Coefficients: (Intercept) ycal 23.4900 0.7076 Degrees of Freedom: 20 Total (i.e. Null); 19 Residual Null Deviance: 7810 Residual Deviance: 4223 AIC: 177 > abline(coef(glm),lty=2) > > mean(ycal) [1] 80 > abline(80,0,lty=3) > msep<-sum((80-yval)^2) > msep [1] 7810 > > m<-matrix(scan("/glim/datasets/nhl82_84"),ncol=3,byrow=T) Read 63 items > m [,1] [,2] [,3] [1,] 96 110 104 [2,] 93 89 103 [3,] 75 78 82 [4,] 72 104 68 [5,] 54 57 69 [6,] 111 106 119 [7,] 60 45 66 [8,] 63 66 59 [9,] 94 96 88 [10,] 109 98 75 [11,] 49 48 41 [12,] 118 96 104 [13,] 92 80 93 [14,] 87 106 98 [15,] 75 45 38 [16,] 82 80 94 [17,] 72 65 71 [18,] 56 68 61 [19,] 77 75 73 [20,] 65 94 101 [21,] 80 74 73 > y1<-m[,1] > y2<-m[,2] > y3<-m[,3] > ym<-(y1+y2+y3)/3 > sigma2<-sum((y1-ym)^2+(y2-ym)^2+(y3-ym)^2)/(2*20) > sigma2 [1] 111 > f<-sum((ycal-80)^2)/20/sigma2 > f [1] 3.227027 > k<-1-1/f*18/20*40/42 > k [1] 0.7343862 > yshrink<-(1-k)*80+k*ycal > abline((1-k)*80,k,lty=4) > legend(47,120,c("Calib","Regres","80","Shrink"),lty=1:4) > cbind(ycal,yshrink,yval) ycal yshrink yval [1,] 82 81.46877 86 [2,] 90 87.34386 80 [3,] 94 90.28141 89 [4,] 83 82.20316 86 [5,] 66 69.71859 40 [6,] 109 101.29720 119 [7,] 69 71.92175 84 [8,] 82 81.46877 54 [9,] 62 66.78105 85 [10,] 94 90.28141 87 [11,] 54 60.90596 59 [12,] 86 84.40632 90 [13,] 62 66.78105 78 [14,] 113 104.23475 110 [15,] 53 60.17157 76 [16,] 91 88.07825 92 [17,] 86 84.40632 85 [18,] 48 56.49964 57 [19,] 59 64.57789 59 [20,] 101 95.42211 107 [21,] 96 91.75018 59 > msep<-sum((yshrink-yval)^2) > msep [1] 4228.503 >