Chapter Contents Previous Next
 General Statistics Examples

## Example 8.6: Regression of Subsets of Variables

The following example performs regression with variable selection similar to some of the features in the REG procedure.

```      /*       Initialization                               */
/* C,CSAVE the crossproducts matrix                   */
/* N       number of observations                     */
/* K       total number of variables to consider      */
/* L       number of variables currently in model     */
/* IN      a 0-1 vector of whether variable is in     */
/* B       print collects results (L MSE RSQ BETAS )  */

start initial;
n=nrow(x);
k=ncol(x);
k1=k+1;
ik=1:k;
bnames={nparm mse rsquare} ||varnames;

/* Correct by mean, adjust out intercept parameter */
y=y-y[+,]/n;                         /* correct y by mean */
x=x-repeat(x[+,]/n,n,1);             /* correct x by mean */
xpy=x`*y;                                /* crossproducts */
ypy=y`*y;
xpx=x`*x;
free x y;                      /* no longer need the data */

/* Save a copy of crossproducts matrix */
csave=(xpx || xpy) // (xpy`|| ypy);
finish initial;
```

```      /* Forward method */
start forward;
print / "FORWARD SELECTION METHOD";
free bprint;
c=csave;
in=repeat(0,k,1);
l=0;                                /* No variables are in */
dfe=n-1;
mse=ypy/dfe;
sprob=0;
do while(sprob<.15 & l<k);
indx=loc(^in);       /* where are the variables not in? */
cd=vecdiag(c)[indx,];                  /* xpx diagonals */
tsqr=cb#cb/(cd#mse);              /* squares of t tests */
imax=tsqr[<:>,];         /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then

do;                            /* if t-test significant */
ii=indx[,imax];             /* pick most significant */
run swp;                         /* routine to sweep */
run bpr;               /* routine to collect results */
end;
end;
print  bprint[colname=bnames] ;
finish forward;

/* Backward method */
start backward;
print / "BACKWARD ELIMINATION ";
free bprint;
c=csave;
in=repeat(0,k,1);
ii=1:k;
run swp;
sprob=1;
do while(sprob>.15 & l>0);
indx=loc(in);            /* where are the variables in? */
cd=vecdiag(c)[indx,];                  /* xpx diagonals */
cb=c[indx,k1];                               /* bvalues */
tsqr=cb#cb/(cd#mse);              /* squares of t tests */
imin=tsqr[>:<,];         /* location of minimum in indx */
sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if sprob>.15 then
do;                         /* if t-test nonsignificant */
ii=indx[,imin];            /* pick least significant */
run swp;             /* routine to sweep in variable */
run bpr;               /* routine to collect results */
end;
end;
print  bprint[colname=bnames] ;
finish backward;
```

```      /* Stepwise method */
start stepwise;
print /"STEPWISE METHOD";
free bprint;
c=csave;
in=repeat(0,k,1);
l=0;
dfe=n-1;
mse=ypy/dfe;
sprob=0;

do while(sprob<.15 & l<k);
indx=loc(^in);       /* where are the variables not in? */
nindx=loc(in);           /* where are the variables in? */
cd=vecdiag(c)[indx,];                  /* xpx diagonals */
tsqr=cb#cb/cd/mse;                /* squares of t tests */
imax=tsqr[<:>,];         /* location of maximum in indx */
sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
if sprob<.15 then
do;                            /* if t-test significant */
ii=indx[,imax];                 /* find index into c */
run swp;                         /* routine to sweep */
run backstep;           /* check if remove any terms */
run bpr;               /* routine to collect results */
end;
end;
print  bprint[colname=bnames] ;
finish stepwise;

/* Routine to backwards-eliminate for stepwise  */
start backstep;
if nrow(nindx)=0 then return;
bprob=1;
do while(bprob>.15 & l<k);
cd=vecdiag(c)[nindx,];                /* xpx diagonals */
cb=c[nindx,k1];                             /* bvalues */
tsqr=cb#cb/(cd#mse);             /* squares of t tests */
imin=tsqr[>:<,];       /* location of minimum in nindx */
bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
if bprob>.15 then
do;
ii=nindx[,imin];
run swp;
run bpr;
end;
end;
finish backstep;
```

```      /* Search all possible models */
start all;

/* Use method of schatzoff et al. for search technique */
betak=repeat(0,k,k); /* rec. ests. for best l-param model */
msek=repeat(1e50,k,1);     /* record best mse per # parms */
rsqk=repeat(0,k,1);                /* record best rsquare */
ink=repeat(0,k,k);         /* record best set per # parms */
limit=2##k-1;              /* number of models to examine */
c=csave;
in=repeat(0,k,1); /* start out with no variables in model */

do kk=1 to limit;
run ztrail;                 /* find which one to sweep */
run swp;                                /* sweep it in */
bb=bb//(l||mse||rsq||(c[ik,k1]#in)`);
if mse<msek[l,] then
msek[l,]=mse;                       /* record mse */
rsqk[l,]=rsq;                   /* record rsquare */
ink[,l]=in;        /* record which parms in model */
betak[l,]=(c[ik,k1]#in)`;     /* record estimates */
end;
end;
print / "ALL POSSIBLE MODELS" " IN SEARCH ORDER";
print bb[colname=bnames]; free bb;
bprint=ik`||msek||rsqk||betak;
print ,"THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
print bprint[colname=bnames];
/* Mallows CP plot */
cp=msek#(n-ik`-1)/min(msek)-(n-2#ik`);
cp=ik`||cp;
cpname={"nparm" "cp"};
/* output cp out=cp colname=cpname; */
finish all;

/*  Subroutine to find number of           */
/*  trailing zeros in binary number        */
/*  on entry: kk is the number to examine  */
/*  on exit:  ii has the result            */
start ztrail;
ii=1;
zz=kk;
do while(mod(zz,2)=0);
ii=ii+1;
zz=zz/2;
end;
finish ztrail;
```

```      /* Subroutine to sweep in a pivot              */
/* on entry: ii has the position(s) to pivot   */
/* on exit:  in, l, dfe, mse, rsq recalculated */
start swp;
if abs(c[ii,ii])<1e-9 then
do;
print , "FAILURE", c;
stop;
end;
c=sweep(c,ii);
in[ii,]=^in[ii,];
l=sum(in);
dfe=n-1-l;
sse=c[k1,k1];
mse=sse/dfe;
rsq=1-sse/ypy;
finish swp;

/* Subroutine to collect bprint results                   */
/* on entry: l,mse,rsq, and c set up to collect           */
/* on exit:  bprint has another row                       */

start bpr;
bprint=bprint//(l||mse||rsq||(c [ik,k1]#in)`);
finish bpr;

/*              Stepwise Methods                          */
/* After a run to the initial routine, which sets up      */
/* the data, four different routines can be called        */
/* to do four different model-selection methods.          */

start seq;
run initial;          /* initialization                   */
run all;              /* all possible models              */
run forward;          /* foreward selection method        */
run backward;         /* backward elimination method      */
run stepwise;         /* stepwise method                  */
finish seq;
```

```   /*              Data on physical fitness                     */
/* these measurements were made on men involved in a physical*/
/* fitness course at n.c.state univ.  the variables are      */
/* age(years), weight(kg), oxygen uptake rate(ml per kg body */
/* weight per minute), time to run 1.5 miles(minutes), heart */
/* rate while resting, heart rate while running (same time   */
/* oxygen rate measured), and maximum heart rate recorded    */
/* while running. certain values of maxpulse were modified   */
/* for consistency.        data courtesy dr. a.c. linnerud   */

data=
{ 44 89.47  44.609 11.37 62 178 182  ,
40 75.07  45.313 10.07 62 185 185  ,
44 85.84  54.297  8.65 45 156 168  ,
42 68.15  59.571  8.17 40 166 172  ,
38 89.02  49.874  9.22 55 178 180  ,
47 77.45  44.811 11.63 58 176 176  ,
40 75.98  45.681 11.95 70 176 180  ,
43 81.19  49.091 10.85 64 162 170  ,
44 81.42  39.442 13.08 63 174 176  ,
38 81.87  60.055  8.63 48 170 186  ,
44 73.03  50.541 10.13 45 168 168  ,
45 87.66  37.388 14.03 56 186 192  ,
45 66.45  44.754 11.12 51 176 176  ,
47 79.15  47.273 10.60 47 162 164  ,
54 83.12  51.855 10.33 50 166 170  ,
49 81.42  49.156  8.95 44 180 185  ,
51 69.63  40.836 10.95 57 168 172  ,
51 77.91  46.672 10.00 48 162 168  ,
48 91.63  46.774 10.25 48 162 164  ,
49 73.37  50.388 10.08 67 168 168  ,
57 73.37  39.407 12.63 58 174 176  ,
54 79.38  46.080 11.17 62 156 165  ,
52 76.32  45.441  9.63 48 164 166  ,
50 70.87  54.625  8.92 48 146 155  ,
51 67.25  45.118 11.08 48 172 172  ,
54 91.63  39.203 12.88 44 168 172  ,
51 73.71  45.790 10.47 59 186 188  ,
57 59.08  50.545  9.93 49 148 155  ,
49 76.32  48.673  9.40 56 186 188  ,
48 61.24  47.920 11.50 52 170 176  ,
52 82.78  47.467 10.50 53 170 172
};

x=data[,{1 2 4 5 6 7 }];
y=data[,3];
free data;
varnames={age weight runtime rstpulse runpulse maxpulse};
reset fw=8 linesize=90;
run seq;
```
The results are shown below.

 ALL POSSIBLE MODELS IN SEARCH ORDER NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE 1 26.63425 0.092777 -0.31136 0 0 0 0 0 2 25.82619 0.150635 -0.37042 -0.15823 0 0 0 0 1 28.58034 0.026488 0 -0.1041 0 0 0 0 2 7.755636 0.744935 0 -0.02548 -3.2886 0 0 0 3 7.226318 0.770831 -0.17388 -0.05444 -3.14039 0 0 0 2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0 1 7.533843 0.74338 0 0 -3.31056 0 0 0 2 7.798261 0.743533 0 0 -3.28661 -0.00968 0 0 3 7.336089 0.767349 -0.16755 0 -3.07925 -0.04549 0 0 4 7.366649 0.775033 -0.19603 -0.05915 -2.9889 -0.05326 0 0 3 8.037314 0.745111 0 -0.02569 -3.26268 -0.01041 0 0 2 24.91487 0.180607 0 -0.09305 0 -0.27474 0 0 3 20.28031 0.356847 -0.44698 -0.15647 0 -0.32186 0 0 2 21.27632 0.30027 -0.38882 0 0 -0.32286 0 0 1 24.67582 0.159485 0 0 0 -0.27921 0 0 2 23.26003 0.235031 0 0 0 -0.20684 -0.15262 0 3 16.81799 0.466648 -0.52338 0 0 -0.22524 -0.23769 0 4 16.26146 0.503398 -0.56317 -0.12697 0 -0.22981 -0.2246 0 3 23.81815 0.244651 0 -0.06381 0 -0.20843 -0.14279 0 4 7.785151 0.762252 0 -0.01231 -3.16759 0.016669 -0.0749 0 5 6.213174 0.817556 -0.28528 -0.05184 -2.70392 -0.02711 -0.12628 0 4 6.166944 0.81167 -0.26213 0 -2.77733 -0.01981 -0.12874 0 3 7.507972 0.761898 0 0 -3.17665 0.017616 -0.07658 0 2 7.254263 0.761424 0 0 -3.14019 0 -0.07351 0 3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0 4 6.009033 0.816493 -0.27642 -0.04932 -2.77237 0 -0.12932 0 3 7.510162 0.761829 0 -0.01315 -3.13261 0 -0.07189 0 2 25.333 0.166855 0 -0.05987 0 0 -0.19797 0 3 18.63184 0.409126 -0.54408 -0.12049 0 0 -0.28248 0 2 18.97378 0.375995 -0.50665 0 0 0 -0.29382 0 1 24.70817 0.158383 0 0 0 0 -0.2068 0 2 21.60626 0.289419 0 0 0 0 -0.6818 0.571538 3 18.21725 0.422273 -0.4214 0 0 0 -0.57966 0.361557 4 17.29877 0.47172 -0.45243 -0.14944 0 0 -0.61723 0.426862 3 21.41763 0.320779 0 -0.11815 0 0 -0.71745 0.635395 4 6.030105 0.815849 0 -0.05159 -2.9255 0 -0.39529 0.38537 5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908 4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513 3 5.991568 0.809988 0 0 -2.97019 0 -0.37511 0.354219 4 6.208523 0.8104 0 0 -3.00426 0.016412 -0.37778 0.353998 5 5.549941 0.837031 -0.20154 0 -2.7386 -0.01208 -0.34562 0.269064 6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217 5 6.263348 0.816083 0 -0.05091 -2.95182 0.01239 -0.39704 0.384793 4 20.11235 0.385797 0 -0.1194 0 -0.19092 -0.64584 0.609632 5 15.1864 0.554066 -0.47923 -0.1527 0 -0.21555 -0.53045 0.385424 4 16.29247 0.502451 -0.44717 0 0 -0.21266 -0.49323 0.319267 3 20.37729 0.353772 0 0 0 -0.18993 -0.61019 0.545236 2 25.11456 0.174039 0 0 0 -0.25219 0 -0.07364 3 19.2347 0.390007 -0.52736 0 0 -0.26492 0 -0.20024 4 18.80875 0.425607 -0.55881 -0.12604 0 -0.27056 0 -0.17799 3 25.59719 0.188232 0 -0.07874 0 -0.25524 0 -0.05502 4 8.311496 0.746179 0 -0.02053 -3.25232 -0.00393 0 -0.02064 5 7.19584 0.788701 -0.25795 -0.04936 -2.86147 -0.04121 0 -0.08153 4 7.091611 0.783432 -0.23928 0 -2.92597 -0.0339 0 -0.08777 3 8.033673 0.745227 0 0 -3.26805 -0.00193 0 -0.02526 2 7.746932 0.745221 0 0 -3.27232 0 0 -0.02561 3 6.882626 0.78173 -0.22923 0 -3.01222 0 0 -0.09094 4 7.00018 0.786224 -0.24436 -0.04525 -2.97011 0 0 -0.08585 3 8.00441 0.746155 0 -0.02027 -3.26114 0 0 -0.02139 2 28.35356 0.067516 0 -0.07074 0 0 0 -0.12159 3 22.38148 0.290212 -0.54076 -0.11605 0 0 0 -0.24445 2 22.50135 0.259982 -0.5121 0 0 0 0 -0.2637 1 27.71259 0.056046 0 0 0 0 0 -0.13762

 THE BEST MODEL FOR EACH NUMBER OF PARAMETERS NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE 1 7.533843 0.74338 0 0 -3.31056 0 0 0 2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0 3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0 4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513 5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908 6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217

 FORWARD SELECTION METHOD NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE 1 7.533843 0.74338 0 0 -3.31056 0 0 0 2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0 3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0 4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

 BACKWARD ELIMINATION NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE 6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217 5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908 4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

 STEPWISE METHOD NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE 1 7.533843 0.74338 0 0 -3.31056 0 0 0 2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0 3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0 4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

 Chapter Contents Previous Next Top