Chapter Contents
Chapter Contents
Previous
Previous
Next
Next
General Statistics Examples

Example 8.6: Regression of Subsets of Variables

The following example performs regression with variable selection similar to some of the features in the REG procedure.

      /*       Initialization                               */
      /* C,CSAVE the crossproducts matrix                   */
      /* N       number of observations                     */
      /* K       total number of variables to consider      */
      /* L       number of variables currently in model     */
      /* IN      a 0-1 vector of whether variable is in     */
      /* B       print collects results (L MSE RSQ BETAS )  */

   start initial;
      n=nrow(x);
      k=ncol(x);
      k1=k+1;
      ik=1:k;
      bnames={nparm mse rsquare} ||varnames;

         /* Correct by mean, adjust out intercept parameter */
      y=y-y[+,]/n;                         /* correct y by mean */
      x=x-repeat(x[+,]/n,n,1);             /* correct x by mean */
      xpy=x`*y;                                /* crossproducts */
      ypy=y`*y;
      xpx=x`*x;
      free x y;                      /* no longer need the data */

         /* Save a copy of crossproducts matrix */
      csave=(xpx || xpy) // (xpy`|| ypy);
   finish initial;

      /* Forward method */
   start forward;
     print / "FORWARD SELECTION METHOD";
     free bprint;
     c=csave;
     in=repeat(0,k,1);
     l=0;                                /* No variables are in */
     dfe=n-1;
     mse=ypy/dfe;
     sprob=0;
     do while(sprob<.15 & l<k);
        indx=loc(^in);       /* where are the variables not in? */
        cd=vecdiag(c)[indx,];                  /* xpx diagonals */
        cb=c[indx,k1];                          /* adjusted xpy */
        tsqr=cb#cb/(cd#mse);              /* squares of t tests */
        imax=tsqr[<:>,];         /* location of maximum in indx */
        sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
        if sprob<.15 then

        do;                            /* if t-test significant */
           ii=indx[,imax];             /* pick most significant */
           run swp;                         /* routine to sweep */
           run bpr;               /* routine to collect results */
        end;
     end;
     print  bprint[colname=bnames] ;
   finish forward;

      /* Backward method */
   start backward;
      print / "BACKWARD ELIMINATION ";
      free bprint;
      c=csave;
      in=repeat(0,k,1);
      ii=1:k;
      run swp;
      run bpr;                   /* start with all variables in */
      sprob=1;
      do while(sprob>.15 & l>0);
        indx=loc(in);            /* where are the variables in? */
        cd=vecdiag(c)[indx,];                  /* xpx diagonals */
        cb=c[indx,k1];                               /* bvalues */
        tsqr=cb#cb/(cd#mse);              /* squares of t tests */
        imin=tsqr[>:<,];         /* location of minimum in indx */
        sprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
        if sprob>.15 then
        do;                         /* if t-test nonsignificant */
           ii=indx[,imin];            /* pick least significant */
           run swp;             /* routine to sweep in variable */
           run bpr;               /* routine to collect results */
        end;
      end;
      print  bprint[colname=bnames] ;
   finish backward;

      /* Stepwise method */
   start stepwise;
      print /"STEPWISE METHOD";
      free bprint;
      c=csave;
      in=repeat(0,k,1);
      l=0;
      dfe=n-1;
      mse=ypy/dfe;
      sprob=0;

      do while(sprob<.15 & l<k);
        indx=loc(^in);       /* where are the variables not in? */
        nindx=loc(in);           /* where are the variables in? */
        cd=vecdiag(c)[indx,];                  /* xpx diagonals */
        cb=c[indx,k1];                          /* adjusted xpy */
        tsqr=cb#cb/cd/mse;                /* squares of t tests */
        imax=tsqr[<:>,];         /* location of maximum in indx */
        sprob=(1-probt(sqrt(tsqr[imax,]),dfe))*2;
        if sprob<.15 then
        do;                            /* if t-test significant */
           ii=indx[,imax];                 /* find index into c */
           run swp;                         /* routine to sweep */
           run backstep;           /* check if remove any terms */
           run bpr;               /* routine to collect results */
           end;
      end;
      print  bprint[colname=bnames] ;
   finish stepwise;

      /* Routine to backwards-eliminate for stepwise  */
   start backstep;
      if nrow(nindx)=0 then return;
      bprob=1;
      do while(bprob>.15 & l<k);
         cd=vecdiag(c)[nindx,];                /* xpx diagonals */
         cb=c[nindx,k1];                             /* bvalues */
         tsqr=cb#cb/(cd#mse);             /* squares of t tests */
         imin=tsqr[>:<,];       /* location of minimum in nindx */
         bprob=(1-probt(sqrt(tsqr[imin,]),dfe))*2;
         if bprob>.15 then
         do;
            ii=nindx[,imin];
            run swp;
            run bpr;
         end;
      end;
   finish backstep;

      /* Search all possible models */
   start all;

      /* Use method of schatzoff et al. for search technique */
      betak=repeat(0,k,k); /* rec. ests. for best l-param model */
      msek=repeat(1e50,k,1);     /* record best mse per # parms */
      rsqk=repeat(0,k,1);                /* record best rsquare */
      ink=repeat(0,k,k);         /* record best set per # parms */
      limit=2##k-1;              /* number of models to examine */
      c=csave;
      in=repeat(0,k,1); /* start out with no variables in model */

      do kk=1 to limit;
         run ztrail;                 /* find which one to sweep */
         run swp;                                /* sweep it in */
         bb=bb//(l||mse||rsq||(c[ik,k1]#in)`);
         if mse<msek[l,] then
         do;                      /* was this best for l parms? */
              msek[l,]=mse;                       /* record mse */
              rsqk[l,]=rsq;                   /* record rsquare */
              ink[,l]=in;        /* record which parms in model */
              betak[l,]=(c[ik,k1]#in)`;     /* record estimates */
         end;
      end;
      print / "ALL POSSIBLE MODELS" " IN SEARCH ORDER";
      print bb[colname=bnames]; free bb;
      bprint=ik`||msek||rsqk||betak;
      print ,"THE BEST MODEL FOR EACH NUMBER OF PARAMETERS";
      print bprint[colname=bnames];
         /* Mallows CP plot */
      cp=msek#(n-ik`-1)/min(msek)-(n-2#ik`);
      cp=ik`||cp;
      cpname={"nparm" "cp"};
         /* output cp out=cp colname=cpname; */
   finish all;

      /*  Subroutine to find number of           */
      /*  trailing zeros in binary number        */
      /*  on entry: kk is the number to examine  */
      /*  on exit:  ii has the result            */
   start ztrail;
       ii=1;
       zz=kk;
       do while(mod(zz,2)=0);
          ii=ii+1;
          zz=zz/2;
       end;
   finish ztrail;

      /* Subroutine to sweep in a pivot              */
      /* on entry: ii has the position(s) to pivot   */
      /* on exit:  in, l, dfe, mse, rsq recalculated */
   start swp;
      if abs(c[ii,ii])<1e-9 then
      do;
         print , "FAILURE", c;
         stop;
      end;
      c=sweep(c,ii);
      in[ii,]=^in[ii,];
      l=sum(in);
      dfe=n-1-l;
      sse=c[k1,k1];
      mse=sse/dfe;
      rsq=1-sse/ypy;
   finish swp;


      /* Subroutine to collect bprint results                   */
      /* on entry: l,mse,rsq, and c set up to collect           */
      /* on exit:  bprint has another row                       */

   start bpr;
      bprint=bprint//(l||mse||rsq||(c [ik,k1]#in)`);
   finish bpr;

      /*              Stepwise Methods                          */
      /* After a run to the initial routine, which sets up      */
      /* the data, four different routines can be called        */
      /* to do four different model-selection methods.          */

   start seq;
      run initial;          /* initialization                   */
      run all;              /* all possible models              */
      run forward;          /* foreward selection method        */
      run backward;         /* backward elimination method      */
      run stepwise;         /* stepwise method                  */
   finish seq;

   /*              Data on physical fitness                     */
   /* these measurements were made on men involved in a physical*/ 
   /* fitness course at n.c.state univ.  the variables are      */
   /* age(years), weight(kg), oxygen uptake rate(ml per kg body */
   /* weight per minute), time to run 1.5 miles(minutes), heart */
   /* rate while resting, heart rate while running (same time   */
   /* oxygen rate measured), and maximum heart rate recorded    */
   /* while running. certain values of maxpulse were modified   */
   /* for consistency.        data courtesy dr. a.c. linnerud   */

   data=
      { 44 89.47  44.609 11.37 62 178 182  ,
        40 75.07  45.313 10.07 62 185 185  ,
        44 85.84  54.297  8.65 45 156 168  ,
        42 68.15  59.571  8.17 40 166 172  ,
        38 89.02  49.874  9.22 55 178 180  ,
        47 77.45  44.811 11.63 58 176 176  ,
        40 75.98  45.681 11.95 70 176 180  ,
        43 81.19  49.091 10.85 64 162 170  ,
        44 81.42  39.442 13.08 63 174 176  ,
        38 81.87  60.055  8.63 48 170 186  ,
        44 73.03  50.541 10.13 45 168 168  ,
        45 87.66  37.388 14.03 56 186 192  ,
        45 66.45  44.754 11.12 51 176 176  ,
        47 79.15  47.273 10.60 47 162 164  ,
        54 83.12  51.855 10.33 50 166 170  ,
        49 81.42  49.156  8.95 44 180 185  ,
        51 69.63  40.836 10.95 57 168 172  ,
        51 77.91  46.672 10.00 48 162 168  ,
        48 91.63  46.774 10.25 48 162 164  ,
        49 73.37  50.388 10.08 67 168 168  ,
        57 73.37  39.407 12.63 58 174 176  ,
        54 79.38  46.080 11.17 62 156 165  ,
        52 76.32  45.441  9.63 48 164 166  ,
        50 70.87  54.625  8.92 48 146 155  ,
        51 67.25  45.118 11.08 48 172 172  ,
        54 91.63  39.203 12.88 44 168 172  ,
        51 73.71  45.790 10.47 59 186 188  ,
        57 59.08  50.545  9.93 49 148 155  ,
        49 76.32  48.673  9.40 56 186 188  ,
        48 61.24  47.920 11.50 52 170 176  ,
        52 82.78  47.467 10.50 53 170 172
      };

   x=data[,{1 2 4 5 6 7 }];
   y=data[,3];
   free data;
   varnames={age weight runtime rstpulse runpulse maxpulse};
   reset fw=8 linesize=90;
   run seq;
The results are shown below.

ALL POSSIBLE MODELS IN SEARCH ORDER
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 26.63425 0.092777 -0.31136 0 0 0 0 0
2 25.82619 0.150635 -0.37042 -0.15823 0 0 0 0
1 28.58034 0.026488 0 -0.1041 0 0 0 0
2 7.755636 0.744935 0 -0.02548 -3.2886 0 0 0
3 7.226318 0.770831 -0.17388 -0.05444 -3.14039 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.798261 0.743533 0 0 -3.28661 -0.00968 0 0
3 7.336089 0.767349 -0.16755 0 -3.07925 -0.04549 0 0
4 7.366649 0.775033 -0.19603 -0.05915 -2.9889 -0.05326 0 0
3 8.037314 0.745111 0 -0.02569 -3.26268 -0.01041 0 0
2 24.91487 0.180607 0 -0.09305 0 -0.27474 0 0
3 20.28031 0.356847 -0.44698 -0.15647 0 -0.32186 0 0
2 21.27632 0.30027 -0.38882 0 0 -0.32286 0 0
1 24.67582 0.159485 0 0 0 -0.27921 0 0
2 23.26003 0.235031 0 0 0 -0.20684 -0.15262 0
3 16.81799 0.466648 -0.52338 0 0 -0.22524 -0.23769 0
4 16.26146 0.503398 -0.56317 -0.12697 0 -0.22981 -0.2246 0
3 23.81815 0.244651 0 -0.06381 0 -0.20843 -0.14279 0
4 7.785151 0.762252 0 -0.01231 -3.16759 0.016669 -0.0749 0
5 6.213174 0.817556 -0.28528 -0.05184 -2.70392 -0.02711 -0.12628 0
4 6.166944 0.81167 -0.26213 0 -2.77733 -0.01981 -0.12874 0
3 7.507972 0.761898 0 0 -3.17665 0.017616 -0.07658 0
2 7.254263 0.761424 0 0 -3.14019 0 -0.07351 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 6.009033 0.816493 -0.27642 -0.04932 -2.77237 0 -0.12932 0
3 7.510162 0.761829 0 -0.01315 -3.13261 0 -0.07189 0
2 25.333 0.166855 0 -0.05987 0 0 -0.19797 0
3 18.63184 0.409126 -0.54408 -0.12049 0 0 -0.28248 0
2 18.97378 0.375995 -0.50665 0 0 0 -0.29382 0
1 24.70817 0.158383 0 0 0 0 -0.2068 0
2 21.60626 0.289419 0 0 0 0 -0.6818 0.571538
3 18.21725 0.422273 -0.4214 0 0 0 -0.57966 0.361557
4 17.29877 0.47172 -0.45243 -0.14944 0 0 -0.61723 0.426862
3 21.41763 0.320779 0 -0.11815 0 0 -0.71745 0.635395
4 6.030105 0.815849 0 -0.05159 -2.9255 0 -0.39529 0.38537
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513
3 5.991568 0.809988 0 0 -2.97019 0 -0.37511 0.354219
4 6.208523 0.8104 0 0 -3.00426 0.016412 -0.37778 0.353998
5 5.549941 0.837031 -0.20154 0 -2.7386 -0.01208 -0.34562 0.269064
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217
5 6.263348 0.816083 0 -0.05091 -2.95182 0.01239 -0.39704 0.384793
4 20.11235 0.385797 0 -0.1194 0 -0.19092 -0.64584 0.609632
5 15.1864 0.554066 -0.47923 -0.1527 0 -0.21555 -0.53045 0.385424
4 16.29247 0.502451 -0.44717 0 0 -0.21266 -0.49323 0.319267
3 20.37729 0.353772 0 0 0 -0.18993 -0.61019 0.545236
2 25.11456 0.174039 0 0 0 -0.25219 0 -0.07364
3 19.2347 0.390007 -0.52736 0 0 -0.26492 0 -0.20024
4 18.80875 0.425607 -0.55881 -0.12604 0 -0.27056 0 -0.17799
3 25.59719 0.188232 0 -0.07874 0 -0.25524 0 -0.05502
4 8.311496 0.746179 0 -0.02053 -3.25232 -0.00393 0 -0.02064
5 7.19584 0.788701 -0.25795 -0.04936 -2.86147 -0.04121 0 -0.08153
4 7.091611 0.783432 -0.23928 0 -2.92597 -0.0339 0 -0.08777
3 8.033673 0.745227 0 0 -3.26805 -0.00193 0 -0.02526
2 7.746932 0.745221 0 0 -3.27232 0 0 -0.02561
3 6.882626 0.78173 -0.22923 0 -3.01222 0 0 -0.09094
4 7.00018 0.786224 -0.24436 -0.04525 -2.97011 0 0 -0.08585
3 8.00441 0.746155 0 -0.02027 -3.26114 0 0 -0.02139
2 28.35356 0.067516 0 -0.07074 0 0 0 -0.12159
3 22.38148 0.290212 -0.54076 -0.11605 0 0 0 -0.24445
2 22.50135 0.259982 -0.5121 0 0 0 0 -0.2637
1 27.71259 0.056046 0 0 0 0 0 -0.13762


THE BEST MODEL FOR EACH NUMBER OF PARAMETERS
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217

FORWARD SELECTION METHOD
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

BACKWARD ELIMINATION
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
6 5.368247 0.848672 -0.22697 -0.07418 -2.62865 -0.02153 -0.36963 0.303217
5 5.176338 0.848002 -0.21962 -0.0723 -2.68252 0 -0.3734 0.304908
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

STEPWISE METHOD
NPARM MSE RSQUARE AGE WEIGHT RUNTIME RSTPULSE RUNPULSE MAXPULSE
1 7.533843 0.74338 0 0 -3.31056 0 0 0
2 7.168422 0.764247 -0.15037 0 -3.20395 0 0 0
3 5.956692 0.811094 -0.2564 0 -2.82538 0 -0.13091 0
4 5.343462 0.836818 -0.19773 0 -2.76758 0 -0.34811 0.270513

Chapter Contents
Chapter Contents
Previous
Previous
Next
Next
Top
Top

Copyright © 1999 by SAS Institute Inc., Cary, NC, USA. All rights reserved.