The average correlation coefficient of a correlation matrix is a useful measure of the internal reliability of the set of variables in the matrix. Moreover, it is a measure of the degree of multi-collinearity among the predictor variables in a model. The smaller the value of the AVG_CORR the better: 1) the model's predictiveness; and 2) the assessment of a predictor variable's contribution to the dependent variable.
This report provides a SAS-code program for calculating the Average Correlation Coefficient. The program should be a welcomed entry in the tool kit of data analysts who frequently work with BIG data.
/************First Create Data IN ***** **/
data IN;
input ID 2.0 GENDER $1. MARITAL $1.;
cards;
01MS
02MM
03M
04
05FS
08FM
07F
08 M
09 S
10MD
;
run;
data IN;
set IN;
GENDER_ = GENDER; if GENDER =' ' then GENDER_ ='x';
MARITAL_= MARITAL;if MARITAL=' ' then MARITAL_='x';
run;
PROC TRANSREG data=IN DESIGN;
model class (GENDER_ / ZERO='x');
output out = GENDER_ (drop = Intercept _NAME_ _TYPE_);
id ID;
run;
proc print;
run;
proc sort data=GENDER_ ;by ID;
proc sort data=IN ;by ID;
run;
data IN;
merge IN GENDER_ ;
by ID;
run;
proc print data=IN;
run;
PROC TRANSREG data=IN DESIGN;
model class (MARITAL_ / ZERO='x');
output out=MARITAL_ (drop= Intercept _NAME_ _TYPE_);
id ID;
run;
proc print;
run;
proc sort data=MARITAL_;by ID;
proc sort data=IN ;by ID;
run;
data IN;
merge IN MARITAL_;
by ID;
run;
proc print data=IN;
run;
/***********End of Creating Data IN ******/
/************ SAS-code Program for Calculating Average Correlation Coefficient**********/
proc corr data=IN out=out;
var GENDER_M GENDER_F MARITAL_M MARITAL_S MARITAL_D;
run;
data out1;
set out;
if _type_='MEAN' or _type_='STD' or _type_='N' then delete;
drop _type_;
array vars (
5)
GENDER_M GENDER_F MARITAL_M MARITAL_S MARITAL_D ;
array pos (
5) x1 - x5;
do i=
1 to
5;
pos(i)=abs(vars(i));
end;
drop
GENDER_M GENDER_F MARITAL_M MARITAL_S MARITAL_D i;
run;
data out2;
set out1;
array poss (
5) x1- x5;
do i=
1 to
5;
if poss(i) =
1 then poss(i)=
.;
drop i;
end;
run;
proc print;run;
proc means data=out2 sum;
output out=out3 sum=;
proc print;run;
data out4;
set out3;
sum_=sum(of x1-x5);
sum_div2= sum_/
2;
bot= (((_freq_*_freq_) -_freq_))/
2;
avg_corr= sum_div2/bot;
run;
data avg_corr;
set out4;
keep avg_corr;
proc print;
run;