/* Parallel Analysis Program For Raw Data and Data Permutations

  This program conducts parallel analyses on data files in which
  the rows of the data matrix are cases/individuals and the
  columns are variables;  Data are read/entered into the program
  using the READ command (see the READ command below);
  Alternative procedures for entering data in PROC IML include
  the USE, READ, INFILE, INPUT, and EDIT commands;
  There can be no missing values;

  You must also specify:
  -- the # of parallel data sets for the analyses;
  -- the desired percentile of the distribution and random
     data eigenvalues;
  -- whether principal components analyses or principal axis/common
     factor analysis are to be conducted, and
  -- whether normally distributed random data generation or
     permutations of the raw data set are to be used in the
     parallel analyses;

  Permutations of the raw data set are time consuming;
  Each parallel data set is based on column-wise random shufflings
  of the values in the raw data matrix using Castellan's (1992,
  BRMIC, 24, 72-77) algorithm; The distributions of the original
  raw variables are exactly preserved in the shuffled versions used
  in the parallel analyses; Permutations of the raw data set are
  thus highly accurate and most relevant, especially in cases where
  the raw data are not normally distributed or when they do not meet
  the assumption of multivariate normality (see Longman & Holden,
  1992, BRMIC, 24, 493, for a Fortran version); If you would
  like to go this route, it is perhaps best to (1) first run a
  normally distributed random data generation parallel analysis to
  familiarize yourself with the program and to get a ballpark
  reference point for the number of factors/components;
  (2) then run a permutations of the raw data parallel analysis
  using a small number of datasets (e.g., 10), just to see how long
  the program takes to run; then (3) run a permutations of the raw
  data parallel analysis using the number of parallel data sets that
  you would like use for your final analyses; 1000 datasets are
  usually sufficient, although more datasets should be used
  if there are close calls. */

/* These next commands generate artificial data that can be used for
   a trial-run of the program.  Just run this whole file.
   However, make sure to delete these commands before attempting
   to run your own data. */

/* Start of artificial data commands.   */
options nocenter nodate nonumber linesize=90;   title;
data artific;
do cases = 1 to 500;
com1 = 10 * normal (1);
com2 = 10 * normal (1);
com3 = 10 * normal (1);
var1 = 10 * normal (1) + com1;
var2 = 10 * normal (1) + com1;
var3 = 10 * normal (1) + com1;
var4 = 10 * normal (1) + com2;
var5 = 10 * normal (1) + com2;
var6 = 10 * normal (1) + com2;
var7 = 10 * normal (1) + com3;
var8 = 10 * normal (1) + com3;
var9 = 10 * normal (1) + com3;
output;
end;
proc factor ; var var1 - var9; run;
/* End of artificial data commands.   */



options nocenter nodate nonumber  linesize=100 pagesize=500;  title;
proc iml;
reset noname; 

/* Enter your specifications: */

/* Enter or read a raw data matrix, where rows = cases,
   & columns = variables
   Use the following name for the raw data matrix: "raw".
   Cases with missing values are not permitted in the data file. */
use artific;
/* read all var _num_ into raw;  */
read all var {var1 var2 var3 var4 var5 var6 var7 var8 var9}  into raw;


/* Enter the desired number of parallel data sets here */
ndatsets = 100;

/* Enter the desired percentile here */
percent  = 95;

/* Specify the desired kind of parellel analysis, where:
   1 = principal components analysis
   2 = principal axis/common factor analysis */
kind = 2 ;

/* Enter either
  1 for normally distributed random data generation parallel analysis, or
  2 for permutations of the raw data set */
randtype = 1 ;

/* When seed = 0, the clock is used as the seed for the random
   number generations. This produces different random numbers
   on different runs of the program. To use the same random 
   numbers on different runs of the program, set seed to a value
   other than 0 */
seed = 0;

/************* End of user specifications ***********************/


ncases   = nrow(raw);
nvars    = ncol(raw);

/* set diagonal to a column vector module */
start setdiag(matname,vector);
do i = 1 to nrow(matname);
do j = 1 to ncol(matname);
if (i = j) then;  matname[i,j] = vector[i,1];
end;end;
finish;

/* row sums module */
start rsum(matname);
rsums =j(nrow(matname),1);
do rows = 1 to nrow(matname);
dumr = matname[rows,];
rsums[rows,1]=sum(dumr);
end;
return(rsums);
finish;

/* Pearson correlation matrix module */
start corrcoef(matname);
ncases   = nrow(matname);
nm1 = 1 / (ncases-1);
vcv = nm1 * (t(matname)*matname - ((t(matname[+,])*matname[+,])/ncases));
d = inv(diag(sqrt(vecdiag(vcv))));
r = d * vcv * d;
return(r);
finish;


/* principal components analysis & random normal data generation */
if kind = 1 & randtype = 1 then do;
realeval = eigval(corrcoef(raw));
evals = j(nvars,ndatsets,-9999);
do nds = 1 to ndatsets;
evals[,nds] = eigval(corrcoef(normal(j(ncases,nvars,seed))));
end;
end;

/* principal components analysis & raw data permutation */
if kind = 1 & randtype = 2 then do;
realeval = eigval(corrcoef(raw));
evals = j(nvars,ndatsets,-9999);
do nds = 1 to ndatsets;
x = raw;
do lupec = 1 to nvars;
do luper = 1 to (ncases -1);
k = int( (ncases - luper + 1) * uniform(seed) + 1 )  + luper - 1;
d = x[luper,lupec];
x[luper,lupec] = x[k,lupec];
x[k,lupec] = d;
end;
end;
evals[,nds] = eigval(corrcoef(x));
end;
end;

/*  PAF/common factor analysis & random normal data generation  */
if kind = 2 & randtype = 1 then do;
r = corrcoef(raw);
smc = 1 - (1 / vecdiag(inv(r)) );
run setdiag(r,smc);
realeval = eigval(r);
evals = j(nvars,ndatsets,-9999);
do nds = 1 to ndatsets;
r = corrcoef(normal(j(ncases,nvars,seed)));
smc = 1 - (1 / vecdiag(inv(r)) );
run setdiag(r,smc);
evals[,nds] = eigval(r);
end;
end;

/*  PAF/common factor analysis & raw data permutation  */
if kind = 2 & randtype = 2 then do;
r = corrcoef(raw);
smc = 1 - (1 / vecdiag(inv(r)) );
run setdiag(r,smc);
realeval = eigval(r);
evals = j(nvars,ndatsets,-9999);
do nds = 1 to ndatsets;
x = raw;
do lupec = 1 to nvars;
do luper = 1 to (ncases -1);
k = int( (ncases - luper + 1) * uniform(seed) + 1 )  + luper - 1;
d = x[luper,lupec];
x[luper,lupec] = x[k,lupec];
x[k,lupec] = d;
end;
end;
r = corrcoef(x);
smc = 1 - (1 / vecdiag(inv(r)) );
run setdiag(r,smc);
evals[,nds] = eigval(r);
end;
end;

/* identifying the eigenvalues corresponding to the desired
percentile */
num = round((percent*ndatsets)/100);
results = j(nvars,4,-9999);
results[,1] = t(1:nvars);
results[,2] = realeval;
do root = 1 to nvars;
ranks = rank(evals[root,]);
do col = 1 to ndatsets;
if (ranks[1,col] = num) then do;
results[root,4] = evals[root,col];
col = ndatsets;
end;
end;
end;
results[,3] = evals[,+] / ndatsets;

print, "Parallel Analysis:";
if (kind = 1 & randtype = 1) then;
print, "Principal Components & Random Normal Data Generation";
if (kind = 1 & randtype = 2) then;
print, "Principal Components & Raw Data Permutation";
if (kind = 2 & randtype = 1) then;
print, "PAF/Common Factor Analysis & Random Normal Data Generation";
if (kind = 2 & randtype = 2) then;
print, "PAF/Common Factor Analysis & Raw Data Permutation";
specifs = (ncases // nvars // ndatsets // percent);
rlabels = {"Ncases" "Nvars" "Ndatsets" "Percent"};
print,  "Specifications for this Run:", specifs[rowname=rlabels];
clabels={"Root" "Raw Data" "Means" "Prcntyle"};
print,  "Raw Data Eigenvalues, & Mean & Percentile Random Data Eigenvalues",
       results[colname=clabels format=12.6];

if (kind = 2) then do;
print,, "Warning: Parallel analyses of adjusted correlation matrices";
print, "eg, with SMCs on the diagonal, tend to indicate more factors";
print, "than warranted (Buja, A., & Eyuboglu, N., 1992, Remarks on parallel";
print, "analysis. Multivariate Behavioral Research, 27, 509-540.).";
print, "The eigenvalues for trivial, negligible factors in the real";
print, "data commonly surpass corresponding random data eigenvalues";
print, "for the same roots. The eigenvalues from parallel analyses";
print, "can be used to determine the real data eigenvalues that are";
print, "beyond chance, but additional procedures should then be used";
print, "to trim trivial factors.";
print,, "Principal components eigenvalues are often used to determine";
print, "the number of common factors. This is the default in most";
print, "statistical software packages, and it is the primary practice";
print, "in the literature. It is also the method used by many factor";
print, "analysis experts, including Cattell, who often examined";
print, "principal components eigenvalues in his scree plots to determine";
print, "the number of common factors. But others believe this common";
print, "practice is wrong. Principal components eigenvalues are based";
print, "on all of the variance in correlation matrices, including both";
print, "the variance that is shared among variables and the variances";
print, "that are unique to the variables. In contrast, principal";
print, "axis eigenvalues are based solely on the shared variance";
print, "among the variables. The two procedures are qualitatively";
print, "different. Some therefore claim that the eigenvalues from one";
print, "extraction method should not be used to determine";
print, "the number of factors for the other extraction method.";
print, "The issue remains neglected and unsettled.";
end;

quit;