/*  SEQGROUPS */


/*  This program computes a variety of basic sequential analysis statistics
    for more than one group/segment.
    The data are assumed to be a series of integer codes with values ranging
    from "1" to what ever value the user specifies in the "ncodes"
    computation at the start of the program. */


options nocenter nodate nonumber  linesize=90;  title;

proc iml;

/* Enter the number of possible code values here. */
ncodes = 6;

/* Enter labels for the codes (5 characters maximum), if desired, here. */
labels={Code1 Code2 Code3 Code4   Code5  Code6  Code7
        Code8 Code9 Code10 Code11 Code12 Code13 Code14 Code15};

/* Enter the lag number for the analyses here. */
lag = 1;

/* Can adjacent events be coded the same? 
   Enter "0" if adjacent events can NEVER be the same;
   Enter "1" if adjacent events can ALWAYS be the same;
   Enter "2" if some adjacent events can, and others cannot, be the same; then
                   scroll down and enter the appropriate "ONEZERO" matrix for your data. */
adjacent = 2;

/* Enter "1" for one-tailed tests; enter "2" for two-tailed tests. */
tailed = 2;

/* Desired test?  Enter "1" for homogeneity,  enter "2" for stationarity. */
test = 1;

/* Desired output? Enter "1" for pooled data only, enter "2" for all data sets*/
output = 2;

/* Which results do you want to have saved in an outfile?
  Enter "1" for transitional frequencies, 
  Enter "2" for transitional probabilities, 
  Enter "3" for Adjusted residuals,
  Enter "4" for Yule's Q values
  Enter "5" for kappas. */
outfile = 1;

/* to save the outfile, see the CREATE command at the end of this file */

/* Enter/read the data into a matrix with the name "alldata", with
  code value > 999 indicating different groups or segments. */


alldata = {
1000,
2,5,2,4,2,5,2,4,2,4,2,2,5,2,4,2,5,3,4,3,4,1,4,4,1,
2,5,2,1,5,4,2,5,1,4,4,2,3,6,6,2,2,6,5,5,5,6,3,6,3,
3,6,3,2,5,3,5,2,6,2,2,2,5,2,4,4,5,2,5,2,5,5,4,2,5,
5,5,2,5,2,5,2,5,2,2,5,2,5,2,2,5,5,2,2,2,5,2,5,2,5,
2,5,2,2,5,2,5,3,5,2,5,5,2,2,5,2,2,5,2,6,2,5,4,2,5,
2,4,4,2,1,2,
2000,
5,2,5,2,5,5,5,3,5,2,5,5,2,5,5,2,5,2,5,
2,4,2,4,2,5,4,2,2,5,2,5,5,5,2,5,2,5,5,2,5,2,5,5,2,
1,2,5,1,5,1,5,5,4,2,2,2,3,6,3,6,3,6,3,6,3,6,3,3,6,
6,6,5,1,2,5,2,5,5,2,4,5,5,2,5,5,2,5,2,5,2,5,1,5,4,
2,2,5,2,5,2,5,4,1,4,4,2,4,4,2,4,2,3,5,4,2,2,5,2,6,
1,4,1,4,5,5,5,4,5,2,4,5,
3000,
5,2,5,2,2,2,5,2,2,5,2,2,4,
2,2,2,6,2,4,3,3,3,3,2,3,6,3,5,3,3,5,3,2,2,2,2,6,3,
6,3,6,2,6,1,6,3,3,3,3,5,3,5,2,5,3,3,6,3,1,4,1,5,1,
5,6,3,3,6,3,6,3,6,3,6,3,6,3,3,6,3,6,3,2,5,2,5,2,5,
2,5,2,2,2,3,5,1,6,1,6,6,6,1,6,6,1,6,1,6,2,5,1,6,6,
6,2,2,6,6,6,2,6,2,6,2,5,5,5,5,5,5,6 };



/* row sums module */
start rsum(matname);
rsums =j(nrow(matname),1);
do rows = 1 to nrow(matname);
dumr = matname[rows,];
rsums[rows,1]=sum(dumr);
end;
return(rsums);
finish;

/* collumn sums module */
start csum(matname);
csums =j(1,ncol(matname));
do cols = 1 to ncol(matname);
dumc = matname[,cols];
csums[1,cols]=sum(dumc);
end;
return(csums);
finish;

/* pooling data for stationarity analyses. */
if test = 2    then  do;
totdata = {0};
do x = 1 to nrow(alldata);
if (alldata[x,1] < 1000) then totdata=(totdata // alldata[x,1]);
end;
totdata = totdata[2:nrow(totdata),1];
freqst = j(ncodes,ncodes,0);
do y = 1 to nrow(totdata);
if ( y + lag <= nrow(totdata) )    then;
freqst[(totdata[y,1]),(totdata[(y+lag),1])] =
        freqst[(totdata[y,1]),(totdata[(y+lag),1])]  + 1;
end;
end;

out     = j(1,((ncodes*ncodes)+1),0);
each    = j(1,(ncodes*ncodes),0);
start   = 2;
lastone = 0;
do a = 1 to nrow(alldata);
freqs  = j(ncodes,ncodes,0);
data = 0;
do b = start to nrow(alldata);
if ( alldata[b,1] < 1000 )    then  do;
data = ( data // alldata[b,1] );
fin = b;
end;
if ( alldata[b,1] >= 1000 )    then  do;
start = b + 1;
b = nrow(alldata);
end;
end;
data = data[2:nrow(data),1];
lastone = ( lastone // data[nrow(data),1] );

freqs = j(ncodes,ncodes,0);
do c = 1 to nrow(data);
if ( c + lag <= nrow(data) )    then 
freqs[(data[c,1]),(data[(c+lag),1])] =
        freqs[(data[c,1]),(data[(c+lag),1])]  + 1;
end;

each = ( each // shape(freqs,1,(ncodes*ncodes)) );

if ( fin = nrow(alldata) )    then  a = nrow(alldata);
end;


lastone = ( lastone[2:nrow(lastone),1] // lastone[nrow(lastone),1] );
csumeach = j(1,ncol(each));
do t = 1 to ncol(each);
dumc = each[,t];
csumeach[1,t]=sum(dumc);
end;
if ( test = 1)   then  freqst = shape(csumeach,ncodes,ncodes);
rowtott =j(nrow(freqst),1);
do t = 1 to ncodes;
dumr = freqst[t,];
rowtott[t,1]=sum(dumr);
end;
pp      = j(ncodes,ncodes,0);
each  = ( each[2:nrow(each),] // shape(freqst,1,ncol(each)) );


/* initializing. */
lrx2sh = {0};
do d = 1 to nrow(each);
lrx2t  = {0};
freqs   = shape(each[d,],ncodes,ncodes);
rowtots = rsum(freqs);
coltots = csum(freqs);
ntrans   = sum(rowtots);
prows    = rowtots / ntrans;
pcols    = coltots / ntrans;
p        = j(ncodes,ncodes,-9999);
et       = j(ncodes,ncodes,-9999);
expfreq  = j(ncodes,ncodes,-9999);
zadjres  = j(ncodes,ncodes,-9999);
pzadjres = j(ncodes,ncodes,1);
yulesq   = j(ncodes,ncodes,-9999);
var      = j(ncodes,ncodes,-9999);
min      = j(ncodes,ncodes,-9999);
kappa    = j(ncodes,ncodes,-9999);
zkappa   = j(ncodes,ncodes,-9999);
pzkappa  = j(ncodes,ncodes,1);
n = ntrans + 1;
nr  = rowtots;
nr[lastone[d,1]] = nr[lastone[d,1]] + 1;
pnr  = nr / n;

/* Warning message for specification error. */
if ( adjacent=0 & any(diag(freqs)) = 1)   then do;
print,"WARNING";
print"You have indicated that consecutive or adjacent codes can never repeat,";
print"(adjacent = 0), yet repeating codes have been found in the data. See the";
print"main diagonal of the frequency matrix.  This specification error will ";
print"result in faulty computations for LRX2, z-values, and adjusted residuals.";
end;


do i = 1 to ncodes;
do j = 1 to ncodes;

/* Note: more refined computations for when adjacent codes cannot
   repeat appear below, after the above 2 loops are completed. */
if (adjacent = 0 & (ntrans - rowtots[i,1]) > 0 )   then;
pcols[1,j] = coltots[1,j] / (ntrans - rowtots[i,1] );
if (adjacent = 0 & (ntrans - rowtots[j,1]) > 0 )   then;
expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / (ntrans - rowtots[j,1]);
if (adjacent = 0 & (n - nr[i,1]) > 0)   then;
et[i,j]  = (nr[i,1] * nr[j,1]) / (n - nr[i,1]) ;

if (adjacent = 1)   then do;
et[i,j]  =    (nr[i,1] * nr[j,1]) / n;
expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / ntrans ;
end;

/* transitional probabilities */
if (rowtots[i,1] > 0)   then;
p[i,j] = freqs[i,j] / rowtots[i,1] ;

/* tablewise LRX2 */
if ( freqs[i,j] > 0 & expfreq[i,j] > 0)   then;
lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j]));

/* LRX2 for stationarity/homogeneity. */
if ( d < nrow(each) )    then  do;
if ( rowtott[i,1] > 0)    then
pp[i,j] = freqst[i,j] / rowtott[i,1];
if ( (p[i,j] > 0 &  pp[i,j]) > 0) then;
lrx2sh = lrx2sh + 2 * ( freqs[i,j] * log( p[i,j] / pp[i,j] ) );
end;

/* adjusted residuals (z values) & sig levels */
if ( (expfreq[i,j]*(1-pcols[1,j])*(1-prows[i,1])) > 0)   then do;
zadjres[i,j]=(freqs[i,j]-expfreq[i,j])
  /sqrt( expfreq[i,j]*(1-pcols[1,j]) * (1-prows[i,1]) );
pzadjres[i,j] = (1 - probnorm(abs(zadjres[i,j])) ) * tailed;
end;

/* Yule's Q; see Bakeman & Gottman, 1997, p 129 */
aaa = freqs[i,j];
bbb = rowtots[i,1] - freqs[i,j];
ccc = coltots[1,j] - freqs[i,j];
ddd = ntrans - rowtots[i,1] - coltots[1,j] + freqs[i,j];
if ( (aaa*ddd + bbb*ccc) > 0 )   then;
yulesq[i,j] = ( aaa*ddd - bbb*ccc ) / (aaa*ddd + bbb*ccc);

/* kappas, z values & sig levels */ 
var[i,j]=(nr[i,1]*nr[j,1]*(n-nr[j,1])*(n-nr[i,1]))/(n##2*(n-1));
if (var[i,j] > 0)   then do;
zkappa[i,j]  = ( freqs[i,j] - et[i,j] ) / sqrt(var[i,j]);
if ( nr[i,1] <= nr[j,1] )    then  min[i,j] = nr[i,1];
else                               min[i,j] = nr[j,1];
kappa[i,j] = ( freqs[i,j] - et[i,j] ) / ( min[i,j] - et[i,j] );
if ( kappa[i,j] < 0 )    then do;
kappa[i,j] = ( freqs[i,j] - et[i,j] ) /  et[i,j];
pzkappa[i,j] = (1 - probnorm(abs(zkappa[i,j]))) * tailed;
end;
end;

end;
end;


if (adjacent = 0 | adjacent = 2) then do;

/* maximum likelihood estimation of the expected cell frequencies using
   iterative proportional fitting (Wickens, 1989, pp. 107-112). */

rsumsf = rsum(freqs);
csumsf = csum(freqs);

onezero = j(ncodes,ncodes,1);
do i = 1 to ncodes;
do j = 1 to ncodes;
if (i = j) then  onezero[i,j] = 0;
end;
end;

/* the 6 previous commands create a matrix of ones and zeros that
   is used in estimating the expected cell frequencies.  A "1" indicates
   that the expected frequency for a given cell is to be estimated, 
   whereas a "0" indicates that the expected frequency for the cell
   should NOT be estimated, typically because it is a structural zero
   (codes that cannot follow one another).  By default, the matrix that
   is created by the above commands has zeros along the main diagonal,
   and ones everywhere else, which will be appropriate for most data 
   sets.  However, if your data happen to involve structural zeros
   that occur in cells other than the cells along the main diagonal,
   then you must create a ONEZERO matrix with ones and zeros that is
   appropriate for your data before running any of the commands 
   below.  Enter your ONEZERO matrix now. */


expfreq = onezero;

rdiffs = rsumsf - rsum(expfreq);
cdiffs = csumsf - csum(expfreq);

do ipfloop=1 to 100 until ( max(rdiffs) < .0001 & max(cdiffs) < .0001);

/* adjusting by row. */
xr = j(ncodes,1,0);
rsumse = rsum(expfreq);
do r = 1 to ncodes;
if (rsumse[r,1] > 0) then xr[r,1] = rsumsf[r,1] / rsumse[r,1];
end;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xr[i,1];
end;
end;

/* adjusting by collumn. */
xc = j(1,ncodes,0);
csumse = csum(expfreq);
do c = 1 to ncodes;
if (csumse[1,c] > 0)  then xc[1,c] = csumsf[1,c] / csumse[1,c];
end;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xc[1,j];
end;
end;

rdiffs = rsumsf - rsum(expfreq);
cdiffs = csumsf - csum(expfreq);
end;

print "Maximum likelihood estimation of the expected cell";
print "frequencies using iterative proportional fitting";
if ( max(rdiffs) < .0001 & max(cdiffs) < .0001) then;
print "converged after the following number of interations:", ipfloop;
else 
print "did NOT converge after the following number of interations:",ipfloop;


/* tablewise LRX2 */
lrx2t    = {0};
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( freqs[i,j] > 0 &  expfreq[i,j] > 0) then;
lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j]));
end;
end;

/* adjusted residuals for matrices with structural zeros (Christensen, 1997, p. 357) */

/* constructing the design matrix */
x = j(ncodes**2,1,1);
y = j(ncodes**2,ncodes-1,0);
z = j(ncodes**2,ncodes-1,0);
do i = 1 to ncodes - 1;
do j = 1 to ncodes;
y[i*ncodes+j,i] = 1;
z[ (((j-1)*ncodes)+(i+1)),i] = 1;
end;
end;
des1 = ( x || y || z );

/* pruning values corresponding to cells with structural zeros */
onezero2 = shape(onezero,ncodes**2,1);
dm1  = shape(expfreq,ncodes**2,1);
dm2  = j(1,1,-9999);
des2 = j(1,ncol(des1),-9999);
do ppp = 1 to ncodes**2;
if (onezero2[ppp,1] = 1) then do;
dm2  = ( dm2  // dm1 [ppp,1] );
des2 = ( des2 // des1[ppp,] );
end;
end;
dm2  = dm2[2:nrow(dm2),1];
des2 = des2[2:nrow(des2),];

dm2 = diag(dm2);
if ( det(t(des2) * dm2 * des2) ^= 0) then do;
a = des2 * (inv(t(des2) * dm2 * des2)) * t(des2) * dm2;
acounter = 1;
do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] ^= 0) then do;
zadjres[i,j] = ( freqs[i,j] - expfreq[i,j] ) /
                 sqrt( expfreq[i,j] * (1 - a[acounter,acounter]) );
acounter = acounter + 1;
end;
end;
end;
end;
if ( det(t(des2) * dm2 * des2) = 0) then do;
reset spaces = 2;
print, "A nonsingular matrix has been identified, which means";
print "that proper adjusted residuals cannot be computed for";
print "this data, probably because there are no values for";
print "one or more codes.  Try recoding using sequential";
print "integers, and redo the analyses.  The adjusted";
print "residuals that are printed below are based on";
print "equation 5 from Bakeman & Quera (1995, p. 274), and";
print "are close approximations to the proper values.";
print "The procedures recommended by Bakeman & Quera";
print "(1995, p. 276), Haberman (1979), and Christensen";
print "(1997) cannot be conducted with nonsingular matrices.";
end;

do i = 1 to ncodes;
do j = 1 to ncodes;
if ( onezero[i,j] = 0) then do;
zadjres [i,j] = 0;
yulesq  [i,j] = 0;
kappa   [i,j] = 0;
zkappa  [i,j] = 0;
pzadjres[i,j] = 1;
pzkappa [i,j] = 1;
end;
end;
end;

end;


if (outfile = 1)   then  outf = freqs;
if (outfile = 2)   then  outf = p;
if (outfile = 3)   then  outf = zadjres;
if (outfile = 4)   then  outf = yulesq;
if (outfile = 5)   then  outf = kappa;
out = ( out // (d || (shape(outf,1,(ncodes*ncodes)))) );

print, "Requested 'tail' (1 or 2) for Significance Tests =", tailed;
b = labels[1,1:ncodes];
if (d < nrow(each) & output = 2)    then
print, "Group/Segment Number:", d;
if (d = nrow(each) )   then   do;
print,,,,, "Pooled Data:";
N =(nrow(each)-1);
print, "Number of cases/groups/segments:", N;
dfsh = ncodes * (ncodes - 1) * ((nrow(each)-1) - 1);
plrx2sh  = 1 -  probchi(abs(lrx2sh),dfsh);
lrx2 = (lrx2sh || dfsh || plrx2sh);
print, "Likelihood Ratio (Chi-Square) Test of Homogeneity/Stationarity",
        lrx2[colname={LRX2 df sig} format=12.4];;
end;

if (d = nrow(each) | output = 2)    then  do;
b = labels[1,1:ncodes];
bb = ( b || {Totals});
transf = ( (freqs || rowtots) // (coltots || ntrans) );
print , "Cell Frequencies, Row & Collumn Totals, & N" ,
         transf [rowname=bb colname=bb format=7.0];
if (adjacent = 0 | adjacent = 2) then do;
print, "The processed ONEZERO matrix appears below.";
print, "In the ONEZERO matrix, a 0 indicates a structural";
print, "zero, and a 1 indicates that an expected cell";
print, "frequency will be estimated.";
print, "ONEZERO matrix:", onezero[rowname=b colname=b format=5.4];
end;
print , "Expected Values/Frequencies", expfreq[rowname=b colname=b format=5.4];
print , "Transitional Probabilities" , p[rowname=b colname=b format=5.4];
if (adjacent = 1)    then  dft = (ncodes - 1)##2;
else                       dft = (ncodes - 1)##2 - (ncodes**2 - sum(onezero));
plrx2t = 1 -  probchi(abs(lrx2t),dft);
lrx =  (lrx2t || dft || plrx2t);
print, "Tablewise Likelihood Ratio (Chi-Square) Test",
        lrx[colname={LRX2 df sig} format=12.4];
print, "Adjusted Residuals", zadjres[rowname=b colname=b format=6.3];
print, "Significance Levels for the Adjusted Residuals",
        pzadjres[rowname=b colname=b format=5.4];
print, "Yule's Q Values", yulesq[rowname=b colname=b format=6.3];
print, "Unidirectional Kappas", kappa[rowname=b colname=b format=5.4];
print, "z values for the Unidirectional Kappas",
        zkappa[rowname=b colname=b format=6.3];
print, "Significance Levels for the Unidirectional Kappas",
        pzkappa[rowname=b colname=b format=5.4];
end;
end;

out = out[2:nrow(out),];
print, "Requested output data", out[format=7.0];


/* The following CREATE command will save the above-specified matrix
   output data in a SAS file that can be accessed for further processing.
   You must provide a legal file name for your system and then "activate"
   the CREATE command, which is currently  a mere comment. */

create dataset from out;
append from out;

quit;


proc means data=dataset;
run;