/* SEQGROUPS */ /* This program computes a variety of basic sequential analysis statistics for more than one group/segment. The data are assumed to be a series of integer codes with values ranging from "1" to what ever value the user specifies in the "ncodes" computation at the start of the program. */ options nocenter nodate nonumber linesize=90; title; proc iml; /* Enter the number of possible code values here. */ ncodes = 6; /* Enter labels for the codes (5 characters maximum), if desired, here. */ labels={Code1 Code2 Code3 Code4 Code5 Code6 Code7 Code8 Code9 Code10 Code11 Code12 Code13 Code14 Code15}; /* Enter the lag number for the analyses here. */ lag = 1; /* Can adjacent events be coded the same? Enter "0" if adjacent events can NEVER be the same; Enter "1" if adjacent events can ALWAYS be the same; Enter "2" if some adjacent events can, and others cannot, be the same; then scroll down and enter the appropriate "ONEZERO" matrix for your data. */ adjacent = 2; /* Enter "1" for one-tailed tests; enter "2" for two-tailed tests. */ tailed = 2; /* Desired test? Enter "1" for homogeneity, enter "2" for stationarity. */ test = 1; /* Desired output? Enter "1" for pooled data only, enter "2" for all data sets*/ output = 2; /* Which results do you want to have saved in an outfile? Enter "1" for transitional frequencies, Enter "2" for transitional probabilities, Enter "3" for Adjusted residuals, Enter "4" for Yule's Q values Enter "5" for kappas. */ outfile = 1; /* to save the outfile, see the CREATE command at the end of this file */ /* Enter/read the data into a matrix with the name "alldata", with code value > 999 indicating different groups or segments. */ alldata = { 1000, 2,5,2,4,2,5,2,4,2,4,2,2,5,2,4,2,5,3,4,3,4,1,4,4,1, 2,5,2,1,5,4,2,5,1,4,4,2,3,6,6,2,2,6,5,5,5,6,3,6,3, 3,6,3,2,5,3,5,2,6,2,2,2,5,2,4,4,5,2,5,2,5,5,4,2,5, 5,5,2,5,2,5,2,5,2,2,5,2,5,2,2,5,5,2,2,2,5,2,5,2,5, 2,5,2,2,5,2,5,3,5,2,5,5,2,2,5,2,2,5,2,6,2,5,4,2,5, 2,4,4,2,1,2, 2000, 5,2,5,2,5,5,5,3,5,2,5,5,2,5,5,2,5,2,5, 2,4,2,4,2,5,4,2,2,5,2,5,5,5,2,5,2,5,5,2,5,2,5,5,2, 1,2,5,1,5,1,5,5,4,2,2,2,3,6,3,6,3,6,3,6,3,6,3,3,6, 6,6,5,1,2,5,2,5,5,2,4,5,5,2,5,5,2,5,2,5,2,5,1,5,4, 2,2,5,2,5,2,5,4,1,4,4,2,4,4,2,4,2,3,5,4,2,2,5,2,6, 1,4,1,4,5,5,5,4,5,2,4,5, 3000, 5,2,5,2,2,2,5,2,2,5,2,2,4, 2,2,2,6,2,4,3,3,3,3,2,3,6,3,5,3,3,5,3,2,2,2,2,6,3, 6,3,6,2,6,1,6,3,3,3,3,5,3,5,2,5,3,3,6,3,1,4,1,5,1, 5,6,3,3,6,3,6,3,6,3,6,3,6,3,3,6,3,6,3,2,5,2,5,2,5, 2,5,2,2,2,3,5,1,6,1,6,6,6,1,6,6,1,6,1,6,2,5,1,6,6, 6,2,2,6,6,6,2,6,2,6,2,5,5,5,5,5,5,6 }; /* row sums module */ start rsum(matname); rsums =j(nrow(matname),1); do rows = 1 to nrow(matname); dumr = matname[rows,]; rsums[rows,1]=sum(dumr); end; return(rsums); finish; /* collumn sums module */ start csum(matname); csums =j(1,ncol(matname)); do cols = 1 to ncol(matname); dumc = matname[,cols]; csums[1,cols]=sum(dumc); end; return(csums); finish; /* pooling data for stationarity analyses. */ if test = 2 then do; totdata = {0}; do x = 1 to nrow(alldata); if (alldata[x,1] < 1000) then totdata=(totdata // alldata[x,1]); end; totdata = totdata[2:nrow(totdata),1]; freqst = j(ncodes,ncodes,0); do y = 1 to nrow(totdata); if ( y + lag <= nrow(totdata) ) then; freqst[(totdata[y,1]),(totdata[(y+lag),1])] = freqst[(totdata[y,1]),(totdata[(y+lag),1])] + 1; end; end; out = j(1,((ncodes*ncodes)+1),0); each = j(1,(ncodes*ncodes),0); start = 2; lastone = 0; do a = 1 to nrow(alldata); freqs = j(ncodes,ncodes,0); data = 0; do b = start to nrow(alldata); if ( alldata[b,1] < 1000 ) then do; data = ( data // alldata[b,1] ); fin = b; end; if ( alldata[b,1] >= 1000 ) then do; start = b + 1; b = nrow(alldata); end; end; data = data[2:nrow(data),1]; lastone = ( lastone // data[nrow(data),1] ); freqs = j(ncodes,ncodes,0); do c = 1 to nrow(data); if ( c + lag <= nrow(data) ) then freqs[(data[c,1]),(data[(c+lag),1])] = freqs[(data[c,1]),(data[(c+lag),1])] + 1; end; each = ( each // shape(freqs,1,(ncodes*ncodes)) ); if ( fin = nrow(alldata) ) then a = nrow(alldata); end; lastone = ( lastone[2:nrow(lastone),1] // lastone[nrow(lastone),1] ); csumeach = j(1,ncol(each)); do t = 1 to ncol(each); dumc = each[,t]; csumeach[1,t]=sum(dumc); end; if ( test = 1) then freqst = shape(csumeach,ncodes,ncodes); rowtott =j(nrow(freqst),1); do t = 1 to ncodes; dumr = freqst[t,]; rowtott[t,1]=sum(dumr); end; pp = j(ncodes,ncodes,0); each = ( each[2:nrow(each),] // shape(freqst,1,ncol(each)) ); /* initializing. */ lrx2sh = {0}; do d = 1 to nrow(each); lrx2t = {0}; freqs = shape(each[d,],ncodes,ncodes); rowtots = rsum(freqs); coltots = csum(freqs); ntrans = sum(rowtots); prows = rowtots / ntrans; pcols = coltots / ntrans; p = j(ncodes,ncodes,-9999); et = j(ncodes,ncodes,-9999); expfreq = j(ncodes,ncodes,-9999); zadjres = j(ncodes,ncodes,-9999); pzadjres = j(ncodes,ncodes,1); yulesq = j(ncodes,ncodes,-9999); var = j(ncodes,ncodes,-9999); min = j(ncodes,ncodes,-9999); kappa = j(ncodes,ncodes,-9999); zkappa = j(ncodes,ncodes,-9999); pzkappa = j(ncodes,ncodes,1); n = ntrans + 1; nr = rowtots; nr[lastone[d,1]] = nr[lastone[d,1]] + 1; pnr = nr / n; /* Warning message for specification error. */ if ( adjacent=0 & any(diag(freqs)) = 1) then do; print,"WARNING"; print"You have indicated that consecutive or adjacent codes can never repeat,"; print"(adjacent = 0), yet repeating codes have been found in the data. See the"; print"main diagonal of the frequency matrix. This specification error will "; print"result in faulty computations for LRX2, z-values, and adjusted residuals."; end; do i = 1 to ncodes; do j = 1 to ncodes; /* Note: more refined computations for when adjacent codes cannot repeat appear below, after the above 2 loops are completed. */ if (adjacent = 0 & (ntrans - rowtots[i,1]) > 0 ) then; pcols[1,j] = coltots[1,j] / (ntrans - rowtots[i,1] ); if (adjacent = 0 & (ntrans - rowtots[j,1]) > 0 ) then; expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / (ntrans - rowtots[j,1]); if (adjacent = 0 & (n - nr[i,1]) > 0) then; et[i,j] = (nr[i,1] * nr[j,1]) / (n - nr[i,1]) ; if (adjacent = 1) then do; et[i,j] = (nr[i,1] * nr[j,1]) / n; expfreq[i,j] = ( rowtots[i,1] * coltots[1,j]) / ntrans ; end; /* transitional probabilities */ if (rowtots[i,1] > 0) then; p[i,j] = freqs[i,j] / rowtots[i,1] ; /* tablewise LRX2 */ if ( freqs[i,j] > 0 & expfreq[i,j] > 0) then; lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j])); /* LRX2 for stationarity/homogeneity. */ if ( d < nrow(each) ) then do; if ( rowtott[i,1] > 0) then pp[i,j] = freqst[i,j] / rowtott[i,1]; if ( (p[i,j] > 0 & pp[i,j]) > 0) then; lrx2sh = lrx2sh + 2 * ( freqs[i,j] * log( p[i,j] / pp[i,j] ) ); end; /* adjusted residuals (z values) & sig levels */ if ( (expfreq[i,j]*(1-pcols[1,j])*(1-prows[i,1])) > 0) then do; zadjres[i,j]=(freqs[i,j]-expfreq[i,j]) /sqrt( expfreq[i,j]*(1-pcols[1,j]) * (1-prows[i,1]) ); pzadjres[i,j] = (1 - probnorm(abs(zadjres[i,j])) ) * tailed; end; /* Yule's Q; see Bakeman & Gottman, 1997, p 129 */ aaa = freqs[i,j]; bbb = rowtots[i,1] - freqs[i,j]; ccc = coltots[1,j] - freqs[i,j]; ddd = ntrans - rowtots[i,1] - coltots[1,j] + freqs[i,j]; if ( (aaa*ddd + bbb*ccc) > 0 ) then; yulesq[i,j] = ( aaa*ddd - bbb*ccc ) / (aaa*ddd + bbb*ccc); /* kappas, z values & sig levels */ var[i,j]=(nr[i,1]*nr[j,1]*(n-nr[j,1])*(n-nr[i,1]))/(n##2*(n-1)); if (var[i,j] > 0) then do; zkappa[i,j] = ( freqs[i,j] - et[i,j] ) / sqrt(var[i,j]); if ( nr[i,1] <= nr[j,1] ) then min[i,j] = nr[i,1]; else min[i,j] = nr[j,1]; kappa[i,j] = ( freqs[i,j] - et[i,j] ) / ( min[i,j] - et[i,j] ); if ( kappa[i,j] < 0 ) then do; kappa[i,j] = ( freqs[i,j] - et[i,j] ) / et[i,j]; pzkappa[i,j] = (1 - probnorm(abs(zkappa[i,j]))) * tailed; end; end; end; end; if (adjacent = 0 | adjacent = 2) then do; /* maximum likelihood estimation of the expected cell frequencies using iterative proportional fitting (Wickens, 1989, pp. 107-112). */ rsumsf = rsum(freqs); csumsf = csum(freqs); onezero = j(ncodes,ncodes,1); do i = 1 to ncodes; do j = 1 to ncodes; if (i = j) then onezero[i,j] = 0; end; end; /* the 6 previous commands create a matrix of ones and zeros that is used in estimating the expected cell frequencies. A "1" indicates that the expected frequency for a given cell is to be estimated, whereas a "0" indicates that the expected frequency for the cell should NOT be estimated, typically because it is a structural zero (codes that cannot follow one another). By default, the matrix that is created by the above commands has zeros along the main diagonal, and ones everywhere else, which will be appropriate for most data sets. However, if your data happen to involve structural zeros that occur in cells other than the cells along the main diagonal, then you must create a ONEZERO matrix with ones and zeros that is appropriate for your data before running any of the commands below. Enter your ONEZERO matrix now. */ expfreq = onezero; rdiffs = rsumsf - rsum(expfreq); cdiffs = csumsf - csum(expfreq); do ipfloop=1 to 100 until ( max(rdiffs) < .0001 & max(cdiffs) < .0001); /* adjusting by row. */ xr = j(ncodes,1,0); rsumse = rsum(expfreq); do r = 1 to ncodes; if (rsumse[r,1] > 0) then xr[r,1] = rsumsf[r,1] / rsumse[r,1]; end; do i = 1 to ncodes; do j = 1 to ncodes; if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xr[i,1]; end; end; /* adjusting by collumn. */ xc = j(1,ncodes,0); csumse = csum(expfreq); do c = 1 to ncodes; if (csumse[1,c] > 0) then xc[1,c] = csumsf[1,c] / csumse[1,c]; end; do i = 1 to ncodes; do j = 1 to ncodes; if ( onezero[i,j] = 1 ) then expfreq[i,j] = expfreq[i,j] * xc[1,j]; end; end; rdiffs = rsumsf - rsum(expfreq); cdiffs = csumsf - csum(expfreq); end; print "Maximum likelihood estimation of the expected cell"; print "frequencies using iterative proportional fitting"; if ( max(rdiffs) < .0001 & max(cdiffs) < .0001) then; print "converged after the following number of interations:", ipfloop; else print "did NOT converge after the following number of interations:",ipfloop; /* tablewise LRX2 */ lrx2t = {0}; do i = 1 to ncodes; do j = 1 to ncodes; if ( freqs[i,j] > 0 & expfreq[i,j] > 0) then; lrx2t = lrx2t + 2 * (freqs[i,j] * log(freqs[i,j]/expfreq[i,j])); end; end; /* adjusted residuals for matrices with structural zeros (Christensen, 1997, p. 357) */ /* constructing the design matrix */ x = j(ncodes**2,1,1); y = j(ncodes**2,ncodes-1,0); z = j(ncodes**2,ncodes-1,0); do i = 1 to ncodes - 1; do j = 1 to ncodes; y[i*ncodes+j,i] = 1; z[ (((j-1)*ncodes)+(i+1)),i] = 1; end; end; des1 = ( x || y || z ); /* pruning values corresponding to cells with structural zeros */ onezero2 = shape(onezero,ncodes**2,1); dm1 = shape(expfreq,ncodes**2,1); dm2 = j(1,1,-9999); des2 = j(1,ncol(des1),-9999); do ppp = 1 to ncodes**2; if (onezero2[ppp,1] = 1) then do; dm2 = ( dm2 // dm1 [ppp,1] ); des2 = ( des2 // des1[ppp,] ); end; end; dm2 = dm2[2:nrow(dm2),1]; des2 = des2[2:nrow(des2),]; dm2 = diag(dm2); if ( det(t(des2) * dm2 * des2) ^= 0) then do; a = des2 * (inv(t(des2) * dm2 * des2)) * t(des2) * dm2; acounter = 1; do i = 1 to ncodes; do j = 1 to ncodes; if ( onezero[i,j] ^= 0) then do; zadjres[i,j] = ( freqs[i,j] - expfreq[i,j] ) / sqrt( expfreq[i,j] * (1 - a[acounter,acounter]) ); acounter = acounter + 1; end; end; end; end; if ( det(t(des2) * dm2 * des2) = 0) then do; reset spaces = 2; print, "A nonsingular matrix has been identified, which means"; print "that proper adjusted residuals cannot be computed for"; print "this data, probably because there are no values for"; print "one or more codes. Try recoding using sequential"; print "integers, and redo the analyses. The adjusted"; print "residuals that are printed below are based on"; print "equation 5 from Bakeman & Quera (1995, p. 274), and"; print "are close approximations to the proper values."; print "The procedures recommended by Bakeman & Quera"; print "(1995, p. 276), Haberman (1979), and Christensen"; print "(1997) cannot be conducted with nonsingular matrices."; end; do i = 1 to ncodes; do j = 1 to ncodes; if ( onezero[i,j] = 0) then do; zadjres [i,j] = 0; yulesq [i,j] = 0; kappa [i,j] = 0; zkappa [i,j] = 0; pzadjres[i,j] = 1; pzkappa [i,j] = 1; end; end; end; end; if (outfile = 1) then outf = freqs; if (outfile = 2) then outf = p; if (outfile = 3) then outf = zadjres; if (outfile = 4) then outf = yulesq; if (outfile = 5) then outf = kappa; out = ( out // (d || (shape(outf,1,(ncodes*ncodes)))) ); print, "Requested 'tail' (1 or 2) for Significance Tests =", tailed; b = labels[1,1:ncodes]; if (d < nrow(each) & output = 2) then print, "Group/Segment Number:", d; if (d = nrow(each) ) then do; print,,,,, "Pooled Data:"; N =(nrow(each)-1); print, "Number of cases/groups/segments:", N; dfsh = ncodes * (ncodes - 1) * ((nrow(each)-1) - 1); plrx2sh = 1 - probchi(abs(lrx2sh),dfsh); lrx2 = (lrx2sh || dfsh || plrx2sh); print, "Likelihood Ratio (Chi-Square) Test of Homogeneity/Stationarity", lrx2[colname={LRX2 df sig} format=12.4];; end; if (d = nrow(each) | output = 2) then do; b = labels[1,1:ncodes]; bb = ( b || {Totals}); transf = ( (freqs || rowtots) // (coltots || ntrans) ); print , "Cell Frequencies, Row & Collumn Totals, & N" , transf [rowname=bb colname=bb format=7.0]; if (adjacent = 0 | adjacent = 2) then do; print, "The processed ONEZERO matrix appears below."; print, "In the ONEZERO matrix, a 0 indicates a structural"; print, "zero, and a 1 indicates that an expected cell"; print, "frequency will be estimated."; print, "ONEZERO matrix:", onezero[rowname=b colname=b format=5.4]; end; print , "Expected Values/Frequencies", expfreq[rowname=b colname=b format=5.4]; print , "Transitional Probabilities" , p[rowname=b colname=b format=5.4]; if (adjacent = 1) then dft = (ncodes - 1)##2; else dft = (ncodes - 1)##2 - (ncodes**2 - sum(onezero)); plrx2t = 1 - probchi(abs(lrx2t),dft); lrx = (lrx2t || dft || plrx2t); print, "Tablewise Likelihood Ratio (Chi-Square) Test", lrx[colname={LRX2 df sig} format=12.4]; print, "Adjusted Residuals", zadjres[rowname=b colname=b format=6.3]; print, "Significance Levels for the Adjusted Residuals", pzadjres[rowname=b colname=b format=5.4]; print, "Yule's Q Values", yulesq[rowname=b colname=b format=6.3]; print, "Unidirectional Kappas", kappa[rowname=b colname=b format=5.4]; print, "z values for the Unidirectional Kappas", zkappa[rowname=b colname=b format=6.3]; print, "Significance Levels for the Unidirectional Kappas", pzkappa[rowname=b colname=b format=5.4]; end; end; out = out[2:nrow(out),]; print, "Requested output data", out[format=7.0]; /* The following CREATE command will save the above-specified matrix output data in a SAS file that can be accessed for further processing. You must provide a legal file name for your system and then "activate" the CREATE command, which is currently a mere comment. */ create dataset from out; append from out; quit; proc means data=dataset; run;