/*------------------------------------------------------------------------------ File: ProcFreqCubed.sas Author: John Sabel Washington State Education Research and Data Center john.sabel@ofm.wa.gov Creation date: 2011-11-07 Version: 1.4 Purpose Analyzes a SAS dataset by running PROC FREQ against every column in association with the user defined Key column. Whereas PROC FREQ by itself produces 1st and 2nd order contingency tables, this macro can produce a sort of 3rd order contingency table. If there are columns that you do not want included in the analysis, then use a drop statement for DsnIn. See examples at bottom of this source code listing. Parameters DsnIn: The one or two-level dataset name that is inputted. DsnOut: The one or two-level dataset name that is created. Key: (Optional) The key variable name. It can be either numeric or character. When not included, the result is a single record of data that contains a series of 1st order contingency tables one for each variable in DsnIn. RegExFilter: (Optional) A Perl Regular Expression used to filter the which columns %ProcFreqCubed will analyze. In essence, it works as a customizable, programmable drop statement. The default is /./, which acts as no filter at all, so by default all columns in the DsnIn data set will be analyzed. When you do use this regular expression filter consider using the "I" option in order to make the filter case insensitive. RegExFilter_2: (Optional) Like RegExFilter. Sometimes writing two complemtary RegEx expressions is easier than writing one comprehensive RegEx expresion. ExcludeColumns: (Optional) One or more column names that you do not want to be analyzed using ProcFreqCubed. FieldLength: (Optional) The column length of the columns that contain frequency information in the output dataset. If a field contains more information than what can be contained in the given FieldLength, than the resulsts are truncated. Default is 512. Debug: (Optional) Y = Don't delete intermidiate datasets. N = Delete intermidiate datasets. Default is N. Signature %ProcFreqCubed(DsnIn=, DsnOut=, Key=, RegExFilter=/./I, RegExFilter_2=/./I, FieldLength=512, Debug=N) Modification Log Version Person Date Description 1.0 JRS 2011-11-07 Macro posted to website. 1.01 JRS 2011-12-08 Name of macro changed from "FrequeciesCubed.sas" to "ProcFreqCubed.sas." 1.1 JRS 2012-05-08 Added _FREQ_ column to outputted data set. 1.2 JRS 2014-02-11 Added the RegExFilter and RegExFilter_2 parameters. 1.3 JRS 2014-03-03 Made the Key parameter optional. 1.4 JRS 2015-05-20 Increased capacity for number of potential "Key" column values from 999 to 9999. ------------------------------------------------------------------------------*/ %macro ProcFreqCubed(DsnIn=, DsnOut=, Key=, RegExFilter=/./I, RegExFilter_2=/./I, FieldLength=512, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %if &Debug.~=N %then %put _local_; options noquotelenmax; %global KeyType2; %local FreqDsn; %local QuoteOrNot QuotedExcludeColumns; %if %sysfunc(exist(%scan(&DsnIn.,1,%str(%()))) ne 1 %then %do; %put %str(ERR)OR: (&sysmacroname.) The %upcase(%scan(&DsnIn.,1,%str(%())) dataset; %put %str(ERR)OR: (&sysmacroname.) (DsnIn parameter) does not exist.; %goto exit; %end; /* %if %length(&ExcludeColumns.) > 0 %then %do;*/ /* %let QuotedExcludeColumns=%sysfunc(COMPBL(&ExcludeColumns.));*/ /* %let QuotedExcludeColumns=%sysfunc(prxchange(s/ /%str(",")/, -1, &QuotedExcludeColumns.));*/ /* %let QuotedExcludeColumns="%upcase(&QuotedExcludeColumns.)";*/ /* %end;*/ %if &Key.=%then %do; %let Key=NULL; %end; %if &Key.=NULL %then %do; data __ProcFreqDsnIn; length __FauxKey $4; set &DsnIn.; /* __FauxKey='NULL';*/ run; %let DsnIn=__ProcFreqDsnIn; %let Key=__FauxKey; %end; /* This program uses proc sql against SAS's dictionary tables to gather some of the metadata of the parameter DsnIn.. Since DsnIn might be a two-level name, rather than having to parse it, the program just creates an empty temporary table just like DsnIn, and puts it in the WORK directory. */ proc sql; CREATE TABLE __TempDsnIn LIKE &DsnIn. ; quit; proc sql noprint; title "&DsnIn. - &Key. Variable Type"; SELECT UPCASE(type) INTO :KeyType FROM DICTIONARY.COLUMNS WHERE UPCASE(LIBNAME) = 'WORK' AND UPCASE(MEMNAME) = "%upcase(__TempDsnIn)" AND UPCASE(NAME) = "%upcase(&Key.)" ; quit; %let KeyType2=&KeyType.; %if &sqlobs=0 %then %do; %put %str(ERR)OR:(&sysmacroname.) The Key = &Key. parameter does not exist.; %put %str(ERR)OR: (&sysmacroname.) This macro will now terminate.; %goto exit; %end; proc sql noprint; title "&DsnIn. Column Names and Formats"; SELECT NAME, FORMAT INTO :Name1-:Name9999, :Format1-:Format9999 FROM DICTIONARY.COLUMNS WHERE UPCASE(LIBNAME) = 'WORK' AND UPCASE(MEMNAME) = "%upcase(__TempDsnIn)" AND UPCASE(NAME) <> "%upcase(&Key.)" AND prxmatch("&RegExFilter.", NAME) > 0 AND prxmatch("&RegExFilter_2.", NAME) > 0 /* %if %length(&ExcludeColumns.) > 0 %then %do;*/ /* AND UPCASE(NAME) NOT IN (&QuotedExcludeColumns.)*/ /* %end;*/ ; %let NumVars=&sqlobs.; quit; %put &NumVars.; proc sql; CREATE TABLE __TempDsnOut ( &Key. CHAR(20), %do I = 1 %to &NumVars.; &&Name&I CHAR(&FieldLength.) %if &I.~=&NumVars. %then ,; %end; ) ; quit; /* Fill in table */ /* Loop over the variables */ %do I = 1 %to &NumVars.; proc sql noprint; title "&DsnIn. &key. Values"; SELECT DISTINCT &key. INTO :KeyValue1-:KeyValue9999 FROM &DsnIn. WHERE &&Name&I IS NOT NULL ; %let NumKeyValues=&sqlobs.; quit; %put NumKeyValues: &NumKeyValues.; /* Loop over the key values in the key field (e.g. the years in a school year field). */ %do J = 1 %to &NumKeyValues.; %if &KeyType.=CHAR %then %do; proc freq data=&DsnIn. noprint; where &key. = "&&KeyValue&J"; /* Could create a macro variable that either has quotes or not depending on if Key is numeric or character. */ title "&&KeyValue&J: &&Name&I"; tables &key.*&&Name&I /norow missing nocol out=__TempFreq; run; %end; %else %do; proc freq data=&DsnIn. noprint; where &key. = &&KeyValue&J; /* Could create a macro variable that either has quotes or not depending on if Key is numeric or character. */ title "&&KeyValue&J: &&Name&I"; tables &key.*&&Name&I /norow missing nocol out=__TempFreq; run; %end; /* For debugging purposes, we want to give a unique name to each PROC FREQ dataset, but we must ensure that each PROC FREQ dataset name does not exceed the SAS limit of 32 characters. When not in debug mode, the program will delete these files. */ %let FreqDsn=__Freq&J._&&Name&I; %if %length(&FreqDsn.) > 32 %then %let FreqDsn=%substr(&FreqDsn.,1,32); data &FreqDsn.; set __tempfreq; if PERCENT >= 10 then Percent_Formattted=cats('(', round(PERCENT, 0.1), '%)'); else if PERCENT > 0.01 then Percent_Formattted=cats('(', round(PERCENT, 0.01), '%)'); else Percent_Formattted=cats('[', COUNT, ']'); %if "&&Format&I" ="" %then %do; Percent_Formattted = cat(&&Name&I, ' ', Percent_Formattted); Count_Formattted = cat(&&Name&I, ' [', COUNT, ']'); %end; %else %do; Percent_Formattted = cat(put(&&Name&I, &&Format&I), ' ', Percent_Formattted); Count_Formattted = cat(put(&&Name&I, &&Format&I), ' [', COUNT, ']'); %end; run; proc sql noprint; title "&DsnIn. Distinct &&Name&I, COUNT, PERCENT, Percent_Formattted, Count_Formattted"; SELECT distinct &&Name&I, COUNT, PERCENT, Percent_Formattted, Count_Formattted INTO :VarValue1-:VarValue9999, :Count1-:Count9999, :Percentage1-:Percentage9999, :Percent_Formattted1-:Percent_Formattted9999, :Count_Formattted1-:Count_Formattted9999 FROM &FreqDsn. ORDER BY &&Name&I ; %let NumVarsInAKey=&sqlobs.; quit; %let ValueStatement=; %do K = 1 %to &NumVarsInAKey.; %let ValueStatement=%bquote(%unquote(&ValueStatement. &&Percent_Formattted&K)); /* %put _user_;*/ %if &K.~=&NumVarsInAKey.%then %let ValueStatement=%bquote(%unquote(&ValueStatement.,)); %if %LENGTH((&ValueStatement.))>&FieldLength. %then %do; %let ValueStatement =RESULTS TRUNCATED: %QSUBSTR((&ValueStatement.),1,%EVAL(%LENGTH((&ValueStatement.))-19)); %let K=&NumVarsInAKey.; %end; %end; proc sql; INSERT INTO __TempDsnOut (&key., &&Name&I) VALUES ("&&KeyValue&J", "&ValueStatement."); /* VALUES ("%bquote(&&KeyValue&J)", "%bquote(&ValueStatement.)");*/ quit; %if &Debug.=N %then %do; proc sql; DROP TABLE &FreqDsn.; quit; %end; %end; %end; proc sort data=__TempDsnOut; by &Key.; run; data __TempDsnOut01; set __TempDsnOut (rename= ( %do I=1 %to &NumVars.; &&Name&I..=__Temp&I. %end; )); by &key.; length %do I=1 %to &NumVars.; &&Name&I $&FieldLength. %end;; retain _CHAR_; %do I=1 %to &NumVars.; if __Temp&I. ~= '' then &&Name&I.. = __Temp&I.; %end; if last.&Key. then do; output; %do I=1 %to &NumVars.; &&Name&I.. = ''; %end; end; drop %do I=1 %to &NumVars.; __Temp&I. %end;; run; /* data _JustTheKey(rename=(KeyAsChar=&Key.)); set &DsnIn.(keep=&Key.); dummy=1; if vtype(&Key.) = 'C' then KeyAsChar = &Key.; else KeyAsChar = strip(put(&Key., 16.)); drop &Key.; run; */ proc sql noprint; SELECT type INTO :KeyType FROM DICTIONARY.COLUMNS WHERE UPCASE(LIBNAME) = 'WORK' AND UPCASE(MEMNAME) = "%upcase(__TempDsnIn)" AND UPCASE(NAME) = "%upcase(&Key.)" ; quit; %put &KeyType; proc sql; CREATE TABLE &DsnOut. %if &Key.=__FauxKey %then %do; (drop=__FauxKey) %end; AS SELECT RecordCountsByKey.&Key., RecordCountsByKey._FREQ_ %do I=1 %to &NumVars.; , __TempDsnOut01.&&Name&I %end; FROM __TempDsnOut01 RIGHT JOIN ( SELECT &Key., COUNT(*) AS _FREQ_ FROM &DsnIn. GROUP BY &Key. ) RecordCountsByKey /* If Key in DsnIn is a character field */ %if %upcase(&KeyType.) =CHAR %then ON __TempDsnOut01.&Key. = RecordCountsByKey.&Key.; /* Else Key is numeric */ %else ON input(__TempDsnOut01.&Key., 32.) = RecordCountsByKey.&Key.; ; quit; %if &Debug.=N %then %do; proc sql; DROP TABLE __tempdsnin; DROP TABLE __tempdsnout; DROP TABLE __tempdsnout01; DROP TABLE __tempfreq; quit; %if &Key.=__FauxKey %then %do; proc sql; DROP TABLE __ProcFreqDsnIn; quit; %end; %end; %exit: options quotelenmax; %if &Debug.~=N %then %put _local_; %put --- End of %upcase(&sysmacroname) macro; %mend ProcFreqCubed; /* data SampleDataIn; input id type $ sex $ age shoesize price animal $ bdate : mmddyy10.; *anydtdte.; format bdate date9.; datalines; 0 . F 20 . . . . . 1 flat M 17 11 3.01 cow 05/30/1965 2 flat F 101 9 99.01 sheep 05/30/1965 3 mountain F 19 9 3.02 goat 10/25/1915 4 mountain M 19 13 77.98 yak 05/30/1965 5 flat F 17 22 3.02 cow 10/14/1582 6 flat F 19 22 3.02 cow 10/14/1582 7 mountain F 3 1 77.98 yak 05/30/1965 ; run; * This example shows that the "Key" parameter can be a character variable. It also shows that for the "DsnIn" parameter, you can can use any of the options available in a datastep "SET" statement.; %ProcFreqCubed(DsnIn=SampleDataIn (drop=id), DsnOut=SampleDataOut_ByType, Key=type, FieldLength=512, Debug=N) * This shows not using a Key parameter. This results in single record of information.; %ProcFreqCubed(DsnIn=SampleDataIn (drop=id), DsnOut=SampleDataOut_NoKey, Key=, FieldLength=512, Debug=N) * Same example as above, but filtering which variables are analyzed using a regular expression. In this case, the hat, "^", means begining with, so the combination of "/^a/" means all variables that being with "a". The following "I" makes the regular expression case insensitive.; %ProcFreqCubed(DsnIn=SampleDataIn (drop=id), DsnOut=SampleDataOut_ByType2, Key=type, RegExFilter=/^a/I, FieldLength=512, Debug=N) * This example shows that the "Key" parameter can be a numeric variable.; %ProcFreqCubed(DsnIn=SampleDataIn, DsnOut=SampleDataOut_ByAge, Key=age, FieldLength=512, Debug=N) */