/*----------------------------------------------------------------------------- File: DeleteSuperfluousRecords.sas Author: John Sabel Washington State Education Research and Data Center Creation date: 10/8/13 Version: 0.91 Purpose Deletes superfluous records from a data set. A superfluous record here means a record that contains information that is a strict subset of another record. A typical use is with name data, where one record has a name with a complete middle name, and other records have just the middle initial, or no initial at all. In this case, the record with the middle name has more complete information than the other records, so this macro will delete the other records. For example, say you have two records with the same person id. These records differ only in that one has a middle initial and one does not. In this case, the record without the middle initial is a subset of the record with an initial, so it is deleted. Parameters DsnIn: The one or two-level dataset name that is inputted. ColumnsIn: The list of columns that that defines a record, in effect the list of columns that defines the primary key. The last column contains what is incomplete information on some records, for example, the middle name. Except for the last column being the last column, he order of the other columns do not matter. DsnOut: The one or two-level dataset name that is created. Debug: (Optional) Y = Don't delete intermidiate datasets. Also, don't delete the data set, __Temp_TheSuperfluousRecords, which contains the superfluous, deleted records. N = Delete intermidiate datasets as well as __Temp_TheSuperfluousRecords. Default is N. Revisions: Date Version Author Comments 2013-11-25 0.91 John 1) Version 0.9 had a bug where if two records the same up to and including the last column, instead of one of these two records being deleted, another, unrelated record was deleted. This version fixes that bug. 2)Changed macro name from DeleteSuperfluousNames.sas to DeleteSuperfluousRecords.sas 3) When the debug parameter is set to Y added the list of local macro variables and their values are outputted to the log. -----------------------------------------------------------------------------*/ %macro DeleteSuperfluousRecords(DsnIn=, ColumnsIn=, DsnOut=, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %local _UltimateColumn _PenultimateColumn _AllButUltimateColumn; %let _UltimateColumn = %scan(&ColumnsIn, -1, %str( )); %let _PenultimateColumn = %sysfunc(reverse(%scan(%sysfunc(reverse(&ColumnsIn.)), 2, %str( )))); %let _AllButUltimateColumn = %substr(&ColumnsIn, 1, %EVAL(%INDEX(&ColumnsIn,&_UltimateColumn.) - 2)); data __Temp_01_DeleteSuperfluousRecs; set &DsnIn.; run; proc sort data=__Temp_01_DeleteSuperfluousRecs; by &ColumnsIn.; run; data __Temp_02_DeleteSuperfluousRecs; length _RcrdNummie 8.; set __Temp_01_DeleteSuperfluousRecs; by &ColumnsIn.; retain _UltimateColumnCount; _RcrdNummie = _n_; _UltimateColumnLag = lag(&_UltimateColumn.); if first.&_PenultimateColumn then _UltimateColumnCount = 0; else do; if strip(&_UltimateColumn.) =: strip(_UltimateColumnLag) or strip(_UltimateColumnLag) = '' then _UltimateColumnCount = _UltimateColumnCount + 1; else _UltimateColumnCount = 0; end; run; /* proc sort data=__Temp_02_DeleteSuperfluousRecs;*/ /* by &_AllButUltimateColumn. descending &_UltimateColumn.;*/ /* run;*/ proc sort data=__Temp_02_DeleteSuperfluousRecs; by &_AllButUltimateColumn. descending _RcrdNummie; run; data &DsnOut. %if &Debug.=N %then %do; (drop= _RcrdNummie _UltimateColumnCount _UltimateColumnLag _UltimateColumnCount _UltmateColumnCountLag) %end; __Temp_TheSuperfluousRecords; set __Temp_02_DeleteSuperfluousRecs; _UltmateColumnCountLag = lag(_UltimateColumnCount); if _UltimateColumnCount < _UltmateColumnCountLag then output __Temp_TheSuperfluousRecords; else output &DsnOut.; run; %if &Debug.=N %then %do; proc sql; DROP TABLE __Temp_01_DeleteSuperfluousRecs; DROP TABLE __Temp_02_DeleteSuperfluousRecs; DROP TABLE __Temp_TheSuperfluousRecords; quit; %end; %if &Debug.~=N %then %put _local_; %put --- End of %upcase(&sysmacroname) macro; %mend DeleteSuperfluousRecords; /* * Sample data and %DeleteSuperfluousRecords() macro calls.; * This sample data contains three superfluous records: 1) The Karen White record that does not have a middle name. 2) The Jack Brown record that does not have a middle name. 3) The Jack Brown record that has only the middle initial "B". 4) Two Sal Orange records, one with a shorter middle name than the other two, another where is an duplicate of the other. data SampleDataMiddleName; input PersonID LastName $ FirstName $ MiddleName $ DOB $; datalines; 1 White Karen . 5/5/2005 1 White Karen M 5/5/2005 2 Brown Jack . 6/6/2006 2 Brown Jack B 6/6/2006 2 Brown Jack BJ 6/6/2006 2 Brown Jack ZY 6/6/2006 3 Orange Sal A 6/6/2006 3 Orange Sal AB 6/6/2006 3 Orange Sal AB 6/6/2006 4 Green Lisa May 7/7/2007 ; run; %DeleteSuperfluousRecords(DsnIn=SampleDataMiddleName, ColumnsIn=PersonID DOB LastName FirstName MiddleName, DsnOut=SampleDataMiddleName_Out, Debug=N) * LastName, FirstName only example; data SampleDataFirstName; input PersonID LastName $ FirstName $ DOB $ FipsCountyCode $; datalines; 1 White K 5/5/2005 53007 1 White Karen 5/5/2005 53007 2 Brown Jack 6/6/2006 53027 2 Brown J 6/6/2006 53027 2 Brown Ja 6/6/2006 53027 2 Brown JK 6/6/2006 53031 3 Green Lisa 7/7/2007 53035 ; run; %DeleteSuperfluousRecords(DsnIn=SampleDataFirstName, ColumnsIn=PersonID DOB FipsCountyCode LastName FirstName, DsnOut=SampleDataFirstName_Out, Debug=N) */