/*----------------------------------------------------------------------------- File: DivineDisparateDedupLinks.sas Author: John Sabel Washington State Education Research & Data Center Creation date: 4/28/11 Version: 0.91 Purpose Compares two different ways that had been used to deduplicate a dataset. Creates a list of instances were the two methods did not link the same records. An example with sample data and a call to DivineDisparateDedupLinks.sas is commented out at the bottom of this file. Parameters KeyName: Name of a primary key. Field can be numeric or character. DedupKeyName_1: Name of the the 1st method's duduplication key. Field can be numeric or character. DedupKeyName_2: Name of the the 2nd method's duduplication key. Field can be numeric or character. DsnIn_1: One or two-level name of the dataset that contains the 1st method's deduplicated key. DsnIn_2: One or two-level name of the dataset that contains the 2nd method's deduplicated key. If your deduplicated dataset, DsnIn_1, contains both methods duduplication keys, then DsnIn_2 would be equal to DsnIn_1. But if you do use different datasets, they must use the same primary key defined by 'Key' above. DsnOut: One or two-level name of the output dataset. Debug: (Optional) Y = Don't delete intermidiate datasets. N = Delete intermidiate datasets. Default is N. Signature %DivineDisparateDedupLinks(KeyName=, DedupKeyName_1=, DedupKeyName_2= , DsnIn_1=, DsnIn_2=, DsnOut=, Debug=N) Modification Log Version Person Date Description 1.0 JRS 12/24/13 There was a bug when DsnIn_1 and DsnIn_2 pointed to different datasets, and the datasets had non-overlapping keys. This fixes the bug. -----------------------------------------------------------------------------*/ %macro DivineDisparateDedupLinks(KeyName=, DedupKeyName_1=, DedupKeyName_2= , DsnIn_1=, DsnIn_2=, DsnOut=, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %local KeyStatement NumOrChar; proc sql; CREATE TABLE __Dataset_1 AS SELECT DISTINCT &DedupKeyName_1., &KeyName. FROM &DsnIn_1. WHERE &KeyName. IN ( SELECT &KeyName. FROM &DsnIn_2. ) ORDER BY &DedupKeyName_1., &KeyName. ; CREATE TABLE __Dataset_2 AS SELECT DISTINCT &DedupKeyName_2., &KeyName. FROM &DsnIn_2. WHERE &KeyName. IN ( SELECT &KeyName. FROM &DsnIn_1. ) ORDER BY &DedupKeyName_2., &KeyName. ; quit; /* This section allows KeyName to be either a num field or a char field. */ proc sql noprint; /* Because DsnIn_1 might be a two-level name, first create the empty dataset __Temp_EmptyShell as a one level name. */ CREATE TABLE __Temp_EmptyShell LIKE &DsnIn_1. ; /* Determine if KeyName is a Num or Char field from empty dataset __Temp_EmptyShell. */ SELECT UPCASE(TYPE) INTO :NumOrChar FROM DICTIONARY.COLUMNS WHERE UPCASE(LIBNAME) = 'WORK' AND UPCASE(MEMNAME) = '__Temp_EmptyShell' AND UPCASE(NAME) = "%upcase(&KeyName.)" ; quit; %if &NumOrChar.=NUM %then %let KeyStatement=put(&KeyName., 10.); %else %let KeyStatement=&KeyName.; data __Denormalized_1 (drop = &KeyName.); set __Dataset_1; by &DedupKeyName_1.; length AllKeys $100; retain KeyCount_1 0 AllKeys ''; if first.&DedupKeyName_1. then do; AllKeys = strip(&KeyStatement.); KeyCount_1 = 1; end; else do; AllKeys = catx('-', AllKeys, &KeyStatement.); KeyCount_1 = KeyCount_1 + 1; end; if last.&DedupKeyName_1. then output; run; data __Denormalized_2 (drop = &KeyName.); set __Dataset_2; by &DedupKeyName_2.; length AllKeys $100; retain KeyCount_2 0 AllKeys ''; if first.&DedupKeyName_2. then do; AllKeys = strip(&KeyStatement.); KeyCount_2 = 1; end; else do; AllKeys = catx('-', AllKeys, &KeyStatement.); KeyCount_2 = KeyCount_2 + 1; end; if last.&DedupKeyName_2. then Output; run; proc sql; CREATE TABLE __CommonLinks AS SELECT __Denormalized_1.&DedupKeyName_1., __Denormalized_2.&DedupKeyName_2. FROM __Denormalized_1 INNER JOIN __Denormalized_2 ON __Denormalized_1.AllKeys = __Denormalized_2.AllKeys ; quit; proc sql; CREATE TABLE &DsnOut. AS SELECT distinct __Denormalized_1.&DedupKeyName_1., Denormalized_2a.&DedupKeyName_2., __Denormalized_1.KeyCount_1, Denormalized_2a.KeyCount_2 FROM &DsnIn_1. DsnIn_1 INNER JOIN __Denormalized_1 ON DsnIn_1.&DedupKeyName_1. = __Denormalized_1.&DedupKeyName_1. INNER JOIN ( SELECT DsnIn_2.&KeyName., __Denormalized_2.&DedupKeyName_2., __Denormalized_2.KeyCount_2 FROM &DsnIn_2. DsnIn_2 INNER JOIN __Denormalized_2 ON DsnIn_2.&DedupKeyName_2. = __Denormalized_2.&DedupKeyName_2. ) Denormalized_2a ON DsnIn_1.&KeyName. = Denormalized_2a.&KeyName. LEFT JOIN __CommonLinks ON DsnIn_1.&DedupKeyName_1. = __CommonLinks.&DedupKeyName_1. WHERE __CommonLinks.&DedupKeyName_1. IS NULL ; quit; %if &Debug.=N %then %do; proc sql; DROP TABLE __Temp_EmptyShell, __Dataset_1, __Dataset_2, __Denormalized_1, __Denormalized_2, __CommonLinks; quit; %end; %else %put _user_; %put --- End of %upcase(&sysmacroname) macro; %mend DivineDisparateDedupLinks; /* data SampleDedupData; input DedupId_1 $ DedupId_2 ID; datalines; A100 111 1 A101 112 2 A102 113 3 A103 113 4 A104 114 5 A104 115 6 A105 116 7 A105 116 8 A106 116 9 ; run; %DivineDisparateDedupLinks(KeyName=ID, DedupKeyName_1=DedupId_1, DedupKeyName_2=DedupId_2, DsnIn_1=SampleDedupData, DsnIn_2=SampleDedupData, DsnOut=Difference, Debug=N) */