/*------------------------------------------------------------------------------ File: CreateCrosslinkID.sas Author: John Sabel john.sabel@ofm.wa.gov Washington State Education Research & Data Center Creation date: 2014-02-07 Version: 1.0 Purpose Resolves many-to-many relationships between two identifiers into a single identifier, the "CrossLinkID". When linking data sets together, many identity matching packages utilize the use of one-to-many matches. For example an ID=1 might match to ID=A and ID=B, and ID=2 might match to ID=C and ID=D. But it could happen where along with ID=1, ID=2 also matchesID=B, converting these two one-to-many matches into a single many-to-many match. Because of the crosslinking between these IDs, they must represent the same "thing", therefore the same "CrossLinkID" should assigned to each of these IDs. That is what this program does. For further explanation, see example code at bottom of source code listing. In the outputted data set, this program also produces some summary statistics showing how much each ID has been deduplicated in the process of creating the crosslink id. This file contains two macros: 1) %CreateCrosslinkID() - The main macro. It is called by the user. 2) %_CreateCrosslinkID_RecurseFunc() - A helper macro, initially called by %CreateCrosslinkID(), and then called recursively by itself. This macro should not be called by the user. Parameters DsnIn: The name of the data set containing two ID columns. CrosslinkID: The name of the outputted crosslink ID. It is a numeric field. ID_A: The name of the first ID. It can either be a character or a numeric field. ID_B: The name of the second ID. It can either be a character or a numeric field. DsnOut: The name of the outputted data set. Along with the Crosslink_ID ID_A, and ID_B parameter fields, it also contains two fields. 1) &ID_A._Count - A count of the number ID_A's per CrosslinkID. This shows to what extent &ID_A. has been deduplicated. 2) &ID_B._Count - A count of the number ID_B's per CrosslinkID. This shows to what extent &ID_B. has been deduplicated. Debug (Optional) When Debug=Y, the intermediate data sets _TempTable, _ID_A_Counts, and _ID_B_Counts are not deleted. Signature %CreateCrosslinkID(DsnIn=, DsnOut=, CrosslinkID=, ID_A=, ID_B=, Debug=N) Revisions: Date Version Author Comments 2014-04-15 0.91 John To %CreateCrosslinkID() added a proc sort with NODUPLICATES clause. To %_CreateCrosslinkID_RecurseFunc() added to existing proc sort statement a NODUPLICATES clause. 2015-05-11 0.911 John Added added "%if &Debug.~=N %then %put _local_;" statements to beginning and end of macros. 2015-07-02 1.0 John Where ID_A has no mate, or ID_B has no mate, made their respective CrosslinkIDs negative numbers. Also, when ID_A has no mate, made the &ID_B._Count "0", and analogously for ID_B. ------------------------------------------------------------------------------*/ %macro CreateCrosslinkID(DsnIn=, CrosslinkID=, ID_A=, ID_B=, DsnOut=, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %if &Debug.~=N %then %put _local_; /* %let DsnIn=SampleData_2; %let CrosslinkID=CrosslinkID; %let ID_A=A; %let ID_B=B; %let DsnOut=SampleData_2_Out; %let Debug=N; */ /* Some ID_As might not be paired with any ID_Bs, and some ID_Bs might not be paired with any ID_As. So pull these out, and give them negative CrosslinkIDs. */ proc sql; CREATE TABLE _CreateCrosslink_NoMate_01 AS SELECT DISTINCT &ID_A., &ID_B., 1 AS &ID_A._Count, 0 AS &ID_B._Count FROM &DsnIn. WHERE &ID_B. IS NULL UNION ALL SELECT DISTINCT &ID_A., &ID_B., 0 AS &ID_A._Count, 1 AS &ID_B._Count FROM &DsnIn. WHERE &ID_A. IS NULL ORDER BY &ID_A. DESC, &ID_B. DESC ; quit; data _CreateCrosslink_NoMate_02; length &CrosslinkID. 8; set _CreateCrosslink_NoMate_01; &CrosslinkID.=-_n_; run; proc sql; CREATE TABLE _TempTable AS SELECT DISTINCT . AS &CrosslinkID., &ID_A., &ID_B. FROM &DsnIn. WHERE &ID_A. IS NOT NULL AND &ID_B. IS NOT NULL ORDER BY &ID_A., &ID_B. ; quit; data _TempTable (drop=_CrosslinkID_Temp); set _TempTable; by &ID_A.; retain _CrosslinkID_Temp 0; if first.&ID_A. then _CrosslinkID_Temp = _CrosslinkID_Temp + 1; &CrosslinkID. = _CrosslinkID_Temp; run; %_CreateCrosslinkID_RecurseFunc(Dsn=_TempTable, ID_A=&ID_B., ID_B=&ID_A., CrosslinkID=&CrosslinkID., Debug=&Debug.) proc sort In=_TempTable out=_TempTable_Deduped noduplicates; by &ID_A. &CrosslinkID.; run; proc sql; CREATE TABLE _ID_A_Counts AS SELECT &CrosslinkID., COUNT(&CrosslinkID.) AS &ID_A._Count FROM ( SELECT DISTINCT &CrosslinkID., &ID_A. FROM _TempTable_Deduped ) GROUP BY &CrosslinkID. ; quit; proc sql; CREATE TABLE _ID_B_Counts AS SELECT &CrosslinkID., COUNT(&CrosslinkID.) AS &ID_B._Count FROM ( SELECT DISTINCT &CrosslinkID., &ID_B. FROM _TempTable_Deduped ) GROUP BY &CrosslinkID. ; quit; proc sql; CREATE TABLE &DsnOut. AS SELECT _TempTable_Deduped.*, &ID_A._Count, &ID_B._Count FROM _TempTable_Deduped INNER JOIN _ID_A_Counts ON _TempTable_Deduped.&CrosslinkID. = _ID_A_Counts.&CrosslinkID. INNER JOIN _ID_B_Counts ON _TempTable_Deduped.&CrosslinkID. = _ID_B_Counts.&CrosslinkID. UNION ALL SELECT * FROM _CreateCrosslink_NoMate_02 ORDER BY &CrosslinkID., &ID_A., &ID_B. ; quit; proc sql; DROP TABLE _CreateCrosslink_NoMate_01, _CreateCrosslink_NoMate_02, _TempTable, _TempTable_Deduped, _ID_A_Counts, _ID_B_Counts; quit; %if &Debug.~=N %then %put _local_; %put --- End of %upcase(&sysmacroname) macro; %mend CreateCrosslinkID; /*----------------------------------------------------------------------------- macro: _CreateCrosslinkID_RecurseFunc() - A helper macro called by CreateCrosslinkID(). ------------------------------------------------------------------------------*/ %macro _CreateCrosslinkID_RecurseFunc(Dsn=, CrosslinkID=, ID_A=, ID_B=, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; %if &Debug.~=N %then %put _local_; /* Initially assume this call will result in no changes. */ %let RecurseAgain=No; proc sort data = &Dsn. noduplicates; by &ID_A. &CrosslinkID.; run; data &Dsn. (drop= _GroupCrosslinkID %if &Debug.=N %then %do;_&CrosslinkID._Old %end;); set &Dsn.; by &ID_A. &CrosslinkID.; retain _GroupCrosslinkID; if first.&ID_A. then _GroupCrosslinkID = &CrosslinkID.; _&CrosslinkID._Old = &CrosslinkID.; if &CrosslinkID. ~= _GroupCrosslinkID then do; &CrosslinkID. = _GroupCrosslinkID; call symputx('RecurseAgain', 'Yes', 'F'); end; run; /* If a change has occurred then make recursive call to this function. Flop ID_A and ID_B */ %if &RecurseAgain.=Yes %then %_CreateCrosslinkID_RecurseFunc(Dsn=&Dsn., ID_A=&ID_B., ID_B=&ID_A., CrosslinkID=&CrosslinkID. ,Debug=&Debug.); %put --- End of %upcase(&sysmacroname) macro; %if &Debug.~=N %then %put _local_; %mend _CreateCrosslinkID_RecurseFunc; /* * Contains a single many-to-many relationship between every ID in data set; data SampleData_1; input A B $; datalines; 1 A 2 B 2 C 3 D 4 D ; run; %CreateCrosslinkID(DsnIn=SampleData_1, CrosslinkID=Crosslink_ID, ID_A=A, ID_B=B, DsnOut=SampleData_1_Out, Debug=N) * Represents one many-to-many relationship, one one-to-many relationship, and one many-to-one relationship; data SampleData_2; input A B $; datalines; -2 . -1 . 0 . 1 A 1 B 2 B 2 C 3 D 3 E 3 F 4 G 5 G . H . I . J ; run; %CreateCrosslinkID(DsnIn=SampleData_2, DsnOut=SampleData_2_Out, CrosslinkID=Crosslink_ID, ID_A=A, ID_B=B, Debug=N) */