/*----------------------------------------------------------------------------- File: StandardizeNames.sas Author: John Sabel john.sabel@ofm.wa.gov Washington State Education Research & Data Center Creation date: 2/14/11 Version: 1.3 Purpose This macro convert the first, last and middle names into a "standard form". Among other things, this means all names are upper case, names contain only alphabetic characters, if a suffix (JR, III, etc.) is embedded in a first or last name, the suffix is placed into its own field, etc. This macro requires that the DsnIn data set contain at least a first name and a last name variable. Other variables for middle name, suffix and nickname, are not required, but will be in the outputted DsnOut data set. Parameters DsnIn: The two level name of a data set that contains first, last and middle names. DsnOut: The name of the outputted data set. FirstName: The name of the first name fieldin the "In" dataset. The default is "FirstName." MiddleName: (Optional) The name of the middle name field in the "In" dataset. The default is "MiddleName." If the DsnIn dataset doesn't contain a MiddleName field, one will be created for the DsnOut dataset. LastName: The name of the last name field in the "In" dataset. The default is "LastName." Suffix: (Optional) The name of the suffix field in the "In" dataset. The default is "Suffix." If the DsnIn dataset doesn't contain a Suffix field, one will be created for the DsnOut dataset. Nickname: (Optional) The name of the Nickname field in the "In" dataset. The default is "Nickname." If the DsnIn dataset doesn't contain a Nickname field, one will be created for the DsnOut dataset. Debug: (Optional) Y = Don't delete intermidiate datasets. Keep instrumentation variables. N = Delete intermidiate datasets and instrumentation variables. Default is "N". Signature %StandardizeNames(DsnIn=, DsnOut=, FirstName=FirstName, MiddleName=MiddleName, LastName=LastName, Nickname=Nickname, Suffix=Suffix, Debug=N) Revisions: Date Version Author Comments 2012-03-26 1.1 John 1) Changed macro name from %StandardizeName to %NormalizeNames. 2) Added code to convert extended ASCII latin letters (e.g. "Ñ") to alphabetic characters. Added code to convert embedded zeros to O's. Added code to delete 2ND's, 3RD's, 4TH's, etc. 2012-04-18 1.11 John Changed name from %NormalizeNames to %StandardizeNames. 2012-10-19 1.12 John Added code to strip names again near end of data step "__ImprovedNames_NoSuffix." Also, added code to make out non-printable characters into spaces. 2012-11-15 1.13 John Added code to strip suffixes in middle names that follow a word in the middle name, and then place these suffixes in the Suffix field. 2013-02-20 1.14 John Where records in DsnIn already have a valid suffix, will not try to strip any suffix candidates from the name fields. Also, reformatted test data into a CSV format, and created an expected data set. 2013-03-01 1.2 John In some data (NSC) some first names have suffixes directly appended to them. Added code to account for these known cases (see format $FirstNameWithSuffix below). 2014-11-05 1.3 John Added code to dynamically create MiddleName and/or Nickname fields when one or both don't exist in the DsnIn data set. -----------------------------------------------------------------------------*/ %macro StandardizeNames(DsnIn=, DsnOut=, FirstName=FirstName, MiddleName=MiddleName, LastName=LastName, Nickname=Nickname, Suffix=Suffix, Debug=N); %put --- Start of %upcase(&sysmacroname) macro; /* Generally ERDC data sources don't append suffixes directly to first names. The one occasional exception has been NSC data. These are the NSC exceptions to date. */ proc format; value $FirstNameWithSuffix 'ABELJR' = 'ABEL JR' 'ALEXANDRJR' = 'ALEXANDR JR' 'ALFREDOJR' = 'ALFREDO JR' 'AMADOJR' = 'AMADO JR' 'ANDREWII' = 'ANDREW II' 'ANTRONJR' = 'ANTRON JR' 'BERNARDIV' = 'BERNARD IV' 'BLUEJR' = 'BLUE JR' 'CHALLANII' = 'CHALLAN II' 'CRAIGJR' = 'CRAIG JR' 'DALEJR' = 'DALE JR' 'DANILOJR' = 'DANILO JR' 'DAVIDJR' = 'DAVID JR' 'DEREKJR' = 'DEREK JR' 'DERRICKII' = 'DERRICK II' 'DOMINGOJR' = 'DOMINGO JR' 'DONALDJR' = 'DONALD JR' 'DONII' = 'DON II' 'DWAYNEJR' = 'DWAYNE JR' 'EDWARDJR' = 'EDWARD JR' 'ERICJR' = 'ERIC JR' 'ERNESTJR' = 'ERNEST JR' 'FAUESEJR' = 'FAUESE JR' 'FELIPEJR' = 'FELIPE JR' 'FRANKLINJR' = 'FRANKLIN JR' 'FREDERICKJR' = 'FREDERICK JR' 'FREDJR' = 'FRED JR' 'GABRIELJR' = 'GABRIEL JR' 'GEORGEJR' = 'GEORGE JR' 'GERARDJR' = 'GERARD JR' 'IGNACIOJR' = 'IGNACIO JR' 'ISTVANJR' = 'ISTVAN JR' 'JAMESII' = 'JAMES II' 'JAMESJR' = 'JAMES JR' 'JEFFREYJR' = 'JEFFREY JR' 'JIMMYII' = 'JIMMY II' 'JOHNJR' = 'JOHN JR' 'JONATHANJR' = 'JONATHAN JR' 'JOSEJR' = 'JOSE JR' 'JOSEPHJR' = 'JOSEPH JR' 'JUANJR' = 'JUAN JR' 'KEVINJR' = 'KEVIN JR' 'KURTNATHANJR' = 'KURTNATHAN JR' 'LAWRENCEJR' = 'LAWRENCE JR' 'LIEULOJR' = 'LIEULO JR' 'LUCIANOJR' = 'LUCIANO JR' 'LUKEJR' = 'LUKE JR' 'MANUELJR' = 'MANUEL JR' 'MARKJR' = 'MARK JR' 'MATTHEWJR' = 'MATTHEW JR' 'MICHAELJR' = 'MICHAEL JR' 'PATRICKJR' = 'PATRICK JR' 'PAULJR' = 'PAUL JR' 'PETERJR' = 'PETER JR' 'RAFAELJR' = 'RAFAEL JR' 'RAULJR' = 'RAUL JR' 'RAYMONDJR' = 'RAYMOND JR' 'RICHARDJR' = 'RICHARD JR' 'ROBERTJR' = 'ROBERT JR' 'ROBERTOJR' = 'ROBERTO JR' 'ROGELIOII' = 'ROGELIO II' 'SCOTTJR' = 'SCOTT JR' 'SERGEJR' = 'SERGE JR' 'STEPHENJR' = 'STEPHEN JR' 'STEVENJR' = 'STEVEN JR' 'TERRYNCEJR' = 'TERRYNCE JR' 'TOUSSAINTJR' = 'TOUSSAINT JR' 'VIKTORJR' = 'VIKTOR JR' 'WILLIAMJR' = 'WILLIAM JR'; run; /* Determine if &Suffix. exists in DsnIn. If not, create &Suffix. below. */ proc sql noprint; CREATE TABLE __DsnInVariables LIKE &DsnIn.; quit; proc sql noprint; title "Does &MiddleName. Exist in &DsnIn?"; SELECT * FROM DICTIONARY.COLUMNS WHERE LIBNAME = 'WORK' AND MEMNAME = "%upcase(__DsnInVariables)" AND UPCASE(NAME) = "%upcase(&MiddleName.)" ; %let _DsnInHasMiddleNameField = &sqlobs.; quit; proc sql noprint; title "Does &Suffix. Exist in &DsnIn?"; SELECT * FROM DICTIONARY.COLUMNS WHERE LIBNAME = 'WORK' AND MEMNAME = "%upcase(__DsnInVariables)" AND UPCASE(NAME) = "%upcase(&Suffix.)" ; %let _DsnInHasSuffixField = &sqlobs.; quit; proc sql noprint; title "Does &Nickname. Exist in &DsnIn?"; SELECT * FROM DICTIONARY.COLUMNS WHERE LIBNAME = 'WORK' AND MEMNAME = "%upcase(__DsnInVariables)" AND UPCASE(NAME) = "%upcase(&Nickname.)" ; %let _DsnInHasNicknameField = &sqlobs.; quit; title; data __ImprovedNames_NoSuffix (drop = _i _aname __re1 __re2); set &DsnIn.; /* Dynamically creates MiddleName and/or Suffix and/or Nickname variables when not extant in the DsnIn data set. */ %if &_DsnInHasMiddleNameField. ~= 1 | &_DsnInHasSuffixField. ~= 1 | &_DsnInHasNicknameField. ~= 1 %then %do; length %if &_DsnInHasMiddleNameField. ~= 1 %then %do; &MiddleName. $30 %end; %if &_DsnInHasSuffixField. ~= 1 %then %do; &Suffix. $5 %end; %if &_DsnInHasNicknameField. ~= 1 %then %do; &Nickname. $30 %end; ; /* Closes out the "length" statement. */ %if &_DsnInHasMiddleNameField. ~= 1 %then %do; &MiddleName. = ''; %end; %if &_DsnInHasSuffixField. ~= 1 %then %do; &Suffix. = ''; %end; %if &_DsnInHasNicknameField. ~= 1 %then %do; &Nickname. = ''; %end; %end; /* %if &_DsnInHasSuffixField. ~= 1 %then %do;*/ /* length &Suffix. $5;*/ /* &Suffix. = ''; * Kind of a kludge. Prevents an uninitialized*/ /* variable note occurring in the log;*/ /* %end;*/ /**/ /* %if &_DsnInHasNicknameField. ~= 1 %then %do;*/ /* length &Nickname. $5;*/ /* &Nickname. = ''; * Kind of a kludge. Prevents an uninitialized*/ /* variable note occurring in the log;*/ /* %end;*/ retain __re1 __re2 _reParen; if _N_ =1 then do; __re1 = prxparse("s/[^\sA-Z]//"); __re2 = prxparse("s/-+/ /"); end; /* &Suffix. = ''; */ _OrigLast = &LastName.; _OrigFirst = &FirstName.; _OrigMiddle = &MiddleName.; _OrigSuffix = &Suffix.; /* Initialize instrumentation variables. */ _n = _N_; _NonPrintable = 0; _Upcased = 0; _Striped = 0; _Nth = 0; _DeLatinized = 0; _OpenParenthesis = 0; _ConvertedDashes = 0; _DeletedApostrophe = 0; _DeletedPairedParen = 0; _DeletedNonAlphaZero = 0; _NULLsInParens = 0; _OpenLeftParen = 0; _DeleteParenARoundName = 0; _SprFlsMI = 0; _Nickname = 0; _SprflsParenLstNmWrd = 0; _ZeroToO = 0; _DeletedZero = 0; _CHS = 0; _Compbled = 0; _Other = 0; _NullName = 0; /* The following instrumentation variables are not used in this data step, but in the data steps below. */ _SquishedLast = 0; _NewSuffix = 0; _MovedToMI = 0; _DeletedSecondFirstName = 0; array __name{4} &FirstName. &MiddleName. &Nickname. &LastName.; /* Make all characters upper case. */ do _i = 1 to 4; _aname = __name{_i}; _aname = upcase(_aname); if _aname ~= __name{_i} then _Upcased = _Upcased + 1; __name{_i} = _aname; end; /* Strip all names. */ do _i = 1 to 4; _aname = __name{_i}; _aname = strip(_aname); if _aname ~= __name{_i} then _Striped = _Striped + 1; __name{_i} = _aname; end; /* JRS: Added 3/13/12 */ /* Delete these sorts or words: 2nd, 3rd, 4th, 5th, etc. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/([\(\s]|^)(1ST|2ND|3RD|[4-9]TH)([\)\s]|$)//', -1, _aname); if _aname ~= __name{_i} then _Nth = _Nth + 1; __name{_i} = _aname; end; /* Convert extended ASCII letters in alphabetic characters. */ do _i = 1 to 4; _aname = __name{_i}; _aname = translate(_aname, 'AAAAAA', 'ÀÁÂÃÄÅ'); _aname = translate(_aname, 'C', 'Ç'); _aname = translate(_aname, 'EEEE', 'ÈÉÊË'); _aname = translate(_aname, 'IIII', 'ÌÍÎÏ'); _aname = translate(_aname, 'N', 'Ñ'); _aname = translate(_aname, 'OOOOO', 'ÒÓÔÕÖ'); _aname = translate(_aname, 'UUUU', 'ÙÚÛÜ'); if _aname ~= __name{_i} then _DeLatinized = _DeLatinized + 1; __name{_i} = _aname; end; /* Delete all open parentheses followed by zero or more characters. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/(\([^\)]*)$//', -1, strip(_aname)); if _aname ~= __name{_i} then _OpenLeftParen = _OpenLeftParen + 1; __name{_i} = _aname; end; /* Convert dashes to spaces. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/-/ /', -1, _aname); if _aname ~= __name{_i} then _ConvertedDashes = _ConvertedDashes + 1; __name{_i} = _aname; end; /* JRS: Added 11/25/12 */ /* Delete apostrophes */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange("s/'//", -1, _aname); if _aname ~= __name{_i} then _DeletedApostrophe = _DeletedApostrophe + 1; __name{_i} = _aname; end; /* Any character that isn't an alphabetic character a white space, character, a parenthesis or a zero, convert to a space. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/[^\s0A-Z\(\)]/ /', -1, _aname); if _aname ~= __name{_i} then _DeletedNonAlphaZero = _DeletedNonAlphaZero + 1; __name{_i} = _aname; end; /* JRS: Added 10/19/12, modified 11/15/12. Make all non-printable characters spaces. Not just stripping mostly because of tabs. Needs to be before compbl() statement below but after the delatinization (extended ASCII) statements above. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/[\x00-\x1F|\x7F-\xFF]/ /', -1, _aname); if _aname ~= __name{_i} then _NonPrintable = _NonPrintable + 1; __name{_i} = _aname; end; /* Parentheses... */ /* Single word names enclosed by parentheses, delete parentheses. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/(^\()(\.*)(\))$/$2/', -1, _aname); if _aname ~= __name{_i} then _DeleteParenARoundName= _DeleteParenARoundName + 1; __name{_i} = _aname; end; /* Deletes superfluous MI if MI is the first letter in following string enclosed by parentheses. */ if length(scanq(&MiddleName., 1, ' ()')) = 1 and index(&MiddleName., '(') then do; _MI = scanq(&MiddleName., 1, ' ()'); /* put _MI=;*/ _temp = scanq(&MiddleName., 2, '()'); /* put _temp=;*/ if _MI = substr(scanq(&MiddleName., 2, '()'), 1, 1) then do; &MiddleName. = scanq(&MiddleName., 2, '()'); _SprFlsMI = _SprFlsMI+1; end; end; if prxmatch('/\w+.*\(.*\)/', &FirstName.) then do; _MOOO = &FirstName.; /* put _MOOO=;*/ &Nickname. = scanq(strip(&FirstName.), -1, '()'); /* &Nickname. = prxchange('s/\((.*)\)/$1/', 1, &FirstName.);*/ &FirstName. = prxchange('s/\(.*\)//', -1, &FirstName.); _Nickname = _Nickname + 1; end; /* Last name contains superfluous word in parentheses, delete word and parentheses. */ if _N_ = 1 then _reParen = prxparse('/\(.*\)/'); /* CASTERLINE SABEL (CASTERLINE) becomes CASTERLINE SABEL */ if prxmatch(_reParen, &LastName.) then do; _PossibleSuperfluousWord=scanq(strip(&LastName.), -1, '()'); /* put _PossibleSuperfluousWord=;*/ /* _PossibleSuperfluousWord = prxposn(_reParen, 1, &LastName.);*/ /* put _PossibleSuperfluousWord=;*/ /* _PossibleSuperfluousWord2 = compress(prxposn(_reParen, 1, &LastName.), "()");*/ /* put _PossibleSuperfluousWord2=;*/ if indexw(&LastName., _PossibleSuperfluousWord) then do; &LastName. = prxchange('s/\(.*\)//', -1, &LastName.); _SprflsParenLstNmWrd = _SprflsParenLstNmWrd + 1; end; end; /* Delete (NULL) from all names. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/\(NULL\)/ /', -1, _aname); if _aname ~= __name{_i} then _NULLsInParens = _NULLsInParens + 1; __name{_i} = _aname; end; /* Convert remaining parentheses to spaces. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/(\(|\))/ /', -1, _aname); if _aname ~= __name{_i} then _NULLsInParens = _NULLsInParens + 1; __name{_i} = _aname; end; /**/ /* &Nickname. = prxchange('s/(\()(.*)(\))/$2/', -1, &LastName.);*/ /* &LastName. = prxchange('s/\(.*\)//', -1, &LastName.;*/ /* _NicknameFromMiddleName = _NicknameFromMiddleName + 1;*/ /* end;*/ /* Convert embedded zeros to O's */ do _i = 1 to 4; _aname = __name{_i}; /* A letter followed by two zeros. */ _aname = prxchange('s/([A-Z])(00)/$1OO/', -1, _aname); /* A single zero embedded within letters. */ _aname = prxchange('s/([A-Z])(0)([A-Z])/$1O$3/', -1, _aname); /* Two or more letters followed by a zero. */ _aname = prxchange('s/([A-Z]{2,})(0)/$1O/', -1, _aname); /* A zero followed by two or more letters. */ _aname = prxchange('s/(0)([A-Z]{2,})/O$2/', -1, _aname); if _aname ~= __name{_i} then _ZeroToO = _ZeroToO + 1; __name{_i} = _aname; end; /* Delete all remaining zeros. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/0//', -1, _aname); if _aname ~= __name{_i} then _DeletedZero = _DeletedZero + 1; __name{_i} = _aname; end; /* Delete "CHS ##" where ## is a number. Seen in last names of ERDC K12 data. This operation is not in the the Normalize Names document since this is specific to ERDC data. */ do _i = 1 to 4; _aname = __name{_i}; _aname = prxchange('s/^CHS\s\d+//', -1, _aname); if _aname ~= __name{_i} then _CHS = _CHS + 1; __name{_i} = _aname; end; /* Convert strings of spaces into single spaces. */ do _i = 1 to 4; _aname = __name{_i}; _aname = compbl(_aname); if _aname ~= __name{_i} then _Compbled = _Compbled + 1; __name{_i} = _aname; end; /* Delete UNKNOWNs and ESTATEs. */ do _i = 1 to 4; _aname = __name{_i}; _aname = tranwrd(_aname,'UNKNOWN',''); if prxmatch('/ESTATE/', _aname) then _aname = ''; if _aname ~= __name{_i} then _Other = _Other + 1; __name{_i} = _aname; end; /* Delete paired ("()") parentheses*/ /* do _i = 1 to 4;*/ /* _aname = __name{_i};*/ /* _aname = prxchange('s/\(\s*\)//', -1, _aname);*/ /* if _aname ~= __name{_i} then _DeletedPairedParen = _DeletedPairedParen + 1;*/ /* __name{_i} = _aname;*/ /* end;*/ /* JRS: Added 10/19/12 Strip all names once more. */ do _i = 1 to 4; _aname = __name{_i}; _aname = strip(_aname); if _aname ~= __name{_i} then _Striped = _Striped + 1; __name{_i} = _aname; end; /* Delete NULLs from first, middle, and nick names. Note: NULL can be a legitamite last name. */ do _i = 1 to 3; _aname = __name{_i}; if _aname= 'NULL' then _aname = ''; if _aname ~= __name{_i} then _NullName = _NullName + 1; __name{_i} = _aname; end; /* The "s" at beginning of expression means substitute. \b = word boundry, such as a space or the beginning of a field. \s = a space character. $1 is the word, be it "AL" or "DE", etc. that lies in the parenthesis. So this expression removes the space that follows any of the "words" listed within the paranthesis. */ _aname = &LastName.; _aname = prxchange('s/\b(AL|DE|DEL|DER|EL|MC|LA|LE|MAC|ST|VON|VONDER|VAN|VANDER)\s/$1/',-1,_aname); _aname = prxchange('s/\b(AL|DE|DEL|DER|EL|MC|LA|LE|MAC|ST|VON|VONDER|VAN|VANDER)\s/$1/',-1,_aname); if _aname ~= &LastName. then _SquishedLast = 1; &LastName. = _aname; run; /* Moves &Suffix.es from first and last names to their own field. */ data __ImprovedNames_Suffix (drop= __re __reIII pattern position length); set __ImprovedNames_NoSuffix; /* Pattern for a name that ends with a space plus a &Suffix.. Used with UB04 data. */ if _N_ = 1 then do; retain __re __reIII; pattern = "/( II| III| IV| V| VI| VII| VIII| ESQ| JR| SR)$/i"; __re = prxparse(pattern); __reIII = prxparse('/III$/'); end; /* JRS: Added 11/26/12. */ /* Delete anything that isn't a generational suffix from the suffix field. */ &Suffix. = strip(upcase(&Suffix.)); &Suffix. = prxchange('s/[^A-Z]//', -1, &Suffix.); if &Suffix. not in('II','III','IV','V','VI','VII','VIII','JR','SR','ESQ') then &Suffix. = ''; /* JRS: Revised 3/1/2013 In some NSC data, suffixes were directly appended to first name. The fomrat $FirstNameWithSuffix. seperates out the suffixes from these known cases. The longest member of $FirstNameWithSuffix is 13 characters, so we only use the format when the length of &FirstName. is <= 13. (I found that the longest length SAS allows in the format is $FirstNameWithSuffix40., so though simplier, you can't just apply the format to all FirstNames because theoretically you would truncate some of them to 40 characters.) */ if length(&FirstName.) <= 13 then &FirstName. = put(&FirstName., $FirstNameWithSuffix40.); /* put &FirstName.=;*/ /* If a good suffix doesn't already exist, extract what can be extracted from name fields. */ if &Suffix. = '' then do; call prxsubstr(__re, TRIM(&FirstName.), position, length); if position ^= 0 then do; &Suffix. = substr(&FirstName., position + 1, length - 1); &FirstName. = substr(&FirstName., 1, position - 1); end; /* Strip out suffixes that follow a word in the middle name and place in Suffix field. JRS: Added 11/15/12. */ call prxsubstr(__re, TRIM(&MiddleName.), position, length); if position ^= 0 then do; &Suffix. = substr(&MiddleName., position + 1, length - 1); &MiddleName. = substr(&MiddleName., 1, position - 1); end; /* Used where only one or two characters make up first name. All UB92 data, a little UB04 data. Intentionally excludes II, IV, V, VI and SR since these could be abbreviated first names. */ else if TRIM(&FirstName.) in ('III','VII','VIII','JR') then do; &Suffix. = &FirstName.; &FirstName. = &MiddleName.; &MiddleName. = ''; end; call prxsubstr(__re, TRIM(&LastName.), position, length); if position ^= 0 and &LastName. ne 'NASR' then do; &Suffix. = substr(&LastName., position + 1, length - 1); &LastName. = substr(&LastName., 1, position - 1); end; /* Traditionally, the 'II' is used to differentiate between members in the same family that have the same name, but do not have a father/son relationship. That said, the 'II' suffix is sometimes used in a father/son relationship. These exceptions seem to be concentrated amoung hispanics. As a result, the line below is commented out in order to preserve the suffix 'II'. */ *if &Suffix.='II' then &Suffix.='JR'; /* For cases where the first name terminates in 'III'. */ if &Suffix. = '' then do; call prxsubstr(__reIII, TRIM(&FirstName.), position, length); if position ^= 0 and lengthn(&Firstname.) >= 7 then do; &Suffix. = 'III'; &FirstName. = trim(substr(&FirstName., 1, position - 1)); end; end; /* For cases where the last name terminates in 'III'. */ if &Suffix. = '' then do; call prxsubstr(__reIII, TRIM(&LastName.), position, length); if position ^= 0 and lengthn(&LastName.) >= 7 then do; &Suffix. = 'III'; &LastName. = trim(substr(&LastName., 1, position - 1)); end; end; if &Suffix. = '' and &MiddleName. in ('III','VII','VIII','JR') then do; &Suffix. = &MiddleName.; &MiddleName. = ''; end; end; if upcase(_OrigSuffix) ~= &Suffix. then _NewSuffix = 1; run; %if &Debug. = N %then %do; proc sql; DROP TABLE __DsnInVariables, __ImprovedNames_NoSuffix; quit; %end; /* This does two things: 1) If second word of a first name is a single letter that doesn't follow a dash and middle name is missing, moves that single letter to the middle name. 2) If second word of a first name is same as the middle name, then delete the second word of the first name. */ data __ImprovedNames_FirstLast; set __ImprovedNames_Suffix; /* Causes the _SecondFirstName to contain all the words from the second and beyond. */ if index(strip(&FirstName.), ' ') > 1 then _SecondFirstName = strip(substr(&FirstName., index(&FirstName., ' ') + 1)); if lengthn(_SecondFirstName) = 1 and indexc(_OrigFirst, '-') = 0 then do; &FirstName. = scanq(&FirstName.,1); /* if &MiddleName. doesn't contain any letters, move single letter 2nd first name to middle name. */ if not(anyalpha(&MiddleName.)) then do; &MiddleName. = _SecondFirstName; _MovedToMI = 1; end; end; /* e.g. (First name:)T JOSEPH (Middle name:)JOSEPH will become (First name:)T (Middle name:)JOSEPH. */ if lengthn(_SecondFirstName) > 1 and _SecondFirstName = compress(&MiddleName.) then do; &FirstName. = scanq(&FirstName.,1); _DeletedSecondFirstName = 1; end; if &MiddleName. in ('NMI', 'NMN') then &MiddleName. = ''; run; %if &Debug. = N %then %do; proc sql; DROP TABLE __ImprovedNames_Suffix; quit; %end; data &DsnOut.; set __ImprovedNames_FirstLast; %if &Debug. = N %then %do; drop _:; %end; run; %if &Debug. = N %then %do; proc sql; DROP TABLE __ImprovedNames_FirstLast; quit; %end; %put --- End of %upcase(&sysmacroname) macro; %mend StandardizeNames; /* * Test data: ; data TestName_In; infile datalines delimiter=',' dsd truncover; input first_name :$20. middle_name :$20. last_name :$30. nick_name :$20. suffix :$5. problem :$96.; datalines; Robert,Lewis,Stevenson,,,Names have lower case letters JOAN D'ARC,,FRANCAIS,,,Apostrophe JILL,ADEL,ABAR--KLEMENS,,,Two dashes in LastName JOHN (2ND),RUDULF,REINDEER,,,FirstName contains '(2nd)'. Preserve parentheses ÀÁÂÃÄÅ Ç,ÈÉÊË Ñ,ÒÓÔÕÖ ÙÚÛÜ,,,Diacritics in all names (OH MY!),OH BE,JOYFUL!,,,Exclamation points in FirstName and LastName. Preserve parentheses JOHN,CHS 10 R,SABEL,,,CHS 10 in MiddleName JOHN,DEV0N,MO0 Z00,,,Zeros which should be O's in MiddleName and LastName S00 Y00,5TH,TRAN,,,Zeros which should be O's in FirstName PAC0 AMJE0,D,0LLIE 0'SHAUN,,,Zeros which should be O's in FirstName and LastName JOSEPH,T0,CONRAD,,,Zero concatenated to middle initial JOHN,NULL,NULL,,,Middle name contains 'Null'. No change to LastName JOHN,UNKNOWN,SOLDIER,,,Middle name contains 'UNKNOWN' JOHN JR,ROBERT,SABEL,,,Suffix in FirstName JOHN,JR,SABEL,,,Suffix is MiddleName JOHN,R.,SABEL JR,,,Suffix in LastName JOHN,,SABEL II,,,Suffix in LastName JILL,NMI,SABEL,,,NMI as MiddleName FRANCISO,GUERILLO,DE CASTILLOIII,,,Suffix concantenated to word in last name FRANCISO ESQ,GUERILLO,DE LA TRISTE,,,Suffix in FirstName JUSTICE,A IV,SMITH,,,Suffix in MiddleName JUSTICE,B JR,BROWN,,,Suffix in MiddleName ISR,,MOO,,,No change (FirstName contains a faux suffix) WILLEM,,VAN ORANJE,,,Space after 'van' T JOSEPH,JOSEPH,QUINTO,,,Middle name embedded in FirstName KING,,HENRY,,IV,No change KING,,HENRY,,Jr.,Period and lower case letter in suffix KING,,HENRY,,Sr.,Period and lower case letter in suffix KING,,HENRY,,Mr.,Non-suffix word in Suffix field KING,JR,HENRY,,IV,No change WILLIAMJR,,HENRY,,,Suffix appended to FirstName BILL (WILLIAM),MARC,ALLEN,,JR,Nickname in FirstName BILL (WILLIAM),MARC,ALLEN,,,Nickname in FirstName LUNA(LUC,,ADELE,,,FirstName contains a word with an only a left parethesis LUNA (LUC,,ADELE,,,FirstName contains a word with an only a left parethesis LUNA (LUCILLE),,ADELE,,,FirstName Nickname enclosed by parentheses BILL,MARC (WILLIAM),ALLEN,,JR,Alternate MiddleName encloses by parentheses BILL,MARC (WILLIAM),ALLEN,,,Alternate MiddleName encloses by parentheses NGUYET,(,TRAN,,,Open left parenthesis in MiddleName NGUYET,(SUSA,TRAN,,,Open parenthesis with additional letters in MiddleName NGUYET,(SUSAN),TRAN,,,MiddleName enclosed by parentheses NGUYET,JOY (SUSAN),TRAN,,,Second word in MiddleName enclosed by parentheses NGUYET,S(SUSAN),TRAN,,,"Superfluous ""S"" in MIddleName" AJ,C (CJ),BJ,,JR,"Superfluous ""C"" in MIddleName" AJ,(NULL),BJ,,JR,Parentheses enclosed NULL in MiddleName JOHN,ROBERT,CASTERLINE (SABEL),,,Second work in LastName is enclosed by parentheses JOHN,ROBERT,CASTERLINE SABEL (CASTERLINE),,,Third word in LastName is enclosed by parentheses and is superfluous WALTER,J0,WINK,,,Supefluous '0' in MiddleName ; run; %StandardizeNames(DsnIn=TestName_In, DsnOut=TestName_Out, FirstName=first_name, MiddleName=middle_name, LastName=last_name, Nickname=nick_name, Debug=N) * What the test data should look like after being fed through %StandardizeNames(). data TestName_Expected; infile datalines delimiter=',' dsd truncover; input first_name :$20. middle_name :$20. last_name :$30. nick_name :$20. suffix :$5. problem :$96.; datalines; ROBERT,LEWIS,STEVENSON,,,Names have lower case letters JOAN DARC,,FRANCAIS,,,Apostrophe JILL,ADEL,ABAR KLEMENS,,,Two dashes in LastName JOHN,RUDULF,REINDEER,,,FirstName contains '(2nd)'. Preserve parentheses AAAAAA,EEEE N,OOOOO UUUU,,,Diacritics in all names OH MY,OH BE,JOYFUL,,,Exclamation points in FirstName and LastName. Preserve parentheses JOHN,CHS R,SABEL,,,CHS 10 in MiddleName JOHN,DEVON,MOO ZOO,,,Zeros which should be O's in MiddleName and LastName SOO YOO,,TRAN,,,Zeros which should be O's in FirstName PACO AMJEO,D,OLLIE OSHAUN,,,Zeros which should be O's in FirstName and LastName JOSEPH,T,CONRAD,,,Zero concatenated to middle initial JOHN,,NULL,,,Middle name contains 'Null'. No change to LastName JOHN,,SOLDIER,,,Middle name contains 'UNKNOWN' JOHN,ROBERT,SABEL,,JR,Suffix in FirstName JOHN,,SABEL,,JR,Suffix is MiddleName JOHN,R,SABEL,,JR,Suffix in LastName JOHN,,SABEL,,II,Suffix in LastName JILL,,SABEL,,,NMI as MiddleName FRANCISO,GUERILLO,DECASTILLO,,III,Suffix concantenated to word in last name FRANCISO,GUERILLO,DELATRISTE,,ESQ,Suffix in FirstName JUSTICE,A,SMITH,,IV,Suffix in MiddleName JUSTICE,B,BROWN,,JR,Suffix in MiddleName ISR,,MOO,,,No change (FirstName contains a faux suffix) WILLEM,,VANORANJE,,,Space after 'van' T,JOSEPH,QUINTO,,,Middle name embedded in FirstName KING,,HENRY,,IV,No change KING,,HENRY,,JR,Period and lower case letter in suffix KING,,HENRY,,SR,Period and lower case letter in suffix KING,,HENRY,,,Non-suffix word in Suffix field KING,JR,HENRY,,IV,No change WILLIAM,,HENRY,,JR,Suffix appended to FirstName BILL,MARC,ALLEN,WILLIAM,JR,Nickname in FirstName BILL,MARC,ALLEN,WILLIAM,,Nickname in FirstName LUNA,,ADELE,,,FirstName contains a word with an only a left parethesis LUNA,,ADELE,,,FirstName contains a word with an only a left parethesis LUNA,,ADELE,LUCILLE,,FirstName Nickname enclosed by parentheses BILL,MARC WILLIAM,ALLEN,,JR,Alternate MiddleName encloses by parentheses BILL,MARC WILLIAM,ALLEN,,,Alternate MiddleName encloses by parentheses NGUYET,,TRAN,,,Open left parenthesis in MiddleName NGUYET,,TRAN,,,Open parenthesis with additional letters in MiddleName NGUYET,SUSAN,TRAN,,,MiddleName enclosed by parentheses NGUYET,JOY SUSAN,TRAN,,,Second word in MiddleName enclosed by parentheses NGUYET,SUSAN,TRAN,,,"Superfluous ""S"" in MIddleName" AJ,CJ,BJ,,JR,"Superfluous ""C"" in MIddleName" AJ,,BJ,,JR,Parentheses enclosed NULL in MiddleName JOHN,ROBERT,CASTERLINE SABEL,,,Second work in LastName is enclosed by parentheses JOHN,ROBERT,CASTERLINE SABEL,,,Third word in LastName is enclosed by parentheses and is superfluous WALTER,J,WINK,,,Supefluous '0' in MiddleName ; run; * Comparison of the test data with the results of feeding the test data thorugh %StandardizeNames(); proc compare base=TestName_Expected compare=TestName_Out briefsummary; run; */