/*-----------------------------------------------------------------------------

File: StandardizeNames.sas

Author:         John Sabel
                john.sabel@ofm.wa.gov
                Washington State Education Research & Data Center
Creation date:  2/14/11
Version:        1.3

Purpose
    This macro convert the first, last and middle names into a "standard
    form".  Among other things, this means all names are upper case, names
    contain only alphabetic characters, if a suffix (JR, III, etc.) is embedded
    in a first or last name, the suffix is placed into its own field, etc.

	This macro requires that the DsnIn data set contain at least a first name
    and a last name variable. Other variables for middle name, suffix and
    nickname, are not required, but will be in the outputted DsnOut data set.

Parameters
    DsnIn:      The two level name of a data set that contains first, last and
                middle names.
    DsnOut:     The name of the outputted data set.
    FirstName:  The name of the first name fieldin the "In" dataset.
                The default is "FirstName."
    MiddleName: (Optional) The name of the middle name field in the "In" dataset.
                The default is "MiddleName." If the DsnIn dataset doesn't contain
                a MiddleName field, one will be created for the DsnOut dataset.
    LastName:   The name of the last name field in the "In" dataset.
                The default is "LastName."
    Suffix:     (Optional) The name of the suffix field in the "In" dataset.
                The default is "Suffix." If the DsnIn dataset doesn't contain
                a Suffix field, one will be created for the DsnOut dataset.
    Nickname:   (Optional) The name of the Nickname field in the "In" dataset.
                The default is "Nickname."  If the DsnIn dataset doesn't contain
                a Nickname field, one will be created for the DsnOut dataset.
    Debug:      (Optional) Y = Don't delete intermidiate datasets. Keep 
                instrumentation variables. N = Delete intermidiate datasets and 
                instrumentation variables. Default is "N".

Signature
%StandardizeNames(DsnIn=,
                  DsnOut=,
                  FirstName=FirstName,
                  MiddleName=MiddleName,
                  LastName=LastName,
                  Nickname=Nickname,
                  Suffix=Suffix,
                  Debug=N)
Revisions:
Date        Version  Author  Comments
2012-03-26  1.1      John    1) Changed macro name from %StandardizeName to 
                                %NormalizeNames.
                             2) Added code to convert extended ASCII latin 
                                letters (e.g. "Ñ") to alphabetic characters.   
                                Added code to convert embedded zeros to O's.  
                                Added code to delete 2ND's, 3RD's, 4TH's, etc.
2012-04-18  1.11     John    Changed name from %NormalizeNames to
                             %StandardizeNames.
2012-10-19  1.12     John    Added code to strip names again near end of data
                             step "__ImprovedNames_NoSuffix."
                             Also, added code to make out non-printable
                             characters into spaces.
2012-11-15  1.13     John    Added code to strip suffixes in middle names that
                             follow a word in the middle name, and then place
                             these suffixes in the Suffix field.
2013-02-20  1.14     John    Where records in DsnIn already have a valid
                             suffix, will not try to strip any suffix
                             candidates from the name fields.  Also,
                             reformatted test data into a CSV format, and 
                             created an expected data set.
2013-03-01  1.2      John    In some data (NSC) some first names have suffixes
                             directly appended to them. Added code to account 
                             for these known cases (see format
                             $FirstNameWithSuffix below).
2014-11-05  1.3      John    Added code to dynamically create MiddleName and/or
                             Nickname fields when one or both don't exist in
                             the DsnIn data set.
-----------------------------------------------------------------------------*/
%macro StandardizeNames(DsnIn=,
						DsnOut=,
						FirstName=FirstName,
						MiddleName=MiddleName,
						LastName=LastName,
						Nickname=Nickname,
						Suffix=Suffix,
						Debug=N);
	%put --- Start of %upcase(&sysmacroname) macro;
	/* 
	Generally ERDC data sources don't append suffixes directly to first names.
	The one occasional exception has been NSC data.  These are the NSC 
	exceptions to date.
	*/
	proc format;
		value $FirstNameWithSuffix
			'ABELJR'       = 'ABEL JR'
			'ALEXANDRJR'   = 'ALEXANDR JR'
			'ALFREDOJR'    = 'ALFREDO JR'
			'AMADOJR'      = 'AMADO JR'
			'ANDREWII'     = 'ANDREW II'
			'ANTRONJR'     = 'ANTRON JR'
			'BERNARDIV'    = 'BERNARD IV'
			'BLUEJR'       = 'BLUE JR'
			'CHALLANII'    = 'CHALLAN II'
			'CRAIGJR'      = 'CRAIG JR'
			'DALEJR'       = 'DALE JR'
			'DANILOJR'     = 'DANILO JR'
			'DAVIDJR'      = 'DAVID JR'
			'DEREKJR'      = 'DEREK JR'
			'DERRICKII'    = 'DERRICK II'
			'DOMINGOJR'    = 'DOMINGO JR'
			'DONALDJR'     = 'DONALD JR'
			'DONII'        = 'DON II'
			'DWAYNEJR'     = 'DWAYNE JR'
			'EDWARDJR'     = 'EDWARD JR'
			'ERICJR'       = 'ERIC JR'
			'ERNESTJR'     = 'ERNEST JR'
			'FAUESEJR'     = 'FAUESE JR'
			'FELIPEJR'     = 'FELIPE JR'
			'FRANKLINJR'   = 'FRANKLIN JR'
			'FREDERICKJR'  = 'FREDERICK JR'
			'FREDJR'       = 'FRED JR'
			'GABRIELJR'    = 'GABRIEL JR'
			'GEORGEJR'     = 'GEORGE JR'
			'GERARDJR'     = 'GERARD JR'
			'IGNACIOJR'    = 'IGNACIO JR'
			'ISTVANJR'     = 'ISTVAN JR'
			'JAMESII'      = 'JAMES II'
			'JAMESJR'      = 'JAMES JR'
			'JEFFREYJR'    = 'JEFFREY JR'
			'JIMMYII'      = 'JIMMY II'
			'JOHNJR'       = 'JOHN JR'
			'JONATHANJR'   = 'JONATHAN JR'
			'JOSEJR'       = 'JOSE JR'
			'JOSEPHJR'     = 'JOSEPH JR'
			'JUANJR'       = 'JUAN JR'
			'KEVINJR'      = 'KEVIN JR'
			'KURTNATHANJR' = 'KURTNATHAN JR'
			'LAWRENCEJR'   = 'LAWRENCE JR'
			'LIEULOJR'     = 'LIEULO JR'
			'LUCIANOJR'    = 'LUCIANO JR'
			'LUKEJR'       = 'LUKE JR'
			'MANUELJR'     = 'MANUEL JR'
			'MARKJR'       = 'MARK JR'
			'MATTHEWJR'    = 'MATTHEW JR'
			'MICHAELJR'    = 'MICHAEL JR'
			'PATRICKJR'    = 'PATRICK JR'
			'PAULJR'       = 'PAUL JR'
			'PETERJR'      = 'PETER JR'
			'RAFAELJR'     = 'RAFAEL JR'
			'RAULJR'       = 'RAUL JR'
			'RAYMONDJR'    = 'RAYMOND JR'
			'RICHARDJR'    = 'RICHARD JR'
			'ROBERTJR'     = 'ROBERT JR'
			'ROBERTOJR'    = 'ROBERTO JR'
			'ROGELIOII'    = 'ROGELIO II'
			'SCOTTJR'      = 'SCOTT JR'
			'SERGEJR'      = 'SERGE JR'
			'STEPHENJR'    = 'STEPHEN JR'
			'STEVENJR'     = 'STEVEN JR'
			'TERRYNCEJR'   = 'TERRYNCE JR'
			'TOUSSAINTJR'  = 'TOUSSAINT JR'
			'VIKTORJR'     = 'VIKTOR JR'
			'WILLIAMJR'    = 'WILLIAM JR';
	run;
	/* Determine if &Suffix. exists in DsnIn. If not, create &Suffix. below. */
	proc sql noprint;
		CREATE TABLE __DsnInVariables LIKE &DsnIn.;
	quit;

	proc sql noprint;
		title "Does &MiddleName. Exist in &DsnIn?";
		SELECT *
		FROM DICTIONARY.COLUMNS
		WHERE LIBNAME = 'WORK' AND
		MEMNAME = "%upcase(__DsnInVariables)" AND
		UPCASE(NAME) = "%upcase(&MiddleName.)"
		;
		%let _DsnInHasMiddleNameField = &sqlobs.;
	quit;

	proc sql noprint;
		title "Does &Suffix. Exist in &DsnIn?";
		SELECT *
		FROM DICTIONARY.COLUMNS
		WHERE LIBNAME = 'WORK' AND
		MEMNAME = "%upcase(__DsnInVariables)" AND
		UPCASE(NAME) = "%upcase(&Suffix.)"
		;
		%let _DsnInHasSuffixField = &sqlobs.;
	quit;

	proc sql noprint;
		title "Does &Nickname. Exist in &DsnIn?";
		SELECT *
		FROM DICTIONARY.COLUMNS
		WHERE LIBNAME = 'WORK' AND
		MEMNAME = "%upcase(__DsnInVariables)" AND
		UPCASE(NAME) = "%upcase(&Nickname.)"
		;
		%let _DsnInHasNicknameField = &sqlobs.;
	quit;
	title;
	

	data __ImprovedNames_NoSuffix (drop = _i _aname __re1 __re2);
		set &DsnIn.;

		/* Dynamically creates MiddleName and/or Suffix and/or 
		Nickname variables when not extant in the DsnIn data set. */
		%if &_DsnInHasMiddleNameField. ~= 1 | 
			&_DsnInHasSuffixField. ~= 1 | 
			&_DsnInHasNicknameField. ~= 1 %then %do;
				length 
				%if &_DsnInHasMiddleNameField. ~= 1 %then %do;
					&MiddleName. $30 
				%end;
				%if &_DsnInHasSuffixField. ~= 1 %then %do;
					&Suffix. $5 
				%end;
				%if &_DsnInHasNicknameField. ~= 1 %then %do;
					&Nickname. $30
				%end;
				; /* Closes out the "length" statement. */
				%if &_DsnInHasMiddleNameField. ~= 1 %then %do;
					&MiddleName. = '';
				%end;
				%if &_DsnInHasSuffixField. ~= 1 %then %do;
					&Suffix. = '';
				%end;
				%if &_DsnInHasNicknameField. ~= 1 %then %do;
					&Nickname. = '';
				%end;
		%end;
		
/*		%if &_DsnInHasSuffixField. ~= 1 %then %do;*/
/*			length &Suffix. $5;*/
/*			&Suffix. = ''; * Kind of a kludge. Prevents an uninitialized*/
/*		                     variable note occurring in the log;*/
/*		%end;*/
/**/
/*		%if &_DsnInHasNicknameField. ~= 1 %then %do;*/
/*			length &Nickname. $5;*/
/*			&Nickname. = ''; * Kind of a kludge. Prevents an uninitialized*/
/*		                     variable note occurring in the log;*/
/*		%end;*/
		
		
		retain __re1 __re2 _reParen;

		if _N_ =1 then do;
			__re1 = prxparse("s/[^\sA-Z]//");
			__re2 = prxparse("s/-+/ /");
		end;

/*		&Suffix. = ''; */
		_OrigLast = &LastName.;
		_OrigFirst = &FirstName.;
		_OrigMiddle = &MiddleName.;
		_OrigSuffix = &Suffix.;

		/* Initialize instrumentation variables. */
		_n = _N_;
		_NonPrintable = 0;
		_Upcased = 0;
		_Striped = 0;
		_Nth = 0;
		_DeLatinized = 0;
		_OpenParenthesis = 0;
		_ConvertedDashes = 0;
		_DeletedApostrophe = 0;
		_DeletedPairedParen = 0;
		_DeletedNonAlphaZero = 0;
		_NULLsInParens = 0;
		_OpenLeftParen = 0;
		_DeleteParenARoundName = 0;
		_SprFlsMI = 0;
		_Nickname = 0;
		_SprflsParenLstNmWrd = 0;
		_ZeroToO = 0;
		_DeletedZero = 0;
		_CHS = 0;
		_Compbled = 0;
		_Other = 0;
		_NullName = 0;
		/* 
		The following instrumentation variables are not used in this data step,
		but in the data steps below.
		*/
		_SquishedLast = 0;
		_NewSuffix = 0;
		_MovedToMI = 0;
		_DeletedSecondFirstName = 0;
		
		array __name{4} &FirstName. &MiddleName. &Nickname. &LastName.;
		
		/* Make all characters upper case. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			_aname  = upcase(_aname);
			if _aname ~= __name{_i} then _Upcased = _Upcased + 1;
			__name{_i} = _aname;
		end;
		/* Strip all names. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			_aname  = strip(_aname);
			if _aname ~= __name{_i} then _Striped = _Striped + 1;
			__name{_i} = _aname;
		end;
		/* JRS: Added 3/13/12 */
		/* Delete these sorts or words: 2nd, 3rd, 4th, 5th, etc. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/([\(\s]|^)(1ST|2ND|3RD|[4-9]TH)([\)\s]|$)//', -1, _aname);
			if _aname ~= __name{_i} then _Nth = _Nth + 1;
			__name{_i} = _aname;
		end;
		/* Convert extended ASCII letters in alphabetic characters. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			_aname  = translate(_aname, 'AAAAAA', 'ÀÁÂÃÄÅ');
			_aname  = translate(_aname, 'C', 'Ç');
			_aname  = translate(_aname, 'EEEE', 'ÈÉÊË');
			_aname  = translate(_aname, 'IIII', 'ÌÍÎÏ');
			_aname  = translate(_aname, 'N', 'Ñ');
			_aname  = translate(_aname, 'OOOOO', 'ÒÓÔÕÖ');
			_aname  = translate(_aname, 'UUUU', 'ÙÚÛÜ');
			if _aname ~= __name{_i} then _DeLatinized = _DeLatinized + 1;
			__name{_i} = _aname;
		end;
		/* Delete all open parentheses followed by zero or more characters. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/(\([^\)]*)$//', -1, strip(_aname));
			if _aname ~= __name{_i} then _OpenLeftParen = _OpenLeftParen + 1;
			__name{_i} = _aname;
		end;
		/* Convert dashes to spaces. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/-/ /', -1, _aname);
			if _aname ~= __name{_i} then _ConvertedDashes = _ConvertedDashes + 1;
			__name{_i} = _aname;
		end;
		/* JRS: Added 11/25/12 */
		/* Delete apostrophes */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange("s/'//", -1, _aname);
			if _aname ~= __name{_i} then _DeletedApostrophe = _DeletedApostrophe + 1;
			__name{_i} = _aname;
		end;
		/*
		Any character that isn't an alphabetic character a white space, 
		character, a parenthesis or a zero, convert to a space.
		*/
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/[^\s0A-Z\(\)]/ /', -1, _aname);
			if _aname ~= __name{_i} then _DeletedNonAlphaZero = _DeletedNonAlphaZero + 1;
			__name{_i} = _aname;
		end;
		/*
		JRS: Added 10/19/12, modified 11/15/12.
		Make all non-printable characters spaces.  Not just stripping mostly
		because of tabs.  Needs to be before compbl() statement below but after
		the delatinization (extended ASCII) statements above.
		*/
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/[\x00-\x1F|\x7F-\xFF]/ /', -1, _aname);
			if _aname ~= __name{_i} then _NonPrintable = _NonPrintable + 1;
			__name{_i} = _aname;
		end;


		/* Parentheses... */
		/* Single word names enclosed by parentheses, delete parentheses. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/(^\()(\.*)(\))$/$2/', -1, _aname);
			if _aname ~= __name{_i} then _DeleteParenARoundName= _DeleteParenARoundName + 1;
			__name{_i} = _aname;
		end;
		
		/* Deletes superfluous MI if MI is the first letter in following string enclosed by parentheses. */
		if length(scanq(&MiddleName., 1, ' ()')) = 1 and index(&MiddleName., '(') then do;
			_MI = scanq(&MiddleName., 1, ' ()');
/*			put _MI=;*/
			_temp = scanq(&MiddleName., 2, '()');
/*			put _temp=;*/
			if _MI = substr(scanq(&MiddleName., 2, '()'), 1, 1) then do;
				&MiddleName. = scanq(&MiddleName., 2, '()');
				_SprFlsMI = _SprFlsMI+1;
			end;
		end;

		if prxmatch('/\w+.*\(.*\)/', &FirstName.) then do;
			_MOOO = &FirstName.;
/*			put _MOOO=;*/
			&Nickname. = scanq(strip(&FirstName.), -1, '()');
/*			&Nickname. = prxchange('s/\((.*)\)/$1/', 1, &FirstName.);*/
			&FirstName. = prxchange('s/\(.*\)//', -1, &FirstName.);
			_Nickname = _Nickname + 1;
		end;
		/* Last name contains superfluous word in parentheses, delete word and parentheses. */
		if _N_ = 1 then _reParen = prxparse('/\(.*\)/');
		/* CASTERLINE SABEL (CASTERLINE) becomes CASTERLINE SABEL */
		if prxmatch(_reParen, &LastName.) then do;
			_PossibleSuperfluousWord=scanq(strip(&LastName.), -1, '()');
/*			put _PossibleSuperfluousWord=;*/
/*			_PossibleSuperfluousWord = prxposn(_reParen, 1, &LastName.);*/
/*			put _PossibleSuperfluousWord=;*/
/*			_PossibleSuperfluousWord2 = compress(prxposn(_reParen, 1, &LastName.), "()");*/
/*			put _PossibleSuperfluousWord2=;*/
			if indexw(&LastName., _PossibleSuperfluousWord) then do;
				&LastName. = prxchange('s/\(.*\)//', -1, &LastName.);
				_SprflsParenLstNmWrd = _SprflsParenLstNmWrd + 1;
			end;
		end;

		/* Delete (NULL) from all names. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/\(NULL\)/ /', -1, _aname);
			if _aname ~= __name{_i} then _NULLsInParens = _NULLsInParens + 1;
			__name{_i} = _aname;
		end;

		/* Convert remaining parentheses to spaces. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/(\(|\))/ /', -1, _aname);
			if _aname ~= __name{_i} then _NULLsInParens = _NULLsInParens + 1;
			__name{_i} = _aname;
		end;
		
/**/
/*			&Nickname. = prxchange('s/(\()(.*)(\))/$2/', -1, &LastName.);*/
/*			&LastName. = prxchange('s/\(.*\)//', -1, &LastName.;*/
/*			_NicknameFromMiddleName = _NicknameFromMiddleName + 1;*/
/*		end;*/




		/* Convert embedded zeros to O's */
		do _i = 1 to 4;
			_aname = __name{_i};
			/* A letter followed by two zeros. */
			_aname = prxchange('s/([A-Z])(00)/$1OO/', -1, _aname);
			/* A single zero embedded within letters. */
			_aname = prxchange('s/([A-Z])(0)([A-Z])/$1O$3/', -1, _aname);
			/* Two or more letters followed by a zero. */
			_aname = prxchange('s/([A-Z]{2,})(0)/$1O/', -1, _aname);
			/* A zero followed by two or more letters. */
			_aname = prxchange('s/(0)([A-Z]{2,})/O$2/', -1, _aname);
			if _aname ~= __name{_i} then _ZeroToO = _ZeroToO + 1;
			__name{_i} = _aname;
		end;
		/* Delete all remaining zeros. */
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/0//', -1, _aname);
			if _aname ~= __name{_i} then _DeletedZero = _DeletedZero + 1;
			__name{_i} = _aname;
		end;
		/* 
		Delete "CHS ##" where ## is a number.  Seen in last names of ERDC K12 data.
		This operation is not in the the Normalize Names document since this is
		specific to ERDC data.
		*/
		do _i = 1 to 4;
			_aname = __name{_i};
			_aname = prxchange('s/^CHS\s\d+//', -1, _aname);
			if _aname ~= __name{_i} then _CHS = _CHS + 1;
			__name{_i} = _aname;
		end;

		/* Convert strings of spaces into single spaces. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			 _aname  = compbl(_aname);
			if _aname ~= __name{_i} then _Compbled = _Compbled + 1;
			__name{_i} = _aname;
		end;
		/* Delete UNKNOWNs and ESTATEs. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			 	_aname  = tranwrd(_aname,'UNKNOWN','');
				if prxmatch('/ESTATE/', _aname) then _aname = '';
			if _aname ~= __name{_i} then _Other = _Other + 1;
			__name{_i} = _aname;
		end;
		/* Delete paired ("()") parentheses*/
/*		do _i = 1 to 4;*/
/*			_aname = __name{_i};*/
/*			_aname = prxchange('s/\(\s*\)//', -1, _aname);*/
/*			if _aname ~= __name{_i} then _DeletedPairedParen = _DeletedPairedParen + 1;*/
/*			__name{_i} = _aname;*/
/*		end;*/
		/* 
		JRS: Added 10/19/12
		Strip all names once more. */
		do _i = 1 to 4;
			_aname  = __name{_i};
			_aname  = strip(_aname);
			if _aname ~= __name{_i} then _Striped = _Striped + 1;
			__name{_i} = _aname;
		end;
		
		/*
		Delete NULLs from first, middle, and nick names.  Note: NULL can be a
		legitamite last name.
		*/
		do _i = 1 to 3;
			_aname  = __name{_i};
			 if _aname= 'NULL' then _aname = '';
			if _aname ~= __name{_i} then _NullName = _NullName + 1;
			__name{_i} = _aname;
		end;
		/* The "s" at beginning of expression means substitute.  \b = word boundry, such
		 as a space or the beginning of a field. \s = a space character.  $1 is the word, be it
		 "AL" or "DE", etc. that lies in the parenthesis.  So this expression removes
		 the space that follows any of the "words" listed within the
		 paranthesis. */

		_aname = &LastName.;
		_aname = prxchange('s/\b(AL|DE|DEL|DER|EL|MC|LA|LE|MAC|ST|VON|VONDER|VAN|VANDER)\s/$1/',-1,_aname);
		_aname = prxchange('s/\b(AL|DE|DEL|DER|EL|MC|LA|LE|MAC|ST|VON|VONDER|VAN|VANDER)\s/$1/',-1,_aname);
		if _aname ~= &LastName. then _SquishedLast = 1;
		&LastName. = _aname;
	run;


	/* Moves &Suffix.es from first and last names to their own field. */
	data __ImprovedNames_Suffix (drop= __re __reIII pattern position length);
		set __ImprovedNames_NoSuffix;

		/*
		Pattern for a name that ends with a space plus a &Suffix..
		Used with UB04 data.
		*/
		if _N_ = 1 then do;
			retain __re __reIII;
			pattern = "/( II| III| IV| V| VI| VII| VIII| ESQ| JR| SR)$/i";
			__re = prxparse(pattern);
			__reIII = prxparse('/III$/');
		end;

		/* JRS: Added 11/26/12. */
		/* Delete anything that isn't a generational suffix from the suffix field. */
		&Suffix. = strip(upcase(&Suffix.));
		&Suffix. = prxchange('s/[^A-Z]//', -1, &Suffix.);
		if &Suffix. not in('II','III','IV','V','VI','VII','VIII','JR','SR','ESQ') then
			&Suffix. = '';
		
		/*
		JRS: Revised 3/1/2013
		In some NSC data, suffixes were directly appended to first name.  The fomrat $FirstNameWithSuffix.
		seperates out the suffixes from these known cases.  The longest member of $FirstNameWithSuffix
		is 13 characters, so we only use the format when the length of &FirstName. is <= 13.
		(I found that the longest length SAS allows in the format is $FirstNameWithSuffix40., so though
		simplier, you can't just apply the format to all FirstNames because theoretically you
		would truncate some of them to 40 characters.)
		*/
		if length(&FirstName.) <= 13 then 
			&FirstName. = put(&FirstName., $FirstNameWithSuffix40.);
/*		put &FirstName.=;*/
		/* 
		If a good suffix doesn't already exist, extract what can be extracted from name fields.
		*/
		if &Suffix. = '' then do;
			call prxsubstr(__re, TRIM(&FirstName.), position, length);
			if position ^= 0 then do;
				&Suffix. = substr(&FirstName., position + 1, length - 1);
				&FirstName. = substr(&FirstName., 1, position - 1);
			end;
			
			/*
			Strip out suffixes that follow a word in the middle name and place
			in Suffix field.  JRS: Added 11/15/12.
			*/
			call prxsubstr(__re, TRIM(&MiddleName.), position, length);
			if position ^= 0 then do;
				&Suffix. = substr(&MiddleName., position + 1, length - 1);
				&MiddleName. = substr(&MiddleName., 1, position - 1);
			end;
			/*
			Used where only one or two characters make up first name.  All UB92
			data, a little UB04 data.  Intentionally excludes II, IV, V, VI and
			SR since these could be abbreviated first names.
			*/
			else if TRIM(&FirstName.) in ('III','VII','VIII','JR') then do;
				&Suffix. = &FirstName.;
				&FirstName. = &MiddleName.;
				&MiddleName. = '';
			end;

			call prxsubstr(__re, TRIM(&LastName.), position, length);
			if position ^= 0 and &LastName. ne 'NASR' then do;
				&Suffix. = substr(&LastName., position + 1, length - 1);
				&LastName. = substr(&LastName., 1, position - 1);
			end;
			/*
			Traditionally, the 'II' is used to differentiate between
			members in the same family that have the same name, but do not
			have a father/son relationship.  That said, the 'II' suffix is
			sometimes used in a father/son relationship.  These exceptions 
			seem to be concentrated amoung hispanics. 
			
			As a result, the line below is commented out in order to preserve
			the suffix 'II'.
			*/  
			*if &Suffix.='II' then &Suffix.='JR';

			/* For cases where the first name terminates in 'III'. */
			if &Suffix. = '' then do;
				call prxsubstr(__reIII, TRIM(&FirstName.), position, length);
				if position ^= 0 and lengthn(&Firstname.) >= 7 then do;
					&Suffix. = 'III';
				&FirstName. = trim(substr(&FirstName., 1, position - 1));
			end;
			end;
			/* For cases where the last name terminates in 'III'. */
			if &Suffix. = '' then do;
				call prxsubstr(__reIII, TRIM(&LastName.), position, length);
				if position ^= 0 and lengthn(&LastName.) >= 7 then do;
					&Suffix. = 'III';
					&LastName. = trim(substr(&LastName., 1, position - 1));
				end;
			end;
			if &Suffix. = '' and &MiddleName. in ('III','VII','VIII','JR') then do;
				&Suffix. = &MiddleName.;
				&MiddleName. = '';
			end;
		end;

		if upcase(_OrigSuffix) ~= &Suffix. then _NewSuffix = 1;
	run;
	
	%if &Debug. = N %then %do;
		proc sql;
			DROP TABLE __DsnInVariables, __ImprovedNames_NoSuffix;
		quit;
	%end;
	
	/*
	This does two things:
		1) If second word of a first name is a single letter that doesn't follow
		a dash and middle name is missing, moves that single letter to the middle name.
		2) If second word of a first name is same as the middle name, then delete
		the second word of the first name.
	*/
	data __ImprovedNames_FirstLast;
		set __ImprovedNames_Suffix;
				
		/* Causes the _SecondFirstName to contain all the words from the second and beyond. */ 
		if index(strip(&FirstName.), ' ') > 1 then
			_SecondFirstName = strip(substr(&FirstName., index(&FirstName., ' ') + 1));

		if lengthn(_SecondFirstName) = 1 and indexc(_OrigFirst, '-') = 0 then do;
			&FirstName. = scanq(&FirstName.,1);
			/* if &MiddleName. doesn't contain any letters, move single letter 2nd first
			name to middle name. */
			if not(anyalpha(&MiddleName.)) then do;
				&MiddleName. = _SecondFirstName;
				_MovedToMI = 1;
			end;
		end;
		/*
		e.g. (First name:)T JOSEPH (Middle name:)JOSEPH will become
		     (First name:)T        (Middle name:)JOSEPH.
		*/

		if lengthn(_SecondFirstName) > 1 and _SecondFirstName = compress(&MiddleName.) then do;
			&FirstName. = scanq(&FirstName.,1);
			_DeletedSecondFirstName = 1;
		end;
		
		if &MiddleName. in ('NMI', 'NMN') then &MiddleName. = '';
	run;
	
	%if &Debug. = N %then %do;
		proc sql;
			DROP TABLE __ImprovedNames_Suffix;
		quit;
	%end;
	

	data &DsnOut.;
		set __ImprovedNames_FirstLast;
		
		%if &Debug. = N %then %do;
			drop _:;
		%end;
	run;

	%if &Debug. = N %then %do;
		proc sql;
			DROP TABLE __ImprovedNames_FirstLast;
		quit;
	%end;

	%put --- End of %upcase(&sysmacroname) macro;
%mend StandardizeNames;

/*
* Test data: ;
data TestName_In;
	infile datalines delimiter=',' dsd truncover;
	input 
		first_name  :$20.
		middle_name :$20.
		last_name   :$30.
		nick_name   :$20.
		suffix      :$5.
	    problem     :$96.;
	datalines;
Robert,Lewis,Stevenson,,,Names have lower case letters
JOAN D'ARC,,FRANCAIS,,,Apostrophe
JILL,ADEL,ABAR--KLEMENS,,,Two dashes in LastName
JOHN (2ND),RUDULF,REINDEER,,,FirstName contains '(2nd)'. Preserve parentheses
ÀÁÂÃÄÅ Ç,ÈÉÊË Ñ,ÒÓÔÕÖ ÙÚÛÜ,,,Diacritics in all names
(OH MY!),OH BE,JOYFUL!,,,Exclamation points in FirstName and LastName. Preserve parentheses
JOHN,CHS 10 R,SABEL,,,CHS 10 in MiddleName
JOHN,DEV0N,MO0 Z00,,,Zeros which should be O's in MiddleName and LastName
S00 Y00,5TH,TRAN,,,Zeros which should be O's in FirstName
PAC0 AMJE0,D,0LLIE 0'SHAUN,,,Zeros which should be O's in FirstName and LastName
JOSEPH,T0,CONRAD,,,Zero concatenated to middle initial
JOHN,NULL,NULL,,,Middle name contains 'Null'. No change to LastName
JOHN,UNKNOWN,SOLDIER,,,Middle name contains 'UNKNOWN'
JOHN JR,ROBERT,SABEL,,,Suffix in FirstName
JOHN,JR,SABEL,,,Suffix is MiddleName
JOHN,R.,SABEL JR,,,Suffix in LastName
JOHN,,SABEL II,,,Suffix in LastName
JILL,NMI,SABEL,,,NMI as MiddleName
FRANCISO,GUERILLO,DE CASTILLOIII,,,Suffix concantenated to word in last name
FRANCISO ESQ,GUERILLO,DE LA TRISTE,,,Suffix in FirstName
JUSTICE,A IV,SMITH,,,Suffix in MiddleName
JUSTICE,B JR,BROWN,,,Suffix in MiddleName
ISR,,MOO,,,No change (FirstName contains a faux suffix)
WILLEM,,VAN ORANJE,,,Space after 'van'
T JOSEPH,JOSEPH,QUINTO,,,Middle name embedded in FirstName
KING,,HENRY,,IV,No change
KING,,HENRY,,Jr.,Period and lower case letter in suffix
KING,,HENRY,,Sr.,Period and lower case letter in suffix
KING,,HENRY,,Mr.,Non-suffix word in Suffix field
KING,JR,HENRY,,IV,No change
WILLIAMJR,,HENRY,,,Suffix appended to FirstName
BILL (WILLIAM),MARC,ALLEN,,JR,Nickname in FirstName
BILL (WILLIAM),MARC,ALLEN,,,Nickname in FirstName
LUNA(LUC,,ADELE,,,FirstName contains a word with an only a left parethesis
LUNA (LUC,,ADELE,,,FirstName contains a word with an only a left parethesis
LUNA (LUCILLE),,ADELE,,,FirstName Nickname enclosed by parentheses
BILL,MARC (WILLIAM),ALLEN,,JR,Alternate MiddleName encloses by parentheses
BILL,MARC (WILLIAM),ALLEN,,,Alternate MiddleName encloses by parentheses
NGUYET,(,TRAN,,,Open left parenthesis in MiddleName
NGUYET,(SUSA,TRAN,,,Open parenthesis with additional letters in MiddleName
NGUYET,(SUSAN),TRAN,,,MiddleName enclosed by parentheses
NGUYET,JOY (SUSAN),TRAN,,,Second word in MiddleName enclosed by parentheses
NGUYET,S(SUSAN),TRAN,,,"Superfluous ""S"" in MIddleName"
AJ,C (CJ),BJ,,JR,"Superfluous ""C"" in MIddleName"
AJ,(NULL),BJ,,JR,Parentheses enclosed NULL in MiddleName
JOHN,ROBERT,CASTERLINE (SABEL),,,Second work in LastName is enclosed by parentheses
JOHN,ROBERT,CASTERLINE SABEL (CASTERLINE),,,Third word in LastName is enclosed by parentheses and is superfluous
WALTER,J0,WINK,,,Supefluous '0' in MiddleName
;
run;

%StandardizeNames(DsnIn=TestName_In,
						DsnOut=TestName_Out,
						FirstName=first_name,
						MiddleName=middle_name,
						LastName=last_name,
						Nickname=nick_name,
						Debug=N)

* What the test data should look like after being fed through %StandardizeNames().
data TestName_Expected;
	infile datalines delimiter=',' dsd truncover;
	input 
		first_name  :$20.
		middle_name :$20.
		last_name   :$30.
		nick_name   :$20.
		suffix      :$5.
	    problem     :$96.;
	datalines;
ROBERT,LEWIS,STEVENSON,,,Names have lower case letters
JOAN DARC,,FRANCAIS,,,Apostrophe
JILL,ADEL,ABAR KLEMENS,,,Two dashes in LastName
JOHN,RUDULF,REINDEER,,,FirstName contains '(2nd)'. Preserve parentheses
AAAAAA,EEEE N,OOOOO UUUU,,,Diacritics in all names
OH MY,OH BE,JOYFUL,,,Exclamation points in FirstName and LastName. Preserve parentheses
JOHN,CHS R,SABEL,,,CHS 10 in MiddleName
JOHN,DEVON,MOO ZOO,,,Zeros which should be O's in MiddleName and LastName
SOO YOO,,TRAN,,,Zeros which should be O's in FirstName
PACO AMJEO,D,OLLIE OSHAUN,,,Zeros which should be O's in FirstName and LastName
JOSEPH,T,CONRAD,,,Zero concatenated to middle initial
JOHN,,NULL,,,Middle name contains 'Null'. No change to LastName
JOHN,,SOLDIER,,,Middle name contains 'UNKNOWN'
JOHN,ROBERT,SABEL,,JR,Suffix in FirstName
JOHN,,SABEL,,JR,Suffix is MiddleName
JOHN,R,SABEL,,JR,Suffix in LastName
JOHN,,SABEL,,II,Suffix in LastName
JILL,,SABEL,,,NMI as MiddleName
FRANCISO,GUERILLO,DECASTILLO,,III,Suffix concantenated to word in last name
FRANCISO,GUERILLO,DELATRISTE,,ESQ,Suffix in FirstName
JUSTICE,A,SMITH,,IV,Suffix in MiddleName
JUSTICE,B,BROWN,,JR,Suffix in MiddleName
ISR,,MOO,,,No change (FirstName contains a faux suffix)
WILLEM,,VANORANJE,,,Space after 'van'
T,JOSEPH,QUINTO,,,Middle name embedded in FirstName
KING,,HENRY,,IV,No change
KING,,HENRY,,JR,Period and lower case letter in suffix
KING,,HENRY,,SR,Period and lower case letter in suffix
KING,,HENRY,,,Non-suffix word in Suffix field
KING,JR,HENRY,,IV,No change
WILLIAM,,HENRY,,JR,Suffix appended to FirstName
BILL,MARC,ALLEN,WILLIAM,JR,Nickname in FirstName
BILL,MARC,ALLEN,WILLIAM,,Nickname in FirstName
LUNA,,ADELE,,,FirstName contains a word with an only a left parethesis
LUNA,,ADELE,,,FirstName contains a word with an only a left parethesis
LUNA,,ADELE,LUCILLE,,FirstName Nickname enclosed by parentheses
BILL,MARC WILLIAM,ALLEN,,JR,Alternate MiddleName encloses by parentheses
BILL,MARC WILLIAM,ALLEN,,,Alternate MiddleName encloses by parentheses
NGUYET,,TRAN,,,Open left parenthesis in MiddleName
NGUYET,,TRAN,,,Open parenthesis with additional letters in MiddleName
NGUYET,SUSAN,TRAN,,,MiddleName enclosed by parentheses
NGUYET,JOY SUSAN,TRAN,,,Second word in MiddleName enclosed by parentheses
NGUYET,SUSAN,TRAN,,,"Superfluous ""S"" in MIddleName"
AJ,CJ,BJ,,JR,"Superfluous ""C"" in MIddleName"
AJ,,BJ,,JR,Parentheses enclosed NULL in MiddleName
JOHN,ROBERT,CASTERLINE SABEL,,,Second work in LastName is enclosed by parentheses
JOHN,ROBERT,CASTERLINE SABEL,,,Third word in LastName is enclosed by parentheses and is superfluous
WALTER,J,WINK,,,Supefluous '0' in MiddleName
;
run;

* Comparison of the test data with the results of feeding the test data thorugh
  %StandardizeNames();
proc compare base=TestName_Expected compare=TestName_Out briefsummary;
run;
*/