标题: Dropping Automatically Variables with Only Missing Values [打印本页] 作者: shiyiming 时间: 2010-11-4 21:21 标题: Dropping Automatically Variables with Only Missing Values From LCChien's blog on blogspot
Link: <a href="https://docs.google.com/viewer?url=http%3A%2F%2Fsupport.sas.com%2Fresources%2Fpapers%2Fproceedings10%2F048-2010.pdf"><!-- m --><a class="postlink" href="http://support.sas.com/resources/papers/proceedings10/048-2010.pdf">http://support.sas.com/resources/papers ... 8-2010.pdf</a><!-- m --></a><br /><br />在進行資料分析前,有些人會習慣把一些含有missing data的樣本給清除掉,雖然這不會影響到分析結果,因為大部分的SAS程序都是用CCA(Complete Case Analysis)來處理含有missing data的數據,不過若要用手動的方法來清掉missing data的話,在遇到龐大數量的變數時,需要消耗很多時間在輸入遍數名稱上。美國人口普查局的Selvaratnam Sridharma發表了一個macro程序於SAS Global Forum 2010,讓這個程式撰寫的過程只需要幾秒鐘的時間就可以完成。<br /><br /><a name='more'></a><br />這個macro如下所示:<br /><pre><code>%DROPMISS(DSIN, DSOUT, NODROP);</code></pre>裡面只需要定義兩個macro參數:<br /><br /><ul><li>DSIN: 原始資料的名稱</li><li>DSOUT: 新資料的名稱</li><li>NODROP: 不要處理missing data的變數名稱,此為optional選項。</li></ul><br />其中,DSIN和DSOUT應該不用多做解釋。NODROP如果沒有指定特定的變數名稱的話,這個macro就會針對所有在DSIN所指定的資料裡面的變數進行missing data的處理。但如果想要讓這個macro不處理所有文字型變數或數值型變數,則可以用 __NUMERIC_ 或_CHARACTER_ 來限制。如下所示:<br /><pre><code>%DROPMISS (DSIN=olddata,DSOUT=newdata, nodrop= _NUMERIC_ );<br />%DROPMISS (DSIN=oldedata,DSOUT=newdata, nodrop= _CHARACTER_ );</code></pre>此macro的原始碼如下:<br /><pre><code>/******************/<br />options nomprint noSYMBOLGEN MLOGIC;<br />/****************************/<br />%macro DROPMISS( DSNIN /* name of input SAS dataset<br />*/<br />, DSNOUT /* name of output SAS dataset<br />*/<br />, NODROP= /* [optional] variables to be omitted from dropping even if<br />they have only missing values */<br />) ;<br />/* PURPOSE: To find both Character and Numeric the variables that have only<br />missing values and drop them if<br />* they are not in &NONDROP<br />*<br />* NOTE: if there are no variables in the dataset, produce no variables<br />processing code<br />*<br />*<br />* EXAMPLE OF USE:<br />* %DROPMISS( DSNIN, DSNOUT )<br />* %DROPMISS( DSNIN, DSNOUT, NODROP=A B C D--H X1-X100 )<br />* %DROPMISS( DSNIN, DSNOUT, NODROP=_numeric_ )<br />* %DROPMISS( DSNIN, DSNOUT, NOdrop=_character_ )<br />*/<br />%local I ;<br />%if "&DSNIN" = "&DSNOUT"<br />%then %do ;<br />%put /------------------------------------------------\ ;<br />%put | ERROR from DROPMISS: | ;<br />%put | Input Dataset has same name as Output Dataset. | ;<br />%put | Execution terminating forthwith. | ;<br />%put \------------------------------------------------/ ;<br />%goto L9999 ;<br />%end ;<br />/*###################################################################*/<br />/* begin executable code<br />/*####################################################################/<br />/*===================================================================*/<br />/* Create dataset of variable names that have only missing values<br />/* exclude from the computation all names in &NODROP<br />/*===================================================================*/<br />proc contents data=&DSNIN( drop=&NODROP ) memtype=data noprint out=_cntnts_( keep=<br />name type ) ; run ;<br />%let N_CHAR = 0 ;<br />%let N_NUM = 0 ;<br />data _null_ ;<br />set _cntnts_ end=lastobs nobs=nobs ;<br /><br />if nobs = 0 then stop ;<br />n_char + ( type = 2 ) ;<br />n_num + ( type = 1 ) ;<br />/* create macro vars containing final # of char, numeric variables */<br />if lastobs<br />then do ;<br />call symput( 'N_CHAR', left( put( n_char, 5. ))) ;<br />call symput( 'N_NUM' , left( put( n_num , 5. ))) ;<br />end ;<br />run ;<br />/*===================================================================*/<br />/* if there are no variables in dataset, stop further processing<br />/*===================================================================*/<br />%if %eval( &N_NUM + &N_CHAR ) = 0<br />%then %do ;<br />%put /----------------------------------\ ;<br />%put | ERROR from DROPMISS: | ;<br />%put | No variables in dataset. | ;<br />%put | Execution terminating forthwith. | ;<br />%put \----------------------------------/ ;<br />%goto L9999 ;<br />%end ;<br />/*===================================================================*/<br />/* put global macro names into global symbol table for later retrieval<br />/*===================================================================*/<br />%LET NUM0 =0;<br />%LET CHAR0 = 0;<br />%IF &N_NUM >0 %THEN %DO;<br />%do I = 1 %to &N_NUM ;<br />%global NUM&I ;<br />%end ;<br />%END;<br />%if &N_CHAR > 0 %THEN %DO;<br />%do I = 1 %to &N_CHAR ;<br />%global CHAR&I ;<br />%end ;<br />%END;<br />/*===================================================================*/<br />/* create macro vars containing variable names<br />/* efficiency note: could compute n_char, n_num here, but must declare macro names<br />to be<br />global b4 stuffing them<br />/*<br />/*===================================================================*/<br />proc sql noprint ;<br />%if &N_CHAR > 0 %then %str( select name into :CHAR1 - :CHAR&N_CHAR from<br />_cntnts_ where type = 2 ; ) ;<br />%if &N_NUM > 0 %then %str( select name into :NUM1 - :NUM&N_NUM from<br />_cntnts_ where type = 1 ; ) ;<br />quit ;<br />/*===================================================================*/<br />/* Determine the variables that are missing<br /><br />/*<br />/*===================================================================*/<br />%IF &N_CHAR > 1 %THEN %DO;<br />%let N_CHAR_1 = %EVAL(&N_CHAR - 1);<br />%END;<br />Proc sql ;<br />select %do I= 1 %to &N_NUM; max (&&NUM&I) , %end; %IF &N_CHAR > 1 %THEN %DO;<br />%do I= 1 %to &N_CHAR_1; max(&&CHAR&I), %END; %end; MAX(&&CHAR&N_CHAR)<br />into<br />%do I= 1 %to &N_NUM; :NUMMAX&I , %END; %IF &N_CHAR > 1 %THEN %DO;<br />%do I= 1 %to &N_CHAR_1; :CHARMAX&I,%END; %END; :CHARMAX&N_CHAR<br />from &DSNIN;<br />quit;<br />/*===================================================================*/<br />/* initialize DROP_NUM, DROP_CHAR global macro vars<br />/*===================================================================*/<br />%let DROP_NUM = ;<br />%let DROP_CHAR = ;<br />%if &N_NUM > 0 %THEN %DO;<br />DATA _NULL_;<br />%do I = 1 %to &N_NUM ;<br />%IF &&NUMMAX&I =. %THEN %DO;<br />%let DROP_NUM = &DROP_NUM %qtrim( &&NUM&I ) ;<br />%END;<br />%end ;<br />RUN;<br />%END;<br />%IF &N_CHAR > 0 %THEN %DO;<br />DATA _NULL_;<br />%do I = 1 %to &N_CHAR ;<br />%IF "%qtrim(&&CHARMAX&I)" eq "" %THEN %DO;<br />%let DROP_CHAR = &DROP_CHAR %qtrim( &&CHAR&I ) ;<br />%END;<br />%end ;<br />RUN;<br />%END;<br />/*===================================================================*/<br />/* Create output dataset<br />/*===================================================================*/<br />data &DSNOUT ;<br />%if &DROP_CHAR ^= %then %str(DROP &DROP_CHAR ; ) ; /* drop char variables<br />that<br />have only missing values */<br />%if &DROP_NUM ^= %then %str(DROP &DROP_NUM ; ) ; /* drop num variables<br />that<br />have only missing values */<br />set &DSNIN ;<br />%if &DROP_CHAR ^= or &DROP_NUM ^= %then %do;<br /><br />%put /----------------------------------\ ;<br />%put | Variables dropped are &DROP_CHAR &DROP_NUM | ;<br />%put \----------------------------------/ ;<br />%end;<br />%if &DROP_CHAR = and &DROP_NUM = %then %do;<br />%put /----------------------------------\ ;<br />%put | No variables are dropped |;<br />%put \----------------------------------/ ;<br />%end;<br />run ;<br />%L9999:<br />%mend DROPMISS ;</code></pre><br /><b>CONTACT INFORMATION</b><br /><br />Selvaratnam Sridharma<br />Economic Planning and Coordination Division<br />U.S. Bureau of the Census<br />Address<br />Washington, DC 20233-6100<br />301-763-6774<br />Email: <!-- e --><a href="mailto:selvaratnam.sridharma@census.gov">selvaratnam.sridharma@census.gov</a><!-- e --><div class="blogger-post-footer"><img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/6268919072942670865-6312028245537720713?l=sugiclub.blogspot.com' alt='' /></div>