SAS中文论坛

标题: Bayesian Computation with SAS 1. [打印本页]

作者: shiyiming 时间: 2011-1-4 21:05
标题: Bayesian Computation with SAS 1.
From oloolo's blog on SasProgramming

<p><a href="http://feedads.g.doubleclick.net/~a/T_SJikcyef9C6jgJlkySU_RAe48/0/da"><img src="http://feedads.g.doubleclick.net/~a/T_SJikcyef9C6jgJlkySU_RAe48/0/di" border="0" ismap="true"></img></a><br/>
<a href="http://feedads.g.doubleclick.net/~a/T_SJikcyef9C6jgJlkySU_RAe48/1/da"><img src="http://feedads.g.doubleclick.net/~a/T_SJikcyef9C6jgJlkySU_RAe48/1/di" border="0" ismap="true"></img></a></p>The book "<a href="http://bayes.bgsu.edu/bcwr/">Bayesian Computation with R</a>" by <a href="http://bayes.bgsu.edu/">Jim Albert</a> is an easy to read entry level book on applied Bayesian Statistics. While the book was written for R users, it is not difficult to translate the languages between R and SAS and I believe it is a good way to show Bayesian capability of SAS and I am going to do this for next several month to cover all chapters in the book. Readers are encouraged to buy this book to understand what's behind the code. The post here will only spend minimum effects for that purpose.<br />
<br />
This post will cover Chapter 1.<br />
<pre style="background-color: #ebebeb; border-bottom: #999999 1px dashed; border-left: #999999 1px dashed; border-right: #999999 1px dashed; border-top: #999999 1px dashed; color: #000001; font-family: Andale Mono, Lucida Console, Monaco, fixed, monospace; font-size: 12px; line-height: 14px; overflow: auto; padding-bottom: 5px; padding-left: 5px; padding-right: 5px; padding-top: 5px; width: 100%;"><code>
/* Chapter 1 of Bayesian Computation with R */

%let folder=C:\Documents\SAS for Bayesian Compu with R;
%let datafolder=&folder\data;
/* Section 1.2.2 Read Tab Deliminated Data into SAS */
libname data "&datafolder";

options missing='.' formchar = "|----|+|---+=|-/\<>*"  errors=2;

proc import datafile="&datafolder\studentdata.txt"  out=student dbms=dlm replace;
   datarow=2;
guessingrows=2;  /* Only use first 2 rows of data to guess the type of data */
   delimiter='09'x;  /* this is redudent if we specify DBMS=TAB in PROC IMPORT option */
getnames=YES;  /* This is the same as 'header=True' in R's table.read function */
run;

ods pdf file="&folder\Chapter1.pdf";
/* section 1.2.2 To see Variable Names and print first row of this data */
proc print data=student(obs=1)  split="*"; run;

/* section 1.2.3 To summarize and graph a single Batch */
proc freq data=student;
   table Drink;
run;

/* section 1.2.3 Barplot on single variable */
proc freq data=student  noprint;
   table Drink /out=t;
run;
proc gbarline data=t;
   bar  Drink /discrete space=4  sumvar=count;
run;quit;

/* Actually, in SAS, it is better off to directly use PROC GBARLINE */
proc gbarline data=student;
   bar Drink /discrete space=3 ;
run;quit;

/* section 1.2.4 Compare Batch */
data student;
   set student;
hours_of_sleep=WakeUp-ToSleep;
label hours_of_sleep="Hours Of Sleep";
run;

proc means data=student min q1 median q3 max  nmiss  maxdec=2;
   var hours_of_sleep;
run;

/* section 1.2.4 Histogram */
proc univariate data=student  noprint;
   var hours_of_sleep;
histogram /midpoints=2.5 to 12.5 by 1;
run;

proc sgplot data=student;
   histogram hours_of_sleep/scale=count;
run;

/* section 1.2.4 Boxplot comparison */
proc sort data=student out=studentsorted; by Gender; run;
proc boxplot data=studentsorted;
   plot  hours_of_sleep*Gender;
run;

proc means data=student
         min q1 median q3 max nmiss
         nway  maxdec=2;
   class Gender;
  var  Haircut;
run;

%macro jitter(var, newname, data=last, factor=1, amount=);
/* follow the JITTER function in R.
if amount is given, then use this value for disturbance
else if amount is 0, then use the range of &var for disturbance
else if amount is NULL, then use the smallest difference among
      distinct values of &var for disturbance. if all values are
      the same, then use the value obtained from range

if range of &var is 0, then use the lower range for disturbance
otherwise
*/
%let blank=%str( );
%if &data=' ' %then %let data=last;
%local fid;

data _null_;
   fid=round(ranuni(0)*10000, 1);
  call symput('fid', compress(fid));
run;
proc means data=&data  noprint;
   var &var;
  output  out=_util&fid  range(&var)=range  min(&var)=min  max(&var)=max;
run;
data _util&fid;
   set _util&fid;
  if range^=0 then z=range; else z=min;
  if z=0 then z=1;
run;

%if %eval(&amount=&blank) %then %do;
proc sort data=&data.(keep=&var  where=(&var^=.)) out=_xuti&fid  nodupkey;
   by &var;
run;
data _duti&fid;
   set _xuti&fid  nobs=ntotal  end=eof;
array _x{2} _temporary_;
if ntotal=1 then do;
   amount=&factor/50*abs(&var);
keep amount;
output;
stop;
end;
else do;
   if _n_=1 then do; _x[1]=&var; _x[2]=constant('BIG'); end;
else do;
            _x[2]=min(_x[2], &var - _x[1]);
   _x[1]=&var;
   if eof then do;
      amount=&factor/5*abs(_x[2]);
   keep amount;
   output;
   end;
end;
end;
  run;

%end;
%else %if %eval(&amount=0) %then %do;
data _duti&fid;
   set _util&fid;
amount=&factor*z/50;
keep amount;
output;
run;
%end;
%else %do;
data _duti&fid;
   amount=&amount;
keep amount;
output;
run;
%end;

proc sql noprint;
   select name into :keepvars separated by ' '
  from sashelp.vcolumn
  where  libname='WORK' and memname=%upcase("&data") and memtype="DATA"
  ;
quit;
data &data;
  array _x{1} _temporary_;
   if _n_=1 then do;
   set _duti&fid;
  _x[1]=amount;
  end;
   set &data;
  &newname=&var + ranuni(0)*(2*_x[1])-_x[1];
  label &newname="jitter(&var)";
  keep &keepvars  &newname;
run;
proc datasets library=work nolist;
   delete _duti&fid _xuti&fid _util&fid;
run;quit;
%mend;

%jitter(hours_of_sleep, xhos, data=student, factor=1, amount=);
%jitter(ToSleep, xts, data=student, factor=1, amount=);

goptions border  reset=all;
axis1  label=("jitter(ToSleep)") major=(height=2  number=5) minor=none;
axis2  label=(angle=90  "jitter(Hours of Sleep")    major=(height=2  number=5)  minor=none;
symbol1 value=circle  interpol=none  color=black;
proc gplot data=student;
   plot xhos*xts/haxis=axis1  vaxis=axis2;
run;quit;

/* old school */

proc reg data=student  lineprinter;
   title "Line Printer Style";
   model Hours_of_Sleep=ToSleep /noprint;
var    xhos  xts;
plot pred.*xts='X'  xhos*xts/overlay  symbol='o';
run;quit;
title;

ods select none;
title "ODS Graphics";
ods graphics on;
proc reg data=student ;
   model Hours_of_Sleep=ToSleep;
var    xhos  xts;
output  out=student  pred=p;
   ods select FitPlot;
run;quit;
ods graphics off;
title;
ods select all;

proc sort data=student  out=studentsorted;
   by ToSleep;
run;
goptions border  reset=all;
axis1  label=("jitter(ToSleep)")    order=(-2 to 6 by 2)  major=(height=2  number=5) minor=none;
axis2  label=(angle=90  "jitter(Hours of Sleep") offset=(, 2 pct)  major=(height=2  number=5)  minor=none;
symbol1 value=circle  interpol=none  color=black;
symbol2 value=none  interpol=line  color=black;
proc gplot data=studentsorted;
   plot xhos*xts  p*xts/overlay  haxis=axis1  vaxis=axis2;
run;quit;

proc sgplot data=student;
   scatter  x=xts  y=xhos;
reg  x=ToSleep  y=Hours_of_Sleep;
run;

/* 1.2.5 Study the relationship between variables */

ods select all ;
proc sgplot data=student;
   scatter x=xts  y=xhos;
reg x=ToSleep  y=Hours_of_sleep;
run;
ods html off;

/* section 1.3 Robustness of T-Statistics */
/* section 1.3.2 Write a function to compute T-Statistics */
%macro tstatistics(vars, dsn=last, outdsn=, class=);
%if &class='' %then %let classstmt=;
%else %let classstmt=class &class;

%let nvars=%sysfunc(count(&vars, ' '));

%let var1=%scan(&vars, 1, ' ');
%let var2=%scan(&vars, 2, ' ');
proc means data=&dsn noprint  nway;
   &classstmt;
  var &vars;
  output  out=tdata(where=(_STAT_ in ('STD', 'MEAN', 'N')));
run;
data &outdsn;
   set tdata; by &class _TYPE_;
  retain  m  n  mean1 mean2  std:;
  select(compress(_STAT_));
      when('N') do;
      m=&var1;
   n=&var2;
end;
when('MEAN') do;
            mean1=&var1;
   mean2=&var2;
end;
      when('STD') do;
      std1=&var1;
   std2=&var2;
end;
otherwise;
  end;
   if last._TYPE_ then do;
      sp=sqrt(((m-1)*std1**2 + (n-1)*std2**2)/(m+n-2));
   t= (mean1-mean2)/(sp * sqrt(1/m + 1/n));
  keep &class _TYPE_ m  n std:  mean:  sp t;
  output;
  end;
run;
%mend;

filename BCR "&folder";
%inc BCR(tstatistics);

data test;
   input x y;
cards;
1  5
4  4
3  7
6  6
5  10
;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=);

/* section 1.3.3 Monte Carlo study on the Robustness of T-Statistics */
%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 99787  seed2 76568;
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then call rannor(seed1, x); else x=.;
if j<=&n then call rannor(seed2, y); else y=.;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* section 1.3.4 Study the robustness of T-Statistics */

/* 1.) both are standard normal */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 987687  seed2 76568;
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then call rannor(seed1, x); else x=.;
if j<=&n then call rannor(seed2, y); else y=.;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* 2.) both are normal with std=1 and 10 */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 99787  seed2 76568;
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then call rannor(seed1, x); else x=.;
if j<=&n then call rannor(seed2, y); else y=.;
y=y*10;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* 2.) both are normal with std=1 and 10 */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 99787  seed2 76568;
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then call rannor(seed1, x); else x=.;
if j<=&n then call rannor(seed2, y); else y=.;
y=y*10;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* 3.) both are T-distribution with df=4 */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 99787  seed2 76568;
call streaminit(seed1);
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then x=rand('T', 4); else x=.;
if j<=&n then y=rand('T', 4); else y=.;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* 4.) one normal with mu=10 and std=2 while the other is exponential with rate=0.10 */

%let alpha=0.1;
%let m=10;
%let n=10;
%let Niter=10000;
data test;
   retain seed1 99787  seed2 76568;
do iter=1 to &Niter;
   do j=1 to max(&n, &m);
         if j<=&m then call rannor(seed1, x); else x=.;
if j<=&n then call ranexp(seed2, y); else y=.;
x=10+x*2;
y=y/0.1;
keep iter x y;
output;
end;
end;
run;

%tstatistics(x y, dsn=test, outdsn=tdata2, class=iter);

data _null_;
   set tdata2  end=eof;
retain n_reject 0;
if abs(t)>quantile('T', 1-&alpha/2, n+m-2) then n_reject+1;
if eof then do;
   call symput('n_reject', n_reject/_n_);
end;
run;
%put &n_reject;

/* draw the overlay density curves of empirical T-statistics and T-distribution */
proc sort data=tdata2;
   by t;
run;
data tdata2;
   set tdata2;
dt=pdf('T', t, 18);
run;

ods select none;
ods output Controls=Controls;
ods output UnivariateStatistics=UniStat;
proc kde data=tdata2  ;
   univar  t /out=density  method=snr  unistats;
run;
ods select all;

data density;
   set density;
dt=pdf('T', value, 18);
run;

data _null_;
   set UniStat;
if Descr='Bandwidth' then call symput ('bw', compress(round(t, 0.0001)));
run;

goptions reset=all border;

title "Comparing Empirical and Theoretical Densities";
legend label=none position=(top right inside)  mode=share;
axis1  label=("N=&Niter Bandwidth=&bw") order=(-4 to 8 by 2) offset=(1,1)  major=(height=2) minor=none;
axis2  label=(angle=90  "Density")    order=(0 to 0.4 by 0.1)  offset=(0, 0.1) major=(height=2)  minor=none;
symbol1  interpol=join  color=black  value=none  width=3;
symbol2  interpol=join  color=grey value=none  width=1;
proc gplot data=density;
   plot density*value  dt*value/overlay  haxis=axis1  vaxis=axis2  legend=legend;
   label dt='t(18)' density='Exact';
run;quit;

ods pdf close;
</code></pre><br />
<a imageanchor="1" target="_blank"  href="http://www.amazon.com/Bayesian-Computation-R-Use/dp/0387922970?ie=UTF8&tag=xie1978&link_code=bil&camp=213689&creative=392969"><img alt="Bayesian Computation with R (Use R)" src="http://ws.amazon.com/widgets/q?MarketPlace=US&ServiceVersion=20070822&ID=AsinImage&WS=1&Format=_SL160_&ASIN=0387922970&tag=xie1978" /></a><img src="http://www.assoc-amazon.com/e/ir?t=xie1978&l=bil&camp=213689&creative=392969&o=1&a=0387922970" width="1" height="1" border="0" alt="" style="border:none !important; margin:0px !important; padding: 0px !important" /><div class="blogger-post-footer"><img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/29815492-6025913615236309735?l=www.sas-programming.com' alt='' /></div><img src="http://feeds.feedburner.com/~r/SasProgramming/~4/yL6bIedyqD4" height="1" width="1"/>

欢迎光临 SAS中文论坛 (https://mysas.net/forum/)