*************** * TSLSFECLUS.sas * Siyi Luo, Wenjia Zhu, Randy Ellis * Current version: January 18, 2017 * * Program to estimate OLS and TSLS models with multiple high-dimensional fixed effects and clustered error corrections * This program contains two main macros * * The first main macro %TSLSCLUS_iterFE (core macro) sequentially absorbs all the fixed effects and estimates * both OLS and TSLS models using cluster errror corrections * Components: * %COUNTVAR () counts the number of elements in an array list such as ENDOG INST and EXOG * %CLUSTERDATASTEP() replaces need to do slow SURVEYREG, and write out results compactly * %TSLSCLUS_iterFE() performs either OLS or TSLS estimation with clustered errror corrections calls the above * iterates over absorbing each mean in order ITERFE number of times * &MAKEFITTED() makes fitted values of RHS endogenous variables * &FE() absorbs one FE after sorting * &SECONDSTAGE() redoes second stage estimation with cluster correction * Notes: * Variable lists INST and EXOG do not have to be explicit, and can use y1-y3 array notation * Fixed effects do not have to be already absorbed * If they are, put number of Fixed effects absorbed in SUMFE macro variable * program will be slow for large n or large NFE or large number of iterations * Limitations * Assumes all exogenous variables be included in the list of the instruments * Requires that the ENDOG endogenous variable list be explicit such as y1 y2 y3 not y1-y3 * * The second main macro %auto_iter calls %TSLSCLUS_iterFE iteratively and checks for convergence * * Instructions on use: * When there are multiple high-dimensional fixed effects: * call %auto_iter only, which has %TSLSCLUS_iterFE embedded in * When there is only one high-dimensional fixed effects: * call %TSLSCLUS_iterFE directly because only one iteration is desired * Commented out sample calls of each of these two macros are included at the end of this program *******************; options dlcreatedir; options source; *options nonotes; options mprint ls = 200 ps = 9999 nocenter; options obs=max; *options obs = 100000; libname junk "directory for storing temporary files"; * Macros called by main macro %TSLSCLUS_iterFE (core macro); %macro countvar(list=,_count=); *********************************; * Macro counts the number of members of a list of variables, allowing for array notation; *********************************; %if %length(&list) = 0 %then %put ERRrOR NO VARIABLES IN VARLIST &list.; data _null_; array _list &list; i=0; do over _list; i=i+1; end; call symputx('__count',i); run; %global &_count; %let &_count = &__count; %put &_count = &__count.; %mend; %macro bigsort(indata=in,outdata=out, byvar=id, parts=10, junkdir=); *********************************; * Macro breaks a big file into &parts pieces, sorts each piece, and then merges them back together. * It is ineffieicnt, but minimimzes the risk of overflowing available disk space; * Note that all files in junkdir are deleted at the end of the program execution, so beware! *********************************; * count number of observations in indata; data _null_; dsid=open("&indata."); z=attrn(dsid,"nobs"); call symput('_nobs', z); partobs=z/&parts.; call symputx('_partobs', partobs); run; %put 'nobs in full dataset ' &_nobs ; %put 'parts used for bigsort ' &parts ; %put 'nobs in each part in bigsort' &_partobs ; *split dataset into &parts pieces in the junkdir drive; data %do ibs = 1 %to &parts; &junkdir..temp&ibs. %end; ; drop _done; set &indata; %do ibs = 1 %to &parts; if (&ibs-1)*&_partobs < _n_ <= &ibs*&_partobs then do; output &junkdir..temp&ibs.; _done=1; end; %end; *this line worries that the last obs may be dropped due to rounding errrors; if _done ne 1 then output &junkdir..temp&parts. ; run; *sort each part, overwriting the original; %do ibs = 1 %to &parts; proc sort data=&junkdir..temp&ibs.; by &byvar.; run; %end; *merge together pieces; data &outdata; set %do ibs = 1 %to &parts; &junkdir..temp&ibs. %end; ; by &byvar.; run; proc datasets library =&junkdir. noprint; delete %do ibs = 1 %to &parts; temp&ibs. %end; ; quit; run; %mend; %macro ClusterDataStep( _indata=fitdata, _depvar=outcome, _rhs= , _usebeta=betavcov2SLS, _cluster = cluster, _sumfe =0, tableformat= ); ***********************************; * Macro does clustered standard error calculations for final model * output is equivalent to using a proc survey reg but uses a series of datasteps instead * Two forms of final output tables are available using tableformat option ***********************************; proc freq data = &_indata. (keep=&_cluster) noprint; table &_cluster/ out=clusterlist (drop=percent) all nopercent; run; data _null_; set clusterlist end=eof; countg+1; if eof then call symputx('_g',countg); *save g as count of clusters; run; %let _gminus1 = %eval(&_g - 1); %let _ktimesk=%eval(&_k * &_k); %let _ktimesg=%eval(&_k * &_g); %let _kplus1 = %eval(&_k + 1); data ctrl; length label $ 8 ; set clusterlist (rename=(&_cluster = start)) end=eof; label= put(_n_, 8.); retain fmtname 'clusformat' type 'n'; output; if eof then do; start=.; hlo='O'; label='***ERRrOR'; output; end; run; proc format library=work cntlin=ctrl; run; *Create macro variable &_nnn which is number of obs in dataset; data _null_; dsid=open("&_indata."); z=attrn(dsid,"nobs"); call symputx('_nnn', z); run; %put number of obs nnn = &_nnn.; proc reg data = &_indata outest=betavcovagain covout noprint; title recalculation of beta and vcov using final model just before datastep; model &_depvar = &_rhs; run; data finalcorrect ; length label $ 30 star1-star&_k $ 5 v1-v5 $ 12; retain i j k g gg kcounter dofirst 1 one 1 index 0 kcounter 0 ncounter 0 _b1-_b&_k. _se1-_se&_k. 0 nnn &_nnn. varlab1-varlab&_k. ; keep label v1 v2 v3 v4 v5; array coef {&_k} _b1-_b&_k.; array true_se {&_k} _se1-_se&_k.; array tratio {&_k} _tr1-_tr&_k.; array pvalue {&_k} _pv1-_pv&_k.; array star {&_k} star1-star&_k; length varlab1-varlab&_k. $15; array varlab {&_k} varlab1-varlab&_k.; array betavar {&_k} intercept &_rhs.; array rhsvar {&_k} one &_rhs.; array truevar {&_k} one &endog. &exog.; array xpxinv {&_k., &_k.} xpxinv1-xpxinv&_ktimesk; * X prime X inverse; array xgxgmat {&_k., &_k.} xgxg1-xgxg&_ktimesk; * product of XG*XG; array xgegmat {&_k., &_g.} xgeg1-xgeg&_ktimesg; * cross product of xk and residuals by g; array xekmean {&_k.} xk1-xk&_k.; * mean for each xk*e value in group; array ng {&_g.} ng1-ng&_g.; * number of people in each group; array vc {&_k., &_k.} vc1-vc&_ktimesk; array tempvc {&_k., &_k.} tempvc1-tempvc&_ktimesk; retain RMSE_true 0 RMSE_BIASED 0 true_sse 0 xpxinv1-xpxinv&_ktimesk xgxg1-xgxg&_ktimesk xgeg1-xgeg&_ktimesg xk1-xk&_k. ng1-ng&_g vc1-vc&_ktimesk tempvc1-tempvc&_ktimesk 0 _depvar_; *read in the coef and varcov matrix and convert to x prime x inverse; if dofirst=1 then do; set betavcovagain ; retain countrows 0; countrows=countrows+1; if _type_='PARMS' then do; RMSE_BIASED =_RMSE_; do i = 1 to &_k; coef[i]=betavar[i]; end; end; else do; *var cov matrix terms; kcounter=kcounter+1; if RMSE_BIASED=. then put "problem is RMSE_BIASED"; if RMSE_BIASED > 0 then do j=1 to &_k; xpxinv[kcounter,j]=betavar[j]/(RMSE_BIASED*RMSE_BIASED); end; end; *end reading varcov terms; *put xpxinv1= xpxinv2= xpxinv3=; *for test only; if countrows=&_k+1 then dofirst=0; end; * end do group for dofirst=1; else do; * do group for reading in and processing data; ncounter=ncounter+1; set &_indata end=eof; g=input(put(&_cluster,CLUSFORMAT.),8.); ng(g)=ng(g) + 1; * count people in the cluster; yhat=0; do k = 1 to &_k; * calculate yhat; yhat=yhat+coef[k]*truevar[k]; end; err=&_depvar - yhat; do k = 1 to &_k; * sum up cross product of Xk and err; xgegmat[k,g]=xgegmat[k,g] + err*rhsvar[k]; end; true_sse=true_sse+err*err; * accumulate SSE for use at end; sumerr+err; end; if eof then do; %let _modelDF = %eval(%eval(&_nnn - &_k) - &_sumfe.); RMSE_true=sqrt(true_sse/(&_modeldf.)); * convert XgEg matrix to means and then deviations from means; do k=1 to &_k; xekmean[k]=0; do gg=1 to &_g; xekmean[k]=xekmean[k]+ xgegmat[k,gg]; end; xekmean[k]=xekmean[k]/&_g.; do gg=1 to &_g; xgegmat[k,gg]=xgegmat[k,gg]-xekmean[k]; end; end; *calculate xgxg matrix; do i=1 to &_k; if coef[i] ne 0 then do k=1 to &_k; if coef[k] ne 0 then do gg=1 to &_g; xgxgmat[i,k]=xgxgmat[i,k]+ xgegmat[i,gg]*xgegmat[k,gg]; end; end; end; *multiply out xpxinv*xgxg; do i=1 to &_k; do k=1 to &_k; do j=1 to &_k; tempvc[i,k]=tempvc[i,k]+ xpxinv[i,j]*xgxgmat[j,k]; end; end; end; *multiply out result*xpxinv; do i=1 to &_k; do k=1 to &_k; do j=1 to &_k; vc[i,k]=vc[i,k]+ tempvc[i,j]*xpxinv[j,k]; end; end; end; do k = 1 to &_k; if vc[k,k]>0 then true_se[k]=sqrt(vc[k,k]*(&_g./(&_g.-1))*(&_nnn. - 1)/(&_nnn. - &_k.- &_sumfe.)); else true_se[k]=.; end; meanerr=sumerr/nnn; label='dependent variable:'; v1 = "&_depvar"; v2 = "®type"; output finalcorrect ; label='Dependent mean'; v1=put(&_depmean,best12.); v2=""; output finalcorrect ; label='Dependent std dev'; v1=put(&_depstd,best12.); output finalcorrect ; %put &_nnn &_sumfe; label='samplesize'; v1=input(&_nnn,12.); output finalcorrect ; label='number of fixed effects'; v1=input(&_sumfe,12.); output finalcorrect ; label='number of model parameters'; v1=input(&_k,12.); output finalcorrect ; label='number of cluster values'; v1=input(&_g,12.); output finalcorrect ; label='total model degrees of freedom'; v1=input(&_modeldf.,12.); output finalcorrect ; label='SSE'; v1=input(true_sse,best12.); output finalcorrect ; label ='RMSE'; v1=input(RMSE_true, best12.); output finalcorrect ; label='variable'; v1 = 'Coef.'; v2 = 'st. err.'; v3 = 't-ratio'; v4 = 'p value'; v5 = 'sign.'; output finalcorrect ; put // "dependent variable = " _depvar_ /; *put 'varlab[k] coef[k] true_se[k] tratio[k] pvalue[k] star[k] '; *for test only; do k = 1 to &_k; if true_se[k]>0 then do; tratio[k]=coef[k]/true_se[k]; pvalue[k]=cdf('t',-abs(tratio[k]),&_gminus1.); if pvalue[k] <.01 then star[k]='***'; else if pvalue[k] <.05 then star[k]='** '; else if pvalue[k] <.10 then star[k]='* '; else star[k]=' '; end; *put varlab[k] coef[k] true_se[k] tratio[k] pvalue[k] star[k]; *for test only; label= varlab[k]; v1=coef[k]; v2=true_se[k]; v3=tratio[k] ; v4=pvalue[k]; v5=star[k]; output finalcorrect ; end; end; run; title Cluster regression results using &_indata.; title2 Cluster =&_cluster; title3 model &_depvar = &_rhs; data varlist; set &_usebeta (obs=1); keep intercept &_rhs; run; proc transpose data=varlist out=varobs; run; *proc print data = varlist; *for test only; run; data finalcorrect; length v1 $20; set finalcorrect; if _n_ >11 then do; set varobs (rename =(_name_ = varname)); v1=put(col1,12.8); if col1 ne 0 then do; tratio=col1/input(v2, 12.8); pvalue=cdf('t',-abs(tratio),&_gminus1.); if pvalue <.01 then star='***'; else if pvalue <.05 then star='** '; else if pvalue <.10 then star='* '; else star=' '; v2=v2; v3=tratio; v4=pvalue; v5=star; end; if label = "" then do; label = varname; if col1=0 then label = trim(label) || ' B'; end; end; output; run; %if &wide = yes %then %do; proc print data = finalcorrect; var label v1-v5; run; %end; %else %do; *proc print data = varobs; *for test only; *run; data &tableformat; length v1 $20; set finalcorrect; if _n_ >11 then do; set varobs (rename =(_name_ = varname)); stderr=v2; v1=put(col1,12.8); if col1 ne 0 then do; tratio=col1/input(v2, 12.8); pvalue=cdf('t',-abs(tratio),&_gminus1.); if pvalue <.01 then star='***'; else if pvalue <.05 then star='** '; else if pvalue <.10 then star='* '; else star=' '; end; v2=star; if label = "" then do; label = varname; if col1=0 then label = trim(label) || ' B'; end; end; output; if _n_>11 then do; label=""; if col1 ne 0 then v1="("||stderr||")"; else v1=""; v2=""; output; end; drop varname; run; proc print data = &tableformat; var label v1 v2 ; run; %end; %mend; %macro FE(FEVAR=, FEdata=, outdata=); *******************************; * Macro absorbs one fixed effect defined by FEVAR after sorting; *******************************; %let datanow=&fedata; %bigsort(indata=&fedata (keep=&allvar), outdata=&datanow, byvar=&fevar_now, parts=5, junkdir=junk); **standardize by &fevar, one var at a time; proc standard data = &datanow. out =&outdata mean=0; var &modelvar. ; by &fevar_now; run; %mend; * Begin the first main macro %TSLSCLUS_iterFE; %macro TSLSCLUS_iterFE( runtitle = "insert descriptive title", /* running title */ indata = , /* input data for each iteration */ depvar = , /* dependent variable */ endog = , /* endogenous variable */ inst = , /* instrumental variable */ exog = , /* exogenous variable */ fe = , /* variables defining absorbed fixed effects */ FE_iter = , /* incremental on iteration number */ cluster = , /* variable defining cluster level */ othervar = , /* other variables to be carried along to final dataset for final analysis*/ tempdir = , /* directory for storing temporary data sets */ regtype = , /* TSLS or OLS */ showmeans = , /* yes or no to showing sample summary statistics */ showrf = , /* yes or no to showing reduced form results of TSLS model */ showols = , /* yes or no to OLS without cluster correction */ dosurveyreg = , /* yes or no to doing PROC SURVEYREG */ wide = , /* yes or no to wide format table */ estresult = /* data set for outputting estimates from current iteration */ ); %let allvar= &depvar &endog &exog &inst &fe &cluster &othervar ; %let modelvar= &depvar &endog &exog &inst; %let fitvar=; %let nnn=; %let sumFE=0; data _temp; runtitle = &runtitle.; title &runtitle.; run; proc print data = _temp; run; title; *these steps count how many of each type of variable there are, and also create a list of variable names; data varlist; one=1; array dep one &depvar.; i=-1; do over dep; i=i+1; end; call symputx('depvarct',i); array endog one &endog.; i=-1; do over endog; i=i+1; end; call symputx('endogvarct',i); array exog one &exog.; i=-1; do over exog; i=i+1; end; call symputx('exogvarct',i); array inst one &inst.; i=-1; do over inst; i=i+1; end; call symputx('instvarct',i); array fe one &fe.; i=-1; do over fe; i=i+1; end; call symputx('fevarct',i); drop i one &depvar. &fe. &inst. ; run; proc transpose data=varlist out=varobs; run; *this datastep adds 12 obs at the beginning to match up with final output; data varobs; ncounter+1; if _n_=1 then do i = 1 to 12; output; ncounter=ncounter+1; end; set varobs; drop i col1; output; run; %let k=%eval(%eval(&endogvarct + &exogvarct) + 1); %let ktimesk=%eval(&k. * &k.); %put '&depvarct. &endogvarct. &exogvarct. &instvarct. &k. &ktimesk.'; %put &depvarct. &endogvarct. &exogvarct. &instvarct. &k. &ktimesk.; %if &showmeans=no %then %let np=noprint; %else %let np = ; proc means data = &indata &np. ; title1 proc means ONE data = &indata on original input data (can be standardized already); var &modelvar.; output out=origmeans; run; *********************************************************************************; * Iterate FE_ITER times over removing fixed effects * Does not care whether already sorted by FE1 * Any number of dimensions of FE ********************************************; data &tempdir..ttemp&fevarct.; set &indata; run; %do iii=1 %to &FE_iter; %do i=1 %to &fevarct.; %let ii=%eval(&i+1); %if &i. = 1 %then %let datanow=&tempdir..ttemp&fevarct.; %else %let datanow = &lastdata.; %let fevar_now=%scan(&fe,&i); %FE(FEVAR=&fevar_now, FEdata=&datanow., outdata=&tempdir..ttemp&i.); %let lastdata = &tempdir..ttemp&i.; *count unique values of fixed effects to be added to model degrees of freedom; %if &iii. = 1 %then %do; data _null_; set &tempdir..ttemp&i. (keep=&fevar_now) end=eof; by &fevar_now; if first.&fevar_now then countfe+1; if eof then call symputx('kkk',countfe); run; %let sumFE=%eval(&sumfe + &kkk); %put kkk &kkk sumfe &sumfe.; %end; *end do group for iii =1; %end; * end do loop over fevarct; %end; *do loop over iii; %let datanow = &lastdata; %if &showmeans=yes %then %do; title proc means on &datanow after removing all fixed effects ; proc means data = &datanow; run; %end; %macro makefitted(stddata=&datanow, outdata=&datanow.fit); ***************************; * Macro creates first stage fitted values for TSLS also does Reduced forms if wanted using SHOWRF option; ***************************; %if &showrf=yes %then %let showrf = ; %else %let showrf = noprint; %put 'debug showrf' &showrf; *Create the XprimeX matrix for endogenous and exogenous variables; proc reg data =&stddata. outsscp = xprimex plots= none noprint; var &depvar &endog &exog &inst; run; *show OLS if requested; %if &showOLS=yes %then %do i=1 %to &depvarct; %let depvar_now=%scan(&depvar,&i); title OLS regressions using &stddata with &FE. ; title2 proc reg data = xprimex; title3 model &depvar_now = &endog &exog; proc reg data = xprimex ; &depvar_now: model &depvar_now = &endog &exog; run; %end; *% end do loop over endogvarct; %mend; %macro secondstage(datasecond=&datanow.); ***************************; * Macro does second-stage estimation; ***************************; *count and identify the groups in each cluster; proc freq data = &datasecond. (keep=&cluster) noprint; table &cluster/ out=clusterlist (drop=percent) all nopercent; run; data _null_; set clusterlist end=eof; countg+1; if eof then call symputx('g',countg); run; %let gminus1 = %eval(&g - 1); %let ktimesg = %eval(&k * &g); %put &ktimesg; data ctrl; length label $ 8 ; set clusterlist (rename=(&cluster = start)) end=eof; label= _n_; retain fmtname 'clusformat' type 'n'; output; if eof then do; start=.; hlo='O'; label='***ERRrOR'; output; end; run; proc format library=work cntlin=ctrl; run; %let kplus1 = %eval(&k + 1); *now do all of the second stage regressions; %do i=1 %to &depvarct; %let depvar_now=%scan(&depvar,&i); %let modelvar_now= &depvar_now. &endog. &exog.; %if &showmeans=yes %then %let np= simple ;%else %let np=noprint; *extract means and std of original depvar; data _null_; set origmeans(keep=&depvar_now firstobs=4 obs=5); if _n_=1 then call symputx('depmean', &depvar_now.); if _n_=2 then call symputx('depstd', &depvar_now.); run; %put &depmean &depstd; %end; *end loop over depvar; *********************************************************; * The following segment of the Macro TSLSCLUS does either OLS or TSLS with clustered errrors using Proc Surveyreg and also using datastep * assumes post standardization; *********************************************************; data _null_; one=1; array dep one &depvar.; i=-1; do over dep; i=i+1; end; call symputx('_ctdepvar',i); array endog one &endog.; i=-1; do over endog; i=i+1; end; call symputx('_ctendog',i); array exog one &exog.; i=-1; do over exog; i=i+1; end; call symputx('_ctexog',i); array inst one &inst.; i=-1; do over inst; i=i+1; end; call symputx('_ctinst',i); array fe one &fe.; i=-1; do over fe; i=i+1; end; call symputx('fevarct',i); run; %countvar(list=one &endog &exog, _count=_k); %let _depstd = &depstd; %let _depmean= &depmean.; %put depvar= &depvar; %put endog = &endog; %put exog = &exog; %put inst = &inst; %put depvar &_ctdepvar endog &_ctendog exog &_ctexog inst &_ctinst total k &_k; %put; %if ®type = OLS %then %do; %do _i=1 %to &_ctdepvar.; %let depvar_now=%scan(&depvar,&_i); %if &dosurveyreg=yes %then %do; proc surveyreg data = &datanow.; cluster &cluster; title &depvar_now = &endog. &exog. using data = &indata from &datanow.; title2 this is the one to match; model &depvar_now = &endog. &exog. /solution; run; %end; *end do group for dosurveyreg=yes; *the following should yield the same results as the above but faster; %ClusterDatastep( _indata =&datanow., _depvar =&depvar_now., _rhs = &endog &exog, _usebeta = betavcovagain, _cluster = &cluster, _sumfe = &sumfe., tableformat = &estresult ); run; %end; *end do loop over _ctdepvar.; %end; * end do group for regtype=OLS with cluster correction; %if ®type ne OLS %then %do; * Do 2SLS using Proc surveyreg and clusterdatastep(); %let fitvar=; proc reg data = &datanow. outsscp = xprimex plots = none noprint; var &depvar &endog &exog &inst; run; %do _i=1 %to &_ctendog; %let endog_now=%scan(&endog,&_i); %let tempvar=&fitvar; %let fitvar=&tempvar &endog_now.fit; title first stage regressions on rhs endogenous variables using &datanow. with &FE. fixed effects; title2 proc reg data = xprimex outest=beta&i &showRF.; title3 model &endog_now = &inst &exog; proc reg data = xprimex outest=beta&_i &showRF.; &endog_now.fit: model &endog_now = &inst &exog; run; proc score data=&datanow. score=beta&_i type=parms out=&tempdir..fit&_i (keep=&endog_now.fit) predict; var &inst. &exog.; run; %end; * end do loop over endogvarct; *Now merge all of the fitted values back in with the original data; data &tempdir..two (compress=no); merge &datanow. end=eof %do _i=1 %to &_ctendog; &tempdir..fit&_i %end; ; if eof then call symputx('_nnn', _n_); *&_nnn is count of obs; run; proc reg data = &tempdir..two outsscp = xprimex plots = none noprint; var &depvar &endog &exog &inst &fitvar; run; *it is faster to do TSLS using this sequence than to just use Proc SYSLIN on the original data; %do _i=1 %to &_ctdepvar.; %let depvar_now=%scan(&depvar,&_i); proc reg data = xprimex outest=betavcov2sls covout /*noprint */; title TSLS result with correct beta estimates but wrong standard errrors; &depvar_now.res: model &depvar_now = &fitvar &exog ; run; proc score data = &tempdir..two score = betavcov2sls (drop=_name_) type=PARMS out=fitdata residual; var &depvar_now. &fitvar. &exog.; run; %if &dosurveyreg=yes %then %do; proc surveyreg data = fitdata; cluster &cluster; title Surveyreg using &depvar = &fitvar. &exog. using data =fitdata from &datanow.; title2 the betas are wrong but standard errrors are correct; model &depvar_now.res = &fitvar. &exog. /solution; run; %end; *the following should yield the same results as the above but faster; %ClusterDatastep( _indata=fitdata, _depvar=&depvar_now., _rhs = &fitvar. &exog, _usebeta=betavcov2SLS, _cluster = &cluster, _sumfe=&sumfe., tableformat = &estresult ); run; %end; *% end do loop over _ctdepvar.; %end; *end do group for regtype ne OLS; proc datasets library = &tempdir memtype = data noprint; delete fit1-fit&_ctendog. two; run; quit; %mend; %makefitted(stddata=&datanow.,outdata=&datanow.fit); %secondstage(); %mend; * Second main macro %auto_iter; * Note that it calls %TSLSCLUS_iterFE one time or iteratively; %macro auto_iter( indsn = , /* input data */ tol = , /* tolerance level for convergence */ maxiter = , /* maximum number of iteration */ betasefinal = , /* output data for storing estimates from all iterations */ fevarcount = , /* number of absorbed fixed effects */ auto_tempdir = , /* directory for storing temporary data sets */ auto_depvar = , /* dependent variable */ auto_endog = , /* endogenous variable */ auto_inst = , /* instrumental variable */ auto_exog = , /* exogenous variable */ auto_fe = , /* variables defining absorbed fixed effects */ auto_FE_iter = , /* incremental on iteration number */ auto_cluster = , /* variable defining cluster level */ auto_othervar = , /* other variables to be carried along to final dataset for final analysis */ auto_regtype = , /* TSLS or OLS */ auto_showmeans = , /* yes or no to showing sample summary statistics */ auto_showrf = , /* yes or no to showing reduced form results of TSLS model */ auto_showols = , /* yes or no to OLS without cluster correction */ auto_dosurveyreg = , /* yes or no to doing PROC SURVEYREG */ auto_wide = /* yes or no to wide format table */ ); %let totiter = 1; %TSLSCLUS_iterFE( runtitle = "&auto_regtype.: &totiter. iteration(s)", /* running title */ indata = &indsn., /* input data for each iteration: &indsn. for the first iteration, standardized data for subsequent iterations */ depvar = &auto_depvar., /* dependent variable */ endog = &auto_endog., /* endogenous variable */ inst = &auto_inst., /* instrumental variable */ exog = &auto_exog., /* exogenous variable */ fe = &auto_fe., /* variables defining absorbed fixed effects */ FE_iter = &auto_FE_iter., /* incremental on iteration number */ cluster = &auto_cluster., /* variable defining cluster level */ othervar = &auto_othervar., /* other variables to be carried along to final dataset for final analysis */ tempdir = &auto_tempdir., /* directory for storing temporary data sets */ regtype = &auto_regtype., /* TSLS or OLS */ showmeans = &auto_showmeans., /* yes or no to showing sample summary statistics */ showrf = &auto_showrf., /* yes or no to showing reduced form results of TSLS model */ showols = &auto_showols., /* yes or no to OLS without cluster correction */ dosurveyreg = &auto_dosurveyreg., /* yes or no to doing PROC SURVEYREG */ wide = &auto_wide., /* yes or no to wide format table */ estresult = coreprev /* data set for outputting estimates from current iteration */ ); *************************************************************** REPORT stores the main results for each iteration; ***************************************************************; data report; set coreprev; keep label v1; rename v1=iter1; id=_N_; if id<10 then delete; drop id; run; data report; set report; id=_n_; output; run; *************************************************************** Save the result of the 1st iteration in prev as the basecase; ***************************************************************; data coreprev; set coreprev; keep v1; rename v1=iterprev; id=_N_; if id~=14 then delete; *only keep RHS variable of interest; * for now it assumes we check convergence of coefficient of interest (here only x); drop id; run; data coreprev; * COREPREV stores the core values of the previous iteration for checking convergence; set coreprev; * 2 var: id & v1; id = _n_; * 1 obs: RMSE; output; run; *proc print data=coreprev; *for test only; *run; %do totiter = 2 %to &maxiter; %TSLSCLUS_iterFE( runtitle = "&auto_regtype.: &totiter. iteration(s)", /* running title */ indata = &auto_tempdir..ttemp&fevarcount.,/* input data for each iteration: &indsn. for the first iteration, standardized data for subsequent iterations */ depvar = &auto_depvar., /* dependent variable */ endog = &auto_endog., /* endogenous variable */ inst = &auto_inst., /* instrumental variable */ exog = &auto_exog., /* exogenous variable */ fe = &auto_fe., /* variables defining absorbed fixed effects */ FE_iter = &auto_FE_iter., /* incremental on iteration number */ cluster = &auto_cluster., /* variable defining cluster level */ othervar = &auto_othervar., /* other variables to be carried along to final dataset for final analysis */ tempdir = &auto_tempdir., /* directory for storing temporary data sets */ regtype = &auto_regtype., /* TSLS or OLS */ showmeans = &auto_showmeans., /* yes or no to showing sample summary statistics */ showrf = &auto_showrf., /* yes or no to showing reduced form results of TSLS model */ showols = &auto_showols., /* yes or no to OLS without cluster correction */ dosurveyreg = &auto_dosurveyreg., /* yes or no to doing PROC SURVEYREG */ wide = &auto_wide., /* yes or no to wide format table */ estresult = betasecurr /* data set for outputting estimates from current iteration */ ); ********************************************************************** Add results to REPORT; **********************************************************************; data reportcurr; set betasecurr; keep v1; rename v1=iter&totiter.; id=_N_; if id<10 then delete; drop id; run; data reportcurr; set reportcurr; id=_n_; output; run; data report; merge report reportcurr; by id; run; data corecurr; set betasecurr; keep v1; rename v1=itercurr; id=_N_; if id~=14 then delete; *only keep RHS variable of interest; *for now it assumes we check convergence of coefficient of interest (here only x); drop id; run; data corecurr; * CORECURR stores the main information of the current iteration, same format as COREPREV; set corecurr; id = _n_; output; run; *proc print data=corecurr; *for test only; *run; ****************************************************** Merge Prev and Curr to check for convergence; ******************************************************; %let diff = 0; data check; merge coreprev corecurr; if iterprev~=0 then pctdiff = abs(iterprev-itercurr)/abs(iterprev); by id; run; data _null_; set check end=eof; retain maxpctdiff 0; if pctdiff gt maxpctdiff then maxpctdiff=pctdiff; if eof then call symputx('diff',maxpctdiff); * Accommodate multiple attributes (e.g., RMSE, X and Y); run; %if %sysevalf(&diff < &tol) or &totiter.=&maxiter %then %do; * Two criteria for stopping increasing number of iterations; data niter_temp; label = "number of iterations"; v1 = "&totiter."; output; run; data &betasefinal; * &betasefinal is the dataset of final results, plus the number of iterations where it stops; set betasecurr niter_temp; run; proc datasets noprint; delete coreprev corecurr betasecurr niter_temp check; run; proc print data=report; var label iter1-iter&totiter.; title "Iteration and Convergence Process"; run; %return; ****terminate macro; %end; %else %do; data corecurr; set corecurr; rename itercurr=iterprev; run; proc datasets noprint; delete coreprev betasecurr check; change corecurr=coreprev; * Replace the name of dataset and iterate; run; %end; %end; %mend; *****************; ** Sample call **; *****************; * Below is a sample call of two main macros; * The second macro can be called directly if only one iteration is desired; * such as if there is only one high-dimensional fixed effect; * Otherwise, call the first macro only which has the second macro embedded in; *%auto_iter( indsn = in_data, /* input data */ tol = 0.0001, /* tolerance level for convergence */ maxiter = 10, /* maximum number of iteration */ betasefinal = out_data, /* output data for storing estimates from all iterations */ fevarcount = 3, /* number of absorbed fixed effects */ auto_tempdir = junk, /* directory for storing temporary data sets */ auto_depvar = y, /* dependent variable */ auto_endog = x, /* endogenous variable */ auto_inst = z, /* instrumental variable */ auto_exog = , /* exogenous variable */ auto_fe = i c t, /* variables defining absorbed fixed effects */ auto_FE_iter = 1, /* incremental on iteration number */ auto_cluster = c, /* variable defining cluster level */ auto_othervar = , /* other variables to be carried along to final dataset for final analysis */ auto_regtype = TSLS, /* TSLS or OLS */ auto_showmeans = no, /* yes or no to showing sample summary statistics */ auto_showrf = no, /* yes or no to showing reduced form results of TSLS model */ auto_showols = no, /* yes or no to OLS without cluster correction */ auto_dosurveyreg = no, /* yes or no to doing PROC SURVEYREG */ auto_wide = no /* yes or no to wide format table */ ); *%TSLSCLUS_iterFE( runtitle = "TSLS: one fixed effect", /* running title */ indata = in_data, /* input data for each iteration: &indsn. for the first iteration, standardized data for subsequent iterations */ depvar = y, /* dependent variable */ endog = x, /* endogenous variable */ inst = z, /* instrumental variable */ exog = , /* exogenous variable */ fe = i, /* variables defining absorbed fixed effects */ FE_iter = 1, /* incremental on iteration number */ cluster = c, /* variable defining cluster level */ othervar = , /* other variables to be carried along to final dataset for final analysis */ tempdir = junk, /* directory for storing temporary data sets */ regtype = TSLS, /* TSLS or OLS */ showmeans = no, /* yes or no to showing sample summary statistics */ showrf = no, /* yes or no to showing reduced form results of TSLS model */ showols = no, /* yes or no to OLS without cluster correction */ dosurveyreg = no, /* yes or no to doing PROC SURVEYREG */ wide = no, /* yes or no to wide format table */ estresult = out_data /* data set for outputting estimates */ );