/****************************************************************************
 *                                                                           *
 * SAS Macro Distrib_Bivariate estimates the bivariate distribution of true  *
 * usual intake of two foods/nutrients using Monte Carlo simulation.         *
 * The first food/nutrient can be episodically consumed or consumed every    *
 * day, while the second food/nutrient is assumed to be consumed every day.  *
 *                                                                           *
 * Macro Distrib_Bivariate reads data sets parms_b and pred_x_b output by    *
 * SAS macro NLMixed_Bivariate, and uses Monte Carlo simulation of the       *
 * random effects to generate the distribution of true usual intake.         *
 *                                                                           *
 * The method for two foods/nutrients that are consumed every day is         *
 * described in:                                                             *
 *    Freedman LS, et al. The population distribution of ratios of usual     *
 *    intake of dietary components that are consume every day can be         *
 *    estimated from repeated 24-hour recalls. J Nutr. 2010;140:111-116.     *
 *                                                                           *
 *****************************************************************************
 *                                                                           *
 * Macro Parameters:                                                         *
 *                                                                           *
 *    Required Parameters:                                                   *
 *       param      = name of SAS data set containing the estimated model    *
 *                    parameters from macro NLMixed_Bivariate.               *
 *       predicted  = name of SAS data set containing the predicted values   *
 *                    calculated by macro NLMixed_Bivariate.                 *
 *                    typically, each subject has only one predicted value,  *
 *                    so has only one observation in the data set.           *
 *                    some models, however, allow a subject to have          *
 *                    different predicted values on different days (e.g.,    *
 *                    subjects may eat differently on weekends than on       *
 *                    weekdays). for such models, each subject should have   *
 *                    one observation for each unique predicted value.       *
 *                    macro Distrib_Univariate will then calculate true      *
 *                    usual intake as a weighted average of usual intake on  *
 *                    the different days (see day_wht, below).               *
 *       subject    = name of the variable that uniquely identifies each     *
 *                    subject in the predicted data set (i.e., ID variable). *
 *       modeltype  = model for first food/nutrient:                         *
 *                       to fit the two-part (epsisodic) model, specify      *
 *                          modeltype = TWOPART                              *
 *                       to fit the one-part (every day) model, specify      *
 *                          modeltype = ONEPART                              *
 *                                                                           *
 *    Optional Parameters:                                                   *
 *       link       = link function for the probability part of the two-     *
 *                    part model. to fit a logistic model, specify           *
 *                       link = logit                                        *
 *                    to fit a probit model, specify                         *
 *                       link = probit                                       *
 *                    by default, link = logit.                              *
 *                    if modeltype = ONEPART, then link is ignored.          *
 *       nsim_mc    = number of pseudo-individuals to simulate for each      *
 *                    real individual in the data set. if the data set has   *
 *                    n subjects, then the monte carlo distribution will     *
 *                    have nsim_mc * n pesudo-individuals.                   *
 *                    by default, nsim_mc = 1.                               *
 *       day_wgt    = name of the "day weight" variable in the predicted     *
 *                    data set. if the model allows subjects to have         *
 *                    different predicted values on different days (see      *
 *                    predicted, above), then macro distrib will calculate   *
 *                    each subject's true usual intake as a weighted average *
 *                    of usual intake on the different days (using the       *
 *                    weights in the "day weight" variable) . for example,   *
 *                    if each subject has one predicted value for weekends   *
 *                    (Friday-Sunday) and one predicted value for weekdays   *
 *                    (Monday-Thursday), then the day weight variable should *
 *                    equal 3/7 for the weekend predicted value and 4/7 for  *
 *                    the weekday predicted value.                           *
 *                    by default, if subjects have multiple predicted        *
 *                    values, then true usual intake is calculated as an     *
 *                    unweighted average of usual intake on different days.  *
 *       min_a1     = minimum true usual intake for a one-part model, or     *
 *                    mininum true usual amount on consumption day for a     *
 *                    two-part model, for the first food/nutrient.            *
 *                    the monte carlo method generates amount consumed on    *
 *                    a transformed scale, then back-transforms it to the    *
 *                    original scale. occasionally, the generated amount is  *
 *                    too small to be back-transformed. when this happens,   *
 *                    amount on the original scale is set to min_a1.         *
 *                    by default, min_a1 = 0.                                *
 *       min_a2     = minimum true usual intake for the second food/nutrient.*
 *                    by default, min_a2 = 0.                                *
 *       backtran   = 1 to use a numerical integration method to integrate   *
 *                    back-transformed reported intake over the distribution *
 *                    of within-person error.                                *
 *                  = 2 to use a Taylor linearization approximation to the   *
 *                    integral (not recommended).                            *
 *                  = 3 to back-transform without integrating over the       *
 *                    distribution of within-person error. (not recommended).*
 *                    by default, backtran = 1.                              *
 *       print      = Y to print summary of the monte carlo distribution.    *
 *                  = N to supress printing summary of the distribution.     *
 *                    by default, print = Y.                                 *
 *       ntitle     = number of titles defined by the user.                  *
 *                    by default, ntitle = 2.                                *
 *                                                                           *
 *****************************************************************************
 *                                                                           *
 * Output Data Set:                                                          *
 *                                                                           *
 *   _mcsim = data set containing the monte carlo distribution of true usual *
 *            intake of the food/nutrient.  if the the predicted data set    *
 *            has n subjects, _mcsim will have nsim_mc * n pseudo-subjects.  *
 *            _mcsim contains all the numeric variables in the predicted     *
 *            data set, plus the following variables:                        *
 *                                                                           *
 *               t1    = true usual intake of first food/nutrient.           *
 *               t2    = true usual intake of second food/nutrient.          *
 *               a1    = true usual amount on consumption day for first      *
 *                       food/nutrient.                                      *
 *               bc_a1 = Box-Cox transformed true usual amount a1.           *
 *               bc_t2 = Box-Cox transformed true usual intake t2.           *
 *                                                                           *
 *            if fitting the two-part model, then _mcsim also contains       *
 *            the following variables:                                       *
 *                                                                           *
 *               p1             = true probability to consume.               *
 *               linear_pred_p1 = transformed true probability to consume:   *
 *                                 = log(p1/(1-p1))  for logistic model      *
 *                                 = probit(p1)      for probit model        *
 *                                                                           *
 ****************************************************************************/

%macro Distrib_Bivariate (param      = ,
                          predicted  = ,
                          subject    = ,
                          modeltype  = ,
                          link       = LOGIT,
                          nsim_mc    = 1,
                          day_wgt    = ,
                          min_a1     = 0,
                          min_a2     = 0,
                          backtran   = 1,
                          print      = Y,
                          ntitle     = 2);

%global seed;  /* seed for random number generator. */

%let print     = %upcase(%substr(&print,1,1));
%let modeltype = %upcase(&modeltype);
%let link      = %upcase(&link);

%if (&backtran ^= 2 & &backtran ^= 3) %then %let backtran = 1;
%if (&link      ^= PROBIT)  %then %let link = LOGIT;
%if (&modeltype ^= ONEPART) %then %let modeltype = TWOPART;

 /* get predicted values and parameter estimates. */

data _predicted;
  set &predicted;
  run;

proc sort data=_predicted;  by &subject;

data _predicted;
  set _predicted end=eof;
    by &subject;
  if (first.&subject) then _id + 1;

  retain multiobs 0;
  if (first.&subject & not last.&subject) then multiobs = 1;
  if (eof) then call symput("multiobs",trim(left(put(multiobs, 1.))));
  run;

 /* determine number and order of random effects u. */

%let dim_u   = 0;
%let index_u = ;
%let name_u  = ;

%if (&modeltype ^= ONEPART) %then %do;
  %let dim_u   = %eval(&dim_u + 1);
  %let index_u = &index_u 1;
  %let name_u  = &name_u u1;
  %end;

%let dim_u   = %eval(&dim_u + 1);
%let index_u = &index_u 2;
%let name_u  = &name_u u2;

%let dim_u   = %eval(&dim_u + 1);
%let index_u = &index_u 3;
%let name_u  = &name_u u3;

 /* generate monte carlo random effects u. */

%put initial seed = &seed;

proc iml;
  seed = &seed;

  use _predicted;
    read all var _num_ into pred [colname=varnames];
    read all var{_id} into id;
    close _predicted;

  use &param;
    read all var{%do i = 1 %to &dim_u;
                   %let ii = %scan(&index_u,&i,%str( ));
                   %do j = 1 %to %eval(&i-1);
                     %let jj = %scan(&index_u,&j,%str( ));
                     COV_U&jj.U&ii
                     %end;
                   VAR_U&ii
                   %end;
                } into var_u;
    close &param;

  var_u     = sqrsym(var_u);
  u_zero    = (vecdiag(var_u) = 0);
  u_nonzero = (vecdiag(var_u) > 0);
  var_u     = var_u # (u_nonzero * u_nonzero`);
  var_u     = var_u + diag(u_zero);

  root_u = root(var_u);
  root_u = root_u # (u_nonzero * u_nonzero`);

  n    = max(id);
  n_u  = nrow(var_u);
  nsim = &nsim_mc;

 /* create sas data set of random effects. */

  names = {"mc_id" "_sim"} || varnames || {%do i = 1 %to &dim_u;
                   %let ii = %scan(&index_u,&i,%str( ));
                   "u&ii"
                   %end;};

  rec = repeat(., 1, ncol(pred) + n_u + 2);
  create _mcsim from rec [colname=names];

  offset = 0;

  do i = 1 to n;
    u = repeat(.,nsim,n_u);
    call rannor (seed, u);
    u = u * root_u;

    predi = pred[loc(id = i),];
    npredi = nrow(predi);

    sim = (1:nsim)`;

    u     = u   @ repeat(1,npredi,1);
    sim   = sim @ repeat(1,npredi,1);
    mc_id = offset + sim;

    rec = mc_id || sim || repeat(predi,nsim,1) || u;
    append from rec;

    offset = offset + nsim;
    end;

  close _mcsim;

  call symput("seed", trim(left(char(seed))));
  quit;

%put final seed = &seed;

 /* generate monte carlo distribution of true usual intake. */
 /* add random effects to predicted values to generate true usual intake. */

data _mcsim (drop=a1_lambda a2_lambda var_e2 var_e3 temp
             %if (&backtran = 2) %then ginv_a1 ginv_a2 d2ginv_a1 d2ginv_a2;
             %if (&backtran = 1) %then c1-c9 w1-w9 ai sumw; );
  if (_n_ = 1) then set &param(keep=a1_lambda a2_lambda var_e2 var_e3);
  set _mcsim;
    by mc_id;

 /* generate p = probability to consume (probit or logit link). */

  %if (&modeltype = ONEPART) %then %do;
    linear_pred_p1 = .;
    p1 = 1;
    %end;
  %else %do;
    linear_pred_p1 = pred_x_p1 + u1;
    %if (&link = PROBIT) %then p1 = probnorm(linear_pred_p1) %str(;);
    %else p1 = 1 / (1 + exp(-linear_pred_p1)) %str(;);
    %end;

 /* generate A = amount on consumption day. */

  bc_a1 = pred_x_a1 + u2;
  bc_a2 = pred_x_a2 + u3;

 /* back-transform amount to original scale. */

 /* integrate back-transformation over distr. of within-person error (exact for log transformation). */

  %if (&backtran ^= 3) %then %do;

    if (bc_a1 > .Z & a1_lambda = 0) then do;
      a1 = exp(bc_a1 + var_e2 / 2);
      end;
 
    if (bc_a2 > .Z & a2_lambda = 0) then do;
      a2 = exp(bc_a2 + var_e3 / 2);
      end;

    %end;   

 /* integrate back-transformation over distr. of within-person error (gauss-hermite quadrature). */

  %if (&backtran = 1) %then %do;

    c1 = -2.1;  w1 = 0.063345;
    c2 = -1.3;  w2 = 0.080255;
    c3 = -0.8;  w3 = 0.070458;
    c4 = -0.5;  w4 = 0.159698;
    c5 =  0.0;  w5 = 0.252489;
    c6 =  0.5;  w6 = 0.159698;
    c7 =  0.8;  w7 = 0.070458;
    c8 =  1.3;  w8 = 0.080255;
    c9 =  2.1;  w9 = 0.063345;

    array c (*) c1-c9;
    array w (*) w1-w9;

    if (bc_a1 > .Z & a1_lambda ^= 0) then do;
      std_e2 = sqrt(var_e2);
      a1     = 0;
      sumw   = 0;

      do i = 1 to dim(c);
        bc_ai = bc_a1 + std_e2 * c(i);
        temp = a1_lambda * bc_ai + 1;
        if (temp > 0) then ai = temp**(1/a1_lambda);
        else ai = 0;
        ai = max(ai, &min_a1);
        a1 = a1 + w(i) * ai;
        sumw = sumw + w(i);
        end;

      a1 = a1 / sumw;
      end;

    if (bc_a2 > .Z & a2_lambda ^= 0) then do;
      std_e3 = sqrt(var_e3);
      a2     = 0;
      sumw   = 0;

      do i = 1 to dim(c);
        bc_ai = bc_a2 + std_e3 * c(i);
        temp = a2_lambda * bc_ai + 1;
        if (temp > 0) then ai = temp**(1/a2_lambda);
        else ai = 0;
        ai = max(ai, &min_a2);
        a2 = a2 + w(i) * ai;
        sumw = sumw + w(i);
        end;

      a2 = a2 / sumw;
      end;

    %end;   /* %if (&backtran = 1) %then %do */

 /* integrate back-transformation over distr. of within-person error (taylor linearization). */

  %if (&backtran = 2) %then %do;

    if (bc_a1 > .Z & a1_lambda ^= 0) then do;
      temp = a1_lambda * bc_a1 + 1;
      if (temp > 0) then do;
        ginv_a1   = temp**(1/a1_lambda);
        d2ginv_a1 = (1 - a1_lambda) * temp**(1/a1_lambda - 2);
        a1 = ginv_a1 + d2ginv_a1 * var_e2 / 2;
        end;
      else a1 = 0;
      end;

    if (bc_a2 > .Z & a2_lambda ^= 0) then do;
      temp = a2_lambda * bc_a2 + 1;
      if (temp > 0) then do;
        ginv_a2   = temp**(1/a2_lambda);
        d2ginv_a2 = (1 - a2_lambda) * temp**(1/a2_lambda - 2);
        a2 = ginv_a2 + d2ginv_a2 * var_e3 / 2;
        end;
      else a2 = 0;
      end;

    %end;   /* %if (&backtran = 2) %then %do */

 /* don't integrate back-transformation over distr. of within-person error. */

  %if (&backtran = 3) %then %do;

      if (bc_a1 > .Z) then do;
        if (a1_lambda = 0) then a1 = exp(bc_a1);
        else  do;
          temp = a1_lambda * bc_a1 + 1;
          if (temp > 0) then a1 = temp**(1/a1_lambda);
          else a1 = 0;
          end;
        end;

      if (bc_a2 > .Z) then do;
        if (a2_lambda = 0) then a2 = exp(bc_a2);
        else  do;
          temp = a2_lambda * bc_a2 + 1;
          if (temp > 0) then a2 = temp**(1/a2_lambda);
          else a2 = 0;
          end;
        end;

    %end;   /* %if (&backtran = 3) %then %do */

 /* ensure positive amounts. */

  if (a1 ^= . & a1_lambda ^= 0) then a1 = max(a1, &min_a1);
  if (a2 ^= . & a2_lambda ^= 0) then a2 = max(a2, &min_a2);

 /* generate t1 = p1 * a1. */

  t1 = p1 * a1;
  t2 = a2;
  bc_t2 = bc_a2;

  drop a2 bc_a2;

  label %if (&modeltype ^= ONEPART) %then pred_x_p1 = " "; pred_x_a1 = " " pred_x_a2 = " ";
  run;

 /* if multiple observations per subject, then average them. */

%if (&multiobs = 1) %then %do;

  proc means data=_mcsim noprint;
    by mc_id;
    %if (&day_wgt ^= %str()) %then weight &day_wgt%str(;);
    var t1 t2 a1 bc_a1 bc_t2 
        %if (&modeltype ^= ONEPART) %then p1 linear_pred_p1 pred_x_p1;
        pred_x_a1 pred_x_a2 &name_u;
    output out=_outmeans(keep=mc_id t1 t2 a1 bc_a1 bc_t2
               %if (&modeltype ^= ONEPART) %then p1 linear_pred_p1 pred_x_p1;
               pred_x_a1 pred_x_a2 &name_u) mean=;
    run;

  data _mcsim;
    merge _mcsim(drop=t1 t2 a1 bc_a1 bc_t2
                 %if (&modeltype ^= ONEPART) %then p1 linear_pred_p1 pred_x_p1;
                 pred_x_a1 pred_x_a2 &name_u)
          _outmeans;
      by mc_id;
    if (first.mc_id);
    run;

  %end;   /* %if (&multiobs = 1) %then %do */

%if (&print = Y) %then %do;

  title%eval(&ntitle + 1) "Monte Carlo Distribution of True Usual Intake";

  proc means data=_mcsim n mean var std stderr min max;
    var t1 t2 %if (&modeltype ^= ONEPART) %then p1; a1
        %if (&modeltype ^= ONEPART) %then linear_pred_p1;
        bc_a1 bc_t2
        %if (&modeltype ^= ONEPART) %then pred_x_p1;
        pred_x_a1 pred_x_a2 &name_u;
    run;

  proc corr data=_mcsim;
    var t1 t2 &name_u;
    run;

  proc univariate data=_mcsim plot normal nextrval=10 nextrobs=0;
    var t1 t2;
    run;

  title%eval(&ntitle + 1);

  %end;  /* %if (&print = Y) %then %do */

%mend Distrib_Bivariate;

 /*** end of macro Distrib_Bivariate ****************************************/