****************************************************************************************************
*                                           PROGRAM OVERVIEW
****************************************************************************************************
*
* PROGRAM: ms_HDPS.sas  
*
* Created (mm/dd/yyyy): 01/26/2016
* Last modified: 06/30/2019
* Version: 1.1
*
*--------------------------------------------------------------------------------------------------
* PURPOSE:
*   This macro will execute the HDPS variable selection algorithm
*   
*  Program inputs:                                                                                   
*    - 5 dimension datasets 
*  - dataset containing the list of patients, groups and outcomes
*
*  Program outputs:  
*    - dataset containing variables created/selected at the PatId level 
*                                                                                                  
*  PARAMETERS: 
*  - var_patient_id         = name of the Patid variable
*  - var_exposure           = name of exposure variable
*  - var_outcome            = name of the outcome variable
*  - top_n                  = 100 
*  - k                      = 200
*  - ranking_method         = ranking_method EXP_ASSOC (or BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING}
*  - outcome_zero_cell_corr = outcome_zero_cell_corr in {0, 1}
*  - outcome_type           = outcome_type in {DICHOTOMOUS, BINARY, COUNT, CONTINUOUS}
*  - trunc_9                = # of characters to keep from ICD-9 code
*  - trunc_10               = # of characters to keep from ICD-10 code
*  - input_cohort           = dataset containing the patid level information (level 1)
*  - input_dim1-10          = name of the dimension files (i.e., codes files from ms_cidacov)
*  - path_output            = To output .txt files
*            
*  Programming Notes:                                                                                
*                                                                           
*
*
*--------------------------------------------------------------------------------------------------
* CONTACT INFO: 
*  Sentinel Coordinating Center
*  info@sentinelsystem.org
*
*--------------------------------------------------------------------------------------------------
*  CHANGE LOG: 
*
*   Version   Date       Initials      Comment (reference external documentation when available)
*   -------   --------   --------   ---------------------------------------------------------------
*    1.1      06/30/2019    AP      Added analysisgrp to output_all_vars dataset to facilitate append
*
***************************************************************************************************;


%macro ms_HDVariableSelection(
  var_patient_id         = patId,
  var_exposure           = exposure,
  var_outcome            = outcome,
  top_n                  = 100,
  k                      = 200,
  ranking_method         = 'EXP_ASSOC',         /* ranking_method in {BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING} */
  outcome_zero_cell_corr = 1,                               /* outcome_zero_cell_corr in {0, 1} */
  outcome_type           = 'DICHOTOMOUS',       /* outcome_type in {DICHOTOMOUS, BINARY, COUNT, CONTINUOUS} */
  trunc_9 = .,                                   /* # of characters to keep from ICD-9 code */
  trunc_10 = .,                                  /* # of characters to keep from ICD-10 code */
  input_cohort = ,
  input_dim1  = ,   input_dim2  = ,
  input_dim3  = ,   input_dim4  = ,
  input_dim5  = ,   input_dim6  = ,
  input_dim7  = ,   input_dim8  = ,
  input_dim9  = ,   input_dim10 = ,
  path_output =
);

    %put NOTE: start ms_HDVariableSelection;
    %put NOTE: macro variables;
    %put var_patient_id         = &var_patient_id;
    %put var_exposure           = &var_exposure;
    %put var_outcome            = &var_outcome;
    %put top_n                  = &top_n;
    %put k                      = &k;
    %put ranking_method         = &ranking_method;
    %put outcome_zero_cell_corr = &outcome_zero_cell_corr;
    %put outcome_type           = &outcome_type;
    %put input_cohort           = &input_cohort;
    %put input_dim1             = &input_dim1;
    %put path_output            = &path_output;

    * clean up - MUST delete with each run.;
    *                        datasets are appended to as each dimension is processed;
    proc datasets library = work nowarn nolist;
        delete allVars allCodes allOccurrenceTypes allLinks allCodesHistogram allCodeMaps allDimNames;
    quit;

    * cumulative data sets need to be created before using %ms_appendfiles;
    data allVars; stop; run;
    data allCodes; stop; run;
    data allOccurrenceTypes; stop; run;
    data allLinks; stop; run;
    data allCodesHistogram; stop; run;
    data allCodeMaps; stop; run;
    data allDimNames; stop; run;

    /** 2) Read Patients **********************************************************************/
    data patients;
        set &input_cohort.;
        if &var_exposure. ~= 0 then exposed = 'T';
        else exposed = 'F';

        * if outcome does not exist then outcome = 0;
        outcome = coalesce(&var_outcome., 0);
        if outcome ~= 0 then outcomeDichotomous = 'T';
        else outcomeDichotomous = 'F';

        outcomeCount = int(outcome);
        if outcomeCount = . then outcomeCount = 0;

        outcomeContinuous = outcome;
        if outcomeContinuous = . then outcomeContinuous = 0;

        * if futime column doesnt exist then futime = .;
        futime = coalesce(futime, .);
        if futime ~= . then followUpTime = futime;
        else followUpTime = 1;

        keep &var_patient_id. exposed outcomeDichotomous outcomeCount outcomeContinuous followUpTime;
        rename &var_patient_id. = patId;
    run;

    %let totalPatients = &SYSNOBS;
    %put totalPatients = &totalPatients.;

    * set count variables;
    proc means noprint data = patients;
        output out = patientTotals
            sum(followUpTime) = ptTotal
            sum(outcomeContinuous) = sumOfOutcomes
            sum(outcomeCount) = numEvents;
    run;

    proc sql noprint;
    create table patientTotals2 as
        select sum(followUpTime) as ptExposed,
               count(*) as nExposed
        from patients
        where exposed = 'T';
    quit;

    proc sql noprint;
    create table patientTotals3 as
        select coalesce(sum(followUpTime), 0) as nOutcome
        from patients
        where outcomeDichotomous = 'T';
    quit;

    data patientTotals;
        set patientTotals;
        set patientTotals2;
        set patientTotals3;
        drop _TYPE_ _FREQ_;
    run;

    /** 3) Read Dimensions ********************************************************************/
    /** 3) a) Build Code Database *************************************************************/

    %macro readDimension(dimName, dimDataSet, dimCodeVarName);
        %put "readDimension(&dimName, &dimDataSet, &dimCodeVarName)";
        * build a mapping: codeNum -> codeId;
        * maintain original order;

        data &dimName.Copy;
            set &dimDataSet.;
            n = _N_;
        %if %sysfunc(prxmatch(m/icddx09/i,&dimDataSet)) and %str("&TRUNC_9.") ^= %str(".") %then %do;
            &dimCodeVarName. = substr(&dimCodeVarName.,1,&TRUNC_9.);
        %end;
        %if %sysfunc(prxmatch(m/icddx10/i,&dimDataSet)) and %str("&TRUNC_10.") ^= %str(".") %then %do;
            &dimCodeVarName. = substr(&dimCodeVarName.,1,&TRUNC_10.);
        %end;
            keep &var_patient_id. &dimCodeVarName. n;
            rename &var_patient_id. = patId  &dimCodeVarName. = code;
        run;

        proc sql noprint;
        create table &dimName.Codes as
            select code, min(n) as minN
            from &dimName.Copy
            group by code
            order by minN;
        quit;

        proc sql noprint;
            select count(*)
            into :numDistinctCodes
            from &dimName.Codes;
        quit;

        data _NULL_;
            put "&dimDataSet. numDistinctCodes=&numDistinctCodes.";
            digits = length(strip("&numDistinctCodes."));
            digits = max(3, digits);
            call symput('maxCodeDigits', put(digits, best.));           
        run;

        data &dimName.Codes;
            set &dimName.Codes;
            format codeId1 $&maxCodeDigits..;
            if _N_ <= 1000 then codeId1 = put((_N_ - 1), z3.);
            else codeId1 = strip(put(_N_ - 1, best.));
        run;

        * filter out rows that have a patId not in patients;
        proc sql noprint;
        create table &dimName.Filtered as
            select t2.* from
            patients t1,
            &dimName.Copy t2
            where t1.patId = t2.patId;
        quit;

        *generate codeIds for dimName;
        *add codeId1 to dimName;
        proc sql noprint;
        create table &dimName.SQL as
            select d.patId,
                   d.code,
                   c.codeId1 
            from &dimName.Filtered d,
                 &dimName.Codes c
            where d.code = c.code;
        quit;

        data &dimName.Codes;
            set &dimName.SQL;
            codeId = "&dimName.V" || strip(codeId1);
            codeType = 'STANDARD';
            considerForPS = 'false';
        run;

        * generate links;
        proc sql noprint;
        create table &dimName.Links as
        select %if %eval(&scdmver1. > 7) %then %do; compress(codeId || '|' || put(patId,8.)) as linkId, %end;
		       %else %do; strip(codeId) || '|' || patId as linkId, %end;
               patId, 
               codeId, 
               count(patId) as numOccurrences
        from &dimName.Codes
        group by patId, codeId;
        quit;

        *create histogram of numOccurrences for each code;
        data &dimName.LinksHistogramT;
            set &dimName.Links;
            h0 = 0;
            h1 = 0;
            h2 = 0;
            h3 = 0;
            h4 = 0;
            h5 = 0;
            h6 = 0;
            h7 = 0;
            h8 = 0;
            h9 = 0;
            h10 = 0;
        run;

        data &dimName.LinksHistogramT;
            set &dimName.LinksHistogramT;
            if numOccurrences = 0 then h0 = 1;
            if numOccurrences = 1 then h1 = 1;
            if numOccurrences = 2 then h2 = 1;
            if numOccurrences = 3 then h3 = 1;
            if numOccurrences = 4 then h4 = 1;
            if numOccurrences = 5 then h5 = 1;
            if numOccurrences = 6 then h6 = 1;
            if numOccurrences = 7 then h7 = 1;
            if numOccurrences = 8 then h8 = 1;
            if numOccurrences = 9 then h9 = 1;
            if numOccurrences > 9 then h10 = 1;
        run;

        proc sql noprint;
        create table &dimName.LinksHistogram as
        select distinct(codeId),
               sum(h0) as h0,
               sum(h1) as h1,
               sum(h2) as h2,
               sum(h3) as h3,
               sum(h4) as h4,
               sum(h5) as h5,
               sum(h6) as h6,
               sum(h7) as h7,
               sum(h8) as h8,
               sum(h9) as h9,
               sum(h10) as h10
        from &dimName.LinksHistogramT
        group by codeId;
        quit;

        * calculate numPatientCodes and numUniquePatientCodes;
        proc sql noprint;
        create table &dimName.Counts as
        select distinct(d.patId),
               count(d.code) as numPatientCodes, 
               count(distinct(d.code)) as numUniquePatientCodes
        from patients, &dimName.Filtered d 
        where d.patId = patients.patid
        group by d.patId;
        quit;

        *calculate numUniqueOccurrences;
        proc sql noprint;
        create table &dimName.CodeMap as
        select distinct(code) as code, 
               codeId, 
               codeType, 
               considerForPS, 
               count(distinct(patId)) as numUniqueOccurrences 
        from &dimName.Codes 
        group by code;
        quit;

    /** 3) b) Filter Codes for Prevalence *****************************************************/

        data &dimName.CodeMap;
            set &dimName.CodeMap;
            prevalence = numUniqueOccurrences / &totalPatients.;

            if prevalence > 0.5 then prevalence = 1.0 - prevalence;
        run;

        proc sort data = &dimName.CodeMap;
            key prevalence / descending;
        run;

        * set considerFopPs = 'true' for top_n codes and remove others;
        data &dimName.CodeMapTopN;
            set &dimName.CodeMap;
            considerForPs = 'true';
            if _N_ <= &top_n.;
        run;

    /** 3) c) i. create service intensity variables *******************************************/
        %macro createServiceIntensityVariables(Unique);
            data &dimName.Frequency&Unique.;
                set &dimName.Counts;
                keep patId num&Unique.PatientCodes;
            run;

            proc sort data=&dimName.Frequency&Unique.;
                key num&Unique.PatientCodes;
            run;

            %let frequencyTotal = &SYSNOBS;

            * quartile indices;
            %let q1i = %sysfunc(round(&frequencyTotal. * 0.25));
            %let q2i = %sysfunc(round(&frequencyTotal. * 0.5));
            %let q3i = %sysfunc(round(&frequencyTotal. * 0.75));
            %let q4i = &frequencyTotal.;

            %let codeIdUnique = .;
            %if &Unique. = Unique %then %do;
                %let codeIdUnique = UNIQ;
            %end;
            %if &Unique. =  %then %do;
                %let codeIdUnique = ALL;
            %end;

						* initializing if qXi. = 0;
						%let q1=0;
						%let q2=0;
						%let q3=0;
						%let q4=0;

            * keep only quartiles and associated patId;
            data &dimName.Quartiles&Unique.;
                set &dimName.Frequency&Unique.;
                if _N_ = &q1i. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q1";
                    code = codeId;
                    codeType = 'INTENSITY';
                    considerForPS = 'true';
                    call symput('q1', strip(put(num&Unique.PatientCodes, best.)));
                    output;
                end;
                if _N_ = &q2i. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q2";
                    code = codeId;
                    codeType = 'INTENSITY';
                    considerForPS = 'true';
                    call symput('q2', strip(put(num&Unique.PatientCodes, best.)));
                    output;
                end;
                if _N_ = &q3i. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q3";
                    code = codeId;
                    codeType = 'INTENSITY';
                    considerForPS = 'true';
                    call symput('q3', strip(put(num&Unique.PatientCodes, best.)));
                    output;
                end;
                if _N_ = &q4i. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q4";
                    code = codeId;
                    codeType = 'INTENSITY';
                    considerForPS = 'true';
                    call symput('q4', strip(put(num&Unique.PatientCodes, best.)));
                    output;
                end;
            run;

            * add quartiles to codeMap;
            %ms_appendfiles(&dimName.CodeMapTopN,
                    &dimName.Quartiles&Unique., 
                    &dimName.CodeMapTopN(drop=patId num&Unique.PatientCodes));

            * create links for quartiles;
            data &dimName.QuartileLinks;
                set &dimName.Frequency&Unique.;
                if num&Unique.PatientCodes <= &q1. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q1";
					%if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end;
		            %else %do; linkId = strip(codeId) || '|' || patId; %end;
                    intensityVarValue = 1;
                end; 
                else if &q1. < num&Unique.PatientCodes <= &q2. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q2";
                    %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end;
		            %else %do; linkId = strip(codeId) || '|' || patId; %end;
                    intensityVarValue = 1;
                end;
                else if &q2. < num&Unique.PatientCodes <= &q3. then do;
                    codeId = "&dimName._INT_&codeIdUnique._Q3";
                    %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end;
		            %else %do; linkId = strip(codeId) || '|' || patId; %end;
                    intensityVarValue = 1;
                end;
                else do;
                    codeId = "&dimName._INT_&codeIdUnique._Q4";
                    %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end;
		            %else %do; linkId = strip(codeId) || '|' || patId; %end;
                    intensityVarValue = 1;
                end;
            run;

            *add quartile links to dimNameLinks;
            data &dimName.Links;
                format linkId $50. codeId $20.;
                set &dimName.Links &dimName.QuartileLinks;
            run;

            * create service intensity vars for quartiles;
            proc sort data = &dimName.Frequency&Unique.;
                by patId;
            run;

            proc sort data = patients;
                by patId;
            run;

            data &dimName.ServiceIntensityVars&Unique.T;
                merge &dimName.Frequency&Unique. patients;
                by patId;
                c1MeanOutcome = outcomeContinuous;
                c1NumEvents = outcomeCount;
                pt_c1 = followUpTime;
                
                e1c1 = 0;
                e1Missing = 0;
                e0c1 = 0;
                e0Missing = 0;
                d1c1 = 0;
                d1Missing = 0;
                d0c1 = 0;
                d0Missing = 0;

                if exposed = 'T' then do;
                    e1c1 = 1;
                end; else do;
                    e0c1 = 1;
                end;

                if outcomeDichotomous = 'T' then do;
                    d1c1 = 1;
                end; else do;
                    d0c1 = 1;
                end;
                keep patId c1MeanOutcome c1NumEvents pt_c1 e1c1 e1Missing 
                         e0c1 e0Missing d1c1 d1Missing d0c1 d0Missing;
            run;

            proc sql noprint;
            create table &dimName.ServiceIntensityVars&Unique.T2 as
                select codeId,
                       sum(c1MeanOutcome) as c1MeanOutcome,
                       sum(c1NumEvents) as c1NumEvents,
                       sum(pt_c1) as pt_c1,
                       sum(e1c1) as e1c1,
                       sum(e1Missing) as e1Missing,
                       sum(e0c1) as e0c1,
                       sum(e0Missing) as e0Missing,
                       sum(d1c1) as d1c1,
                       sum(d1Missing) as d1Missing,
                       sum(d0c1) as d0c1,
                       sum(d0Missing) as d0Missing
                from &dimName.ServiceIntensityVars&Unique.T t1,
                     &dimName.QuartileLinks t2
                where t1.patId = t2.patId
                group by codeId;
            quit;

            *add quartile codes that have no associated patients;
            data &dimName.Quartiles&Unique.T;
                set &dimName.Quartiles&Unique.;
                keep codeId;
            run;

            proc sql noprint;
            create table &dimName.ServiceIntensityVars&Unique. as
                select q.codeId,
                       coalesce(s.c1MeanOutcome, 0) as c1MeanOutcome,
                       coalesce(s.c1NumEvents, 0) as c1NumEvents,
                       coalesce(s.pt_c1, 0) as pt_c1,
                       coalesce(s.e1c1, 0) as e1c1,
                       coalesce(s.e1Missing, 0) as e1Missing,
                       coalesce(s.e0c1, 0) as e0c1,
                       coalesce(s.e0Missing, 0) as e0Missing,
                       coalesce(s.d1c1, 0) as d1c1,
                       coalesce(s.d1Missing, 0) as d1Missing,
                       coalesce(s.d0c1, 0) as d0c1,
                       coalesce(s.d0Missing, 0) as d0Missing
                from &dimName.Quartiles&Unique.T q
                full outer join &dimName.ServiceIntensityVars&Unique.T2 s
                on q.codeId = s.codeId;
            quit;

        %mend createServiceIntensityVariables;

        /** 3) c) i.  create service intensity variables ******************************************/
        %createServiceIntensityVariables(Unique);
        /** 3) c) ii. create service intensity variables ******************************************/
        %createServiceIntensityVariables();

        /** 3) c. Calculate Medians and Bias ... processing codes ... *****************************/

        * calculate cumulative histogram;
        data &dimName.LinksHistogram;
            set &dimName.LinksHistogram;
            c0  = h0;
            c1  = sum(c0, h1);
            c2  = sum(c1, h2);
            c3  = sum(c2, h3);
            c4  = sum(c3, h4);
            c5  = sum(c4, h5);
            c6  = sum(c5, h6);
            c7  = sum(c6, h7);
            c8  = sum(c7, h8);
            c9  = sum(c8, h9);
            c10 = sum(c9, h10);
        run;

        /** 3) c. iii. Calculate Median - median, q3 **********************************************/
        %macro findPoint(cumulative, search, start);
            point = .;
            if &search. <= &cumulative.[&start.] then do;
                point = &start.;
            end;
            else do;
                do i = 2 to 10;
                    if &search. > &cumulative.[i - 1] and &search. <= &cumulative.[i] then leave;
                end;
                point = (i - 1);
            end;
        %mend findPoint;

        %macro calcPercentile(percentile, cumulative, total, storeIn);
            center = &percentile. * &total.;
            fCenter = floor(center);
            isEven = (fCenter = center);
            
            if isEven then do;
                *find the two center points;
                %findPoint(&cumulative., fCenter, 1);
                v1 = point;
                %findPoint(&cumulative., fCenter + 1, v1);
                v2 = point;
                *average them;
                &storeIn. = (v1 + v2) / 2;
            end;
            else do;
                *only one center point;
                cCenter = ceil(center);
                %findPoint(&cumulative., cCenter, 1);
                &storeIn. = point;
            end;
        %mend calcPercentile;

        data &dimName.LinksHistogram;
            set &dimName.LinksHistogram;
            array cHist c0 - c10;
            %calcPercentile(0.5, cHist, c10, median);
            %calcPercentile(0.75, cHist, c10, q3);
            keep codeId median q3;
        run;

        /** 3) c. iv. Mark Occurrence Type ********************************************************/

        proc sql noprint;
        create table &dimName.OccurrenceTypes as 
            select l.linkId,
                   l.codeId,
                   l.patId,
                   l.numOccurrences,
                   c.median,
                   c.q3
            from &dimName.Links l,
                 &dimName.LinksHistogram c
            where l.codeId = c.codeId;
        quit;

        data &dimName.OccurrenceTypes;
            set &dimName.OccurrenceTypes;

            setanyVar = (numOccurrences >= 1); 
            setoftenVar = (numOccurrences >= median);
            setFrequentVar = (numOccurrences >= q3);

            oftenVarMissing = (median = 1);
            frequentVarMissing = (q3 = median);

            anyVarValue = 0;
            oftenVarValue = 0;
            frequentVarValue = 0;

            if setanyVar then anyVarValue = 1;
            if setoftenVar then do;
                if ~oftenVarMissing then oftenVarValue = 1;
                else oftenVarValue = .;
            end;
            if setFrequentVar then do;
                if ~frequentVarMissing then frequentVarValue = 1;
                else frequentVarValue = .;
            end;

            drop setanyVar setoftenVar setFrequentVar oftenVarMissing frequentVarMissing;
        run;

        proc sql noprint;
        create table &dimName.CodeVars as 
            select o.codeId,
                   o.anyVarValue,
                   o.oftenVarValue,
                   o.frequentVarValue, 
                   p.patId,
                   p.outcomeContinuous,
                   p.outcomeCount,
                   p.followUpTime,
                   p.exposed,
                   p.outcomeDichotomous
            from &dimName.OccurrenceTypes o,
                 patients p
            where o.patId = p.patId;
        quit;

        data &dimName.CodeVars;
            set &dimName.CodeVars;
            
            c1MeanOutcome = 0;
            c1NumEvents = 0;
            pt_c1 = 0;
            e1Missing = 0;
            e1c1 = 0;
            e0Missing = 0;
            e0c1 = 0;
            d1Missing = 0;
            d1c1 = 0;
            d0Missing = 0;
            d0c1 = 0;
        run;

        %macro updateVarCounts(type, value);
            data &dimName.CodeVars&Type.T;
                set &dimName.CodeVars;

                if &value. ~= 0 then do;
                    c1MeanOutcome = c1MeanOutcome + outcomeContinuous;
                    c1NumEvents = c1NumEvents + outcomeCount;

                    if &value. = 1 then pt_c1 = pt_c1 + followUpTime;

                    if exposed = 'T' then do;
                        if &value. = . then e1Missing = e1Missing + 1;
                        else if &value. = 1 then e1c1 = e1c1 + 1;
                    end;
                    else do;
                        if &value. = . then e0Missing = e0Missing + 1;
                        else if &value. = 1 then e0c1 = e0c1 + 1;
                    end;

                    if outcomeDichotomous = 'T' then do;
                        if &value. = . then d1Missing = d1Missing + 1;
                        else if &value. = 1 then d1c1 = d1c1 + 1;
                    end;
                    else do;
                        if &value. = . then d0Missing = d0Missing + 1;
                        else if &value. = 1 then d0c1 = d0c1 + 1;
                    end;
                end;
            run;

            proc sql noprint;
            create table &dimName.CodeVars&Type. as
                select distinct(codeId) as codeId,
                       sum(c1MeanOutcome) as c1MeanOutcome,
                       sum(c1NumEvents) as c1NumEvents,
                       sum(pt_c1) as pt_c1,
                       sum(e1c1) as e1c1,
                       sum(e1Missing) as e1Missing,
                       sum(e0c1) as e0c1,
                       sum(e0Missing) as e0Missing,
                       sum(d1c1) as d1c1,
                       sum(d1Missing) as d1Missing,
                       sum(d0c1) as d0c1,
                       sum(d0Missing) as d0Missing
                from &dimName.CodeVars&Type.T
                group by codeId;
            quit;

        %mend updateVarCounts;

        %updateVarCounts(Any, anyVarValue);
        %updateVarCounts(Often, oftenVarValue);
        %updateVarCounts(Frequent, frequentVarValue);

        /** 3) c. v. Calculate Bias ***************************************************************/
        data patientTotals;
            set patientTotals;

            nTotal = &totalPatients.;
            e1Total = nExposed;
            e0Total = nTotal - e1Total;
            d1Total = nOutcome;
            d0Total = nTotal - d1Total;
        run;

        %macro calculateBias(datasetName);
            data &datasetName.;
                set &datasetName.;
                if _N_ = 1 then set patientTotals;
                
                bias = 0;
                nMissing = e1Missing + e0Missing;
                N = nTotal - nMissing;

                e1 = e1Total - e1Missing;
                e1c0 = e1 - e1c1;

                e0 = e0Total - e0Missing;
                e0c0 = e0 - e0c1;

                d1 = d1Total - d1Missing;
                d1c0 = d1 - d1c1;

                d0 = d0Total - d0Missing;
                d0c0 = d0 - d0c1;

                c1 = e1c1 + e0c1;
                c0 = N - c1;
                
                pt = ptTotal;
                pt_e1 = ptExposed;
                pt_e0 = ptTotal - ptExposed;
                pt_c0 = ptTotal - pt_c1;
                
                numEvents = numEvents;
                c0NumEvents = numEvents - c1NumEvents;

                *so far, these are running totals.  divide to get means.;
                meanOutcome = sumOfOutcomes / nTotal;
                if c1 > 0 then c1MeanOutcome = c1MeanOutcome / c1;
                else c1MeanOutcome = -1;
                if c0 > 0 then c0MeanOutcome = (((meanOutcome * nTotal) - (c1MeanOutcome * c1)) / c0);
                else c0MeanOutcome = -1;

                if e1 > 0 then pc_e1 = e1c1 / e1;
                else pc_e1 = 0;
                
                if e0 > 0 then  pc_e0 = e0c1 / e0;
                else pc_e0 = 0;
                
                if pc_e1 > 0.5 then pc_e1 = 1.0 - pc_e1;
                if pc_e0 > 0.5 then pc_e0 = 1.0 - pc_e0;

                * TODO use an "i.n.v.a.l.i.d." value like java instead of . ?;
                rrCe = .;
                expAssocRankingVariable = .;
                rrCd = .;
                bias = .;
                biasRankingVariable = .;
                outcomeAssocRankingVariable = .;

                if pc_e0 > 0 and pc_e1 > 0 then do;
                    rrCe = pc_e1 / pc_e0;
                    expAssocRankingVariable = abs(log(rrCe));
                end;

                if &outcome_type. = 'DICHOTOMOUS' or &outcome_type. = 'BINARY' or &outcome_type. = 'COUNT' then do;
                    if &outcome_zero_cell_corr. = 1 then do;
                        rrCd = (((c1NumEvents + 0.1) / (pt_c1 + 0.1)) / ((c0NumEvents + 0.1) / (pt_c0 + 0.1)));
                    end;
                    else if ((c1NumEvents > 0) and (pt_c1 > 0) and (c0NumEvents > 0) and (pt_c0 > 0)) then do;
                        rrCd = ((c1NumEvents / pt_c1) / (c0NumEvents / pt_c0));
                    end;

                    if (rrCd > 0) then outcomeAssocRankingVariable = abs(log(rrCd));
                end;
                else if &outcome_type. = 'CONTINUOUS' then do;
                    rrCd = c1MeanOutcome - c0MeanOutcome;
                    outcomeAssocRankingVariable = abs(rrCd);
                end;

                if ((pc_e0 > 0) && (pc_e1 > 0) && (rrCd > 0)) then do;
                    biasA = pc_e1 * (rrCd - 1.0) + 1.0;
                    biasB = pc_e0 * (rrCd - 1.0) + 1.0;
                    bias = biasA / biasB;
                    biasRankingVariable = abs(log(bias));
                end;

            run;

        %mend calculateBias;

        %calculateBias(&dimName.CodeVarsany);
        %calculateBias(&dimName.CodeVarsoften);
        %calculateBias(&dimName.CodeVarsFrequent);
        %calculateBias(&dimName.ServiceIntensityVarsUnique);
        %calculateBias(&dimName.ServiceIntensityVars);

        * update cumulative data sets;
        * cannot use proc datasets append because it truncates character variables;

        data allVarsT;
            set &dimName.CodeVarsany(in = any)
                    &dimName.CodeVarsoften(in = often)
                    &dimName.CodeVarsFrequent(in = frequent)
                    &dimName.ServiceIntensityVarsUnique(in = intensity)
                    &dimName.ServiceIntensityVars(in = unique);
            format type $10.;
            if any = 1 then type = 'Any';
            if often = 1 then type = 'Often';
            if frequent = 1 then type = 'Frequent';
            if unique = 1 then type = 'ServiceInt';
            if intensity = 1 then type = 'ServiceInt';
        run;

        %ms_appendfiles(allVars,
                    allVarsT, 
                    allVars);

        %ms_appendfiles(allCodes,
                    &dimName.CodeMapTopN, 
                    allCodes);

        %ms_appendfiles(allOccurrenceTypes,
                    &dimName.OccurrenceTypes, 
                    allOccurrenceTypes);

        %ms_appendfiles(allLinks,
                    &dimName.Links, 
                    allLinks);

        %ms_appendfiles(allCodesHistogram,
                    &dimName.LinksHistogram, 
                    allCodesHistogram);

        %ms_appendfiles(allCodeMaps,
                    &dimName.CodeMap, 
                    allCodeMaps);

        data allDimNamesT;
            dimName = "&dimName.";
            dimDataSet = "&dimDataSet.";
            output;
        run;

        %ms_appendfiles(allDimNames,
                    allDimNamesT, 
                    allDimNames);

    %mend;

  *This is where all the processing is done using the dimension files that were passed on (typically 7 in QRP);
    data _NULL_;
        %do dimIndex = 1 %to 10;
            %let tDataSetName = %scan(&&input_dim&dimIndex, 1, ' ');
            %let tCodeVar = %scan(&&input_dim&dimIndex, 2, ' ');

            %if &tDataSetName ~= and %sysfunc(exist(&tDataSetName)) %then %do;
                %if &dimIndex < 10 %then %do;
                    %let dimName = D0&dimIndex.;
                %end;
                %else %do;
                    %let dimName = D&dimIndex.;
                %end;
                %readDimension(&dimName., &tDataSetName, &tCodeVar);
            %end;
            %else %if &&input_dim&dimIndex ~= %then %do;
                %put ERROR: (Sentinel) readDimension(&&input_dim&dimIndex), "&&input_dim&dimIndex" is invalid.;
            %end;
        %end;
    run;

    /** 4) Select and Output Variables: *******************************************************/
    /** 4) a) Choose Variables to Consider: ***************************************************/

    data allConsideredCodes;
        set allCodes;
        if considerForPS = 'true' then output;
    run;

    proc sql noprint;
    create table variablesToConsider as
        select v.codeId,
               c.code,
               v.e1,
               v.e0,
               v.pt_e1,
               v.pt_e0,
               v.d1,
               v.d0,
               v.c1,
               v.c0,
               v.pt_c1,
               v.pt_c0,
               v.e1c1,
               v.e1c0,
               v.e0c1,
               v.e0c0,
               v.d1c1,
               v.d1c0,
               v.d0c1,
               v.d0c0,
               v.pc_e1,
               v.pc_e0,
               v.numEvents,
               v.c1NumEvents,
               v.c0NumEvents,
               v.meanOutcome,
               v.c1MeanOutcome,
               v.c0MeanOutcome,
               v.rrCe,
               v.rrCd,
               v.bias,
               v.expAssocRankingVariable,
               v.outcomeAssocRankingVariable,
               v.biasRankingVariable,
               v.type
        from allVars v, allConsideredCodes c
        where v.codeId = c.codeId;
    quit;

    * ranking_method in {BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING};
    data variablesToConsider;
        set variablesToConsider;
        
        if &ranking_method. = 'EXP_ASSOC' then do;
            activeRankingVariable = expAssocRankingVariable;
        end;
        else if &ranking_method. = 'OUTCOME_ASSOC' then do;
            activeRankingVariable = outcomeAssocRankingVariable;
        end;
        else if &ranking_method. = 'BIAS' then do;
            activeRankingVariable = biasRankingVariable;
        end;
        else do;
            activeRankingVariable = 0;
        end;
    run;

    *keep only k variables with greatest activeRankingVariable;
    proc sort data = variablesToConsider;
        by descending activeRankingVariable code;
    run;

    data variablesToConsider;
        set variablesToConsider;
        format selectedForPs $5.;
        if _N_ <= &k. and activeRankingVariable ~= . then selectedForPs = 'true';
        else selectedForPs = 'false';       
    run;

    data selectedVariables;
        set variablesToConsider;
        if selectedForPs = 'true';
    run;

    /** 4) b) Calculate Z Bias: ***************************************************************/
    data outcomeStrengths;
        set selectedVariables;
        if expAssocRankingVariable ~= . and outcomeAssocRankingVariable ~= .;
        keep outcomeAssocRankingVariable code codeId type;
    run;

    proc sort data = outcomeStrengths;
        by outcomeAssocRankingVariable descending code;
    run;

    data outcomeStrengths;
        set outcomeStrengths;
        i = _N_;
    run;

    proc univariate data = outcomeStrengths noprint PCTLDEF=4;
        var outcomeAssocRankingVariable;
        output pctlpre=P pctlpts=50 out=outcomeStrengthsP;
    run;

    data _NULL_;
        set outcomeStrengthsP;
        if _N_ = 1 then do;
            call symput('P50', strip(put(P50, best.)));
        end;
    run;

    %macro bSearch (array=,key=,return=);
        %local l m h;
        %let l=l%substr(%sysfunc(ranuni(0)),3,7);
        %let m=m%substr(%sysfunc(ranuni(0)),3,7);
        %let h=h%substr(%sysfunc(ranuni(0)),3,7);
        drop &l &m &h &return;
        &l=lbound(&array);
        &h=hbound(&array);
        &return=.;
        do until (&h < &l);
            &m = floor((&l + &h) * 0.5);
            if          round(&key, 0.0000000001) < round(&array(&m), 0.0000000001) then &h=&m-1;
            else if round(&key, 0.0000000001) > round(&array(&m), 0.0000000001) then &l=&m+1;
            else do;
                &return=&m;
                leave;
            end;
        end;
        * no exact match found. return same value as java Arrays.binarySearch();
        *** the index of the first element greater than the key;
        if &return = . then do;
            do i = &l to &h;
                if &array(i) > &key  then do;
                    leave;
                end;
            end;
            &return = i;
        end;
    %mend;

    proc transpose data = outcomeStrengths out = outcomeStrengthsT prefix = ost;
        var outcomeAssocRankingVariable;
    run;

    data outcomeStrengthsT;
        set outcomeStrengthsT;
        drop _NAME_;
    run;

    data _NULL_;
        set outcomeStrengthsT;

        array ost _ALL_;
        %bSearch(array=ost, key=&P50., return=x);
        * one based arry to zero based array;
        call symput('zMedian', strip(put((x - 1), best.)));
    run;

    data weakOutcomeStrengths;
        set outcomeStrengths;
        if _N_ <= &zMedian.;
    run;

    proc univariate data = weakOutcomeStrengths noprint PCTLDEF = 4;
        var outcomeAssocRankingVariable;
        output pctlpre = P pctlpts = 20, 40, 60, 80 out = weakOutcomeStrengthsP;
    run;

    data _NULL_;
        set weakOutcomeStrengthsP;
        if _N_ = 1 then do;
            call symput('P20', strip(put(P20, best.)));
            call symput('P40', strip(put(P40, best.)));
            call symput('P60', strip(put(P60, best.)));
            call symput('P80', strip(put(P80, best.)));
        end;
    run;

    data _NULL_;
        set outcomeStrengthsT;

        if _N_ = 1 then do;
            array ost _ALL_;

            %bSearch(array=ost, key=&P20., return=i20);
            %bSearch(array=ost, key=&P40., return=i40);
            %bSearch(array=ost, key=&P60., return=i60);
            %bSearch(array=ost, key=&P80., return=i80);

            array startsOfQuintile{7};
            * first elements is always 0;
            startsOfQuintile{1} = 0;
            * one based arry to zero based array;
            startsOfQuintile{2} = i20 - 1;
            startsOfQuintile{3} = i40 - 1;
            startsOfQuintile{4} = i60 - 1;
            startsOfQuintile{5} = i80 - 1;
            startsOfQuintile{6} = &zMedian.;

            declare hash zBiasScores();
            zBiasScores.defineKey('i');
            zBiasScores.defineData('i', 'score');
            zBiasScores.defineDone();
            
            do quintile = 1 to 5;
                do i = startsOfQuintile{quintile} to (startsOfQuintile{quintile + 1} - 1);
                    score = 6 - quintile;
                    zBiasScores.add();
                end;
            end;

            zBiasScores.output(dataset: "zBiasScores");
        end;
    run;

    * zero based array to one based array;
    data zBiasScores;
        set zBiasScores;
        i = i + 1;
    run;

    proc sql noprint;
        create table zBiasScoresMerged as
            select z.score,
                   o.codeId,
                   o.type
            from zBiasScores z,
                 outcomeStrengths o
            where z.i = o.i;
    quit;

    proc sql noprint;
        create table variablesToConsiderScored as
        select v.*,
               coalesce(z.score, 0) as zBiasScore
        from variablesToConsider v
        full outer join zBiasScoresMerged z
        on v.codeId = z.codeId and
             v.type = z.type;
    quit;

    /** 4) c) Output Variables and Dimensions: ************************************************/
    /** output_all_vars ***********************************************************************/
    proc sql noprint;
    create table output_all_varsT as 
        select substr(codeId, 1, 3) as dimensionLU,
               code as code_id,
               strip(codeId) || type as var_name,
               selectedForPs as selected_for_ps,
               e1,
               e0,
               pt_e1,
               pt_e0,
               d1,
               d0,
               c1,
               c0,
               pt_c1,
               pt_c0,
               e1c1,
               e1c0,
               e0c1,
               e0c0,
               d1c1,
               d1c0,
               d0c1,
               d0c0,
               pc_e1,
               pc_e0,
               numEvents as num_events,
               c1NumEvents as c1_num_events,
               c0NumEvents as c0_num_events,
               meanOutcome as mean_outcome,
               c1MeanOutcome as c1_mean_outcome,
               c0MeanOutcome as c0_mean_outcome,
               rrCe as rr_ce,
               rrCd as rr_cd,
               bias,
               expAssocRankingVariable as exp_assoc_ranking_var,
               outcomeAssocRankingVariable as outcome_assoc_ranking_var,
               biasRankingVariable as bias_assoc_ranking_var,
               zBiasScore as z_bias_score
        from variablesToConsiderScored
        order by var_name;

    * lookup name of original dimension dataset;
    create table output_all_vars as
        select d.dimDataSet as dimension, 
               v.code_id,
               v.var_name,
               v.selected_for_ps,
               v.e1,
               v.e0,
               v.pt_e1,
               v.pt_e0,
               v.d1,
               v.d0,
               v.c1,
               v.c0,
               v.pt_c1,
               v.pt_c0,
               v.e1c1,
               v.e1c0,
               v.e0c1,
               v.e0c0,
               v.d1c1,
               v.d1c0,
               v.d0c1,
               v.d0c0,
               v.pc_e1,
               v.pc_e0,
               v.num_events,
               v.c1_num_events,
               v.c0_num_events,
               v.mean_outcome,
               v.c1_mean_outcome,
               v.c0_mean_outcome,
               v.rr_ce,
               v.rr_cd,
               v.bias,
               v.exp_assoc_ranking_var,
               v.outcome_assoc_ranking_var,
               v.bias_assoc_ranking_var,
               v.z_bias_score
        from output_all_varsT v,
             allDimNames d
        where d.dimName = v.dimensionLU;
    quit;

    *match java formatting for easy comparison;
    data output_all_vars;
        set output_all_vars;
        format psestimategrp $40. e1 e0 pt_e1 pt_e0 d1 d0 c1 c0 pt_c1 pt_c0 e1c1 e1c0 e0c1 e0c0 
               d1c1 d1c0 d0c1 d0c0 pc_e1 pc_e0 num_events c1_num_events c0_num_events
               mean_outcome c1_mean_outcome c0_mean_outcome rr_ce rr_cd bias 
               exp_assoc_ranking_var outcome_assoc_ranking_var bias_assoc_ranking_var 18.10;
        psestimategrp = "&psestimategrp.";
    run;

    /** output_dimension_codes ****************************************************************/
    proc sql noprint;
    create table output_dimension_codesT as
        select coalesce(h.codeId, c.codeId) as codeId,
               substr(coalesce(h.codeId, c.codeId), 2, 2) as dimension_num,
               substr(coalesce(h.codeId, c.codeId), 1, 3) as dimension,
               c.considerForPs as consider_for_selection,
               c.prevalence as prevalence,
               h.median,
               h.q3
        from allCodes c
        full outer join allCodesHistogram h
        on c.codeId = h.codeId;

    create table output_dimension_codesT2 as
        select o.dimension_num,
               o.dimension as dimensionLU,
               coalesce(a.code, o.codeId) as code_id,     /* for INTENSITY codes */
               a.numUniqueOccurrences as num_patients,
               o.consider_for_selection,
               a.prevalence,
               o.median,
               o.q3
        from output_dimension_codesT o
        full outer join allCodeMaps a
        on o.codeId = a.codeId;

    * lookup name of original dimension dataset;
    create table output_dimension_codes as
        select c.dimension_num,
               d.dimDataSet as dimension, 
               c.code_id,
               c.num_patients,
               c.consider_for_selection,
               c.prevalence,
               c.median,
               c.q3       
        from output_dimension_codesT2 c,
             allDimNames d
        where d.dimName = c.dimensionLU;
    quit;

    data output_dimension_codes;
        set output_dimension_codes;
        format prevalence median q3 18.10;
        if num_patients = . then num_patients = 0;
        if prevalence = . then prevalence = -1;
        if median = . then median = 0;
        if q3 = . then q3 = 0;
        if consider_for_selection ~= 'true' then do;
            consider_for_selection = 'false';
            median = 0;
            q3 = 0;
        end;
    run;
	
    /** generate cohorts ****************************************************************/
    *** create an adjacency matrix with selectedVariables on top and ALL patIds on left;
    proc sql noprint;
    create table allLinksWithOccurrenceTypes as
        select l.linkId,
               l.codeId,
               l.patId,
               o.anyVarValue,
               o.oftenVarValue,
               o.frequentVarValue,
               l.intensityVarValue
        from allLinks l
        full outer join allOccurrenceTypes o
        on l.linkId = o.linkId;
    quit;

    proc sql noprint;
    create table cohort as
        select trim(s.codeId) || s.type as code,
               s.type,
               l.patId,
               l.anyVarValue,
               l.oftenVarValue,
               l.frequentVarValue,
               l.intensityVarValue
        from selectedVariables s
        full outer join allLinksWithOccurrenceTypes l
        on s.codeId = l.codeId;
    quit;

    * filter output_full_cohort, remove all rows where {type}VarValue ~= 1;
    data cohortT;
        set cohort;
        
        if type = 'Any' then varValue = anyVarValue;
        else if type = 'Often' then varValue = oftenVarValue;
        else if type = 'Frequent' then varValue = frequentVarValue;
        else if type = 'ServiceInt' then varValue = intensityVarValue;
        
        outputVar = 0;
        if varValue = . then outputVar = 0;
        else if varValue = 1 then outputVar = 1;
        
        if outputVar = 1;
    run;

    * add patients without links;
    proc sql noprint;
    create table allPatIds as
        select distinct(patId)
        from patients;

    create table cohort as
        select p.patId, 
               c.code
        from cohortT c
        full outer join allPatIds p
        on c.patId = p.patId;
    quit;

    * build an adjacency matrix;
    * get column names;
    proc sql noprint;
    create table meta as
        select distinct(trim(left(codeId)) || trim(left(type))) as code
        from selectedVariables
        order by code;
    quit;

    * build case and sum statements for the following two SQL blocks;
    *   case when code = 'D01V000Any' then 1 else 0 end as D01V000Any, ;
    *   sum(D01V000Any) as D01V000Any, ;
    data meta;
        set meta;   
        metaSQLCase = "case when code = '" || trim(left(code)) || "' then 1 else 0 end as " || trim(left(code)) || " length = 3";
        metaSQLSum = 'max(' || trim(left(code)) || ') as ' || trim(left(code)) ||' length = 3';
    run;

    proc sql noprint;
        select metaSQLCase
        into :metaSQLCaseAll separated by ','
        from meta;

        select metaSQLSum
        into :metaSQLSumAll separated by ','
        from meta;
    quit;

    proc sql noprint;
    create table adjacencyCohort as
        select patId,
               &metaSQLCaseAll.
        from cohort;
    quit;

    * remove duplicates using sum() as values are in {0,1};
    proc sql noprint;
    create table output_full_cohort as
        select distinct(patId) as patient_id,
               &metaSQLSumAll.
        from adjacencyCohort
        group by patId;
    quit;
	
%mend ms_HDVariableSelection;