****************************************************************************************************
*                                           PROGRAM OVERVIEW
****************************************************************************************************
*
* PROGRAM: baseline_aggregate.sas  
* Created (mm/dd/yyyy): 10/20/2020
*
*--------------------------------------------------------------------------------------------------
* PURPOSE: The macro imports and concatenates both Level 1 and Level 2 baseline tables
*
*  Program inputs:                                                                                   
*   - Level 1 tables in the format: [runid]_baseline_[cohort]_[periodid].sas7bdat
*   - Level 2 tables in the format: [runid]_adjusted_baseline_[periodid].sas7bdat
* 
*  Program outputs:                                                                                                                                       
*  	- Dataset where each row represents a baseline metric and each column represents a DP value.
*     Groups in the GROUPTABLE are stacked
* 
*  PARAMETERS:        
*   - dpsiteid: name of DP to import. Will be used as libname
    - dpnumber: number to assign to each DP after transpose
*   - level: 1 or 2
*   - grouptable: name of the dataset that contains groups, runIDs, and infomation pertaining to the 
*     baseline table name to include in the report.
*   - outdata: name of the aggregated dataset
*   - periodid: monitoring period #
*
*
*  Programming Notes:                                                                                
*   - This macro is designed to be called within a loop that loops through each data partner
*
*   - For level 1 requests, the baseline table file name will be dynamically derived using 
*     &dpsiteid as the libname and the file name as: [runid]_baseline_[cohort]_&periodid
*
*   - GROUPTABLE must contain the following variables: GROUP, RUNID, COHORT, and MERGEVAR
*     For level 2 requests, only GROUP and RUNID need to be populated. GROUP will represent the ANALYSISGRP.
*     For level 1 requests, COHORT is optional and will be used to correctly name the associated baseline 
*     table ([runid]_baseline_[cohort]_[periodid].sas7bdat), MERGEVAR is required and is the name of 
*     the key variable in the baseline table (group or analysisgrp)
*
*   - It is assumed the user intends to import and aggregate all groups in the GROUPTABLE input file.
*     The user should restrict this file to only groups with a baseline table prior to executing 
*     this macro.
*
*--------------------------------------------------------------------------------------------------
* CONTACT INFO: 
*  Sentinel Coordinating Center
*  info@sentinelsystem.org
*
***************************************************************************************************;

%macro baseline_aggregate(dpsiteid = ,
                          maskedid = ,
                          dpnumber = ,
                          level = ,
                          grouptable = ,
                          outdata = ,
                          periodid = );

	%put =====> MACRO CALLED: baseline_aggregate;

    /*********************************************************************************************/
    /* Level 1 baseline tables
    /*********************************************************************************************/

    %if %eval(&level.=1) %then %do;

        /*Use GROUPTABLE to create list of baseline tables and create dataset in statement*/
        data _temp_baseline_tablenames;
            set &GROUPTABLE.;
            format baselinetablename $40.;
            if missing(cohort) then do;
                baselinetablename = lowcase(cats("&dpsiteid..",runid, "_baseline_&periodid"));
            end;
            else do;
                baselinetablename = lowcase(cats("&dpsiteid..",runid, "_baseline_", cohort, "_&periodid"));
            end;
        run;

        proc sql noprint;
            select distinct baselinetablename into: baselinetables separated by ' '
            from _temp_baseline_tablenames;
            select count(distinct baselinetablename) into: num_unique_baseline_tables
            from _temp_baseline_tablenames;
        quit;

        %put Extracting baseline tables: &baselinetables.;

        /*Loop through each baseline table, import, stack groups*/

        %do b = 1 %to %eval(&num_unique_baseline_tables.);
            %let infile = %scan(&baselinetables., &b., ' ');

            /*Ensure table exists: if it does not exist, write error to the log and abort*/
            %if %sysfunc(exist(&infile.))=0 %then %do;
		 		%put ERROR: &infile. does not exist. Program will abort;
			 	%abort; 
		 	%end;

            /*Merge table with GROUPSTABLE and only keep Groups/Analysisgrps in the input file*/
            data _temp_baseline_tablenames&b.;
                set _temp_baseline_tablenames(where=(baselinetablename="&infile."));
                call symputx('mergevar', mergevar);
            run;

            /*If lab covariates specified, create comma separated list to ensure they exist in data */
            %let checkbaselinelabvars=;
            %if %length(&labcharacteristics) > 0 %then %do;
                proc contents data = &infile noprint out=_labvarsname(keep=name);
                run;

                %create_comma_charlist(inlist=%qsysfunc(compress(&labcharacteristics,%str(%"))), outlist=labcharscomma);
                
                /* Check to see if specified lab covariates exist */
                proc sql noprint;
                select name 
                into :checkbaselinelabvars 
                from _labvarsname
                where upper(name) in (&labcharscomma);
                quit;
            %end;

            proc sql noprint;
                create table _temp_baseline_tablenum&b. as
                select x.*
                     , y.runid
                     , y.order
                     , y.cohort
                    %if %length(&labcharacteristics) > 0 and %length(&checkbaselinelabvars) > 0 %then %do;
                        %do labvars = 1 %to %sysfunc(countw(&labcharacteristics));
                        %let labvar = %scan(&labcharacteristics,&labvars);
                        , n_episodes - &labvar as &labvar._notestrecord label="No test record"
                        %end;
                    %end;
                     %if "&mergevar" ne "analysisgrp" %then %do;
                     , y.analysisgrp
                     %end;
                     , y.group as group1
                from &infile. as x,
                     _temp_baseline_tablenames&b. as y
                where x.&mergevar. = y.group;
            quit;
        %end;

        /*Stack baseline tables*/
        data _temp_baseline_stacked;
            set _temp_baseline_tablenum:;
        run;

		/*if stratifying by data partner and collapse_vars = race, recode values 1-10 to Unknown*/
		%if "&stratifybydp." = "Y" and "&collapse_vars." = "race" %then %do;
            /*confirm race vars exist on dataset*/
            proc contents noprint data = _temp_baseline_stacked 
                                   out = content_out;
            run;

            proc sql noprint;
    		    select count(name) into :race_cats 
    		    from content_out
    		    where lowcase(name) like 'race_%';
            quit; 

            proc datasets nowarn noprint lib=work;
                delete content_out;
            quit;

            %if %eval(&race_cats>0) %then %do;

                /*type 6 - if any switch is collapsed, collapse all switches*/
                proc sort data=_temp_baseline_stacked;
                    by order %if &reporttype.=T6 %then %do; descending switchstep %end;;
                run;

                %let collapse_comparator = N;

                data _temp_baseline_stacked(drop=recode_:);
                    missing R;
                    set _temp_baseline_stacked;
                    by order;

                    /* Remove 0 from list and count */
                    %let race_cats_nozero = %sysfunc(tranwrd(&race_cats,0,%str()));
                    %let race_cats_count = %sysfunc(countw(&race_cats_nozero));

                    /*Recode*/
                    if first.order then do;
                        %do c_r = 1 %to &race_cats_count;
                            %let race_value = %scan(&race_cats_nozero,&c_r);
                        recode_&race_value. = 'N'; /*to track if another anchor switch or cohort collapsed*/
        		        if 1 <= race_&race_value <= 10 then do;
                            race_0 = sum(race_0, race_&race_value.);
                            /*set recoded value to special missing value to track in code and final table*/
                            race_&race_value. = .R;
                            recode_&race_value. = 'Y';
                        end;
                        retain recode_&race_value.;
                        %end;             
                    end;
                    else do;
                        %do c_r = 1 %to &race_cats_count;
                            %let race_value = %scan(&race_cats_nozero,&c_r);
        		        if 1 <= race_&race_value <= 10 | recode_&race_value. = 'Y' then do;
                            race_0 = sum(race_0, race_&race_value.);
                            /*set recoded value to special missing value to track in code and final table*/
                            race_&race_value. = .R;
                            recode_&race_value. = 'Y';
                            retain recode_&race_value.;
                        end;
                        %end;
                        /*mark if comparator cohort*/
                        %if &reporttype. ne T6 %then %do; call symputx('collapse_comparator', 'Y'); %end;
                    end;
                run; 

                %if &collapse_comparator = Y %then %do;
                /*if there is a comparator cohort, need to recode other group if a row has been collapsed in 1 group.
                  This is not needed for Type 6 because the at each switch, the # of patients will inherantly decrease, 
                  whereas for comparator cohorts this is not the case */
                    proc sort data=_temp_baseline_stacked out=_temp_baseline_stacked;
                        by order descending &mergevar.;
                    run;

                    data _temp_baseline_stacked(drop=recode_:);
                        set _temp_baseline_stacked;
                        by order;

                       /*Determine if other cohort was recoded*/
                        if first.order then do;
                            %do c_r = 1 %to &race_cats_count;
                                %let race_value = %scan(&race_cats_nozero,&c_r);
                            recode_&race_value. = 'N';
            		        if race_&race_value =.R then do;
                                recode_&race_value. = 'Y';
                            end;
                            retain recode_&race_value.;
                            %end;             
                        end;
                        else do;
                            %do c_r = 1 %to &race_cats_count;
                                %let race_value = %scan(&race_cats_nozero,&c_r);
            		        if recode_&race_value. = 'Y' then do;
                                race_0 = sum(race_0, race_&race_value.);
                                /*set recoded value to special missing value to track in code and final table*/
                                race_&race_value. = .R;
                            end;
                            %end;
                        end;
                    run; 
                %end;
            %end; /*race exists on dataset*/
        %end; /*collapse race categories*/

        /*Transpose and rename variable holding metrics to DP&DPNUMBER*/
        proc sort data=_temp_baseline_stacked;
            by analysisgrp group1 runid order cohort &switch_s;
        run;
		
		/*Determine the total number of episodes on the stacked dataset. A value of 0 indicates a default baseline table*/
		proc sql noprint;
		  select sum(n_episodes) into: total_episodes 
		  from _temp_baseline_stacked;
		quit;

        proc transpose data=_temp_baseline_stacked out=_temp_baseline_transposed;
            by analysisgrp group1 runid order cohort &switch_s; 
		run;

        proc datasets library=WORK nowarn nolist;
            modify _temp_baseline_transposed;  
            rename col1=dp&dpnumber.;
            rename _name_ = metvar;
        quit;

        proc sort data=_temp_baseline_transposed;
            by analysisgrp group1 runid order cohort metvar &switch_s;
        run;
		
		/* Change case of metvar from n_episodes to N_episodes for baseline datasets that have 0 total N_episodes */
		%if &total_episodes. = 0 %then %do;
		   data _temp_baseline_transposed;
		     set _temp_baseline_transposed;
             length _label_ $&baselinelabellength;
			 if metvar = 'n_episodes' then metvar = 'N_episodes';
             /* Initialize _label_ variable when there are no patients in the cohort */
            _label_='';
		   run;
		%end;

        /*if DPNUMBER =1 or &outdata does not exist, then output &outdata, else merge into existing outdata*/
        %if %eval(&dpnumber.=1) | %sysfunc(exist(&outdata.))=0 %then %do;
            data &outdata.;
                set _temp_baseline_transposed;
            run;
            data _baseline_agg_&periodid.;
                set _temp_baseline_stacked;
				length dpidsiteid $6;
                dpidsiteid = "&maskedid."; 
            run;
        %end;
        %else %do;
            data &outdata.;
                length metvar $32;
                merge &outdata.
                      _temp_baseline_transposed(in=a);
                by analysisgrp group1 runid order cohort metvar &switch_s;
            run;
            data _baseline_agg_&periodid.;
                set _baseline_agg_&periodid.
				    _temp_baseline_stacked (in=b);
                if b then dpidsiteid = "&maskedid.";  
            run;
        %end;

		
    %end; /*level 1 baseline tables*/


    /*********************************************************************************************/
    /* Level 2 baseline tables
    /*********************************************************************************************/

    %if %eval(&level.=2) %then %do;

        /*Loop through each runID, import, stack analysisgrps*/
				
        proc sql noprint;
            select distinct runid into: unique_baseline_runids separated by ' '
            from &GROUPTABLE.;
            select count(distinct runid) into: num_unique_baseline_runids
            from &GROUPTABLE.;
        quit;

        %do b = 1 %to %eval(&num_unique_baseline_runids.);
            %let runid = %scan(&unique_baseline_runids., &b., ' ');

            /*Ensure table exists: if it does not exist, write error to the log and abort*/
            %if %sysfunc(exist(&dpsiteid..&runid._adjusted_baseline_&periodid.))=0 %then %do;
		 		%put ERROR: &runid._adjusted_baseline_&periodid. for &dpsiteid. does not exist. Program will abort;
			 	%abort; 
		 	%end;

            /*Merge table with GROUPSTABLE and only keep Analysisgrps in the input file*/
            proc sql noprint;
                create table _temp_baseline_tablenum&b. as
                select x.*
                     , y.runid
                     , y.order
                     , &periodid as monitoringperiod length=3
                from &dpsiteid..&runid._adjusted_baseline_&periodid. as x,
                     &GROUPTABLE.(where=(runid="&runid.")) as y
                where x.analysisgrp = y.group;
            quit;

			/* Get lab covariate labels from L1 baseline table variables */
			%if %length(&labcharacteristics) > 0 %then %do;
				data _temp_baseline_labcovars_&b.;
				%if %str("&reporttype") = %str("T4L2") %then %do;
					set &dpsiteid..&runid._baseline_mi_&periodid.(obs=1);
				%end;
				%else %do;
					set &dpsiteid..&runid._baseline_&periodid.(obs=1);
				%end;				
				run;
			%end;
        %end;		

        /*Stack baseline tables*/
        data _temp_baseline_stacked;
            set _temp_baseline_tablenum:;

            /*defensive: set metvar to uppercase*/
            metvar=upcase(metvar);

            /*replace TOTAL with N_EPISODES and assign vartype to match L1 tables*/
            if metvar = 'TOTAL' then do;
                metvar = 'N_EPISODES';
                vartype = 'dichotomous';
                if exp_mean=. then exp_mean = 0;
                if comp_mean=. then comp_mean = 0;
            end;
        run;

		/* If lab covariates specified, create comma separated list to ensure they exist in data */
		%let checkbaselinelabvars=;
		%if %length(&labcharacteristics) > 0 %then %do;
			data _temp_baseline_labcovars;
			set _temp_baseline_labcovars_:;
			run;

		    proc contents data = _temp_baseline_labcovars noprint out=_labvarsname(keep=name label);
		    run;

		    %create_comma_charlist(inlist=%qsysfunc(compress(&labcharacteristics,%str(%"))), outlist=labcharscomma);
		    
		    /* Check to see if specified lab covariates exist */
		    proc sql noprint;
		    select name 
		    into :checkbaselinelabvars 
		    from _labvarsname
		    where upper(name) in (&labcharscomma);
		    quit;
		%end;

		%if %length(&labcharacteristics) > 0 and %length(&checkbaselinelabvars) > 0 %then %do;		
			proc sort data=_temp_baseline_stacked;
			by metvar;
			run;
	
			* Get lab covariate labels;
			data _labvarsname;
			set _labvarsname;
			format metvar $30.;
			name=upcase(name);
			if index(name,"COVAR")>0;
			metvar=tranwrd(name, "MEAN_", "");
			metvar=strip(tranwrd(metvar, "STD_", ""));
			drop name;
			run;			

			proc sort nodupkey data=_labvarsname;
			by metvar;
			run;

			data _temp_baseline_stacked(rename=label=_label_);
			merge _temp_baseline_stacked(in=a)
				  _labvarsname;
			by metvar;
			if a;
			if index(metvar, "_NOTESTRECORD")>0 then label="No test record";
			run;		
		%end;

        /*Add &DPNUMBER suffix to variables*/
        proc datasets library=work noprint;
                modify _temp_baseline_stacked;    
                rename  exp_mean = exp_mean&dpnumber.
                        exp_std = exp_std&dpnumber.
                        comp_mean = comp_mean&dpnumber.
                        comp_std = comp_std&dpnumber.
                        exp_s2 = exp_s2_&dpnumber.
                        comp_s2 = comp_s2_&dpnumber.
                        ad   = ad&dpnumber.
                        sd   = sd&dpnumber.
                        exp_w = exp_w1_&dpnumber.
                        exp_w2 = exp_w2_&dpnumber.
                        comp_w = comp_w1_&dpnumber.
                        comp_w2 = comp_w2_&dpnumber.;
        quit;

        proc sort data=_temp_baseline_stacked; 
            by analysisgrp runid order table group1 group2 weight subgroup subgroupcat vartype metvar;                 
        run;

        /*if DPNUMBER =1 or &outdata does not exist, then output &outdata, else merge into existing outdata*/
        %if %eval(&dpnumber.=1) | %sysfunc(exist(&outdata.))=0 %then %do;
            data &outdata. (drop=dpidsiteid) 
			     _baseline_agg_&periodid. (rename=(exp_mean&dpnumber.=exp_mean
												   exp_std&dpnumber.=exp_std
												   comp_mean&dpnumber.=comp_mean
												   comp_std&dpnumber.=comp_std
												   exp_s2_&dpnumber.=exp_s2
												   comp_s2_&dpnumber.=comp_s2
												   ad&dpnumber.=ad
												   sd&dpnumber.=sd
												   exp_w1_&dpnumber.=exp_w
												   exp_w2_&dpnumber.=exp_w2
												   comp_w1_&dpnumber.=comp_w
												   comp_w2_&dpnumber.=comp_w2));
                set _temp_baseline_stacked;
				length dpidsiteid $6 monitoringperiod 3;
                dpidsiteid = "&maskedid."; 
                monitoringperiod=&periodid;
            run;
        %end;
        %else %do;
            data &outdata.;
                merge &outdata.(in=a)
                      _temp_baseline_stacked;
                by analysisgrp runid order table group1 group2 weight subgroup subgroupcat vartype metvar; 
            run;
            data _baseline_agg_&periodid.;
                set _baseline_agg_&periodid.
				    _temp_baseline_stacked (in=b rename=(exp_mean&dpnumber.=exp_mean
													     exp_std&dpnumber.=exp_std
						  							     comp_mean&dpnumber.=comp_mean
													     comp_std&dpnumber.=comp_std
													     exp_s2_&dpnumber.=exp_s2
													     comp_s2_&dpnumber.=comp_s2
													     ad&dpnumber.=ad
													     sd&dpnumber.=sd
													     exp_w1_&dpnumber.=exp_w
													     exp_w2_&dpnumber.=exp_w2
													     comp_w1_&dpnumber.=comp_w
													     comp_w2_&dpnumber.=comp_w2));
                if b then do;
                dpidsiteid = "&maskedid.";
                monitoringperiod=&periodid;
                end;  
            run;
        %end;
		
        proc sort data=_baseline_agg_&periodid.; 
            by dpidsiteid analysisgrp runid order table group1 group2 weight subgroup subgroupcat vartype metvar;                 
        run;
			
    %end; /*level 2 baseline tables*/
						
    /*Clean up*/
    proc datasets nowarn noprint lib=work;
        delete _temp_baseline_:;
    quit;
    
	%put =====> END MACRO: baseline_aggregate;

%mend baseline_aggregate;