/****************************************************************************************
*                                         SENTINEL PROGRAM
*****************************************************************************************
* NAME:             scdm_data_qa_review-maskresults.sas
*
* PURPOSE:          This program evaluates select QA package output based on user-defined parameter
*                   to identify whether conditions for masking apply (results specific to select TabID)
*                   and numeric value between 1 and user defined max inclusive. If conditions met,
*                   the numeric value will be masked alongside other related statistics to prevent back
*                   calculation
* 
* MAJOR STEPS: 
*                   1) Evaluate QA package output to identify files where masking may be indicated
*                   2) Evaluate each file to define conditions for and variables to mask
*                   3) Evaluate masking conditions and if met, mask numeric variable values
*                   4) Evaluate count of masking updates made per file. If only one, mask additional records in file
*                      per conditions previously defined to prevent back calculation
*
* KEY DEPENDENCIES/CONSTRAINTS/CAVEATS:
*
*   PROGRAM INPUT:  QA Package MSOC folder output   
*
*   PROGRAM OUTPUT: Masked QA Package Output (note file output to MSOC directory will be 
*                   evaluated for masking but only updated to masked if conditions met to do so  
*
****************************************************************************************/


	/* Create temp subdirectories in dplocal for organizing masking process */
	options DLCREATEDIR;	
            libname mmsoc "&MSOC.mask";
    options NODLCREATEDIR;

    /* create concatenated libname to reference data for masking, selecting mask version if it exists rather than original version from msoc */
    libname maskin (mmsoc msoc) access=readonly; /* concatenated libref to mask multiple tabids - if masked version exists, this version will be selected when macro iterates otherwise dataset from qapackage msoc will be selected*/
  
    %let maskMin = 10;/* max value for matching - masking will be triggered if value between 1 and this value inclusive */ 


    /* known numeric variables that should not be used to determine masking requirement but masked if requirement met by other variable within row of results */
    %LET EXACTMATCH_STR = %STR("MIN" "P1" "P5" "P25" "MEDIAN" "P75" "P95" "P99" "MAX" "MODE", "PROPORTION_FLAGGED"); /* if exact match on variable name, space delimited quoted list*/


*** process data to apply matching                                                                 ***;

    /* Select list of datasets and variable where masking could apply i.e. not signature or metadata  */
    Proc sql NOPRINT ;  
        create table work._DS_SELECT as 
        select distinct   upcase(memname) as memname
                        , upcase(name) as name
                        , upcase(type) as type
                        , length 
        from dictionary.columns 
        where      libname='MSOC'
               and memtype = 'DATA' 
               and memname not in ('ALL_L1_SCDM_COMP', 'ALL_L1_CONT', 'DTH_MODULE_ALL_L1_CONT')
        order by memname, name;
    quit;


    *** Macro to define and apply masking conditions per qa dataset                                    ***;
    %macro filter_memlist (tabid_list = );

        %do m = 1 %to %sysfunc(countw(&tabid_list.));

            %let mask_tabid = %scan(&tabid_list., &m.);   

            %* Process list to define datasets that should be reviewed for masking and variables within that ;
            %* will be evaluated to do so                                                                    ;
            DATA dplocal.mask_list_&mask_tabid.;

                length ind_tabid_memname ind_tabid_var ind_tabid_index ind_id_mask ind_apply_mask 3;

                SET work._DS_SELECT ;
                by memname;

                /* limit to datasets that are based on ALL SCDM tables or specific table specified by user parameter */
                if not(index(memname, 'SIGNATURE'))
                   and (   substr(memname,1,3) = 'ALL' 
                        or index(memname, %upcase("&mask_tabid._")));
    
                /* dataset level indicators for whether masking logic should be applied to table  */
    
                /* filename references specific SCDM table */
                ind_tabid_memname = (index(memname, %upcase("&mask_tabid._")));
        
                /* variable name references specific SCDM table */
                ind_tabid_var    = (     not(ind_tabid_memname)
                                     and (    index(name, %upcase("&mask_tabid._"))
                                           or name = %upcase("&mask_tabid.")));

                /* variable value has potential to reference specific SCDM table - note additional processing required to ascertain */
                ind_tabid_index  = (     not(ind_tabid_var)
                                     and type = 'CHAR' 
                                     and name ne 'DP'
                                     and length >= 3); 

                /* variable level indicator */

                /* variable value that should be used to identify whether masking required i.e. does not mean minimum n requirement */
                ind_id_mask  = (    type = 'NUM' 
                                 and name not in (&EXACTMATCH_STR.)
                                 and not(index(name, 'DATE'))
                                 and not(index(name, 'LENGTH')));

                /* indicator for variable value to be masked if ind_id_mask triggered */
                ind_apply_mask = (    type = 'NUM' );

            run;

            %* Create parameter with list of all datasets where masking may be required;
            %local list_memfilter;
            %let list_memfilter = ;
            proc sql NOPRINT;

                select distinct memname
                    into :list_memfilter
                    separated by ' '
                    from ( select  memname
                                 , sum(ind_tabid_memname) as sum_ind_tabid_memname
                                 , sum(ind_tabid_var)     as sum_ind_tabid_var
                                 , sum(ind_tabid_index)   as sum_ind_tabid_index
                                 , sum(ind_id_mask)       as sum_ind_id_mask
                            from dplocal.mask_list_&mask_tabid.
                            group by memname)
                    where (sum_ind_tabid_memname or sum_ind_tabid_var or sum_ind_tabid_index)
                           and sum_ind_id_mask; 
            quit;
           
            %* iterate through each distinct memname to evaluate, mask as required, and save result to dplocal directory;
            %if %length(&list_memfilter.) > 0
            %then %do i = 1 %to %sysfunc(countw(&list_memfilter.));

                %let memfilter = %scan(&list_memfilter., &i.);         
     
                %let list_ind =  tabid_memname tabid_index tabid_var id_mask apply_mask;
                %* Select metadata for single dsn and populate parameters to identify and apply masking;
                %do j = 1 %to %sysfunc(countw(&list_ind.));        

                    %let indtype = %scan(&list_ind., &j.);
                    %let list_&indtype. =;

                    proc sql NOPRINT;
                        select name 
                        into :list_&indtype. 
                        separated by ' '
                        from dplocal.mask_list_&mask_tabid.
                        where memname = "&memfilter."
                              and ind_&indtype.;
                    quit;        
                %end; %* end j do loop;
 
 
                %* Create temp masked dataset;
                data _&memfilter. (drop = _ind:) ;
                    
                    set maskin.&memfilter. end = eof; 
                    
                    /* dummy var */
                    _ind_dummy=.;

                    /* if tabid in filename, set indicator */
                    _&mask_tabid._memname= 0;
                    %if %length(&list_tabid_memname.) > 0
                        %then _&mask_tabid._memname= 1;;
    
                    /* filter any variables based on variable name to identify if masking applies */
                    _&mask_tabid._varname=0;
                    %if %length(&list_tabid_var.)  > 0
                    %then %do;
                        %do b = 1 %to %sysfunc(countw(&list_tabid_var.));
                            %let var_name = %scan(&list_tabid_var., &b.); 
                            _ind_var_name_&b. = (&var_name. = '1');
                        %end; %* end b do loop;
                        _&mask_tabid._varname = sum(of _ind_var_name_:);
                    %end; %* end tabid var assignment;
    
                    /* filter any variable based on value to identify if masking applies and apply within if/then do loop */
                    _&mask_tabid._index =0;
                    %if %length(&list_tabid_index.) > 0
                    %then %do;
                        %do a = 1 %to %sysfunc(countw(&list_tabid_index.));
                            %let tabid_index = %scan(&list_tabid_index., &a.); 
                            _ind_tabid_index_&a. =(   upcase(&tabid_index) = %upcase("&mask_tabid.")
                                                   or index(upcase(&tabid_index), %upcase("&mask_tabid._"))
                                                   or index(upcase(&tabid_index), %upcase("&mask_tabid.-")));
                        %end; 
                        _&mask_tabid._index =sum(of _ind_tabid_index_:);
                    %end; %* end tabid variable value assignment;

                    /* evaluate mask id variables and apply making to all applicable values if min threshold not met */
                    _masked_&mask_tabid. = .;
                    if sum(of _&mask_tabid._memname, _&mask_tabid._varname, _&mask_tabid._index) > 0
                    then do;    
                        _masked_&mask_tabid. = 0;             
                        %do c = 1 %to %sysfunc(countw(&list_id_mask.));   
                            %let id_mask = %scan(&list_id_mask., &c.); 
                            if  0< &id_mask. <= &maskMin.
                            then do;
                                _masked_&mask_tabid. = 1;
                                %do d = 1 %to %sysfunc(countw(&list_apply_mask.));     
                                    %let apply_mask = %scan(&list_apply_mask., &d.);
                                    &apply_mask. = .M;
                                %end; %* end d loop;

                            end; 
                        %end; %* end c loop;
                    end;

                run;
                
                %let SumMaskN = ;
                proc sql NOPRINT;
                    select count(*)
                    into :SumMaskN trimmed
                    from _&memfilter.
                    where _masked_&mask_tabid. = 1;
                quit;
                   
                proc sort data = _&memfilter.;
                    by &list_id_mask.;
                run;

                /* if single record masked, mask additional records to prevent ability to back calculate */
                data mmsoc.&memfilter.; 

                    set _&memfilter.;

                    /* dummy var */
                    _ind_dummy = .;
 
                    %if &SumMaskN.= 1
                    %then %do;  
                        retain _addmaskn_&mask_tabid.;   
                        if _masked_&mask_tabid. > . then do;                 
                            /* evaluate mask id variables and apply masking to all applicable values if min threshold not met */
                            %do c = 1 %to %sysfunc(countw(&list_id_mask.));   
                                %let id_mask = %scan(&list_id_mask., &c.);     
                                if      &id_mask. > 0
                                   and _addmaskn_&mask_tabid. < 1
                                then do;
                                    _addmaskn_&mask_tabid. = sum(_addmaskn_&mask_tabid., 1);
                                    %do d = 1 %to %sysfunc(countw(&list_apply_mask.));     
                                        %let apply_mask = %scan(&list_apply_mask., &d.);
                                        &apply_mask. = .R;
                                    %end; %* end d loop;
                                end; 
                            %end; %* end c loop;      
                        end;
                    %end; %* end additional masking do statement;
                run;     
            %end; %* end i do loop;             
        %end; %* end m do loop;

    %mend filter_memlist;



    /* move files created in dplocal to msoc output */
    %macro move_masked;

        %* identify intermediate dataset created in dplocal directory ;
        %let list_memmove = ;
        proc sql NOPRINT;
            select distinct memname
                into :list_memmove
                separated by ' '
                from dictionary.columns 
                where      upcase(libname)='MMSOC'
                      and upcase(memtype) = 'DATA';
        quit;
   
        %* iterate through each dataset to create version in msoc directory renamed to add prefix indicating masking evaluated and applied if indicated;   
        %if %length(&list_memmove.) > 0 
        %then %do k = 1 %to %sysfunc(countw(&list_memmove.));

            %let memmove = %scan(&list_memmove., &k.); 

            %* preserve unmasked/original version of dataset;
            data dplocal.&memmove.;
                set msoc.&memmove.;
            run;

            %* move reviewed and potentially masked dataset to msoc directory;
            data msoc.&memmove.;
                set MMSOC.&memmove. (drop = _:);
            run;

        %end; %* end k loop;


    %mend move_masked;

    %filter_memlist (tabid_list = &MaskTabid_List.);

    %move_masked;

    proc datasets lib=MMSOC nolist nowarn kill;
    run; quit;
     
    filename tempdir "&MSOC.mask";
    data _null_;
        rc=fdelete('tempdir');
        put rc=;
        msg=sysmsg();
        put msg=;
    run;

*** ---------------------------------- END PROGRAM ----------------------------------------------- ***;