/*-------------------------------------------------------------------------------------*\
|  PROGRAM NAME:                                                                        |
|     scdm_data_qa_review-demographic.sas                                               |
|                                                                                       |
|---------------------------------------------------------------------------------------|
|  PURPOSE:                                                                             |
|     The purpose of this program is to perform data quality checks on the Demographic  |
|     table.                                                                            |
|---------------------------------------------------------------------------------------|
|  PROGRAM INPUT:                                                                       |
|     see 00.0_scdm_data_qa_review_master_file.sas                                      |
|                                                                                       |
|  PROGRAM OUTPUT:                                                                      |
|     see Workplan PDF                                                                  |
|---------------------------------------------------------------------------------------|
|  CONTACT:                                                                             |
|     Sentinel Coordinating Center                                                      |
|     info@sentinelsystem.org                                                           |
\*-------------------------------------------------------------------------------------*/

*-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-;
*  PLEASE DO NOT EDIT BELOW WITHOUT CONTACTING THE SENTINEL OPERATIONS CENTER           ;
*-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-;

/*-------------------------------------------------------------------------------------*/
/*  START Level 2                                                                      */
/*-------------------------------------------------------------------------------------*/
%level2;

/*-------------------------------------------------------------------------------------*/
/*  START Level 3                                                                      */
/*-------------------------------------------------------------------------------------*/
%let level=3;

/* Define catvars_a and catvars_b from lkl_all_l1 **/
proc sql noprint;
  select Variable into: catvars_a separated by ", "
  from infolder.lkp_all_l1
  where upcase(tabid)=upcase("&module.") and upcase(VarType)="C" and upcase(Variable) ne "POSTALCODE";

  select Variable into: catvars_b separated by " "
  from infolder.lkp_all_l1
  where  upcase(tabid)=upcase("&module.") and upcase(VarType)="C" and upcase(Variable) ne "POSTALCODE";
quit;

* birth_date: list with # of records for use in the L3 module (after DP maxdate is created);
%let qa_table= ;
%macro get_qa_table ;
  %if &do_partitions and not %eval(&module in &tabid_exc) %then %do ;
    data t_view /view=t_view ;
      set qadata.&&&module.table.: (keep=birth_date &catvars_b.) ;
    run;
    %let qa_table = t_view ;
  %end ;
  %else %do;
    %let qa_table =  qadata.&&&module.table (keep=birth_date &catvars_b.) ;
  %end;
%mend;
%get_qa_table ;

proc sql noprint;
  create table dplocal.&module._temp as
  select birth_date
       , &catvars_a.
       , count(*) as n
  from &qa_table
  group by birth_date, &catvars_a.
  ;
quit;
%remove_labels(dplocal,&module._temp);

proc sql noprint;
  create table dplocal.&module._birthd as
  select birth_date
       , sum(n) as n
  from dplocal.&module._temp (keep=birth_date n)
  group by birth_date
  ;
quit;

proc sql noprint;
  create table msoc.&module._l3_catvars as
  select &catvars_a.
       , sum(n) as count format=comma15.
  from dplocal.&module._temp (keep=&catvars_b. n)
  group by &catvars_a.
  ;
quit;

/** Delete catvars_a and catvars_b from global symbol table */
data _null_;
  call symdel("catvars_a", "nowarn");
  call symdel("catvars_b", "nowarn");
run;

%macro l3_zip;
/* Load the data from the zip lookup table into a hash table and merge SCDM table on key variable ZIP to assign STATECODE*/
%if %sysfunc(exist(infolder.lkp_dem_zip,data)) %then %do;

%if &do_partitions and not %eval(&module in &tabid_exc) %then %let pn=: ; %else %let pn= ;

data dplocal.zip1 (drop=rc);
    declare Hash postalstate (); /* declare the name POSTALSTATE for hash */
    rc = postalstate.DefineKey ('postalcode'); /* identify fields to use as keys */
    rc = postalstate.DefineData ('statecode'); /* identify fields to use as data */
    rc = postalstate.DefineDone (); /* complete hash table definition */
   do until (eof1) ; /* loop to read records from lookup file */
	  set infolder.lkp_dem_zip end = eof1;
      rc = postalstate.add (); /* add each record to the hash table */
    end;
    do until (eof2) ; /* loop to read records from WORK.ZIP1 */
      set qadata.&&&module.table.&pn (keep=postalcode postalcode_date)  end = eof2;
      call missing(statecode); /* initialize the variable to fill */
      rc = postalstate.find (); /* lookup each ZIP in hash POSTALSTATE */
	output; /* write record WORK.POSTALSTATE*/
    end;
    stop;
  run;

/* create masked (pseudo) zip code variable ZIP-masked */
  data work.&module._temp1;
    set dplocal.zip1;
    length z1-z5 $1 temp $20;
    temp=tranwrd(postalcode, " ", "_"); *change spaces to underscores;
  %do j=1 %to 5;
    z&j.= substr(temp,&j.,1); *parse zip characters into 5 separate variables;
  %end;
  run;

  proc datasets lib=dplocal memtype=data nolist;
    delete zip1 ;
  run; quit;

  data work.&module._temp2;
    set work.&module._temp1;
  %do i=1 %to 5;
    if anydigit(z&i.)=1 then z2_&i.="N"; *convert numeric characters to "N";
    else if z&i="_" then z2_&i.="_"; *keep "-" for spaces;
    else if notdigit(z&i.)=1 then z2_&i.="a"; *convert alpha characters to "a";
  %end;
    drop temp z1-z5;
  run;

  data work.zip;
    length Zip_masked $5;
    set work.&module._temp2;
    zip_masked=z2_1||z2_2||z2_3||z2_4||z2_5; *concatenate 5 characters create to a pseudo zip variable;
    drop z2_1-z2_5;
  run;

  proc sql noprint;
    create table work.&module._temp3 as
       select PostalCode_Date 
       , count(*) as n
    from work.zip (keep = PostalCode_Date )
    group by 1
    ;
  quit;

%remove_labels( work , &module._temp3 ) ;

%date_percentiles (libin=work, dsin=&module._temp3, libout=dplocal, dsout=date_dist_&module., vars=PostalCode_Date);

  proc datasets lib=work nolist;
    delete &module._temp: ;
  run; quit;

  proc sql noprint;
/* create new variables ZIP-CHECK and ZIP-STATE to characterize ZIP*/
    create table work.zip1 as
    select put(postalcode_date,year4.) as Year
         , zip_masked
         , case when postalcode=' ' then 'Blank'
                when statecode ne ' ' then 'Valid'
                else 'Invalid'
           end as Zip_Check length=7
         , case when calculated zip_check='Blank' then 'Missing'
                when calculated zip_check='Valid' then statecode
                when calculated zip_check='Invalid' then 'Invalid'
           end as Zip_State length=7
    from work.zip
    ;
/* output table to characterize ZIP by year */
    create table msoc.dem_l3_zip_y as
    select *
         , count(*) as count format=comma15.
    from work.zip1 (keep=year zip_masked)
    group by year, zip_masked
    ;
/* output table to characterize the validity of zip values by year*/
    create table msoc.dem_l3_zip_verify_y as
    select *
         , count(*) as count format=comma15.
    from work.zip1 (keep=year zip_check)
    group by year, zip_check
    ;
/* output table to characterize distribution of Zip State */
    create table msoc.dem_l3_zip_state as
    select *
         , count(*) as count format=comma15.
    from work.zip1 (keep=zip_state)
    group by zip_state
    ;
  quit;

%end;
%mend;
%l3_zip;

proc sql ;
  drop table dplocal.&module._temp ;
;quit;
proc datasets kill lib=work nolist nowarn nodetails;
quit;

*-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-;
*  END scdm-data-qa-review-demographic.sas                                              ;
*-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-;