****************************************************************************************************
*                                           PROGRAM OVERVIEW
****************************************************************************************************
*
* PROGRAM: ms_partition_patients.sas  
* Created (mm/dd/yyyy): 08/18/2020
*
*--------------------------------------------------------------------------------------------------
* PURPOSE:
*  This program contains determines number of partitions and assigns each patID to a partition
*
*  Program inputs:
*   - dataset to determine number of partitions
*
*  Program outputs:                                                                                                                                       
*	- dataset containing each partition number
* 
*  PARAMETERS:                                                                       
*   - datalib: library for input dataset
*   - datafile: source data to be split
*   - split_vars: variables needed to identify unique records
*   - outfile: file containing patient partitions
*
*  Programming Notes:    
*
*--------------------------------------------------------------------------------------------------
* CONTACT INFO: 
*  Sentinel Coordinating Center
*  info@sentinelsystem.org
*
***************************************************************************************************;

%macro ms_partition_patients(datalib = , datafile = , split_vars = , outfile =);

 /*----------------------------------------------------------------------------------------------------------
   Determine the number of partitions needed for the dataset based on filesize. Create one partition for every
   2G of data
   ----------------------------------------------------------------------------------------------------------*/

   proc sql noprint;
     select ceil(filesize/2000000000) into: num_partitions trimmed
        from dictionary.tables
        where libname = "%upcase(&datalib.)" and memname = "%upcase(&datafile.)";
   quit;

 /*----------------------------------------------------------------------------------------------------------
   If there is more than one partition required then create a partition-id for each patid. Only keep the
   necessary split_vars for the partition.
   ----------------------------------------------------------------------------------------------------------*/ 
   
   /* Create unique patient list */
    proc sort nodupkey data = &datalib..&datafile. (keep = &split_vars.) out = _unique_patids;
	  by patid;
	run;
	
	proc sql noprint ;
    create table _pts as
      select %sysfunc(tranwrd(&split_vars.,%str( ),%str(, )))
            ,ranuni(1776) as rn
      from _unique_patids
      order by calculated rn;
    quit ;

    data _pts (drop = rn);
      set _pts ;
      by rn ;
      patient_number + 1;
    run ;

    proc rank group=&num_partitions. data=_pts out=_pts (drop=patient_number) ;
      var patient_number;
      ranks group;
    run;

    data &outfile.(drop = group);
       set _pts;
       length partition 3. ;
       partition = group + 1 ;   /* first _partition starts at 0*/
    run;

    proc datasets nowarn noprint lib=work;
        delete _pts _unique_patids;
    quit;

%mend ms_partition_patients;