**************************************************************************************************** * PROGRAM OVERVIEW **************************************************************************************************** * * PROGRAM: ms_partition_patients.sas * Created (mm/dd/yyyy): 08/18/2020 * *-------------------------------------------------------------------------------------------------- * PURPOSE: * This program contains determines number of partitions and assigns each patID to a partition * * Program inputs: * - dataset to determine number of partitions * * Program outputs: * - dataset containing each partition number * * PARAMETERS: * - datalib: library for input dataset * - datafile: source data to be split * - split_vars: variables needed to identify unique records * - outfile: file containing patient partitions * * Programming Notes: * *-------------------------------------------------------------------------------------------------- * CONTACT INFO: * Sentinel Coordinating Center * info@sentinelsystem.org * ***************************************************************************************************; %macro ms_partition_patients(datalib = , datafile = , split_vars = , outfile =); /*---------------------------------------------------------------------------------------------------------- Determine the number of partitions needed for the dataset based on filesize. Create one partition for every 2G of data ----------------------------------------------------------------------------------------------------------*/ proc sql noprint; select ceil(filesize/2000000000) into: num_partitions trimmed from dictionary.tables where libname = "%upcase(&datalib.)" and memname = "%upcase(&datafile.)"; quit; /*---------------------------------------------------------------------------------------------------------- If there is more than one partition required then create a partition-id for each patid. Only keep the necessary split_vars for the partition. ----------------------------------------------------------------------------------------------------------*/ /* Create unique patient list */ proc sort nodupkey data = &datalib..&datafile. (keep = &split_vars.) out = _unique_patids; by patid; run; proc sql noprint ; create table _pts as select %sysfunc(tranwrd(&split_vars.,%str( ),%str(, ))) ,ranuni(1776) as rn from _unique_patids order by calculated rn; quit ; data _pts (drop = rn); set _pts ; by rn ; patient_number + 1; run ; proc rank group=&num_partitions. data=_pts out=_pts (drop=patient_number) ; var patient_number; ranks group; run; data &outfile.(drop = group); set _pts; length partition 3. ; partition = group + 1 ; /* first _partition starts at 0*/ run; proc datasets nowarn noprint lib=work; delete _pts _unique_patids; quit; %mend ms_partition_patients;