**************************************************************************************************** * PROGRAM OVERVIEW **************************************************************************************************** * * PROGRAM: ms_HDPS.sas * * Created (mm/dd/yyyy): 01/26/2016 * Last modified: 06/30/2019 * Version: 1.1 * *-------------------------------------------------------------------------------------------------- * PURPOSE: * This macro will execute the HDPS variable selection algorithm * * Program inputs: * - 5 dimension datasets * - dataset containing the list of patients, groups and outcomes * * Program outputs: * - dataset containing variables created/selected at the PatId level * * PARAMETERS: * - var_patient_id = name of the Patid variable * - var_exposure = name of exposure variable * - var_outcome = name of the outcome variable * - top_n = 100 * - k = 200 * - ranking_method = ranking_method EXP_ASSOC (or BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING} * - outcome_zero_cell_corr = outcome_zero_cell_corr in {0, 1} * - outcome_type = outcome_type in {DICHOTOMOUS, BINARY, COUNT, CONTINUOUS} * - trunc_9 = # of characters to keep from ICD-9 code * - trunc_10 = # of characters to keep from ICD-10 code * - input_cohort = dataset containing the patid level information (level 1) * - input_dim1-10 = name of the dimension files (i.e., codes files from ms_cidacov) * - path_output = To output .txt files * * Programming Notes: * * * *-------------------------------------------------------------------------------------------------- * CONTACT INFO: * Sentinel Coordinating Center * info@sentinelsystem.org * *-------------------------------------------------------------------------------------------------- * CHANGE LOG: * * Version Date Initials Comment (reference external documentation when available) * ------- -------- -------- --------------------------------------------------------------- * 1.1 06/30/2019 AP Added analysisgrp to output_all_vars dataset to facilitate append * ***************************************************************************************************; %macro ms_HDVariableSelection( var_patient_id = patId, var_exposure = exposure, var_outcome = outcome, top_n = 100, k = 200, ranking_method = 'EXP_ASSOC', /* ranking_method in {BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING} */ outcome_zero_cell_corr = 1, /* outcome_zero_cell_corr in {0, 1} */ outcome_type = 'DICHOTOMOUS', /* outcome_type in {DICHOTOMOUS, BINARY, COUNT, CONTINUOUS} */ trunc_9 = ., /* # of characters to keep from ICD-9 code */ trunc_10 = ., /* # of characters to keep from ICD-10 code */ input_cohort = , input_dim1 = , input_dim2 = , input_dim3 = , input_dim4 = , input_dim5 = , input_dim6 = , input_dim7 = , input_dim8 = , input_dim9 = , input_dim10 = , path_output = ); %put NOTE: start ms_HDVariableSelection; %put NOTE: macro variables; %put var_patient_id = &var_patient_id; %put var_exposure = &var_exposure; %put var_outcome = &var_outcome; %put top_n = &top_n; %put k = &k; %put ranking_method = &ranking_method; %put outcome_zero_cell_corr = &outcome_zero_cell_corr; %put outcome_type = &outcome_type; %put input_cohort = &input_cohort; %put input_dim1 = &input_dim1; %put path_output = &path_output; * clean up - MUST delete with each run.; * datasets are appended to as each dimension is processed; proc datasets library = work nowarn nolist; delete allVars allCodes allOccurrenceTypes allLinks allCodesHistogram allCodeMaps allDimNames; quit; * cumulative data sets need to be created before using %ms_appendfiles; data allVars; stop; run; data allCodes; stop; run; data allOccurrenceTypes; stop; run; data allLinks; stop; run; data allCodesHistogram; stop; run; data allCodeMaps; stop; run; data allDimNames; stop; run; /** 2) Read Patients **********************************************************************/ data patients; set &input_cohort.; if &var_exposure. ~= 0 then exposed = 'T'; else exposed = 'F'; * if outcome does not exist then outcome = 0; outcome = coalesce(&var_outcome., 0); if outcome ~= 0 then outcomeDichotomous = 'T'; else outcomeDichotomous = 'F'; outcomeCount = int(outcome); if outcomeCount = . then outcomeCount = 0; outcomeContinuous = outcome; if outcomeContinuous = . then outcomeContinuous = 0; * if futime column doesnt exist then futime = .; futime = coalesce(futime, .); if futime ~= . then followUpTime = futime; else followUpTime = 1; keep &var_patient_id. exposed outcomeDichotomous outcomeCount outcomeContinuous followUpTime; rename &var_patient_id. = patId; run; %let totalPatients = &SYSNOBS; %put totalPatients = &totalPatients.; * set count variables; proc means noprint data = patients; output out = patientTotals sum(followUpTime) = ptTotal sum(outcomeContinuous) = sumOfOutcomes sum(outcomeCount) = numEvents; run; proc sql noprint; create table patientTotals2 as select sum(followUpTime) as ptExposed, count(*) as nExposed from patients where exposed = 'T'; quit; proc sql noprint; create table patientTotals3 as select coalesce(sum(followUpTime), 0) as nOutcome from patients where outcomeDichotomous = 'T'; quit; data patientTotals; set patientTotals; set patientTotals2; set patientTotals3; drop _TYPE_ _FREQ_; run; /** 3) Read Dimensions ********************************************************************/ /** 3) a) Build Code Database *************************************************************/ %macro readDimension(dimName, dimDataSet, dimCodeVarName); %put "readDimension(&dimName, &dimDataSet, &dimCodeVarName)"; * build a mapping: codeNum -> codeId; * maintain original order; data &dimName.Copy; set &dimDataSet.; n = _N_; %if %sysfunc(prxmatch(m/icddx09/i,&dimDataSet)) and %str("&TRUNC_9.") ^= %str(".") %then %do; &dimCodeVarName. = substr(&dimCodeVarName.,1,&TRUNC_9.); %end; %if %sysfunc(prxmatch(m/icddx10/i,&dimDataSet)) and %str("&TRUNC_10.") ^= %str(".") %then %do; &dimCodeVarName. = substr(&dimCodeVarName.,1,&TRUNC_10.); %end; keep &var_patient_id. &dimCodeVarName. n; rename &var_patient_id. = patId &dimCodeVarName. = code; run; proc sql noprint; create table &dimName.Codes as select code, min(n) as minN from &dimName.Copy group by code order by minN; quit; proc sql noprint; select count(*) into :numDistinctCodes from &dimName.Codes; quit; data _NULL_; put "&dimDataSet. numDistinctCodes=&numDistinctCodes."; digits = length(strip("&numDistinctCodes.")); digits = max(3, digits); call symput('maxCodeDigits', put(digits, best.)); run; data &dimName.Codes; set &dimName.Codes; format codeId1 $&maxCodeDigits..; if _N_ <= 1000 then codeId1 = put((_N_ - 1), z3.); else codeId1 = strip(put(_N_ - 1, best.)); run; * filter out rows that have a patId not in patients; proc sql noprint; create table &dimName.Filtered as select t2.* from patients t1, &dimName.Copy t2 where t1.patId = t2.patId; quit; *generate codeIds for dimName; *add codeId1 to dimName; proc sql noprint; create table &dimName.SQL as select d.patId, d.code, c.codeId1 from &dimName.Filtered d, &dimName.Codes c where d.code = c.code; quit; data &dimName.Codes; set &dimName.SQL; codeId = "&dimName.V" || strip(codeId1); codeType = 'STANDARD'; considerForPS = 'false'; run; * generate links; proc sql noprint; create table &dimName.Links as select %if %eval(&scdmver1. > 7) %then %do; compress(codeId || '|' || put(patId,8.)) as linkId, %end; %else %do; strip(codeId) || '|' || patId as linkId, %end; patId, codeId, count(patId) as numOccurrences from &dimName.Codes group by patId, codeId; quit; *create histogram of numOccurrences for each code; data &dimName.LinksHistogramT; set &dimName.Links; h0 = 0; h1 = 0; h2 = 0; h3 = 0; h4 = 0; h5 = 0; h6 = 0; h7 = 0; h8 = 0; h9 = 0; h10 = 0; run; data &dimName.LinksHistogramT; set &dimName.LinksHistogramT; if numOccurrences = 0 then h0 = 1; if numOccurrences = 1 then h1 = 1; if numOccurrences = 2 then h2 = 1; if numOccurrences = 3 then h3 = 1; if numOccurrences = 4 then h4 = 1; if numOccurrences = 5 then h5 = 1; if numOccurrences = 6 then h6 = 1; if numOccurrences = 7 then h7 = 1; if numOccurrences = 8 then h8 = 1; if numOccurrences = 9 then h9 = 1; if numOccurrences > 9 then h10 = 1; run; proc sql noprint; create table &dimName.LinksHistogram as select distinct(codeId), sum(h0) as h0, sum(h1) as h1, sum(h2) as h2, sum(h3) as h3, sum(h4) as h4, sum(h5) as h5, sum(h6) as h6, sum(h7) as h7, sum(h8) as h8, sum(h9) as h9, sum(h10) as h10 from &dimName.LinksHistogramT group by codeId; quit; * calculate numPatientCodes and numUniquePatientCodes; proc sql noprint; create table &dimName.Counts as select distinct(d.patId), count(d.code) as numPatientCodes, count(distinct(d.code)) as numUniquePatientCodes from patients, &dimName.Filtered d where d.patId = patients.patid group by d.patId; quit; *calculate numUniqueOccurrences; proc sql noprint; create table &dimName.CodeMap as select distinct(code) as code, codeId, codeType, considerForPS, count(distinct(patId)) as numUniqueOccurrences from &dimName.Codes group by code; quit; /** 3) b) Filter Codes for Prevalence *****************************************************/ data &dimName.CodeMap; set &dimName.CodeMap; prevalence = numUniqueOccurrences / &totalPatients.; if prevalence > 0.5 then prevalence = 1.0 - prevalence; run; proc sort data = &dimName.CodeMap; key prevalence / descending; run; * set considerFopPs = 'true' for top_n codes and remove others; data &dimName.CodeMapTopN; set &dimName.CodeMap; considerForPs = 'true'; if _N_ <= &top_n.; run; /** 3) c) i. create service intensity variables *******************************************/ %macro createServiceIntensityVariables(Unique); data &dimName.Frequency&Unique.; set &dimName.Counts; keep patId num&Unique.PatientCodes; run; proc sort data=&dimName.Frequency&Unique.; key num&Unique.PatientCodes; run; %let frequencyTotal = &SYSNOBS; * quartile indices; %let q1i = %sysfunc(round(&frequencyTotal. * 0.25)); %let q2i = %sysfunc(round(&frequencyTotal. * 0.5)); %let q3i = %sysfunc(round(&frequencyTotal. * 0.75)); %let q4i = &frequencyTotal.; %let codeIdUnique = .; %if &Unique. = Unique %then %do; %let codeIdUnique = UNIQ; %end; %if &Unique. = %then %do; %let codeIdUnique = ALL; %end; * initializing if qXi. = 0; %let q1=0; %let q2=0; %let q3=0; %let q4=0; * keep only quartiles and associated patId; data &dimName.Quartiles&Unique.; set &dimName.Frequency&Unique.; if _N_ = &q1i. then do; codeId = "&dimName._INT_&codeIdUnique._Q1"; code = codeId; codeType = 'INTENSITY'; considerForPS = 'true'; call symput('q1', strip(put(num&Unique.PatientCodes, best.))); output; end; if _N_ = &q2i. then do; codeId = "&dimName._INT_&codeIdUnique._Q2"; code = codeId; codeType = 'INTENSITY'; considerForPS = 'true'; call symput('q2', strip(put(num&Unique.PatientCodes, best.))); output; end; if _N_ = &q3i. then do; codeId = "&dimName._INT_&codeIdUnique._Q3"; code = codeId; codeType = 'INTENSITY'; considerForPS = 'true'; call symput('q3', strip(put(num&Unique.PatientCodes, best.))); output; end; if _N_ = &q4i. then do; codeId = "&dimName._INT_&codeIdUnique._Q4"; code = codeId; codeType = 'INTENSITY'; considerForPS = 'true'; call symput('q4', strip(put(num&Unique.PatientCodes, best.))); output; end; run; * add quartiles to codeMap; %ms_appendfiles(&dimName.CodeMapTopN, &dimName.Quartiles&Unique., &dimName.CodeMapTopN(drop=patId num&Unique.PatientCodes)); * create links for quartiles; data &dimName.QuartileLinks; set &dimName.Frequency&Unique.; if num&Unique.PatientCodes <= &q1. then do; codeId = "&dimName._INT_&codeIdUnique._Q1"; %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end; %else %do; linkId = strip(codeId) || '|' || patId; %end; intensityVarValue = 1; end; else if &q1. < num&Unique.PatientCodes <= &q2. then do; codeId = "&dimName._INT_&codeIdUnique._Q2"; %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end; %else %do; linkId = strip(codeId) || '|' || patId; %end; intensityVarValue = 1; end; else if &q2. < num&Unique.PatientCodes <= &q3. then do; codeId = "&dimName._INT_&codeIdUnique._Q3"; %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end; %else %do; linkId = strip(codeId) || '|' || patId; %end; intensityVarValue = 1; end; else do; codeId = "&dimName._INT_&codeIdUnique._Q4"; %if %eval(&scdmver1. > 7) %then %do; linkId = compress(codeId || '|' || put(patId,8.)); %end; %else %do; linkId = strip(codeId) || '|' || patId; %end; intensityVarValue = 1; end; run; *add quartile links to dimNameLinks; data &dimName.Links; format linkId $50. codeId $20.; set &dimName.Links &dimName.QuartileLinks; run; * create service intensity vars for quartiles; proc sort data = &dimName.Frequency&Unique.; by patId; run; proc sort data = patients; by patId; run; data &dimName.ServiceIntensityVars&Unique.T; merge &dimName.Frequency&Unique. patients; by patId; c1MeanOutcome = outcomeContinuous; c1NumEvents = outcomeCount; pt_c1 = followUpTime; e1c1 = 0; e1Missing = 0; e0c1 = 0; e0Missing = 0; d1c1 = 0; d1Missing = 0; d0c1 = 0; d0Missing = 0; if exposed = 'T' then do; e1c1 = 1; end; else do; e0c1 = 1; end; if outcomeDichotomous = 'T' then do; d1c1 = 1; end; else do; d0c1 = 1; end; keep patId c1MeanOutcome c1NumEvents pt_c1 e1c1 e1Missing e0c1 e0Missing d1c1 d1Missing d0c1 d0Missing; run; proc sql noprint; create table &dimName.ServiceIntensityVars&Unique.T2 as select codeId, sum(c1MeanOutcome) as c1MeanOutcome, sum(c1NumEvents) as c1NumEvents, sum(pt_c1) as pt_c1, sum(e1c1) as e1c1, sum(e1Missing) as e1Missing, sum(e0c1) as e0c1, sum(e0Missing) as e0Missing, sum(d1c1) as d1c1, sum(d1Missing) as d1Missing, sum(d0c1) as d0c1, sum(d0Missing) as d0Missing from &dimName.ServiceIntensityVars&Unique.T t1, &dimName.QuartileLinks t2 where t1.patId = t2.patId group by codeId; quit; *add quartile codes that have no associated patients; data &dimName.Quartiles&Unique.T; set &dimName.Quartiles&Unique.; keep codeId; run; proc sql noprint; create table &dimName.ServiceIntensityVars&Unique. as select q.codeId, coalesce(s.c1MeanOutcome, 0) as c1MeanOutcome, coalesce(s.c1NumEvents, 0) as c1NumEvents, coalesce(s.pt_c1, 0) as pt_c1, coalesce(s.e1c1, 0) as e1c1, coalesce(s.e1Missing, 0) as e1Missing, coalesce(s.e0c1, 0) as e0c1, coalesce(s.e0Missing, 0) as e0Missing, coalesce(s.d1c1, 0) as d1c1, coalesce(s.d1Missing, 0) as d1Missing, coalesce(s.d0c1, 0) as d0c1, coalesce(s.d0Missing, 0) as d0Missing from &dimName.Quartiles&Unique.T q full outer join &dimName.ServiceIntensityVars&Unique.T2 s on q.codeId = s.codeId; quit; %mend createServiceIntensityVariables; /** 3) c) i. create service intensity variables ******************************************/ %createServiceIntensityVariables(Unique); /** 3) c) ii. create service intensity variables ******************************************/ %createServiceIntensityVariables(); /** 3) c. Calculate Medians and Bias ... processing codes ... *****************************/ * calculate cumulative histogram; data &dimName.LinksHistogram; set &dimName.LinksHistogram; c0 = h0; c1 = sum(c0, h1); c2 = sum(c1, h2); c3 = sum(c2, h3); c4 = sum(c3, h4); c5 = sum(c4, h5); c6 = sum(c5, h6); c7 = sum(c6, h7); c8 = sum(c7, h8); c9 = sum(c8, h9); c10 = sum(c9, h10); run; /** 3) c. iii. Calculate Median - median, q3 **********************************************/ %macro findPoint(cumulative, search, start); point = .; if &search. <= &cumulative.[&start.] then do; point = &start.; end; else do; do i = 2 to 10; if &search. > &cumulative.[i - 1] and &search. <= &cumulative.[i] then leave; end; point = (i - 1); end; %mend findPoint; %macro calcPercentile(percentile, cumulative, total, storeIn); center = &percentile. * &total.; fCenter = floor(center); isEven = (fCenter = center); if isEven then do; *find the two center points; %findPoint(&cumulative., fCenter, 1); v1 = point; %findPoint(&cumulative., fCenter + 1, v1); v2 = point; *average them; &storeIn. = (v1 + v2) / 2; end; else do; *only one center point; cCenter = ceil(center); %findPoint(&cumulative., cCenter, 1); &storeIn. = point; end; %mend calcPercentile; data &dimName.LinksHistogram; set &dimName.LinksHistogram; array cHist c0 - c10; %calcPercentile(0.5, cHist, c10, median); %calcPercentile(0.75, cHist, c10, q3); keep codeId median q3; run; /** 3) c. iv. Mark Occurrence Type ********************************************************/ proc sql noprint; create table &dimName.OccurrenceTypes as select l.linkId, l.codeId, l.patId, l.numOccurrences, c.median, c.q3 from &dimName.Links l, &dimName.LinksHistogram c where l.codeId = c.codeId; quit; data &dimName.OccurrenceTypes; set &dimName.OccurrenceTypes; setanyVar = (numOccurrences >= 1); setoftenVar = (numOccurrences >= median); setFrequentVar = (numOccurrences >= q3); oftenVarMissing = (median = 1); frequentVarMissing = (q3 = median); anyVarValue = 0; oftenVarValue = 0; frequentVarValue = 0; if setanyVar then anyVarValue = 1; if setoftenVar then do; if ~oftenVarMissing then oftenVarValue = 1; else oftenVarValue = .; end; if setFrequentVar then do; if ~frequentVarMissing then frequentVarValue = 1; else frequentVarValue = .; end; drop setanyVar setoftenVar setFrequentVar oftenVarMissing frequentVarMissing; run; proc sql noprint; create table &dimName.CodeVars as select o.codeId, o.anyVarValue, o.oftenVarValue, o.frequentVarValue, p.patId, p.outcomeContinuous, p.outcomeCount, p.followUpTime, p.exposed, p.outcomeDichotomous from &dimName.OccurrenceTypes o, patients p where o.patId = p.patId; quit; data &dimName.CodeVars; set &dimName.CodeVars; c1MeanOutcome = 0; c1NumEvents = 0; pt_c1 = 0; e1Missing = 0; e1c1 = 0; e0Missing = 0; e0c1 = 0; d1Missing = 0; d1c1 = 0; d0Missing = 0; d0c1 = 0; run; %macro updateVarCounts(type, value); data &dimName.CodeVars&Type.T; set &dimName.CodeVars; if &value. ~= 0 then do; c1MeanOutcome = c1MeanOutcome + outcomeContinuous; c1NumEvents = c1NumEvents + outcomeCount; if &value. = 1 then pt_c1 = pt_c1 + followUpTime; if exposed = 'T' then do; if &value. = . then e1Missing = e1Missing + 1; else if &value. = 1 then e1c1 = e1c1 + 1; end; else do; if &value. = . then e0Missing = e0Missing + 1; else if &value. = 1 then e0c1 = e0c1 + 1; end; if outcomeDichotomous = 'T' then do; if &value. = . then d1Missing = d1Missing + 1; else if &value. = 1 then d1c1 = d1c1 + 1; end; else do; if &value. = . then d0Missing = d0Missing + 1; else if &value. = 1 then d0c1 = d0c1 + 1; end; end; run; proc sql noprint; create table &dimName.CodeVars&Type. as select distinct(codeId) as codeId, sum(c1MeanOutcome) as c1MeanOutcome, sum(c1NumEvents) as c1NumEvents, sum(pt_c1) as pt_c1, sum(e1c1) as e1c1, sum(e1Missing) as e1Missing, sum(e0c1) as e0c1, sum(e0Missing) as e0Missing, sum(d1c1) as d1c1, sum(d1Missing) as d1Missing, sum(d0c1) as d0c1, sum(d0Missing) as d0Missing from &dimName.CodeVars&Type.T group by codeId; quit; %mend updateVarCounts; %updateVarCounts(Any, anyVarValue); %updateVarCounts(Often, oftenVarValue); %updateVarCounts(Frequent, frequentVarValue); /** 3) c. v. Calculate Bias ***************************************************************/ data patientTotals; set patientTotals; nTotal = &totalPatients.; e1Total = nExposed; e0Total = nTotal - e1Total; d1Total = nOutcome; d0Total = nTotal - d1Total; run; %macro calculateBias(datasetName); data &datasetName.; set &datasetName.; if _N_ = 1 then set patientTotals; bias = 0; nMissing = e1Missing + e0Missing; N = nTotal - nMissing; e1 = e1Total - e1Missing; e1c0 = e1 - e1c1; e0 = e0Total - e0Missing; e0c0 = e0 - e0c1; d1 = d1Total - d1Missing; d1c0 = d1 - d1c1; d0 = d0Total - d0Missing; d0c0 = d0 - d0c1; c1 = e1c1 + e0c1; c0 = N - c1; pt = ptTotal; pt_e1 = ptExposed; pt_e0 = ptTotal - ptExposed; pt_c0 = ptTotal - pt_c1; numEvents = numEvents; c0NumEvents = numEvents - c1NumEvents; *so far, these are running totals. divide to get means.; meanOutcome = sumOfOutcomes / nTotal; if c1 > 0 then c1MeanOutcome = c1MeanOutcome / c1; else c1MeanOutcome = -1; if c0 > 0 then c0MeanOutcome = (((meanOutcome * nTotal) - (c1MeanOutcome * c1)) / c0); else c0MeanOutcome = -1; if e1 > 0 then pc_e1 = e1c1 / e1; else pc_e1 = 0; if e0 > 0 then pc_e0 = e0c1 / e0; else pc_e0 = 0; if pc_e1 > 0.5 then pc_e1 = 1.0 - pc_e1; if pc_e0 > 0.5 then pc_e0 = 1.0 - pc_e0; * TODO use an "i.n.v.a.l.i.d." value like java instead of . ?; rrCe = .; expAssocRankingVariable = .; rrCd = .; bias = .; biasRankingVariable = .; outcomeAssocRankingVariable = .; if pc_e0 > 0 and pc_e1 > 0 then do; rrCe = pc_e1 / pc_e0; expAssocRankingVariable = abs(log(rrCe)); end; if &outcome_type. = 'DICHOTOMOUS' or &outcome_type. = 'BINARY' or &outcome_type. = 'COUNT' then do; if &outcome_zero_cell_corr. = 1 then do; rrCd = (((c1NumEvents + 0.1) / (pt_c1 + 0.1)) / ((c0NumEvents + 0.1) / (pt_c0 + 0.1))); end; else if ((c1NumEvents > 0) and (pt_c1 > 0) and (c0NumEvents > 0) and (pt_c0 > 0)) then do; rrCd = ((c1NumEvents / pt_c1) / (c0NumEvents / pt_c0)); end; if (rrCd > 0) then outcomeAssocRankingVariable = abs(log(rrCd)); end; else if &outcome_type. = 'CONTINUOUS' then do; rrCd = c1MeanOutcome - c0MeanOutcome; outcomeAssocRankingVariable = abs(rrCd); end; if ((pc_e0 > 0) && (pc_e1 > 0) && (rrCd > 0)) then do; biasA = pc_e1 * (rrCd - 1.0) + 1.0; biasB = pc_e0 * (rrCd - 1.0) + 1.0; bias = biasA / biasB; biasRankingVariable = abs(log(bias)); end; run; %mend calculateBias; %calculateBias(&dimName.CodeVarsany); %calculateBias(&dimName.CodeVarsoften); %calculateBias(&dimName.CodeVarsFrequent); %calculateBias(&dimName.ServiceIntensityVarsUnique); %calculateBias(&dimName.ServiceIntensityVars); * update cumulative data sets; * cannot use proc datasets append because it truncates character variables; data allVarsT; set &dimName.CodeVarsany(in = any) &dimName.CodeVarsoften(in = often) &dimName.CodeVarsFrequent(in = frequent) &dimName.ServiceIntensityVarsUnique(in = intensity) &dimName.ServiceIntensityVars(in = unique); format type $10.; if any = 1 then type = 'Any'; if often = 1 then type = 'Often'; if frequent = 1 then type = 'Frequent'; if unique = 1 then type = 'ServiceInt'; if intensity = 1 then type = 'ServiceInt'; run; %ms_appendfiles(allVars, allVarsT, allVars); %ms_appendfiles(allCodes, &dimName.CodeMapTopN, allCodes); %ms_appendfiles(allOccurrenceTypes, &dimName.OccurrenceTypes, allOccurrenceTypes); %ms_appendfiles(allLinks, &dimName.Links, allLinks); %ms_appendfiles(allCodesHistogram, &dimName.LinksHistogram, allCodesHistogram); %ms_appendfiles(allCodeMaps, &dimName.CodeMap, allCodeMaps); data allDimNamesT; dimName = "&dimName."; dimDataSet = "&dimDataSet."; output; run; %ms_appendfiles(allDimNames, allDimNamesT, allDimNames); %mend; *This is where all the processing is done using the dimension files that were passed on (typically 7 in QRP); data _NULL_; %do dimIndex = 1 %to 10; %let tDataSetName = %scan(&&input_dim&dimIndex, 1, ' '); %let tCodeVar = %scan(&&input_dim&dimIndex, 2, ' '); %if &tDataSetName ~= and %sysfunc(exist(&tDataSetName)) %then %do; %if &dimIndex < 10 %then %do; %let dimName = D0&dimIndex.; %end; %else %do; %let dimName = D&dimIndex.; %end; %readDimension(&dimName., &tDataSetName, &tCodeVar); %end; %else %if &&input_dim&dimIndex ~= %then %do; %put ERROR: (Sentinel) readDimension(&&input_dim&dimIndex), "&&input_dim&dimIndex" is invalid.; %end; %end; run; /** 4) Select and Output Variables: *******************************************************/ /** 4) a) Choose Variables to Consider: ***************************************************/ data allConsideredCodes; set allCodes; if considerForPS = 'true' then output; run; proc sql noprint; create table variablesToConsider as select v.codeId, c.code, v.e1, v.e0, v.pt_e1, v.pt_e0, v.d1, v.d0, v.c1, v.c0, v.pt_c1, v.pt_c0, v.e1c1, v.e1c0, v.e0c1, v.e0c0, v.d1c1, v.d1c0, v.d0c1, v.d0c0, v.pc_e1, v.pc_e0, v.numEvents, v.c1NumEvents, v.c0NumEvents, v.meanOutcome, v.c1MeanOutcome, v.c0MeanOutcome, v.rrCe, v.rrCd, v.bias, v.expAssocRankingVariable, v.outcomeAssocRankingVariable, v.biasRankingVariable, v.type from allVars v, allConsideredCodes c where v.codeId = c.codeId; quit; * ranking_method in {BIAS, EXP_ASSOC, OUTCOME_ASSOC, NO_RANKING}; data variablesToConsider; set variablesToConsider; if &ranking_method. = 'EXP_ASSOC' then do; activeRankingVariable = expAssocRankingVariable; end; else if &ranking_method. = 'OUTCOME_ASSOC' then do; activeRankingVariable = outcomeAssocRankingVariable; end; else if &ranking_method. = 'BIAS' then do; activeRankingVariable = biasRankingVariable; end; else do; activeRankingVariable = 0; end; run; *keep only k variables with greatest activeRankingVariable; proc sort data = variablesToConsider; by descending activeRankingVariable code; run; data variablesToConsider; set variablesToConsider; format selectedForPs $5.; if _N_ <= &k. and activeRankingVariable ~= . then selectedForPs = 'true'; else selectedForPs = 'false'; run; data selectedVariables; set variablesToConsider; if selectedForPs = 'true'; run; /** 4) b) Calculate Z Bias: ***************************************************************/ data outcomeStrengths; set selectedVariables; if expAssocRankingVariable ~= . and outcomeAssocRankingVariable ~= .; keep outcomeAssocRankingVariable code codeId type; run; proc sort data = outcomeStrengths; by outcomeAssocRankingVariable descending code; run; data outcomeStrengths; set outcomeStrengths; i = _N_; run; proc univariate data = outcomeStrengths noprint PCTLDEF=4; var outcomeAssocRankingVariable; output pctlpre=P pctlpts=50 out=outcomeStrengthsP; run; data _NULL_; set outcomeStrengthsP; if _N_ = 1 then do; call symput('P50', strip(put(P50, best.))); end; run; %macro bSearch (array=,key=,return=); %local l m h; %let l=l%substr(%sysfunc(ranuni(0)),3,7); %let m=m%substr(%sysfunc(ranuni(0)),3,7); %let h=h%substr(%sysfunc(ranuni(0)),3,7); drop &l &m &h &return; &l=lbound(&array); &h=hbound(&array); &return=.; do until (&h < &l); &m = floor((&l + &h) * 0.5); if round(&key, 0.0000000001) < round(&array(&m), 0.0000000001) then &h=&m-1; else if round(&key, 0.0000000001) > round(&array(&m), 0.0000000001) then &l=&m+1; else do; &return=&m; leave; end; end; * no exact match found. return same value as java Arrays.binarySearch(); *** the index of the first element greater than the key; if &return = . then do; do i = &l to &h; if &array(i) > &key then do; leave; end; end; &return = i; end; %mend; proc transpose data = outcomeStrengths out = outcomeStrengthsT prefix = ost; var outcomeAssocRankingVariable; run; data outcomeStrengthsT; set outcomeStrengthsT; drop _NAME_; run; data _NULL_; set outcomeStrengthsT; array ost _ALL_; %bSearch(array=ost, key=&P50., return=x); * one based arry to zero based array; call symput('zMedian', strip(put((x - 1), best.))); run; data weakOutcomeStrengths; set outcomeStrengths; if _N_ <= &zMedian.; run; proc univariate data = weakOutcomeStrengths noprint PCTLDEF = 4; var outcomeAssocRankingVariable; output pctlpre = P pctlpts = 20, 40, 60, 80 out = weakOutcomeStrengthsP; run; data _NULL_; set weakOutcomeStrengthsP; if _N_ = 1 then do; call symput('P20', strip(put(P20, best.))); call symput('P40', strip(put(P40, best.))); call symput('P60', strip(put(P60, best.))); call symput('P80', strip(put(P80, best.))); end; run; data _NULL_; set outcomeStrengthsT; if _N_ = 1 then do; array ost _ALL_; %bSearch(array=ost, key=&P20., return=i20); %bSearch(array=ost, key=&P40., return=i40); %bSearch(array=ost, key=&P60., return=i60); %bSearch(array=ost, key=&P80., return=i80); array startsOfQuintile{7}; * first elements is always 0; startsOfQuintile{1} = 0; * one based arry to zero based array; startsOfQuintile{2} = i20 - 1; startsOfQuintile{3} = i40 - 1; startsOfQuintile{4} = i60 - 1; startsOfQuintile{5} = i80 - 1; startsOfQuintile{6} = &zMedian.; declare hash zBiasScores(); zBiasScores.defineKey('i'); zBiasScores.defineData('i', 'score'); zBiasScores.defineDone(); do quintile = 1 to 5; do i = startsOfQuintile{quintile} to (startsOfQuintile{quintile + 1} - 1); score = 6 - quintile; zBiasScores.add(); end; end; zBiasScores.output(dataset: "zBiasScores"); end; run; * zero based array to one based array; data zBiasScores; set zBiasScores; i = i + 1; run; proc sql noprint; create table zBiasScoresMerged as select z.score, o.codeId, o.type from zBiasScores z, outcomeStrengths o where z.i = o.i; quit; proc sql noprint; create table variablesToConsiderScored as select v.*, coalesce(z.score, 0) as zBiasScore from variablesToConsider v full outer join zBiasScoresMerged z on v.codeId = z.codeId and v.type = z.type; quit; /** 4) c) Output Variables and Dimensions: ************************************************/ /** output_all_vars ***********************************************************************/ proc sql noprint; create table output_all_varsT as select substr(codeId, 1, 3) as dimensionLU, code as code_id, strip(codeId) || type as var_name, selectedForPs as selected_for_ps, e1, e0, pt_e1, pt_e0, d1, d0, c1, c0, pt_c1, pt_c0, e1c1, e1c0, e0c1, e0c0, d1c1, d1c0, d0c1, d0c0, pc_e1, pc_e0, numEvents as num_events, c1NumEvents as c1_num_events, c0NumEvents as c0_num_events, meanOutcome as mean_outcome, c1MeanOutcome as c1_mean_outcome, c0MeanOutcome as c0_mean_outcome, rrCe as rr_ce, rrCd as rr_cd, bias, expAssocRankingVariable as exp_assoc_ranking_var, outcomeAssocRankingVariable as outcome_assoc_ranking_var, biasRankingVariable as bias_assoc_ranking_var, zBiasScore as z_bias_score from variablesToConsiderScored order by var_name; * lookup name of original dimension dataset; create table output_all_vars as select d.dimDataSet as dimension, v.code_id, v.var_name, v.selected_for_ps, v.e1, v.e0, v.pt_e1, v.pt_e0, v.d1, v.d0, v.c1, v.c0, v.pt_c1, v.pt_c0, v.e1c1, v.e1c0, v.e0c1, v.e0c0, v.d1c1, v.d1c0, v.d0c1, v.d0c0, v.pc_e1, v.pc_e0, v.num_events, v.c1_num_events, v.c0_num_events, v.mean_outcome, v.c1_mean_outcome, v.c0_mean_outcome, v.rr_ce, v.rr_cd, v.bias, v.exp_assoc_ranking_var, v.outcome_assoc_ranking_var, v.bias_assoc_ranking_var, v.z_bias_score from output_all_varsT v, allDimNames d where d.dimName = v.dimensionLU; quit; *match java formatting for easy comparison; data output_all_vars; set output_all_vars; format psestimategrp $40. e1 e0 pt_e1 pt_e0 d1 d0 c1 c0 pt_c1 pt_c0 e1c1 e1c0 e0c1 e0c0 d1c1 d1c0 d0c1 d0c0 pc_e1 pc_e0 num_events c1_num_events c0_num_events mean_outcome c1_mean_outcome c0_mean_outcome rr_ce rr_cd bias exp_assoc_ranking_var outcome_assoc_ranking_var bias_assoc_ranking_var 18.10; psestimategrp = "&psestimategrp."; run; /** output_dimension_codes ****************************************************************/ proc sql noprint; create table output_dimension_codesT as select coalesce(h.codeId, c.codeId) as codeId, substr(coalesce(h.codeId, c.codeId), 2, 2) as dimension_num, substr(coalesce(h.codeId, c.codeId), 1, 3) as dimension, c.considerForPs as consider_for_selection, c.prevalence as prevalence, h.median, h.q3 from allCodes c full outer join allCodesHistogram h on c.codeId = h.codeId; create table output_dimension_codesT2 as select o.dimension_num, o.dimension as dimensionLU, coalesce(a.code, o.codeId) as code_id, /* for INTENSITY codes */ a.numUniqueOccurrences as num_patients, o.consider_for_selection, a.prevalence, o.median, o.q3 from output_dimension_codesT o full outer join allCodeMaps a on o.codeId = a.codeId; * lookup name of original dimension dataset; create table output_dimension_codes as select c.dimension_num, d.dimDataSet as dimension, c.code_id, c.num_patients, c.consider_for_selection, c.prevalence, c.median, c.q3 from output_dimension_codesT2 c, allDimNames d where d.dimName = c.dimensionLU; quit; data output_dimension_codes; set output_dimension_codes; format prevalence median q3 18.10; if num_patients = . then num_patients = 0; if prevalence = . then prevalence = -1; if median = . then median = 0; if q3 = . then q3 = 0; if consider_for_selection ~= 'true' then do; consider_for_selection = 'false'; median = 0; q3 = 0; end; run; /** generate cohorts ****************************************************************/ *** create an adjacency matrix with selectedVariables on top and ALL patIds on left; proc sql noprint; create table allLinksWithOccurrenceTypes as select l.linkId, l.codeId, l.patId, o.anyVarValue, o.oftenVarValue, o.frequentVarValue, l.intensityVarValue from allLinks l full outer join allOccurrenceTypes o on l.linkId = o.linkId; quit; proc sql noprint; create table cohort as select trim(s.codeId) || s.type as code, s.type, l.patId, l.anyVarValue, l.oftenVarValue, l.frequentVarValue, l.intensityVarValue from selectedVariables s full outer join allLinksWithOccurrenceTypes l on s.codeId = l.codeId; quit; * filter output_full_cohort, remove all rows where {type}VarValue ~= 1; data cohortT; set cohort; if type = 'Any' then varValue = anyVarValue; else if type = 'Often' then varValue = oftenVarValue; else if type = 'Frequent' then varValue = frequentVarValue; else if type = 'ServiceInt' then varValue = intensityVarValue; outputVar = 0; if varValue = . then outputVar = 0; else if varValue = 1 then outputVar = 1; if outputVar = 1; run; * add patients without links; proc sql noprint; create table allPatIds as select distinct(patId) from patients; create table cohort as select p.patId, c.code from cohortT c full outer join allPatIds p on c.patId = p.patId; quit; * build an adjacency matrix; * get column names; proc sql noprint; create table meta as select distinct(trim(left(codeId)) || trim(left(type))) as code from selectedVariables order by code; quit; * build case and sum statements for the following two SQL blocks; * case when code = 'D01V000Any' then 1 else 0 end as D01V000Any, ; * sum(D01V000Any) as D01V000Any, ; data meta; set meta; metaSQLCase = "case when code = '" || trim(left(code)) || "' then 1 else 0 end as " || trim(left(code)) || " length = 3"; metaSQLSum = 'max(' || trim(left(code)) || ') as ' || trim(left(code)) ||' length = 3'; run; proc sql noprint; select metaSQLCase into :metaSQLCaseAll separated by ',' from meta; select metaSQLSum into :metaSQLSumAll separated by ',' from meta; quit; proc sql noprint; create table adjacencyCohort as select patId, &metaSQLCaseAll. from cohort; quit; * remove duplicates using sum() as values are in {0,1}; proc sql noprint; create table output_full_cohort as select distinct(patId) as patient_id, &metaSQLSumAll. from adjacencyCohort group by patId; quit; %mend ms_HDVariableSelection;