From 224eef9ffa6273cb1a535cf8183284969767a864 Mon Sep 17 00:00:00 2001 From: Roy Pardee Date: Tue, 30 Dec 2014 06:34:37 -0500 Subject: [PATCH] Added support for SAS syntax. --- .gitmodules | 3 + grammars.yml | 3 + lib/linguist/languages.yml | 2 +- samples/SAS/detect_phi.sas | 275 +++++++++++++++++++++++++++++++++++ vendor/grammars/sas.tmbundle | 1 + 5 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 samples/SAS/detect_phi.sas create mode 160000 vendor/grammars/sas.tmbundle diff --git a/.gitmodules b/.gitmodules index 405a88e5..d9d11dee 100644 --- a/.gitmodules +++ b/.gitmodules @@ -520,3 +520,6 @@ [submodule "vendor/grammars/IDL-Syntax"] path = vendor/grammars/IDL-Syntax url = https://github.com/andik/IDL-Syntax +[submodule "vendor/grammars/sas.tmbundle"] + path = vendor/grammars/sas.tmbundle + url = https://github.com/rpardee/sas.tmbundle diff --git a/grammars.yml b/grammars.yml index d6c679ff..5946966a 100644 --- a/grammars.yml +++ b/grammars.yml @@ -366,6 +366,9 @@ vendor/grammars/ruby-slim.tmbundle: vendor/grammars/ruby.tmbundle: - source.ruby - text.html.erb +vendor/grammars/sas.tmbundle: +- source.SASLog +- source.sas vendor/grammars/scala.tmbundle: - source.sbt - source.scala diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index aa263926..bc7345fc 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -2585,7 +2585,7 @@ SAS: color: "#1E90FF" extensions: - .sas - tm_scope: none + tm_scope: source.sas ace_mode: text SCSS: diff --git a/samples/SAS/detect_phi.sas b/samples/SAS/detect_phi.sas new file mode 100644 index 00000000..5784d74f --- /dev/null +++ b/samples/SAS/detect_phi.sas @@ -0,0 +1,275 @@ +%macro check_dataset(dset =, obs_lim = max, eldest_age = 89) ; + %local i ; + %local inset_name ; + %let inset_name = &dset ; + + %if %lowcase(&obs_lim) = max %then %do ; + %** Nothing ; + %end ; + %else %do ; + proc surveyselect + data = &inset_name + out = __sub_dset + method = srs + sampsize = &obs_lim SELECTALL + seed = 1234567 + noprint + ; + run; + %let dset = __sub_dset ; + %end ; + + %macro check_varname(regx, msg) ; + create table possible_bad_vars as + select name, label + from these_vars + where prxmatch(compress("/(®x)/i"), name) + ; + + %if &sqlobs > 0 %then %do ; + insert into phi_warnings(dset, variable, label, warning) + select "&inset_name" as dset, name, label, "&msg" + from possible_bad_vars + ; + %end ; + + %mend check_varname ; + + %macro check_vars_for_mrn(length_limit = 6, obs_lim = max) ; + %local char ; + %let char = 2 ; + proc sql noprint ; + select name + into :mrn_array separated by ' ' + from these_vars + where type = &char and length ge &length_limit + ; + quit ; + %if &sqlobs > 0 %then %do ; + %put Checking these vars for possible MRN contents: &mrn_array ; + data __gnu ; + retain + mrn_regex_handle + badcount + ; + set &inset_name (obs = &obs_lim keep = &mrn_array) ; + if _n_ = 1 then do ; + mrn_regex_handle = prxparse("/&mrn_regex/") ; + badcount = 0 ; + end ; + array p &mrn_array ; + do i = 1 to dim(p) ; + if prxmatch(mrn_regex_handle, p{i}) then do ; + badvar = vname(p{i}) ; + badvalue = p{i} ; + badcount = _n_ ; + output ; + end ; + keep badvar badvalue badcount ; + end ; + run ; + proc sql noprint ; + select compress(put(max(badcount), best.)) + into :badcount + from __gnu + ; + insert into phi_warnings(dset, variable, warning) + select distinct "&inset_name", badvar, "Could this var hold MRN values? Contents of %trim(&badcount) records match the pattern given for MRN values. MRNs should never move across sites." + from __gnu ; + drop table __gnu ; + quit ; + %end ; + %mend check_vars_for_mrn ; + + %macro check_vars_for_oldsters(eldest_age = 89, obs_lim = max) ; + %local dtfmts ; + %let dtfmts = 'B8601DA','B8601DN','B8601DT','B8601DZ','B8601LZ','B8601TM','B8601TZ','DATE','DATEAMPM','DATETIME','DAY','DDMMYY', + 'DDMMYYB','DDMMYYC','DDMMYYD','DDMMYYN','DDMMYYP','DDMMYYS','DOWNAME','DTDATE','DTMONYY','DTWKDATX','DTYEAR', + 'DTYYQC','E8601DA','E8601DN','E8601DT','E8601DZ','E8601LZ','E8601TM','E8601TZ','HHMM','HOUR','JULDAY','JULIAN', + 'MMDDYY','MMDDYYB','MMDDYYC','MMDDYYD','MMDDYYN','MMDDYYP','MMDDYYS','MMSS','MMYY','MMYY','MONNAME','MONTH','MONYY', + 'PDJULG','PDJULI','QTR','QTRR','WEEKDATE','WEEKDATX','WEEKDAY','WEEKU','WEEKV','WEEKW','WORDDATE','WORDDATX', + 'YEAR','YYMM','YYMMC','YYMMD','YYMMN','YYMMP','YYMMS','YYMMDD','YYMMDDB','YYMMDDC','YYMMDDD','YYMMDDN','YYMMDDP', + 'YYMMDDS','YYMON','YYQ','YYQC','YYQD','YYQN','YYQP','YYQS','YYQR','YYQRC','YYQRD','YYQRN','YYQRP','YYQRS' ; + + %local num ; + %let num = 1 ; + + proc sql noprint ; + select name + into :dat_array separated by ' ' + from these_vars + where type = &num and (format in (&dtfmts) or lowcase(name) like '%date%') + ; + /* added by cb to shorten the process of looking at all dates */ + %if &sqlobs > 0 %then %do ; + %put Checking these vars for possible DOB contents: &dat_array ; + select 'min(' || trim(name) || ') as ' || name into :var_list separated by ',' + from these_vars + where type = &num and (format in (&dtfmts) or lowcase(name) like '%date%') + ; + create table __gnu as + select &var_list from &inset_name + ; + /* end cb additions */ + quit ; + data __gnu ; + set __gnu (obs = &obs_lim keep = &dat_array) ; + array d &dat_array ; + do i = 1 to dim(d) ; + if n(d{i}) then maybe_age = %calcage(bdtvar = d{i}, refdate = "&sysdate9."d) ; + if maybe_age ge &eldest_age then do ; + badvar = vname(d{i}) ; + badvalue = d{i} ; + output ; + end ; + keep badvar badvalue maybe_age ; + end ; + run ; + proc sql outobs = 30 nowarn ; + insert into phi_warnings(dset, variable, warning) + select distinct "&inset_name", badvar, "If this is a date, at least one value is " || compress(put(maybe_age, best.)) || " years ago, which is older than &eldest_age.. " || + "If this date applies to a person, the record is probably PHI." + from __gnu ; + drop table __gnu ; + quit ; + %end ; + %else %do ; + %put No obvious date variables found in &inset_name.--skipping age checks. ; + %end ; + %mend check_vars_for_oldsters ; + + proc contents noprint data = &inset_name out = these_vars ; + run ; + + proc sql noprint ; + create table phi_warnings (dset char(50), variable char(256), label char(256), warning char(200)) ; + + %check_varname(regx = mrn|hrn , msg = %str(Name suggests this var may be an MRN, which should never move across sites.)) ; + %check_varname(regx = birth_date|BirthDate|DOB|BDate , msg = %str(Name suggests this var may be a date of birth.)) ; + %check_varname(regx = SSN|SocialSecurityNumber|social_security_number|socsec, msg = %str(Name suggests this var may be a social security number.)) ; + + %if %symexist(locally_forbidden_varnames) %then %do ; + %check_varname(regx = &locally_forbidden_varnames, msg = %str(May be on the locally defined list of variables not allowed to be sent to other sites.)) ; + %end ; + + quit ; + + %check_vars_for_mrn(obs_lim = &obs_lim) ; + %check_vars_for_oldsters(obs_lim = &obs_lim, eldest_age = &eldest_age) ; + + title3 "WARNINGS for dataset &inset_name:" ; + + proc sql noprint ; + select count(*) as num_warns into :num_warns from phi_warnings ; + + %if &num_warns = 0 %then %do ; + reset print outobs = 5 NOWARN ; + select "No obvious PHI-like data elements in &inset_name--BUT PLEASE INSPECT THE CONTENTS AND PRINTs TO FOLLOW" as x label = "No warnings for &inset_name" + from &inset_name + ; + %do i = 1 %to 5 ; + %put No obvious phi-like data elements in &inset_name. BUT PLEASE INSPECT THE CONTENTS AND PRINTs CAREFULLY TO MAKE SURE OF THIS! ; + %end ; + %end ; + %else %do ; + reset print ; + select variable, warning from phi_warnings + order by variable, warning + ; + quit ; + %end ; + title3 "Dataset &inset_name" ; + proc contents data = &inset_name varnum ; + run ; + /* + proc print data = &inset_name (obs = 20) ; + run ; + */ + ** TODO: make the print print out recs that trip the value warnings. ; + proc sql number ; + select * + from &inset_name (obs = 20) + ; + quit ; + + quit ; + + %RemoveDset(dset = __sub_dset) ; + %RemoveDset(dset = possible_bad_vars) ; + %RemoveDset(dset = phi_warnings) ; + %RemoveDset(dset = these_vars) ; + +%mend check_dataset ; + +%macro detect_phi(transfer_lib, obs_lim = max, eldest_age = 89) ; + + %put ; + %put ; + %put ============================================================== ; + %put ; + %put Macro detect_phi: ; + %put ; + %put Checking all datasets found in %sysfunc(pathname(&transfer_lib)) for the following signs of PHI: ; + %put - Variable names signifying sensitive items like 'MRN', 'birth_date', 'SSN' and so forth. ; + %if %symexist(locally_forbidden_varnames) %then %do ; + %put - Variable names on the list defined in the standard macro variable locally_forbidden_varnames (here those names are: &locally_forbidden_varnames). ; + %end ; + %put - Contents of CHARACTER variables that match the pattern given in the standard macro variable mrn_regex (here that var is &mrn_regex) ; + %put Please note that numeric variables ARE NOT CHECKED FOR MRN-LIKE CONTENT. ; + %put - The contents of date variables (as divined by their formats) for values that, if they were DOBs, would indicate a person older than &eldest_age years. ; + %put ; + %put THIS IS BETA SOFTWARE-PLEASE SCRUTINIZE THE RESULTS AND REPORT PROBLEMS TO pardee.r@ghc.org. ; + %put ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put THIS MACRO IS NOT A SUBSTITUTE FOR HUMAN INSPECTION AND THOUGHT--PLEASE CAREFULLY INSPECT ALL VARIABLES--WHETHER ; + %put OR NOT THEY TRIP A WARNING--TO MAKE SURE THE DATA COMPORTS WITH YOUR DATA SHARING AGREEMENT!!! ; + %put ; + %put ; + %put ============================================================== ; + %put ; + %put ; + + title1 "PHI-Detection Report for the datasets in %sysfunc(pathname(&transfer_lib))." ; + title2 "please inspect all output carefully to make sure it comports with your data sharing agreement!!!" ; + + proc sql noprint ; + ** describe table dictionary.tables ; + + select trim(libname) || '.' || memname as dset + into :d1-:d999 + from dictionary.tables + where libname = "%upcase(&transfer_lib)" AND + memtype = 'DATA' + ; + %local num_dsets ; + %let num_dsets = &sqlobs ; + quit ; + + %local i ; + + %if &num_dsets = 0 %then %do i = 1 %to 10 ; + %put ERROR: NO DATASETS FOUND IN &transfer_lib!!!! ; + %end ; + + %do i = 1 %to &num_dsets ; + %put about to check &&d&i ; + %check_dataset(dset = &&d&i, obs_lim = &obs_lim, eldest_age = &eldest_age) ; + %end ; + +%mend detect_phi ; diff --git a/vendor/grammars/sas.tmbundle b/vendor/grammars/sas.tmbundle new file mode 160000 index 00000000..0d864a6b --- /dev/null +++ b/vendor/grammars/sas.tmbundle @@ -0,0 +1 @@ +Subproject commit 0d864a6bc89ca418e9516249774fc9ab7c792a79