diff --git a/SDG - Generate Synthetic Data through SMOTE/README.md b/SDG - Generate Synthetic Data through SMOTE/README.md index 6ed13ab3..1833d134 100755 --- a/SDG - Generate Synthetic Data through SMOTE/README.md +++ b/SDG - Generate Synthetic Data through SMOTE/README.md @@ -39,6 +39,20 @@ This video (click on below image to play) provides a basic idea: 3. [hnswlib](https://pypi.org/project/hnswlib/) 4. [protobuf](https://pypi.org/project/protobuf/) +### (OPTIONAL) Prerequisites for Singling Out Risk calculation + +If you want to measure singling out risk (provided as an option in this step), note the following additional prerequisites: + +1. SAS compute session should be configured to access a Python runtime of version of > 3.7 and < 3.12. + +2. A Python package - [anonymeter](https://pypi.org/project/anonymeter/) - should be installed in the above runtime. Make note of details about anonymeter at https://pypi.org/project/anonymeter/ + +3. As a further dependency on above, anonymeter requires NumPy between version 1.2 and less than 1.7 (specifically, "numpy >=1.22, <1.27", # limited by Numba support) + +Note terms of anonymeter license here: https://github.com/statice/anonymeter/blob/main/LICENSE.md + +Note citation in [Privacy Risk](#privacy-risk) section below. + ----- ## Parameters ---- @@ -55,6 +69,40 @@ This video (click on below image to play) provides a basic idea: 5. Select a class column (column selector, optional): select a column if you wish to use SMOTE in order to balance or augment a level within the class column. Be judicious in the choice of this column since a column with a high number of levels may slow down or even fail the process. Your class column is required to be in the inputs column list. 6. Class to augment (drop-down list, values from class column if selected): select the level of the class variable you wish to augment. The values that appear here depend on the data that's contained in the class column, so may take time to populate based on actual data and number of levels. +---- +### Privacy Risk +Synthetic data requires assurances on data privacy. One aspect of privacy risk is singling out risk, which evolved alongside General Data Protection Regulation (GDPR). **This is an optional step.** If you wish to measure singling out risk, enter the parameters below. + +1. **Measure Singling Out Risk** (check box, default not checked): select this option if you want to measure singling out risk. Be aware of the Python dependencies (in Prerequisites section) and the fact that this involves a longer runtime in addition to the generation operation. + +2. **Evaluation mode** (drop-down list): select either univariate or multivariate to define the type of attack query to be tested. + +3. **Confidence interval** (percentage, numeric stepper): select a number from 90 to 99 to define the confidence level while providing privacy risk estimates. + +4. **Number of attacks** (numeric stepper, default 100) : enter number of attacks (queries) to simulate. + +5. **Singling Out Risk Results table** (output port): attach a CAS table to the so_results_tbl output port to hold results. + +6. **Singling Out Risk Queries table** (output port): attach a CAS table to the so_queries_tbl output port to hold results. + +#### Citation for anonymeter + +As we make use of an open-source package, anonymeter, to perform these calculations, we note the following citation: + +"A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi et al, PoPETS 2023. + +This bibtex entry refers to the paper: + +``` +@misc{anonymeter, + doi = {https://doi.org/10.56553/popets-2023-0055}, + url = {https://petsymposium.org/popets/2023/popets-2023-0055.php}, + journal = {Proceedings of Privacy Enhancing Technologies Symposium}, + year = {2023}, + author = {Giomi, Matteo and Boenisch, Franziska and Wehmeyer, Christoph and Tasnádi, Borbála}, + title = {A Unified Framework for Quantifying Privacy Risk in Synthetic Data}, +} +``` ---- @@ -118,6 +166,7 @@ IMPORTANT: Be aware that disabling this step means that none of its main executi 3. PyPi page for [hnswlib](https://pypi.org/project/hnswlib/) 4. PyPi page for [protobuf](https://pypi.org/project/protobuf/) +5. PyPi page for [anonymeter](https://pypi.org/project/anonymeter/) ---- ## SAS Program @@ -133,6 +182,7 @@ Refer [here](./extras/SDG_SMOTE_Synthetic_Data.sas) for the SAS program used by ## Created/contact: - Sundaresh Sankaran (sundaresh.sankaran@sas.com) +- Josiah Chua (josiah.chua@sas.com) Acknowledgements to others for their help on details, testing or exploring the area: - David Olaleye (david.olaleye@sas.com) @@ -143,6 +193,8 @@ Acknowledgements to others for their help on details, testing or exploring the a ---- ## Change Log +* Version 1.3.1 (10DEC2024) + * Add calculation for privacy risk (singling out risk) * Version 1.2 (11NOV2024) * Add provenance flag and sampling for assessment * Version 1.1 (02NOV2024) diff --git a/SDG - Generate Synthetic Data through SMOTE/SDG - Generate Synthetic Data through SMOTE.step b/SDG - Generate Synthetic Data through SMOTE/SDG - Generate Synthetic Data through SMOTE.step index e3a3e831..a7ee9c0f 100755 --- a/SDG - Generate Synthetic Data through SMOTE/SDG - Generate Synthetic Data through SMOTE.step +++ b/SDG - Generate Synthetic Data through SMOTE/SDG - Generate Synthetic Data through SMOTE.step @@ -1 +1 @@ -{"type":"code","name":"SDG - Generate Synthetic Data through SMOTE.step","displayName":"SDG - Generate Synthetic Data through SMOTE.step","description":"","templates":{"SAS":"/* SAS templated code goes here */\n\n/* -------------------------------------------------------------------------------------------* \n Synthetic Data Generation (SDG) - Generate Synthetic Data through SMOTE\n\n v 1.2 (11NOV2024)\n\n This program generates synthetic data using the Synthetic Minority Oversampling TEchnique\n and is meant for use within a SAS Studio Custom Step. Please modify requisite macro variables\n (hint: use the debug section as a reference) to run this through other interfaces, such as \n a SAS Program editor or the SAS extension for Visual Studio Code.\n\n Sundaresh Sankaran (sundaresh.sankaran@sas.com|sundaresh.sankaran@gmail.com)\n*-------------------------------------------------------------------------------------------- */\n\n/*-----------------------------------------------------------------------------------------*\n DEBUG Section\n Code under the debug section SHOULD ALWAYS remain commented unless you are tinkering with \n or testing the step!\n*------------------------------------------------------------------------------------------*/\n\n/* Provide test values for the parameters */\n\n/*\n%let CLASSTOAUGMENT =1;\n%let CLASSVAR =BAD;\n%let CLASSVAR_1_TYPE =Numeric;\n%let INPUTTABLE =PUBLIC.HMEQ;\n%let INPUTTABLE_ENGINE=V9;\n%let INPUTTABLE_LIB=PUBLIC;\n%let INPUTTABLE_NAME=HMEQ;\n%let INPUTTABLE_NAME_BASE=HMEQ;\n%let INPUTTABLE_TBLTYPE=table;\n%let INPUTTABLE_TYPE=dataTable;\n%let INPUTVARS=BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC;\n%let NOMINALVARS=BAD REASON JOB;\n%let NOMINALVARS_COUNT=3;\n%let CLASSVAR_COUNT=1;\n%let NUMK=5;\n%let NUMSAMPLES=100;\n%let NUMTHREADS=0;\n%let OUTPUTTABLE=PUBLIC.HMEQ_SYNTH;\n%let OUTPUTTABLE_ENGINE=V9;\n%let OUTPUTTABLE_LIB=PUBLIC;\n%let OUTPUTTABLE_NAME=HMEQ_SYNTH;\n%let OUTPUTTABLE_NAME_BASE=HMEQ_SYNTH;\n%let SEEDNUMBER=123;\n%let extrapolationFactor=0;\n%let sampling_percent=30;\n\n*/;\n\n/*-----------------------------------------------------------------------------------------*\n END DEBUG Section\n*------------------------------------------------------------------------------------------*/\n\n\n/*-----------------------------------------------------------------------------------------*\n MACROS\n*------------------------------------------------------------------------------------------*/\n\n\n/* -------------------------------------------------------------------------------------------* \n Macro to initialize a run-time trigger global macro variable to run SAS Studio Custom Steps. \n A value of 1 (the default) enables this custom step to run. A value of 0 (provided by \n upstream code) sets this to disabled.\n\n Input:\n 1. triggerName: The name of the runtime trigger you wish to create. Ensure you provide a \n unique value to this parameter since it will be declared as a global variable.\n\n Output:\n 2. &triggerName : A global variable which takes the name provided to triggerName.\n*-------------------------------------------------------------------------------------------- */\n\n%macro _create_runtime_trigger(triggerName);\n\n %global &triggerName.;\n\n %if %sysevalf(%superq(&triggerName.)=, boolean) %then %do;\n \n %put NOTE: Trigger macro variable &triggerName. does not exist. Creating it now.;\n %let &triggerName.=1;\n\n %end;\n\n%mend _create_runtime_trigger;\n\n\n/* -----------------------------------------------------------------------------------------* \n Macro to create an error flag for capture during code execution.\n\n Input:\n 1. errorFlagName: The name of the error flag you wish to create. Ensure you provide a \n unique value to this parameter since it will be declared as a global variable.\n 2. errorFlagDesc: A description to add to the error flag.\n\n Output:\n 1. &errorFlagName : A global variable which takes the name provided to errorFlagName.\n 2. &errorFlagDesc : A global variable which takes the name provided to errorFlagDesc.\n*------------------------------------------------------------------------------------------ */\n\n%macro _create_error_flag(errorFlagName, errorFlagDesc);\n\n %global &errorFlagName.;\n %let &errorFlagName.=0;\n %global &errorFlagDesc.;\n\n%mend _create_error_flag;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to capture indicator and UUIDof any currently active CAS session.\n UUID is not expensive and can be used in future to consider graceful reconnect.\n\n Input:\n 1. errorFlagName: name of an error flag that gets populated in case the connection is \n not active. Provide this value in quotes when executing the macro.\n Define this as a global macro variable in order to use downstream.\n 2. errorFlagDesc: Name of a macro variable which can hold a descriptive message output\n from the check.\n \n Output:\n 1. Informational note as required. We explicitly don't provide an error note since \n there is an easy recourse(of being able to connect to CAS)\n 2. UUID of the session: macro variable which gets created if a session exists.\n 3. errorFlagName: populated\n 4. errorFlagDesc: populated\n*------------------------------------------------------------------------------------------*/\n\n%macro _env_cas_checkSession(errorFlagName, errorFlagDesc);\n\n %if %sysfunc(symexist(_current_uuid_)) %then %do;\n %symdel _current_uuid_;\n %end;\n %if %sysfunc(symexist(_SESSREF_)) %then %do;\n %let casSessionExists= %sysfunc(sessfound(&_SESSREF_.));\n %if &casSessionExists.=1 %then %do;\n %global _current_uuid_;\n %let _current_uuid_=; \n proc cas;\n session.sessionId result = sessresults;\n call symputx(\"_current_uuid_\", sessresults[1]);\n quit;\n %put NOTE: A CAS session &_SESSREF_. is currently active with UUID &_current_uuid_. ;\n data _null_;\n call symputx(&errorFlagName., 0);\n call symput(&errorFlagDesc., \"CAS session is active.\");\n run;\n %end;\n %else %do;\n %put NOTE: Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream. ;\n data _null_;\n call symputx(&errorFlagName., 1);\n call symput(&errorFlagDesc., \"Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream.\");\n run;\n %end;\n %end;\n %else %do;\n %put NOTE: No active CAS session ;\n data _null_;\n call symputx(&errorFlagName., 1);\n call symput(&errorFlagDesc., \"No active CAS session. Connect to a CAS session upstream.\");\n run;\n %end;\n\n%mend _env_cas_checkSession; \n \n\n/*-----------------------------------------------------------------------------------------*\n Caslib for a Libname macro\n \n This macro creates a global macro variable called _usr_nameCaslib\n that contains the caslib name (aka. caslib-reference-name) associated with the libname\n and assumes that the libname is using the CAS engine.\n \n As sysvalue has a length of 1024 chars, we use the trimmed option in proc sql\n to remove leading and trailing blanks in the caslib name.\n \n From macro provided by Wilbram Hazejager (wilbram.hazejager@sas.com)\n\n Inputs:\n - _usr_LibrefUsingCasEngine : A library reference provided by the user which is based \n on a CAS engine.\n \n Outputs:\n - _usr_nameCaslib : Global macro variable containing the caslib name.\n*------------------------------------------------------------------------------------------*/\n \n%macro _usr_getNameCaslib(_usr_LibrefUsingCasEngine);\n \n %global _usr_nameCaslib;\n %let _usr_nameCaslib=;\n \n proc sql noprint;\n select sysvalue into :_usr_nameCaslib trimmed from dictionary.libnames\n where libname = upcase(\"&_usr_LibrefUsingCasEngine.\") and upcase(sysname)=\"CASLIB\";\n quit;\n\n /*--------------------------------------------------------------------------------------*\n Note that we output a NOTE instead of an ERROR for the below condition since the \n execution context determines whether this is an error or just an informational note.\n *---------------------------------------------------------------------------------------*/\n %if \"&_usr_nameCaslib.\" = \"\" %then %put NOTE: The caslib name for the &_usr_LibrefUsingCasEngine. is blank.;\n \n%mend _usr_getNameCaslib;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to check if a given libref belongs to a SAS or CAS engine.\n\n Input:\n 1. sasCasLibref: a libref to be checked. Do not quote.\n 2. tableEngine: a flag to hold the table Engine value.\n 3. errorFlagName: a flag to populate an error code with.\n 4. errorFlagDesc: a flag to describe the error if one occurs.\n 5. sessionExists: an indicator (1) whether an active CAS session exists. If not(0),\n it will be created.\n \n Output:\n 1. tableEngine: populated with SAS or CAS\n 2. errorFlagName: populated with 1 if an error and 0 if not\n 3. errorFlagDesc: populated in case of an error\n*------------------------------------------------------------------------------------------*/\n\n%macro _sas_or_cas(sasCasLibref, tableEngine, errorFlagName, errorFlagDesc, sessionExists);\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ ;\n caslib _ALL_ assign;\n %end;\n\n proc sql noprint;\n select distinct Engine into:&&tableEngine. from dictionary.libnames where libname = upcase(\"&sasCasLibref.\");\n quit;\n\n %put \"&&&tableEngine.\";\n\n %if %sysfunc(compress(\"&&&tableEngine.\")) = \"V9\" %THEN %DO;\n data _null_;\n call symput(\"&tableEngine.\",\"SAS\");\n call symputx(\"&errorFlag.\",0);\n call symput(\"&errorFlagDesc.\",\"\");\n run;\n %end;\n %else %if %sysfunc(compress(\"&&&tableEngine.\")) = \"CAS\" %THEN %DO;\n data _null_;\n call symputx(\"&errorFlagName.\",0);\n call symput(\"&errorFlagDesc.\",\"\");\n run;\n %END;\n %else %do;\n data _null_;\n call symputx(\"&errorFlagName.\",1);\n call symput(\"&errorFlagDesc.\",\"Unable to associate libref with either SAS or CAS. Check the input libref provided.\");\n run;\n %end;\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ terminate;\n %end;\n \n%mend _sas_or_cas;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to check if an in-memory table exists.\n\n Input:\n 1. tableName: name of the in-memory table\n 2. tableLib: caslib backing the in-memory table\n 3. sessionExists: an indicator (1) whether an active CAS session exists. If not(0),\n it will be created.\n \n Output:\n 1. tableExists: populated with 0 if does not exist, 1 if exists with local scope, \n 2 if exists with global scope\n\n*------------------------------------------------------------------------------------------*/ \n\n%macro _cas_table_exists(tableName, tableLib, sessionExists, tableExists);\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ ;\n caslib _ALL_ assign;\n %end;\n\n proc cas;\n table.tableExists result = rc /\n name=\"&tableName.\",\n caslib=\"&tableLib.\"\n ;\n call symputx(\"&tableExists.\",rc.exists);\n quit;\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ terminate;\n %end;\n \n%mend _cas_table_exists;\n \n\n/*-----------------------------------------------------------------------------------------*\n EXECUTION CODE MACRO \n\n _smt prefix stands for SMOTE\n*------------------------------------------------------------------------------------------*/\n\n%macro _smt_execution_code;\n\n/*-----------------------------------------------------------------------------------------*\n Create an error flag. \n*------------------------------------------------------------------------------------------*/\n\n %_create_error_flag(_smt_error_flag, _smt_error_desc);\n\n/*-----------------------------------------------------------------------------------------*\n Check if an active CAS session exists. \n*------------------------------------------------------------------------------------------*/\n\n %_env_cas_checkSession(\"_smt_error_flag\", \"_smt_error_desc\");\n\n/*-----------------------------------------------------------------------------------------*\n Check Input table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n\n %global inputCaslib;\n %_usr_getNameCaslib(&inputTable_lib.);\n %let inputCaslib=&_usr_nameCaslib.;\n %put NOTE: &inputCaslib. is the caslib for the input table.;\n %let _usr_nameCaslib=;\n\n %if \"&inputCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Input table caslib is blank. Check if Base table is a valid CAS table.\");\n run;\n %put ERROR: Input table caslib is blank. Check if Base table is a valid CAS table. ;\n %end;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check if input table exists.\n*------------------------------------------------------------------------------------------*/\n \n %global casTableExists;\n\n %if &_smt_error_flag. = 0 %then %do;\n %_cas_table_exists(&inputTable_name_base.,&inputTable_lib.,1,casTableExists);\n %if &casTableExists.=0 %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: The given CAS table does not seem to exist. Please check if it is loaded to CAS.\");\n run;\n %put ERROR: The given CAS table does not seem to exist. Please check if it is loaded to CAS.;\n %end; \n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check Output table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n\n %global outputCaslib;\n %_usr_getNameCaslib(&outputTable_lib.);\n %let outputCaslib=&_usr_nameCaslib.;\n %put NOTE: &outputCaslib. is the caslib for the output table.;\n %let _usr_nameCaslib=;\n\n %if \"&outputCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Output table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Output table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Obtain list of input & nominal variables and store them in macro variables.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n %let blankSeparatedInputVars = %_flw_get_column_list(_flw_prefix=inputVars);\n %let blankSeparatedNominalVars = %_flw_get_column_list(_flw_prefix=nominalVars);\n %end;\n\n %put NOTE: Input variables selected - &blankSeparatedInputVars.;\n %put NOTE: Nominal variables selected - &blankSeparatedNominalVars.;\n\n\n/*-----------------------------------------------------------------------------------------*\n Create a program string based on selection of nominal variables.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n %if &nominalVars_count.=0 %then %do;\n data _null_;\n call symput(\"nominalString\",\"\");\n run;\n %end;\n %else %do;\n data _null_;\n call symput(\"nominalString\",\"nominals=${&blankSeparatedNominalVars.},\");\n run;\n %end;\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Create a program string based on selection of class variables.\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &classVar_count.=0 %then %do;\n data _null_;\n call symput(\"classString\",\"\");\n call symput(\"classToAugment\",\"\");\n call symput(\"classAugmentString\",\"\");\n run;\n %end;\n %else %do;\n data _null_;\n call symput(\"classString\",\"classColumn=classColumnVar,\");\n call symput(\"classAugmentString\",\"classToAugment=class_to_augment,\");\n run;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Check if provenance flag name has been provided otherwise code as default\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if %sysfunc(compress(\"&prov_flag_name.\"))=\"\" %then %do;\n %put NOTE: Value not provided for provenance variable. Using default.;\n data _null_;\n call symput(\"prov_flag_name\",\"Synthetic_Data_Provenance\");\n run;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Check if assessment table (optional) has been provided otherwise code as default\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &sampling_percent. > 0 %then %do;\n %put NOTE: Assessment table value is - &assessmentTable. ;\n %if %sysevalf(%superq(assessmentTable)=, boolean) %then %do;\n %put ERROR: An assessment table has not been attached. Please attach the same.;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"An assessment table has not been attached. Please attach the same.\");\n run;\n %end;\n %else %if \"%sysfunc(substr(&assessmentTable.,1,9))\"=\"WORK._flw\" %then %do;\n %put NOTE: Value not provided for assessment table. Using default.;\n data _null_;\n call symput(\"assessmentTable\",\"PUBLIC.SMOTE_ASSESSMENT\");\n call symput(\"assessmentTable_lib\",\"PUBLIC\");\n call symput(\"assessmentTable_name_base\",\"SMOTE_ASSESSMENT\");\n run;\n %end;\n %else %if %sysfunc(compress(\"&assessmentTable.\"))=\"\" %then %do;\n %put NOTE: Value not provided for assessment table. Using default.;\n data _null_;\n call symput(\"assessmentTable\",\"PUBLIC.SMOTE_ASSESSMENT\");\n call symput(\"assessmentTable_lib\",\"PUBLIC\");\n call symput(\"assessmentTable_name_base\",\"SMOTE_ASSESSMENT\");\n run;\n %end;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Test data set created based on percent\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &sampling_percent.=0 %then %do;\n data &outputTable_lib..__temp_smote;\n set &inputTable.;\n _PartInd_ = 0;\n run;\n %end;\n %else %do;\n proc partition data=&inputTable. partind samppct= &sampling_percent. seed=10 ;\n output out=&outputTable_lib..__temp_smote copyvars=(_all_);\n display 'SRSFreq';\n run;\n data &outputTable_lib..__temp_smote &outputTable_lib..__assess_orig;\n set &outputTable_lib..__temp_smote;\n if _PartInd_=0 then output &outputTable_lib..__temp_smote;\n else output &outputTable_lib..__assess_orig;\n run;\n/*-----------------------------------------------------------------------------------------*\n Add a provenance flag\n*------------------------------------------------------------------------------------------*/\n data &outputTable_lib..__assess_orig;\n length &prov_flag_name. $9.;\n set &outputTable_lib..__assess_orig;\n &prov_flag_name. = \"Original\";\n run;\n\n %end;\n\n %end;\n/*-----------------------------------------------------------------------------------------*\n Run SMOTE action\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n proc cas; \n numK = symget(\"numK\");\n inputTableCaslib = symget(\"inputCaslib\");\n inputTableName = symget(\"inputTable_name_base\");\n blankSeparatedNominalVars = symget(\"blankSeparatedNominalVars\");\n blankSeparatedInputVars = symget(\"blankSeparatedInputVars\");\n classColumnVar = symget(\"classVar\");\n classVarType = symget(\"classVar_1_Type\");\n classToAugment = symget(\"classToAugment\");\n numSamplesVar = symget(\"numSamples\");\n outputTableCaslib = symget(\"outputCaslib\");\n outputTableName = symget(\"outputTable_name_base\");\n seedNumber = symget(\"seedNumber\");\n numThreads = symget(\"numThreads\");\n extrapolation_factor = symget(\"extrapolationFactor\");\n\n if classVarType = \"Numeric\" then class_to_augment = classToAugment*1; \n else class_to_augment = classToAugment;\n\n smote.smoteSample result=r/\n table={name=\"__temp_smote\", caslib=outputTableCaslib, where='_PartInd_=0'},\n/* table={name=inputTableName, caslib=inputTableCaslib}, */\n k = numK,\n inputs=${&blankSeparatedInputVars.},\n &nominalString.\n &classString.\n &classAugmentString.\n seed=seedNumber,\n nThreads = numThreads,\n numSamples=numSamplesVar,\n extrapolationFactor=extrapolation_factor,\n casout={name=outputTableName,caslib= outputTableCaslib, replace=\"TRUE\"}\n ;\n print r;\n run;\n quit;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Add a provenance flag\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n data &outputTable.;\n length &prov_flag_name. $9.;\n set &outputTable.;\n &prov_flag_name. = \"Synthetic\";\n run;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Take a sample from synthetic data and merge with original data\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n proc sql noprint;\n select count(*) into: synth_records from &outputTable.;\n select count(*) into: orig_records from &inputTable.;\n quit;\n\n %put NOTE: Number of synthetic records - &synth_records.;\n %put NOTE: Number of original records - &orig_records.;\n %put NOTE: Sampling Percent provided - &sampling_percent.;\n \n data _null_;\n call symputx(\"synth_sampling_percent\",100*((&sampling_percent./100) * &orig_records. )/&synth_records.);\n run;\n %put NOTE: Synthetic Sampling Percent - &synth_sampling_percent.;\n\n %if &sampling_percent.=0 %then %do;\n/*-----------------------------------------------------------------------------------------*\n Block deliberately left empty for a future consideration\n*------------------------------------------------------------------------------------------*/\n %end;\n %else %do;\n proc partition data=&outputTable. partind samppct= &synth_sampling_percent. seed=10 ;\n output out=&outputTable_lib..__assess_synth copyvars=(_all_);\n display 'SRSFreq';\n run;\n data &assessmentTable.;\n set &outputTable_lib..__assess_orig &outputTable_lib..__assess_synth (where=(_PartInd_=1));\n keep &prov_flag_name. &blankSeparatedInputVars.;\n run;\n proc datasets lib=&outputTable_lib.;\n delete __assess_orig __assess_synth ;\n quit;\n %end;\n proc datasets lib=&outputTable_lib.;\n delete __temp_smote;\n quit;\n %end;\n\n\n%mend _smt_execution_code; \n\n/*-----------------------------------------------------------------------------------------*\n END MACROS\n*------------------------------------------------------------------------------------------*/\n\n/*-----------------------------------------------------------------------------------------*\n EXECUTION CODE\n*------------------------------------------------------------------------------------------*/\n \n/*-----------------------------------------------------------------------------------------*\n Create Runtime Trigger\n*------------------------------------------------------------------------------------------*/\n%_create_runtime_trigger(_smt_run_trigger);\n\n/*-----------------------------------------------------------------------------------------*\n Execute \n*------------------------------------------------------------------------------------------*/\n\n%if &_smt_run_trigger. = 1 %then %do;\n\n %_smt_execution_code;\n\n%end;\n\n%if &_smt_run_trigger. = 0 %then %do;\n\n %put NOTE: This step has been disabled. Nothing to do.;\n\n%end;\n\n\n%put NOTE: Final summary;\n%put NOTE: Status of error flag - &_smt_error_flag. ;\n%put &_smt_error_desc.;\n%put NOTE: Error desc - &_smt_error_desc. ;\n\n/*-----------------------------------------------------------------------------------------*\n END EXECUTION CODE\n*------------------------------------------------------------------------------------------*/\n/*-----------------------------------------------------------------------------------------*\n Clean up existing macro variables and macro definitions.\n*------------------------------------------------------------------------------------------*/\n\n%if %symexist(inputCaslib) %then %do;\n %symdel inputCaslib;\n%end;\n\n%if %symexist(outputCaslib) %then %do;\n %symdel outputCaslib;\n%end;\n\n%if %symexist(casTableExists) %then %do;\n %symdel casTableExists;\n%end;\n\n%if %symexist(prov_flag_name) %then %do;\n %symdel prov_flag_name;\n%end;\n\n%if %symexist(_smt_run_trigger) %then %do;\n %symdel _smt_run_trigger;\n%end;\n\n%if %symexist(_smt_error_flag) %then %do;\n %symdel _smt_error_flag;\n%end;\n\n%if %symexist(_smt_error_desc) %then %do;\n %symdel _smt_error_desc;\n%end;\n\n%sysmacdelete _create_error_flag;\n%sysmacdelete _create_runtime_trigger;\n%sysmacdelete _env_cas_checkSession;\n%sysmacdelete _usr_getNameCaslib;\n%sysmacdelete _sas_or_cas;\n%sysmacdelete _cas_table_exists;\n%sysmacdelete _smt_execution_code;\n"},"properties":{},"ui":"{\n\t\"showPageContentOnly\": true,\n\t\"pages\": [\n\t\t{\n\t\t\t\"id\": \"page1\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"Parameters\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section1\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Input Table\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text2\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Connect a SAS Cloud Analytics Services (CAS) table to the input port of this step.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"inputTable\",\n\t\t\t\t\t\t\t\"type\": \"inputtable\",\n\t\t\t\t\t\t\t\"label\": \"Select input table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"parameters_section\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Parameters\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"numK\",\n\t\t\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\t\t\"label\": \"Select number of nearest neighbors:\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"integer\": true,\n\t\t\t\t\t\t\t\"min\": 1,\n\t\t\t\t\t\t\t\"max\": 1000,\n\t\t\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"inputVars\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select input columns:\",\n\t\t\t\t\t\t\t\"include\": null,\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 1,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": \"inputTable\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"nominalVars\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select nominal columns:\",\n\t\t\t\t\t\t\t\"include\": \"inputVars\",\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": null\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"classVar\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select a class column to augment:\",\n\t\t\t\t\t\t\t\"include\": \"inputVars\",\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": 1,\n\t\t\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": null\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"classToAugment\",\n\t\t\t\t\t\t\t\"type\": \"dropdown\",\n\t\t\t\t\t\t\t\"label\": \"Select a class value to augment:\",\n\t\t\t\t\t\t\t\"items\": {\n\t\t\t\t\t\t\t\t\"ref\": \"classVar\"\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"$classVar\",\n\t\t\t\t\t\t\t\"enabled\": \"$classVar\",\n\t\t\t\t\t\t\t\"indent\": 1\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section9\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Output specification\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"numSamples\",\n\t\t\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\t\t\"label\": \"Number of synthetic observations:\",\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 10,\n\t\t\t\t\t\t\t\"excludemin\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"integer\": true\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text1\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Connect a SAS Cloud Analytics Services (CAS) table to the output port of this step.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"outputTable\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide output table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"assessmentTable\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide assessment table (optional):\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"placeholder\": \"PUBLIC.SMOTE_ASSESSMENT\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text3\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Note: Assessment table will be populated only if a value greater than 0 has been selected for the sampling percentage (Configuration tab).\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t}\n\t\t\t]\n\t\t},\n\t\t{\n\t\t\t\"id\": \"page2\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"Configuration\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"numThreads\",\n\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\"label\": \"Select number of threads:\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"integer\": true,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"max\": 1024,\n\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"seedNumber\",\n\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\"label\": \"Select a seed:\",\n\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\"min\": null,\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"extrapolationFactor\",\n\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\"label\": \"Provide extrapolation factor (to perturb input data boundary):\",\n\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"excludemin\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"integer\": false\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"prov_flag_name\",\n\t\t\t\t\t\"type\": \"textfield\",\n\t\t\t\t\t\"label\": \"Edit provenance variable name:\",\n\t\t\t\t\t\"placeholder\": \"Synthetic_Data_Provenance\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"sampling_percent\",\n\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\"label\": \"Provide sampling percentage for assessment:\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"integer\": false,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"max\": 99,\n\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t}\n\t\t\t]\n\t\t},\n\t\t{\n\t\t\t\"id\": \"about\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"About\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_description\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Synthetic Minority Oversampling TEchnique (SMOTE) \\n=====================================\\n\\nThis custom step helps you generate synthetic data based on an input table, using the Synthetic Minority Oversampling TEchnique (SMOTE). SMOTE is an oversampling technique which identifies new data observations in the neighborhood of closely associated original observations. \\n\\nSMOTE is an alternative approach to Generative Adversarial Networks (GANs) for generating synthetic tabular data. Access to synthetic data helps you make better, data-informed decisions in situations where you have imbalanced, scant, poor quality, unobservable, or restricted data.\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section8\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Prerequisites\",\n\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text4\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"1. A SAS Viya 4 environment, preferably monthly stable 2024.10 or later\\n\\n2. A Visual Data Mining and Machine Learning (VDMML) license, usually provided with SAS Viya, SAS Viya Enterprise or Advanced.\\n\\n3. An active SAS Cloud Analytics Services (CAS) connection during runtime.\\n\\n4. The smote.smoteSample CAS action requires Python configuration, as specified in SAS documentation. Please work with your SAS administrator to have the same configured. Specifically, ensure the following:\\n\\n 1. The correct version of Python is installed (as of version 2024.10, this was 3.11.x) \\n 2. [sas-ipc-queue](https://pypi.org/project/sas-ipc-queue/) , version atleast 0.7.0 and beyond \\n 3. [hnswlib](https://pypi.org/project/hnswlib/)\\n 4. [protobuf](https://pypi.org/project/protobuf/)\\n\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section10\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Assumptions\",\n\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text6\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Current assumptions for this initial versions (future versions may improve upon the same):\\n\\n1. Users choose either an existing Chroma DB vector database collection or load PDF or CSV files to an existing or new Chroma DB collection.\\n\\n2. Users may load all PDFs in a directory on the SAS Server (filesystem), or select a PDF / CSV of their choice.\\n\\n3. The code assumes use of a Chroma DB vector store. Users may choose to replace this with other supported vector stores.\\n\\n4. The code uses the langchain LLM framework. \\n\\n5. PDFs (containing text) and single CSV files are currently the only loadable file format allowed. Users are however free to ingest various other document types into a Chroma DB collection beforehand, using the \\\"Vector Databases - Hydrate Chroma DB collection\\\" SAS Studio Custom Step (refer documentation)\\n\\n6. User has already configured Azure OpenAI to deploy both an embedding function and LLM service, or knows the deployment names. \\n\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_parameters\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Parameters\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_input\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Input parameters\",\n\t\t\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"input_parameters_text\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Input table (input port, required): connect a CAS table to the input port.\\n\\n2. Nearest neighbors (numeric stepper, default 5): select the number of nearest neighbours to be used by the SMOTE algorithm as the basis for identifying candidate synthetic points.\\n\\n3. Input columns (column selector): select all inputs for the SMOTE process. You would also need to include the class and any nominal columns.\\n\\n4. Nominal variables (column selector): select any nominal variables you wish to use. Your nominal variables are required to be in the inputs column list.\\n\\n5. Select a class column (column selector, optional): select a column if you wish to use SMOTE in order to balance or augment a level within the class column. Be judicious in the choice of this column since a column with a high number of levels may slow down or even fail the process. Your class column is required to be in the inputs column list.\\n\\n6. Class to augment (drop-down list, values from class column if selected): select the level of the class variable you wish to augment. The values that appear here depend on the data that's contained in the class column, so may take time to populate based on actual data and number of levels.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_output_specs\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Output specifications\",\n\t\t\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"output_parameters_text\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Number of synthetic observations (numeric field): specify the number of synthetic observations you would like in the output table.\\n\\n2. Output table (output port): attach a CAS table to the output port to hold results.\\n\\n3. Assessment table (output port, optional): Attach a CAS table to the second output port (assessmentTable) of this step in case you select a sampling percentage (refer Configuration tab, pt #5) greater than 0. A default value of PUBLIC.SMOTE_ASSESSMENT is assigned in case you do not specify a table.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_config_1\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Configuration \",\n\t\t\t\t\t\t\t\"open\": 1,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"output_parameters_text_1\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Number of threads: (numeric stepper, optional): most of the time, you do not need to modify this. Change if you need to especially control the number of threads in which the process runs.\\n\\n2. Select a seed (numeric field, optional): specify a seed number to establish (but not completely guarantee) some level of reproducability with respect to results.\\n\\n3. Select extrapolation factor: specify a number (double) to use as a standard deviation in order to perturb (add noise or randomness) the input data boundaries.\\n\\n4. Name for synthetic data provenance variable (optional, default provided): insert a value only if you want to give a special name to a flag that indicates this data is synthetic. Otherwise, a default name of Synthetic_Data_Provenance will be used instead.\\n\\n5. Provide sampling percentage for assessment (numeric stepper, default is 0): Enter a value between 0 and 100 to sample a test dataset before running SMOTE. Note that the test dataset will NOT be used to generate data. The test dataset will be appended with an equal number of observations from the synthetic dataset (along with the provenance flag described in #4) and can be used for assessment purposes.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_runtimecontrol\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Run-time Control\",\n\t\t\t\t\t\"open\": 0,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"runtimecontrol_text\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Note: Run-time control is optional. You may choose whether to execute the main code of this step or not, based on upstream conditions set by earlier SAS programs. This includes nodes run prior to this custom step earlier in a SAS Studio Flow, or a previous program in the same session.\\n\\nRefer this blog (https://communities.sas.com/t5/SAS-Communities-Library/Switch-on-switch-off-run-time-control-of-SAS-Studio-Custom-Steps/ta-p/885526) for more details on the concept.\\n\\nThe following macro variable,\\n\\n_smt_run_trigger\\n\\nwill initialize with a value of 1 by default, indicating an \\\"enabled\\\" status and allowing the custom step to run.\\n\\nIf you wish to control execution of this custom step, include code in an upstream SAS program to set this variable to 0. This \\\"disables\\\" execution of the custom step.\\n\\nTo \\\"disable\\\" this step, run the following code upstream:\\n\\n%global _smt_run_trigger;\\n%let _smt_run_trigger =0;\\n\\nTo \\\"enable\\\" this step again, run the following (it's assumed that this has already been set as a global variable):\\n\\n%let _smt_run_trigger =1;\\n\\nIMPORTANT: Be aware that disabling this step means that none of its main execution code will run, and any downstream code which was dependent on this code may fail. Change this setting only if it aligns with the objective of your SAS Studio program.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_documentation\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Documentation\",\n\t\t\t\t\t\"open\": 0,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"documentation_text\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"1. Documentation on the smote.smoteSample CAS action: https://go.documentation.sas.com/doc/en/pgmsascdc/default/casactml/casactml_smote_details01.htm\\n\\n2. PyPi page for sas-ipc-queue. While an implementation detail for the smoteSample action, this is a good place to highlight efficient memory handling techniques, for interested programmers: https://pypi.org/project/sas-ipc-queue/\\n\\n3. PyPi page for hnswlib, an algorithm for fast approximate nearest neighbour search, used in smoteSample: https://pypi.org/project/hnswlib/\\n\\n4. PyPi page for protobuf (Python package used within action): https://pypi.org/project/protobuf/\\n\\n\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"version_text\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Version: 1.2 (11NOV2024)\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"contact_text\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Created/contact: \\n\\n- Sundaresh Sankaran (sundaresh.sankaran@sas.com)\\n\\nAcknowledgements to others for their help on details, testing or exploring the area: \\n- David Olaleye (david.olaleye@sas.com)\\n- Suneel Grover (suneel.grover@sas.com)\\n- Reza Nazari (reza.nazari@sas.com)\\n- SAS Analytics R&D team\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t}\n\t\t\t]\n\t\t}\n\t],\n\t\"syntaxversion\": \"1.3.0\",\n\t\"values\": {\n\t\t\"inputTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"numK\": 5,\n\t\t\"inputVars\": [],\n\t\t\"nominalVars\": [],\n\t\t\"classVar\": [],\n\t\t\"classToAugment\": null,\n\t\t\"numSamples\": 100,\n\t\t\"outputTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"assessmentTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"numThreads\": 0,\n\t\t\"seedNumber\": 123,\n\t\t\"extrapolationFactor\": 0,\n\t\t\"prov_flag_name\": \"Synthetic_Data_Provenance\",\n\t\t\"sampling_percent\": 0\n\t}\n}","flowMetadata":{"inputPorts":[{"name":"inputTable","displayName":"inputTable","minEntries":1,"maxEntries":1,"type":"table"}],"outputPorts":[{"name":"outputTable","displayName":"outputTable","minEntries":1,"maxEntries":1,"columnDelta":null,"type":"table"},{"name":"assessmentTable","displayName":"assessmentTable","minEntries":0,"maxEntries":1,"columnDelta":null,"type":"table"}]}} \ No newline at end of file +{"type":"code","name":"SDG - Generate Synthetic Data through SMOTE.step","displayName":"SDG - Generate Synthetic Data through SMOTE.step","description":"","templates":{"SAS":"/* SAS templated code goes here */\n\n/* -------------------------------------------------------------------------------------------* \n Synthetic Data Generation (SDG) - Generate Synthetic Data through SMOTE\n\n v 1.3.1 (10DEC2024)\n\n This program generates synthetic data using the Synthetic Minority Oversampling TEchnique\n and is meant for use within a SAS Studio Custom Step. Please modify requisite macro variables\n (hint: use the debug section as a reference) to run this through other interfaces, such as \n a SAS Program editor or the SAS extension for Visual Studio Code.\n\n Sundaresh Sankaran (sundaresh.sankaran@sas.com|sundaresh.sankaran@gmail.com)\n*-------------------------------------------------------------------------------------------- */\n\n/*-----------------------------------------------------------------------------------------*\n DEBUG Section\n Code under the debug section SHOULD ALWAYS remain commented unless you are tinkering with \n or testing the step!\n*------------------------------------------------------------------------------------------*/\n\n/* Provide test values for the parameters */\n\n/*\n%let CLASSTOAUGMENT =1;\n%let CLASSVAR =BAD;\n%let CLASSVAR_1_TYPE =Numeric;\n%let INPUTTABLE =PUBLIC.HMEQ;\n%let INPUTTABLE_ENGINE=V9;\n%let INPUTTABLE_LIB=PUBLIC;\n%let INPUTTABLE_NAME=HMEQ;\n%let INPUTTABLE_NAME_BASE=HMEQ;\n%let INPUTTABLE_TBLTYPE=table;\n%let INPUTTABLE_TYPE=dataTable;\n%let INPUTVARS=BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC;\n%let NOMINALVARS=BAD REASON JOB;\n%let NOMINALVARS_COUNT=3;\n%let CLASSVAR_COUNT=1;\n%let NUMK=5;\n%let NUMSAMPLES=100;\n%let NUMTHREADS=0;\n%let OUTPUTTABLE=PUBLIC.HMEQ_SYNTH;\n%let OUTPUTTABLE_ENGINE=V9;\n%let OUTPUTTABLE_LIB=PUBLIC;\n%let OUTPUTTABLE_NAME=HMEQ_SYNTH;\n%let OUTPUTTABLE_NAME_BASE=HMEQ_SYNTH;\n%let SEEDNUMBER=123;\n%let extrapolationFactor=0;\n%let sampling_percent=30;\n\n*/;\n\n/*-----------------------------------------------------------------------------------------*\n END DEBUG Section\n*------------------------------------------------------------------------------------------*/\n\n\n/*-----------------------------------------------------------------------------------------*\n Python Block Definition\n*------------------------------------------------------------------------------------------*/\n\n/*-----------------------------------------------------------------------------------------*\n The following block of code has been created for the purpose of allowing proc python \n to execute within a macro. Execution within a macro allows for other checks to be carried \n out through SAS prior to handing off to the Python step.\n\n In this example, a temporary file is created containing the requisite Python commands, which \n are then executed through infile reference.\n\n Note that Python code is pasted as-is and may be out of line with the SAS indentation followed.\n\n This Python block comes into operation only upon the selection of Privacy Risk (Singling\n Out Risk) metrics.\n\n*------------------------------------------------------------------------------------------*/\nfilename smtcode temp;\n\ndata _null_;\n\n length line $32767; * max SAS character size ;\n infile datalines4 truncover pad;\n input ; \n file smtcode;\n line = strip(_infile_); * line without leading and trailing blanks ;\n l1 = length(trimn(_infile_)); * length of line without trailing blanks ;\n l2 = length(line); * length of line without leading and trailing blanks ;\n first_position=l1-l2+1; * position where the line should start (alignment) ;\n if (line eq ' ') then put @1; * empty line ;\n else put @first_position line; * line without leading and trailing blanks correctly aligned ;\n\n datalines4;\n# Imports\n_smt_error_flag = int(SAS.symget(\"_smt_error_flag\"))\n_smt_error_desc = SAS.symget(\"_smt_error_desc\")\n\n\ncitation = \"\"\"\n\n Calculated using anonymeter (https://pypi.org/project/anonymeter/)\n\n \"A Unified Framework for Quantifying Privacy Risk in Synthetic Data\", M. Giomi et al, PoPETS 2023. \n\n\n\"\"\"\n\n\ntry:\n import os\n import swat\n import json \n from anonymeter.evaluators import SinglingOutEvaluator\nexcept ImportError as ie:\n _smt_error_flag = 1\n _smt_error_desc = ie\n SAS.symput(\"_smt_error_flag\",_smt_error_flag)\n SAS.symput(\"_smt_error_desc\",_smt_error_desc)\n SAS.logMessage(_smt_error_desc,\"error\")\n\nif _smt_error_flag ==0:\n # Obtain values from UI & SAS macro variables\n evaluation_mode = SAS.symget('evaluation_mode')\n conf_interval = float(SAS.symget('conf_interval'))\n s_o_attacks = int(SAS.symget('s_o_attacks'))\n singling_out_results_tbl = SAS.symget('singling_out_results_tbl')\n singling_out_queries_tbl = SAS.symget('singling_out_queries_tbl')\n cas_session_exists = SAS.symget('casSessionExists')\n assessment_table_name = SAS.symget('assessmentTable_name_base')\n assessment_table_caslib = SAS.symget('assessmentCaslib')\n input_caslib = SAS.symget('inputCaslib')\n input_table_name = SAS.symget('inputTable_name_base')\n so_queries_tbl = SAS.symget('so_queries_tbl_name_base')\n so_results_tbl = SAS.symget('so_results_tbl_name_base')\n so_queries_caslib = SAS.symget('so_queries_caslib')\n so_results_caslib = SAS.symget('so_results_caslib')\n\n # Retrieve values for SAS options cashost and casport, these are needed by SWAT connection \n cas_host_name = SAS.sasfnc('getoption','cashost')\n cas_host_port = SAS.sasfnc('getoption','casport')\n\n # Add certificate location to operating system list of trusted certs\n os.environ['CAS_CLIENT_SSL_CA_LIST'] = os.environ['SSLCALISTLOC']\n \n \n # Connect to CAS\n if cas_session_exists == '1':\n cas_session_uuid = SAS.symget('casSessionUUID')\n SAS.logMessage(f\"CAS connection exists. Session UUID is {cas_session_uuid}\") \n conn = swat.CAS(hostname = cas_host_name, port = cas_host_port, password = os.environ['SAS_SERVICES_TOKEN'], session = cas_session_uuid)\n if conn:\n SAS.logMessage('SWAT connection established.')\n else:\n SAS.logMessage('ERROR: No active CAS session. Connect to a CAS session in upstream step in the flow.')\n _smt_error_flag = 1\n _smt_error_desc = \"ERROR: No active CAS session. Connect to a CAS session in upstream step in the flow.\"\n\n df_org = conn.CASTable(name=input_table_name, caslib=input_caslib).to_frame()\n df_syn = conn.CASTable(name=assessment_table_name, caslib=assessment_table_caslib, where=\"Synthetic_Data_Provenance='Synthetic'\").to_frame()\n df_con = conn.CASTable(name=assessment_table_name, caslib=assessment_table_caslib, where=\"Synthetic_Data_Provenance='Original'\").to_frame()\n\n\n evaluator = SinglingOutEvaluator(ori=df_org, syn=df_syn, control=df_con, n_attacks=s_o_attacks)\n\n try:\n evaluator.evaluate(mode=evaluation_mode)\n risk = evaluator.risk(confidence_level=conf_interval)\n print(risk)\n\n except RuntimeError as ex: \n _smt_error_flag = 1\n _smt_error_desc = f\"Singling out evaluation failed with {ex}. Please re-run this operation. For more stable results increase `n_attacks`. Note that this will make the evaluation slower.\"\n SAS.symput(\"_smt_error_flag\",1)\n SAS.symput(\"_smt_error_desc\",_smt_error_desc)\n\n# Create a summary (title section will be modified in future version based on adding more metrics)\n# SAS.submit(\"title 'Singling Out Risk: Summary'; run;\")\n\nif _smt_error_flag == 0:\n summary = f\"Singling out privacy risk has been found to be {risk.value} between a confidence interval of {risk.ci[0]} and {risk.ci[1]}\"\n query_status = f\"{len(evaluator.queries())} queries were successful attacks.\"\n # Print to SAS results window\n SAS.submit(f\"ods text = 'Singling Out Risk: Summary';\")\n SAS.submit(f\"ods text = '{summary}';\")\n SAS.submit(f\"ods text = '{query_status}';\")\n SAS.submit(f\"ods text = '{citation}';\")\n SAS.logMessage(citation)\n citation_col = []\n for a in range(0,len(evaluator.queries())):\n citation_col.append(citation)\n # Define table for results and queries\n so_results_table = conn.CASTable(name=so_results_tbl, caslib=so_results_caslib, replace=True)\n so_queries_table = conn.CASTable(name=so_queries_tbl, caslib=so_queries_caslib, replace=True)\n # Create a Results dict\n so_res = evaluator.results()\n res_dict = {\"Citation\":[citation], \"Privacy_Risk\": [risk.value], \"Privacy_Risk_Conf_Interval_Lower\": [risk.ci[0]],\"Privacy_Risk_Conf_Interval_Upper\": [risk.ci[1]],\"Attack_Rate\":[so_res.attack_rate.value],\"Attack_Rate_Error\":[so_res.attack_rate.error], \"Baseline_Rate\":[so_res.baseline_rate.value],\"Baseline_Rate_Error\":[so_res.baseline_rate.error], \"Control_Rate\":[so_res.control_rate.value],\"Control_Rate_Error\":[so_res.control_rate.error], \"N_Attacks\":[so_res.n_attacks],\"N_Success\":[so_res.n_success], \"N_Baseline\": [so_res.n_baseline],\"N_Control\":[so_res.n_control] }\n # Load Results to a CAS table\n so_results_table.from_dict(data=res_dict, connection=conn, casout=so_results_table)\n SAS.logMessage(\"Results table loaded to CAS.\")\n # Load Queries to a CAS table\n so_queries_table.from_dict(data={\"Query\":evaluator.queries(), \"Citation\": citation_col}, connection=conn, casout=so_queries_table)\n SAS.logMessage(\"Queries table loaded to CAS.\")\n\n\n;;;;\n \n\nrun;\n/*-----------------------------------------------------------------------------------------*\n MACROS\n*------------------------------------------------------------------------------------------*/\n\n\n/* -------------------------------------------------------------------------------------------* \n Macro to initialize a run-time trigger global macro variable to run SAS Studio Custom Steps. \n A value of 1 (the default) enables this custom step to run. A value of 0 (provided by \n upstream code) sets this to disabled.\n\n Input:\n 1. triggerName: The name of the runtime trigger you wish to create. Ensure you provide a \n unique value to this parameter since it will be declared as a global variable.\n\n Output:\n 2. &triggerName : A global variable which takes the name provided to triggerName.\n*-------------------------------------------------------------------------------------------- */\n\n%macro _create_runtime_trigger(triggerName);\n\n %global &triggerName.;\n\n %if %sysevalf(%superq(&triggerName.)=, boolean) %then %do;\n \n %put NOTE: Trigger macro variable &triggerName. does not exist. Creating it now.;\n %let &triggerName.=1;\n\n %end;\n\n%mend _create_runtime_trigger;\n\n\n/* -----------------------------------------------------------------------------------------* \n Macro to create an error flag for capture during code execution.\n\n Input:\n 1. errorFlagName: The name of the error flag you wish to create. Ensure you provide a \n unique value to this parameter since it will be declared as a global variable.\n 2. errorFlagDesc: A description to add to the error flag.\n\n Output:\n 1. &errorFlagName : A global variable which takes the name provided to errorFlagName.\n 2. &errorFlagDesc : A global variable which takes the name provided to errorFlagDesc.\n*------------------------------------------------------------------------------------------ */\n\n%macro _create_error_flag(errorFlagName, errorFlagDesc);\n\n %global &errorFlagName.;\n %let &errorFlagName.=0;\n %global &errorFlagDesc.;\n\n%mend _create_error_flag;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to capture indicator and UUIDof any currently active CAS session.\n UUID is not expensive and can be used in future to consider graceful reconnect.\n\n Input:\n 1. errorFlagName: name of an error flag that gets populated in case the connection is \n not active. Provide this value in quotes when executing the macro.\n Define this as a global macro variable in order to use downstream.\n 2. errorFlagDesc: Name of a macro variable which can hold a descriptive message output\n from the check.\n \n Output:\n 1. Informational note as required. We explicitly don't provide an error note since \n there is an easy recourse(of being able to connect to CAS)\n 2. UUID of the session: macro variable which gets created if a session exists.\n 3. errorFlagName: populated\n 4. errorFlagDesc: populated\n*------------------------------------------------------------------------------------------*/\n\n%macro _env_cas_checkSession(errorFlagName, errorFlagDesc);\n %global casSessionExists;\n %if %sysfunc(symexist(_current_uuid_)) %then %do;\n %symdel _current_uuid_;\n %end;\n %if %sysfunc(symexist(_SESSREF_)) %then %do;\n %let casSessionExists= %sysfunc(sessfound(&_SESSREF_.));\n %if &casSessionExists.=1 %then %do;\n %global _current_uuid_;\n %let _current_uuid_=; \n proc cas;\n session.sessionId result = sessresults;\n call symputx(\"_current_uuid_\", sessresults[1]);\n quit;\n %put NOTE: A CAS session &_SESSREF_. is currently active with UUID &_current_uuid_. ;\n data _null_;\n call symputx(&errorFlagName., 0);\n call symput(&errorFlagDesc., \"CAS session is active.\");\n run;\n %end;\n %else %do;\n %put NOTE: Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream. ;\n data _null_;\n call symputx(&errorFlagName., 1);\n call symput(&errorFlagDesc., \"Unable to find a currently active CAS session. Reconnect or connect to a CAS session upstream.\");\n run;\n %end;\n %end;\n %else %do;\n %put NOTE: No active CAS session ;\n data _null_;\n call symputx(&errorFlagName., 1);\n call symput(&errorFlagDesc., \"No active CAS session. Connect to a CAS session upstream.\");\n run;\n %end;\n\n%mend _env_cas_checkSession; \n \n\n/*-----------------------------------------------------------------------------------------*\n Caslib for a Libname macro\n \n This macro creates a global macro variable called _usr_nameCaslib\n that contains the caslib name (aka. caslib-reference-name) associated with the libname\n and assumes that the libname is using the CAS engine.\n \n As sysvalue has a length of 1024 chars, we use the trimmed option in proc sql\n to remove leading and trailing blanks in the caslib name.\n \n From macro provided by Wilbram Hazejager (wilbram.hazejager@sas.com)\n\n Inputs:\n - _usr_LibrefUsingCasEngine : A library reference provided by the user which is based \n on a CAS engine.\n \n Outputs:\n - _usr_nameCaslib : Global macro variable containing the caslib name.\n*------------------------------------------------------------------------------------------*/\n \n%macro _usr_getNameCaslib(_usr_LibrefUsingCasEngine);\n \n %global _usr_nameCaslib;\n %let _usr_nameCaslib=;\n \n proc sql noprint;\n select sysvalue into :_usr_nameCaslib trimmed from dictionary.libnames\n where libname = upcase(\"&_usr_LibrefUsingCasEngine.\") and upcase(sysname)=\"CASLIB\";\n quit;\n\n /*--------------------------------------------------------------------------------------*\n Note that we output a NOTE instead of an ERROR for the below condition since the \n execution context determines whether this is an error or just an informational note.\n *---------------------------------------------------------------------------------------*/\n %if \"&_usr_nameCaslib.\" = \"\" %then %put NOTE: The caslib name for the &_usr_LibrefUsingCasEngine. is blank.;\n \n%mend _usr_getNameCaslib;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to check if a given libref belongs to a SAS or CAS engine.\n\n Input:\n 1. sasCasLibref: a libref to be checked. Do not quote.\n 2. tableEngine: a flag to hold the table Engine value.\n 3. errorFlagName: a flag to populate an error code with.\n 4. errorFlagDesc: a flag to describe the error if one occurs.\n 5. sessionExists: an indicator (1) whether an active CAS session exists. If not(0),\n it will be created.\n \n Output:\n 1. tableEngine: populated with SAS or CAS\n 2. errorFlagName: populated with 1 if an error and 0 if not\n 3. errorFlagDesc: populated in case of an error\n*------------------------------------------------------------------------------------------*/\n\n%macro _sas_or_cas(sasCasLibref, tableEngine, errorFlagName, errorFlagDesc, sessionExists);\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ ;\n caslib _ALL_ assign;\n %end;\n\n proc sql noprint;\n select distinct Engine into:&&tableEngine. from dictionary.libnames where libname = upcase(\"&sasCasLibref.\");\n quit;\n\n %put \"&&&tableEngine.\";\n\n %if %sysfunc(compress(\"&&&tableEngine.\")) = \"V9\" %THEN %DO;\n data _null_;\n call symput(\"&tableEngine.\",\"SAS\");\n call symputx(\"&errorFlag.\",0);\n call symput(\"&errorFlagDesc.\",\"\");\n run;\n %end;\n %else %if %sysfunc(compress(\"&&&tableEngine.\")) = \"CAS\" %THEN %DO;\n data _null_;\n call symputx(\"&errorFlagName.\",0);\n call symput(\"&errorFlagDesc.\",\"\");\n run;\n %END;\n %else %do;\n data _null_;\n call symputx(\"&errorFlagName.\",1);\n call symput(\"&errorFlagDesc.\",\"Unable to associate libref with either SAS or CAS. Check the input libref provided.\");\n run;\n %end;\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ terminate;\n %end;\n \n%mend _sas_or_cas;\n\n\n/*-----------------------------------------------------------------------------------------*\n Macro to check if an in-memory table exists.\n\n Input:\n 1. tableName: name of the in-memory table\n 2. tableLib: caslib backing the in-memory table\n 3. sessionExists: an indicator (1) whether an active CAS session exists. If not(0),\n it will be created.\n \n Output:\n 1. tableExists: populated with 0 if does not exist, 1 if exists with local scope, \n 2 if exists with global scope\n\n*------------------------------------------------------------------------------------------*/ \n\n%macro _cas_table_exists(tableName, tableLib, sessionExists, tableExists);\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ ;\n caslib _ALL_ assign;\n %end;\n\n proc cas;\n table.tableExists result = rc /\n name=\"&tableName.\",\n caslib=\"&tableLib.\"\n ;\n call symputx(\"&tableExists.\",rc.exists);\n quit;\n\n %if &sessionExists. = 0 %then %do;\n cas _temp_ss_ terminate;\n %end;\n \n%mend _cas_table_exists;\n\n/*-----------------------------------------------------------------------------------------*\n Macro to calculate singling out risk\n\n Input: invoked with current state of macro variables \n Output (implicit):\n 1. Singling Out Risk Results table\n 2. Singling Out Risk Queries table\n\n As the calculation of Singling Out Risk is based on an open-source Python package (anonymeter),\n we note the following citation: \n\n \"A Unified Framework for Quantifying Privacy Risk in Synthetic Data\", M. Giomi et al, PoPETS 2023. \n \n This bibtex entry can be used to refer to the paper:\n\n @misc{anonymeter,\n doi = {https://doi.org/10.56553/popets-2023-0055},\n url = {https://petsymposium.org/popets/2023/popets-2023-0055.php},\n journal = {Proceedings of Privacy Enhancing Technologies Symposium},\n year = {2023},\n author = {Giomi, Matteo and Boenisch, Franziska and Wehmeyer, Christoph and Tasnádi, Borbála},\n title = {A Unified Framework for Quantifying Privacy Risk in Synthetic Data},\n }\n\n\n*------------------------------------------------------------------------------------------*/ \n\n%macro _smt_singling_out_risk;\n\n %put NOTE: Singling out risk macro;\n/*-----------------------------------------------------------------------------------------*\n Check Results table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %global so_results_caslib;\n %_usr_getNameCaslib(&so_results_tbl_lib.);\n %let so_results_caslib=&_usr_nameCaslib.;\n %put NOTE: &so_results_caslib. is the caslib for the Singling Out Risk results table.;\n %let _usr_nameCaslib=;\n %if \"&so_results_caslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Singling Out Results table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Singling Out Results table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Check Queries table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %global so_queries_caslib;\n %_usr_getNameCaslib(&so_queries_tbl_lib.);\n %let so_queries_caslib=&_usr_nameCaslib.;\n %put NOTE: &so_queries_caslib. is the caslib for the Singling Out Risk queries table.;\n %let _usr_nameCaslib=;\n %if \"&so_queries_caslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Singling Out Queries table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Singling Out Queries table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n %end;\n %if &_smt_error_flag. = 0 %then %do;\n proc python infile=smtcode;\n quit;\n %end;\n\n\n%mend _smt_singling_out_risk;\n\n/*-----------------------------------------------------------------------------------------*\n EXECUTION CODE MACRO \n\n _smt prefix stands for SMOTE\n*------------------------------------------------------------------------------------------*/\n\n%macro _smt_execution_code;\n\n/*-----------------------------------------------------------------------------------------*\n Create an error flag. \n*------------------------------------------------------------------------------------------*/\n\n %_create_error_flag(_smt_error_flag, _smt_error_desc);\n\n/*-----------------------------------------------------------------------------------------*\n Account for edge cases where singling out risk has been requested even without a sample. \n*------------------------------------------------------------------------------------------*/\n data _null_;\n call symputx(\"singling_out_risk\",min(1, &singling_out_risk. * &sampling_percent.));\n run;\n\n %if &singling_out_risk.=0 %then %do;\n %put NOTE: Privacy risk assessment will not be carried out because a sample has not been specified.;\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check if an active CAS session exists. \n*------------------------------------------------------------------------------------------*/\n\n %_env_cas_checkSession(\"_smt_error_flag\", \"_smt_error_desc\");\n\n/*-----------------------------------------------------------------------------------------*\n Check Input table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n\n %global inputCaslib;\n %_usr_getNameCaslib(&inputTable_lib.);\n %let inputCaslib=&_usr_nameCaslib.;\n %put NOTE: &inputCaslib. is the caslib for the input table.;\n %let _usr_nameCaslib=;\n\n %if \"&inputCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Input table caslib is blank. Check if Base table is a valid CAS table.\");\n run;\n %put ERROR: Input table caslib is blank. Check if Base table is a valid CAS table. ;\n %end;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check if input table exists.\n*------------------------------------------------------------------------------------------*/\n \n %global casTableExists;\n\n %if &_smt_error_flag. = 0 %then %do;\n %_cas_table_exists(&inputTable_name_base.,&inputTable_lib.,1,casTableExists);\n %if &casTableExists.=0 %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: The given CAS table does not seem to exist. Please check if it is loaded to CAS.\");\n run;\n %put ERROR: The given CAS table does not seem to exist. Please check if it is loaded to CAS.;\n %end; \n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check Output table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n\n %global outputCaslib;\n %_usr_getNameCaslib(&outputTable_lib.);\n %let outputCaslib=&_usr_nameCaslib.;\n %put NOTE: &outputCaslib. is the caslib for the output table.;\n %let _usr_nameCaslib=;\n\n %if \"&outputCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Output table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Output table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Check Assessment table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n\n %global assessmentCaslib;\n %_usr_getNameCaslib(&outputTable_lib.);\n %let assessmentCaslib=&_usr_nameCaslib.;\n %put NOTE: &assessmentCaslib. is the caslib for the assessment table.;\n %let _usr_nameCaslib=;\n\n %if \"&assessmentCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Assessment table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Assessment table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n\n %end;\n\n\n/*-----------------------------------------------------------------------------------------*\n Obtain list of input & nominal variables and store them in macro variables.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n %let blankSeparatedInputVars = %_flw_get_column_list(_flw_prefix=inputVars);\n %let blankSeparatedNominalVars = %_flw_get_column_list(_flw_prefix=nominalVars);\n %put NOTE: Input variables selected - &blankSeparatedInputVars.;\n %put NOTE: Nominal variables selected - &blankSeparatedNominalVars.;\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Create a program string based on selection of nominal variables.\n*------------------------------------------------------------------------------------------*/\n\n %if &_smt_error_flag. = 0 %then %do;\n %if &nominalVars_count.=0 %then %do;\n data _null_;\n call symput(\"nominalString\",\"\");\n run;\n %end;\n %else %do;\n data _null_;\n call symput(\"nominalString\",\"nominals=${&blankSeparatedNominalVars.},\");\n run;\n %end;\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Create a program string based on selection of class variables.\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &classVar_count.=0 %then %do;\n data _null_;\n call symput(\"classString\",\"\");\n call symput(\"classToAugment\",\"\");\n call symput(\"classAugmentString\",\"\");\n run;\n %end;\n %else %do;\n data _null_;\n call symput(\"classString\",\"classColumn=classColumnVar,\");\n call symput(\"classAugmentString\",\"classToAugment=class_to_augment,\");\n run;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Check if provenance flag name has been provided otherwise code as default\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if %sysfunc(compress(\"&prov_flag_name.\"))=\"\" %then %do;\n %put NOTE: Value not provided for provenance variable. Using default.;\n data _null_;\n call symput(\"prov_flag_name\",\"Synthetic_Data_Provenance\");\n run;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Check if assessment table (optional) has been provided otherwise code as default\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &sampling_percent. > 0 %then %do;\n %put NOTE: Assessment table value is - &assessmentTable. ;\n %if %sysevalf(%superq(assessmentTable)=, boolean) %then %do;\n %put ERROR: An assessment table has not been attached. Please attach the same.;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"An assessment table has not been attached. Please attach the same.\");\n run;\n %end;\n %else %if \"%sysfunc(substr(&assessmentTable.,1,9))\"=\"WORK._flw\" %then %do;\n %put NOTE: Value not provided for assessment table. Using default.;\n data _null_;\n call symput(\"assessmentTable\",\"PUBLIC.SMOTE_ASSESSMENT\");\n call symput(\"assessmentTable_lib\",\"PUBLIC\");\n call symput(\"assessmentTable_name_base\",\"SMOTE_ASSESSMENT\");\n run;\n %end;\n %else %if %sysfunc(compress(\"&assessmentTable.\"))=\"\" %then %do;\n %put NOTE: Value not provided for assessment table. Using default.;\n data _null_;\n call symput(\"assessmentTable\",\"PUBLIC.SMOTE_ASSESSMENT\");\n call symput(\"assessmentTable_lib\",\"PUBLIC\");\n call symput(\"assessmentTable_name_base\",\"SMOTE_ASSESSMENT\");\n run;\n %end;\n %else %do;\n/*-----------------------------------------------------------------------------------------*\n Check Assessment table libref to ensure it points to a valid caslib.\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %global assessmentCaslib;\n %_usr_getNameCaslib(&outputTable_lib.);\n %let assessmentCaslib=&_usr_nameCaslib.;\n %put NOTE: &assessmentCaslib. is the caslib for the assessment table.;\n %let _usr_nameCaslib=;\n %if \"&assessmentCaslib.\" = \"\" %then %do;\n data _null_;\n call symputx(\"_smt_error_flag\",1);\n call symput(\"_smt_error_desc\",\"ERROR: Assessment table caslib is blank. Check if table is a valid CAS table.\");\n run;\n %put ERROR: Assessment table caslib is blank. Check if table is a valid CAS table. ;\n %end;\n %end;\n %end;\n %end;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Test data set created based on percent\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n %if &sampling_percent.=0 %then %do;\n data &outputTable_lib..__temp_smote;\n set &inputTable.;\n _PartInd_ = 0;\n run;\n %end;\n %else %do;\n proc partition data=&inputTable. partind samppct= &sampling_percent. seed=10 ;\n output out=&outputTable_lib..__temp_smote copyvars=(_all_);\n display 'SRSFreq';\n run;\n data &outputTable_lib..__temp_smote &outputTable_lib..__assess_orig;\n set &outputTable_lib..__temp_smote;\n if _PartInd_=0 then output &outputTable_lib..__temp_smote;\n else output &outputTable_lib..__assess_orig;\n run;\n/*-----------------------------------------------------------------------------------------*\n Add a provenance flag\n*------------------------------------------------------------------------------------------*/\n data &outputTable_lib..__assess_orig;\n length &prov_flag_name. $9.;\n set &outputTable_lib..__assess_orig;\n &prov_flag_name. = \"Original\";\n run;\n\n %end;\n\n %end;\n/*-----------------------------------------------------------------------------------------*\n Run SMOTE action\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n proc cas; \n numK = symget(\"numK\");\n inputTableCaslib = symget(\"inputCaslib\");\n inputTableName = symget(\"inputTable_name_base\");\n blankSeparatedNominalVars = symget(\"blankSeparatedNominalVars\");\n blankSeparatedInputVars = symget(\"blankSeparatedInputVars\");\n classColumnVar = symget(\"classVar\");\n classVarType = symget(\"classVar_1_Type\");\n classToAugment = symget(\"classToAugment\");\n numSamplesVar = symget(\"numSamples\");\n outputTableCaslib = symget(\"outputCaslib\");\n outputTableName = symget(\"outputTable_name_base\");\n seedNumber = symget(\"seedNumber\");\n numThreads = symget(\"numThreads\");\n extrapolation_factor = symget(\"extrapolationFactor\");\n\n if classVarType = \"Numeric\" then class_to_augment = classToAugment*1; \n else class_to_augment = classToAugment;\n\n smote.smoteSample result=r/\n table={name=\"__temp_smote\", caslib=outputTableCaslib, where='_PartInd_=0'},\n/* table={name=inputTableName, caslib=inputTableCaslib}, */\n k = numK,\n inputs=${&blankSeparatedInputVars.},\n &nominalString.\n &classString.\n &classAugmentString.\n seed=seedNumber,\n nThreads = numThreads,\n numSamples=numSamplesVar,\n extrapolationFactor=extrapolation_factor,\n casout={name=outputTableName,caslib= outputTableCaslib, replace=\"TRUE\"}\n ;\n print r;\n run;\n quit;\n %end;\n/*-----------------------------------------------------------------------------------------*\n Add a provenance flag\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n data &outputTable.;\n length &prov_flag_name. $9.;\n set &outputTable.;\n &prov_flag_name. = \"Synthetic\";\n run;\n\n %end;\n\n/*-----------------------------------------------------------------------------------------*\n Take a sample from synthetic data and merge with original data\n*------------------------------------------------------------------------------------------*/\n %if &_smt_error_flag. = 0 %then %do;\n proc sql noprint;\n select count(*) into: synth_records from &outputTable.;\n select count(*) into: orig_records from &inputTable.;\n quit;\n\n %put NOTE: Number of synthetic records - &synth_records.;\n %put NOTE: Number of original records - &orig_records.;\n %put NOTE: Sampling Percent provided - &sampling_percent.;\n \n data _null_;\n call symputx(\"synth_sampling_percent\",100*((&sampling_percent./100) * &orig_records. )/&synth_records.);\n run;\n %put NOTE: Synthetic Sampling Percent - &synth_sampling_percent.;\n\n %if &sampling_percent.=0 %then %do;\n/*-----------------------------------------------------------------------------------------*\n Block deliberately left empty for a future consideration\n*------------------------------------------------------------------------------------------*/\n %end;\n %else %do;\n proc partition data=&outputTable. partind samppct= &synth_sampling_percent. seed=10 ;\n output out=&outputTable_lib..__assess_synth copyvars=(_all_);\n display 'SRSFreq';\n run;\n data &assessmentTable.;\n set &outputTable_lib..__assess_orig &outputTable_lib..__assess_synth (where=(_PartInd_=1));\n keep &prov_flag_name. &blankSeparatedInputVars.;\n run;\n proc datasets lib=&outputTable_lib. nolist nodetails;\n delete __assess_orig __assess_synth ;\n quit;\n/*-----------------------------------------------------------------------------------------*\n Check and address singling out risk\n*------------------------------------------------------------------------------------------*/\n %if &singling_out_risk.=1 %then %do;\n\n %_smt_singling_out_risk;\n\n %end;\n\n %end;\n proc datasets lib=&outputTable_lib. nolist nodetails;\n delete __temp_smote;\n quit;\n %end;\n\n\n%mend _smt_execution_code; \n\n/*-----------------------------------------------------------------------------------------*\n END MACROS\n*------------------------------------------------------------------------------------------*/\n\n/*-----------------------------------------------------------------------------------------*\n EXECUTION CODE\n*------------------------------------------------------------------------------------------*/\n \n/*-----------------------------------------------------------------------------------------*\n Create Runtime Trigger\n*------------------------------------------------------------------------------------------*/\n%_create_runtime_trigger(_smt_run_trigger);\n\n/*-----------------------------------------------------------------------------------------*\n Execute \n*------------------------------------------------------------------------------------------*/\n\n\n\n%if &_smt_run_trigger. = 1 %then %do;\n\n %_smt_execution_code;\n\n%end;\n\n%if &_smt_run_trigger. = 0 %then %do;\n\n %put NOTE: This step has been disabled. Nothing to do.;\n\n%end;\n\n\n%put NOTE: Final summary;\n%put NOTE: Status of error flag - &_smt_error_flag. ;\n%put &_smt_error_desc.;\n%put NOTE: Error desc - &_smt_error_desc. ;\n\n/*-----------------------------------------------------------------------------------------*\n END EXECUTION CODE\n*------------------------------------------------------------------------------------------*/\n/*-----------------------------------------------------------------------------------------*\n Clean up existing macro variables and macro definitions.\n*------------------------------------------------------------------------------------------*/\n\n%if %symexist(inputCaslib) %then %do;\n %symdel inputCaslib;\n%end;\n\n%if %symexist(outputCaslib) %then %do;\n %symdel outputCaslib;\n%end;\n\n%if %symexist(assessmentCaslib) %then %do;\n %symdel assessmentCaslib;\n%end;\n\n%if %symexist(so_results_caslib) %then %do;\n %symdel so_results_caslib;\n%end;\n\n%if %symexist(so_queries_caslib) %then %do;\n %symdel so_queries_caslib;\n%end;\n\n%if %symexist(casTableExists) %then %do;\n %symdel casTableExists;\n%end;\n\n%if %symexist(prov_flag_name) %then %do;\n %symdel prov_flag_name;\n%end;\n\n%if %symexist(_smt_run_trigger) %then %do;\n %symdel _smt_run_trigger;\n%end;\n\n%if %symexist(_smt_error_flag) %then %do;\n %symdel _smt_error_flag;\n%end;\n\n%if %symexist(_smt_error_desc) %then %do;\n %symdel _smt_error_desc;\n%end;\n\n%if %symexist(casSessionExists) %then %do;\n %symdel casSessionExists;\n%end;\n\n%sysmacdelete _create_error_flag;\n%sysmacdelete _create_runtime_trigger;\n%sysmacdelete _env_cas_checkSession;\n%sysmacdelete _usr_getNameCaslib;\n%sysmacdelete _sas_or_cas;\n%sysmacdelete _cas_table_exists;\n%sysmacdelete _smt_execution_code;\n%sysmacdelete _smt_singling_out_risk;\n\nfilename smtcode clear;\n"},"properties":{},"ui":"{\n\t\"showPageContentOnly\": true,\n\t\"pages\": [\n\t\t{\n\t\t\t\"id\": \"page1\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"Parameters\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section1\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Input Table\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text2\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Connect a SAS Cloud Analytics Services (CAS) table to the input port of this step.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"inputTable\",\n\t\t\t\t\t\t\t\"type\": \"inputtable\",\n\t\t\t\t\t\t\t\"label\": \"Select input table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"parameters_section\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Parameters\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"numK\",\n\t\t\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\t\t\"label\": \"Select number of nearest neighbors:\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"integer\": true,\n\t\t\t\t\t\t\t\"min\": 1,\n\t\t\t\t\t\t\t\"max\": 1000,\n\t\t\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"inputVars\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select input columns:\",\n\t\t\t\t\t\t\t\"include\": null,\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 1,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": \"inputTable\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"nominalVars\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select nominal columns:\",\n\t\t\t\t\t\t\t\"include\": \"inputVars\",\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": null\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"classVar\",\n\t\t\t\t\t\t\t\"type\": \"columnselector\",\n\t\t\t\t\t\t\t\"label\": \"Select a class column to augment:\",\n\t\t\t\t\t\t\t\"include\": \"inputVars\",\n\t\t\t\t\t\t\t\"order\": false,\n\t\t\t\t\t\t\t\"columntype\": \"a\",\n\t\t\t\t\t\t\t\"max\": 1,\n\t\t\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"table\": null\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"classToAugment\",\n\t\t\t\t\t\t\t\"type\": \"dropdown\",\n\t\t\t\t\t\t\t\"label\": \"Select a class value to augment:\",\n\t\t\t\t\t\t\t\"items\": {\n\t\t\t\t\t\t\t\t\"ref\": \"classVar\"\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"$classVar\",\n\t\t\t\t\t\t\t\"enabled\": \"$classVar\",\n\t\t\t\t\t\t\t\"indent\": 1\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section9\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Output specification\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"numSamples\",\n\t\t\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\t\t\"label\": \"Number of synthetic observations:\",\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\t\t\"min\": 10,\n\t\t\t\t\t\t\t\"excludemin\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"integer\": true\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text1\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Connect a SAS Cloud Analytics Services (CAS) table to the output port of this step.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"outputTable\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide output table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"assessmentTable\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide assessment table (optional):\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"placeholder\": \"PUBLIC.SMOTE_ASSESSMENT\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text3\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Note: Assessment table will be populated only if a value greater than 0 has been selected for the sampling percentage (Configuration tab).\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"privacyRisk\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Privacy Risk\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t0\n\t\t\t\t\t],\n\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t0\n\t\t\t\t\t],\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"singling_out_risk\",\n\t\t\t\t\t\t\t\"type\": \"checkbox\",\n\t\t\t\t\t\t\t\"label\": \"Measure Singling Out Risk\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$sampling_percent\",\n\t\t\t\t\t\t\t\t\">\",\n\t\t\t\t\t\t\t\t0\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"evaluation_mode\",\n\t\t\t\t\t\t\t\"type\": \"dropdown\",\n\t\t\t\t\t\t\t\"label\": \"Select evaluation mode:\",\n\t\t\t\t\t\t\t\"items\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"value\": \"univariate\",\n\t\t\t\t\t\t\t\t\t\"label\": \"univariate\"\n\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"value\": \"multivariate\",\n\t\t\t\t\t\t\t\t\t\"label\": \"multivariate\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"indent\": 2,\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"conf_interval\",\n\t\t\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\t\t\"label\": \"Select confidence interval (percentage):\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"integer\": false,\n\t\t\t\t\t\t\t\"min\": 0.9,\n\t\t\t\t\t\t\t\"max\": 0.99,\n\t\t\t\t\t\t\t\"stepsize\": 0.01,\n\t\t\t\t\t\t\t\"indent\": 2,\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"s_o_attacks\",\n\t\t\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\t\t\"label\": \"Provide number of singling out attacks to simulate:\",\n\t\t\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\t\t\"integer\": false,\n\t\t\t\t\t\t\t\"min\": 10,\n\t\t\t\t\t\t\t\"max\": 2000,\n\t\t\t\t\t\t\t\"stepsize\": 1,\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"indent\": 2,\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"so_results_tbl\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide singling out risk results table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"indent\": 2\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"so_queries_tbl\",\n\t\t\t\t\t\t\t\"type\": \"outputtable\",\n\t\t\t\t\t\t\t\"label\": \"Provide singling out risk queries table:\",\n\t\t\t\t\t\t\t\"required\": true,\n\t\t\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\t\t\"visible\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"enabled\": [\n\t\t\t\t\t\t\t\t\"$singling_out_risk\",\n\t\t\t\t\t\t\t\t\"=\",\n\t\t\t\t\t\t\t\ttrue\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"indent\": 2\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t}\n\t\t\t]\n\t\t},\n\t\t{\n\t\t\t\"id\": \"page2\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"Configuration\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"numThreads\",\n\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\"label\": \"Select number of threads:\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"integer\": true,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"max\": 1024,\n\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"seedNumber\",\n\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\"label\": \"Select a seed:\",\n\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\"min\": null,\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"extrapolationFactor\",\n\t\t\t\t\t\"type\": \"numberfield\",\n\t\t\t\t\t\"label\": \"Provide extrapolation factor (to perturb input data boundary):\",\n\t\t\t\t\t\"placeholder\": \"\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"max\": null,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"excludemin\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"integer\": false\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"prov_flag_name\",\n\t\t\t\t\t\"type\": \"textfield\",\n\t\t\t\t\t\"label\": \"Edit provenance variable name:\",\n\t\t\t\t\t\"placeholder\": \"Synthetic_Data_Provenance\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"sampling_percent\",\n\t\t\t\t\t\"type\": \"numstepper\",\n\t\t\t\t\t\"label\": \"Provide sampling percentage for assessment:\",\n\t\t\t\t\t\"required\": false,\n\t\t\t\t\t\"integer\": false,\n\t\t\t\t\t\"min\": 0,\n\t\t\t\t\t\"max\": 99,\n\t\t\t\t\t\"stepsize\": 1\n\t\t\t\t}\n\t\t\t]\n\t\t},\n\t\t{\n\t\t\t\"id\": \"about\",\n\t\t\t\"type\": \"page\",\n\t\t\t\"label\": \"About\",\n\t\t\t\"children\": [\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_description\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Synthetic Minority Oversampling TEchnique (SMOTE) \\n=====================================\\n\\nThis custom step helps you generate synthetic data based on an input table, using the Synthetic Minority Oversampling TEchnique (SMOTE). SMOTE is an oversampling technique which identifies new data observations in the neighborhood of closely associated original observations. \\n\\nSMOTE is an alternative approach to Generative Adversarial Networks (GANs) for generating synthetic tabular data. Access to synthetic data helps you make better, data-informed decisions in situations where you have imbalanced, scant, poor quality, unobservable, or restricted data.\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section8\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Prerequisites\",\n\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text4\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"1. A SAS Viya 4 environment, preferably monthly stable 2024.10 or later\\n\\n2. A Visual Data Mining and Machine Learning (VDMML) license, usually provided with SAS Viya, SAS Viya Enterprise or Advanced.\\n\\n3. An active SAS Cloud Analytics Services (CAS) connection during runtime.\\n\\n4. The smote.smoteSample CAS action requires Python configuration, as specified in SAS documentation. Please work with your SAS administrator to have the same configured. Specifically, ensure the following:\\n\\n 1. The correct version of Python is installed (as of version 2024.10, this was 3.11.x) \\n 2. [sas-ipc-queue](https://pypi.org/project/sas-ipc-queue/) , version atleast 0.7.0 and beyond \\n 3. [hnswlib](https://pypi.org/project/hnswlib/)\\n 4. [protobuf](https://pypi.org/project/protobuf/)\\n\\n(OPTIONAL) Prerequisites for Singling Out Risk calculation\\n\\nIf you want to measure singling out risk (provided as an option in this step), note the following additional prerequisites:\\n\\n1. SAS compute session should be configured to access a Python runtime of version of > 3.7 and < 3.12.\\n\\n2. A Python package - anonymeter - should be installed in the above runtime. Make note of details about anonymeter at https://pypi.org/project/anonymeter/\\n\\n3. As a further dependency on above, anonymeter requires NumPy between version 1.2 and less than 1.7 (specifically, \\\"numpy >=1.22, <1.27\\\", # limited by Numba support)\\n\\nNote terms of anonymeter license here: https://github.com/statice/anonymeter/blob/main/LICENSE.md\\n\\nNote citation in Privacy Risk section below.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"section10\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Assumptions\",\n\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"text6\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Current assumptions for this initial versions (future versions may improve upon the same):\\n\\n1. Users choose either an existing Chroma DB vector database collection or load PDF or CSV files to an existing or new Chroma DB collection.\\n\\n2. Users may load all PDFs in a directory on the SAS Server (filesystem), or select a PDF / CSV of their choice.\\n\\n3. The code assumes use of a Chroma DB vector store. Users may choose to replace this with other supported vector stores.\\n\\n4. The code uses the langchain LLM framework. \\n\\n5. PDFs (containing text) and single CSV files are currently the only loadable file format allowed. Users are however free to ingest various other document types into a Chroma DB collection beforehand, using the \\\"Vector Databases - Hydrate Chroma DB collection\\\" SAS Studio Custom Step (refer documentation)\\n\\n6. User has already configured Azure OpenAI to deploy both an embedding function and LLM service, or knows the deployment names. \\n\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_parameters\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Parameters\",\n\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_input\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Input parameters\",\n\t\t\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"input_parameters_text\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Input table (input port, required): connect a CAS table to the input port.\\n\\n2. Nearest neighbors (numeric stepper, default 5): select the number of nearest neighbours to be used by the SMOTE algorithm as the basis for identifying candidate synthetic points.\\n\\n3. Input columns (column selector): select all inputs for the SMOTE process. You would also need to include the class and any nominal columns.\\n\\n4. Nominal variables (column selector): select any nominal variables you wish to use. Your nominal variables are required to be in the inputs column list.\\n\\n5. Select a class column (column selector, optional): select a column if you wish to use SMOTE in order to balance or augment a level within the class column. Be judicious in the choice of this column since a column with a high number of levels may slow down or even fail the process. Your class column is required to be in the inputs column list.\\n\\n6. Class to augment (drop-down list, values from class column if selected): select the level of the class variable you wish to augment. The values that appear here depend on the data that's contained in the class column, so may take time to populate based on actual data and number of levels.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_output_specs\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Output specifications\",\n\t\t\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"output_parameters_text\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Number of synthetic observations (numeric field): specify the number of synthetic observations you would like in the output table.\\n\\n2. Output table (output port): attach a CAS table to the output port to hold results.\\n\\n3. Assessment table (output port, optional): Attach a CAS table to the second output port (assessmentTable) of this step in case you select a sampling percentage (refer Configuration tab, pt #5) greater than 0. A default value of PUBLIC.SMOTE_ASSESSMENT is assigned in case you do not specify a table.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_privacy_risk\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Privacy Risk\",\n\t\t\t\t\t\t\t\"open\": false,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"pr_parameters_text\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"Synthetic data requires assurances on data privacy. One aspect of privacy risk is singling out risk, which evolved alongside General Data Protection Regulation (GDPR). This is an optional step. If you wish to measure singling out risk, enter the parameters below.\\n\\n1. Measure Singling Out Risk (check box, default not checked): select this option if you want to measure singling out risk. Be aware of the Python dependencies (in Prerequisites section) and the fact that this involves a longer runtime in addition to the generation operation.\\n\\n2. Evaluation mode (drop-down list): select either univariate or multivariate to define the type of attack query to be tested.\\n\\n3. Confidence interval (percentage, numeric stepper): select a number from 90 to 99 to define the confidence level while providing privacy risk estimates.\\n\\n4. Number of attacks (numeric stepper, default 100) : enter number of attacks (queries) to simulate.\\n\\n5. Singling Out Risk Results table (output port): attach a CAS table to the so_results_tbl output port to hold results.\\n\\n6. Singling Out Risk Queries table (output port): attach a CAS table to the so_queries_tbl output port to hold results.\\n\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"section2\",\n\t\t\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\t\t\"label\": \"Citation\",\n\t\t\t\t\t\t\t\t\t\"open\": true,\n\t\t\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\t\t\"id\": \"text5\",\n\t\t\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\t\t\"text\": \"Since we make use of an open-source package, anonymeter, to perform these calculations, we provide the following citation as mentioned in package documentation: \\n\\n\\\"A Unified Framework for Quantifying Privacy Risk in Synthetic Data\\\", M. Giomi et al, PoPETS 2023. \\n\\nThis bibtex entry can be used to refer to the paper:\\n\\n@misc{anonymeter,\\n doi = {https://doi.org/10.56553/popets-2023-0055},\\n url = {https://petsymposium.org/popets/2023/popets-2023-0055.php},\\n journal = {Proceedings of Privacy Enhancing Technologies Symposium},\\n year = {2023},\\n author = {Giomi, Matteo and Boenisch, Franziska and Wehmeyer, Christoph and Tasnádi, Borbála},\\n title = {A Unified Framework for Quantifying Privacy Risk in Synthetic Data},\\n}\\n\",\n\t\t\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t\t]\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t},\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"parameters_config_1\",\n\t\t\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\t\t\"label\": \"Configuration \",\n\t\t\t\t\t\t\t\"open\": 1,\n\t\t\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"id\": \"output_parameters_text_1\",\n\t\t\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\t\t\"text\": \"1. Number of threads: (numeric stepper, optional): most of the time, you do not need to modify this. Change if you need to especially control the number of threads in which the process runs.\\n\\n2. Select a seed (numeric field, optional): specify a seed number to establish (but not completely guarantee) some level of reproducability with respect to results.\\n\\n3. Select extrapolation factor: specify a number (double) to use as a standard deviation in order to perturb (add noise or randomness) the input data boundaries.\\n\\n4. Name for synthetic data provenance variable (optional, default provided): insert a value only if you want to give a special name to a flag that indicates this data is synthetic. Otherwise, a default name of Synthetic_Data_Provenance will be used instead.\\n\\n5. Provide sampling percentage for assessment (numeric stepper, default is 0): Enter a value between 0 and 100 to sample a test dataset before running SMOTE. Note that the test dataset will NOT be used to generate data. The test dataset will be appended with an equal number of observations from the synthetic dataset (along with the provenance flag described in #4) and can be used for assessment purposes.\",\n\t\t\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_runtimecontrol\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Run-time Control\",\n\t\t\t\t\t\"open\": 0,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"runtimecontrol_text\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"Note: Run-time control is optional. You may choose whether to execute the main code of this step or not, based on upstream conditions set by earlier SAS programs. This includes nodes run prior to this custom step earlier in a SAS Studio Flow, or a previous program in the same session.\\n\\nRefer this blog (https://communities.sas.com/t5/SAS-Communities-Library/Switch-on-switch-off-run-time-control-of-SAS-Studio-Custom-Steps/ta-p/885526) for more details on the concept.\\n\\nThe following macro variable,\\n\\n_smt_run_trigger\\n\\nwill initialize with a value of 1 by default, indicating an \\\"enabled\\\" status and allowing the custom step to run.\\n\\nIf you wish to control execution of this custom step, include code in an upstream SAS program to set this variable to 0. This \\\"disables\\\" execution of the custom step.\\n\\nTo \\\"disable\\\" this step, run the following code upstream:\\n\\n%global _smt_run_trigger;\\n%let _smt_run_trigger =0;\\n\\nTo \\\"enable\\\" this step again, run the following (it's assumed that this has already been set as a global variable):\\n\\n%let _smt_run_trigger =1;\\n\\nIMPORTANT: Be aware that disabling this step means that none of its main execution code will run, and any downstream code which was dependent on this code may fail. Change this setting only if it aligns with the objective of your SAS Studio program.\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"about_documentation\",\n\t\t\t\t\t\"type\": \"section\",\n\t\t\t\t\t\"label\": \"Documentation\",\n\t\t\t\t\t\"open\": 0,\n\t\t\t\t\t\"visible\": \"\",\n\t\t\t\t\t\"children\": [\n\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\"id\": \"documentation_text\",\n\t\t\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\t\t\"text\": \"1. Documentation on the smote.smoteSample CAS action: https://go.documentation.sas.com/doc/en/pgmsascdc/default/casactml/casactml_smote_details01.htm\\n\\n2. PyPi page for sas-ipc-queue. While an implementation detail for the smoteSample action, this is a good place to highlight efficient memory handling techniques, for interested programmers: https://pypi.org/project/sas-ipc-queue/\\n\\n3. PyPi page for hnswlib, an algorithm for fast approximate nearest neighbour search, used in smoteSample: https://pypi.org/project/hnswlib/\\n\\n4. PyPi page for protobuf (Python package used within action): https://pypi.org/project/protobuf/\\n\\n\",\n\t\t\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t\t\t}\n\t\t\t\t\t]\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"version_text\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Version: 1.3.1 (10DEC2024)\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t},\n\t\t\t\t{\n\t\t\t\t\t\"id\": \"contact_text\",\n\t\t\t\t\t\"type\": \"text\",\n\t\t\t\t\t\"text\": \"Created/contact: \\n\\n- Sundaresh Sankaran (sundaresh.sankaran@sas.com)\\n- Josiah Chua (josiah.chua@sas.com)\\n\\nAcknowledgements to others for their help on details, testing or exploring the area: \\n- David Olaleye (david.olaleye@sas.com)\\n- Suneel Grover (suneel.grover@sas.com)\\n- Reza Nazari (reza.nazari@sas.com)\\n- SAS Analytics R&D team\",\n\t\t\t\t\t\"visible\": \"\"\n\t\t\t\t}\n\t\t\t]\n\t\t}\n\t],\n\t\"syntaxversion\": \"1.3.0\",\n\t\"values\": {\n\t\t\"inputTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"numK\": 5,\n\t\t\"inputVars\": [],\n\t\t\"nominalVars\": [],\n\t\t\"classVar\": [],\n\t\t\"classToAugment\": null,\n\t\t\"numSamples\": 100,\n\t\t\"outputTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"assessmentTable\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"singling_out_risk\": false,\n\t\t\"evaluation_mode\": {\n\t\t\t\"value\": \"multivariate\",\n\t\t\t\"label\": \"multivariate\"\n\t\t},\n\t\t\"conf_interval\": 0.95,\n\t\t\"s_o_attacks\": 100,\n\t\t\"so_results_tbl\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"so_queries_tbl\": {\n\t\t\t\"library\": \"\",\n\t\t\t\"table\": \"\"\n\t\t},\n\t\t\"numThreads\": 0,\n\t\t\"seedNumber\": 123,\n\t\t\"extrapolationFactor\": 0,\n\t\t\"prov_flag_name\": \"Synthetic_Data_Provenance\",\n\t\t\"sampling_percent\": 0\n\t}\n}","flowMetadata":{"inputPorts":[{"name":"inputTable","displayName":"inputTable","minEntries":1,"maxEntries":1,"type":"table"}],"outputPorts":[{"name":"outputTable","displayName":"outputTable","minEntries":1,"maxEntries":1,"columnDelta":null,"type":"table"},{"name":"assessmentTable","displayName":"assessmentTable","minEntries":0,"maxEntries":1,"columnDelta":null,"type":"table"},{"name":"so_results_tbl","displayName":"so_results_tbl","minEntries":1,"maxEntries":1,"columnDelta":null,"type":"table"},{"name":"so_queries_tbl","displayName":"so_queries_tbl","minEntries":1,"maxEntries":1,"columnDelta":null,"type":"table"}]}} \ No newline at end of file diff --git a/SDG - Generate Synthetic Data through SMOTE/extras/SDG - Generate Synthetic Data through SMOTE.sas b/SDG - Generate Synthetic Data through SMOTE/extras/SDG - Generate Synthetic Data through SMOTE.sas index e2459d7a..ec6208df 100755 --- a/SDG - Generate Synthetic Data through SMOTE/extras/SDG - Generate Synthetic Data through SMOTE.sas +++ b/SDG - Generate Synthetic Data through SMOTE/extras/SDG - Generate Synthetic Data through SMOTE.sas @@ -3,7 +3,7 @@ /* -------------------------------------------------------------------------------------------* Synthetic Data Generation (SDG) - Generate Synthetic Data through SMOTE - v 1.2 (11NOV2024) + v 1.3.1 (10DEC2024) This program generates synthetic data using the Synthetic Minority Oversampling TEchnique and is meant for use within a SAS Studio Custom Step. Please modify requisite macro variables @@ -55,6 +55,155 @@ *------------------------------------------------------------------------------------------*/ +/*-----------------------------------------------------------------------------------------* + Python Block Definition +*------------------------------------------------------------------------------------------*/ + +/*-----------------------------------------------------------------------------------------* + The following block of code has been created for the purpose of allowing proc python + to execute within a macro. Execution within a macro allows for other checks to be carried + out through SAS prior to handing off to the Python step. + + In this example, a temporary file is created containing the requisite Python commands, which + are then executed through infile reference. + + Note that Python code is pasted as-is and may be out of line with the SAS indentation followed. + + This Python block comes into operation only upon the selection of Privacy Risk (Singling + Out Risk) metrics. + +*------------------------------------------------------------------------------------------*/ +filename smtcode temp; + +data _null_; + + length line $32767; * max SAS character size ; + infile datalines4 truncover pad; + input ; + file smtcode; + line = strip(_infile_); * line without leading and trailing blanks ; + l1 = length(trimn(_infile_)); * length of line without trailing blanks ; + l2 = length(line); * length of line without leading and trailing blanks ; + first_position=l1-l2+1; * position where the line should start (alignment) ; + if (line eq ' ') then put @1; * empty line ; + else put @first_position line; * line without leading and trailing blanks correctly aligned ; + + datalines4; +# Imports +_smt_error_flag = int(SAS.symget("_smt_error_flag")) +_smt_error_desc = SAS.symget("_smt_error_desc") + + +citation = """ + + Calculated using anonymeter (https://pypi.org/project/anonymeter/) + + "A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi et al, PoPETS 2023. + + +""" + + +try: + import os + import swat + import json + from anonymeter.evaluators import SinglingOutEvaluator +except ImportError as ie: + _smt_error_flag = 1 + _smt_error_desc = ie + SAS.symput("_smt_error_flag",_smt_error_flag) + SAS.symput("_smt_error_desc",_smt_error_desc) + SAS.logMessage(_smt_error_desc,"error") + +if _smt_error_flag ==0: + # Obtain values from UI & SAS macro variables + evaluation_mode = SAS.symget('evaluation_mode') + conf_interval = float(SAS.symget('conf_interval')) + s_o_attacks = int(SAS.symget('s_o_attacks')) + singling_out_results_tbl = SAS.symget('singling_out_results_tbl') + singling_out_queries_tbl = SAS.symget('singling_out_queries_tbl') + cas_session_exists = SAS.symget('casSessionExists') + assessment_table_name = SAS.symget('assessmentTable_name_base') + assessment_table_caslib = SAS.symget('assessmentCaslib') + input_caslib = SAS.symget('inputCaslib') + input_table_name = SAS.symget('inputTable_name_base') + so_queries_tbl = SAS.symget('so_queries_tbl_name_base') + so_results_tbl = SAS.symget('so_results_tbl_name_base') + so_queries_caslib = SAS.symget('so_queries_caslib') + so_results_caslib = SAS.symget('so_results_caslib') + + # Retrieve values for SAS options cashost and casport, these are needed by SWAT connection + cas_host_name = SAS.sasfnc('getoption','cashost') + cas_host_port = SAS.sasfnc('getoption','casport') + + # Add certificate location to operating system list of trusted certs + os.environ['CAS_CLIENT_SSL_CA_LIST'] = os.environ['SSLCALISTLOC'] + + + # Connect to CAS + if cas_session_exists == '1': + cas_session_uuid = SAS.symget('casSessionUUID') + SAS.logMessage(f"CAS connection exists. Session UUID is {cas_session_uuid}") + conn = swat.CAS(hostname = cas_host_name, port = cas_host_port, password = os.environ['SAS_SERVICES_TOKEN'], session = cas_session_uuid) + if conn: + SAS.logMessage('SWAT connection established.') + else: + SAS.logMessage('ERROR: No active CAS session. Connect to a CAS session in upstream step in the flow.') + _smt_error_flag = 1 + _smt_error_desc = "ERROR: No active CAS session. Connect to a CAS session in upstream step in the flow." + + df_org = conn.CASTable(name=input_table_name, caslib=input_caslib).to_frame() + df_syn = conn.CASTable(name=assessment_table_name, caslib=assessment_table_caslib, where="Synthetic_Data_Provenance='Synthetic'").to_frame() + df_con = conn.CASTable(name=assessment_table_name, caslib=assessment_table_caslib, where="Synthetic_Data_Provenance='Original'").to_frame() + + + evaluator = SinglingOutEvaluator(ori=df_org, syn=df_syn, control=df_con, n_attacks=s_o_attacks) + + try: + evaluator.evaluate(mode=evaluation_mode) + risk = evaluator.risk(confidence_level=conf_interval) + print(risk) + + except RuntimeError as ex: + _smt_error_flag = 1 + _smt_error_desc = f"Singling out evaluation failed with {ex}. Please re-run this operation. For more stable results increase `n_attacks`. Note that this will make the evaluation slower." + SAS.symput("_smt_error_flag",1) + SAS.symput("_smt_error_desc",_smt_error_desc) + +# Create a summary (title section will be modified in future version based on adding more metrics) +# SAS.submit("title 'Singling Out Risk: Summary'; run;") + +if _smt_error_flag == 0: + summary = f"Singling out privacy risk has been found to be {risk.value} between a confidence interval of {risk.ci[0]} and {risk.ci[1]}" + query_status = f"{len(evaluator.queries())} queries were successful attacks." + # Print to SAS results window + SAS.submit(f"ods text = 'Singling Out Risk: Summary';") + SAS.submit(f"ods text = '{summary}';") + SAS.submit(f"ods text = '{query_status}';") + SAS.submit(f"ods text = '{citation}';") + SAS.logMessage(citation) + citation_col = [] + for a in range(0,len(evaluator.queries())): + citation_col.append(citation) + # Define table for results and queries + so_results_table = conn.CASTable(name=so_results_tbl, caslib=so_results_caslib, replace=True) + so_queries_table = conn.CASTable(name=so_queries_tbl, caslib=so_queries_caslib, replace=True) + # Create a Results dict + so_res = evaluator.results() + res_dict = {"Citation":[citation], "Privacy_Risk": [risk.value], "Privacy_Risk_Conf_Interval_Lower": [risk.ci[0]],"Privacy_Risk_Conf_Interval_Upper": [risk.ci[1]],"Attack_Rate":[so_res.attack_rate.value],"Attack_Rate_Error":[so_res.attack_rate.error], "Baseline_Rate":[so_res.baseline_rate.value],"Baseline_Rate_Error":[so_res.baseline_rate.error], "Control_Rate":[so_res.control_rate.value],"Control_Rate_Error":[so_res.control_rate.error], "N_Attacks":[so_res.n_attacks],"N_Success":[so_res.n_success], "N_Baseline": [so_res.n_baseline],"N_Control":[so_res.n_control] } + # Load Results to a CAS table + so_results_table.from_dict(data=res_dict, connection=conn, casout=so_results_table) + SAS.logMessage("Results table loaded to CAS.") + # Load Queries to a CAS table + so_queries_table.from_dict(data={"Query":evaluator.queries(), "Citation": citation_col}, connection=conn, casout=so_queries_table) + SAS.logMessage("Queries table loaded to CAS.") + + +;;;; + + +run; /*-----------------------------------------------------------------------------------------* MACROS *------------------------------------------------------------------------------------------*/ @@ -129,7 +278,7 @@ *------------------------------------------------------------------------------------------*/ %macro _env_cas_checkSession(errorFlagName, errorFlagDesc); - + %global casSessionExists; %if %sysfunc(symexist(_current_uuid_)) %then %do; %symdel _current_uuid_; %end; @@ -298,7 +447,78 @@ %end; %mend _cas_table_exists; - + +/*-----------------------------------------------------------------------------------------* + Macro to calculate singling out risk + + Input: invoked with current state of macro variables + Output (implicit): + 1. Singling Out Risk Results table + 2. Singling Out Risk Queries table + + As the calculation of Singling Out Risk is based on an open-source Python package (anonymeter), + we note the following citation: + + "A Unified Framework for Quantifying Privacy Risk in Synthetic Data", M. Giomi et al, PoPETS 2023. + + This bibtex entry can be used to refer to the paper: + + @misc{anonymeter, + doi = {https://doi.org/10.56553/popets-2023-0055}, + url = {https://petsymposium.org/popets/2023/popets-2023-0055.php}, + journal = {Proceedings of Privacy Enhancing Technologies Symposium}, + year = {2023}, + author = {Giomi, Matteo and Boenisch, Franziska and Wehmeyer, Christoph and Tasnádi, Borbála}, + title = {A Unified Framework for Quantifying Privacy Risk in Synthetic Data}, + } + + +*------------------------------------------------------------------------------------------*/ + +%macro _smt_singling_out_risk; + + %put NOTE: Singling out risk macro; +/*-----------------------------------------------------------------------------------------* + Check Results table libref to ensure it points to a valid caslib. +*------------------------------------------------------------------------------------------*/ + %if &_smt_error_flag. = 0 %then %do; + %global so_results_caslib; + %_usr_getNameCaslib(&so_results_tbl_lib.); + %let so_results_caslib=&_usr_nameCaslib.; + %put NOTE: &so_results_caslib. is the caslib for the Singling Out Risk results table.; + %let _usr_nameCaslib=; + %if "&so_results_caslib." = "" %then %do; + data _null_; + call symputx("_smt_error_flag",1); + call symput("_smt_error_desc","ERROR: Singling Out Results table caslib is blank. Check if table is a valid CAS table."); + run; + %put ERROR: Singling Out Results table caslib is blank. Check if table is a valid CAS table. ; + %end; + %end; +/*-----------------------------------------------------------------------------------------* + Check Queries table libref to ensure it points to a valid caslib. +*------------------------------------------------------------------------------------------*/ + %if &_smt_error_flag. = 0 %then %do; + %global so_queries_caslib; + %_usr_getNameCaslib(&so_queries_tbl_lib.); + %let so_queries_caslib=&_usr_nameCaslib.; + %put NOTE: &so_queries_caslib. is the caslib for the Singling Out Risk queries table.; + %let _usr_nameCaslib=; + %if "&so_queries_caslib." = "" %then %do; + data _null_; + call symputx("_smt_error_flag",1); + call symput("_smt_error_desc","ERROR: Singling Out Queries table caslib is blank. Check if table is a valid CAS table."); + run; + %put ERROR: Singling Out Queries table caslib is blank. Check if table is a valid CAS table. ; + %end; + %end; + %if &_smt_error_flag. = 0 %then %do; + proc python infile=smtcode; + quit; + %end; + + +%mend _smt_singling_out_risk; /*-----------------------------------------------------------------------------------------* EXECUTION CODE MACRO @@ -314,6 +534,17 @@ %_create_error_flag(_smt_error_flag, _smt_error_desc); +/*-----------------------------------------------------------------------------------------* + Account for edge cases where singling out risk has been requested even without a sample. +*------------------------------------------------------------------------------------------*/ + data _null_; + call symputx("singling_out_risk",min(1, &singling_out_risk. * &sampling_percent.)); + run; + + %if &singling_out_risk.=0 %then %do; + %put NOTE: Privacy risk assessment will not be carried out because a sample has not been specified.; + %end; + /*-----------------------------------------------------------------------------------------* Check if an active CAS session exists. *------------------------------------------------------------------------------------------*/ @@ -381,6 +612,29 @@ %end; +/*-----------------------------------------------------------------------------------------* + Check Assessment table libref to ensure it points to a valid caslib. +*------------------------------------------------------------------------------------------*/ + + %if &_smt_error_flag. = 0 %then %do; + + %global assessmentCaslib; + %_usr_getNameCaslib(&outputTable_lib.); + %let assessmentCaslib=&_usr_nameCaslib.; + %put NOTE: &assessmentCaslib. is the caslib for the assessment table.; + %let _usr_nameCaslib=; + + %if "&assessmentCaslib." = "" %then %do; + data _null_; + call symputx("_smt_error_flag",1); + call symput("_smt_error_desc","ERROR: Assessment table caslib is blank. Check if table is a valid CAS table."); + run; + %put ERROR: Assessment table caslib is blank. Check if table is a valid CAS table. ; + %end; + + %end; + + /*-----------------------------------------------------------------------------------------* Obtain list of input & nominal variables and store them in macro variables. *------------------------------------------------------------------------------------------*/ @@ -388,12 +642,10 @@ %if &_smt_error_flag. = 0 %then %do; %let blankSeparatedInputVars = %_flw_get_column_list(_flw_prefix=inputVars); %let blankSeparatedNominalVars = %_flw_get_column_list(_flw_prefix=nominalVars); + %put NOTE: Input variables selected - &blankSeparatedInputVars.; + %put NOTE: Nominal variables selected - &blankSeparatedNominalVars.; %end; - %put NOTE: Input variables selected - &blankSeparatedInputVars.; - %put NOTE: Nominal variables selected - &blankSeparatedNominalVars.; - - /*-----------------------------------------------------------------------------------------* Create a program string based on selection of nominal variables. *------------------------------------------------------------------------------------------*/ @@ -469,6 +721,25 @@ call symput("assessmentTable_name_base","SMOTE_ASSESSMENT"); run; %end; + %else %do; +/*-----------------------------------------------------------------------------------------* + Check Assessment table libref to ensure it points to a valid caslib. +*------------------------------------------------------------------------------------------*/ + %if &_smt_error_flag. = 0 %then %do; + %global assessmentCaslib; + %_usr_getNameCaslib(&outputTable_lib.); + %let assessmentCaslib=&_usr_nameCaslib.; + %put NOTE: &assessmentCaslib. is the caslib for the assessment table.; + %let _usr_nameCaslib=; + %if "&assessmentCaslib." = "" %then %do; + data _null_; + call symputx("_smt_error_flag",1); + call symput("_smt_error_desc","ERROR: Assessment table caslib is blank. Check if table is a valid CAS table."); + run; + %put ERROR: Assessment table caslib is blank. Check if table is a valid CAS table. ; + %end; + %end; + %end; %end; %end; /*-----------------------------------------------------------------------------------------* @@ -588,11 +859,20 @@ set &outputTable_lib..__assess_orig &outputTable_lib..__assess_synth (where=(_PartInd_=1)); keep &prov_flag_name. &blankSeparatedInputVars.; run; - proc datasets lib=&outputTable_lib.; + proc datasets lib=&outputTable_lib. nolist nodetails; delete __assess_orig __assess_synth ; quit; +/*-----------------------------------------------------------------------------------------* + Check and address singling out risk +*------------------------------------------------------------------------------------------*/ + %if &singling_out_risk.=1 %then %do; + + %_smt_singling_out_risk; + + %end; + %end; - proc datasets lib=&outputTable_lib.; + proc datasets lib=&outputTable_lib. nolist nodetails; delete __temp_smote; quit; %end; @@ -617,6 +897,8 @@ Execute *------------------------------------------------------------------------------------------*/ + + %if &_smt_run_trigger. = 1 %then %do; %_smt_execution_code; @@ -650,6 +932,18 @@ %symdel outputCaslib; %end; +%if %symexist(assessmentCaslib) %then %do; + %symdel assessmentCaslib; +%end; + +%if %symexist(so_results_caslib) %then %do; + %symdel so_results_caslib; +%end; + +%if %symexist(so_queries_caslib) %then %do; + %symdel so_queries_caslib; +%end; + %if %symexist(casTableExists) %then %do; %symdel casTableExists; %end; @@ -670,6 +964,10 @@ %symdel _smt_error_desc; %end; +%if %symexist(casSessionExists) %then %do; + %symdel casSessionExists; +%end; + %sysmacdelete _create_error_flag; %sysmacdelete _create_runtime_trigger; %sysmacdelete _env_cas_checkSession; @@ -677,3 +975,6 @@ %sysmacdelete _sas_or_cas; %sysmacdelete _cas_table_exists; %sysmacdelete _smt_execution_code; +%sysmacdelete _smt_singling_out_risk; + +filename smtcode clear;