diff --git a/.gitignore b/.gitignore index dd6d4f7..29d522b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,95 +1,131 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover - -# Translations -*.mo -*.pot - -# Django stuff: -*.log - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# DotEnv configuration -.env - -# Database -*.db -*.rdb - -# Pycharm -.idea - -# VS Code -.vscode/ - -# Spyder -.spyproject/ - -# Jupyter NB Checkpoints -.ipynb_checkpoints/ - -# Mac OS-specific storage files -.DS_Store - -# vim -*.swp -*.swo - -# Mypy cache -.mypy_cache/ - -# Jupyter Book -_build/ - -# python-dotenv -.env - -# setuptools-scm/ -src/*/_version.py +######################################################################## +# +# Based on DIME .gitignore template version 2.0 +# +######################################################################## + +####################### +# Start by ignoring everything, and below we are explicitly saying +# what to not ignore +* + +####################### +# List of files with GitHub functionality anywhere in the repo +# that we do not want to ignore + +# These files include GitHub settings +!.gitignore +!.gitattributes +!.env.example + +!.reproducibility_report_RR_2024_WLD_175-v02.pdf + +# Git placeholder file (to commit empty folders) +!/**/*.gitkeep + +# Keep markdown files used for documentation on GitHub +!README.md +!CONTRIBUTING.md +!LICENSE* + +* Unignore reproot files - see https://dime-worldbank.github.io/repkit/articles/reproot-files.html +!reproot*.yaml + +####################### +# For performance reasons, if a folder is already ignored, then +# GitHub does not check the content for that folder for matches +# with additional rules. The line below includes folder in the +# top folder (but not their content), so that anything matching +# the rules below will still not be ignored. +!*/ + +####################### +# The following file types are code that should always be +# included no matter where in the repository folder they are +# located unless you explicitly ignore that folder + +# Stata +!/**/*.do +!/**/*.ado +!/**/*.sthlp +!/**/*.smcl + +# R +!/**/*.R +!/**/*.Rmd +!/**/*.Rproj +!/**/*.qmd +!/**/*.Rprofile +!/**/renv/ + +# Still ignore user file for R projects +.Rproj.user + +# LaTeX +!/**/*.tex +!/**/*.bib + +# Python +!/**/*.py +!/**/*.ipynb +!/**/requirements.txt +!/**/Pipfile +!/**/Pipfile.lock +!/**/*.toml +# Still ignore .ipynb files in checkpoint folders +.ipynb_checkpoints + +# Matlab +!/**/*.m + +# Markdown +!/**/*.md + +# Julia +!/**/*.jl + +#Files to create custom layout and functionality +# in dashboards, markdown notebooks, bookdown etc. +!/**/*.css +!/**/*.js +!/**/*.json + +#Bash scripts +!/**/*.sh + +# Citations +!/**/*.cff + +# .yml and .yaml files +# These files sometimes have credentials, so only unignore these lines by removing +# the "#" below after you checked your .yml/.yaml files for sensitive content +!/**/*.yml +!/**/*.yaml + +####################### +# Include some additional file formats in any output folder. You might have +# to change the name of the Output folder to whatever it is called in your +# project, but we strongly recommend that you only include these files in +# a subset of the folders where you are certain no private data is ever stored. +!/**/Output/**/*.txt +!/**/Output/**/*.csv +!/**/Output/**/*.xml +!/**/images/**/*.eps +!/**/images/**/*.svg +!/**/images/**/*.png +!/**/images/**/*.ico + +####################### +# Include all the files with passwords or tokens here. All files named +# password or passwords are with this template ignored no matter which +# format you are using. Additionally, all content in any folder called +# password or passwords are also ignored. NOTE that your project might be +# using different names and then you must edit the lines below accordingly. +password.* +passwords.* +password/ +passwords/ +token.* +tokens.* +token/ +tokens/ diff --git a/01.code/R/.Rprofile b/01.code/R/.Rprofile deleted file mode 100644 index 81b960f..0000000 --- a/01.code/R/.Rprofile +++ /dev/null @@ -1 +0,0 @@ -source("renv/activate.R") diff --git a/01.code/R/00_MASTER.R b/01.code/R/00_MASTER.R old mode 100644 new mode 100755 index 26e2714..7a81f53 --- a/01.code/R/00_MASTER.R +++ b/01.code/R/00_MASTER.R @@ -1,27 +1,27 @@ -#------------------------------------------------------------------------------# -# Population at high-risk from climate related hazards # -# R master script # -#------------------------------------------------------------------------------# - -# install packages using renv -renv::restore() - -# set directory to root replication folder -setwd("../../") - -# run from intermediate data? -from_intermediate = TRUE - -# !!! Running the MASTER R script from source data is not recommended !!! # -# !!! (~1 TB storage required, > 14 days run time) !!! # - -# run scripts -if (from_intermediate) { - source("01.code/R/10_extract_exposed_pop.R") - source("01.code/R/11_clean_exposed_pop.R") -} else { - script_list <- list.files("01.code/R", ".R$", full.names = TRUE) - for (code in setdiff(script_list, "01.code/R/00_MASTER.R")) { - source(code) - } +#------------------------------------------------------------------------------# +# Population at high-risk from climate related hazards # +# R master script # +#------------------------------------------------------------------------------# + +# install packages using renv +# just the first time you run the package +renv::restore() + +setwd("C:/Temp/08.replication package/") + +# run from intermediate data? +from_intermediate = TRUE + +# !!! Running the MASTER R script from source data is not recommended !!! # +# !!! (~1 TB storage required, > 14 days run time) !!! # + +# run scripts +if (from_intermediate) { + source("01.code/R/10_extract_exposed_pop.R") + source("01.code/R/11_clean_exposed_pop.R") +} else { + script_list <- list.files("01.code/R", ".R$", full.names = TRUE) + for (code in setdiff(script_list, "01.code/R/00_MASTER.R")) { + source(code) + } } \ No newline at end of file diff --git a/01.code/R/01_prep_flood_fathom.R b/01.code/R/01_prep_flood_fathom.R old mode 100644 new mode 100755 diff --git a/01.code/R/02_prep_flood_deltares.R b/01.code/R/02_prep_flood_deltares.R old mode 100644 new mode 100755 diff --git a/01.code/R/03_prep_flood_combine.R b/01.code/R/03_prep_flood_combine.R old mode 100644 new mode 100755 diff --git a/01.code/R/04_prep_cyclone_drought_heatwave.R b/01.code/R/04_prep_cyclone_drought_heatwave.R old mode 100644 new mode 100755 diff --git a/01.code/R/05_prep_degurban.R b/01.code/R/05_prep_degurban.R old mode 100644 new mode 100755 diff --git a/01.code/R/06_prep_rai.R b/01.code/R/06_prep_rai.R old mode 100644 new mode 100755 diff --git a/01.code/R/07_classify_hazards.R b/01.code/R/07_classify_hazards.R old mode 100644 new mode 100755 diff --git a/01.code/R/08_combine_hazards.R b/01.code/R/08_combine_hazards.R old mode 100644 new mode 100755 diff --git a/01.code/R/09_combine_hazards_degurban.R b/01.code/R/09_combine_hazards_degurban.R old mode 100644 new mode 100755 diff --git a/01.code/R/10_extract_exposed_pop.R b/01.code/R/10_extract_exposed_pop.R old mode 100644 new mode 100755 diff --git a/01.code/R/11_clean_exposed_pop.R b/01.code/R/11_clean_exposed_pop.R old mode 100644 new mode 100755 diff --git a/01.code/R/renv.lock b/01.code/R/renv.lock deleted file mode 100644 index eec84bd..0000000 --- a/01.code/R/renv.lock +++ /dev/null @@ -1,828 +0,0 @@ -{ - "R": { - "Version": "4.4.1", - "Repositories": [ - { - "Name": "CRAN", - "URL": "https://packagemanager.posit.co/cran/latest" - } - ] - }, - "Packages": { - "DBI": { - "Package": "DBI", - "Version": "1.2.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "065ae649b05f1ff66bb0c793107508f5" - }, - "KernSmooth": { - "Package": "KernSmooth", - "Version": "2.23-24", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "stats" - ], - "Hash": "9f33a1ee37bbe8919eb2ec4b9f2473a5" - }, - "MASS": { - "Package": "MASS", - "Version": "7.3-60.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "methods", - "stats", - "utils" - ], - "Hash": "2f342c46163b0b54d7b64d1f798e2c78" - }, - "R6": { - "Package": "R6", - "Version": "2.5.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "470851b6d5d0ac559e9d01bb352b4021" - }, - "Rcpp": { - "Package": "Rcpp", - "Version": "1.0.12", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "methods", - "utils" - ], - "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" - }, - "askpass": { - "Package": "askpass", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "sys" - ], - "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" - }, - "bit": { - "Package": "bit", - "Version": "4.0.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "d242abec29412ce988848d0294b208fd" - }, - "bit64": { - "Package": "bit64", - "Version": "4.0.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "bit", - "methods", - "stats", - "utils" - ], - "Hash": "9fe98599ca456d6552421db0d6772d8f" - }, - "class": { - "Package": "class", - "Version": "7.3-22", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "MASS", - "R", - "stats", - "utils" - ], - "Hash": "f91f6b29f38b8c280f2b9477787d4bb2" - }, - "classInt": { - "Package": "classInt", - "Version": "0.4-10", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "KernSmooth", - "R", - "class", - "e1071", - "grDevices", - "graphics", - "stats" - ], - "Hash": "f5a40793b1ae463a7ffb3902a95bf864" - }, - "cli": { - "Package": "cli", - "Version": "3.6.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "utils" - ], - "Hash": "1216ac65ac55ec0058a6f75d7ca0fd52" - }, - "clipr": { - "Package": "clipr", - "Version": "0.8.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "utils" - ], - "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" - }, - "cpp11": { - "Package": "cpp11", - "Version": "0.4.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "5a295d7d963cc5035284dcdbaf334f4e" - }, - "crayon": { - "Package": "crayon", - "Version": "1.5.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "grDevices", - "methods", - "utils" - ], - "Hash": "859d96e65ef198fd43e82b9628d593ef" - }, - "curl": { - "Package": "curl", - "Version": "5.2.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" - }, - "data.table": { - "Package": "data.table", - "Version": "1.15.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "8ee9ac56ef633d0c7cab8b2ca87d683e" - }, - "dplyr": { - "Package": "dplyr", - "Version": "1.1.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "cli", - "generics", - "glue", - "lifecycle", - "magrittr", - "methods", - "pillar", - "rlang", - "tibble", - "tidyselect", - "utils", - "vctrs" - ], - "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" - }, - "e1071": { - "Package": "e1071", - "Version": "1.7-14", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "class", - "grDevices", - "graphics", - "methods", - "proxy", - "stats", - "utils" - ], - "Hash": "4ef372b716824753719a8a38b258442d" - }, - "exactextractr": { - "Package": "exactextractr", - "Version": "0.10.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "Rcpp", - "methods", - "raster", - "sf" - ], - "Hash": "3db813596387e90573ad092d5e3fde37" - }, - "fansi": { - "Package": "fansi", - "Version": "1.0.6", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "utils" - ], - "Hash": "962174cf2aeb5b9eea581522286a911f" - }, - "generics": { - "Package": "generics", - "Version": "0.1.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "15e9634c0fcd294799e9b2e929ed1b86" - }, - "glue": { - "Package": "glue", - "Version": "1.7.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "e0b3a53876554bd45879e596cdb10a52" - }, - "gtools": { - "Package": "gtools", - "Version": "3.9.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "methods", - "stats", - "utils" - ], - "Hash": "588d091c35389f1f4a9d533c8d709b35" - }, - "hms": { - "Package": "hms", - "Version": "1.1.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "lifecycle", - "methods", - "pkgconfig", - "rlang", - "vctrs" - ], - "Hash": "b59377caa7ed00fa41808342002138f9" - }, - "httr": { - "Package": "httr", - "Version": "1.4.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "curl", - "jsonlite", - "mime", - "openssl" - ], - "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" - }, - "jsonlite": { - "Package": "jsonlite", - "Version": "1.8.8", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "methods" - ], - "Hash": "e1b9c55281c5adc4dd113652d9e26768" - }, - "lattice": { - "Package": "lattice", - "Version": "0.22-6", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "grid", - "stats", - "utils" - ], - "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" - }, - "lifecycle": { - "Package": "lifecycle", - "Version": "1.0.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "rlang" - ], - "Hash": "b8552d117e1b808b09a832f589b79035" - }, - "lubridate": { - "Package": "lubridate", - "Version": "1.9.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "generics", - "methods", - "timechange" - ], - "Hash": "680ad542fbcf801442c83a6ac5a2126c" - }, - "magrittr": { - "Package": "magrittr", - "Version": "2.0.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "7ce2733a9826b3aeb1775d56fd305472" - }, - "mime": { - "Package": "mime", - "Version": "0.12", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "tools" - ], - "Hash": "18e9c28c1d3ca1560ce30658b22ce104" - }, - "openssl": { - "Package": "openssl", - "Version": "2.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "askpass" - ], - "Hash": "2bcca3848e4734eb3b16103bc9aa4b8e" - }, - "pillar": { - "Package": "pillar", - "Version": "1.9.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "cli", - "fansi", - "glue", - "lifecycle", - "rlang", - "utf8", - "utils", - "vctrs" - ], - "Hash": "15da5a8412f317beeee6175fbc76f4bb" - }, - "pkgconfig": { - "Package": "pkgconfig", - "Version": "2.0.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "utils" - ], - "Hash": "01f28d4278f15c76cddbea05899c5d6f" - }, - "prettyunits": { - "Package": "prettyunits", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" - }, - "progress": { - "Package": "progress", - "Version": "1.2.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "crayon", - "hms", - "prettyunits" - ], - "Hash": "f4625e061cb2865f111b47ff163a5ca6" - }, - "proxy": { - "Package": "proxy", - "Version": "0.4-27", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "stats", - "utils" - ], - "Hash": "e0ef355c12942cf7a6b91a6cfaea8b3e" - }, - "purrr": { - "Package": "purrr", - "Version": "1.0.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "lifecycle", - "magrittr", - "rlang", - "vctrs" - ], - "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" - }, - "raster": { - "Package": "raster", - "Version": "3.6-26", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "Rcpp", - "methods", - "sp", - "terra" - ], - "Hash": "7d6eda494f34a644420ac1bfd2a8023a" - }, - "readr": { - "Package": "readr", - "Version": "2.1.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "cli", - "clipr", - "cpp11", - "crayon", - "hms", - "lifecycle", - "methods", - "rlang", - "tibble", - "tzdb", - "utils", - "vroom" - ], - "Hash": "9de96463d2117f6ac49980577939dfb3" - }, - "renv": { - "Package": "renv", - "Version": "1.0.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "utils" - ], - "Hash": "397b7b2a265bc5a7a06852524dabae20" - }, - "rlang": { - "Package": "rlang", - "Version": "1.1.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "utils" - ], - "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1" - }, - "s2": { - "Package": "s2", - "Version": "1.1.6", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "Rcpp", - "wk" - ], - "Hash": "32f7b1a15bb01ae809022960abad5363" - }, - "sf": { - "Package": "sf", - "Version": "1.0-16", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "Rcpp", - "classInt", - "grDevices", - "graphics", - "grid", - "magrittr", - "methods", - "s2", - "stats", - "tools", - "units", - "utils" - ], - "Hash": "ad57b543f7c3fca05213ba78ff63df9b" - }, - "sp": { - "Package": "sp", - "Version": "2.1-4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "grid", - "lattice", - "methods", - "stats", - "utils" - ], - "Hash": "75940133cca2e339afce15a586f85b11" - }, - "stringi": { - "Package": "stringi", - "Version": "1.8.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "stats", - "tools", - "utils" - ], - "Hash": "39e1144fd75428983dc3f63aa53dfa91" - }, - "stringr": { - "Package": "stringr", - "Version": "1.5.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "lifecycle", - "magrittr", - "rlang", - "stringi", - "vctrs" - ], - "Hash": "960e2ae9e09656611e0b8214ad543207" - }, - "sys": { - "Package": "sys", - "Version": "3.4.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" - }, - "terra": { - "Package": "terra", - "Version": "1.7-78", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "Rcpp", - "methods" - ], - "Hash": "8f020def0792119cb98bd8f696dab22d" - }, - "tibble": { - "Package": "tibble", - "Version": "3.2.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "fansi", - "lifecycle", - "magrittr", - "methods", - "pillar", - "pkgconfig", - "rlang", - "utils", - "vctrs" - ], - "Hash": "a84e2cc86d07289b3b6f5069df7a004c" - }, - "tidyr": { - "Package": "tidyr", - "Version": "1.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "cpp11", - "dplyr", - "glue", - "lifecycle", - "magrittr", - "purrr", - "rlang", - "stringr", - "tibble", - "tidyselect", - "utils", - "vctrs" - ], - "Hash": "915fb7ce036c22a6a33b5a8adb712eb1" - }, - "tidyselect": { - "Package": "tidyselect", - "Version": "1.2.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "lifecycle", - "rlang", - "vctrs", - "withr" - ], - "Hash": "829f27b9c4919c16b593794a6344d6c0" - }, - "timechange": { - "Package": "timechange", - "Version": "0.3.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cpp11" - ], - "Hash": "c5f3c201b931cd6474d17d8700ccb1c8" - }, - "tzdb": { - "Package": "tzdb", - "Version": "0.4.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cpp11" - ], - "Hash": "f561504ec2897f4d46f0c7657e488ae1" - }, - "units": { - "Package": "units", - "Version": "0.8-5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "Rcpp" - ], - "Hash": "119d19da480e873f72241ff6962ffd83" - }, - "utf8": { - "Package": "utf8", - "Version": "1.2.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "62b65c52671e6665f803ff02954446e9" - }, - "vctrs": { - "Package": "vctrs", - "Version": "0.6.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "lifecycle", - "rlang" - ], - "Hash": "c03fa420630029418f7e6da3667aac4a" - }, - "vroom": { - "Package": "vroom", - "Version": "1.6.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "bit64", - "cli", - "cpp11", - "crayon", - "glue", - "hms", - "lifecycle", - "methods", - "progress", - "rlang", - "stats", - "tibble", - "tidyselect", - "tzdb", - "vctrs", - "withr" - ], - "Hash": "390f9315bc0025be03012054103d227c" - }, - "wbstats": { - "Package": "wbstats", - "Version": "1.0.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "dplyr", - "httr", - "jsonlite", - "lifecycle", - "lubridate", - "magrittr", - "readr", - "rlang", - "stringr", - "tibble", - "tidyr" - ], - "Hash": "503678a5ad5d99378960224904a76c47" - }, - "withr": { - "Package": "withr", - "Version": "3.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics" - ], - "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" - }, - "wk": { - "Package": "wk", - "Version": "0.9.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "5d4545e140e36476f35f20d0ca87963e" - } - } -} diff --git a/01.code/R/renv/.gitignore b/01.code/R/renv/.gitignore old mode 100644 new mode 100755 diff --git a/01.code/R/renv/activate.R b/01.code/R/renv/activate.R old mode 100644 new mode 100755 diff --git a/01.code/R/renv/settings.json b/01.code/R/renv/settings.json old mode 100644 new mode 100755 diff --git a/01.code/R/replication package.Rproj b/01.code/R/replication package.Rproj old mode 100644 new mode 100755 index 8e3c2eb..3af27f6 --- a/01.code/R/replication package.Rproj +++ b/01.code/R/replication package.Rproj @@ -1,13 +1,13 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/01.code/ado/_/_gadjl.ado b/01.code/ado/_/_gadjl.ado new file mode 100755 index 0000000..a347afa --- /dev/null +++ b/01.code/ado/_/_gadjl.ado @@ -0,0 +1,25 @@ +*! 1.1.0 NJC 2 June 2004 +*! 1.0.0 NJC 17 May 2004 +program _gadjl + version 8.0 + syntax newvarname =/exp [if] [in] /// + [, BY(varlist) FACTor(numlist max=1 >=0) ] + quietly { + tempvar touse group + tempname l + mark `touse' `if' `in' + sort `touse' `by' + by `touse' `by' : gen long `group' = _n == 1 if `touse' + replace `group' = sum(`group') + local max = `group'[_N] + gen double `varlist' = . + if "`factor'" == "" local factor = 1.5 + + forval i = 1/`max' { + su `exp' if `group' == `i', detail + scalar `l' = r(p25) - `factor' * (r(p75) - r(p25)) + su `exp' if `group' == `i' & `exp' >= `l', meanonly + replace `varlist' = r(min) if `group' == `i' + } + } +end diff --git a/01.code/ado/_/_gadju.ado b/01.code/ado/_/_gadju.ado new file mode 100755 index 0000000..f1f4d60 --- /dev/null +++ b/01.code/ado/_/_gadju.ado @@ -0,0 +1,25 @@ +*! 1.1.0 NJC 2 June 2004 +*! 1.0.0 NJC 17 May 2004 +program _gadju + version 8.0 + syntax newvarname =/exp [if] [in] /// + [, BY(varlist) FACTor(numlist max=1 >=0) ] + quietly { + tempvar touse group + tempname u + mark `touse' `if' `in' + sort `touse' `by' + by `touse' `by' : gen long `group' = _n == 1 if `touse' + replace `group' = sum(`group') + local max = `group'[_N] + gen double `varlist' = . + if "`factor'" == "" local factor = 1.5 + + forval i = 1/`max' { + su `exp' if `group' == `i', detail + scalar `u' = r(p75) + `factor' * (r(p75) - r(p25)) + su `exp' if `group' == `i' & `exp' <= `u', meanonly + replace `varlist' = r(max) if `group' == `i' + } + } +end diff --git a/01.code/ado/_/_gaxis.ado b/01.code/ado/_/_gaxis.ado new file mode 100755 index 0000000..3213b55 --- /dev/null +++ b/01.code/ado/_/_gaxis.ado @@ -0,0 +1,70 @@ +*! NJC 1.0.0 6 February 2004 +program _gaxis + version 8 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] /// + [, gap Missing BY(string) REVerse label(varlist)] + + if `"`by'"' != "" { + _egennoby egroup() `"`by'"' + /* NOTREACHED */ + } + + tempvar touse order + quietly { + mark `touse' `if' `in' + if "`missing'" == "" markout `touse' `varlist', strok + + sort `touse' `varlist' + gen long `order' = _n + + if "`label'" == "" local label "`varlist'" + + if "`gap'" != "" { + local nvars : word count `varlist' + gen `type' `g' = 1 - `nvars' if `touse' + foreach v of local varlist { + replace `g' = `g' + sum(`v' != `v'[_n-1]) if `touse' + } + } + else { + by `touse' `varlist' : gen `type' `g' = _n == 1 if `touse' + replace `g' = sum(`g') if `touse' + } + + su `g', meanonly + if "`reverse'" != "" { + replace `g' = `r(max)' - `g' + 1 + su `g', meanonly + } + + if "`label'" == "" local label "`varlist'" + + forval i = 1/`r(max)' { + su `order' if `g' == `i', meanonly + if r(N) > 0 { + local value + local first = `r(min)' + local prev = `first' - 1 + +// offset so that it is readable +foreach v of local label { + if (`v'[`first'] != `v'[`prev']) | !`touse'[`prev'] { + if "`: value label `v''" != "" { + local value `"`value' `: label (`v') `=`v'[`first']''"' + } + else local value `"`value' `=`v'[`first']'"' + } +} +// end offset + label def $EGEN_Varname `i' `"`value'"', modify + } + } + + label val `g' $EGEN_Varname + label var `g' "`varlist'" + } +end diff --git a/01.code/ado/_/_gbase.ado b/01.code/ado/_/_gbase.ado new file mode 100755 index 0000000..e9995a2 --- /dev/null +++ b/01.code/ado/_/_gbase.ado @@ -0,0 +1,47 @@ +*! 1.0.1 NJC 2 Oct 2002 +* 1.0.0 NJC 29 Oct 2001 +program define _gbase + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [ , Base(numlist max=1 int <=9 >1) ] + + marksample touse + * ignores type passed from -egen- + local type "str1" + if "`base'" == "" { local base = 2 } + + capture assert `varlist' == int(`varlist') if `touse' + if _rc { + di in r "`varlist' invalid: not integer" + exit 459 + } + capture assert `varlist' >= 0 if `touse' + local sign = _rc != 0 + + quietly { + tempvar work digit + gen `type' `g' = "" + gen long `work' = `varlist' if `touse' + gen int `digit' = . + su `work', meanonly + local max = max(`r(max)',-`r(min)') + local power = 0 + while `max' >= (`base'^(`power' + 1)) { + local power = `power' + 1 + } + if `sign' { + replace `g' = `g' + cond(`work' < 0, "-","+") if `touse' + replace `work' = abs(`work') + } + while `power' >= 0 { + replace `digit' = int(`work' / `base'^`power') + replace `work' = mod(`work', `base'^`power') + replace `g' = `g' + string(`digit') if `touse' + local power = `power' - 1 + } + } +end diff --git a/01.code/ado/_/_gbom.ado b/01.code/ado/_/_gbom.ado new file mode 100755 index 0000000..9adda88 --- /dev/null +++ b/01.code/ado/_/_gbom.ado @@ -0,0 +1,70 @@ +*! 1.1.0 NJC 7 December 2000 +* 1.0.0 NJC 12 July 2000 +program define _gbom + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("(), ") /* "(" */ + + gettoken month 0 : 0, parse("(), ") + gettoken year 0 : 0, parse("(), ") + if `"`year'"' == "," { + gettoken year 0 : 0, parse("(), ") + } + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) Lag(str) Work ] + + quietly { + tempvar touse + mark `touse' `if' `in' + + if "`lag'" == "" { local lag = 0 } + else { + capture assert `lag' == int(`lag') + if _rc { + di in r "`lag' contains non-integer value(s)" + exit 410 + } + } + + capture assert `month' > 0 & `month' < 13 if `touse' + if _rc { + di in r "`month' contains value(s) not 1 to 12" + exit 198 + } + + capture assert `month' == int(`month') if `touse' + if _rc { + di in r "`month' contains non-integer value(s)" + exit 410 + } + + capture assert `year' == int(`year') if `touse' + if _rc { + di in r "`year' contains non-integer value(s)" + exit 410 + } + + gen long `g' = dofm(ym(`year', `month') - `lag') if `touse' + + * Sunday? add 1; Saturday? add 2 + if "`work'" != "" { + replace `g' = `g' + 1 if dow(`g') == 0 + replace `g' = `g' + 2 if dow(`g') == 6 + } + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gbomd.ado b/01.code/ado/_/_gbomd.ado new file mode 100755 index 0000000..a7a22b7 --- /dev/null +++ b/01.code/ado/_/_gbomd.ado @@ -0,0 +1,43 @@ +*! 1.1.0 NJC 7 December 2000 +* 1.0.0 NJC 12 July 2000 +program define _gbomd + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [ , Format(str) Lag(str) Work ] + local d "`varlist'" + + quietly { + tempvar touse + mark `touse' `if' `in' + + if "`lag'" == "" { local lag = 0 } + else { + capture assert `lag' == int(`lag') + if _rc { + di in r "`lag' contains non-integer value(s)" + exit 410 + } + } + + gen long `g' = dofm(ym(year(`d'), month(`d')) - `lag') /* + */ if `touse' + + * Sunday? add 1; Saturday? add 2 + if "`work'" != "" { + replace `g' = `g' + 1 if dow(`g') == 0 + replace `g' = `g' + 2 if dow(`g') == 6 + } + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gclsst.ado b/01.code/ado/_/_gclsst.ado new file mode 100755 index 0000000..77e92c0 --- /dev/null +++ b/01.code/ado/_/_gclsst.ado @@ -0,0 +1,28 @@ +*! 1.0.0 NJC 11 January 2000 +program define _gclsst + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in], Values(numlist) [ Later ] + local eq = cond("`later'" != "", "=", "") + marksample touse + tokenize `values' + tempvar gdiff ldiff + quietly { + gen `gdiff' = . + gen `ldiff' = . + gen `g' = . + while "`1'" != "" { + replace `ldiff' = abs(`varlist' - `1') if `touse' + replace `g' = `1' if `ldiff' <`eq' `gdiff' + replace `gdiff' = min(`gdiff', `ldiff') + mac shift + } + } + if length("`varlist': closest of `values'") > 80 { + note `g' : `varlist' closest of `values' + label var `g' "`varlist': see notes" + } + else label var `g' "`varlist' closest of `values'" +end diff --git a/01.code/ado/_/_gcorr.ado b/01.code/ado/_/_gcorr.ado new file mode 100755 index 0000000..1459995 --- /dev/null +++ b/01.code/ado/_/_gcorr.ado @@ -0,0 +1,83 @@ +*! NJGW 09jun2005 +*! syntax: [by varlist:] egen newvar = var1 var2 [if exp] [in exp] +*! [ , covariance spearman taua taub ] +*! computes correlation (or covariance, or spearman correlation) between var1 and var2, optionally by: varlist +*! and stores the result in newvar. +program define _gcorr + version 8 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varlist(min=2 max=2) [if] [in] [, BY(string) Covariance Spearman taua taub ] + + if "`taua'`taub'`spearman'"!="" & "`covariance'"!="" { + di as error "`taua'`taub'`spearman' and covariance are mutually exclusive" + exit 198 + } + + local x : word count `taua' `taub' `spearman' + if `x'> 1 { + di as error "may only specify one of `taua' `taub' `spearman'" + exit 198 + } + + if `"`by'"'!="" { + local by `"by `by':"' + } + + quietly { + gen `type' `g' = . + `by' GenCorr `varlist' `if' `in', thevar(`g') `covariance' `spearman' `taua' `taub' + } + + if "`spearman'"!="" { + local lab "Spearman Correlation" + } + else if "`taua'"!="" { + local lab "Tau-A Correlation" + } + else if "`taub'"!="" { + local lab "Tau-B Correlation" + } + else if "`covariance'" != "" { + local lab "Covariance" + } + else { + local lab "Correlation" + } + + capture label var `g' "`lab' of `varlist'" +end + +program define GenCorr, byable(recall) + syntax varlist [if] [in] , thevar(string) [ covariance spearman taua taub ] + marksample touse + if "`covariance'"!="" { + local stat "r(cov_12)" + } + else if "`taua'"!="" { + local stat "r(tau_a)" + } + else if "`taub'"!="" { + local stat "r(tau_b)" + } + else { /* correlation and spearman */ + local stat "r(rho)" + } + + if "`spearman'"!="" { + local cmd spearman + } + else if "`taua'`taub'"!="" { + local cmd ktau + } + else { + local cmd corr /* correlation and covariance */ + } + + cap `cmd' `varlist' if `touse' , `covariance' + if !_rc { + qui replace `thevar'=``stat'' if `touse' + } +end diff --git a/01.code/ado/_/_gd2.ado b/01.code/ado/_/_gd2.ado new file mode 100755 index 0000000..2c17981 --- /dev/null +++ b/01.code/ado/_/_gd2.ado @@ -0,0 +1,30 @@ +*! 1.0.1 Pablo A. Mitnik, April 2009 +*! d2(exp) is an egen function that returns the mean absolute deviation +*! from the median (within varlist) of exp; d2 accepts weights. +*! Requires that _gwpctile (which is part of egenmore) be installed, + +program define _gd2 + + version 10.1 + syntax newvarname = /exp [if] [in] [, Weights(varname) BY(varlist)] + + tempvar med d2 x w sumw wadev touse + marksample touse, novarlist + + quietly { + + if "`weights'"=="" { + gen `w' = 1 + local weights `w' + } + + bysort `touse' `by': egen double `sumw' = total(`weights') + gen `x' = `exp' if `touse' + egen double `med' = wpctile(`x'), p(50) weights(`weight') by(`by') + gen double `wadev' = abs(`x' - `med') * (`weights'/`sumw') + egen double `d2' = total(`wadev') if `touse', missing by(`by') + gen `typlist' `varlist' = `d2' + + } +end + diff --git a/01.code/ado/_/_gdayofyear.ado b/01.code/ado/_/_gdayofyear.ado new file mode 100755 index 0000000..bf1e39f --- /dev/null +++ b/01.code/ado/_/_gdayofyear.ado @@ -0,0 +1,20 @@ +*! 1.0.0 NJC 21 March 2006 +program _gdayofyear + version 8 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in] [, Month(int 1) Day(int 1) ] + local v "`varlist'" + local m = `month' + local d = `day' + marksample touse + + quietly { + tempvar y + gen `y' = year(`v') if `touse' + replace `y' = `y' - 1 if mdy(`m', `d', `y') > `v' + gen `g' = `v' - mdy(`m', `d', `y') + 1 if `touse' + } +end + diff --git a/01.code/ado/_/_gdecimal.ado b/01.code/ado/_/_gdecimal.ado new file mode 100755 index 0000000..fb9de58 --- /dev/null +++ b/01.code/ado/_/_gdecimal.ado @@ -0,0 +1,38 @@ +*! 1.0.0 NJC 26 Oct 2001 +program define _gdecimal + version 7.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(numeric min=1) [if] [in] [, Base(numlist max=1 int >1) ] + + marksample touse + * ignores type passed from -egen- + local type "long" + if "`base'" == "" { local base = 2 } + + foreach v of varlist `varlist' { + capture assert `v' == int(`v') if `touse' + if _rc == 0 { + capture assert `v' >= 0 & `v' < `base' if `touse' + } + if _rc { + di as err "invalid syntax: `v' not base `base'" + exit 198 + } + } + + local nvars : word count `varlist' + tokenize `varlist' + + quietly { + gen `type' `g' = 0 if `touse' + forval i = 1/`nvars' { + local power = `nvars' - `i' + replace `g' = `g' + ``i'' * `base'^`power' + } + compress `g' + } +end diff --git a/01.code/ado/_/_gdensity.ado b/01.code/ado/_/_gdensity.ado new file mode 100755 index 0000000..b3076e0 --- /dev/null +++ b/01.code/ado/_/_gdensity.ado @@ -0,0 +1,54 @@ +*! NJC 1.1.0 23jan2004 +*! NJC 1.0.0 28nov2003 +program _gdensity + version 8 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in] /// + [, Width(numlist max=1 >0) BY(varlist) STart(numlist max=1) /// + FREQuency PERCENT FRACtion DENsity] + + local opts `density' `fraction' `frequency' `percent' + local nopts : word count `opts' + + if `nopts' >= 2 { + di as err "options `opts' may not be combined" + exit 198 + } + else if `nopts' == 1 local option `opts' + else if `nopts' == 0 local option "density" + + if "`width'" == "" local width 1 + + tempvar ry sum + quietly { + marksample touse + if "`start'" == "" { + su `varlist' if `touse', meanonly + local start = r(min) + } + gen double `ry' = /// + `width' * floor((`varlist' - `start') / `width') + bysort `touse' `by' `ry' : gen `type' `g' = _N if `touse' + + if "`option'" == "frequency" exit 0 + else if "`option'" == "fraction" { + by `touse' `by' `ry' : gen double `sum' = `g' * (_n == 1) + by `touse' `by' : replace `sum' = sum(`sum') + by `touse' `by' : replace `g' = `g' / `sum'[_N] + exit 0 + } + else if "`option'" == "percent" { + by `touse' `by' `ry' : gen double `sum' = `g' * (_n == 1) + by `touse' `by' : replace `sum' = sum(`sum') + by `touse' `by' : replace `g' = 100 * `g' / `sum'[_N] + exit 0 + } + else if "`option'" == "density" { + by `touse' `by' : replace `g' = `g' / (`width' * _N) + exit 0 + } + // not reached + } +end diff --git a/01.code/ado/_/_gdhms.ado b/01.code/ado/_/_gdhms.ado new file mode 100755 index 0000000..e8ca0d4 --- /dev/null +++ b/01.code/ado/_/_gdhms.ado @@ -0,0 +1,82 @@ +*! 1.1.0 NJC 27 Feb 2006 +* 1.0.0 CFB 29 Sep 2002 +* 1.1.0 NJC 7 December 2000 _gbom +* 1.0.0 NJC 12 July 2000 +program define _gdhms + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + + gettoken date 0 : 0, parse("() ") + gettoken hour 0 : 0, parse("() ") + gettoken min 0 : 0, parse("() ") + gettoken sec 0 : 0, parse("() ") + + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) ] + + quietly { + tempvar touse + mark `touse' `if' `in' + replace `touse' = 0 if missing(`date', `hour', `min', `sec') + + capture assert `date' == int(`date') if `touse' + if _rc { + di in r "`date' contains non-integer value(s)" + exit 410 + } + + capture assert `hour' >= 0 & `hour' < 24 if `touse' + if _rc { + di in r "`hour' contains value(s) not 0 to 23" + exit 198 + } + + capture assert `hour' == int(`hour') if `touse' + if _rc { + di in r "`hour' contains non-integer value(s)" + exit 410 + } + + capture assert `min' >= 0 & `min' < 60 if `touse' + if _rc { + di in r "`min' contains value(s) not 0 to 59" + exit 198 + } + + capture assert `min' == int(`min') if `touse' + if _rc { + di in r "`min' contains non-integer value(s)" + exit 410 + } + + capture assert `sec' >= 0 & `sec' < 60 if `touse' + if _rc { + di in r "`sec' contains value(s) not 0 to 59" + exit 198 + } + + capture assert `sec' == int(`sec') if `touse' + if _rc { + di in r "`sec' contains non-integer value(s)" + exit 410 + } + + gen double `g' = `date' + (`sec' + `min'*60 + `hour'*3600)/86400 if `touse' + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gegroup.ado b/01.code/ado/_/_gegroup.ado new file mode 100755 index 0000000..4cc0aec --- /dev/null +++ b/01.code/ado/_/_gegroup.ado @@ -0,0 +1,101 @@ +*! NJC 1.0.0 10 July 2002 +* _ggroup 2.0.4 19oct2000 +program define _gegroup + version 7 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] [, Missing BY(string) Label Label2(varlist) /* + */ Truncate(numlist max=1 int >= 1)] + + if `"`by'"' != "" { + _egennoby egroup() `"`by'"' + /* NOTREACHED */ + } + + if "`label'" != "" & "`label2'" != "" { + di as err "may not combine label and label() options" + exit 198 + } + + if "`truncate'" != "" & "`label'`label2'" == "" { + di as err "truncate() option requires a label option" + exit 198 + } + + tempvar touse + quietly { + mark `touse' `if' `in' + if "`missing'" == "" { + markout `touse' `varlist', strok + } + + sort `touse' `varlist' + quietly by `touse' `varlist': /* + */ gen `type' `g' = 1 if _n == 1 & `touse' + replace `g' = sum(`g') + replace `g' = . if `touse' != 1 + + if "`label2'" != "" { + local label "label" + local varlist "`label2'" + } + + if "`label'" != "" { + local dfltfmt : set dp + local dfltfmt = /* + */ cond("`dfltfmt'" == "period","%9.0g","%9,0g") + local truncate=cond("`truncate'" == "","80","`truncate'") + + count if !`touse' + local j = 1 + r(N) + sum `g', meanonly + local max `r(max)' + forval i = 1 / `max' { + tokenize `varlist' + local vtmp " " + local x 1 + while "`1'" != "" { + local vallab : value label `1' + local val = `1'[`j'] + if "`vallab'" != "" { +local vtmp2 : label `vallab' `val' `truncate' + } + else { + cap confirm numeric var `1' + if _rc == 0 { +local vtmp2 = string(`1'[`j'],"`dfltfmt'") + } + else { +local vtmp2 = trim(substr(trim(`1'[`j']),1,`truncate')) + } + } + local x = `x' + length("`vtmp2'") + 1 + local vtmp "`vtmp' `vtmp2'" + mac shift + } + + if `x' >= 80 { + local over = "over" + } + local val `vtmp' + label def $EGEN_Varname `i' "`val'", modify + count if `g' == `i' + local j = `j' + r(N) + } + label val `g' $EGEN_Varname + if "`over'" != "" { + noi di as txt _n /* +*/ "note: value labels exceed 80 characters and were truncated;" _n /* +*/ " use the truncate() option to control this" + } + } + } + + if length("group(`varlist')") > 80 { + note `g' : group(`varlist') + label var `g' "see notes" + } + else label var `g' "group(`varlist')" +end diff --git a/01.code/ado/_/_gelap.ado b/01.code/ado/_/_gelap.ado new file mode 100755 index 0000000..c39367a --- /dev/null +++ b/01.code/ado/_/_gelap.ado @@ -0,0 +1,54 @@ +*! 1.0.0 CFB 29 Sep 2002 +* 1.1.0 NJC 7 December 2000 _gbom +* 1.0.0 NJC 12 July 2000 +program define _gelap + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + + gettoken time 0 : 0, parse("() ") + + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) ] + + quietly { + tempvar touse dd tyme hh mm ss + mark `touse' `if' `in' + + capture assert `time' >= 0 if `touse' + if _rc { + di in r "`time' contains non time-of-day value(s)" + exit 198 + } + + capture assert `time' == int(`time') if `touse' + if _rc { + di in r "`time' contains non-integer value(s)" + exit 410 + } + + gen int `dd' = int(`time'/86400) if `touse' + gen long `tyme' = `time' - 86400*`dd' if `touse' + gen int `hh' = int(`tyme'/3600) if `touse' + gen int `mm' = int((`tyme' - `hh'*3600)/60) if `touse' + gen int `ss' = int(`tyme' - `hh'*3600 - `mm'*60) if `touse' + gen str15 `g' = string(`dd',"%6.0f") + ":"+ string(`hh',"%02.0f") /* + */ + ":" + string(`mm',"%02.0f") + /* + */ ":" + string(`ss',"%02.0f") if `touse' + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gelap2.ado b/01.code/ado/_/_gelap2.ado new file mode 100755 index 0000000..a7299d0 --- /dev/null +++ b/01.code/ado/_/_gelap2.ado @@ -0,0 +1,60 @@ +*! 1.0.0 CFB 29 Sep 2002 +* 1.1.0 NJC 7 December 2000 _gbom +* 1.0.0 NJC 12 July 2000 +program define _gelap2 + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + + gettoken time1 0 : 0, parse("() ") + gettoken time2 0 : 0, parse("() ") + + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) ] + + quietly { + tempvar touse diff tyme dd hh mm ss + mark `touse' `if' `in' + + capture assert `time1' >= 0 if `touse' + if _rc { + di in r "`time1' contains invalid value(s)" + exit 198 + } + capture assert `time2' >= 0 if `touse' + if _rc { + di in r "`time2' contains invalid value(s)" + exit 198 + } + capture assert `time2' >= `time1' if `touse' + if _rc { + di in r "`time2' - `time1' must be non-negative" + exit 198 + } + + gen double `diff' = `time2' - `time1' if `touse' + gen int `dd' = int(`diff') if `touse' + gen long `tyme' = 86400*(`diff'-`dd') if `touse' + gen int `hh' = int(`tyme'/3600) if `touse' + gen int `mm' = int((`tyme' - `hh'*3600)/60) if `touse' + gen int `ss' = int(`tyme' - `hh'*3600 - `mm'*60) if `touse' + gen str15 `g' = string(`dd',"%6.0f") + ":"+ string(`hh',"%02.0f") /* + */ + ":" + string(`mm',"%02.0f") + /* + */ ":" + string(`ss',"%02.0f") if `touse' + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_geom.ado b/01.code/ado/_/_geom.ado new file mode 100755 index 0000000..7b51402 --- /dev/null +++ b/01.code/ado/_/_geom.ado @@ -0,0 +1,71 @@ +*! 1.1.0 NJC 7 December 2000 +* 1.0.0 NJC 12 July 2000 +program define _geom + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("(), ") /* "(" */ + + gettoken month 0 : 0, parse("(), ") + gettoken year 0 : 0, parse("(), ") + if `"`year'"' == "," { + gettoken year 0 : 0, parse("(), ") + } + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) Lag(str) Work ] + + quietly { + tempvar touse + mark `touse' `if' `in' + + if "`lag'" == "" { local lag = 0 } + else { + capture assert `lag' == int(`lag') + if _rc { + di in r "`lag' contains non-integer value(s)" + exit 410 + } + } + + capture assert `month' > 0 & `month' < 13 if `touse' + if _rc { + di in r "`month' contains value(s) not 1 to 12" + exit 198 + } + + capture assert `month' == int(`month') if `touse' + if _rc { + di in r "`month' contains non-integer value(s)" + exit 410 + } + + capture assert `year' == int(`year') if `touse' + if _rc { + di in r "`year' contains non-integer value(s)" + exit 410 + } + + gen long `g' = dofm(ym(`year', `month') - `lag' + 1) - 1 /* + */ if `touse' + + * Sunday? subtract 2; Saturday? subtract 1 + if "`work'" != "" { + replace `g' = `g' - 2 if dow(`g') == 0 + replace `g' = `g' - 1 if dow(`g') == 6 + } + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_geomd.ado b/01.code/ado/_/_geomd.ado new file mode 100755 index 0000000..899a92b --- /dev/null +++ b/01.code/ado/_/_geomd.ado @@ -0,0 +1,44 @@ +*! 1.1.1 NJC 13 October 2002 +* 1.1.0 NJC 7 December 2000 +* 1.0.0 NJC 12 July 2000 +program define _geomd + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [ , Format(str) Lag(str) Work ] + local d "`varlist'" + + quietly { + tempvar touse + mark `touse' `if' `in' + + if "`lag'" == "" { local lag = 0 } + else { + capture assert `lag' == int(`lag') + if _rc { + di in r "`lag' contains non-integer value(s)" + exit 410 + } + } + + gen long `g' = dofm(ym(year(`d'),month(`d'))- `lag' + 1) - 1 /* + */ if `touse' + + * Sunday? subtract 2; Saturday? subtract 1 + if "`work'" != "" { + replace `g' = `g' - 2 if dow(`g') == 0 + replace `g' = `g' - 1 if dow(`g') == 6 + } + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gewma.ado b/01.code/ado/_/_gewma.ado new file mode 100755 index 0000000..5072c69 --- /dev/null +++ b/01.code/ado/_/_gewma.ado @@ -0,0 +1,19 @@ +*! 1.0.0 NJC 20 February 2001 +program define _gewma + version 6.0 + qui tsset /* error if not set as time series */ + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname [if] [in] , a(real) + + marksample touse + + qui { + gen `type' `g' = `varlist' if `touse' + replace `g' = /* + */ `a' * `varlist' + (1 - `a') * L.`g' if L.`g' < . + } + +end diff --git a/01.code/ado/_/_gfilter.ado b/01.code/ado/_/_gfilter.ado new file mode 100755 index 0000000..7ce532b --- /dev/null +++ b/01.code/ado/_/_gfilter.ado @@ -0,0 +1,51 @@ +*! 1.1.1 NJC 19 March 2006 +*! 1.1.0 NJC 5 December 2003 +* 1.0.1 NJC 26 June 2001 +* promoted to 7 to avoid problems with -normali?e- +* 1.0.0 NJC 25 January 2000 aided and abetted by CFB +program _gfilter + version 8 + qui tsset /* error if not set as time series */ + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(ts) [if] [in] , Lags(numlist int min=1) /// + [ Coef(numlist min=1) Normalise Normalize ] + + local nlags : word count `lags' + local ncoef : word count `coef' + + if `ncoef' == 0 { + local coef : di _dup(`nlags') "1 " + local ncoef `nlags' + } + else if `nlags' != `ncoef' { + di as err "lags() and coef() not consistent" + exit 198 + } + + marksample touse + tokenize `coef' + + if "`normalise'`normalize'" != "" { + local total = 0 + forval i = 1/`ncoef' { + local total = `total' + (``i'') + } + forval i = 1/`ncoef' { + local `i' = ``i'' / `total' + } + } + + local rhs "0" + + forval i = 1/`nlags' { + local l : word `i' of `lags' + local L = -`l' + local op = cond(`l' < 0, "F`L'", "L`l'") + local rhs "`rhs' + (``i'') * `op'.`varlist'" + } + + qui gen `type' `g' = `rhs' if `touse' +end diff --git a/01.code/ado/_/_gfirst.ado b/01.code/ado/_/_gfirst.ado new file mode 100755 index 0000000..ce5c61f --- /dev/null +++ b/01.code/ado/_/_gfirst.ado @@ -0,0 +1,15 @@ +*! 1.0.0 NJC 31 May 2000 +program define _gfirst + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname [if] [in] [, BY(varlist) ] + marksample touse, strok + tempvar order + gen long `order' = _n + sort `touse' `by' `order' + * ignore user-supplied `type' + local type : type `varlist' + qui by `touse' `by' : gen `type' `g' = `varlist'[1] if `touse' +end diff --git a/01.code/ado/_/_gfoy.ado b/01.code/ado/_/_gfoy.ado new file mode 100755 index 0000000..73a0d43 --- /dev/null +++ b/01.code/ado/_/_gfoy.ado @@ -0,0 +1,46 @@ +*! 2.0.0 NJC 21 March 2006 +* 1.1.0 NJC 24 August 2005 +* 1.0.0 NJC 9 April 2002 +program _gfoy + version 8 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in] [, Month(int 1) Day(int 1) ] + local v "`varlist'" + local m = `month' + local d = `day' + marksample touse + + quietly { + tempvar y + gen `y' = year(`v') if `touse' + replace `y' = `y' - 1 if mdy(`m', `d', `y') > `v' + gen `g' = `v' - mdy(`m', `d', `y') + 1 if `touse' + + local limits mdy(`m', `d', `y'), mdy(`m',`d', `y' + 1) - 1 + local leap inrange(mdy(2, 29, `y'), `limits') + local leap (`leap' | inrange(mdy(2, 29, `y' + 1), `limits')) + + cap assert `v' == int(`v') if `touse' + + if _rc replace `g' = (`g' - 1) / (365 + `leap') + else replace `g' = (`g' - 0.5) / (365 + `leap') + } +end + +/* + + days are integers: + + fraction of year = (day of year - 0.5) / # days in year + day of year = 1 on 1 January, ... ,365 or 366 on 31 December + # days in year = day of year of December 31 in same year + + days are not integers: + + fractional part = time after midnight as fraction of day + integer part gives day + +*/ + diff --git a/01.code/ado/_/_ggmean.ado b/01.code/ado/_/_ggmean.ado new file mode 100755 index 0000000..fe86599 --- /dev/null +++ b/01.code/ado/_/_ggmean.ado @@ -0,0 +1,14 @@ +*! NJC 1.0.0 9 December 1999 +program define _ggmean + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + + tempvar touse + quietly { + gen byte `touse' = 1 `if' `in' + sort `touse' `by' + by `touse' `by': gen `typlist' `varlist' = /* + */ sum(log(`exp')) / sum((log(`exp'))!=.) if `touse'==1 + by `touse' `by': replace `varlist' = exp(`varlist'[_N]) + } +end diff --git a/01.code/ado/_/_ggroup2.ado b/01.code/ado/_/_ggroup2.ado new file mode 100755 index 0000000..4db5457 --- /dev/null +++ b/01.code/ado/_/_ggroup2.ado @@ -0,0 +1,107 @@ +*! NJC 20 Sept 2001 +*! version 2.0.4 19oct2000 +program define _ggroup2 + version 7 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] [, Missing BY(string) Label /* + */ Truncate(numlist max=1 int >= 1) SOrt(str) ] + + if `"`by'"' != "" { + _egennoby group() `"`by'"' + /* NOTREACHED */ + } + + if "`truncate'" != "" & "`label'" == "" { + di as err "truncate() option requires the label option" + exit 198 + } + + tempvar touse + quietly { + mark `touse' `if' `in' + if "`missing'"=="" { + markout `touse' `varlist', strok + } + + if "`sort'" != "" { + tempvar sresult + local hold "$EGEN_Varname" + capture egen `sresult' = `sort', by(`touse' `varlist') + global EGEN_Varname `hold' + if _rc { + di as err "invalid sort(`sort')" + exit 198 + } + local stext " by `sort'" + } + * `sresult' will be blank if -sort()- not used + * `stext' ditto + + sort `touse' `sresult' `varlist' + quietly by `touse' `sresult' `varlist': /* + */ gen `type' `g'=1 if _n==1 & `touse' + replace `g'=sum(`g') + replace `g'=. if `touse'!=1 + + if "`label'"!="" { + local dfltfmt : set dp + local dfltfmt = /* + */ cond("`dfltfmt'"=="period","%9.0g","%9,0g") + local truncate=cond("`truncate'"=="","80","`truncate'") + + count if !`touse' + local j = 1 + r(N) + sum `g', meanonly + local max `r(max)' + local i 1 + while `i' <= `max' { + tokenize `varlist' + local vtmp " " + local x 1 + while "`1'"!="" { + local vallab : value label `1' + local val = `1'[`j'] + if "`vallab'" != "" { +local vtmp2 : label `vallab' `val' `truncate' + } + else { + cap confirm numeric var `1' + if _rc==0 { +local vtmp2 = string(`1'[`j'],"`dfltfmt'") + } + else { +local vtmp2 = trim(substr(trim(`1'[`j']),1,`truncate')) + } + } + local x = `x' + length("`vtmp2'") + 1 + local vtmp "`vtmp' `vtmp2'" + mac shift + } + + if `x' >= 80 { + local over = "over" + } + local val `vtmp' + label def $EGEN_Varname `i' "`val'", modify + count if `g' == `i' + local j = `j' + r(N) + local i = `i' + 1 + } + label val `g' $EGEN_Varname + if "`over'" != "" { + noi di as txt _n /* +*/ "note: value labels exceed 80 characters and were truncated;" _n /* +*/ " use the truncate() option to control this" + } + } + } + + if length("group(`varlist'`stext')") > 80 { + note `g' : group(`varlist'`stext') + label var `g' "see notes" + } + else label var `g' "group(`varlist'`stext')" +end diff --git a/01.code/ado/_/_ghmean.ado b/01.code/ado/_/_ghmean.ado new file mode 100755 index 0000000..c9e4502 --- /dev/null +++ b/01.code/ado/_/_ghmean.ado @@ -0,0 +1,16 @@ +*! NJC 1.0.0 9 December 1999 +program define _ghmean + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + + tempvar touse + quietly { + gen byte `touse' = 1 `if' `in' + sort `touse' `by' + by `touse' `by': gen `typlist' `varlist' = /* + */ cond((`exp') > 0, 1 / (`exp'), . ) if `touse' == 1 + by `touse' `by': replace `varlist' = /* + */ sum(`varlist') / sum(`varlist' != .) + by `touse' `by': replace `varlist' = 1 / (`varlist'[_N]) + } +end diff --git a/01.code/ado/_/_ghmm.ado b/01.code/ado/_/_ghmm.ado new file mode 100755 index 0000000..2b397ed --- /dev/null +++ b/01.code/ado/_/_ghmm.ado @@ -0,0 +1,37 @@ +*! 1.0.0 NJC 12 March 2000 +program define _ghmm + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [, Round(numlist >0 max=1) Trim ] + marksample touse, strok + local type "str1" /* ignores type passed from -egen- */ + + quietly { + gen `type' `g' = "" + tempvar wrk + + * minutes + gen `wrk' = int(`varlist'/ 60) + replace `g' = string(`wrk') + ":" if `touse' + + * seconds + replace `wrk' = mod(`varlist', 60) + if "`round'" != "" { replace `wrk' = round(`wrk', `round') } +replace `g' = `g' + cond(`wrk' < 10, "0", "") + string(`wrk') if `touse' + if "`trim'" != "" { + local goon 1 + while `goon' { +count if `touse' & (substr(`g',1,1) == "0" | substr(`g',1,1) == ":") & `g' != "0" + local goon = r(N) + if `goon' { +replace `g' = substr(`g',2,.) if `touse' & (substr(`g',1,1) == "0" | substr(`g',1,1) == ":") & `g' != "0" + } + } + compress `g' + } + } +end diff --git a/01.code/ado/_/_ghmmss.ado b/01.code/ado/_/_ghmmss.ado new file mode 100755 index 0000000..4fcbd57 --- /dev/null +++ b/01.code/ado/_/_ghmmss.ado @@ -0,0 +1,41 @@ +*! 1.0.0 NJC 12 March 2000 +program define _ghmmss + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [, Round(numlist >0 max=1) Trim ] + marksample touse, strok + local type "str1" /* ignores type passed from -egen- */ + + quietly { + gen `type' `g' = "" + tempvar wrk + + * hours + gen `wrk' = int(`varlist'/ 3600) + replace `g' = string(`wrk') + ":" if `touse' & `wrk' + + * minutes + replace `wrk' = int(mod(`varlist', 3600) / 60) +replace `g' = `g' + cond(`wrk' < 10, "0", "") + string(`wrk') + ":" if `touse' + + * seconds + replace `wrk' = mod(`varlist', 60) + if "`round'" != "" { replace `wrk' = round(`wrk', `round') } +replace `g' = `g' + cond(`wrk' < 10, "0", "") + string(`wrk') if `touse' + if "`trim'" != "" { + local goon 1 + while `goon' { +count if `touse' & (substr(`g',1,1) == "0" | substr(`g',1,1) == ":") & `g' != "0" + local goon = r(N) + if `goon' { +replace `g' = substr(`g',2,.) if `touse' & (substr(`g',1,1) == "0" | substr(`g',1,1) == ":") & `g' != "0" + } + } + compress `g' + } + } +end diff --git a/01.code/ado/_/_ghms.ado b/01.code/ado/_/_ghms.ado new file mode 100755 index 0000000..06df063 --- /dev/null +++ b/01.code/ado/_/_ghms.ado @@ -0,0 +1,75 @@ +*! 1.1.0 NJC 27 Feb 2006 +* 1.0.0 CFB 29 Sep 2002 +* 1.1.0 NJC 7 December 2000 _gbom +* 1.0.0 NJC 12 July 2000 +program define _ghms + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + + gettoken hour 0 : 0, parse("() ") + gettoken min 0 : 0, parse("() ") + gettoken sec 0 : 0, parse("() ") + + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) ] + + quietly { + tempvar touse + mark `touse' `if' `in' + replace `touse' = 0 if missing(`hour', `min', `sec') + + capture assert `hour' >= 0 & `hour' < 24 if `touse' + if _rc { + di in r "`hour' contains value(s) not 0 to 23" + exit 198 + } + + capture assert `hour' == int(`hour') if `touse' + if _rc { + di in r "`hour' contains non-integer value(s)" + exit 410 + } + + capture assert `min' >= 0 & `min' < 60 if `touse' + if _rc { + di in r "`min' contains value(s) not 0 to 59" + exit 198 + } + + capture assert `min' == int(`min') if `touse' + if _rc { + di in r "`min' contains non-integer value(s)" + exit 410 + } + + capture assert `sec' >= 0 & `sec' < 60 if `touse' + if _rc { + di in r "`sec' contains value(s) not 0 to 59" + exit 198 + } + + capture assert `sec' == int(`sec') if `touse' + if _rc { + di in r "`sec' contains non-integer value(s)" + exit 410 + } + + gen long `g' = `sec' + `min'*60 + `hour'*3600 if `touse' + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gifirst.ado b/01.code/ado/_/_gifirst.ado new file mode 100755 index 0000000..cf914ce --- /dev/null +++ b/01.code/ado/_/_gifirst.ado @@ -0,0 +1,38 @@ +*! 1.0.0 NJC 8 Feb 2000 +program define _gifirst + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in], Value(int) /* + */ [ BY(varlist) BEfore After ] + + if ("`before'" != "") & ("`after'" != "") { + di in r "must choose between -before- and -after-" + exit 198 + } + + tempvar touse order neqval + mark `touse' `if' `in' + gen long `order' = _n + sort `touse' `by' `order' + + quietly { + by `touse' `by' : gen `neqval' = /* + */ sum(`varlist' == `value') if `touse' + + /* ignore user-supplied `type' */ + if "`before'`after'" == "" { + by `touse' `by': gen byte `g' = /* + */ `neqval' == 1 & (_n == 1 | `neqval'[_n-1] == 0) if `touse' + } + else if "`before'" != "" { + by `touse' `by': gen byte `g' = /* + */ `neqval' == 0 if `touse' + } + else if "`after'" != "" { + by `touse' `by': gen byte `g' = /* + */ (`neqval' > 0) & (_n > 1 & `neqval'[_n-1] > 0) if `touse' + } + } +end diff --git a/01.code/ado/_/_gilast.ado b/01.code/ado/_/_gilast.ado new file mode 100755 index 0000000..b5a1c1d --- /dev/null +++ b/01.code/ado/_/_gilast.ado @@ -0,0 +1,42 @@ +*! 1.0.0 NJC 8 Feb 2000 +program define _gilast + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname(numeric) [if] [in], Value(int) /* + */ [ BY(varlist) BEfore After ] + + if ("`before'" != "") & ("`after'" != "") { + di in r "must choose between -before- and -after-" + exit 198 + } + + tempvar touse order neqval + mark `touse' `if' `in' + gen long `order' = _n + sort `touse' `by' `order' + + quietly { + by `touse' `by' : gen `neqval' = /* + */ sum(`varlist' == `value') if `touse' + + /* ignore user-supplied `type' */ + if "`before'`after'" == "" { + by `touse' `by': gen byte `g' = /* + */ `neqval' == `neqval'[_N] & `neqval' != `neqval'[_n-1] if `touse' + by `touse' `by': replace `g' = 0 /* + */ if `neqval'[_N] == 0 + } + else if "`before'" != "" { + by `touse' `by': gen byte `g' = /* + */ `neqval' < `neqval'[_N] | `neqval'[_N] == 0 + } + else if "`after'" != "" { + by `touse' `by': gen byte `g' = /* + */ `neqval' == `neqval'[_N] & `neqval' == `neqval'[_n-1] if `touse' + by `touse' `by': replace `g' = 0 /* + */ if `neqval'[_N] == 0 + } + } +end diff --git a/01.code/ado/_/_gincss.ado b/01.code/ado/_/_gincss.ado new file mode 100755 index 0000000..31eeb6f --- /dev/null +++ b/01.code/ado/_/_gincss.ado @@ -0,0 +1,24 @@ +*! 1.0.1 NJC 4 July 2000 +* 1.0.0 NJC 20 March 2000 +program define _gincss + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varlist(string) [if] [in] , Substr(str) [ Insensitive ] + tempvar touse + mark `touse' `if' `in' + tokenize `varlist' + if "`insensitive'" != "" { + local substr = lower(`"`substr'"') + local lower "lower" + } + quietly { + gen byte `g' = 0 /* ignore user-supplied `type' */ + while "`1'" != "" { + replace `g' = 1 /* + */ if index(`lower'(`1'),`"`substr'"') & `touse' + mac shift + } + } +end diff --git a/01.code/ado/_/_giso3166.ado b/01.code/ado/_/_giso3166.ado new file mode 100755 index 0000000..182d585 --- /dev/null +++ b/01.code/ado/_/_giso3166.ado @@ -0,0 +1,106 @@ +* 1.0.2 UK 31 Aug 2007 (new url of codelist) +* 1.0.1 NJC 1 Feb 2007 +*! version 1.0.0 January 31, 2007 @ 11:36:16 +*! Generates ISO 3166 country codes and country names +program _giso3166 +version 9.2 + + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varname [, Origin(string) Update Language(string) Verbose] + + // Default Settings etc. + if "`origin'" == "" local origin "names" + local destination = cond("`origin'" == "names","codes","names") + local language = cond("`language'"=="","en","`language'") + + + // Error-Checks + if "`origin'" != "codes" & "`origin'" != "names" { + di `"{err}origin(`origin') invalid: use -origin(names)- or -origin(codes)-"' + exit 198 + } + + if "`language'" != "fr" & "`language'" != "en" { + di `"{err}language(`language') invalid: use -language(fr)- or -language(en)-"' + exit 198 + } + + // Declarations + tempvar _iso`origin' + + // Take care the Code-lists exist + local url `"http://www.iso.org/iso"' + + if `"update"' != "" { + capture mkdir `c(sysdir_personal)' + copy `url'/iso3166_`language'_code_lists.txt `c(sysdir_personal)'/iso3166`language'.txt, replace text public + } + + capture confirm file `c(sysdir_personal)'/iso3166`language'.txt + if _rc { + capture mkdir `c(sysdir_personal)' + copy `url'//iso3166_`language'_code_lists.txt `c(sysdir_personal)'/iso3166`language'.txt, replace text public + } + + quietly { + + preserve + + // Prepare ISO codelist + insheet _isonames _isocodes using `c(sysdir_personal)'/iso3166`language'.txt, clear delimit(";") + drop in 1 + compress + ren _iso`origin' `_iso`origin'' + sort `_iso`origin'' + tempfile isocodes + save `isocodes' + + // Prepare user file + restore + capture confirm string variable `varlist' + if _rc { + decode `varlist', gen(`_iso`origin'') + replace `_iso`origin'' = trim(upper(`_iso`origin'')) + } + else gen `_iso`origin'' = trim(upper(`varlist')) + + // Correct some frequent user errors (is this a good idea?) + // capture assert `_iso`origin'' != "RUSSIA" + // if _rc { + // noi di `"{txt}`varlist' contains "{res}Russia{txt}". "{res}Russian Federation{txt}" assumed."' + // replace `_iso`origin'' = "RUSSIAN FEDERATION" if `_iso`origin''=="RUSSIA" + // } + // capture assert `_iso`origin'' != "GREAT BRITAIN" + // if _rc { + // noi di `"{txt}`varlist' contains "{res}Great Britain{txt}". "{res}United Kingdom{txt}" assumed ."' + // replace `_iso`origin'' = "UNITED KINGDOM" if `_iso`origin''=="GREAT BRITAIN" + // } + // if _rc { + // noi di `"{txt}`varlist' contains "{res}Taiwan{txt}". "{res}Taiwan, Province of China{txt}" assumed ."' + // replace `_iso`origin'' = "TAIWAN" if `_iso`origin''=="TAIWAN, PROVINCE OF CHINA" + // } + + // Merge ISO codes to user file + sort `_iso`origin'' + merge `_iso`origin'' using `isocodes', nokeep + ren _iso`destination' `h' + if "`origin'" == "codes" { + replace `h' = trim(itrim(proper(`h'))) + compress `h' + } + + // Produce verbose output + capture assert _merge == 3 + if "`verbose'" != "" & _rc { + noi di _n "{txt}note: could not find ISO 3166 information for " + tempvar marker + by `varlist', sort: gen `marker' = _n==1 & _merge==1 + noi list `varlist' if `marker' , noobs + noi di `"{txt}check spelling ({view `c(sysdir_personal)'/iso3166`language'.txt:show codelist})"' + } + drop _merge + } +end diff --git a/01.code/ado/_/_glastnm.ado b/01.code/ado/_/_glastnm.ado new file mode 100755 index 0000000..2f7c123 --- /dev/null +++ b/01.code/ado/_/_glastnm.ado @@ -0,0 +1,15 @@ +*! 1.0.0 NJC 31 May 2000 +program define _glastnm + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname [if] [in] [, BY(varlist) ] + marksample touse, strok + tempvar order + gen long `order' = _n + sort `touse' `by' `order' + * ignore user-supplied `type' + local type : type `varlist' + qui by `touse' `by' : gen `type' `g' = `varlist'[_N] if `touse' +end diff --git a/01.code/ado/_/_gminutes.ado b/01.code/ado/_/_gminutes.ado new file mode 100755 index 0000000..7240b27 --- /dev/null +++ b/01.code/ado/_/_gminutes.ado @@ -0,0 +1,85 @@ +*! 1.0.1 NJC 20 February 2003 +* 1.0.0 NJC 24 January 2003 +program _gminutes + version 8 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + gettoken time 0 : 0, parse("() ") + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + confirm str var `time' + syntax [if] [in] [ , maxhour(int 24) ] + + quietly { + tempvar touse shour hour smin min + mark `touse' `if' `in' + markout `touse' `time', strok + + capture assert index(`time',":") if `touse' + if _rc { + di as err "missing colon(s) in `time'" + exit 459 + } + else { + gen `shour' = /// + substr(`time',1,index(`time',":")-1) if `touse' + gen `hour' = real(`shour') + + count if mi(`hour') & `touse' + if r(N) { + di as err "problematic characters in `time'" + exit 459 + } + + gen `smin' = substr(`time',index(`time',":")+1,.) + } + + capture assert !index(`smin',":") if `touse' + if _rc { + di as err "too many colons in `time'" + exit 459 + } + else { + gen `min' = real(`smin') + + count if mi(`min') & `touse' + if r(N) { + di as err "problematic characters in `time'" + exit 459 + } + } + + capture assert `hour' >= 0 & `hour' < `maxhour' if `touse' + if _rc { + di as err "hour value(s) not 0 to `--maxhour'" + exit 459 + } + + capture assert `hour' == int(`hour') if `touse' + if _rc { + di as err "hours contain non-integer value(s)" + exit 459 + } + + capture assert `min' >= 0 & `min' < 60 if `touse' + if _rc { + di as err "minute value(s) not 0 to 59" + exit 459 + } + + capture assert `min' == int(`min') if `touse' + if _rc { + di as err "minutes contain non-integer value(s)" + exit 459 + } + + gen long `g' = `min' + `hour' * 60 if `touse' + } +end + diff --git a/01.code/ado/_/_gmixnorm.ado b/01.code/ado/_/_gmixnorm.ado new file mode 100755 index 0000000..f86a135 --- /dev/null +++ b/01.code/ado/_/_gmixnorm.ado @@ -0,0 +1,30 @@ +*! _gmixnorm CFBaum 09may2005 function to generate mixture of normals +* with differing means and variances (invoke as mixnorm() ) +* MU1, MU2: means of distributions 1 and 2 (default 0 0) +* VAR1, VAR2 = variances of distributions 1 and 2 (default 1 1) +* Frac: fraction of sample with Low variance (default 0.5) + +capt program drop _gmixnorm +program _gmixnorm, rclass + version 8 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax anything [, Frac(real 0.5) MU1(real 0) MU2(real 0) VAR1(real 1) VAR2(real 1) ] + + if `frac' < 0.01 | `frac' > 0.99 { + di as err "frac must be in unit interval" + error 198 + } + if `var1' <= 0 | `var2' <= 0 { + di as err "var1 and var2 must be > 0" + error 198 + } + tempname s1 s2 + scalar `s1' = sqrt(`var1') + scalar `s2' = sqrt(`var2') + qui g `type' `g' = cond(uniform() < `frac', /// + `s1'*invnorm(uniform()) + `mu1', /// + `s2'*invnorm(uniform()) + `mu2') +end diff --git a/01.code/ado/_/_gmlabvpos.ado b/01.code/ado/_/_gmlabvpos.ado new file mode 100755 index 0000000..9d70f37 --- /dev/null +++ b/01.code/ado/_/_gmlabvpos.ado @@ -0,0 +1,90 @@ +* _gmllabvpos version 1.0.1 NJC 19 March 2006 +*! _gmllabvpos version 1.0 UK 19 Feb 04 +* Generates Clock-Positions for scatter-option mlabvpos() as proposed by Cleveland +program _gmlabvpos + version 8.2 + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(min=2 max=2) [if] [in] [, POLYnomial(int 1) LOG MATrix(string)] + gettoken yvar varlist: varlist + gettoken xvar: varlist + + quietly { + // Clock-Position Matrix + if "`matrix'" == "" { + matrix input clock = (11 12 12 12 1 \\ /// + 10 11 12 1 2 \\ /// + 9 9 12 3 3 \\ /// + 8 7 6 5 4 \\ /// + 7 6 6 6 5 ) + } + if "`matrix'" ~= "" { + matrix input clock = (`matrix') + } + + // Log-Option + if "`log'" != "" { + tempvar log + gen `log' = log(`xvar') + } + + // Polynomials + local indep = cond("`log'" == "","`xvar'","`log'") + if `polynomial' > 1 { + forv i = 1/`polynomial' { + tempvar poly`i' + gen `poly`i'' = `xvar'^`i' + local polyterm `polyterm' `poly`i'' + } + } + + // Calculate Residuals + tempvar yhat resid leverage + regress `yvar' `indep' `polyterm' `if' `in' + predict `yhat' + predict `resid', resid + + // Categorize xvar into 5 groups + tempvar `xvar'g + sum `xvar' `if' `in', meanonly + local urange = r(max) - r(mean) + local lrange = r(mean) - r(min) + gen ``xvar'g' = 1 /// + if inrange(`xvar',r(min),r(mean)-`lrange'/5 * 3) + replace ``xvar'g' = 2 /// + if inrange(`xvar',r(mean)-`lrange'/5*3,r(mean)-`lrange'/5) + replace ``xvar'g' = 3 /// + if inrange(`xvar',r(mean)-`lrange'/5,r(mean)+`urange'/5) + replace ``xvar'g' = 4 /// + if inrange(`xvar',r(mean)+`urange'/5,r(mean)+`urange'/5*3) + replace ``xvar'g' = 5 /// + if inrange(`xvar',r(mean)+`urange'/5*3,r(max)) + + // Categorize yvar into 5 groups, according to reg-residuals + tempvar `yvar'g + sum `resid', meanonly + local urange = r(max) - r(mean) + local lrange = r(mean) - r(min) + gen ``yvar'g' = 1 /// + if inrange(`resid',r(min),r(mean)-`lrange'/5 * 3) + replace ``yvar'g' = 2 /// + if inrange(`resid',r(mean)-`lrange'/5*3,r(mean)-`lrange'/5) + replace ``yvar'g' = 3 /// + if inrange(`resid',r(mean)-`lrange'/5,r(mean)+`urange'/5) + replace ``yvar'g' = 4 /// + if inrange(`resid',r(mean)+`urange'/5,r(mean)+`urange'/5*3) + replace ``yvar'g' = 5 /// + if inrange(`resid',r(mean)+`urange'/5*3,r(max)) + + // generate clock-position according to Matrix + gen `type' `h' = . + forv i=1/5 { + forv j=1/5 { + replace `h' = clock[`i',`j'] /// + if (5 -``yvar'g') +1 == `i' & ``xvar'g' == `j' + } + } + } +end diff --git a/01.code/ado/_/_gmsub.ado b/01.code/ado/_/_gmsub.ado new file mode 100755 index 0000000..a838b11 --- /dev/null +++ b/01.code/ado/_/_gmsub.ado @@ -0,0 +1,53 @@ +*! 1.0.0 NJC 11 December 2000 +program define _gmsub + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(max=1 string) [if] [in], /* + */ Find(str asis) [ Replace(str asis) N(int -1) Word ] + + local fcn = cond("`word'" != "", "subinword", "subinstr") + + * doesn't work => user needs to update Stata + capture local bar = `fcn'("foo","foo","bar",.) + if _rc { + di in r "Your version of Stata doesn't recognise `fcn'( )." + di in r "I guess that you need to update." + exit 198 + } + + local nfind : word count `find' + local nrepl : word count `replace' + + if `nrepl' == 0 { /* no replacement => delete */ + local nrepl = `nfind' + } + else if `nrepl' == 1 { /* many to one replacements allowed */ + local nrepl = `nfind' + local replace : di _dup(`nfind') `"`replace' "' + } + else if `nfind' != `nrepl' { + di in r "number of find and replace arguments not equal" + exit 198 + } + + marksample touse, strok + local type "str1" /* ignores type passed from -egen- */ + local n = cond(`n' == -1, ., `n') + + quietly { + gen `type' `g' = "" + replace `g' = `varlist' if `touse' + + local i = 1 + while `i' <= `nfind' { + local f : word `i' of `find' + local r : word `i' of `replace' + replace `g' = `fcn'(`g', `"`f'"', `"`r'"', `n') + local i = `i' + 1 + } + } +end diff --git a/01.code/ado/_/_gncyear.ado b/01.code/ado/_/_gncyear.ado new file mode 100755 index 0000000..4039c54 --- /dev/null +++ b/01.code/ado/_/_gncyear.ado @@ -0,0 +1,56 @@ +*! NJC 1.2.0 9 March 2004 fix v.8 labelling problem +*! NJC 1.1.0 23 February 2000 after a WWG idea +program define _gncyear + version 6 + syntax newvarname =/exp [if] [in] , Month(int) [ Day(int 1)] + + if `month' == 1 & `day' == 1 { + di in r "calendar years requested" + exit 198 + } + + * test month and day + local test = mdy(`month',`day',2000) + if `test' == . { + di in r "invalid start date" + exit 198 + } + + quietly { + tempvar touse date + gen byte `touse' = 1 `if' `in' + replace `touse' = 0 if `touse' == . + gen `date' = `exp' + + * ignore any user-specified type + gen int `varlist' = . + + * version 8 handles labelling differently + if "$EGEN_Varname" != "" { + local vlabel "$EGEN_Varname" + } + else tempname vlabel + + su `date' if `touse', meanonly + local ymin = year(r(min)) + local ymax = year(r(max)) + + local y = `ymin' - 1 + while `y' <= `ymax' { + local start = mdy(`month',`day',`y') + if `start' == . { /* 29 February */ + local start = mdy(3,1,`y') + } + local end = mdy(`month',`day',`y' + 1) + if `end' == . { /* 29 February */ + local end = mdy(28, 2, `y' + 1) + } + local Yp1 = mod(`y' + 1,100) + replace `varlist' = `y' /* + */ if `date' >= `start' & `date' < `end' & `touse' + label def `vlabel' `y' "`y'/`Yp1'", modify + local y = `y' + 1 + } + label val `varlist' `vlabel' + } +end diff --git a/01.code/ado/_/_gnmiss.ado b/01.code/ado/_/_gnmiss.ado new file mode 100755 index 0000000..ac875da --- /dev/null +++ b/01.code/ado/_/_gnmiss.ado @@ -0,0 +1,13 @@ +*! 1.1.0 NJC 7 January 2000 +program define _gnmiss + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + tempvar touse + quietly { + mark `touse' `if' `in' + sort `touse' `by' + by `touse' `by': gen `typlist' `varlist' = /* + */ sum(missing(`exp')) if `touse' + by `touse' `by': replace `varlist' = `varlist'[_N] + } +end diff --git a/01.code/ado/_/_gnoccur.ado b/01.code/ado/_/_gnoccur.ado new file mode 100755 index 0000000..8686535 --- /dev/null +++ b/01.code/ado/_/_gnoccur.ado @@ -0,0 +1,38 @@ +*! Nick Winter 1.0.2 10 Oct 2002 +program define _gnoccur + version 7 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(string) [if] [in] , String(string) + + local size = length(`"`string'"') + tempvar new pos count + qui { + gen str1 `new' = "" + replace `new' = `varlist' + gen int `count' = 0 + gen int `pos' = index(`new',`"`string'"') + capture assert `pos' == 0 `if' `in' + + while _rc { + replace `count' = `count' + (`pos' != 0) + replace `new' = substr(`new',`pos'+`size',.) + replace `pos' = index(`new',`"`string'"') + capture assert `pos' == 0 `if' `in' + } + + * ignore user-specified type; a byte will often be enough + gen int `g' = `count' `if' `in' + compress `g' + } +end + +/* +There is a trade-off on `if' `in': +0. We only to generate `if' `in' +1. We don't want to loop if there is nothing left to count +2. We suspect that `if' can slow things down +*/ diff --git a/01.code/ado/_/_gnss.ado b/01.code/ado/_/_gnss.ado new file mode 100755 index 0000000..021fd27 --- /dev/null +++ b/01.code/ado/_/_gnss.ado @@ -0,0 +1,50 @@ +*! 1.0.0 NJC 12 July 2000 +program define _gnss + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(string) [if] [in], Find(str) [ Insensitive ] + + marksample touse, strok + tempvar lower ndx rmndr + + quietly { + /* ignores type passed from -egen- */ + gen byte `g' = 0 if `touse' + local flen = length(`"`find'"') + local type : type `varlist' + + if "`insensitive'" != "" { + gen `type' `lower' = lower(`varlist') + local varlist `lower' + local find = lower(`"`find'"') + local flen2 = length(`"`find'"') + local dflen = `flen' - `flen2' + if `dflen' { + local spaces : di _dup(`dflen') " " + local find `"`spaces'`find'"' + } + } + + gen byte `ndx' = index(`varlist', `"`find'"') * `touse' + count if `ndx' + if r(N) == 0 { exit 0 } + + gen `type' `rmndr' = `varlist' if `touse' + + while 1 { + replace `g' = `g' + (`ndx' > 0) + replace `rmndr' = /* + */ cond(`ndx', substr(`rmndr', `ndx'+`flen', .),"") + cap assert `rmndr' == "" + if _rc != 0 { + replace `ndx' = index(`rmndr', `"`find'"') + } + else exit 0 + } + } +end + diff --git a/01.code/ado/_/_gntos.ado b/01.code/ado/_/_gntos.ado new file mode 100755 index 0000000..666820c --- /dev/null +++ b/01.code/ado/_/_gntos.ado @@ -0,0 +1,31 @@ +*! 1.0.0 NJC 21 January 2001 +program define _gntos + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(numeric min=1) [if] [in], From(numlist) To(str asis) + marksample touse + + local nfrom : word count `from' + local nto : word count `to' + if `nfrom' != `nto' { + di in r "from( ) and to( ) do not match one to one" + exit 198 + } + + quietly { + /* ignore `type' passed from -egen- */ + gen str1 `g' = "" + tokenize `"`to'"' + local i = 1 + while `i' <= `nfrom' { + local fval : word `i' of `from' + replace `g' = `"``i''"' /* + */ if `varlist' == `fval' & `touse' + local i = `i' + 1 + } + } +end diff --git a/01.code/ado/_/_gnvals.ado b/01.code/ado/_/_gnvals.ado new file mode 100755 index 0000000..afff3e0 --- /dev/null +++ b/01.code/ado/_/_gnvals.ado @@ -0,0 +1,23 @@ +*! 1.0.1 NJC 20 November 2000 +*! 1.0.0 NJC 20 July 2000 +program define _gnvals + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] [, by(varlist) MISSing] + tempvar touse + quietly { + mark `touse' `if' `in' + if "`missing'" == "" { + markout `touse' `varlist', strok + } + sort `touse' `by' `varlist' + by `touse' `by' `varlist': gen `type' `g' = _n == 1 if `touse' + by `touse' `by' : replace `g' = sum(`g') if `touse' + by `touse' `by' : replace `g' = `g'[_N] if `touse' + + } +end + diff --git a/01.code/ado/_/_gnwords.ado b/01.code/ado/_/_gnwords.ado new file mode 100755 index 0000000..6419a4c --- /dev/null +++ b/01.code/ado/_/_gnwords.ado @@ -0,0 +1,27 @@ +*! 1.0.0 NJC 17 July 2000 +program define _gnwords + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(max=1 string) [if] [in] + + marksample touse, strok + local type "byte" /* ignores type passed from -egen- */ + + quietly { + gen `type' `g' = . + local i = 1 + while `i' <= _N { + if `touse'[`i'] { + local value = `varlist'[`i'] + local nw : word count `value' + replace `g' = `nw' in `i' + } + local i = `i' + 1 + } + } +end + diff --git a/01.code/ado/_/_goutside.ado b/01.code/ado/_/_goutside.ado new file mode 100755 index 0000000..cd55cb0 --- /dev/null +++ b/01.code/ado/_/_goutside.ado @@ -0,0 +1,26 @@ +*! 1.1.0 NJC 2 June 2004 +*! 1.0.0 NJC 17 May 2004 +program _goutside + version 8.0 + syntax newvarname =/exp [if] [in] /// + [, BY(varlist) FACTor(numlist max=1 >=0) ] + quietly { + tempvar touse group + tempname u l + mark `touse' `if' `in' + sort `touse' `by' + by `touse' `by' : gen long `group' = _n == 1 if `touse' + replace `group' = sum(`group') + local max = `group'[_N] + gen double `varlist' = . + if "`factor'" == "" local factor = 1.5 + + forval i = 1/`max' { + su `exp' if `group' == `i', detail + scalar `u' = r(p75) + `factor' * (r(p75) - r(p25)) + scalar `l' = r(p25) - `factor' * (r(p75) - r(p25)) + replace `varlist' = `exp' /// + if `group' == `i' & (`exp' > `u' | `exp' < `l') + } + } +end diff --git a/01.code/ado/_/_grall.ado b/01.code/ado/_/_grall.ado new file mode 100755 index 0000000..c62dd1e --- /dev/null +++ b/01.code/ado/_/_grall.ado @@ -0,0 +1,32 @@ +*! 1.0.0 NJC 12 February 2001 +program define _grall + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] , Cond(string asis) [ SYmbol(str) ] + + if "`symbol'" == "" { local symbol "@" } + if index(`"`cond'"',"`symbol'") == 0 { + di in r `"`cond' does not contain `symbol'"' + exit 198 + } + + tempvar touse + mark `touse' `if' `in' + + quietly { + * ignore user-supplied `type' + gen int `g' = 0 if `touse' + tokenize `varlist' + local nvars : word count `varlist' + while "`1'" != "" { + local Cond : subinstr local cond "`symbol'" "`1'", all + replace `g' = `g' + (`Cond') + mac shift + } + replace `g' = `g' == `nvars' if `touse' + compress `g' + } +end diff --git a/01.code/ado/_/_grany.ado b/01.code/ado/_/_grany.ado new file mode 100755 index 0000000..a676a60 --- /dev/null +++ b/01.code/ado/_/_grany.ado @@ -0,0 +1,31 @@ +*! 1.0.0 NJC 12 February 2001 +program define _grany + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] , Cond(string asis) [ SYmbol(str) ] + + if "`symbol'" == "" { local symbol "@" } + if index(`"`cond'"',"`symbol'") == 0 { + di in r `"`cond' does not contain `symbol'"' + exit 198 + } + + tempvar touse + mark `touse' `if' `in' + + quietly { + * ignore user-supplied `type' + gen int `g' = 0 if `touse' + tokenize `varlist' + while "`1'" != "" { + local Cond : subinstr local cond "`symbol'" "`1'", all + replace `g' = `g' + (`Cond') + mac shift + } + replace `g' = `g' > 0 if `touse' + compress `g' + } +end diff --git a/01.code/ado/_/_grcount.ado b/01.code/ado/_/_grcount.ado new file mode 100755 index 0000000..014a3e9 --- /dev/null +++ b/01.code/ado/_/_grcount.ado @@ -0,0 +1,30 @@ +*! 1.0.0 NJC 12 February 2001 +program define _grcount + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist [if] [in] , Cond(string asis) [ SYmbol(str) ] + + if "`symbol'" == "" { local symbol "@" } + if index(`"`cond'"',"`symbol'") == 0 { + di in r `"`cond' does not contain `symbol'"' + exit 198 + } + + tempvar touse + mark `touse' `if' `in' + + quietly { + * just in case someone tries something like -cond(@^2)- + gen `type' `g' = 0 if `touse' + tokenize `varlist' + while "`1'" != "" { + local Cond : subinstr local cond "`symbol'" "`1'", all + replace `g' = `g' + (`Cond') + mac shift + } + compress `g' + } +end diff --git a/01.code/ado/_/_grecord.ado b/01.code/ado/_/_grecord.ado new file mode 100755 index 0000000..4e3fc05 --- /dev/null +++ b/01.code/ado/_/_grecord.ado @@ -0,0 +1,18 @@ +*! 1.2.1 CFB/NJC 8 Oct 2001 +* 1.2.0 CFB/NJC 8 Oct 2001 +* 1.1.0 CFB 06 Oct 2001 +program define _grecord + version 6.0 + syntax newvarname =/exp [if] [in] [, BY(varlist) ORDER(varlist) MIN ] + tempvar touse obsno + local op = cond("`min'" == "min", "min", "max") + quietly { + mark `touse' `if' `in' + gen `typlist' `varlist' = `exp' if `touse' + gen long `obsno' = _n + sort `touse' `by' `order' `obsno' + by `touse' `by': /* + */ replace `varlist' = `op'(`varlist',`varlist'[_n-1]) if `touse' + } +end + diff --git a/01.code/ado/_/_grepeat.ado b/01.code/ado/_/_grepeat.ado new file mode 100755 index 0000000..1c82ff1 --- /dev/null +++ b/01.code/ado/_/_grepeat.ado @@ -0,0 +1,52 @@ +program _grepeat +*! 1.0.0 NJC 21 July 2003 + version 8.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + gettoken lparen 0 : 0, parse("(") + gettoken rparen 0 : 0, parse(")") + + syntax [if] [in] , Values(str asis) [ by(varlist) Block(int 1) ] + + if `block' < 1 { + di as err "block should be at least 1" + exit 498 + } + + marksample touse + tempvar order obs which + qui { + gen long `order' = _n + bysort `touse' `by' (`order') : /// + gen long `obs' = _n if `touse' + + capture numlist "`values'" + local isstr = _rc + // ignore user type + if `isstr' { + gen `g' = "" + local nvals : word count `values' + tokenize `"`values'"' + } + else { + gen double `g' = . + local nvals : word count `r(numlist)' + tokenize "`r(numlist)'" + } + + gen long `which' = 1 + int(mod((`obs' - 1) / `block', `nvals')) + + if `isstr' { + forval i = 1 / `nvals' { + replace `g' = "``i''" if `which' == `i' + } + } + else { + forval i = 1 / `nvals' { + replace `g' = ``i'' if `which' == `i' + } + compress `g' + } + } +end diff --git a/01.code/ado/_/_gridit.ado b/01.code/ado/_/_gridit.ado new file mode 100755 index 0000000..0aaf75e --- /dev/null +++ b/01.code/ado/_/_gridit.ado @@ -0,0 +1,19 @@ +*! NJC 1.0.0 19 Oct 2000 +program define _gridit + version 6.0 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + syntax varname [if] [in] [, by(varlist) MISSing REVerse PERCent] + marksample touse + if "`missing'" == "" & "`by'" != "" { markout `touse' `by', strok } + sort `touse' `by' `varlist' + tempvar total pr + qui by `touse' `by': gen `total' = _N + qui by `touse' `by' `varlist': gen `pr' = _N / `total' + qui by `touse' `by': gen `type' `g' = 0.5 * `pr' if `touse' + qui by `touse' `by' `varlist': replace `pr' = `pr' * (_n == _N) + qui by `touse' `by': replace `g' = `g' + sum(`pr'[_n-1]) + if "`reverse'" != "" { replace `g' = 1 - `g' } + if "`percent'" != "" { replace `g' = 100 * `g' } +end diff --git a/01.code/ado/_/_grndint.ado b/01.code/ado/_/_grndint.ado new file mode 100755 index 0000000..da1c0dd --- /dev/null +++ b/01.code/ado/_/_grndint.ado @@ -0,0 +1,30 @@ +*! NJC 1.0.0 21 January 2000 +program define _grndint + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + gettoken lparen 0 : 0, parse("(") + gettoken rparen 0 : 0, parse(")") + syntax [if] [in] , MAx(int) [ MIn(int 1) ] + + if `max' <= `min' { + di in r "max(`max') does not exceed min(`min')" + exit 198 + } + + tempvar touse + quietly { + mark `touse' `if' `in' + /* ignore user `type' */ + if `max' <= 126 & `min' >= -127 { + local type "byte" + } + else if `max' <= 32766 & `min' >= -32767 { + local type "int" + } + else local type "long" + gen `type' `g' = /* + */ `min' + int((`max' - `min' + 1) * uniform( )) if `touse' + } +end diff --git a/01.code/ado/_/_grndsub.ado b/01.code/ado/_/_grndsub.ado new file mode 100755 index 0000000..877887f --- /dev/null +++ b/01.code/ado/_/_grndsub.ado @@ -0,0 +1,59 @@ +*! NJC 1.2.0 11 January 2000 +* NJC 1.1.0 7 January 2000 +* John Moran suggested -frac( )- +program define _grndsub + version 6 + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + gettoken lparen 0 : 0, parse("(") + gettoken rparen 0 : 0, parse(")") + syntax [if] [in] [, by(varlist) NGroup(int 2) Frac(str) Percent(str)] + + if "`frac'" != "" | "`percent'" != "" { + if "`frac'" != "" & "`percent'" != "" { + di in r "may not combine frac( ) and percent( )" + exit 198 + } + else { + local opt = cond("`frac'" != "", "frac", "percent") + if `ngroup' != 2 { + di in r "`opt'( ) allowed only with ngroup(2)" + exit 198 + } + } + + if "`percent'" != "" { + confirm number `percent' + capture assert `percent' > 0 & `percent' < 100 + if _rc { + di in r "percent( ) should be between 0 and 100" + exit 198 + } + local frac = 100 / `percent' + } + else { + confirm number `frac' + capture assert `frac' > 0 + if _rc { + di in r "frac( ) should be > 0" + exit 198 + } + } + } + + tempvar touse random + quietly { + mark `touse' `if' `in' + gen `random' = uniform( ) + sort `touse' `by' `random' + if "`frac'" != "" { + by `touse' `by' : gen `type' `g' = /* + */ 1 + (_n > (_N / `frac')) if `touse' + } + else { + by `touse' `by': gen `type' `g' = /* + */ group(`ngroup') if `touse' + } + } +end diff --git a/01.code/ado/_/_growmedian.ado b/01.code/ado/_/_growmedian.ado new file mode 100755 index 0000000..758df6d --- /dev/null +++ b/01.code/ado/_/_growmedian.ado @@ -0,0 +1,50 @@ +*! NJC 1.2.0 17 Feb 2007 +* NJC 1.1.0 16 Feb 2007 +* NJC 1.0.0 15 Feb 2007 +program _growmedian + version 9 + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(numeric) [if] [in] [, BY(string)] + if `"`by'"' != "" { + _egennoby rowmedian() `"`by'"' + /* NOTREACHED */ + } + + marksample touse, novarlist + quietly { + mata : row_median("`varlist'", "`touse'", "`h'", "`type'") + } +end + +mata : + +void row_median(string scalar varnames, + string scalar tousename, + string scalar medianname, + string scalar type) +{ + real matrix y + real colvector median, row + real scalar n + + st_view(y, ., tokens(varnames), tousename) + median = J(rows(y), 1, .) + + for(i = 1; i <= rows(y); i++) { + row = y[i,]' + if (n = colnonmissing(row)) { // sic + _sort(row, 1) + median[i] = + (row[ceil(n / 2)] + row[floor(n + 2) / 2]) / 2 + } + } + + st_addvar(type, medianname) + st_store(., medianname, tousename, median) +} + +end + diff --git a/01.code/ado/_/_grownvals.ado b/01.code/ado/_/_grownvals.ado new file mode 100755 index 0000000..5ea82c8 --- /dev/null +++ b/01.code/ado/_/_grownvals.ado @@ -0,0 +1,54 @@ +* NJC 1.0.1 28 Jan 2009 +* NJC 1.0.0 7 Jan 2009 +program _grownvals + version 9 + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(numeric) [if] [in] [, BY(string) MISSing] + if `"`by'"' != "" { + _egennoby rownvals() `"`by'"' + /* NOTREACHED */ + } + + marksample touse, novarlist + local miss = "`missing'" != "" + quietly { + mata : row_nvals("`varlist'", "`touse'", "`h'", "`type'", `miss') + } +end + +mata : + +void row_nvals(string scalar varnames, + string scalar tousename, + string scalar nvalsname, + string scalar type, + real scalar miss) +{ + real matrix y + real colvector nvals, row + + st_view(y, ., tokens(varnames), tousename) + nvals = J(rows(y), 1, .) + + if (miss) { + for(i = 1; i <= rows(y); i++) { + row = y[i,]' + nvals[i] = length(uniqrows(row)) + } + } + else { + for(i = 1; i <= rows(y); i++) { + row = y[i,]' + nvals[i] = length(uniqrows(select(row, (row :< .)))) + } + } + + st_addvar(type, nvalsname) + st_store(., nvalsname, tousename, nvals) +} + +end + diff --git a/01.code/ado/_/_growsvals.ado b/01.code/ado/_/_growsvals.ado new file mode 100755 index 0000000..ac699d1 --- /dev/null +++ b/01.code/ado/_/_growsvals.ado @@ -0,0 +1,55 @@ +* NJC 1.0.1 28 Jan 2009 +* NJC 1.0.0 7 Jan 2009 +program _growsvals + version 9 + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(string) [if] [in] [, BY(string) MISSing] + if `"`by'"' != "" { + _egennoby rowsvals() `"`by'"' + /* NOTREACHED */ + } + + marksample touse, novarlist + local miss = "`missing'" != "" + quietly { + mata : row_svals("`varlist'", "`touse'", "`h'", "`type'", `miss') + } +end + +mata : + +void row_svals(string scalar varnames, + string scalar tousename, + string scalar svalsname, + string scalar type, + real scalar miss) +{ + string matrix y + string colvector row + real colvector nvals + + st_sview(y, ., tokens(varnames), tousename) + svals = J(rows(y), 1, .) + + if (miss) { + for(i = 1; i <= rows(y); i++) { + row = y[i,]' + svals[i] = length(uniqrows(row)) + } + } + else { + for(i = 1; i <= rows(y); i++) { + row = y[i,]' + svals[i] = length(uniqrows(select(row, (row :!= "")))) + } + } + + st_addvar(type, svalsname) + st_store(., svalsname, tousename, svals) +} + +end + diff --git a/01.code/ado/_/_grsum2.ado b/01.code/ado/_/_grsum2.ado new file mode 100755 index 0000000..3b7acce --- /dev/null +++ b/01.code/ado/_/_grsum2.ado @@ -0,0 +1,47 @@ +*! 1.0.1 16jul2002 Steven Stillman +* NJC minor edits 16 July 2002 +* version 1.0 12jul2002 Steven Stillman +* created as an extension to version 2.1.3 26jun2000 of _grsum +* adds options to exclude observations with missing values on +* either any or all of the variables chosen +program define _grsum2 + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(numeric) [if] [in] [, BY(string) ANYMiss ALLMiss ] + if `"`by'"' != "" { + _egennoby rsum() `"`by'"' + /* NOTREACHED */ + } + + if "`anymiss'" != "" & "`allmiss'" != "" { + di as err "cannot use anymiss and allmiss options together" + exit 198 + } + + quietly { + tempvar nmiss + local nvar: word count `varlist' + tokenize `varlist' + gen `nmiss' = `1' == . `if' `in' + gen `type' `g' = cond(`1' == . , 0, `1') `if' `in' + mac shift + + while "`1'" != "" { + replace `nmiss' = `nmiss' + (`1' == .) `if' `in' + replace `g' = `g' + cond(`1' == ., 0, `1') `if' `in' + mac shift + } + + if "`anymiss'" != "" { + replace `g' = . if `nmiss' > 0 + } + else if "`allmiss'" != "" { + replace `g' = . if `nmiss' == `nvar' + } + } +end + diff --git a/01.code/ado/_/_gseconds.ado b/01.code/ado/_/_gseconds.ado new file mode 100755 index 0000000..5ad4745 --- /dev/null +++ b/01.code/ado/_/_gseconds.ado @@ -0,0 +1,118 @@ +*! 1.0.1 NJC 20 February 2003 +* 1.0.0 NJC 24 January 2003 +program _gseconds + version 8 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + gettoken time 0 : 0, parse("() ") + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + confirm str var `time' + syntax [if] [in] [ , maxhour(int 24) ] + + quietly { + tempvar touse shour hour smin min ssec sec work + mark `touse' `if' `in' + markout `touse' `time', strok + + capture assert index(`time',":") if `touse' + if _rc { + di as err "missing colons in `time'" + exit 459 + } + else { + gen `shour' = /// + substr(`time',1,index(`time',":")-1) if `touse' + gen `hour' = real(`shour') + + count if mi(`hour') & `touse' + if r(N) { + di as err "problematic characters in `time'" + exit 459 + } + + gen `work' = /// + substr(`time',index(`time',":")+1,.) if `touse' + } + + capture assert index(`work',":") if `touse' + if _rc { + di as err "missing colons in `time'" + exit 459 + } + else { + gen `smin' = /// + substr(`work',1,index(`work',":")-1) if `touse' + gen `min' = real(`smin') + + count if mi(`min') & `touse' + if r(N) { + di as err "problematic characters in `time'" + exit 459 + } + + gen `ssec' = /// + substr(`work',index(`work',":")+1,.) if `touse' + } + + capture assert !index(`ssec',":") if `touse' + if _rc { + di as err "too many colons in `time'" + exit 459 + } + else { + gen `sec' = real(`ssec') + + count if mi(`sec') & `touse' + if r(N) { + di as err "problematic characters in `time'" + exit 459 + } + } + + capture assert `hour' >= 0 & `hour' < `maxhour' if `touse' + if _rc { + di as err "hour value(s) not 0 to `--maxhour'" + exit 459 + } + + capture assert `hour' == int(`hour') if `touse' + if _rc { + di as err "hours contain non-integer value(s)" + exit 459 + } + + capture assert `min' >= 0 & `min' < 60 if `touse' + if _rc { + di as err "minute value(s) not 0 to 59" + exit 459 + } + + capture assert `min' == int(`min') if `touse' + if _rc { + di as err "minutes contain non-integer value(s)" + exit 459 + } + + capture assert `sec' >= 0 & `sec' < 60 if `touse' + if _rc { + di as err "second value(s) not 0 to 59" + exit 459 + } + + capture assert `sec' == int(`sec') if `touse' + if _rc { + di as err "seconds contain non-integer value(s)" + exit 459 + } + + gen long `g' = `sec' + `min' * 60 + `hour' * 3600 if `touse' + } +end + diff --git a/01.code/ado/_/_gsemean.ado b/01.code/ado/_/_gsemean.ado new file mode 100755 index 0000000..f4cc7ba --- /dev/null +++ b/01.code/ado/_/_gsemean.ado @@ -0,0 +1,19 @@ +*! 1.1.0 NJC 24 May 2007 +*! 1.0.0 NJC 5 December 2000 +program define _gsemean + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + tempvar touse mean n + quietly { + gen byte `touse' = 1 `if' `in' + sort `touse' `by' + by `touse' `by': gen double `mean' = /* + */ sum(`exp')/sum((`exp') < .) if `touse' == 1 + by `touse' `by': gen long `n' = sum((`exp') < .) if `touse' == 1 + by `touse' `by': replace `n' = `n'[_N] + by `touse' `by': gen `typlist' `varlist' = /* + */ sqrt(sum(((`exp')-`mean'[_N])^2)/(sum((`exp') < .) - 1)) /* + */ if `touse'==1 & sum(`exp' < .) + by `touse' `by': replace `varlist' = `varlist'[_N] / sqrt(`n') + } +end diff --git a/01.code/ado/_/_gsieve.ado b/01.code/ado/_/_gsieve.ado new file mode 100755 index 0000000..73718dd --- /dev/null +++ b/01.code/ado/_/_gsieve.ado @@ -0,0 +1,96 @@ +*! 1.0.0 NJC 23 Sept 2002 +program define _gsieve + version 7.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(string) [if] [in] /* + */ [, KEEP(str) CHAR(str asis) OMIT(str asis) ] + + local nopts = ("`keep'" != "") + (`"`char'"' != "") + (`"`omit'"' != "") + if `nopts' != 1 { + di as err "specify keep() or char() or omit()" + exit 198 + } + if `"`omit'"' != "" { + local char `"`omit'"' + local not "!" + } + + marksample touse, strok + local type "str1" /* ignores type passed from -egen- */ + qui gen `type' `g' = "" + local length : type `varlist' + local length = substr("`length'",4,.) + + if "`keep'" != "" { + local a 0 + local n 0 + local o 0 + local s 0 + + foreach w of local keep { + local l = length("`w'") + if substr("alphabetic",1,max(1,`l')) == "`w'" { + local a 1 + } + else if substr("numeric",1,max(1,`l')) == "`w'" { + local n 1 + } + else if substr("other",1,max(1,`l')) == "`w'" { + local o 1 + } + else if substr("spaces",1,max(1,`l')) == "`w'" { + local s 1 + } + else { + di as err "keep() invalid" + exit 198 + } + } + + tempvar c + + quietly { + gen str1 `c' = "" + + forval i = 1 / `length' { + replace `c' = substr(`varlist',`i',1) + + if `a' { + replace `g' = `g' + `c' /* + */ if ((`c' >= "A" & `c' <= "Z") /* + */ | (`c' >= "a" & `c' <= "z")) + } + if `n' { + replace `g' = `g' + `c' /* + */ if (`c' >= "0" & `c' <= "9") + } + if `s' { + replace `g' = `g' + `c' if `c' == " " + } + if `o' { + replace `g' = `g' + `c' /* + */ if !( (`c' >= "A" & `c' <= "Z") /* + */ | (`c' >= "a" & `c' <= "z") /* + */ | (`c' >= "0" & `c' <= "9") /* + */ | (`c' == " ") ) + } + } + } + } + else { /* char() or omit() */ + forval i = 1 / `length' { + qui replace `g' = `g' + substr(`varlist',`i',1) /* + */ if `not'index(`"`char'"', substr(`varlist',`i',1)) + } + } + + qui { + replace `g' = "" if !`touse' + compress `g' + } +end + diff --git a/01.code/ado/_/_gston.ado b/01.code/ado/_/_gston.ado new file mode 100755 index 0000000..03a1da0 --- /dev/null +++ b/01.code/ado/_/_gston.ado @@ -0,0 +1,30 @@ +*! 1.0.0 NJC 21 January 2001 +program define _gston + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(string min=1) [if] [in], From(str asis) To(numlist) + marksample touse, strok + + local nfrom : word count `from' + local nto : word count `to' + if `nfrom' != `nto' { + di in r "from( ) and to( ) do not match one to one" + exit 198 + } + + quietly { + gen `type' `g' = . + tokenize `"`from'"' + local i = 1 + while `i' <= `nfrom' { + local toval : word `i' of `to' + replace `g' = `toval' /* + */ if `varlist' == `"``i''"' & `touse' + local i = `i' + 1 + } + } +end diff --git a/01.code/ado/_/_gsumoth.ado b/01.code/ado/_/_gsumoth.ado new file mode 100755 index 0000000..04bfb86 --- /dev/null +++ b/01.code/ado/_/_gsumoth.ado @@ -0,0 +1,14 @@ +*! 1.0.0 NJC 16 Oct 2001 +program define _gsumoth + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + tempvar touse + quietly { + gen byte `touse'=1 `if' `in' + sort `touse' `by' + by `touse' `by': gen `typlist' `varlist' = sum(`exp') /* + */ if `touse'==1 + by `touse' `by': replace `varlist' = `varlist'[_N] + replace `varlist' = `varlist' - `exp' + } +end diff --git a/01.code/ado/_/_gtod.ado b/01.code/ado/_/_gtod.ado new file mode 100755 index 0000000..1795e09 --- /dev/null +++ b/01.code/ado/_/_gtod.ado @@ -0,0 +1,51 @@ +*! 1.0.0 CFB 29 Sep 2002 +* 1.1.0 NJC 7 December 2000 _gbom +* 1.0.0 NJC 12 July 2000 +program define _gtod + version 6 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 /* "=" */ + gettoken paren 0 : 0, parse("() ") /* "(" */ + + gettoken time 0 : 0, parse("() ") + + gettoken paren 0 : 0, parse("(), ") /* ")" */ + if `"`paren'"' != ")" { + error 198 + } + + syntax [if] [in] [ , Format(str) ] + + quietly { + tempvar touse hh mm ss + mark `touse' `if' `in' + + capture assert `time' >= 0 & `time' < 86400 if `touse' + if _rc { + di in r "`time' contains non time-of-day value(s)" + exit 198 + } + + capture assert `time' == int(`time') if `touse' + if _rc { + di in r "`time' contains non-integer value(s)" + exit 410 + } + + gen int `hh' = int(`time'/3600) if `touse' + gen int `mm' = int((`time' - `hh'*3600)/60) if `touse' + gen int `ss' = int(`time' - `hh'*3600 - `mm'*60) if `touse' + gen str8 `g' = string(`hh',"%02.0f") + ":" + string(`mm',"%02.0f") + /* + */ ":" + string(`ss',"%02.0f") if `touse' + + if "`format'" != "" { + capture format `g' `format' + if _rc { + noi di in bl "`format' invalid format" + } + } + } +end + diff --git a/01.code/ado/_/_gtools_internal.ado b/01.code/ado/_/_gtools_internal.ado new file mode 100755 index 0000000..c6b37c6 --- /dev/null +++ b/01.code/ado/_/_gtools_internal.ado @@ -0,0 +1,8055 @@ +*! version 1.10.1 05Dec2022 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! gtools function internals + +* rc 17000 +* rc 17001 - no observations +* rc 17002 - strL variables and version < 14 +* rc 17003 - strL variables and version >= 14 +* rc 17004 - strL variables could not be compressed +* rc 17005 - strL contains binary data +* rc 17006 - strL variables unknown error +* rc 17800 - More than 2^31-1 obs +* rc 17801 - gtools has not been compiled for a X-bit architecture +* rc 18101 - greshape long id variables not unique +* rc 18102 - greshape wide j variables not unique within id +* rc 18103 - greshape wide xi variables not unique within id +* rc 18201 - gstats all variables are non-numeric (soft exit) +* rc 18301 - gstats transform; unexpected number of stats passed to transform +* rc 18401 - gregress k > N (too many vars/absorb levels) +* rc 18402 - hdfe maximum number of iterations +* -------- +* rc 17459 - isid special error +* rc 17900 - multi-threading not available +* rc 17901 - generic not implemented +* rc 17902 - gtools API OOM +* rc 17999 - collision error +* -------- +* > 0 to < 100 strict gives quantiles +* -1 to -100 stats are regular stats +* -101 to -200 stats are analogues with special nan handling +* -201 to -300 stats are analogues with special gstats handling +* 1000 + # selects the #th smallest +* - 1000 - # selects the #th largest +* 1000.5 + # rawselects the #th smallest +* - 1000.5 - # rawselects the #th largest + +capture program drop _gtools_internal +program _gtools_internal, rclass + version 13.1 + + if ( `"`0'"' == "_check" ) { + cap noi plugin call gtools_plugin, check + exit _rc + } + + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile gregfile + tempfile gregbfile + tempfile gregsefile + tempfile gregvcovfile + tempfile gregclusfile + tempfile gregabsfile + tempfile ghdfeabsfile + tempfile gstatsfile + tempfile gbyvarfile + tempfile gbycolfile + tempfile gbynumfile + tempfile gtopnumfile + tempfile gtopmatfile + } + else { + GtoolsTempFile gregfile + GtoolsTempFile gregbfile + GtoolsTempFile gregsefile + GtoolsTempFile gregvcovfile + GtoolsTempFile gregclusfile + GtoolsTempFile gregabsfile + GtoolsTempFile ghdfeabsfile + GtoolsTempFile gstatsfile + GtoolsTempFile gbyvarfile + GtoolsTempFile gbycolfile + GtoolsTempFile gbynumfile + GtoolsTempFile gtopnumfile + GtoolsTempFile gtopmatfile + } + + global GTOOLS_GREG_FILE: copy local gregfile + global GTOOLS_GREGB_FILE: copy local gregbfile + global GTOOLS_GREGSE_FILE: copy local gregsefile + global GTOOLS_GREGVCOV_FILE: copy local gregvcovfile + global GTOOLS_GREGCLUS_FILE: copy local gregclusfile + global GTOOLS_GREGABS_FILE: copy local gregabsfile + global GTOOLS_GHDFEABS_FILE: copy local ghdfeabsfile + global GTOOLS_GSTATS_FILE: copy local gstatsfile + global GTOOLS_BYVAR_FILE: copy local gbyvarfile + global GTOOLS_BYCOL_FILE: copy local gbycolfile + global GTOOLS_BYNUM_FILE: copy local gbynumfile + global GTOOLS_GTOPNUM_FILE: copy local gtopnumfile + global GTOOLS_GTOPMAT_FILE: copy local gtopmatfile + + global GTOOLS_USER_INTERNAL_VARABBREV `c(varabbrev)' + * set varabbrev off + + if ( inlist("${GTOOLS_FORCE_PARALLEL}", "17900") ) { + di as txt "(note: multi-threading is not available on this platform)" + } + + if ( `c(bit)' != 64 ) { + di as err "(warning: gtools has not been tested on a `c(bit)'-bit architecture)" + * di as err "gtools has not been compiled on a `c(bit)'-bit architecture" + * exit 17801 + } + + local GTOOLS_CALLER $GTOOLS_CALLER + local GTOOLS_CALLERS gegen /// + gcollapse /// + gisid /// 2 + hashsort /// 3 + glevelsof /// + gunique /// + gtoplevelsof /// + gcontract /// 8 + gquantiles /// + gstats /// + greshape /// 11 + gregress /// + ghash + + if ( !(`:list GTOOLS_CALLER in GTOOLS_CALLERS') | ("$GTOOLS_CALLER" == "") ) { + di as err "_gtools_internal is not meant to be called directly." /// + " See {help gtools}" + clean_all 198 + exit 198 + } + + if ( `=_N < 1' ) { + di as err "no observations" + clean_all 17001 + exit 17001 + } + + if ( `=_N > 2^31-1' ) { + local nmax = trim("`: disp %21.0gc 2^31-1'") + di as err `"too many observations"' + di as err `""' + di as err `"A Stata bug prevents gtools from working with more than `nmax' observations."' + di as err `"See {browse "https://www.statalist.org/forums/forum/general-stata-discussion/general/1457637"}"' + di as err `"and {browse "https://github.com/mcaceresb/stata-gtools/issues/43"}"' + clean_all 17800 + exit 17800 + } + + local 00: copy local 0 + + * Time the entire function execution + FreeTimer + local t99: copy local FreeTimer + global GTOOLS_T99: copy local t99 + gtools_timer on `t99' + + FreeTimer + local t98: copy local FreeTimer + global GTOOLS_T98: copy local t98 + gtools_timer on `t98' + + *********************************************************************** + * Syntax parsing * + *********************************************************************** + + syntax [anything] [if] [in] , /// + [ /// + DEBUG_level(int 0) /// debugging + Verbose /// info + _subtract /// (Undocumented) Subtract result from source variabes + _keepgreshape /// (Undocumented) Keep greshape scalars + _CTOLerance(real 0) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// print function benchmark info + BENCHmarklevel(int 0) /// print plugin benchmark info + HASHmethod(str) /// hashing method + oncollision(str) /// On collision, fall back or throw error + gfunction(str) /// Program to handle collision + replace /// Replace variables, if they exist + noinit /// Do not initialize targets with missing values + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + /// + /// General options + /// --------------- + /// + /// keeptouse(str) /// generate sample indicator + seecount /// print group info to console + COUNTonly /// report group info and exit + MISSing /// Include missing values + KEEPMISSing /// Summary stats are . if all inputs are . + unsorted /// Do not sort hash values; faster + countmiss /// count # missing in output + /// (only w/certain targets) + NODS DS /// Parse - as varlist (ds) or negative (nods) + /// + /// Generic stats options + /// --------------------- + /// + sources(str) /// varlist must exist + targets(str) /// varlist must exist + stats(str) /// stats, 1 per target. w/multiple targets, + /// # targets must = # sources + freq(str) /// also collapse frequencies to variable + rawstat(str) /// Ignore weights for these targets + /// + /// Capture options + /// --------------- + /// + greshape(str) /// options for greshape (to parse later) + gregress(str) /// options for gregress (to parse later) + gstats(str) /// options for gstats (to parse later) + gquantiles(str) /// options for gquantiles (to parse later) + gcontract(str) /// options for gcontract (to parse later) + gcollapse(str) /// options for gcollapse (to parse later) + gtop(str) /// options for gtop (to parse later) + recast(str) /// bulk recast + sumcheck(str) /// absolute sum + weights(str) /// weight_type weight_var + /// + /// gegen group options + /// ------------------- + /// + tag(str) /// 1 for first obs of group in range, 0 otherwise + GENerate(str) /// variable where to store encoded index + counts(str) /// variable where to store group counts + fill(str) /// for counts(); group fill order or value + /// + /// gisid options + /// ------------- + /// + EXITMissing /// Throw error if any missing values (by row). + /// + /// hashsort options + /// ---------------- + /// + invertinmata /// invert sort index using mata + sortindex(str) /// keep sort index in memory + sortgen /// sort by generated variable (hashsort only) + skipcheck /// skip is sorted check + mlast /// sort missing values last, as a group + /// + /// glevelsof options + /// ----------------- + /// + glevelsof(str) /// extra options for glevelsof (parse later) + Separate(str) /// Levels sepparator + COLSeparate(str) /// Columns sepparator + Clean /// Clean strings + numfmt(str) /// Columns sepparator + ] + + * Startup! + * -------- + + * if ( ("`replace'" != "") & ("${GTOOLS_USER_INTERNAL_VARABBREV}" == "on") ) { + * disp as err "Option {opt replace} not allowed with varabbrev on." + * disp as err "Run {stata set varabbrev off} to use this feature." + * exit 198 + * } + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local gen `generate' + mata st_local("ifin", strtrim(st_local("if") + " " + st_local("in"))) + + local hashmethod `hashmethod' + if ( `"`hashmethod'"' == "" ) local hashmethod 0 + + local hashmethod_list 0 1 2 default biject spooky + if ( !`:list hashmethod_list in hashmethod_list' ) { + di as err `"hash method '`hashmethod'' not known;"' /// + " specify 0 (default), 1 (biject), or 2 (spooky)" + clean_all 198 + exit 198 + } + + if ( `"`hashmethod'"' == "default" ) local hashmethod 0 + if ( `"`hashmethod'"' == "biject" ) local hashmethod 1 + if ( `"`hashmethod'"' == "spooky" ) local hashmethod 2 + + *********************************************************************** + * debug! * + *********************************************************************** + + if ( `debug_level' ) { + local gopts1 tag(`tag') + local gopts1 `gopts1' generate(`generate') + local gopts1 `gopts1' counts(`counts') + local gopts1 `gopts1' fill(`fill') + + local gopts2 `exitmissing' + + local gopts3 `invertinmata' + local gopts3 `gopts3' sortindex(`sortindex') + local gopts3 `gopts3' `sortgen' + local gopts3 `gopts3' `skipcheck' + local gopts3 `gopts3' `mlast' + + local gopts4 glevelsof(`glevelsof') + local gopts4 `gopts4' separate(`separate') + local gopts4 `gopts4' colseparate(`colseparate') + local gopts4 `gopts4' clean + local gopts4 `gopts4' numfmt(`numfmt') + + disp as txt `""' + disp as txt "{cmd:_gtools_internal} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" anything: `anything'"' + disp as txt `" [if] [in]: `if' `in'"' + disp as txt `" weights: `weights'"' + disp as txt `" gfunction: `gfunction'"' + disp as txt `" GTOOLS_CALLER: $GTOOLS_CALLER"' + disp as txt `""' + disp as txt `" compress: `compress'"' + disp as txt `" forcestrl: `forcestrl'"' + disp as txt `" verbose: `verbose'"' + disp as txt `" benchmark: `benchmark'"' + disp as txt `" hashmethod: `hashmethod'"' + disp as txt `" oncollision: `oncollision'"' + disp as txt `" replace: `replace'"' + disp as txt `""' + disp as txt `" seecount: `seecount'"' + disp as txt `" countonly: `countonly'"' + disp as txt `" missing: `missing'"' + disp as txt `" keepmissing: `keepmissing'"' + disp as txt `" unsorted: `unsorted'"' + disp as txt `" countmiss: `countmiss'"' + disp as txt `""' + disp as txt `" sources: `sources'"' + disp as txt `" targets: `targets'"' + disp as txt `" stats: `stats'"' + disp as txt `" freq: `freq'"' + disp as txt `" rawstat: `rawstat'"' + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" gegen: `gopts1'"' + disp as txt `" gisid: `gopts2'"' + disp as txt `" hashsort: `gopts3'"' + disp as txt `" glevelsof: `gopts4'"' + disp as txt `" gquantiles: `gquantiles'"' + disp as txt `" gcontract: `gcontract'"' + disp as txt `" gstats: `gstats'"' + disp as txt `" gregress: `gregress'"' + disp as txt `" greshape: `greshape'"' + disp as txt `" gcollapse: `gcollapse'"' + disp as txt `" gtop: `gtop'"' + disp as txt `" recast: `recast'"' + disp as txt `" sumcheck: `sumcheck'"' + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + } + + *********************************************************************** + * Sum of absolute values * + *********************************************************************** + + if ( "`sumcheck'" != "" ) { + gettoken wtype wvar: weights + local wtype `wtype' + local wvar `wvar' + local 0 , checkvars(`sumcheck') + syntax, checkvars(varlist) + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/sumcheck} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" checkvars: `checkvars'"' + disp as txt `" __gtools_sum_k: `:list sizeof checkvars'"' + } + + scalar __gtools_sum_k = `:list sizeof checkvars' + scalar __gtools_sum_w = "`wvar'" != "" + matrix __gtools_sumcheck = J(1, `:list sizeof checkvars', .) + if ( inlist(`"`wtype'"', "fweight", "") ) { + cap noi plugin call gtools_plugin `checkvars' `wvar', sumcheck + local rc = _rc + } + else rc = 0 + return matrix sumcheck = __gtools_sumcheck + cap scalar drop __gtools_sum_k + cap matrix drop __gtools_sumcheck + clean_all `rc' + exit `rc' + } + + *********************************************************************** + * Bulk recast * + *********************************************************************** + + if ( "`recast'" != "" ) { + local 0 , `recast' + syntax, sources(varlist) targets(varlist) + + if ( `:list sizeof sources' != `:list sizeof targets' ) { + di as err "Must specify the same number of sources and targets" + clean_all 198 + exit 198 + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/recast} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" sources: `sources'"' + disp as txt `" targets: `targets'"' + disp as txt `" __gtools_k_recast: `:list sizeof sources'"' + } + + scalar __gtools_k_recast = `:list sizeof sources' + cap noi plugin call gtools_plugin `targets' `sources', recast + local rc = _rc + cap scalar drop __gtools_k_recast + clean_all `rc' + exit `rc' + } + + *********************************************************************** + * Execute the function normally * + *********************************************************************** + + * What to do + * ---------- + + local gfunction_list hash /// + egen /// + levelsof /// + isid /// + sort /// + unique /// + collapse /// + top /// + contract /// + stats /// + regress /// + reshape /// + quantiles + + if ( "`gfunction'" == "" ) local gfunction hash + if ( !(`:list gfunction in gfunction_list') ) { + di as err "{opt gfunction()} was '`gfunction''; expected one of:" /// + " `gfunction_list'" + clean_all 198 + exit 198 + } + + * Switches, options + * ----------------- + + local website_url https://github.com/mcaceresb/stata-gtools/issues + local website_disp github.com/mcaceresb/stata-gtools + + if ( "`oncollision'" == "" ) local oncollision fallback + if ( !inlist("`oncollision'", "fallback", "error") ) { + di as err "option {opt oncollision()} must be 'fallback' or 'error'" + clean_all 198 + exit 198 + } + + * Check options compatibility + * --------------------------- + + * Unsorted is passed automagically for isid and unique, where we + * don't care about sort order. + + if ( inlist("`gfunction'", "isid", "unique") ) { + if ( "`unsorted'" == "" ) { + di as txt "({opt gfunction(`gfunction')} sets option" /// + " {opt unsorted} automatically)" + local unsorted unsorted + } + } + + * isid exits with error if any variables have a missing value; the + * function needs to know whether to obey this rule or skip it (i.e. + * -missok- option in the caller) + + if ( inlist("`gfunction'", "isid") ) { + if ( "`exitmissing'`missing'" == "" ) { + di as err "{opt gfunction(`gfunction')} must set either" /// + " {opt exitmissing} or {opt missing}" + clean_all 198 + exit 198 + } + } + + * If the caller is sort, then + * - It must be applied to the entire data set (no partial sorts) + * - It does not exit if any observations are missing + * - It also sorts rows with any missing observations + * - The output cannot be unsorted! + + if ( inlist("`gfunction'", "sort") ) { + if ( `"`if'"' != "" ) { + di as err "Cannot sort data with if condition" + clean_all 198 + exit 198 + } + if ( "`exitmissing'" != "" ) { + di as err "Cannot specify {opt exitmissing} with" /// + " {opt gfunction(sort)}" + clean_all 198 + exit 198 + } + if ( "`missing'" == "" ) { + di as txt "({opt gfunction(`gfunction')} sets option" /// + " {opt missing} automatically)" + local missing missing + } + if ( "`unsorted'" != "" ) { + di as err "Cannot specify {opt unsorted} with {opt gfunction(sort)}" + clean_all 198 + exit 198 + } + } + + * You cannot both exit if any observation is missing and not exit + * if any observation is missing. For several group functions, stata + * ignores a row if the by variable has a missing observation. This + * controls whether to exclude the row/throw an error or whether to + * include it as a new group. + + if ( ("`exitmissing'" != "") & ("`missing'" != "") ) { + di as err "Cannot specify {opt exitmissing} with option {opt missing}" + clean_all 198 + exit 198 + } + + * If the caller is sort, you can request a sort index. + if ( "`sortindex'" != "" ) { + if ( !inlist("`gfunction'", "sort") ) { + di as err "sort index only allowed with {opt gfunction(sort)}" + clean_all 198 + exit 198 + } + } + + * Counts, gen, and tag are generic options that were specially + * coded to work with egen count, group, and tag, espectively. Hence + * they are handled sepparately. However, we only allow them to be + * requested with egen, unique, sort, levelsof, or quantiles as the + * caller. + + if ( "`counts'`gen'`tag'" != "" ) { + if ( "`countonly'" != "" ) { + di as err "cannot generate targets with option {opt countonly}" + clean_all 198 + exit 198 + } + + local gen_list hash egen unique sort levelsof quantiles + if ( !`:list gfunction in gen_list' ) { + di as err "cannot generate targets with" /// + " {opt gfunction(`gfunction')}" + clean_all 198 + exit 198 + } + + if ( ("`gen'" == "") & !inlist("`gfunction'", "sort", "levelsof") ) { + if ( "`unsorted'" == "" ) { + di as txt "({opt tag} and {opt counts} without {opt gen}" /// + " sets option {opt unsorted} automatically)" + local unsorted unsorted + } + } + } + + * Sources, targets, and stats are coded as generic options but they + * are basically only allowed with egen and collapse as callers. The + * generic "hash" caller will also accept it but it will not run any + * of the optimization checks that gegen and gcollapse do (specially + * gcollapse). + + if ( "`sources'`targets'`stats'" != "" ) { + if ( !inlist("`gfunction'", "hash", "egen", "collapse", "unique") ) { + di as err "cannot generate targets with {opt gfunction(`gfunction')}" + clean_all 198 + exit 198 + } + } + + * -fill()- is an option that was included at Sergio Correia's + * request. It allows the user to specify how certain output is to + * be filled (group: merge back to the data; missing: only the first + * observation of each group; adata: sequentially without merging + * back to the data). I believe he uses this internally in reghdfe. + + if ( "`fill'" != "" ) { + if ( "`counts'`targets'" == "" ) { + di as err "{opt fill()} only allowed with {opth counts(newvarname)}" + clean_all 198 + exit 198 + } + } + + * The levelsof caller's options were implemented before I got the + * idea of capturing each caller's options. Hence they are parsed + * here! Yay for legacy support. + * - separate is the character that delimits each group + * - colseparate is the char that delimits each column within a group + * - clean is whether the strings should be left unquoted + * - numfmt is how to print the numbers + + if ( "`separate'`colseparate'`clean'`numfmt'" != "" ) { + local errmsg "" + if ( "`separate'" != "" ) local errmsg "`errmsg' separate()," + if ( "`colseparate'" != "" ) local errmsg "`errmsg' colseparate(), " + if ( "`clean'" != "" ) local errmsg "`errmsg' -clean-, " + if ( "`numfmt'" != "" ) local errmsg "`errmsg' -numfmt()-, " + if ( !inlist("`gfunction'", "levelsof", "top") ) { + di as err "`errmsg' only allowed with {opt gfunction(levelsof)}" + clean_all 198 + exit 198 + } + } + + * Parse weights + * ------------- + + * Some functions allow weights, which are parsed here. + + gettoken wtype wvar: weights + + if ( `"`wtype'"' == "" ) { + local wcode 0 + } + else { + if ( `"`wvar'"' == "" ) { + di as err "Passed option {opt weights(`wtype')} without a weighting variable" + clean_all 198 + exit 198 + } + + if ( `"`wtype'"' == "aweight" ) local wcode 1 + else if ( `"`wtype'"' == "fweight" ) local wcode 2 + else if ( `"`wtype'"' == "iweight" ) local wcode 3 + else if ( `"`wtype'"' == "pweight" ) local wcode 4 + else { + di as err "unknown weight type {opt `wtype'}" + clean_all 198 + exit 198 + } + } + + * Interestingly, stata allows for rawsum, but someone gave me the + * idea of implementing a generic -rawstat()- option, so weights are + * selectively applied to each individual target, if the user so + * chooses to specify it. + + local wstats: copy local stats + local wselective 0 + local skipstats percent + + if ( "`rawstat'" != "" ) { + cap matrix drop wselmat + foreach var in `targets' { + gettoken wstat wstats: wstats + local inraw: list posof `"`var'"' in rawstat + local statskip: list posof `"`wstat'"' in skipstats + if ( (`inraw' > 0) & (`statskip' == 0) ) { + local ++wselective + matrix wselmat = nullmat(wselmat), 1 + } + else if ( (`inraw' > 0) & (`statskip' > 0) ) { + disp as err "{opt rawstat} cannot be requested for {opt percent}" + exit 198 + } + else { + matrix wselmat = nullmat(wselmat), 0 + } + } + + if ( `wselective' == 0 ) { + disp as err "{bf:Warning:} {opt rawstat} requested but none of the variables are targets" + } + else { + if ( `"`wtype'"' != "" ) { + disp "{bf:Warning:} 0 or missing weights are dropped for {bf:all} variables." + } + } + } + else { + matrix wselmat = J(1, 1, 0) + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/weights} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" wtype: `wtype'"' + disp as txt `" wcode: `wcode'"' + disp as txt `" wstats: `wstats'"' + disp as txt `" wselective: `wselective'"' + disp as txt `" skipstats: `skipstats'"' + disp as txt `" rawstat: `rawstat'"' + matrix list wselmat + } + + * Parse options into scalars, etc. for C + * -------------------------------------- + + * C is great! It's fast, it's...well, it's fast. The compiler is + * cool too, but it's not the friendliest language to write stuff in. + * And Stata's C API is limited. It's awesome and amazing that it + * even exists, to be honest, but the functionality is wanting. + * + * Anyway, the easiest way to pass info to and from C is to use + * scalars and matrices. Moreover, it's easier to define EVERY + * variable that we could possibly set and read it from C every + * time vs going through the hassle of writing 16 pairs of if-else + * statements. + * + * Here I initialize all the relevant scalars and such to empty or + * dummy values as applicable. + + local any_if = ( "if'" != "" ) + local verbose = ( "`verbose'" != "" ) + local benchmark = ( "`benchmark'" != "" ) + + mata: st_numscalar("__gtools_gfile_byvar", strlen(st_local("gbyvarfile")) + 1) + mata: st_numscalar("__gtools_gfile_bycol", strlen(st_local("gbycolfile")) + 1) + mata: st_numscalar("__gtools_gfile_bynum", strlen(st_local("gbynumfile")) + 1) + + mata: st_numscalar("__gtools_gfile_topnum", strlen(st_local("gtopnumfile")) + 1) + mata: st_numscalar("__gtools_gfile_topmat", strlen(st_local("gtopmatfile")) + 1) + + mata: st_numscalar("__gtools_gfile_gregb", strlen(st_local("gregbfile")) + 1) + mata: st_numscalar("__gtools_gfile_gregse", strlen(st_local("gregsefile")) + 1) + mata: st_numscalar("__gtools_gfile_gregvcov", strlen(st_local("gregvcovfile")) + 1) + mata: st_numscalar("__gtools_gfile_gregclus", strlen(st_local("gregclusfile")) + 1) + mata: st_numscalar("__gtools_gfile_gregabs", strlen(st_local("gregabsfile")) + 1) + mata: st_numscalar("__gtools_gfile_ghdfeabs", strlen(st_local("ghdfeabsfile")) + 1) + + scalar __gtools_init_targ = 0 + scalar __gtools_any_if = `any_if' + scalar __gtools_verbose = `verbose' + scalar __gtools_debug = `debug_level' + scalar __gtools_benchmark = cond(`benchmarklevel' > 0, `benchmarklevel', 0) + scalar __gtools_keepmiss = ( "`keepmissing'" != "" ) + scalar __gtools_missing = ( "`missing'" != "" ) + scalar __gtools_unsorted = ( "`unsorted'" != "" ) + scalar __gtools_countonly = ( "`countonly'" != "" ) + scalar __gtools_seecount = ( "`seecount'" != "" ) + scalar __gtools_nomiss = ( "`exitmissing'" != "" ) + scalar __gtools_replace = ( "`replace'" != "" ) + scalar __gtools_countmiss = ( "`countmiss'" != "" ) + scalar __gtools_invertix = ( "`invertinmata'" == "" ) + scalar __gtools_skipcheck = ( "`skipcheck'" != "" ) + scalar __gtools_mlast = ( "`mlast'" != "" ) + scalar __gtools_subtract = ( "`_subtract'" != "" ) + scalar __gtools_ctolerance = `_ctolerance' + scalar __gtools_hash_method = `hashmethod' + scalar __gtools_weight_code = `wcode' + scalar __gtools_weight_pos = 0 + scalar __gtools_weight_sel = `wselective' + scalar __gtools_nunique = ( `:list posof "nunique" in stats' > 0 ) + + scalar __gtools_top_nrows = 0 + scalar __gtools_top_ntop = 0 + scalar __gtools_top_pct = 0 + scalar __gtools_top_freq = 0 + scalar __gtools_top_mataname = "" + scalar __gtools_top_matasave = 0 + scalar __gtools_top_silent = 0 + scalar __gtools_top_vlab = 1 + scalar __gtools_top_invert = 0 + scalar __gtools_top_alpha = 0 + scalar __gtools_top_miss = 0 + scalar __gtools_top_groupmiss = 0 + scalar __gtools_top_other = 0 + scalar __gtools_top_lmiss = 0 + scalar __gtools_top_lother = 0 + scalar __gtools_top_Jmiss = 0 + scalar __gtools_top_Jother = 0 + matrix __gtools_contract_which = J(1, 4, 0) + matrix __gtools_invert = 0 + matrix __gtools_weight_smat = wselmat + cap matrix drop wselmat + + scalar __gtools_levels_mataname = `""' + scalar __gtools_levels_matasave = 0 + scalar __gtools_levels_silent = 0 + scalar __gtools_levels_return = 1 + scalar __gtools_levels_gen = 0 + scalar __gtools_levels_replace = 0 + + scalar __gtools_xtile_xvars = 0 + scalar __gtools_xtile_nq = 0 + scalar __gtools_xtile_nq2 = 0 + scalar __gtools_xtile_cutvars = 0 + scalar __gtools_xtile_ncuts = 0 + scalar __gtools_xtile_qvars = 0 + scalar __gtools_xtile_gen = 0 + scalar __gtools_xtile_pctile = 0 + scalar __gtools_xtile_genpct = 0 + scalar __gtools_xtile_pctpct = 0 + scalar __gtools_xtile_altdef = 0 + scalar __gtools_xtile_missing = 0 + scalar __gtools_xtile_strict = 0 + scalar __gtools_xtile_min = 0 + scalar __gtools_xtile_max = 0 + scalar __gtools_xtile_method = 0 + scalar __gtools_xtile_bincount = 0 + scalar __gtools_xtile__pctile = 0 + scalar __gtools_xtile_dedup = 0 + scalar __gtools_xtile_cutifin = 0 + scalar __gtools_xtile_cutby = 0 + scalar __gtools_xtile_imprecise = 0 + matrix __gtools_xtile_quantiles = J(1, 1, .) + matrix __gtools_xtile_cutoffs = J(1, 1, .) + matrix __gtools_xtile_quantbin = J(1, 1, .) + matrix __gtools_xtile_cutbin = J(1, 1, .) + + gregress_scalars init + gstats_scalars init + greshape_scalars init + + * Parse glevelsof options + * ----------------------- + + * Again, glevelsof is parsed in the open since I defined the options + * before moving to capturing each caller's options. + + if ( `"`separate'"' == "" ) local sep `" "' + else local sep: copy local separate + + if ( `"`colseparate'"' == "" ) local colsep `" | "' + else local colsep: copy local colseparate + + local numfmt_empty = 0 + if ( `"`numfmt'"' == "" ) { + local numfmt_empty = 1 + local numfmt `"%.16g"' + } + + if regexm(`"`numfmt'"', "%([0-9]+)\.([0-9]+)([gf])") { + local numlen = max(`:di regexs(1)', `:di regexs(2)' + 5) + cond(regexs(3) == "f", 23, 0) + } + else if regexm(`"`numfmt'"', "%\.([0-9]+)([gf])") { + local numlen = `:di regexs(1)' + 5 + cond(regexs(2) == "f", 23, 0) + } + else { + di as err "Number format must be %(width).(digits)(f|g);" /// + " e.g. %.16g (default), %20.5f" + clean_all 198 + exit 198 + } + + scalar __gtools_numfmt_max = `numlen' + scalar __gtools_numfmt_len = length(`"`numfmt'"') + scalar __gtools_cleanstr = ( "`clean'" != "" ) + scalar __gtools_sep_len = length(`"`sep'"') + scalar __gtools_colsep_len = length(`"`colsep'"') + + * Parse target names and group fill + * --------------------------------- + + * tag, gen, and counts are set up as generic options. Here we figure + * out whether to generate each of them as empty variables or whether + * to over-write existing variables (if -replace- was specified by + * the user). + + * confirm new variable `gen_name' + * local 0 `gen_name' + * syntax newvarname + + if ( "`tag'" != "" ) { + gettoken tag_type tag_name: tag + local tag_name `tag_name' + local tag_type `tag_type' + if ( "`tag_name'" == "" ) { + local tag_name `tag_type' + local tag_type byte + } + cap noi confirm_var `tag_name', `replace' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + local new_tag = `r(newvar)' + } + + if ( "`gen'" != "" ) { + gettoken gen_type gen_name: gen + local gen_name `gen_name' + local gen_type `gen_type' + if ( "`gen_name'" == "" ) { + local gen_name `gen_type' + if ( `=_N < maxlong()' ) { + local gen_type long + } + else { + local gen_type double + } + } + cap noi confirm_var `gen_name', `replace' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + local new_gen = `r(newvar)' + } + + * counts is a bit convoluted because it must obey the fill() option. + * Depending on the set up, we specify whether counts will be filled + * sequentially 1 / number of groups, whether they will be merged + * back to the data, or whether only the first entry within a group + * will be filled. + + scalar __gtools_group_data = 0 + scalar __gtools_group_fill = 0 + scalar __gtools_group_val = . + if ( "`counts'" != "" ) { + { + gettoken counts_type counts_name: counts + local counts_name `counts_name' + local counts_type `counts_type' + if ( "`counts_name'" == "" ) { + local counts_name `counts_type' + if ( `=_N < maxlong()' ) { + local counts_type long + } + else { + local counts_type double + } + } + cap noi confirm_var `counts_name', `replace' + if ( _rc ) { + local rc = _rc + clean_all + exit `rc' + } + local new_counts = `r(newvar)' + } + if ( "`fill'" != "" ) { + if ( "`fill'" == "group" ) { + scalar __gtools_group_fill = 0 + scalar __gtools_group_val = . + } + else if ( "`fill'" == "data" ) { + scalar __gtools_group_data = 1 + scalar __gtools_group_fill = 0 + scalar __gtools_group_val = . + } + else { + cap confirm number `fill' + cap local fill_value = `fill' + if ( _rc ) { + di as error "'`fill'' found where number expected" + clean_all 7 + exit 7 + } + * local 0 , fill(`fill') + * syntax , [fill(real 0)] + scalar __gtools_group_fill = 1 + scalar __gtools_group_val = `fill' + } + } + } + else if ( "`targets'" != "" ) { + if ( "`fill'" != "" ) { + if ( "`fill'" == "missing" ) { + scalar __gtools_group_fill = 1 + scalar __gtools_group_val = . + } + else if ( "`fill'" == "data" ) { + scalar __gtools_group_data = 1 + scalar __gtools_group_fill = 0 + scalar __gtools_group_val = . + } + } + } + else if ( "`fill'" != "" ) { + di as err "{opt fill} only allowed with option {opt count()} or {opt targets()}" + clean_all 198 + exit 198 + } + + * Generate new variables + * ---------------------- + + * Here is where we actually generate the variables. If the target + * already exists we skip it; otherwise we add an empty variable. + + local kvars_group = 0 + scalar __gtools_encode = 1 + mata: __gtools_group_targets = J(1, 3, 0) + mata: __gtools_group_init = J(1, 3, 0) + mata: __gtools_togen_k = 0 + + if ( "`counts'`gen'`tag'" != "" ) { + local topos 1 + local etargets `gen_name' `counts_name' `tag_name' + mata: __gtools_togen_types = J(1, `:list sizeof etargets', "") + mata: __gtools_togen_names = J(1, `:list sizeof etargets', "") + + * 111 = 8 + * 101 = 6 + * 011 = 7 + * 001 = 5 + * 110 = 4 + * 010 = 3 + * 100 = 2 + * 000 = 1 + + if ( "`gen'" != "" ) { + local ++kvars_group + scalar __gtools_encode = __gtools_encode + 1 + if ( `new_gen' ) { + mata: __gtools_togen_types[`topos'] = "`gen_type'" + mata: __gtools_togen_names[`topos'] = "`gen_name'" + local ++topos + } + else { + mata: __gtools_group_init[1] = 1 + } + mata: __gtools_group_targets = J(1, 3, 1) + } + + if ( "`counts'" != "" ) { + local ++kvars_group + scalar __gtools_encode = __gtools_encode + 2 + if ( `new_counts' ) { + mata: __gtools_togen_types[`topos'] = "`counts_type'" + mata: __gtools_togen_names[`topos'] = "`counts_name'" + local ++topos + } + else { + mata: __gtools_group_init[2] = 1 + } + mata: __gtools_group_targets[2] = __gtools_group_targets[2] + 1 + mata: __gtools_group_targets[3] = __gtools_group_targets[3] + 1 + } + else { + mata: __gtools_group_targets[2] = 0 + } + + if ( "`tag'" != "" ) { + local ++kvars_group + scalar __gtools_encode = __gtools_encode + 4 + if ( `new_tag' ) { + mata: __gtools_togen_types[`topos'] = "`tag_type'" + mata: __gtools_togen_names[`topos'] = "`tag_name'" + local ++topos + } + else { + mata: __gtools_group_init[3] = 1 + } + mata: __gtools_group_targets[3] = __gtools_group_targets[3] + 1 + } + else { + mata: __gtools_group_targets[3] = 0 + } + + qui mata: __gtools_togen_k = sum(__gtools_togen_names :!= missingof(__gtools_togen_names)) + qui mata: __gtools_togen_s = 1::((__gtools_togen_k > 0)? __gtools_togen_k: 1) + qui mata: (__gtools_togen_k > 0)? st_addvar(__gtools_togen_types[__gtools_togen_s], __gtools_togen_names[__gtools_togen_s]): "" + + local msg "Generated targets" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') + } + else local etargets "" + + scalar __gtools_k_group = `kvars_group' + mata: st_matrix("__gtools_group_targets", __gtools_group_targets) + mata: st_matrix("__gtools_group_init", __gtools_group_init) + mata: mata drop __gtools_group_targets + mata: mata drop __gtools_group_init + + * Parse by types + * -------------- + + * Finally parse the by variables We process the set of by variables. + * differently depending on their type. If any are strings, then we + * use the spooky hash regardless. If all are numbers, we may use a + * bijection, which is faster, instead. + * + * Here we obtain the number of string variables, the number of + * numeric variables, and the length of each string variables (to + * adequately allocate memory internally). For numeric variables + * we also need the min and the max, but we will find that out + * internally later on. + * + * Last, we parse whether or not to invert the sort orner of a given + * by variable ("-" preceding it). If option -ds- is passed, then "-" + * is interpret as the "to" operator in Stata's varlist notation. + + if ( `"`anything'"' != "" ) { + local clean_anything: copy local anything + local clean_anything: subinstr local clean_anything "+" " ", all + if ( strpos(`"`clean_anything'"', "-") & ("`ds'`nods'" == "") ) { + disp as txt "'-' interpreted as negative; use option -ds- to interpret as varlist" + disp as txt "(to suppress this warning, use option -nods-)" + } + if ( "`ds'" != "" ) { + local clean_anything `clean_anything' + if ( "`clean_anything'" == "" ) { + di as err "Invalid varlist: `anything'" + clean_all 198 + exit 198 + } + cap ds `clean_anything' + if ( _rc ) { + cap noi ds `clean_anything' + local rc = _rc + clean_all `rc' + exit `rc' + } + local clean_anything `r(varlist)' + } + else { + local clean_anything: subinstr local clean_anything "-" " ", all + local clean_anything `clean_anything' + if ( "`clean_anything'" == "" ) { + di as err "Invalid list: '`anything''" + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + clean_all 198 + exit 198 + } + cap ds `clean_anything' + if ( _rc ) { + local notfound + foreach var of local clean_anything { + cap ds `var' + if ( _rc ) { + local notfound `notfound' `var' + } + } + if ( `:list sizeof notfound' > 0 ) { + if ( `:list sizeof notfound' > 1 ) { + di as err "Variables not found: `notfound'" + } + else { + di as err "Variable `notfound' not found" + } + } + clean_all 111 + exit 111 + } + qui ds `clean_anything' + local clean_anything `r(varlist)' + } + cap noi check_matsize `clean_anything' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + if ( "`ds'" == "" ) local nods nods + + local opts `compress' `forcestrl' glevelsof(`glevelsof') `ds' + cap noi parse_by_types `anything' `ifin', clean_anything(`clean_anything') `opts' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + local invert = `r(invert)' + local byvars = "`r(varlist)'" + local bynum = "`r(varnum)'" + local bystr = "`r(varstr)'" + local bystrL = "`r(varstrL)'" + global GTOOLS_BYNAMES: copy local byvars + + * Unfortunately, the number of by variables we can process is + * limited by the number of entries we can store in a Stata matrix. + * We _could_ hack our way around this, but it would be very + * cumbersome for very little payoff. (Is it that common to request + * more than 800p by variables, sources, or targets? Or 11,000 in the + * case of MP?) + * + * Anyway, we check whether the largest allowed number of entries + * in a matrix is at least as large as the number of variables. If + * it's not, we try to set matsize to that number so we don't get any + * errors. If we reach Stata's limit then we throw an error and let + * the user know about this limitation. + + if ( "`byvars'" != "" ) { + cap noi check_matsize `byvars' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + + if ( "`targets'" != "" ) { + cap noi check_matsize `targets' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + + if ( "`sources'" != "" ) { + cap noi check_matsize `sources' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + + if ( inlist("`gfunction'", "levelsof") & ("`byvars'" == "") ) { + di as err "gfunction(`gfunction') requires at least one variable." + clean_all 198 + exit 198 + } + + * Parse position of by variables + * ------------------------------ + + if ( "`byvars'" != "" ) { + cap matrix drop __gtools_strpos + cap matrix drop __gtools_numpos + + foreach var of local bystr { + matrix __gtools_strpos = nullmat(__gtools_strpos), /// + `:list posof `"`var'"' in byvars' + } + + foreach var of local bynum { + matrix __gtools_numpos = nullmat(__gtools_numpos), /// + `:list posof `"`var'"' in byvars' + } + } + else { + matrix __gtools_strpos = 0 + matrix __gtools_numpos = 0 + } + + * Parse sources, targets, stats (sources and targets MUST exist!) + * --------------------------------------------------------------- + + * Here we code the position of each source and each target relative + * to each source. A single source can be the base of multiple + * targets. That is, consider: + * + * source1 source2 source3 source4 + * target1 target2 target3 target4 + * + * It coult be the case that, for example, + * + * source1 = source3 + * source2 = source4 + * + * Hence we pass the variable list as + * + * source1 source3 target1 target2 target3 target4 + * + * And the source of each target is (1, 2, 1, 2). + * + * We also need to encode the stat requested. It's inconsequential + * for a few groups, but if there are a large number of groups + * then it's much more efficient to use numbers to determine which + * statistic to compute than strings. + + matrix __gtools_stats = 0 + matrix __gtools_pos_targets = 0 + scalar __gtools_k_vars = 0 + scalar __gtools_k_targets = 0 + scalar __gtools_k_stats = 0 + + if ( "`sources'`targets'`stats'" != "" ) { + if ( "`gfunction'" == "collapse" ) { + if regexm("`gcollapse'", "^(forceio|switch)") { + local k_exist k_exist(sources) + } + if regexm("`gcollapse'", "^read") { + local k_exist k_exist(targets) + } + } + + parse_targets, sources(`sources') /// + targets(`targets') /// + stats(`stats') /// + `k_exist' `replace' `keepmissing' + + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + if ( "`freq'" != "" ) { + cap confirm variable `freq' + if ( _rc ) { + di as err "Target `freq' has to exist." + clean_all 198 + exit 198 + } + + cap confirm numeric variable `freq' + if ( _rc ) { + di as err "Target `freq' must be numeric." + clean_all 198 + exit 198 + } + + scalar __gtools_k_targets = __gtools_k_targets + 1 + scalar __gtools_k_stats = __gtools_k_stats + 1 + matrix __gtools_stats = __gtools_stats, -14 + matrix __gtools_pos_targets = __gtools_pos_targets, 0 + } + + local intersection: list __gtools_targets & byvars + if ( "`intersection'" != "" ) { + if ( "`replace'" == "" ) { + di as error "targets in are also in by(): `intersection'" + error 110 + } + } + + local extravars `__gtools_sources' `__gtools_targets' `freq' + } + else local extravars "" + + local msg "Parsed by variables" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') + + *********************************************************************** + * Debug! * + *********************************************************************** + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/setup} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" sep: `sep' "' + disp as txt `" colsep: `colsep' "' + disp as txt `" numfmt: `numfmt' "' + disp as txt `" numlen: `numlen' "' + disp as txt `""' + disp as txt `" tag_name: `tag_name' "' + disp as txt `" tag_type: `tag_type' "' + disp as txt `" gen_name: `gen_name' "' + disp as txt `" gen_type: `gen_type' "' + disp as txt `" counts_name: `counts_name'"' + disp as txt `" counts_type: `counts_type'"' + disp as txt `""' + disp as txt `" clean_anything: `clean_anything'"' + disp as txt `" invert: `invert'"' + disp as txt `" byvars: `byvars'"' + disp as txt `" bynum: `bynum'"' + disp as txt `" bystr: `bystr'"' + disp as txt `""' + disp as txt `" __gtools_sources: `__gtools_sources'"' + disp as txt `" __gtools_targets: `__gtools_targets'"' + disp as txt `" extravars: `extravars'"' + + scalar list + matrix dir + } + + *********************************************************************** + * Call the plugin * + *********************************************************************** + + local rset = 1 + local opts oncollision(`oncollision') + if ( "`gfunction'" == "sort" ) { + + * Sorting using plugins internally involves several steps: + * + * 1) Make a copy of the data in memory + * 2) Sort the copy of the data in place + * 3) Copy the sorted copy back into Stata + * + * While step 2, the sort itself, is much faster in C, steps + * 1 and 3 make it so such an implementation is actually much + * slower than sorting in Stata. This involves only one step: + * Sort the copy of the data in place. + * + * Hence we use a trick! + * + * 1) Generate an index + * 2) Make a copy of the indexed sort variables + * 3) Sort the indexed copy + * 4) Copy the index to Stata + * 5) Re-arrange the data in place using the index + * + * This is still a multi-step process that is not particularly + * fast. Hence Stata, specially Stata/MP, can often still sort + * faster (since it's only one step). + + * Andrew Mauer's trick? From ftools + * --------------------------------- + + local contained 0 + local sortvar : sortedby + forvalues k = 1 / `:list sizeof byvars' { + if ( "`:word `k' of `byvars''" == "`:word `k' of `sortvar''" ) { + local ++contained + } + } + * di "`contained'" + + * Check if already sorted + if ( "`skipcheck'" == "" ) { + if ( !`invert' & ("`sortvar'" == "`byvars'") ) { + if ( "`verbose'" != "" ) di as txt "(already sorted)" + clean_all 0 + exit 0 + } + else if ( !`invert' & (`contained' == `:list sizeof byvars') ) { + * If the first k sorted variables equal byvars, just call sort + if ( "`verbose'" != "" ) di as txt "(already sorted)" + sort `byvars', `:disp cond("`bystrL'" == "", "", "stable")' + clean_all 0 + exit 0 + } + else if ( "`sortvar'" != "" ) { + * Andrew Maurer's trick to clear `: sortedby' + qui set obs `=_N + 1' + loc sortvar : word 1 of `sortvar' + loc sortvar_type : type `sortvar' + loc sortvar_is_str = strpos("`sortvar_type'", "str") == 1 + + if ( `sortvar_is_str' ) { + qui replace `sortvar' = `"."' in `=_N' + } + else { + qui replace `sortvar' = 0 in `=_N' + } + qui drop in `=_N' + } + } + else { + if ( "`sortvar'" != "" ) { + * Andrew Maurer's trick to clear `: sortedby' + qui set obs `=_N + 1' + loc sortvar : word 1 of `sortvar' + loc sortvar_type : type `sortvar' + loc sortvar_is_str = strpos("`sortvar_type'", "str") == 1 + + if ( `sortvar_is_str' ) { + qui replace `sortvar' = `"."' in `=_N' + } + else { + qui replace `sortvar' = 0 in `=_N' + } + qui drop in `=_N' + } + } + + * Use sortindex for the shuffle + * ----------------------------- + + if ( "`bystrL'" != "" ) { + disp as txt "({bf:warning}: hashsort with strL variables is {bf:slow})" + } + + local hopts benchmark(`benchmark') `invertinmata' + cap noi hashsort_inner `byvars' `etargets', `hopts' + cap noi rc_dispatch `byvars', rc(`=_rc') `opts' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + if ( ("`gen_name'" == "") | ("`sortgen'" == "") ) { + if ( `invert' ) { + mata: st_numscalar("__gtools_first_inverted", /// + selectindex(st_matrix("__gtools_invert"))[1]) + if ( `=scalar(__gtools_first_inverted)' > 1 ) { + local sortvars "" + forvalues i = 1 / `=scalar(__gtools_first_inverted) - 1' { + local sortvars `sortvars' `:word `i' of `byvars'' + } + scalar drop __gtools_first_inverted + sort `sortvars', `:disp cond("`bystrL'" == "", "", "stable")' + } + } + else { + sort `byvars', `:disp cond("`bystrL'" == "", "", "stable")' + } + } + else if ( ("`gen_name'" != "") & ("`sortgen'" != "") ) { + sort `gen_name', `:disp cond("`bystrL'" == "", "", "stable")' + } + + local msg "Stata reshuffle" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') off + + if ( `=_N < maxlong()' ) { + local stype long + } + else { + stype double + } + if ( "`sortindex'" != "" ) gen `stype' `sortindex' = _n + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/sort} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" contained: `contained'"' + disp as txt `" skipcheck: `skipcheck'"' + disp as txt `" sortvar: `sortvar'"' + disp as txt `" sortvar_type: `sortvar_type'"' + disp as txt `" sortvar_is_str: `sortvar_is_str'"' + disp as txt `" gen_name: `gen_name'"' + disp as txt `" sortgen: `sortgen'"' + disp as txt `" sortindex: `sortindex'"' + disp as txt `""' + disp as txt `" byvars: `byvars'"' + disp as txt `" etargets: `etargets'"' + disp as txt `" hopts: `hopts'"' + disp as txt `""' + } + } + else if ( "`gfunction'" == "collapse" ) { + + * Collapse is a convoluted function. It would be simpler if + * Stata's C API was nicer, but due to the way it's written, + * we require a number of workarounds. See gcollapse.ado for + * details. + + local 0 `gcollapse' + syntax anything, [st_time(real 0) fname(str) ixinfo(str) merge] + scalar __gtools_st_time = `st_time' + scalar __gtools_used_io = 0 + scalar __gtools_ixfinish = 0 + scalar __gtools_J = _N + scalar __gtools_init_targ = (`"`ifin'"' != "") & ("`merge'" != "") & ("`init'" == "") + if ( (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" != "") ) NoInitWarning + + if inlist("`anything'", "forceio", "switch") { + local extravars `__gtools_sources' `__gtools_sources' `freq' + } + if inlist("`anything'", "read") { + local extravars `: list __gtools_targets - __gtools_sources' `freq' + } + + local plugvars `byvars' `etargets' `extravars' `ixinfo' + scalar __gtools_weight_pos = `:list sizeof plugvars' + 1 + + cap noi plugin call gtools_plugin `plugvars' `wvar' `ifin', /// + collapse `anything' `"`fname'"' + + cap noi rc_dispatch `byvars', rc(`=_rc') `opts' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + if ( "`anything'" != "read" ) { + scalar __gtools_J = `r_J' + return scalar N = `r_N' + return scalar J = `r_J' + return scalar minJ = `r_minJ' + return scalar maxJ = `r_maxJ' + local rset = 0 + } + + if ( `=scalar(__gtools_ixfinish)' ) { + local msg "Switch code runtime" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') + + qui mata: st_addvar(__gtools_gc_addtypes, __gtools_gc_addvars, 1) + local msg "Added targets" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') + + local extravars `__gtools_sources' `__gtools_targets' `freq' + local plugvars `byvars' `etargets' `extravars' `ixinfo' + scalar __gtools_weight_pos = `:list sizeof plugvars' + 1 + + cap noi plugin call gtools_plugin `plugvars' `wvar' `ifin', /// + collapse ixfinish `"`fname'"' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + local msg "Finished collapse" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') off + } + else { + local msg "C plugin runtime" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') off + } + + return scalar used_io = `=scalar(__gtools_used_io)' + local runtxt " (internals)" + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/collapse} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" byvars: `byvars'"' + disp as txt `" etargets: `etargets'"' + disp as txt `" extravars: `extravars'"' + disp as txt `" ixinfo: `ixinfo'"' + disp as txt `""' + disp as txt `" [if] [in]: `if' `in'"' + disp as txt `" wvar: `wvar'"' + disp as txt `" fname: `fname'"' + disp as txt `" anything: `anything'"' + disp as txt `""' + + scalar list __gtools_st_time + scalar list __gtools_used_io + scalar list __gtools_ixfinish + scalar list __gtools_J + scalar list __gtools_init_targ + scalar list __gtools_weight_pos + scalar list __gtools_J + } + } + else { + + * The rest of the functions can be easily dispatched using + * a similar set of steps. Internally: + * + * 1. Hash, index + * 2. Sort indexed hash + * 3. Determine group sizes and cut points + * 4. Use index and group info to compute the function + * + * NOTE: If there are targets (as with egen, collapse, or generic + * hash), they are replaced with missing values internally right + * before writing the output. Special functions tag, group, + * and count are initialized as well, should they have been + * requested. + + if ( inlist("`gfunction'", "unique", "egen", "hash") ) { + local gcall hash + scalar __gtools_init_targ = (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" == "") + if ( (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" != "") ) NoInitWarning + } + else if ( inlist("`gfunction'", "reshape") ) { + local 0: copy local greshape + syntax anything, xij(str) [j(str) xi(str) File(str) STRing(int 0) DROPMISSing] + + gettoken shape readwrite: anything + local readwrite `readwrite' + if !inlist(`"`shape'"', "long", "wide") { + disp "`shape' unknown: only long and wide are supported" + exit 198 + } + if !inlist(`"`readwrite'"', "fwrite", "write", "read") { + disp "`readwrite' unknown: only fwrite, write, and read are supported" + exit 198 + } + + if ( inlist(`"`readwrite'"', "fwrite", "write") ) { + if ( `"`shape'"' == "long" ) { + local reshapevars `xi' `xij' + } + else { + local reshapevars `xij' `xi' + } + } + else { + local reshapevars `xij' `xi' + } + + local gcall `gfunction' `readwrite' `"`file'"' + + scalar __gtools_greshape_code = cond(`"`shape'"' == "wide", 2, 1) + if ( (`"`shape'"' == "wide") | ("`readwrite'" == "read") ) { + local reshapevars `j' `reshapevars' + } + scalar __gtools_greshape_str = `string' + scalar __gtools_greshape_kxi = `:list sizeof xi' + scalar __gtools_greshape_dropmiss = ( `"`dropmissing'"' != "" ) + } + else if ( inlist("`gfunction'", "regress") ) { + local gcall `gfunction' `"${GTOOLS_GREG_FILE}"' + local 0: copy local gregress + // syntax varlist(numeric ts fv), [ /// TODO: ts fv not yet supported! + syntax varlist(numeric), [ /// TODO: ts fv not yet supported! + Robust /// Robust SE + cluster(str) /// Cluster by varlist + absorb(varlist) /// Absorb each var in varlist as FE + interval(str) /// Interval for rolling regressions + window(str) /// Window for moving regressions + hdfetol(real 1e-8) /// Tolerance for hdfe convergence + STANdardize /// standardize before applying transform + TRACEiter /// trace iteration progress (internal hdfe) + maxiter(real 100000) /// maximum number of hdfe iterations + algorithm(str) /// alias for method + method(str) /// projection method for hdfe + /// map (method of alternating projections) + /// squarem + /// conjugate gradient|cg (default) + /// it|irons tuck + noConstant /// Whether to add a constant + /// + ivkendog(int 0) /// IV endogenous + ivkexog(int 0) /// IV exogenous + ivkz(int 0) /// IV instruments + /// + glmtol(real 1e-8) /// Tolerance for GLM (IRLS) convergence + glmiter(int 1000) /// Max iterations for GLM convergence + glmfam(str) /// GLM family + glmlink(str) /// GLM link function + /// + mata(str) /// save in mata (default) + GENerate(str) /// save in varlist + prefix(str) /// save prepending prefix + PREDict(str) /// save fit in `predict' + resid /// save residuals in _resid_`yvarlist' + RESIDuals(str) /// save residuals in `residuals' + replace /// Replace targets, if they exist + noinit /// Do not initialize targets with missing values + ] + + if ( ("`algorithm'" != "") & ("`method'" != "") ) { + disp as err "gregress: method() is an alias for algorithm(); specify only one" + clean_all 198 + exit 198 + } + if ( `"`algorithm'"' == "" ) local algorithm cg + if ( `"`method'"' != "" ) local algorithm: copy local method + local method: copy local algorithm + + if ( `maxiter' < 1 ) { + disp as err "gregress: maxiter() must be >= 1" + clean_all 198 + exit 198 + } + + if ( missing(`maxiter') ) local maxiter 0 + local maxiter = floor(`maxiter') + + if ( lower(`"`method'"') == "map" ) { + local method_code 1 + local method map + } + else if ( lower(`"`method'"') == "squarem" ) { + local method_code 2 + local method squarem + } + else if ( inlist(lower(`"`method'"'), "conjugate gradient", "conjugate_gradient", "cg") ) { + local method_code 3 + local method cg + } + else if ( inlist(lower(`"`method'"'), "irons and tuck", "irons tuck", "irons_tuck", "it") ) { + local method_code 5 + local method it + } + else if ( inlist(lower(`"`method'"'), "bit", "berge_it", "berge it") ) { + * TODO: gives segfault on some runs last I checked; debug someday. + * Option is undocumented but I leave it here for myself. + local method_code 6 + local method bit + } + else { + disp as err "gstats_hdfe: method() must be one of: map, squarem, cg, it" + clean_all 198 + exit 198 + } + + local ivregress + if ( `ivkendog' > 0 ) { + if ( `ivkz' >= `ivkendog' ) { + local ivregress ivregress + } + else { + disp as error "Need at least as many instruments as endogenous variables (received `ivkz' < `ivkendog')" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + else if ( `ivkz' > 0 ) { + disp as error "Detected instruments but no endogenous variables for IV regresssion" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`"`window'"' != "") & (`"`interval'"' != "") ) { + disp as err "moving() and window() are mutually exclusive options" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`"`window'"' != "") | (`"`interval'"' != "") ) { + if ( `"`window'"' != "" ) local what window + if ( `"`interval'"' != "" ) local what interval + + disp as err "option `what'() is planned for the next release" + local rc = 198 + clean_all `rc' + exit `rc' + + if ( `"`cluster'"' != "" ) { + disp as err "cluster() cannot yet be combined with `what'(); this is planned for the next release" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( `"`absorb'"' != "" ) { + disp as err "absorb() cannot yet be combined with `what'(); this is planned for the next release" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + + if ( `"`window'"' != "" ) { + encode_moving moving regress `interval' + if ( `r(warn)' ) { + disp as txt "{bf:note:} requested window() without a window; will ignore" + } + else if ( `r(match)' ) { + scalar __gtools_gregress_moving = `r(scode)' + scalar __gtools_gregress_moving_l = `r(lower)' + scalar __gtools_gregress_moving_u = `r(upper)' + } + else { + disp as err "window() incorrectly specified" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + + local intervalvar + if ( `"`interval'"' != "" ) { + encode_range range regress `interval' + if ( `r(warn)' ) { + disp as txt "{bf:note:} requested interval() without an interval; will ignore" + } + else if ( `r(match)' & (`"`r(var)'"' == "") ) { + disp as err "interval() requires a variable; interval(lower upper varname)" + local rc = 198 + clean_all `rc' + exit `rc' + } + else if ( `r(match)' ) { + scalar __gtools_gregress_range = 1 + scalar __gtools_gregress_range_l = `r(lower)' + scalar __gtools_gregress_range_u = `r(upper)' + scalar __gtools_gregress_range_ls = `r(lcode)' + scalar __gtools_gregress_range_us = `r(ucode)' + local intervalvar `r(var)' + } + else { + disp as err "interval() incorrectly specified" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + + * TODO: strL support + if ( `"`cluster'"' != "" ) { + GenericParseTypes `cluster', mat(__gtools_gregress_clustyp) + } + + if ( (`"`cluster'"' == "") & (`"`robust'"' == "") & (`wcode' == 4) ) { + disp as txt "{bf:note:} robust SE will be computed with pweights" + } + + if ( (`"`cluster'"' == "") & (`"`robust'"' == "") & (`"`glmfam'"' != "") ) { + disp as txt "{bf:note:} robust SE will be computed with GLM (`glmfam')" + } + + if ( (`wcode' == 4) | (`"`glmfam'"' != "") ) { + local robust robust + } + + if ( `:list sizeof residuals' > 1 ) { + disp as err "resid() must specify a single variable name" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( `:list sizeof predict' > 1 ) { + disp as err "predict() must specify a single variable name" + local rc = 198 + clean_all `rc' + exit `rc' + } + + local regressvars `varlist' `cluster' `absorb' `intervalvar' + + scalar __gtools_gregress_hdfemethnm = cond(`:list sizeof absorb' > 1, "`method'", "direct") + scalar __gtools_gregress_hdfemethod = `method_code' + scalar __gtools_gregress_kvars = `:list sizeof varlist' + scalar __gtools_gregress_cons = `"`constant'"' != "noconstant" + scalar __gtools_gregress_robust = `"`robust'"' != "" + scalar __gtools_gregress_cluster = `:list sizeof cluster' + scalar __gtools_gregress_absorb = `:list sizeof absorb' + scalar __gtools_gregress_hdfetol = `hdfetol' + scalar __gtools_gregress_hdfemaxiter = `maxiter' + scalar __gtools_gregress_hdfetraceiter = "`traceiter'" != "" + scalar __gtools_gregress_hdfestandard = "`standardize'" != "" + scalar __gtools_gregress_glmfam = `"`glmfam'"' != "" + scalar __gtools_gregress_glmlogit = (`"`glmfam'"' == "binomial") & (`"`glmlink'"' == "logit") + scalar __gtools_gregress_glmpoisson = (`"`glmfam'"' == "poisson") & (`"`glmlink'"' == "log") + scalar __gtools_gregress_glmiter = `glmiter' + scalar __gtools_gregress_glmtol = `glmtol' + scalar __gtools_gregress_ivreg = `"`ivregress'"' != "" + scalar __gtools_gregress_ivkendog = `ivkendog' + scalar __gtools_gregress_ivkexog = `ivkexog' + scalar __gtools_gregress_ivkz = `ivkz' + + if ( scalar(__gtools_gregress_glmlogit) ) { + local Caller Logit + local caller glogit + } + else if ( scalar(__gtools_gregress_glmpoisson) ) { + local Caller Poisson + local caller gpoisson + } + else if ( scalar(__gtools_gregress_ivreg) ) { + local Caller IV + local caller givregress + } + else { + local Caller Regress + local caller gregress + } + + if ( scalar(__gtools_gregress_glmfam) & scalar(__gtools_gregress_ivreg) ) { + disp as err "Parsing error: GLM (`caller') and givregress cannot be run at the same time" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( scalar(__gtools_gregress_cluster) > 1 ) { + disp as txt "({bf:warning}: cluster() with multiple variables is assumed to be nested)" + } + + if ( scalar(__gtools_gregress_kvars) < 2 ) { + disp as err "2 or more variables required: depvar indepvar [indepvar ...]" + local rc = 198 + clean_all `rc' + exit `rc' + } + + local zvarlist + local yxvarlist: copy local varlist + gettoken yvarlist xvarlist: yxvarlist + scalar __gtools_gregress_kv = __gtools_gregress_kvars - 1 - __gtools_gregress_ivkz + if ( scalar(__gtools_gregress_ivreg) ) { + local _xvarlist + forvalues i = 1 / `=scalar(__gtools_gregress_kv)' { + local _xvarlist `_xvarlist' `:word `i' of `xvarlist'' + } + local zvarlist + forvalues i = `=scalar(__gtools_gregress_kv) + 1' / `=scalar(__gtools_gregress_kvars) - 1' { + local zvarlist `zvarlist' `:word `i' of `xvarlist'' + } + local xvarlist: copy local _xvarlist + } + scalar __gtools_gregress_kv = __gtools_gregress_kv + __gtools_gregress_cons * (__gtools_gregress_absorb == 0) + + if ( `"`mata'`generate'`prefix'"' == "" ) { + scalar __gtools_gregress_savemata = 1 + scalar __gtools_gregress_savemb = 1 + scalar __gtools_gregress_savemse = 1 + local saveGregressMata Gtools`Caller' + mata: `saveGregressMata' = GtoolsRegressOutput() + mata: `saveGregressMata'.whoami = `"`saveGregressMata'"' + } + else { + if ( `"`mata'"' != "" ) { + local 0 `mata' + cap noi syntax [namelist(max = 1)], [noB noSE] + if ( `"`namelist'"' == "" ) local namelist Gtools`Caller' + + scalar __gtools_gregress_savemata = 1 + scalar __gtools_gregress_savemb = `"`b'"' != "nob" + scalar __gtools_gregress_savemse = `"`se'"' != "nose" + + local saveGregressMata `namelist' + mata: `saveGregressMata' = GtoolsRegressOutput() + mata: `saveGregressMata'.whoami = `"`saveGregressMata'"' + } + + if ( (`"`generate'"' != "") & (`"`prefix'"' != "") ) { + local 0, `generate' `prefix' + cap syntax, [b(str) se(str) hdfe(str)] + if ( _rc ) { + disp as err "cannot specify multiple saves across gen() and prefix()" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + + if ( `"`generate'"' != "" ) { + local 0, `generate' + cap noi syntax, [b(str) se(str) hdfe(str)] + if ( _rc ) { + disp as err "error parsing gen()" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`:list sizeof b' != scalar(__gtools_gregress_kv)) & (`"`b'"' != "") ) { + disp as err "number of output variables in gen(b()) does not match number of inputs" + if ( scalar(__gtools_gregress_cons) ) { + if ( `:list sizeof b' == (scalar(__gtools_gregress_kv) - 1) ) { + disp as err "Did you forget the constant?" + } + } + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`:list sizeof se' != scalar(__gtools_gregress_kv)) & (`"`se'"' != "") ) { + disp as err "number of output variables in gen(se()) does not match number of inputs" + if ( scalar(__gtools_gregress_cons) ) { + if ( `:list sizeof se' == (scalar(__gtools_gregress_kv) - 1) ) { + disp as err "Did you forget the constant?" + } + } + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`"`hdfe'"' != "") & (scalar(__gtools_gregress_absorb) == 0) ) { + disp as err "gen(hdfe()) without absorb() just makes a copy of the variables" + } + else if ( (`:list sizeof hdfe' != scalar(__gtools_gregress_kvars)) & (`"`hdfe'"' != "") ) { + disp as err "number of output variables in gen(hdfe()) does not match number of inputs" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( "`replace'" == "" ) { + cap noi confirm new var `b' `se' `hdfe' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + + local nvar = 0 + local togen + foreach var in `b' `se' `hdfe' { + cap confirm new var `var' + if ( _rc == 0 ) { + local togen `togen' `var' + local ++nvar + } + } + + scalar __gtools_gregress_savegb = `"`b'"' != "" + scalar __gtools_gregress_savegse = `"`se'"' != "" + scalar __gtools_gregress_saveghdfe = `"`hdfe'"' != "" + + if ( `nvar' > 0 ) { + qui mata: (void) st_addvar(J(1, `nvar', `"`:set type'"'), tokens(`"`togen'"')) + } + local regressvars `regressvars' `b' `se' `hdfe' + } + + if ( `"`prefix'"' != "" ) { + local 0, `prefix' + cap noi syntax, [b(str) se(str) hdfe(str)] + if ( _rc ) { + disp as err "error parsing prefix()" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`:list sizeof b' != 1) & (`"`b'"' != "") ) { + disp as err "specify a single prefix in prefix(b())" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`:list sizeof se' != 1) & (`"`se'"' != "") ) { + disp as err "specify a single prefix in prefix(se())" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`:list sizeof hdfe' != 1) & (`"`hdfe'"' != "") ) { + disp as err "specify a single prefix in prefix(hdfe())" + local rc = 198 + clean_all `rc' + exit `rc' + } + + if ( (`"`hdfe'"' != "") & (scalar(__gtools_gregress_absorb) == 0) ) { + disp as err "prefix(hdfe()) without absorb() just makes a copy of the variables" + } + + local bvars + local sevars + if ( `"`hdfe'"' != "" ) { + local hdfevars `hdfe'`yvarlist' + cap confirm name `hdfe'`yvarlist' + if ( _rc ) { + disp as err "prefix(hdfe()) results in invalid variable name, `hdfe'`yvarlist'" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + + if ( scalar(__gtools_gregress_cons) * (scalar(__gtools_gregress_absorb) == 0) ) { + local cons cons + } + else local cons + + foreach xvar in `xvarlist' `cons' { + if ( `"`b'"' != "" ) { + local bvars `bvars' `b'`xvar' + cap confirm name `b'`xvar' + if ( _rc ) { + disp as err "prefix(b()) results in invalid variable name, `b'`xvar'" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + if ( `"`se'"' != "" ) { + local sevars `sevars' `se'`xvar' + cap confirm name `se'`xvar' + if ( _rc ) { + disp as err "prefix(se()) results in invalid variable name, `se'`xvar'" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + if ( `"`hdfe'"' != "" ) { + local hdfevars `hdfevars' `hdfe'`xvar' + cap confirm name `hdfe'`xvar' + if ( _rc ) { + disp as err "prefix(hdfe()) results in invalid variable name, `hdfe'`xvar'" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + } + + foreach zvar in `zvarlist' { + if ( `"`hdfe'"' != "" ) { + local hdfevars `hdfevars' `hdfe'`zvar' + cap confirm name `hdfe'`zvar' + if ( _rc ) { + disp as err "prefix(hdfe()) results in invalid variable name, `hdfe'`zvar'" + local rc = 198 + clean_all `rc' + exit `rc' + } + } + } + + if ( "`replace'" == "" ) { + cap noi confirm new var `bvars' `sevars' `hdfevars' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + + local nvar = 0 + local togen + foreach var in `bvars' `sevars' `hdfevars' { + cap confirm new var `var' + if ( _rc == 0 ) { + local togen `togen' `var' + local ++nvar + } + } + + scalar __gtools_gregress_savegb = `"`b'"' != "" + scalar __gtools_gregress_savegse = `"`se'"' != "" + scalar __gtools_gregress_saveghdfe = `"`hdfe'"' != "" + + if ( `nvar' > 0 ) { + qui mata: (void) st_addvar(J(1, `nvar', `"`:set type'"'), tokens(`"`togen'"')) + } + + local regressvars `regressvars' `bvars' `sevars' `hdfevars' + } + } + + scalar __gtools_gregress_savegresid = `"`resid'`residuals'"' != "" + if ( scalar(__gtools_gregress_savegresid) ) { + if ( ("`resid'" != "") & ("`residuals'" != "") ) { + disp as txt "warning: option -resid- ignored with option resid()" + } + if ( "`residuals'" == "" ) { + local residuals _resid_`yvarlist' + } + if ( "`replace'" == "" ) { + cap noi confirm new var `residuals' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + else { + cap confirm new var `residuals' + } + if ( _rc == 0 ) { + qui mata: (void) st_addvar(`"`:set type'"', `"`residuals'"') + } + local regressvars `regressvars' `residuals' + } + + scalar __gtools_gregress_savegpred = `"`predict'"' != "" + if ( scalar(__gtools_gregress_savegpred) ) { + disp as txt "{bf:Warning}: The behavior of predict() is different cross functions." + disp as txt "Do not use unless you understand the code and know what it does." + if ( "`replace'" == "" ) { + cap noi confirm new var `predict' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + } + else { + cap confirm new var `predict' + } + if ( _rc == 0 ) { + qui mata: (void) st_addvar(`"`:set type'"', `"`predict'"') + } + local regressvars `regressvars' `predict' + } + + * TODO: strL support + if ( `"`absorb'"' != "" ) { + local 0: copy local absorb + syntax varlist, [save(str)] + local absorb: copy local varlist + GenericParseTypes `absorb', mat(__gtools_gregress_abstyp) + scalar __gtools_gregress_savegabs = `"`save'"' != "" + } + + * --------------------------- + * TODO: xx What was this for? + * --------------------------- + * if ( scalar(__gtools_gregress_savegabs) ) { + * if ( "`replace'" == "" ) { + * cap noi confirm new var `save' + * if ( _rc ) { + * local rc = _rc + * clean_all `rc' + * exit `rc' + * } + * } + * else { + * cap confirm new var `save' + * } + * if ( _rc == 0 ) { + * qui mata: (void) st_addvar(`"`:set type'"', `"`save'"') + * } + * local regressvars `regressvars' `save' + * } + + if ( `"`saveGregressMata'"' != "" ) { + mata: `saveGregressMata'.init() + } + + if ( `wcode' == 3 ) { + disp as txt "{bf:note:} iweights mimic the behavior of aweights" + } + + scalar __gtools_init_targ = (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" == "") + if ( (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" != "") ) NoInitWarning + } + else if ( inlist("`gfunction'", "stats") ) { + local gcall `gfunction' `"${GTOOLS_GSTATS_FILE}"' + gettoken gstat gstats: gstats + cap noi gstats_`gstat' `gstats' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + local statvars `varlist' + + * Note: This seems inefficient; if in ought to be done one + * level above in gstats... not to mention that it will force + * initializing the targets every time. + if ( "`gstat'" == "hdfe" ) { + tempvar touse + mark `touse' `ifin' + markout `touse' `__gtools_hdfe_markvars', strok + local if if `touse' + mata st_local("ifin", st_local("if") + " " + st_local("in")) + } + + scalar __gtools_init_targ = (`"`ifin'"' != "") & ("`gstats_replace'" != "") & ("`gstats_init'" == "") + if ( ("`gstat'" == "winsor") & `:list sizeof gstats_replace_anysrc' & `=scalar(__gtools_init_targ)' ) { + disp as err "gstats winsor: -replace- with source as target not allowed with if/in" + clean_all 198 + exit 198 + } + + if ( ("`gstat'" == "transform") & ("`gstats_greedy'" != "") & `:list sizeof gstats_replace_anysrc' & `=scalar(__gtools_init_targ)' ) { + disp as err "gstats transform: -replace- source as target not allowed with if/in and -nogreedy-" + clean_all 198 + exit 198 + } + + if ( (`"`ifin'"' != "") & ("`gstats_replace'" != "") & ("`gstats_init'" != "") ) NoInitWarning + } + else if ( inlist("`gfunction'", "contract") ) { + local 0 `gcontract' + syntax varlist, contractwhich(numlist) + local gcall `gfunction' + local contractvars `varlist' + mata: st_matrix("__gtools_contract_which", /// + strtoreal(tokens(`"`contractwhich'"'))) + local runtxt " (internals)" + } + else if ( inlist("`gfunction'", "levelsof") ) { + local 0, `glevelsof' + syntax, [ /// + noLOCALvar /// + freq(str) /// + store(str) /// + gen(str) /// + silent /// + MATAsave /// + MATAsavename(str) /// + ] + local gcall `gfunction' + scalar __gtools_levels_return = ( `"`localvar'"' == "" ) + + if ( "`store'" != "" ) { + di as err "store() is planned for a future release." + clean_all 198 + exit 198 + } + + if ( "`freq'" != "" ) { + di as err "freq() is planned for a future release." + clean_all 198 + exit 198 + } + + local replace_ `replace' + local 0 `gen' + syntax [anything], [replace] + + scalar __gtools_levels_mataname = `"`matasavename'"' + scalar __gtools_levels_matasave = ( `"`matasave'"' != "" ) + scalar __gtools_levels_silent = ( `"`silent'"' != "" ) + scalar __gtools_levels_gen = ( `"`gen'"' != "" ) + scalar __gtools_levels_replace = ( `"`replace'"' != "" ) + + local k1: list sizeof anything + local k2: list sizeof byvars + + // 1. gen(, replace) -> replaces existing varlist + // 2. gen(prefix) -> generates prefix* + // 4. gen(newvarlist) -> generates newvarlist + + if ( "`gen'" != "" ) { + if ( ("`replace'" == "") & (`k1' == 0) ) { + disp as err "{opt gen()} requires a prefix, target names, or {opt gen(, replace)}." + clean_all 198 + exit 198 + } + + if ( ("`replace'" != "") & (`k1' > 0) ) { + disp as err "{opt gen(, replace)} can only replace the source variables, not arbitrary targets." + clean_all 198 + exit 198 + } + + local level_targets + if ( `k1' > 0 ) { + cap confirm name `anything' + if ( _rc ) { + disp as err "{opt gen()} must specify a variable name or prefix" + clean_all 198 + exit 198 + } + + if ( `k1' > 1 ) { + cap assert (`k1') == (`k2') + if ( _rc ) { + disp as err "{opt gen()} must specify a single prefix or one name per target." + clean_all 198 + exit 198 + } + + cap confirm new var `anything' + if ( _rc ) { + disp as err "{opt gen()} must specify new variable names." + clean_all 198 + exit 198 + } + local level_targets `anything' + } + else { + local level_targets + foreach var of varlist `byvars' { + local level_targets `level_targets' `anything'`var' + } + + cap confirm new var `level_targets' + if ( _rc ) { + disp as err "{opt gen()} must specify new variable names." + clean_all 198 + exit 198 + } + } + + local level_types + foreach var of varlist `byvars' { + local level_types `level_types' `:type `var'' + } + + qui mata: st_addvar(tokens(`"`level_types'"'), tokens(`"`level_targets'"')) + qui mata: __gtools_level_targets = tokens(`"`level_targets'"') + + local plugvars `byvars' `etargets' `extravars' + scalar __gtools_levels_gen = `:list sizeof plugvars' + 1 + } + } + + local 0, `store' + syntax, [GENerate(str) genpre(str) MATrix(str) replace(str)] + + local 0, `freq' + syntax, [GENerate(str) MATrix(str) replace(str)] + + * Check which exist (w/replace) and create empty vars + * Pass to plugin call + + * store(matrix(name)) <- only numeric + * store(data(varlist)) <- any type; must be same length as by vars + * store(data prefix(prefix) [truncate]) <- prefix; must be valid stata names + * freq(matrix(name)) + * freq(mata(name)) + + local replace `replace_' + } + else if ( inlist("`gfunction'", "top") ) { + local 0, `gtop' + syntax, ntop(real) /// + pct(real) /// + freq(real) /// + [ /// + misslab(str) /// + otherlab(str) /// + groupmiss /// + MATAsave /// + MATAsavename(str) /// + alpha /// + invert /// + silent /// + noVALUELABels /// + ] + local gcall `gfunction' + + scalar __gtools_top_ntop = `ntop' + scalar __gtools_top_pct = `pct' + scalar __gtools_top_freq = `freq' + scalar __gtools_top_mataname = `"`matasavename'"' + scalar __gtools_top_matasave = ( `"`matasave'"' != "" ) + scalar __gtools_top_silent = ( `"`silent'"' != "" ) + scalar __gtools_top_vlab = ( `"`valuelabels'"' == "" ) + scalar __gtools_top_invert = ( `"`invert'"' != "" ) + scalar __gtools_top_alpha = ( `"`alpha'"' != "" ) + scalar __gtools_top_miss = ( `"`misslab'"' != "" ) + scalar __gtools_top_groupmiss = ( `"`groupmiss'"' != "" ) + scalar __gtools_top_other = ( `"`otherlab'"' != "" ) + scalar __gtools_top_Jmiss = 0 + scalar __gtools_top_Jother = 0 + scalar __gtools_top_lmiss = length(`"`misslab'"') + scalar __gtools_top_lother = length(`"`otherlab'"') + scalar __gtools_top_nrows = abs(__gtools_top_ntop) /* + */ + __gtools_top_miss /* + */ + __gtools_top_other + + cap noi check_matsize, nvars(`=scalar(__gtools_kvars_num)') + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + local nrows = scalar(__gtools_top_nrows) + } + else if ( inlist("`gfunction'", "quantiles") ) { + + * gquantiles is the only complex function in this portion + * of the program. While it involves the same initial steps, + * it also requires additional work. In particular we need + * run a selection algorithm on the sources to compute the + * percentiles or xtile. + * + * The function does a number of other things, which I will + * not repeat here. For details see the documentation online: + * + * https://gtools.readthedocs.io/en/latest/usage/gquantiles/index.html + * + * In particular, the "examples" section. + + local 0 `gquantiles' + syntax [name], /// + [ /// + xsources(varlist numeric) /// + /// + Nquantiles(real 0) /// + /// + Quantiles(numlist) /// + cutoffs(numlist) /// + /// + quantmatrix(str) /// + cutmatrix(str) /// + /// + Cutpoints(varname numeric) /// + cutquantiles(varname numeric) /// + /// + pctile(name) /// + GENp(name) /// + BINFREQvar(name) /// + replace /// + /// + returnlimit(real 1001) /// + dedup /// + cutifin /// + cutby /// + _pctile /// + binfreq /// + method(int 0) /// + XMISSing /// + ALTdef /// + strict /// + minmax /// + ] + + local gcall `gfunction' + local xvars `namelist' /// + `pctile' /// + `binfreqvar' /// + `genp' /// + `cutpoints' /// + `cutquantiles' /// + `xsources' + + *************************** + * quantiles and cutoffs * + *************************** + + * First we need to parse quantmatrix and cutmatrix to find + * out how many quantiles or cutoffs we may have. + + if ( "`quantmatrix'" != "" ) { + if ( "`quantiles'" != "" ) { + disp as err "Specify only one of quantiles() or quantmatrix()" + clean_all 198 + exit 198 + } + + tempname m c r + mata: `m' = st_matrix("`quantmatrix'") + mata: `c' = cols(`m') + mata: `r' = rows(`m') + cap mata: assert(min((`c', `r')) == 1) + if ( _rc ) { + disp as err "quantmatrix() must be a N by 1 or 1 by N matrix." + clean_all 198 + exit 198 + } + + cap mata: assert(all(`m' :> 0) & all(`m' :< 100)) + if ( _rc ) { + disp as err "quantmatrix() must contain all values" /// + " strictly between 0 and 100" + clean_all 198 + exit 198 + } + mata: st_local("xhow_nq2", strofreal(max((`c', `r')) > 0)) + mata: st_matrix("__gtools_xtile_quantiles", rowshape(`m', 1)) + mata: st_numscalar("__gtools_xtile_nq2", max((`c', `r'))) + } + else { + local xhow_nq2 = ( `:list sizeof quantiles' > 0 ) + scalar __gtools_xtile_nq2 = `:list sizeof quantiles' + } + + if ( "`cutmatrix'" != "" ) { + if ( "`cutoffs'" != "" ) { + disp as err "Specify only one of cutoffs() or cutmatrix()" + clean_all 198 + exit 198 + } + + tempname m c r + mata: `m' = st_matrix("`cutmatrix'") + mata: `c' = cols(`m') + mata: `r' = rows(`m') + cap mata: assert(min((`c', `r')) == 1) + if ( _rc ) { + disp as err "cutmatrix() must be a N by 1 or 1 by N matrix." + clean_all 198 + exit 198 + } + mata: st_local("xhow_cuts", strofreal(max((`c', `r')) > 0)) + mata: st_matrix("__gtools_xtile_cutoffs", rowshape(`m', 1)) + mata: st_numscalar("__gtools_xtile_ncuts", max((`c', `r'))) + } + else { + local xhow_cuts = ( `:list sizeof cutoffs' > 0 ) + scalar __gtools_xtile_ncuts = `:list sizeof cutoffs' + } + + ****************************** + * Rest of quantile parsing * + ****************************** + + * Make sure cutoffs/quantiles are correctly requested (can + * only specify 1 method!) + + local xhow_nq = ( `nquantiles' > 0 ) + local xhow_cutvars = ( `:list sizeof cutpoints' > 0 ) + local xhow_qvars = ( `:list sizeof cutquantiles' > 0 ) + local xhow_total = `xhow_nq' /// + + `xhow_nq2' /// + + `xhow_cuts' /// + + `xhow_cutvars' /// + + `xhow_qvars' + + local early_rc = 0 + if ( "`_pctile'" != "" ) { + if ( `nquantiles' > `returnlimit' ) { + di as txt "Warning: {opt nquantiles()} > returnlimit" /// + " (`nquantiles' > `returnlimit')." /// + _n(1) "Will not store return values beyond" /// + " `returnlimit'. Try {opt pctile()}" /// + _n(1) "(Note: you can also pass {opt returnlimit(.)}" /// + " but that is very slow.)" + } + + if ( `:list sizeof quantiles' > `returnlimit' ) { + di as txt "Warning: # quantiles in" /// + " {opt quantiles()} > returnlimit" /// + " (`:list sizeof quantiles' > `returnlimit')." /// + _n(1) "Will not store return values beyond" /// + " `returnlimit'. Try {opt pctile()}" /// + _n(1) "(Note: you can also pass {opt returnlimit(.)}" /// + " but that is very slow.)" + } + + if ( `:list sizeof cutoffs' > `returnlimit' ) { + di as txt "Warning: # of cutoffs in" /// + " {opt cutoffs()} > returnlimit" /// + " (`:list sizeof cutoffs' > `returnlimit')." /// + _n(1) "Will not store return values beyond" /// + " `returnlimit'. Try {opt pctile()}" /// + _n(1) "(Note: you can also pass {opt returnlimit(.)}" /// + " but that is very slow.)" + } + } + + if ( `xhow_total' == 0 ) { + local nquantiles = 2 + } + else if (`xhow_total' > 1) { + if ( `nquantiles' > 0 ) local olist "`olist' nquantiles()" + if ( "`quantiles'" != "" ) local olist "`olist', quantiles()" + if ( "`quantmatrix'" != "" ) local olist "`olist', quantmatrix()" + if ( "`cutpoints'" != "" ) local olist "`olist', cutpoints()" + if ( "`cutmatrix'" != "" ) local olist "`olist', cutmatrix()" + if ( "`cutquantiles'" != "" ) local olist "`olist', cutquantiles()" + if ( "`cutoffs'" != "" ) local olist "`olist', cutoffs()" + di as err "Specify only one of: `olist'" + local early_rc = 198 + } + + if ( `xhow_nq' & (`nquantiles' < 2) ) { + di as err "{opt nquantiles()} must be greater than or equal to 2" + local early_rc = 198 + } + + foreach quant of local quantiles { + if ( `quant' < 0 ) | ( `quant' > 100 ) { + di as err "{opt quantiles()} must all be strictly" /// + " between 0 and 100" + local early_rc = 198 + } + if ( `quant' == 0 ) | ( `quant' == 100 ) { + di as err "{opt quantiles()} cannot be 0 or 100" /// + " (note: try passing option {opt minmax})" + local early_rc = 198 + } + } + + local xgen_ix = ( "`namelist'" != "" ) + local xgen_p = ( "`pctile'" != "" ) + local xgen_gp = ( "`genp'" != "" ) + local xgen_bf = ( "`binfreqvar'" != "" ) + local xgen_tot = `xgen_p' + `xgen_gp' + `xgen_bf' + + local xgen_required = `xhow_cutvars' + `xhow_qvars' + local xgen_any = `xgen_ix' | `xgen_p' | `xgen_gp' | `xgen_bf' + if ( (`xgen_required' > 0) & !(`xgen_any') ) { + if ( "`cutpoints'" != "" ) local olist "cutpoints()" + if ( "`cutquantiles'" != "" ) local olist "cutquantiles()" + di as err "Option {opt `olist'} requires xtile or pctile" + local early_rc = 198 + } + + local xbin_any = ("`binfreq'" != "") & ("`binfreqvar'" == "") + if ( (`xgen_required' > 0) & `xbin_any' ) { + if ( "`cutpoints'" != "" ) local olist "cutpoints()" + if ( "`cutquantiles'" != "" ) local olist "cutquantiles()" + di as err "{opt binfreq} not allowed with {opt `olist'};" /// + " try {opth binfreq(newvarname)}" + local early_rc = 198 + } + + if ( ("`cutoffs'" != "") & ("`binfreq'" == "") & !(`xgen_any') ) { + di as err "Nothing to do: Option {opt cutoffs()} requires" /// + " {opt binfreq}, {opt xtile}, or {opt pctile}" + local early_rc = 198 + } + + local xgen_maxdata = `xgen_p' | `xgen_gp' | `xgen_bf' + if ( (`nquantiles' > `=_N + 1') & `xgen_maxdata' ) { + di as err "{opt nquantiles()} must be less than or equal to" /// + " `=_N +1' (# obs + 1) with {opt pctile()} or {opt binfreq()}" + local early_rc = 198 + } + + if ( (`=scalar(__gtools_xtile_nq2)' > `=_N') & `xgen_maxdata' ) { + di as err "Number of {opt quantiles()} must be" /// + " less than or equal to `=_N' (# obs)" /// + " with options {opt pctile()} or {opt binfreq()}" + local early_rc = 198 + } + + if ( (`=scalar(__gtools_xtile_ncuts)' > `=_N') & `xgen_maxdata' ) { + di as err "Number of {opt cutoffs()} must be " /// + " less than or equal to `=_N' (# obs)" /// + " with options {opt pctile()} or {opt binfreq()}" + local early_rc = 198 + } + + if ( `early_rc' ) { + clean_all `early_rc' + exit `early_rc' + } + + scalar __gtools_xtile_xvars = `:list sizeof xsources' + + scalar __gtools_xtile_nq = `nquantiles' + scalar __gtools_xtile_cutvars = `:list sizeof cutpoints' + scalar __gtools_xtile_qvars = `:list sizeof cutquantiles' + + scalar __gtools_xtile_gen = `xgen_ix' + scalar __gtools_xtile_pctile = `xgen_p' + scalar __gtools_xtile_genpct = `xgen_gp' + scalar __gtools_xtile_pctpct = `xgen_bf' + + scalar __gtools_xtile_altdef = ( "`altdef'" != "" ) + scalar __gtools_xtile_missing = ( "`xmissing'" != "" ) + scalar __gtools_xtile_strict = ( "`strict'" != "" ) + scalar __gtools_xtile_min = ( "`minmax'" != "" ) + scalar __gtools_xtile_max = ( "`minmax'" != "" ) + scalar __gtools_xtile_method = `method' + scalar __gtools_xtile_bincount = ( "`binfreq'" != "" ) + scalar __gtools_xtile__pctile = ( "`_pctile'" != "" ) + scalar __gtools_xtile_dedup = ( "`dedup'" != "" ) + scalar __gtools_xtile_cutifin = ( "`cutifin'" != "" ) + scalar __gtools_xtile_cutby = ( "`cutby'" != "" ) + + cap noi check_matsize, nvars(`=scalar(__gtools_xtile_nq2)') + if ( _rc ) { + local rc = _rc + di as err _n(1) "Note: bypass matsize and specify quantiles" /// + " using a variable via {opt cutquantiles()}" + clean_all `rc' + exit `rc' + } + + cap noi check_matsize, nvars(`=scalar(__gtools_xtile_ncuts)') + if ( _rc ) { + local rc = _rc + di as err _n(1) "Note: bypass matsize and specify cutoffs" /// + " using a variable via {opt cutpoints()}" + clean_all `rc' + exit `rc' + } + + * I don't think it's possible to preserve numerical precision + * with numlist. And I asked... + * + * https://stackoverflow.com/questions/47336278 + * https://www.statalist.org/forums/forum/general-stata-discussion/general/1418513 + * + * Hance I should have added other ways to request quantiles: + * + * - cutquantiles + * - quantmatrix + * + * and other ways to request cut points: + * + * - cutoffs + * - cutmatrix + + scalar __gtools_xtile_imprecise = 0 + matrix __gtools_xtile_quantbin = /// + J(1, cond(`xhow_nq2', `=scalar(__gtools_xtile_nq2)', 1), 0) + matrix __gtools_xtile_cutbin = /// + J(1, cond(`xhow_cuts', `=scalar(__gtools_xtile_ncuts)', 1), 0) + + if ( `xhow_nq2' & ("`quantiles'" != "") & ("`quantmatrix'" == "") ) { + matrix __gtools_xtile_quantiles = /// + J(1, cond(`xhow_nq2', `=scalar(__gtools_xtile_nq2)', 1), 0) + + local k = 0 + foreach quant of numlist `quantiles' { + local ++k + matrix __gtools_xtile_quantiles[1, `k'] = `quant' + if ( strpos("`quant'", ".") & (length("`quant'") >= 13) & ("`altdef'" == "") ) { + scalar __gtools_xtile_imprecise = 1 + } + } + if ( `=scalar(__gtools_xtile_imprecise)' ) { + disp as err "Warning: Loss of numerical precision" /// + " with option {opth quantiles(numlist)}." /// + _n(1) "Stata's numlist truncates decimals with" /// + " more than 13 significant digits." /// + _n(1) "Consider using {cmd:altdef} or " /// + " {opth quantmatrix(name)}." + } + } + + if ( `xhow_cuts' & ("`cutoffs'" != "") & ("`cutmatrix'" == "") ) { + matrix __gtools_xtile_cutoffs = /// + J(1, cond(`xhow_cuts', `=scalar(__gtools_xtile_ncuts)', 1), 0) + + local k = 0 + foreach cut of numlist `cutoffs' { + local ++k + matrix __gtools_xtile_cutoffs[1, `k'] = `cut' + if ( strpos("`cut'", ".") & (length("`cut'") >= 13) ) { + scalar __gtools_xtile_imprecise = 1 + } + } + if ( `=scalar(__gtools_xtile_imprecise)' ) { + disp as err "Warning: Loss of numerical precision" /// + " with option {opth cutoffs(numlist)}." /// + _n(1) "Stata's numlist truncates decimals with" /// + " more than 13 significant digits." /// + _n(1) "Consider using {cmd:altdef} or " /// + " {opth cutmatrix(name)}." + } + } + + * So, I don't really know why I imposed this restriction or + * why I thought it was a good idea. If you request binfreq + * you should get the matrix, and you should only not get it + * if the number of quantiles is not allowed by matsize... + * But throughout the code I consistently only allow either + * binfreq OR binfreqvar! + + local xbin_any = ("`binfreq'" != "") & ("`binfreqvar'" == "") + if ( (`nquantiles' > 0) & `xbin_any' ) { + cap noi check_matsize, nvars(`=`nquantiles' - 1') + if ( _rc ) { + local rc = _rc + di as err _n(1) "Note: You can bypass matsize and" /// + " save binfreq to a variable via binfreq()" + clean_all `rc' + exit `rc' + } + matrix __gtools_xtile_quantbin = /// + J(1, max(`=scalar(__gtools_xtile_nq2)', `nquantiles' - 1), 0) + local __gtools_xtile_nq_extra bin + } + else if ( "`binfreq'" != "" ) { + disp as txt "(option binfreq ignored)" + } + + if ( (`nquantiles' > 0) & ("`_pctile'" != "") ) { + cap noi check_matsize, nvars(`=`nquantiles' - 1') + if ( _rc ) { + local rc = _rc + di as err _n(1) "Note: You can bypass matsize and" /// + " save quantiles to a variable via pctile()" + clean_all `rc' + exit `rc' + } + matrix __gtools_xtile_quantiles = /// + J(1, max(`=scalar(__gtools_xtile_nq2)', `nquantiles' - 1), 0) + local __gtools_xtile_nq_extra `__gtools_xtile_nq_extra' quantiles + } + else if ( (`=scalar(__gtools_xtile_nq2)' > 0) & ("`_pctile'" != "") ) { + * matsize for nq2 was already checked + } + else if ( "`_pctile'" != "" ) { + disp as txt "(option _pctile ignored)" + } + + scalar __gtools_xtile_size = `nquantiles' + scalar __gtools_xtile_size = /// + max(__gtools_xtile_size, __gtools_xtile_nq2 + 1) + scalar __gtools_xtile_size = /// + max(__gtools_xtile_size, __gtools_xtile_ncuts + 1) + scalar __gtools_xtile_size = /// + max(__gtools_xtile_size, cond(__gtools_xtile_cutvars, `=_N+1', 1)) + scalar __gtools_xtile_size = /// + max(__gtools_xtile_size, cond(__gtools_xtile_qvars, `=_N+1', 1)) + + local toadd 0 + qui mata: __gtools_xtile_addlab = J(1, 0, "") + qui mata: __gtools_xtile_addnam = J(1, 0, "") + foreach xgen in xgen_ix xgen_p xgen_gp xgen_bf { + if ( ``xgen'' > 0 ) { + if ( "`xgen'" == "xgen_ix" ) { + if ( `=scalar(__gtools_xtile_size)' < maxbyte() ) { + local qtype byte + } + else if ( `=scalar(__gtools_xtile_size)' < maxint() ) { + local qtype int + } + else if ( `=scalar(__gtools_xtile_size)' < maxlong() ) { + local qtype long + } + else local qtype double + local qvar `namelist' + } + else { + if ( "`:type `xsources''" == "double" ) local qtype double + else local qtype: set type + + if ( "`xgen'" == "xgen_p" ) local qvar `pctile' + if ( "`xgen'" == "xgen_gp" ) local qvar `genp' + if ( "`xgen'" == "xgen_bf" ) { + if ( "`wvar'" == "" ) { + if ( `=_N' < maxbyte() ) { + local qtype byte + } + else if ( `=_N' < maxint() ) { + local qtype int + } + else if ( `=_N' < maxlong() ) { + local qtype long + } + else local qtype double + } + else local qtype double + local qvar `binfreqvar' + } + } + cap confirm new var `qvar' + if ( _rc & ("`replace'" == "") ) { + di as err "Variable `qvar' exists with no replace." + clean_all 198 + exit 198 + } + else if ( _rc & ("`replace'" != "") & ("`init'" == "") ) { + qui replace `qvar' = . + } + else if ( _rc == 0 ) { + local ++toadd + mata: __gtools_xtile_addlab = __gtools_xtile_addlab, "`qtype'" + mata: __gtools_xtile_addnam = __gtools_xtile_addnam, "`qvar'" + } + } + } + + if ( `toadd' > 0 ) { + qui mata: st_addvar(__gtools_xtile_addlab, __gtools_xtile_addnam) + } + + * This is superseded by the replace qvar above: + * scalar __gtools_init_targ = (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" == "") + if ( (`"`ifin'"' != "") & ("`replace'" != "") & ("`init'" != "") ) NoInitWarning + + local msg "Parsed quantiles and added targets" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') + } + else local gcall `gfunction' + + local plugvars `byvars' `etargets' `extravars' `level_targets' + local plugvars `plugvars' `statvars' `contractvars' `xvars' + local plugvars `plugvars' `reshapevars' `regressvars' + + scalar __gtools_weight_pos = `:list sizeof plugvars' + 1 + cap noi plugin call gtools_plugin `plugvars' `wvar' `ifin', `gcall' + local rc = _rc + cap noi rc_dispatch `byvars', rc(`=_rc') `opts' + if ( _rc ) { + local rc = _rc + clean_all `rc' + exit `rc' + } + + local msg "C plugin runtime" + gtools_timer info `t98' `"`msg'"', prints(`benchmark') off + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:_gtools_internal/`gfunction'} (debug level `debug_level')" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" gcall: `gcall'"' + disp as txt `""' + disp as txt `" contractvars: `contractvars'"' + disp as txt `" statvars: `statvars'"' + disp as txt `""' + disp as txt `" nolocalvar: `nolocalvar'"' + disp as txt `" freq: `freq'"' + disp as txt `" store: `store'"' + disp as txt `""' + disp as txt `" ntop: `ntop'"' + disp as txt `" pct: `pct'"' + disp as txt `" freq: `freq'"' + disp as txt `" misslab: `misslab'"' + disp as txt `" otherlab: `otherlab'"' + disp as txt `" groupmiss: `groupmiss'"' + disp as txt `" nrows: `nrows'"' + disp as txt `""' + disp as txt `" xvars: `xvars'"' + disp as txt `" xsources: `xsources'"' + disp as txt `" nquantiles: `nquantiles'"' + disp as txt `" quantiles: `quantiles'"' + disp as txt `" cutoffs: `cutoffs'"' + disp as txt `" quantmatrix: `quantmatrix'"' + disp as txt `" cutmatrix: `cutmatrix'"' + disp as txt `" cutpoints: `cutpoints'"' + disp as txt `" cutquantiles: `cutquantiles'"' + disp as txt `" pctile: `pctile'"' + disp as txt `" genp: `genp'"' + disp as txt `" binfreqvar: `binfreqvar'"' + disp as txt `" replace: `replace'"' + disp as txt `" returnlimit: `returnlimit'"' + disp as txt `" dedup: `dedup'"' + disp as txt `" cutifin: `cutifin'"' + disp as txt `" cutby: `cutby'"' + disp as txt `" _pctile: `_pctile'"' + disp as txt `" binfreq: `binfreq'"' + disp as txt `" method: `method'"' + disp as txt `" xmissing: `xmissing'"' + disp as txt `" altdef: `altdef'"' + disp as txt `" strict: `strict'"' + disp as txt `" minmax: `minmax'"' + disp as txt `""' + disp as txt `" xhow_nq: `xhow_nq'"' + disp as txt `" xhow_cutvars: `xhow_cutvars'"' + disp as txt `" xhow_qvars: `xhow_qvars'"' + disp as txt `" xhow_total: `xhow_total'"' + disp as txt `" xhow_cuts: `xhow_cuts'"' + disp as txt `" xhow_nq2: `xhow_nq2'"' + disp as txt `" xgen_ix: `xgen_ix'"' + disp as txt `" xgen_p: `xgen_p'"' + disp as txt `" xgen_gp: `xgen_gp'"' + disp as txt `" xgen_bf: `xgen_bf'"' + disp as txt `" xgen_tot: `xgen_tot'"' + disp as txt `" xgen_required: `xgen_required'"' + disp as txt `" xgen_any: `xgen_any'"' + disp as txt `" xbin_any: `xbin_any'"' + disp as txt `" xgen_maxdata: `xgen_maxdata'"' + disp as txt `""' + + cap matrix list __gtools_contract_which + cap matrix list __gtools_xtile_cutoffs + cap matrix list __gtools_xtile_quantbin + cap matrix list __gtools_xtile_cutbin + cap matrix list __gtools_xtile_quantiles + + cap scalar list __gtools_top_nrows + cap scalar list __gtools_top_ntop + cap scalar list __gtools_top_pct + cap scalar list __gtools_top_freq + cap scalar list __gtools_top_mataname + cap scalar list __gtools_top_matasave + cap scalar list __gtools_top_silent + cap scalar list __gtools_top_vlab + cap scalar list __gtools_top_invert + cap scalar list __gtools_top_alpha + cap scalar list __gtools_top_miss + cap scalar list __gtools_top_groupmiss + cap scalar list __gtools_top_other + cap scalar list __gtools_top_lmiss + cap scalar list __gtools_top_lother + cap scalar list __gtools_top_Jmiss + cap scalar list __gtools_top_Jother + + cap scalar list __gtools_xtile_xvars + cap scalar list __gtools_xtile_nq + cap scalar list __gtools_xtile_nq2 + cap scalar list __gtools_xtile_cutvars + cap scalar list __gtools_xtile_qvars + cap scalar list __gtools_xtile_gen + cap scalar list __gtools_xtile_ncuts + cap scalar list __gtools_xtile_pctile + cap scalar list __gtools_xtile_genpct + cap scalar list __gtools_xtile_pctpct + cap scalar list __gtools_xtile_altdef + cap scalar list __gtools_xtile_missing + cap scalar list __gtools_xtile_strict + cap scalar list __gtools_xtile_min + cap scalar list __gtools_xtile_max + cap scalar list __gtools_xtile_method + cap scalar list __gtools_xtile_bincount + cap scalar list __gtools_xtile__pctile + cap scalar list __gtools_xtile_dedup + cap scalar list __gtools_xtile_cutifin + cap scalar list __gtools_xtile_cutby + cap scalar list __gtools_xtile_imprecise + cap scalar list __gtools_xtile_size + cap scalar list __gtools_weight_pos + } + } + + local msg "Internal gtools runtime`runtxt'" + gtools_timer info `t99' `"`msg'"', prints(`benchmark') off + + * Return values + * ------------- + + * generic + if ( `rset' ) { + return scalar N = `r_N' + return scalar J = `r_J' + return scalar minJ = `r_minJ' + return scalar maxJ = `r_maxJ' + } + + return scalar kvar = `=scalar(__gtools_kvars)' + return scalar knum = `=scalar(__gtools_kvars_num)' + return scalar kint = `=scalar(__gtools_kvars_int)' + return scalar kstr = `=scalar(__gtools_kvars_str)' + return scalar kstrL = `=scalar(__gtools_kvars_strL)' + + return local byvars = "`byvars'" + return local bynum = "`bynum'" + return local bystr = "`bystr'" + + * gstats + if ( inlist("`gfunction'", "stats") ) { + return scalar gstats_winsor_cutlow = __gtools_winsor_cutl + return scalar gstats_winsor_cuthigh = __gtools_winsor_cuth + + if ( `=scalar(__gtools_gstats_code)' == 2 ) { + if ( `=scalar(__gtools_summarize_matasave)' ) { + mata: `GstatsMataSave' = __gstats_summarize_results() + disp as txt _n "(note: raw results saved in `GstatsMataSave';" /* + */ " see {stata mata `GstatsMataSave'.desc()})" + } + else { + mata: (void) __gstats_summarize_results() + cap mata: mata drop `GstatsMataSave' + } + } + + tempname ghdfeabsmatrix + if ( `=scalar(__gtools_gstats_code)' == 4 ) { + return scalar hdfe_nonmiss = `=scalar(__gtools_hdfe_nonmiss)' + if ( ("`byvars'" != "") & `=scalar(__gtools_hdfe_matasave)' ) { + + mata: `=scalar(__gtools_hdfe_mataname)' = GtoolsByLevels() + mata: `=scalar(__gtools_hdfe_mataname)'.whoami = st_strscalar("__gtools_hdfe_mataname") + mata: `=scalar(__gtools_hdfe_mataname)'.caller = "gstats hdfe" + mata: `=scalar(__gtools_hdfe_mataname)'.read(`""', 0) + + mata: `ghdfeabsmatrix' = GtoolsReadMatrix(st_local("ghdfeabsfile"), /* + */ 1 + st_numscalar("__gtools_hdfe_absorb"), `=scalar(__gtools_hdfe_mataname)'.J) + mata: `=scalar(__gtools_hdfe_mataname)'.nj = `ghdfeabsmatrix'[1, .]' + mata: `=scalar(__gtools_hdfe_mataname)'.njabsorb = colshape( /* + */ `ghdfeabsmatrix'[2::rows(`ghdfeabsmatrix'), .], st_numscalar("__gtools_hdfe_absorb")) + + disp as txt "(note: by() info saved in `=scalar(__gtools_hdfe_mataname)';" /* + */ " see {stata mata `=scalar(__gtools_hdfe_mataname)'.desc()})" + } + else if ( ("`byvars'" == "") & `=scalar(__gtools_hdfe_matasave)' ) { + disp as txt "(warning: matasave() without by() is ignored)" + } + + return local hdfe_method = "`=scalar(__gtools_hdfe_methodname)'" + return scalar hdfe_saveinfo = 0 + return scalar hdfe_saveabs = 0 + if ( "`byvars'" == "" ) { + return scalar hdfe_saveabs = 1 + return matrix hdfe_nabsorb = __gtools_hdfe_nabsorb + + return scalar hdfe_saveinfo = 1 + if ( "`=scalar(__gtools_hdfe_methodname)'" == "direct" ) { + return scalar hdfe_iter = 1 + return scalar hdfe_feval = 1 + } + else { + return scalar hdfe_iter = __gtools_hdfe_iter + return scalar hdfe_feval = __gtools_hdfe_feval + } + } + } + + if ( `=scalar(__gtools_summarize_pooled)' ) { + return local statvars: copy local statvars + } + + return scalar gstats_summarize_pooled = __gtools_summarize_pooled + return scalar gstats_summarize_normal = __gtools_summarize_normal + return scalar gstats_summarize_detail = __gtools_summarize_detail + return scalar gstats_summarize_tabstat = __gtools_summarize_tabstat + + return scalar gstats_summarize_N = __gtools_summarize_N + return scalar gstats_summarize_sum_w = __gtools_summarize_sum_w + return scalar gstats_summarize_sum = __gtools_summarize_sum + return scalar gstats_summarize_mean = __gtools_summarize_mean + return scalar gstats_summarize_min = __gtools_summarize_min + return scalar gstats_summarize_max = __gtools_summarize_max + return scalar gstats_summarize_Var = __gtools_summarize_Var + return scalar gstats_summarize_sd = __gtools_summarize_sd + return scalar gstats_summarize_p1 = __gtools_summarize_p1 + return scalar gstats_summarize_p5 = __gtools_summarize_p5 + return scalar gstats_summarize_p10 = __gtools_summarize_p10 + return scalar gstats_summarize_p25 = __gtools_summarize_p25 + return scalar gstats_summarize_p50 = __gtools_summarize_p50 + return scalar gstats_summarize_p75 = __gtools_summarize_p75 + return scalar gstats_summarize_p90 = __gtools_summarize_p90 + return scalar gstats_summarize_p95 = __gtools_summarize_p95 + return scalar gstats_summarize_p99 = __gtools_summarize_p99 + return scalar gstats_summarize_skewness = __gtools_summarize_skewness + return scalar gstats_summarize_kurtosis = __gtools_summarize_kurtosis + return scalar gstats_summarize_smallest1 = __gtools_summarize_smallest1 + return scalar gstats_summarize_smallest2 = __gtools_summarize_smallest2 + return scalar gstats_summarize_smallest3 = __gtools_summarize_smallest3 + return scalar gstats_summarize_smallest4 = __gtools_summarize_smallest4 + return scalar gstats_summarize_largest4 = __gtools_summarize_largest4 + return scalar gstats_summarize_largest3 = __gtools_summarize_largest3 + return scalar gstats_summarize_largest2 = __gtools_summarize_largest2 + return scalar gstats_summarize_largest1 = __gtools_summarize_largest1 + } + + * levelsof + if ( inlist("`gfunction'", "levelsof", "top") & `=scalar(__gtools_levels_return)' ) { + cap disp `"`vals'"' + if ( _rc ) { + error _rc + } + return local levels: copy local vals + return local sep: copy local sep + return local colsep: copy local colsep + } + + if ( inlist("`gfunction'", "levelsof") ) { + if ( `=scalar(__gtools_levels_matasave)' ) { + mata: `=scalar(__gtools_levels_mataname)' = GtoolsByLevels() + mata: `=scalar(__gtools_levels_mataname)'.whoami = st_strscalar("__gtools_levels_mataname") + mata: `=scalar(__gtools_levels_mataname)'.caller = "glevelsof" + mata: `=scalar(__gtools_levels_mataname)'.read( /* + */ st_numscalar("__gtools_levels_silent")? `""': (`numfmt_empty'? "%16.0g": `"`numfmt'"'), 1) + + disp as txt "(note: raw levels saved in `=scalar(__gtools_levels_mataname)';" /* + */ " see {stata mata `=scalar(__gtools_levels_mataname)'.desc()})" + } + } + + * top matrix + if ( inlist("`gfunction'", "top") ) { + if ( `=scalar(__gtools_top_matasave)' ) { + mata: `=scalar(__gtools_top_mataname)' = GtoolsByLevels() + mata: `=scalar(__gtools_top_mataname)'.whoami = st_strscalar("__gtools_top_mataname") + mata: `=scalar(__gtools_top_mataname)'.caller = "gtop" + + mata: `=scalar(__gtools_top_mataname)'.read( /* + */ st_numscalar("__gtools_top_silent")? `""': `"`numfmt'"', /* + */ st_numscalar("__gtools_top_vlab")) + + c_local _post_msg_gtop_matanote /* + */ (note: raw levels saved in `=scalar(__gtools_top_mataname)'; /* + */ see {stata mata `=scalar(__gtools_top_mataname)'.desc()}) + + mata `=scalar(__gtools_top_mataname)'.toplevels = GtoolsReadMatrix( /* + */ st_local("gtopmatfile"), /* + */ st_numscalar("__gtools_top_nrows"), 5) + } + else { + if ( `=scalar(__gtools_top_ntop)' > `c(matsize)' ) { + c_local _post_msg_gtop_matawarn /* + */ {bf:performance warning:} # levels > matsize /* + */ (`=scalar(__gtools_top_ntop)' > `c(matsize)'); try option -mata- + } + + mata __gtools_top_matrix = GtoolsReadMatrix( /* + */ st_local("gtopmatfile"), /* + */ st_numscalar("__gtools_top_nrows"), 5) + + mata __gtools_top_num = GtoolsReadMatrix( /* + */ st_local("gtopnumfile"), /* + */ st_numscalar("__gtools_top_ntop"), /* + */ st_numscalar("__gtools_kvars_num")) + } + + return scalar alpha = __gtools_top_alpha + return scalar ntop = __gtools_top_ntop + return scalar nrows = __gtools_top_nrows + return scalar Jmiss = __gtools_top_Jmiss + return scalar Jother = __gtools_top_Jother + + * return matrix toplevels = __gtools_top_matrix + * return matrix numlevels = __gtools_top_num + } + + * regress results + if ( inlist("`gfunction'", "regress") ) { + if ( scalar(__gtools_gregress_savemata) ) { + mata: `saveGregressMata'.readMatrices() + mata: `saveGregressMata'.ByLevels = GtoolsByLevels() + mata: `saveGregressMata'.ByLevels.whoami = "ByLevels" + mata: `saveGregressMata'.ByLevels.caller = `"`caller'"' + mata: `saveGregressMata'.ByLevels.read("%16.0g", 1) + c_local saveGregressMata: copy local saveGregressMata + disp as txt "Results in `saveGregressMata'; see {stata mata `saveGregressMata'.desc()}" + } + } + + * quantile info + if ( inlist("`gfunction'", "quantiles") ) { + return local quantiles = "`quantiles'" + return local cutoffs = "`cutoffs'" + return local nqextra = "`__gtools_xtile_nq_extra'" + return local Nxvars = scalar(__gtools_xtile_xvars) + + return scalar min = scalar(__gtools_xtile_min) + return scalar max = scalar(__gtools_xtile_max) + return scalar method_ratio = scalar(__gtools_xtile_method) + return scalar imprecise = scalar(__gtools_xtile_imprecise) + + return scalar nquantiles = scalar(__gtools_xtile_nq) + return scalar nquantiles2 = scalar(__gtools_xtile_nq2) + return scalar ncutpoints = scalar(__gtools_xtile_cutvars) + return scalar ncutoffs = scalar(__gtools_xtile_ncuts) + return scalar nquantpoints = scalar(__gtools_xtile_qvars) + + return matrix quantiles_used = __gtools_xtile_quantiles + return matrix quantiles_bincount = __gtools_xtile_quantbin + return matrix cutoffs_used = __gtools_xtile_cutoffs + return matrix cutoffs_bincount = __gtools_xtile_cutbin + } + + return matrix invert = __gtools_invert + clean_all 0 + exit 0 +end + +*********************************************************************** +* hashsort * +*********************************************************************** + +capture program drop hashsort_inner +program hashsort_inner, sortpreserve + syntax varlist [in], benchmark(int) [invertinmata] + cap noi plugin call gtools_plugin `varlist' `_sortindex' `in', hashsort + if ( _rc ) exit _rc + if ( "`invertinmata'" != "" ) { + mata: st_store(., "`_sortindex'", invorder(st_data(., "`_sortindex'"))) + } + * else { + * mata: st_store(., "`_sortindex'", st_data(., "`_sortindex'")) + * } + + c_local r_N = `r_N' + c_local r_J = `r_J' + c_local r_minJ = `r_minJ' + c_local r_maxJ = `r_maxJ' + + local msg "C plugin runtime" + gtools_timer info ${GTOOLS_T98} `"`msg'"', prints(`benchmark') +end + +*********************************************************************** +* Cleanup * +*********************************************************************** + +capture program drop clean_all +program clean_all + args rc + if ( `"`rc'"' == "" ) local rc = 0 + + foreach f of global GTOOLS_TEMPFILES_INTERNAL { + cap erase `"${GTOOLS_TEMPDIR}/`f'"' + } + global GTOOLS_TEMPFILES_INTERNAL + global GTOOLS_TEMPFILES_INTERNAL_I + + set varabbrev ${GTOOLS_USER_INTERNAL_VARABBREV} + global GTOOLS_USER_INTERNAL_VARABBREV + global GTOOLS_GREG_FILE + global GTOOLS_GREGB_FILE + global GTOOLS_GREGSE_FILE + global GTOOLS_GREGVCOV_FILE + global GTOOLS_GREGCLUS_FILE + global GTOOLS_GREGABS_FILE + global GTOOLS_GHDFEABS_FILE + global GTOOLS_GSTATS_FILE + global GTOOLS_BYVAR_FILE + global GTOOLS_BYCOL_FILE + global GTOOLS_BYNUM_FILE + global GTOOLS_GTOPNUM_FILE + global GTOOLS_GTOPMAT_FILE + global GTOOLS_BYNAMES + + cap scalar drop __gtools_gfile_byvar + cap scalar drop __gtools_gfile_bycol + cap scalar drop __gtools_gfile_bynum + cap scalar drop __gtools_gfile_topnum + cap scalar drop __gtools_gfile_topmat + cap scalar drop __gtools_gfile_gregb + cap scalar drop __gtools_gfile_gregse + cap scalar drop __gtools_gfile_gregclus + cap scalar drop __gtools_gfile_gregabs + cap scalar drop __gtools_gfile_hdfeabs + cap scalar drop __gtools_init_targ + cap scalar drop __gtools_any_if + cap scalar drop __gtools_verbose + cap scalar drop __gtools_debug + cap scalar drop __gtools_benchmark + cap scalar drop __gtools_countonly + cap scalar drop __gtools_seecount + cap scalar drop __gtools_unsorted + cap scalar drop __gtools_invertix + cap scalar drop __gtools_nomiss + cap scalar drop __gtools_keepmiss + cap scalar drop __gtools_missing + cap scalar drop __gtools_hash + cap scalar drop __gtools_encode + cap scalar drop __gtools_replace + cap scalar drop __gtools_countmiss + cap scalar drop __gtools_skipcheck + cap scalar drop __gtools_mlast + cap scalar drop __gtools_subtract + cap scalar drop __gtools_ctolerance + cap scalar drop __gtools_hash_method + cap scalar drop __gtools_weight_code + cap scalar drop __gtools_weight_pos + cap scalar drop __gtools_weight_sel + cap scalar drop __gtools_nunique + + cap scalar drop __gtools_top_nrows + cap scalar drop __gtools_top_ntop + cap scalar drop __gtools_top_pct + cap scalar drop __gtools_top_freq + cap scalar drop __gtools_top_mataname + cap scalar drop __gtools_top_matasave + cap scalar drop __gtools_top_silent + cap scalar drop __gtools_top_vlab + cap scalar drop __gtools_top_invert + cap scalar drop __gtools_top_alpha + cap scalar drop __gtools_top_miss + cap scalar drop __gtools_top_groupmiss + cap scalar drop __gtools_top_other + cap scalar drop __gtools_top_lmiss + cap scalar drop __gtools_top_lother + cap scalar drop __gtools_top_Jmiss + cap scalar drop __gtools_top_Jother + cap matrix drop __gtools_contract_which + + cap scalar drop __gtools_levels_mataname + cap scalar drop __gtools_levels_matasave + cap scalar drop __gtools_levels_silent + cap scalar drop __gtools_levels_return + cap scalar drop __gtools_levels_gen + cap scalar drop __gtools_levels_replace + + cap scalar drop __gtools_xtile_xvars + cap scalar drop __gtools_xtile_nq + cap scalar drop __gtools_xtile_nq2 + cap scalar drop __gtools_xtile_cutvars + cap scalar drop __gtools_xtile_ncuts + cap scalar drop __gtools_xtile_qvars + cap scalar drop __gtools_xtile_gen + cap scalar drop __gtools_xtile_pctile + cap scalar drop __gtools_xtile_genpct + cap scalar drop __gtools_xtile_pctpct + cap scalar drop __gtools_xtile_altdef + cap scalar drop __gtools_xtile_missing + cap scalar drop __gtools_xtile_strict + cap scalar drop __gtools_xtile_min + cap scalar drop __gtools_xtile_max + cap scalar drop __gtools_xtile_method + cap scalar drop __gtools_xtile_bincount + cap scalar drop __gtools_xtile__pctile + cap scalar drop __gtools_xtile_dedup + cap scalar drop __gtools_xtile_cutifin + cap scalar drop __gtools_xtile_cutby + cap scalar drop __gtools_xtile_imprecise + cap matrix drop __gtools_xtile_quantiles + cap matrix drop __gtools_xtile_cutoffs + cap matrix drop __gtools_xtile_quantbin + cap matrix drop __gtools_xtile_cutbin + cap scalar drop __gtools_xtile_size + + cap scalar drop __gtools_kvars + cap scalar drop __gtools_kvars_num + cap scalar drop __gtools_kvars_int + cap scalar drop __gtools_kvars_str + cap scalar drop __gtools_kvars_strL + + cap scalar drop __gtools_group_data + cap scalar drop __gtools_group_fill + cap scalar drop __gtools_group_val + + cap scalar drop __gtools_cleanstr + cap scalar drop __gtools_sep_len + cap scalar drop __gtools_colsep_len + cap scalar drop __gtools_numfmt_len + cap scalar drop __gtools_numfmt_max + + cap scalar drop __gtools_k_vars + cap scalar drop __gtools_k_targets + cap scalar drop __gtools_k_stats + cap scalar drop __gtools_k_group + + cap scalar drop __gtools_st_time + cap scalar drop __gtools_used_io + cap scalar drop __gtools_ixfinish + cap scalar drop __gtools_J + + cap matrix drop __gtools_weight_smat + cap matrix drop __gtools_invert + cap matrix drop __gtools_bylens + cap matrix drop __gtools_strL + cap matrix drop __gtools_numpos + cap matrix drop __gtools_strpos + + cap matrix drop __gtools_group_targets + cap matrix drop __gtools_group_init + + cap matrix drop __gtools_stats + cap matrix drop __gtools_pos_targets + + gregress_scalars drop + gstats_scalars drop + greshape_scalars drop `_keepgreshape' + + * NOTE(mauricio): You had the urge to make sure you were dropping + * variables at one point. Don't. This is fine for gquantiles but not so + * with gegen or gcollapse. In the case of gcollapse, if the user ran w/o + * fast then they were willing to leave the data in a bad stata in case + * there was an error. In the casae of gegen, the main variable is a dummy + * that is renamed later on. + + if ( `rc' ) { + cap mata: st_dropvar(__gtools_xtile_addnam) + cap mata: st_dropvar(__gtools_level_targets) + * cap mata: st_dropvar(__gtools_togen_names[__gtools_togen_s]) + * cap mata: st_dropvar(__gtools_gc_addvars) + } + + cap mata: mata drop __gtools_togen_k + cap mata: mata drop __gtools_togen_s + + cap mata: mata drop __gtools_togen_types + cap mata: mata drop __gtools_togen_names + + cap mata: mata drop __gtools_xtile_addlab + cap mata: mata drop __gtools_xtile_addnam + + cap mata: mata drop __gtools_level_targets + + cap timer off $GTOOLS_T99 + cap timer clear $GTOOLS_T99 + + cap timer off $GTOOLS_T98 + cap timer clear $GTOOLS_T98 + + global GTOOLS_T99 + global GTOOLS_T98 +end + +*********************************************************************** +* Parse by types * +*********************************************************************** + +capture program drop parse_by_types +program parse_by_types, rclass + syntax [anything] [if] [in], [clean_anything(str) compress forcestrl glevelsof(str) ds] + + mata st_local("ifin", st_local("if") + " " + st_local("in")) + if ( "`anything'" == "" ) { + matrix __gtools_invert = 0 + matrix __gtools_bylens = 0 + matrix __gtools_strL = 0 + + return local invert = 0 + return local varlist = "" + return local varnum = "" + return local varstr = "" + return local varstrL = "" + + scalar __gtools_kvars = 0 + scalar __gtools_kvars_int = 0 + scalar __gtools_kvars_num = 0 + scalar __gtools_kvars_str = 0 + scalar __gtools_kvars_strL = 0 + + exit 0 + } + + cap matrix drop __gtools_invert + cap matrix drop __gtools_bylens + cap matrix drop __gtools_strL + + * Parse whether to invert sort order + * ---------------------------------- + + local parse `anything' + local varlist "" + local skip = 0 + local invert = 0 + if ( strpos("`anything'", "-") & ("`ds'" == "") ) { + while ( trim("`parse'") != "" ) { + gettoken var parse: parse, p(" -+") + if inlist("`var'", "-", "+") { + local skip = 1 + local invert = ( "`var'" == "-" ) + } + else { + cap ds `var' + if ( _rc ) { + local rc = _rc + di as err "Variable '`var'' does not exist." + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + clean_all `rc' + exit `rc' + } + if ( `skip' ) { + local skip = 0 + foreach var in `r(varlist)' { + matrix __gtools_invert = nullmat(__gtools_invert), /// + `invert' + } + } + else { + foreach var in `r(varlist)' { + matrix __gtools_invert = nullmat(__gtools_invert), 0 + } + } + local varlist `varlist' `r(varlist)' + } + } + } + else { + local varlist `clean_anything' + matrix __gtools_invert = J(1, max(`:list sizeof varlist', 1), 0) + } + + * Compress strL variables if requested + * ------------------------------------ + + * gcollapse, gcontract, greshape, need to write to variables, + * and so cannot support strL variables + + local GTOOLS_CALLER $GTOOLS_CALLER + local GTOOLS_STRL gcollapse gcontract greshape hashsort + local GTOOLS_STRL_FAIL: list GTOOLS_CALLER in GTOOLS_STRL + + * glevelsof, gen() needs to write to variables, and so cannot + * support strL variables + + local varlist_ `varlist' + local anything_ `anything' + local 0, `glevelsof' + syntax, [ /// + noLOCALvar /// + freq(str) /// + store(str) /// + gen(str) /// + silent /// + MATAsave /// + MATAsavename(str) /// + ] + local varlist `varlist_' + local anything `anything_' + + if ( `"`gen'"' != "" ) { + local GTOOLS_CALLER "`GTOOLS_CALLER', gen()" + local GTOOLS_STRL_FAIL = 1 + } + + * Any strL? + local varstrL "" + if ( "`varlist'" != "" ) { + cap confirm variable `varlist' + if ( _rc ) { + di as err "{opt varlist} requried but received: `varlist'" + exit 198 + } + + foreach byvar of varlist `varlist' { + if regexm("`:type `byvar''", "str([1-9][0-9]*|L)") { + if (regexs(1) == "L") { + local varstrL `varstrL' `byvar' + } + } + } + } + + local need_compress = `GTOOLS_STRL_FAIL' | (`c(stata_version)' < 14.1) + if ( ("`varstrL'" != "") & `need_compress' & ("`compress'" != "") ) { + qui compress `varstrL', nocoalesce + } + + local varstrL "" + if ( "`varlist'" != "" ) { + cap confirm variable `varlist' + if ( _rc ) { + di as err "{opt varlist} requried but received: `varlist'" + exit 198 + } + + foreach byvar of varlist `varlist' { + if regexm("`:type `byvar''", "str([1-9][0-9]*|L)") { + if (regexs(1) == "L") { + local varstrL `varstrL' `byvar' + } + } + } + } + + local cpass = cond("`GTOOLS_CALLER'" == "gduplicates", "gtools(compress)", "compress") + if ( ("`varstrL'" != "") & `need_compress' & ("`compress'" != "") ) { + if ( `GTOOLS_STRL_FAIL' ) { + disp as err _n(1) "{cmd:`GTOOLS_CALLER'} does not support strL variables. I tried" /// + _n(1) "" /// + _n(1) " {stata compress `varstrL'}" /// + _n(1) "" /// + _n(1) "But these variables could not be recast as str#. This limitation comes" /// + _n(1) "from the Stata Plugin Interface, which does not allow writing to strL" /// + _n(1) "variables from a plugin." + } + else if ( `c(stata_version)' < 14.1 ) { + disp as err _n(1) "gtools for Stata 13 and earlier does not support strL variables. I tried" /// + _n(1) "" /// + _n(1) " {stata compress `varstrL'}" /// + _n(1) "" /// + _n(1) "But these variables could not be compressed as str#. Please note {cmd:gcollapse}," /// + _n(1) " {cmd:gcontract}, {cmd:greshape}, and {cmd:hashsort} do not support strL variables." /// + _n(1) "Further, binary strL variables are not yet supported in any Stata version." /// + _n(1) "" /// + _n(1) "However, if your strL variables do not contain binary data, gtools 0.14" /// + _n(1) "and above can read strL variables in Stata 14 or later." + } + exit 17004 + } + else if ( ("`varstrL'" != "") & `need_compress' ) { + if ( `GTOOLS_STRL_FAIL' ) { + disp as err _n(1) "{cmd:`GTOOLS_CALLER'} does not support strL variables. If your strL variables are str#, try" /// + _n(1) "" /// + _n(1) " {stata compress `varstrL'}" /// + _n(1) "" /// + _n(1) "or passing {opt `cpass'} to {opt `GTOOLS_CALLER'}. If this does not work or if you have" /// + _n(1) "have binary data, you will not be able to use {opt `GTOOLS_CALLER'}. This limitation" /// + _n(1) "comes from the Stata Plugin Interface, which does not allow writing to" /// + _n(1) "strL variables from a plugin." + } + else if ( `c(stata_version)' < 14.1 ) { + disp as err _n(1) "gtools for Stata 13 and earlier does not support strL variables. If your" /// + _n(1) "strL variables are string-only, try" /// + _n(1) "" /// + _n(1) " {stata compress `varstrL'}" /// + _n(1) "" /// + _n(1) "or passing {opt `cpass'} to {opt `GTOOLS_CALLER'}. Please note {cmd:gcollapse}, {cmd:gcontract}, " /// + _n(1) "{cmd:greshape}, and {cmd:hashsort} do not support strL variables in any version. Further, binary" /// + _n(1) "strL variables are not yet supported in any Stata version." /// + _n(1) "" /// + _n(1) "However, if your strL variables do not contain binary data, gtools" /// + _n(1) "0.14 and above can read strL variables in Stata 14 or later." + } + exit 17002 + } + else if ( ("`varstrL'" != "") & (`c(stata_version)' >= 14.1) & ("`forcestrl'" == "") ) { + scalar __gtools_k_strL = `:list sizeof varstrL' + cap noi plugin call gtools_plugin `varstrL', checkstrL + if ( _rc ) { + cap scalar drop __gtools_k_strL + disp as err _n(1) "gtools does not yet support binary data in strL variables." + if ( strpos(lower("`c(os)'"), "windows") ) { + disp as txt /// + _n(1) "On some Windows systems Stata detects binary data in strL variables even" /// + _n(1) "when there is none. You can try the experimental option {opt forcestrl} to skip" /// + _n(1) "the binary data check. {opt Forcing gtools to work with binary data gives wrong}" /// + _n(1) "results, so only use this option if you are certain your strL variables" /// + _n(1) "do no contain binary data." + } + exit 17005 + } + cap scalar drop __gtools_k_strL + * disp as txt "(note: performance with strL variables is not optimized)" + } + else if ( ("`varstrL'" != "") & ("`forcestrl'" == "") ) { + disp as err _n(1) "gtools failed to parse strL variables." + exit 17006 + } + + tempvar strlen + if ( "`varstrL'" != "" ) qui gen long `strlen' = . + + * Check how many of each variable type we have + * -------------------------------------------- + + local kint = 0 + local knum = 0 + local kstr = 0 + local kstrL = 0 + local kvars = 0 + + local varint "" + local varnum "" + local varstr "" + local varstrL "" + + if ( "`varlist'" != "" ) { + cap confirm variable `varlist' + if ( _rc ) { + di as err "{opt varlist} requried but received: `varlist'" + exit 198 + } + + foreach byvar of varlist `varlist' { + local ++kvars + if inlist("`:type `byvar''", "byte", "int", "long") { + local ++kint + local ++knum + local varint `varint' `byvar' + local varnum `varnum' `byvar' + matrix __gtools_strL = nullmat(__gtools_strL), 0 + matrix __gtools_bylens = nullmat(__gtools_bylens), 0 + } + else if inlist("`:type `byvar''", "float", "double") { + local ++knum + local varnum `varnum' `byvar' + matrix __gtools_strL = nullmat(__gtools_strL), 0 + matrix __gtools_bylens = nullmat(__gtools_bylens), 0 + } + else { + local ++kstr + local varstr `varstr' `byvar' + if regexm("`:type `byvar''", "str([1-9][0-9]*|L)") { + if (regexs(1) == "L") { + local ++kstrL + local varstrL `varstrL' `byvar' + qui replace `strlen' = length(`byvar') + qui sum `strlen', meanonly + matrix __gtools_strL = nullmat(__gtools_strL), 1 + matrix __gtools_bylens = nullmat(__gtools_bylens), /// + `=r(max) + 1' + } + else { + matrix __gtools_strL = nullmat(__gtools_strL), 0 + matrix __gtools_bylens = nullmat(__gtools_bylens), /// + `:di regexs(1)' + } + } + else { + di as err "variable `byvar' has unknown type" /// + " '`:type `byvar'''" + exit 198 + } + } + } + + cap assert `kvars' == `:list sizeof varlist' + if ( _rc ) { + di as err "Error parsing syntax call; variable list was:" /// + _n(1) "`anything'" + exit 198 + } + } + + * Parse which hashing strategy to use + * ----------------------------------- + + scalar __gtools_kvars = `kvars' + scalar __gtools_kvars_int = `kint' + scalar __gtools_kvars_num = `knum' + scalar __gtools_kvars_str = `kstr' + scalar __gtools_kvars_strL = `kstrL' + + * Return hash info + * ---------------- + + return local invert = `invert' + return local varlist = "`varlist'" + return local varnum = "`varnum'" + return local varstr = "`varstr'" + return local varstrL = "`varstrL'" +end + +*********************************************************************** +* Generic hash helpers * +*********************************************************************** + +capture program drop confirm_var +program confirm_var, rclass + syntax anything, [replace] + local newvar = 1 + if ( "`replace'" != "" ) { + cap confirm new variable `anything' + if ( _rc ) { + local newvar = 0 + } + else { + cap noi confirm name `anything' + if ( _rc ) { + local rc = _rc + clean_all + exit `rc' + } + } + } + else { + cap confirm new variable `anything' + if ( _rc ) { + local rc = _rc + clean_all + cap noi confirm name `anything' + if ( _rc ) { + exit `rc' + } + else { + di as err "Variable `anything' exists;" /// + " try a different name or run with -replace-" + exit `rc' + } + } + } + return scalar newvar = `newvar' + exit 0 +end + +capture program drop rc_dispatch +program rc_dispatch + syntax [varlist], rc(int) oncollision(str) + + local website_url https://github.com/mcaceresb/stata-gtools/issues + local website_disp github.com/mcaceresb/stata-gtools + + if ( `rc' == 17000 ) { + di as err "There may be 128-bit hash collisions!" + di as err `"This is a bug. Please report to"' /// + `" {browse "`website_url'":`website_disp'}"' + if ( "`oncollision'" == "fallback" ) { + exit 17999 + } + else { + exit 17000 + } + } + else if ( `rc' == 17001 ) { + exit 17001 + } + else if ( `rc' == 459 ) { + local kvars : word count `varlist' + local s = cond(`kvars' == 1, "", "s") + di as err "variable`s' `varlist' should never be missing" + exit 459 + } + else if ( `rc' == 17459 ) { + local kvars : word count `varlist' + local var = cond(`kvars'==1, "variable", "variables") + local does = cond(`kvars'==1, "does", "do") + di as err "`var' `varlist' `does' not uniquely" /// + " identify the observations" + exit 459 + } + else { + * error `rc' + exit `rc' + } +end + +capture program drop gtools_timer +program gtools_timer, rclass + syntax anything, [prints(int 0) end off] + tokenize `"`anything'"' + local what `1' + local timer `2' + local msg `"`3'; "' + + * If timer is 0, then there were no free timers; skip this benchmark + if ( `timer' == 0 ) exit 0 + + if ( inlist("`what'", "start", "on") ) { + cap timer off `timer' + cap timer clear `timer' + timer on `timer' + } + else if ( inlist("`what'", "info") ) { + timer off `timer' + qui timer list + return scalar t`timer' = `r(t`timer')' + return local pretty`timer' = trim("`:di %21.4gc r(t`timer')'") + if ( `prints' ) di `"`msg'`:di trim("`:di %21.4gc r(t`timer')'")' seconds"' + timer off `timer' + timer clear `timer' + timer on `timer' + } + + if ( "`end'`off'" != "" ) { + timer off `timer' + timer clear `timer' + } +end + +capture program drop check_matsize +program check_matsize + syntax [anything], [nvars(int 0)] + if ( `nvars' == 0 ) local nvars `:list sizeof anything' + if ( `nvars' > `c(matsize)' ) { + cap set matsize `=`nvars'' + if ( _rc ) { + di as err /// + _n(1) "{bf:# variables > matsize (`nvars' > `c(matsize)').}" /// + _n(2) " {stata set matsize `=`nvars''}" /// + _n(2) "{bf:failed. Try setting matsize manually.}" + exit 908 + } + } +end + +* NOTE(mauricio): Replace does nothing here atm; it shouldn't because +* _gtools_internal expects everything to exist already! +capture program drop parse_targets +program parse_targets + syntax, sources(str) targets(str) stats(str) [replace k_exist(str) KEEPMISSing] + local k_vars = `:list sizeof sources' + local k_targets = `:list sizeof targets' + local k_stats = `:list sizeof stats' + + local uniq_sources: list uniq sources + local uniq_targets: list uniq targets + + cap assert `k_targets' == `k_stats' + if ( _rc ) { + di as err " `k_targets' target(s) require(s) `k_targets' stat(s)," /// + " but user passed `k_stats'" + exit 198 + } + + if ( `k_targets' > 1 ) { + cap assert `k_targets' == `k_vars' + if ( _rc ) { + di as err " `k_targets' targets require `k_targets' sources," /// + " but user passed `k_vars'" + exit 198 + } + } + else if ( `k_targets' == 1 ) { + cap assert `k_vars' > 0 + if ( _rc ) { + di as err "Specify at least one source variable" + exit 198 + } + cap assert `:list sizeof uniq_sources' == `k_vars' + if ( _rc ) { + di as txt "(warning: repeat sources ignored with 1 target)" + } + } + else { + di as err "Specify at least one target" + exit 198 + } + + local stats: subinstr local stats "total" "sum", all + cap assert `:list sizeof uniq_targets' == `k_targets' + if ( _rc ) { + di as err "Cannot specify multiple targets with the same name." + exit 198 + } + + if ( "`k_exist'" != "targets" ) { + foreach var of local uniq_sources { + cap confirm variable `var' + if ( _rc ) { + di as err "Source `var' has to exist." + exit 198 + } + + cap confirm numeric variable `var' + if ( _rc ) { + di as err "Source `var' must be numeric." + exit 198 + } + } + } + + mata: __gtools_stats = J(1, `k_stats', .) + mata: __gtools_pos_targets = J(1, `k_targets', 0) + + cap noi check_matsize `targets' + if ( _rc ) exit _rc + + local keepadd = cond("`keepmissing'" == "", 0, 100) + forvalues k = 1 / `k_targets' { + local src: word `k' of `sources' + local trg: word `k' of `targets' + local st: word `k' of `stats' + + encode_stat_allowed `st' `keepadd' + mata: __gtools_stats[`k'] = `r(statcode)' + + if ( "`k_exist'" != "sources" ) { + cap confirm variable `trg' + if ( _rc ) { + di as err "Target `trg' has to exist." + exit 198 + } + + cap confirm numeric variable `trg' + if ( _rc ) { + di as err "Target `trg' must be numeric." + exit 198 + } + } + + mata: __gtools_pos_targets[`k'] = `:list posof `"`src'"' in uniq_sources' - 1 + } + + scalar __gtools_k_vars = `:list sizeof uniq_sources' + scalar __gtools_k_targets = `k_targets' + scalar __gtools_k_stats = `k_stats' + + c_local __gtools_sources `uniq_sources' + c_local __gtools_targets `targets' + + mata: st_matrix("__gtools_stats", __gtools_stats) + mata: st_matrix("__gtools_pos_targets", __gtools_pos_targets) + + cap mata: mata drop __gtools_stats + cap mata: mata drop __gtools_pos_targets +end + +capture program drop encode_stat_allowed +program encode_stat_allowed, rclass + args st keepadd + local allowed sum /// + nansum /// + mean /// + geomean /// + sd /// + variance /// + cv /// + max /// + min /// + range /// + count /// + median /// + iqr /// + percent /// + first /// + last /// + firstnm /// + lastnm /// + freq /// + semean /// + sebinomial /// + sepoisson /// + nunique /// + nmissing /// + skewness /// + kurtosis /// + gini /// + gini|dropneg /// + gini|keepneg /// + rawsum /// + rawnansum + + encode_aliases `st' + local st `r(stat)' + + if ( `:list st in allowed' ) { + encode_stat `st' `keepadd' + local statcode `r(statcode)' + } + else { + cap noi encode_regex `st' + if ( `r(statcode)' == 0 ) { + disp as err "_gtools_internal/encode_stat_allowed: unknown stat `st'" + error 110 + } + else local statcode `r(statcode)' + local st `r(stat)' + } + + return local statname = `"`st'"' + return scalar statcode = `statcode' +end + +capture program drop encode_stat +program encode_stat, rclass + args stat keepadd + local statcode 0 + if ( "`stat'" == "sum" ) local statcode = -1 - `keepadd' + if ( "`stat'" == "nansum" ) local statcode = -101 + if ( "`stat'" == "mean" ) local statcode = -2 + if ( "`stat'" == "geomean" ) local statcode = -26 + if ( "`stat'" == "sd" ) local statcode = -3 + if ( "`stat'" == "variance" ) local statcode = -23 + if ( "`stat'" == "cv" ) local statcode = -24 + if ( "`stat'" == "max" ) local statcode = -4 + if ( "`stat'" == "min" ) local statcode = -5 + if ( "`stat'" == "range" ) local statcode = -25 + if ( "`stat'" == "count" ) local statcode = -6 + if ( "`stat'" == "percent" ) local statcode = -7 + if ( "`stat'" == "median" ) local statcode = 50 + if ( "`stat'" == "iqr" ) local statcode = -9 + if ( "`stat'" == "first" ) local statcode = -10 + if ( "`stat'" == "firstnm" ) local statcode = -11 + if ( "`stat'" == "last" ) local statcode = -12 + if ( "`stat'" == "lastnm" ) local statcode = -13 + if ( "`stat'" == "freq" ) local statcode = -14 + if ( "`stat'" == "semean" ) local statcode = -15 + if ( "`stat'" == "sebinomial" ) local statcode = -16 + if ( "`stat'" == "sepoisson" ) local statcode = -17 + if ( "`stat'" == "nunique" ) local statcode = -18 + if ( "`stat'" == "nmissing" ) local statcode = -22 + if ( "`stat'" == "skewness" ) local statcode = -19 + if ( "`stat'" == "kurtosis" ) local statcode = -20 + if ( "`stat'" == "gini" ) local statcode = -27 + if ( "`stat'" == "gini|dropneg" ) local statcode = -27.1 + if ( "`stat'" == "gini|keepneg" ) local statcode = -27.2 + if ( "`stat'" == "rawsum" ) local statcode = -21 - `keepadd' + if ( "`stat'" == "rawnansum" ) local statcode = -121 + return scalar statcode = `statcode' +end + +capture program drop encode_regex +program encode_regex, rclass + args st + local rc = 0 + local statcode = 0 + local stat: copy local st + if regexm("`st'", "rawselect") { + local stat rawselect + local select = regexm("`st'", "^rawselect(-|)([0-9]+)$") + if ( `select' == 0 ) { + di as error "Invalid stat: (`st'; did you mean rawselect# or rawselect-#?)" + local rc = 110 + } + else if ( `=regexs(2)' == 0 ) { + di as error "Invalid stat: (`st' not allowed; selection must be 1 or larger)" + local rc = 110 + } + else { + local statcode = `:di regexs(1)' (1000.5 + `=regexs(2)') + } + } + else if regexm("`st'", "select") { + local stat select + local select = regexm("`st'", "^select(-|)([0-9]+)$") + if ( `select' == 0 ) { + di as error "Invalid stat: (`st'; did you mean select# or select-#?)" + local rc = 110 + } + else if ( `=regexs(2)' == 0 ) { + di as error "Invalid stat: (`st' not allowed; selection must be 1 or larger)" + local rc = 110 + } + else { + local statcode = `:di regexs(1)' (1000 + `=regexs(2)') + } + } + else if regexm("`st'", "^p([0-9][0-9]?(\.[0-9]+)?)$") { + local stat pctile + if ( `:di regexs(1)' == 0 ) { + di as error "Invalid stat: (`st'; maybe you meant 'min'?)" + local rc = 110 + } + else { + local statcode = `:di regexs(1)' + } + } + else if ( "`st'" == "p100" ) { + di as error "Invalid stat: (`st'; maybe you meant 'max'?)" + local rc = 110 + } + else { + di as error "Invalid stat: `st'" + local rc = 110 + } + return local stat = `"`stat'"' + return scalar statcode = `statcode' + exit `rc' +end + +capture program drop encode_aliases +program encode_aliases, rclass + args st + local allowed sum /// + nansum /// + mean /// + geomean /// + sd /// + variance /// + cv /// + max /// + min /// + range /// + count /// + median /// + iqr /// + percent /// + first /// + last /// + firstnm /// + lastnm /// + freq /// + semean /// + sebinomial /// + sepoisson /// + nunique /// + nmissing /// + skewness /// + kurtosis /// + gini /// + gini|dropneg /// + gini|keepneg /// + rawsum /// + rawnansum + + local alias_sum su + local alias_nansum nansu + local alias_mean me mea + local alias_geomean + local alias_gini + local alias_gini_dropneg + local alias_gini_keepneg + local alias_sd + local alias_variance var vari varia varian varianc + local alias_cv + local alias_max ma max + local alias_min mi min + local alias_range r ra ran rang range + local alias_count co cou coun + local alias_median med medi media + local alias_iqr + local alias_percent perc perce percen + local alias_first + local alias_last + local alias_firstnm + local alias_lastnm + local alias_freq + local alias_semean sem seme semea + local alias_sebinomial seb sebi sebin sebino sebinom sebinomi sebinomia + local alias_sepoisson sep sepo sepoi sepois sepoiss sepoisso + local alias_nunique nuniq nuniqu + local alias_nmissing nmiss nmissi nmissin + local alias_skewness sk ske skew skewn skewne skewnes + local alias_kurtosis k ku kur kurt kurto kurtos kurtosi + local alias_rawsum rawsu + local alias_rawnansum rawnansu + + if ( !`:list st in allowed' ) { + foreach stat of local allowed { + if ( `:list st in alias_`:subinstr local stat "|" "_", all'' ) { + local st: copy local stat + } + } + } + + return local stat: copy local st +end + +capture program drop encode_stat_types +program encode_stat_types, rclass + args stat stype ttype + + * default type for summary stats + if ( inlist("`stype'", "double", "long") ) { + local deftype double + } + else { + local deftype: set type + } + + * next-biggest type + if ( `"`stype'"' == "byte" ) { + local nexttype int + } + else if ( `"`stype'"' == "int" ) { + local nexttype long + } + else if ( `"`stype'"' == "long" ) { + local nexttype double + } + else if ( `"`stype'"' == "float" ) { + local nexttype double + } + else if ( `"`stype'"' == "double" ) { + local nexttype double + } + + * minimum OK type for counts + if ( `=_N' < maxbyte() ) { + local mintype_count byte + } + else if ( `=_N' < maxint() ) { + local mintype_count int + } + else if ( `=_N' < maxlong() ) { + local mintype_count long + } + else { + local mintype_count double + } + + encode_stat_allowed `stat' 0 + local stat `r(statname)' + + if ( "`stat'" == "sum" ) local type double + if ( "`stat'" == "nansum" ) local type double + if ( "`stat'" == "mean" ) local type `deftype' + if ( "`stat'" == "geomean" ) local type `deftype' + if ( "`stat'" == "sd" ) local type `deftype' + if ( "`stat'" == "variance" ) local type `deftype' + if ( "`stat'" == "cv" ) local type `deftype' + if ( "`stat'" == "max" ) local type `stype' + if ( "`stat'" == "min" ) local type `stype' + if ( "`stat'" == "range" ) local type `nexttype' + if ( "`stat'" == "count" ) local type `mintype_count' + if ( "`stat'" == "percent" ) local type `deftype' + if ( "`stat'" == "median" ) local type `deftype' + if ( "`stat'" == "iqr" ) local type `deftype' + if ( "`stat'" == "first" ) local type `stype' + if ( "`stat'" == "firstnm" ) local type `stype' + if ( "`stat'" == "last" ) local type `stype' + if ( "`stat'" == "lastnm" ) local type `stype' + if ( "`stat'" == "freq" ) local type `mintype_count' + if ( "`stat'" == "semean" ) local type `deftype' + if ( "`stat'" == "sebinomial" ) local type `deftype' + if ( "`stat'" == "sepoisson" ) local type `deftype' + if ( "`stat'" == "nunique" ) local type `mintype_count' + if ( "`stat'" == "nmissing" ) local type `mintype_count' + if ( "`stat'" == "skewness" ) local type `deftype' + if ( "`stat'" == "kurtosis" ) local type `deftype' + if ( "`stat'" == "rawsum" ) local type double + if ( "`stat'" == "rawnansum" ) local type double + if ( "`stat'" == "pctile" ) local type `deftype' + if ( "`stat'" == "select" ) local type `stype' + if ( "`stat'" == "rawselect" ) local type `stype' + if ( "`stat'" == "gini" ) local type `deftype' + if ( "`stat'" == "gini|dropneg" ) local type `deftype' + if ( "`stat'" == "gini|keepneg" ) local type `deftype' + + if ( `"`ttype'"' == "double" ) { + local retype = 0 + } + else if ( `"`ttype'"' == "byte" ) { + local retype = !inlist(`"`type'"', "byte") + } + else if ( `"`ttype'"' == "int" ) { + local retype = !inlist(`"`type'"', "byte", "int") + } + else if ( `"`ttype'"' == "long" ) { + local retype = !inlist(`"`type'"', "byte", "int", "long") + if ( (`retype') & (`"`type'"' == "float") ) local type double + } + else if ( `"`ttype'"' == "float" ) { + local retype = !inlist(`"`type'"', "byte", "int", "float") + if ( (`retype') & (`"`type'"' == "long") ) local type double + } + else local retype = 1 + + return local type: copy local type + return local retype: copy local retype +end + +capture program drop FreeTimer +program FreeTimer + qui { + timer list + local i = 99 + while ( (`i' > 0) & ("`r(t`i')'" != "") ) { + local --i + } + } + c_local FreeTimer `i' +end + +capture program drop GenericParseTypes +program GenericParseTypes + syntax varlist, mat(name) [matstrl(name)] + + cap disp ustrregexm("a", "a") + if ( _rc ) local regex regex + else local regex ustrregex + + tempvar strlen + local types + local strl + foreach var of varlist `varlist' { + if ( `regex'm("`:type `var''", "str([1-9][0-9]*|L)") ) { + if ( (`regex's(1) == "L") & (`"`matstrl'"' == "") ) { + disp as err "ParseTypes(`mat'): Unsupported type `:type `var''" + exit 198 + } + else if ( `regex's(1) == "L" ) { + cap confirm var `strlen' + if ( _rc ) { + qui gen `strlen' = length(`var') + } + else { + qui replace `strlen' = length(`var') + } + qui sum `strlen', meanonly + + local strl `strl' 1 + local types `types' `r(max)' + } + else { + local strl `strl' 0 + local types `types' `=`regex's(1)' + } + } + else if inlist("`:type `var''", "byte", "int", "long") { + local strl `strl' 0 + local types `types' -1 + } + else if inlist("`:type `var''", "float", "double") { + local strl `strl' 0 + local types `types' 0 + } + else { + disp as err "ParseTypes(`mat'): Unknown type `:type `var''" + exit 198 + } + } + mata: st_matrix(st_local("mat"), strtoreal(tokens(st_local("types")))) + if ( `"`matstrl'"' != "" ) { + mata: st_matrix(st_local("matstrl"), strtoreal(tokens(st_local("strlen")))) + } +end + +capture program drop encode_moving +program encode_moving, rclass + syntax anything, [window(str)] + + gettoken lwindow uwindow: window + if ( `"`window'"' != "" ) { + if ( (`"`lwindow'"' == "") | (`"`uwindow'"' == "") ) { + disp as err "moving: option window() requires a lower and upper bound" + exit 198 + } + cap confirm integer number `lwindow' + if ( _rc & (`lwindow' != .) ) { + disp as err "moving: option window() requires integer inputs" + exit 7 + } + cap confirm integer number `uwindow' + if ( _rc & (`uwindow' != .) ) { + disp as err "moving: option window() requires integer inputs" + exit 7 + } + } + else { + local lwindow . + local uwindow . + } + + local rwarn = 0 + if ( regexm(`"`anything'"', "^moving[ |]+([^ |]+)[ |]*([^ |]+)?[ |]*([^ |]+)?$") ) { + local rmatch = 1 + local rstat = regexs(1) + cap local rlower = regexs(2) + cap local rupper = regexs(3) + + if ( `"`rlower'"' == "" ) local rlower `lwindow' + if ( `"`rupper'"' == "" ) local rupper `uwindow' + + cap confirm integer number `rlower' + if ( _rc & (`rlower' != .) ) { + disp as err "moving: option window requires integer inputs" + exit 7 + } + + cap confirm integer number `rupper' + if ( _rc & (`rupper' != .) ) { + disp as err "moving: option window requires integer inputs" + exit 7 + } + + local rwarn = `rwarn' | ((`rupper' == .) & (`rlower' == .)) + + encode_aliases `rstat' + local rstat `r(stat)' + local stat moving|`rstat'|`rlower'|`rupper' + + cap encode_stat_allowed `rstat' 0 + local scode = `r(statcode)' + if ( _rc ) { + disp as err "moving: unknown sub-statistic `rstat'" + exit 198 + } + + if inlist("`rstat'", "percent", "nunique") { + disp as err "moving: `rstat' not implemented" + exit 198 + } + } + else { + local rwarn = 0 + local rmatch = 0 + local scode = 0 + local stat: copy local anything + local rstat: copy local anything + local rlower: copy local lwindow + local rupper: copy local uwindow + } + + c_local stat: copy local stat + return local stat: copy local rstat + return local name = strtoname(`"`stat'"') + return scalar warn = `rwarn' + return scalar scode = `scode' + return scalar match = `rmatch' + return scalar lower = `rlower' + return scalar upper = `rupper' +end + +capture program drop encode_range +program encode_range, rclass + syntax anything, [interval(str) var(str)] + + encode_range_parse `interval' + + local linterval `r(linterval)' + local uinterval `r(uinterval)' + local variable `r(variable)' + local lstat `r(lstat)' + local ustat `r(ustat)' + + if ( `"`linterval'"' == "" ) local linterval . + if ( `"`uinterval'"' == "" ) local uinterval . + if ( `"`r(lsign)'"' == "-" ) local linterval -`linterval' + if ( `"`r(usign)'"' == "-" ) local uinterval -`uinterval' + if ( `"`variable'"' == "" ) local variable `var' + + local iwarn = 0 + if ( regexm(`"`anything'"', "^range[ |]+([^ |]+)[ |]*([^ |]+)?[ |]*([^ |]+)?[ |]*([^ ]+)?$") ) { + local imatch = 1 + local istat = regexs(1) + cap local ilower = regexs(2) + cap local iupper = regexs(3) + cap local ivar = regexs(4) + + if ( `"`ilower'"' == "" ) local ilower `linterval'`lstat' + if ( `"`iupper'"' == "" ) local iupper `uinterval'`ustat' + if ( `"`ivar'"' == "" ) local ivar `variable' + + encode_range_parse `ilower' `iupper' `ivar' + + local ilower `r(linterval)' + local iupper `r(uinterval)' + local ivar `r(variable)' + local ilstat `r(lstat)' + local iustat `r(ustat)' + local ilsign `r(lsign)' + local iusign `r(usign)' + + local iwarn = `iwarn' | ((`iupper' == .) & (`ilower' == .)) + + local checkcodes scode lcode ucode + foreach checkstat in istat ilstat iustat { + gettoken checkcode checkcodes: checkcodes + if ( `"``checkstat''"' != "" ) { + encode_aliases ``checkstat'' + local `checkstat' `r(stat)' + + cap encode_stat_allowed ``checkstat'' 0 + local `checkcode' = `r(statcode)' + if ( _rc ) { + disp as err "range: unknown sub-statistic ``checkstat''" + exit 198 + } + + if inlist("``checkstat''", "percent", "nunique") { + disp as err "range: ``checkstat'' not implemented" + exit 198 + } + } + else { + local `checkcode' = 0 + } + } + + local irangestr `ivar' within `ivar'[i] `ilsign' `ilower'`ilstat' to `ivar'[i] `iusign' `iupper'`iustat' + + if ( `"`ilsign'"' == "-" ) local ilower -`ilower' + if ( `"`iusign'"' == "-" ) local iupper -`iupper' + + local stat range|`istat'|`ilower'`ilstat'|`iupper'`iustat'|`ivar' + } + else { + local irangestr + local scode = 0 + local lcode = 0 + local ucode = 0 + local iwarn = 0 + local imatch = 0 + local stat: copy local anything + local istat: copy local anything + local ilower + local ilstat + local iupper + local iustat + local ivar + } + + c_local stat: copy local stat + + return local stat: copy local istat + return local lower: copy local ilower + return local lstat: copy local ilstat + return local upper: copy local iupper + return local ustat: copy local iustat + return local var: copy local ivar + return local rangestr: copy local irangestr + + return local name = strtoname(`"`stat'"') + return scalar scode = `scode' + return scalar lcode = `lcode' + return scalar ucode = `ucode' + return scalar warn = `iwarn' + return scalar match = `imatch' +end + +capture program drop encode_range_parse +program encode_range_parse, rclass + + gettoken linterval uinterval: 0 + gettoken uinterval variable: uinterval + + local linterval `linterval' + local uinterval `uinterval' + local variable `variable' + local lstat + local ustat + + if ( `"`0'"' != "" ) { + if ( (`"`linterval'"' == "") | (`"`uinterval'"' == "") ) { + disp as err "range: option interval() requires a lower and upper bound" + exit 198 + } + + cap confirm integer number `linterval' + if ( _rc & (`"`linterval'"' != ".") ) { + encode_range_stat `linterval' + if ( `r(imatch)' == 0 ) { + disp as err "range: option interval() incorrectly specified" + exit 7 + } + local linterval `r(iscalar)' + local lstat `r(istat)' + local lsign `r(isign)' + } + else if ( _rc == 0 ) { + * negative numbers would have a '-' sign prepended already + local lsign = cond(`linterval' < 0, "", "+") + } + + cap confirm integer number `uinterval' + if ( _rc & (`"`uinterval'"' != ".") ) { + encode_range_stat `uinterval' + if ( `r(imatch)' == 0 ) { + disp as err "range: option interval() incorrectly specified" + exit 7 + } + local uinterval `r(iscalar)' + local ustat `r(istat)' + local usign `r(isign)' + } + else if ( _rc == 0 ) { + * negative numbers would have a '-' sign prepended already + local usign = cond(`uinterval' < 0, "", "+") + } + } + + return local linterval: copy local linterval + return local uinterval: copy local uinterval + return local variable: copy local variable + return local lstat: copy local lstat + return local ustat: copy local ustat + return local lsign: copy local lsign + return local usign: copy local usign +end + +capture program drop encode_range_stat +program encode_range_stat, rclass + if ( regexm(`"`0'"', "^(\+|-)?([0-9]+\.[0-9]+|\.[0-9]+|[0-9]+)?(.*)$") ) { + local imatch = 1 + cap local isign = regexs(1) + cap local iscalar = regexs(2) + cap local istat = regexs(3) + if ( `"`isign'"' != "-" ) { + local isign + + } + else { + local isign - + } + if ( `"`iscalar'"' == "" ) local iscalar 1 + } + else { + local imatch = 0 + } + return scalar imatch = `imatch' + return local isign : copy local isign + return local iscalar : copy local iscalar + return local istat : copy local istat +end + +capture program drop encode_cumsum +program encode_cumsum, rclass + syntax anything, [cumby(str) var(str)] + + local var `var' + local anything `anything' + local stat: copy local anything + local match 0 + local cumsign 0 + local cumvars + + if ( (`"`anything'"' == "cumsum") & (`"`cumby'"' == "") ) { + local match 1 + } + else { + local anything: subinstr local anything "|" " ", all + local anything `anything' + if ( regexm(`"`anything'"', "^cumsum(.*)$") ) { + local _cumby = regexs(1) + local _cumby `_cumby' + if ( `"`_cumby'"' == "" ) local _cumby: copy local cumby + + gettoken cumsign cumvars: _cumby + local cumvars `cumvars' + local cumsign `cumsign' + + if inlist(`"`cumsign'"', "+", "-") { + * if ( `"`cumvars'"' == "" ) local cumvars: copy local var + + local match 1 + local stat cumsum|`cumsign'|`:subinstr local cumvars " " "|", all' + local cumsign = cond(`"`cumsign'"' == "+", 1, 2) + } + else { + disp as err "cumsum: cumby() misspecified; expected '+/- [varlist]' but got '`_cumby''" + exit 7 + } + } + + c_local stat: copy local stat + } + + return local stat: copy local stat + return local match: copy local match + return local cumsign: copy local cumsign + return local cumvars: copy local cumvars + return local cumother = `"`cumvars'"' != "" +end + +capture program drop encode_shift +program encode_shift, rclass + syntax anything, [shiftby(str)] + + local anything `anything' + local stat: copy local anything + local match 0 + local shift 0 + + if regexm(`"`anything'"', "^shift[ |]*([+-]?[0-9]+)[ |]*$") { + local shift = `=regexs(1)' + local match 1 + local stat shift|`shift' + } + else if regexm(`"`anything'"', "^shift[ |]*$") { + if ( `"`shiftby'"' == "" ) { + disp as err "shift: shiftby() required if no individual shift is specified" + exit 198 + } + else { + cap confirm integer number `shiftby' + if ( _rc ) { + disp as err "shift: shiftby() misspecified; expected integer but got '`shiftby''" + exit 7 + } + local shift = `=`shiftby'' + local match 1 + local stat shift|`shift' + } + } + + c_local stat: copy local stat + return local stat: copy local stat + return local match: copy local match + return local shift: copy local shift +end + +*********************************************************************** +* greshape * +*********************************************************************** + +capture program drop greshape_scalars +program greshape_scalars + * 1 = long, 2 = wide + if ( inlist(`"`1'"', "gen", "init", "alloc") ) { + scalar __gtools_greshape_code = 0 + scalar __gtools_greshape_kxi = 0 + scalar __gtools_greshape_str = 0 + scalar __gtools_greshape_dropmiss = 0 + + cap matrix list __gtools_greshape_xitypes + if ( _rc ) matrix __gtools_greshape_xitypes = 0 + cap matrix list __gtools_greshape_types + if ( _rc ) matrix __gtools_greshape_types = 0 + cap matrix list __gtools_greshape_maplevel + if ( _rc ) matrix __gtools_greshape_maplevel = 0 + + cap scalar dir __gtools_greshape_jfile + if ( _rc ) scalar __gtools_greshape_jfile = 0 + cap scalar dir __gtools_greshape_kxij + if ( _rc ) scalar __gtools_greshape_kxij = 0 + cap scalar dir __gtools_greshape_kout + if ( _rc ) scalar __gtools_greshape_kout = 0 + cap scalar dir __gtools_greshape_klvls + if ( _rc ) scalar __gtools_greshape_klvls = 0 + } + else if ( `"`2'"' != "_keepgreshape" ) { + cap scalar drop __gtools_greshape_code + cap scalar drop __gtools_greshape_kxi + cap scalar drop __gtools_greshape_str + cap scalar drop __gtools_greshape_dropmiss + + if ( `"${GTOOLS_CALLER}"' != "greshape" ) { + cap matrix drop __gtools_greshape_xitypes + cap matrix drop __gtools_greshape_types + cap matrix drop __gtools_greshape_maplevel + + cap scalar drop __gtools_greshape_jfile + cap scalar drop __gtools_greshape_kxij + cap scalar drop __gtools_greshape_kout + cap scalar drop __gtools_greshape_klvls + } + } +end + +*********************************************************************** +* gregress * +*********************************************************************** + +capture program drop gregress_scalars +program gregress_scalars + if ( inlist(`"`0'"', "gen", "init", "alloc") ) { + scalar __gtools_gregress_kv = 0 + scalar __gtools_gregress_kvars = 0 + scalar __gtools_gregress_cons = 0 + scalar __gtools_gregress_robust = 0 + scalar __gtools_gregress_cluster = 0 + scalar __gtools_gregress_absorb = 0 + scalar __gtools_gregress_hdfetol = 0 + scalar __gtools_gregress_hdfemaxiter = 0 + scalar __gtools_gregress_hdfetraceiter = 0 + scalar __gtools_gregress_hdfestandard = 0 + scalar __gtools_gregress_hdfemethnm = "" + scalar __gtools_gregress_hdfemethod = 0 + scalar __gtools_gregress_glmlogit = 0 + scalar __gtools_gregress_glmpoisson = 0 + scalar __gtools_gregress_glmfam = 0 + scalar __gtools_gregress_glmiter = 0 + scalar __gtools_gregress_glmtol = 0 + scalar __gtools_gregress_ivreg = 0 + scalar __gtools_gregress_ivkendog = 0 + scalar __gtools_gregress_ivkexog = 0 + scalar __gtools_gregress_ivkz = 0 + scalar __gtools_gregress_savemata = 0 + scalar __gtools_gregress_savemb = 0 + scalar __gtools_gregress_savemse = 0 + scalar __gtools_gregress_savegb = 0 + scalar __gtools_gregress_savegse = 0 + scalar __gtools_gregress_saveghdfe = 0 + scalar __gtools_gregress_savegresid = 0 + scalar __gtools_gregress_savegpred = 0 + scalar __gtools_gregress_savegabs = 0 + scalar __gtools_gregress_moving = 0 + scalar __gtools_gregress_moving_l = 0 + scalar __gtools_gregress_moving_u = 0 + scalar __gtools_gregress_range = 0 + scalar __gtools_gregress_range_l = 0 + scalar __gtools_gregress_range_u = 0 + scalar __gtools_gregress_range_ls = 0 + scalar __gtools_gregress_range_us = 0 + matrix __gtools_gregress_clustyp = . + matrix __gtools_gregress_abstyp = . + } + else { + cap scalar drop __gtools_gregress_kv + cap scalar drop __gtools_gregress_kvars + cap scalar drop __gtools_gregress_cons + cap scalar drop __gtools_gregress_robust + cap scalar drop __gtools_gregress_cluster + cap scalar drop __gtools_gregress_absorb + cap scalar drop __gtools_gregress_hdfetol + cap scalar drop __gtools_gregress_hdfemaxiter + cap scalar drop __gtools_gregress_hdfetraceiter + cap scalar drop __gtools_gregress_hdfestandard + cap scalar drop __gtools_gregress_hdfemethnm + cap scalar drop __gtools_gregress_hdfemethod + cap scalar drop __gtools_gregress_ivreg + cap scalar drop __gtools_gregress_ivkendog + cap scalar drop __gtools_gregress_ivkexog + cap scalar drop __gtools_gregress_ivkz + cap scalar drop __gtools_gregress_glmlogit + cap scalar drop __gtools_gregress_glmpoisson + cap scalar drop __gtools_gregress_glmfam + cap scalar drop __gtools_gregress_glmiter + cap scalar drop __gtools_gregress_glmtol + cap scalar drop __gtools_gregress_savemata + cap scalar drop __gtools_gregress_savemb + cap scalar drop __gtools_gregress_savemse + cap scalar drop __gtools_gregress_savegb + cap scalar drop __gtools_gregress_savegse + cap scalar drop __gtools_gregress_saveghdfe + cap scalar drop __gtools_gregress_savegresid + cap scalar drop __gtools_gregress_savegpred + cap scalar drop __gtools_gregress_savegabs + cap scalar drop __gtools_gregress_moving + cap scalar drop __gtools_gregress_moving_l + cap scalar drop __gtools_gregress_moving_u + cap scalar drop __gtools_gregress_range + cap scalar drop __gtools_gregress_range_l + cap scalar drop __gtools_gregress_range_u + cap scalar drop __gtools_gregress_range_ls + cap scalar drop __gtools_gregress_range_us + cap matrix drop __gtools_gregress_clustyp + cap matrix drop __gtools_gregress_abstyp + } +end + +*********************************************************************** +* gstats * +*********************************************************************** + +capture program drop gstats_scalars +program gstats_scalars + scalar __gtools_gstats_code = . + if ( inlist(`"`0'"', "gen", "init", "alloc") ) { + scalar __gtools_winsor_trim = . + scalar __gtools_winsor_cutl = . + scalar __gtools_winsor_cuth = . + scalar __gtools_winsor_kvars = . + + scalar __gtools_hdfe_nonmiss = 0 + scalar __gtools_hdfe_kvars = 0 + scalar __gtools_hdfe_absorb = 0 + scalar __gtools_hdfe_method = 1 + scalar __gtools_hdfe_maxiter = 0 + scalar __gtools_hdfe_traceiter = 0 + scalar __gtools_hdfe_standard = 0 + scalar __gtools_hdfe_hdfetol = 0 + scalar __gtools_hdfe_matasave = 0 + scalar __gtools_hdfe_mataname = "" + scalar __gtools_hdfe_iter = 0 + scalar __gtools_hdfe_feval = 0 + scalar __gtools_hdfe_methodname = "" + + scalar __gtools_summarize_matasave = 0 + scalar __gtools_summarize_pretty = 0 + scalar __gtools_summarize_colvar = 0 + scalar __gtools_summarize_noprint = 0 + scalar __gtools_summarize_nosep = 0 + scalar __gtools_summarize_pooled = 0 + scalar __gtools_summarize_normal = 0 + scalar __gtools_summarize_detail = 0 + scalar __gtools_summarize_kvars = 0 + scalar __gtools_summarize_kstats = 0 + scalar __gtools_summarize_tabstat = 0 + scalar __gtools_summarize_lwidth = 16 + scalar __gtools_summarize_separator = 0 + scalar __gtools_summarize_format = 0 + scalar __gtools_summarize_dfmt = "%9.0g" + + scalar __gtools_summarize_N = . + scalar __gtools_summarize_sum_w = . + scalar __gtools_summarize_sum = . + scalar __gtools_summarize_mean = . + scalar __gtools_summarize_min = . + scalar __gtools_summarize_max = . + scalar __gtools_summarize_Var = . + scalar __gtools_summarize_sd = . + scalar __gtools_summarize_p1 = . + scalar __gtools_summarize_p5 = . + scalar __gtools_summarize_p10 = . + scalar __gtools_summarize_p25 = . + scalar __gtools_summarize_p50 = . + scalar __gtools_summarize_p75 = . + scalar __gtools_summarize_p90 = . + scalar __gtools_summarize_p95 = . + scalar __gtools_summarize_p99 = . + scalar __gtools_summarize_skewness = . + scalar __gtools_summarize_kurtosis = . + scalar __gtools_summarize_smallest1 = . + scalar __gtools_summarize_smallest2 = . + scalar __gtools_summarize_smallest3 = . + scalar __gtools_summarize_smallest4 = . + scalar __gtools_summarize_largest4 = . + scalar __gtools_summarize_largest3 = . + scalar __gtools_summarize_largest2 = . + scalar __gtools_summarize_largest1 = . + + scalar __gtools_transform_greedy = 0 + scalar __gtools_transform_kvars = 1 + scalar __gtools_transform_ktargets = 1 + scalar __gtools_transform_kgstats = 1 + scalar __gtools_transform_cumsum_k = 0 + scalar __gtools_transform_range_k = 0 + scalar __gtools_transform_range_xs = 0 + scalar __gtools_transform_range_xb = 0 + + matrix __gtools_transform_rank_ties = 1 + matrix __gtools_summarize_codes = . + matrix __gtools_transform_varfuns = . + matrix __gtools_transform_statcode = . + matrix __gtools_transform_statmap = . + matrix __gtools_hdfe_abstyp = . + matrix __gtools_hdfe_nabsorb = . + + matrix __gtools_transform_moving = 0 + matrix __gtools_transform_moving_l = . + matrix __gtools_transform_moving_u = . + + matrix __gtools_transform_range = 0 + matrix __gtools_transform_range_pos = 0 + matrix __gtools_transform_range_l = . + matrix __gtools_transform_range_u = . + matrix __gtools_transform_range_ls = 0 + matrix __gtools_transform_range_us = 0 + + matrix __gtools_transform_cumtypes = 0 + matrix __gtools_transform_cumsum = 0 + matrix __gtools_transform_cumsign = 0 + matrix __gtools_transform_cumvars = 0 + matrix __gtools_transform_aux8_shift = 0 + + mata: __gtools_transform_cumsum = . + mata: __gtools_transform_cumsign = . + mata: __gtools_transform_cumvars = . + mata: __gtools_transform_aux8_shift = . + mata: __gtools_summarize_codes = . + } + else { + cap scalar drop __gtools_gstats_code + cap scalar drop __gtools_winsor_trim + cap scalar drop __gtools_winsor_cutl + cap scalar drop __gtools_winsor_cuth + cap scalar drop __gtools_winsor_kvars + + cap scalar drop __gtools_hdfe_nonmiss + cap scalar drop __gtools_hdfe_kvars + cap scalar drop __gtools_hdfe_absorb + cap scalar drop __gtools_hdfe_method + cap scalar drop __gtools_hdfe_maxiter + cap scalar drop __gtools_hdfe_traceiter + cap scalar drop __gtools_hdfe_standard + cap scalar drop __gtools_hdfe_hdfetol + cap scalar drop __gtools_hdfe_matasave + cap scalar drop __gtools_hdfe_mataname + cap scalar drop __gtools_hdfe_iter + cap scalar drop __gtools_hdfe_feval + cap scalar drop __gtools_hdfe_methodname + + cap scalar drop __gtools_summarize_matasave + cap scalar drop __gtools_summarize_pretty + cap scalar drop __gtools_summarize_colvar + cap scalar drop __gtools_summarize_noprint + cap scalar drop __gtools_summarize_nosep + cap scalar drop __gtools_summarize_pooled + cap scalar drop __gtools_summarize_normal + cap scalar drop __gtools_summarize_detail + cap scalar drop __gtools_summarize_kvars + cap scalar drop __gtools_summarize_kstats + cap scalar drop __gtools_summarize_tabstat + cap scalar drop __gtools_summarize_lwidth + cap scalar drop __gtools_summarize_separator + cap scalar drop __gtools_summarize_format + cap scalar drop __gtools_summarize_dfmt + + cap scalar drop __gtools_summarize_N + cap scalar drop __gtools_summarize_sum_w + cap scalar drop __gtools_summarize_sum + cap scalar drop __gtools_summarize_mean + cap scalar drop __gtools_summarize_min + cap scalar drop __gtools_summarize_max + cap scalar drop __gtools_summarize_Var + cap scalar drop __gtools_summarize_sd + cap scalar drop __gtools_summarize_p1 + cap scalar drop __gtools_summarize_p5 + cap scalar drop __gtools_summarize_p10 + cap scalar drop __gtools_summarize_p25 + cap scalar drop __gtools_summarize_p50 + cap scalar drop __gtools_summarize_p75 + cap scalar drop __gtools_summarize_p90 + cap scalar drop __gtools_summarize_p95 + cap scalar drop __gtools_summarize_p99 + cap scalar drop __gtools_summarize_skewness + cap scalar drop __gtools_summarize_kurtosis + cap scalar drop __gtools_summarize_smallest1 + cap scalar drop __gtools_summarize_smallest2 + cap scalar drop __gtools_summarize_smallest3 + cap scalar drop __gtools_summarize_smallest4 + cap scalar drop __gtools_summarize_largest4 + cap scalar drop __gtools_summarize_largest3 + cap scalar drop __gtools_summarize_largest2 + cap scalar drop __gtools_summarize_largest1 + + cap scalar drop __gtools_transform_greedy + cap scalar drop __gtools_transform_kvars + cap scalar drop __gtools_transform_ktargets + cap scalar drop __gtools_transform_kgstats + cap scalar drop __gtools_transform_cumsum_k + cap scalar drop __gtools_transform_range_k + cap scalar drop __gtools_transform_range_xs + cap scalar drop __gtools_transform_range_xb + + cap mata st_dropvar(__gtools_gst_dropvars) + + cap matrix drop __gtools_transform_rank_ties + cap matrix drop __gtools_summarize_codes + cap matrix drop __gtools_transform_varfuns + cap matrix drop __gtools_transform_statcode + cap matrix drop __gtools_transform_statmap + cap matrix drop __gtools_hdfe_abstyp + cap matrix drop __gtools_hdfe_nabsorb + + cap matrix drop __gtools_transform_moving + cap matrix drop __gtools_transform_moving_l + cap matrix drop __gtools_transform_moving_u + + cap matrix drop __gtools_transform_range + cap matrix drop __gtools_transform_range_pos + cap matrix drop __gtools_transform_range_l + cap matrix drop __gtools_transform_range_u + cap matrix drop __gtools_transform_range_ls + cap matrix drop __gtools_transform_range_us + + cap matrix drop __gtools_transform_cumtypes + cap matrix drop __gtools_transform_cumsum + cap matrix drop __gtools_transform_cumsign + cap matrix drop __gtools_transform_cumvars + cap matrix drop __gtools_transform_aux8_shift + + cap mata: mata drop __gtools_transform_cumsum + cap mata: mata drop __gtools_transform_cumsign + cap mata: mata drop __gtools_transform_cumvars + cap mata: mata drop __gtools_transform_aux8_shift + + cap mata: mata drop __gtools_transform_rank_ties + cap mata: mata drop __gtools_summarize_codes + cap mata: mata drop __gtools_gst_labels + cap mata: mata drop __gtools_gst_formats + cap mata: mata drop __gtools_gst_dropvars + + cap mata: mata drop __gtools_transform_varfuns + cap mata: mata drop __gtools_transform_statcode + cap mata: mata drop __gtools_transform_statmap + + cap mata: mata drop __gtools_transform_moving + cap mata: mata drop __gtools_transform_moving_l + cap mata: mata drop __gtools_transform_moving_u + + cap mata: mata drop __gtools_transform_range + cap mata: mata drop __gtools_transform_range_pos + cap mata: mata drop __gtools_transform_range_l + cap mata: mata drop __gtools_transform_range_u + cap mata: mata drop __gtools_transform_range_ls + cap mata: mata drop __gtools_transform_range_us + } +end + +capture program drop gstats_transform +program gstats_transform + syntax anything(equalok), /// + [ /// + /// TODO: Maybe add rawstat at some point... + replace /// replace variables, if they exist + noinit /// do not initialize targets with missing values + nogreedy /// use memory-heavy algorithm + TYPEs(str) /// override automatic types + /// + WILDparse /// parse assuming wildcard renaming + AUTOrename /// automagically name targets if no target is specified + AUTOrenameformat(passthru) /// + LABELFormat(passthru) /// Custom label engine: (#stat#) #sourcelabel# is the default + LABELProgram(passthru) /// Program to parse labelformat (see examples) + statprefix(passthru) /// add prefix to every stat + /// + ties(str) /// how to resolve ties (one per target; use . for non-rank targets) + window(passthru) /// moving window if not specified in the stat + interval(passthru) /// interval if not specified in the stat + cumby(passthru) /// Cummulative sum by +/- and varlst + shiftby(passthru) /// Shift by +/-# + excludeself /// exclude current obs from statistic + excludebounds /// interval is strict (do not include bounds) + ] + + * Parse transforms and variables + * ------------------------------ + + gstats_transform_parse `anything', /// + `wildparse' /// + `labelformat' /// + `labelprogram' /// + `autorename' /// + `autorenameformat' /// + `window' `interval' `cumby' `shiftby' `statprefix' + + local transforms rank /// + standardize /// + normalize /// + demean /// + demedian // + + local unknown + foreach stat of local __gtools_gst_stats { + if ( !`:list stat in transforms' ) { + encode_moving `stat' + local rmatch = `r(match)' + encode_range `stat' + local rmatch = `r(match)' | `rmatch' + encode_cumsum `stat' + local rmatch = `r(match)' | `rmatch' + encode_shift `stat' + local rmatch = `r(match)' | `rmatch' + if ( `rmatch' == 0 ) { + local unknown `unknown' `stat' + } + } + } + + if ( `"`unknown'"' != "" ) { + disp as err `"Unknown transformations: `unknown'"' + exit 198 + } + + * if ( !`:list __gtools_gst_uniq_vars === __gtools_gst_vars' ) { + * if ( `"`greedy'"' == "nogreedy" ) { + * disp as err "gstats_transform: nogreedy not allowed with repeat sources" + * exit 198 + * } + * } + + gstats_transform_types, /// + vars(`__gtools_gst_vars') /// + targets(`__gtools_gst_targets') /// + stats(`__gtools_gst_stats') /// + types(`types') /// + ties(`ties') /// + prefix(__gtools_gst) + + local kvars: list sizeof __gtools_gst_vars + local ktargets: list sizeof __gtools_gst_targets + local kstat: list sizeof __gtools_gst_stats + local ktype: list sizeof __gtools_gst_types + local kretype: list sizeof __gtools_gst_retype + local ktcodes: list sizeof __gtools_gst_tcodes + + local kbad = 0 + local kbad = `kbad' | (`kvars' != `ktargets') + local kbad = `kbad' | (`kvars' != `kstat') + local kbad = `kbad' | (`kvars' != `ktype') + local kbad = `kbad' | (`kvars' != `kretype') + local kbad = `kbad' | (`kvars' != `ktcodes') + + if ( `kbad' ) { + disp as err "gstats_transform: parsing error (inconsistent number of inputs)" + exit 198 + } + + * Parse variables to add + * ---------------------- + + * A variable needs to be "retyped" only if a target exists already + * and it has an unsuitable type. One of two things happens: + * + * a) The target is also a source. Source gets renamed, used as + * input, dropped. + * + * b) The target is not a source. Target is renamed, dropped. + + local __gtools_gst_i = 0 + local __gtools_gst_dropvars + local __gtools_gst_vars: subinstr local __gtools_gst_vars " " " ", all + + local krecast = 0 + local recast_sources + local recast_targets + + forvalues k = 1 / `ktargets' { + local retype: word `k' of `__gtools_gst_retype' + local target: word `k' of `__gtools_gst_targets' + + cap confirm new variable `target' + if ( _rc ) { + if ( `"`replace'"' == "" ) { + disp as err "gstats_transform: target `target' exists without replace" + exit 198 + } + + if ( `retype' ) { + cap confirm new variable __gtools_gst`__gtools_gst_i' + while ( _rc ) { + local ++__gtools_gst_i + cap confirm new variable __gtools_gst`__gtools_gst_i' + } + rename `target' __gtools_gst`__gtools_gst_i' + local __gtools_gst_dropvars `__gtools_gst_dropvars' __gtools_gst`__gtools_gst_i' + + if ( `:list target in __gtools_gst_vars' ) { + local __gtools_gst_vars: subinstr local __gtools_gst_vars " `target' " " __gtools_gst`__gtools_gst_i' ", all + local recast_sources `recast_sources' __gtools_gst`__gtools_gst_i' + local recast_targets `recast_targets' `target' + local ++krecast + } + } + + local __gtools_gst_vars: subinstr local __gtools_gst_vars " " " ", all + } + } + + local kadd = 0 + local __gtools_gst_addvars + local __gtools_gst_addtypes + forvalues k = 1 / `ktargets' { + local target: word `k' of `__gtools_gst_targets' + local type: word `k' of `__gtools_gst_types' + cap confirm new variable `target' + if ( _rc == 0 ) { + local ++kadd + local __gtools_gst_addvars `__gtools_gst_addvars' `target' + local __gtools_gst_addtypes `__gtools_gst_addtypes' `type' + } + } + + * Group stat codes + * ---------------- + + * -1 // sum + * -2 // mean + * -3 // sd + * -4 // max + * -5 // min + * -6 // count, n + * -7 // percent + * 50 // median + * -9 // iqr + * -10 // first + * -11 // firstnm + * -12 // last + * -13 // lastnm + * -14 // freq + * -15 // semean + * -16 // sebinomial + * -17 // sepoisson + * -18 // nunique + * -19 // skewness + * -20 // kurtosis + * -21 // rawsum + * -22 // nmissing + * -23 // variance + * -24 // cv + * -25 // range + * -26 // geomean + * -27 // gini + * -27.1 // gini|dropneg + * -27.2 // gini|keepneg + * -101 // nansum + * -121 // rawnansum + * -206 // sum weight + * -203 // variance + * 1000 + # // #th smallest + * -1000 - # // #th largest + * 1000.5 + # // raw #th smallest + * -1000.5 - # // raw #th largest + + * Transform codes + * --------------- + + * -1 // standardize normalize + * -2 // demean + * -3 // demedian + * -4 // moving + * // syntax via stat call + * // + * // (moving stat lower upper) + * // + * // and/or via window() option + * // + * // window(lower upper) + * // + * // window() fills stat calls w/o lower/upper. + * // + * -5 // range + * // syntax via stat call + * // + * // (range stat lower upper [reference]) + * // + * // and/or via interval() option + * // + * // interval(lower upper [reference]) + * // + * // interval() fills stat calls w/o lower/upper. + * // reference is the reference variable; if empty + * // the source is taken as its own reference. lower + * // an upper can be statistical transformations: + * // + * // (range mean -2sd 0.5sd) + * // (range skew -2 1.5cv) + * // interval(-sd sd varname) + * // + * // if either lower or upper are not numbers then + * // they will try to be parsed in the format above. + * // the number in front of the stat is multipled by + * // the stat requested. so (range mean -2sd 0.5sd) + * // will compute for x[i] the average over j s.t. + * // x[i] - 2 * sd(x) <= x[j] <= x[i] + 0.5 * sd(x) + * -6 // rank + * -7 // cummsum + * // syntax via stata call + * // + * // (cumsum) + * // (cumsum +/-) + * // (cumsum +/- varlist) + * // + * // and/or via cumby() option + * // + * // cumby(+/-) + * // cumby(+/- varlist) + * // + * // cumsum happens in the order th data appears or + * // in ascending/descending order. if varlist then + * // cumsum happens in ascending or descending order + * // or varlist. + * -8 // shift + * // syntax via stata call + * // + * // (shift) for use with shift() + * // (shift -#) for lags + * // (shift #) for leads + * // + * // where shiftby() is an integer + * // + * // shiftby(-#) for lags + * // shiftby(#) for leads + + * moving stats + * ------------ + + * __gtools_transform_moving + * stat code with the statistic to compute in the moving window. + * e.g. (-2, 0, -3, 0, 75) means moving mean, non-moving stat, + * moving sd, non-moving stat, and moving 75th percentile. + * + * __gtools_transform_moving_l + * lower window bound. -1 means from the prior obs, . means all + * obs before, 0 means start at current obs, etc. + * + * __gtools_transform_moving_u + * upper window bound. 1 means up to the next obs, . means all + * obs after, 0 means end at current obs, etc. + + * Range stats + * ----------- + + * __gtools_transform_range + * stat code with the statistic to compute in the interval window. + * e.g. (-2, 0, -3, 0, 75) means range mean, non-range stat, + * range sd, non-range stat, and range 75th percentile. + * + * __gtools_transform_range_xs + * + * __gtools_transform_range_xb + * + * __gtools_transform_range_k + * number of reference range variables + * + * __gtools_transform_range_pos + * position of reference range variable. i.e. the input is + * + * [byvars] sources targets [rangevars] [weightvar] + * + * the kth entry of this matrix maps the kth source with the + * corresponding reference range variable. if the reference + * variable is the source, this is 0. + * + * __gtools_transform_range_l + * __gtools_transform_range_u + * + * lower and upper range windows for the kth statistic, if it is + * an range statistic. The ith observation is computed over + * sources[j, k] s.t. + * + * lower <= sources[j, k] <= upper + * + * where + * + * l = __gtools_transform_range_pos[k] + * lower = rangevars[i, l] + __gtools_transform_range_l[k] + * upper = rangevars[i, l] + __gtools_transform_range_u[k] + * + * If l is 0 then this is computed with + * + * lower = sources[i, k] + __gtools_transform_range_l[k] + * upper = sources[i, k] + __gtools_transform_range_u[k] + * + * Note that both lower and upper are ADDED, so lower must be + * npassed as a negative umber if you want to subtract it. + * + * Last, if the range has a reference statistic attached to + * it, then lower/upper are the scalar it multiplies: + * + * lscalar = __gtools_transform_range_l[k] + * uscalar = __gtools_transform_range_u[k] + * lower = rangevars[i, l] + lscalar * lstat[k] + * upper = rangevars[i, l] + uscalar * ustat[k] + * + * or + * + * lower = sources[i, k] + lscalar * lstat[k] + * upper = sources[i, k] + uscalar * ustat[k] + * + * as applicable. For details on lstat and ustat see below. + * + * __gtools_transform_range_ls + * __gtools_transform_range_us + * + * lower and upper range statistics for the kth statistic, + * if it is an range statistic and if a statistical + * transformation was requested. Lower and upper bounds + * lstat[k] and ustat[k] referenced above are obtained from + * rangevars[i, l] or sources[i, k], as applicable. The + * requested statistic is computed over all i. + * + * these vectors contain the code's statistic. if there is + * no reference statistic, the entry is just 0. For example, + * this computes the mean price within a standard deviation: + * + * (range mean -sd sd) price + * + * l = __gtools_transform_range_pos[k] <- 0 + * lscalar = __gtools_transform_range_l[k] <- -1 + * uscalar = __gtools_transform_range_u[k] <- 1 + * + * lcode = __gtools_transform_range_ls[k] <- -3 + * ucode = __gtools_transform_range_us[k] <- -3 + * lstat = sd(sources[i, k]) over all i + * ustat = sd(sources[i, k]) over all i + * + * lower = sources[i, k] + lscalar * lstat + * upper = sources[i, k] + uscalar * ustat + * + * ith output obs = mean(sources[j, k]) s.t. lower <= sources[j, k] <= upper + + * Cummulative Sum + * --------------- + + * __gtools_transform_cumsum_k + * + * number of aux variables for cumsum + * + * __gtools_transform_cumtypes + * + * types for cumvars + * + * __gtools_transform_cumsum + * + * 0/1 for whther kth target stat is cumsum + * + * __gtools_transform_cumsign + * + * whether cumsum should be in data (0), ascending (1), or + * descending (2) order + * + * __gtools_transform_cumvars + * + * start and end position of cumvars for each target + + * Shift (lads and leads) + * ---------------------- + + * __gtools_transform_aux8_shift + * + * number (positive or negative) to shift kth target by + + * Transform mappings + * ------------------ + + * There are two sets of stats: The transformations and the group + * stats that the transformations use. For example, normalizing a + * variable uses the mean and standard deviation. Each transform has + * an internal code for its group stats. normalize has codes 1 and 2 + * for the mean and standard deviation, demedian has code 1 for the + * median, and so on. Hence we create a matrix with mappings from + * each stat to their stat's position on the array of group stats. If + * we have stats(demedian demean normalize) we get + * + * __gtools_transform_varfuns // code for variable transforms + * demedian demean normalize + * -3 -2 -1 + * + * __gtools_transform_statcode // code for group stats + * 50 -2 -3 + * median mean sd + * + * __gtools_transform_statmap // mapping from transforms to the group stats + * 1 0 0 + * 2 0 0 + * 2 3 0 + * + * The group stat array will have first the median, then the mean, + * then the standard deviation. Hence demedian will use the first + * stat, demean the second, and normalize the second and third. + + * Generate matrices for plugin internals + * -------------------------------------- + + local gs_nostats_codes -4 -5 -6 -7 -8 + + local gs_standardize mean sd + local gs_normalize mean sd + local gs_demean mean + local gs_demedian median + local gs_moving + local gs_range + local gs_rank + local gs_cumsum + local gs_shift + + local gs + local rangevars + + foreach stat of local __gtools_gst_stats { + local gs `gs' `gs_`stat'' + encode_range `stat' + local rangevars `rangevars' `r(var)' + } + + local gs: list uniq gs + local rangevars: list uniq rangevars + + mata: __gtools_transform_varfuns = J(1, `:list sizeof __gtools_gst_stats', .) + mata: __gtools_transform_statcode = J(1, max((`:list sizeof gs', 1)), 0) + mata: __gtools_transform_statmap = J(`:list sizeof __gtools_gst_stats', max((`:list sizeof gs', 1)), 0) + + mata: __gtools_transform_moving = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_moving_l = J(1, `:list sizeof __gtools_gst_stats', .) + mata: __gtools_transform_moving_u = J(1, `:list sizeof __gtools_gst_stats', .) + + mata: __gtools_transform_range = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_range_pos = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_range_l = J(1, `:list sizeof __gtools_gst_stats', .) + mata: __gtools_transform_range_u = J(1, `:list sizeof __gtools_gst_stats', .) + mata: __gtools_transform_range_ls = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_range_us = J(1, `:list sizeof __gtools_gst_stats', 0) + + mata: __gtools_transform_cumsum = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_cumsign = J(1, `:list sizeof __gtools_gst_stats', 0) + mata: __gtools_transform_cumvars = J(1, `:list sizeof __gtools_gst_stats' + 1, 0) + + mata: __gtools_transform_aux8_shift = J(1, `:list sizeof __gtools_gst_stats', 0) + + forvalues l = 1 / `:list sizeof gs' { + local gstat: word `l' of `gs' + encode_stat_allowed `gstat' 0 + mata: __gtools_transform_statcode[`l'] = `r(statcode)' + } + + local bwarn4 = 0 + local bwarn5 = 0 + + local rwarn = 0 + local iwarn = 0 + + local cumvars + forvalues k = 1 / `:list sizeof __gtools_gst_stats' { + local stat: word `k' of `__gtools_gst_stats' + + if ( "`stat'" == "standardize" ) local statcode -1 + else if ( "`stat'" == "normalize" ) local statcode -1 + else if ( "`stat'" == "demean" ) local statcode -2 + else if ( "`stat'" == "demedian" ) local statcode -3 + else if ( "`stat'" == "rank" ) local statcode -6 + else local statcode 0 + + * moving matrices + encode_moving `stat' + local rwarn = `rwarn' | `r(warn)' + if ( `r(match)' ) { + if ( `r(scode)' == 0 ) { + disp as err "gstats_transform: moving parsing error; unknown substat" + exit 198 + } + local statcode -4 + local bwarn4 = 1 + mata: __gtools_transform_moving[`k'] = `r(scode)' + mata: __gtools_transform_moving_l[`k'] = `r(lower)' + mata: __gtools_transform_moving_u[`k'] = `r(upper)' + } + + * interval matrices + encode_range `stat' + local iwarn = `iwarn' | `r(warn)' + if ( `r(match)' ) { + if ( `r(scode)' == 0 ) { + disp as err "gstats_transform: range parsing error; unknown substat" + exit 198 + } + local statcode -5 + local bwarn5 = 1 + mata: __gtools_transform_range[`k'] = `r(scode)' + mata: __gtools_transform_range_l[`k'] = `r(lower)' + mata: __gtools_transform_range_u[`k'] = `r(upper)' + mata: __gtools_transform_range_ls[`k'] = `r(lcode)' + mata: __gtools_transform_range_us[`k'] = `r(ucode)' + mata: __gtools_transform_range_pos[`k'] = `:list posof "`r(var)'" in rangevars' + } + + * cumsum matrices + encode_cumsum `stat' + if ( `r(match)' ) { + local statcode -7 + local cumvars `cumvars' `r(cumvars)' + if ( !inlist(`:word count `r(cumvars)'', 0, 1) ) { + disp as err "gstats_transform: cumby for multiple variables not implemented" + exit 198 + } + mata: __gtools_transform_cumsum[`k'] = 1 + mata: __gtools_transform_cumsign[`k'] = `r(cumsign)' + mata: __gtools_transform_cumvars[`=`k'+1'] = `:list sizeof cumvars' + } + + * shift matrices + encode_shift `stat' + if ( `r(match)' ) { + local statcode -8 + mata: __gtools_transform_aux8_shift[`k'] = `r(shift)' + } + + * other matrices + if ( `statcode' == 0 ) { + disp as err "gstats_transform: unknown stat `stat'" + exit 198 + } + + mata: __gtools_transform_varfuns[`k'] = `statcode' + if ( !`:list statcode in gs_nostats_codes' ) { + forvalues l = 1 / `:list sizeof gs' { + local gstat: word `l' of `gs' + forvalues m = 1 / `:list sizeof gs_`stat'' { + mata: __gtools_transform_statmap[`k', `m'] = `:list posof "`:word `m' of `gs_`stat'''" in gs' + } + } + + cap mata: assert(all(rowsum(__gtools_transform_statmap[`k', .] :> 0) :> 0)) + if ( _rc ) { + disp as err "gstats_transform: error parsing transform mappings" + exit 198 + } + } + } + + * if ( `bwarn4' ) { + * disp as txt "{bf:warning}: requested transform {bf:'moving'} is in beta" + * } + * + * if ( `bwarn5' ) { + * disp as txt "{bf:warning}: requested transform {bf:'range'} is in beta" + * } + + if ( `rwarn' ) { + disp as txt "{bf:note:} requested moving statistic without a window" + } + + if ( `iwarn' ) { + disp as txt "{bf:note:} requested range statistic without an interval" + } + + * NOTE(mauricio): Unlike gcollapse, here we can't really have a set + * of unique sources that get mapped to multiple targets because each + * source gets transformed! So you will need to read each source in + * unmodified for as many targets as you have. + + * TODO: strL support + local cumvars `cumvars' + if ( `"`cumvars'"' != "" ) { + GenericParseTypes `cumvars', mat(__gtools_transform_cumtypes) + forvalues i = 1 / `:list sizeof cumvars' { + cap mata assert(st_matrix("__gtools_transform_cumtypes")[`i'] :<= 0) + if ( _rc ) { + disp as err "gstats_transform: cumby for string types not implemented" + exit 198 + } + } + } + + mata { + if ( (`"`excludeself'"' != "") & any(__gtools_transform_varfuns :!= -5) ) { + if ( all(__gtools_transform_varfuns :!= -5) ) { + printf("gstats_transform: option -excludeself- not allowed (only with transform range)\n") + _error(198) + } + else { + printf("gstats_transform: excludeself ignored for stats other than range\n") + } + } + } + + * Return varlist for plugin internals + * ----------------------------------- + scalar __gtools_transform_greedy = (`"`greedy'"' != "nogreedy") + scalar __gtools_transform_kvars = `:list sizeof __gtools_gst_vars' + scalar __gtools_transform_ktargets = `:list sizeof __gtools_gst_targets' + scalar __gtools_transform_kgstats = `:list sizeof gs' + scalar __gtools_gstats_code = 3 + scalar __gtools_transform_cumsum_k = `:list sizeof cumvars' + scalar __gtools_transform_range_k = `:list sizeof rangevars' + scalar __gtools_transform_range_xs = (`"`excludeself'"' != "") + scalar __gtools_transform_range_xb = (`"`excludebounds'"' != "") + + mata: st_matrix("__gtools_transform_rank_ties", strtoreal(tokens(st_local("__gtools_gst_tcodes")))) + mata: st_matrix("__gtools_transform_varfuns", __gtools_transform_varfuns) + mata: st_matrix("__gtools_transform_statmap", __gtools_transform_statmap) + mata: st_matrix("__gtools_transform_statcode", __gtools_transform_statcode) + + mata: st_matrix("__gtools_transform_moving", __gtools_transform_moving) + mata: st_matrix("__gtools_transform_moving_l", __gtools_transform_moving_l) + mata: st_matrix("__gtools_transform_moving_u", __gtools_transform_moving_u) + + mata: st_matrix("__gtools_transform_range", __gtools_transform_range) + mata: st_matrix("__gtools_transform_range_pos", __gtools_transform_range_pos) + mata: st_matrix("__gtools_transform_range_l", __gtools_transform_range_l) + mata: st_matrix("__gtools_transform_range_u", __gtools_transform_range_u) + mata: st_matrix("__gtools_transform_range_ls", __gtools_transform_range_ls) + mata: st_matrix("__gtools_transform_range_us", __gtools_transform_range_us) + + mata: st_matrix("__gtools_transform_cumsum", __gtools_transform_cumsum) + mata: st_matrix("__gtools_transform_cumsign", __gtools_transform_cumsign) + mata: st_matrix("__gtools_transform_cumvars", __gtools_transform_cumvars) + + mata: st_matrix("__gtools_transform_aux8_shift", __gtools_transform_aux8_shift) + + c_local varlist `__gtools_gst_vars' `__gtools_gst_targets' `rangevars' `cumvars' + + * Check if any of the target is any type of source + local common: list cumvars | rangevars + local common: list common | __gtools_gst_vars + c_local gstats_replace_anysrc: list common & __gtools_gst_targets + + * Potential intensive operations: Add, recast targets + * --------------------------------------------------- + + if ( `kadd' ) { + mata: (void) st_addvar(tokens(`"`__gtools_gst_addtypes'"'), tokens(`"`__gtools_gst_addvars'"')) + } + + if ( `krecast' ) { + scalar __gtools_k_recast = `krecast' + cap noi plugin call gtools_plugin `recast_targets' `recast_sources', recast + local rc = _rc + cap scalar drop __gtools_k_recast + if ( `rc' ) { + exit `rc' + } + } + + mata __gtools_gst_dropvars = tokens(`"`__gtools_gst_dropvars'"') + forvalues k = 1 / `ktargets' { + mata: st_varlabel( `"`:word `k' of `__gtools_gst_targets''"', __gtools_gst_labels[`k']) + mata: st_varformat(`"`:word `k' of `__gtools_gst_targets''"', __gtools_gst_formats[`k']) + } + + c_local gstats_replace: copy local replace + c_local gstats_init: copy local init + c_local gstats_greedy: copy local greedy +end + +* NOTE: Copy/paste from gcollapse.ado/parse_vars + +capture program drop gstats_transform_parse +program gstats_transform_parse + syntax anything(equalok), /// + [ /// + WILDparse /// parse assuming wildcard renaming + autorename /// automagically name targets if no target is specified + autorenameformat(str) /// + window(passthru) /// moving window if not specified in the stat + interval(passthru) /// interval if not specified in the stat + cumby(passthru) /// cummulative sum by +/- and varlst if not specified in the stat + shiftby(passthru) /// Shift by +/-# + statprefix(passthru) /// add prefix to every stat + labelformat(str) /// label prefix + labelprogram(str) /// label program + ] + + * Parse call into list of sources, targets, stats + * ----------------------------------------------- + + local opts prefix(__gtools_gst) default(demean) + local passthru `window' `interval' `cumby' `shiftby' `statprefix' + if ( "`wildparse'" != "" ) { + local rc = 0 + + ParseListWild `anything', loc(__gtools_gst_call) `opts' `passthru' + + local __gtools_bak_stats : copy local __gtools_gst_stats + local __gtools_bak_vars : copy local __gtools_gst_vars + local __gtools_bak_targets : copy local __gtools_gst_targets + local __gtools_bak_uniq_stats : copy local __gtools_gst_uniq_stats + local __gtools_bak_uniq_vars : copy local __gtools_gst_uniq_vars + + ParseList `__gtools_gst_call', `opts' `passthru' + + cap assert ("`__gtools_gst_stats'" == "`__gtools_bak_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gst_vars'" == "`__gtools_bak_vars'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gst_targets'" == "`__gtools_bak_targets'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gst_uniq_stats'" == "`__gtools_bak_uniq_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gst_uniq_vars'" == "`__gtools_bak_uniq_vars'") + local rc = max(_rc, `rc') + + if ( `rc' ) { + disp as error "gstats_transform_parse: Wild parsing inconsistent with standard parsing." + exit 198 + } + } + else { + ParseList `anything', `opts' `passthru' + } + + if ( `"`autorenameformat'"' != "" ) local autorename autorename + if ( `"`autorenameformat'"' == "" ) local autorenameformat #source#_#stat# + + if ( `"`autorename'"' != "" ) { + local targets + forvalues k = 1 / `:list sizeof __gtools_gst_vars' { + local stat: word `k' of `__gtools_gst_stats' + local var: word `k' of `__gtools_gst_vars' + local target: word `k' of `__gtools_gst_targets' + if ( `"`var'"' == `"`target'"' ) { + local sname = strtoname(`"`stat'"') + local autoname: subinstr local autorenameformat "#source#" "`var'", all + local autoname: subinstr local autoname "#stat#" "`sname'", all + local targets `targets' `autoname' + } + else { + local targets `targets' `target' + } + } + local __gtools_gst_targets: copy local targets + } + + unab __gtools_gst_vars: `__gtools_gst_vars' + unab __gtools_gst_uniq_vars: `__gtools_gst_uniq_vars' + local __gtools_gst_uniq_targets: list uniq __gtools_gst_targets + + if ( !`:list __gtools_gst_uniq_targets === __gtools_gst_targets' ) { + disp as err "gstats_transform_parse: repeat targets found in function call" + exit 198 + } + + * Get format and labels from sources + * ---------------------------------- + + if ( "`labelformat'" == "") local labelformat "(#stat#) #sourcelabel#" + + local lnice_regex "(.*)(#stat:pretty#)(.*)" + local lpre_regex "(.*)(#stat#)(.*)" + local lPre_regex "(.*)(#Stat#)(.*)" + local lPRE_regex "(.*)(#STAT#)(.*)" + local ltxt_regex "(.*)(#sourcelabel#)(.*)" + local lsub_regex "(.*)#sourcelabel:([0-9]+):([.0-9]+)#(.*)" + + mata: __gtools_gst_labels = J(1, `:list sizeof __gtools_gst_targets', "") + mata: __gtools_gst_formats = J(1, `:list sizeof __gtools_gst_targets', "") + forvalues k = 1 / `:list sizeof __gtools_gst_targets' { + local vl = `"`:variable label `:word `k' of `__gtools_gst_vars'''"' + local vl = cond(`"`vl'"' == "", `"`:word `k' of `__gtools_gst_vars''"', `"`vl'"') + local vp = `"`:word `k' of `__gtools_gst_stats''"' + + if ( "`labelprogram'" == "" ) GtoolsPrettyStat `vp' + else `labelprogram' `vp' + local vpretty = `"`r(prettystat)'"' + + if ( `"`vpretty'"' == "#default#" ) { + GtoolsPrettyStat `vp' + local vpretty = `"`r(prettystat)'"' + } + + local lfmt_k = `"`labelformat'"' + + if ( "`vp'" == "freq" ) { + if !regexm(`"`vl'"', "`ltxt_regex'") { + while regexm(`"`lfmt_k'"', "`ltxt_regex'") { + local lfmt_k = regexs(1) + `""' + regexs(3) + } + } + if !regexm(`"`vl'"', "`lsub_regex'") { + while regexm(`"`lfmt_k'"', "`lsub_regex'") { + local lfmt_k = regexs(1) + `""' + regexs(4) + } + } + } + else { + if !regexm(`"`vl'"', "`ltxt_regex'") { + while regexm(`"`lfmt_k'"', "`ltxt_regex'") { + local lfmt_k = regexs(1) + `"`vl'"' + regexs(3) + } + } + if !regexm(`"`vl'"', "`lsub_regex'") { + while regexm(`"`lfmt_k'"', "`lsub_regex'") { + local lfmt_k = regexs(1) + substr(`"`vl'"', `:di regexs(2)', `:di regexs(3)') + regexs(4) + } + } + } + + if !regexm(`"`vpretty'"', "`lnice_regex'") { + while regexm(`"`lfmt_k'"', "`lnice_regex'") { + local lfmt_k = regexs(1) + `"`vpretty'"' + regexs(3) + } + } + if !regexm(`"`vp'"', "`lpre_regex'") { + while regexm(`"`lfmt_k'"', "`lpre_regex'") { + local lfmt_k = regexs(1) + `"`vp'"' + regexs(3) + } + } + if !regexm(`"`vp'"', "`lPre_regex'") { + while regexm(`"`lfmt_k'"', "`lPre_regex'") { + local lfmt_k = regexs(1) + proper(`"`vp'"') + regexs(3) + } + } + if !regexm(`"`vp'"', "`lPRE_regex'") { + while regexm(`"`lfmt_k'"', "`lPRE_regex'") { + local lfmt_k = regexs(1) + upper(`"`vp'"') + regexs(3) + } + } + mata: __gtools_gst_labels[`k'] = `"`lfmt_k'"' + + local vf = "`:format `:word `k' of `__gtools_gst_vars'''" + local vf = cond(inlist(`"`:word `k' of `__gtools_gst_stats''"', "count", "freq", "nunique", "nmissing"), "%8.0g", "`vf'") + mata: __gtools_gst_formats[`k'] = "`vf'" + } + + * Locals one level up + * ------------------- + + c_local __gtools_gst_targets : copy local __gtools_gst_targets + c_local __gtools_gst_vars : copy local __gtools_gst_vars + c_local __gtools_gst_stats : copy local __gtools_gst_stats + c_local __gtools_gst_uniq_vars : copy local __gtools_gst_uniq_vars + c_local __gtools_gst_uniq_stats : copy local __gtools_gst_uniq_stats +end + +capture program drop gstats_transform_types +program gstats_transform_types + syntax, vars(str) targets(str) stats(str) prefix(str) [types(str) ties(str)] + + * Check all inputs are numeric + * ---------------------------- + + cap confirm var `vars' + if ( _rc ) { + disp as err "gstats_transform_types: sources must exit" + exit 198 + } + + cap confirm numeric var `vars' + if ( _rc ) { + disp as err "gstats_transform_types: numeric sources required" + exit 198 + } + + local sametype standardize /// + normalize /// + demean /// + demedian /// + + local upgrade + local types + local retype + + * Special parsing for rank type + * ----------------------------- + + if ( (`:list sizeof ties' > 1) & (`:list sizeof ties' != `:list sizeof targets') ) { + disp as err "gstats_transform_types: only one tie-break or one tie-break per target in ties()" + exit 198 + } + + if ( scalar(__gtools_weight_code) > 0 ) { + local mintype_rank double + } + else if ( `=_N' < maxbyte() ) { + local mintype_rank byte + } + else if ( `=_N' < maxint() ) { + local mintype_rank int + } + else if ( `=_N' < maxlong() ) { + local mintype_rank long + } + else { + local mintype_rank double + } + + local tcodes + local rtypes + local default d de def defa defau defaul default . + local field f fi fie fiel field + local track t tr tra trac track + local unique u un uni uniq uniqu unique + local stableunique s st sta stab stabl stable stableu stableun stableuni stableuniq stableuniqu stableunique + + if ( `:list sizeof ties' > 1 ) { + foreach t of local ties { + if ( `:list t in default' | (`"`t'"' == "") ) { + local ties_code = 1 + local rtype = cond(`"`mintype_rank'"' != "double", "`:set type'", "double") + } + else if ( `:list t in field' ) { + local ties_code = 2 + local rtype `mintype_rank' + } + else if ( `:list t in track' ) { + local ties_code = 3 + local rtype `mintype_rank' + } + else if ( `:list t in unique' ) { + local ties_code = 4 + local rtype `mintype_rank' + } + else if ( `:list ties in stableunique' ) { + local ties_code = 5 + local rtype `mintype_rank' + } + else { + disp as err "ties(`t') not allowed" + exit 198 + } + local tcodes `tcodes' `ties_code' + local rtypes `rtypes' `rtype' + } + } + else { + if ( `:list ties in default' | (`"`ties'"' == "") ) { + local ties_code = 1 + local rtype = cond(inlist(`"`mintype_rank'"', "long", "double"), "double", "`:set type'") + } + else if ( `:list ties in field' ) { + local ties_code = 2 + local rtype `mintype_rank' + } + else if ( `:list ties in track' ) { + local ties_code = 3 + local rtype `mintype_rank' + } + else if ( `:list ties in unique' ) { + local ties_code = 4 + local rtype `mintype_rank' + } + else if ( `:list ties in stableunique' ) { + local ties_code = 5 + local rtype `mintype_rank' + } + else { + disp as err "gstats_transform_types: ties(`ties') not allowed" + exit 198 + } + + forvalues k = 1 / `:list sizeof targets' { + local tcodes `tcodes' `ties_code' + local rtypes `rtypes' `rtype' + } + } + + * If types are empty, autoretype; else use user input + * --------------------------------------------------- + + * NOTE(mauricio): retype is 1 if the target exists and the type is + * unsuitable or if the target does not exist (since "" will not + * equal any named type). In the former case retype is necessary, + * in the latter retype will get ignored and a new variable will be + * created. + + if ( `"`types'"' == "" ) { + forvalues k = 1 / `:list sizeof vars' { + gettoken var vars: vars + gettoken target targets: targets + gettoken stat stats: stats + gettoken rtype rtypes: rtypes + + local var `var' + local target `target' + local stat `stat' + local rtype `rtype' + local type: type `var' + + cap confirm var `target' + if ( _rc ) local ttype + else local ttype: type `target' + + encode_moving `stat' + local rmatch = `r(match)' + if ( `r(match)' ) { + encode_stat_types `r(stat)' `type' `ttype' + local types `types' `r(type)' + local retype `retype' `r(retype)' + } + + encode_range `stat' + local rmatch = `r(match)' | `rmatch' + if ( `r(match)' ) { + encode_stat_types `r(stat)' `type' `ttype' + local types `types' `r(type)' + local retype `retype' `r(retype)' + } + + encode_cumsum `stat' + local rmatch = `r(match)' | `rmatch' + if ( `r(match)' ) { + local types `types' double + local retype `retype' `=!inlist("`ttype'", "double")' + } + + * shift follows the same retype logic as min, max, first, last, etc. + encode_shift `stat' + local rmatch = `r(match)' | `rmatch' + if ( `r(match)' ) { + encode_stat_types first `type' `ttype' + local types `types' `type' + local retype `retype' `r(retype)' + } + + if ( `"`stat'"' == "rank" ) { + local types `types' `rtype' + if ( inlist("`ttype'", "`rtype'", "double") ) { + local retype `retype' 0 + } + else if ( "`ttype'" == "float" ) { + local retype `retype' `=inlist("`rtype'", "long", "double")' + } + else if ( "`ttype'" == "long" ) { + local retype `retype' `=inlist("`rtype'", "double")' + } + else if ( "`ttype'" == "int" ) { + local retype `retype' `=!inlist("`rtype'", "int", "byte")' + } + else if ( "`ttype'" == "byte" ) { + local retype `retype' `=!inlist("`rtype'", "byte")' + } + else if ( "`ttype'" == "" ) { + local retype `retype' 1 + } + else { + disp as err "gstats_transform_types: Unable to parse type '`ttype''" + exit 198 + } + local rmatch = 1 + } + + if ( `rmatch' == 0 ) { + if inlist(`"`type'"', "long") { + local types `types' double + local retype `retype' `=!inlist("`ttype'", "double")' + } + else if inlist(`"`type'"', "int", "byte") { + local types `types' `:set type' + local retype `retype' `=!inlist("`ttype'", "`:set type'", "double")' + } + else { + if ( `:list stat in sametype' ) { + local types `types' `type' + local retype `retype' `=!inlist("`ttype'", "`type'", "double")' + } + else if ( `:list stat in upgrade' ) { + local types `types' double + local retype `retype' `=!inlist("`ttype'", "double")' + } + else { + disp as err "gstats_transform_types: Uknown stat found in function call" + exit 198 + } + } + } + } + } + else if ( `:list sizeof types' == 1 ) { + forvalues k = 1 / `:list sizeof targets' { + local target: word `k' of `targets' + cap confirm var `target' + if ( _rc ) local ttype + else local ttype: type `target' + + local types `types' `types' + local retype `retype' `=("`ttype'" != "`types'")' + } + } + else if ( `:list sizeof types' != `:list sizeof targets' ) { + disp as err "gstats_transform_types: types() must be a single input or one input per target" + exit 198 + } + else { + forvalues k = 1 / `:list sizeof targets' { + local tcmp: word `k' of `types' + local target: word `k' of `targets' + cap confirm var `target' + if ( _rc ) local ttype + else local ttype: type `target' + + local retype `retype' `=("`ttype'" != "`tcmp'")' + } + } + + c_local `prefix'_tcodes: copy local tcodes + c_local `prefix'_types: copy local types + c_local `prefix'_retype: copy local retype +end + +capture program drop gstats_hdfe +program gstats_hdfe + syntax anything(equalok), /// + absorb(varlist) /// + [ /// + PREfix(str) /// generate variables with specified prefix + GENerate(str) /// generate specified variables + replace /// replace variables, if they exist + noinit /// do not initialize targets with missing values + WILDparse /// parse assuming wildcard renaming + /// + ABSORBMISSing /// absorb missing levels + algorithm(str) /// alias for method + method(str) /// projection method + /// map (method of alternating projections) + /// squarem + /// conjugate gradient|cg + /// it|irons tuck + STANdardize /// standardize before applying transform + TRACEiter /// trace iteration progress + maxiter(real 100000) /// maximum number of iterations + TOLerance(real 1e-8) /// tolerance for hdfe convergence + MATAsave /// save by vars/levels in mata + MATAsavename(str) /// name of mata object + /// + individual /// do not drop missing rows case-wise + ] + + if ( ("`algorithm'" != "") & ("`method'" != "") ) { + disp as err "gstats_hdfe: method() is an alias for algorithm(); specify only one" + exit 198 + } + if ( `"`algorithm'"' == "" ) local algorithm cg + if ( `"`method'"' != "" ) local algorithm: copy local method + local method: copy local algorithm + + if ( "`individual'" != "" ) { + disp as err "gstats_hdfe: option -individual- not implemented; values dropped row-wise" + exit 198 + } + + if ( `maxiter' < 1 ) { + disp as err "gstats_hdfe: maxiter() must be >= 1" + exit 198 + } + + if ( missing(`maxiter') ) local maxiter 0 + local maxiter = floor(`maxiter') + + local __gtools_byvars: copy global GTOOLS_BYNAMES + * NB: This is tested internally against gregress; should be OK? + * if ( "${GTOOLS_HDFEBY}" != "1" ) { + * if ( "`__gtools_byvars'" != "" ) { + * disp as err "gstats hdfe with by() has {bf:NOT} been tested. Try it at your own risk via" + * disp as err "" + * disp as err " global GTOOLS_HDFEBY = 1" + * exit 198 + * } + * } + + if ( `"`matasavename'"' != "" ) local matasave matasave + if ( `"`matasavename'"' == "" ) local matasavename GtoolsByLevels + + if ( lower(`"`method'"') == "map" ) { + local method_code 1 + local method map + } + else if ( lower(`"`method'"') == "squarem" ) { + local method_code 2 + local method squarem + } + else if ( inlist(lower(`"`method'"'), "conjugate gradient", "conjugate_gradient", "cg") ) { + local method_code 3 + local method cg + } + else if ( inlist(lower(`"`method'"'), "irons and tuck", "irons tuck", "irons_tuck", "it") ) { + local method_code 5 + local method it + } + else if ( inlist(lower(`"`method'"'), "bit", "berge_it", "berge it") ) { + * TODO: gives segfault on some runs last I checked; debug someday. + * Option is undocumented but I leave it here for myself. + local method_code 6 + local method bit + } + else { + disp as err "gstats_hdfe: method() must be one of: map, squarem, cg, it" + exit 198 + } + + * --------------------------------------------------------------------- + * Parse absorb + * --------------------------------------------------------------------- + + local absorb_uniq: list uniq absorb + if ( `:list sizeof absorb_uniq' < `:list sizeof absorb' ) { + disp as txt "warning: duplicate variables in absorb()" + } + + * TODO: strL support + GenericParseTypes `absorb', mat(__gtools_hdfe_abstyp) + matrix __gtools_hdfe_nabsorb = J(1, `:list sizeof absorb', .) + + * --------------------------------------------------------------------- + * Parse variable targets + * --------------------------------------------------------------------- + + local gen_clist = strpos(`"`anything'"', "=") > 0 + local gen_prefix = ("`prefix'" != "") + local gen_direct = ("`generate'" != "") + local gen_replace = ("`replace'" != "") + + if ( !`gen_clist' & !`gen_direct' & !`gen_prefix' & !`gen_replace' ) { + disp as err "gstats_hdfe: No targets specified and no replace." + exit 198 + } + + if ( `gen_clist' & `gen_direct' ) { + disp as err "gstats_hdfe: Cannot specify both generate() and target=source syntax" + exit 198 + } + + + if ( `gen_replace' & ((`gen_clist' + `gen_direct' + `gen_prefix') == 0) ) { + confirm numeric var `anything' + unab __gtools_hdfe_vars: `anything' + local __gtools_hdfe_targets: copy local __gtools_hdfe_vars + } + + if ( `gen_direct' ) { + confirm numeric var `anything' + unab __gtools_hdfe_vars: `anything' + local __gtools_hdfe_targets: copy local generate + } + + local opts prefix(__gtools_hdfe) default(hdfe) + if ( `gen_clist' ) { + if ( "`wildparse'" != "" ) { + local rc = 0 + + ParseListWild `anything', loc(__gtools_hdfe_call) `opts' + + local __gtools_bak_stats : copy local __gtools_hdfe_stats + local __gtools_bak_vars : copy local __gtools_hdfe_vars + local __gtools_bak_targets : copy local __gtools_hdfe_targets + local __gtools_bak_uniq_stats : copy local __gtools_hdfe_uniq_stats + local __gtools_bak_uniq_vars : copy local __gtools_hdfe_uniq_vars + + ParseList `__gtools_hdfe_call', `opts' + + cap assert ("`__gtools_hdfe_stats'" == "`__gtools_bak_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_hdfe_vars'" == "`__gtools_bak_vars'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_hdfe_targets'" == "`__gtools_bak_targets'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_hdfe_uniq_stats'" == "`__gtools_bak_uniq_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_hdfe_uniq_vars'" == "`__gtools_bak_uniq_vars'") + local rc = max(_rc, `rc') + + if ( `rc' ) { + disp as error "gstats_hdfe: Wild parsing inconsistent with standard parsing." + exit 198 + } + } + else { + ParseList `anything', `opts' + } + } + + if ( `gen_clist' & `gen_prefix' ) { + local _targets + forvalues k = 1 / `:list sizeof __gtools_hdfe_vars' { + local target: word `k' of `__gtools_hdfe_targets' + local source: word `k' of `__gtools_hdfe_vars' + if ( "`target'" == "`source'" ) { + local _targets `_targets' `prefix'`target' + } + else { + local _targets `_targets' `target' + } + } + local __gtools_hdfe_targets: copy local _targets + } + else if ( `gen_prefix' ) { + confirm numeric var `anything' + unab __gtools_hdfe_vars: `anything' + local __gtools_hdfe_targets + foreach var of local __gtools_hdfe_vars { + local __gtools_hdfe_targets `__gtools_hdfe_targets' `prefix'`var' + } + } + + foreach var of local __gtools_hdfe_targets { + cap confirm new var `var' + if ( _rc & !`gen_replace' ) { + di as err "Variable `var' exists with no replace." + exit 198 + } + } + + * --------------------------------------------------------------------- + * Parse variable types + * --------------------------------------------------------------------- + + local __gtools_hdfe_types + foreach var of local __gtools_hdfe_vars { + if inlist("`:type `var''", "float", "double") { + local __gtools_hdfe_types `__gtools_hdfe_types' `:type `var'' + } + else { + local __gtools_hdfe_types `__gtools_hdfe_types' `:set type' + } + } + + local kvars: list sizeof __gtools_hdfe_vars + local ktargets: list sizeof __gtools_hdfe_targets + local ktype: list sizeof __gtools_hdfe_types + + local kbad = 0 + local kbad = `kbad' | (`kvars' != `ktargets') + local kbad = `kbad' | (`kvars' != `ktype') + + if ( `kbad' ) { + disp as err "gstats_hdfe: parsing error (inconsistent number of inputs)" + exit 198 + } + + local __gtools_hdfe_uniq: list uniq __gtools_hdfe_vars + if ( `:list sizeof __gtools_hdfe_uniq' != `kvars' ) { + disp as err "gstats_hdfe: Repeat sources not allowed" + exit 198 + } + + * --------------------------------------------------------------------- + * Recast or drop + * --------------------------------------------------------------------- + + local krecast = 0 + local recast_sources + local recast_targets + + local __gtools_hdfe_i = 0 + local __gtools_hdfe_dropvars + + forvalues k = 1 / `ktargets' { + local typek: word `k' of `__gtools_hdfe_types' + local targetk: word `k' of `__gtools_hdfe_targets' + local sourcek: word `k' of `__gtools_hdfe_vars' + + cap confirm new variable `targetk' + if ( _rc ) { + if !inlist("`:type `targetk''", "float", "double") { + cap confirm new variable __gtools_hdfe`__gtools_hdfe_i' + while ( _rc ) { + local ++__gtools_hdfe_i + cap confirm new variable __gtools_hdfe`__gtools_hdfe_i' + } + rename `targetk' __gtools_hdfe`__gtools_hdfe_i' + local __gtools_hdfe_dropvars `__gtools_hdfe_dropvars' __gtools_hdfe`__gtools_hdfe_i' + + if ( "`targetk'" == "`sourcek'" ) { + local recast_sources `recast_sources' __gtools_hdfe`__gtools_hdfe_i' + local recast_targets `recast_targets' `targetk' + local ++krecast + } + } + } + } + + * --------------------------------------------------------------------- + * Add target variables + * --------------------------------------------------------------------- + + local kadd = 0 + local __gtools_hdfe_addvars + local __gtools_hdfe_addtypes + forvalues k = 1 / `ktargets' { + local target: word `k' of `__gtools_hdfe_targets' + local type: word `k' of `__gtools_hdfe_types' + cap confirm new variable `target' + if ( _rc == 0 ) { + local ++kadd + local __gtools_hdfe_addvars `__gtools_hdfe_addvars' `target' + local __gtools_hdfe_addtypes `__gtools_hdfe_addtypes' `type' + } + } + + if ( `kadd' ) { + mata: (void) st_addvar(tokens(`"`__gtools_hdfe_addtypes'"'), tokens(`"`__gtools_hdfe_addvars'"')) + } + + if ( `krecast' ) { + scalar __gtools_k_recast = `krecast' + cap noi plugin call gtools_plugin `recast_targets' `recast_sources', recast + local rc = _rc + cap scalar drop __gtools_k_recast + if ( `rc' ) { + exit `rc' + } + } + + mata st_dropvar(tokens(`"`__gtools_hdfe_dropvars'"')) + + * --------------------------------------------------------------------- + * Scalars and locals for C internals + * --------------------------------------------------------------------- + + scalar __gtools_hdfe_methodname = cond(`:list sizeof absorb' > 1, "`method'", "direct") + scalar __gtools_hdfe_method = `method_code' + scalar __gtools_hdfe_mataname = `"`matasavename'"' + scalar __gtools_hdfe_matasave = `"`matasave'"' != "" + scalar __gtools_hdfe_kvars = `kvars' + scalar __gtools_hdfe_absorb = `:list sizeof absorb' + scalar __gtools_hdfe_hdfetol = `tolerance' + scalar __gtools_hdfe_maxiter = `maxiter' + scalar __gtools_hdfe_traceiter = "`traceiter'" != "" + scalar __gtools_hdfe_standard = "`standardize'" != "" + scalar __gtools_gstats_code = 4 + scalar __gtools_hdfe_iter = cond(`:list sizeof absorb' > 1, 1, 0) + scalar __gtools_hdfe_feval = cond(`:list sizeof absorb' > 1, 1, 0) + + if "`absorbmissing'" != "" c_local __gtools_hdfe_markvars `__gtools_hdfe_vars' + else c_local __gtools_hdfe_markvars `__gtools_hdfe_vars' `absorb' + + c_local varlist `__gtools_hdfe_vars' `__gtools_hdfe_targets' `absorb' + + c_local gstats_replace: copy local replace + c_local gstats_init: copy local init + +* TODO: xx formats and labels for targets? +* forvalues k = 1 / `ktargets' { +* mata: st_varlabel( `"`:word `k' of `__gtools_hdfe_targets''"', __gtools_hdfe_labels[`k']) +* mata: st_varformat(`"`:word `k' of `__gtools_hdfe_targets''"', __gtools_hdfe_formats[`k']) +* } + +end + +capture program drop gstats_winsor +program gstats_winsor + syntax varlist(numeric), [ /// + Suffix(str) /// + Prefix(str) /// + GENerate(str) /// + Trim /// + Cuts(str) /// + Label /// + replace /// + noinit /// + ] + + * Default is winsorize or trim 1st or 99th pctile + local trim = ( `"`trim'"' != "" ) + if ( `"`cuts'"' == "" ) { + local cutl = 1 + local cuth = 99 + } + else { + gettoken cutl cuth: cuts + cap noi confirm number `cutl' + if ( _rc ) { + disp "you must pass two percentiles to option -cuts()-" + exit _rc + } + + cap noi confirm number `cuth' + if ( _rc ) { + disp "you must pass two percentiles to option -cuts()-" + exit _rc + } + + if ( (`cutl' < 0) | (`cutl' > 100) | (`cuth' < 0) | (`cuth' > 100) ) { + disp as err "percentiles in -cuts()- must be between 0 and 100" + exit 198 + } + + if ( `cutl' > `cuth' ) { + disp as err "specify the lower cutpoint first in -cuts()-" + exit 198 + } + } + local kvars: list sizeof varlist + + scalar __gtools_winsor_trim = `trim' + scalar __gtools_winsor_cutl = `cutl' + scalar __gtools_winsor_cuth = `cuth' + scalar __gtools_winsor_kvars = `kvars' + scalar __gtools_gstats_code = 1 + + * Default is to generate vars with suffix (_w or _tr) + if ( `"`prefix'`suffix'`generate'"' == "" ) { + local ngen = 0 + if ( `trim' ) { + local suffix _tr + } + else { + local suffix _w + } + } + else local ngen = (`"`prefix'`suffix'"' != "") + (`"`generate'"' != "") + + * Can only generate variables in one way + if ( `ngen' > 1 ) { + disp as err "Specify only one of prefix()/suffix() or generate." + exit 198 + } + + * Generate same targets as sources + if ( (`"`replace'"' != "") & (`ngen' == 0) ) { + local targetvars: copy local varlist + } + else { + if ( `"`replace'"' == "" ) local noi noi + if ( `"`prefix'`suffix'"' != "" ) { + local genvars + local gentypes + local targetvars + foreach var of varlist `varlist' { + local targetvars `targetvars' `prefix'`var'`suffix' + cap `noi' confirm new var `prefix'`var'`suffix' + if ( _rc & (`"`replace'"' == "") ) { + exit _rc + } + else if ( _rc == 0 ) { + local genvars `genvars' `prefix'`var'`suffix' + local gentypes `gentypes' `:type `var'' + } + } + } + else if ( `"`generate'"' != "" ) { + local kgen: list sizeof generate + if ( `kgen' != `kvars' ) { + disp as err "Specify the same number of targets as sources with -generate()-" + exit 198 + } + + local targetvars: copy local generate + local genvars + local gentypes + forvalues i = 1 / `kvars' { + local var: word `i' of `varlist' + local gvar: word `i' of `generate' + cap `noi' confirm new var `gvar' + if ( _rc & (`"`replace'"' == "") ) { + exit _rc + } + else if ( _rc == 0 ) { + local genvars `genvars' `gvar' + local gentypes `gentypes' `:type `var'' + } + } + } + else { + disp as err "Invalid call in gtools/gstats/winsor" + exit 198 + } + + mata: (void) st_addvar(tokens(`"`gentypes'"'), tokens(`"`genvars'"')) + } + + * Add to label if applicable + if ( substr("`cutl'", 1, 1) == "." ) local cutl 0`cutl' + if ( substr("`cuth'", 1, 1) == "." ) local cuth 0`cuth' + if ( "`label'" != "" ) { + local cuth `cuth' + local cutl `cutl' + if ( `trim' ) { + local glab `" - Trimmed (p`cutl', p`cuth')"' + } + else { + local glab `" - Winsor (p`cutl', p`cuth')"' + } + } + else local glab `""' + + * Label and copy formats + forvalues i = 1 / `kvars' { + local var: word `i' of `varlist' + local gvar: word `i' of `targetvars' + local vlab: var label `var' + if ( `"`vlab'"' == "" ) local vlab `var' + label var `gvar' `"`=`"`vlab'"' + `"`glab'"''"' + format `:format `var'' `gvar' + } + + c_local varlist `varlist' `targetvars' + c_local gstats_replace: copy local replace + c_local gstats_init: copy local init + c_local gstats_replace_anysrc: list varlist & targetvars +end + +capture program drop gstats_summarize +program gstats_summarize + syntax [varlist], [ /// + noDetail /// + Meanonly /// + TABstat /// + /// + SEParator(int 5) /// + /// + COLumns(str) /// + Format /// + POOLed /// + PRETTYstats /// + noPRINT /// + MATAsave /// + MATAsavename(str) /// + save /// + * /// + ] + + if ( `"`matasavename'"' != "" ) local matasave matasave + if ( `"`matasavename'"' == "" ) local matasavename GstatsOutput + + if ( `"`options'"' != "" ) { + disp as err "Unknown options (note not all display options are not allowed):" + disp as err " `options'" + exit 198 + } + + if ( `"`pooled'"' == "pooled" & (`=scalar(__gtools_weight_code)' > 0) ) { + disp as err "Option -pooled- not allowed with weights" + exit 198 + } + + scalar __gtools_summarize_separator = `separator' + scalar __gtools_summarize_noprint = (`"`print'"' == "noprint") + scalar __gtools_summarize_format = (`"`format'"' == "format") + scalar __gtools_summarize_matasave = (`"`matasave'"' == "matasave") + + if ( "`save'" != "" ) { + disp as err "{bf:Warning}: Option save not implemented; try -matasave-" + } + + * Number of stats to compute + * -------------------------- + + local kstats = 25 + if ( `"`meanonly'"' != "") { + local kstats = 6 + local detail nodetail + } + else if ( `"`detail'"' == "nodetail" ) { + local kstats = 8 + } + + * Switch to tabstat + * ----------------- + + if ( "`tabstat'" != "" ) { + scalar __gtools_summarize_tabstat = 1 + } + + * Ignore string vars + * ------------------ + + local ignorelist + local statlist + foreach var of varlist `varlist' { + cap confirm numeric variable `var' + if ( _rc ) { + local ignorelist `ignorelist' `var' + } + else { + local statlist `statlist' `var' + } + } + + if ( `:list sizeof statlist' == 0 ) { + disp as err "No numeric variables; nothing to do." + exit 18201 + } + + if ( `:list sizeof ignorelist' > 0 ) { + disp "Ignoring non-numeric variables:" + foreach var of varlist `ignorelist' { + disp _skip(4) `"`var'"' + } + } + + * Stats to compute + * ---------------- + + c_local GstatsMataSave: copy local matasavename + c_local varlist: copy local statlist + + scalar __gtools_gstats_code = 2 + scalar __gtools_summarize_pretty = (`"`prettystats'"' == "prettystats") + scalar __gtools_summarize_pooled = (`"`pooled'"' == "pooled") + scalar __gtools_summarize_normal = (`"`meanonly'"' == "") + scalar __gtools_summarize_detail = (`"`detail'"' != "nodetail") + scalar __gtools_summarize_kvars = `:list sizeof statlist' + scalar __gtools_summarize_kstats = `kstats' + + * -1 // sum + * -2 // mean + * -3 // sd + * -4 // max + * -5 // min + * -6 // count, n + * -7 // percent + * 50 // median + * -9 // iqr + * -10 // first + * -11 // firstnm + * -12 // last + * -13 // lastnm + * -14 // freq + * -15 // semean + * -16 // sebinomial + * -17 // sepoisson + * -18 // nunique + * -19 // skewness + * -20 // kurtosis + * -21 // rawsum + * -22 // nmissing + * -23 // variance + * -24 // cv + * -25 // range + * -26 // geomean + * -27 // gini + * -27.1 // gini|dropneg + * -27.2 // gini|keepneg + * -101 // nansum + * -121 // rawnansum + * -206 // sum weight + * -203 // variance + * 1000 + # // #th smallest + * -1000 - # // #th largest + * 1000.5 + # // raw #th smallest + * -1000.5 - # // raw #th largest + + * N sum sum_w mean min max, -299 + * N sum sum_w mean min max sd var, -298 + * N sum mean min max sd, -297 (mainly for defailt tabstat) + + mata: __gtools_summarize_codes = J(1, `kstats', .) + // mata: __gtools_summarize_codes[1] = -299 + mata: __gtools_summarize_codes[1] = -6 // N + mata: __gtools_summarize_codes[2] = -206 // sum_w + mata: __gtools_summarize_codes[3] = -1 // sum + mata: __gtools_summarize_codes[4] = -2 // mean + mata: __gtools_summarize_codes[5] = -5 // min + mata: __gtools_summarize_codes[6] = -4 // max + + if ( `kstats'> 6 ) { + // mata: __gtools_summarize_codes[1] = -298 + mata: __gtools_summarize_codes[7] = -3 // sd + mata: __gtools_summarize_codes[8] = -203 // var, copy previous entry^2 + } + + if ( `kstats' > 8 ) { + mata: __gtools_summarize_codes[9] = 1 // 1st percentile + mata: __gtools_summarize_codes[10] = 5 // 5th percentile + mata: __gtools_summarize_codes[11] = 10 // 10th percentile + mata: __gtools_summarize_codes[12] = 25 // 25th percentile + mata: __gtools_summarize_codes[13] = 50 // 50th percentile + mata: __gtools_summarize_codes[14] = 75 // 75th percentile + mata: __gtools_summarize_codes[15] = 90 // 90th percentile + mata: __gtools_summarize_codes[16] = 95 // 95th percentile + mata: __gtools_summarize_codes[17] = 99 // 99th percentile + mata: __gtools_summarize_codes[18] = -19 // skewness + mata: __gtools_summarize_codes[19] = -20 // kurtosis + mata: __gtools_summarize_codes[20] = 1002 // 2nd smallest + mata: __gtools_summarize_codes[21] = 1003 // 3rd smallest + mata: __gtools_summarize_codes[22] = 1004 // 4th smallest + mata: __gtools_summarize_codes[23] = -1004 // 4th largest + mata: __gtools_summarize_codes[24] = -1003 // 3rd largest + mata: __gtools_summarize_codes[25] = -1002 // 2nd largest + } + + mata: st_matrix("__gtools_summarize_codes", __gtools_summarize_codes) + + * Auto-columns for tab + * -------------------- + + if ( (`kstats' > 8) & (`"`tabstat'"' != "") ) { + disp as txt "({bf:note}: making table with 25 statistics from {cmd:summarize, detail})" + local coldef variables + } + else { + local coldef statistics + } + + if ( `"`columns'"' == "" ) local columns `coldef' + if ( `"`columns'"' == "var" ) local columns variables + if ( `"`columns'"' == "stat" ) local columns statistics + if ( !inlist(`"`columns'"', "variables", "statistics") ) { + disp as err `"columns(`columns') not allowed. Available: variables, statistics"' + exit 198 + } + scalar __gtools_summarize_colvar = (`"`columns'"' == "variables") +end + +capture program drop gstats_tabstat +program gstats_tabstat + syntax [varlist], [ /// + noDetail /// + Meanonly /// + TABstat /// + /// + _sum /// + Statistics(str) /// + stats(str) /// + LABELWidth(int 16) /// + /// + COLumns(str) /// + Formatvar /// + Format(str) /// + POOLed /// + PRETTYstats /// + noSEParator /// + noPRINT /// + MATAsave /// + MATAsavename(str) /// + save /// + * /// + ] + + if ( `"`matasavename'"' != "" ) local matasave matasave + if ( `"`matasavename'"' == "" ) local matasavename GstatsOutput + if ( `"`format'"' == "" ) local format %9.0g + + scalar __gtools_summarize_tabstat = 1 + scalar __gtools_summarize_lwidth = `labelwidth' + scalar __gtools_summarize_nosep = (`"`separator'"' == "noseparator") + scalar __gtools_summarize_noprint = (`"`print'"' == "noprint") + scalar __gtools_summarize_format = (`"`formatvar'"' == "formatvar") + scalar __gtools_summarize_dfmt = `"`format'"' + scalar __gtools_summarize_matasave = (`"`matasave'"' == "matasave") + + if ( `"`options'"' != "" ) { + disp as err "Unknown options (note not all display options are not allowed):" + disp as err " `options'" + exit 198 + } + + if ( `"`pooled'"' == "pooled" & (`=scalar(__gtools_weight_code)' > 0) ) { + disp as err "Option -pooled- not allowed with weights" + exit 198 + } + + if ( "`save'" != "" ) { + disp as err "{bf:Warning}: Option save not implemented; try -matasave-" + } + + * Ignore string vars + * ------------------ + + if ( `"`_sum'"' == "" ) { + confirm numeric var `varlist' + local statlist: copy local varlist + } + else { + local ignorelist + local statlist + foreach var of varlist `varlist' { + cap confirm numeric variable `var' + if ( _rc ) { + local ignorelist `ignorelist' `var' + } + else { + local statlist `statlist' `var' + } + } + + if ( `:list sizeof statlist' == 0 ) { + disp as err "No numeric variables; nothing to do." + exit 18201 + } + + if ( `:list sizeof ignorelist' > 0 ) { + disp "Ignoring non-numeric variables:" + foreach var of varlist `ignorelist' { + disp _skip(4) `"`var'"' + } + } + } + + * Stats to compute + * ---------------- + + if ( (`"`_sum'"' == "") & (`"`detail'`meanonly'"' != "") ) { + disp as err "Options -nodetail- and -meanonly- only allowed with {cmd:gstats_summarize}" + exit 198 + } + + scalar __gtools_gstats_code = 2 + scalar __gtools_summarize_pretty = (`"`prettystats'"' == "prettystats") + scalar __gtools_summarize_pooled = (`"`pooled'"' == "pooled") + scalar __gtools_summarize_normal = (`"`meanonly'"' == "") + scalar __gtools_summarize_detail = (`"`detail'"' != "nodetail") + scalar __gtools_summarize_kvars = `:list sizeof statlist' + + if ( `"`statistics'`stats'"' == "" ) { + if ( `"`_sum'"' == "" ) { + if ( `"`detail'"' == "nodetail" ) { + disp as txt "({bf:warning}:option -nodetail- ignored)" + } + if ( `"`meanonly'"' != "" ) { + disp as txt "({bf:warning}:option -meanonly- ignored)" + } + // local scode -297 + local scode -6 -1 -2 -5 -4 -3 + local kstats = 6 + } + else if ( `"`meanonly'"' != "" ) { + // local scode -299 + local scode -6 -206 -1 -2 -5 -4 + local kstats = 6 + } + else if ( `"`detail'"' == "nodetail" ) { + // local scode -298 + local scode -6 -206 -1 -2 -5 -4 -3 -203 + local kstats = 8 + } + else { + disp as err "parsing error: _gtools_internal failed to parse input" + exit 198 + } + } + else { + if ( `"`detail'"' == "nodetail" ) { + disp as txt "({bf:warning}:option -nodetail- ignored)" + } + if ( `"`meanonly'"' != "" ) { + disp as txt "({bf:warning}:option -meanonly- ignored)" + } + if ( (`"`statistics'"' != "") & (`"`stats'"' != "") ) { + disp as err "statistics() and stats() are aliases; use only one" + exit 198 + } + else if ( `"`stats'"' != "" ) { + local statistics: copy local stats + } + local scode + local kstats = `:list sizeof statistics' + foreach st of local statistics { + if ( "`st'" == "n" ) { + local scode `scode' -6 + } + else if ( "`st'" == "q" ) { + local kstats = `kstats' + 2 + local scode `scode' 25 50 75 + } + else { + encode_aliases `st' + local st `r(stat)' + encode_stat `st' 0 + if ( `r(statcode)' == 0 ) { + cap noi encode_regex `st' + if ( `r(statcode)' == 0 ) { + error 110 + } + else { + local scode `scode' `r(statcode)' + if ( `r(statcode)' == -18 ) scalar __gtools_nunique = 1 + } + } + else { + local scode `scode' `r(statcode)' + if ( `r(statcode)' == -18 ) scalar __gtools_nunique = 1 + } + } + } + } + + mata: __gtools_summarize_codes = strtoreal(tokens(st_local("scode"))) + mata: st_matrix("__gtools_summarize_codes", __gtools_summarize_codes) + scalar __gtools_summarize_kstats = `kstats' + + c_local GstatsMataSave: copy local matasavename + c_local varlist: copy local statlist + + * Auto-columns for _sum + * --------------------- + + if ( (`kstats' > 8) & (`"`_sum'"' != "") ) { + local coldef variables + } + else { + local coldef statistics + } + + if ( `"`columns'"' == "" ) local columns `coldef' + if ( `"`columns'"' == "var" ) local columns variables + if ( `"`columns'"' == "stat" ) local columns statistics + if ( !inlist(`"`columns'"', "variables", "statistics") ) { + disp as err `"columns(`columns') not allowed. Available: variables, statistics"' + exit 198 + } + scalar __gtools_summarize_colvar = (`"`columns'"' == "variables") +end + +* findfile "_gtools_internal.mata" +* include `"`r(fn)'"' + +cap mata: mata drop __gstats_summarize_results() +cap mata: mata drop __gstats_summarize_sprintf() +cap mata: mata drop __gstats_summarize_prettysplit() +cap mata: mata drop __gstats_tabstat_results() + +mata: +class GtoolsResults scalar function __gstats_summarize_results() +{ + class GtoolsResults GtoolsByLevels + string scalar fname, var, varlabel, fmt, vfmt, dfmt + string colvector varlabelsplit + string matrix printstr, statvars + real scalar k, l, J, tabstat, sep, usevfmt, maxl, pool, wcode + real scalar kvars + real scalar nrow + real scalar ncol + real matrix output + + GtoolsByLevels = GtoolsResults() + GtoolsByLevels.readScalars() + pool = GtoolsByLevels.pool + usevfmt = GtoolsByLevels.usevfmt + maxl = GtoolsByLevels.maxl + dfmt = GtoolsByLevels.dfmt + + wcode = (st_numscalar("__gtools_weight_code") > 0)? 2: 0 + sep = st_numscalar("__gtools_summarize_separator") + if ( sep <= 0 ) { + sep = 0 + } + + tabstat = st_numscalar("__gtools_summarize_tabstat") + if ( tabstat ) { + return(__gstats_tabstat_results()) + } + + fname = st_global("GTOOLS_GSTATS_FILE") + J = strtoreal(st_local("r_J")) + kvars = pool? 1: st_numscalar("__gtools_summarize_kvars"); + nrow = kvars * J + ncol = st_numscalar("__gtools_summarize_kstats") + output = GtoolsReadMatrix(fname, nrow, ncol) + + if ( ncol >= 6 ) { + st_numscalar("__gtools_summarize_N", output[nrow, 1]) + st_numscalar("__gtools_summarize_sum_w", output[nrow, 2]) + st_numscalar("__gtools_summarize_sum", output[nrow, 3]) + st_numscalar("__gtools_summarize_mean", output[nrow, 4]) + st_numscalar("__gtools_summarize_min", output[nrow, 5]) + st_numscalar("__gtools_summarize_max", output[nrow, 6]) + } + + if ( ncol >= 8 ) { + st_numscalar("__gtools_summarize_sd", output[nrow, 7]) + st_numscalar("__gtools_summarize_Var", output[nrow, 8]) + } + + if ( ncol >= 25 ) { + st_numscalar("__gtools_summarize_p1", output[nrow, 9]) + st_numscalar("__gtools_summarize_p5", output[nrow, 10]) + st_numscalar("__gtools_summarize_p10", output[nrow, 11]) + st_numscalar("__gtools_summarize_p25", output[nrow, 12]) + st_numscalar("__gtools_summarize_p50", output[nrow, 13]) + st_numscalar("__gtools_summarize_p75", output[nrow, 14]) + st_numscalar("__gtools_summarize_p90", output[nrow, 15]) + st_numscalar("__gtools_summarize_p95", output[nrow, 16]) + st_numscalar("__gtools_summarize_p99", output[nrow, 17]) + st_numscalar("__gtools_summarize_skewness", output[nrow, 18]) + st_numscalar("__gtools_summarize_kurtosis", output[nrow, 19]) + st_numscalar("__gtools_summarize_smallest1", output[nrow, 5]) + st_numscalar("__gtools_summarize_smallest2", output[nrow, 20]) + st_numscalar("__gtools_summarize_smallest3", output[nrow, 21]) + st_numscalar("__gtools_summarize_smallest4", output[nrow, 22]) + st_numscalar("__gtools_summarize_largest4", output[nrow, 23]) + st_numscalar("__gtools_summarize_largest3", output[nrow, 24]) + st_numscalar("__gtools_summarize_largest2", output[nrow, 25]) + st_numscalar("__gtools_summarize_largest1", output[nrow, 6]) + } + + // do sum non-detail style summaries + statvars = tokens(st_local("statvars")) + if ( J == 1 & st_numscalar("__gtools_summarize_detail") ) { + if ( st_numscalar("__gtools_summarize_noprint") == 0 ) { + for (k = 1; k <= kvars; k++) { + var = pool? "[Pooled Variables]": statvars[k] + vfmt = pool? dfmt: (usevfmt? st_varformat(var): dfmt) + + printstr = J(wcode? 14: 12, 5, " ") + + printstr[1 , 2] = "Percentiles" + printstr[1 , 3] = "Smallest" + printstr[8 + (wcode > 0) , 3] = "Largest" + printstr[4 + (wcode > 0) , 4] = "Obs" + printstr[5 + (wcode > 0) , 4] = "Sum of Wgt." + printstr[7 + (wcode > 0) , 4] = "Mean" + printstr[8 + (wcode > 0) , 4] = "Std. Dev." + printstr[10 + wcode , 4] = "Variance" + printstr[11 + wcode , 4] = "Skewness" + printstr[12 + wcode , 4] = "Kurtosis" + printstr[2 + (wcode > 0) , 1] = "1%" + printstr[3 + (wcode > 0) , 1] = "5%" + printstr[4 + (wcode > 0) , 1] = "10%" + printstr[5 + (wcode > 0) , 1] = "25%" + printstr[7 + (wcode > 0) , 1] = "50%" + printstr[9 + wcode , 1] = "75%" + printstr[10 + wcode , 1] = "90%" + printstr[11 + wcode , 1] = "95%" + printstr[12 + wcode , 1] = "99%" + if ( wcode ) { + printstr[2, 3] = "(weighted)" + printstr[10, 3] = "(weighted)" + } + + for(l = 1; l <= 4; l++) { + printstr[1 + l + (wcode > 0), 2] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 8 + l]) + } + printstr[7 + (wcode > 0), 2] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 13]) + for(l = 1; l <= 4; l++) { + printstr[8 + l + wcode, 2] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 13 + l]) + } + printstr[2 + (wcode > 0), 3] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 5]) + printstr[12 + wcode, 3] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 6]) + for(l = 1; l <= 3; l++) { + printstr[2 + l + (wcode > 0), 3] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 19 + l]) + } + for(l = 1; l <= 3; l++) { + printstr[8 + l + wcode, 3] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 22 + l]) + } + printstr[4 + (wcode > 0), 5] = strtrim((output[k, 1] == round(output[k, 1]))? /* + */ sprintf("%15.0gc", output[k, 1]): /* + */ sprintf(" " + dfmt, output[k, 1])) + printstr[5 + (wcode > 0), 5] = strtrim((output[k, 2] == round(output[k, 2]))? /* + */ sprintf("%15.0gc", output[k, 2]): /* + */ sprintf(" " + dfmt, output[k, 2])) + printstr[7 + (wcode > 0), 5] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 4]) + printstr[8 + (wcode > 0), 5] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 7]) + printstr[10 + wcode, 5] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 8]) + printstr[11 + wcode, 5] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 18]) + printstr[12 + wcode, 5] = __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 19]) + + varlabel = pool? "": st_varlabel(var) + printf("\n"); + if ( varlabel == "" ) { + printf("%~61s\n", var); + } + else { + varlabelsplit = __gstats_summarize_prettysplit(varlabel, 50) + for(l = 1; l <= rows(varlabelsplit); l++) { + printf("%~61s\n", varlabelsplit[l]); + } + } + printf( "{hline %g}\n", 61); + for(l = 1; l <= (12 + wcode); l++) { + printf("%4s", printstr[l, 1]); + printf(" "); + printf("%12s", printstr[l, 2]); + printf(" "); + printf("%12s", printstr[l, 3]); + printf(" "); + printf("%-12s", printstr[l, 4]); + printf("%12s", printstr[l, 5]); + printf("\n"); + } + } + } + GtoolsByLevels.read() + } + else if ( J == 1 & (st_numscalar("__gtools_summarize_normal") == 1) ) { + l = max((strlen(statvars), 12)) + fmt = sprintf("%%%gs", l) + + if ( st_numscalar("__gtools_summarize_noprint") == 0 ) { + printf("\n") + printf(fmt, "Variable") + printf(" | ") + printf("%12s", "Obs") + printf(" ") + printf("%11s", "Mean") + printf("%11s", " Std. Dev.") + printf("%11s", "Min") + printf("%11s", "Max") + printf("\n") + + printf(sprintf("{hline %g}", l + 1)) + printf("+") + printf("{hline 58}") + printf("\n") + + for (k = 1; k <= kvars; k++) { + var = pool? "[Pooled Var]": statvars[k] + vfmt = pool? dfmt: (usevfmt? st_varformat(var): dfmt) + printf(fmt, var) + printf(" | ") + printf((output[k, 1] == round(output[k, 1]))? "%12.0gc": " " + dfmt, output[k, 1]) + printf(" ") + printf("%11s", __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 4])) + printf("%11s", __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 7])) + printf("%11s", __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 5])) + printf("%11s", __gstats_summarize_sprintf(vfmt, dfmt, maxl, output[k, 6])) + printf("\n") + if ( mod(k, sep) == 0 ) { + printf(sprintf("{hline %g}\n", 58 + 1 + l + 1)) + } + } + } + GtoolsByLevels.read() + } + else if ( J > 1 ) { + return(__gstats_tabstat_results()) + } + + GtoolsByLevels.tabstat = 0 + GtoolsByLevels.output = output + GtoolsByLevels.colvar = st_numscalar("__gtools_summarize_colvar") + GtoolsByLevels.ksources = kvars + GtoolsByLevels.kstats = ncol + GtoolsByLevels.statvars = statvars + GtoolsByLevels.scodes = st_matrix("__gtools_summarize_codes") + GtoolsByLevels.whoami = st_local("GstatsMataSave") + GtoolsByLevels.readStatnames() + + return (GtoolsByLevels) +} + +string scalar function __gstats_summarize_sprintf( + string scalar vfmt, + string scalar dfmt, + real scalar maxl, + real scalar x) +{ + string scalar s + s = sprintf(vfmt, x) + if ( strlen(s) > maxl ) { + s = sprintf(dfmt, x) + } + return(s) +} + +string colvector function __gstats_summarize_prettysplit( + string scalar txt, + real scalar maxl) +{ + real scalar len, lenmax, lenbuf, badsplit + string colvector splitxt + string scalar bufleft, bufright + + if ( maxl < 1 ) { + return(txt) + } + + lenmax = floor(maxl) + len = strlen(txt) + if ( len <= maxl ) { + return(txt) + } + + splitxt = J(0, 1, "") + bufright = txt + while ( bufright != "" ) { + if ( strlen(bufright) > lenmax ) { + lenbuf = lenmax + 1 + do { + badsplit = substr(bufright, lenbuf--, 1) != " " + } while ( badsplit & lenbuf > 0 ) + } + else { + lenbuf = 0 + } + if ( lenbuf > 0 ) { + bufleft = substr(bufright, 1, lenbuf) + bufright = substr(bufright, lenbuf + 2, .) + } + else { + bufleft = substr(bufright, 1, lenmax) + bufright = substr(bufright, lenmax + 1, .) + } + splitxt = splitxt \ bufleft + } + + return(splitxt) +} + +class GtoolsResults scalar function __gstats_tabstat_results() +{ + class GtoolsResults scalar GtoolsByLevels + + GtoolsByLevels = GtoolsResults() + GtoolsByLevels.readScalars() + GtoolsByLevels.read() + + GtoolsByLevels.tabstat = 1 + GtoolsByLevels.colvar = st_numscalar("__gtools_summarize_colvar") + GtoolsByLevels.ksources = GtoolsByLevels.pool? 1: st_numscalar("__gtools_summarize_kvars") + GtoolsByLevels.kstats = st_numscalar("__gtools_summarize_kstats") + GtoolsByLevels.statvars = tokens(st_local("statvars")) + GtoolsByLevels.scodes = st_matrix("__gtools_summarize_codes") + GtoolsByLevels.whoami = st_local("GstatsMataSave") + + GtoolsByLevels.readStatnames() + GtoolsByLevels.readOutput(st_global("GTOOLS_GSTATS_FILE")) + if ( st_numscalar("__gtools_summarize_noprint") == 0 ) { + GtoolsByLevels.printOutput() + } + + return (GtoolsByLevels) +} +end + +capture program drop GtoolsTempFile +program GtoolsTempFile + if ( `"${GTOOLS_TEMPFILES_INTERNAL_I}"' == "" ) { + local GTOOLS_TEMPFILES_INTERNAL_I = 1 + global GTOOLS_TEMPFILES_INTERNAL_I = 1 + } + else { + local GTOOLS_TEMPFILES_INTERNAL_I = ${GTOOLS_TEMPFILES_INTERNAL_I} + 1 + global GTOOLS_TEMPFILES_INTERNAL_I = ${GTOOLS_TEMPFILES_INTERNAL_I} + 1 + } + local f ${GTOOLS_TEMPDIR}/__gtools_tmpfile_internal_`GTOOLS_TEMPFILES_INTERNAL_I' + global GTOOLS_TEMPFILES_INTERNAL ${GTOOLS_TEMPFILES_INTERNAL} __gtools_tmpfile_internal_`GTOOLS_TEMPFILES_INTERNAL_I' + c_local `0': copy local f +end + +*********************************************************************** +* Input parsing (copy/paste from gcollapse.ado * +*********************************************************************** + +capture program drop GtoolsPrettyStat +program GtoolsPrettyStat, rclass + + * Group stats + * ----------- + + if ( `"`0'"' == "sum" ) local prettystat "Sum" + if ( `"`0'"' == "nansum" ) local prettystat "Sum" + if ( `"`0'"' == "mean" ) local prettystat "Mean" + if ( `"`0'"' == "geomean" ) local prettystat "Geometric mean" + if ( `"`0'"' == "sd" ) local prettystat "St Dev." + if ( `"`0'"' == "variance" ) local prettystat "Variance" + if ( `"`0'"' == "cv" ) local prettystat "Coef. of variation" + if ( `"`0'"' == "max" ) local prettystat "Max" + if ( `"`0'"' == "min" ) local prettystat "Min" + if ( `"`0'"' == "range" ) local prettystat "Range" + if ( `"`0'"' == "count" ) local prettystat "Count" + if ( `"`0'"' == "freq" ) local prettystat "Group size" + if ( `"`0'"' == "percent" ) local prettystat "Percent" + if ( `"`0'"' == "median" ) local prettystat "Median" + if ( `"`0'"' == "iqr" ) local prettystat "IQR" + if ( `"`0'"' == "first" ) local prettystat "First" + if ( `"`0'"' == "firstnm" ) local prettystat "First Non-Miss." + if ( `"`0'"' == "last" ) local prettystat "Last" + if ( `"`0'"' == "lastnm" ) local prettystat "Last Non-Miss." + if ( `"`0'"' == "semean" ) local prettystat "SE Mean" + if ( `"`0'"' == "sebinomial" ) local prettystat "SE Mean (Binom)" + if ( `"`0'"' == "sepoisson" ) local prettystat "SE Mean (Pois)" + if ( `"`0'"' == "nunique" ) local prettystat "N Unique" + if ( `"`0'"' == "nmissing" ) local prettystat "N Missing" + if ( `"`0'"' == "skewness" ) local prettystat "Skewness" + if ( `"`0'"' == "kurtosis" ) local prettystat "Kurtosis" + if ( `"`0'"' == "rawsum" ) local prettystat "Unweighted sum" + if ( `"`0'"' == "rawnansum" ) local prettystat "Unweighted sum" + if ( `"`0'"' == "gini" ) local prettystat "Gini Coefficient" + if ( `"`0'"' == "gini|dropneg" ) local prettystat "Gini Coefficient (drop neg)" + if ( `"`0'"' == "gini|keepneg" ) local prettystat "Gini Coefficient (keep neg)" + + local match = 0 + if regexm(`"`0'"', "^rawselect(-|)([0-9]+)$") { + if ( `"`:di regexs(1)'"' == "-" ) { + local Pretty Largest (Unweighted) + } + else { + local Pretty Smallest (Unweighted) + } + local p = `=regexs(2)' + local match = 1 + } + else if regexm(`"`0'"', "^select(-|)([0-9]+)$") { + if ( `"`:di regexs(1)'"' == "-" ) { + local Pretty Largest + } + else { + local Pretty Smallest + } + local p = `=regexs(2)' + local match = 1 + } + else if regexm(`"`0'"', "^p([0-9][0-9]?(\.[0-9]+)?)$") { + local p = `:di regexs(1)' + local Pretty Pctile + local match = 1 + } + + if ( `match' ) { + if ( inlist(substr(`"`p'"', -2, 2), "11", "12", "13") ) { + local prettystat "`s'th `Pretty'" + } + else { + if ( mod(`p', 10) == 1 ) local prettystat "`p'st `Pretty'" + else if ( mod(`p', 10) == 2 ) local prettystat "`p'nd `Pretty'" + else if ( mod(`p', 10) == 3 ) local prettystat "`p'rd `Pretty'" + else local prettystat "`p'th `Pretty'" + } + } + + * Transforms + * ---------- + + if ( `"`0'"' == "standardize" ) local prettystat "Standardized" + if ( `"`0'"' == "normalize" ) local prettystat "Normalized" + if ( `"`0'"' == "demean" ) local prettystat "De-meaned" + if ( `"`0'"' == "demedian" ) local prettystat "De-medianed" + + encode_moving `0' + if ( `r(match)' ) { + local range `r(lower)' to `r(upper)' + GtoolsPrettyStat `r(stat)' + local prettystat "Moving `r(prettystat)' (`range')" + } + + encode_range `0' + if ( `r(match)' ) { + local rangestr `r(rangestr)' + GtoolsPrettyStat `r(stat)' + local prettystat "`r(prettystat)' for `rangestr'" + } + + encode_cumsum `0' + if ( `r(match)' ) { + if ( `r(cumsign)' == 0 ) { + local prettystat "Cummulative sum" + } + else if ( `r(cumsign)' == 1 ) { + if ( `r(cumother)' ) { + local prettystat "Cummulative sum (ascending by `r(cumvars)')" + } + else { + local prettystat "Cummulative sum (ascending)" + } + } + else if ( `r(cumsign)' == 2 ) { + if ( `r(cumother)' ) { + local prettystat "Cummulative sum (descending by `r(cumvars)')" + } + else { + local prettystat "Cummulative sum (descending)" + } + } + } + + encode_shift `0' + if ( `r(match)' ) { + if ( `r(shift)' == 0 ) { + local prettystat "" + } + else if ( `r(shift)' > 0 ) { + local prettystat "Lead (`r(shift)')" + } + else if ( `r(shift)' < 0 ) { + local prettystat "Lag (`=abs(`r(shift)')')" + } + } + + return local prettystat = `"`prettystat'"' +end + +capture program drop ParseListWild +program ParseListWild + local opts window(passthru) interval(passthru) cumby(passthru) shiftby(passthru) statprefix(str) + syntax anything(equalok), LOCal(str) PREfix(str) default(str) [`opts'] + local stat `default' + + * Trim spaces + local 0: copy local anything + while strpos("`0'", " ") { + local 0: subinstr local 0 " " " ", all + } + local 0 `0' + + * Parse each portion of the collapse call + while (trim("`0'") != "") { + GetStat stat 0 : `0' + GetTarget target 0 : `0' + gettoken vars 0 : 0 + + * Must specify stat (if blank, we do the mean) + if ( "`stat'" == "" ) { + disp as err "option stat() requried" + exit 198 + } + + if ( `"`stat'"' == "var" ) local stat variance + if ( `"`stat'"' == "sem" ) local stat semean + if ( `"`stat'"' == "seb" ) local stat sebinomial + if ( `"`stat'"' == "sep" ) local stat sepoisson + if ( `"`stat'"' == "skew" ) local stat skewness + if ( `"`stat'"' == "kurt" ) local stat kurtosis + if ( regexm(`"`stat'"', " ") ) local stat: subinstr local stat " " "|", all + + if ( substr(`"`stat'"', 1, length(`"`statprefix'"')) != `"`statprefix'"' ) { + local stat `statprefix'`stat' + } + encode_moving `stat', `window' + + * Parse bulk rename if applicable + unab usources : `vars' + if ( "`eqsign'" == "=" ) { + cap noi rename `vars' `target' + if ( _rc ) { + disp as err "Targets cannot exist with option {opt wildparse}." + exit `=_rc' + } + unab utargets : `target' + rename (`utargets') (`usources') + + local full_vars `full_vars' `usources' + local full_targets `full_targets' `utargets' + + foreach svar of varlist `usources' { + gettoken tvar utargets: utargets + + * Parsed here because each interval call can specify a + * different reference variable. If no reference variable + * is specified then it is assumed to be the source. + + encode_range `stat', `interval' var(`svar') + encode_cumsum `stat', `cumby' var(`svar') + encode_shift `stat', `shiftby' + + local call `call' (`stat') `tvar' = `svar' + local full_stats `full_stats' `stat' + } + } + else { + local full_vars `full_vars' `usources' + local full_targets `full_targets' `usources' + + foreach svar of varlist `usources' { + encode_range `stat', `interval' var(`svar') + encode_cumsum `stat', `cumby' var(`svar') + encode_shift `stat', `shiftby' + + local call `call' (`stat') `svar' + local full_stats `full_stats' `stat' + } + } + + local target + } + + * Check that targets don't repeat + local dups : list dups targets + if ("`dups'" != "") { + di as error "repeated targets in collapse: `dups'" + error 110 + } + + c_local `local' : copy local call + c_local `prefix'_targets `full_targets' + c_local `prefix'_stats `full_stats' + c_local `prefix'_vars `full_vars' + c_local `prefix'_uniq_stats : list uniq full_stats + c_local `prefix'_uniq_vars : list uniq full_vars +end + +* NOTE: Regular parsing is adapted from Sergio Correia's fcollapse.ado + +capture program drop ParseList +program define ParseList + local opts window(passthru) interval(passthru) cumby(passthru) shiftby(passthru) statprefix(str) + syntax anything(equalok), PREfix(str) default(str) [`opts'] + local stat `default' + + * Trim spaces + local 0: copy local anything + while strpos("`0'", " ") { + local 0: subinstr local 0 " " " " + } + local 0 `0' + + while (trim("`0'") != "") { + GetStat stat 0 : `0' + GetTarget target 0 : `0' + gettoken vars 0 : 0 + unab vars : `vars' + + * Must specify stat (if blank, we do the mean) + if ( "`stat'" == "" ) { + disp as err "option stat() requried" + exit 198 + } + + if ( `"`stat'"' == "var" ) local stat variance + if ( `"`stat'"' == "sem" ) local stat semean + if ( `"`stat'"' == "seb" ) local stat sebinomial + if ( `"`stat'"' == "sep" ) local stat sepoisson + if ( `"`stat'"' == "skew" ) local stat skewness + if ( `"`stat'"' == "kurt" ) local stat kurtosis + if ( regexm(`"`stat'"', " ") ) local stat: subinstr local stat " " "|", all + + if ( substr(`"`stat'"', 1, length(`"`statprefix'"')) != `"`statprefix'"' ) { + local stat `statprefix'`stat' + } + encode_moving `stat', `window' + + foreach var of local vars { + if ("`target'" == "") local target `var' + + encode_range `stat', `interval' var(`var') + encode_cumsum `stat', `cumby' var(`var') + encode_shift `stat', `shiftby' + + local full_vars `full_vars' `var' + local full_targets `full_targets' `target' + local full_stats `full_stats' `stat' + + local target + } + } + + * Check that targets don't repeat + local dups : list dups targets + if ("`dups'" != "") { + di as error "repeated targets in collapse: `dups'" + error 110 + } + + c_local `prefix'_targets `full_targets' + c_local `prefix'_stats `full_stats' + c_local `prefix'_vars `full_vars' + c_local `prefix'_uniq_stats : list uniq full_stats + c_local `prefix'_uniq_vars : list uniq full_vars +end + +capture program drop GetStat +program define GetStat + _on_colon_parse `0' + local before `s(before)' + gettoken lhs rhs : before + local rest `s(after)' + + gettoken stat rest : rest , match(parens) + if ("`parens'" != "") { + c_local `lhs' `stat' + c_local `rhs' `rest' + } +end + +capture program drop GetTarget +program define GetTarget + _on_colon_parse `0' + local before `s(before)' + gettoken lhs rhs : before + local rest `s(after)' + + local rest : subinstr local rest "=" "= ", all + gettoken target rest : rest, parse("= ") + gettoken eqsign rest : rest + if ("`eqsign'" == "=") { + c_local `lhs': copy local target + c_local `rhs': copy local rest + c_local eqsign "=" + } + else { + c_local eqsign + } +end + +capture program drop NoInitWarning +program NoInitWarning + if !inlist(`"${GTOOLS_NOINIT_WARNING}"', "0") { + disp as txt "WARNING: You have chosen to use the undocumented option -noinit-" + disp as txt "with -replace- and if/in. Variables that exist and will be replaced" + disp as txt "WITHOUT modifying any observations not tagged by if/in. Please make" + disp as txt "sure you understand the implications of doing this. To supress this" + disp as txt "warning, set" + disp as txt "" + disp as txt " global GTOOLS_NOINIT_WARNING = 0" + disp as txt "" + } +end + +*********************************************************************** +* Load plugin * +*********************************************************************** + +if ( inlist("`c(os)'", "MacOSX") | strpos("`c(machine_type)'", "Mac") ) local c_os_ macosx +else local c_os_: di lower("`c(os)'") + +if ( `c(stata_version)' < 14.1 ) local spiver v2 +else local spiver v3 + +cap program drop gtools_plugin +if ( inlist("${GTOOLS_FORCE_PARALLEL}", "1") ) { + cap program gtools_plugin, plugin using("gtools_`c_os_'_multi_`spiver'.plugin") + if ( _rc ) { + global GTOOLS_FORCE_PARALLEL 17900 + program gtools_plugin, plugin using("gtools_`c_os_'_`spiver'.plugin") + } +} +else program gtools_plugin, plugin using("gtools_`c_os_'_`spiver'.plugin") diff --git a/01.code/ado/_/_gtotal0.ado b/01.code/ado/_/_gtotal0.ado new file mode 100755 index 0000000..5e9d005 --- /dev/null +++ b/01.code/ado/_/_gtotal0.ado @@ -0,0 +1,15 @@ +*! version 3.1.4 CFBaum 25mar2006 from 3.1.1 _gsum +program define _gtotal0 +version 8.2 +syntax newvarname =/exp [if] [in] [, BY(varlist)] +tempvar touse temp1 +quietly { + gen byte `touse'=1 `if' `in' + bys `touse' `by': gen `typlist' `varlist' = sum(`exp') if `touse'==1 + by `touse' `by': gen `temp1' = sum((`exp')<.) + by `touse' `by': replace `varlist' = cond(`temp1'[_N]==_N,`varlist'[_N],.) + } +end + + + diff --git a/01.code/ado/_/_gtruncdig.ado b/01.code/ado/_/_gtruncdig.ado new file mode 100755 index 0000000..e0c1df4 --- /dev/null +++ b/01.code/ado/_/_gtruncdig.ado @@ -0,0 +1,13 @@ +*! egen truncdig() cfb 3nov2016 +capt prog drop _gtruncdig +prog def _gtruncdig + version 12 + syntax newvarname =/exp [if] [in] , DIG(integer) + tempvar touse + tempname pwr + qui{ + gen byte `touse'=1 `if' `in' + sca `pwr' = 10^`dig' + gen `varlist' = 1/`pwr' * trunc(`exp' * `pwr') if `touse' == 1 + } +end diff --git a/01.code/ado/_/_gvar.ado b/01.code/ado/_/_gvar.ado new file mode 100755 index 0000000..bcf47ef --- /dev/null +++ b/01.code/ado/_/_gvar.ado @@ -0,0 +1,17 @@ +*! 1.0.0 NJC 31 Dec 2002 +* _gsd version 3.1.0 30jun1998 +program define _gvar + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + tempvar touse mean + quietly { + gen byte `touse'=1 `if' `in' + sort `touse' `by' + by `touse' `by': gen double `mean' = /* + */ sum(`exp')/sum((`exp')!=.) if `touse'==1 + by `touse' `by': gen `typlist' `varlist' = /* + */ sum(((`exp')-`mean'[_N])^2)/(sum((`exp')!=.)-1) /* + */ if `touse'==1 & sum(`exp'!=.) + by `touse' `by': replace `varlist' = `varlist'[_N] + } +end diff --git a/01.code/ado/_/_gwordof.ado b/01.code/ado/_/_gwordof.ado new file mode 100755 index 0000000..6d3a783 --- /dev/null +++ b/01.code/ado/_/_gwordof.ado @@ -0,0 +1,31 @@ +*! 1.0.0 NJC 17 July 2000 +program define _gwordof + version 6.0 + + gettoken type 0 : 0 + gettoken g 0 : 0 + gettoken eqs 0 : 0 + + syntax varlist(max=1 string) [if] [in] , Word(int) + + marksample touse, strok + local type "str1" /* ignores type passed from -egen- */ + + quietly { + gen `type' `g' = "" + local i = 1 + while `i' <= _N { + if `touse'[`i'] { + local value = `varlist'[`i'] + local nw : word count `value' + local which = cond(`word' < 0, `nw' + `word' + 1, `word') + if `which' > 0 { + local value : word `which' of `value' + replace `g' = `"`value'"' in `i' + } + } + local i = `i' + 1 + } + } +end + diff --git a/01.code/ado/_/_gwpctile.ado b/01.code/ado/_/_gwpctile.ado new file mode 100755 index 0000000..10f29c0 --- /dev/null +++ b/01.code/ado/_/_gwpctile.ado @@ -0,0 +1,49 @@ +* 1.1.0 24 May 2007 +*! Uli Kohler 1.0.0 April 4, 2007 @ 18:47:24 +*! egen function -pctile()- with weights and altdef +program _gwpctile + version 8.2 + syntax newvarname =/exp [if] [in] /// + [, Weights(string) p(real 50) ALTdef BY(varlist) ] + + if `p'<=0 | `p'>=100 { + di as err "p(`p') must be between 0 and 100" + exit 198 + } + + tempvar touse x + + if "`weights'" != "" { + local weights "[aweight=`weights']" + if "`altdef'" != "" { + di as err "altdef not allowed with weights()" + exit 198 + } + } + + quietly { + mark `touse' `if' `in' + gen double `x' = `exp' if `touse' + + if "`by'"=="" { + _pctile `x' `weights' if `touse', p(`p') `altdef' + gen `typlist' `varlist' = r(r1) if `touse' + exit 0 + } + + sort `touse' `by' `x' + tempvar N + by `touse' `by': gen long `N' = _n == 1 if `touse' + by `touse': replace `N' = sum(`N') if `touse' + sum `N', meanonly + local maxby = r(max) + + gen `typlist' `varlist' = . + forv i = 1/`maxby' { + _pctile `x' `weights' /// + if `touse' & `N' == `i', p(`p') `altdef' + replace `varlist' = r(r1) if `touse' & `N' == `i' + } + } +end + diff --git a/01.code/ado/_/_gwtfreq.ado b/01.code/ado/_/_gwtfreq.ado new file mode 100755 index 0000000..fea8fcc --- /dev/null +++ b/01.code/ado/_/_gwtfreq.ado @@ -0,0 +1,16 @@ +*! NJC 1.0.0 31 Dec 2002 +program define _gwtfreq + version 6 + syntax newvarname =/exp [if] [in] [, BY(varlist)] + quietly { + marksample touse, novarlist + tempvar wt + gen double `wt' = `exp' + sort `touse' `by' + by `touse' `by': gen `typlist' `varlist' = sum(`wt') /* + */ if `touse' + by `touse' `by': replace `varlist' = `varlist'[_N] + su `wt' if `touse', meanonly + replace `varlist' = `varlist' / r(mean) + } +end diff --git a/01.code/ado/_/_gxtile.ado b/01.code/ado/_/_gxtile.ado new file mode 100755 index 0000000..f41acb3 --- /dev/null +++ b/01.code/ado/_/_gxtile.ado @@ -0,0 +1,104 @@ +*! 2.1 UK/NJC 14 Jan 2019 +*! _gxtile version 2.0 UK 28 AUG 2016 +*! Categorizes exp by its quantiles - byable + +* Version history +// 2.0: Use levelsof instead of levels +// 1.2: Bug: Opt percentiles were treated incorrectely after implement. of option nq +// Allows By-Variables that are strings +// 1.1: Bug: weights are treated incorectelly in version 1.0. -> fixed +// New option nquantiles() implemented +// 1.0: initial version + +* Main program +program _gxtile, byable(onecall) sortpreserve + version 8.2 + +** Syntax + gettoken type 0 : 0 + gettoken h 0 : 0 + gettoken eqs 0 : 0 + + syntax varname(numeric) [if] [in] [, /// + Percentiles(string) /// + Nquantiles(string) /// + Weights(string) ALTdef by(varlist) ] + + marksample touse + +** Error Checks + + if "`altdef'" ~= "" & "`weights'" ~= "" { + di as error "weights are not allowed with altdef" + exit 111 + } + + if "`percentiles'" != "" & "`nquantiles'" != "" { + di as error "do not specify percentiles and nquantiles" + exit 198 + } + +** Default Settings etc. + + if "`weights'" ~= "" { + local weight "[aw = `weights']" + } + + if "`percentiles'" != "" { + local pctopt percentiles(`percentiles') + local pct "`percentiles'" + } + + else if "`nquantiles'" != "" { + local pctopt nquantiles(`nquantiles') + local pct "1/`=`nquantiles'-1'" + } + + else if "`nquantiles'" == "" & "`percentiles'" == "" { + local pctoption percentiles(50) + local pct "1/2" + } + + quietly { + + gen `type' `h' = . + +*** Without by + + if "`by'"=="" { + + local i 1 + _pctile `varlist' `weight' if `touse', `pctopt' `altdef' + foreach p of numlist `pct' { + if `i' == 1 { + replace `h' = `i' if `varlist' <= r(r`i') & `touse' + } + replace `h' = `++i' if `varlist' > r(r`--i') & `touse' + local i = `i' + 1 + } + exit + } + +*** With by + + tempvar byvar + by `touse' `by', sort: gen `byvar' = 1 if _n==1 & `touse' + by `touse' (`by'): replace `byvar' = sum(`byvar') + + sum `byvar', meanonly + forval k = 1/`r(max)' { + local i 1 + _pctile `varlist' `weight' if `byvar' == `k' & `touse' , `pctopt' `altdef' + foreach p of numlist `pct' { + if `i' == 1 { + replace `h' = `i' if `varlist' <= r(r`i') & `byvar' == `k' & `touse' + } + replace `h' = `++i' if `varlist' > r(r`--i') & `byvar' == `k' & `touse' + local i = `i' + 1 + } + } + } + +end +exit + diff --git a/01.code/ado/d/ds3.ado b/01.code/ado/d/ds3.ado new file mode 100755 index 0000000..2e6c8ff --- /dev/null +++ b/01.code/ado/d/ds3.ado @@ -0,0 +1,385 @@ +*! NJC 1.2.3 5 February 2002 +*! NJC 1.2.2 24 September 2001 +* each tab replaced by spaces +program define ds3, rclass + version 7 + + * update for Stata/SE 1 February 2002 + local smax = cond("$S_StataSE" == "SE", 245, 81) + + syntax [varlist] [if] [in] [, NUMeric STRing1 byte int long float double /* + */ STRing2(numlist int >0 <`smax') COmplement ANY(str asis) ALL(str asis) /* + */ NONE(str asis) HAS(str) NOT(str) case Detail /* + */ COLs(numlist int max=1 >0 <13) PLACEholder(str) LOcal(str) GLobal(str) ] + + * checking for number of has(), not(), any(), all(), none() options + local ntopts = 0 + foreach opt in has not any all none { + if `"``opt''"' != "" { + local testopts "`testopts'`opt'" + local Testopts "`Testopts'`opt' " + local ntopts = `ntopts' + 1 + } + } + + if `ntopts' > 1 { + di as err "may not combine `Testopts'options" + exit 198 + } + + if "`placeholder'" != "" { + local np : word count `placeholder' + if `np' > 1 { + di as err "placeholder must be single symbol or word" + exit 198 + } + local X "`placeholder'" + } + else local X "X" + + if `"`any'`all'`none'"' != "" { + if !index(`"`any'`all'`none'"',"`X'") { + di as err "`testopts' does not contain `X'" + exit 198 + } + + * identifying conditions which will fail with numbers | strings + * test condition, first substituting the number 1 + local thistest : subinstr local `testopts' "`X'" "1", all + capture count if `thistest' + local numtest = cond(_rc, 0, 1) + + * next substituting string "a" + local thistest : subinstr local `testopts' "`X'" `""a""', all + capture count if `thistest' + local strtest = cond(_rc, 0, 1) + if !`numtest' & !`strtest' { /* failed both */ + di as err "invalid `testopts' option" + exit 198 + } + } + + if `"`not'"' != "" { + local has "`not'" + local not 1 + } + else local not 0 + + if `"`has'"' != "" { + * what kind of thing? which particular thing(s)? + tokenize `has' + args what + mac shift + local which "`*'" + + * first element should start var | val | f | c + local what = lower("`what'") + + local l = length("`what'") + + if "`what'" == substr("varlabel",1,max(4,`l')) { + local what "varl" + } + else if "`what'" == substr("vallabel",1,max(4,`l')) { + local what "vall" + } + else if "`what'" == substr("format",1,max(1,`l')) { + if "`which'" == "" { BadHas `not' } + local what "f" + } + else if "`what'" == substr("char",1,max(1,`l')) { + local what "c" + } + else BadHas `not' + + * to lower case: fewer problems if `which' is longer than 80 chars + if "`case'" != "" { + local case 1 + foreach word in `which' { + local lower = lower("`word'") + local which2 "`which2' `lower'" + } + local which "`which2'" + } + else local case 0 + } + + * what restrictions on types of variables? + local nopts : word count /* + */ `numeric' `string1' `byte' `int' `long' `float' `double' `string2' + + if `nopts' == 0 { /* none */ + local list "`varlist'" + } + else { /* some */ + foreach opt in numeric string1 { + local `opt' = "``opt''" != "" + } + + local types "`byte' `int' `long' `float' `double'" + if "`string2'" != "" { + foreach n of local string2 { + local types "`types'str`n' " + } + } + local ntypes : word count `types' + + foreach x of local varlist { + local OK 0 + if `string1' | `numeric' { + capture confirm string variable `x' + local isstr = _rc == 0 + if `isstr' & `string1' { + local OK 1 + } + if !`isstr' & `numeric' { + local OK 1 + } + } + if `ntypes' { + local type : type `x' + foreach t of local types { + if "`t'" == "`type'" { + local OK 1 + } + } + } + if `OK' { local list "`list'`x' " } + } + } + + * complement of varlist? + if "`complement'" != "" { + foreach y of varlist _all { + local found 0 + foreach x of local list { + if "`x'" == "`y'" { + local found 1 + continue, break + } + } + if !`found' { local clist "`clist'`y' " } + } + local list "`clist'" + } + + if `ntopts' == 0 { tokenize `list' } + + * implementation of has() or not() + if `"`has'"' != "" { + * variable or value labels + if "`what'" == "varl" | "`what'" == "vall" { + local kind = /* + */ cond("`what'" == "varl", "variable", "value") + if `"`which'"' == "" { /* any label */ + local op = cond(`not', "==", "!=") + foreach x of local list { + local lbl : `kind' label `x' + if `"`lbl'"' `op' "" { + local list2 "`list2'`x' " + } + } + } + else { /* some label pattern */ + if `not' { /* must match no pattern */ + foreach x of local list { + local lbl : `kind' label `x' + if `case' { + local lbl = lower(`"`lbl'"') + } + local found 0 + foreach w of local which { + if match(`"`lbl'"',`"`w'"') { + local found 1 + continue, break + } + } + if !`found' { + local list2 "`list2' `x'" + } + } + } + else { /* can match any pattern */ + foreach x of local list { + local lbl : `kind' label `x' + if `case' { + local lbl = lower(`"`lbl'"') + } + foreach w of local which { + if match(`"`lbl'"',`"`w'"') { + local list2 "`list2'`x' " + continue, break + } + } + } + } + } + } /* end of code for variable or value labels */ + * formats + else if "`what'" == "f" { + if `not' { /* must match no pattern */ + foreach x of local list { + local fmt : format `x' + if `case' { + local fmt = lower(`"`fmt'"') + } + local found 0 + foreach w of local which { + if match(`"`fmt'"',`"`w'"') | /* + */ match(`"`fmt'"',`"%`w'"') { + local found 1 + continue, break + } + } + if !`found' { + local list2 "`list2' `x'" + } + } + } + else { /* can match any pattern */ + foreach x of local list { + local fmt : format `x' + if `case' { + local fmt = lower(`"`fmt'"') + } + foreach w of local which { + if match(`"`fmt'"',`"`w'"') | /* + */ match(`"`fmt'"',`"%`w'"') { + local list2 "`list2'`x' " + continue, break + } + } + } + } + } /* end of code for formats */ + * characteristics + else { + if `"`which'"' == "" { /* any char */ + local op = cond(`not', "==", "!=") + foreach x of local list { + local chr : char `x'[] + if `"`chr'"' `op' "" { + local list2 "`list2'`x' " + } + } + } + else { /* some char pattern */ + if `not' { /* must match no pattern */ + foreach x of local list { + local chr : char `x'[] + local found 0 + foreach c of local chr { + if `case' { + local c = lower(`"`c'"') + } + foreach w of local which { + if match(`"`c'"',`"`w'"') { + local found 1 + continue, break + } + } + } + if !`found' { + local list2 "`list2' `x'" + } + } + } + else { /* can match any pattern */ + foreach x of local list { + local chr : char `x'[] + local found 0 + foreach c of local chr { + if `case' { + local c = lower(`"`c'"') + } + foreach w of local which { + if match(`"`c'"',`"`w'"') { + local found 1 + local list2 "`list2'`x' " + continue, break + } + } + if `found' { continue, break } + } + } + } + } + } /* end of code for characteristics */ + + tokenize `list2' + } + + * implementation of any(), all(), none() + if `"`any'`all'`none'"' != "" { + marksample touse, strok novarlist + qui count if `touse' + local N = r(N) + + foreach v of local list { + capture confirm string variable `v' + local isstr = _rc == 0 + if (`isstr' & `strtest') | (!`isstr' & `numtest') { + local thistest : subinstr local `testopts' "`X'" "`v'", all + qui count if `thistest' & `touse' + if "`testopts'" == "any" & r(N) > 0 { + local list2 "`list2'`v' " + } + else if "`testopts'" == "none" & r(N) == 0 { + local list2 "`list2'`v' " + } + else if "`testopts'" == "all" & r(N) == `N' { + local list2 "`list2'`v' " + } + } + } + + tokenize `list2' + } + + if "`*'" != "" { + if "`detail'" != "" { describe `*' } + else { + if "`cols'" == "" { local cols = 8 } + + if `cols' == 1 { local length = 78 } + else if `cols' == 2 { local length = 38 } + else if `cols' == 3 { local length = 24 } + else if `cols' == 4 { local length = 18 } + else if `cols' == 5 { local length = 14 } + else if `cols' == 6 { local length = 11 } + else if `cols' == 7 { local length = 9 } + else if `cols' == 8 { local length = 8 } + else if `cols' == 9 { local length = 6 } + else if `cols' == 10 { local length = 6 } + else if `cols' == 11 { local length = 5 } + else if `cols' == 12 { local length = 4 } + + local lp1 = `length' + 1 + local i 1 + while "``i''" != "" { + if (mod(`i' - 1, `cols') == 0 & `i' != 1) { di } + local abname = abbrev("``i''",`length') + local l = `lp1' - length("`abname'") + (mod(`i', `cols') != 0) + di in gr "`abname'" _skip(`l') _c + local i = `i' + 1 + } + di + } + return local varlist `*' + } + + * undocumented, for now + if "`local'" != "" { c_local `local' "`*'" } + if "`global'" != "" { global `global' "`*'" } +end + +program def BadHas + args not + if `not' { + di as err "invalid not() option" + } + else di as err "invalid has() option" + exit 198 +end +exit + diff --git a/01.code/ado/e/egenmore.sthlp b/01.code/ado/e/egenmore.sthlp new file mode 100755 index 0000000..a361888 --- /dev/null +++ b/01.code/ado/e/egenmore.sthlp @@ -0,0 +1,1548 @@ +{smcl} +{* 21mar2006/1feb2007/21feb2007/24may2007/9jan2009/8feb2010/29feb2012/14feb2014/20nov2016}{...} +{hline} +help for {cmd:egenmore} +{hline} + +{title:Extensions to generate (more extras)} + +{p 8 17 2}{cmd:egen} +[{it:type}] +{it:newvar} +{cmd:=} +{it:fcn}{cmd:(}{it:arguments}{cmd:)} +[{cmd:if} {it:exp}] +[{cmd:in} {it:range}] +[{cmd:,} {it:options}] + + +{title:Description} + +{p 4 4 2} +{help egen} creates {it:newvar} of the optionally specified storage type +equal to {it:fcn}{cmd:(}{it:arguments}{cmd:)}. Depending on +{it:fcn}{cmd:()}, {it:arguments} refers to an expression, a +{help varlist}, a {help numlist}, or an empty string. The options are +similarly function dependent. + + +{title:Functions} + +{p 4 4 2} +(The option {cmd:by(}{it:byvarlist}{cmd:)} means that computations are +performed separately for each group defined by {it:byvarlist}.) + +{p 4 4 2} +Functions are grouped thematically as follows:{p_end} +{space 8}Grouping and graphing +{space 8}Strings, numbers and conversions +{space 8}Dates, times and time series +{space 8}Summaries and estimates +{space 8}First and last +{space 8}Random numbers +{space 8}Row operations + + +{title:Grouping and graphing} + +{p 4 8 2} +{cmd:axis(}{it:varlist}{cmd:)} +[ +{cmd:, gap} +{cmd:label(}{it:lblvarlist}{cmd:)} +{cmdab:miss:ing} +{cmdab:rev:erse} +] +resembles {help egen}'s {cmd:group()}, but is specifically designed for +constructing categorical axis variables for graphs, hence the name. It +creates a single variable taking on values 1, 2, ... for the groups +formed by {it:varlist}. {it:varlist} may contain string, numeric, or +both string and numeric variables. The order of the groups is that of +the sort order of {it:varlist}. {cmd:gap} overrides the default +numbering of 1 up by adding a gap of 1 whenever a variable changes. +{cmd:label()} specifies that labels are to be assigned based on the +value labels or values of {it:lblvarlist}; if not specified, +{it:lblvarlist} defaults to {it:varlist}. {cmd:missing} indicates that +missing values in {it:varlist} (either numeric missing or {cmd:""}) are to be +treated like any other value when assigning groups, instead of missing +values being assigned to the group missing. {cmd:reverse} reverses +labelling so that groups that would have been assigned values of 1 ... +whatever are instead assigned values of whatever ... 1. (Stata 8 +required.) + +{p 4 4 2} +To order groups of a categorical variable according +to their values of another variable, in preparation +for a graph or table: + +{p 4 8 2}{cmd:. egen meanmpg = mean(-mpg), by(rep78)}{p_end} +{p 4 8 2}{cmd:. egen Rep78 = axis(meanmpg rep78), label(rep78)}{p_end} +{p 4 8 2}{cmd:. tabstat mpg, by(Rep78) s(min mean max)} + +{p 4 4 2}Note: the function author considers this approach superseded by +his {cmd:seqvar} and {cmd:labmask} (Cox 2008). + +{p 4 8 2} +{cmd:clsst(}{it:varname}{cmd:)} +{cmd:,} +{cmdab:v:alues(}{it:numlist}{cmd:)} +[ +{cmdab:l:ater} +] +returns whichever of the {it:numlist} in {cmd:values()} is closest +(differs by least, disregarding sign) to the numeric variable +{it:varname}. {cmd:later} specifies that in the event of ties values +specified later in the list overwrite values specified earlier. If +varname is 15 then 10 and 20 specified by {cmd:values(10 20)} are +equally close. For any observation containing 15 the default is that 10 +is reported, whereas with {cmd:later} 20 is reported. For a {it:numlist} +containing an increasing sequence, {cmd:later} implies choosing the +higher of two equally close values. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen mpgclass = clsst(mpg), v(10(5)40)} + +{p 4 8 2} +{cmd:egroup(}{it:varlist}{cmd:)} is a extension of {help egen}'s +{cmd:group()} function with the extra option +{cmd:label(}{it:lblvarlist}{cmd:)}, which will attach the original +values (or value labels if they exist) of {it:lblvarlist} as value +labels. This option may not be combined with the {cmd:label} option. +(Stata 7 required; superseded by {cmd:axis()} above.) + +{p 4 8 2} +{cmdab:group2(}{it:varlist}{cmd:)} is a generalisation of +{help egen}'s {cmd:group()} with the extra option +{cmd:sort(}{it:egen_call}{cmd:)}. Groups of {it:varlist} will have +values 1 upwards according to their values on the results of a specified +{it:egen_call}. For example, {cmd:group2(rep78) sort(mean(mpg))} will +produce a variable such that the group of {cmd:rep78} with the lowest +mean of {cmd:mpg} will have value 1, that with the second lowest mean +will have value 2, and so forth. As with {cmd:group()}, the +{cmd:label} option will attach the original values of {it:varlist} (or +value labels if they exist) as value labels. The argument of +{cmd:sort()} must be a valid call to an {cmd:egen} function, official or +otherwise. (Stata 7 required; use of {cmd:egroup()} or +{cmd:axis()} above is now considered better style.) + +{p 4 8 2}{cmd:mlabvpos(}{it:yvar xvar}{cmd:)} +[ +{cmd:,} +{cmd:log} +{cmdab:poly:nomial(}{it:#}{cmd:)} +{cmdab:mat:rix(}{it:5x5 matrix}{cmd:)} +] +automatically generates a variable giving clock positions of marker labels +given names of variables {it:yvar} and {it:xvar} defining the axes of a scatter +plot. Thus the command generates a variable to be used in the {help scatter} +option {cmd:mlabvpos()}. + +{p 8 8 2} +The general idea is to pull marker labels away from the +data region. So, marker labels in the lower left of the +region are at clock positions 7 or 8, and those in the upper right +are at clock-position 1 or 2, etc. +More precisely, considering the following rectangle as the data region, +then marker labels are placed as follows: + +{col 9}{c TLC}{hline 14}{c TRC} +{col 9}{c |}11 12 12 12 1{c |} +{col 9}{c |}10 11 12 1 2{c |} +{col 9}{c |} 9 9 12 3 3{c |} +{col 9}{c |} 8 7 6 5 4{c |} +{col 9}{c |} 7 6 6 6 5{c |} +{col 9}{c BLC}{hline 14}{c BRC} + +{p 8 8 2} +Note that there is no attempt to prevent marker labels from overplotting, +which is likely in any dataset with many observations. In such situations +you might be better off simply randomizing clock positions with say +{cmd:ceil(uniform() * 12)}. + +{p 8 8 2} +If {it:yvar} and {it:xvar} are highly correlated, than the clock-positions are generated +as follows (which is however the same general idea): + +{col 9}{c TLC}{hline 14}{c TRC} +{col 9}{c |} 12 1 3{c |} +{col 9}{c |} 12 12 3 4{c |} +{col 9}{c |}11 11 12 5 5{c |} +{col 9}{c |}10 9 6 6 {c |} +{col 9}{c |} 9 7 6 {c |} +{col 9}{c BLC}{hline 14}{c BRC} + +{p 8 8 2} +To calculate the positions, the x axis is first categorized into 5 equal +intervals around the mean of {it:xvar}. Afterwards the residuals from regression of +{it:yvar} on {it:xvar} are categorized into 5 equal intervals. Both categorized +variables are then used to calculate the positions according to the first +table above. The rule can be changed with the option {cmd:matrix()}. + +{p 8 8 2} +{cmd:log} indicates that residuals from regression are to be calculated +using the logarithms of {it:xvar}. This might be useful if the scatter +shows a strong curvilinear relationship. + +{p 8 8 2} +{cmd:polynomial(}{it:#}{cmd:)} indicates that residuals are to be calculated +from a regression of {it:yvar} on a polynomial of {it:xvar}. For example, use +{cmd:poly(2)} if the scatter shows a U-shaped relationship. + +{p 8 8 2} +{cmd:matrix(}{it:#}{cmd:)} is used to change the general rule for the plot positions. +The positions are specified by a 5 x 5 matrix, in which cell [1,1] gives the clock +position of marker labels in the upper left part of the data region, and so forth. +(Stata 8.2 required.) + +{p 4 8 2}{cmd:. egen clock = mlabvpos(mpg weight)}{p_end} +{p 4 8 2}{cmd:. scatter mpg weight, mlab(make) mlabvpos(clock)}{p_end} +{p 4 8 2}{cmd:. egen clock2 = mlabvpos(mpg weight), matrix(11 1 12 11 1 \\ 10 2 12 10 2 \\ 9 3 12 9 3 \\ 8 4 6 8 4 \\ 7 5 6 7 5)}{p_end} +{p 4 8 2}{cmd:. sc mpg weight, mlab(make) mlabvpos(clock2)} + +{title:Strings, numbers and conversions} + +{p 4 8 2} +{cmd:base(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmdab:b:ase(}{it:#}{cmd:)} +] +produces a string variable containing the digits of a base {it:#} +(default 2, possible values 2(1)9) representation of {it:varname}, which +must contain integers. Thus if {it:varname} contains values 0, 1, 2, 3, +4, and the default base is used, then the result will contain the +strings {cmd:"000"}, {cmd:"001"}, {cmd:"010"}, {cmd:"011"}, {cmd:"100"}. +If any integer values are negative, all string values will start with +{cmd:-} if negative and {cmd:+} otherwise. See also {cmd:decimal()}. The +examples show how to unpack this string into individual digits if +desired. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen binary = base(code)} + +{p 4 4 2}Suppose {cmd:binary} is {cmd:str5}. +To get individual {cmd:str1} variables, + +{p 4 8 2}{cmd:. forval i = 1/5 {c -(}}{p_end} +{p 4 8 2}{cmd:. {space 8}gen str1 code`i' = substr(binary, `i',1)}{p_end} +{p 4 8 2}{cmd:. {c )-}} + +{p 4 4 2}and to get individual numeric variables, + +{p 4 8 2}{cmd:. forval i = 1/5 {c -(}}{p_end} +{p 4 8 2}{cmd:. {space 8}gen byte code`i' = real(substr(binary, `i', 1))}{p_end} +{p 4 8 2}{cmd:. {c )-}} + +{p 4 8 2} +{cmd:decimal(}{it:varlist}{cmd:)} +[ +{cmd:,} +{cmdab:b:ase(}{it:#}{cmd:)} +] +treats the values of {it:varlist} as indicating digits in a base {it:#} +(default 2, possible values integers >=2) representation of a number and +produces the decimal equivalent. Thus if three variables are given with +values in a single observation of 1 1 0, and the default base is used, +the decimal result is 1 * 2^2 + 1 * 2^1 + 0 * 2^0 = 4 + 2 + 0 = 6. +Similarly if base 5 is used, the decimal equivalent of 2 3 4 is 2 * 5^2 ++ 3 * 5^1 + 4 * 5^0 = 50 + 15 + 4 = 59. Note that the order of variables +in {it:varlist} is crucial. (Stata 7 required.) + +{p 4 8 2}{cmd:. egen decimal = decimal(q1-q8)} + +{p 4 8 2} +{cmd:incss(}{it:strvarlist}{cmd:)} +{cmd:,} +{cmdab:s:ubstr(}{it:substring}{cmd:)} +[ +{cmdab:i:nsensitive} +] +indicates occurrences of {it:substring} within any of the variables in a list +of string variables by 1 and other observations by 0. {cmd:insensitive} +makes comparison case-insensitive. (Stata 6 required; an alternative is +now just to use {help foreach}.) + +{p 4 8 2}{cmd:. egen buick = incss(make), sub(buick) i} + +{p 4 8 2} +{cmd:iso3166(}{it:varname}{cmd:)} +[{cmd:,} +{cmdab:o:rigin(}{cmd:codes}|{cmd:names}{cmd:)} +{cmdab:l:anguage(}{cmd:en}|{cmd:fr}{cmd:)} +{cmdab:v:erbose} +{cmdab:u:pdate}] +maps {it:varname} containing "official short country names" into +a new variable containing the ISO 3166-1-alpha-2 code elements +(e.g. DE for "Germany", GB for "United Kingdom" and HM for "Heard +Island and McDonald Islands") and vice versa. The official short +country names can be in English (default) or French. Correspondingly +the function produces country names from ISO 3166-1-alpha-2 codes in +English or French. (Version 9.2 required.) + +{p 8 8 2}{cmdab:o:rigin(}{cmd:codes}|{cmd:names}{cmd:)} declares the +character of +the country variable that is already in the data. The default is +{cmd:names}, meaning that {it:varname} holds the "official short country +names". This information may be stored as a string variable or as a +numeric variable that is labeled accordingly. This default setting +produces ISO 3166-1-alpha-2 codes from the country names. If country +names should be produced from the two letter codes, use +{cmd:egen} {it:newvar} {cmd:= iso3166(}{it:varname}{cmd:), origin(codes)}. + +{p 8 8 2}{cmdab:l:anguage(}{cmd:en}|{cmd:fr}{cmd:)} defines the language in +which the country names are stored, or should be +produced. {cmd:language(en)} is for English names (default); +{cmd:language(fr)} is for French names. + +{p 8 8 2}{cmdab:v:erbose} For the mapping from country names to +ISO 3166-1-alpha2 codes the program expects official short +country names. It cannot handle unofficial country names such as +"Great Britain", "Taiwan" or "Russia". Such unofficial country names +result in the generation of missing values for the respective +countries. By default {cmd:iso3166()} only returns the number of +missing values it has produced. With {cmd:verbose} Stata also provides +the list of unofficial country names in {it:varname} and a clickable +link to the list of official country names. This is convenient +if one wants to correct the information stored in {it:varname} before +using {cmd:iso3166()}. For the transformation of ISO 3166-1-alpha2 +codes into country names, {cmd:verbose} does something +equivalent. + +{p 8 8 2}{cmdab:u:pdate} The ISO 3166-1-alpha2 codes are automaticaly +looked up in information provided by the ISO 3166 Maintenance +Agency of the International Organization for Standardization. The +information is automatically downloaded from the internet when the user +specifies {cmd:iso3166()} the first time, or +whenever {cmd:update} is specified. Note: Updating the matching list +regularly will guarantee that {cmd:iso3166()} always produces +up-to-date country names. However, updating the match list may also +produce missing values when running older do-files for data sets with +countries that no longer exist (for example, Yugoslavia). + +{p 8 8 2}Note the implications: This function will only work if your copy of +Stata can access the internet, at least for the first time it is called. +The results of the function might be not fully reproducible in the +future. + +{p 4 8 2} +{cmd:msub(}{it:strvar}{cmd:)} +{cmd:,} +{cmdab:f:ind(}{it:findstr}{cmd:)} +[ +{cmdab:r:eplace(}{it:replacestr}{cmd:)} +{cmd:n(}{it:#}{cmd:)} +{cmdab:w:ord} +] +replaces occurrences of the words of {it:findstr} by the words of +{it:replacestr} in the string variable {it:strvar}. The words of +{it:findstr} and of {it:replacestr} are separated by spaces or bound by +{cmd:" "}: thus {cmd:find(a b "c d")} includes three words, in turn +{cmd:"a"}, {cmd:"b"} and {cmd:"c d"}, and double quotation marks +{cmd:" "} should be used to delimit any word including one or more spaces. The +number of words in {it:findstr} should equal that in {it:replacestr}, +except that (1) an empty {it:replacestr} is taken to specify deletion; +(2) a single word in {it:replacestr} is taken to mean that each word of +{it:findstr} is to be replaced by that word. As quotation marks are used +for delimiting, literal quotation marks should be included in +compound double quotation marks, as in {cmd:`"""'}. By default all occurrences +are changed. {cmd:n(}{it:#}{cmd:)} specifies that the first {it:#} +occurrences only should be changed. {cmd:word} specifies that words in +{it:findstr} are to be replaced only if they occur as separate words in +{it:strvar}. The substitutions of {cmd:msub()} are made in sequence. +(Stata 6 required; {cmd:msub()} depends on the built-in functions +{help subinstr()} and {help subinword()}.) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(A B C) r(1 2 3)}{p_end} +{p 4 4 2}(replaces {cmd:"A"} by {cmd:"1"}, {cmd:"B"} by {cmd:"2"}, {cmd:"C"} by {cmd:"3"}) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(A B C) r(1 2 3) n(1)}{p_end} +{p 4 4 2}(replaces {cmd:"A"} by {cmd:"1"}, {cmd:"B"} by {cmd:"2"}, {cmd:"C"} by {cmd:"3"}, first occurrence only) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(A B C) r(1)}{p_end} +{p 4 4 2}(replaces {cmd:"A"} by {cmd:"1"}, {cmd:"B"} by {cmd:"1"}, {cmd:"C"} by {cmd:"1"}) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(A B C)}{p_end} +{p 4 4 2}(deletes {cmd:"A"}, {cmd:"B"}, {cmd:"C"}) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(" ")}{p_end} +{p 4 4 2}(deletes spaces) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar), f(`"""')}{p_end} +{p 4 4 2}(deletes quotation mark {cmd:"}) + +{p 4 8 2}{cmd:. egen newstr = msub(strvar) f(frog) w}{p_end} +{p 4 4 2}(deletes {cmd:"frog"} only if occurring as single word) + +{p 4 8 2} +{cmd:noccur(}{it:strvar}{cmd:)} +{cmd:,} +{cmdab:s:tring(}{it:substr}{cmd:)} +creates a variable containing the number of occurrences of the string +{it:substr} in string variable {it:strvar}. Note that occurrences must +be disjoint (non-overlapping): thus there are two occurrences of +{cmd:"aa"} within {cmd:"aaaaa"}. (Stata 7 required.) + +{p 4 8 2} +{cmd:nss(}{it:strvar}{cmd:)} +{cmd:,} +{cmdab:f:ind(}{it:substr}{cmd:)} +[ +{cmdab:i:nsensitive} +] +returns the number +of occurrences of {it:substr} within the string variable {it:strvar}. +{cmd:insensitive} makes counting case-insensitive. (Stata 6 required.) + +{p 4 4 2}The inclusion of {cmd:noccur()} and {cmd:nss()}, two almost +identical functions, was an act of sheer inadvertence by the maintainer. + +{p 4 8 2} +{cmd:ntos(}{it:numvar}{cmd:)} +{cmd:,} +{cmdab:f:rom(}{it:numlist}{cmd:)} +{cmdab:t:o(}{it:list of string values}{cmd:)} +generates a string variable from a numeric variable {it:numvar}, mapping +each numeric value in {it:numlist} to the corresponding string value. +The number of elements in each list must be the same. String values +containing blanks should be delimited by doube quotation marks +{cmd:" "}. Values not defined by the mapping are generated as missing. The type +of the string variable is determined automatically. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen grade = ntos(Grade), from(1/5) to(Poor Fair Good "Very good" Excellent)} + +{p 4 8 2} +{cmd:nwords(}{it:strvar}{cmd:)} returns the number of words within the string +variable {it:strvar}. Words are separated by spaces, unless bound by double +quotation marks {cmd:" "}. (Stata 6 required; superseded by +{help wordcount()}). + +{p 4 8 2} +{cmd:repeat()} +{cmd:,} +{cmdab:v:alues(}{it:value_list}{cmd:)} +[ +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:b:lock(}{it:#}{cmd:)} +] +produces a repeated sequence of {it:value_list}. The items of {it:value_list}, +which may be a {it:numlist} or a set of string values, are assigned cyclically to +successive observations. The order of observations is determined (1) after +noting any {cmd:if} or {cmd:in} restrictions; (2) within groups specified by +{cmd:by()}, if issued; (3) by the current sort order. {cmd:block()} specifies +that values should be repeated in blocks of the specified size: the default is +1. The variable type is determined smartly, and need not be specified. (Stata 8 +required.) + +{p 4 8 2}{cmd:. egen quarter = repeat(), v(1/4) block(3)}{p_end} +{p 4 8 2}{cmd:. egen months = repeat(), v(`c(Months)')}{p_end} +{p 4 8 2}{cmd:. egen levels = repeat(), v(10 50 200 500)} + +{p 4 8 2} +{cmd:sieve(}{it:strvar}{cmd:)} +{cmd:,} +{c -(} +{cmd:keep(}{it:classes}{cmd:)} +{c |} +{cmd:char(}{it:chars}{cmd:)} +{c |} +{cmd:omit(}{it:chars}{cmd:)} +{c )-} +selects characters from {it:strvar} according to a specified criterion and +generates a new string variable containing only those characters. This may be +done in three ways. First, characters are classified using the keywords +{cmd:alphabetic} (any of {cmd:a-z} or {cmd:A-Z}), {cmd:numeric} (any of +{cmd:0-9}), {cmd:space} or {cmd:other}. {cmd:keep()} specifies one or more of +those classes: keywords may be abbreviated by as little as one letter. Thus +{cmd:keep(a n)} selects alphabetic and numeric characters and omits spaces and +other characters. Note that keywords must be separated by spaces. +Alternatively, {cmd:char()} specifies each character to be selected or +{cmd:omit()} specifies each character to be omitted. Thus +{cmd:char(0123456789.)} selects numeric characters and the stop (presumably as +decimal point); {cmd:omit(" ")} strips spaces and {cmd:omit(`"""')} strips +double quotation marks. (Stata 7 required.) + +{p 4 8 2} +{cmd:ston(}{it:strvar}{cmd:)} +{cmd:,} +{cmdab:f:rom(}{it:list of string values}{cmd:)} +{cmdab:t:o(}{it:numlist}{cmd:)} +generates a numeric variable from a string variable {it:strvar}, mapping each +string value to the corresponding numeric value in {it:numlist}. The number of +elements in each list must be the same. String values containing blanks should +be delimited by {cmd:" "}. Values not defined by the mapping are generated as +missing. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen Grade = ston(grade), to(1/5) from(Poor Fair Good "Very good" Excellent)} + +{p 4 8 2} +{cmd:truncdig(}{it:varname}{cmd:), dig(}{it:#}{cmd:)} +truncates a numeric variable at the specified number of decimal digits. +It applies the {cmd:trunc()} or {cmd:int()} function to the variable +times 10^{cmd:dig}, then divides by 10^{cmd:dig}. The {cmd:dig()} +argument may be positive, zero or negative. If negative, it creates a +binned variable: for instance, with income in dollars, +{cmd:egen inck = truncdig(income), dig(-3)} creates a measure of income +expressed in whole thousands of dollars. (Stata 12 required.) + +{p 4 8 2} +{cmd:wordof(}{it:strvar}{cmd:)} +{cmd:,} +{cmdab:w:ord(}{it:#}{cmd:)} returns the {it:#}th word of string variable +{it:strvar}. {cmd:word(1)} is the first word, {cmd:word(2)} the second word, +{cmd:word(-1)} the last word, and so forth. Words are separated by spaces, +unless bound by quotation marks {cmd:" "}. (Stata 6 required; superseded +by {help word()}.) + + +{title:Dates, times and time series} + +{p 4 8 2} +{cmd:bom(}{it:m y}{cmd:)} +[ +{cmd:,} +{cmdab:l:ag(}{it:lag}{cmd:)} +{cmdab:f:ormat(}{it:format}{cmd:)} +{cmdab:w:ork} +] +creates an elapsed date variable containing the date of the beginning of +month {it:m} and year {it:y}. {it:m} can be a variable containing +integers between 1 and 12 inclusive or a single integer in that range. +{it:y} can be a variable containing integers within the range covered by +elapsed dates or a single integer within that range. Optionally +{cmd:lag()} specifies a lag: the beginning of the month will be given +for {cmd:lag} months before the current date. {cmd:lag(1)} refers to the +previous month, {cmd:lag(3)} to 3 months ago and {cmd:lag(-3)} to 3 +months hence. The {cmd:lag} may also be specified by a variable +containing integers. Optionally a format, usually but not necessarily a +date format, can be specified. {cmd:work} specifies that the first day +must also be one of Monday to Friday. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen bom = bom(month year), f(%dd_m_y)}{p_end} + +{p 4 8 2} +{cmd:bomd(}{it:datevar}{cmd:)} +[ +{cmd:,} +{cmdab:l:ag}{cmd:(}{it:lag}{cmd:)} +{cmdab:f:ormat}{cmd:(}{it:format}{cmd:)} +{cmdab:w:ork} +] +creates an elapsed date variable containing the date of the beginning of +the month containing the date in an elapsed date variable {it:datevar}. +Optionally {cmd:lag()} specifies a lag: the beginning of the month will +be given for {cmd:lag} months before the current date. {cmd:lag(1)} +refers to the previous month, {cmd:lag(3)} to 3 months ago and +{cmd:lag(-3)} to 3 months hence. The {cmd:lag} may also be specified by +a variable containing integers. Optionally a format, usually but not +necessarily a date format, can be specified. {cmd:work} specifies that +the first day must also be one of Monday to Friday. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen bomd = bomd(date), f(%dd_m_y)} + +{p 4 4 2} +Note that {cmd:work} knows nothing about holidays or any special days. + +{p 4 8 2} +{cmd:dayofyear(}{it:daily_date_variable}{cmd:)} +[ +{cmd:,} +{cmdab:m:onth(}{it:#}{cmd:)} +{cmdab:d:ay(}{it:#}{cmd:)} +] +generates the day of the year, counting from the +start of the year, from a daily date variable. The +start of the year is 1 January by default: {cmd:month()} +and/or {cmd:day()} may be used to specify an alternative. +This function thus is a generalisation of the date +function {help doy()}. +(Stata 8 required.) + +{p 4 8 2}{cmd:. egen dayofyear = dayofyear(date), m(10)} + +{p 4 8 2} +{cmd:dhms(}{it:d h m s}{cmd:)} +[ +{cmd:,} +{cmdab:f:ormat(}{it:format}{cmd:)} +] +creates a date variable from Stata date variable or date {it:d} with a +fractional part reflecting the number of hours, minutes and seconds past +midnight. {it:h} can be a variable containing integers between 0 and 23 +inclusive or a single integer in that range. {it:m} and {it:s} can be +variables containing integers between 0 and 59 or single integer(s) in +that range. Optionally a format, usually but not necessarily a date +format, can be specified. The resulting variable, which is by default +stored as a double, may be used in date and time arithmetic in which the +time of day is taken into account. (Stata 6 required.) + +{p 4 8 2} +{cmd:elap(}{it:time}{cmd:)} +[ +{cmd:,} +{cmdab:f:ormat(}{it:format}{cmd:)} +] +creates a string variable which contains the number of days, hours, +minutes and seconds associated with an integer variable containing a +number of elapsed seconds. Such a variable might be the result of +date/time arithmetic, where a time interval between two timestamps has +been expressed in terms of elapsed seconds. Leading zeroes are included +in the hours, minutes, and seconds fields. Optionally, a format can be +specified. (Stata 6 required.) + +{p 4 8 2} +{cmd:elap2(}{it:time1 time2}{cmd:)} +[ +{cmd:,} +{cmdab:f:ormat(}{it:format}{cmd:)} +] +creates a string variable which contains the number of days, hours, +minutes and seconds associated with a pair of time values, expressed as +fractional days, where {it:time1} is no greater than {it:time2}. Such +time values may be generated by function {cmd:dhms()}. {cmd:elap2()} +expresses the interval between these time values in readable form. +Leading zeroes are included in the hours, minutes, and seconds fields. +Optionally, a format can be specified. (Stata 6 required.) + +{p 4 8 2} +{cmd:eom(}{it:m y}{cmd:)} +[ +{cmd:,} +{cmdab:l:ag(}{it:lag}{cmd:)} +{cmdab:f:ormat(}{it:format}{cmd:)} +{cmdab:w:ork} +] +creates an elapsed date variable containing the date of the end of month +{it:m} and year {it:y}. {it:m} can be a variable containing integers between 1 and 12 +inclusive or a single integer in that range. {it:y} can be a variable +containing integers within the range covered by elapsed dates or a +single integer within that range. Optionally {cmd:lag()} specifies a +lag: the end of the month will be given for {cmd:lag} months before the +current date. {cmd:lag(1)} refers to the previous month, {cmd:lag(3)} to +3 months ago and {cmd:lag(-3)} to 3 months hence. The {cmd:lag} may also +be specified by a variable containing integers. Optionally a format, +usually but not necessarily a date format, can be specified. {cmd:work} +specifies that the last day must also be one of Monday to Friday. +(Stata 6 required.) + +{p 4 8 2}{cmd:. egen eom = eom(month year), f(%dd_m_y)} + +{p 4 8 2} +{cmd:eomd(}{it:datevar}{cmd:)} +[ +{cmd:,} +{cmdab:l:ag(}{it:lag}{cmd:)} +{cmdab:f:ormat(}{it:format}{cmd:)} +{cmdab:w:ork} +] +creates an elapsed date variable containing the date of the end of the +month containing the date in an elapsed date variable {it:datevar}. +Optionally {cmd:lag()} specifies a lag: the end of the month will be +given for {cmd:lag} months before the current date. {cmd:lag(1)} refers +to the previous month, {cmd:lag(3)} to 3 months ago and {cmd:lag(-3)} to +3 months hence. The {cmd:lag} may also be specified by a variable +containing integers. Optionally a format, usually but not necessarily a +date format, can be specified. {cmd:work} specifies that the last day +must also be one of Monday to Friday. (Stata 6 required.) + +{p 4 4 2}Note that {cmd:work} knows nothing about holidays +or any special days. + +{p 4 8 2}{cmd:. egen eom = eomd(date), f(%dd_m_y)}{p_end} +{p 4 8 2}{cmd:. egen eopm = eomd(date), f(%dd_m_y) lag(1)} + +{p 4 8 2} +{cmd:ewma(}{it:timeseriesvar}{cmd:)} +{cmd:,} +{cmd:a(}{it:#}{cmd:)} +calculates the exponentially weighted moving average, which is + +{p 8 8 2} +{it:ewma} = {it:timeseriesvar} for the first observation + +{p 13 8 2} += {cmd:a * }{it:timeseriesvar} + {cmd:(1 - a) * L.}{it:ewma} otherwise + +{p 8 8 2} +The data must have been declared time series data by {help tsset}. +Calculations start afresh after any gap with missing values. +(Stata 6 required; superseded by {help tssmooth}.) + +{p 4 8 2} +{cmd:filter(}{it:timeseriesvar}{cmd:) ,} +{cmdab:l:ags(}{it:numlist}{cmd:)} +[ +{cmdab:c:oef(}{it:numlist}{cmd:)} +{c -(} +{cmdab:n:ormalise} +{c |} +{cmdab:n:ormalize} +{c )-} +] +calculates the linear filter which is the sum of terms + +{p 8 8 2} +{it:coef_i} {cmd:* L}{it:i.timeseriesvar} or {it:coef_i} {cmd:* F}{it:i.timeseriesvar} + +{p 8 8 2} +{cmd:coef()} defaults to a vector the same length as {cmd:lags()} with each +element 1. + +{p 8 8 2} +{cmd:filter(y), l(0/3) c(0.4(0.1)0.1)} calculates + +{p 8 8 2} +{cmd:0.4 * y + 0.3 * L1.y + 0.2 * L2.y + 0.1 * L3.y} + +{p 8 8 2} +{cmd:filter(y), l(0/3)} calculates + +{p 8 8 2} +{cmd:1 * y + 1 * L1.y + 1 * L2.y + 1 * L3.y} or {cmd:y + L1.y + L2.y + L3.y} + +{p 8 8 2} +Leads are specified as negative lags. {cmd:normalise} (or {cmd:normalize}, +according to taste) specifies that coefficients are to be divided by +their sum so that they add to 1 and thus specify a weighted mean. + +{p 8 8 2} +{cmd:filter(y), l(-2/2) c(1 4 6 4 1) n} calculates + +{p 8 8 2} +{cmd:(1/16) * F2.y + (4/16) * F1.y + (6/16) * y} +{cmd:+ (4/16) * L1.y + (1/16) * L2.y} + +{p 8 8 2} +The data must have been declared time series data by {help tsset}. +Note that this may include panel data, which are automatically +filtered separately within each panel. + +{p 8 8 2} +The order of terms in {cmd:coef()} is taken to be the same as that in +{cmd:lags}. (Stata 8 required; see also {help tssmooth}.) + +{p 4 8 2}{cmd:. egen f2y = filter(y), l(-1/1) c(0.25 0.5 0.25)}{p_end} +{p 4 8 2}{cmd:. egen f2y = filter(y), l(-1/1) c(1 2 1) n} + +{p 4 8 2} +{cmd:filter7(}{it:timeseriesvar}{cmd:) ,} +{cmdab:l:ags(}{it:numlist}{cmd:)} +{cmdab:c:oef(}{it:numlist}{cmd:)} +[ +{c -(} +{cmdab:n:ormalise} +{c |} +{cmdab:n:ormalize} +{c )-} +] +calculates the linear filter which is the sum of terms + +{p 8 8 2} +{it:coef_i} {cmd:* L}{it:i.timeseriesvar} or {it:coef_i }{cmd:* F}{it:i.timeseriesvar} + +{p 8 8 2} +{cmd:filter7(y), l(0/3) c(0.4(0.1)0.1)} calculates + +{p 8 8 2} +{cmd:0.4 * y + 0.3 * L1.y + 0.2 * L2.y + 0.1 * L3.y} + +{p 8 8 2} +Leads are specified as negative lags. {cmd:normalise} (or {cmd:normalize}, +according to taste) specifies that coefficients are to be divided by +their sum so that they add to 1 and thus specify a weighted mean. + +{p 8 8 2} +{cmd:filter7(y), l(-2/2) c(1 4 6 4 1) n} calculates + +{p 8 8 2} +{cmd:(1/16) * F2.y + (4/16) * F1.y + (6/16) * y} +{cmd:+ (4/16) * L1.y + (1/16) * L2.y} + +{p 8 8 2} +The data must have been declared time series data by {help tsset}. +Note that this may include panel data, which are automatically +filtered separately within each panel. + +{p 8 8 2} +The order of terms in {cmd:coef()} is taken to be the same as that in +{cmd:lags()}. (Stata 7 required; see also {help tssmooth}.) + +{p 4 8 2} +{cmd:foy(}{it:daily_date_variable}{cmd:)} +[ +{cmd:,} +{cmdab:m:onth(}{it:#}{cmd:)} +{cmdab:d:ay(}{it:#}{cmd:)} +] +generates the fraction of the year elapsed since the +start of the year from a daily date variable. The +start of the year is 1 January by default: {cmd:month()} +and/or {cmd:day()} may be used to specify an alternative. +If {it:daily_date_variable} +is all integers, then the result is {bind:(day of year - 0.5)} / +number of days in year. If {it:daily_date_variable} +contains non-integers, then the result is +{bind:(day of year - 1)} / number of days in year. +(Stata 8 required.) + +{p 4 8 2}{cmd:. egen frac = foy(date), m(10)} + +{p 4 8 2} +{cmd:hmm(}{it:timevar}{cmd:)} +[ +{cmd:,} +{cmdab:r:ound(}{it:#}{cmd:)} +{cmdab:t:rim} +] +generates a string variable showing {it:timevar}, interpreted as +indicating time in minutes, represented as hours and minutes in the form +{cmd:"}[...{it:h}]{it:h}{cmd::}{it:mm}{cmd:"}. For example, times of +{cmd:9}, {cmd:90}, {cmd:900} and {cmd:9000} minutes would be represented +as {cmd:"0:09"},{cmd:"1:30"}, {cmd:"15:00"} and {cmd:"150:00"}. The +option {cmd:round(}{it:#}{cmd:)} rounds the result: {cmd:round(1)} +rounds the time to the nearest minute. The option {cmd:trim} trims the +result of leading zeros and colons, except that an isolated {cmd:0} is +not trimmed. With {cmd:trim} {cmd:"0:09"} is trimmed to {cmd:"9"} and +{cmd:"0:00"} is trimmed to {cmd:"0"}. + +{p 8 8 2} +{cmd:hmm()} serves equally well for representing times in seconds in +minutes and seconds in the form +{cmd:"}[...{it:m}]{it:m}{cmd::}{it:ss}{cmd:"}. (Stata 6 required.) + +{p 4 8 2} +{cmd:hmmss(}{it:timevar}{cmd:)} +[ +{cmd:,} +{cmdab:r:ound(}{it:#}{cmd:)} +{cmdab:t:rim} +] +generates a string variable showing {it:timevar}, interpreted as +indicating time in seconds, represented as hours, minutes and seconds in +the form {cmd:"}[...{it:h}{cmd::}]{it:mm}{cmd::}{it:ss}{cmd:"}. For +example, times of {cmd:9}, {cmd:90}, {cmd:900} and {cmd:9000} seconds +would be represented as {cmd:"00:09"},{cmd:"01:30"}, {cmd:"15:00"} and +{cmd:"2:30:00"}. The option {cmd:round(}{it:#}{cmd:)} rounds the result: +{cmd:round(1)} rounds the time to the nearest second. The option +{cmd:trim} trims the result of leading zeros and colons, except that an +isolated {cmd:0} is not trimmed. With {cmd:trim} {cmd:"00:09"} is +trimmed to {cmd:"9"} and {cmd:"00:00"} is trimmed to {cmd:"0"}. (Stata 6 +required.) + +{p 4 8 2} +{cmd:hms(}{it:h m s}{cmd:)} +[ +{cmd:,} +{cmdab:f:ormat(}{it:format}{cmd:)} +] +creates an elapsed time variable containing the number of seconds past +midnight. {it:h} can be a variable containing integers between 0 and 23 +inclusive or a single integer in that range. {it:m} and {it:s} can be variables +containing integers between 0 and 59 or single integer(s) in that range. +Optionally a format can be specified. (Stata 6 required.) + +{p 4 8 2} +{cmd:minutes(}{it:strvar}{cmd:)} +[ +{cmd:,} +{cmd:maxhour(}{it:#}{cmd:)} +] +returns time in minutes given a string variable {it:strvar} containing a +time in hours and minutes in the form +{cmd:"}[..{it:h}]{it:hh}:{it:mm}{cmd:"}. In particular, minutes are +given as two digits between 00 and 59 and hours by default are given as +two digits between 00 and 23. The {cmd:maxhour()} option may be used to +change the (unreachable) limit: its default is 24. Note that, strange +though it may seem, this function rather than {cmd:seconds()} is +appropriate for converting times in the form +{cmd:"}{it:mm}:{it:ss}{cmd:"} to seconds. The maximum number of minutes +acceptable may need then to be specified by {cmd:maxhour()} [sic]. +(Stata 8 required.) + +{p 4 8 2} +{cmd:ncyear(}{it:datevar}{cmd:)} +{cmd:,} +{cmdab:m:onth(}{it:#}{cmd:)} +[ +{cmdab:d:ay(}{it:#}{cmd:)} +] +returns an integer variable labelled with labels such as {cmd:"1952/53"} +for non-calendar years starting on the specified month and day. The day +defaults to 1. {it:datevar} is treated as indicating elapsed dates. For +more on dates, see help on {help dates}. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen wtryear = ncyear(date), m(10)}{p_end} +{p 4 4 2}(years starting on 1 October) + +{p 4 8 2}{cmd:. egen wwgyear = ncyear(date), m(1) d(21)}{p_end} +{p 4 4 2}(years starting on 21 January) + +{p 4 8 2} +{cmd:record(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmd:min} +{cmd:order(}{it:varlist}{cmd:)} +] +produces the maximum (with {cmd:min} the minimum) value observed "to date" of +the specified {it:exp}. Thus {cmd:record(wage), by(id) order(year)} produces +the maximum wage so far in a worker's career, calculations being separate for +each {cmd:id} and records being determined within each {cmd:id} in {cmd:year} +order. Although explanation and example here refer to dates, nothing in +{cmd:record()} restricts its use to data ordered in time. If not otherwise +specified with {cmd:by()} and/or {cmd:order()}, records are determined with +respect to the current order of observations. No special action is required for +missing values, as internally {cmd:record()} uses either the {cmd:max()} or the +{cmd:min()} function, both of which return results of missing only if all +values are missing. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen hiwage = record(exp(lwage)), by(id) order(year)}{p_end} +{p 4 8 2}{cmd:. egen lowage = record(exp(lwage)), by(id) order(year) min} + +{p 4 8 2} +{cmd:seconds(}{it:strvar}{cmd:)} +[ +{cmd:,} +{cmd:maxhour(}{it:#}{cmd:)} +] +returns time in seconds given a string variable containing a time in hours, +minutes and seconds in the form +{cmd:"}[..{it:h}]{it:hh}{cmd::}{it:mm}{cmd::}{it:ss}{cmd:"}. +In particular, minutes and seconds are each given as two digits between +00 and 59 and hours by default are given as two digits between 00 and +23. The {cmd:maxhour()} option may be used to change the (unreachable) +limit: its default is 24. (Stata 8 required.) + +{p 4 8 2} +{cmd:tod(}{it:time}{cmd:)} +[ +{cmd:,} +{cmdab:f:ormat(}{it:format}{cmd:)} +] +creates a string variable which contains the number of hours, minutes and +seconds associated with an integer in the range 0 to 86399, one less than the +number of seconds in a day. Such a variable is produced by {cmd:hms()}, which +see above. Leading zeroes are included in the hours, minutes, and seconds +fields. Colons are used as separators. Optionally a format can be specified. +(Stata 6 required.) + + +{title:Summaries and estimates} + +{p 4 8 2} +{cmd:adjl(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:fact:or(}{it:#}{cmd:)} +] +calculates adjacent lower values. These are the smallest values within +{cmd:factor()} times the interquartile range of the lower quartile. +By default {cmd:factor()} is 1.5, defining the default lower value +of a so-called whisker on a Stata box plot. (Stata 8 required.) + +{p 4 8 2} +{cmd:adju(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:fact:or(}{it:#}{cmd:)} +] +calculates adjacent upper values. These are the largest values within +{cmd:factor()} times the interquartile range of the upper quartile. +By default {cmd:factor()} is 1.5, defining the default upper value +of a so-called whisker on a Stata box plot. (Stata 8 required.) + +{p 4 8 2}{cmd:. egen adjl = adjl(mpg), by(foreign)}{p_end} +{p 4 8 2}{cmd:. egen adju = adju(mpg), by(foreign)} + +{p 4 8 2} +{cmd:corr(}{it:varname1 varname2}{cmd:)} +[ +{cmd:,} +{cmdab:c:ovariance} +{cmdab:s:pearman} +{cmd:taua} +{cmd:taub} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the correlation of {it:varname1} with {it:varname2}. By +default, this returns the Pearson correlation coefficient. {cmd:covariance} +indicates that covariances should be calculated; {cmd:spearman} +indicates that Spearman's rank correlation coefficient should be +calculated; {cmd:taua} and {cmd:taub} return Kendall's tau-A and tau-B, +respectively. (Stata 8 required.) + +{p 4 8 2} +{cmd:d2(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmdab:w:eights(}{it:exp}{cmd:)} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the mean absolute deviation from the median (within varlist) of {it:exp}, +allowing specification of weights. The function creates a constant (within {it:byvarlist}) +containing the mean of abs({it:exp} - median({it:exp})). (Stata 10.1 required.) + +{p 4 8 2} +{cmd:density(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmdab:w:idth(}{it:#}{cmd:)} +{cmdab:st:art(}{it:#}{cmd:)} +{cmdab:freq:uency} +{cmd:percent} +{cmdab:frac:tion} +{cmd:by(}{it:byvarlist}{cmd:)} +] +calculates the density (or optionally the {cmd:frequency}, +{cmd:fraction} or {cmd:percent}) of values in bins of width +{cmd:width()} (default 1) starting at {cmd:start()} (default minimum of +the data). Note that each value produced will be identical for all +observations in the same bin. Commonly for further use it will be +desired to select one value from each bin, say by using {help egen}'s +{cmd:tag()} function. (Stata 8 required.) + +{p 4 8 2} +{cmd:gmean(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the geometric mean of {it:exp}. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen gmean = gmean(mpg), by(rep78)} + +{p 4 8 2} +{cmd:hmean(}{it:exp}{cmd:)} +[ +{cmd:, by(}{it:byvarlist}{cmd:)} +] +returns the harmonic mean of {it:exp}. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen hmean = hmean(mpg), by(rep78)} + +{p 4 8 2} +{cmd:nmiss(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the number of missing values in {it:exp}. (Stata 6 required.) +Remark: Why this was written is a mystery. The one-line command +{cmd:egen nmiss = sum(missing(}{it:exp}{cmd:)} +(in Stata 9 {cmd:egen nmiss = total(missing(}{it:exp}{cmd:)}) +shows that it is unnecessary. + +{p 4 8 2}{cmd:. egen nmiss = nmiss(rep78), by(foreign)} + +{p 4 8 2} +{cmd:nvals(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:miss:ing} +] +returns the number of distinct values in {it:varname}. Missing values +are ignored unless {cmd:missing} is specified. +Remark: Much can be done by using {help egen} function {cmd:tag()} +and then summing values as desired. See also {cmd:distinct} (Cox and Longton 2008). +(Stata 6 required.) + +{p 4 8 2} +{cmd:outside(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:fact:or(}{it:#}{cmd:)} +] +calculates outside values. These are any values more than {cmd:factor()} +times the interquartile range from the nearer quartile, that is above +the upper quartile or below the lower quartile. By default +{cmd:factor()} is 1.5, defining the default outside values, those +plotted separately, on a Stata box plot. +Values not outside are returned as missing. +(Stata 8 required.) + +{p 4 8 2} +{cmd:ridit(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +{cmdab:miss:ing} +{cmdab:perc:ent} +{cmdab:rev:erse} +] +calculates the ridit for {it:varname}, which is + +{space 8}(1/2) count at this value + SUM counts in values below +{space 8}{hline 54} +{space 23}SUM counts of all values + +{p 8 8 2} +With terminology from Tukey (1977, pp.496-497), this could be called a +`split fraction below'. The name `ridit' was used by Bross (1958): +see also Fleiss (1981, pp.150-7) or Flora (1988). The numerator is a +`split count'. + +{p 8 8 2} +{cmd:missing} specifies that observations for which values of {it:byvarlist} +are missing will be included in calculations if {cmd:by()} is specified. The +default is to exclude them. {cmd:percent} scales the numbers to percents by +multiplying by 100. {cmd:reverse} specifies the use of reverse cumulative +probabilities (1 - fraction above). (Stata 6 required.) + +{p 4 8 2} +{cmd:semean(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +calculates the standard error of the mean of {it:exp}. (Stata 6 +required.) + +{p 4 8 2} +{cmd:sumoth(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the sum of the other values of {it:exp} in the same group. If +{cmd:by()} is specified, distinct combinations of {it:byvarlist} define groups; +otherwise all observations define one group. (Stata 6 required.) + +{p 4 8 2} +{cmd:var(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +creates a constant (within {it:byvarlist}) containing the variance of {it:exp}. +Note also the {help egen} function {cmd:sd()}. (Stata 6 required.) + +{p 4 8 2} +{cmd:wpctile(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:p(}{it:#}{cmd:)} +{cmdab:w:eights(}{it:varname}{cmd:)} +{cmdab:alt:def} +{cmd:by(}{it:byvarlist}{cmd:)} +] +is a hack on official Stata's {cmd:egen} function {cmd:pctile()} +allowing specification of weights in the calculation of percentiles. By +default, the function creates a constant (within {it:byvarlist}) +containing the {it:#}th percentile of {it:varname}. If {cmd:p()} is not +specified, 50 is assumed, meaning medians. {cmd:weights()} requests +weighted calculation of percentiles. {cmd:altdef} uses an alternative +formula for calculating percentiles, which is not applicable with +weights present. {cmd:by()} requests calculation by groups. You may +also use the {cmd:by:} construct. (Stata 8.2 required.) + +{p 4 8 2} +{cmd:wtfreq(}{it:exp}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +creates a constant (within {it:byvarlist}) +containing the weighted frequency using {it:exp} as weights. (Such +frequencies sum to {cmd:_N}.) (Stata 6 required.) + +{p 4 8 2} +{cmd:xtile(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmdab:p:ercentiles(}{it:numlist}{cmd:)} +{cmdab:n:quantiles(}{it:#}{cmd:)} +{cmdab:w:eights(}{it:varname}{cmd:)} +{cmdab:alt:def} +{cmd:by(}{it:byvarlist}{cmd:)} +] +categorizes {it:varname} by specific percentiles. The function works +like {help xtile}. By default {it:varname} is dichotomized at the +median. {cmd:percentiles()} requests percentiles corresponding to +{it:numlist}: for example, {cmd:p(25(25)75)} is used to create a +variable according to quartiles. Alternatively you also may have +specified {cmd:n(4)}: to create a variable according to quartiles. +{cmd:weights()} requests weighted calculation of percentiles. +{cmd:altdef} uses an alternative formula for calculating percentiles. +See {help xtile}. {cmd:by()} requests calculation by groups. You may +also use the {cmd:by:} construct. (Stata 8.2 required.) + +{p 4 8 2}{cmd:. egen mpg4 = xtile(mpg), by(foreign) p(25(25)75)}{p_end} +{p 4 8 2}{cmd:. egen mpg10 = xtile(mpg), by(foreign) nq(10)} + + +{title:First and last} + +{p 4 8 2} +{cmd:first(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the first non-missing value of {it:varname}. `First' depends on the +existing order of observations. {it:varname} may be numeric or string. +(Stata 6 required.) + +{p 4 8 2} +{cmd:ifirst(}{it:numvar}{cmd:)} +{cmd:,} +{cmdab:v:alue(}{it:#}{cmd:)} +[ +{c -(} +{cmdab:be:fore} +{c |} +{cmdab:a:fter} +{c )-} +{cmd:by(}{it:byvarlist}{cmd:)} +] +indicates the first occurrence of integer {it:#} within {it:numvar} +by 1 and other observations by 0. + +{p 8 8 2} +{cmd:before} indicates observations before the first occurrence by 1 +and other observations by 0. +{cmd:after} indicates observations after the first occurrence by 1 +and other observations by 0. +The default, the value {cmd:before} and the value {cmd:after} +always sum to 1 for observations analysed. + +{p 8 8 2} +First occurrence is determined as follows: (1) if {cmd:if} or {cmd:in} is +specified, any observations excluded are ignored; (2) if {cmd:by()} is +specified, first is determined separately for each distinct group of +observations; (3) first is first in current sort order. +If {it:#} does not occur, all observations +are before the first occurrence. (Stata 6 required.) + +{p 4 8 2}{cmd:. gen warm = celstemp > 20}{p_end} +{p 4 8 2}{cmd:. egen fwarm = ifirst(warm), v(1) by(year)} + +{p 4 8 2} +{cmd:ilast(}{it:numvar}{cmd:)} +{cmd:,} +{cmdab:v:alue(}{it:#}{cmd:)} +[ +{c -(} +{cmdab:be:fore} +{c |} +{cmdab:a:fter} +{c )-} +{cmd:by(}{it:byvarlist}{cmd:)} +] +indicates the last occurrence of integer {it:#} within {it:numvar} by 1 and +other observations by 0. + +{p 8 8 2} +{cmd:before} indicates observations before the last occurrence by 1 +and other observations by 0. +{cmd:after} indicates observations after the last occurrence by 1 +and other observations by 0. +The default, the value {cmd:before} and the value {cmd:after} +always sum to 1 for observations analysed. + +{p 8 8 2} +Last occurrence is determined as follows: (1) if {cmd:if} or {cmd:in} is +specified, any observations excluded are ignored; (2) if {cmd:by()} is +specified, last is determined separately for each distinct group of +observations; (3) last is last in current sort order. +If {it:#} does not occur, all +observations are before the last occurrence. (Stata 6 required.) + +{p 4 8 2} +{cmd:lastnm(}{it:varname}{cmd:)} +[ +{cmd:,} +{cmd:by(}{it:byvarlist}{cmd:)} +] +returns the last non-missing value of {it:varname}. `Last' depends on +the existing order of observations. {it:varname} may be numeric or +string. Remark: {cmd:lastnm()} would have been better called +{cmd:last()}, except that an {cmd:egen} program with that name for +selecting the last `word' in a string was published in STB-50. +(Stata 6 required.) + + +{title:Random numbers} + +{p 4 8 2} +{cmd:mixnorm()} +[ +{cmd:,} +{cmd:frac(}{it:#}{cmd:)} +{cmd:mu1(}{it:#}{cmd:)} +{cmd:mu2(}{it:#}{cmd:)} +{cmd:var1(}{it:#}{cmd:)} +{cmd:var2(}{it:#}{cmd:)} +] +generates a new variable of specified type as +a mixture of two Normal distributions, with the fraction +{cmd:frac(}{it:#}{cmd:)} of the observations defined by the first +distribution. Both options for means {cmd:mu1(}{it:#}{cmd:)} and +{cmd:mu2(}{it:#}{cmd:)} default to 0; both options for variances +{cmd:var1(}{it:#}{cmd:)} and {cmd:var2(}{it:#}{cmd:)} default to 1, +while {cmd:frac(}{it:#}{cmd:)} defaults to 0.5. Only non-default +parameters of the desired mixture need be specified. (Stata 8 required.) + +{p 4 8 2}{cmd:. egen mixture = mixnorm(), frac(0.9) mu2(10) var2(4)} + +{p 4 8 2} +{cmd:rndint()} +{cmd:,} +{cmdab:ma:x(}{it:#}{cmd:)} +[ +{cmdab:mi:n(}{it:#}{cmd:)} +] +generates random integers from a uniform distribution on {cmd:min()} to +{cmd:max()}, inclusive. {cmd:min(1)} is the default. +Remark: Note that {cmd:ceil(uniform() * }{it:#}{cmd:)} is a direct way +to get random integers from 1 to {it:#}. (Stata 6 required.) + +{p 4 8 2}{cmd:. egen integ = rndint(), min(100) max(199)}{p_end} + +{p 4 8 2} +{cmd:rndsub()} +[ +{cmd:,} +{cmdab:ng:roup(}{it:#}{cmd:)} +{c -(} +{cmdab:f:rac(}{it:#}{cmd:)} +{c |} +{cmdab:p:ercent(}{it:#}{cmd:)} +{c )-} +{cmd:by(}{it:byvarlist}{cmd:)} +] +randomly splits observations into groups or subsamples. The result is a +categorical variable taking values from 1 upward labelling distinct groups. + +{p 8 8 2} +{cmd:ngroup(}{it:#}{cmd:)} (default 2) defines the number of groups. + +{p 8 8 2} +{cmd:frac(}{it:#}{cmd:)}, which is only allowed with {cmd:ngroup(2)}, specifies that +the first group should contain 1 / {it:#} of the observations and thus that +the second group should contain the remaining observations. + +{p 8 8 2} +{cmd:percent(}{it:#}{cmd:)}, which is only allowed with {cmd:ngroup(2)}, +specifies that the first group should contain {it:#}% of the observations and thus that + the second group should contain the remaining observations. + +{p 8 8 2} +{cmd:frac()} and {cmd:percent()} may not be specified together. +(Stata 6 required.) + +{p 4 8 2}{cmd:. egen group = rndsub(), by(foreign)}{p_end} + +{p 4 8 2}{cmd:. egen group = rndsub(), by(foreign) f(3)}{p_end} +{p 4 4 2}(first group contains 1/3 of observations, second group contains 2/3) + +{p 4 8 2}{cmd:. egen group = rndsub(), by(foreign) p(25)}{p_end} +{p 4 8 2}(first group contains 25% of observations, second group contains 75%) + +{p 4 4 2} +For reproducible results, set the seed of the random number generator +beforehand and document your choice. + +{p 4 4 2} +Note that to generate {it:#} random numbers the number of observations must be +at least {it:#}. If there are no data in memory and you want 100 random +numbers, type {cmd:set obs 100} before using these functions. + + +{title:Row operations} + +{p 4 8 2} +{cmd:rall(}{it:varlist}{cmd:)} +{cmd:,} +{cmdab:c:ond(}{it:condition}{cmd:)} +[ +{cmdab:sy:mbol(}{it:symbol}{cmd:)} +] +returns 1 for observations for which the condition specified is true for +all variables in {it:varlist} and 0 otherwise. The condition should be +specified using {cmd:symbol()}, by default {cmd:@}, as a placeholder for each +variable. Thus, for example, +{cmd:rall(}{it:varlist}{cmd:), c(@ > 0 & @ < .)} +tests whether all variables in {it:varlist} are positive and +non-missing. Note that conditions typically make sense only if variables +are either all numeric or all string: one exception is {cmd:missing(@)}. +(Stata 6 required.) + +{p 4 8 2} +{cmd:rany(}{it:varlist}{cmd:)} +{cmd:,} +{cmdab:c:ond(}{it:condition}{cmd:)} +[ +{cmdab:sy:mbol(}{it:symbol}{cmd:)} +] +returns 1 for observations for which the condition specified is true for +any variable in {it:varlist} and 0 otherwise. The condition should be +specified using {cmd:symbol()}, by default {cmd:@}, as a placeholder for each +variable. Thus, for example, {cmd:rany(}{it:varlist}{cmd:), c(@ > 0 & @ < .)} +tests whether any variable in {it:varlist} is positive and non-missing. +Note that conditions typically make sense only if variables are either +all numeric or all string: one exception is {cmd:missing(@)}. +(Stata 6 required.) + +{p 4 8 2} +{cmd:rcount(}{it:varlist}{cmd:)} +{cmd:,} +{cmdab:c:ond(}{it:condition}{cmd:)} +[ +{cmdab:sy:mbol(}{it:symbol}{cmd:)} +] +returns the number of variables in {it:varlist} for which the condition +specified is true. The condition should be specified using {cmd:symbol()}, by +default {cmd:@}, as a placeholder for each variable. Thus, for example, +{cmd:rcount(}{it:varlist}{cmd:), c(@ > 0 & @ < .)} counts for each observation how +many variables in {it:varlist} are positive and non-missing. Note that +conditions typically make sense only if variables are either all numeric or all +string: one exception is {cmd:missing(@)}. More precisely, {cmd:rcount()} +gives the sum across {it:varlist} of condition, evaluated in turn for each +variable. (Stata 6 required.) + +{p 4 4 2} +For {cmd:rall()}, {cmd:rany()}, and {cmd:rcount()}, the {cmd:symbol()} option +may be used to set an alternative to {cmd:@} whenever the latter is +inappropriate. For example, if string variables were being searched for literal +occurrences of {cmd:"@"}, some other symbol not appearing in text or in +variable names should be used. + +{p 4 8 2}{cmd:. egen any = rany(b c d e f) , c(@ == a)}{p_end} +{p 4 8 2}{cmd:. egen all = rall(b c d e f) , c(@ == a)}{p_end} +{p 4 8 2}{cmd:. egen count = rcount(b c d e f) , c(@ == a)}{p_end} +{p 4 4 2}(values of {cmd:b c d e f} matched by (equal to) those of {cmd:a}?) + +{p 4 8 2}{cmd:. egen anyw1 = rany(b c d e f) , c(abs(@ - a) <= 1)}{p_end} +{p 4 8 2}{cmd:. egen allw1 = rall(b c d e f) , c(abs(@ - a) <= 1)}{p_end} +{p 4 8 2}{cmd:. egen countw1 = rcount(b c d e f) , c(abs(@ - a) <= 1)}{p_end} +{p 4 4 2}(values of {cmd:b c d e f} within 1 of those of {cmd:a}?) + +{p 4 4 2} +From Stata 7, {help foreach} provides an alternative that would now be +considered better style: + +{p 4 8 2}{cmd:. gen any = 0}{p_end} +{p 4 8 2}{cmd:. gen all = 1}{p_end} +{p 4 8 2}{cmd:. gen count = 0}{p_end} +{p 4 8 2}{cmd:. foreach v of var a b c d e f {c -(}}{p_end} +{p 4 8 2}{cmd:. {space 8}replace any = max(any, inrange(`v', 0, .))}{p_end} +{p 4 8 2}{cmd:. {space 8}replace all = min(all, inrange(`v', 0, .))}{p_end} +{p 4 8 2}{cmd:. {space 8}replace count = count + inrange(`v', 0, .)}{p_end} +{p 4 8 2}{cmd:. {c )-}}{p_end} + +{p 4 8 2} +{cmd:rowmedian(}{it:varlist}{cmd:)} +returns the median across observations of the variables in {it:varlist}. +(Stata 9 required.) (Note: official Stata added a {cmd:rowmedian()} +function in Stata 11, which always trumps this one.) + +{p 4 8 2} +{cmd:rownvals(}{it:numvarlist}{cmd:)} [ {cmd:,} {cmdab:miss:ing} ] +returns the number of distinct values in each observation for a set of +numeric variables {it:numvarlist}. Thus if the values in one observation for +five numeric variables are 1, 1, 2, 2, 3 the function returns 3 for +that observation. Missing values, i.e. any of . .a ... .z, are ignored +unless the {cmd:missing} option is specified. (Stata 9 required.) + +{p 4 8 2} +{cmd:rowsvals(}{it:strvarlist}{cmd:)} [ {cmd:,} {cmdab:miss:ing} ] +returns the number of distinct values in each observation for a set of +string variables {it:strvarlist}. Thus if the values in one observation for +five string variables are "frog", "frog", "toad", "toad", "newt" the function returns 3 for +that observation. Missing values, i.e. empty strings "", are ignored +unless the {cmd:missing} option is specified. (Stata 9 required.) + +{p 4 8 2} +{cmd:rsum2(}{it:varlist}{cmd:)} is a generalisation of {help egen}'s +{cmd:rsum()} (from Stata 9: {cmd:rowtotal()}) function with the extra +options {cmdab:allm:iss} and {cmdab:anym:iss}. +As with {cmd:rsum()}, it creates the (row) sum of the variables in {it:varlist}, +treating missing as 0. However, if the option {cmd:allmiss} is selected, the +(row) sum for any observation for which all variables in {it:varlist} are +missing is set equal to missing. Similarly, if the option {cmd:anymiss} is +selected the (row) sum for any observation for which any variable in +{it:varlist} is missing is set equal to missing. (Stata 6 required.) + + +{title:References} + +{p 4 8 2} +Bross, I.D.J. 1958. How to use ridit analysis. {it:Biometrics} 14: 18{c -}38. + +{p 4 8 2} +Cox, N.J. 2008. Speaking Stata: Between tables and graphs. +{it:Stata Journal} 8(2): 269{c -}289. + +{p 4 8 2} +Cox, N.J. and G. M. Longton. 2008. +Speaking Stata: Distinct observations. +{it:Stata Journal} 8(4): 557{c -}568. + +{p 4 8 2} +Fleiss, J.L. 1981. {it:Statistical Methods for Rates and Proportions.} +New York: John Wiley. + +{p 4 8 2} +Flora, J.D. 1988. Ridit analysis. In Kotz, S. and Johnson, N.L. (eds) +{it:Encyclopedia of Statistical Sciences.} New York: John Wiley. 8: 136{c -}139. + +{p 4 8 2} +Tukey, J.W. 1977. {it:Exploratory Data Analysis.} Reading, MA: Addison-Wesley. + + +{title:Maintainer} + +{p 4 4 2}Nicholas J. Cox, Durham University, U.K.{break} + n.j.cox@durham.ac.uk + + +{title:Acknowledgements} + +{p 4 4 2} +Kit Baum (baum@bc.edu) is the first author of {cmd:record()} and the +author of {cmd:dhms()}, {cmd:elap()}, {cmd:elap2()}, {cmd:hms()}, +{cmd:tod()}, {cmd:mixnorm()} and {cmd:truncdig()}. + +{p 4 4 2} +Ulrich Kohler (kohler@wzb.eu) is the author of {cmd:xtile()}, +{cmd:mlabvpos()}, {cmd:iso3166()} and {cmd:wpctile()}. + +{p 4 4 2} +Pablo A. Mitnik (pmitnik@stanford.edu) is the author of {cmd:d2()}. + +{p 4 4 2} +Steven Stillman (s.stillman@verizon.net) is the author of {cmd:rsum2()}. + +{p 4 4 2} +Nick Winter (njw3x@virginia.edu) is the author of {cmd:corr()} and +{cmd:noccur()}. + +{p 4 4 2} +Kit Baum, Sascha Becker, Ron{c a'}n Conroy, William Gould, Syed Islam, +Ariel Linden, +John Moran, +Stephen Soldz, Richard Williams, Fred Wolfe and Gerald Wright +provided stimulating and helpful comments. + + +{title:Also see} + +{p 4 13 2}STB: STB-50 dm70 for {cmd:atan2()}, {cmd:pp()}, {cmd:rev()}, {cmd:rindex()}, {cmd:rmed()}, {cmd:rotate()} + +{p 4 13 2}Manual: [D] egen (before Stata 9 [R] egen) + +{p 4 13 2}On-line: help for +{help egen}, +{help dates}, +{help functions}, +{help means}, +{help numlist}, +{help seed}, +{help tsset}, +{help varlist} (timeseries operators), +{help circular} (if installed), +{help ntimeofday} (if installed), +{help stimeofday} (if installed) + diff --git a/01.code/ado/f/fasterxtile.ado b/01.code/ado/f/fasterxtile.ado new file mode 100755 index 0000000..5b4f883 --- /dev/null +++ b/01.code/ado/f/fasterxtile.ado @@ -0,0 +1,96 @@ +*! version 1.0.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! faster implementation of xtile and fastxtile using C for faster processing +*! (note: this is a wrapper for gquantiles) + +capture program drop fasterxtile +program define fasterxtile + version 13.1 + + if ( `=_N < 1' ) { + error 2000 + } + + _parsewt "aweight fweight pweight" `0' + local 0 `"`s(newcmd)'"' /* command minus weight statement */ + local wgt `"`s(weight)'"' /* contains [weight=exp] or nothing */ + + syntax newvarname =/exp /// newvar = exp + [if] [in] , /// [if condition] [in start / end] + [ /// + by(passthru) /// By variabes: [+|-]varname [[+|-]varname ...] + replace /// Replace newvar, if it exists + Nquantiles(str) /// Number of quantiles + Cutpoints(varname numeric) /// Use cutpoints instead of percentiles + ALTdef /// Alternative definition + /// + method(passthru) /// Quantile method: (1) qsort, (2) qselect + strict /// Exit if nquantiles > # non-missing obs + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// print function benchmark info + BENCHmarklevel(passthru) /// print plugin benchmark info + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + /// + debug(passthru) /// + GROUPid(passthru) /// + tag(passthru) /// + counts(passthru) /// + fill(passthru) /// + ] + + if ( (`"`weight'"' != "") & ("`altdef'" != "") ) { + di in err "altdef option cannot be used with weights" + exit 198 + } + + if ( "`nquantiles'" != "" ) { + if ( "`cutpoints'" != "" ) { + di as err "both nquantiles() and cutpoints() " /// + "cannot be specified" + exit 198 + } + + if ( `nquantiles' < 2 ) { + di as err "nquantiles() must be greater than or " /// + "equal to 2" + exit 198 + } + + * if ( `nquantiles' > `=_N + 1' ) { + * di as err "nquantiles() must be less than or equal to " /// + * "number of observations plus one" + * exit 198 + * } + local nquantiles nquantiles(`nquantiles') + } + else if ( "`cutpoints'" == "" ) { + local nquantiles nquantiles(2) + } + + if ( "`cutpoints'" != "" ) { + unab cutpoints: `cutpoints' + local cutpoints cutpoints(`cutpoints') + } + + local opts `verbose' /// + `_ctolerance' /// + `benchmark' /// + `benchmarklevel' /// + `oncollision' /// + `hashmethod' /// + `compress' /// + `forcestrl' /// + `replace' /// + `groupid' /// + `debug' /// + `tag' /// + `counts' /// + `fill' + + local gqopts `nquantiles' `cutpoints' `altdef' `strict' `opts' `method' + gquantiles `varlist' = `exp' `if' `in' `wgt', xtile `gqopts' `by' +end diff --git a/01.code/ado/f/fasterxtile.sthlp b/01.code/ado/f/fasterxtile.sthlp new file mode 100755 index 0000000..5d20826 --- /dev/null +++ b/01.code/ado/f/fasterxtile.sthlp @@ -0,0 +1,472 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gquantiles "dialog gquantiles"}{...} +{vieweralsosee "[R] gquantiles" "mansection R gquantiles"}{...} +{viewerjumpto "Syntax" "gquantiles##syntax"}{...} +{viewerjumpto "Description" "gquantiles##description"}{...} +{viewerjumpto "Options" "gquantiles##options"}{...} +{viewerjumpto "Stored results" "gegen##results"}{...} +{title:Title} + +{p2colset 5 19 23 2}{...} +{p2col :{cmd:gquantiles} {hline 2}}Efficiently compute percentiles (quantiles), categories, and frequencies.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{pstd} +gquantiles can function as a fast, by-able, alternative to {cmd:xtile}, +{cmd:pctile}, and {cmd:_pctile}, though it offers more functionality +that those Stata commands (e.g. this function accepts {opth by(varlist)} +with {cmd:xtile[()]} and {cmd:pctile[()]}, it can compute arbitrary +quantiles and an arbitrary number in a reasonable amount of time, +it computes frequencies, and more). + +{phang} +Create variable containing percentiles (equivalent to {cmd:pctile}) + +{p 8 15 2} +{cmd:gquantiles} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +pctile +[{opth nquantiles(int)} +{opth genp(newvarname)} +{opt altdef}] + +{phang} +Create variable containing quantile categories (equivalent to {cmd:xtile}) + +{p 8 15 2} +{cmd:gquantiles} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +xtile +[{opth nquantiles(int)} +{opth cutpoints(varname)} +{opt altdef}] + +{p 8 15 2} +{cmd:fasterxtile} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +[{opth nquantiles(int)} +{opth cutpoints(varname)} +{opt altdef}] + +{phang} +Compute percentiles and store them in r() (equivalent to {cmd:_pctile}) + +{p 8 15 2} +{cmd:gquantiles} +{it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +_pctile +[{opth nquantiles(int)} +{opth percentiles(numlist)} +{opt altdef}] + +{pstd} +The full syntax, however, is + +{p 8 15 2} +{cmd:gquantiles} +[{newvar} {cmd:=}] {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +{c -(}{cmd:pctile}{c |}{cmd:xtile}{c |}{cmd:_pctile}{c )-} +{it:{help gquantiles##quantiles_method:quantiles_method}} +[{it:{help gquantiles##gquantiles_options:gquantiles_options}}] + +{synoptset 22 tabbed}{...} +{marker quantiles_method}{...} +{synopthdr} +{synoptline} +{syntab :Quantiles method (choose only one)} + +{synopt :{opt n:quantiles(#)}}number of quantiles; default is {cmd:nquantiles(2)} +{p_end} +{synopt :{opth p:ercentiles(numlist)}}calculate percentiles corresponding to the specified percentages +{p_end} +{synopt :{opth c:utpoints(varname)}}use values of {it:varname} as cutpoints +{p_end} +{synopt :{opth cutoffs(numlist)}}use values of {it:numlist} as cutpoints +{p_end} +{synopt :{opth cutquantiles(numlist)}}calculate percentiles corresponding to the values of {it:varname} +{p_end} +{synopt :{opth quantmatrix(matrix)}}use values of {it:matrix} as quantiles +{p_end} +{synopt :{opth cutmatrix(matrix)}}use values of {it:matrix} as cutpoints +{p_end} + +{synoptset 18 tabbed}{...} +{marker gquantiles_options}{...} +{synopthdr} +{synoptline} +{syntab :Options} + +{synopt :{opth g:enp(newvar:newvarp)}}generate {it:newvarp} variable containing percentages +{p_end} +{synopt :{opt alt:def}}use alternative formula for calculating percentiles +{p_end} + +{syntab:Extras} +{synopt :{opth by(varlist)}}Compute quantiles by groups ({cmd:pctile} and {cmd:xtile} only). +{p_end} +{synopt :{opth groupid(varname)}}Store group ID in {it:varname}. +{p_end} +{synopt :{opt _pctile}}(Not with by.) Do the computation in the style of {cmd:_pctile} +{p_end} +{synopt :{cmd:pctile}[{cmd:(}{newvar}{cmd:)}]}Store percentiles in {it:newvar}. If {it:newvar} is not specified, then this indicates to do the computations in the style of {cmd:pctile}. +{p_end} +{synopt :{cmd:xtile}[{cmd:(}{newvar}{cmd:)}]}Store quantile categories in {it:newvar}. If {it:newvar} is not specified, then this indicates to do the computations in the style of {cmd:xtile}. +{p_end} +{synopt :{cmd:binfreq}[{cmd:(}{newvar}{cmd:)}]}Store the frequency counts of the source variable in the quantile categories in {it:newvar}. If {it:newvar} is not specified (not with by), this is stored in {hi:r(quantiles_bincount)} or {hi:r(cutoffs_bincount)} +{p_end} + +{syntab:Switches} +{synopt :{opt method(#)}}(Not with by.) Algorithm to use to compute quantiles. +{p_end} +{synopt :{opt dedup}}Drop duplicate values of variables specified via {opt cutpoints} or {opt cutquantiles} +{p_end} +{synopt :{opt cutifin}}Exclude values outside {ifin} of variables specified via {opt cutpoints} or {opt cutquantiles} +{p_end} +{synopt :{opt cutby}}Use {opt cutquantiles()} or {opt cutpoints()} by group. +{p_end} +{synopt :{opt returnlimit(#)}}Maximum return values that can be set via {opt _pctile} +{p_end} +{synopt :{opt strict}}Without by, exit with error when the number of quantiles requested exceeds the number non-missing. With by, skip groups where this happens. +{p_end} +{synopt :{opt minmax}}(Not with by.) Additionally store the min and max in {hi:r(min)} and {hi:r(max)} +{p_end} +{synopt :{opt replace}}Replace targets, should they exist. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed (see +{manhelp weight U:11.1.6 weight}), except with option {opt altdef}, in +which case no weights are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gquantiles} replaces {cmd:xtile}, {cmd:pctile}, and {cmd:_pctile}. +gquantiles offers several additional options above the three built-in +Stata commands: an arbitrary number of quantiles, arbitrary cutoffs, +frequency counts of the xtile categories, computing {cmd:pctile} and +{cmd:xtile} at the same time, and so on. + +{pstd} +gquantiles is also faster than the user-written fastxtile, so an alias, +fasterxtile, is provided. + +{pstd} +{opt gquantiles} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{dlgtab:Quantiles method} + +{phang} +{opt n:quantiles(#)} specifies the number of quantiles. +It computes percentiles corresponding to percentages 100*k/m +for k=1, 2, ..., m-1, where m={it:#}. For example, {cmd:nquantiles(10)} +requests that the 10th, 20th, ..., 90th percentiles be computed. The default +is {cmd:nquantiles(2)}; that is, the median is computed. + +{phang} +{opth p:ercentiles(numlist)} requests +percentiles corresponding to the specified percentages. For example, +{cmd:percentiles(10(20)90)} requests that the 10th, 30th, 50th, 70th, and 90th +percentiles be computed. With {opt _pctile} these are placed into {cmd:r(r1)}, +{cmd:r(r2)}, {cmd:r(r3)}, {cmd:r(r4)}, and {cmd:r(r5)} up to 1,001. With +{opt xtile} these are the quantiles that define the categories and with +{opt pctile} these are the quantiles to compute. + +{phang} +{opth c:utpoints(varname)} requests that the values of {it:varname} +be used to define the categories, rather than quantiles. This is natural +to use with {opt xtile}. With {opt pctile} and {opt _pctile} this is +redindant unless you also request {cmd:binfreq}[{cmd:(}{newvar}{cmd:)}]. +By default, all values of {it:varname} are used, regardless of any {opt if} +or {opt in} restriction. You can specify {opt cutifin} to obey the +restrictions and {opt dedup} to exclude duplicates. + +{phang} +{opth cutoffs(numlist)} Use values of {it:numlist} as cutpoints. + +{phang} +{opth cutquantiles(numlist)} Calculate percentiles corresponding to the values of +{it:varname}. This is an alternative to {opt percentiles()}. + +{phang} +{opth quantmatrix(matrix)} +Requests percentiles (quantiles) corresponding to the entries of the +matrix. This must be a column vector or a row vector. The behavior of +gquantiles using this option is otherwise equivalent to its behavior +when passing {opt quantiles()}. + +{phang} +{opth cutmatrix(matrix)} +Requests cutoffs corresponding to the entries of the matrix. This must +be a column vector or a row vector. The behavior of gquantiles using +this option is otherwise equivalent to its behavior when passing +{opt cutoffs()}. + +{dlgtab:Standard Options} + +{phang}{opth genp(newvar)} +specifies a new variable to be generated +containing the percentages corresponding to the percentiles. + +{phang}{opt altdef} uses an alternative formula for calculating percentiles +(not with weights). +The default method is to invert the empirical distribution function by using +averages, where the function is flat (the default is the same method used by +{cmd:summarize}; see {manhelp summarize R}). +The alternative formula uses an interpolation method. See +{mansection D pctileMethodsandformulas:{it:Methods and formulas}} in +{bf:[D] pctile}. + +{dlgtab:Extras} + +{phang} +{opth by(varlist)} +Compute quantiles by group. {cmd:pctile[()]} requires option +{cmd:strict}, which has the effect of ignoring groups where the number +of quantiles requested is larger than the number of non-missing +observations within the group. {opt by()} is most useful with option +{opth groupid(varname)}. + +{phang} +{opth groupid(varname)} Store group ID in {it:varname}. This +is equivalent to {cmd:gegen, group} + +{phang} +{opt _pctile} (Not with by.) Do the computation in the style of {cmd:_pctile}. It +stores return values in r(1), r(2), and so on, as wll as a matrix called +{hi:r(quantiles_used)} or {hi:r(cutoffs_used)} in case quantiles or cutoffs +are requested. This can be combined with other options listed in this section. + +{phang} +{cmd:pctile}[{cmd:(}{newvar}{cmd:)}] Store percentiles in {it:newvar}. If +{it:newvar} is not specified, then this indicates to do the computations in +the style of {cmd:pctile}. This can be combined with other options listed in +this section. + +{phang} +{cmd:xtile}[{cmd:(}{newvar}{cmd:)}] Store quantile categories in +{it:newvar}. If {it:newvar} is not specified, then this indicates to do the +computations in the style of {cmd:xtile}. This can be combined with other +options listed in this section. + +{phang} +{cmd:binfreq}[{cmd:(}{newvar}{cmd:)}] Store the frequency counts of +the source variable in the quantile categories in {it:newvar}. When +weights are specified, this stores the sum of the weights within +that category. If {it:newvar} is not specified, this is stored in +{hi:r(quantiles_bincount)} or {hi:r(cutoffs_bincount)}. This can be +combined with other options listed in this section. + +{dlgtab:Switches} + +{phang} +{opt method(#)} (Not with by.) Algorithm to use to compute quantiles. If you have many +duplicates or are computing many quantiles, you should specify {opt +method(1)}. If you have few duplicates or are computing few quantiles you +should specify {opt method(2)}. By default, {cmd:gquantiles} tries to guess +which method will run faster. + +{phang} +{opt dedup} Drop duplicate values of variables specified via {opt cutpoints()} +or {opt cutquantiles()}. For instance, if the user asks for +quantiles 1, 90, 10, 10, and 1, then quantiles 1, 1, 10, 10, and 90 are +used. With this option only 1, 10, and 90 would be used. + +{phang} +{opt cutifin} Exclude values outside {ifin} of variables specified via +{opt cutpoints()} or {opt cutquantiles()}. The restriction that all +values are used is artificial (the option was originally written to +allow {cmd:xtile} to use {cmd:pctile} internally). + +{phang} +{opt cutby} By default all values of the variable requested via {opt cutpoints()} +or {opt cutquantiles()} are used. With this option, each group uses a different +set of quantiles or cutoffs (note this automatically sets option {cmd:cutifin}) + +{phang} +{opt returnlimit(#)} Maximum return values that can be set via {opt _pctile}. +Since {cmd:gquantiles} can compute a large number of quantiles very quickly, +the function allows the user to request an arbitrary number. But setting +1,000s of return values is computationally infeasible. Consider {opt pctile} +in this case. + +{phang} +{opt strict} Without {opt by()}, exit with error if the number of quantiles +is greater than the number of non-missing observations plus one. With +{opt by()}, skip groups where this happens. This restriction for {opt pctile} +is sensible, but for {opt xtile} it is artificial. It exists because it uses +{opt pctile} internally, but {cmd:gquantiles} does not have this issue. + +{phang} +{opt minmax} (Not with by.) Additionally store the min and max in {hi:r(min)} and {hi:r(max)} + +{phang} +{opt replace} Replace targets, should they exist. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gquantiles/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gquantiles} stores the following in {cmd:r()}: + +{synoptset 22 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }}Number of observations {p_end} +{synopt:{cmd:r(min) }}Min (only if minmax was requested) {p_end} +{synopt:{cmd:r(max) }}Max (only if minmax was requested) {p_end} +{synopt:{cmd:r(nqused) }}Number of quantiles/cutoffs {p_end} +{synopt:{cmd:r(method_ratio)}}Rule used to decide between methods 1 and 2{p_end} + +{synopt:{cmd:r(nquantiles) }}Number of quantiles (only w nquantiles()) {p_end} +{synopt:{cmd:r(ncutpoints) }}Number of cutpoints (only w cutpoints()) {p_end} +{synopt:{cmd:r(nquantiles_used)}}Number of quantiles (only w quantiles()) {p_end} +{synopt:{cmd:r(nquantpoints) }}Number of quantiles (only w cutquantiles()){p_end} +{synopt:{cmd:r(ncutoffs_used) }}Number of cutoffs (only w cutoffs()) {p_end} + +{synopt:{cmd:r(r#)}}The #th quantile requested (only w _pctile){p_end} +{p2colreset}{...} + +{synoptset 22 tabbed}{...} +{p2col 5 15 19 2: Macros}{p_end} +{synopt:{cmd:r(quantiles)}}Quantiles used (only w percentiles() or quantiles()){p_end} +{synopt:{cmd:r(cutoffs) }}Cutoffs used (only w option cutoffs()) {p_end} +{p2colreset}{...} + +{synoptset 22 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(quantiles_used) }}With _pctile or with quantiles() {p_end} +{synopt:{cmd:r(quantiles_binfreq)}}With option binfreq and any quantiles requested{p_end} + +{synopt:{cmd:r(cutoffs_used) }}With _pctile or with cutoffs() {p_end} +{synopt:{cmd:r(cutoffs_binfreq)}}With option binfreq and any cutoffs requested{p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gquantiles} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:pctile} +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help pctile}, +{help gtools}; +{help fastxtile} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gcollapse.ado b/01.code/ado/g/gcollapse.ado new file mode 100755 index 0000000..0dc09c7 --- /dev/null +++ b/01.code/ado/g/gcollapse.ado @@ -0,0 +1,2013 @@ +*! version 1.3.1 03Nov2021 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -collapse- implementation using C for faster processing + +capture program drop gcollapse +program gcollapse, rclass + version 13.1 + global GTOOLS_USER_VARABBREV `c(varabbrev)' + local 00 `0' + + * Grab some free timers + FreeTimer + local t97: copy local FreeTimer + global GTOOLS_T97: copy local t97 + gtools_timer on `t97' + + FreeTimer + local t96: copy local FreeTimer + if ( `t96' == 0 ) { + disp as txt "(note: at least one timer required; overriding timer 96)" + local t96 96 + } + global GTOOLS_T96: copy local t96 + gtools_timer on `t96' + + global GTOOLS_CALLER gcollapse + syntax [anything(equalok)] /// Main function call: + /// [(stat)] varlist [ [(stat)] ... ] + /// [(stat)] target = source [target = source ...] [ [(stat)] ...] + [if] [in] /// [if condition] [in start / end] + [aw fw iw pw] , /// [weight type = exp] + [ /// + by(str) /// Collapse by variabes: [+|-]varname [[+|-]varname ...] + cw /// Drop ocase-wise bservations where sources are missing. + fast /// Do not preserve and restore the original dataset. Saves speed + /// but leaves data unusable if the user hits Break. + /// + merge /// Merge statistics back to original data, replacing if applicable + replace /// Allow replacing existing variables with output with merge + noinit /// Do not initialize targets with missing values + freq(passthru) /// Include frequency count with observations per group + /// + LABELFormat(passthru) /// Custom label engine: (#stat#) #sourcelabel# is the default + LABELProgram(passthru) /// Program to parse labelformat (see examples) + /// + missing /// Preserve missing values for sums + rawstat(passthru) /// Ignore weights for selected variables + /// + /// + recast /// Recast source variables to save memory + WILDparse /// parse assuming wildcard renaming + unsorted /// Do not sort the data; faster + forceio /// Use disk temp drive for writing/reading collapsed data + forcemem /// Use memory for writing/reading collapsed data + double /// Generate all targets as doubles + sumcheck /// Check whether sum will overflow + NODS DS /// Parse - as varlist (ds) or negative (nods) + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _subtract /// (Undocumented) Subtract result from source variable + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// print function benchmark info + BENCHmarklevel(int 0) /// print plugin benchmark info + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + /// + debug /// (internal) Debug + DEBUG_level(int 0) /// (internal) Debug (passed to internals) + debug_replaceby /// (internal) Allow replacing by variables with output + debug_io_read(int 1) /// (internal) Read IO data using mata or C + debug_io_check(real 1e6) /// (internal) Threshold to check for I/O speed gains + debug_io_threshold(real 10) /// (internal) Threshold to switch to I/O instead of RAM + ] + + * Pre-option parsing + * ------------------ + + if ( "`debug'" != "" ) local debug_level 9 + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( "`missing'" != "" ) { + local keepmissing keepmissing + disp "Option -missing- is deprecated. Use (nansum) or (rawnansum) instead." + } + + local replaceby = cond("`debug_replaceby'" == "", "", "replaceby") + local gfallbackok = `"`replaceby'`replace'`init'`freq'`merge'`labelformat'`labelprogram'`rawstat'"' == `""' + + if ( ("`ds'" != "") & ("`nods'" != "") ) { + di as err "-ds- and -nods- mutually exclusive" + exit 198 + } + + * Parse by call (make sure varlist is valid) + * ------------------------------------------ + + if ( `"`by'"' != "" ) { + local clean_by: copy local by + local clean_by: subinstr local clean_by "+" " ", all + if ( strpos(`"`clean_by'"', "-") & ("`ds'`nods'" == "") ) { + disp as txt "'-' interpreted as negative; use option -ds- to interpret as varlist" + disp as txt "(to suppress this warning, use option -nods-)" + } + if ( "`ds'" != "" ) { + local clean_by `clean_by' + if ( "`clean_by'" == "" ) { + di as err "Invalid varlist: `by'" + clean_all 198 + exit 198 + } + cap ds `clean_by' + if ( _rc ) { + cap noi ds `clean_by' + local rc = _rc + clean_all `rc' + exit `rc' + } + local clean_by `r(varlist)' + } + else { + local clean_by: subinstr local clean_by "-" " ", all + local clean_by `clean_by' + if ( "`clean_by'" == "" ) { + di as err "Invalid list: `by'" + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + CleanExit + exit 198 + } + cap ds `clean_by' + if ( _rc ) { + local notfound + foreach var of local clean_by { + cap ds `var' + if ( _rc ) { + local notfound `notfound' `var' + } + } + if ( `:list sizeof notfound' > 0 ) { + if ( `:list sizeof notfound' > 1 ) { + di as err "Variables not found: `notfound'" + } + else { + di as err "Variable `notfound' not found" + } + } + CleanExit + exit 111 + } + qui ds `clean_by' + local clean_by `r(varlist)' + } + } + if ( "`ds'" == "" ) local nods nods + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "Running {cmd:gcollapse} with debug level `debug_level'" + disp as txt "{hline 72}" + disp as txt `""' + disp as txt `" anything: `anything'"' + disp as txt `" [if] [in]: `if' `in'"' + disp as txt `""' + disp as txt `" by: `by'"' + disp as txt `" cw: `cw'"' + disp as txt `" fast: `fast'"' + disp as txt `""' + disp as txt `" merge: `merge'"' + disp as txt `" replace: `replace'"' + disp as txt `" compress: `compress'"' + disp as txt `" forcestrl: `forcestrl'"' + disp as txt `" freq: `freq'"' + disp as txt `" labelformat: `labelformat'"' + disp as txt `" labelprogram: `labelprogram'"' + disp as txt `" unsorted: `unsorted'"' + disp as txt `" forceio: `forceio'"' + disp as txt `" forcemem: `forcemem'"' + disp as txt `" double: `double'"' + disp as txt `""' + disp as txt `" verbose: `verbose'"' + disp as txt `" benchmark: `benchmark'"' + disp as txt `" benchmarklevel: `benchmarklevel'"' + disp as txt `" hashmethod: `hashmethod'"' + disp as txt `" oncollision: `oncollision'"' + disp as txt `""' + disp as txt `" debug_replaceby: `debug_replaceby'"' + disp as txt `" debug_io_read: `debug_io_read'"' + disp as txt `" debug_io_check: `debug_io_check'"' + disp as txt `" debug_io_threshold: `debug_io_threshold'"' + disp as txt "{hline 72}" + disp as txt `""' + } + + * Parse options + * ------------- + + if ( ("`forceio'" != "") & ("`merge'" != "") ) { + di as err "{opt merge} with {opt forceio} is" /// + " inefficient and hence not allowed." + CleanExit + exit 198 + } + + if ( ("`forceio'" != "") & ("`forcemem'" != "") ) { + di as err "only specify one of {opt forceio} and {opt forcemem};" /// + " cannot do both at the same time." + CleanExit + exit 198 + } + + local verb = ( "`verbose'" != "" ) + local bench = ( "`benchmark'" != "" ) + + if ( "`fast'" == "" ) preserve + + * Parse collapse statement to get sources, targets, and stats + * ----------------------------------------------------------- + + cap noi parse_vars `anything' `if' `in', /// + `labelformat' `labelprogram' `freq' `wildparse' + + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + local __gtools_gc_uniq_targets : list uniq __gtools_gc_targets + local nonunique: list __gtools_gc_targets - __gtools_gc_uniq_targets + if ( `:list sizeof nonunique' != 0 ) { + di as err "Repeat targets not allowed: `:list uniq nonunique'" + CleanExit + exit 198 + } + + foreach var of local __gtools_gc_uniq_vars { + cap noi confirm numeric variable `var' + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:gcollapse} debug level `debug_level'" + disp as txt "{hline 72}" + disp as txt `"parse_vars"' + disp as txt `" anything: `anything'"' + disp as txt `" [if] [in]: `if' `in'"' + disp as txt `""' + disp as txt `" cw: `cw'"' + disp as txt `" fast: `fast'"' + disp as txt `""' + disp as txt `" freq: `freq'"' + disp as txt `" labelformat: `labelformat'"' + disp as txt `" labelprogram: `labelprogram'"' + disp as txt `""' + disp as txt " __gtools_gc_targets: `__gtools_gc_targets'" + disp as txt " __gtools_gc_vars: `__gtools_gc_vars'" + disp as txt " __gtools_gc_stats: `__gtools_gc_stats'" + disp as txt " __gtools_gc_uniq_vars: `__gtools_gc_uniq_vars'" + disp as txt " __gtools_gc_uniq_stats: `__gtools_gc_uniq_stats'" + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + } + + * Parse weights + * ------------- + + if ( `:list posof "count" in __gtools_gc_uniq_stats' > 0 ) { + if ( `"`weight'"' == "aweight" ) { + local awnote 1 + } + else local awnote 0 + } + else if ( `:list posof "nmissing" in __gtools_gc_uniq_stats' > 0 ) { + if ( `"`weight'"' == "aweight" ) { + local awnote 1 + } + else local awnote 0 + } + else local awnote 0 + + if ( `:list posof "variance" in __gtools_gc_uniq_stats' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "variance not allowed with pweights" + exit 135 + } + } + if ( `:list posof "cv" in __gtools_gc_uniq_stats' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "cv not allowed with pweights" + exit 135 + } + } + if ( `:list posof "sd" in __gtools_gc_uniq_stats' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "sd not allowed with pweights" + exit 135 + } + } + if ( `:list posof "semean" in __gtools_gc_uniq_stats' > 0 ) { + if ( inlist(`"`weight'"', "pweight", "iweight") ) { + di as err "semean not allowed with `weight's" + exit 135 + } + } + if ( `:list posof "sebinomial" in __gtools_gc_uniq_stats' > 0 ) { + if ( inlist(`"`weight'"', "aweight", "iweight", "pweight") ) { + di as err "sebinomial not allowed with `weight's" + exit 135 + } + } + if ( `:list posof "sepoisson" in __gtools_gc_uniq_stats' > 0 ) { + if ( inlist(`"`weight'"', "aweight", "iweight", "pweight") ) { + di as err "sepoisson not allowed with `weight's" + exit 135 + } + } + if ( regexm("^select", `"`__gtools_gc_uniq_stats'"') ) { + if ( inlist(`"`weight'"', "iweight") ) { + di as err "select not allowed with `weight's" + exit 135 + } + } + + if ( `"`weight'"' != "" ) { + tempvar w + qui gen double `w' `exp' `if' `in' + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + } + else local weights + + * Subset if requested + * ------------------- + + if ( (`"`if'`wgt'"' != `""') | ("`cw'" != "") ) { + * marksample touse, strok novarlist + tempvar touse + mark `touse' `if' `in' `wgt' + if ( "`cw'" != "" ) { + markout `touse' `__gtools_gc_uniq_vars', strok + } + if ( "`merge'" == "" ) { + qui keep if `touse' + local ifin "" + } + else local ifin if `touse' `in' + } + else { + local ifin `in' + } + + if ( `=_N' == 0 ) { + di as err "no observations" + CleanExit + exit 2000 + } + + * Parse variables to keep, drop, rename, recast + * --------------------------------------------- + + * Parse variables to keep (by variables, sources) and drop (all else). + * Also parse which source variables to recast (see below; we try to use + * source variables as their first target to save memory) + + set varabbrev off + cap noi parse_keep_drop, by(`clean_by') `double' /// + `merge' `replace' `replaceby' `sumcheck' `weights' /// + __gtools_gc_targets(`__gtools_gc_targets') /// + __gtools_gc_vars(`__gtools_gc_vars') /// + __gtools_gc_stats(`__gtools_gc_stats') /// + __gtools_gc_uniq_vars(`__gtools_gc_uniq_vars') /// + __gtools_gc_uniq_stats(`__gtools_gc_uniq_stats') + + set varabbrev ${GTOOLS_USER_VARABBREV} + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + local dropme "" + local keepvars "`r(keepvars)' `w'" + local added "`r(added)'" + local memvars "`r(memvars)'" + local check_recast "`r(check_recast)'" + + scalar __gtools_gc_k_targets = `:list sizeof __gtools_gc_targets' + scalar __gtools_gc_k_vars = `:list sizeof __gtools_gc_vars' + scalar __gtools_gc_k_stats = `:list sizeof __gtools_gc_stats' + scalar __gtools_gc_k_uniq_vars = `:list sizeof __gtools_gc_uniq_vars' + scalar __gtools_gc_k_uniq_stats = `:list sizeof __gtools_gc_uniq_stats' + + mata: gtools_vars = tokens(`"`__gtools_gc_vars'"') + mata: gtools_targets = tokens(`"`__gtools_gc_targets'"') + mata: gtools_stats = tokens(`"`__gtools_gc_stats'"') + + cap noi CheckMatsize `clean_by' + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + cap noi CheckMatsize `__gtools_gc_vars' + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + cap noi CheckMatsize `__gtools_gc_targets' + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + cap noi CheckMatsize `__gtools_gc_stats' + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:gcollapse} debug level `debug_level'" + disp as txt "{hline 72}" + disp as txt `"parse_keep_drop"' + disp as txt `""' + disp as txt `" by: `by'"' + disp as txt `" clean_by: `clean_by'"' + disp as txt `""' + disp as txt `" merge: `merge'"' + disp as txt `" double: `double'"' + disp as txt `" replace: `replace'"' + disp as txt `" compress: `compress'"' + disp as txt `" forcestrl: `forcestrl'"' + disp as txt `" replaceby: `replaceby'"' + disp as txt `""' + disp as txt `" __gtools_gc_targets: `__gtools_gc_targets'"' + disp as txt `" __gtools_gc_vars: `__gtools_gc_vars'"' + disp as txt `" __gtools_gc_stats: `__gtools_gc_stats'"' + disp as txt `" __gtools_gc_uniq_vars: `__gtools_gc_uniq_vars'"' + disp as txt `" __gtools_gc_uniq_stats: `__gtools_gc_uniq_stats'"' + disp as txt `""' + disp as txt `" dropme: `dropme'"' + disp as txt `" keepvars: `keepvars'"' + disp as txt `" added: `added'"' + disp as txt `" memvars: `memvars'"' + disp as txt `" check_recast: `check_recast'"' + disp as txt `""' + disp as txt `" scalar __gtools_gc_k_targets = `=scalar(__gtools_gc_k_targets)'"' + disp as txt `" scalar __gtools_gc_k_vars = `=scalar(__gtools_gc_k_vars)'"' + disp as txt `" scalar __gtools_gc_k_stats = `=scalar(__gtools_gc_k_stats)'"' + disp as txt `" scalar __gtools_gc_k_uniq_vars = `=scalar(__gtools_gc_k_uniq_vars)'"' + disp as txt `" scalar __gtools_gc_k_uniq_stats = `=scalar(__gtools_gc_k_uniq_stats)'"' + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + } + + * Timers! + * ------- + + local msg "Parsed by variables, sources, and targets" + gtools_timer info `t97' `"`msg'"', prints(`bench') + + *********************************************************************** + * Recast variables to save memory * + *********************************************************************** + + * Recast sources, if applicable + mata: st_numscalar("__gtools_gc_k_recast", cols(__gtools_gc_recastvars)) + if ( `=scalar(__gtools_gc_k_recast)' > 0 ) { + local gtools_recastvars "" + local gtools_recastsrc "" + forvalues k = 1 / `=scalar(__gtools_gc_k_recast)' { + mata: st_local("var", __gtools_gc_recastvars[`k']) + tempvar dropvar + rename `var' `dropvar' + local dropme `dropme' `dropvar' + local gtools_recastvars `gtools_recastvars' `var' + local gtools_recastsrc `gtools_recastsrc' `dropvar' + } + + qui mata: st_addvar(__gtools_gc_recasttypes, __gtools_gc_recastvars, 1) + if ( `=_N > 0' ) { + cap noi _gtools_internal, /// + recast(targets(`gtools_recastvars') sources(`gtools_recastsrc')) + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + } + + local msg `"Recast source variables to save memory"' + gtools_timer info `t97' `"`msg'"', prints(`bench') + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:gcollapse} debug level `debug_level'" + disp as txt "{hline 72}" + disp as txt `"recast"' + disp as txt `""' + disp as txt `" gtools_recastvars `gtools_recastvars'"' + disp as txt `" gtools_recastsrc `gtools_recastsrc'"' + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + } + + *********************************************************************** + * Reorder * + *********************************************************************** + + local _: list memvars - __gtools_gc_uniq_vars + local memorder: list memvars - _ + + mata: gtools_vars_mem = tokens("`memorder'") + mata: gtools_pos = gtools_vars :== gtools_targets + mata: gtools_io_order = selectindex(gtools_pos), selectindex(!gtools_pos) + + * First, make sure that the sources used as targets appear first + mata: gtools_vars = gtools_vars [gtools_io_order] + mata: gtools_targets = gtools_targets [gtools_io_order] + mata: gtools_stats = gtools_stats [gtools_io_order] + mata: __gtools_gc_labels = __gtools_gc_labels [gtools_io_order] + mata: __gtools_gc_formats = __gtools_gc_formats [gtools_io_order] + + * Now make sure that the sources are in memory order + tempname k1 k2 ord + mata: `k1' = cols(gtools_vars_mem) + mata: `k2' = cols(gtools_vars) + mata: `ord' = gtools_vars[1::`k1'] + mata: gtools_mem_order = J(1, 0, .) + mata: for(k = 1; k <= `k1'; k++) gtools_mem_order = gtools_mem_order, selectindex(gtools_vars_mem[k] :== `ord') + mata: gtools_mem_order = (`k2' > `k1')? gtools_mem_order, ((`k1' + 1)::`k2')': gtools_mem_order + cap mata: mata drop `k' + cap mata: mata drop `ord' + + mata: gtools_vars = gtools_vars [gtools_mem_order] + mata: gtools_targets = gtools_targets [gtools_mem_order] + mata: gtools_stats = gtools_stats [gtools_mem_order] + mata: __gtools_gc_labels = __gtools_gc_labels [gtools_mem_order] + mata: __gtools_gc_formats = __gtools_gc_formats [gtools_mem_order] + + * At each step we reordered stats, soruces, and targets! + local __gtools_gc_order `__gtools_gc_targets' + local __gtools_gc_vars "" + local __gtools_gc_targets "" + local __gtools_gc_stats "" + forvalues k = 1 / `=scalar(__gtools_gc_k_targets)' { + mata: st_local("var", gtools_vars [`k']) + mata: st_local("targ", gtools_targets[`k']) + mata: st_local("stat", gtools_stats [`k']) + local __gtools_gc_vars `__gtools_gc_vars' `var' + local __gtools_gc_targets `__gtools_gc_targets' `targ' + local __gtools_gc_stats `__gtools_gc_stats' `stat' + } + local __gtools_gc_uniq_stats: list uniq __gtools_gc_stats + local __gtools_gc_uniq_vars: list uniq __gtools_gc_vars + + *********************************************************************** + * I/O switch * + *********************************************************************** + + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile __gtools_gc_file + } + else { + GcollapseTempFile __gtools_gc_file + } + scalar __gtools_gc_k_extra = __gtools_gc_k_targets - __gtools_gc_k_uniq_vars + + local sources sources(`__gtools_gc_vars') + local stats stats(`__gtools_gc_stats') + local targets targets(`__gtools_gc_targets') + local opts missing replace `init' `keepmissing' `compress' `forcestrl' `_subtract' `_ctolerance' + local opts `opts' `verbose' `benchmark' `benchmarklevel' `hashmethod' `ds' `nods' + local opts `opts' `oncollision' debug(`debug_level') `rawstat' + local action `sources' `targets' `stats' + + local switch = (`=scalar(__gtools_gc_k_extra)' > 3) & (`debug_io_check' < `=_N') + local mem = ("`forcemem'" != "") /// + | ("`merge'" != "") /// + | (`=scalar(__gtools_gc_k_extra)' == 0) + local io = ("`forceio'" != "") & (`=scalar(__gtools_gc_k_extra)' > 0) + + if ( `io' ) { + * Drop rest of vars + local plugvars `clean_by' `__gtools_gc_uniq_vars' + local dropme `dropme' `:list memvars - keepvars' + local dropme `:list dropme - plugvars' + if ( "`dropme'" != "" ) mata: st_dropvar(tokens(`"`dropme'"')) + + local gcollapse gcollapse(forceio, fname(`__gtools_gc_file')) + local action `action' fill(data) `unsorted' + } + else if ( !`mem' & `switch' ) { + + * Replace source vars in memory, since they already exist + local plugvars `clean_by' `__gtools_gc_uniq_vars' + + * It will be faster to add targets with fewer variables in + * memory. Dropping superfluous variables also saves memory. + local dropme `dropme' `:list memvars - keepvars' + local dropme `:list dropme - plugvars' + + * Drop extra vars + if ( "`dropme'" != "" ) mata: st_dropvar(tokens(`"`dropme'"')) + local msg `"Dropped superfluous variables"' + gtools_timer info `t97' `"`msg'"', prints(`bench') + + * Benchmark adding 2 variables to gauge how long it might take to + * add __gtools_gc_k_extra targets. + tempvar __gtools_gc_index __gtools_gc_ix __gtools_gc_info + cap noi benchmark_memvars, /// + index(`__gtools_gc_index') /// + ix(`__gtools_gc_ix') /// + info(`__gtools_gc_info') + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + local st_time = `r(st_time)' + gtools_timer info `t97' `"`r(st_str)'"', prints(`bench') + + if ( `st_time' > 0 ) { + * Call the plugin with switch option + * ---------------------------------- + + local st_time st_time(`=`st_time' / `debug_io_threshold'') + local ixinfo ixinfo(`__gtools_gc_index' `__gtools_gc_ix' `__gtools_gc_info') + local gcollapse gcollapse(switch, `st_time' fname(`__gtools_gc_file') `ixinfo') + local action `action' fill(data) `unsorted' + } + else { + + * If benchmark was 0, add the vars right now + * ------------------------------------------ + + qui mata: st_addvar(__gtools_gc_addtypes, __gtools_gc_addvars, 1) + local msg "Generated additional targets" + gtools_timer info `t97' `"`msg'"', prints(`bench') + + local gcollapse gcollapse(memory) + local action `action' fill(data) `unsorted' + } + } + else { + + local plugvars `clean_by' `__gtools_gc_uniq_vars' + if ( "`merge'" == "" ) local dropme `dropme' `:list memvars - keepvars' + local dropme `:list dropme - plugvars' + + if ( "`dropme'" != "" ) mata: st_dropvar(tokens(`"`dropme'"')) + local msg `"Dropped superfluous variables"' + gtools_timer info `t97' `"`msg'"', prints(`bench') + + if ( ("`forceio'" == "forceio") & (`=scalar(__gtools_gc_k_extra)' == 0) ) { + if ( `verb' ) { + di as text "(ignored -forceio- because sources are being used as targets)" + } + } + + if ( "`added'" != "" ) { + qui mata: st_addvar(__gtools_gc_addtypes, __gtools_gc_addvars, 1) + } + local msg "Generated additional targets" + gtools_timer info `t97' `"`msg'"', prints(`bench') + + local gcollapse gcollapse(memory, `merge') + local action `action' `:di cond("`merge'" == "", "fill(data)", "unsorted")' + } + + if ( `debug_level' ) { + disp as txt `""' + disp as txt "{cmd:gcollapse} debug level `debug_level'" + disp as txt "{hline 72}" + disp as txt `"recast"' + disp as txt `""' + disp as txt `" scalar __gtools_gc_k_extra = `=scalar(__gtools_gc_k_extra)'"' + disp as txt `""' + disp as txt `" plugvars: `plugvars'"' + disp as txt `" dropme: `dropme'"' + disp as txt `" memvars: `memvars'"' + disp as txt `""' + disp as txt `" sources: `sources'"' + disp as txt `" stats: `stats'"' + disp as txt `" targets: `targets'"' + disp as txt `" unsorted: `unsorted'"' + disp as txt `" opts: `opts'"' + disp as txt `""' + disp as txt `" switch: `switch'"' + disp as txt `" mem: `mem'"' + disp as txt `" io: `io'"' + disp as txt `""' + disp as txt `" gtools_stats: `gtools_stats'"' + disp as txt `""' + disp as txt `" action: `action'"' + disp as txt `" gcollapse: `gcollapse'"' + disp as txt `""' + disp as txt "{hline 72}" + disp as txt `""' + + disp `"_gtools_internal `by' `ifin', `opts' `weights' `action' `gcollapse' gfunction(collapse)"' + } + + local msg `"Ready for plugin execution"' + gtools_timer info `t97' `"`msg'"', prints(`bench') + + cap noi _gtools_internal `by' `ifin', `opts' `weights' `action' `gcollapse' gfunction(collapse) + if ( _rc == 17999 ) { + if ( "`gfallbackok'" != "" ) { + di as err "Cannot use fallback with gtools-only options" + exit 17000 + } + local 0 `00' + syntax [anything(equalok)] [if] [in] , [ by(passthru) cw fast *] + collapse `anything' `if' `in', `by' `cw' `fast' + exit 0 + } + else if ( _rc == 17001 ) { + local rc = _rc + CleanExit + error 2000 + } + else if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + local used_io = `r(used_io)' + local r_N = `r(N)' + local r_J = `r(J)' + local r_minJ = `r(minJ)' + local r_maxJ = `r(maxJ)' + matrix __gtools_invert = r(invert) + + * Return values + * ------------- + + return scalar N = `r_N' + return scalar J = `r_J' + return scalar minJ = `r_minJ' + return scalar maxJ = `r_maxJ' + + *********************************************************************** + * Finish * + *********************************************************************** + + gtools_timer on `t97' + if ( "`merge'" == "" ) { + + * Keep only the collapsed data + * ---------------------------- + + qui { + if ( `=`r_J' > 0' ) keep in 1 / `:di %32.0f `r_J'' + else if ( `=`r_J' == 0' ) { + keep in 1 + drop in 1 + } + else if ( `=`r_J' < 0' ) { + di as err "The plugin returned a negative number of groups." + di as err `"This is a bug. Please report to {browse "`website_url'":`website_disp'}"' + CleanExit + exit 17200 + } + ds * + } + if ( `=_N' == 0 ) di as txt "(no observations)" + + * Make sure no extra variables are present + * ---------------------------------------- + + local memvars `r(varlist)' + local keepvars `clean_by' `__gtools_gc_targets' + local dropme `:list memvars - keepvars' + if ( "`dropme'" != "" ) mata: st_dropvar(tokens(`"`dropme'"')) + + * If we collapsed to disk, read back the data + * ------------------------------------------- + + local ifcond (`=_N > 0') /// + & (`=scalar(__gtools_gc_k_extra)' > 0) /// + & ( `used_io' | ("`forceio'" == "forceio") ) + if ( `ifcond' ) { + qui mata: st_addvar(__gtools_gc_addtypes, __gtools_gc_addvars, 1) + gtools_timer info `t97' `"Added extra targets after collapse"', prints(`bench') + + local __gtools_gc_iovars: list __gtools_gc_targets - __gtools_gc_uniq_vars + local gcollapse gcollapse(read, fname(`__gtools_gc_file')) + if ( `debug_io_read' ) { + cap noi _gtools_internal, `gcollapse' `action' gfunction(collapse) + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + } + else { + local nrow = `=_N' + local ncol = `=scalar(__gtools_gc_k_extra)' + mata: __gtools_gc_data = gtools_get_collapsed (`"`__gtools_gc_file'"', `nrow', `ncol') + mata: st_store(., tokens(`"`__gtools_gc_iovars'"'), __gtools_gc_data) + cap mata: mata drop __gtools_gc_data + } + + gtools_timer info `t97' `"Read extra targets from disk"', prints(`bench') + } + + * Order variables if they are not in user-requested order + * ------------------------------------------------------- + + local order = 0 + qui ds * + local varorder `r(varlist)' + local varsort `clean_by' `__gtools_gc_order' + foreach varo in `varorder' { + gettoken svar varsort: varsort + if ("`varo'" != "`vars'") local order = 1 + } + if ( `order' ) order `clean_by' `__gtools_gc_order' + + * Label the things in the style of collapse + * ----------------------------------------- + + forvalues k = 1 / `:list sizeof __gtools_gc_targets' { + mata: st_varlabel(gtools_targets[`k'], __gtools_gc_labels[`k']) + mata: GtoolsFormatDefaultFallback(gtools_targets[`k'], __gtools_gc_formats[`k']) + } + } + else { + forvalues k = 1 / `:list sizeof __gtools_gc_targets' { + mata: st_varlabel(gtools_targets[`k'], __gtools_gc_labels[`k']) + mata: GtoolsFormatDefaultFallback(gtools_targets[`k'], __gtools_gc_formats[`k']) + } + } + + *********************************************************************** + * Program Exit * + *********************************************************************** + + if ( ("`unsorted'" == "") & ("`merge'" == "") ) { + mata: st_local("invert", strofreal(sum(st_matrix("__gtools_invert")))) + if ( `invert' ) { + mata: st_numscalar("__gtools_first_inverted", /// + selectindex(st_matrix("__gtools_invert"))[1]) + if ( `=scalar(__gtools_first_inverted)' > 1 ) { + local sortvars "" + forvalues i = 1 / `=scalar(__gtools_first_inverted) - 1' { + local sortvars `sortvars' `:word `i' of `clean_by'' + } + sort `sortvars' + } + } + else if ( "`clean_by'" != "" ) { + sort `clean_by' + } + } + + if ( "`fast'" == "" ) restore, not + + local msg "Program exit executed" + gtools_timer info `t97' `"`msg'"', prints(`bench') off + + if ( `awnote' ) { + di as txt "(note: {bf:aweight}s not used to compute {bf:count}s or {bf:nmissing})" + } + + CleanExit + exit 0 +end + +*********************************************************************** +* Generic helpers * +*********************************************************************** + +capture program drop gtools_timer +program gtools_timer, rclass + syntax anything, [prints(int 0) end off] + tokenize `"`anything'"' + local what `1' + local timer `2' + local msg `"`3'; "' + + * If timer is 0, then there were no free timers; skip this benchmark + if ( `timer' == 0 ) exit 0 + + if ( inlist("`what'", "start", "on") ) { + cap timer off `timer' + cap timer clear `timer' + timer on `timer' + } + else if ( inlist("`what'", "info") ) { + timer off `timer' + qui timer list + return scalar t`timer' = `r(t`timer')' + return local pretty`timer' = trim("`:di %21.4gc r(t`timer')'") + if ( `prints' ) di `"`msg'`:di trim("`:di %21.4gc r(t`timer')'")' seconds"' + timer off `timer' + timer clear `timer' + timer on `timer' + } + + if ( "`end'`off'" != "" ) { + timer off `timer' + timer clear `timer' + } +end + +capture program drop FreeTimer +program FreeTimer + qui { + timer list + local i = 99 + while ( (`i' > 0) & ("`r(t`i')'" != "") ) { + local --i + } + } + c_local FreeTimer `i' +end + +*********************************************************************** +* Gcollapse helpers * +*********************************************************************** + +cap mata: mata drop gtools_get_collapsed() +mata +real matrix function gtools_get_collapsed( + string scalar fname, + real scalar nrow, + real scalar ncol) +{ + real scalar fh + real matrix X + colvector C + fh = fopen(fname, "r") + C = bufio() + X = fbufget(C, fh, "%8z", nrow, ncol) + fclose(fh) + return (X) +} +end + +capture program drop parse_vars +program parse_vars + syntax [anything(equalok)] /// + [if] [in] , /// subset + [ /// + WILDparse /// parse assuming wildcard renaming + freq(str) /// include number of observations in group + labelformat(str) /// label prefix + labelprogram(str) /// label prefix + ] + + * Parse gcollapse call into list of sources, targets, stats + * --------------------------------------------------------- + + if ( "`anything'" == "" ) { + di as err "invalid syntax" + exit 198 + } + else { + if ( "`wildparse'" != "" ) { + local rc = 0 + ParseListWild `anything', loc(__gtools_gc_call) + + local __gtools_bak_stats `__gtools_gc_stats' + local __gtools_bak_vars `__gtools_gc_vars' + local __gtools_bak_targets `__gtools_gc_targets' + local __gtools_bak_uniq_stats `__gtools_gc_uniq_stats' + local __gtools_bak_uniq_vars `__gtools_gc_uniq_vars' + + ParseList `__gtools_gc_call' + + cap assert ("`__gtools_gc_stats'" == "`__gtools_bak_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gc_vars'" == "`__gtools_bak_vars'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gc_targets'" == "`__gtools_bak_targets'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gc_uniq_stats'" == "`__gtools_bak_uniq_stats'") + local rc = max(_rc, `rc') + + cap assert ("`__gtools_gc_uniq_vars'" == "`__gtools_bak_uniq_vars'") + local rc = max(_rc, `rc') + + if ( `rc' ) { + disp as error "Wild parsing inconsistent with standard parsing." + exit 198 + } + } + else { + ParseList `anything' + } + } + + if ( "`freq'" != "" ) { + local __gtools_gc_targets `__gtools_gc_targets' `freq' + local __gtools_gc_stats `__gtools_gc_stats' freq + local __gtools_gc_vars `__gtools_gc_vars' `:word 1 of `__gtools_gc_vars'' + } + + * Get format and labels from sources + * ---------------------------------- + + if ( "`labelformat'" == "") local labelformat "(#stat#) #sourcelabel#" + local lnice_regex "(.*)(#stat:pretty#)(.*)" + local lpre_regex "(.*)(#stat#)(.*)" + local lPre_regex "(.*)(#Stat#)(.*)" + local lPRE_regex "(.*)(#STAT#)(.*)" + local ltxt_regex "(.*)(#sourcelabel#)(.*)" + local lsub_regex "(.*)#sourcelabel:([0-9]+):([.0-9]+)#(.*)" + + mata: __gtools_gc_formats = J(1, `:list sizeof __gtools_gc_targets', "") + mata: __gtools_gc_labels = J(1, `:list sizeof __gtools_gc_targets', "") + forvalues k = 1 / `:list sizeof __gtools_gc_targets' { + local vl = `"`:variable label `:word `k' of `__gtools_gc_vars'''"' + local vl = cond(`"`vl'"' == "", `"`:word `k' of `__gtools_gc_vars''"', `"`vl'"') + local vp = `"`:word `k' of `__gtools_gc_stats''"' + + if ( "`labelprogram'" == "" ) GtoolsPrettyStat `vp' + else `labelprogram' `vp' + local vpretty = `"`r(prettystat)'"' + + if ( `"`vpretty'"' == "#default#" ) { + GtoolsPrettyStat `vp' + local vpretty = `"`r(prettystat)'"' + } + + local lfmt_k = `"`labelformat'"' + + if ( "`vp'" == "freq" ) { + if !regexm(`"`vl'"', "`ltxt_regex'") { + while regexm(`"`lfmt_k'"', "`ltxt_regex'") { + local lfmt_k = regexs(1) + `""' + regexs(3) + } + } + if !regexm(`"`vl'"', "`lsub_regex'") { + while regexm(`"`lfmt_k'"', "`lsub_regex'") { + local lfmt_k = regexs(1) + `""' + regexs(4) + } + } + } + else { + if !regexm(`"`vl'"', "`ltxt_regex'") { + while regexm(`"`lfmt_k'"', "`ltxt_regex'") { + local lfmt_k = regexs(1) + `"`vl'"' + regexs(3) + } + } + if !regexm(`"`vl'"', "`lsub_regex'") { + while regexm(`"`lfmt_k'"', "`lsub_regex'") { + local lfmt_k = regexs(1) + substr(`"`vl'"', `:di regexs(2)', `:di regexs(3)') + regexs(4) + } + } + } + + if !regexm(`"`vpretty'"', "`lnice_regex'") { + while regexm(`"`lfmt_k'"', "`lnice_regex'") { + local lfmt_k = regexs(1) + `"`vpretty'"' + regexs(3) + } + } + if !regexm(`"`vp'"', "`lpre_regex'") { + while regexm(`"`lfmt_k'"', "`lpre_regex'") { + local lfmt_k = regexs(1) + `"`vp'"' + regexs(3) + } + } + if !regexm(`"`vp'"', "`lPre_regex'") { + while regexm(`"`lfmt_k'"', "`lPre_regex'") { + local lfmt_k = regexs(1) + proper(`"`vp'"') + regexs(3) + } + } + if !regexm(`"`vp'"', "`lPRE_regex'") { + while regexm(`"`lfmt_k'"', "`lPRE_regex'") { + local lfmt_k = regexs(1) + upper(`"`vp'"') + regexs(3) + } + } + mata: __gtools_gc_labels[`k'] = `"`lfmt_k'"' + + local vf = "`:format `:word `k' of `__gtools_gc_vars'''" + local vf = cond(inlist("`:word `k' of `__gtools_gc_stats''", "count", "freq", "nunique", "nmissing"), "%8.0g", "`vf'") + mata: __gtools_gc_formats[`k'] = "`vf'" + } + + * Available Stats + * --------------- + + local stats sum /// + nansum /// if every entry is missing, output . instead of 0 + mean /// + geomean /// + sd /// + variance /// + cv /// + max /// + min /// + range /// + count /// + median /// + iqr /// + percent /// + first /// + last /// + firstnm /// + lastnm /// + semean /// + sebinomial /// + sepoisson /// + nunique /// + nmissing /// + skewness /// + kurtosis /// + gini /// + gini|dropneg /// + gini|keepneg /// + rawsum /// + rawnansum // if every entry is missing, output . instead of 0 + + * Parse quantiles + local anyquant = 0 + local quantiles : list __gtools_gc_uniq_stats - stats + + foreach quantile of local quantiles { + if regexm("`quantile'", "rawselect") { + local select = regexm("`quantile'", "^rawselect(-|)([0-9]+)$") + if ( `select' == 0 ) { + di as error "Invalid stat: (`quantile'; did you mean rawselect# or rawselect-#?)" + error 110 + } + else if ( `=regexs(2)' == 0 ) { + di as error "Invalid stat: (`quantile' not allowed; selection must be 1 or larger)" + error 110 + } + } + else if regexm("`quantile'", "select") { + local select = regexm("`quantile'", "^select(-|)([0-9]+)$") + if ( `select' == 0 ) { + di as error "Invalid stat: (`quantile'; did you mean select# or select-#?)" + error 110 + } + else if ( `=regexs(2)' == 0 ) { + di as error "Invalid stat: (`quantile' not allowed; selection must be 1 or larger)" + error 110 + } + } + else { + local quantbad = !regexm("`quantile'", "^p([0-9][0-9]?(\.[0-9]+)?)$") + if ( `quantbad' ) { + di as error "Invalid stat: (`quantile')" + error 110 + } + if ("`quantile'" == "p0") { + di as error "Invalid stat: (`quantile'; maybe you meant 'min'?)" + error 110 + } + if ("`quantile'" == "p100") { + di as error "Invalid stat: (`quantile'; maybe you meant 'max'?)" + error 110 + } + } + } + + if ( "`freq'" != "" ) { + local __gtools_gc_uniq_stats `__gtools_gc_uniq_stats' freq + } + + * Locals one level up + * ------------------- + + * unab __gtools_gc_targets: `__gtools_gc_targets' + unab __gtools_gc_vars: `__gtools_gc_vars' + unab __gtools_gc_uniq_vars: `__gtools_gc_uniq_vars' + + c_local __gtools_gc_targets `__gtools_gc_targets' + c_local __gtools_gc_vars `__gtools_gc_vars' + c_local __gtools_gc_stats `__gtools_gc_stats' + c_local __gtools_gc_uniq_vars `__gtools_gc_uniq_vars' + c_local __gtools_gc_uniq_stats `__gtools_gc_uniq_stats' +end + +capture program drop parse_keep_drop +program parse_keep_drop, rclass + syntax, /// + [ /// + weights(str) /// + replace /// + replaceby /// + merge /// + double /// + sumcheck /// + by(varlist) /// + __gtools_gc_targets(str) /// + __gtools_gc_vars(str) /// + __gtools_gc_stats(str) /// + __gtools_gc_uniq_vars(str) /// + __gtools_gc_uniq_stats(str) /// + ] + + * The code assumes targets either do not exist or are named the same as + * the source variable. If a target exists in memory but is not one of the + * sources, rename the target to a dummy + + local __gtools_gc_i = 0 + if ( "`merge'" == "" ) { + foreach var in `__gtools_gc_targets' { + cap confirm variable `var' + if ( (_rc == 0) & !`:list var in __gtools_gc_vars' ) { + cap confirm variable __gtools_gc`__gtools_gc_i' + while ( _rc == 0 ) { + local ++__gtools_gc_i + cap confirm variable __gtools_gc`__gtools_gc_i' + } + rename `var' __gtools_gc`__gtools_gc_i' + } + } + } + + * Try to be smart about creating target variables + * ----------------------------------------------- + + local __gtools_gc_keepvars `__gtools_gc_uniq_vars' + + * If not merging, then be smart about creating new variable columns + if ( "`merge'" == "" ) { + scalar __gtools_gc_merge = 0 + + local __gtools_gc_vars " `__gtools_gc_vars' " + local __gtools_gc_uniq_vars " `__gtools_gc_uniq_vars' " + local __gtools_gc_keepvars " `__gtools_gc_keepvars' " + + local __gtools_gc_vars: subinstr local __gtools_gc_vars " " " ", all + local __gtools_gc_uniq_vars: subinstr local __gtools_gc_uniq_vars " " " ", all + local __gtools_gc_keepvars: subinstr local __gtools_gc_keepvars " " " ", all + + local by: subinstr local by " " " ", all + local K: list sizeof __gtools_gc_targets + forvalues k = 1 / `K' { + unab memvars : _all + + local k_target: word `k' of `__gtools_gc_targets' + local k_var: word `k' of `__gtools_gc_vars' + local k_stat: word `k' of `__gtools_gc_stats' + + * Only use as target if the type matches + * parse_ok_astarget, sourcevar(`k_var') targetvar(`k_target') stat(`k_stat') `double' + * if ( `:list k_var in __gtools_gc_uniq_vars' & `r(ok_astarget)' ) { + + * Always try to use as target; will recast if necessary + if ( `:list k_var in __gtools_gc_uniq_vars' ) { + local __gtools_gc_uniq_vars: list __gtools_gc_uniq_vars - k_var + if ( !`:list k_var in __gtools_gc_targets' & !`:list k_target in memvars' & !`:list k_var in by' ) { + * local by " `by' " + * local by: subinstr local by " `k_var' " " `k_target' ", all + * local by `by' + + local __gtools_gc_vars " `__gtools_gc_vars' " + local __gtools_gc_uniq_vars " `__gtools_gc_uniq_vars' " + local __gtools_gc_keepvars " `__gtools_gc_keepvars' " + local __gtools_gc_vars: subinstr local __gtools_gc_vars " `k_var' " " `k_target' ", all + local __gtools_gc_uniq_vars: subinstr local __gtools_gc_uniq_vars " `k_var' " " `k_target' ", all + local __gtools_gc_keepvars: subinstr local __gtools_gc_keepvars " `k_var' " " `k_target' ", all + local __gtools_gc_vars `__gtools_gc_vars' + local __gtools_gc_uniq_vars `__gtools_gc_uniq_vars' + local __gtools_gc_keepvars `__gtools_gc_keepvars' + + rename `k_var' `k_target' + } + } + } + + local by " `by' " + local by: subinstr local by " " " ", all + local by `by' + + local __gtools_gc_vars " `__gtools_gc_vars' " + local __gtools_gc_uniq_vars " `__gtools_gc_uniq_vars' " + local __gtools_gc_keepvars " `__gtools_gc_keepvars' " + local __gtools_gc_vars: subinstr local __gtools_gc_vars " " " ", all + local __gtools_gc_uniq_vars: subinstr local __gtools_gc_uniq_vars " " " ", all + local __gtools_gc_keepvars: subinstr local __gtools_gc_keepvars " " " ", all + local __gtools_gc_vars `__gtools_gc_vars' + local __gtools_gc_uniq_vars `__gtools_gc_uniq_vars' + local __gtools_gc_keepvars `__gtools_gc_keepvars' + + local keepvars `by' `__gtools_gc_keepvars' + } + else { + scalar __gtools_gc_merge = 1 + if ( "`replace'" == "" ) { + local intersection: list __gtools_gc_targets & __gtools_gc_vars + if ( "`intersection'" != "" ) { + di as error "merge targets also sources with no replace: `intersection'" + error 110 + } + + unab memvars: _all + local intersection: list memvars - __gtools_gc_vars + local intersection: list intersection - by + local intersection: list __gtools_gc_targets & intersection + if ( "`intersection'" != "" ) { + di as error "merge targets exist with no replace: `intersection'" + error 110 + } + } + } + + local intersection: list __gtools_gc_targets & by + if ( "`intersection'" != "" ) { + if ( "`replaceby'" == "" ) { + di as error "targets also in by(): `intersection'" + error 110 + } + } + + * Variables in memory; will compare to keepvars + * --------------------------------------------- + + * Unfortunately, this is necessary for C. We cannot create variables + * from C, and we cannot halt the C execution, create the final data + * in Stata, and then go back to C. + + unab memvars : _all + local added "" + + mata: __gtools_gc_addvars = J(1, 0, "") + mata: __gtools_gc_addtypes = J(1, 0, "") + mata: __gtools_gc_recastvars = J(1, 0, "") + mata: __gtools_gc_recasttypes = J(1, 0, "") + + c_local __gtools_gc_vars `__gtools_gc_vars' + c_local __gtools_gc_uniq_vars `__gtools_gc_keepvars' + + * If any of the other requested stats are not counts, freq, nunique, + * or nmissing, upgrade! Otherwise you'll get the wrong result. + + local __gtools_upgrade + local __gtools_upgrade_vars + local __gtools_upgrade_list freq count nunique nmissing + forvalues i = 1 / `:list sizeof __gtools_gc_targets' { + local src: word `i' of `__gtools_gc_vars' + local stat: word `i' of `__gtools_gc_stats' + if ( !`:list stat in __gtools_upgrade_list' ) { + local __gtools_upgrade_vars `__gtools_upgrade_vars' `src' + } + } + local __gtools_upgrade_vars: list uniq __gtools_upgrade_vars + + * If requested, check whether sum will overflow. Assign smallest + * possible type given sum. + + local __gtools_sumok + local __gtools_sumcheck + gettoken wtype wvar: weights + if ( ("`sumcheck'" != "") & inlist(`"`wtype'"', "fweight", "") ) { + foreach var in `: list uniq __gtools_gc_vars' { + if ( inlist("`:type `var''", "byte", "int", "long") ) { + local __gtools_sumcheck `__gtools_sumcheck' `var' + } + } + + if ( `:list sizeof __gtools_sumcheck' > 0 ) { + cap noi _gtools_internal, sumcheck(`__gtools_sumcheck') weights(`weights') + if ( _rc ) { + local rc = _rc + CleanExit + exit `rc' + } + + matrix __gtools_sumcheck = r(sumcheck) + forvalues i = 1 / `:list sizeof __gtools_sumcheck' { + local s = __gtools_sumcheck[1, `i'] + if ( `s' > maxlong() ) { + local __gtools_sumok `__gtools_sumok' double + if ( mi(`s') ) { + disp as err "{bf:Overflow warning:} (sum) `:word `i' of `__gtools_sumcheck''" + } + } + else if ( `s' > maxint() ) { + local __gtools_sumok `__gtools_sumok' long + } + else if ( `s' > maxbyte() ) { + local __gtools_sumok `__gtools_sumok' int + } + else { + local __gtools_sumok `__gtools_sumok' byte + } + } + } + } + + * Loop through all the targets to determine which type is most + * appropriate. Also check whether we can the source variable for the + * first target; if not, we will recast the source variable. + + local check_recast "" + foreach var of local __gtools_gc_targets { + gettoken sourcevar __gtools_gc_vars: __gtools_gc_vars + gettoken collstat __gtools_gc_stats: __gtools_gc_stats + local upgrade = `:list sourcevar in __gtools_upgrade_vars' + local sumtype = "double" + + * I try to match Stata's types when possible + if regexm("`collstat'", "first|last|min|max|select|rawselect") { + * First, last, min, max, and select can preserve type, clearly + local targettype: type `sourcevar' + } + else if regexm("`collstat'", "range") { + * Upgrade type by one + local targettype: type `sourcevar' + if ( `"`targettype'"' == "byte" ) { + local targettype int + } + else if ( `"`targettype'"' == "int" ) { + local targettype long + } + else if ( `"`targettype'"' == "long" ) { + local targettype double + } + else if ( `"`targettype'"' == "float" ) { + local targettype double + } + else if ( `"`targettype'"' == "double" ) { + local targettype double + } + } + else if ( inlist("`collstat'", "freq", "nunique") & ( `=_N < maxlong()' ) ) { + * freqs can be long if we have fewer than 2^31 observations + * (largest signed integer in long variables can be 2^31-1) + local targettype = cond(`upgrade', "double", "long") + } + else if ( inlist("`collstat'", "freq", "nunique") & !( `=_N < maxlong()' ) ) { + local targettype double + } + else if ( "`double'" != "" ) { + local targettype double + } + else if ( inlist("`collstat'", "count", "nmissing") & (`=_N < maxlong()') & (`"`weights'"' == "") ) { + * Counts can be long if we have fewer than 2^31 observations + * (largest signed integer in long variables can be 2^31-1). + * With weights, however, count is sum w_i, so the rules are + * as with sums in that case. + local targettype = cond(`upgrade', "double", "long") + } + else if ( inlist("`collstat'", "count", "nmissing") & !((`=_N < maxlong()') & (`"`weights'"' == "")) ) { + local targettype double + } + else if ( inlist("`collstat'", "sum", "nansum", "rawsum", "rawnansum") ) { + * Sums are double so we don't overflow; however, if the + * user requested sumcheck we assign byte, int, and long the + * smallest possible type. + local targettype double + if ( `:list sourcevar in __gtools_sumcheck' ) { + local pos: list posof "`sourcevar'" in __gtools_sumcheck + local targettype: word `pos' of `__gtools_sumok' + } + local sumtype: copy local targettype + } + else if ( "`:type `sourcevar''" == "long" ) { + * Some operations on long variables with target float can be + * inaccurate + local targettype double + } + else if inlist("`:type `sourcevar''", "double") { + * If variable is double, then keep that type + local targettype double + } + else { + * Otherwise, store results in specified user-default type + local targettype `c(type)' + } + + * Create target variables as applicable. If it's the first instance, + * we use it to store the first summary statistic requested for that + * variable and recast as applicable. + + cap confirm variable `var' + if ( _rc ) { + mata: __gtools_gc_addvars = __gtools_gc_addvars, "`var'" + mata: __gtools_gc_addtypes = __gtools_gc_addtypes, "`targettype'" + local added `added' `var' + } + else { + * We only recast integers. Floats and doubles are preserved unless + * requested or the target is a sum. + parse_ok_astarget, /// + sourcevar(`var') /// + targetvar(`var') /// + stat(`collstat') /// + sumtype(`sumtype') /// + `double' weights(`weights') + local recast = !(`r(ok_astarget)') + + if ( `recast' ) { + mata: __gtools_gc_recastvars = __gtools_gc_recastvars, "`var'" + mata: __gtools_gc_recasttypes = __gtools_gc_recasttypes, "`targettype'" + } + } + } + + return local keepvars = "`keepvars'" + return local added = "`added'" + return local memvars = "`memvars'" + return local check_recast = "`check_recast'" +end + +capture program drop parse_ok_astarget +program parse_ok_astarget, rclass + syntax, sourcevar(varlist) targetvar(str) stat(str) sumtype(str) [double weights(str)] + local ok_astarget = 0 + local sourcetype = "`:type `sourcevar''" + + * I try to match Stata's types when possible + if regexm("`stat'", "first|last|min|max|select|rawselect") { + * First, last, min, max, and select can preserve type, clearly + local targettype `sourcetype' + local ok_astarget = 1 + } + else if regexm("`stat'", "range") { + * Upgrade type by one + local ok_astarget = 0 + if ( `"`sourcetype'"' == "byte" ) { + local targettype int + } + else if ( `"`sourcetype'"' == "int" ) { + local targettype long + } + else if ( `"`sourcetype'"' == "long" ) { + local targettype double + } + else if ( `"`sourcetype'"' == "float" ) { + local targettype double + } + else if ( `"`sourcetype'"' == "double" ) { + local targettype double + local ok_astarget = 1 + } + } + else if ( "`double'" != "" ) { + local targettype double + local ok_astarget = ("`:type `sourcevar''" == "double") + } + else if ( inlist("`stat'", "freq", "nunique") & ( `=_N < maxlong()' ) ) { + local targettype long + local ok_astarget = inlist("`:type `sourcevar''", "long", "double") + } + else if ( inlist("`stat'", "freq", "nunique") & !( `=_N < maxlong()' ) ) { + local targettype double + } + else if ( inlist("`stat'", "count", "nmissing") & (`=_N < maxlong()') & (`"`weights'"' == "") ) { + local targettype long + local ok_astarget = inlist("`:type `sourcevar''", "long", "double") + } + else if ( inlist("`stat'", "count", "nmissing") & !((`=_N < maxlong()') & (`"`weights'"' == "")) ) { + local targettype double + } + else if ( inlist("`stat'", "sum", "nansum", "rawsum", "rawnansum") ) { + * Sums are double so we don't overflow; however, if the + * user requested sumcheck we assign byte, int, and long the + * smallest possible type. + local targettype double + local ok_astarget = ("`:type `sourcevar''" == "double") + if ( !`ok_astarget' & ("`sumtype'" != "double") & ("`:type `sourcevar''" != "float") ) { + if ( ("`:type `sourcevar''" == "long") & inlist("`sumtype'", "byte", "int", "long") ) { + local ok_astarget = 1 + } + else if ( ("`:type `sourcevar''" == "int") & inlist("`sumtype'", "byte", "int") ) { + local ok_astarget = 1 + } + else if ( ("`:type `sourcevar''" == "byte") & inlist("`sumtype'", "byte") ) { + local ok_astarget = 1 + } + else { + local ok_astarget = 0 + } + } + } + else if ( "`:type `sourcevar''" == "long" ) { + * Some operations on long variables with target float can be + * inaccurate + local targettype double + } + else if inlist("`:type `sourcevar''", "double") { + local targettype double + local ok_astarget = 1 + } + else { + * Otherwise, store results in specified user-default type + local targettype `c(type)' + if ( "`targettype'" == "float" ) { + local ok_astarget = inlist("`:type `sourcevar''", "float", "double") + } + else { + local ok_astarget = inlist("`:type `sourcevar''", "double") + } + } + return local ok_astarget = `ok_astarget' +end + +capture program drop benchmark_memvars +program benchmark_memvars, rclass + syntax, index(str) ix(str) info(str) + if ( `=_N < maxlong()' ) { + local itype long + local factor = 2 / 3 + local bytes = 12 + } + else { + local itype double + local factor = 1 / 3 + local bytes = 24 + } + + { + cap timer off ${GTOOLS_T96} + cap timer clear ${GTOOLS_T96} + timer on ${GTOOLS_T96} + } + qui mata: st_addvar(("`itype'"), ("`index'"), 1) + { + cap timer off ${GTOOLS_T96} + qui timer list + local total_time = r(t${GTOOLS_T96}) + cap timer clear ${GTOOLS_T96} + timer on ${GTOOLS_T96} + } + qui mata: st_addvar(("`itype'"), ("`ix'"), 1) + { + cap timer off ${GTOOLS_T96} + qui timer list + local total_time = `total_time' + r(t${GTOOLS_T96}) + cap timer clear ${GTOOLS_T96} + timer on ${GTOOLS_T96} + } + + qui mata: st_addvar(("`itype'"), ("`info'"), 1) + { + cap timer off ${GTOOLS_T96} + qui timer list + local total_time = `total_time' + r(t${GTOOLS_T96}) + cap timer clear ${GTOOLS_T96} + } + + local mib = `=_N * 8 / 1024 / 1024' + local mib_str = trim("`:di %15.2gc 2 * `mib''") + local n_str = trim("`:di %15.0gc `=_N''") + return local st_str = `"Added index and info (`n_str' obs; approx `mib_str'MiB)"' + return local st_time = max(`total_time', 0.001) * scalar(__gtools_gc_k_extra) * `factor' + * return local st_time = `total_time' * scalar(__gtools_gc_k_extra) * `factor' +end + +capture program drop CleanExit +program CleanExit + foreach f of global GTOOLS_TEMPFILES_GCOLLAPSE { + cap erase `"${GTOOLS_TEMPDIR}/`f'"' + } + global GTOOLS_TEMPFILES_GCOLLAPSE + global GTOOLS_TEMPFILES_GCOLLAPSE_I + + set varabbrev ${GTOOLS_USER_VARABBREV} + global GTOOLS_USER_VARABBREV + global GTOOLS_CALLER + + cap mata: mata drop __gtools_gc_formats + cap mata: mata drop __gtools_gc_labels + + cap mata: mata drop __gtools_gc_addvars + cap mata: mata drop __gtools_gc_addtypes + cap mata: mata drop __gtools_gc_recastvars + cap mata: mata drop __gtools_gc_recasttypes + + cap mata: mata drop gtools_vars + cap mata: mata drop gtools_targets + cap mata: mata drop gtools_stats + + cap mata: mata drop gtools_pos + cap mata: mata drop gtools_vars_mem + cap mata: mata drop gtools_io_order + cap mata: mata drop gtools_mem_order + + cap mata: mata drop __gtools_gc_asfloat + cap mata: mata drop __gtools_gc_checkrecast + cap mata: mata drop __gtools_gc_norecast + cap mata: mata drop __gtools_gc_keeprecast + + cap mata: mata drop __gtools_gc_iovars + + cap scalar drop __gtools_gc_k_recast + cap scalar drop __gtools_gc_merge + + cap scalar drop __gtools_gc_k_extra + cap scalar drop __gtools_gc_k_targets + cap scalar drop __gtools_gc_k_vars + cap scalar drop __gtools_gc_k_stats + cap scalar drop __gtools_gc_k_uniq_vars + cap scalar drop __gtools_gc_k_uniq_stats + + cap scalar drop __gtools_first_inverted + cap matrix drop __gtools_invert + + cap timer off $GTOOLS_T97 + cap timer clear $GTOOLS_T97 + + cap timer off $GTOOLS_T96 + cap timer clear $GTOOLS_T96 + + global GTOOLS_T97 + global GTOOLS_T96 +end + +capture program drop GcollapseTempFile +program GcollapseTempFile + if ( `"${GTOOLS_TEMPFILES_GCOLLAPSE_I}"' == "" ) { + local GTOOLS_TEMPFILES_GCOLLAPSE_I = 1 + global GTOOLS_TEMPFILES_GCOLLAPSE_I = 1 + } + else { + local GTOOLS_TEMPFILES_GCOLLAPSE_I = ${GTOOLS_TEMPFILES_GCOLLAPSE_I} + 1 + global GTOOLS_TEMPFILES_GCOLLAPSE_I = ${GTOOLS_TEMPFILES_GCOLLAPSE_I} + 1 + } + local f ${GTOOLS_TEMPDIR}/__gtools_tmpfile_gcollapse_`GTOOLS_TEMPFILES_GCOLLAPSE_I' + global GTOOLS_TEMPFILES_GCOLLAPSE ${GTOOLS_TEMPFILES_GCOLLAPSE} __gtools_tmpfile_gcollapse_`GTOOLS_TEMPFILES_GCOLLAPSE_I' + c_local `0': copy local f +end + +capture program drop CheckMatsize +program CheckMatsize + syntax [anything], [nvars(int 0)] + if ( `nvars' == 0 ) local nvars `:list sizeof anything' + if ( `nvars' > `c(matsize)' ) { + cap set matsize `=`nvars'' + if ( _rc ) { + di as err _n(1) "{bf:# variables > matsize (`nvars' > `c(matsize)'). Tried to run}" + di _n(1) " {stata set matsize `=`nvars''}" + di _n(1) "{bf:but the command failed. Try setting matsize manually.}" + exit 908 + } + } +end + +capture program drop GtoolsPrettyStat +program GtoolsPrettyStat, rclass + if ( `"`0'"' == "sum" ) local prettystat "Sum" + if ( `"`0'"' == "nansum" ) local prettystat "Sum" + if ( `"`0'"' == "mean" ) local prettystat "Mean" + if ( `"`0'"' == "geomean" ) local prettystat "Geometric mean" + if ( `"`0'"' == "sd" ) local prettystat "St Dev." + if ( `"`0'"' == "variance" ) local prettystat "Variance" + if ( `"`0'"' == "cv" ) local prettystat "Coef. of variation" + if ( `"`0'"' == "max" ) local prettystat "Max" + if ( `"`0'"' == "min" ) local prettystat "Min" + if ( `"`0'"' == "range" ) local prettystat "Range" + if ( `"`0'"' == "count" ) local prettystat "Count" + if ( `"`0'"' == "freq" ) local prettystat "Group size" + if ( `"`0'"' == "percent" ) local prettystat "Percent" + if ( `"`0'"' == "median" ) local prettystat "Median" + if ( `"`0'"' == "iqr" ) local prettystat "IQR" + if ( `"`0'"' == "first" ) local prettystat "First" + if ( `"`0'"' == "firstnm" ) local prettystat "First Non-Miss." + if ( `"`0'"' == "last" ) local prettystat "Last" + if ( `"`0'"' == "lastnm" ) local prettystat "Last Non-Miss." + if ( `"`0'"' == "semean" ) local prettystat "SE Mean" + if ( `"`0'"' == "sebinomial" ) local prettystat "SE Mean (Binom)" + if ( `"`0'"' == "sepoisson" ) local prettystat "SE Mean (Pois)" + if ( `"`0'"' == "nunique" ) local prettystat "N Unique" + if ( `"`0'"' == "nmissing" ) local prettystat "N Missing" + if ( `"`0'"' == "skewness" ) local prettystat "Skewness" + if ( `"`0'"' == "kurtosis" ) local prettystat "Kurtosis" + if ( `"`0'"' == "rawsum" ) local prettystat "Unweighted sum" + if ( `"`0'"' == "rawnansum" ) local prettystat "Unweighted sum" + if ( `"`0'"' == "gini" ) local prettystat "Gini Coefficient" + if ( `"`0'"' == "gini|dropneg" ) local prettystat "Gini Coefficient (drop neg)" + if ( `"`0'"' == "gini|keepneg" ) local prettystat "Gini Coefficient (keep neg)" + + local match = 0 + if regexm(`"`0'"', "^rawselect(-|)([0-9]+)$") { + if ( `"`:di regexs(1)'"' == "-" ) { + local Pretty Largest (Unweighted) + } + else { + local Pretty Smallest (Unweighted) + } + local p = `=regexs(2)' + local match = 1 + } + else if regexm(`"`0'"', "^select(-|)([0-9]+)$") { + if ( `"`:di regexs(1)'"' == "-" ) { + local Pretty Largest + } + else { + local Pretty Smallest + } + local p = `=regexs(2)' + local match = 1 + } + else if regexm(`"`0'"', "^p([0-9][0-9]?(\.[0-9]+)?)$") { + local p = `:di regexs(1)' + local Pretty Pctile + local match = 1 + } + + if ( `match' ) { + if ( inlist(substr(`"`p'"', -2, 2), "11", "12", "13") ) { + local prettystat "`s'th `Pretty'" + } + else { + if ( mod(`p', 10) == 1 ) local prettystat "`p'st `Pretty'" + else if ( mod(`p', 10) == 2 ) local prettystat "`p'nd `Pretty'" + else if ( mod(`p', 10) == 3 ) local prettystat "`p'rd `Pretty'" + else local prettystat "`p'th `Pretty'" + } + } + + return local prettystat = `"`prettystat'"' +end + +*********************************************************************** +* Parse assuming the call includes wildcard renaming * +*********************************************************************** + +capture program drop ParseListWild +program ParseListWild + syntax [anything(equalok)], [LOCal(str)] + local stat mean + + if ( "`local'" == "" ) local local gcollapse_call + + * Trim spaces + local 0 `anything' + while strpos("`0'", " ") { + local 0: subinstr local 0 " " " ", all + } + local 0 `0' + + * Parse each portion of the collapse call + while (trim("`0'") != "") { + GetStat stat 0 : `0' + GetTarget target 0 : `0' + gettoken vars 0 : 0 + + * Must specify stat (if blank, we do the mean) + if ( "`stat'" == "" ) { + disp as err "option stat() requried" + exit 198 + } + + if ( `"`stat'"' == "var" ) local stat variance + if ( `"`stat'"' == "sem" ) local stat semean + if ( `"`stat'"' == "seb" ) local stat sebinomial + if ( `"`stat'"' == "sep" ) local stat sepoisson + if ( `"`stat'"' == "skew" ) local stat skewness + if ( `"`stat'"' == "kurt" ) local stat kurtosis + if ( regexm(`"`stat'"', " ") ) local stat: subinstr local stat " " "|", all + + * Parse bulk rename if applicable + unab usources : `vars' + if ( "`eqsign'" == "=" ) { + cap noi rename `vars' `target' + if ( _rc ) { + disp as err "Targets cannot exist with option {opt wildparse}." + exit `=_rc' + } + unab utargets : `target' + rename (`utargets') (`usources') + + local full_vars `full_vars' `usources' + local full_targets `full_targets' `utargets' + + local call `call' (`stat') + foreach svar of varlist `usources' { + gettoken tvar utargets: utargets + local call `call' `tvar' = `svar' + local full_stats `full_stats' `stat' + } + } + else { + local call `call' (`stat') `usources' + local full_vars `full_vars' `usources' + local full_targets `full_targets' `usources' + + foreach svar of varlist `usources' { + local full_stats `full_stats' `stat' + } + } + + local target + } + + * Check that targets don't repeat + local dups : list dups targets + if ("`dups'" != "") { + di as error "repeated targets in collapse: `dups'" + error 110 + } + + * disp "`call'" + c_local `local' `call' + c_local __gtools_gc_targets `full_targets' + c_local __gtools_gc_stats `full_stats' + c_local __gtools_gc_vars `full_vars' + c_local __gtools_gc_uniq_stats : list uniq full_stats + c_local __gtools_gc_uniq_vars : list uniq full_vars +end + +*********************************************************************** +* Parsing is adapted from Sergio Correia's fcollapse.ado * +*********************************************************************** + +capture program drop ParseList +program define ParseList + syntax [anything(equalok)] + local stat mean + + * Trim spaces + while strpos("`0'", " ") { + local 0: subinstr local 0 " " " " + } + local 0 `0' + + while (trim("`0'") != "") { + GetStat stat 0 : `0' + GetTarget target 0 : `0' + gettoken vars 0 : 0 + unab vars : `vars' + + * Must specify stat (if blank, we do the mean) + if ( "`stat'" == "" ) { + disp as err "option stat() requried" + exit 198 + } + + foreach var of local vars { + if ("`target'" == "") local target `var' + + if ( `"`stat'"' == "var" ) local stat variance + if ( `"`stat'"' == "sem" ) local stat semean + if ( `"`stat'"' == "seb" ) local stat sebinomial + if ( `"`stat'"' == "sep" ) local stat sepoisson + if ( `"`stat'"' == "skew" ) local stat skewness + if ( `"`stat'"' == "kurt" ) local stat kurtosis + if ( regexm(`"`stat'"', " ") ) local stat: subinstr local stat " " "|", all + + local full_vars `full_vars' `var' + local full_targets `full_targets' `target' + local full_stats `full_stats' `stat' + + local target + } + } + + * Check that targets don't repeat + local dups : list dups targets + if ("`dups'" != "") { + di as error "repeated targets in collapse: `dups'" + error 110 + } + + c_local __gtools_gc_targets `full_targets' + c_local __gtools_gc_stats `full_stats' + c_local __gtools_gc_vars `full_vars' + c_local __gtools_gc_uniq_stats : list uniq full_stats + c_local __gtools_gc_uniq_vars : list uniq full_vars +end + +capture program drop GetStat +program define GetStat + _on_colon_parse `0' + local before `s(before)' + gettoken lhs rhs : before + local rest `s(after)' + + gettoken stat rest : rest , match(parens) + if ("`parens'" != "") { + c_local `lhs' `stat' + c_local `rhs' `rest' + } +end + +capture program drop GetTarget +program define GetTarget + _on_colon_parse `0' + local before `s(before)' + gettoken lhs rhs : before + local rest `s(after)' + + local rest : subinstr local rest "=" "= ", all + gettoken target rest : rest, parse("= ") + gettoken eqsign rest : rest + if ("`eqsign'" == "=") { + c_local `lhs' `target' + c_local `rhs' `rest' + c_local eqsign "=" + } + else { + c_local eqsign + } +end diff --git a/01.code/ado/g/gcollapse.sthlp b/01.code/ado/g/gcollapse.sthlp new file mode 100755 index 0000000..f33b9d8 --- /dev/null +++ b/01.code/ado/g/gcollapse.sthlp @@ -0,0 +1,417 @@ +{smcl} +{* *! version 1.2.1 30Jan2020}{...} +{viewerdialog gcollapse "dialog gcollapse"}{...} +{vieweralsosee "[R] gcollapse" "mansection R gcollapse"}{...} +{viewerjumpto "Syntax" "gcollapse##syntax"}{...} +{viewerjumpto "Description" "gcollapse##description"}{...} +{viewerjumpto "Options" "gcollapse##options"}{...} +{viewerjumpto "Stored results" "gegen##results"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gcollapse} {hline 2}}Efficiently +make dataset of summary statistics using C.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Note}: Stata 17+, MP version, introduced significant speed improvements +to the native {cmd:collapse} command, specially with many cores. Depending +on the collapse, it can be up to twice as fast than {cmd:gcollapse}; however, +it remained slower for some use cases. YMMV. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to Stata's {opt collapse} (9-300 times faster +in IC and 4-120 times faster in MP), with several additions. + +{p 8 17 2} +{cmd:gcollapse} +{it:clist} +{ifin} +[{it:{help gcollapse##weight:weight}}] +[{cmd:,} +{it:{help gcollapse##table_options:options}}] + +{pstd}where {it:clist} is either + +{p 8 17 2} +[{opt (stat)}] +{varlist} +[ [{opt (stat)}] {it:...} ]{p_end} + +{p 8 17 2} +[{opt (stat)}] {it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + [ [{opt (stat)}] {it:...}] + +{p 4 4 2}or any combination of the {it:varlist} or {it:target_var} forms, and +{it:stat} is one of{p_end} + +{p2colset 9 22 24 2}{...} +{p2col :{opt mean}}means (default){p_end} +{p2col :{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{p2col :{opt count}}number of nonmissing observations{p_end} +{p2col :{opt nmissing}}number of missing observations{p_end} +{p2col :{opt percent}}percentage of nonmissing observations{p_end} +{p2col :{opt nunique}}number of unique elements{p_end} +{p2col :{opt sum}}sums{p_end} +{p2col :{opt rawsum}}sums, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{p2col :{opt nansum}}sum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt rawnansum}}rawsum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt median}}medians (same as {opt p50}){p_end} +{p2col :{opt p#.#}}arbitrary quantiles{p_end} +{p2col :{opt p1}}1st percentile{p_end} +{p2col :{opt p2}}2nd percentile{p_end} +{p2col :{it:...}}3rd{hline 1}49th percentiles{p_end} +{p2col :{opt p50}}50th percentile (same as {cmd:median}){p_end} +{p2col :{it:...}}51st{hline 1}97th percentiles{p_end} +{p2col :{opt p98}}98th percentile{p_end} +{p2col :{opt p99}}99th percentile{p_end} +{p2col :{opt iqr}}interquartile range{p_end} +{p2col :{opt sd}}standard deviation{p_end} +{p2col :{opt var:iance}}variance{p_end} +{p2col :{opt cv}}coefficient of variation ({cmd:sd/mean}){p_end} +{p2col :{opt select#}}#th smallest{p_end} +{p2col :{opt select-#}}#th largest{p_end} +{p2col :{opt rawselect#}}#th smallest, ignoring weights{p_end} +{p2col :{opt rawselect-#}}#th largest, ignoring weights{p_end} +{p2col :{opt max}}maximums{p_end} +{p2col :{opt min}}minimums{p_end} +{p2col :{opt range}}range = {opt max} - {opt min}{p_end} +{p2col :{opt first}}first value{p_end} +{p2col :{opt last}}last value{p_end} +{p2col :{opt firstnm}}first nonmissing value{p_end} +{p2col :{opt lastnm}}last nonmissing value{p_end} +{p2col :{opt sem:ean}}standard error of the mean ({cmd:sd/sqrt(n)}){p_end} +{p2col :{opt seb:inomial}}standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}) (missing if source not 0, 1){p_end} +{p2col :{opt sep:oisson}}standard error of the mean, Poisson ({cmd:sqrt(mean / n)}) (result rounded to nearest integer){p_end} +{p2col :{opt skewness}}Skewness{p_end} +{p2col :{opt kurtosis}}Kurtosis{p_end} +{p2col :{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{p2col :{opt gini dropneg}}Gini coefficient (negative values dropped){p_end} +{p2col :{opt gini keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{p2colreset}{...} + +{synoptset 18 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Options} +{synopt :{opth by(varlist)}}groups over which {it:stat} is to be calculated. Prepend "-" to invert final sort order. +{p_end} +{synopt :{opt cw}}Drop ocase-wise bservations where sources are missing. +{p_end} +{synopt :{opt fast}}do not preserve and restore the original dataset; +saves speed but leaves the data in an unusable state shall the +user press {hi:Break} +{p_end} + +{syntab:Extras} +{synopt :{opth rawstat(varlist)}}Sequence of target names for which to ignore weights. +{p_end} +{synopt :{opt merge}}Merge statistics back to original data, replacing if applicable. +{p_end} +{synopt :{opt wild:parse}}Allow rename-style syntax in target naming +{p_end} +{synopt :{opt replace}}Allow replacing existing variables with output with {opt merge}. +{p_end} +{synopt :{opth freq(varname)}}Include frequency count with observations per group in {opt freq}. +{p_end} +{synopt :{opt labelf:ormat}}Custom label engine: {bf:(#stat#) #sourcelabel#} is the default. +{p_end} +{synopt :{opth labelp:rogram(str)}}Program to parse {opt labelformat} (see examples). +{p_end} +{synopt :{opt unsorted}}Do not sort resulting dataset. Saves speed. +{p_end} + +{syntab:Switches} +{synopt :{opt forceio}}Use disk temp drive for writing/reading collapsed data. +{p_end} +{synopt :{opt forcemem}}Use memory for writing/reading collapsed data. +{p_end} +{synopt :{opt double}}Generate all targets as doubles. +{p_end} +{synopt :{opt sumcheck}}Check whether byte, int, or long sum will overflow. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s +are allowed and mimic {cmd:collapse}; see {help weight} and +{help collapse##weights:Weights (collapse)}. {opt pweight}s may not be used +with {opt sd}, {opt variance}, {opt cv}, {opt semean}, {opt sebinomial}, or {opt sepoisson}. +{opt iweight}s may not be used with {opt semean}, {opt sebinomial}, or +{opt sepoisson}. {opt aweight}s may not be used with {opt sebinomial} or +{opt sepoisson}.{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{opt gcollapse} converts the dataset in memory into a dataset of means, +sums, medians, etc. {it:clist} can refer only to numeric variables. + +{pstd} +first, last, firstnm, lastnm for string variables are not supported. + +{marker options}{...} +{title:Options} + +{dlgtab:Options} + +{phang} +{opth by(varlist)} specifies the groups over which the means, etc., are +to be calculated. It can contain any mix of string or numeric variables. + +{phang} +{opt cw} specifies casewise deletion. If {opt cw} is not specified, all +possible observations are used for each calculated statistic. + +{phang} +{opt fast} specifies that {opt gcollapse} not restore the original dataset +should the user press {hi:Break}. + +{dlgtab:Extras} + +{phang} +{opth rawstat(varlist)}Sequence of target names for which to ignore +weights, except observations with a weight of zero or missing, which are +excluded. This is a generalization of {opt rawsum}, but it is specified +for each individual target (if no target is specified, the source +variable name is what we call target). + +{phang} +{opt merge} merges the collapsed data back to the original data set. +Note that if you want to replace the source variable(s) then you need +to specify {opt replace}. + +{phang} +{opt wildparse} specifies that the function call should be parsed +assuming targets are named using rename-stle syntax. For example, +{cmd:gcollapse (sum) s_x* = x*, wildparse} + +{phang} +{opt replace} Replace allows replacing existing variables with {opt merge}. + +{phang} +{opth freq(varname)} stores the group frequency count in {opt freq}. It +differs from count because it merely stores the number of occurrences of +the group in the data, rather than the non-missing count. Hence it is +equivalent to summing a dummy variable equal to 1 everywhere. + +{phang} +{opth freq(varname)} Specifies that the row count of each group be stored +in {opt freq} after the collapse. + +{phang} +{opth labelformat(str)} Specifies the label format of the output. #stat# +is replaced with the statistic: #Stat# for titlecase, #STAT# +for uppercase, #stat:pretty# for a custom replacement; #sourcelabel# +for the source label and #sourcelabel:start:nchars# to extract a substring +from the source label. The default is (#stat#) #sourcelabel#. #stat# +palceholders in the source label are also replaced. + +{phang} +{opth labelprogram(str)} Specifies the program to use with #stat:pretty#. +This is an {opt rclass} that must set {opt prettystat} as a return value. The +program must specify a value for each summary stat or return #default# to +use the default engine. The programm is passed the requested stat by {opt gcollapse}. + +{phang} +{opt unsorted} Do not sort resulting data set. Saves speed. + +{dlgtab:Switches} + +{phang} +{opt forceio} By default, when there are more than 3 additional targets +(i.e. the number of targets is greater than the number of source +variables plus 3) the function tries to be smart about whether adding +empty variables in Stata before the collapse is faster or slower than +collapsing the data to disk and reading them back after keeping only the +first J observations (assuming J is the number of groups). For J small +relative to N, collapsing to disk will be faster. This check involves +some overhead, however, so if J is known to be small {opt forceio} will +be faster. + +{phang} +{opt forcemem} The opposite of {opt forceio}. The check for whether to use +memory or disk check involves some overhead, so if J is known to +be large {opt forcemem} will be faster. + +{phang} +{opt double} stores data in double precision. + +{phang} +{opt sumcheck} Check whether byte, int, or long sum will overflow. By +default sum targets are double; in this case, sum targets check the +smallest integer type that will be suitable and only assigns a double if +the sum would overflow. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker memory}{...} +{title:Out of memory} + +{pstd} +(See also Stata's own discussion in {help memory:help memory}.) + +{pstd} +There are many reasons for why an OS may run out of memory. The best-case +scenario is that your system is running some other memory-intensive program. +This is specially likely if you are running your program on a server, where +memory is shared across all users. In this case, you should attempt to re-run +{it:gcollapse} once other memory-intensive programs finish. + +{pstd} +If no memory-intensive programs were running concurrently, the second best-case +scenario is that your user has a memory cap that your programs can use. Again, +this is specially likely on a server, and even more likely on a computing grid. +If you are on a grid, see if you can increase the amount of memory your programs +can use (there is typically a setting for this). If your cap was set by a system +administrator, consider contacting them and asking for a higher memory cap. + +{pstd} +If you have no memory cap imposed on your user, the likely scenario is that +your system cannot allocate enough memory for {it:gcollapse}. At this point +you have two options: One option is to try {it:fcollapse} or {it:collapse}, +which are slower but using either should require a trivial one-letter change +to the code; another option is to re-write the code to collapse the data in +segments (the easiest way to do this would be to collapse a portion of all +variables at a time and perform a series of 1:1 merges at the end). + +{pstd} +Replacing {it:gcollapse} with {it:fcollapse} or plain {it:collapse} is an +option because {it:gcollapse} often uses more memory. This is a consequence +of Stata's inability to create variables via C plugins. This forces +{it:gcollapse} to create variables before collapsing, meaning that if there +are {it:J} groups and {it:N} observations, {it:gcollapse} uses {it:N} - {it:J} +more rows than the ideal collapse program, per variable. + +{pstd} +{it:gcollapse} was written with this limitation in mind and tries to save +memory in various ways (for example, if {it:J} is small relative to {it:N}, +gcollapse will use free disk space instead of memory, which not only saves +memory but is also much faster). Nevertheless, it is possible that your system +will allocate enough memory for {it:fcollapse} or {it:collapse} in situations +where it cannot allocate enough memory for {it:gcollapse}. + +{marker example}{...} +{title:Examples} + +{pstd} +See {help collapse##examples} or the +{browse "http://gtools.readthedocs.io/en/latest/usage/gcollapse/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gcollapse} stores the following in {cmd:r()}: + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(minJ)}} largest group size {p_end} +{synopt:{cmd:r(maxJ)}} smallest group size {p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gcollapse} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file +for {it:collapse} and Sergio Correia's help file for {it:fcollapse}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gcontract}, +{help gtoplevelsof}, +{help gtools}; +{help fcollapse} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gcontract.ado b/01.code/ado/g/gcontract.ado new file mode 100755 index 0000000..9d2c409 --- /dev/null +++ b/01.code/ado/g/gcontract.ado @@ -0,0 +1,384 @@ +*! version 1.0.2 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Frequency counts using C-plugins for a speedup. + +cap program drop gcontract +program gcontract, rclass + version 13.1 + + if ( `=_N' == 0 ) { + di as err "no observations" + exit 2000 + } + + global GTOOLS_CALLER gcontract + syntax anything [if] [in] [fw], /// [if condition] [in start / end] [fw = exp] + [ /// + Freq(string) /// Name of frequency variable + CFreq(name) /// Add cummulative frequency in cfreq + Percent(name) /// Add percentages in percent + CPercent(name) /// Add cummulative percentages in cpercent + FLOAT /// Store percentages in float variables + FORMat(string) /// Format for percentage variables + Zero /// Include varlist combinations with 0 frequency + noMISS /// Exclude rows with missing values in varlist + NODS DS /// Parse - as varlist (ds) or negative (nods) + /// + fast /// Do not preserve and restore the original dataset. Saves speed + /// but leaves data unusable if the user hits Break. + unsorted /// Do not sort the data; faster + /// + debug(passthru) /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// print function benchmark info + BENCHmarklevel(int 0) /// print plugin benchmark info + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + ] + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + local missing = cond("`miss'" == "nomiss", "", "missing") + + if ( ("`ds'" != "") & ("`nods'" != "") ) { + di as err "-ds- and -nods- mutually exclusive" + exit 198 + } + + * Set type and format for generated numeric variables + * --------------------------------------------------- + + if ( (`"`percent'"' == "") & (`"`cpercent'"' == "") & (`"`float'"' != "") ) { + di as error "percent or cpercent must be specified" + exit 198 + } + else if ( `"`float'"' == "" ) { + local numtype "double" + } + else { + local numtype "float" + } + + if ( `=_N < maxlong()' ) { + local freqtype long + } + else { + local freqtype double + } + + if ( (`"`percent'"' == "") & (`"`cpercent'"' == "") & (`"`format'"' != "") ) { + di as error "percent or cpercent must be specified" + exit 198 + } + else if `"`format'"' == "" { + local format "%8.2f" + } + + * Check generated variables + * ------------------------- + + if ( "`zero'" != "" ) { + capture confirm new variable _fillin + if ( _rc != 0 ) { + di as error "_fillin already defined" + exit 110 + } + } + + * Parse variable names + * -------------------- + + if ( `"`freq'"' == "" ) { + capture confirm new variable _freq + if ( _rc == 0 ) { + local freq "_freq" + } + else { + di as error "_freq already defined: " /// + "use freq() option to specify frequency variable" + exit 110 + } + } + else { + confirm new variable `freq' + } + + local types `freqtype' + local newvars `freq' + local cwhich 1 + + if ( `"`cfreq'"' != "" ) { + confirm new variable `cfreq' + local newvars `newvars' `cfreq' + local types `types' `freqtype' + local cwhich `cwhich' 1 + } + else { + local cwhich `cwhich' 0 + } + + if ( `"`percent'"' != "" ) { + confirm new variable `percent' + local newvars `newvars' `percent' + local types `types' `numtype' + local cwhich `cwhich' 1 + } + else { + local cwhich `cwhich' 0 + } + + if ( `"`cpercent'"' != "" ) { + confirm new variable `cpercent' + local newvars `newvars' `cpercent' + local types `types' `numtype' + local cwhich `cwhich' 1 + } + else { + local cwhich `cwhich' 0 + } + + * Get varlist + * ----------- + + if ( `"`anything'"' != "" ) { + local varlist: copy local anything + local varlist: subinstr local varlist "+" " ", all + if ( strpos(`"`varlist'"', "-") & ("`ds'`nods'" == "") ) { + disp as txt "'-' interpreted as negative; use option -ds- to interpret as varlist" + disp as txt "(to suppress this warning, use option -nods-)" + } + if ( "`ds'" != "" ) { + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid varlist: `anything'" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + cap noi ds `varlist' + exit _rc + } + local varlist `r(varlist)' + local anything: copy local varlist + } + else { + local parse: copy local varlist + local varlist: subinstr local varlist "-" " ", all + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid list: `anything'" + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + local notfound + foreach var of local varlist { + cap confirm var `var' + if ( _rc ) { + local notfound `notfound' `var' + } + } + if ( `:list sizeof notfound' > 0 ) { + if ( `:list sizeof notfound' > 1 ) { + di as err "Variables not found: `notfound'" + } + else { + di as err "Variable `notfound' not found" + } + } + exit 111 + } + local varlist + local anything + while ( `:list sizeof parse' ) { + gettoken var parse: parse, p(" -") + local neg + if inlist("`var'", "-") { + gettoken var parse: parse, p(" -") + local neg - + } + cap ds `var' + if ( _rc ) { + local rc = _rc + di as err "Variable '`var'' does not exist." + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit `rc' + } + foreach v of varlist `var' { + local anything `anything' `neg'`v' + local varlist `varlist' `v' + } + } + } + } + if ( "`ds'" == "" ) local nods nods + + * Create variables + * ---------------- + + if ( "`fast'" == "" ) preserve + gtools_timer on 97 + + if ( `"`if'`in'"' != "" ) qui keep `if' `in' + if ( `"`weight'"' != "" ) { + tempvar w touse + qui gen double `w' `exp' + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + mark `touse' `wgt' + qui keep if `touse' + } + else local weights + + qui ds * + local memvars `r(varlist)' + local keepvars `varlist' `w' + local dropvars: list memvars - keepvars + if ( "`dropvars'" != "" ) qui mata: st_dropvar(tokens(`"`dropvars'"')) + qui mata: st_addvar(tokens(`"`types'"'), tokens(`"`newvars'"')) + + local bench = ( "`benchmark'" != "" ) + local msg "Added target variables" + gtools_timer info 97 `"`msg'"', prints(`bench') off + + * Call the plugin + * --------------- + + local opts `weights' `missing' `unsorted' `compress' `forcestrl' `ds' `nods' + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' + + local gcontract gcontract(`newvars', contractwhich(`cwhich')) + cap noi _gtools_internal `anything', `opts' gfunction(contract) `gcontract' + + local rc = _rc + global GTOOLS_CALLER "" + if ( `rc' == 17999 ) { + if ( strpos("`anything'", "-") & ("`ds'" == "") ) { + di as err "Cannot use fallback with inverted sorting." + exit 17000 + } + else { + local copts f(`freq') /// + cf(`cfreq') /// + p(`percent') /// + cp(`cpercent') /// + `float' /// + format(`format') /// + `zero' /// + `miss' + contract `varlist', `copts' + if ( "`fast'" == "" ) restore, not + exit 0 + } + } + else if ( `rc' == 17001 ) { + error 2000 + } + else if ( `rc' ) { + exit `rc' + } + + local r_N = `r(N)' + local r_J = `r(J)' + local r_minJ = `r(minJ)' + local r_maxJ = `r(maxJ)' + matrix __gtools_invert = r(invert) + + return scalar N = `r_N' + return scalar J = `r_J' + return scalar minJ = `r_minJ' + return scalar maxJ = `r_maxJ' + + * Exit in the style of contract + * ----------------------------- + + qui keep in 1 / `:di %21.0g `r_J'' + if ( "`zero'" != "" ) { + qui fillin `varlist' + qui replace `freq' = 0 if `freq' >= . + qui drop _fillin + cap confirm var `percent' + if ( _rc == 0 ) { + qui replace `percent' = 0 if `percent' >= . + } + if ( "`cpercent'`cfreq'" != "" ) { + foreach var of varlist `cfreq' `cpercent' { + qui replace `var' = 0 in 1 if `var'[1] >= . + if ( `=_N' > 1 ) { + qui replace `var' = `var'[_n - 1] in 2 / `=_N' if `var' >= . + } + } + } + } + + qui compress `freq' `cfreq' `percent' `cpercent' + + if ( "`percent'`cpercent'" != "" ) { + format `format' `percent' `cpercent' + } + + * Set sort var using varlist + * -------------------------- + + if ( "`unsorted'" == "" ) { + mata: st_local("invert", strofreal(sum(st_matrix("__gtools_invert")))) + if ( `invert' ) { + mata: st_numscalar("__gtools_first_inverted", /// + selectindex(st_matrix("__gtools_invert"))[1]) + if ( `=scalar(__gtools_first_inverted)' > 1 ) { + local sortvars "" + forvalues i = 1 / `=scalar(__gtools_first_inverted) - 1' { + local sortvars `sortvars' `:word `i' of `varlist'' + } + sort `sortvars' + } + } + else { + sort `varlist' + } + + cap scalar drop __gtools_first_inverted + cap matrix drop __gtools_invert + } + + if ( "`fast'" == "" ) restore, not +end + + +*********************************************************************** +* Generic helpers * +*********************************************************************** + +capture program drop gtools_timer +program gtools_timer, rclass + syntax anything, [prints(int 0) end off] + tokenize `"`anything'"' + local what `1' + local timer `2' + local msg `"`3'; "' + + if ( inlist("`what'", "start", "on") ) { + cap timer off `timer' + cap timer clear `timer' + timer on `timer' + } + else if ( inlist("`what'", "info") ) { + timer off `timer' + qui timer list + return scalar t`timer' = `r(t`timer')' + return local pretty`timer' = trim("`:di %21.4gc r(t`timer')'") + if ( `prints' ) di `"`msg'`:di trim("`:di %21.4gc r(t`timer')'")' seconds"' + timer off `timer' + timer clear `timer' + timer on `timer' + } + + if ( "`end'`off'" != "" ) { + timer off `timer' + timer clear `timer' + } +end diff --git a/01.code/ado/g/gcontract.sthlp b/01.code/ado/g/gcontract.sthlp new file mode 100755 index 0000000..834f725 --- /dev/null +++ b/01.code/ado/g/gcontract.sthlp @@ -0,0 +1,247 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gcontract "dialog gcontract"}{...} +{vieweralsosee "[R] gcontract" "mansection R gcontract"}{...} +{viewerjumpto "Syntax" "gcontract##syntax"}{...} +{viewerjumpto "Description" "gcontract##description"}{...} +{viewerjumpto "Options" "gcontract##options"}{...} +{viewerjumpto "Stored results" "gegen##results"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gcontract} {hline 2}}Efficiently make dataset of frequencies and percentages using C plugins.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} This is a fast option to Stata's {opt contract}. +It is 5-7 times faster in Stata/IC and 2.5-4 times faster in MP + +{p 8 17 2} +{cmd:gcontract} +{varlist} +{ifin} +[{it:{help gcontract##weight:weight}}] +[{cmd:,} {it:{help gcontract##table_options:options}}] + +{pstd} +Instead of {varlist}, it is possible to specify + +{p 8 17 2} +[{cmd:+}|{cmd:-}] +{varname} +[[{cmd:+}|{cmd:-}] +{varname} {it:...}] + +{pstd} +This will not affect the results, but it will affect the sort order of the +final data. + +{synoptset 18 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{synopt :{opth f:req(newvar)}}name of frequency variable; default is {opt _freq}{p_end} +{synopt :{opth cf:req(newvar)}}create cumulative frequency variable{p_end} +{synopt :{opth p:ercent(newvar)}}create percentage variable{p_end} +{synopt :{opth cp:ercent(newvar)}}create cumulative percentage variable{p_end} +{synopt :{opt float}}generate percentage variables as type {opt float}{p_end} +{synopt :{opth form:at(format)}}display format for new percentage variables; default is {cmd:format(%8.2f)}{p_end} +{synopt :{opt z:ero}}include combinations with frequency zero (VERY SLOW){p_end} +{synopt :{opt nomiss}}drop observations with missing values{p_end} + +{syntab :Options} +{synopt :{opt unsorted}}Do not sort resulting dataset. Saves speed. {p_end} +{synopt :{opt fast}} Mirrors the same option in {opt collapse}. Do not preserve and restore the original dataset; saves speed but leaves the data in an unusable state shall the user press {hi:Break} {p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{p 4 6 2} +{opt fweight}s are allowed; see {help weight}. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{opt gcontract} replaces the dataset in memory with a new dataset consisting +of all combinations of {varlist} that exist in the data and a new variable +that contains the frequency of each combination. The user can optionally request +percentages and cumulative counts and percentages. + +{pstd} +{opt gcontract} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{dlgtab:Options} + +{phang} +{opth freq(newvar)} specifies a name for the frequency +variable. If not specified, {opt _freq} is used. + +{phang} +{opth cfreq(newvar)} specifies a name for the +cumulative frequency variable. If not specified, no cumulative frequency +variable is created. + +{phang} +{opth percent(newvar)} specifies a name for the percentage variable. +If not specified, no percent variable is created. + +{phang} +{opth cpercent(newvar)} specifies a name for the +cumulative percentage variable. If not specified, no cumulative percentage +variable is created. + +{phang} +{opt float} specifies that the percentage variables specified by +{opt percent()} and {opt cpercent()} will be stored as variables of type +{helpb data types:float}. This only affects the Stata storage type; +{opt gtools} does all computations internally in double precision. If +{opt float} is not specified, these variables will be generated as variables +of type {helpb double}. All generated variables are compressed to the +smallest storage type possible without loss of precision; see {manhelp compress D}. + +{phang} +{opth format(format)} specifies a +display format for the generated percentage variables specified +by {opt percent()} and {opt cpercent()}. If {opt format()} is not specified, +these variables will have the display format {cmd:%8.2f}. + +{phang} +{opt zero} specifies that combinations with frequency zero be included. +This is VERY slow. + +{phang} +{opt nomiss} specifies that observations with missing values on any +variable in {varlist} be dropped. If {opt nomiss} is not specified, all +observations possible are used. + +{dlgtab:Extras} + +{phang} +{opt fast} specifies that {opt gcollapse} not restore the original dataset +should the user press {hi:Break}. + +{phang} +{opt unsorted} Do not sort resulting data set. Saves speed. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gcontract/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gcontract} stores the following in {cmd:r()}: + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(minJ)}} largest group size {p_end} +{synopt:{cmd:r(maxJ)}} smallest group size {p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gcontract} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:contract}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gcollapse}, +{help gtools}; +{help fcollapse} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gdistinct.ado b/01.code/ado/g/gdistinct.ado new file mode 100755 index 0000000..5dd30a2 --- /dev/null +++ b/01.code/ado/g/gdistinct.ado @@ -0,0 +1,220 @@ +*! version 1.0.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -distinct- implementation using C for faster processing + +* TODO: xx make sort take multiple inputs to decide tie-breaks; add 'memory' option +capture program drop gdistinct +program gdistinct, rclass + version 13.1 + + if ( `=_N < 1' ) { + di as err "no observations" + exit 2000 + } + + global GTOOLS_CALLER gunique + syntax [varlist] [if] [in] , /// + [ /// + MISSing /// Include missing values + Joint /// Report distinct values for varlist jointly + MINimum(int 0) /// Report distinct only for groups with at least min + MAXimum(int -1) /// Report distinct only for groups with at most max + Abbrev(int -1) /// Abbrev print of var names + SORTby(str) /// Sort output matrix (alpha, distinct, total) + /// + debug(passthru) /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + ] + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( `maximum' == -1 ) local maximum . + + if ( `minimum' > `maximum' ) { + local swap `minimum' + local minimum `maximum' + local maximum `swap' + di as txt "min(`maximum') max(`minimum') interpreted as min(`minimum') max(`maximum')" + } + + local keepvars "" + tempname ndistinct + + local opts `missing' `compress' `forcestrl' countonly unsorted + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' + + local sortalpha a al alp alph alpha + local sortdistinct d di dis dist disti distin distinc distinct + local sorttotal t to tot tota total + if ( `"`sortby'"' != "" ) { + local sortby `sortby' + local pm = regexm(`"`sortby'"', "([+-]?)(.+)") + if ( `pm' ) { + local sortby = regexs(2) + local sortdesc = cond(regexs(1) == "-", "-", "") + } + + local sortby `sortby' + if ( (`:list sortby in sortalpha' + `:list sortby in sortdistinct' + `:list sortby in sorttotal') == 0 ) { + disp as err "Option sort() incorrectly specified; must be one of: alpha, distinct, total" + exit 198 + } + + if ( "`joint'" != "" ) { + disp as txt "Option sort() ignored with joint" + } + } + + * --------------------------------------------------- + * Joint or individual distinct for all vars requested + * --------------------------------------------------- + + if ( "`joint'" != "" ) { + cap noi _gtools_internal `varlist' `if' `in', `opts' gfunction(unique) + + local rc = _rc + global GTOOLS_CALLER "" + if ( `rc' == 17999 ) { + distinct `varlist' `if' `in', /// + `missing' `joint' min(`minimum') max(`maximum') a(`abbrev') + exit 0 + } + else if ( `rc' == 17001 ) { + local r_N = 0 + local r_J = 0 + local r_ndistinct = 0 + local r_minJ = 0 + local r_maxJ = 0 + matrix `ndistinct' = (0, 0) + exit 0 + } + else if ( `rc' ) { + exit `rc' + } + else { + local r_N = `r(N)' + local r_J = `r(J)' + local r_ndistinct = `r(J)' + local r_minJ = `r(minJ)' + local r_maxJ = `r(maxJ)' + matrix `ndistinct' = (`r(N)', `r(J)') + } + + di + di in text " Observations" + di in text " total distinct" + if ( (`r_J' >= `minimum') & (`r_J' <= `maximum') ) { + di as res %11.0g `r_N' " " %9.0g `r_J' + } + } + else { + if ( `abbrev' == -1 ) { + foreach v of local varlist { + local abbrev = max(`abbrev', length("`v'")) + } + } + + local abbrev = max(`abbrev', 5) + local abbp2 = `abbrev' + 2 + local abbp3 = `abbrev' + 3 + + local k = 0 + mata: __gtools_distinct = J(2, `:list sizeof varlist', "") + + foreach v of local varlist { + cap noi _gtools_internal `v' `if' `in', `opts' gfunction(unique) + + local rc = _rc + if ( `rc' == 17999 ) { + global GTOOLS_CALLER "" + distinct `varlist' `if' `in', /// + `missing' `joint' min(`minimum') max(`maximum') a(`abbrev') + } + else if ( `rc' == 17001 ) { + local r_N = 0 + local r_J = 0 + local r_ndistinct = 0 + local r_minJ = 0 + local r_maxJ = 0 + } + else if ( `rc' ) { + global GTOOLS_CALLER "" + cap mata: mata drop __gtools_distinct + exit `rc' + } + else { + local r_N = `r(N)' + local r_J = `r(J)' + local r_ndistinct = `r(J)' + local r_minJ = `r(minJ)' + local r_maxJ = `r(maxJ)' + } + + if ( (`r_J' >= `minimum') & (`r_J' <= `maximum') ) { + local keepvars `keepvars' `v' + local ++k + mata: __gtools_distinct[1, `k'] = `"" " as txt %`abbrev's abbrev("`v'", `abbrev')"' + mata: __gtools_distinct[2, `k'] = `"" {c |} " as res %9.0g `r_N' " " %9.0g `r_J'"' + matrix `ndistinct' = nullmat(`ndistinct') \ (`r_N', `r_J') + } + } + + * ------------------------------- + * Custom sort order, if requested + * ------------------------------- + + if ( `"`sortby'"' == "" ) { + mata __gtools_order = 1::`k' + } + else if ( `:list sortby in sortalpha' ) { + mata __gtools_order = order(tokens("`keepvars'")', `sortdesc'1) + } + else if ( `:list sortby in sortdistinct' ) { + mata __gtools_order = order(st_matrix("`ndistinct'")[., 2], `sortdesc'1) + } + else if ( `:list sortby in sorttotal' ) { + mata __gtools_order = order(st_matrix("`ndistinct'")[., 1], `sortdesc'1) + } + + mata: st_local("keepvars", invtokens(tokens("`keepvars'")[__gtools_order])) + mata: __gtools_distinct = __gtools_distinct[., __gtools_order] + mata: st_matrix("`ndistinct'", st_matrix("`ndistinct'")[__gtools_order, .]) + + * -------------- + * Display output + * -------------- + + di + di as txt _col(`abbp3') "{c |} Observations" + di as txt _col(`abbp3') "{c |} total distinct" + di as txt "{hline `abbp2'}{c +}{hline 22}" + forvalues i = 1 / `k' { + mata: st_local("d1", __gtools_distinct[1, `i']) + mata: st_local("d2", __gtools_distinct[2, `i']) + di `d1' /// + `d2' + } + cap mata: mata drop __gtools_distinct + cap mata: mata drop __gtools_order + } + + if ( ("`joint'" == "") & ("`keepvars'" != "") ) { + matrix rownames `ndistinct' = `keepvars' + } + matrix colnames `ndistinct' = N Distinct + + return scalar N = `r_N' + return scalar J = `r_J' + return scalar ndistinct = `r_J' + return scalar minJ = `r_minJ' + return scalar maxJ = `r_maxJ' + return matrix distinct = `ndistinct' +end diff --git a/01.code/ado/g/gdistinct.sthlp b/01.code/ado/g/gdistinct.sthlp new file mode 100755 index 0000000..a413851 --- /dev/null +++ b/01.code/ado/g/gdistinct.sthlp @@ -0,0 +1,202 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gdistinct "dialog gdistinct"}{...} +{vieweralsosee "[D] gdistinct" "mansection D gdistinct"}{...} +{viewerjumpto "Syntax" "gdistinct##syntax"}{...} +{viewerjumpto "Description" "gdistinct##description"}{...} +{viewerjumpto "Options" "gdistinct##options"}{...} +{title:Title} + + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gdistinct} {hline 2}}Efficiently report number(s) of distinct observations or values{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to the user command {help distinct}, +additionally storing the results in a matrix. +It is 4 to 26 times faster in Stata/IC and 4-12 times faster in MP + +{p 8 17 2}{cmd:gdistinct} [{varlist}] +{ifin} +[{cmd:,} {opt miss:ing} +{opt a:bbrev(#)} +{opt j:oint} +{opt min:imum(#)} +{opt max:imum(#)} +] + + +{marker description}{...} +{title:Description} + +{pstd} +{opt gdistinct} is a faster alternative to {help distinct}. It displays the +number of distinct observations with respect to the variables in {varlist}. +By default, each variable is considered separately (excluding missing values) +so that the number of distinct observations for each variable is reported and +in this case the results are stored in a matrix. + +{pstd} +The number of distinct observations is the same as the number of distinct +values. Optionally, variables can be considered jointly so that the number +of distinct groups defined by the values of variables in {it:varlist} is +reported. + +{pstd} +{opt gdistinct} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{dlgtab:Options} + +{p 4 4 2}{cmd:missing} specifies that missing values are to be included +in counting distinct observations. + +{p 4 4 2}{opt abbrev(#)} specifies that variable names are to be +displayed abbreviated to at most {it:#} characters. This option has no +effect with {cmd:joint}. + +{p 4 4 2}{cmd:joint} specifies that distinctness is to be determined +jointly for the variables in {it:varlist}. + +{p 4 4 2}{opt minimum(#)} specifies that numbers of distinct values are to be displayed only if they are +equal to or greater than a specified minimum. + +{p 4 4 2}{opt maximum(#)} specifies that numbers of distinct values are to be displayed only if they are +less than or equal to a specified maximum. + +{p 4 4 2}{opt sort(order)} sort output. {it:order} may be {opt a:lpha} +(alphabetical by variable name), {opt d:istinct} (number of distinct values), +or {opt t:otal} (number of non-missing values, unless option {cmd:missing} is +specified). Optionally prepend a negative sign to sort in descending order. +Tie-breaks are resolved arbitrarily. This is ignored with option {cmd:joint}. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker examples}{...} +{title:Examples} + +{p 4 4 2}{cmd:. sysuse auto}{p_end} +{p 4 4 2}{cmd:. gdistinct} {p_end} +{p 4 4 2}{cmd:. gdistinct, max(10)} {p_end} +{p 4 4 2}{cmd:. gdistinct make-headroom}{p_end} +{p 4 4 2}{cmd:. gdistinct make-headroom, missing abbrev(6)}{p_end} +{p 4 4 2}{cmd:. gdistinct foreign rep78, joint}{p_end} +{p 4 4 2}{cmd:. gdistinct foreign rep78, joint missing} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gdistinct/index.html#examples":online documentation} +for more examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gdistinct} stores the following in {cmd:r()}: + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations (last variable or joint) {p_end} +{synopt:{cmd:r(J) }} number of groups (last variable or joint) {p_end} +{synopt:{cmd:r(ndistinct)}} number of groups (last variable or joint) {p_end} +{synopt:{cmd:r(minJ) }}largest group size (last variable or joint) {p_end} +{synopt:{cmd:r(maxJ) }}smallest group size (last variable or joint) {p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(ndistinct)}}number of non-missing observations (one row per variable or joint){p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gdistinct} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{cmd:gdistinct} was written largely to mimic the functionality of the community-contributed command {cmd:distinct}, +written by + +{p 8 8 2} +Gary Longton, Fred Hutchinson Cancer Research Center, USA{break} +glongton@fhcrc.org + +{p 8 8 2} +Nicholas J. Cox, Durham University, UK{break} +n.j.cox@durham.ac.uk + +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + + +{title:Also see} + +{p 4 13 2} +help for +{help gunique}, +{help gtools}; +{help distinct} (if installed) + diff --git a/01.code/ado/g/gduplicates.ado b/01.code/ado/g/gduplicates.ado new file mode 100755 index 0000000..a61bbd3 --- /dev/null +++ b/01.code/ado/g/gduplicates.ado @@ -0,0 +1,372 @@ +*! version 1.0.0 20Sep2018 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -duplicates- implementation using -gegen tag- for faster processing + +capture program drop gduplicates +program gduplicates, rclass + version 13.1 + + local 00 `0' + gettoken cmd 0 : 0, parse(" ,") + local l = length("`cmd'") + + * Get subcommand + * -------------- + + if ( `l' == 0 ) { + di "{err}subcommand needed; see help on {help gduplicates##|_new:gduplicates}" + exit 198 + } + + if ( substr("report", 1, max(1, `l')) == "`cmd'" ) { + local cmd "report" + } + else if ( substr("examples", 1, max(1, `l')) == "`cmd'" ) { + local cmd "examples" + } + else if ( substr("list", 1, max(1, `l')) == "`cmd'" ) { + local cmd "list" + } + else if ( substr("browse", 1, max(1, `l')) == "`cmd'" ) { + local cmd "browse" + disp "{p 0 0 2}As of Stata 11.0, browse is no longer a valid" /// + "{cmd}duplicates subcommand; hence gtools will not support it." /// + "{result}See {help duplicates##remarks:Remarks} under help" /// + "{helpb duplicates} for an explanation.{p_end}" + exit 198 + } + else if ( substr("tag", 1, max(1, `l')) == "`cmd'" ) { + local cmd "tag" + } + else if ( "drop" == "`cmd'" ) { + * OK + } + else { + di "{err}illegal {cmd}gduplicates {err}gsubcommand" + exit 198 + } + + * Check syntax + * ------------ + + if ( "`cmd'" == "drop" ) { + capture syntax varlist [if] [in], [gtools(str)] + if ( _rc == 0 ) { + di "{err}force option required with {cmd}gduplicates drop {it}varlist{rm}" + exit 198 + } + + capture syntax varlist [if] [in], force [gtools(str)] + if ( _rc ) { + syntax [varlist] [if] [in], [gtools(str)] + unab varlist : _all + * local varlist : subinstr local varlist "`_sortindex'" "" + local vartext "{txt} all variables" + } + else local vartext "{res} `varlist'" + } + else if "`cmd'" == "tag" { + syntax [varlist(default=none)] [if] [in], Generate(str) [gtools(str)] + capture confirm new variable `generate' + if ( _rc ) { + di as err "generate() must specify new variable" + exit _rc + } + + if ( "`varlist'" == "" ) { + unab varlist : _all + * local varlist : subinstr local varlist "`_sortindex'" "" + local vartext "{txt} all variables" + } + else local vartext "{res} `varlist'" + } + else { + syntax [varlist(default=none)] [if] [in] [ , SORTed UNSORTed gtools(str) * ] + if ( "`varlist'" == "" ) { + unab varlist : _all + * local varlist : subinstr local varlist "`_sortindex'" "" + local vartext "{txt} all variables" + } + else local vartext "{res} `varlist'" + } + + * Dedup algorithm + * --------------- + + tempvar example Ngroup freq surplus dgroup order + /* + order 1 up _n when called + dgroup 0 if unique on varlist (not a "duplicated" group) + 1 up labels groups which share identical values on varlist + Ngroup 1 if unique on varlist + 2 up is # in each dgroup + example 1 to show if showing examples -- and to keep if -drop- + 0 to drop if -drop- + freq # # in each group + surplus # # of surplus observations + */ + + di _n "{p 0 4}{txt}Duplicates in terms of `vartext'{p_end}" + + * tag - count duplicates by group + * ------------------------------- + + if ( "`cmd'" == "tag" ) { + global GTOOLS_DUPS gduplicates + cap noi gegen `generate' = count(1) `if' `in', by(`varlist') missing `gtools' + global GTOOLS_DUPS "" + + if ( _rc == 2000 ) { + error 2000 + } + else if ( _rc ) { + error _rc + } + + qui replace `generate' = `generate' - 1 + qui compress `generate' + exit 0 + } + + * report - stats on duplicates + * ---------------------------- + + if ( "`cmd'" == "report" ) { + if ( `"`if'"' != "" ) { + marksample touse, novarlist + local ifin if `touse' `in' + } + else { + mata st_local("ifin", st_local("if") + " " + st_local("in")) + } + + global GTOOLS_DUPS gduplicates + * cap noi gegen `Ngroup' = count(1) `ifin', by(`varlist') missing `gtools' + cap noi gegen `example' = tag(`varlist') `ifin', counts(`Ngroup') missing `gtools' + global GTOOLS_DUPS "" + + if ( _rc == 2000 ) { + error 2000 + } + else if ( _rc ) { + error _rc + } + + return scalar unique_value = `r(J)' + + global GTOOLS_DUPS gduplicates + cap noi gegen `freq' = count(1) `ifin', by(`Ngroup') missing `gtools' + global GTOOLS_DUPS "" + + if ( _rc == 2000 ) { + error 2000 + } + else if ( _rc ) { + error _rc + } + + gen `surplus' = `freq' - ( `freq' / `Ngroup' ) + + label var `Ngroup' "copies" + label var `freq' "observations" + label var `surplus' "surplus" + + tabdisp `Ngroup' if `example', cell(`freq' `surplus') + local varcount: word count `varlist' + + exit 0 + } + + * drop + * ---- + + if ( "`cmd'" == "drop" ) { + if ( `"`if'`in'"' != "" ) { + marksample touse, novarlist + local ifin if `touse' `in' + } + + global GTOOLS_DUPS gduplicates + cap noi gegen `example' = tag(`varlist') `ifin', missing `gtools' + global GTOOLS_DUPS "" + + if ( _rc == 2000 ) { + error 2000 + } + else if ( _rc ) { + error _rc + } + + * bail out now if no duplicates + if ( `r(N)' == `r(J)' ) { + di _n as txt "(0 observations are duplicates)" + exit 0 + } + + di + if ( `"`if'`in'"' == "" ) { + noisily keep if `example' + } + else { + noisily keep if `example' | !`touse' + } + exit 0 + } + + * examples or list + * ---------------- + + local opts varlist(`varlist') ifin(`if' `in') cmd(`cmd') + * if ( "`unsorted'" == "" ) { + if ( "`sorted'" != "" ) { + cap noi examplesList, `opts' gtools(`gtools') `options' + exit _rc + } + else { + cap noi examplesListUnsorted, `opts' gtools(`gtools') `options' + exit _rc + } +end + +* Examples and list +* ----------------- + +capture program drop examplesListUnsorted +program examplesListUnsorted + syntax, varlist(str) cmd(str) [ifin(str asis) gtools(str) noWARNing *] + + tempvar example Ngroup freq surplus dgroup order + + global GTOOLS_CALLER ghash + local opts missing gfunction(hash) `gtools' + local gopts gen(`dgroup') counts(`Ngroup') tag(`example') + + cap noi _gtools_internal `varlist' `ifin', `gopts' `opts' + global GTOOLS_CALLER "" + + if ( _rc == 17999 ) { + duplicates `0' + exit 0 + } + else if ( _rc == 17001 ) { + error 2000 + } + else if ( _rc ) { + exit _rc + } + + * bail out now if no duplicates + if ( `r(J)' == `r(N)' ) { + di _n as txt "(0 observations are duplicates)" + exit 0 + } + else { + di _n as txt "`=`r(N)' - `r(J)'' observations are duplicates. Examples:" + } + + if ( `"`warning'"' != "nowarning" ) { + disp "({cmd}note: {cmd}`cmd' {txt}left unsorted to improve performance; use option {cmd}sort {txt}to mimic {cmd}duplicates)" + } + + qui replace `dgroup' = 0 if ( `Ngroup' == 1 ) | mi(`dgroup') + gen long `order' = _n + + if ( "`cmd'" == "examples" ) { + char `order'[varname] "e.g. obs:" + char `dgroup'[varname] "group:" + char `Ngroup'[varname] "#" + if ( `r(J)' ) > 1 { + local lopts subvarname noobs `options' + local lvars `dgroup' `Ngroup' `order' `varlist' + list `lvars' if `example' & `dgroup', `lopts' + } + else { + local lopts subvarname noobs `options' + local lvars `Ngroup' `order' `varlist' + list `lvars' if `example' & `dgroup', `lopts' + } + } + else if ( "`cmd'" == "list" ) { + char `order'[varname] "obs:" + char `dgroup'[varname] "group:" + * char `order'[varname] "obs:" + if ( `r(J)' > 1 ) { + local lopts subvarname noobs `options' + local lvars `dgroup' `order' `varlist' + list `lvars' if `dgroup', `lopts' + } + else { + list `order' `varlist' if `dgroup', subvarname noobs `options' + } + } +end + +capture program drop examplesList +program examplesList, sortpreserve + syntax, varlist(str) cmd(str) [ifin(str asis) gtools(str) noWARNing *] + + tempvar example Ngroup freq surplus dgroup order + + global GTOOLS_CALLER ghash + local opts missing gfunction(hash) `gtools' + local gopts gen(`dgroup') counts(`Ngroup') tag(`example') + + cap noi _gtools_internal `varlist' `ifin', `gopts' `opts' + global GTOOLS_CALLER "" + + if ( _rc == 17999 ) { + duplicates `0' + exit 0 + } + else if ( _rc == 17001 ) { + error 2000 + } + else if ( _rc ) { + exit _rc + } + + * bail out now if no duplicates + if ( `r(J)' == `r(N)' ) { + di _n as txt "(0 observations are duplicates)" + exit 0 + } + else { + di _n as txt "`=`r(N)' - `r(J)'' observations are duplicates. Examples:" + } + + qui replace `dgroup' = 0 if ( `Ngroup' == 1 ) | mi(`dgroup') + gen long `order' = _n + sort `dgroup' `order' + + if ( "`cmd'" == "examples" ) { + char `order'[varname] "e.g. obs:" + char `dgroup'[varname] "group:" + char `Ngroup'[varname] "#" + if ( `r(J)' ) > 1 { + local lopts subvarname noobs `options' + local lvars `dgroup' `Ngroup' `order' `varlist' + list `lvars' if `example' & `dgroup', `lopts' + } + else { + local lopts subvarname noobs `options' + local lvars `Ngroup' `order' `varlist' + list `lvars' if `example' & `dgroup', `lopts' + } + } + else if ( "`cmd'" == "list" ) { + char `order'[varname] "obs:" + char `dgroup'[varname] "group:" + * char `order'[varname] "obs:" + if ( `r(J)' > 1 ) { + local lopts subvarname noobs `options' + local lvars `dgroup' `order' `varlist' + list `lvars' if `dgroup', `lopts' + } + else { + local lopts subvarname noobs `options' + local lvars `order' `varlist' + list `lvars' if `dgroup', `lopts' + } + } + + * disp "{cmd}Warning: {txt}Performance gains are negligible without option {cmd}unsorted" +end diff --git a/01.code/ado/g/gduplicates.sthlp b/01.code/ado/g/gduplicates.sthlp new file mode 100755 index 0000000..0f23357 --- /dev/null +++ b/01.code/ado/g/gduplicates.sthlp @@ -0,0 +1,270 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gduplicates "dialog gduplicates"}{...} +{vieweralsosee "[D] gduplicates" "mansection D gduplicates"}{...} +{viewerjumpto "Syntax" "gduplicates##syntax"}{...} +{viewerjumpto "Description" "gduplicates##description"}{...} +{viewerjumpto "Commands" "gduplicates##commands"}{...} +{viewerjumpto "Options" "gduplicates##options"}{...} +{title:Title} + + +{p2colset 5 23 25 2}{...} +{p2col :{cmd:gduplicates} {hline 2}}Efficiently report, tag, or drop duplicate observations using C plugins.{p_end} + +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to Stata's {opt duplicates}. +It is XX to XX times faster in Stata/IC and XX to XX times faster in MP + + +{phang} +Report duplicates + +{p 8 10 2} +{cmd:gduplicates} {opt r:eport} [{varlist}] {ifin} + + +{phang} +List one example for each group of duplicates + +{p 8 10 2} +{cmd:gduplicates} {opt e:xamples} [{varlist}] {ifin} +[{cmd:,} sorted {it:{help gduplicates##options:options}}] + + +{phang} +List all duplicates + +{p 8 10 2} +{cmd:gduplicates} {opt l:ist} [{varlist}] {ifin} +[{cmd:,} sorted {it:{help gduplicates##options:options}}] + +{pstd} +Option {opt sorted} is required to fully mimic {opt duplicates}; +otherwise, {opt gduplicates} will not sort the list of examples or the +full list of duplicates. This default behavior improves performance but +may be harder to read. + + +{phang} +Tag duplicates + +{p 8 10 2} +{cmd:gduplicates} {opt t:ag} [{varlist}] {ifin} +{cmd:,} {opth g:enerate(newvar)} + + +{phang} +Drop duplicates + +{p 8 10 2} +{cmd:gduplicates} {opt drop} {ifin} + +{p 8 10 2} +{cmd:gduplicates} {opt drop} {varlist} {ifin} +{cmd:, force} + + +{synoptset 23 tabbed}{...} +{marker options}{...} +{synopthdr} +{synoptline} +{syntab :Main} +{synopt :{opt c:ompress}}compress width of columns in both table and display formats{p_end} +{synopt :{opth forcestrl}}Skip binary variable check and force gtools to read strL variables.{p_end} +{synopt :{opt noc:ompress}}use display format of each variable{p_end} +{synopt :{opt fast}}synonym for {opt nocompress}; no delay in output of large datasets{p_end} +{synopt :{opt ab:breviate(#)}}abbreviate variable names to {it:#} characters; default is {cmd:ab(8)}{p_end} +{synopt :{opt str:ing(#)}}truncate string variables to {it:#} characters; default is {cmd:string(10)}{p_end} + +{syntab :Options} +{synopt :{opt t:able}}force table format{p_end} +{synopt :{opt d:isplay}}force display format{p_end} +{synopt :{opt h:eader}}display variable header once; default is table mode{p_end} +{synopt :{opt noh:eader}}suppress variable header{p_end} +{synopt :{opt h:eader(#)}}display variable header every {it:#} lines{p_end} +{synopt :{opt clean}}force table format with no divider or separator lines{p_end} +{synopt :{opt div:ider}}draw divider lines between columns{p_end} +{synopt :{opt sep:arator(#)}}draw a separator line every {it:#} lines; default is {cmd:separator(5)}{p_end} +{synopt :{opth sepby(varlist)}}draw a separator line whenever {it:varlist} values change{p_end} +{synopt :{opt nol:abel}}display numeric codes rather than label values{p_end} + +{syntab :Summary} +{synopt :{opt mean}[{cmd:(}{varlist}{cmd:)}]}add line reporting the mean for each of the (specified) variables{p_end} +{synopt :{opt sum}[{cmd:(}{varlist}{cmd:)}]}add line reporting the sum for each of the (specified) variables{p_end} +{synopt :{opt N}[{cmd:(}{varlist}{cmd:)}]}add line reporting the number of nonmissing values for each of the (specified) variables{p_end} +{synopt :{opth lab:var(varname)}}substitute {opt Mean}, {opt Sum}, or {opt N} for {it:varname} in last row of table{p_end} + +{syntab :Advanced} +{synopt :{opt con:stant}[{cmd:(}{varlist}{cmd:)}]}separate and list variables that are constant only once{p_end} +{synopt :{opt notr:im}}suppress string trimming{p_end} +{synopt :{opt abs:olute}}display overall observation numbers when using {opt by} {varlist}{cmd::}{p_end} +{synopt :{opt nodotz}}display numerical values equal to {opt .z} as field of blanks{p_end} +{synopt :{opt subvar:name}}substitute characteristic for variable name in header{p_end} +{synopt :{opt line:size(#)}}columns per line; default is {cmd:linesize(79)}{p_end} +{synoptline} +{p2colreset}{...} + + + +{marker description}{...} +{title:Description} + +{pstd} +{opt gduplicates} is a faster alternative to {help duplicates}. It can +replicate every sub-command of {opt duplicates}; that is, it reports, +displays, lists, tags, or drops duplicate observations, depending on +the subcommand. Duplicates are observations with identical values +either on all variables if no {varlist} is specified or on a specified +{it:varlist}. + +{pstd} +Note that for sub-commands {opt examples} and {opt list} the output is +{opt NOT} sorted by default. To mimic {opt duplicates} entirely, pass +option {opt sorted} when using those sub-commands. + +{pstd} +{opt gduplicates} is part of the {manhelp gtools R:gtools} project. +In order to pass {opt gtools} options, use {opth gtools(str)}. + +{marker commands}{...} +{title:Commands} + +{pstd} +{cmd:gduplicates report} produces a table showing observations +that occur as one or more copies and indicating how many observations are +"surplus" in the sense that they are the second (third, ...) copy of the first +of each group of duplicates. + +{pstd} +{cmd:gduplicates examples} lists one example for each group of +duplicated observations. Each example represents the first occurrence of each +group in the dataset. + +{pstd} +{cmd:gduplicates list} lists all duplicated observations. + +{pstd} +{cmd:gduplicates tag} generates a variable representing the number of +duplicates for each observation. This will be 0 for all +unique observations. + +{pstd} +{cmd:gduplicates drop} drops all but the first occurrence of each group +of duplicated observations. The word {opt drop} may not be abbreviated. + +{pstd} +Any observations that do not satisfy specified {opt if} and/or {opt in} +conditions are ignored when you use {opt report}, {opt examples}, {opt list}, +or {opt drop}. The variable created by {opt tag} will have +missing values for such observations. + + +{marker options_duplicates_examples}{...} +{title:Options for duplicates examples and duplicates list} + +{phang}{opt sort:ed} Sort the output list. By default the list is left +unsorted to improve performance. + +{dlgtab:Main} + +{phang} +{opt compress}, {opt nocompress}, {opt fast}, {opt abbreviate(#)}, +{opt string(#)}; see {manhelp list D}. + +{dlgtab:Options} + +{phang} +{opt table}, {opt display}, {opt header}, {opt noheader}, {opt header(#)}, +{opt clean}, {opt divider}, {opt separator(#)}, {opth sepby(varlist)}, +{opt nolabel}; see {manhelp list D}. + +{dlgtab:Summary} + +{phang} +{opt mean}[{cmd:(}{varlist}{cmd:)}], {opt sum}[{cmd:(}{it:varlist}{cmd:)}], +{opt N}[{cmd:(}{it:varlist}{cmd:)}], {opt labvar(varname)}; see +{manhelp list D}. + +{dlgtab:Advanced} + +{phang} +{opt constant}[{cmd:(}{varlist}{cmd:)}], {opt notrim}, {opt absolute}, {opt nodotz}, {opt subvarname}, {opt linesize(#)}; see {manhelp list D}. + + +{marker option_duplicates_tag}{...} +{title:Option for duplicates tag} + +{phang} +{opth generate(newvar)} is required and specifies the name of a new variable +that will tag duplicates. + + +{marker option_duplicates_drop}{...} +{title:Option for duplicates drop} + +{phang} +{opt force} specifies that observations duplicated with respect to a named +{varlist} be dropped. The {cmd:force} option is required when such +a {it:varlist} is given as a reminder that information may be lost by dropping +observations, given that those observations may differ on any variable +not included in {it:varlist}. + + +{marker examples}{...} +{title:Examples} + +{pstd} +See {help duplicates##examples} or the +{browse "http://gtools.readthedocs.io/en/latest/usage/gduplicates/index.html#examples":online documentation} +for examples. + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + + +{title:Website} + +{pstd}{cmd:gduplicates} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:duplicates} +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + + +{title:Also see} + +{p 4 13 2} +help for +{help gduplicates}, +{help gisid}, +{help gtools} + diff --git a/01.code/ado/g/gegen.ado b/01.code/ado/g/gegen.ado new file mode 100755 index 0000000..4c175cc --- /dev/null +++ b/01.code/ado/g/gegen.ado @@ -0,0 +1,1029 @@ +* version 1.4.1 26Jan2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! implementation -egen- using C for faster processing + +/* + * syntax: + * gegen [type] varname = fun(args) [if] [in], [options] + * passed to fun are + * [type] varname = fun(args) [if] [in], [options] + */ + +/* + * stata's egen does not parse types correctly. If the requested result is + * a sum, stata will happily create a float, despite the risk of overflow. + * If the source variable is a double, stata will also create a float, even + * though that might cause a loss in precision. I do not imitate this behavior + * because I consider it flawed. I upgrade types whenever necessary. + * + */ + +/* + * TODO: implement label, lname, and truncate for group + */ + +capture program drop gegen +program define gegen, byable(onecall) rclass + version 13.1 + + local 00 `0' + qui syntax anything(equalok) [if] [in] [aw fw iw pw], [by(str) *] + local byvars `by' + local 0 `00' + + * Parse weights + * ------------- + + local wgt = cond(`"`weight'"' != "", `"[`weight' `exp']"', "") + + * Parse egen call + * --------------- + + gettoken type 0 : 0, parse(" =(") + gettoken name 0 : 0, parse(" =(") + + if ( `"`name'"' == "=" ) { + local name `"`type'"' + local type : set type + local retype = 1 + local btype double + } + else { + gettoken eqsign 0 : 0, parse(" =(") + if ( `"`eqsign'"' != "=" ) { + error 198 + } + local btype `type' + local retype = 0 + } + + confirm name `name' + gettoken fcn 0: 0, parse(" =(") + gettoken args 0: 0, parse(" ,") match(par) + local fcn `fcn' + + if ( `"`par'"' != "(" ) exit 198 + if ( `"`fcn'"' == "total" ) local fcn sum + if ( `"`fcn'"' == "var" ) local fcn variance + if ( `"`fcn'"' == "sem" ) local fcn semean + if ( `"`fcn'"' == "seb" ) local fcn sebinomial + if ( `"`fcn'"' == "sep" ) local fcn sepoisson + if ( `"`fcn'"' == "kurt" ) local fcn kurtosis + if ( `"`fcn'"' == "skew" ) local fcn skewness + if ( `"`fcn'"' == "sum" ) local type `btype' + if ( regexm(`"`fcn'"', " ") ) local fcn: subinstr local fcn " " "|", all + if ( regexm(`"`fcn'"', "_") ) local fcn: subinstr local fcn "_" "|", all + + * Parse by call + * ------------- + + local warnby = 0 + if ( _by() ) { + local byvars `_byvars' + local warnby = 1 + } + + * Pre-compiled functions + * ---------------------- + + local funcs tag /// + group /// + total /// + sum /// + nansum /// + mean /// + geomean /// + sd /// + variance /// + cv /// + max /// + min /// + range /// + count /// + median /// + iqr /// + percent /// + first /// + last /// + firstnm /// + lastnm /// + semean /// + sebinomial /// + sepoisson /// + nunique /// + pctile /// + select /// + nmissing /// + skewness /// + gini /// + gini|dropneg /// + gini|keepneg /// + kurtosis + + * gegen aliases for other gtools functions + * ---------------------------------------- + + * NOTE: With retype, let the captured functions determine type + + local transforms rank /// + standardize /// + normalize /// + demean /// + demedian // + + local direct winsorize /// + winsor /// + residualize /// + hdfe // + + * NOTE(mauricio): Though you would want to allow by as prefix, it's + * difficult because the user can try to pass inputs assuming by: + * will produce a particular result, and it might not e.g. + * + * by var: gegen x = fcn(y[_n - 2]) + * + * fails with these (in this example, for non-transforms y[_n - 2], + * or whichever expression is passed, is generated into a variable + * with the correct by: predix. + + if ( `"`fcn'"' == "xtile" ) { + if ( _by() ) { + * Note I leave this here because I want to allow expressions. So + * I don't allow by... + disp as err "by: prefix not allowed with `fcn'" + exit 198 + } + if ( `retype' ) { + disp as err "warning: type ignored with gegen function xtile" + } + cap noi fasterxtile `name' = `args' `if' `in' `wgt', by(`byvars') `options' + exit _rc + } + + local shift = regexm(`"`fcn'"', "^shift[ |]*([+-]?[0-9]+)?[ |]*$") + local cumsum = regexm(`"`fcn'"', "^cumsum(.*)$") + local moving = regexm(`"`fcn'"', "^moving[ |]+([^ |]+)[ |]*([^ |]+)?[ |]*([^ |]+)?$") + local range = regexm(`"`fcn'"', "^range[ |]+([^ |]+)[ |]*([^ |]+)?[ |]*([^ |]+)?[ |]*([^ ]+)?$") + if ( `:list fcn in transforms' | `moving' | `range' | `cumsum' | `shift' ) { + cap confirm var `args' + if ( _rc ) { + disp as err `"`fcn' requires single variable input"' + exit _rc + } + unab args: `args' + if ( `:list sizeof args' != 1 ) { + disp as err `"`fcn' requires single variable input"' + exit 198 + } + if ( _by() ) { + disp as txt "performance wrning: -by- prefix may be slower than -by()-" + * disp as err "by: prefix not allowed with `fcn'" + * exit 198 + } + if ( `retype' ) local type + local options types(`type') `options' + cap noi gstats transform (`fcn') `name' = `args' `if' `in' `wgt', by(`byvars') `options' + exit _rc + } + + if ( `:list fcn in direct' ) { + if ( `"`fcn'"' == "winsorize" ) local fcn winsor + if ( `"`fcn'"' == "residualize" ) local fcn hdfe + cap confirm var `args' + if ( _rc ) { + disp as err `"`fcn' requires single variable input"' + exit _rc + } + unab args: `args' + if ( `:list sizeof args' != 1 ) { + disp as err `"`fcn' requires single variable input"' + exit 198 + } + if ( _by() ) { + disp as txt "performance wrning: -by- prefix may be slower than -by()-" + * disp as err "by: prefix not allowed with `fcn'" + * exit 198 + } + if ( `retype' ) { + disp as err "warning: type ignored with gegen function `fcn'" + } + local options gen(`name') `options' + cap noi gstats `fcn' `args' `if' `in' `wgt', by(`byvars') `options' + exit _rc + } + + * If function does not exist, fall back on egen + * --------------------------------------------- + + if !( `:list fcn in funcs' ) { + confirm new variable `name' + + if ( `"`c(adoarchive)'"' == "1" ) { + capture qui _stfilearchive find _g`fcn'.ado + if ( _rc ) { + di as error "`fcn'() is neither a gtools nor an egen function" + exit 133 + } + } + else { + capture qui findfile _g`fcn'.ado + if ( `"`r(fn)'"' == "" ) { + di as error "`fcn'() is neither a gtools nor an egen function" + exit 133 + } + } + + if ( `"`weight'"' != "" ) { + di as txt "`fcn'() is not a gtools function; falling back on egen" + di as err "weights are not allowed for egen-only functions" + exit 101 + } + + if ( `"`args'"' == "_all" ) | ( `"`args'"' == "*" ) { + unab args : _all + } + + local gtools_args HASHmethod(passthru) /// + oncollision(passthru) /// + Verbose /// + _subtract /// + _CTOLerance(passthru) /// + compress /// + forcestrl /// + NODS DS /// Parse - as varlist (ds) or negative (nods) + BENCHmark /// + BENCHmarklevel(passthru) /// + gtools_capture(str) + syntax [if] [in] [, `gtools_args' *] + + if ( "`byvars'" == "" ) { + di as txt "`fcn'() is not a gtools function and no by(); falling back on egen" + cap noi egen `type' `name' = `fcn'(`args') `if' `in', `options' `gtools_capture' + exit _rc + } + else { + di as txt "`fcn'() is not a gtools function; will hash and use egen" + + local gopts `hashmethod' `oncollision' `verbose' `_subtract' `_ctolerance' + local gopts `gopts' `compress' `forcestrl' `benchmark' `benchmarklevel' `ds' `nods' + local popts _type(`type') _name(`name') _fcn(`fcn') _args(`args') _byvars(`byvars') + + * NOTE(mauricio): I don't think you need to do anything special if by() + * here because L50 to L67 of egen.ado just pass by() as an argument. + + cap noi egen_fallback `if' `in', kwargs(`gopts') `popts' `options' `gtools_capture' + exit _rc + } + } + + FreeTimer + local t97: copy local FreeTimer + gtools_timer on `t97' + global GTOOLS_CALLER gegen + + if ( `warnby' ) { + disp as txt "performance warning: -by- prefix may be slower than -by()-" + } + + * Parse syntax call if function is known + * -------------------------------------- + + * gegen [type] varname = fun(args) [if] [in], [options] + + syntax /// Main call was parsed manually + [if] [in] /// [if condition] [in start / end] + [aw fw iw pw] , /// [weight type = exp] + [ /// + by(str) /// Collapse by variabes: [+|-]varname [[+|-]varname ...] + /// + p(real 50) /// Percentile to compute, #.# (only with pctile). e.g. 97.5 + n(int 0) /// nth smallest to select (negative for largest) + /// + missing /// for group(), tag(); does not get rid of missing values + counts(passthru) /// for group(), tag(); create `counts' with group counts + fill(str) /// for group(), tag(); fills rest of group with `fill' + /// + replace /// Replace target variable with output, if target already exists + noinit /// Do not initialize targets with missing values + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + NODS DS /// Parse - as varlist (ds) or negative (nods) + Verbose /// Print info during function execution + _subtract /// (Undocumented) Subtract result from source variable + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// print function benchmark info + BENCHmarklevel(int 0) /// print plugin benchmark info + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + gtools_capture(passthru) /// Ignored (captures fcn options if fcn is not known) + /// + /// Unsupported egen options + /// ------------------------ + /// + Label /// + lname(passthru) /// + Truncate(passthru) /// + ] + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + local keepmissing = cond("`missing'" == "", "", "keepmissing") + + foreach opt in label lname truncate { + if ( `"``opt''"' != "" ) { + di as txt "Option -`opt'- is not implemented." + exit 198 + } + } + + if ( "`gtools_capture'" != "" ) { + di as txt ("option -gtools_capture()- ignored with supported function `fcn')" + } + + local bench = ( "`benchmark'" != "" ) + + if ( ("`ds'" != "") & ("`nods'" != "") ) { + di as err "-ds- and -nods- mutually exclusive" + exit 198 + } + + * Parse weights + * ------------- + + if ( `:list posof "variance" in fcn' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "variance not allowed with pweights" + exit 135 + } + } + if ( `:list posof "cv" in fcn' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "cv not allowed with pweights" + exit 135 + } + } + if ( `:list posof "sd" in fcn' > 0 ) { + if ( `"`weight'"' == "pweight" ) { + di as err "sd not allowed with pweights" + exit 135 + } + } + if ( `:list posof "select" in fcn' > 0 ) { + if ( inlist(`"`weight'"', "iweight") ) { + di as err "select not allowed with `weight's" + exit 135 + } + } + if ( `:list posof "semean" in fcn' > 0 ) { + if ( inlist(`"`weight'"', "pweight", "iweight") ) { + di as err "semean not allowed with `weight's" + exit 135 + } + } + if ( `:list posof "sebinomial" in fcn' > 0 ) { + if ( inlist(`"`weight'"', "aweight", "iweight", "pweight") ) { + di as err "sebinomial not allowed with `weight's" + exit 135 + } + } + if ( `:list posof "sepoisson" in fcn' > 0 ) { + if ( inlist(`"`weight'"', "aweight", "iweight", "pweight") ) { + di as err "sepoisson not allowed with `weight's" + exit 135 + } + } + + if ( `"`weight'"' != "" ) { + tempvar w touse + qui gen double `w' `exp' `if' `in' + + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + local anywgt anywgt + + mark `touse' `if' `in' `wgt' + local ifin if `touse' `in' + } + else { + local wgt + local weights + local anywgt + mata st_local("ifin", st_local("if") + " " + st_local("in")) + } + + * Parse quantiles + * --------------- + + local ofcn `fcn' + if ( "`fcn'" == "pctile" ) { + local quantbad = !( (`p' < 100) & (`p' > 0) ) + if ( `quantbad' ) { + di as error "Invalid quantile: `p'; p() should be in (0, 100)" + cap timer clear `t97' + global GTOOLS_CALLER "" + exit 110 + } + local fcn p`p' + } + else if ( `p' != 50 ) { + di as err "Option {opt p()} not allowed" + cap timer clear `t97' + global GTOOLS_CALLER "" + exit 198 + } + + * Parse selection + * --------------- + + if ( "`fcn'" == "select" ) { + if ( `n' == 0 ) { + di as error "n() should be a positive or negative integer" + cap timer clear `t97' + global GTOOLS_CALLER "" + exit 110 + } + local fcn select`n' + } + else if ( `n' != 0 ) { + di as err "Option {opt n()} not allowed" + cap timer clear `t97' + global GTOOLS_CALLER "" + exit 198 + } + + * Target and stats + * ---------------- + + if ( "`replace'" == "" ) { + confirm new variable `name' + tempvar dummy + local rename rename `dummy' `name' + local addvar qui mata: st_addvar("`type'", "`dummy'") + local noobs "" + local retype = `retype' & 1 + } + else { + + * NOTE: Addvar should be "" with replace; the problem was that + * the internals did not empty the variable before writing to + * it. With if/in conditions, this caused problems because the + * variable was not set to missing outside the range, as it + * should. + * + * As a quickfix I thought I could just empty it before calling + * internals. However, this causesd two issues: The variable + * would be missing on error, and if the target is also a source, + * the source would be all misssing when read by the plugin! + * + * The easiest fix was to require the target to not be in the + * sources, but there was an easier fix! I already empty the + * targets fot gcollapse, so I simply set that boolean to true + * (init_targ) when gegen was called with replace! This impacts + * the check in lines 489-492. + + cap confirm new variable `name' + if ( _rc ) { + local dummy `name' + local rename "" + local addvar "" + local retype = `retype' & 0 + if "`init'" == "" local noobs qui replace `dummy' = . + else local noobs "" + } + else { + tempvar dummy + local rename rename `dummy' `name' + local addvar qui mata: st_addvar("`type'", "`dummy'") + local retype = `retype' & 1 + local noobs "" + } + } + + local targets targets(`dummy') + local stats stats(`fcn') + + * If tag or group requested, then do that right away + * -------------------------------------------------- + + local opts `compress' `forcestrl' `_subtract' `_ctolerance' + local opts `opts' `verbose' `benchmark' `benchmarklevel' + local opts `opts' `oncollision' `hashmethod' `ds' `nods' + local sopts `counts' + + if ( inlist("`fcn'", "tag", "group") | (("`fcn'" == "count") & ("`args'" == "1")) ) { + if ( "`fill'" != "" ) local fill fill(`fill') + + if ( `"`weight'"' != "" ) { + di as txt "(weights are ignored for egen function {opt `fcn'})" + } + + gtools_timer info `t97' `"Plugin setup"', prints(`bench') off + + if ( "`fcn'" == "tag" ) { + local action tag(`type' `dummy') gfunction(hash) unsorted + local noobs qui replace `dummy' = 0 + } + + if ( inlist("`fcn'", "group", "count") ) { + if ( `=_N' < maxbyte() ) { + * All types are OK + } + else if ( `=_N' < `=2^24' ) { + if inlist("`type'", "byte") { + * byte is no longer OK; int, float still OK + local upgraded = cond(`retype', "", "`type'") + local type int + } + } + else if ( `=_N' < maxint() ) { + if inlist("`type'", "byte", "float") { + * byte and float no longer OK; int still OK + local upgraded = cond(`retype', "", "`type'") + local type int + } + } + else if ( `=_N' < maxlong() ) { + if inlist("`type'", "byte", "int", "float") { + * byte, float, int no longer OK; must upgrade to long + local upgraded = cond(`retype', "", "`type'") + local type long + } + } + else { + if ( "`type'" != "double" ) { + * Only double can maintain precision + local upgraded = cond(`retype', "", "`type'") + local type double + } + } + } + + if ( "`upgraded'" != "" ) { + disp "(warning: user-requested type '`upgraded'' upgraded to '`type'')" + } + + if ( "`fcn'" == "group" ) { + local action gen(`type' `dummy') gfunction(hash) countmiss + if ( `=_N' > 1 ) local s s + local noobs qui replace `dummy' = . + local notxt di as txt "(`=_N' missing value`s' generated)" + } + + if ( "`fcn'" == "count" ) { + local missing missing + local fill fill(group) + local action counts(`type' `dummy') gfunction(hash) countmiss unsorted + if ( `=_N' > 1 ) local s s + local noobs qui replace `dummy' = . + local notxt di as txt "(`=_N' missing value`s' generated)" + } + + if ( ("`byvars'" != "") & inlist("`fcn'", "tag", "group") ) { + di as err "egen ... `fcn'() may not be combined with with by" + global GTOOLS_CALLER "" + exit 190 + } + + if ( ("`byvars'" == "") & inlist("`fcn'", "tag", "group") ) { + local byvars `args' + } + + cap noi _gtools_internal `byvars' `ifin', `opts' `sopts' `action' `missing' `replace' `init' `fill' + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + local gtools_args `hashmethod' /// + `oncollision' /// + `verbose' /// + `_subtract' /// + `_ctolerance' /// + `compress' /// + `forcestrl' /// + `nods' `ds' /// + `benchmark' /// + `benchmarklevel' /// + * `gtools_capture' + local gtools_opts `counts' fill(`fill') `replace' `init' p(`p') `missing' + collision_fallback, gtools_call(`"`type' `name' = `fcn'(`args') `ifin'"') `gtools_args' `gtools_opts' + exit 0 + } + else if ( `rc' == 17001 ) { + if ( "${GTOOLS_DUPS}" == "" ) { + if ( `=_N' > 0 ) { + `noobs' + `notxt' + } + `rename' + exit 0 + } + else { + error 2000 + } + } + else if ( `rc' ) { + exit `rc' + } + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + + `rename' + exit 0 + } + + * Parse source(s) + * --------------- + + unab memvars: _all + + local rc = 0 + if ( !((`:list sizeof args' == 1) & (`:list args in memvars')) ) { + tempvar exp + if ( _by() ) { + cap by `_byvars': gen double `exp' = `args' + } + else { + cap gen double `exp' = `args' + if ( (_rc == 0) & ("`byvars'" != "") ) { + mata printf("{bf:warning}: gegen is {bf:NOT} parsing the expression '%s' by group.\n", st_local("args")) + mata printf("To parse this expression by group, call gegen using the -by:- prefix.\n") + } + } + local rc = _rc + } + + if ( ((`:list sizeof args' == 1) & (`:list args in memvars')) | `rc' ) { + cap ds `args' + if ( _rc ) { + global GTOOLS_CALLER "" + di as error "Invalid call; please specify {opth `ofcn'(varlist)} or {opth `ofcn'(exp)}." + exit 198 + } + else { + local sametype 1 + local sources `r(varlist)' + cap confirm numeric v `sources' + if ( _rc ) { + global GTOOLS_CALLER "" + di as err "{opth `ofcn'(varlist)} must call a numeric variable list." + exit _rc + } + + * See notes in lines 294-310 + * if ( "`:list sources & dummy'" != "" ) { + * if ( "`replace'" != "" ) local extra " even with -replace-" + * di as error "Variable `dummy' canot be a source and a target`extra'" + * exit 198 + * } + } + } + else if ( `rc' == 0 ) { + local sources `exp' + local sametype 0 + } + + if ( `"`ofcn'"' == "nunique" ) { + if ( `:list sizeof sources' != 1 ) { + global GTOOLS_CALLER "" + disp as err `"`fcn' requires single variable input"' + exit 198 + } + } + + * cap ds `args' + * if ( _rc == 0 ) { + * local sametype 1 + * local sources `r(varlist)' + * cap confirm numeric v `sources' + * if ( _rc ) { + * global GTOOLS_CALLER "" + * di as err "{opth `ofcn'(varlist)} must call a numeric variable list." + * exit _rc + * } + * } + * else { + * local sametype 0 + * tempvar exp + * cap gen double `exp' = `args' + * if ( _rc ) { + * global GTOOLS_CALLER "" + * di as error "Invalid call; please specify {opth `ofcn'(varlist)} or {opth `ofcn'(exp)}." + * + * exit 198 + * } + * local sources `exp' + * } + + * Parse target type + * ----------------- + + * if ( ("`addvar'" != "") & `retype' ) { + if ( `retype' ) { + parse_target_type `sources', fcn(`ofcn') sametype(`sametype') `anywgt' + local type = "`r(retype)'" + local addvar qui mata: st_addvar("`type'", "`dummy'") + } + + + * Parse counts into freq for gfunction call + * ----------------------------------------- + + if ( "`counts'" != "" ) { + local 0, `counts' + syntax, [counts(str)] + + gettoken ftype fname: counts + if ( "`fname'" == "" ) { + local fname `ftype' + if ( `=_N < maxlong()' ) local ftype long + else local ftype double + } + + cap confirm new variable `fname' + if ( _rc ) { + local rc = _rc + if ( "`replace'" == "" ) { + global GTOOLS_CALLER "" + di as err "Variable `fname' exists; try a different name or run with -replace-" + exit `rc' + } + else if ( ("`replace'" != "") & ("`addvar'" != "") ) { + qui replace `fname' = . + local replace "" + } + } + else { + if ( "`addvar'" == "" ) { + local addvar qui mata: st_addvar("`ftype'", "`counts'") + } + else { + local addvar qui mata: st_addvar(("`type'", "`ftype'"), ("`name'", "`counts'")) + local replace "" + } + } + + local counts freq(`counts') + } + + * Call the plugin + * --------------- + + local unsorted = cond("`fill'" == "data", "", "unsorted") + gtools_timer info `t97' `"Plugin setup"', prints(`bench') off + + `addvar' + local action sources(`sources') `targets' `stats' fill(`fill') `counts' countmiss + cap noi _gtools_internal `byvars' `ifin', `unsorted' `opts' `action' `weights' missing `keepmissing' `replace' `init' + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + if ( `"`weight'"' != "" ) { + di as err "Cannot use fallback with weights." + exit 17000 + } + local gtools_args `hashmethod' /// + `oncollision' /// + `verbose' /// + `_subtract' /// + `_ctolerance' /// + `compress' /// + `forcestrl' /// + `nods' `ds' /// + `benchmark' /// + `benchmarklevel' /// + `gtools_capture' + local gtools_opts `counts' fill(`fill') `replace' `init' p(`p') `missing' + collision_fallback, gtools_call(`"`type' `name' = `fcn'(`args') `ifin'"') `gtools_args' `gtools_opts' + exit 0 + } + else if ( `rc' == 17001 ) { + if ( "${GTOOLS_DUPS}" == "" ) { + `noobs' + `rename' + exit 0 + } + else { + error 2000 + } + } + else if ( `rc' ) exit `rc' + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + + `rename' + exit 0 +end + +capture program drop egen_fallback +program egen_fallback, sortpreserve + syntax [if] [in], /// + [ /// + _type(str) /// + _name(str) /// + _fcn(str) /// + _args(str) /// + _byvars(str) /// + by(passthru) /// + kwargs(str) /// + * /// + ] + + tempvar dummy + global EGEN_Varname `_name' + global EGEN_SVarname `_sortindex' + + local cvers = _caller() + if ( "`_fcn'" == "mode" | "`_fcn'" == "concat" ) { + local vv : display "version " string(`cvers') ", missing:" + } + + if ( "`: sortedby'" == "`_byvars'" ) { + local byid `: sortedby' + } + else { + tempvar byid + hashsort `_byvars', gen(`byid') sortgen skipcheck `kwargs' + } + + capture noisily `vv' _g`_fcn' `_type' `dummy' = (`_args') `if' `in', by(`byid') `options' + global EGEN_SVarname + global EGEN_Varname + if ( _rc ) exit _rc + + quietly count if missing(`dummy') + if ( `r(N)' ) { + local s = cond(r(N) > 1, "s", "") + di in bl "(" r(N) " missing value`s' generated)" + } + rename `dummy' `_name' + exit 0 +end + +capture program drop gtools_timer +program gtools_timer, rclass + syntax anything, [prints(int 0) end off] + tokenize `"`anything'"' + local what `1' + local timer `2' + local msg `"`3'; "' + + * If timer is 0, then there were no free timers; skip this benchmark + if ( `timer' == 0 ) exit 0 + + if ( inlist("`what'", "start", "on") ) { + cap timer off `timer' + cap timer clear `timer' + timer on `timer' + } + else if ( inlist("`what'", "info") ) { + timer off `timer' + qui timer list + return scalar t`timer' = `r(t`timer')' + return local pretty`timer' = trim("`:di %21.4gc r(t`timer')'") + if ( `prints' ) { + di `"`msg'`:di trim("`:di %21.4gc r(t`timer')'")' seconds"' + } + timer off `timer' + timer clear `timer' + timer on `timer' + } + + if ( "`end'`off'" != "" ) { + timer off `timer' + timer clear `timer' + } +end + +capture program drop parse_target_type +program parse_target_type, rclass + syntax varlist, fcn(str) sametype(int) [anywgt] + + gettoken var restvars: varlist + + local maxtype: type `var' + encode_vartype `maxtype' + local maxcode `r(typecode)' + + foreach var in `restvars' { + local stype: type `var' + encode_vartype `stype' + local scode `r(typecode)' + if ( `scode' > `maxcode' ) { + local maxtype `stype' + local maxcode `scode' + } + } + + if ( `sametype' ) local retype_A `maxtype' + else local retype_A: set type + + if ( "`maxtype'" == "double" ) local retype_B double + else local retype_B: set type + + if ( `=_N < maxlong()' & ("`anywgt'" == "") ) local retype_C long + else local retype_C double + + if ( `"`maxtype'"' == "byte" ) { + local retype_D int + } + else if ( `"`maxtype'"' == "int" ) { + local retype_D long + } + else if ( `"`maxtype'"' == "long" ) { + local retype_D double + } + else if ( `"`maxtype'"' == "float" ) { + local retype_D double + } + else if ( `"`maxtype'"' == "double" ) { + local retype_D double + } + + if ( "`fcn'" == "tag" ) return local retype = "byte" + if ( "`fcn'" == "group" ) return local retype = "`retype_C'" + if ( "`fcn'" == "total" ) return local retype = "double" + if ( "`fcn'" == "sum" ) return local retype = "double" + if ( "`fcn'" == "nansum" ) return local retype = "double" + if ( "`fcn'" == "mean" ) return local retype = "`retype_B'" + if ( "`fcn'" == "geomean" ) return local retype = "`retype_B'" + if ( "`fcn'" == "sd" ) return local retype = "`retype_B'" + if ( "`fcn'" == "variance" ) return local retype = "`retype_B'" + if ( "`fcn'" == "cv" ) return local retype = "`retype_B'" + if ( "`fcn'" == "max" ) return local retype = "`retype_A'" + if ( "`fcn'" == "min" ) return local retype = "`retype_A'" + if ( "`fcn'" == "range" ) return local retype = "`retype_D'" + if ( "`fcn'" == "select" ) return local retype = "`retype_A'" + if ( "`fcn'" == "count" ) return local retype = "`retype_C'" + if ( "`fcn'" == "median" ) return local retype = "`retype_B'" + if ( "`fcn'" == "iqr" ) return local retype = "`retype_B'" + if ( "`fcn'" == "percent" ) return local retype = "`retype_B'" + if ( "`fcn'" == "first" ) return local retype = "`retype_A'" + if ( "`fcn'" == "last" ) return local retype = "`retype_A'" + if ( "`fcn'" == "firstnm" ) return local retype = "`retype_A'" + if ( "`fcn'" == "lastnm" ) return local retype = "`retype_A'" + if ( "`fcn'" == "semean" ) return local retype = "`retype_B'" + if ( "`fcn'" == "sebinomial" ) return local retype = "`retype_B'" + if ( "`fcn'" == "sepoisson" ) return local retype = "`retype_B'" + if ( "`fcn'" == "pctile" ) return local retype = "`retype_B'" + if ( "`fcn'" == "nunique" ) return local retype = "`retype_C'" + if ( "`fcn'" == "nmissing" ) return local retype = "`retype_C'" + if ( "`fcn'" == "skewness" ) return local retype = "`retype_B'" + if ( "`fcn'" == "kurtosis" ) return local retype = "`retype_B'" + if ( "`fcn'" == "gini" ) return local retype = "`retype_B'" + if ( "`fcn'" == "gini|dropneg" ) return local retype = "`retype_B'" + if ( "`fcn'" == "gini|keepneg" ) return local retype = "`retype_B'" +end + +capture program drop encode_vartype +program encode_vartype, rclass + args vtype + if ( "`vtype'" == "byte" ) return scalar typecode = 1 + else if ( "`vtype'" == "int" ) return scalar typecode = 2 + else if ( "`vtype'" == "long" ) return scalar typecode = 3 + else if ( "`vtype'" == "float" ) return scalar typecode = 4 + else if ( "`vtype'" == "double" ) return scalar typecode = 5 + else return scalar typecode = 0 +end + +capture program drop collision_fallback +program collision_fallback + local gtools_args HASHmethod(passthru) /// + oncollision(passthru) /// + Verbose /// + _subtract /// + _CTOLerance(passthru) /// + compress /// + forcestrl /// + NODS DS /// + BENCHmark /// + BENCHmarklevel(passthru) /// + gtools_capture(str) + + syntax, [`gtools_args' gtools_call(str) counts(str) fill(str) replace *] + foreach opt in counts fill replace { + if ( `"``opt''"' != "" ) { + di as err "Cannot use fallback with option {opt `opt'}." + exit 17000 + } + } + egen `gtools_call', `options' +end + +capture program drop FreeTimer +program FreeTimer + qui { + timer list + local i = 99 + while ( (`i' > 0) & ("`r(t`i')'" != "") ) { + local --i + } + } + c_local FreeTimer `i' +end diff --git a/01.code/ado/g/gegen.sthlp b/01.code/ado/g/gegen.sthlp new file mode 100755 index 0000000..78ff0f2 --- /dev/null +++ b/01.code/ado/g/gegen.sthlp @@ -0,0 +1,383 @@ +{smcl} +{* *! version 1.4.2 30Jan2020}{...} +{viewerdialog gegen "dialog gegen"}{...} +{vieweralsosee "[R] gegen" "mansection R gegen"}{...} +{viewerjumpto "Syntax" "gegen##syntax"}{...} +{viewerjumpto "Description" "gegen##description"}{...} +{viewerjumpto "Options" "gegen##options"}{...} +{viewerjumpto "Stored results" "gegen##results"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gegen} {hline 2}}Efficient implementation of by-able egen functions using C.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 14 2} +{cmd:gegen} {dtype} {newvar} {cmd:=} {it:fcn}({it:arguments}) {ifin} +[{it:{help gegen##weight:weight}}] +[{cmd:,} +{opt replace} +{it:fcn_options} +{help gegen##gtools_options:gtools_options}] + +{synoptset 21 tabbed}{...} +{marker gtools_options}{...} +{synopthdr} +{synoptline} +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench:mark}}Benchmark various steps of the plugin. +{p_end} +{synopt :{opt bench:marklevel(int)}}Benchmark various steps of the plugin. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} +{synopt :{opth gtools_capture(str)}}The above options are captured and not passed to {opt egen} in case the requested function is not internally supported by gtools. You can pass extra arguments here if their names conflict with captured gtools options. +{p_end} +{synoptline} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s are +allowed for the functions listed below and mimic {cmd:collapse} and +{cmd:gcollapse}; see {help weight} and {help collapse##weights:Weights (collapse)}. +{opt pweight}s may not be used with {opt sd}, {opt variance}, {opt cv}, {opt semean}, +{opt sebinomial}, or {opt sepoisson}. {opt iweight}s may not be used +with {opt semean}, {opt sebinomial}, or {opt sepoisson}. {opt aweight}s +may not be used with {opt sebinomial} or {opt sepoisson}.{p_end} + +{pstd} +The following are simply wrappers for other {it:gtools} functions. They +all allow {opth by(varlist)} as an option. Consult each command's +corresponding help files for details. (Note that {cmd:gstats transform} +in particular allows embedding options in the statistic call rather +than program arguments; while this is technically also possible to do +through {cmd:gegen}, I do not recommend it. Instead, use {opt window()} +with {it:moving_stat}, {opt interval()} with {it:range_stat}, {opt cumby()} +with {it:cumsum}, and so on.) In the table, {it:stat} can be replaced +with any stat available to {cmd:gcollapse} except percent, {it:nunique}: + + {opt function} -> {opt calls} + {hline 40} + {opth xtile(exp)} -> {help fasterxtile} + {opth standardize(varname)} -> {help gstats transform} + {opth normalize(varname)} -> {help gstats transform} + {opth demean(varname)} -> {help gstats transform} + {opth demedian(varname)} -> {help gstats transform} + {opth moving_stat(varname)} -> {help gstats transform} + {opth range_stat(varname)} -> {help gstats transform} + {opth cumsum(varname)} -> {help gstats transform} + {opth shift(varname)} -> {help gstats transform} + {opth rank(varname)} -> {help gstats transform} + {opth winsor(varname)} -> {help gstats winsor} + {opth winsorize(varname)} -> {help gstats winsor} + +{pstd} +The functions listed below have been compiled and hence will run very quickly. +Functions not listed here hash the data and then call {opt egen} with {opth by(varlist)} +set to the hash, which is often faster than calling {opt egen} directly, but not always. +Natively supported functions should always be faster, however. They are: + +{phang2} +{opth group(varlist)} [{cmd:,} {opt m:issing} {opth counts(newvarname)} {opth fill(real)}]{p_end} +{pmore2} +may not be combined with {cmd:by}. It creates one variable taking on +values 1, 2, ... for the groups formed by {it:varlist}. {it:varlist} may +contain numeric variables, string variables, or a combination of the two. +The default order of the groups is the sort order of the {it:varlist}. +However, the user can specify: + +{pmore3} +[{cmd:+}|{cmd:-}] +{varname} +[[{cmd:+}|{cmd:-}] +{varname} {it:...}] + +{pmore2} +And the order will be inverted for variables that have {cmd:-} prepended. +{opt missing} indicates that missing values in {it:varlist} +{bind:(either {cmd:.} or {cmd:""}}) are to be treated like any other value +when assigning groups, instead of as missing values being assigned to the +group missing. + +{pmore2} +You can also specify {opt counts()} to generate a new variable with the number +of observations per group; by default all observations within a group are +filled with the count, but via {opt fill()} the user can specify the value +the variable will take after the first observation that appears within a +group. The user can also specify {opt fill(data)} to fill the first J{it:th} +observations with the count per group (in the sorted group order) or +{opt fill(group)} to keep the default behavior. + +{phang2} +{opth tag(varlist)} [{cmd:,} {opt m:issing}]{p_end} +{pmore2} +may not be combined with {cmd:by}. It tags just 1 observation in each +distinct group defined by {it:varlist}. When all observations in a group have +the same value for a summary variable calculated for the group, it will be +sufficient to use just one value for many purposes. The result will be 1 if +the observation is tagged and never missing, and 0 otherwise. + +{pmore2} +Note values for any observations excluded by either {helpb if} or {helpb in} +are set to 0 (not missing). Hence, if {opt tag} is the variable +produced by {cmd:egen tag =} {opt tag(varlist)}, the idiom {opt if tag} +is always safe. {opt missing} specifies that missing values of {it:varlist} +may be included. + + {opth first|last|firstnm|lastnm(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the first, last, first non-missing, and last non-missing +observation. The functions are analogous to those in {opt collapse} and {opt not} to those in {opt egenmore}. + + {opth count(exp)} {right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the number of nonmissing +observations of {it:exp}. + + {opth nunique(exp)} {right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the number of unique +observations of {it:exp}. + + {opth iqr(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the interquartile range of +{it:exp}. Also see {help gegen##pctile():{bf:pctile()}}. + + {opth max(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the maximum value +of {it:exp}. + +{marker mean()}{...} + {opth mean(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the mean of +{it:exp}. + +{marker geomean()}{...} + {opth geomean(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the geometric mean of +{it:exp}. If {it:exp} has negative values, the function returns missing (.). +If {it:exp} has any zeros, the function returns zero. + +{marker median()}{...} + {opth median(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the median of +{it:exp}. Also see {help gegen##pctile():{bf:pctile()}}. + + {opth min(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the minimum value +of {it:exp}. + + {opth range(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the value range of {it:exp}. + +{marker select()}{...} + {opth select(exp)} {cmd:, n(}{it:#}|{it:-#}{cmd:)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the {it:#}th smallest +value of {it:exp}. To compute the {it:#}th largest +value, prefix a negative sign, {it:-#}. Note that without weights, +{opt n(1)} and {opt n(-1)} will give the same value as {opt min} and +{opt max}, respectively. + +{marker pctile()}{...} + {opth pctile(exp)} [{cmd:, p(}{it:#}{cmd:)}]{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the {it:#}th percentile +of {it:exp}. If {opt p(#)} is not specified, 50 is assumed, meaning medians. +Also see {help gegen##median():{bf:median()}}. + + {opth sd(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the standard +deviation of {it:exp}. Also see {help gegen##mean():{bf:mean()}}. + + {opth variance(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the variance +of {it:exp}. Also see {help gegen##sd():{bf:sd()}}. + + {opth cv(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the coefficient +of variation of {it:exp}; {opt sd/mean}. Also see {help gegen##sd():{bf:sd()}} and +{help gegen##mean():{bf:mean()}}. + + {opth percent(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the percent of +non-missing observations of {it:exp} in the group relative to the sample. + + {opth semean(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the standard +error of the mean of {it:exp}, (sd/sqrt(n)). + + {opth sebinomial(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the standard +error of the mean of {it:exp}, binomial (sqrt(p(1-p)/n)) (missing if +{it:exp} not 0, 1). + + {opth sepoisson(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the standard +error of the mean of {it:exp}, Poisson (sqrt(mean / n)) (missing if +{it:exp} is negative; result rounded to nearest integer) + + {opth skewness(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the skewness of {it:exp} + + {opth kurtosis(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the kurtosis of {it:exp} + + {opth sum(exp)} [{cmd:,} {opt m:issing}] {right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } + {opth total(exp)} [{cmd:,} {opt m:issing}] {right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within {it:varlist}) containing the sum of {it:exp} +treating missing as 0. If {opt missing} is specified and all values in +{it:exp} are missing, {it:newvar} is set to missing. Also see +{help gegen##mean():{bf:mean()}}. + + {opth gini(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } + {opth gini|dropneg(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } + {opth gini|keepneg(exp)}{right:(allows {help by:{bf:by} {it:varlist}{bf::}}) } +{pmore2} +creates a constant (within varlist) containing the Gini coefficient of +exp, truncating negative values to 0. {opt gini|dropneg} drops negative +values, and {opt gini|keepneg} keeps negative values as is (the user +is responsible for the interpretation of the Gini coefficient in this case). + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gegen} creates {newvar} of the optionally specified storage type +equal to {it:fcn}{cmd:(}{it:arguments}{cmd:)}. Here {it:fcn}{cmd:()} is either +one of the internally supported commands above or a by-able function written +for {cmd:egen}, as documented above. Only {cmd:egen} functions or internally +supported functions may be used with {cmd:egen}. If you want to generate +multiple summary statistics from a single variable it may be faster to use +{opt gcollapse} with the {opt merge} option. + +{pstd} +Depending on {it:fcn}{cmd:()}, {it:arguments}, if present, refers to an +expression, {varlist}, or a {it:{help numlist}}, and the {it:options} +are similarly {it:fcn} dependent. + +{marker memory}{...} +{title:Out of memory} + +{pstd} +(See also Stata's own discussion: {help memory:help memory}.) + +{pstd} +There are many reasons for why an OS may run out of memory. The best-case +scenario is that your system is running some other memory-intensive program. +This is specially likely if you are running your program on a server, where +memory is shared across all users. In this case, you should attempt to re-run +{it:gegen} once other memory-intensive programs finish. + +{pstd} +If no memory-intensive programs were running concurrently, the second best-case +scenario is that your user has a memory cap that your programs can use. Again, +this is specially likely on a server, and even more likely on a computing grid. +If you are on a grid, see if you can increase the amount of memory your programs +can use (there is typically a setting for this). If your cap was set by a system +administrator, consider contacting them and asking for a higher memory cap. + +{pstd} +If you have no memory cap imposed on your user, the likely scenario is that +your system cannot allocate enough memory for {it:gegen}. At this point you +have two options: One option is to try {it:fegen} or {it:egen}, which are +slower but using either should require a trivial one-letter change to the +code; another option is to re-write egen the data in segments (the easiest +way to do this would be to egen a portion of all rows at a time and +perform a series of append statements at the end.) + +If you have no memory cap imposed on your user, the likely scenario is +that your system cannot allocate enough memory for {it:gegen}. At this +point you can try {it:fegen} or {it:egen}, which are slower but using +either should require a trivial one-letter change to the code. Note, +however, that replacing {it:gegen} with {it:fegen} or plain {it:egen} +is not guaranteed to use less memory. I have not benchmarked memory use +very extensively, so {it:gegen} might use less memory (I doubt that is +the case in most scenarios, but it is possible). + +{pstd} +You can also try to process the data by segments. However, if you are +doing group operations you would need to first sort the data and make +sure you are not splitting groups apart. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gegen/index.html#examples":online documentation} +for examples. + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gegen} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file +for {it:egen}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gcollapse}, +{help gtools}; +{help fegen} (if installed), +{help fcollapse} (if installed), +{help ftools} (if installed) +p_end} + diff --git a/01.code/ado/g/gglm.ado b/01.code/ado/g/gglm.ado new file mode 100755 index 0000000..e1d6611 --- /dev/null +++ b/01.code/ado/g/gglm.ado @@ -0,0 +1,28 @@ +*! version 0.2.0 15Jun2021 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Estimate GLM via IRLS by group and with HDFE + +cap program drop gglm +program gglm, rclass + version 13.1 + + local 00: copy local 0 + if ( strpos(`"`0'"', ",") > 0 ) { + local comma + } + else { + local comma , + } + gregress `0' `comma' glm + if ( ${GREG_RC} ) { + global GREG_RC + exit 0 + } + local 0: copy local 00 + + return local levels `"`r(levels)'"' + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) +end + diff --git a/01.code/ado/g/gglm.sthlp b/01.code/ado/g/gglm.sthlp new file mode 100755 index 0000000..0f3072f --- /dev/null +++ b/01.code/ado/g/gglm.sthlp @@ -0,0 +1,287 @@ +{smcl} +{* *! version 0.2.0 15Jun2021}{...} +{viewerdialog gglm "dialog gglm"}{...} +{vieweralsosee "[R] gglm" "mansection R gglm"}{...} +{viewerjumpto "Syntax" "gglm##syntax"}{...} +{viewerjumpto "Description" "gglm##description"}{...} +{viewerjumpto "Methods and Formulas" "gglm##methods_and_formulas"}{...} +{viewerjumpto "Examples" "gglm##examples"}{...} +{title:Title} + +{p2colset 5 18 24 2}{...} +{p2col :{cmd:gglm} {hline 2}} GLM (via IRLS) by group with weights, clustering, and HDFE{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt gglm} is in beta and meant for testing; use in production {bf:NOT} recommended. (To enable beta features, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt gglm} +{depvar} +{indepvars} +{ifin} +[{it:{help gglm##weight:weight}}] +[{cmd:,} {opth by(varlist)} {opth absorb(varlist)} {it:{help gglm##table_options:options}}] + +{pstd} +Support for different link functions may be added in future releases. +At the moment only the cannonical link for each `family()` is available. + +{pstd} +By default, results are saved into a mata class object named after the +model estimated: {opt GtoolsLogit} when the logit link is used (default +with {opt family(bonimial)}), {opt GtoolsPoisson} when the poisson link +is used (default with {opt family(poisson)}), etc. For details, the +{opt desc()} method is available, e.g. {opt mata GtoolsLogit.desc()}. +The name and contents can be modified via the {opt mata()} option. The +results can also be saved into variables via {opt gen()} or {opt prefix()} +(either can be combined with {opt mata()}, but not each other). + +{pstd} +Run {opt mata GtoolsLogit.desc()} for details; the name and contents can be +modified via {opt mata()}. The results can also be saved into variables +via {opt gen()} or {opt prefix()} (either can be combined with {opt +mata()}, but not each other). + +{pstd} +Note that extended varlist syntax is {bf:not} supported. Further, +{opt fweight}s behave differently that other weighting schemes; that +is, this assumes that the weight refers to the number of available +{it:observations}. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Save Results} +{synopt:{opt mata(name, [nob nose])}}Specify name of output mata object and whether to save {bf:b} and {bf:se} +{p_end} +{synopt:{opt gen(...)}}Specify any of {opth b(varlist)}, {opth se(varlist)}, and {opth hdfe(varlist)}. One per covariate is required ({opt hdfe()} also requires one for the dependent variable). +{p_end} +{synopt:{opt prefix(...)}}Specify any of {opth b(str)}, {opth se(str)}, and {opth hdfe(str)}. A single prefix is allowed. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} + +{syntab :Options} +{synopt:{opth by(varname)}}Group statistics by variable. +{p_end} +{synopt:{opt robust}}Robust SE. +{p_end} +{synopt:{opth cluster(varlist)}}One-way or nested cluster SE. +{p_end} +{synopt:{opth absorb(varlist)}}Multi-way high-dimensional fixed effects. +{p_end} +{synopt:{opth hdfetol(real)}}Tolerance level for HDFE algoritm (default 1e-8). +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} +{synopt:{opt noc:onstant}}Whether to add a constant (cannot be combined with {opt absorb()}). +{p_end} +{synopt:{opth glmtol(real)}}Tolerance level for IRLS algoritm (default 1e-8). +{p_end} +{synopt:{opth glmiter(int)}}Maximum number of iterations for IRLS (default 1000). +{p_end} + +{syntab:Gtools} +{synopt:{opt compress}}Try to compress strL to str#. +{p_end} +{synopt:{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt:{opt v:erbose}}Print info during function execution. +{p_end} +{synopt:{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt:{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt:{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gglm} estimates GLM via IRLS, optionally weighted, by group, with +cluster SE, and/or with multi-way high-dimensional fixed effects. +The results are by default saved into a mata object (e.g. {opt GtoolsLogit}, +{opt GtoolsPoisson}, and so on; run {opt mata GtoolsLogit.desc()}) for details). +The following data is stored: + + regression info + --------------- + + string scalar caller + model used; "glogit", "gpoisson", etc. + + real scalar kx + number of (non-absorbed) covariates + + real scalar cons + whether a constant was added automagically + + real scalar saveb + whether b was stored + + real matrix b + J by kx matrix with regression coefficients + + real scalar savese + whether se was stored + + real matrix se + J by kx matrix with corresponding standard errors + + string scalar setype + type of SE computed (homoskedastic, robust, or cluster) + + real scalar absorb + whether any FE were absorbed + + string colvector absorbvars + variables absorbed as fixed effects + + string colvector njabsorb + number of FE to be absorbed for each variaable and by level + + string colvector savenjabsorb + whether njabsorb is stored + + string colvector clustervars + cluster variables + + string colvector njcluster + number of clusters per by level + + string colvector savenjcluster + whether njcluster is stored + + real scalar by + whether there were any grouping variables + + string rowvector byvars + grouping variable names + + real scalar J + number of levels defined by grouping variables + + class GtoolsByLevels ByLevels + grouping variable levels; see GtoolsLogit.ByLevels.desc() for details + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar ByLevels.anyvars + 1: any by variables; 0: no by variables + + real scalar ByLevels.anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector ByLevels.byvars + by variable names + + real scalar ByLevels.kby + number of by variables + + real scalar ByLevels.rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar ByLevels.J + number of levels + + real matrix ByLevels.numx + numeric by variables + + string matrix ByLevels.charx + string by variables + + real scalar ByLevels.knum + number of numeric by variables + + real scalar ByLevels.kchar + number of string by variables + + real rowvector ByLevels.lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector ByLevels.map + map from index to numx and charx + +{marker methods_and_formulas}{...} +{title:Methods and Formulas} + +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gglm/index.html#methods-and-formulas":online documentation} +for details. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gglm/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/gglm/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/gisid.ado b/01.code/ado/g/gisid.ado new file mode 100755 index 0000000..34f561b --- /dev/null +++ b/01.code/ado/g/gisid.ado @@ -0,0 +1,60 @@ +*! version 1.1.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -isid- implementation using C for faster processing + +capture program drop gisid +program gisid + version 13.1 + + global GTOOLS_CALLER gisid + syntax varlist /// Variables to check + [if] [in] , /// [if condition] [in start / end] + [ /// + Missok /// Missing values in varlist are OK + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _keepgreshape /// (Undocumented) Keep greshape scalars + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + debug(passthru) /// Print debugging info to console + /// + /// Unsupported isid options + /// ------------------------ + Sort /// + ] + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( "`sort'" != "" ) { + di as err "Option -sort- is not implemented" + exit 198 + } + + if ( "`missok'" == "" ) { + local miss exitmissing + } + else { + local miss missing + } + + local opts `miss' `compress' `forcestrl' `_ctolerance' `_keepgreshape' + local opts `opts' `verbose' `benchmark' `benchmarklevel' + local opts `opts' `oncollision' `hashmethod' `debug' + cap noi _gtools_internal `varlist' `if' `in', unsorted `opts' gfunction(isid) + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + isid `varlist' `if' `in', `missok' + exit 0 + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + exit 0 + } + else if ( `rc' ) exit `rc' +end diff --git a/01.code/ado/g/gisid.sthlp b/01.code/ado/g/gisid.sthlp new file mode 100755 index 0000000..ec4fb5e --- /dev/null +++ b/01.code/ado/g/gisid.sthlp @@ -0,0 +1,135 @@ +{smcl} +{* *! version 1.1.1 23Jan2019}{...} +{viewerdialog gisid "dialog gisid"}{...} +{vieweralsosee "[D] gisid" "mansection D gisid"}{...} +{viewerjumpto "Syntax" "gisid##syntax"}{...} +{viewerjumpto "Description" "gisid##description"}{...} +{viewerjumpto "Options" "gisid##options"}{...} +{title:Title} + + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gisid} {hline 2}}Efficiently check for unique identifiers using C plugins.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to Stata's {opt isid}. +It is 8 to 30 times faster in Stata/IC and 4-14 times faster in MP + +{p 8 13 2} +{cmd:gisid} +{varlist} +{ifin} +[{cmd:,} +{opt m:issok}] + + +{marker description}{...} +{title:Description} + +{pstd} +{opt gisid} is a faster alternative to {help isid}. It can check for an ID in +a subset of the data, but it can't do it for an external dataset or sort the data. + +{pstd} +{opt gisid} is part of the {manhelp gtools R:gtools} project. + + +{marker options}{...} +{title:Options} + +{phang}{opt missok} indicates that missing values are permitted in {varlist}. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker examples}{...} +{title:Examples} + +{pstd} +See {help isid##examples} or the +{browse "http://gtools.readthedocs.io/en/latest/usage/gisid/index.html#examples":online documentation} +for examples. + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + + +{title:Website} + +{pstd}{cmd:gisid} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file +for {it:isid} and Sergio Correia's help file for {it:fisid}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + + +{title:Also see} + +{p 4 13 2} +help for +{help gisid}, +{help gtools}; +{help fisid} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/givregress.ado b/01.code/ado/g/givregress.ado new file mode 100755 index 0000000..55abafd --- /dev/null +++ b/01.code/ado/g/givregress.ado @@ -0,0 +1,27 @@ +*! version 0.1.1 14Apr2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Estimate IV regression via 2SLS by group and with HDFE + +cap program drop givregress +program givregress, rclass + version 13.1 + + local 00: copy local 0 + if ( strpos(`"`0'"', ",") > 0 ) { + local comma + } + else { + local comma , + } + gregress `0' `comma' ivregress + if ( ${GREG_RC} ) { + global GREG_RC + exit 0 + } + local 0: copy local 00 + + return local levels `"`r(levels)'"' + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) +end diff --git a/01.code/ado/g/givregress.sthlp b/01.code/ado/g/givregress.sthlp new file mode 100755 index 0000000..3b55bcc --- /dev/null +++ b/01.code/ado/g/givregress.sthlp @@ -0,0 +1,272 @@ +{smcl} +{* *! version 0.1.1 14Apr2020}{...} +{viewerdialog givregress "dialog givregress"}{...} +{vieweralsosee "[R] givregress" "mansection R givregress"}{...} +{viewerjumpto "Syntax" "givregress##syntax"}{...} +{viewerjumpto "Description" "givregress##description"}{...} +{viewerjumpto "Methods and Formulas" "givregress##methods_and_formulas"}{...} +{viewerjumpto "Examples" "givregress##examples"}{...} +{title:Title} + +{p2colset 5 18 24 2}{...} +{p2col :{cmd:givregress} {hline 2}} 2SLS linear regressions by group with weights, clustering, and HDFE{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt givregress} is in beta and meant for testing; use in production {bf:NOT} recommended. (To enable beta features, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt givregress} +{depvar} +{cmd:(}{it:endogenous}{cmd:=}{it:instruments}{cmd:)} +[{it:exogenous}] +{ifin} +[{it:{help givregress##weight:weight}}] +[{cmd:,} {opth by(varlist)} {opth absorb(varlist)} {it:{help givregress##table_options:options}}] + +{pstd} +By default, results are saved into a mata class object named +{opt GtoolsIV}. Run {opt mata GtoolsIV.desc()} for +details; the name and contents can be modified via {opt mata()}. +The results can also be saved into variables via {opt gen()} +or {opt prefix()} (either can be combined with {opt mata()}, but not +each other). + +{pstd} +Note that extended varlist syntax is {bf:not} supported. Further, +{opt fweight}s behave differently that other weighting schemes; that +is, this assumes that the weight refers to the number of available +{it:observations}. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Save Results} +{synopt:{opt mata(name, [nob nose])}}Specify name of output mata object and whether to save {bf:b} and {bf:se} +{p_end} +{synopt:{opt gen(...)}}Specify any of {opth b(varlist)}, {opth se(varlist)}, and {opth hdfe(varlist)}. One per covariate is required ({opt hdfe()} also requires one for the dependent variable). +{p_end} +{synopt:{opt prefix(...)}}Specify any of {opth b(str)}, {opth se(str)}, and {opth hdfe(str)}. A single prefix is allowed. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} + +{syntab :Options} +{synopt:{opth by(varname)}}Group statistics by variable. +{p_end} +{synopt:{opt robust}}Robust SE. +{p_end} +{synopt:{opth cluster(varlist)}}One-way or nested cluster SE. +{p_end} +{synopt:{opth absorb(varlist)}}Multi-way high-dimensional fixed effects. +{p_end} +{synopt:{opth hdfetol(real)}}Tolerance level for HDFE algoritm (default 1e-8). +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} +{synopt:{opt noc:onstant}}Whether to add a constant (cannot be combined with {opt absorb()}). +{p_end} + +{syntab:Gtools} +{synopt:{opt compress}}Try to compress strL to str#. +{p_end} +{synopt:{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt:{opt v:erbose}}Print info during function execution. +{p_end} +{synopt:{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt:{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt:{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:givregress} estimates a linear IV model via 2SLS, +optionally weighted, by group, with cluster SE, and/or with multi-way +high-dimensional fixed effects. The results are by default saved +into a mata object (default {opt GtoolsIV}). Run {opt mata +GtoolsIV.desc()} for details; the following data is stored: + + regression info + --------------- + + string scalar caller + model used; should be "givregress" + + real scalar kx + number of (non-absorbed) covariates + + real scalar cons + whether a constant was added automagically + + real scalar saveb + whether b was stored + + real matrix b + J by kx matrix with regression coefficients + + real scalar savese + whether se was stored + + real matrix se + J by kx matrix with corresponding standard errors + + string scalar setype + type of SE computed (homoskedastic, robust, or cluster) + + real scalar absorb + whether any FE were absorbed + + string colvector absorbvars + variables absorbed as fixed effects + + string colvector njabsorb + number of FE to be absorbed for each variaable and by level + + string colvector savenjabsorb + whether njabsorb is stored + + string colvector clustervars + cluster variables + + string colvector njcluster + number of clusters per by level + + string colvector savenjcluster + whether njcluster is stored + + real scalar by + whether there were any grouping variables + + string rowvector byvars + grouping variable names + + real scalar J + number of levels defined by grouping variables + + class GtoolsByLevels ByLevels + grouping variable levels; see GtoolsIV.ByLevels.desc() for details + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar ByLevels.anyvars + 1: any by variables; 0: no by variables + + real scalar ByLevels.anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector ByLevels.byvars + by variable names + + real scalar ByLevels.kby + number of by variables + + real scalar ByLevels.rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar ByLevels.J + number of levels + + real matrix ByLevels.numx + numeric by variables + + string matrix ByLevels.charx + string by variables + + real scalar ByLevels.knum + number of numeric by variables + + real scalar ByLevels.kchar + number of string by variables + + real rowvector ByLevels.lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector ByLevels.map + map from index to numx and charx + +{marker methods_and_formulas}{...} +{title:Methods and Formulas} + +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/givregress/index.html#methods-and-formulas":online documentation} +for details. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/givregress/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/givregress/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/glevelsof.ado b/01.code/ado/g/glevelsof.ado new file mode 100755 index 0000000..5310405 --- /dev/null +++ b/01.code/ado/g/glevelsof.ado @@ -0,0 +1,222 @@ +*! version 1.2.0 23Mar2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -levelsof- implementation using C for faster processing + +capture program drop glevelsof +program glevelsof, rclass + version 13.1 + + if ( `=_N < 1' ) { + di as err "no observations" + exit 2000 + } + + global GTOOLS_CALLER glevelsof + syntax anything /// Variables to get levels of: [+|-]varname [[+|-]varname ...] + [if] [in] , /// [if condition] [in start / end] + [ /// + Separate(passthru) /// Levels sepparator + COLSeparate(passthru) /// Columns sepparator + MISSing /// Include missing values + LOCal(str) /// Store results in local + Clean /// Clean strings + /// + unsorted /// Do not sort levels (faster) + noLOCALvar /// Do not store levels in a local macro (or in r(levels)) + numfmt(passthru) /// Number format + freq(passthru) /// (not implemented) compute frequency counts + store(passthru) /// (not implemented) store in matrix or mata object + gen(passthru) /// Save unique levels in varlist + NODS DS /// Parse - as varlist (ds) or negative (nods) + silent /// Do not try to display levels in console + MATAsave /// Save results in mata + MATAsavename(str) /// mata save name + /// + debug(passthru) /// Print debugging info to console + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + /// + GROUPid(str) /// + tag(passthru) /// + counts(passthru) /// + fill(passthru) /// + replace /// + ] + + if ( (`"`matasave'"' != "") & (`"`local'"' != "") ) { + disp as err "Option local() not allowed with option -matasave-" + exit 198 + } + + if ( (`"`matasavename'"' != "") & (`"`local'"' != "") ) { + disp as err "Option local() not allowed with option -matasave()-" + exit 198 + } + + if ( `"`matasavename'"' != "" ) local matasave matasave + if ( `"`matasavename'"' == "" ) local matasavename GtoolsByLevels + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( (`"`localvar'"' != "") & (`"`local'"' != "") ) { + disp as txt "(option {opt local} ignored with option {nolocalvar})" + } + + if ( ("`ds'" != "") & ("`nods'" != "") ) { + di as err "-ds- and -nods- mutually exclusive" + exit 198 + } + + * Get varlist + * ----------- + + if ( `"`anything'"' != "" ) { + local varlist: copy local anything + local varlist: subinstr local varlist "+" " ", all + if ( strpos(`"`varlist'"', "-") & ("`ds'`nods'" == "") ) { + disp as txt "'-' interpreted as negative; use option -ds- to interpret as varlist" + disp as txt "(to suppress this warning, use option -nods-)" + } + if ( "`ds'" != "" ) { + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid varlist: `anything'" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + cap noi ds `varlist' + exit _rc + } + local varlist `r(varlist)' + local anything: copy local varlist + } + else { + local parse: copy local varlist + local varlist: subinstr local varlist "-" " ", all + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid list: `anything'" + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + local notfound + foreach var of local varlist { + cap confirm var `var' + if ( _rc ) { + local notfound `notfound' `var' + } + } + if ( `:list sizeof notfound' > 0 ) { + if ( `:list sizeof notfound' > 1 ) { + di as err "Variables not found: `notfound'" + } + else { + di as err "Variable `notfound' not found" + } + } + exit 111 + } + local varlist + local anything + while ( `:list sizeof parse' ) { + gettoken var parse: parse, p(" -") + local neg + if inlist("`var'", "-") { + gettoken var parse: parse, p(" -") + local neg - + } + cap ds `var' + if ( _rc ) { + local rc = _rc + di as err "Variable '`var'' does not exist." + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit `rc' + } + foreach v of varlist `var' { + local anything `anything' `neg'`v' + local varlist `varlist' `v' + } + } + } + } + if ( "`ds'" == "" ) local nods nods + + * Run levelsof + * ------------ + + local opts `separate' `missing' `clean' `unsorted' `ds' `nods' + + local sopts `colseparate' `numfmt' `compress' `forcestrl' + local sopts `sopts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local sopts `sopts' `oncollision' `hashmethod' `debug' + + local gopts gen(`groupid') `tag' `counts' `fill' `replace' + local gopts `gopts' glevelsof(`localvar' `freq' `store' /* + */ `gen' `silent' `matasave' matasavename(`matasavename')) + + cap noi _gtools_internal `anything' `if' `in', `opts' `sopts' `gopts' gfunction(levelsof) + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + if ( `:list sizeof varlist' > 1 ) { + di as err "Cannot use fallback with more than one variable." + exit 17000 + } + else if ( `"`localvar'`gen'`numfmt'`matasave'"' != "" ) { + di as err `"Cannot use fallback with option(s): `localvar' `gen' `numfmt' `matasave'."' + exit 17000 + } + else if ( strpos("`anything'", "-") & ("`ds'" == "") ) { + di as err "Cannot use fallback with inverse order." + exit 17000 + } + else { + levelsof `varlist' `if' `in', `opts' + exit 0 + } + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + exit 0 + } + else if ( `rc' == 920 ) { + disp as err _n(1) "try {opt gen(prefix)} {opt nolocal} or {opt mata(name)} {opt nolocal};" /* + */ " see {help glevelsof:help glevelsof} for details" + exit `rc' + } + else if ( `rc' ) exit `rc' + + if ( (`"`localvar'"' == "") & (`"`matasave'"' == "") ) { + mata st_local("vals", st_global("r(levels)")) + mata st_local("sep", st_global("r(sep)")) + mata st_local("colsep", st_global("r(colsep)")) + if ( `:list sizeof varlist' == 1 ) { + cap confirm numeric variable `varlist' + if ( _rc == 0 ) { + local vals: subinstr local vals " 0." " .", all + local vals: subinstr local vals "-0." "-.", all + } + } + return local levels: copy local vals + return local sep: copy local sep + return local colsep: copy local colsep + if ( "`local'" != "" ) c_local `local': copy local vals + if ( "`silent'" == "" ) di as txt `"`vals'"' + * if ( "`silent'" == "" ) mata st_global("r(levels)") + } + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' +end diff --git a/01.code/ado/g/glevelsof.sthlp b/01.code/ado/g/glevelsof.sthlp new file mode 100755 index 0000000..275783d --- /dev/null +++ b/01.code/ado/g/glevelsof.sthlp @@ -0,0 +1,357 @@ +{smcl} +{* *! version 1.2.0 23Mar2019}{...} +{vieweralsosee "[P] glevelsof" "mansection P glevelsof"}{...} +{vieweralsosee "" "--"}{...} +{vieweralsosee "[P] foreach" "help foreach"}{...} +{vieweralsosee "" "--"}{...} +{viewerjumpto "Syntax" "glevelsof##syntax"}{...} +{viewerjumpto "Description" "glevelsof##description"}{...} +{viewerjumpto "Options" "glevelsof##options"}{...} +{viewerjumpto "Remarks" "glevelsof##remarks"}{...} +{viewerjumpto "Stored results" "glevelsof##results"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:glevelsof} {hline 2}}Efficiently get levels of variable using C plugins{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to Stata's {opt levelsof}. It can additionally take +multiple variables. +It is 3 to 13 times faster in Stata/IC and 2.5-7 times faster in MP + +{p 8 17 2} +{cmd:glevelsof} +{varlist} +{ifin} +[{cmd:,} {it:options}] + +{pstd} +Instead of {varlist}, it is possible to specify + +{p 8 17 2} +[{cmd:+}|{cmd:-}] +{varname} +[[{cmd:+}|{cmd:-}] +{varname} {it:...}] + +{pstd} +To change the sort order of the results. + +{synoptset 25 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Options} +{synopt:{opt c:lean}}display string values without compound double quotes{p_end} +{synopt:{opt l:ocal(macname)}}insert the list of values in the local macro {it:macname}{p_end} +{synopt:{opt miss:ing}}include missing values of {varlist} in calculation{p_end} +{synopt:{opt s:eparate(separator)}}separator to serve as punctuation for the values of returned list; default is a space{p_end} + +{syntab:Extras} +{synopt:{opt nolocal:var}}Do not store the levels of {opt varlist} in a local macro.{p_end} +{synopt:{opt silent}}Do not display the levels of varlist. For use with {opt gen()} and {opt mata:save}{p_end} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GtoolsByLevels}){p_end} +{synopt:{opt gen([prefix], [replace])}}Store the levels of {it:varlist} in new varlist ({opt prefix}) or {opt replace} {it:varlist} with its levels{p_end} +{synopt:{opt cols:eparate(separator)}}separator to serve as punctuation for the columns of returned list; default is a pipe{p_end} +{synopt:{opth numfmt(format)}}Number format for numeric variables. Default is {opt %.16g} (or {opt %16.0g} with {opt matasave}).{p_end} +{synopt:{opt unsorted}}do not sort levels (ignored if inputs are integers){p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:glevelsof} displays a sorted list of the distinct values of {varlist}. +It is meant to be a fast replacement of {cmd:levelsof}. Unlike {cmd:levelsof}, +it can take a single variable or multiple variables. + +{pstd} +{cmd:glevelsof} is part of the {manhelp gtools R:gtools} project. + + +{marker options}{...} +{title:Options} + +{dlgtab:Options} + +{phang} +{cmd:clean} displays string values without compound double quotes. +By default, each distinct string value is displayed within compound double +quotes, as these are the most general delimiters. If you know that the +string values in {varlist} do not include embedded spaces or embedded +quotes, this is an appropriate option. {cmd:clean} +does not affect the display of values from numeric variables. + +{phang} +{cmd:local(}{it:macname}{cmd:)} inserts the list of values in +local macro {it:macname} within the calling program's space. Hence, +that macro will be accessible after {cmd:glevelsof} has finished. +This is helpful for subsequent use, especially with {helpb foreach}. + +{phang} +{cmd:missing} specifies that missing values of {varlist} +should be included in the calculation. The default is to exclude them. + +{phang} +{cmd:separate(}{it:separator}{cmd:)} specifies a separator +to serve as punctuation for the values of the returned list. +The default is a space. A useful alternative is a comma. + +{phang} +{cmd:colseparate(}{it:separator}{cmd:)} specifies a separator +to serve as punctuation for the columns of the returned list. +The default is a pipe. Specifying a {varlist} instead of a +{varname} is only useful for double loops or for use with +{helpb gettoken}. + +{phang} +{opth numfmt(format)} Number format for printing. By default numbers +are printed to 16 digits of precision, but the user can specify +the number format here. By default, only "%.#g|f" and "%#.#g|f" are +accepted since this is formated internally in C. However, with option +{opt matasave} this is formated in mata and has to be a mata format. + +{phang} +{opth unsorted} Do not sort levels. This option is experimental and +only affects the output when the input is not an integer (for integers, +the levels are sorted internally regardless; the user would request the +spooky hash method via {opt hash()}, which obeys the {opt unsorted} +option, but this is intended for debugging). While not sorting the +levels is faster, {cmd:glevelsof} is typically used when the number +of levels is small (10s, 100s, 1000s) and thus speed savings will be +minimal. + +{phang} +{opt nolocalvar}Do not store the levels of {opt varlist} in a local macro. +This is specially useful with option {opt gen()}. + +{phang} +{opt silent}Do not display the levels of varlist. Mainly for use with +{opt gen()} and {opt mata:save}. With {opt mata:save}, the levels are +not sepparately stored as a string matrix, but the raw levels {it:are} +kept. + +{phang} +{opt mata:save}[{cmd:(}{it:str}{cmd:)}]Save results in mata object (default +name is {bf:GtoolsByLevels}). See {opt GtoolsByLevels.desc()} for more. +This object contains the raw variable levels in {opt numx} and {opt charx} +(since mata does not allow matrices of mixed-type). The levels are saved +as a string in {opt printed} (with value labels correctly applied) unless +option {opt silent} is also specified. + +{phang} +{opt gen([prefix], [replace])} Store the levels of {it:varlist} in new +varlist ({opt prefix}) or {opt replace} {it:varlist} with its levels. +These options are mutually exclusive. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + + +{marker remarks}{...} +{title:Remarks} + +{pstd} +{cmd:glevelsof} serves two different functions. First, it gives a +compact display of the distinct values of {it:varlist}. More commonly, it is +useful when you desire to cycle through the distinct values of +{it:varlist} with (say) {cmd:foreach}; see {helpb foreach:[P] foreach}. +{cmd:glevelsof} leaves behind a list in {cmd:r(levels)} that may be used in a +subsequent command. + +{pstd} +{cmd:glevelsof} may hit the {help limits} imposed by your Stata. However, +it is typically used when the number of distinct values of +{it:varlist} is modest. If you have many levels in varlist then +an alternative may be {help gtoplevelsof}, which shows the largest or smallest +levels of a varlist by their frequency count. + + +{marker examples}{...} +{title:Examples} + +{phang}{cmd:. sysuse auto} + +{phang}{cmd:. glevelsof rep78}{p_end} +{phang}{cmd:. display "`r(levels)'"}{p_end} + +{phang}{cmd:. glevelsof rep78, miss local(mylevs)}{p_end} +{phang}{cmd:. display "`mylevs'"}{p_end} + +{phang}{cmd:. glevelsof rep78, sep(,)}{p_end} +{phang}{cmd:. display "`r(levels)'"}{p_end} + +{phang}{cmd:. glevelsof foreign rep78, sep(,)}{p_end} +{phang}{cmd:. display `"`r(levels)'"'}{p_end} + +{phang}{cmd:. glevelsof foreign rep78, gen(uniq_) nolocal}{p_end} +{phang}{cmd:. desc uniq_*}{p_end} +{phang}{cmd:. glevelsof foreign rep78, mata(uniq) nolocal}{p_end} +{phang}{cmd:. mata uniq.desc()}{p_end} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/glevelsof/index.html#examples":online documentation} +for more examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:glevelsof} stores the following in {cmd:r()}: + +{synoptset 15 tabbed}{...} +{p2col 5 15 19 2: Macros}{p_end} +{synopt:{cmd:r(levels)}}list of distinct values{p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(minJ)}} largest group size {p_end} +{synopt:{cmd:r(maxJ)}} smallest group size {p_end} +{p2colreset}{...} + +{pstd} +With {opt matasave}, the following data is stored in {opt GtoolsByLevels}: + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + real rowvector charpos + position of kth character variable + + string matrix printed + formatted (printf-ed) variable levels (not with option -silent-) + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:glevelsof} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:levelsof}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gtoplevelsof}, +{help gtools}; +{help flevelsof} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gpoisson.ado b/01.code/ado/g/gpoisson.ado new file mode 100755 index 0000000..1b50103 --- /dev/null +++ b/01.code/ado/g/gpoisson.ado @@ -0,0 +1,27 @@ +*! version 0.2.1 14Apr2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Estimate poisson regression via IRLS by group and with HDFE + +cap program drop gpoisson +program gpoisson, rclass + version 13.1 + + local 00: copy local 0 + if ( strpos(`"`0'"', ",") > 0 ) { + local comma + } + else { + local comma , + } + gregress `0' `comma' glm family(poisson) + if ( ${GREG_RC} ) { + global GREG_RC + exit 0 + } + local 0: copy local 00 + + return local levels `"`r(levels)'"' + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) +end diff --git a/01.code/ado/g/gquantiles.ado b/01.code/ado/g/gquantiles.ado new file mode 100755 index 0000000..86dfb7d --- /dev/null +++ b/01.code/ado/g/gquantiles.ado @@ -0,0 +1,559 @@ +*! version 1.0.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! faster implementation of pctile, xtile, and _pctile using C plugins + +capture program drop gquantiles +program gquantiles, rclass + version 13.1 + + if ( `=_N < 1' ) { + error 2000 + } + + gtools_timer on 97 + + global GTOOLS_CALLER gquantiles + syntax [anything(equalok)] /// newvar = exp, exp + [if] [in] /// [if condition] [in start / end] + [aw fw pw] , /// [weight type = exp] + [ /// + /// Standard options + /// ---------------- + /// + Nquantiles(int 0) /// Number of quantiles + Cutpoints(passthru) /// Use cutpoints instead of percentiles of `exp' + Percentiles(str) /// Percentiles to compute + ALTdef /// Alternative definition + /// + /// Extras (ways to specify cutoffs) + /// -------------------------------- + /// + Quantiles(passthru) /// Alias for percentiles + cutoffs(passthru) /// Use specified cutoffs instead of quantiles + quantmatrix(passthru) /// Name of matrix with quantiles + cutmatrix(passthru) /// Name of matrix with cutoffs + CUTQuantiles(passthru) /// Use percentiles specified in cutquantiles + /// + /// Augmented standard options + /// -------------------------- + /// + returnlimit(real 1001) /// Set to missing (.) to have no return limit; REALLY SLOW to tinker + cutifin /// Read quantiles() or cutquantiles() within [if] [in] + cutby /// Read quantiles() or cutquantiles() within [if] [in] + dedup /// Remove duplicates from quantiles() or cutquantiles() + _pctile /// Set return values in the style of pctile + pctile /// Call pctile + xtile /// Call xtile + PCTILEvar(name) /// Compute pctile when xtile is main call + XTILEvar(name) /// Compute xtile when pctile is main call + GENp(name) /// Store bin counts for nq/cutpoints/percentiles/etc. + /// + /// Extras (Other) + /// -------------- + /// + binfreq /// Return bin frequencies + BINFREQvar(name) /// Return bin count with options quantiles or cutoffs + /// + by(str) /// By variabes: [+|-]varname [[+|-]varname ...] + replace /// Replace newvar, if it exists + noinit /// Do not initialize targets with missing values + strict /// Exit with error if nq < # if in and non-missing + minmax /// Store r(min) and r(max) (pctiles must be in (0, 100)) + method(passthru) /// Method to compute quantiles: (1) qsort, (2) qselect + /// + /// Standard gtools options + /// ----------------------- + /// + debug(passthru) /// Print debugging info to console + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + /// + GROUPid(str) /// + tag(passthru) /// + counts(passthru) /// + fill(passthru) /// + ] + + local if0: copy local if + local in0: copy local in + local weight0: copy local weight + local exp0: copy local exp + mata st_local("ifin", st_local("if") + " " + st_local("in")) + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + local gfallbackmaybe = "`replace'`init'`by'`pctilevar'`xtilevar'`minmax'`cutoffs'`quantmatrix'`cutmatrix'`cutquantiles'`binfreq'`binfreqvar'" == "" + + local gen_pctile = ("`pctile'" != "") | ("`pctilevar'" != "") + local gen_xtile = ("`xtile'" != "") | ("`xtilevar'" != "") + local gen_any = `gen_pctile' | `gen_xtile' + + if ( (`gen_any' == 0) & ("`_pctile'" == "") ) { + di as err "Nothing to do. Specify _pctile, xtile[()], or pctile[()]" + CleanExit + exit 198 + } + + if ( ("`pctilevar'" == "") & ("`xtilevar'" == "") & ("`_pctile'" == "") & (strpos(`"`anything'"', "=") == 0) ) { + di in txt "(note: no targets will be generated)" + } + + local early_rc = 0 + if ( (`"`weight'"' != "") & ("`altdef'" != "") ) { + di in err "altdef option cannot be used with weights" + local early_rc = 198 + } + + if ("`pctile'" != "") & ("`xtile'" != "") { + di as err "Specify only one of -xtile- or -pctile-." + di as err "To specify a second variable, try -xtile()- or -pctile()-." + local early_rc = 198 + } + + if ( ("`pctile'" != "") | ("`xtile'" != "") ) { + if ( "`_pctile'" != "" ) { + di as err "You should specify -xtile()- or -pctile()- if you want to call _pctile" + local early_rc = 198 + } + } + + if ( "`by'" != "" ) { + if ( `gen_pctile' & ("`strict'" != "strict") ) { + di as err "by() with -pctile- requires option -strict-" + local early_rc = 198 + } + + if ( !(`gen_xtile') & !(`gen_pctile') ) { + di as err "by() requires -xtile- or -pctile-" + local early_rc = 198 + } + + if ( "`minmax'" != "" ) { + di as err "by() does not allow option -minmax-" + di as err"(note: you might be looking for {help gcollapse} with the -merge- option)" + local early_rc = 198 + } + + if ( "`_pctile'" != "" ) { + di as err "by() does not allow _pctile; try pctile[()]" + local early_rc = 198 + } + + if ( "`binfreq'" != "" ) { + di as err "by() does not allow -binfreq-" + local early_rc = 198 + } + + if ( ("`binfreqvar'" != "") & ("`strict'" != "strict") ) { + di as err "by() with -binfreq()- requires option -strict-" + local early_rc = 198 + } + + if ( `gen_pctile' & ("`groupid'" == "") ) { + disp as txt "Suggestion: by() with pctile() is more useful with groupid()" + } + + if ( ("`binfreqvar'" != "") & ("`groupid'" == "") ) { + disp as txt "Suggestion: by() with binfreq() is more useful with groupid()" + } + } + + * Can only specify one way of determining which quantiles to compute + * ------------------------------------------------------------------ + + if ( ("`quantiles'" != "") & ("`percentiles'" != "") ) { + di as err "Options quantile() and percentiles() are redundant." + local early_rc = 198 + } + else if ("`quantiles'" == "") { + local quantiles quantiles(`percentiles') + } + + if ( ("`cutpoints'" != "") | ("`cutoffs'" != "") ) { + if ( "`genp'" != "" ) { + di as err "Option genp() not allowed with cutpoints() or cutoffs()" + local early_rc = 198 + } + } + + * Make sure variables to generate don't exist (or replace) + * -------------------------------------------------------- + + foreach togen in genp pctilevar xtilevar { + if ( "``togen''" != "" ) { + cap confirm new variable ``togen'' + if ( _rc & ("`replace'" == "") ) { + di as err "Option `togen'() invalid; ``togen'' already exists." + local early_rc = 198 + } + local nvar : list sizeof `togen' + if ( `nvar' > 1 ) { + di as err "only one variable allowed in `togen'()" + local early_rc = 198 + } + } + } + + if ( "`binfreqvar'" != "" ) { + local binaddvar binfreqvar(`binfreqvar') + } + + * Parse number of quantiles + * ------------------------- + + if ( (`nquantiles' > `=_N + 1') & ("`strict'" != "") ) { + di in red "nquantiles() must be less than or equal to " /* + */ "number of observations plus one" + local early_rc = 198 + } + if ( `nquantiles' >= 2 ) local fall_nq nquantiles(`nquantiles') + local nquantiles nquantiles(`nquantiles') + + if ( `early_rc' ) { + CleanExit + exit `early_rc' + } + + * Parse main call + * --------------- + + local 0 `anything' + cap syntax newvarname =/exp + if ( _rc ) { + cap syntax varname =/exp + if ( _rc ) { + cap confirm numeric var `anything' + if ( _rc ) { + tempvar touse xsources + mark `touse' `ifin' + cap gen double `xsources' = `anything' if `touse' + if ( _rc ) { + if ( ("`xtile'" != "") | ("`pctile'" != "") ) { + CleanExit + di as err "Invalid syntax. Requried: newvarname = exp" + exit 198 + } + else { + di as err "Invalid expression" + CleanExit + exit 198 + } + } + local ifin if `touse' `in0' + } + else { + local xsources `anything' + local ifin: copy local ifin + } + } + else if ( "`replace'" == "" ) { + di as err "Variable `varlist' already exists" + CleanExit + exit 110 + } + else if ( ("`xtile'" == "") & ("`pctile'" == "") ) { + di as err "varname = exp requires option -xtile- or -pctile-" + CleanExit + exit 198 + } + else { + cap confirm numeric var `exp' + if ( _rc ) { + tempvar touse xsources + mark `touse' `ifin' + cap gen double `xsources' = `exp' if `touse' + if ( _rc ) { + di as err "Invalid expression" + CleanExit + exit 198 + } + local ifin if `touse' `in0' + } + else { + local xsources `exp' + local ifin: copy local ifin + } + } + } + else { + if ( ("`xtile'" == "") & ("`pctile'" == "") ) { + di as err "newvarname = exp requires option -xtile- or -pctile-" + CleanExit + exit 198 + } + cap confirm numeric var `exp' + if ( _rc ) { + tempvar touse xsources + mark `touse' `ifin' + cap gen double `xsources' = `exp' if `touse' + if ( _rc ) { + di as err "Invalid expression" + CleanExit + exit 198 + } + local ifin if `touse' `in0' + } + else { + local xsources `exp' + local ifin: copy local ifin + } + } + + cap unab xsources: `xsources' + if ( _rc ) { + disp "unable to parse source variables or expression" + CleanExit + exit 198 + } + + if ( `:list sizeof xsources' > 1 ) { + disp "multiple sources not allowed" + CleanExit + exit 198 + } + + if ( "`binfreq'" != "" ) { + local binadd binfreq + } + + if ( "`pctile'" != "" ) local pctilevar `varlist' + if ( "`xtile'" != "" ) local xtilevar `varlist' + + local genp genp(`genp') + local pctile pctile(`pctilevar') + local varlist `xtilevar' + + * Pass arguments to internals + * --------------------------- + + if ( `"`weight0'"' != "" ) { + tempvar touse0 w + qui gen double `w' `exp0' `ifin' + local wgt `"[`weight0'=`w']"' + local weights weights(`weight0' `w') + mark `touse0' `ifin' `wgt' + local ifin if `touse0' `in0' + } + else local weights + + if ( ("`xtile'" != "") | ("`pctile'" != "") ) { + local fallback `xtile'`pctile' `varlist' = `exp' `ifin' `wgt', `fall_nq' `cutpoints' `altdef' + } + else { + local fallback _pctile `xsources' `ifin' `wgt', `fall_nq' `altdef' p(`percentiles') + } + + local bench = ( "`benchmark'" != "" ) + local msg "Parsed quantile call" + gtools_timer info 97 `"`msg'"', prints(`bench') off + + local opts `compress' `forcestrl' `_ctolerance' + local opts `opts' `verbose' `benchmark' `benchmarklevel' + local opts `opts' `oncollision' `hashmethod' `debug' + local opts `opts' gen(`groupid') `tag' `counts' `fill' `weights' + + local gqopts `varlist', xsources(`xsources') `_pctile' `pctile' `genp' + local gqopts `gqopts' `binadd' `binaddvar' `nquantiles' `quantiles' + local gqopts `gqopts' `cutoffs' `cutpoints' `quantmatrix' + local gqopts `gqopts' `cutmatrix' `cutquantiles' `cutifin' `cutby' + local gqopts `gqopts' `dedup' `replace' `init' `altdef' `method' `strict' + local gqopts `gqopts' `minmax' returnlimit(`returnlimit') + + cap noi _gtools_internal `by' `ifin', missing unsorted `opts' gquantiles(`gqopts') gfunction(quantiles) + local rc = _rc + + if ( `rc' == 17999 ) { + CleanExit + if ( `gfallbackmaybe' ) { + `fallback' + exit 0 + } + else { + disp as err "(note: cannot use fallback)" + exit 17000 + } + } + else if ( `rc' == 17001 ) { + CleanExit + exit 2000 + } + else if ( `rc' ) { + CleanExit + exit `rc' + } + + * Return values + * ------------- + + if ( "`binfreq'" == "" ) local bin pct + if ( "`binfreq'" != "" ) local bin freq + + if ( "`by'" != "" ) { + return scalar N = `r(N)' + return scalar Nx = `r(Nxvars)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + CleanExit + exit 0 + } + else { + return scalar N = `r(Nxvars)' + } + local Nx = `r(Nxvars)' + + if ( "`minmax'" != "" ) { + return scalar min = r(min) + return scalar max = r(max) + } + + if ( "`quantiles'" != "" ) { + return local quantiles = "`r(quantiles)'" + } + + if ( "`cutoffs'" != "" ) { + return local cutoffs = "`r(cutoffs)'" + } + + if ( `r(nquantiles)' > 0 ) { + return scalar nquantiles = `r(nquantiles)' + local Nout = `r(nquantiles)' - 1 + local nqextra = "`r(nqextra)'" + if ( `: list posof "quantiles" in nqextra' ) { + mata: st_matrix("__gtools_r_qused", st_matrix("r(quantiles_used)")[1::`Nout']') + return matrix quantiles_used = __gtools_r_qused + } + if ( `: list posof "bin" in nqextra' ) { + mata: st_matrix("__gtools_r_qbin", st_matrix("r(quantiles_bincount)")[1::`Nout']') + if ("`binfreq'" != "") { + return matrix quantiles_binfreq = __gtools_r_qbin + } + } + if ( "`_pctile'" != "" ) { + local nreturn = cond(`returnlimit' > 0, min(`Nout', `returnlimit'), `Nout') + if ( "`pctilevar'" != "" ) { + forvalues i = 1 / `nreturn' { + return scalar r`i' = `pctilevar'[`i'] + } + } + else if ( `: list posof "quantiles" in nqextra' ) { + mata: st_matrix("__gtools_r_qused", st_matrix("r(quantiles_used)")[1::`Nout']') + forvalues i = 1 / `nreturn' { + return scalar r`i' = __gtools_r_qused[`i', 1] + } + cap scalar drop `rscalar' + } + else { + di as err "Cannot set _pctile return values with nquantiles() but no pctile()" + CleanExit + exit 198 + } + } + } + + if ( `r(nquantiles2)' > 0 ) { + return scalar nquantiles_used = `r(nquantiles2)' + local Nout = `r(nquantiles2)' + mata: st_matrix("__gtools_r_qused", st_matrix("r(quantiles_used)")[1::`r(nquantiles2)']') + mata: st_matrix("__gtools_r_qbin", st_matrix("r(quantiles_bincount)")[1::`r(nquantiles2)']') + return matrix quantiles_used = __gtools_r_qused + if ("`binfreq'" != "") { + return matrix quantiles_binfreq = __gtools_r_qbin + } + if ( "`_pctile'" != "" ) { + mata: st_matrix("__gtools_r_qused", st_matrix("r(quantiles_used)")[1::`Nout']') + local nreturn = cond(`returnlimit' > 0, min(`r(nquantiles2)', `returnlimit'), `r(nquantiles2)') + forvalues i = 1 / `nreturn' { + return scalar r`i' = __gtools_r_qused[`i', 1] + } + } + } + + if ( `r(ncutpoints)' > 0 ) { + return scalar ncutpoints = `r(ncutpoints)' + local Nout = `r(ncutpoints)' + } + + if ( `r(ncutoffs)' > 0 ) { + return scalar ncutoffs_used = `r(ncutoffs)' + local Nout = `r(ncutoffs)' + mata: st_matrix("__gtools_r_qused", st_matrix("r(cutoffs_used)")[1::`r(ncutoffs)']') + mata: st_matrix("__gtools_r_qbin", st_matrix("r(cutoffs_bincount)")[1::`r(ncutoffs)']') + return matrix cutoffs_used = __gtools_r_qused + if ("`binfreq'" != "") { + return matrix cutoffs_binfreq = __gtools_r_qbin + } + } + + if ( `r(nquantpoints)' > 0 ) { + return scalar nquantpoints = `r(nquantpoints)' + local Nout = `r(nquantpoints)' + if ( "`_pctile'" != "" ) { + if ( "`pctilevar'" != "" ) { + local nreturn = cond(`returnlimit' > 0, min(`r(nquantpoints)', `returnlimit'), `r(nquantpoints)') + forvalues i = 1 / `nreturn' { + return scalar r`i' = `pctilevar'[`i'] + } + } + else { + di as err "Cannot set _pctile return values with cutquantiles() but no pctile()" + CleanExit + exit 198 + } + } + } + + return scalar nqused = `Nout' + return scalar method_ratio = `r(method_ratio)' + + CleanExit + exit 0 +end + +capture program drop gtools_timer +program gtools_timer, rclass + syntax anything, [prints(int 0) end off] + tokenize `"`anything'"' + local what `1' + local timer `2' + local msg `"`3'; "' + + if ( inlist("`what'", "start", "on") ) { + cap timer off `timer' + cap timer clear `timer' + timer on `timer' + } + else if ( inlist("`what'", "info") ) { + timer off `timer' + qui timer list + return scalar t`timer' = `r(t`timer')' + return local pretty`timer' = trim("`:di %21.4gc r(t`timer')'") + if ( `prints' ) di `"`msg'`:di trim("`:di %21.4gc r(t`timer')'")' seconds"' + timer off `timer' + timer clear `timer' + timer on `timer' + } + + if ( "`end'`off'" != "" ) { + timer off `timer' + timer clear `timer' + } +end + +capture program drop CleanExit +program CleanExit + global GTOOLS_CALLER "" + + cap matrix drop __gtools_r_qused + cap matrix drop __gtools_r_qbin + cap matrix drop __gtools_r_qpct + + cap timer off 97 + cap timer clear 97 +end diff --git a/01.code/ado/g/gquantiles.sthlp b/01.code/ado/g/gquantiles.sthlp new file mode 100755 index 0000000..5d20826 --- /dev/null +++ b/01.code/ado/g/gquantiles.sthlp @@ -0,0 +1,472 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gquantiles "dialog gquantiles"}{...} +{vieweralsosee "[R] gquantiles" "mansection R gquantiles"}{...} +{viewerjumpto "Syntax" "gquantiles##syntax"}{...} +{viewerjumpto "Description" "gquantiles##description"}{...} +{viewerjumpto "Options" "gquantiles##options"}{...} +{viewerjumpto "Stored results" "gegen##results"}{...} +{title:Title} + +{p2colset 5 19 23 2}{...} +{p2col :{cmd:gquantiles} {hline 2}}Efficiently compute percentiles (quantiles), categories, and frequencies.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{pstd} +gquantiles can function as a fast, by-able, alternative to {cmd:xtile}, +{cmd:pctile}, and {cmd:_pctile}, though it offers more functionality +that those Stata commands (e.g. this function accepts {opth by(varlist)} +with {cmd:xtile[()]} and {cmd:pctile[()]}, it can compute arbitrary +quantiles and an arbitrary number in a reasonable amount of time, +it computes frequencies, and more). + +{phang} +Create variable containing percentiles (equivalent to {cmd:pctile}) + +{p 8 15 2} +{cmd:gquantiles} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +pctile +[{opth nquantiles(int)} +{opth genp(newvarname)} +{opt altdef}] + +{phang} +Create variable containing quantile categories (equivalent to {cmd:xtile}) + +{p 8 15 2} +{cmd:gquantiles} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +xtile +[{opth nquantiles(int)} +{opth cutpoints(varname)} +{opt altdef}] + +{p 8 15 2} +{cmd:fasterxtile} +{newvar} {cmd:=} {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +[{opth nquantiles(int)} +{opth cutpoints(varname)} +{opt altdef}] + +{phang} +Compute percentiles and store them in r() (equivalent to {cmd:_pctile}) + +{p 8 15 2} +{cmd:gquantiles} +{it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +_pctile +[{opth nquantiles(int)} +{opth percentiles(numlist)} +{opt altdef}] + +{pstd} +The full syntax, however, is + +{p 8 15 2} +{cmd:gquantiles} +[{newvar} {cmd:=}] {it:{help exp}} +{ifin} +[{it:{help gquantiles##weight:weight}}] +{cmd:,} +{c -(}{cmd:pctile}{c |}{cmd:xtile}{c |}{cmd:_pctile}{c )-} +{it:{help gquantiles##quantiles_method:quantiles_method}} +[{it:{help gquantiles##gquantiles_options:gquantiles_options}}] + +{synoptset 22 tabbed}{...} +{marker quantiles_method}{...} +{synopthdr} +{synoptline} +{syntab :Quantiles method (choose only one)} + +{synopt :{opt n:quantiles(#)}}number of quantiles; default is {cmd:nquantiles(2)} +{p_end} +{synopt :{opth p:ercentiles(numlist)}}calculate percentiles corresponding to the specified percentages +{p_end} +{synopt :{opth c:utpoints(varname)}}use values of {it:varname} as cutpoints +{p_end} +{synopt :{opth cutoffs(numlist)}}use values of {it:numlist} as cutpoints +{p_end} +{synopt :{opth cutquantiles(numlist)}}calculate percentiles corresponding to the values of {it:varname} +{p_end} +{synopt :{opth quantmatrix(matrix)}}use values of {it:matrix} as quantiles +{p_end} +{synopt :{opth cutmatrix(matrix)}}use values of {it:matrix} as cutpoints +{p_end} + +{synoptset 18 tabbed}{...} +{marker gquantiles_options}{...} +{synopthdr} +{synoptline} +{syntab :Options} + +{synopt :{opth g:enp(newvar:newvarp)}}generate {it:newvarp} variable containing percentages +{p_end} +{synopt :{opt alt:def}}use alternative formula for calculating percentiles +{p_end} + +{syntab:Extras} +{synopt :{opth by(varlist)}}Compute quantiles by groups ({cmd:pctile} and {cmd:xtile} only). +{p_end} +{synopt :{opth groupid(varname)}}Store group ID in {it:varname}. +{p_end} +{synopt :{opt _pctile}}(Not with by.) Do the computation in the style of {cmd:_pctile} +{p_end} +{synopt :{cmd:pctile}[{cmd:(}{newvar}{cmd:)}]}Store percentiles in {it:newvar}. If {it:newvar} is not specified, then this indicates to do the computations in the style of {cmd:pctile}. +{p_end} +{synopt :{cmd:xtile}[{cmd:(}{newvar}{cmd:)}]}Store quantile categories in {it:newvar}. If {it:newvar} is not specified, then this indicates to do the computations in the style of {cmd:xtile}. +{p_end} +{synopt :{cmd:binfreq}[{cmd:(}{newvar}{cmd:)}]}Store the frequency counts of the source variable in the quantile categories in {it:newvar}. If {it:newvar} is not specified (not with by), this is stored in {hi:r(quantiles_bincount)} or {hi:r(cutoffs_bincount)} +{p_end} + +{syntab:Switches} +{synopt :{opt method(#)}}(Not with by.) Algorithm to use to compute quantiles. +{p_end} +{synopt :{opt dedup}}Drop duplicate values of variables specified via {opt cutpoints} or {opt cutquantiles} +{p_end} +{synopt :{opt cutifin}}Exclude values outside {ifin} of variables specified via {opt cutpoints} or {opt cutquantiles} +{p_end} +{synopt :{opt cutby}}Use {opt cutquantiles()} or {opt cutpoints()} by group. +{p_end} +{synopt :{opt returnlimit(#)}}Maximum return values that can be set via {opt _pctile} +{p_end} +{synopt :{opt strict}}Without by, exit with error when the number of quantiles requested exceeds the number non-missing. With by, skip groups where this happens. +{p_end} +{synopt :{opt minmax}}(Not with by.) Additionally store the min and max in {hi:r(min)} and {hi:r(max)} +{p_end} +{synopt :{opt replace}}Replace targets, should they exist. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed (see +{manhelp weight U:11.1.6 weight}), except with option {opt altdef}, in +which case no weights are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gquantiles} replaces {cmd:xtile}, {cmd:pctile}, and {cmd:_pctile}. +gquantiles offers several additional options above the three built-in +Stata commands: an arbitrary number of quantiles, arbitrary cutoffs, +frequency counts of the xtile categories, computing {cmd:pctile} and +{cmd:xtile} at the same time, and so on. + +{pstd} +gquantiles is also faster than the user-written fastxtile, so an alias, +fasterxtile, is provided. + +{pstd} +{opt gquantiles} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{dlgtab:Quantiles method} + +{phang} +{opt n:quantiles(#)} specifies the number of quantiles. +It computes percentiles corresponding to percentages 100*k/m +for k=1, 2, ..., m-1, where m={it:#}. For example, {cmd:nquantiles(10)} +requests that the 10th, 20th, ..., 90th percentiles be computed. The default +is {cmd:nquantiles(2)}; that is, the median is computed. + +{phang} +{opth p:ercentiles(numlist)} requests +percentiles corresponding to the specified percentages. For example, +{cmd:percentiles(10(20)90)} requests that the 10th, 30th, 50th, 70th, and 90th +percentiles be computed. With {opt _pctile} these are placed into {cmd:r(r1)}, +{cmd:r(r2)}, {cmd:r(r3)}, {cmd:r(r4)}, and {cmd:r(r5)} up to 1,001. With +{opt xtile} these are the quantiles that define the categories and with +{opt pctile} these are the quantiles to compute. + +{phang} +{opth c:utpoints(varname)} requests that the values of {it:varname} +be used to define the categories, rather than quantiles. This is natural +to use with {opt xtile}. With {opt pctile} and {opt _pctile} this is +redindant unless you also request {cmd:binfreq}[{cmd:(}{newvar}{cmd:)}]. +By default, all values of {it:varname} are used, regardless of any {opt if} +or {opt in} restriction. You can specify {opt cutifin} to obey the +restrictions and {opt dedup} to exclude duplicates. + +{phang} +{opth cutoffs(numlist)} Use values of {it:numlist} as cutpoints. + +{phang} +{opth cutquantiles(numlist)} Calculate percentiles corresponding to the values of +{it:varname}. This is an alternative to {opt percentiles()}. + +{phang} +{opth quantmatrix(matrix)} +Requests percentiles (quantiles) corresponding to the entries of the +matrix. This must be a column vector or a row vector. The behavior of +gquantiles using this option is otherwise equivalent to its behavior +when passing {opt quantiles()}. + +{phang} +{opth cutmatrix(matrix)} +Requests cutoffs corresponding to the entries of the matrix. This must +be a column vector or a row vector. The behavior of gquantiles using +this option is otherwise equivalent to its behavior when passing +{opt cutoffs()}. + +{dlgtab:Standard Options} + +{phang}{opth genp(newvar)} +specifies a new variable to be generated +containing the percentages corresponding to the percentiles. + +{phang}{opt altdef} uses an alternative formula for calculating percentiles +(not with weights). +The default method is to invert the empirical distribution function by using +averages, where the function is flat (the default is the same method used by +{cmd:summarize}; see {manhelp summarize R}). +The alternative formula uses an interpolation method. See +{mansection D pctileMethodsandformulas:{it:Methods and formulas}} in +{bf:[D] pctile}. + +{dlgtab:Extras} + +{phang} +{opth by(varlist)} +Compute quantiles by group. {cmd:pctile[()]} requires option +{cmd:strict}, which has the effect of ignoring groups where the number +of quantiles requested is larger than the number of non-missing +observations within the group. {opt by()} is most useful with option +{opth groupid(varname)}. + +{phang} +{opth groupid(varname)} Store group ID in {it:varname}. This +is equivalent to {cmd:gegen, group} + +{phang} +{opt _pctile} (Not with by.) Do the computation in the style of {cmd:_pctile}. It +stores return values in r(1), r(2), and so on, as wll as a matrix called +{hi:r(quantiles_used)} or {hi:r(cutoffs_used)} in case quantiles or cutoffs +are requested. This can be combined with other options listed in this section. + +{phang} +{cmd:pctile}[{cmd:(}{newvar}{cmd:)}] Store percentiles in {it:newvar}. If +{it:newvar} is not specified, then this indicates to do the computations in +the style of {cmd:pctile}. This can be combined with other options listed in +this section. + +{phang} +{cmd:xtile}[{cmd:(}{newvar}{cmd:)}] Store quantile categories in +{it:newvar}. If {it:newvar} is not specified, then this indicates to do the +computations in the style of {cmd:xtile}. This can be combined with other +options listed in this section. + +{phang} +{cmd:binfreq}[{cmd:(}{newvar}{cmd:)}] Store the frequency counts of +the source variable in the quantile categories in {it:newvar}. When +weights are specified, this stores the sum of the weights within +that category. If {it:newvar} is not specified, this is stored in +{hi:r(quantiles_bincount)} or {hi:r(cutoffs_bincount)}. This can be +combined with other options listed in this section. + +{dlgtab:Switches} + +{phang} +{opt method(#)} (Not with by.) Algorithm to use to compute quantiles. If you have many +duplicates or are computing many quantiles, you should specify {opt +method(1)}. If you have few duplicates or are computing few quantiles you +should specify {opt method(2)}. By default, {cmd:gquantiles} tries to guess +which method will run faster. + +{phang} +{opt dedup} Drop duplicate values of variables specified via {opt cutpoints()} +or {opt cutquantiles()}. For instance, if the user asks for +quantiles 1, 90, 10, 10, and 1, then quantiles 1, 1, 10, 10, and 90 are +used. With this option only 1, 10, and 90 would be used. + +{phang} +{opt cutifin} Exclude values outside {ifin} of variables specified via +{opt cutpoints()} or {opt cutquantiles()}. The restriction that all +values are used is artificial (the option was originally written to +allow {cmd:xtile} to use {cmd:pctile} internally). + +{phang} +{opt cutby} By default all values of the variable requested via {opt cutpoints()} +or {opt cutquantiles()} are used. With this option, each group uses a different +set of quantiles or cutoffs (note this automatically sets option {cmd:cutifin}) + +{phang} +{opt returnlimit(#)} Maximum return values that can be set via {opt _pctile}. +Since {cmd:gquantiles} can compute a large number of quantiles very quickly, +the function allows the user to request an arbitrary number. But setting +1,000s of return values is computationally infeasible. Consider {opt pctile} +in this case. + +{phang} +{opt strict} Without {opt by()}, exit with error if the number of quantiles +is greater than the number of non-missing observations plus one. With +{opt by()}, skip groups where this happens. This restriction for {opt pctile} +is sensible, but for {opt xtile} it is artificial. It exists because it uses +{opt pctile} internally, but {cmd:gquantiles} does not have this issue. + +{phang} +{opt minmax} (Not with by.) Additionally store the min and max in {hi:r(min)} and {hi:r(max)} + +{phang} +{opt replace} Replace targets, should they exist. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gquantiles/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gquantiles} stores the following in {cmd:r()}: + +{synoptset 22 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }}Number of observations {p_end} +{synopt:{cmd:r(min) }}Min (only if minmax was requested) {p_end} +{synopt:{cmd:r(max) }}Max (only if minmax was requested) {p_end} +{synopt:{cmd:r(nqused) }}Number of quantiles/cutoffs {p_end} +{synopt:{cmd:r(method_ratio)}}Rule used to decide between methods 1 and 2{p_end} + +{synopt:{cmd:r(nquantiles) }}Number of quantiles (only w nquantiles()) {p_end} +{synopt:{cmd:r(ncutpoints) }}Number of cutpoints (only w cutpoints()) {p_end} +{synopt:{cmd:r(nquantiles_used)}}Number of quantiles (only w quantiles()) {p_end} +{synopt:{cmd:r(nquantpoints) }}Number of quantiles (only w cutquantiles()){p_end} +{synopt:{cmd:r(ncutoffs_used) }}Number of cutoffs (only w cutoffs()) {p_end} + +{synopt:{cmd:r(r#)}}The #th quantile requested (only w _pctile){p_end} +{p2colreset}{...} + +{synoptset 22 tabbed}{...} +{p2col 5 15 19 2: Macros}{p_end} +{synopt:{cmd:r(quantiles)}}Quantiles used (only w percentiles() or quantiles()){p_end} +{synopt:{cmd:r(cutoffs) }}Cutoffs used (only w option cutoffs()) {p_end} +{p2colreset}{...} + +{synoptset 22 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(quantiles_used) }}With _pctile or with quantiles() {p_end} +{synopt:{cmd:r(quantiles_binfreq)}}With option binfreq and any quantiles requested{p_end} + +{synopt:{cmd:r(cutoffs_used) }}With _pctile or with cutoffs() {p_end} +{synopt:{cmd:r(cutoffs_binfreq)}}With option binfreq and any cutoffs requested{p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gquantiles} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:pctile} +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help pctile}, +{help gtools}; +{help fastxtile} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/greg.ado b/01.code/ado/g/greg.ado new file mode 100755 index 0000000..811d803 --- /dev/null +++ b/01.code/ado/g/greg.ado @@ -0,0 +1,22 @@ +*! version 0.2.1 14Apr2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Estimate linear regression via OLS by group and with HDFE + +cap program drop greg +program greg, rclass + version 13.1 + + local 00: copy local 0 + gregress `0' + if ( ${GREG_RC} ) { + global GREG_RC + exit 0 + } + local 0: copy local 00 + + return local cmd `"`r(cmd)'"' + return local mata `"`r(mata)'"' + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) +end diff --git a/01.code/ado/g/greg.sthlp b/01.code/ado/g/greg.sthlp new file mode 100755 index 0000000..496cae9 --- /dev/null +++ b/01.code/ado/g/greg.sthlp @@ -0,0 +1,271 @@ +{smcl} +{* *! version 0.1.1 14Apr2020}{...} +{viewerdialog gregress "dialog gregress"}{...} +{vieweralsosee "[R] gregress" "mansection R gregress"}{...} +{viewerjumpto "Syntax" "gregress##syntax"}{...} +{viewerjumpto "Description" "gregress##description"}{...} +{viewerjumpto "Methods and Formulas" "gregress##methods_and_formulas"}{...} +{viewerjumpto "Examples" "gregress##examples"}{...} +{title:Title} + +{p2colset 5 18 24 2}{...} +{p2col :{cmd:gregress} {hline 2}} OLS linear regressions by group with weights, clustering, and HDFE{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt gregress} is in beta and meant for testing; use in production {bf:NOT} recommended. (To enable beta features, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt gregress} +{depvar} +{indepvars} +{ifin} +[{it:{help gregress##weight:weight}}] +[{cmd:,} {opth by(varlist)} {opth absorb(varlist)} {it:{help gregress##table_options:options}}] + +{pstd} +By default, results are saved into a mata class object named +{opt GtoolsRegress}. Run {opt mata GtoolsRegress.desc()} for +details; the name and contents can be modified via {opt mata()}. +The results can also be saved into variables via {opt gen()} +or {opt prefix()} (either can be combined with {opt mata()}, but not +each other). + +{pstd} +Note that extended varlist syntax is {bf:not} supported. Further, +{opt fweight}s behave differently that other weighting schemes; that +is, this assumes that the weight refers to the number of available +{it:observations}. Other weights run WLS. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Save Results} +{synopt:{opt mata(name, [nob nose])}}Specify name of output mata object and whether to save {bf:b} and {bf:se} +{p_end} +{synopt:{opt gen(...)}}Specify any of {opth b(varlist)}, {opth se(varlist)}, and {opth hdfe(varlist)}. One per covariate is required ({opt hdfe()} also requires one for the dependent variable). +{p_end} +{synopt:{opt prefix(...)}}Specify any of {opth b(str)}, {opth se(str)}, and {opth hdfe(str)}. A single prefix is allowed. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} + +{syntab :Options} +{synopt:{opth by(varname)}}Group statistics by variable. +{p_end} +{synopt:{opt robust}}Robust SE. +{p_end} +{synopt:{opth cluster(varlist)}}One-way or nested cluster SE. +{p_end} +{synopt:{opth absorb(varlist)}}Multi-way high-dimensional fixed effects. +{p_end} +{synopt:{opth hdfetol(real)}}Tolerance level for HDFE algoritm (default 1e-8). +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} +{synopt:{opt noc:onstant}}Whether to add a constant (cannot be combined with {opt absorb()}). +{p_end} + +{syntab:Gtools} +{synopt:{opt compress}}Try to compress strL to str#. +{p_end} +{synopt:{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt:{opt v:erbose}}Print info during function execution. +{p_end} +{synopt:{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt:{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt:{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gregress} estimates a linear regression model via OLS, +optionally weighted, by group, with cluster SE, and/or with multi-way +high-dimensional fixed effects. The results are by default saved +into a mata object (default {opt GtoolsRegress}). Run {opt mata +GtoolsRegress.desc()} for details; the following data is stored: + + regression info + --------------- + + string scalar caller + model used; should be "gregress" + + real scalar kx + number of (non-absorbed) covariates + + real scalar cons + whether a constant was added automagically + + real scalar saveb + whether b was stored + + real matrix b + J by kx matrix with regression coefficients + + real scalar savese + whether se was stored + + real matrix se + J by kx matrix with corresponding standard errors + + string scalar setype + type of SE computed (homoskedastic, robust, or cluster) + + real scalar absorb + whether any FE were absorbed + + string colvector absorbvars + variables absorbed as fixed effects + + string colvector njabsorb + number of FE to be absorbed for each variaable and by level + + string colvector savenjabsorb + whether njabsorb is stored + + string colvector clustervars + cluster variables + + string colvector njcluster + number of clusters per by level + + string colvector savenjcluster + whether njcluster is stored + + real scalar by + whether there were any grouping variables + + string rowvector byvars + grouping variable names + + real scalar J + number of levels defined by grouping variables + + class GtoolsByLevels ByLevels + grouping variable levels; see GtoolsRegress.ByLevels.desc() for details + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar ByLevels.anyvars + 1: any by variables; 0: no by variables + + real scalar ByLevels.anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector ByLevels.byvars + by variable names + + real scalar ByLevels.kby + number of by variables + + real scalar ByLevels.rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar ByLevels.J + number of levels + + real matrix ByLevels.numx + numeric by variables + + string matrix ByLevels.charx + string by variables + + real scalar ByLevels.knum + number of numeric by variables + + real scalar ByLevels.kchar + number of string by variables + + real rowvector ByLevels.lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector ByLevels.map + map from index to numx and charx + +{marker methods_and_formulas}{...} +{title:Methods and Formulas} + +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#methods-and-formulas":online documentation} +for details. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/gregress.ado b/01.code/ado/g/gregress.ado new file mode 100755 index 0000000..555cd49 --- /dev/null +++ b/01.code/ado/g/gregress.ado @@ -0,0 +1,431 @@ +*! version 0.2.1 14Apr2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Estimate linear regression via OLS by group and with HDFE + +capture program drop gregress +program gregress, rclass + + if !inlist(`"${GTOOLS_BETA}"', "1", "I KNOW WHAT I AM DOING") { + disp as err `"This function is in beta; to use, you must enable beta features via"' + disp as err `""' + disp as err `" global GTOOLS_BETA = "I KNOW WHAT I AM DOING""' + disp as err `""' + disp as err `"gtools functions in beta are subject to change."' + exit 198 + } + + if ( ("${GTOOLS_GREGTABLE}" == "1") & replay() ) { + Replay `0' + exit 0 + } + + version 13.1 + global GREG_RC 0 + global GTOOLS_CALLER gregress + + if ( `=_N < 1' ) { + global GREG_RC 17001 + di as txt "no observations" + exit 0 + } + + * syntax varlist(numeric ts fv) // a way to support this would be to filter it through mata + syntax anything(equalok) /// depvar indepvars + [if] [in] /// [if condition] [in start / end] + [aw fw pw] , /// [weight type = exp] + [ /// + by(str) /// Winsorize options + noMISSing /// Exclude groups with any missing values by level + Robust /// Robust SE + cluster(str) /// Cluster by varlist + absorb(varlist) /// Absorb each var in varlist as FE + glm /// estimate glm + family(str) /// glm family + IVregress /// IV regression + * /// Regress options + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + debug(passthru) /// Print debugging info to console + ] + + disp as txt "{bf:warning:} gregress is beta software and meant for testing." + disp as txt "Use in production is {bf:NOT} recommended; proceed with caution." + + if ( `"`missing'"' == "nomissing" ) local missing + else local missing missing + + if ( `"`by'"' != "" ) unab by: `by' + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + * Parse IV syntax + * --------------- + + * NOTE(mauricio): IV will only be allowed with input colmajor. + + * NOTE(mauricio): I put the instruments at the start so I can add a + * constant. I will only have one memory alloc to X and then point to + * ivz = X, ivendog = X + kz * N, ivexog = X + (kz + kendog) + N + + * NOTE(mauricio): Confirm var does not apparently allow for + * wildcards; expand before confirm var. A consequence of this is + * that the first variable listed is assumed to be the dependent + * variable. Warn the user this might not be their intended behavior + * if the first token passed is a wildcard. + + local ivok 0 + if regexm(`"`anything'"', ".+\((.+=.+)\)") { + + * Here I am rather inflexible with the notation. I think the + * danger of bugs from unforseen mistakes are greater than the + * upside of flexible notation. + + local iveq = regexr(regexs(1), "\(|\)", "") + local ivexog = trim(regexr("`anything'", "\(.+=.+\)", "")) + + * In keeping with this idea, the syntax _must_ be + * + * indep [exog] (endog = instrument) [exog] + * + * where indep is a single variable and exog is optinal. + + unab ivexog: `ivexog' + cap noi confirm var `ivexog' + if ( _rc ) { + disp as err "Error parsing IV syntax: No dependent variable detected" + exit 198 + } + + gettoken ivendog ivinstruments: iveq, p(=) + gettoken _ ivinstruments: ivinstruments + + unab ivinstruments: `ivinstruments' + cap noi confirm var `ivinstruments' + if ( _rc ) { + disp as err "Instruments required for IV" + exit 198 + } + + unab ivendog: `ivendog' + cap noi confirm var `ivendog' + if ( _rc ) { + disp as err "Endogenous covariates required for IV" + exit 198 + } + + * Note we expanded each set of variables above so this _should_ + * be an accurate cound of how many variables there are. + + gettoken ivdepvar ivexog: ivexog + local ivkendog: list sizeof ivendog + local ivkexog: list sizeof ivexog + local ivkz: list sizeof ivinstruments + + * There is a slight issue here in that there is no colinearity + * check implemented _before_ the under-identification check. + * However, after the colinarity check if there are not enough + * instruments both beta and se are set to missing. + + if ( `ivkz' < `ivkendog' ) { + disp as error "Need at least as many instruments as endogenous variables (received `ivkz' < `ivkendog')" + exit 198 + } + + * Finally, you can't have a variable that is both + * + * - dependenet variable _and_ instrumented + * - dependenet variable _and_ instrument + * - dependenet variable _and_ exogenous + * - instrumented _and_ instrument + * - instrumented _and_ exogenous + * - instrument _and_ exogenous + + local problems: list ivdepvar & ivendog + if ( `"`problems'"' != `""' ) { + disp as error "`problems' included as both regressand and endogenous variable" + exit 198 + } + + local problems: list ivdepvar & ivexog + if ( `"`problems'"' != `""' ) { + disp as error "`problems' included as both regressand and exogenous variable" + exit 198 + } + + local problems: list ivdepvar & ivinstruments + if ( `"`problems'"' != `""' ) { + disp as error "`problems' included as both regressand and instrument" + exit 198 + } + + local problems: list ivendog & ivexog + if ( `"`problems'"' != `""' ) { + disp as error "included as both an endogenous and exogenous variable: `problems'" + exit 198 + } + + local problems: list ivendog & ivinstruments + if ( `"`problems'"' != `""' ) { + disp as error "included as both an endogenous variable and an instrument: `problems'" + exit 198 + } + + local problems: list ivexog & ivinstruments + if ( `"`problems'"' != `""' ) { + disp as error "included as both an exogenous variable and an instrument: `problems'" + exit 198 + } + + * Note that each set of variables is passed, unabbreviated + * already, as options so that no further parsing is necessary. + + unab varlist: `ivdepvar' `ivendog' `ivexog' `ivinstruments' + local ivopts ivkendog(`ivkendog') ivkexog(`ivkexog') ivkz(`ivkz') + local ivregress ivregress + local ivok 1 + } + else { + + * Without IV, the only issue is that of unabbreviating the varlist. + unab varlist: `anything' + } + + * Parse rest of regression syntax + * ------------------------------- + + * gegen and gcollapse are better suited for implicit constant-only + * models. The user can also generate a variable of ones and request + * that the constant be supressed. However, I will not allow an + * implicit constant-only model because it's annoying to code and has + * little added value. + + confirm var `varlist' + if ( `:list sizeof varlist' == 1 ) { + disp as err "constant-only models not allowed; varlist required" + exit 198 + } + + * If ivregress requested, implicitly or otherwise, then check the + * parsing. Again, hard stop because it's probably not good to be + * overly flexible. + + if ( (`ivok' == 0) & ("`ivregress'" != "") ) { + disp as err "Could not parse input into IV syntax" + exit 198 + } + + * NOTE(mauricio): ivpoisson is not as straightforward as adapting + * the poisson code, which iterates over OLS. Don't try it. + + local glmfamilies binomial poisson + local nglm: list sizeof family + + if ( (`"`glm'"' == "") & (`nglm' > 0) ) { + disp as err "Input error: GLM family requested without specifying glm" + exit 198 + } + + if ( (`"`glm'"' != "") & (`nglm' == 0) ) { + disp as err "Input error: GLM requires specifying model family()" + exit 198 + } + + if ( `nglm' > 1 ) { + disp as err "Input error: Cannot request multiple GLM models: `family'" + exit 198 + } + + if ( (`"`glm'"' != "") & (!`:list family in glmfamilies') ) { + disp as err "Input error: GLM family() must be one of: `glmfamilies'" + exit 198 + } + + if ( ("`ivregress'" != "") & `nglm' ) { + disp as err "Input error: IV and GLM (`family') requested at the same time" + exit 198 + } + + * TODO: xx the idea is to eventually allow other links even for the + * same family, I think + + if ( `"`family'"' == "binomial" ) local glmlink logit + if ( `"`family'"' == "poisson" ) local glmlink log + + * NOTE(mauricio): We always make a todo variable because we want + * to exclude missing values in varlist. Furthermore, I think this + * is the place where we ought to exclude observations once the + * -dropsingletons- option is added (to drop singleton groups) + * and the program to automagically detect colinear groups (with + * multi-way hdfe). + + if ( `"`weight'"' != "" ) { + tempvar touse w + qui gen double `w' `exp' `if' `in' + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + mark `touse' `if' `in' `wgt' + markout `touse' `varlist' `cluster' `absorb', strok + local if if `touse' + } + else { + local weights + local _varlist: copy local varlist + local varlist `varlist' `cluster' `absorb' + marksample touse, strok + local varlist: copy local _varlist + local if if `touse' + } + + * binary models require the variable be be 0/1: + + if ( `"`family'"' == "binomial" ) { + gettoken y x: varlist + qui count if !inlist(`y', 0, 1) & `touse' + if ( `r(N)' > 0 ) { + disp as err "`y' must be binary (0/1)" + exit 198 + } + } + + * Recall that the poisson model is a count model, so the variable + * must be a natural number (i.e. non-negative integer). However, I + * think I ought to allow users to have non-count variables if they + * deem it necessary---the warning should be enough. The algorithm + * fails, however, with negative numbers, so that _is_ a hard stop. + + if ( `"`family'"' == "poisson" ) { + gettoken y x: varlist + qui count if (`y' < 0) & `touse' + if ( `r(N)' > 0 ) { + disp as err "`y' must be non-negative" + exit 198 + } + qui count if (`y' != int(`y')) & `touse' + if ( `r(N)' > 0 ) { + disp as txt "{bf:note} you are responsible for interpretation of non-count dep. variable" + } + } + + * NOTE(mauricio): I don't think this warning is necessary anymore. The + * main issue is no longer the collinearity check taking forever (it's + * fairly quick now) but with the X' X matrix multiplication. It's + * _very_ slow and the main bottleneck, but not unreasonably slow given + * the other speed gains. + * + * local kall: list sizeof varlist + * local ratio = log(1e10 / _N)^2 + * if ( `kall' > max(`ratio', 16) ) { + * disp as txt "{bf:beta warning}: 'wide' model (large # of regressors) detected; performance may suffer" + * } + + * Standard call to internals + * -------------------------- + + local options `options' `robust' cluster(`cluster') absorb(`absorb') glmfam(`family') glmlink(`glmlink') + local opts `weights' `compress' `forcestrl' nods unsorted `missing' + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' + local greg gfunction(regress) gregress(`varlist', `options' `ivopts') + + cap noi _gtools_internal `by' `if' `in', `opts' `greg' + local rc = _rc + global GTOOLS_CALLER "" + + * Cleanup + * ------- + + if ( `rc' == 17999 ) { + exit 17000 + } + else if ( `rc' == 18401 ) { + exit 2001 + } + else if ( `rc' == 17001 ) { + global GREG_RC 17001 + di as txt "(no observations)" + exit 0 + } + else if ( `rc' ) exit `rc' + + * Returns + * ------- + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + return local cmd = "gregress" + return local mata: copy local saveGregressMata + + if ( "${GTOOLS_GREGTABLE}" == "1" ) Display `saveGregressMata', touse(`touse') +end + +capture program drop Replay +program Replay, eclass + if ( (`"`r(cmd)'"' != "gregress") | (`"`r(mata)'"' == "") ) error 301 + Display `r(mata)', repost `options' +end + +capture program drop Display +program Display, eclass + syntax [namelist(max = 1)], [repost touse(str) *] + tempname by + if ( "`namelist'" == "" ) { + disp as txt "Cannot display table without cached results; use option -mata()- to save" + } + else { + mata st_numscalar("`by'", `namelist'.by) + if ( `=scalar(`by')' == 0) { + tempname colnames sel nmiss + FreeMatrix b V + mata st_local("caller", `namelist'.caller) + mata st_local("setype", `namelist'.setype) + mata st_matrix("`b'", `namelist'.b[1, .]) + mata st_matrix("`V'", `namelist'.Vcov) + mata `colnames' = `namelist'.xvarlist, J(1, `namelist'.cons, "_cons") + mata `nmiss' = missing(`namelist'.se) + mata `sel' = selectindex(`namelist'.se :>= .) + mata `colnames'[`sel'] = J(1, `nmiss', "o.") :+ `colnames'[`sel'] + mata st_matrixcolstripe("`b'", (J(cols(`colnames'), 1, ""), `colnames'')) + mata st_matrixrowstripe("`b'", ("", `namelist'.yvarlist[1])) + mata st_matrixcolstripe("`V'", (J(cols(`colnames'), 1, ""), `colnames'')) + mata st_matrixrowstripe("`V'", (J(cols(`colnames'), 1, ""), `colnames'')) + if "`repost'" == "" { + if ( "`touse'" != "" ) qui count if `touse' + else qui count + ereturn post `b' `V', esample(`touse') obs(`r(N)') + } + else { + ereturn repost b = `b' V = `V' + } + if ( "`setype'" == "cluster" ) ereturn local vcetype "Cluster" + if ( "`setype'" == "robust" ) ereturn local vcetype "Robust" + if ( "`setype'" != "homoskedastic" ) ereturn local vce "`setype'" + disp _n(1) "`caller' with `setype' SE" + _coef_table, `options' + } + else { + disp as txt "Cannot display table with by(); use {stata mata `namelist'.print()}" + } + } +end + +capture program drop FreeMatrix +program FreeMatrix + local FreeCounter 0 + local FreeMatrix + foreach FM of local 0 { + cap error 0 + while ( _rc == 0 ) { + cap confirm matrix Gtools`++FreeCounter' + c_local `FM' Gtools`FreeCounter' + } + } +end diff --git a/01.code/ado/g/gregress.sthlp b/01.code/ado/g/gregress.sthlp new file mode 100755 index 0000000..496cae9 --- /dev/null +++ b/01.code/ado/g/gregress.sthlp @@ -0,0 +1,271 @@ +{smcl} +{* *! version 0.1.1 14Apr2020}{...} +{viewerdialog gregress "dialog gregress"}{...} +{vieweralsosee "[R] gregress" "mansection R gregress"}{...} +{viewerjumpto "Syntax" "gregress##syntax"}{...} +{viewerjumpto "Description" "gregress##description"}{...} +{viewerjumpto "Methods and Formulas" "gregress##methods_and_formulas"}{...} +{viewerjumpto "Examples" "gregress##examples"}{...} +{title:Title} + +{p2colset 5 18 24 2}{...} +{p2col :{cmd:gregress} {hline 2}} OLS linear regressions by group with weights, clustering, and HDFE{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt gregress} is in beta and meant for testing; use in production {bf:NOT} recommended. (To enable beta features, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt gregress} +{depvar} +{indepvars} +{ifin} +[{it:{help gregress##weight:weight}}] +[{cmd:,} {opth by(varlist)} {opth absorb(varlist)} {it:{help gregress##table_options:options}}] + +{pstd} +By default, results are saved into a mata class object named +{opt GtoolsRegress}. Run {opt mata GtoolsRegress.desc()} for +details; the name and contents can be modified via {opt mata()}. +The results can also be saved into variables via {opt gen()} +or {opt prefix()} (either can be combined with {opt mata()}, but not +each other). + +{pstd} +Note that extended varlist syntax is {bf:not} supported. Further, +{opt fweight}s behave differently that other weighting schemes; that +is, this assumes that the weight refers to the number of available +{it:observations}. Other weights run WLS. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Save Results} +{synopt:{opt mata(name, [nob nose])}}Specify name of output mata object and whether to save {bf:b} and {bf:se} +{p_end} +{synopt:{opt gen(...)}}Specify any of {opth b(varlist)}, {opth se(varlist)}, and {opth hdfe(varlist)}. One per covariate is required ({opt hdfe()} also requires one for the dependent variable). +{p_end} +{synopt:{opt prefix(...)}}Specify any of {opth b(str)}, {opth se(str)}, and {opth hdfe(str)}. A single prefix is allowed. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} + +{syntab :Options} +{synopt:{opth by(varname)}}Group statistics by variable. +{p_end} +{synopt:{opt robust}}Robust SE. +{p_end} +{synopt:{opth cluster(varlist)}}One-way or nested cluster SE. +{p_end} +{synopt:{opth absorb(varlist)}}Multi-way high-dimensional fixed effects. +{p_end} +{synopt:{opth hdfetol(real)}}Tolerance level for HDFE algoritm (default 1e-8). +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} +{synopt:{opt noc:onstant}}Whether to add a constant (cannot be combined with {opt absorb()}). +{p_end} + +{syntab:Gtools} +{synopt:{opt compress}}Try to compress strL to str#. +{p_end} +{synopt:{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt:{opt v:erbose}}Print info during function execution. +{p_end} +{synopt:{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt:{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt:{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gregress} estimates a linear regression model via OLS, +optionally weighted, by group, with cluster SE, and/or with multi-way +high-dimensional fixed effects. The results are by default saved +into a mata object (default {opt GtoolsRegress}). Run {opt mata +GtoolsRegress.desc()} for details; the following data is stored: + + regression info + --------------- + + string scalar caller + model used; should be "gregress" + + real scalar kx + number of (non-absorbed) covariates + + real scalar cons + whether a constant was added automagically + + real scalar saveb + whether b was stored + + real matrix b + J by kx matrix with regression coefficients + + real scalar savese + whether se was stored + + real matrix se + J by kx matrix with corresponding standard errors + + string scalar setype + type of SE computed (homoskedastic, robust, or cluster) + + real scalar absorb + whether any FE were absorbed + + string colvector absorbvars + variables absorbed as fixed effects + + string colvector njabsorb + number of FE to be absorbed for each variaable and by level + + string colvector savenjabsorb + whether njabsorb is stored + + string colvector clustervars + cluster variables + + string colvector njcluster + number of clusters per by level + + string colvector savenjcluster + whether njcluster is stored + + real scalar by + whether there were any grouping variables + + string rowvector byvars + grouping variable names + + real scalar J + number of levels defined by grouping variables + + class GtoolsByLevels ByLevels + grouping variable levels; see GtoolsRegress.ByLevels.desc() for details + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar ByLevels.anyvars + 1: any by variables; 0: no by variables + + real scalar ByLevels.anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector ByLevels.byvars + by variable names + + real scalar ByLevels.kby + number of by variables + + real scalar ByLevels.rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar ByLevels.J + number of levels + + real matrix ByLevels.numx + numeric by variables + + string matrix ByLevels.charx + string by variables + + real scalar ByLevels.knum + number of numeric by variables + + real scalar ByLevels.kchar + number of string by variables + + real rowvector ByLevels.lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector ByLevels.map + map from index to numx and charx + +{marker methods_and_formulas}{...} +{title:Methods and Formulas} + +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#methods-and-formulas":online documentation} +for details. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/gregress/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/greshape.ado b/01.code/ado/g/greshape.ado new file mode 100755 index 0000000..ddec7c6 --- /dev/null +++ b/01.code/ado/g/greshape.ado @@ -0,0 +1,2915 @@ +*! version 0.5.0 26Jan2020 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Fast implementation of reshape using C plugins + +capture program drop greshape +program greshape, rclass + version 13.1 + + if ( inlist(`"`1'"', "clear", "query", "error", "i", "xij", "j", "xi") ) { + disp as err "-reshape `1'- syntax is not supported; see {help greshape:help greshape}" _n + picture err cmd + exit 198 + } + + if ( inlist(`"`1'"', "") ) { + disp as err `"Nothing to do. Specify long or gather, wide or spread"' _n + picture err cmd + exit 198 + } + else if ( !inlist(`"`1'"', "long", "wide", "gather", "spread") ) { + disp as err `"Unknown subcommand '`1''; supported: long or gather, wide or spread"' _n + picture err cmd + exit 198 + } + + if ( `=_N' == 0 ) { + disp "(no observations)" + exit + } + + *********************************************************************** + * Reshape wide or long * + *********************************************************************** + + if ( inlist(`"`1'"', "wide", "spread") ) { + local cmd Wide + } + else if ( inlist(`"`1'"', "long", "gather") ) { + local cmd Long + } + else { + disp as err `"Unknown subcommand '`1''; supported: long or gather, wide or spread"' _n + picture err cmd + exit 198 + } + + * ---------------------------------- + * Handle wide/spread and long/gather + * ---------------------------------- + + ClearReshape + global GTOOLS_PARSE /// + unsorted /// Do not sort the data + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + debug(passthru) // Print debugging info to console + + * gettoken sub args: 0 + cap noi `cmd' `*' + local rc = _rc + if ( `rc' == 17999 ) { + CleanExit + if inlist( `"`1'"', "spread", "gather") { + di as err `"Cannot use fallback with `1'"' + exit 17000 + } + else { + reshape `0' + exit + } + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + local rc 0 + } + else if ( `rc' == 18101 ) { + NonUniqueLongID + local rc 9 + } + else if ( `rc' == 18102 ) { + NonUniqueWideJ + local rc 9 + } + else if ( `rc' == 18103 ) { + NonUniqueWideXi + local rc 9 + } + CleanExit + exit `rc' +end + +* L | W | G +* X | X | ReS_Xij +* X | X | ReS_str +* X | | ReS_uselabels +* X | X | ReS_i +* X | X | ReS_j +* X | X | ReS_jname +* X | X | ReS_nodupcheck +* X | X | ReS_nomisscheck +* X | X | ReS_cmd +* | X | ReS_prefix +* | X | ReS_labelformat +* X | X | ReS_Xij_stubs +* X | | ReS_Xij_regex +* X | | ReS_Xij_add +* X | X | ReS_Xij_keep +* X | X | ReS_Xij_keepnames +* X | X | ReS_Xij_names +* | X | ReS_Xij_addtypes +* | X | ReS_Xij_addvars +* X | X | ReS_atwl +* X | X | ReS_match +* X | X | ReS_jfile +* | X | ReS_jcode +* X | X | ReS_jlen +* X | X | ReS_jv +* X | X | ReS_jv2 +* X | X | ReS_Xi +* X | X | S_1 +* ? | ? | S_1_full +* ? | ? | S_2 +* X | X | rVANS + +* --------------------------------------------------------------------- +* Reshape long + +capture program drop Long +program define Long /* reshape long */ + + *********************************************************************** + * Parse Long Syntax * + *********************************************************************** + + gettoken ReS_cmd 0: 0 + global ReS_cmd: copy local ReS_cmd + global ReS_jname j + global ReS_iname i + + local long_opts /// + [ /// + by(varlist) /// reshape by groups of -by()- + i(varlist) /// reshape by groups of -i()- + j(name) String /// varnames by levels -j()-; look for string-like names + KEYs(name) /// varnames by levels -key()- + xi(str) /// Handle extraneous variables + fast /// Do not preserve and restore the original dataset. Saves speed + nochecks /// Do not do any checks + CHECKlevel(real 4) /// Check level + DROPMISSing /// Drop missing values from reshape + nodupcheck /// Do not check for duplicates + nomisscheck /// Do not check for missing values or blanks in j + match(str) /// a string (e.g. @) to match or 'regex' + /// with match(regex), stubs must be of the form + /// + /// regex/# regex/# + /// + /// where # is the group to be captured for the levels of j; the + /// default is 1. If no groups are found, the level is assumed to + /// be stub suffix (as would be the case w/o regex). For example: + /// + /// st(.+)ub stub (foo|bar)stub([0-9]+)(alice|bob)/2 + /// + /// is almost the the same as + /// + /// st@ub stub@ foostub@alice barstub@bob foostub@alice barstub@bob + /// + /// The only difference is that @ in the latter 4 stubs will only + /// match numbers. Note that several parts of a variable name + /// match the group, the first match will be replaced even if the + /// user captures both separately. If this is really a concern, + /// you can specify ustrregex (Stata 14 and above only) and use + /// lookarounds: + /// + /// (?<=(foo|bar)[0-9]{0,2}stub)([0-9]+)(?=alice|bob) + /// + /// everything other than the levels to match to j must be captured + /// in a lookbehind or lookahead. Note Stata does not support matches + /// of indeterminate length inside lookarounds (this is a limitation + /// that is not uncommon across several regex implementations). + /// + ${GTOOLS_PARSE} /// + ] + + local gather_opts /// + VALUEs(name) /// out variaable name + [ /// + by(varlist) /// reshape by groups of -i()- + i(varlist) /// reshape by groups of -i()- + j(name) /// varnames by levels -j()- + KEYs(name) /// varnames by levels -key()- + xi(str) /// Handle extraneous variables + fast /// Do not preserve and restore the original dataset. Saves speed + DROPMISSing /// Drop missing values from reshape + USELabels /// Use labels as values instead of variable names + USELabelsvars(str) /// Use labels as values instead of variable names + ${GTOOLS_PARSE} /// + ] + + syntax anything, ``ReS_cmd'_opts' + local key: copy local keys + + * ------------------ + * Parse i, j aliases + * ------------------ + + if ( (`"`by'"' == "") & (`"`i'"' == "") & (`"`ReS_cmd'"' == "long") ) { + disp as err "option {opt i()} (id variable) required" + exit 198 + } + + if ( (`"`by'"' != "") & (`"`i'"' != "") ) { + disp as err "i() and by() are aliases for the same option; use only one" + exit 198 + } + else if ( `"`by'"' != "" ) { + global ReS_iname by + local i: copy local by + } + + if ( (`"`key'"' != "") & (`"`j'"' != "") ) { + disp as err "j() and key() are aliases for the same option; use only one" + exit 198 + } + else if ( `"`key'"' != "" ) { + global ReS_jname key + local j: copy local key + } + + * ------------------- + * Parse other options + * ------------------- + + c_local 0 `ReS_cmd' `anything', i(`i') j(`j') `string' + + if ( `"`checklevel'"' == "" ) local checklevel 4 + if ( `"`values'"' == "" ) local values: copy local anything + + if ( `checklevel' > 3 ) { + } + else if ( `checklevel' > 2 ) { + local misscheck nomisscheck + } + else if ( `checklevel' > 1 ) { + local fast fast + local misscheck nomisscheck + } + else if ( `checklevel' > 0 ) { + local fast fast + local unsorted unsorted + local misscheck nomisscheck + } + else if ( `checklevel' == 0 ) { + local checks nochecks + } + + if ( `"`checks'"' == "nochecks" ) { + local fast fast + local dupcheck nodupcheck + local unsorted unsorted + local misscheck nomisscheck + } + + if ( `"`ReS_cmd'"' == "gather" ) { + local dupcheck nodupcheck + local unsorted unsorted + local misscheck nomisscheck + local string string + global ReS_jname key + } + + if ( "`fast'" == "" ) preserve + + unab oldlist: _all + if ( `"`i'"' == "" ) { + unab anything: `anything' + local restvars: list oldlist - anything + local restvars: list restvars - j + local i: copy local restvars + if ( `"`xi'"' != "" ) { + disp as txt "(note: -xi()- ignored without -$ReS_iname()-)" + } + } + else { + unab i: `i' + } + + if ( `"`j'"' == "" ) local j _$ReS_jname + + if ( `"`match'"' == "" ) local match @ + + if ( `"`uselabelsvars'"' != "" ) { + local uselabels uselabels + } + else if ( `"`uselabels'"' != "" ) { + unab uselabelsvars: _all + } + + global ReS_uselabels: copy local uselabelsvars + global ReS_str = ( `"`string'"' != "" ) + global ReS_atwl `atwl' + global ReS_match `match' + global ReS_Xij `values' + global ReS_i `i' + global ReS_j `j' + + * This defines $ReS_Xij_stubs and potentially overwrites ReS_Xij + ParseStubsByMatch long + + if ( `"`ReS_cmd'"' == "gather" ) { + unab ReS_Xij_names: `anything' + cap noi confirm var `ReS_Xij_names' + if _rc { + disp as err "greshape spread requires explicit variable names" + exit 198 + } + global ReS_Xij_names: copy local ReS_Xij_names + + local restvars: list oldlist - ReS_Xij_names + local restvars: list restvars - j + cap assert `:list i == restvars' + if ( _rc & (`"`xi'"' != "drop") ) { + disp as err "greshape spread does not allow extraneous variables (Xi):" + disp as err "" + disp as err " Xij -> `ReS_Xij_names'" + disp as err " $ReS_iname -> `i'" + disp as err " Xi -> `:list restvars - i'" + disp as err "" + disp as err "Specify xi(drop), leave $ReS_iname() blank, or include Xi somewhere in the reshape." + exit 198 + } + + if ( `:list sizeof values' > 1 ) { + disp as err "values() must be a new variable name" + exit 198 + } + } + + local opts `unsorted' /// + `compress' /// + `forcestrl' /// + `verbose' /// + `_ctolerance' /// + `benchmark' /// + bench(`benchmarklevel') /// + `oncollision' /// + `hashmethod' /// + `debug' + + global ReS_nodupcheck = ( `"`dupcheck'"' == "nodupcheck" ) + global ReS_nomisscheck = ( `"`misscheck'"' == "nomisscheck" ) + if ( `"`ReS_cmd'"' != "gather" ) { + if ( "`unsorted'" == "unsorted" ) { + if ( $ReS_nodupcheck ) { + disp as txt "(note: reshape left unsorted; duplicates check is skipped)" + } + else { + disp as txt "(note: reshape left unsorted; original order not preserved)" + } + } + else { + if ( $ReS_nodupcheck ) { + disp as txt "(note: reshape will be sorted; -nodupcheck- ignored)" + } + } + } + + local oldobs = _N + quietly describe, short + local oldvars = r(k) + + *********************************************************************** + * Macros and J values * + *********************************************************************** + + Macros long + confirm var $ReS_i $ReS_Xi + capture confirm new var $ReS_j + if ( _rc ) { + di in blu "Target $ReS_jname($ReS_j) already exists (is the data already long?)" + exit 198 + } + + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile ReS_jfile + } + else { + GreshapeTempFile ReS_jfile + } + global ReS_jfile `ReS_jfile' + scalar __greshape_jfile = length(`"`ReS_jfile'"') + 1 + + GetJLevels + + confirm var $ReS_i $ReS_Xi + if ( $ReS_str ) { + local string str($ReS_jlen) + local jtype str$ReS_jlen + } + else { + local string str(0) + local jtype long + } + + if ( inlist(`"`xi'"', "keep", "") ) { + } + else if ( `"`xi'"' == "drop" ) { + global ReS_Xi + } + else { + disp as err `"Invalid sytax -xi(`xi')-; specify first, keep, drop"' + exit 198 + } + + if ( `"`ReS_cmd'"' == "gather" & `"${ReS_Xi}"' != "" ) { + disp as err "Error parsing varlist. xi() should be blank" + exit 198 + } + + GetXiTypes + CopyScalars + + *********************************************************************** + * Do the reshape * + *********************************************************************** + + * ------------------------ + * Reshape the data to disk + * ------------------------ + + if ( $ReS_nodupcheck ) local cmd long fwrite + else local cmd long write + + if ( `benchmarklevel' > 0 | `"`benchmark'"' != "" ) disp as txt "Writing reshape to disk:" + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile ReS_Data + } + else { + GreshapeTempFile ReS_Data + } + mata: __greshape_w2l_meta = WideToLongMetaSave() + global GTOOLS_CALLER greshape + local gopts xij($ReS_Xij_names) xi($ReS_Xi) f(`ReS_Data') `string' `dropmissing' + local gopts greshape(`cmd', `gopts') gfunction(reshape) `opts' + cap noi _gtools_internal ${ReS_i}, `gopts' missing + global GTOOLS_CALLER "" + if ( _rc ) exit _rc + + * ---------------------------- + * Allocate space for long data + * ---------------------------- + + FreeTimer + if ( `FreeTimer' ) timer on `FreeTimer' + keep $ReS_i $ReS_Xij_keep $ReS_Xi + * disp "debug: ($ReS_Xij_keep) ($ReS_Xij_keepnames)" + mata __greshape_addtypes = ("`jtype'", J(1, `:word count $ReS_Xij_add', "double")) + mata __greshape_addvars = "$ReS_j", tokens(st_global("ReS_Xij_add")) + mata (void) st_addvar(__greshape_addtypes, __greshape_addvars, 0) + if ( (`"$ReS_Xij_keep"' != "") &(`"$ReS_Xij_keepnames"' != "") ) { + rename ($ReS_Xij_keep) ($ReS_Xij_keepnames) + } + order $ReS_i $ReS_j $ReS_Xij_stubs $ReS_Xi + if ( `"`dropmissing'"' != "" ) { + * disp as txt "({bf:warning:} -dropmiss- will remove IDs with all missing values)" + if ( `=scalar(__gtools_greshape_nrows)' <= `=_N' ) { + qui keep in 1 / `=scalar(__gtools_greshape_nrows)' + } + else { + qui set obs `=scalar(__gtools_greshape_nrows)' + } + } + else { + qui set obs `=_N * scalar(__greshape_klvls)' + * qui expand `=scalar(__greshape_klvls)' + } + if ( `FreeTimer' ) { + qui timer off `FreeTimer' + qui timer list + local s `:disp %9.3f `r(t`FreeTimer')'' + if ( `benchmarklevel' > 2 ) { + disp _char(9) "reshape long step 4: allocated target dataset; `s' seconds." + } + timer clear `FreeTimer' + } + else if ( `benchmarklevel' > 2 ) { + disp _char(9) "reshape long step 4: allocated target dataset; ??? seconds." + } + + * ------------------ + * Read reshaped data + * ------------------ + + if ( `benchmarklevel' > 0 | `"`benchmark'"' != "" ) disp as txt _n "Reading reshape from disk:" + local cmd long read + global GTOOLS_CALLER greshape + local gopts j($ReS_j) xij($ReS_Xij_stubs) xi($ReS_Xi) f(`ReS_Data') `string' + local gopts greshape(`cmd', `gopts') gfunction(reshape) `opts' + cap noi _gtools_internal ${ReS_i}, `gopts' missing + global GTOOLS_CALLER "" + if ( _rc ) exit _rc + + * ---------------------------------------- + * Finish in the same style as reshape.Long + * ---------------------------------------- + + cap disp bsubstr(" ", 1, 1) + if ( _rc ) local substr substr + else local substr bsubstr + + /* Apply J value label and to variable label for LONG Format*/ + local isstr: copy global ReS_str + local labn : char _dta[__JValLabName] + if `"`labn'"' != "" & `"`isstr'"' == "0" { + local lab : char _dta[__JValLab] + capture label define `labn' `lab' + label values $ReS_j `labn' + char define _dta[__JValLab] `""' + char define _dta[__JValLabName] `""' + } + + local jvlab : char _dta[__JVarLab] + if `"`jvlab'"' != "" { + label variable $ReS_j `"`jvlab'"' + char define _dta[__JVarLab] `""' + } + + * -------------------------------------------------- + * TODO: Is this of any value? Done in WideToLongMeta + * -------------------------------------------------- + * /* Apply Xij variable label for LONG*/ + * local iii : char _dta[__XijVarLabTotal] + * if `"`iii'"' == "" { + * local iii = -1 + * } + * foreach var of global ReS_Xij_stubs { + * local var = subinstr(`"`var'"', `"$ReS_match"', "$ReS_atwl", 1) + * if (length(`"`var'"') < 21 ) { + * local xijlab : char _dta[__XijVarLab`var'] + * if `"`xijlab'"' != "" { + * label variable `var' `"`xijlab'"' + * char define _dta[__XijVarLab`var'] `""' + * } + * } + * else { + * local ii = 1 + * while `ii' <= `iii' { + * local xijlab : char _dta[__XijVarLab`ii'] + * if (`"`xijlab'"' != "") { + * local v = /// + * `substr'(`"`xijlab'"',1, /// + * strpos(`"`xijlab'"', " ")-1) + * if `"`v'"' == `"`var'"' { + * local tlab : /// + * subinstr local /// + * xijlab `"`v' "' "" + * capture label variable /// + * `var' `"`tlab'"' + * capture char define /// + * _dta[__XijVarLab`ii'] `""' + * continue, break + * } + * } + * local ii = `ii' + 1 + * } + * } + * } + * -------------------------------------------------- + + ReportL `oldobs' `oldvars' + mata: WideToLongMetaApply(__greshape_w2l_meta) + + if ( "`fast'" == "" ) restore, not +end + +* --------------------------------------------------------------------- +* Reshape wide + +capture program drop Wide +program define Wide /* reshape wide */ + + *********************************************************************** + * Parse Wide Syntax * + *********************************************************************** + + gettoken ReS_cmd 0: 0 + global ReS_cmd: copy local ReS_cmd + global ReS_jname j + global ReS_iname i + + local wide_opts /// + [ /// + i(varlist) /// reshape by groups of -i()- + by(varlist) /// reshape by groups of -by()- + j(varlist) /// varnames by levels -j()- + KEYs(varlist) /// varnames by levels -key()- + String /// look for string-like names + COLSeparate(str) /// Columns sepparator for levels of j + xi(str) /// Handle extraneous variables + fast /// Do not preserve and restore the original dataset. Saves speed + nochecks /// Do not do any checks + CHECKlevel(real 4) /// Check level + nomisscheck /// Do not check for missing values or blanks in j + match(str) /// a string (e.g. @) to match + LABELFormat(str) /// label format; default is '#keyvalue# #stublabel#' + prefix(str) /// a list with the variable prefix format. default + /// + /// #stub# [#stub# #blank# ...] + /// + /// where #stub# simply uses the stub as the variable prefix. + /// @ syntax allowed. Examples of valid prefixes: + /// + /// #stub# combo#stub# #stub#combo prefix mid@dle @suffix + /// + /// + ${GTOOLS_PARSE} /// + ] + + local spread_opts /// + [ /// + j(varlist) /// varnames by levels -j()- + KEYs(varlist) /// varnames by levels -key()- + COLSeparate(str) /// Columns sepparator for levels of j + by(varlist) /// reshape by groups of -by()- + i(varlist) /// reshape by groups of -i()- + xi(str) /// Handle extraneous variables + LABELFormat(str) /// label format; default is '#keyvalue# #stublabel#' + prefix(str) /// a list with the variable prefix format + fast /// Do not preserve and restore the original dataset. Saves speed + ${GTOOLS_PARSE} /// + ] + + syntax anything(everything), ``ReS_cmd'_opts' + local key: copy local keys + + * ------------------ + * Parse i, j aliases + * ------------------ + + if ( (`"`by'"' == "") & (`"`i'"' == "") & (`"`ReS_cmd'"' == "wide") ) { + disp as err "option {opt i()} (grouping variable) required" + exit 198 + } + + if ( (`"`by'"' != "") & (`"`i'"' != "") ) { + disp as err "i() and by() are aliases for the same option; use only one" + exit 198 + } + else if ( `"`by'"' != "" ) { + global ReS_iname by + local i: copy local by + } + + if ( (`"`key'"' == "") & (`"`j'"' == "") ) { + if ( `"`ReS_cmd'"' == "spread" ) { + disp as err "option {opt key:s()} required" + } + else { + disp as err "option {opt j()} (keys) required" + } + exit 198 + } + if ( (`"`key'"' != "") & (`"`j'"' != "") ) { + disp as err "j() and keys() are aliases for the same option; use only one" + exit 198 + } + else if ( `"`key'"' != "" ) { + global ReS_jname keys + local j: copy local key + } + + * ------------------- + * Parse other options + * ------------------- + + c_local 0 `ReS_cmd' `anything', i(`i') j(`j') `string' + if ( `"`checklevel'"' == "" ) local checklevel 4 + + if ( `checklevel' > 3 ) { + } + else if ( `checklevel' > 2 ) { + local misscheck nomisscheck + } + else if ( `checklevel' > 1 ) { + local fast fast + local misscheck nomisscheck + } + else if ( `checklevel' > 0 ) { + local fast fast + local unsorted unsorted + local misscheck nomisscheck + } + else if ( `checklevel' == 0 ) { + local checks nochecks + } + + if ( `"`checks'"' == "nochecks" ) { + local fast fast + local unsorted unsorted + local misscheck nomisscheck + } + + if ( `"`ReS_cmd'"' == "spread" ) { + local misscheck nomisscheck + global ReS_jname keys + } + + if ( "`fast'" == "" ) preserve + + if ( `"`match'"' == "" ) local match @ + + if ( `"`labelformat'"' == "" ) { + local labelformat #keyvalue# #stublabel# + } + else if ( `:list sizeof j' > 1 ) { + disp as txt "(warning: labelformat() ignored with multiple $ReS_jname() variables)" + } + + global ReS_atwl `atwl' + global ReS_match `match' + global ReS_Xij `anything' + global ReS_Xij_k `:list sizeof anything' + global ReS_jsep: copy local colseparate + global ReS_labelformat: copy local labelformat + + * This defines $ReS_Xij_stubs and potentially overwrites ReS_Xij + ParseStubsByMatch wide + + * This is mainly for spread; i are all the excluded variables + unab oldlist: _all + unab ReS_Xij_stubs: $ReS_Xij_stubs + local restvars: list oldlist - ReS_Xij_stubs + local restvars: list restvars - j + if ( `"`i'"' == "" ) { + local i: copy local restvars + if ( `"`xi'"' != "" ) { + disp as txt "(note: -xi()- ignored without -$ReS_iname()-)" + } + } + else { + unab i: `i' + } + + global ReS_j `j' + global ReS_i `i' + + * If there are multiple prefixes, you must specify the same number + * of prefixes as stubs; otherwise the prefix is taken for every + * variable + + local ReS_Xij_stubs: copy global ReS_Xij_stubs + local ReS_prefix: copy local prefix + + local k1: list sizeof prefix + local k2: list sizeof ReS_Xij_stubs + if ( `k1' > 1 ) { + if ( `k1' != `k2' ) { + disp as err `"mismatch: `k1' prefixes for `k2' stubs"' + exit 198 + } + } + else if ( (`k1' == 1) & (`k2' > 1) ) { + local ReS_prefix + forvalues kk = 1 / `k2' { + local ReS_prefix `ReS_prefix' `prefix' + } + } + global ReS_prefix: copy local ReS_prefix + + * Cannot have multiple stubs with the same name + if ( `"`:list uniq ReS_Xij_stubs'"' != `"`ReS_Xij_stubs'"' ) { + disp as err `"repeated variables not allowed"' + exit 198 + } + + * Check that the spread call is sane + + if ( `"`ReS_cmd'"' == "spread" ) { + cap assert `:list i == restvars' + if ( _rc & (`"`xi'"' != "drop") ) { + disp as err "greshape spread does not allow extraneous variables (Xi):" + disp as err "" + disp as err " Xij -> $ReS_Xij" + disp as err " $ReS_jname -> $ReS_j" + disp as err " $ReS_iname -> $ReS_i" + disp as err " Xi -> `:list restvars - i'" + disp as err "" + disp as err "Specify xi(drop), leave $ReS_iname() blank, or include Xi somewhere in the reshape." + exit 198 + } + } + + * gtools options! + + cap confirm str var `j' + global ReS_str = (_rc == 0) + + local opts `unsorted' /// + `compress' /// + `forcestrl' /// + `verbose' /// + `_ctolerance' /// + `benchmark' /// + bench(`benchmarklevel') /// + `oncollision' /// + `hashmethod' /// + `debug' + + global ReS_nodupcheck = 0 + global ReS_nomisscheck = ( `"`misscheck'"' == "nomisscheck" ) + if ( "`unsorted'" == "unsorted" ) { + disp as txt "(note: reshape left unsorted; original order not preserved)" + } + + if ( `"`string'"' != "" ) { + disp as txt "Option -string- ignored with {cmd:greshape wide}" + local string + } + + local oldobs = _N + quietly describe, short + local oldvars = r(k) + + *********************************************************************** + * Macros and J values * + *********************************************************************** + + Macros wide + local rc = 0 + foreach var in $ReS_j { + capture ConfVar `var' + if ( _rc ) { + di in blu "Source $ReS_jname(`var') does not exist (is the data already wide?)" + exit 198 + } + ConfVar `var' + } + confirm var $ReS_j $rVANS $ReS_i $ReS_Xi + + if ( `:list sizeof j' > 1 ) { + disp as txt "({bf:warning}: labels of $ReS_jname() not saved with multiple variables)" + } + else { + /* Save J value and variable label for LONG */ + local jlab : value label $ReS_j + if "`jlab'" != "" { + char define _dta[__JValLabName] `"`jlab'"' + capture label list `jlab' + if _rc == 0 & !missing(`r(min)') & !missing(`r(max)') { + forvalues i = `r(min)'/`r(max)' { + local label : label `jlab' `i', strict + if `"`label'"' != "" { + local char `"`char' `i' `"`label'"' "' + } + } + char define _dta[__JValLab] `"`char'"' + } + } + local jvlab : variable label $ReS_j + if `"`jvlab'"' != "" { + char define _dta[__JVarLab] `"`jvlab'"' + } + } + + * -------------------------------------------------- + * TODO: Is this of any value? Done in LongToWideMeta + * -------------------------------------------------- + * /* Save xij variable labels for LONG */ + * local iii = 1 + * foreach var of global ReS_Xij { + * local var = subinstr(`"`var'"', `"$ReS_match"', "$ReS_atwl", 1) + * local xijlab : variable label `var' + * if `"`xijlab'"' != "" { + * if (length(`"`var'"') < 21) { + * char define _dta[__XijVarLab`var'] `"`xijlab'"' + * } + * else { + * char define _dta[__XijVarLab`iii'] /// + * `"`var' `xijlab'"' + * char define _dta[__XijVarLabTotal] `"`iii'"' + * local iii = `iii' + 1 + * } + * } + * } + * -------------------------------------------------- + + tempvar ReS_jcode + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile ReS_jfile + } + else { + GreshapeTempFile ReS_jfile + } + global ReS_jcode: copy local ReS_jcode + global ReS_jfile: copy local ReS_jfile + scalar __greshape_jfile = length(`"`ReS_jfile'"') + 1 + + GetJLevels + foreach var in $ReS_j { + ConfVar `var' + } + confirm var $ReS_j $ReS_Xi + local ReS_Xi: copy global ReS_Xi + local ReS_Xi: list ReS_Xi - ReS_jcode + global ReS_Xi: copy local ReS_Xi + + if ( inlist(`"`xi'"', "keep", "") ) { + } + else if ( `"`xi'"' == "drop" ) { + global ReS_Xi + } + else { + disp as err `"Invalid sytax -xi(`xi')-; specify first, keep, drop"' + exit 198 + } + + if ( `"`ReS_cmd'"' == "spread" & `"${ReS_Xi}"' != "" ) { + disp as err "Error parsing varlist. xi() should be blank" + exit 198 + } + + GetXiTypes + CopyScalars + + *********************************************************************** + * Do the reshape * + *********************************************************************** + + * ------------------------ + * Reshape the data to disk + * ------------------------ + + if ( `benchmarklevel' > 0 | `"`benchmark'"' != "" ) disp as txt "Writing reshape to disk:" + local cmd wide write + keep $ReS_i $ReS_j $ReS_jcode $ReS_Xi $rVANS + if ( `"${GTOOLS_TEMPDIR}"' == "" ) { + tempfile ReS_Data + } + else { + GreshapeTempFile ReS_Data + } + * disp "debug 1: $ReS_Xij" + mata: __greshape_l2w_meta = LongToWideMetaSave(`"$ReS_cmd"' == "spread") + global GTOOLS_CALLER greshape + local gopts j($ReS_jcode) xij($rVANS) xi($ReS_Xi) f(`ReS_Data') `string' + local gopts greshape(`cmd', `gopts') gfunction(reshape) `opts' + cap noi _gtools_internal ${ReS_i}, `gopts' missing + global GTOOLS_CALLER "" + if ( _rc ) exit _rc + + * ---------------------------- + * Allocate space for wide data + * ---------------------------- + + qui keep in 1 / `:di %32.0f `r(J)'' + global S_FN + global S_FNDATE + + FreeTimer + if ( `FreeTimer' ) timer on `FreeTimer' + rename ($ReS_Xij_keep) ($ReS_Xij_keepnames) + mata __greshape_addtypes = tokens(st_global("ReS_Xij_addtypes")) + mata __greshape_addvars = tokens(st_global("ReS_Xij_addvars")) + mata (void) st_addvar(__greshape_addtypes, __greshape_addvars, 0) + keep $ReS_i $ReS_Xij_names $ReS_Xi + order $ReS_i $ReS_Xij_names $ReS_Xi + if ( `FreeTimer' ) { + qui timer off `FreeTimer' + qui timer list + local s `:disp %9.3f `r(t`FreeTimer')'' + if ( `benchmarklevel' > 2 ) { + disp _char(9) "reshape wide step 4: allocated target dataset; `s' seconds." + } + timer clear `FreeTimer' + } + else if ( `benchmarklevel' > 2 ) { + disp _char(9) "reshape wide step 4: allocated target dataset; ??? seconds." + } + + * ------------------ + * Read reshaped data + * ------------------ + + if ( `benchmarklevel' > 0 | `"`benchmark'"' != "" ) disp as txt _n "Reading reshape from disk:" + local cmd wide read + global GTOOLS_CALLER greshape + local gopts xij($ReS_Xij_names) xi($ReS_Xi) f(`ReS_Data') `string' + local gopts greshape(`cmd', `gopts') gfunction(reshape) `opts' + cap noi _gtools_internal ${ReS_i}, `gopts' missing + global GTOOLS_CALLER "" + if ( _rc ) exit _rc + + * ---------------------------------------- + * Finish in the same style as reshape.Wide + * ---------------------------------------- + + ReportW `oldobs' `oldvars' + mata: LongToWideMetaApply(__greshape_l2w_meta, `"$ReS_cmd"' == "spread") + + if ( "`fast'" == "" ) restore, not +end + +* --------------------------------------------------------------------- +* Stub matches + +capture program drop ParseStubsByMatch +program ParseStubsByMatch + if ( inlist(`"$ReS_match"', "regex", "ustrregex") ) { + if ( `"`1'"' != "long" ) { + disp as err `"match($ReS_match) only allowed when reshaping wide to long"' + exit 198 + } + + unab allvars: _all + local ReS_Xi + local ReS_Xij_regex + local ReS_Xij_stubs + if ( `"$ReS_match"' == "ustrregex" ) { + cap disp ustrregexm("a", "a") + if ( _rc ) { + disp as err "ustrregex is only available in Stata 14+" + exit 198 + } + foreach stub of global ReS_Xij { + local any 0 + local rep 0 + if ( `"`group'"' == "" ) local group 1 + foreach var of varlist `allvars' { + if ustrregexm(`"`var'"', `"`stub'"') { + local new `=ustrregexrf(`"`var'"', `"`stub'"', "@")' + if ( !`:list new in ReS_Xij' ) { + local ReS_Xij `ReS_Xij' `=ustrregexrf(`"`var'"', `"`stub'"', "@")' + local ReS_Xij_regex `ReS_Xij_regex' `stub' + local ReS_Xij_stubs `ReS_Xij_stubs' `=ustrregexrf(`"`var'"', `"`stub'"', "")' + local any 1 + } + else { + local rep 1 + } + * disp `"`var'"', `"`stub'"', `any' + } + } + if ( `any' == 0 ) { + if ( `rep' ) { + disp as err "no new variables matched regex: `stub' (you probably have repeated stubs)" + exit 198 + } + else { + disp as err "no variables matched stub regex: `stub'" + exit 198 + } + } + } + } + else { + foreach stub of global ReS_Xij { + local any 0 + local rep 0 + gettoken stub group: stub, p(/) + gettoken slash group: group, p(/) + local group `group' + if ( `"`group'"' == "" ) local group 1 + foreach var of varlist `allvars' { + if regexm(`"`var'"', `"`stub'"') { + cap local rg = regexs(`group') + if ( `"`rg'"' != "" ) { + local new `=regexr(`"`var'"', `"`rg'"', "@")' + if ( !`:list new in ReS_Xij' ) { + local ReS_Xij `ReS_Xij' `=regexr(`"`var'"', `"`rg'"', "@")' + local ReS_Xij_regex `ReS_Xij_regex' `stub' + local ReS_Xij_stubs `ReS_Xij_stubs' `=regexr(`"`var'"', `"`rg'"', "")' + local any 1 + } + else { + local rep 1 + } + } + } + * disp `"`var'"', `"`stub'"', `any' + } + if ( `any' == 0 ) { + if ( `rep' ) { + disp as err "no new variables matched regex: `stub' (you probably have repeated stubs)" + exit 198 + } + else { + disp as err "no variables matched stub regex: `stub'" + exit 198 + } + } + } + } + + global ReS_match @ + global ReS_Xij: copy local ReS_Xij + global ReS_Xij_regex: copy local ReS_Xij_regex + global ReS_Xij_stubs: copy local ReS_Xij_stubs + + } + else { + global ReS_Xij_stubs: subinstr global ReS_Xij `"$ReS_match"' "", all + if ( `"`1'"' == "wide" ) { + local ReS_Xij + local ReS_Xij_stubs + foreach stub of global ReS_Xij { + local var: subinstr local stub `"$ReS_match"' "" + unab vars: `var' + if ( index(`"`stub'"', `"$ReS_match"') & (`"`var'"' != `"`vars'"') ) { + disp as err "error parsing stubs; cannot specify a custom match with varlist syntax" + exit 198 + } + else if ( index(`"`stub'"', `"$ReS_match"') == 0 ) { + foreach var of local vars { + local ReS_Xij `ReS_Xij' `var' + } + } + else { + local ReS_Xij `ReS_Xij' `stub' + } + foreach var of local vars { + local ReS_Xij_stubs `ReS_Xij_stubs' `var' + } + } + global ReS_Xij: copy local ReS_Xij + global ReS_Xij_stubs: copy local ReS_Xij_stubs + } + } + + * disp "$ReS_Xij" + * disp "$ReS_Xij_stubs" + * disp "$ReS_Xij_regex" +end + +* --------------------------------------------------------------------- +* GetJLevels + +capture program drop GetJLevels +program define GetJLevels + + * --------------------------- + * TODO: Is this of any value? + * --------------------------- + * local varlist "req ex" + * parse "_all" + * { + * local n : word count `varlist' + * local __greshape_dsname + * parse "`varlist'", parse(" ") + * local i 0 + * while `++i' <= `n' { + * disp `"`i', ``i''"' + * local __greshape_dsname `"`__greshape_dsname'"' `"``i''"' + * } + * } + * mata: __greshape_dsname = tokens(st_local("__greshape_dsname"))' + * --------------------------- + + unab varlist: _all + mata: __greshape_dsname = tokens(st_local("varlist"))' + + if inlist("$ReS_cmd", "wide") { + FillvalL + FillXi 1 + } + else if inlist("$ReS_cmd", "spread" ) { + FillvalL + } + else if inlist("$ReS_cmd", "long") { + FillvalW + FillXi 0 + } + else if inlist("$ReS_cmd", "gather") { + FillvalW + } + else { + disp as err "Uknown subcommand: $ReS_cmd" + exit 198 + } + + global S_1 1 +end + +capture program drop FillvalW +program define FillvalW + tempname jlen + mata: `jlen' = 0 + if ( "$ReS_cmd" != "gather" ) { + FindVariablesByCharacter + + capture mata: assert(all(__greshape_res :== "")) + if _rc == 0 { + no_xij_found + /* NOTREACHED */ + } + + if ( !$ReS_str ) { + mata: __greshape_res = strtoreal(__greshape_res) + mata: __greshape_sel = selectindex(__greshape_res :< .) + } + else { + mata: __greshape_sel = selectindex(__greshape_res :!= "") + } + + mata: __greshape_res = sort(uniqrows(__greshape_res[__greshape_sel]), 1) + mata: st_numscalar("__greshape_kout", cols(tokens(st_global(`"ReS_Xij"')))) + mata: st_numscalar("__greshape_klvls", rows(__greshape_res)) + if ( `=(__greshape_klvls)' == 0 ) { + disp as err "variable j contains all missing values" + exit 498 + } + + if ( !$ReS_str ) { + mata: SaveJValuesReal(__greshape_res) + mata: __greshape_res = strofreal(__greshape_res) + } + else { + mata: (void) SaveJValuesString(__greshape_res, "") + } + mata: __greshape_xijname = sort(uniqrows(__greshape_dsname[__greshape_sel]), 1) + } + else { + mata: __greshape_res = tokens(st_global("ReS_Xij_names"))' + mata: st_numscalar("__greshape_kout", cols(tokens(st_global(`"ReS_Xij"')))) + mata: st_numscalar("__greshape_klvls", rows(__greshape_res)) + if ( `=(__greshape_klvls)' == 0 ) { + disp as err "variable j contains all missing values" + exit 498 + } + + if ( `"$ReS_uselabels"' != "" ) { + local 0: copy global ReS_uselabels + cap syntax varlist, [exclude] + if ( _rc ) { + disp as err "option uselabels[()] incorrectly specified" + syntax varlist, [exclude] + exit 198 + } + else { + if ( `"`exclude'"' != "" ) { + unab ReS_uselabels: _all + local ReS_uselabels: list ReS_uselabels - varlist + global ReS_uselabels: copy local ReS_uselabels + } + else { + global ReS_uselabels: copy local varlist + } + } + } + + mata: `jlen' = SaveJValuesString(__greshape_res, tokens(`"${ReS_uselabels}"')) - 1 + mata: __greshape_xijname = __greshape_res + } + + mata: __greshape_maplevel = MakeMapLevel( /* + */ __greshape_xijname, /* + */ __greshape_res, /* + */ tokens(st_global(`"ReS_Xij"')), /* + */ (`"$ReS_cmd"' == "gather")) + + mata: st_matrix("__greshape_maplevel", __greshape_maplevel) + + mata: __greshape_rc = CheckVariableTypes( /* + */ tokens(st_global(`"ReS_Xij_names"')), /* + */ __greshape_res, /* + */ tokens(st_global(`"ReS_Xij"')), /* + */ tokens(st_global(`"ReS_Xij_stubs"')), /* + */ (`"$ReS_cmd"' == "gather")) + + mata: st_numscalar("__greshape_rc", __greshape_rc) + if ( `=scalar(__greshape_rc)' ) { + mata: mata drop `jlen' + exit 198 + } + + scalar __greshape_nrows = . + scalar __greshape_ncols = . + + mata: st_global("ReS_jv", invtokens(__greshape_res')) + mata: st_global("ReS_jlen", strofreal(`jlen' > 0? `jlen': max(strlen(__greshape_res)))) + mata: mata drop `jlen' + + di in gr "(note: $ReS_jname = $ReS_jv)" + global ReS_jv2: copy global ReS_jv +end + +capture program drop FindVariablesByCharacter +program FindVariablesByCharacter + cap disp bsubstr(" ", 1, 1) + if ( _rc ) local substr substr + else local substr bsubstr + + cap mata ustrregexm("a", "a") + if ( _rc ) local regex regex + else local regex ustrregex + + local ReS_Xij_regex: copy global ReS_Xij_regex + + parse "$ReS_Xij", parse(" ") + local i 1 + mata: __greshape_res = J(rows(__greshape_dsname), 1, "") + mata: __greshape_u = J(rows(__greshape_dsname), 1, "") + while "``i''" != "" { + gettoken exp ReS_Xij_regex: ReS_Xij_regex + + local m = length(`"$ReS_match"') + local _l = index("``i''", `"$ReS_match"') + local l = cond(`_l' == 0, length("``i''") + 1, `_l') + local lft = `substr'("``i''", 1, `l' - 1) + local rgt = `substr'("``i''", `l' + `m', .) + local rgtl = length("`rgt'") + local minl = length("`lft'") + `rgtl' + + if ( `"`exp'"' == "" ) { + mata: __greshape_u = selectindex( /* + */ (strlen(__greshape_dsname) :> `minl') :& /* + */ (`substr'(__greshape_dsname, 1, `l' - 1) :== `"`lft'"') :& /* + */ (`substr'(__greshape_dsname, -`rgtl', .) :== `"`rgt'"')) + } + else { + mata: __greshape_u = selectindex( /* + */ (strlen(__greshape_dsname) :> `minl') :& /* + */ (`substr'(__greshape_dsname, 1, `l' - 1) :== `"`lft'"') :& /* + */ (`substr'(__greshape_dsname, -`rgtl', .) :== `"`rgt'"') :& /* + */ (`regex'm(__greshape_dsname, `"`exp'"'))) + } + + mata: st_local("any", strofreal(length(__greshape_u) > 0)) + if ( `any' ) { + mata: __greshape_res[__greshape_u] = `substr'( /* + */ __greshape_dsname[__greshape_u], `l', .) + + mata: __greshape_res[__greshape_u] = `substr'( /* + */ __greshape_res[__greshape_u], 1, /* + */ strlen(__greshape_res[__greshape_u]) :- `rgtl') + + capture mata: assert(all(__greshape_res[__greshape_u] :!= "")) + } + else cap error 0 + + if _rc { + di in red as smcl /// + "variable {bf:`lft'`rgt'} already defined" + exit 110 + } + local i = `i' + 1 + } +end + +capture mata: mata drop MakeMapLevel() +capture mata: mata drop GetVariableFromStub() +capture mata: mata drop GetVariableFromStubPrefix() +mata: +real matrix function MakeMapLevel( + string colvector dsname, + string colvector res, + string rowvector xij, + real scalar gather) +{ + real scalar i, j, k, l + real matrix maplevel + string rowvector ordered + string scalar sr + + k = 1 + ordered = J(1, cols(xij) * rows(res), "") + maplevel = J(cols(xij), rows(res), 0) + + for (i = 1; i <= cols(xij); i++) { + for (j = 1; j <= rows(res); j++) { + sr = gather? res[j]: GetVariableFromStub(xij[i], res[j]) + if ( any(dsname :== sr) ) { + maplevel[i, j] = k + ordered[k] = sr + k++ + } + } + } + + st_global("ReS_Xij_names", invtokens(ordered)) + st_numscalar("__greshape_kxij", cols(ordered)) + return(maplevel) +} + +string scalar function GetVariableFromStub(string scalar s, string scalar r) +{ + real scalar l, m + string scalar left, right + + m = strlen(st_global(`"ReS_match"')) + l = strpos(s, st_global(`"ReS_match"')) + l = (l == 0)? strlen(s) + 1: l + left = substr(s, 1, l - 1) + right = substr(s, l + m, .) + return(left + r + right) +} + +string scalar function GetVariableFromStubPrefix( + string scalar stub, + string scalar level, + string scalar prefix) +{ + real scalar l, m + string scalar left, right, out + + s = subinstr(prefix, "#blank#", "", .) + s = subinstr(s, "#stub#", stub, .) + + m = strlen(st_global(`"ReS_match"')) + l = strpos(s, st_global(`"ReS_match"')) + l = (l == 0)? strlen(s) + 1: l + left = substr(s, 1, l - 1) + right = substr(s, l + m, .) + return(left + level + right) +} + +end + +cap mata ustrregexm("a", "a") +if ( _rc ) local regex regex +else local regex ustrregex + +capture mata: mata drop CheckVariableTypes() +mata: +real scalar function CheckVariableTypes( + string rowvector dsname, + string colvector res, + string rowvector xij, + string rowvector xij_stubs, + real scalar gather) +{ + real scalar i, j, k, t, ix + real colvector sel + real rowvector types + string colvector keep + string colvector keepnames + string colvector add + string scalar sr, v + + k = 0 + types = J(1, cols(dsname), 0) + highest = J(cols(xij), 2, 0) + for (i = 1; i <= cols(xij); i++) { + for (j = 1; j <= rows(res); j++) { + sr = gather? res[j]: GetVariableFromStub(xij[i], res[j]) + ix = selectindex(dsname :== sr) + t = highest[i, 1] + if ( length(ix) > 1 ) { + errprintf("stub %s had repeated matches (do you have repeated stubs?)\n", + xij[i]) + return(198) + } + else if ( length(ix) ) { + v = st_vartype(sr) + if ( `regex'm(v, "str([1-9][0-9]*|L)") ) { + if ( t > 0 ) { + errprintf("%s type mismatch with other %s variables\n", + sr, xij[i]) + return(198) + } + l = `regex's(1) + if ( l == "L" ) { + errprintf("strL variables not supported\n") + return(198) + } + if ( t > -strtoreal(l) ) { + highest[i, 1] = -strtoreal(l) + highest[i, 2] = ix + } + types[ix] = strtoreal(l) + } + else { + if ( v == "byte" ) { + types[ix] = 0 + if ( t < 1 ) { + highest[i, 1] = 1 + highest[i, 2] = ix + } + } + else if ( v == "int" ) { + types[ix] = 0 + if ( t < 2 ) { + highest[i, 1] = 2 + highest[i, 2] = ix + } + } + else if ( v == "long" ) { + types[ix] = 0 + if ( t < 3 ) { + highest[i, 1] = 3 + highest[i, 2] = ix + } + if ( t == 4 ) { + highest[i, 1] = . + highest[i, 2] = . + } + } + else if ( v == "float" ) { + types[ix] = 0 + if ( t < 2 ) { + highest[i, 1] = 4 + highest[i, 2] = ix + } + if ( t == 3 ) { + highest[i, 1] = . + highest[i, 2] = . + } + } + else if ( v == "double" ) { + types[ix] = 0 + highest[i, 1] = 5 + highest[i, 2] = ix + } + else { + errprintf("unknown variable type: %s\n", v) + return(198) + } + } + } + } + } + + sel = highest[., 2] + keepnames = xij_stubs[selectindex(sel :!= .)] + keep = dsname[sel[selectindex(sel :!= .)]] + add = xij_stubs[selectindex(sel :== .)] + + st_matrix("__greshape_types", types) + + st_global("ReS_Xij_keepnames", invtokens(keepnames)) + st_global("ReS_Xij_keep", invtokens(keep)) + st_global("ReS_Xij_add", invtokens(add)) + + return(0) +} +end + +capture mata: mata drop SaveJValuesReal() +capture mata: mata drop SaveJValuesString() +mata: +void function SaveJValuesReal(real colvector res) +{ + real scalar i, fh + colvector C + fh = fopen(st_global("ReS_jfile"), "w") + C = bufio() + for(i = 1; i <= rows(res); i++) { + fbufput(C, fh, "%8z", res[i]) + } + fclose(fh) +} + +real scalar function SaveJValuesString(string colvector res, string rowvector uselabelsvars) +{ + real scalar i, fh, max, uselabels + string scalar fmt, vlbl + string colvector reslbl + colvector C + + uselabels = length(uselabelsvars) > 0 + + fh = fopen(st_global("ReS_jfile"), "w") + C = bufio() + if ( uselabels ) { + reslbl = J(rows(res), 1, "") + for(i = 1; i <= rows(res); i++) { + if ( length(selectindex(res[i] :== uselabelsvars)) > 0 ) { + vlbl = st_varlabel(res[i]) + reslbl[i] = (strtrim(vlbl) == "")? res[i]: vlbl + } + else { + reslbl[i] = res[i] + } + } + max = max(strlen(reslbl)) + 1 + fmt = sprintf("%%%gS", max) + for(i = 1; i <= rows(reslbl); i++) { + fbufput(C, fh, fmt, reslbl[i] + (max - strlen(reslbl[i])) * char(0)) + } + } + else { + max = max(strlen(res)) + 1 + fmt = sprintf("%%%gS", max) + for(i = 1; i <= rows(res); i++) { + fbufput(C, fh, fmt, res[i] + (max - strlen(res[i])) * char(0)) + } + } + fclose(fh) + + return(max) +} +end + +capture program drop no_xij_found +program no_xij_found + di as smcl in red "no xij variables found" + di as smcl in red "{p 4 4 2}" + di as smcl in red "You typed something like" + di as smcl in red "{bf:reshape wide a b, $ReS_iname(i) $ReS_jname(j)}.{break}" + di as smcl in red "{bf:reshape} looked for existing variables" + di as smcl in red "named {bf:a}{it:#} and {bf:b}{it:#} but" + di as smcl in red "could not find any. Remember this picture:" + di as smcl in red + picture err cmds + exit 111 +end + +capture program drop FillvalL +program define FillvalL + cap mata bsubstr(" ", 1, 1) + if ( _rc ) local substr substr + else local substr bsubstr + local ReS_j: copy global ReS_j + + if ( `"$ReS_jsep"' == "" ) local ReS_jsep `" "' + else local ReS_jsep: copy global ReS_jsep + + if ( `:list sizeof ReS_j' > 1 ) local clean clean + glevelsof $ReS_j, silent local(ReS_jv) cols(`"`ReS_jsep'"') group($ReS_jcode) missing `clean' + scalar __greshape_klvls = `r(J)' + mata: st_global("ReS_jvraw", st_local("ReS_jv")) + + if ( ("$ReS_cmd" != "spread") | ($ReS_Xij_k > 1) ) { + mata: __greshape_jv = strtoname("_" :+ tokens(st_local("ReS_jv"))') + mata: __greshape_jv = `substr'(__greshape_jv, 2, strlen(__greshape_jv)) + } + else { + mata: __greshape_jv = strtoname(tokens(st_local("ReS_jv"))') + } + + * Not sure if blank selectindex is 1 by 0 or 0 by 1 or 0 by 0 + mata: __greshape_jv_ = selectindex(__greshape_jv :== "") + mata: st_numscalar("__greshape_jv_", /* + */ min((rows(__greshape_jv_), cols(__greshape_jv_)))) + if ( `=scalar(__greshape_jv_)' ) { + mata: __greshape_jv[__greshape_jv_] = J(1, rows(__greshape_jv_), "_") + } + + mata: st_global("ReS_jv", invtokens(__greshape_jv')) + cap mata: assert(sort(__greshape_jv, 1) == sort(uniqrows(__greshape_jv), 1)) + if _rc { + disp as err "j defines non-unique or invalid names" + exit 198 + } + + * mata: (void) SaveJValuesString(__greshape_jv, "") + di in gr "(note: $ReS_jname = $ReS_jv)" + global ReS_jv2: copy global ReS_jv + + CheckVariableTypes +end + +capture program drop CheckVariableTypes +program CheckVariableTypes + cap disp ustrregexm("a", "a") + if ( _rc ) local regex regex + else local regex ustrregex + + local i: copy global rVANS + local k: copy global ReS_Xij + local j: copy global ReS_jv + local p: copy global ReS_prefix + gettoken j1 jrest: j + + global ReS_Xij_keep: copy global rVANS + global ReS_Xij_keepnames + + global ReS_Xij_names + global ReS_Xij_addvars + global ReS_Xij_addtypes + + if ( ("$ReS_cmd" != "spread") | ($ReS_Xij_k > 1) ) { + + * Allow for custom user-defined prefixes and such. This is + * useful in gather but in wide it's basically a convoluted + * rename scheme... + + foreach stub of local k { + gettoken var i: i + gettoken prefix p: p + + local prefix `prefix' + if ( `"`prefix'"' != "" ) { + local prefix: subinstr local prefix `"#blank#"' `""' + local prefix: subinstr local prefix `"#stub#"' `"`stub'"' + local stub: copy local prefix + } + + if ( index(`"`stub'"', `"$ReS_match"') > 0 ) { + local _var: subinstr local stub `"$ReS_match"' `"`j1'"' + } + else { + local _var `stub'`j1' + } + + global ReS_Xij_keepnames $ReS_Xij_keepnames `_var' + global ReS_Xij_names $ReS_Xij_names `_var' + + foreach jv of local jrest { + + if ( index(`"`stub'"', `"$ReS_match"') > 0 ) { + local _var: subinstr local stub `"$ReS_match"' `"`jv'"' + } + else { + local _var `stub'`jv' + } + + global ReS_Xij_addtypes $ReS_Xij_addtypes `:type `var'' + global ReS_Xij_addvars $ReS_Xij_addvars `_var' + global ReS_Xij_names $ReS_Xij_names `_var' + } + } + } + else { + + * Allow for custom user-defined prefixes and such. This is + * useful in gather but in wide it's basically a convoluted + * rename scheme... + + foreach var of local i { + + local stub: copy local k + local prefix: copy local p + + local prefix `prefix' + if ( `"`prefix'"' != "" ) { + local prefix: subinstr local prefix `"#blank#"' `""' + local prefix: subinstr local prefix `"#stub#"' `"`stub'"' + local stub: copy local prefix + } + else local stub + + if ( index(`"`stub'"', `"$ReS_match"') > 0 ) { + local _var: subinstr local stub `"$ReS_match"' `"`j1'"' + } + else { + local _var `stub'`j1' + } + + global ReS_Xij_keepnames $ReS_Xij_keepnames `_var' + global ReS_Xij_names $ReS_Xij_names `_var' + + foreach jv of local jrest { + + if ( index(`"`stub'"', `"$ReS_match"') > 0 ) { + local _var: subinstr local stub `"$ReS_match"' `"`jv'"' + } + else { + local _var `stub'`jv' + } + + global ReS_Xij_addtypes $ReS_Xij_addtypes `:type `var'' + global ReS_Xij_addvars $ReS_Xij_addvars `_var' + global ReS_Xij_names $ReS_Xij_names `_var' + } + } + } + + scalar __greshape_kout = `:list sizeof k' + scalar __greshape_kxij = `:list sizeof k' * `:list sizeof j' + scalar __greshape_nrows = . + scalar __greshape_ncols = . + matrix __greshape_maplevel = 0 + + local __greshape_types + capture matrix drop __greshape_types + foreach var of varlist $rVANS { + if ( `regex'm("`:type `var''", "str([1-9][0-9]*|L)") ) { + if ( `regex's(1) == "L" ) { + disp as err "Unsupported type `:type `var''" + exit 198 + } + local __greshape_types `__greshape_types' `=`regex's(1)' + * matrix __greshape_types = nullmat(__greshape_types), `=`regex's(1)' + } + else if ( inlist("`:type `var''", "byte", "int", "long", "float", "double") ) { + local __greshape_types `__greshape_types' 0 + * matrix __greshape_types = nullmat(__greshape_types), 0 + } + else { + disp as err "Unknown type `:type `var''" + exit 198 + } + } + mata: st_matrix("__greshape_types", /* + */ strtoreal(tokens(st_local("__greshape_types")))) +end + +capture program drop FillXi +program define FillXi /* {1|0} */ /* 1 if islong currently */ + local islong `1' + if `islong' { /* long to wide */ + unab ReS_Xi: _all + unab ReS_i: $ReS_i + unab ReS_j: $ReS_j + unab ReS_Xij: $rVANS + local ReS_Xi: list ReS_Xi - ReS_i + local ReS_Xi: list ReS_Xi - ReS_j + local ReS_Xi: list ReS_Xi - ReS_Xij + global ReS_Xi: copy local ReS_Xi + } + else { /* wide to long */ + unab ReS_Xi: _all + unab ReS_i: $ReS_i + local ReS_j: copy global ReS_j + unab ReS_Xij: $ReS_Xij_names + local ReS_Xi: list ReS_Xi - ReS_i + local ReS_Xi: list ReS_Xi - ReS_j + local ReS_Xi: list ReS_Xi - ReS_Xij + global ReS_Xi: copy local ReS_Xi + } + + * --------------------------- + * TODO: Is this of any value? + * --------------------------- + * local name __greshape_dsname + * quietly { + * if `islong' { + * Dropout __greshape_dsname $ReS_j $ReS_i + * parse "", parse(" ") + * local i 1 + * while "``i''" != "" { + * Subname ``i'' $ReS_atwl + * mata: `name' = `name'[selectindex(`name' :!= `"$S_1"')] + * local i = `i' + 1 + * } + * } + * else { /* wide */ + * Dropout __greshape_dsname $ReS_j $ReS_i + * parse "$ReS_Xij", parse(" ") + * local i 1 + * while "``i''" != "" { + * local j 1 + * local jval : word `j' of $ReS_jv + * while "`jval'"!="" { + * Subname ``i'' `jval' + * mata: `name' = `name'[selectindex(`name' :!= `"$S_1"')] + * local j = `j' + 1 + * local jval : word `j' of $ReS_jv + * } + * local i = `i' + 1 + * } + * } + * mata: st_local("N", strofreal(length(`name'))) + * local i 1 + * while ( `i' <= `=`N'' ) { + * mata: st_local("nam", `name'[`i']) + * global ReS_Xi $ReS_Xi `nam' + * local i = `i' + 1 + * } + * } + * --------------------------- +end + +capture program drop GetXiTypes +program GetXiTypes + cap disp ustrregexm("a", "a") + if ( _rc ) local regex regex + else local regex ustrregex + + if ( "$ReS_Xi" != "" ) { + local __greshape_xitypes + cap matrix drop __greshape_xitypes + foreach var of varlist $ReS_Xi { + if ( `regex'm("`:type `var''", "str([1-9][0-9]*|L)") ) { + if ( `regex's(1) == "L" ) { + disp as err "Unsupported type `:type `var''" + exit 198 + } + local __greshape_xitypes `__greshape_xitypes' `=`regex's(1)' + * matrix __greshape_xitypes = nullmat(__greshape_xitypes), `=`regex's(1)' + } + else if ( inlist("`:type `var''", "byte", "int", "long", "float", "double") ) { + local __greshape_xitypes `__greshape_xitypes' 0 + * matrix __greshape_xitypes = nullmat(__greshape_xitypes), 0 + } + else { + disp as err "Unknown type `:type `var''" + exit 198 + } + } + mata: st_matrix("__greshape_xitypes", /* + */ strtoreal(tokens(st_local("__greshape_xitypes")))) + } + else { + matrix __greshape_xitypes = . + } +end + +capture program drop Dropout +program define Dropout /* varname varnames */ + local name "`1'" + local i 2 + while `"``i''"' != "" { + mata: `name' = `name'[selectindex(`name' :!= `"``i''"')] + local i = `i' + 1 + } +end + +capture program drop FreeTimer +program FreeTimer + qui { + timer list + local i = 99 + while ( (`i' > 0) & ("`r(t`i')'" != "") ) { + local --i + } + } + c_local FreeTimer `i' +end + +capture program drop CheckMatsize +program CheckMatsize + syntax [anything], [nvars(int 0)] + if ( `nvars' == 0 ) local nvars `:list sizeof anything' + if ( `nvars' > `c(matsize)' ) { + cap set matsize `=`nvars'' + if ( _rc ) { + di as err _n(1) "{bf:# variables > matsize (`nvars' > `c(matsize)'). Tried to run}" + di _n(1) " {stata set matsize `=`nvars''}" + di _n(1) "{bf:but the command failed. Try setting matsize manually.}" + exit 908 + } + } +end + +capture program drop CopyScalars +program CopyScalars + scalar __gtools_greshape_klvls = __greshape_klvls + scalar __gtools_greshape_kout = __greshape_kout + scalar __gtools_greshape_kxij = __greshape_kxij + scalar __gtools_greshape_ncols = __greshape_ncols + scalar __gtools_greshape_nrows = __greshape_nrows + scalar __gtools_greshape_jfile = __greshape_jfile + + matrix __gtools_greshape_xitypes = __greshape_xitypes + matrix __gtools_greshape_types = __greshape_types + matrix __gtools_greshape_maplevel = __greshape_maplevel +end + +capture program drop CleanExit +program CleanExit + foreach f of global GTOOLS_TEMPFILES_GRESHAPE { + cap erase `"${GTOOLS_TEMPDIR}/`f'"' + } + global GTOOLS_TEMPFILES_GRESHAPE + global GTOOLS_TEMPFILES_GRESHAPE_I + + Macdrop + mac drop GTOOLS_PARSE + + capture mata mata drop __greshape_maplevel + capture mata mata drop __greshape_dsname + capture mata mata drop __greshape_jv + capture mata mata drop __greshape_jv_ + capture mata mata drop __greshape_res + capture mata mata drop __greshape_sel + capture mata mata drop __greshape_addtypes + capture mata mata drop __greshape_addvars + capture mata mata drop __greshape_u + capture mata mata drop __greshape_xijname + capture mata mata drop __greshape_rc + capture mata mata drop __greshape_l2w_meta + capture mata mata drop __greshape_w2l_meta + + capture scalar drop __greshape_jv_ + capture scalar drop __greshape_rc + capture scalar drop __greshape_klvls + capture scalar drop __greshape_kout + capture scalar drop __greshape_kxij + capture scalar drop __greshape_ncols + capture scalar drop __greshape_nrows + capture scalar drop __greshape_jfile + + capture scalar drop __gtools_greshape_klvls + capture scalar drop __gtools_greshape_kout + capture scalar drop __gtools_greshape_kxij + capture scalar drop __gtools_greshape_ncols + capture scalar drop __gtools_greshape_nrows + capture scalar drop __gtools_greshape_jfile + + capture matrix drop __greshape_xitypes + capture matrix drop __greshape_types + capture matrix drop __greshape_maplevel + + capture matrix drop __gtools_greshape_xitypes + capture matrix drop __gtools_greshape_types + capture matrix drop __gtools_greshape_maplevel +end + +* --------------------------------------------------------------------- +* Helpers taken near-verbatim from reshape.ado + +capture program drop Macdrop +program define Macdrop + mac drop ReS_cmd /// + ReS_Xij /// + ReS_Xij_regex /// + ReS_Xij_stubs /// + ReS_Xij_k /// + ReS_Xij_add /// + ReS_Xij_keep /// + ReS_Xij_keepnames /// + ReS_Xij_names /// + ReS_Xij_addtypes /// + ReS_Xij_addvars /// + ReS_nodupcheck /// + ReS_nomisscheck /// + ReS_match /// + ReS_atwl /// + ReS_uselabels /// + ReS_i /// + ReS_iname /// + ReS_j /// + ReS_jname /// + ReS_jfile /// + ReS_jsep /// + ReS_jcode /// + ReS_jlen /// + ReS_jv /// + ReS_jv2 /// + ReS_jvraw /// + ReS_prefix /// + ReS_labelformat /// + ReS_str /// + ReS_Xi /// + S_1 /// + S_1_full /// + S_2 /// + rVANS +end + +capture program drop picture +program picture + args how cmds + + if ("`how'"=="err") { + local how "as smcl in red" + } + else { + local how "as smcl in green" + } + +/* +----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8 + long wide + +---------------+ +------------------+ + | i j a b | | i a1 a2 b1 b2 | + |---------------| <---greshape ---> |------------------| + | 1 2 1 2 | | 1 1 3 2 4 | + | 1 2 3 4 | | 2 5 7 6 8 | + | 2 1 5 6 | +------------------+ + | 2 2 7 8 | + +---------------+ + j existing variable + / + long to wide: greshape wide a b, i(i) j(j) + + wide to long: greshape long a b, i(i) j(j) + \ + j new variable + + 123456789012345 123456789012345678 + +---------------+1234567890123456789+------------------+ + | i j a b | | i a1 a2 b1 b2 | + |---------------| <---greshape ---> |------------------| + | 1 2 1 2 | | 1 1 3 2 4 | + | 1 2 3 4 | | 2 5 7 6 8 | + | 2 1 5 6 | +------------------+ + | 2 2 7 8 | + +---------------+ +*/ + + di `how' _col(9) " {it:long}" _skip(33) "{it:wide}" + di `how' _col(9) /// + "{c TLC}{hline 15}{c TRC}" _skip(20) /// + "{c TLC}{hline 18}{c TRC}" + di `how' _col(9) "{c |} {it:i j} a b {c |}" _skip(20) /// + "{c |} {it:i} a1 a2 b1 b2 {c |}" + di `how' _col(9) "{c |}{hline 15}{c |}" /// + " <--- {bf:greshape} ---> " /// + "{c |}{hline 18}{c |}" + di `how' _col(9) "{c |} 1 1 1 2 {c |}" _skip(20) /// + "{c |} 1 1 3 2 4 {c |}" + di `how' _col(9) "{c |} 1 2 3 4 {c |}" _skip(20) /// + "{c |} 2 5 7 6 8 {c |}" + di `how' _col(9) "{c |} 2 1 5 6 {c |}" _skip(20) /// + "{c BLC}{hline 18}{c BRC}" + di `how' _col(9) "{c |} 2 2 7 8 {c |}" + di `how' _col(9) "{c BLC}{hline 15}{c BRC}" + if ("`cmds'" != "") { + di `how' + di `how' _col(9) /// + "long to wide: " /// + "{bf:greshape wide a b, $ReS_iname(}{it:i}{bf:) j(}{it:j}{bf:)} " /// + " ({it:j} existing variable)" + di `how' _col(9) /// + "wide to long: " /// + "{bf:greshape long a b, $ReS_iname(}{it:i}{bf:) j(}{it:j}{bf:)} " /// + " ({it:j} new variable)" + } +end + +capture program drop ConfVar +program define ConfVar /* varname */ + capture syntax varname + if ( _rc == 0 ) { + gettoken lhs : 0 + if ( `"`lhs'"' == `"`varlist'"' ) { + exit 0 + } + } + di in red as smcl `"variable {bf:`0'} not found"' + exit 111 +end + +capture program drop ReportL +program define ReportL /* old_obs old_vars */ + Report1 `1' `2' wide long + + local n : word count $ReS_jv + di in gr "$ReS_jname (`n' values) " _col(43) "->" _col(48) /* + */ in ye "$ReS_j" + di in gr "xij variables:" + parse "$ReS_Xij", parse(" ") + local xijn : char _dta[ReS_Xij_n] + if `"`xijn'"' != "" { + forvalues i = 1/`xijn' { + char _dta[ReS_Xij_wide`i'] + char _dta[ReS_Xij_long`i'] + } + char _dta[ReS_Xij_n] + } + local i 0 + while ( `"`1'"' != "" ) { + RepF "`1'" + local skip = 39 - length("$S_1") + di in ye _skip(`skip') "$S_1" _col(43) in gr "->" /* + */ in ye _col(48) "$S_2" + local ++i + char _dta[ReS_Xij_wide`i'] "$S_1_full" + char _dta[ReS_Xij_long`i'] "$S_2" + mac shift + } + char _dta[ReS_Xij_n] "`i'" + di in smcl in gr "{hline 77}" +end + +capture program drop RepF +program define RepF /* element from ReS_Xij */ + local v "`1'" + if "$ReS_jv2" != "" { + local n : word count $ReS_jv2 + parse "$ReS_jv2", parse(" ") + } + else { + local n : word count $ReS_jv + parse "$ReS_jv", parse(" ") + } + if `n'>=1 { + Subname `v' `1' + local list $S_1 + } + if `n'>=2 { + Subname `v' `2' + local list `list' $S_1 + } + if `n'==3 { + Subname `v' ``n'' + local list `list' $S_1 + } + else if `n'>3 { + Subname `v' ``n'' + local list `list' ... $S_1 + } + + local flist + forvalues i=1/`n' { + Subname `v' ``i'' + local flist `flist' $S_1 + } + global S_1_full `flist' + + Subname `v' $ReS_atwl + global S_2 $S_1 + global S_1 `list' +end + +capture program drop Report1 +program define Report1 /* <#oobs> <#ovars> {wide|long} {long|wide} */ + local oobs "`1'" + local ovars "`2'" + local wide "`3'" + local long "`4'" + + di in smcl _n in gr "Data" _col(36) "`wide'" _col(43) /* + */ "->" _col(48) "`long'" _n "{hline 77}" + + di in gr "Number of obs." _col(19) in ye %21.0gc `oobs' /* + */ in gr _col(43) "-> " in ye %-21.0gc _N + + quietly desc, short + + di in gr "Number of variables" _col(19) in ye %21.0gc `ovars' /* + */ in gr _col(43) "-> " in ye %-21.0gc r(k) +end + +capture program drop ReportW +program define ReportW /* old_obs old_vars */ + Report1 `1' `2' long wide + + local n : word count $ReS_jv2 + local col = 31+(9-length("$ReS_j")) + di in gr "$ReS_jname (`n' values) " /* + */ _col(`col') in ye "$ReS_j" in gr _col(43) "->" /* + */ _col(48) "(dropped)" + di in gr "xij variables:" + parse "$ReS_Xij", parse(" ") + if ( `"`xijn'"' != "" ) { + forvalues i = 1/`xijn' { + char _dta[ReS_Xij_wide`i'] + char _dta[ReS_Xij_long`i'] + } + char _dta[ReS_Xij_n] + } + local i 0 + while ( `"`1'"' != "" ) { + RepF "`1'" + local skip = 39 - length("$S_2") + di in ye _skip(`skip') "$S_2" _col(43) in gr "->" /* + */ in ye _col(48) "$S_1" + local ++i + char _dta[ReS_Xij_wide`i'] "$S_1_full" + char _dta[ReS_Xij_long`i'] "$S_2" + mac shift + } + char _dta[ReS_Xij_n] "`i'" + di in smcl in gr "{hline 77}" +end + +capture program drop Macros +program define Macros /* reshape macro check utility */ + capture ConfVar $ReS_j + if ( _rc == 0 ) { + if ( $ReS_nomisscheck == 0 ) { + if ( $ReS_str == 0 ) { + capture assert $ReS_j<. + if _rc { + di in red as smcl /// + "variable {bf:$ReS_j} contains missing values" + exit 498 + } + } + else { + capture assert trim($ReS_j) != "" + if _rc { + di in red as smcl /// + "variable {bf:$ReS_j} contains missing values" + exit 498 + } + capture assert $ReS_j == trim($ReS_j) + if _rc { + di in red as smcl /// + "variable {bf:$ReS_j} has leading or trailing blanks" + exit 498 + } + } + } + } + + if ( "$ReS_i" == "" ) { + NotDefd "reshape i" + } + + if ( "$ReS_Xij" == "" ) { + NotDefd "reshape xij" + } + + cap disp bsubstr(" ", 1, 1) + if ( _rc ) local substr substr + else local substr bsubstr + + global rVANS: copy global ReS_Xij_stubs + global S_1 + + * --------------------------- + * TODO: Is this of any value? + * --------------------------- + * global rVANS + * parse "$ReS_Xij", parse(" ") + * local i 1 + * while "``i''"!="" { + * Subname ``i'' + * global rVANS "$rVANS $S_1" + * local i = `i' + 1 + * } + * global S_1 + * --------------------------- + * TODO: Is this of any value? + * --------------------------- +end + +capture program drop NotDefd +program define NotDefd /* */ + hasanyinfo hasinfo + if (`hasinfo') { + di in red as smcl `"{bf:`*'} not defined"' + exit 111 + } + di as err "data have not been reshaped yet" + di as err in smcl "{p 4 4 2}" + di as err in smcl "What you typed is a syntax error because" + di as err in smcl "the data have not been {bf:reshape}d" + di as err in smcl "previously. The basic syntax of + di as err in smcl "{bf:reshape} is" + di as err in smcl + picture err cmds + exit 111 +end + +capture program drop hasanyinfo +program define hasanyinfo + args macname + + local cons : char _dta[ReS_i] + local grpvar : char _dta[ReS_j] + local values : char _dta[ReS_jv] + local vars : char _dta[ReS_Xij] + local car : char _dta[Res_Xi] + local atwl : char _dta[ReS_atwl] + local isstr : char _dta[ReS_str] + + local hasinfo 0 + local hasinfo = `hasinfo' | (`"`cons'"' != "") + local hasinfo = `hasinfo' | (`"`grpvar'"' != "") + local hasinfo = `hasinfo' | (`"`values'"' != "") + local hasinfo = `hasinfo' | (`"`values'"' != "") + local hasinfo = `hasinfo' | (`"`vars'"' != "") + local hasinfo = `hasinfo' | (`"`car'"' != "") + local hasinfo = `hasinfo' | (`"`atwl'"' != "") + local hasinfo = `hasinfo' | (`"`isstr'"' != "") + + c_local `macname' `hasinfo' +end + +capture program drop Subname +program define Subname /* */ + cap disp bsubstr(" ", 1, 1) + if ( _rc ) local substr substr + else local substr bsubstr + local name "`1'" + local sub "`2'" + local m = length(`"$ReS_match"') + local l = index("`name'", `"$ReS_match"') + local l = cond(`l' == 0, length("`name'") + 1, `l') + local a = `substr'("`name'", 1, `l' - 1) + local c = `substr'("`name'", `l' + `m', .) + global S_1 "`a'`sub'`c'" +end + +capture program drop NonUniqueLongID +program define NonUniqueLongID + di in red as smcl /// + "variable {bf:id} does not uniquely identify the observations" + di in red as smcl "{p 4 4 2}" + di in red as smcl "Your data are currently wide." + di in red as smcl "You are performing a {bf:reshape long}." + di in red as smcl "You specified {bf:$ReS_iname($ReS_i)} and {bf:$ReS_jname($ReS_j)}." + di in red as smcl "In the current wide form, variable {bf:$ReS_i}" + di in red as smcl "should uniquely identify the observations." + di in red as smcl "Remember this picture:" + di in red + picture err + di in red as smcl "{p 4 4 2}" + di in red as smcl "Type {stata gduplicates examples $ReS_i} for examples of" + di in red as smcl "problem observations." + di in red as smcl "{p_end}" +end + +capture program drop NonUniqueWideJ +program NonUniqueWideJ + di in red as smcl /// + "values of variable {bf:$ReS_j} not unique within {bf:$ReS_i}" + di in red as smcl "{p 4 4 2}" + di in red as smcl "Your data are currently long." + di in red as smcl "You are performing a {bf:reshape wide}." + di in red as smcl "You specified {bf:$ReS_iname($ReS_i)} and" + di in red as smcl "{bf:$ReS_jname($ReS_j)}." + di in red as smcl "There are observations within" + di in red as smcl "{bf:$ReS_iname($ReS_i)} with the same value of" + di in red as smcl "{bf:$ReS_jname($ReS_j)}. In the long data," + di in red as smcl "variables {bf:$ReS_iname()} and {bf:$ReS_jname()} together" + di in red as smcl "must uniquely identify the observations." + di in red as smcl + picture err + di in red as smcl "{p 4 4 2}" + di in red as smcl "Type {stata gduplicates examples $ReS_i $ReS_j} for examples of" + di in red as smcl "problem observations." + di in red as smcl "{p_end}" +end + +capture program drop NonUniqueWideXi +program NonUniqueWideXi + * TODO: List problem variables + * forvalues i = 1 / `nxi' { + * if ( __greshape_xiproblem[`i'] ) { + * di in red as smcl /// + * "variable {bf:`1'} not constant within {bf:$ReS_i}" + * } + * } + di in red as smcl "{p 4 4 2}" + di in red as smcl "Your data are currently long." + di in red as smcl "You are performing a {bf:reshape wide}." + di in red as smcl "You typed something like" + di in red + di in red as smcl "{p 8 8 2}" + di in red as smcl "{bf:. reshape wide a b, $ReS_iname($ReS_i) $ReS_jname($ReS_j)}" + di in red + di in red as smcl "{p 4 4 2}" + di in red as smcl "There are variables other than {bf:a}," + di in red as smcl "{bf:b}, {bf:$ReS_i}, {bf:$ReS_j} in your data." + di in red as smcl "They must be constant within" + di in red as smcl "{bf:$ReS_i} because that is the only way they can" + di in red as smcl "fit into wide data without loss of information." + di in red + di in red as smcl "{p 4 4 2}" + di in red as smcl "The variable or variables listed above are" + di in red as smcl "not constant within {bf:$ReS_i}. + di in red + di in red as smcl "{p 4 4 2}" + di in red as smcl "You must either add the variables" + di in red as smcl "to the list of xij variables to be reshaped," + di in red as smcl "or {bf:drop} them." + di in red as smcl "{p_end}" +end + +capture program drop ClearReshape +program ClearReshape + char _dta[ReS_ver] + char _dta[ReS_i] + char _dta[ReS_j] + char _dta[ReS_jv] + char _dta[ReS_Xij] + char _dta[Res_Xi] + char _dta[ReS_atwl] + char _dta[ReS_str] + local xijn : char _dta[ReS_Xij_n] + if "`xijn'" != "" { + forvalues i = 1/`xijn' { + char _dta[ReS_Xij_wide`i'] + char _dta[ReS_Xij_long`i'] + } + char _dta[ReS_Xij_n] + } + CleanExit +end + +*********************************************************************** +* Labels, etc. * +*********************************************************************** + +cap mata ustrregexm("a", "a") +if ( _rc ) local regex regex +else local regex ustrregex + +capture mata: mata drop LongToWideMetaSave() +capture mata: mata drop LongToWideMetaApply() +capture mata: mata drop WideToLongMetaSave() +capture mata: mata drop WideToLongMetaApply() +capture mata: mata drop ApplyDefaultFormat() +capture mata: mata drop ApplyCustomLabelFormat() + +mata: +transmorphic scalar LongToWideMetaSave(real scalar spread) +{ + transmorphic scalar LongToWideMeta + string rowvector rVANS, ReS_Xij,ReS_jv, ReS_jvraw, ReS_prefix + string scalar ReS_j, ReS_jvlb, ReS_labelformat + string scalar newvar, var, stub, lvl, fmt, lbl, fmtlbl + string matrix chars, _chars + real scalar i, j, k, prefix, njvars + + // Get all the meta info! Note that the label formatting only + // happens with single-variable input for keys()/j(), so we + // only grab the "first" element (because they get ignored + // if there are any other elements). + + LongToWideMeta = asarray_create() + fmt = "%s[%s]" + rVANS = tokens(st_global("rVANS")) + ReS_Xij = tokens(st_global("ReS_Xij")) + ReS_jv = tokens(st_global("ReS_jv")) + ReS_jvraw = tokens(st_global("ReS_jvraw")) + ReS_prefix = tokens(st_global("ReS_prefix")) + ReS_j = tokens(st_global("ReS_j"))[1] + njvars = cols(tokens(st_global("ReS_j"))) + ReS_jvlb = st_varvaluelabel(ReS_j) + ReS_labelformat = st_global("ReS_labelformat") + prefix = (length(ReS_prefix) > 0) + + asarray(LongToWideMeta, "rVANS", rVANS) + asarray(LongToWideMeta, "ReS_Xij", ReS_Xij) + asarray(LongToWideMeta, "ReS_jv", ReS_jv) + asarray(LongToWideMeta, "ReS_jvraw", ReS_jvraw) + asarray(LongToWideMeta, "ReS_prefix", ReS_prefix) + + // Keep labels, value labels, formats, and characteristics of + // each source variable. All will be applied to (copied to) each + // corresponding wide variable. + + spread = (spread & (cols(ReS_Xij) == 1)) + for (i = 1; i <= cols(ReS_Xij); i++) { + stub = ReS_Xij[i] + var = rVANS[i] + for (j = 1; j <= cols(ReS_jv); j++) { + lvl = ReS_jv[j] + lbl = ReS_jvraw[j] + chars = J(0, 2, "") + _chars = st_dir("char", var, "*") + if ( prefix ) { + newvar = GetVariableFromStubPrefix(stub, lvl, ReS_prefix[i]) + } + else { + newvar = spread? lvl: GetVariableFromStub(stub, lvl) + } + for (k = 1; k <= rows(_chars); k++) { + chars = chars \ ( + sprintf(fmt, newvar, _chars[k]), + st_global(sprintf(fmt, var, _chars[k])) + ) + } + if ( njvars > 1 ) { + fmtlbl = lbl + " " + st_varlabel(var) + } + else { + fmtlbl = ApplyCustomLabelFormat( + ReS_labelformat, + var, + st_varlabel(var), + ReS_j, + st_varlabel(ReS_j), + lbl, + ReS_jvlb + ) + } + // asarray(LongToWideMeta, newvar + "lbl", lbl + " " + st_varlabel(var)) + asarray(LongToWideMeta, newvar + "lbl", fmtlbl) + asarray(LongToWideMeta, newvar + "fmt", st_varformat(var)) + asarray(LongToWideMeta, newvar + "vlb", st_varvaluelabel(var)) + asarray(LongToWideMeta, newvar + "chr", chars) + } + } + + return (LongToWideMeta) +} + +void LongToWideMetaApply(transmorphic scalar LongToWideMeta, real scalar spread) +{ + + string rowvector ReS_Xij,ReS_jv, ReS_prefix + string scalar newvar, stub, lvl + string matrix chars + real scalar i, j, k, prefix + + ReS_Xij = asarray(LongToWideMeta, "ReS_Xij") + ReS_jv = asarray(LongToWideMeta, "ReS_jv") + ReS_prefix = asarray(LongToWideMeta, "ReS_prefix") + prefix = (length(ReS_prefix) > 0) + + spread = (spread & (cols(ReS_Xij) == 1)) + for (i = 1; i <= cols(ReS_Xij); i++) { + stub = ReS_Xij[i] + for (j = 1; j <= cols(ReS_jv); j++) { + lvl = ReS_jv[j] + if ( prefix ) { + newvar = GetVariableFromStubPrefix(stub, lvl, ReS_prefix[i]) + } + else { + newvar = spread? lvl: GetVariableFromStub(stub, lvl) + } + st_varlabel(newvar, asarray(LongToWideMeta, newvar + "lbl")) + st_varformat(newvar, asarray(LongToWideMeta, newvar + "fmt")) + + // Value labels only for numeric + if ( `regex'm(st_vartype(newvar), "str([1-9][0-9]*|L)") == 0 ) { + st_varvaluelabel(newvar, asarray(LongToWideMeta, newvar + "vlb")) + } + + chars = asarray(LongToWideMeta, newvar + "chr") + for (k = 1; k <= rows(chars); k++) { + st_global(chars[k, 1], chars[k, 2]) + } + + } + } +} + +transmorphic scalar WideToLongMetaSave() +{ + transmorphic scalar WideToLongMeta + string rowvector ReS_Xij, ReS_Xij_names, ReS_jv + string scalar var, nam, fmt, what, lvl + string scalar _lb2, _lbl, _fmt, _vlb + string matrix chars, _chars + string matrix notes, _notes, note0, _note0 + real matrix maplevel + real scalar i, j, k, notek, noten + real scalar any_lbl, any_lb2, any_fmt, any_vlb + real scalar ever_lbl, ever_fmt, ever_vlb + + maplevel = st_matrix("__greshape_maplevel") + + WideToLongMeta = asarray_create() + fmt = "%s[%s]" + ReS_Xij = tokens(st_global("ReS_Xij_stubs")) + ReS_jv = tokens(st_global("ReS_jv")) + ReS_Xij_names = tokens(st_global("ReS_Xij_names")) + + asarray(WideToLongMeta, "ReS_Xij", ReS_Xij) + asarray(WideToLongMeta, "ReS_jv", ReS_jv) + + ever_lbl = 0 + ever_fmt = 0 + ever_vlb = 0 + + // Keep labels, value labels, formats, and characteristics of each + // set of source variables. All will be applied to (copied to) each + // corresponding single long variable. Since this is a many to one, + // Labels, value labels, and formats are discarded if there is more + // than one (TODO: Add optionfor which to keep?). + // + // Notes are appended in the order they appear. We keep a unique set + // of notes from the source variables in the target variable. Note + // that variable notes are saved as variable characteristics, so we + // apply all variable characteristics first and all the (unique) + // notes second (we do not mind collisions and such, in part because + // it's a hassle but also because variable characteristics are such + // an advanced feature anyway that any user using them ought to + // be defining their behavior explicitly, so I don't even want to + // tinker with that too much). + + what = "" + for (i = 1; i <= rows(maplevel); i++) { + var = ReS_Xij[i] + chars = J(0, 2, "") + notes = J(0, 2, "") + _notes = J(0, 1, "") + noten = 0 + for (j = 1; j <= cols(maplevel); j++) { + if ( maplevel[i, j] ) { + lvl = ReS_jv[j] + nam = ReS_Xij_names[maplevel[i, j]] + _chars = st_dir("char", nam, "*") + for (k = 1; k <= rows(_chars); k++) { + chars = chars \ ( + sprintf(fmt, var, _chars[k]), + st_global(sprintf(fmt, nam, _chars[k])) + ) + } + + note0 = st_global(sprintf(fmt, nam, "note0")) + if ( note0 != "" ) { + notek = strtoreal(note0) + if ( notek < . ) { + for (k = 1; k <= notek; k++) { + _note0 = st_global(sprintf(fmt, nam, "note" + strofreal(k))) + if ( any(_note0 :== _notes) == 0 ) { + _notes = _notes \ _note0 + } + } + } + } + } + } + + any_lbl = 0 + any_lb2 = 0 + any_fmt = 0 + any_vlb = 0 + + _lbl = st_varlabel(nam) + _lb2 = substr(st_varlabel(nam), strlen(lvl) + 1, .) + _fmt = st_varformat(nam) + _vlb = st_varvaluelabel(nam) + for (j = 1; j <= cols(maplevel); j++) { + if ( maplevel[i, j] ) { + lvl = ReS_jv[j] + nam = ReS_Xij_names[maplevel[i, j]] + any_lbl = any_lbl | (_lbl != st_varlabel(nam)) + any_lb2 = any_lb2 | (_lb2 != substr(st_varlabel(nam), strlen(lvl) + 1, .)) + any_fmt = any_fmt | (_fmt != st_varformat(nam)) + any_vlb = any_vlb | (_vlb != st_varvaluelabel(nam)) + } + } + + // _notes = uniqrows(_notes) + if ( rows(_notes) > 0 ) { + noten = rows(_notes) + notes = J(noten + 1, 2, "") + notes[1, .] = (sprintf(fmt, var, "note0"), strofreal(noten)) + for (k = 1; k <= noten; k++) { + notes[1 + k, .] = (sprintf(fmt, var, "note" + strofreal(k)), _notes[k]) + } + } + + if ( any_lbl == 0 ) { + asarray(WideToLongMeta, var + "lbl", _lbl) + } + else if ( any_lb2 == 0 ) { + any_lbl = 0 + asarray(WideToLongMeta, var + "lbl", _lb2) + } + else { + any_lbl = 1 + any_lb2 = 1 + asarray(WideToLongMeta, var + "lbl", "") + } + + asarray(WideToLongMeta, var + "fmt", any_fmt? "": _fmt) + asarray(WideToLongMeta, var + "vlb", any_vlb? "": _vlb) + asarray(WideToLongMeta, var + "chr", chars) + asarray(WideToLongMeta, var + "nts", notes) + + ever_lbl = ever_lbl | any_lbl + ever_fmt = ever_fmt | any_fmt + ever_vlb = ever_vlb | any_vlb + } + + if ( ever_lbl ) + what = "labels" + + if ( ever_vlb ) + what = (what == "")? "value labels": what + ", value labels" + + if ( ever_fmt ) + what = (what == "")? "variable formats": what + ", variable formats" + + if ( what != "" ) + printf("(note: cannot preserve %s when reshaping long)\n", what) + + return (WideToLongMeta) +} + +void WideToLongMetaApply(transmorphic scalar WideToLongMeta) +{ + string rowvector ReS_Xij + string scalar var + string matrix chars, notes + real scalar i, k, f + + ReS_Xij = asarray(WideToLongMeta, "ReS_Xij") + for (i = 1; i <= cols(ReS_Xij); i++) { + var = ReS_Xij[i] + st_varlabel(var, asarray(WideToLongMeta, var + "lbl")) + + if ( `regex'm(st_vartype(var), "str([1-9][0-9]*|L)") == 0 ) { + st_varvaluelabel(var, asarray(WideToLongMeta, var + "vlb")) + } + + f = asarray(WideToLongMeta, var + "fmt") + if ( f == "" ){ + ApplyDefaultFormat(var) + } + else { + st_varformat(var, asarray(WideToLongMeta, var + "fmt")) + } + + chars = asarray(WideToLongMeta, var + "chr") + for (k = 1; k <= rows(chars); k++) { + st_global(chars[k, 1], chars[k, 2]) + } + + notes = asarray(WideToLongMeta, var + "nts") + for (k = 1; k <= rows(notes); k++) { + st_global(notes[k, 1], notes[k, 2]) + } + } + +} + +// If variable formats collide, reset to default format +void function ApplyDefaultFormat(string scalar var) +{ + string scalar v, l, f + v = st_vartype(var) + f = "" + if ( `regex'm(v, "str([1-9][0-9]*|L)") ) { + l = `regex's(1) + if ( l == "L" ) { + f = "%9s" + } + else { + f = "%" + `regex's(1) + "s" + } + } + else { + if ( v == "byte" ) { + f = "%8.0g" + } + else if ( v == "int" ) { + f = "%8.0g" + } + else if ( v == "long" ) { + f = "%12.0g" + } + else if ( v == "float" ) { + f = "%9.0g" + } + else if ( v == "double" ) { + f = "%10.0g" + } + else { + f = "" + } + } + if ( f != "" ) { + st_varformat(var, f) + } +} + +string scalar function ApplyCustomLabelFormat( + string scalar fmt, // Label format + string scalar stbnam, // stub variable name + string scalar stblbl, // stub variable label + string scalar varnam, // Key variable name + string scalar varlbl, // Key variable label + string scalar varval, // Key variable value + string scalar varvlbnam) // Key variable value label name +{ + string scalar regstbnam + string scalar regstblbl + string scalar regvarnam + string scalar regvarlbl + string scalar regvarval + string scalar regvarvlb + string scalar varvlb + string scalar out + real scalar numlbl + + numlbl = st_isnumvar(varnam)? strtoreal(varval): . + varvlb = varvlbnam == ""? "": st_vlmap(varvlbnam, numlbl) + + regstbnam = "#stubname#" + regstblbl = "#stublabel#" + regvarnam = "#keyname#" + regvarlbl = "#keylabel#" + regvarval = "#keyvalue#" + regvarvlb = "#keyvaluelabel#" + + // Fallbacks + if ( stblbl == "" ) stblbl = stbnam + if ( varlbl == "" ) varlbl = varnam + if ( varvlb == "" ) varvlb = varval + + out = subinstr(fmt, regstbnam, stbnam, .) + out = subinstr(out, regstblbl, stblbl, .) + out = subinstr(out, regvarnam, varnam, .) + out = subinstr(out, regvarlbl, varlbl, .) + out = subinstr(out, regvarval, varval, .) + out = subinstr(out, regvarvlb, varvlb, .) + + return(out) +} +end + +capture program drop GreshapeTempFile +program GreshapeTempFile + if ( `"${GTOOLS_TEMPFILES_GRESHAPE_I}"' == "" ) { + local GTOOLS_TEMPFILES_GRESHAPE_I = 1 + global GTOOLS_TEMPFILES_GRESHAPE_I = 1 + } + else { + local GTOOLS_TEMPFILES_GRESHAPE_I = ${GTOOLS_TEMPFILES_GRESHAPE_I} + 1 + global GTOOLS_TEMPFILES_GRESHAPE_I = ${GTOOLS_TEMPFILES_GRESHAPE_I} + 1 + } + local f ${GTOOLS_TEMPDIR}/__gtools_tmpfile_greshape_`GTOOLS_TEMPFILES_GRESHAPE_I' + global GTOOLS_TEMPFILES_GRESHAPE ${GTOOLS_TEMPFILES_GRESHAPE} __gtools_tmpfile_greshape_`GTOOLS_TEMPFILES_GRESHAPE_I' + c_local `0': copy local f +end + +*********************************************************************** +* TODO Eventually * +*********************************************************************** + +* -------------------------------- +* TODO: Add collapse syntax for xi +* -------------------------------- + +* -------------------------------------------------------------------------------------------------------- +* TODO: for gather +* +* greshape gather (type target1 "Label" = varlist1) [(type target2 "label" = varlist2) ...], i(i) j(j) +* * 1 problem var +* disp as err "Incomparible types: taget1 is type but source var1 is type" +* * N problem var +* disp as err "Incomparible types for variables:" +* "taget1 is type but source var1 is type" +* "..." +* * With option force: Just go! No type checking. Set to missing if type is not compat. +* * Option for converting numeric to string? sprintf(...) +* -------------------------------------------------------------------------------------------------------- + +* ---------------------------------------------------- +* TODO: Add values of j to subset? j(j 2005 2006 2007) +* j(j 2005-2007) +* j(j a b c ...) +* ---------------------------------------------------- + +* ------------------------------------------------------------- +* TODO: better atwl, basically. match(num|str|anything|/regex/) +* ------------------------------------------------------------- diff --git a/01.code/ado/g/greshape.sthlp b/01.code/ado/g/greshape.sthlp new file mode 100755 index 0000000..29a3af2 --- /dev/null +++ b/01.code/ado/g/greshape.sthlp @@ -0,0 +1,309 @@ +{smcl} +{* *! version 0.5.0 26Jan2020}{...} +{viewerdialog greshape "dialog greshape"}{...} +{vieweralsosee "[R] greshape" "mansection R greshape"}{...} +{viewerjumpto "Syntax" "greshape##syntax"}{...} +{viewerjumpto "Description" "greshape##description"}{...} +{title:Title} + +{p2colset 5 17 23 2}{...} +{p2col :{cmd:greshape} {hline 2}} Fast alternative to reshape using C for speed. {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{pstd} +{opt greshape} is a fast alternative to {opt reshape} that additionally +implements the equivalents to R's {cmd:spread} and {cmd:gather} from +{cmd:tidyr}. Further, it allows an arbitrary number of grouping by variables +({opt i()}) and keys ({opt j()}). + +{p 4 8 2} +Normal syntax + +{p 8 8 2} In R parlance, {opt j()} is called {opt keys()}, which is +frankly much clearer to understand. Hence, while regular Stata syntax is +also supported, {cmd:greshape} provides the aliases {opt by()} and {opt keys()} +for {opt i()} and {opt j()} respectively (note {it:spread} and {it:wide} +accept multiple {it:j} keys). + +{p 8 8 2} +wide->long + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:gather} +{varlist}{cmd:,} +{cmd:keys(}{varlist}{cmd:)} +{cmd:values(}{varname}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:long} +{it:stubnames}{cmd:,} +{cmd:by(}{varlist}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 8 8 2} +long->wide + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:spread} +{varlist}{cmd:,} +{cmd:keys(}{varname}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:wide} +{it:stubnames}{cmd:,} +{cmd:by(}{varlist}{cmd:)} +{cmd:keys(}{varname}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 8 8 2} {it:i} and {it:j} not only look similar, but even if you are +familiar with conventional matrix notation it's not immediately obvious +which is the grouping variable and which defines the column keys. In any +case, note that in {it:spread} and {it:gather} the {opt by()} option is +implicitly defined as every variable remaining in the data. + +{p 4 8 2} +Stata syntax + +{p 8 8 2} +I think the above syntax is clearer (happy to receive feedback otherwise) +but {cmd:greshape} also accepts the traditional Stata {it:i, j} syntax: + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:long} +{it:stubnames}{cmd:,} +{cmd:i(}{varlist}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 12 16 2} +{cmd:greshape} {helpb greshape##overview:wide} +{it:stubnames}{cmd:,} +{cmd:i(}{varlist}{cmd:)} +{cmd:j(}{varname}{cmd:)} +[{it:{help greshape##options_table:options}}] + +{p 4 8 2} +Details + +{p 8 8 2} +The {it:stubnames} are a list of variable {it:prefixes}. The suffixes are either +saved or taken from {opt keys()}, depending on the shape of the data. Remember +this picture: + + {it:long} + {c TLC}{hline 12}{c TRC} {it:wide} + {c |} {it:i j} {it:stub} {c |} {c TLC}{hline 16}{c TRC} + {c |}{hline 12}{c |} {c |} {it:i} {it:stub}{bf:1} {it:stub}{bf:2} {c |} + {c |} 1 {bf:1} 4.1 {c |} greshape {c |}{hline 16}{c |} + {c |} 1 {bf:2} 4.5 {c |} <{hline 10}> {c |} 1 4.1 4.5 {c |} + {c |} 2 {bf:1} 3.3 {c |} {c |} 2 3.3 3.0 {c |} + {c |} 2 {bf:2} 3.0 {c |} {c BLC}{hline 16}{c BRC} + {c BLC}{hline 12}{c BRC} + + long->wide + + {col 54}{it:j} existing variable(s) + {col 53}/ + {cmd:greshape wide} {it:stub}{cmd:, by(}{it:i}{cmd:) keys(}{it:j}{cmd:)} + + wide->long + + {cmd:greshape long} {it:stub}{cmd:, by(}{it:i}{cmd:) keys(}{it:j}{cmd:)} + {col 52}\ + {col 53}{it:j} new variable + +{p 8 8 2} +Additionally, the user can reshape in the style of R's {cmd:tidyr} package. +To go from long to wide: + + {cmd:greshape spread} {it:varlist}{cmd:, keys(}{it:j}{cmd:)} + +{p 8 8 2} +Note that {cmd:spread} (and {cmd:gather}) both require variable {it:names}, not prefixes. +Further, all variables not specified in the reshape are assumed to be +part of {cmd:by()} and the new variables are simply named after the values of +{cmd:keys()}. From wide to long: + + {cmd:greshape gather} {it:varlist}{cmd:, keys(}{it:j}{cmd:) values(}{it:values}{cmd:)} + +{p 8 8 2} +This does {opt not} check for duplicates or sorts the data. Variables not +named are assumed to be part of {cmd:by()}). The values of the variables in +{it:varlist} are saved in {cmd:values()}, with their names saved in {it:keys()}. + +{p 8 8 2} +{cmd:reshape}'s extended syntax is not supported; that is, {cmd:greshape} does +not implement "reshape mode" where a user can type {cmd:reshape long} or +{cmd:reshape wide} after the first reshape. This syntax is cumbersome to +support and prone to errors given the degree to which {cmd:greshape} had +to rewrite the base code. This also means the "advanced" commands +are not supported, including: clear, error, query, i, j, xij, and xi. + +{synoptset 19 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Long} +{synopt :* {opth by(varlist)}} use {it:varlist} as the ID variables (alias {opt i()}). +{p_end} +{synopt :{opth keys(varname)}} wide->long: {it:varname}, new variable to store stub suffixes (default {it:_j}; alias {opt j()}). +{p_end} +{synopt :{opt s:tring}} Whether to allow for string matches to each {it:stub} +{p_end} +{synopt :{opt match(str)}} Where to match levels of {opt keys()} in stub (default {opt @}). Use {opt match(regex)} for complex matches. +{p_end} +{synopt :{opt dropm:iss}} Drop missing observations for reshaped variables. +{p_end} + +{syntab :Wide} +{synopt :* {opth by(varlist)}} use {it:varlist} as the ID variables (alias {opt i()}). +{p_end} +{synopt :* {opth keys(varlist)}} long->wide: {it:varlist}, existing variable with stub suffixes (alias {opt j()}). +{p_end} +{synopt :{opth cols:epparate(str)}} Column separator when multiple variables are passed to {opt keys()}. +{p_end} +{synopt :{opt match(str)}} Where to replace levels of {opt keys()} in stub (default {opt @}). +{p_end} +{synopt :{opt labelf:ormat(str)}} Custom label format using placeholders (only with single variable input for {opt key()}/{opt j()}). Default {it:#keyvalue# #stublabel#}. Other placeholders are {it:#stubname#}, {it:#keyname#}, {it:#keylabel#}, {it:#keyvaluelabel#}. +{p_end} +{synopt :{opt prefix(str)}} Custom renaming of reshaped variables (one rename per stub; {opt @} syntax allowed). +{p_end} + + +{syntab :Common long and wide options} +{synopt :{opt fast}} Do not wrap the reshape in preserve/restore pairs. +{p_end} +{synopt :{opt unsorted}} Leave the data unsorted (faster). Original sort order is {opt not} preserved. +{p_end} +{synopt :{opt nodupcheck}} wide->long, allow duplicate {opt by()} values (faster). +{p_end} +{synopt :{opt nomisscheck}} long->wide, allow missing values and/or leading blanks in {opt keys()} (faster). +{p_end} +{synopt :{opt nochecks}} This is equivalent to all 4 of the above options (fastest). +{p_end} +{synopt :{opt xi(drop)}} Drop variables not in the reshape, {opt by()}, or {opt keys()}. +{p_end} + +{synoptline} +{syntab :Gather} +{synopt :* {opth values(varname)}} Store values in {it:varname}. +{p_end} +{synopt :{opth keys(varname)}} wide->long: {it:varname}, new variable to store variable names (default {it:_key}). +{p_end} +{synopt :{opt usel:abels[{cmd:(}{it:str}{cmd:)}]}} Store variable labels instead of their names (optionally specify which variables to do this for). +{p_end} +{synopt :{opt dropm:iss}} Drop missing observations for reshaped variables. +{p_end} + +{syntab :Spread} +{synopt :* {opth keys(varlist)}} long->wide: {it:varlist}, existing variable with variable names. +{p_end} +{synopt :{opt labelf:ormat(str)}} Custom label format using placeholders (only with single variable input for {opt key()}/{opt j()}). Default {it:#keyvalue# #stublabel#}. Other placeholders are {it:#stubname#}, {it:#keyname#}, {it:#keylabel#}, {it:#keyvaluelabel#}. +{p_end} +{synopt :{opt prefix(str)}} Custom renaming of reshaped variables (one common rename; {opt @} syntax allowed). +{p_end} + +{syntab :Common gather and spread options} +{synopt :{opth by(varlist)}} check {it:varlist} are the ID variables. Throws an error otherwise. +{p_end} +{synopt :{opt xi(drop)}} Drop variables not in the reshape or in {opt by()}. +{p_end} +{synopt :{opt fast}} Do not wrap the reshape in preserve/restore pairs. +{p_end} + +{synoptline} +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{syntab:* options are required for that subcommand.} +{p2colreset}{...} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:greshape} converts data from wide to long form and vice versa. It +is a fast alternative to {cmd:reshape}, and it additionally implements +{cmd:greshape spread} and {cmd:greshape gather}, both of which are +marginally faster and in the style of the equivalent R commands from +{cmd:tidyr}. + +{pstd} +It is well-known that {cmd:reshape} is a slow command, and there are +several alternatives that I have encountered to speed up reshape, +including: {opt fastreshape}, {opt parallel}, {opt sreshape}, and various +custom solutions (e.g. {browse "http://www.nber.org/stata/efficient/reshape.html":here}). +While these alternatives are slower than `greshape`, the speed gains are +not uniform. + +{pstd} +If {opt j()} is numeric, the data is already sorted by {opt i()}, and +there are not too may variables to reshape, then {cmd:fastreshape} comes +closest to achieving comparable speeds to {cmd:greshape}. Under most +circumstances, however, {cmd:greshape} is typically 20-60% faster than +{cmd:fastreshape} on sorted data, and up to 90% faster if {opt j()} has +string values {it:or} if the data is unsorted (by default, {cmd:greshape} will +output the data in the correct sort order). In other words, {cmd:greshape}'s +speed gains are very robust, while other solutions' are not. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/greshape/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:greshape} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:reshape}. +{p_end} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gtools}; {help reshape} diff --git a/01.code/ado/g/groupfunction.ado b/01.code/ado/g/groupfunction.ado new file mode 100755 index 0000000..1876323 --- /dev/null +++ b/01.code/ado/g/groupfunction.ado @@ -0,0 +1,795 @@ +*!version 2.2 (28 Jan 2021) +* Fixed python code, added from __main__ import gini +* version 2.1.1 (27 Jan 2021) +* fix python indetention +* version 2.0 (04 April 2020) groupfunction +* option [in] [if] added +* version 2.0 (04 Apr 2020) +* phython Gini added +* version 1.0 (05 Dec 2017) +* +* Paul Corral - World Bank Group +* Minh Nguyen - World Bank Group +* Joao Pedro Azevedo - World Bank Group + + +cap prog drop groupfunction +program define groupfunction, eclass + version 11.2, missing + #delimit; + syntax [if] [in] [aw pw fw] , + [ + sum(varlist numeric) + rawsum(varlist numeric) + mean(varlist numeric) + first(varlist numeric) + max(varlist numeric) + min(varlist numeric) + count(varlist numeric) + sd(varlist numeric) + gini(varlist numeric) + theil(varlist numeric) + VARiance(varlist numeric) + by(varlist) + norestore + xtile(varlist numeric) + nq(numlist max=1 int >0) + missing + slow + merge + ]; +#delimit cr +qui{ +if ("`by'"==""){ + tempvar myby + gen `myby' = 1 + local by `myby' + +} +if ("`xtile'"!="") local forby forby +local wvar : word 2 of `exp' +if ("`norestore'"!="") keep `wvar' `by' `sum' `rawsum' `mean' `first' `max' `min' `count' `sd' `variance' `gini' `theil' `xtile' + tempvar _useit _gr0up _thesort + marksample _useit + + //save label + qui: label dir + local _allv `r(names)' + foreach x of varlist `by' { + local Nm: val lab `x' + + if ("`Nm'"!=""){ + local l: value label `x' + local vallabs "`vallabs' `l'" + } + } + local vallabs: list vallabs & _allv + if ("`vallabs'"!=""){ + tempfile labeldo + label save `vallabs' using `labeldo', replace + } + + //Weights + local wvar : word 2 of `exp' + if "`wvar'"==""{ + tempvar peso + gen `peso'=1 + local wvar `peso' + } + else{ + replace `_useit'=0 if missing(`wvar') + } + if ("`forby'"!=""){ + //Only one var can be specified + tokenize `xtile' + if ("`2'"!=""){ + dis as error "When specifying xtile variables with forby option, only one is allowed" + error 111 + exit + } + if ("`nq'"==""){ + dis as error "When specifying xtile, nq needs to be specified" + error 111 + exit + } + replace `_useit'=0 if missing(`xtile') + } + + local procs sum mean first max min count rawsum sd variance gini theil xtile + + local empty=1 + + foreach ss of local procs{ + if ("``ss''"!="") local empty = 0 + } + + if (`empty'==1){ + display as error "Please specify variables for sum, rawsum, mean, min, max, count, sd, variance, or first" + error 301 + exit + } + + if ("`by'"==""){ + gen `_gr0up' =1 + local by `_gr0up' + } + + + //adjust here when adding new functions + local procs sum mean first max min count rawsum sd variance gini theil + local check + foreach x of local procs{ + local `x': list uniq `x' + local check: list check & `x' + } + + + if ("`checks'"!=""){ + display as error "Please specify unique names for variables in sum, first, mean, max, min, gini, theil options" + } + + local numby = wordcount("`by'") + if ("`forby'"=="") local ++numby + + if (`numby'>1|"`strs'"!=""){ + tempvar _1x1 + egen `_1x1' = group(`by') + local thearea `_1x1' + } + else{ + local thearea `by' + } + + if ("`forby'"=="") sort `thearea' + else{ + levelsof `thearea' if `_useit'==1, local(myforby) + + foreach hi of local myforby{ + tempvar _myby + gen `_myby' = `thearea'==`hi' + mata: w=st_data(.,tokens("`wvar'"),"`_myby'") + mata: st_view(__i=.,.,tokens("`xtile'"),"`_myby'") + mata:__i[.,.] =_fpctilebig(__i,1,`nq',w) + } + sort `thearea' `xtile' + local thearea `thearea' `xtile' + local by `by' `xtile' + } + + //Account for more than one by variable, and strings + foreach x of local by{ + if ("`:val lab `x''"!="") local lbl_`x' : val lab `x' + cap confirm string variable `x' + if (_rc==0){ + local strpres =1 + local strs `strs' `x' + mata: st_sview(strs=.,.,tokens("`strs'"),"`_useit'") + } + + } + + local by2: list by - strs + mata: st_view(nostrs=.,.,tokens("`by2'"),"`_useit'") + + + //Import data into mata + mata: w=st_data(.,tokens("`wvar'"),"`_useit'") + + if ("`forby'"=="") mata: st_view(area=.,.,tokens("`thearea'"),"`_useit'") + else mata: st_view(area=.,.,tokens("`xtile'"),"`_useit'") + /* + mata: if (allof(x:==.)) st_local(cont, 0) + if ("`cont'"=="0"){ + display as error "Your by group has all values missing" + error 119 + exit + } + */ + mata: info = panelsetup(area,1) + + //mata: rows(info) + //Get area matrix + + if ("`slow'"!="") local slow=1 + else local slow=0 + + if ("`strs'"!=""){ + mata: strs=strs[info[.,1],.] + mata: nostrs=nostrs[info[.,1],.] + } + else{ + mata: nostrs=nostrs[info[.,1],.] + } + + if ("`missing'"=="") local miss = 0 + else local miss = 1 + + if ("`sum'"!=""){ + mata: st_view(x=.,.,tokens("`sum'"),"`_useit'") + mata: xsum = _fastsum(x,w,info, `miss') + local todrop: list sum - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`rawsum'"!=""){ + mata: st_view(x=.,.,tokens("`rawsum'"),"`_useit'") + + mata: w2=J(rows(w),1,1) + mata: xrawsum = _fastsum(x,w2,info, `miss') + local todrop: list rawsum - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`count'"!="") { + mata: st_view(x=.,.,tokens("`count'"),"`_useit'") + mata: xcount = _fastcount(x,info) + local todrop: list count - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`mean'"!=""){ + mata: st_view(x=.,.,tokens("`mean'"),"`_useit'") + mata: xmean = _fastmean(x,w,info, `slow') + local todrop: list mean - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`sd'"!=""){ + mata: st_view(x=.,.,tokens("`sd'"),"`_useit'") + mata: xsd = sqrt(_fastvariance(x,w,info)) + local todrop: list sd - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`variance'"!=""){ + mata: st_view(x=.,.,tokens("`variance'"),"`_useit'") + mata: xvariance = (_fastvariance(x,w,info)) + local todrop: list variance - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`first'"!=""){ + mata: st_view(x=.,.,tokens("`first'"),"`_useit'") + mata: xfirst = _fastfirst(x,info) + local todrop: list first - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`max'"!=""){ + mata: st_view(x=.,.,tokens("`max'"),"`_useit'") + mata: xmax = _fastmax(x,info) + local todrop: list max - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`min'"!=""){ + mata: st_view(x=.,.,tokens("`min'"),"`_useit'") + mata: xmin = _fastmin(x,info) + local todrop: list min - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + + if ("`gini'"!=""|"`theil'"!=""){ + levelsof `thearea', local(misareas) + foreach indic in gini theil{ + local mivars=1 + foreach y of local `indic'{ + local careas =1 + foreach geo of local misareas{ + CpBCaLC `y' if `thearea'==`geo' [aw=`wvar'], `indic' + if (`careas'==1) mata: x`indic'1 = `r(`indic')' + else mata: x`indic'1 = x`indic'1 \ `r(`indic')' + + local ++careas + } + if (`mivars'==1) mata: x`indic' = x`indic'1 + else mata: x`indic' = x`indic',x`indic'1 + local ++mivars + } + + local todrop: list `indic' - wvar + local todrop: list todrop - thearea + if ("`todrop'"!="" & "`merge'"=="") drop `todrop' + } + } + + foreach x of local procs{ + if ("``x''"!=""){ + local finmat `finmat' x`x' + local procs2 `procs2' `x' + } + } + local finmat=subinstr("`finmat'"," ",",",.) + mata: xx=(`finmat') + //Save number of observations + mata: st_matrix("_obs_",rows(info)) + + if ("`merge'"!=""){ + local lasvar `procs2' + foreach x of local lasvar{ + foreach y of local `x'{ + if ("`exp'"=="") local WwW + else local WwW w + gen double `WwW'`x'_`y' = . + if ("`exp'"=="") lab var `WwW'`x'_`y' "`x' of `y'" + else lab var `WwW'`x'_`y' " Weighted `x' of `y'" + local _all `_all' `WwW'`x'_`y' + } + } + noi:mata: st_store(.,tokens(st_local("_all")),"`_useit'",theexpanse(info,xx)) + } + else{ + //SAVE RESULTS in STATA + clear + local lasvar by2 `procs2' + local o=_obs_[1,1] + + set obs `o' + if "`strs'"!=""{ + local ccc=1 + foreach x of local strs{ + mata:st_local("ha",strofreal(max(strlen(strs[.,`ccc'])))) + if (`ha'!=0) gen str`ha' `x'= "" + else gen str1 `x'= "" + lab var `x' "Group by `x'" + local ccc=`ccc'+1 + } + mata: st_sstore(.,tokens(st_local("strs")),.,strs) + } + + foreach x of local lasvar{ + foreach y of local `x'{ + if ("`x'"!="by2"){ + gen double `y' = . + lab var `y' "`x' of `y'" + local _all `_all' `y' + } + else{ + gen double `y' = . + lab var `y' "Group by `y'" + //if `"`_val`y''"'!=""{ + //lab def `y' `_val`y'' + //lab val `y' `y' + //} + } + } + } + + if ("`by2'"!="") mata: st_store(.,tokens(st_local("by2")),.,nostrs) + mata: st_store(.,tokens(st_local("_all")),.,xx) + //apply label back + if ("`vallabs'"!=""){ + do `labeldo' + foreach x of local by2 { + if "`lbl_`x''"~="" lab val `x' `lbl_`x'' + } + } + } + mata: mata drop area info nostrs + cap mata: mata drop x + cap mata: mata drop y + cap mata: mata drop w + cap mata: mata drop xmean + cap mata: mata drop xgini + cap mata: mata drop xgini1 + cap mata: mata drop xsum + cap mata: mata drop xrawsum + cap mata: mata drop w2 + cap mata: mata drop xx + +} + +end + +mata +mata set matastrict off + +function _TTheil(y,w){ + one=ln(y:/mean(y,w)) + two=one:*(y:/mean(y,w)) + return(mean(two,w)) +} + +function _fasttheil(real matrix x, real matrix w, real matrix info){ + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + + for(i=1; i<=r;i++){ + panelsubview(xi=.,x,i,info) + panelsubview(wi=.,w,i,info) + for(j=1;j<=jj;j++){ + X1[i,j] = _TTheil(xi[.,j],wi) + } + } + return(X1) +} + + +function _GGini(x, w) { + t = x,w + _sort(t,1) + x=t[.,1] + w=t[.,2] + xw = x:*w + rxw = quadrunningsum(xw) :- (xw:/2) + return(1- 2*((quadcross(rxw,w)/quadcross(x,w))/quadcolsum(w))) +} + +function _fastgini(real matrix x, real matrix w, real matrix info){ + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + + + for(i=1; i<=r;i++){ + panelsubview(xi=.,x,i,info) + panelsubview(wi=.,w,i,info) + for(j=1;j<=jj;j++){ + X1[i,j] = _GGini(xi[.,j],wi) + } + } + return(X1) +} + + +//data should have been previously sorted +function _fastmean(real matrix x, real matrix w, real matrix info, slow){ + + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + if (((hasmissing(x)+hasmissing(w))!=0)|slow==1){ + //slow option + for(i=1; i<=r;i++){ + panelsubview(xi=.,x,i,info) + panelsubview(wi=.,w,i,info) + for(j=1;j<=jj;j++){ + X1[i,j] = mean(xi[.,j],wi) + } + } + } + else{ + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + rr2 = info[i,1],1 \ info[i,2],1 + X1[i,.] = mean(x[|rr|],w[|rr2|]) + } + } + return(X1) +} + +function _fastvariance(real matrix x, real matrix w, real matrix info){ + + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + if ((hasmissing(x)+hasmissing(w))!=0){ + //slow option + for(i=1; i<=r;i++){ + panelsubview(xi=.,x,i,info) + panelsubview(wi=.,w,i,info) + for(j=1;j<=jj;j++){ + X1[i,j] = diagonal(quadvariance(xi[.,j],wi))' + } + } + } + else{ + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + rr2 = info[i,1],1 \ info[i,2],1 + X1[i,.] = diagonal(quadvariance(x[|rr|],w[|rr2|]))' + } + } + return(X1) +} + + +//data should have been previously sorted +function _fastsum(real matrix x, real matrix w, real matrix info, miss){ + //ww = strtoreal(stlocal("rawsum") + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + if ((hasmissing(x)+hasmissing(w))!=0){ + //slow option + for(i=1; i<=r;i++){ + panelsubview(xi=.,x,i,info) + panelsubview(wi=.,w,i,info) + + for(j=1;j<=jj;j++){ + if (miss==1){ + if (colnonmissing(xi[.,j])!=0){ + if(j==1) X1[i,1] = quadcolsum(xi[.,j]:*wi) + else X1[i,j] = quadcolsum(xi[.,j]:*wi) + } + else{ + if(j==1) X1[i,1] = . + else X1[i,j] = . + } + } + else{ + if(j==1) X1[i,1] = quadcolsum(xi[.,j]:*wi) + else X1[i,j] = quadcolsum(xi[.,j]:*wi) + } + } + } + } + else{ + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + rr2 = info[i,1],1 \ info[i,2],1 + X1[i,.] = quadcolsum(x[|rr|]:*w[|rr2|]) + } + } + return(X1) +} + +function _fastfirst(real matrix x, real matrix info){ + + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + X1[i,.] = x[info[i,1],.] + } + + + return(X1) +} + +function _fastmax(real matrix x, real matrix info){ + + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + X1[i,.] = colmax(x[|rr|]) + } + + + return(X1) +} + +function _fastmin(real matrix x, real matrix info){ + + r = rows(info) + jj = cols(x) + X1 = J(rows(info),cols(x),0) + //check to see if we can use block + + for(i=1; i<=r; i++){ + rr = info[i,1],. \info[i,2],. + X1[i,.] = colmin(x[|rr|]) + } + + + return(X1) +} + +//data should have been previously sorted +function _fastcount(real matrix x, real matrix info) { + r = rows(info) + jj = cols(x) + X1 = J(r,jj,0) + //fest = (st_local("std")~="" ? J(2,3,NULL) : J(1,3,NULL)) + for(i=1; i<=r; i++){ + rr = info[i,1],. \ info[i,2],. + rr2 = info[i,1],1 \ info[i,2],1 + X1[i,.] = quadcolsum((x[|rr|]:<.)) + } + return(X1) +} + +end + +*! version 2.0 (21 March 2020) cpbcalc +* Paul Corral - World Bank Group +* Jose Montes - World Bank Group +* Joao Pedro Azevedo - World Bank Group + +cap prog drop CpBCaLC +program define CpBCaLC, rclass + version 11.2 + syntax varlist(max=1 numeric) [if] [in] [aw pw fw] , [ gini theil poverty(numlist >=0 max=1 integer) line(numlist >0 max=1)] + +marksample touse1 +local vlist: list uniq varlist + +//Get weights matrix +mata:st_view(y=., .,"`vlist'","`touse1'") + +//Weights + local wvar : word 2 of `exp' + if "`wvar'"=="" { + tempvar w + gen `w' = 1 + local wvar `w' + } + +mata:st_view(w=., .,"`wvar'","`touse1'") + +if ("`poverty'"!=""){ + if ("`line'"==""){ + dis as error "You need to specify a threshold for poverty calculation" + error 198 + exit + } +} + +if ("`line'"!=""){ + if ("`poverty'"==""){ + dis as error "You specified a poverty line, but no FGT value" + error 198 + exit + } +} + + + + +local options gini theil poverty + +foreach x of local options{ + if ("``x''"!=""){ + if ("`x'"=="poverty"){ + mata:p=_CPBfgt(y,w,`line',``x'') + mata: st_local("_x0",strofreal(p)) + return local fgt``x'' = `_x0' + + dis in green "fgt``x'' : `_x0'" + } + else{ + if ("`x'"=="gini"){ + cap python: from __main__ import gini ; gini("`vlist'","`wvar'","`touse1'") + if _rc!=0{ + display as error "No Python :(" + mata:p=_CPB`x'(y,w) + mata: st_local("_x0",strofreal(p)) + return local `x' = `_x0' + } + else{ + display as error "Yay Python!" + return local `x' = r(gini) + } + } + else{ + mata:p=_CPB`x'(y,w) + mata: st_local("_x0",strofreal(p)) + return local `x' = `_x0' + + dis in green "`x' : `_x0'" + } + } + } +} + + +end + +mata +function _CPBtheil(y,w){ + one=ln(y:/mean(y,w)) + two=one:*(y:/mean(y,w)) + return(mean(two,w)) +} + + function _CPBtheils(x,w){ + + for(i=1;i<=cols(x);i++){ + if (i==1) out = _CPBtheil(x[.,i],w) + else out = out,_CPBtheil(x[.,i],w) + } + return(out) + } + +function _CPBgini(x, w) { + t = x,w + _sort(t,1) + x=t[.,1] + w=t[.,2] + xw = x:*w + rxw = quadrunningsum(xw) :- (xw:/2) + return(1- 2*((quadcross(rxw,w)/quadcross(x,w))/quadcolsum(w))) +} + +function _CPBginis(x,w){ + + for(i=1;i<=cols(x);i++){ + if (i==1) out = _CPBgini(x[.,i],w) + else out = out,_CPBgini(x[.,i],w) + } +return(out) +} + +function _CPBfgt(x,w,z,a){ + return(mean((x: 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( ("`weight'" == "iweight") & ("`stat'" == "hdfe") ) { + disp as err "iweight not allowed" + exit 101 + } + + if ( `"`weight'"' != "" ) { + tempvar touse w + qui gen double `w' `exp' `if' `in' + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + mark `touse' `if' `in' `wgt' + local if if `touse' + } + else local weights + + local opts `weights' `compress' `forcestrl' nods `unsorted' `missing' + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' + local gstats gfunction(stats) gstats(`stat' `anything', `options' `statprefix') + + cap noi _gtools_internal `by' `if' `in', `opts' `gstats' + local rc = _rc + global GTOOLS_CALLER "" + + * Special handling of exit behavior + * --------------------------------- + + if ( `"`stat'"' == "summarize" ) { + if ( inlist(`rc', 17001, 18201) ) { + return scalar N = 0 + return scalar sum_w = 0 + return scalar sum = 0 + } + } + + * Cleanup + * ------- + + if ( `rc' == 17999 ) { + exit 17000 + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + exit 0 + } + else if ( `rc' == 18201 ) { + exit 0 + } + else if ( `rc' == 18402 ) { + di as txt "gstats_hdfe: maximum number of iterations exceeded; convergence not achieved" + exit 430 + } + else if ( `rc' == 18301 ) { + di as txt "gstats_transform: internal parsing error (unexpected number of stats in transform)" + exit `rc' + } + else if ( `rc' ) exit `rc' + + * Returns + * ------- + + * return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + + * Extra returns + * ------------- + + if ( `"`stat'"' == "hdfe" ) { + tempname hdfe_nabsorb + matrix `hdfe_nabsorb' = r(hdfe_nabsorb) + return scalar N = `r(hdfe_nonmiss)' + if `r(hdfe_saveabs)' { + return matrix nabsorb = `hdfe_nabsorb' + } + if `r(hdfe_saveinfo)' { + return scalar iter = `r(hdfe_iter)' + return scalar feval = `r(hdfe_feval)' + } + return local algorithm = "`r(hdfe_method)'" + } + + if ( `"`stat'"' == "winsor" ) { + return scalar cutlow = r(gstats_winsor_cutlow) + return scalar cuthigh = r(gstats_winsor_cuthigh) + } + + if ( `"`stat'"' == "summarize" ) { + if ( `r(gstats_summarize_tabstat)' ) { + * disp as txt "({bf:warning}: r() results not currently saved)" + } + + { + return scalar N = r(gstats_summarize_N) // number of observations + return scalar sum_w = r(gstats_summarize_sum_w) // sum of the weights + return scalar sum = r(gstats_summarize_sum) // sum of variable + return scalar mean = r(gstats_summarize_mean) // mean + return scalar min = r(gstats_summarize_min) // minimum + return scalar max = r(gstats_summarize_max) // maximum + + if ( `r(gstats_summarize_normal)' ) { + return scalar Var = r(gstats_summarize_Var) // variance + return scalar sd = r(gstats_summarize_sd) // standard deviation + } + + if ( `r(gstats_summarize_detail)' ) { + return scalar p1 = r(gstats_summarize_p1) // 1st percentile (detail only) + return scalar p5 = r(gstats_summarize_p5) // 5th percentile (detail only) + return scalar p10 = r(gstats_summarize_p10) // 10th percentile (detail only) + return scalar p25 = r(gstats_summarize_p25) // 25th percentile (detail only) + return scalar p50 = r(gstats_summarize_p50) // 50th percentile (detail only) + return scalar p75 = r(gstats_summarize_p75) // 75th percentile (detail only) + return scalar p90 = r(gstats_summarize_p90) // 90th percentile (detail only) + return scalar p95 = r(gstats_summarize_p95) // 95th percentile (detail only) + return scalar p99 = r(gstats_summarize_p99) // 99th percentile (detail only) + return scalar skewness = r(gstats_summarize_skewness) // skewness (detail only) + return scalar kurtosis = r(gstats_summarize_kurtosis) // kurtosis (detail only) + + return scalar smallest1 = r(gstats_summarize_smallest1) // smallest + return scalar smallest2 = r(gstats_summarize_smallest2) // 2nd smallest + return scalar smallest3 = r(gstats_summarize_smallest3) // 3rd smallest + return scalar smallest4 = r(gstats_summarize_smallest4) // 4th smallest + return scalar largest4 = r(gstats_summarize_largest4) // 4th largest + return scalar largest3 = r(gstats_summarize_largest3) // 3rd largest + return scalar largest2 = r(gstats_summarize_largest2) // 2nd largest + return scalar largest1 = r(gstats_summarize_largest1) // largest + } + } + + if ( `r(gstats_summarize_pooled)' ) { + return local varlist `r(statvars)' + } + } +end diff --git a/01.code/ado/g/gstats.sthlp b/01.code/ado/g/gstats.sthlp new file mode 100755 index 0000000..d61fb06 --- /dev/null +++ b/01.code/ado/g/gstats.sthlp @@ -0,0 +1,92 @@ +{smcl} +{* *! version 0.4.0 09Jun2019}{...} +{viewerdialog gstats "dialog gstats"}{...} +{vieweralsosee "[R] gstats" "mansection R gstats"}{...} +{viewerjumpto "Syntax" "gstats##syntax"}{...} +{viewerjumpto "Description" "gstats##description"}{...} +{title:Title} + +{p2colset 5 15 23 2}{...} +{p2col :{cmd:gstats} {hline 2}} Various statistical fucntions and transformations. {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats} +{it:subcommand} +{varlist} +{ifin} +[{it:{help gstats##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats##table_options:subcommand_options}}] + +{phang} +{opt gstats} is a wrapper for various statistical functions and +transformations, including: + +{p 8 17 2} +{help gstats hdfee:{bf:hdfe}} +(alias {help gstats hdfe:{bf:residualize}}) is a fast utility for residualizing variables (i.e. HDFE transform; accepts weights). {p_end} + +{p 8 17 2} +{help gstats winsor:{bf:winsor}} +as a fast {opt winsor2} alternative (accepts weights). {p_end} + +{p 8 17 2} +{help gstats summarize:{bf:{ul:sum}marize}} and +{help gstats summarize:{bf:{ul:tab}stat}} are fast, +by-able alternatives to {opt summarize, detail} and {opt tabtsat} (accept weights). {p_end} + +{p 8 17 2} +{help gstats transform:{bf:transform}} +to apply various statistical transformations (accepts weights). {p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats} is a wrapper to several statistical fucntions and +transformations. In theory {opt gegen} would be the place to expand +{opt gtools}; however, {opt gegen}'s internally implemented functions +were written with two assumptions: first, the output is unique at the +group level; second, there is always a target variable. {opt gstats} +is written to be more flexible and allow arbitrary functions and +transformations. + +{pstd} +Weights are supported for the following subcommands: {it:winsor}, {it:summarize}, {it:tabstat}, {it:residualize}. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gtools} diff --git a/01.code/ado/g/gstats_hdfe.sthlp b/01.code/ado/g/gstats_hdfe.sthlp new file mode 100755 index 0000000..f45c29a --- /dev/null +++ b/01.code/ado/g/gstats_hdfe.sthlp @@ -0,0 +1,270 @@ +{smcl} +{* *! version 0.1.0 14Mar2022}{...} +{viewerdialog gstats_hdfe "dialog gstats_hdfe"}{...} +{vieweralsosee "[R] gstats_hdfe" "mansection R gstats_hdfe"}{...} +{viewerjumpto "Syntax" "gstats_hdfe##syntax"}{...} +{viewerjumpto "Description" "gstats_hdfe##description"}{...} +{viewerjumpto "Statistics" "gstats_hdfe##statistics"}{...} +{title:Title} + +{p2colset 5 20 23 2}{...} +{p2col :{cmd:gstats hdfe} {hline 2}} Absorb HDFE (residualize variables) {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt gstats hdfe} is in beta; see {help gstats hdfe##missing:missing features}. +(To enable beta, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats hdfe} +{varlist} +{ifin} +[{it:{help gstats hdfe##weight:weight}}] +[ +{cmd:,} {opth absorb(varlist)} +{c -(}{opth gen(newvarlist)}{c |}{opt prefix(str)}{c |}{cmd:replace}{c )-} +{it:{help gstats hdfe##table_options:options}} +] + +{pstd} If none of {cmd:gen()}, {cmd:prefix()}, or {cmd:replace} are +specified then {it:target}{cmd:=}{it:source} syntax must be supplied +instead of {varlist}: + +{p 8 17 2} +{it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + +{pstd} +{cmd:gstats hdfe} (alias {cmd:gstats residualize}) provides a fast way of +absorbing high-dimensional fixed effects (HDFE). It saves the number of levels +in each absorbed variable, accepts weights, and optionally takes {opt by()} +as an argument (in this case ancillary information is not saved by +default and must be accessed via {opt mata()}). Missing values in the +source and absorb variables are skipped row-size (the latter can be +optionally retained via {opt absorbmissing}). + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Specify Targets} +{synopt:{opth pre:fix(str)}}Generate all variables with prefix (e.g. residualized {it:x} saved to {it:prefix_x}, etc). +{p_end} +{synopt:{opth gen:erate(newvarlist)}}List of targets; must specify one per source. +{p_end} +{synopt:{opt replace}}Replace variables as applicable. (If no targets are specified, this replaces the sources.) +{p_end} +{synopt:{opt wild:parse}}Allow rename-style syntax if {it:target}{cmd:=}{it:source} is specified (e.g. {it:x*}{cmd:=}{it:prefix_x*}). +{p_end} + +{syntab :HDFE Options} +{synopt:{opth by(varlist)}}Group by variables. +{p_end} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save {opt by()} info (and absorb info by group) in mata object (default name is {bf:GtoolsByLevels}) +{p_end} +{synopt:{opt absorbmi:ssing}}Treat missing absorb levels as a group instead of dropping them. +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL {cmd:by()} variables to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary {cmd:by()} variables check and force gtools to read strL {cmd:by()} variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method for {cmd:by()} variables (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats hdfe} (alias {opt gstats residualize}) is designed as a +utility to embed in programs that require absorbing high-dimensional +fixed effects, optionally taking in weights. The number of non-missing +observations and the number of levels in each absorb variable are +returned (see {it:{help gstats hdfe##results:stored results}}). + +{pstd} +Mainly as a side-effect of being a {cmd:gtools} program, {opt by()} is +also allowed. In this case, the fixed effects are absorbed sepparately +for each group defined by {opt by()}. Note in this case the number of +non-missing observations and the number of absorb levels varies by group. +This is {bf:NOT} saved by default. The user can optionally specify +{opt mata:save}[{cmd:(}{it:str}{cmd:)}] to save information on the by levels, +including the number of non-missing rows per level and the number of +levels per absorb variable per level. + +{pstd} +{opt mata:save}[{cmd:(}{it:str}{cmd:)}] by default is stored in +{opt GtoolsByLevels} but the user may specify any name desired. +Run {opt mata GtoolsByLevels.desc()} for details on the stored +objects (also see {it:{help gstats hdfe##results:stored results}} below). + +{marker examples}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_hdfe/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gstats hdfe} stores the following in {cmd:r()}: + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Macros}{p_end} +{synopt:{cmd:r(algorithm)}} algorithm used for HDFE absorption{p_end} +{p2colreset}{...} + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of {opt by()} groups {p_end} +{synopt:{cmd:r(minJ) }} largest {opt by()} group size {p_end} +{synopt:{cmd:r(maxJ) }} smallest {opt by()} group size {p_end} +{synopt:{cmd:r(iter) }} (without {opt by()}) iterations of absorption algorithm {p_end} +{synopt:{cmd:r(feval)}} (without {opt by()}) function evaluations in absorption algorithm {p_end} +{p2colreset}{...} + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(nabsorb)}} (without {opt by()}) vector with number of levels in each absorb variable{p_end} +{p2colreset}{...} + +{pstd} +With {opt mata:save}[{cmd:(}{it:str}{cmd:)}], the following data is +stored in the mata object: + + string matrix nj + non-missing observations in each -by- group + + string matrix njabsorb + number of absorbed levels in each -by- group by each absorb variable + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + real rowvector charpos + position of kth character variable + + string matrix printed + formatted (printf-ed) variable levels (not with option -silent-) + +{marker missing}{...} +{title:Missing Features} + +{pstd} +Check whether it's mathematically OK to apply SQUAREM. In general it's meant +for contractions but my understanding is that it can be applied to any +monotonically convergent algorithm. + +{pstd} +Improve convergence criterion. Current criterion may not be sensible. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}, and this specific +function was inspired by Sergio Correia's {it:reghdfe}: +{browse "https: //github.com/sergiocorreia/reghdfe"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_hdfe/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/gstats_moving.sthlp b/01.code/ado/g/gstats_moving.sthlp new file mode 100755 index 0000000..e656aaf --- /dev/null +++ b/01.code/ado/g/gstats_moving.sthlp @@ -0,0 +1,292 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_transform "dialog gstats_transform"}{...} +{vieweralsosee "[R] gstats_transform" "mansection R gstats_transform"}{...} +{viewerjumpto "Syntax" "gstats_transform##syntax"}{...} +{viewerjumpto "Description" "gstats_transform##description"}{...} +{viewerjumpto "Statistics" "gstats_transform##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats transform} {hline 2}} Apply statistical functions by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats transform} +{it:clist} +{ifin} +[{it:{help gstats transform##weight:weight}}] +[{cmd:,} +{it:{help gstats transform##table_options:options}}] + +{pstd}where {it:clist} is either + +{p 8 17 2} +[{opt (stat)}] +{varlist} +[ [{opt (stat)}] {it:...} ]{p_end} + +{p 8 17 2} +[{opt (stat)}] {it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + [ [{opt (stat)}] {it:...}] + +{p 4 4 2}or any combination of the {it:varlist} or {it:target_var} forms, and +{it:stat} is one of{p_end} + +{p2colset 9 28 30 2}{...} +{p2col :{opt demean}}subtract the mean (default){p_end} +{p2col :{opt demedian}}subtract the median{p_end} +{p2col :{opt normalize}}(x - mean) / sd{p_end} +{p2col :{opt standardize}}same as {opt normalize}{p_end} +{p2col :{opt moving stat [# #]}}moving statistic {it:stat}; # specify the relative bounds ({help gstats transform##moving_format:see below}){p_end} +{p2col :{opt range stat [...]}}range statistic {it:stat} for observations within specified interval ({help gstats transform##interval_format:see below}){p_end} +{p2col :{opt cumsum [+/- [varname]]}}cumulative sum, optionally ascending (+) or descending (-) (optionally +/- by varname){p_end} +{p2col :{opt shift [[+/-]#]}}lags (-#) and leads (+#); unsigned numbers are positive (i.e. leads){p_end} +{p2col :{opt rank}}rank observations; use option {opt ties()} to specify how ties are handled{p_end} +{p2colreset}{...} + +{p 4 4 2} Some of the above transformations allow specifying various +options as part of their name. This is done to allow the user to request +various versions of the same transformation. However, this is not +required. The user can specify a global option that will be used for +all the corresponding transformations: + +{p2colset 9 28 30 2}{...} +{p2col :{opt moving stat}}{opt window()}{p_end} +{p2col :{opt range stat}}{opt interval()}{p_end} +{p2col :{opt cumsum}}{opt cumby()}{p_end} +{p2col :{opt shift}}{opt shiftby()}{p_end} +{p2colreset}{...} + +{p 4 4 2} Note {cmd:gstats moving} and {cmd:gstats range} are aliases +for {cmd:gstats transform}. In this case all the requested statistics +are assumed to be moving or range statistics, respectively. Finally, +{cmd:moving} and {bf:range} may be combined with any one of the +folloing:{p_end} + +{p2colset 9 22 24 2}{...} +{p2col :{opt mean}}means (default){p_end} +{p2col :{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{p2col :{opt count}}number of nonmissing observations{p_end} +{p2col :{opt nmissing}}number of missing observations{p_end} +{p2col :{opt sum}}sums{p_end} +{p2col :{opt rawsum}}sums, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{p2col :{opt nansum}}sum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt rawnansum}}rawsum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt median}}medians (same as {opt p50}){p_end} +{p2col :{opt p#.#}}arbitrary quantiles{p_end} +{p2col :{opt p1}}1st percentile{p_end} +{p2col :{opt p2}}2nd percentile{p_end} +{p2col :{it:...}}3rd{hline 1}49th percentiles{p_end} +{p2col :{opt p50}}50th percentile (same as {cmd:median}){p_end} +{p2col :{it:...}}51st{hline 1}97th percentiles{p_end} +{p2col :{opt p98}}98th percentile{p_end} +{p2col :{opt p99}}99th percentile{p_end} +{p2col :{opt iqr}}interquartile range{p_end} +{p2col :{opt sd}}standard deviation{p_end} +{p2col :{opt var:iance}}variance{p_end} +{p2col :{opt cv}}coefficient of variation ({cmd:sd/mean}){p_end} +{p2col :{opt select#}}#th smallest{p_end} +{p2col :{opt select-#}}#th largest{p_end} +{p2col :{opt rawselect#}}#th smallest, ignoring weights{p_end} +{p2col :{opt rawselect-#}}#th largest, ignoring weights{p_end} +{p2col :{opt max}}maximums{p_end} +{p2col :{opt min}}minimums{p_end} +{p2col :{opt range}}range = {opt max} - {opt min}{p_end} +{p2col :{opt first}}first value{p_end} +{p2col :{opt last}}last value{p_end} +{p2col :{opt firstnm}}first nonmissing value{p_end} +{p2col :{opt lastnm}}last nonmissing value{p_end} +{p2col :{opt sem:ean}}standard error of the mean ({cmd:sd/sqrt(n)}){p_end} +{p2col :{opt seb:inomial}}standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}) (missing if source not 0, 1){p_end} +{p2col :{opt sep:oisson}}standard error of the mean, Poisson ({cmd:sqrt(mean / n)}) (result rounded to nearest integer){p_end} +{p2col :{opt skewness}}Skewness{p_end} +{p2col :{opt kurtosis}}Kurtosis{p_end} +{p2col :{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{p2col :{opt gini dropneg}}Gini coefficient (negative values dropped){p_end} +{p2col :{opt gini keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{p2colreset}{...} + +{marker interval_format}{...} +{dlgtab:Interval format} + +{pstd} +{cmd:range stat} must specify an interval or use the {opt interval(...)} +option. The interval must be of the form + +{p 8 17 2} +{bf:#}[{it:statlow}] {bf:#}[{it:stathigh}] [{it:var}] + +{pstd} +This computes, for each observation {it:i}, the summary statistic {it:stat} +among all observations {it:j} of the source variable such that + +{p 8 17 2} +var[i] + # * statlow(var) <= var[j] <= var[i] + # * stathigh(var) + +{pstd} +if {it:var} is not specified, it is taken to be the source variable itself. +{it:statlow} and {it:stathigh} are summary statistics computed based on +{it:every} value of {it:var}. If they are not specified, then {bf:#} is used by +itself to construct the bounds, but {bf:#} may be missing ({bf:.}) to mean no +upper or lower bound. For example, given some variable {it:x} with {it:N} observations, +we have{p_end} + + Input -> Meaning + {hline 55} + -2 2 time -> j: time[i] - 2 <= time[j] <= time[i] + 2 + i.e. {it:stat} within a 2-period time window + + -sd sd -> j: x[i] - sd(x) <= x[j] <= x[i] + sd(x) + i.e. {it:stat} for obs within a standard dev + +{marker moving_format}{...} +{dlgtab:Moving window format} + +{pstd}{bf:moving stat} must specify a relative range or use the {opt window(# #)} +option. The relative range uses a window defined by the {it:observations}. This +would be equivalent to computing time series rolling window statistics +using the time variable set to {it:_n}. For example, given some variable +{it:x} with {it:N} observations, we have{p_end} + + Input -> Range + {hline 31} + -3 3 -> x[i - 3] to x[i + 3] + -3 . -> x[i - 3] to x[N] + . 3 -> x[1] to x[i + 3] + -3 -1 -> x[i - 3] to x[i - 1] + -3 0 -> x[i - 3] to x[i] + 5 10 -> x[i + 5] to x[i + 10] + +{pstd}and so on. If the observation is outside of the admisible range +(e.g. {it:-10 10} but {it:i = 5}) the output is set to missing. If you +don't specify a range in ({it:moving stat}) then the range in {opt:window(# #)} +is used. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Common Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} +{synopt :{opt wild:parse}}Allow rename-style syntax in target naming. +{p_end} +{synopt:{opt labelf:ormat}}Custom label engine: {bf:(#stat#) #sourcelabel#} is the default. +{p_end} +{synopt:{opth labelp:rogram(str)}}Program to parse {opt labelformat} (see examples). +{p_end} +{synopt :{opth auto:rename}[{cmd:(}{str}{cmd:)}]}Automatically name targets based on requested stats. Default is {it:#source#_#stat#}. +{p_end} +{synopt:{opt nogreedy}}Use slower but memory-efficient (non-greedy) algorithm. +{p_end} +{synopt:{opth type:s(str)}}Override variable types for targets ({bf:use with caution}). +{p_end} + +{syntab :Command Options} +{synopt:{opt window(lower upper)}}With {it:moving stat}. Relative observation range for moving statistics (if not specified in call). E.g. {opt window(-3 1)} means from 3 lag to 1 lead. {opt window(. #)} and {opt window(# .)} mean from the start and through the end. +{p_end} +{synopt:{opt interval(#[stat] #[stat] [var])}}With {it:range stat}. Interval for range statistics that don't specify their own interval. +{p_end} +{synopt:{opt cumby([+/- [varname]])}}With {it:cumsum}. Sort options for cumsum variables that don't specify their own. +{p_end} +{synopt:{opt shiftby([+/-]#)}}With {it:shift}. Lag or lead when to use {bf:shift} is requested without specifying a number. +{p_end} +{synopt:{opt ties(str)}}With {it:rank}. How to break ties for {opt rank}. {opt d:efault} assigns the average rank; {opt u:nique} breaks ties arbitrarily; {opt stableunique} breaks ties using the order values appear in the data; {opt f:ield} counts the number of values greater than; {opt t:rack} counts the number of values less than. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s +are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gstats transform} applies various statistical transformations +to input data. It is similar to {cmd:gcollapse, merge} or {cmd:gegen} but +for individual-level transformations. That is, {cmd:gcollapse} takes an +input variable and procudes a single statistic; {cmd:gstats transform} +applies a function to each element of the input variable. For example, +subtracting the mean. + +{pstd} +Every function available to {cmd:gstats transform} can be called via +{cmd:gegen}. Further, note that while not every function will use weights +in their computations (e.g. {it:shift} ignores weights in the actual +transformation), if weights are specified they will be used to flag +acceptable observations (i.e. missing, zero, and, except for {opt iweights}, +negative observations get excluded). + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_transform/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help gegen}; +{help gcollapse}; +{help gtools} diff --git a/01.code/ado/g/gstats_range.sthlp b/01.code/ado/g/gstats_range.sthlp new file mode 100755 index 0000000..e656aaf --- /dev/null +++ b/01.code/ado/g/gstats_range.sthlp @@ -0,0 +1,292 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_transform "dialog gstats_transform"}{...} +{vieweralsosee "[R] gstats_transform" "mansection R gstats_transform"}{...} +{viewerjumpto "Syntax" "gstats_transform##syntax"}{...} +{viewerjumpto "Description" "gstats_transform##description"}{...} +{viewerjumpto "Statistics" "gstats_transform##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats transform} {hline 2}} Apply statistical functions by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats transform} +{it:clist} +{ifin} +[{it:{help gstats transform##weight:weight}}] +[{cmd:,} +{it:{help gstats transform##table_options:options}}] + +{pstd}where {it:clist} is either + +{p 8 17 2} +[{opt (stat)}] +{varlist} +[ [{opt (stat)}] {it:...} ]{p_end} + +{p 8 17 2} +[{opt (stat)}] {it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + [ [{opt (stat)}] {it:...}] + +{p 4 4 2}or any combination of the {it:varlist} or {it:target_var} forms, and +{it:stat} is one of{p_end} + +{p2colset 9 28 30 2}{...} +{p2col :{opt demean}}subtract the mean (default){p_end} +{p2col :{opt demedian}}subtract the median{p_end} +{p2col :{opt normalize}}(x - mean) / sd{p_end} +{p2col :{opt standardize}}same as {opt normalize}{p_end} +{p2col :{opt moving stat [# #]}}moving statistic {it:stat}; # specify the relative bounds ({help gstats transform##moving_format:see below}){p_end} +{p2col :{opt range stat [...]}}range statistic {it:stat} for observations within specified interval ({help gstats transform##interval_format:see below}){p_end} +{p2col :{opt cumsum [+/- [varname]]}}cumulative sum, optionally ascending (+) or descending (-) (optionally +/- by varname){p_end} +{p2col :{opt shift [[+/-]#]}}lags (-#) and leads (+#); unsigned numbers are positive (i.e. leads){p_end} +{p2col :{opt rank}}rank observations; use option {opt ties()} to specify how ties are handled{p_end} +{p2colreset}{...} + +{p 4 4 2} Some of the above transformations allow specifying various +options as part of their name. This is done to allow the user to request +various versions of the same transformation. However, this is not +required. The user can specify a global option that will be used for +all the corresponding transformations: + +{p2colset 9 28 30 2}{...} +{p2col :{opt moving stat}}{opt window()}{p_end} +{p2col :{opt range stat}}{opt interval()}{p_end} +{p2col :{opt cumsum}}{opt cumby()}{p_end} +{p2col :{opt shift}}{opt shiftby()}{p_end} +{p2colreset}{...} + +{p 4 4 2} Note {cmd:gstats moving} and {cmd:gstats range} are aliases +for {cmd:gstats transform}. In this case all the requested statistics +are assumed to be moving or range statistics, respectively. Finally, +{cmd:moving} and {bf:range} may be combined with any one of the +folloing:{p_end} + +{p2colset 9 22 24 2}{...} +{p2col :{opt mean}}means (default){p_end} +{p2col :{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{p2col :{opt count}}number of nonmissing observations{p_end} +{p2col :{opt nmissing}}number of missing observations{p_end} +{p2col :{opt sum}}sums{p_end} +{p2col :{opt rawsum}}sums, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{p2col :{opt nansum}}sum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt rawnansum}}rawsum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt median}}medians (same as {opt p50}){p_end} +{p2col :{opt p#.#}}arbitrary quantiles{p_end} +{p2col :{opt p1}}1st percentile{p_end} +{p2col :{opt p2}}2nd percentile{p_end} +{p2col :{it:...}}3rd{hline 1}49th percentiles{p_end} +{p2col :{opt p50}}50th percentile (same as {cmd:median}){p_end} +{p2col :{it:...}}51st{hline 1}97th percentiles{p_end} +{p2col :{opt p98}}98th percentile{p_end} +{p2col :{opt p99}}99th percentile{p_end} +{p2col :{opt iqr}}interquartile range{p_end} +{p2col :{opt sd}}standard deviation{p_end} +{p2col :{opt var:iance}}variance{p_end} +{p2col :{opt cv}}coefficient of variation ({cmd:sd/mean}){p_end} +{p2col :{opt select#}}#th smallest{p_end} +{p2col :{opt select-#}}#th largest{p_end} +{p2col :{opt rawselect#}}#th smallest, ignoring weights{p_end} +{p2col :{opt rawselect-#}}#th largest, ignoring weights{p_end} +{p2col :{opt max}}maximums{p_end} +{p2col :{opt min}}minimums{p_end} +{p2col :{opt range}}range = {opt max} - {opt min}{p_end} +{p2col :{opt first}}first value{p_end} +{p2col :{opt last}}last value{p_end} +{p2col :{opt firstnm}}first nonmissing value{p_end} +{p2col :{opt lastnm}}last nonmissing value{p_end} +{p2col :{opt sem:ean}}standard error of the mean ({cmd:sd/sqrt(n)}){p_end} +{p2col :{opt seb:inomial}}standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}) (missing if source not 0, 1){p_end} +{p2col :{opt sep:oisson}}standard error of the mean, Poisson ({cmd:sqrt(mean / n)}) (result rounded to nearest integer){p_end} +{p2col :{opt skewness}}Skewness{p_end} +{p2col :{opt kurtosis}}Kurtosis{p_end} +{p2col :{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{p2col :{opt gini dropneg}}Gini coefficient (negative values dropped){p_end} +{p2col :{opt gini keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{p2colreset}{...} + +{marker interval_format}{...} +{dlgtab:Interval format} + +{pstd} +{cmd:range stat} must specify an interval or use the {opt interval(...)} +option. The interval must be of the form + +{p 8 17 2} +{bf:#}[{it:statlow}] {bf:#}[{it:stathigh}] [{it:var}] + +{pstd} +This computes, for each observation {it:i}, the summary statistic {it:stat} +among all observations {it:j} of the source variable such that + +{p 8 17 2} +var[i] + # * statlow(var) <= var[j] <= var[i] + # * stathigh(var) + +{pstd} +if {it:var} is not specified, it is taken to be the source variable itself. +{it:statlow} and {it:stathigh} are summary statistics computed based on +{it:every} value of {it:var}. If they are not specified, then {bf:#} is used by +itself to construct the bounds, but {bf:#} may be missing ({bf:.}) to mean no +upper or lower bound. For example, given some variable {it:x} with {it:N} observations, +we have{p_end} + + Input -> Meaning + {hline 55} + -2 2 time -> j: time[i] - 2 <= time[j] <= time[i] + 2 + i.e. {it:stat} within a 2-period time window + + -sd sd -> j: x[i] - sd(x) <= x[j] <= x[i] + sd(x) + i.e. {it:stat} for obs within a standard dev + +{marker moving_format}{...} +{dlgtab:Moving window format} + +{pstd}{bf:moving stat} must specify a relative range or use the {opt window(# #)} +option. The relative range uses a window defined by the {it:observations}. This +would be equivalent to computing time series rolling window statistics +using the time variable set to {it:_n}. For example, given some variable +{it:x} with {it:N} observations, we have{p_end} + + Input -> Range + {hline 31} + -3 3 -> x[i - 3] to x[i + 3] + -3 . -> x[i - 3] to x[N] + . 3 -> x[1] to x[i + 3] + -3 -1 -> x[i - 3] to x[i - 1] + -3 0 -> x[i - 3] to x[i] + 5 10 -> x[i + 5] to x[i + 10] + +{pstd}and so on. If the observation is outside of the admisible range +(e.g. {it:-10 10} but {it:i = 5}) the output is set to missing. If you +don't specify a range in ({it:moving stat}) then the range in {opt:window(# #)} +is used. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Common Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} +{synopt :{opt wild:parse}}Allow rename-style syntax in target naming. +{p_end} +{synopt:{opt labelf:ormat}}Custom label engine: {bf:(#stat#) #sourcelabel#} is the default. +{p_end} +{synopt:{opth labelp:rogram(str)}}Program to parse {opt labelformat} (see examples). +{p_end} +{synopt :{opth auto:rename}[{cmd:(}{str}{cmd:)}]}Automatically name targets based on requested stats. Default is {it:#source#_#stat#}. +{p_end} +{synopt:{opt nogreedy}}Use slower but memory-efficient (non-greedy) algorithm. +{p_end} +{synopt:{opth type:s(str)}}Override variable types for targets ({bf:use with caution}). +{p_end} + +{syntab :Command Options} +{synopt:{opt window(lower upper)}}With {it:moving stat}. Relative observation range for moving statistics (if not specified in call). E.g. {opt window(-3 1)} means from 3 lag to 1 lead. {opt window(. #)} and {opt window(# .)} mean from the start and through the end. +{p_end} +{synopt:{opt interval(#[stat] #[stat] [var])}}With {it:range stat}. Interval for range statistics that don't specify their own interval. +{p_end} +{synopt:{opt cumby([+/- [varname]])}}With {it:cumsum}. Sort options for cumsum variables that don't specify their own. +{p_end} +{synopt:{opt shiftby([+/-]#)}}With {it:shift}. Lag or lead when to use {bf:shift} is requested without specifying a number. +{p_end} +{synopt:{opt ties(str)}}With {it:rank}. How to break ties for {opt rank}. {opt d:efault} assigns the average rank; {opt u:nique} breaks ties arbitrarily; {opt stableunique} breaks ties using the order values appear in the data; {opt f:ield} counts the number of values greater than; {opt t:rack} counts the number of values less than. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s +are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gstats transform} applies various statistical transformations +to input data. It is similar to {cmd:gcollapse, merge} or {cmd:gegen} but +for individual-level transformations. That is, {cmd:gcollapse} takes an +input variable and procudes a single statistic; {cmd:gstats transform} +applies a function to each element of the input variable. For example, +subtracting the mean. + +{pstd} +Every function available to {cmd:gstats transform} can be called via +{cmd:gegen}. Further, note that while not every function will use weights +in their computations (e.g. {it:shift} ignores weights in the actual +transformation), if weights are specified they will be used to flag +acceptable observations (i.e. missing, zero, and, except for {opt iweights}, +negative observations get excluded). + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_transform/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help gegen}; +{help gcollapse}; +{help gtools} diff --git a/01.code/ado/g/gstats_residualize.sthlp b/01.code/ado/g/gstats_residualize.sthlp new file mode 100755 index 0000000..f45c29a --- /dev/null +++ b/01.code/ado/g/gstats_residualize.sthlp @@ -0,0 +1,270 @@ +{smcl} +{* *! version 0.1.0 14Mar2022}{...} +{viewerdialog gstats_hdfe "dialog gstats_hdfe"}{...} +{vieweralsosee "[R] gstats_hdfe" "mansection R gstats_hdfe"}{...} +{viewerjumpto "Syntax" "gstats_hdfe##syntax"}{...} +{viewerjumpto "Description" "gstats_hdfe##description"}{...} +{viewerjumpto "Statistics" "gstats_hdfe##statistics"}{...} +{title:Title} + +{p2colset 5 20 23 2}{...} +{p2col :{cmd:gstats hdfe} {hline 2}} Absorb HDFE (residualize variables) {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{pstd} +{it:Warning}: {opt gstats hdfe} is in beta; see {help gstats hdfe##missing:missing features}. +(To enable beta, define {cmd:global GTOOLS_BETA = 1}.) + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats hdfe} +{varlist} +{ifin} +[{it:{help gstats hdfe##weight:weight}}] +[ +{cmd:,} {opth absorb(varlist)} +{c -(}{opth gen(newvarlist)}{c |}{opt prefix(str)}{c |}{cmd:replace}{c )-} +{it:{help gstats hdfe##table_options:options}} +] + +{pstd} If none of {cmd:gen()}, {cmd:prefix()}, or {cmd:replace} are +specified then {it:target}{cmd:=}{it:source} syntax must be supplied +instead of {varlist}: + +{p 8 17 2} +{it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + +{pstd} +{cmd:gstats hdfe} (alias {cmd:gstats residualize}) provides a fast way of +absorbing high-dimensional fixed effects (HDFE). It saves the number of levels +in each absorbed variable, accepts weights, and optionally takes {opt by()} +as an argument (in this case ancillary information is not saved by +default and must be accessed via {opt mata()}). Missing values in the +source and absorb variables are skipped row-size (the latter can be +optionally retained via {opt absorbmissing}). + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Specify Targets} +{synopt:{opth pre:fix(str)}}Generate all variables with prefix (e.g. residualized {it:x} saved to {it:prefix_x}, etc). +{p_end} +{synopt:{opth gen:erate(newvarlist)}}List of targets; must specify one per source. +{p_end} +{synopt:{opt replace}}Replace variables as applicable. (If no targets are specified, this replaces the sources.) +{p_end} +{synopt:{opt wild:parse}}Allow rename-style syntax if {it:target}{cmd:=}{it:source} is specified (e.g. {it:x*}{cmd:=}{it:prefix_x*}). +{p_end} + +{syntab :HDFE Options} +{synopt:{opth by(varlist)}}Group by variables. +{p_end} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save {opt by()} info (and absorb info by group) in mata object (default name is {bf:GtoolsByLevels}) +{p_end} +{synopt:{opt absorbmi:ssing}}Treat missing absorb levels as a group instead of dropping them. +{p_end} +{synopt:{opth algorithm(str)}}Algorithm used to absorb HDFE: CG (conjugate gradient), MAP (alternating projections), SQUAREM (squared extrapolation), IT (Irons and Tuck). +{p_end} +{synopt:{opth maxiter(int)}}Maximum number of algorithm iterations (default 100,000). Pass {it:.} for unlimited iterations. +{p_end} +{synopt:{opth tol:erance(real)}}Convergence tolerance (default 1e-8). +{p_end} +{synopt:{opth trace:iter}}Trace algorithm iterations. +{p_end} +{synopt:{opth stan:dardize}}Standardize variables before algorithm. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL {cmd:by()} variables to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary {cmd:by()} variables check and force gtools to read strL {cmd:by()} variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method for {cmd:by()} variables (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats hdfe} (alias {opt gstats residualize}) is designed as a +utility to embed in programs that require absorbing high-dimensional +fixed effects, optionally taking in weights. The number of non-missing +observations and the number of levels in each absorb variable are +returned (see {it:{help gstats hdfe##results:stored results}}). + +{pstd} +Mainly as a side-effect of being a {cmd:gtools} program, {opt by()} is +also allowed. In this case, the fixed effects are absorbed sepparately +for each group defined by {opt by()}. Note in this case the number of +non-missing observations and the number of absorb levels varies by group. +This is {bf:NOT} saved by default. The user can optionally specify +{opt mata:save}[{cmd:(}{it:str}{cmd:)}] to save information on the by levels, +including the number of non-missing rows per level and the number of +levels per absorb variable per level. + +{pstd} +{opt mata:save}[{cmd:(}{it:str}{cmd:)}] by default is stored in +{opt GtoolsByLevels} but the user may specify any name desired. +Run {opt mata GtoolsByLevels.desc()} for details on the stored +objects (also see {it:{help gstats hdfe##results:stored results}} below). + +{marker examples}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_hdfe/index.html#examples":online documentation} +for examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gstats hdfe} stores the following in {cmd:r()}: + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Macros}{p_end} +{synopt:{cmd:r(algorithm)}} algorithm used for HDFE absorption{p_end} +{p2colreset}{...} + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of {opt by()} groups {p_end} +{synopt:{cmd:r(minJ) }} largest {opt by()} group size {p_end} +{synopt:{cmd:r(maxJ) }} smallest {opt by()} group size {p_end} +{synopt:{cmd:r(iter) }} (without {opt by()}) iterations of absorption algorithm {p_end} +{synopt:{cmd:r(feval)}} (without {opt by()}) function evaluations in absorption algorithm {p_end} +{p2colreset}{...} + +{synoptset 15 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(nabsorb)}} (without {opt by()}) vector with number of levels in each absorb variable{p_end} +{p2colreset}{...} + +{pstd} +With {opt mata:save}[{cmd:(}{it:str}{cmd:)}], the following data is +stored in the mata object: + + string matrix nj + non-missing observations in each -by- group + + string matrix njabsorb + number of absorbed levels in each -by- group by each absorb variable + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + real rowvector charpos + position of kth character variable + + string matrix printed + formatted (printf-ed) variable levels (not with option -silent-) + +{marker missing}{...} +{title:Missing Features} + +{pstd} +Check whether it's mathematically OK to apply SQUAREM. In general it's meant +for contractions but my understanding is that it can be applied to any +monotonically convergent algorithm. + +{pstd} +Improve convergence criterion. Current criterion may not be sensible. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}, and this specific +function was inspired by Sergio Correia's {it:reghdfe}: +{browse "https: //github.com/sergiocorreia/reghdfe"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{marker references}{...} +{title:References} + +{pstd} +See +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_hdfe/index.html#references":online documentation} +for the list of references. + +{title:Also see} + +{pstd} +help for +{help gtools} diff --git a/01.code/ado/g/gstats_sum.sthlp b/01.code/ado/g/gstats_sum.sthlp new file mode 100755 index 0000000..c67d75a --- /dev/null +++ b/01.code/ado/g/gstats_sum.sthlp @@ -0,0 +1,368 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_summarize "dialog gstats_summarize"}{...} +{vieweralsosee "[R] gstats_summarize" "mansection R gstats_summarize"}{...} +{viewerjumpto "Syntax" "gstats_summarize##syntax"}{...} +{viewerjumpto "Description" "gstats_summarize##description"}{...} +{viewerjumpto "Statistics" "gstats_summarize##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats summarize} {hline 2}} Summary statistics by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats {ul:sum}marize} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{p 8 17 2} +{cmd:gstats {ul:tab}stat} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{pstd} +{cmd:gstats {ul:tab}stat} and {cmd:gstats {ul:sum}marize} are fast, by-able +alternatives to {opt tabstat} and {opt summarize, detail}. +If {cmd:gstats summarize} is called with {opt by()} or {opt tab}, a table +in the style of {opt tabstat} is produced that inclues all the summary +statistics included by default in {opt summarize, detail}. + +{pstd} +Note the {it:prefixes} {cmd:by}, {cmd:rolling}, {cmd:statsby} are +{cmd:{it:not}} supported. To compute a table of statistics by a group +use the option {opt by()}. With {opt by()}, {opt gstats tab} is also +faster than {cmd:gcollapse}. + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Tabstat Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{cmdab:s:tatistics:(}{it:{help gstats_summarize##statname:stat}} [{it:...}]{cmd:)}}Report +specified statistics; default for {opt tabstat} is count, sum, mean, sd, min, max. +{p_end} +{synopt:{opt col:umns(stat|var)}}Columns are statistics (default) or variables. +{p_end} +{synopt:{opt pretty:stats}}Pretty statistic header names +{p_end} +{synopt:{opth labelw:idth(int)}}Max by variable label/value width. +{p_end} +{synopt:{opt f:ormat}[{cmd:(%}{it:{help format:fmt}}{cmd:)}]} +Use format to display summary stats; default %9.0g +{p_end} + +{syntab :Summarize Options} +{synopt:{opt nod:etail}}Do not display the full set of statistics. +{p_end} +{synopt:{opt mean:only}}Calculate only the count, sum, mean, min, max. +{p_end} +{synopt:{opth by(varlist)}}Group by variable; all stats are computed but output is in the style of tabstat. +{p_end} +{synopt:{opt sep:arator(#)}}Draw separator line after every {it:#} variables; default is {cmd:separator(5)}. +{p_end} +{synopt:{opt tab:stat}}Compute and display statistics in the style of {opt tabstat}. +{p_end} + +{syntab :Common Options} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GstatsOutput}) +{p_end} +{synopt:{opt pool:ed}}Pool varlist +{p_end} +{synopt:{opt noprint}}Do not print +{p_end} +{synopt:{opt f:ormat}}Use variable's display format. +{p_end} +{synopt:{opt nomiss:ing}}With {opt by()}, ignore groups with missing entries. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats tab} and {opt gstats sum} are mainly designed to report +statistics by group. It does not modify the data in memory, +so it is a nice alternative to {opt gcollapse} when there are few +groups and you want to compute summary stats more quickly. + +{pstd} +{opt gstats sum} by default computes the staistics that are reported by +{opt sum, detail} and without {opt by()} it is anywhere from 5 to 40 +times faster. The lower end of the speed gains are for Stata/MP, but +{opt sum, detail} is very slow in versions of Stata that are not multi-threaded. +The behavior of plain {opt summarize} and {opt summarize, meanonly} +can be recovered via options {opt nodetail} and {opt meanonly}, but Stata +is not specially slow in this case. Hence they are mainly included for +use with {opt by()}, where {opt gstats sum} is again faster. + +{pstd} +{opt gstats tab} should be faster than {opt tabstat} even without +groups, but the speed gains are largest with even a modest number of +levels in {opt by()}. Furthermore, an arbitrary number of grouping +variables are allowed. Note that with a very large numer of groups, +{opt tabstat}'s runtime seems to scale non-linearly, while {opt gstats tab} +will execute in a reasonable time. + +{pstd} +{opt gstata tab} does not store results in {opt r()}. Rather, the option {opt matasave} +is provided to store the full set of summary statistics and the by variable +levels in a mata class object called {opt statsOutput} (the name of the object +can be changed via {opt matasave(name)}). Run {opt mata GstatsOutput.desc()} +after {opt gstats tab, matasave} for details. The following helper functions are provided: + + string scalar getf(j, l, maxlbl) + get formatted (j, l) entry from by variables up to maxlbl characters + + real matrix getnum(j, l) + get (j, l) numeric entry from by variables + + string matrix getchar(j, l,| raw) + get (j, l) numeric entry from by variables; raw controls whether to null-pad entries + + real rowvector getOutputRow(j) + get jth output row + + real colvector getOutputCol(j) + get jth output column by position + + real matrix getOutputVar(var) + get jth output var by name + + real matrix getOutputGroup(j) + get jth output group + +{pstd} +The following data is stored {opt GstatsOutput}: + + summary statistics + ------------------ + + real matrix output + matrix with output statistics; J x kstats x kvars + + real scalar colvar + 1: columns are variables, rows are statistics; 0: the converse + + real scalar ksources + number of variable sources (0 if pool is true) + + real scalar kstats + number of statistics + + real matrix tabstat + 1: used tabstat; 0: used summarize + + string rowvector statvars + variables summarized + + string rowvector statnames + statistics computed + + real rowvector scodes + internal code for summary statistics + + real scalar pool + pooled source variables + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + printing options + ---------------- + + void printOutput() + print summary table + + real scalar maxlbl + max by variable label/value width + + real scalar pretty + print pretty statistic names + + real scalar usevfmt + use variable format for printing + + string scalar dfmt + fallback printing format + + real scalar maxl + maximum column length + + void readDefaults() + reset printing defaults + +{marker statistics}{...} +{title:Statistics} + +{phang} +{cmd:statistics(}{it:statname} [{it:...}]{cmd:)} + specifies the statistics to be displayed; the default with {opt tabstat} + is equivalent to specifying {cmd:statistics(mean)}. ({opt stats()} + is a synonym for {opt statistics()}.) Multiple statistics + may be specified and are separated by white space, such as + {cmd:statistics(mean sd)}. Available statistics are + +{marker statname}{...} +{synoptset 17}{...} +{synopt:{space 4}{it:statname}}Definition{p_end} +{space 4}{synoptline} +{synopt:{space 4}{opt me:an}} mean{p_end} +{synopt:{space 4}{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{synopt:{space 4}{opt co:unt}} count of nonmissing observations{p_end} +{synopt:{space 4}{opt n}} same as {cmd:count}{p_end} +{synopt:{space 4}{opt nmiss:ing}} number of missing observations{p_end} +{synopt:{space 4}{opt perc:ent}} percentage of nonmissing observations{p_end} +{synopt:{space 4}{opt nuniq:ue}} number of unique elements{p_end} +{synopt:{space 4}{opt su:m}} sum{p_end} +{synopt:{space 4}{opt rawsu:m}} sum, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{synopt:{space 4}{opt nansu:m}} sum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt rawnansu:m}} rawsum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt med:ian}} median (same as {opt p50}){p_end} +{synopt:{space 4}{opt p#.#}} arbitrary quantiles{p_end} +{synopt:{space 4}{opt p1}} 1st percentile{p_end} +{synopt:{space 4}{opt p2}} 2nd percentile{p_end} +{synopt:{space 4}{it:...}} 3rd-49th percentiles{p_end} +{synopt:{space 4}{opt p50}} 50th percentile (same as {opt median}){p_end} +{synopt:{space 4}{it:...}} 51st-97th percentiles{p_end} +{synopt:{space 4}{opt p98}} 98th percentile{p_end} +{synopt:{space 4}{opt p99}} 99th percentile{p_end} +{synopt:{space 4}{opt iqr}} interquartile range = {opt p75} - {opt p25}{p_end} +{synopt:{space 4}{opt q}} equivalent to specifying {cmd:p25 p50 p75}{p_end} +{synopt:{space 4}{opt sd}} standard deviation{p_end} +{synopt:{space 4}{opt v:ariance}} variance{p_end} +{synopt:{space 4}{opt cv}} coefficient of variation ({cmd:sd/mean}){p_end} +{synopt:{space 4}{opt select#}} #th smallest{p_end} +{synopt:{space 4}{opt select-#}} #th largest{p_end} +{synopt:{space 4}{opt mi:n}} minimum (same as {opt select1}){p_end} +{synopt:{space 4}{opt ma:x}} maximum (same as {opt select-1}){p_end} +{synopt:{space 4}{opt r:ange}} range = {opt max} - {opt min}{p_end} +{synopt:{space 4}{opt first}} first value{p_end} +{synopt:{space 4}{opt last}} last value{p_end} +{synopt:{space 4}{opt firstnm}} first nonmissing value{p_end} +{synopt:{space 4}{opt lastnm}} last nonmissing value{p_end} +{synopt:{space 4}{opt sem:ean}} standard error of mean ({cmd:sd/sqrt(n)}){p_end} +{synopt:{space 4}{opt seb:inomial}} standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}){p_end} +{synopt:{space 4}{opt sep:oisson}} standard error of the mean, Poisson ({cmd:sqrt(mean)}){p_end} +{synopt:{space 4}{opt sk:ewness}} skewness{p_end} +{synopt:{space 4}{opt k:urtosis}} kurtosis{p_end} +{synopt:{space 4}{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{synopt:{space 4}{opt gini|dropneg}}Gini coefficient (negative values dropped){p_end} +{synopt:{space 4}{opt gini|keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{space 4}{synoptline} +{p2colreset}{...} + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_summarize/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help summarize}; +{help tabstat}; +{help gtools} diff --git a/01.code/ado/g/gstats_summarize.sthlp b/01.code/ado/g/gstats_summarize.sthlp new file mode 100755 index 0000000..c67d75a --- /dev/null +++ b/01.code/ado/g/gstats_summarize.sthlp @@ -0,0 +1,368 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_summarize "dialog gstats_summarize"}{...} +{vieweralsosee "[R] gstats_summarize" "mansection R gstats_summarize"}{...} +{viewerjumpto "Syntax" "gstats_summarize##syntax"}{...} +{viewerjumpto "Description" "gstats_summarize##description"}{...} +{viewerjumpto "Statistics" "gstats_summarize##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats summarize} {hline 2}} Summary statistics by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats {ul:sum}marize} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{p 8 17 2} +{cmd:gstats {ul:tab}stat} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{pstd} +{cmd:gstats {ul:tab}stat} and {cmd:gstats {ul:sum}marize} are fast, by-able +alternatives to {opt tabstat} and {opt summarize, detail}. +If {cmd:gstats summarize} is called with {opt by()} or {opt tab}, a table +in the style of {opt tabstat} is produced that inclues all the summary +statistics included by default in {opt summarize, detail}. + +{pstd} +Note the {it:prefixes} {cmd:by}, {cmd:rolling}, {cmd:statsby} are +{cmd:{it:not}} supported. To compute a table of statistics by a group +use the option {opt by()}. With {opt by()}, {opt gstats tab} is also +faster than {cmd:gcollapse}. + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Tabstat Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{cmdab:s:tatistics:(}{it:{help gstats_summarize##statname:stat}} [{it:...}]{cmd:)}}Report +specified statistics; default for {opt tabstat} is count, sum, mean, sd, min, max. +{p_end} +{synopt:{opt col:umns(stat|var)}}Columns are statistics (default) or variables. +{p_end} +{synopt:{opt pretty:stats}}Pretty statistic header names +{p_end} +{synopt:{opth labelw:idth(int)}}Max by variable label/value width. +{p_end} +{synopt:{opt f:ormat}[{cmd:(%}{it:{help format:fmt}}{cmd:)}]} +Use format to display summary stats; default %9.0g +{p_end} + +{syntab :Summarize Options} +{synopt:{opt nod:etail}}Do not display the full set of statistics. +{p_end} +{synopt:{opt mean:only}}Calculate only the count, sum, mean, min, max. +{p_end} +{synopt:{opth by(varlist)}}Group by variable; all stats are computed but output is in the style of tabstat. +{p_end} +{synopt:{opt sep:arator(#)}}Draw separator line after every {it:#} variables; default is {cmd:separator(5)}. +{p_end} +{synopt:{opt tab:stat}}Compute and display statistics in the style of {opt tabstat}. +{p_end} + +{syntab :Common Options} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GstatsOutput}) +{p_end} +{synopt:{opt pool:ed}}Pool varlist +{p_end} +{synopt:{opt noprint}}Do not print +{p_end} +{synopt:{opt f:ormat}}Use variable's display format. +{p_end} +{synopt:{opt nomiss:ing}}With {opt by()}, ignore groups with missing entries. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats tab} and {opt gstats sum} are mainly designed to report +statistics by group. It does not modify the data in memory, +so it is a nice alternative to {opt gcollapse} when there are few +groups and you want to compute summary stats more quickly. + +{pstd} +{opt gstats sum} by default computes the staistics that are reported by +{opt sum, detail} and without {opt by()} it is anywhere from 5 to 40 +times faster. The lower end of the speed gains are for Stata/MP, but +{opt sum, detail} is very slow in versions of Stata that are not multi-threaded. +The behavior of plain {opt summarize} and {opt summarize, meanonly} +can be recovered via options {opt nodetail} and {opt meanonly}, but Stata +is not specially slow in this case. Hence they are mainly included for +use with {opt by()}, where {opt gstats sum} is again faster. + +{pstd} +{opt gstats tab} should be faster than {opt tabstat} even without +groups, but the speed gains are largest with even a modest number of +levels in {opt by()}. Furthermore, an arbitrary number of grouping +variables are allowed. Note that with a very large numer of groups, +{opt tabstat}'s runtime seems to scale non-linearly, while {opt gstats tab} +will execute in a reasonable time. + +{pstd} +{opt gstata tab} does not store results in {opt r()}. Rather, the option {opt matasave} +is provided to store the full set of summary statistics and the by variable +levels in a mata class object called {opt statsOutput} (the name of the object +can be changed via {opt matasave(name)}). Run {opt mata GstatsOutput.desc()} +after {opt gstats tab, matasave} for details. The following helper functions are provided: + + string scalar getf(j, l, maxlbl) + get formatted (j, l) entry from by variables up to maxlbl characters + + real matrix getnum(j, l) + get (j, l) numeric entry from by variables + + string matrix getchar(j, l,| raw) + get (j, l) numeric entry from by variables; raw controls whether to null-pad entries + + real rowvector getOutputRow(j) + get jth output row + + real colvector getOutputCol(j) + get jth output column by position + + real matrix getOutputVar(var) + get jth output var by name + + real matrix getOutputGroup(j) + get jth output group + +{pstd} +The following data is stored {opt GstatsOutput}: + + summary statistics + ------------------ + + real matrix output + matrix with output statistics; J x kstats x kvars + + real scalar colvar + 1: columns are variables, rows are statistics; 0: the converse + + real scalar ksources + number of variable sources (0 if pool is true) + + real scalar kstats + number of statistics + + real matrix tabstat + 1: used tabstat; 0: used summarize + + string rowvector statvars + variables summarized + + string rowvector statnames + statistics computed + + real rowvector scodes + internal code for summary statistics + + real scalar pool + pooled source variables + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + printing options + ---------------- + + void printOutput() + print summary table + + real scalar maxlbl + max by variable label/value width + + real scalar pretty + print pretty statistic names + + real scalar usevfmt + use variable format for printing + + string scalar dfmt + fallback printing format + + real scalar maxl + maximum column length + + void readDefaults() + reset printing defaults + +{marker statistics}{...} +{title:Statistics} + +{phang} +{cmd:statistics(}{it:statname} [{it:...}]{cmd:)} + specifies the statistics to be displayed; the default with {opt tabstat} + is equivalent to specifying {cmd:statistics(mean)}. ({opt stats()} + is a synonym for {opt statistics()}.) Multiple statistics + may be specified and are separated by white space, such as + {cmd:statistics(mean sd)}. Available statistics are + +{marker statname}{...} +{synoptset 17}{...} +{synopt:{space 4}{it:statname}}Definition{p_end} +{space 4}{synoptline} +{synopt:{space 4}{opt me:an}} mean{p_end} +{synopt:{space 4}{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{synopt:{space 4}{opt co:unt}} count of nonmissing observations{p_end} +{synopt:{space 4}{opt n}} same as {cmd:count}{p_end} +{synopt:{space 4}{opt nmiss:ing}} number of missing observations{p_end} +{synopt:{space 4}{opt perc:ent}} percentage of nonmissing observations{p_end} +{synopt:{space 4}{opt nuniq:ue}} number of unique elements{p_end} +{synopt:{space 4}{opt su:m}} sum{p_end} +{synopt:{space 4}{opt rawsu:m}} sum, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{synopt:{space 4}{opt nansu:m}} sum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt rawnansu:m}} rawsum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt med:ian}} median (same as {opt p50}){p_end} +{synopt:{space 4}{opt p#.#}} arbitrary quantiles{p_end} +{synopt:{space 4}{opt p1}} 1st percentile{p_end} +{synopt:{space 4}{opt p2}} 2nd percentile{p_end} +{synopt:{space 4}{it:...}} 3rd-49th percentiles{p_end} +{synopt:{space 4}{opt p50}} 50th percentile (same as {opt median}){p_end} +{synopt:{space 4}{it:...}} 51st-97th percentiles{p_end} +{synopt:{space 4}{opt p98}} 98th percentile{p_end} +{synopt:{space 4}{opt p99}} 99th percentile{p_end} +{synopt:{space 4}{opt iqr}} interquartile range = {opt p75} - {opt p25}{p_end} +{synopt:{space 4}{opt q}} equivalent to specifying {cmd:p25 p50 p75}{p_end} +{synopt:{space 4}{opt sd}} standard deviation{p_end} +{synopt:{space 4}{opt v:ariance}} variance{p_end} +{synopt:{space 4}{opt cv}} coefficient of variation ({cmd:sd/mean}){p_end} +{synopt:{space 4}{opt select#}} #th smallest{p_end} +{synopt:{space 4}{opt select-#}} #th largest{p_end} +{synopt:{space 4}{opt mi:n}} minimum (same as {opt select1}){p_end} +{synopt:{space 4}{opt ma:x}} maximum (same as {opt select-1}){p_end} +{synopt:{space 4}{opt r:ange}} range = {opt max} - {opt min}{p_end} +{synopt:{space 4}{opt first}} first value{p_end} +{synopt:{space 4}{opt last}} last value{p_end} +{synopt:{space 4}{opt firstnm}} first nonmissing value{p_end} +{synopt:{space 4}{opt lastnm}} last nonmissing value{p_end} +{synopt:{space 4}{opt sem:ean}} standard error of mean ({cmd:sd/sqrt(n)}){p_end} +{synopt:{space 4}{opt seb:inomial}} standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}){p_end} +{synopt:{space 4}{opt sep:oisson}} standard error of the mean, Poisson ({cmd:sqrt(mean)}){p_end} +{synopt:{space 4}{opt sk:ewness}} skewness{p_end} +{synopt:{space 4}{opt k:urtosis}} kurtosis{p_end} +{synopt:{space 4}{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{synopt:{space 4}{opt gini|dropneg}}Gini coefficient (negative values dropped){p_end} +{synopt:{space 4}{opt gini|keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{space 4}{synoptline} +{p2colreset}{...} + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_summarize/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help summarize}; +{help tabstat}; +{help gtools} diff --git a/01.code/ado/g/gstats_tab.sthlp b/01.code/ado/g/gstats_tab.sthlp new file mode 100755 index 0000000..c67d75a --- /dev/null +++ b/01.code/ado/g/gstats_tab.sthlp @@ -0,0 +1,368 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_summarize "dialog gstats_summarize"}{...} +{vieweralsosee "[R] gstats_summarize" "mansection R gstats_summarize"}{...} +{viewerjumpto "Syntax" "gstats_summarize##syntax"}{...} +{viewerjumpto "Description" "gstats_summarize##description"}{...} +{viewerjumpto "Statistics" "gstats_summarize##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats summarize} {hline 2}} Summary statistics by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats {ul:sum}marize} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{p 8 17 2} +{cmd:gstats {ul:tab}stat} +{varlist} +{ifin} +[{it:{help gstats summarize##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats summarize##table_options:options}}] + +{pstd} +{cmd:gstats {ul:tab}stat} and {cmd:gstats {ul:sum}marize} are fast, by-able +alternatives to {opt tabstat} and {opt summarize, detail}. +If {cmd:gstats summarize} is called with {opt by()} or {opt tab}, a table +in the style of {opt tabstat} is produced that inclues all the summary +statistics included by default in {opt summarize, detail}. + +{pstd} +Note the {it:prefixes} {cmd:by}, {cmd:rolling}, {cmd:statsby} are +{cmd:{it:not}} supported. To compute a table of statistics by a group +use the option {opt by()}. With {opt by()}, {opt gstats tab} is also +faster than {cmd:gcollapse}. + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Tabstat Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{cmdab:s:tatistics:(}{it:{help gstats_summarize##statname:stat}} [{it:...}]{cmd:)}}Report +specified statistics; default for {opt tabstat} is count, sum, mean, sd, min, max. +{p_end} +{synopt:{opt col:umns(stat|var)}}Columns are statistics (default) or variables. +{p_end} +{synopt:{opt pretty:stats}}Pretty statistic header names +{p_end} +{synopt:{opth labelw:idth(int)}}Max by variable label/value width. +{p_end} +{synopt:{opt f:ormat}[{cmd:(%}{it:{help format:fmt}}{cmd:)}]} +Use format to display summary stats; default %9.0g +{p_end} + +{syntab :Summarize Options} +{synopt:{opt nod:etail}}Do not display the full set of statistics. +{p_end} +{synopt:{opt mean:only}}Calculate only the count, sum, mean, min, max. +{p_end} +{synopt:{opth by(varlist)}}Group by variable; all stats are computed but output is in the style of tabstat. +{p_end} +{synopt:{opt sep:arator(#)}}Draw separator line after every {it:#} variables; default is {cmd:separator(5)}. +{p_end} +{synopt:{opt tab:stat}}Compute and display statistics in the style of {opt tabstat}. +{p_end} + +{syntab :Common Options} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GstatsOutput}) +{p_end} +{synopt:{opt pool:ed}}Pool varlist +{p_end} +{synopt:{opt noprint}}Do not print +{p_end} +{synopt:{opt f:ormat}}Use variable's display format. +{p_end} +{synopt:{opt nomiss:ing}}With {opt by()}, ignore groups with missing entries. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{opt gstats tab} and {opt gstats sum} are mainly designed to report +statistics by group. It does not modify the data in memory, +so it is a nice alternative to {opt gcollapse} when there are few +groups and you want to compute summary stats more quickly. + +{pstd} +{opt gstats sum} by default computes the staistics that are reported by +{opt sum, detail} and without {opt by()} it is anywhere from 5 to 40 +times faster. The lower end of the speed gains are for Stata/MP, but +{opt sum, detail} is very slow in versions of Stata that are not multi-threaded. +The behavior of plain {opt summarize} and {opt summarize, meanonly} +can be recovered via options {opt nodetail} and {opt meanonly}, but Stata +is not specially slow in this case. Hence they are mainly included for +use with {opt by()}, where {opt gstats sum} is again faster. + +{pstd} +{opt gstats tab} should be faster than {opt tabstat} even without +groups, but the speed gains are largest with even a modest number of +levels in {opt by()}. Furthermore, an arbitrary number of grouping +variables are allowed. Note that with a very large numer of groups, +{opt tabstat}'s runtime seems to scale non-linearly, while {opt gstats tab} +will execute in a reasonable time. + +{pstd} +{opt gstata tab} does not store results in {opt r()}. Rather, the option {opt matasave} +is provided to store the full set of summary statistics and the by variable +levels in a mata class object called {opt statsOutput} (the name of the object +can be changed via {opt matasave(name)}). Run {opt mata GstatsOutput.desc()} +after {opt gstats tab, matasave} for details. The following helper functions are provided: + + string scalar getf(j, l, maxlbl) + get formatted (j, l) entry from by variables up to maxlbl characters + + real matrix getnum(j, l) + get (j, l) numeric entry from by variables + + string matrix getchar(j, l,| raw) + get (j, l) numeric entry from by variables; raw controls whether to null-pad entries + + real rowvector getOutputRow(j) + get jth output row + + real colvector getOutputCol(j) + get jth output column by position + + real matrix getOutputVar(var) + get jth output var by name + + real matrix getOutputGroup(j) + get jth output group + +{pstd} +The following data is stored {opt GstatsOutput}: + + summary statistics + ------------------ + + real matrix output + matrix with output statistics; J x kstats x kvars + + real scalar colvar + 1: columns are variables, rows are statistics; 0: the converse + + real scalar ksources + number of variable sources (0 if pool is true) + + real scalar kstats + number of statistics + + real matrix tabstat + 1: used tabstat; 0: used summarize + + string rowvector statvars + variables summarized + + string rowvector statnames + statistics computed + + real rowvector scodes + internal code for summary statistics + + real scalar pool + pooled source variables + + variable levels (empty if without -by()-) + ----------------------------------------- + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + printing options + ---------------- + + void printOutput() + print summary table + + real scalar maxlbl + max by variable label/value width + + real scalar pretty + print pretty statistic names + + real scalar usevfmt + use variable format for printing + + string scalar dfmt + fallback printing format + + real scalar maxl + maximum column length + + void readDefaults() + reset printing defaults + +{marker statistics}{...} +{title:Statistics} + +{phang} +{cmd:statistics(}{it:statname} [{it:...}]{cmd:)} + specifies the statistics to be displayed; the default with {opt tabstat} + is equivalent to specifying {cmd:statistics(mean)}. ({opt stats()} + is a synonym for {opt statistics()}.) Multiple statistics + may be specified and are separated by white space, such as + {cmd:statistics(mean sd)}. Available statistics are + +{marker statname}{...} +{synoptset 17}{...} +{synopt:{space 4}{it:statname}}Definition{p_end} +{space 4}{synoptline} +{synopt:{space 4}{opt me:an}} mean{p_end} +{synopt:{space 4}{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{synopt:{space 4}{opt co:unt}} count of nonmissing observations{p_end} +{synopt:{space 4}{opt n}} same as {cmd:count}{p_end} +{synopt:{space 4}{opt nmiss:ing}} number of missing observations{p_end} +{synopt:{space 4}{opt perc:ent}} percentage of nonmissing observations{p_end} +{synopt:{space 4}{opt nuniq:ue}} number of unique elements{p_end} +{synopt:{space 4}{opt su:m}} sum{p_end} +{synopt:{space 4}{opt rawsu:m}} sum, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{synopt:{space 4}{opt nansu:m}} sum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt rawnansu:m}} rawsum; returns . instead of 0 if all entries are missing{p_end} +{synopt:{space 4}{opt med:ian}} median (same as {opt p50}){p_end} +{synopt:{space 4}{opt p#.#}} arbitrary quantiles{p_end} +{synopt:{space 4}{opt p1}} 1st percentile{p_end} +{synopt:{space 4}{opt p2}} 2nd percentile{p_end} +{synopt:{space 4}{it:...}} 3rd-49th percentiles{p_end} +{synopt:{space 4}{opt p50}} 50th percentile (same as {opt median}){p_end} +{synopt:{space 4}{it:...}} 51st-97th percentiles{p_end} +{synopt:{space 4}{opt p98}} 98th percentile{p_end} +{synopt:{space 4}{opt p99}} 99th percentile{p_end} +{synopt:{space 4}{opt iqr}} interquartile range = {opt p75} - {opt p25}{p_end} +{synopt:{space 4}{opt q}} equivalent to specifying {cmd:p25 p50 p75}{p_end} +{synopt:{space 4}{opt sd}} standard deviation{p_end} +{synopt:{space 4}{opt v:ariance}} variance{p_end} +{synopt:{space 4}{opt cv}} coefficient of variation ({cmd:sd/mean}){p_end} +{synopt:{space 4}{opt select#}} #th smallest{p_end} +{synopt:{space 4}{opt select-#}} #th largest{p_end} +{synopt:{space 4}{opt mi:n}} minimum (same as {opt select1}){p_end} +{synopt:{space 4}{opt ma:x}} maximum (same as {opt select-1}){p_end} +{synopt:{space 4}{opt r:ange}} range = {opt max} - {opt min}{p_end} +{synopt:{space 4}{opt first}} first value{p_end} +{synopt:{space 4}{opt last}} last value{p_end} +{synopt:{space 4}{opt firstnm}} first nonmissing value{p_end} +{synopt:{space 4}{opt lastnm}} last nonmissing value{p_end} +{synopt:{space 4}{opt sem:ean}} standard error of mean ({cmd:sd/sqrt(n)}){p_end} +{synopt:{space 4}{opt seb:inomial}} standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}){p_end} +{synopt:{space 4}{opt sep:oisson}} standard error of the mean, Poisson ({cmd:sqrt(mean)}){p_end} +{synopt:{space 4}{opt sk:ewness}} skewness{p_end} +{synopt:{space 4}{opt k:urtosis}} kurtosis{p_end} +{synopt:{space 4}{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{synopt:{space 4}{opt gini|dropneg}}Gini coefficient (negative values dropped){p_end} +{synopt:{space 4}{opt gini|keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{space 4}{synoptline} +{p2colreset}{...} + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_summarize/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help summarize}; +{help tabstat}; +{help gtools} diff --git a/01.code/ado/g/gstats_transform.sthlp b/01.code/ado/g/gstats_transform.sthlp new file mode 100755 index 0000000..e656aaf --- /dev/null +++ b/01.code/ado/g/gstats_transform.sthlp @@ -0,0 +1,292 @@ +{smcl} +{* *! version 0.2.1 30Jan2020}{...} +{viewerdialog gstats_transform "dialog gstats_transform"}{...} +{vieweralsosee "[R] gstats_transform" "mansection R gstats_transform"}{...} +{viewerjumpto "Syntax" "gstats_transform##syntax"}{...} +{viewerjumpto "Description" "gstats_transform##description"}{...} +{viewerjumpto "Statistics" "gstats_transform##statistics"}{...} +{title:Title} + +{p2colset 5 25 28 2}{...} +{p2col :{cmd:gstats transform} {hline 2}} Apply statistical functions by group using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gstats transform} +{it:clist} +{ifin} +[{it:{help gstats transform##weight:weight}}] +[{cmd:,} +{it:{help gstats transform##table_options:options}}] + +{pstd}where {it:clist} is either + +{p 8 17 2} +[{opt (stat)}] +{varlist} +[ [{opt (stat)}] {it:...} ]{p_end} + +{p 8 17 2} +[{opt (stat)}] {it:target_var}{cmd:=}{varname} + [{it:target_var}{cmd:=}{varname} {it:...}] + [ [{opt (stat)}] {it:...}] + +{p 4 4 2}or any combination of the {it:varlist} or {it:target_var} forms, and +{it:stat} is one of{p_end} + +{p2colset 9 28 30 2}{...} +{p2col :{opt demean}}subtract the mean (default){p_end} +{p2col :{opt demedian}}subtract the median{p_end} +{p2col :{opt normalize}}(x - mean) / sd{p_end} +{p2col :{opt standardize}}same as {opt normalize}{p_end} +{p2col :{opt moving stat [# #]}}moving statistic {it:stat}; # specify the relative bounds ({help gstats transform##moving_format:see below}){p_end} +{p2col :{opt range stat [...]}}range statistic {it:stat} for observations within specified interval ({help gstats transform##interval_format:see below}){p_end} +{p2col :{opt cumsum [+/- [varname]]}}cumulative sum, optionally ascending (+) or descending (-) (optionally +/- by varname){p_end} +{p2col :{opt shift [[+/-]#]}}lags (-#) and leads (+#); unsigned numbers are positive (i.e. leads){p_end} +{p2col :{opt rank}}rank observations; use option {opt ties()} to specify how ties are handled{p_end} +{p2colreset}{...} + +{p 4 4 2} Some of the above transformations allow specifying various +options as part of their name. This is done to allow the user to request +various versions of the same transformation. However, this is not +required. The user can specify a global option that will be used for +all the corresponding transformations: + +{p2colset 9 28 30 2}{...} +{p2col :{opt moving stat}}{opt window()}{p_end} +{p2col :{opt range stat}}{opt interval()}{p_end} +{p2col :{opt cumsum}}{opt cumby()}{p_end} +{p2col :{opt shift}}{opt shiftby()}{p_end} +{p2colreset}{...} + +{p 4 4 2} Note {cmd:gstats moving} and {cmd:gstats range} are aliases +for {cmd:gstats transform}. In this case all the requested statistics +are assumed to be moving or range statistics, respectively. Finally, +{cmd:moving} and {bf:range} may be combined with any one of the +folloing:{p_end} + +{p2colset 9 22 24 2}{...} +{p2col :{opt mean}}means (default){p_end} +{p2col :{opt geomean}}geometric mean (missing if var has any negative values){p_end} +{p2col :{opt count}}number of nonmissing observations{p_end} +{p2col :{opt nmissing}}number of missing observations{p_end} +{p2col :{opt sum}}sums{p_end} +{p2col :{opt rawsum}}sums, ignoring optionally specified weights ({bf:note}: zero-weighted obs are still excluded){p_end} +{p2col :{opt nansum}}sum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt rawnansum}}rawsum; returns . instead of 0 if all entries are missing{p_end} +{p2col :{opt median}}medians (same as {opt p50}){p_end} +{p2col :{opt p#.#}}arbitrary quantiles{p_end} +{p2col :{opt p1}}1st percentile{p_end} +{p2col :{opt p2}}2nd percentile{p_end} +{p2col :{it:...}}3rd{hline 1}49th percentiles{p_end} +{p2col :{opt p50}}50th percentile (same as {cmd:median}){p_end} +{p2col :{it:...}}51st{hline 1}97th percentiles{p_end} +{p2col :{opt p98}}98th percentile{p_end} +{p2col :{opt p99}}99th percentile{p_end} +{p2col :{opt iqr}}interquartile range{p_end} +{p2col :{opt sd}}standard deviation{p_end} +{p2col :{opt var:iance}}variance{p_end} +{p2col :{opt cv}}coefficient of variation ({cmd:sd/mean}){p_end} +{p2col :{opt select#}}#th smallest{p_end} +{p2col :{opt select-#}}#th largest{p_end} +{p2col :{opt rawselect#}}#th smallest, ignoring weights{p_end} +{p2col :{opt rawselect-#}}#th largest, ignoring weights{p_end} +{p2col :{opt max}}maximums{p_end} +{p2col :{opt min}}minimums{p_end} +{p2col :{opt range}}range = {opt max} - {opt min}{p_end} +{p2col :{opt first}}first value{p_end} +{p2col :{opt last}}last value{p_end} +{p2col :{opt firstnm}}first nonmissing value{p_end} +{p2col :{opt lastnm}}last nonmissing value{p_end} +{p2col :{opt sem:ean}}standard error of the mean ({cmd:sd/sqrt(n)}){p_end} +{p2col :{opt seb:inomial}}standard error of the mean, binomial ({cmd:sqrt(p(1-p)/n)}) (missing if source not 0, 1){p_end} +{p2col :{opt sep:oisson}}standard error of the mean, Poisson ({cmd:sqrt(mean / n)}) (result rounded to nearest integer){p_end} +{p2col :{opt skewness}}Skewness{p_end} +{p2col :{opt kurtosis}}Kurtosis{p_end} +{p2col :{opt gini}}Gini coefficient (negative truncated to 0){p_end} +{p2col :{opt gini dropneg}}Gini coefficient (negative values dropped){p_end} +{p2col :{opt gini keepneg}}Gini coefficient (negative values kept; the user is responsible for the interpretation of the Gini in this case){p_end} +{p2colreset}{...} + +{marker interval_format}{...} +{dlgtab:Interval format} + +{pstd} +{cmd:range stat} must specify an interval or use the {opt interval(...)} +option. The interval must be of the form + +{p 8 17 2} +{bf:#}[{it:statlow}] {bf:#}[{it:stathigh}] [{it:var}] + +{pstd} +This computes, for each observation {it:i}, the summary statistic {it:stat} +among all observations {it:j} of the source variable such that + +{p 8 17 2} +var[i] + # * statlow(var) <= var[j] <= var[i] + # * stathigh(var) + +{pstd} +if {it:var} is not specified, it is taken to be the source variable itself. +{it:statlow} and {it:stathigh} are summary statistics computed based on +{it:every} value of {it:var}. If they are not specified, then {bf:#} is used by +itself to construct the bounds, but {bf:#} may be missing ({bf:.}) to mean no +upper or lower bound. For example, given some variable {it:x} with {it:N} observations, +we have{p_end} + + Input -> Meaning + {hline 55} + -2 2 time -> j: time[i] - 2 <= time[j] <= time[i] + 2 + i.e. {it:stat} within a 2-period time window + + -sd sd -> j: x[i] - sd(x) <= x[j] <= x[i] + sd(x) + i.e. {it:stat} for obs within a standard dev + +{marker moving_format}{...} +{dlgtab:Moving window format} + +{pstd}{bf:moving stat} must specify a relative range or use the {opt window(# #)} +option. The relative range uses a window defined by the {it:observations}. This +would be equivalent to computing time series rolling window statistics +using the time variable set to {it:_n}. For example, given some variable +{it:x} with {it:N} observations, we have{p_end} + + Input -> Range + {hline 31} + -3 3 -> x[i - 3] to x[i + 3] + -3 . -> x[i - 3] to x[N] + . 3 -> x[1] to x[i + 3] + -3 -1 -> x[i - 3] to x[i - 1] + -3 0 -> x[i - 3] to x[i] + 5 10 -> x[i + 5] to x[i + 10] + +{pstd}and so on. If the observation is outside of the admisible range +(e.g. {it:-10 10} but {it:i = 5}) the output is set to missing. If you +don't specify a range in ({it:moving stat}) then the range in {opt:window(# #)} +is used. + +{marker options}{...} +{title:Options} + +{synoptset 23 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Common Options} +{synopt:{opth by(varlist)}}Group statistics by variable. +{p_end} +{synopt:{opt replace}}Allow replacing existing variables. +{p_end} +{synopt :{opt wild:parse}}Allow rename-style syntax in target naming. +{p_end} +{synopt:{opt labelf:ormat}}Custom label engine: {bf:(#stat#) #sourcelabel#} is the default. +{p_end} +{synopt:{opth labelp:rogram(str)}}Program to parse {opt labelformat} (see examples). +{p_end} +{synopt :{opth auto:rename}[{cmd:(}{str}{cmd:)}]}Automatically name targets based on requested stats. Default is {it:#source#_#stat#}. +{p_end} +{synopt:{opt nogreedy}}Use slower but memory-efficient (non-greedy) algorithm. +{p_end} +{synopt:{opth type:s(str)}}Override variable types for targets ({bf:use with caution}). +{p_end} + +{syntab :Command Options} +{synopt:{opt window(lower upper)}}With {it:moving stat}. Relative observation range for moving statistics (if not specified in call). E.g. {opt window(-3 1)} means from 3 lag to 1 lead. {opt window(. #)} and {opt window(# .)} mean from the start and through the end. +{p_end} +{synopt:{opt interval(#[stat] #[stat] [var])}}With {it:range stat}. Interval for range statistics that don't specify their own interval. +{p_end} +{synopt:{opt cumby([+/- [varname]])}}With {it:cumsum}. Sort options for cumsum variables that don't specify their own. +{p_end} +{synopt:{opt shiftby([+/-]#)}}With {it:shift}. Lag or lead when to use {bf:shift} is requested without specifying a number. +{p_end} +{synopt:{opt ties(str)}}With {it:rank}. How to break ties for {opt rank}. {opt d:efault} assigns the average rank; {opt u:nique} breaks ties arbitrarily; {opt stableunique} breaks ties using the order values appear in the data; {opt f:ield} counts the number of values greater than; {opt t:rack} counts the number of values less than. +{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s +are allowed. +{p_end} + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gstats transform} applies various statistical transformations +to input data. It is similar to {cmd:gcollapse, merge} or {cmd:gegen} but +for individual-level transformations. That is, {cmd:gcollapse} takes an +input variable and procudes a single statistic; {cmd:gstats transform} +applies a function to each element of the input variable. For example, +subtracting the mean. + +{pstd} +Every function available to {cmd:gstats transform} can be called via +{cmd:gegen}. Further, note that while not every function will use weights +in their computations (e.g. {it:shift} ignores weights in the actual +transformation), if weights are specified they will be used to flag +acceptable observations (i.e. missing, zero, and, except for {opt iweights}, +negative observations get excluded). + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_transform/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help gegen}; +{help gcollapse}; +{help gtools} diff --git a/01.code/ado/g/gstats_winsor.sthlp b/01.code/ado/g/gstats_winsor.sthlp new file mode 100755 index 0000000..b951766 --- /dev/null +++ b/01.code/ado/g/gstats_winsor.sthlp @@ -0,0 +1,202 @@ +{smcl} +{* *! version 0.1.4 23Jan2019}{...} +{viewerdialog gstats_winsor "dialog gstats_winsor"}{...} +{vieweralsosee "[R] gstats_winsor" "mansection R gstats_winsor"}{...} +{viewerjumpto "Syntax" "gstats_winsor##syntax"}{...} +{viewerjumpto "Description" "gstats_winsor##description"}{...} +{title:Title} + +{p2colset 5 22 34 2}{...} +{p2col :{cmd:gstats winsor} {hline 2}} Winsorize data using C for speed {p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{pstd} +{it:gstats winsor} was written as a fast {opt winsor2} alternative. It +additionally accepts weights. {p_end} + +{p 8 17 2} +{cmd:gstats winsor} +{varlist} +{ifin} +[{it:{help gstats winsor##weight:weight}}] +[{cmd:,} {opth by(varlist)} {it:{help gstats##table_options:options}}] + +{synoptset 19 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Winsor Options} +{synopt :{opth p:refix(str)}} Generate targets as {it:prefix}source (default empty). +{p_end} +{synopt :{opth s:uffix(str)}} Generate targets as source{it:suffix} (default {it:_w} with cut and {it:_tr} with {opt trim}). +{p_end} +{synopt :{opth gen:erate(namelist)}} Named targets to generate; one per source. +{p_end} +{synopt :{opt c:uts(#.# #.#)}} Cut points (detault 1.0 and 99.0 for 1st and 99th percentiles). +{p_end} +{synopt :{opt t:rim}} Trim instead of Winsorize (i.e. replace outliers with missing values). +{p_end} +{synopt :{opt l:abel}} Add Winsorized/trimming note to target labels. +{p_end} +{synopt :{opt replace}} Replace targets if they exist. +{p_end} +{synopt :{opt nomiss:ing}} With {opt by()}, ignore groups with missing entries. +{p_end} + +{syntab:Gtools Options} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{opt bench}{it:[(int)]}}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, {opt iweight}s, and {opt pweight}s are +allowed (see {manhelp weight U:11.1.6 weight} for more on the way Stata +uses weights). + +{marker description}{...} +{title:Description} + +{pstd} +{it:gstats winsor} winsorizes or trims (if the trim option is specified) +the variables in varlist at particular percentiles specified by option +{opt cuts(#1 #2)}. By defult, new variables will be generated with a +suffix "_w" or "_tr", respectively. The user can control this via the +{opt suffix()} option. The replace option replaces the variables with +their winsorized or trimmed ones. + +{error}{dlgtab:Difference between winsorizing and trimming}{text} + +{pstd} +{it:Important}: This section is nearly verbatim from the equivalent help +section from {help winsor2}. + +{pstd} +Winsorizing is not equivalent to simply excluding data, which is +a simpler procedure, called trimming or truncation. In a trimmed +estimator, the extreme values are discarded; in a Winsorized estimator, +the extreme values are instead replaced by certain percentiles, +specified by option cuts(# #). For details, see {help winsor} (if +installed), and {help trimmean} (if installed). + +{pstd} +For example, you type the following commands to get the 1st and 99th +percentiles of the variable wage, 1.930993 and 38.70926. + +{phang2} {bf: . sysuse nlsw88, clear} {p_end} +{phang2} {bf: . sum wage, detail} {p_end} + +{pstd} +By default, {cmd:gstats winsor} winsorizes wage at 1st and 99th percentiles, + +{phang2} {bf: . gstats winsor wage, replace cuts(1 99)} {p_end} + +{pstd} +which can be done by hand: + +{phang2} {bf: . replace wage=1.930993 if wage<1.930993} {p_end} +{phang2} {bf: . replace wage=38.70926 if wage>38.70926} {p_end} + +{pstd} + +Note that, values smaller than the 1st percentile are repalced by that +value, and similarly with values above the 99th percentile. When the +-{bf:trim}- option is specified, those values are set to missing instead +(which are discarded by most commands): + +{phang2} {bf: . gstats winsor wage, replace cuts(1 99) trim} {p_end} + +{pstd} +which can also be done by hand: + +{phang2} {bf: . replace wage=. if wage<1.930993} {p_end} +{phang2} {bf: . replace wage=. if wage>38.70926} {p_end} + +{pstd} +In this case, we discard values smaller than 1th percentile or greater +than 99th percentile. This is trimming. + +{marker example}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gstats_winsor/index.html#examples":online documentation} +for examples. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gstats} is maintained as part of the {manhelp gtools R:gtools} project at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{opt gstats winsor} was written largely to mimic the functionality of the community-contributed command {opt winsor2}. + +{p 8 8 2} +{cmd:Yujun,Lian (Arlion)} Department of Finance, Lingnan College, Sun Yat-Sen University.{break} +E-mail: {browse "mailto:arlionn@163.com":arlionn@163.com}.{break} +Blog: {browse "http://blog.cnfol.com/arlion":http://blog.cnfol.com/arlion}.{break} +Homepage: {browse "http://www.lingnan.sysu.edu.cn/lnshizi/faculty_vch.asp?name=lianyj":http://www.lingnan.sysu.edu.cn/lnshizi/faculty_vch.asp?name=lianyj}. {break} + +{pstd} +This, in turn, had incorporated some code from {opt winsor}, by + +{p 8 8 2} +Nicholas J. Cox, Durham University, U.K.{break} +n.j.cox@durham.ac.uk + +{p 4 4 2} +and {opt winsorizeJ.ado}, by + +{p 8 8 2} +Judson Caskey + +{pstd} +{opt gstats winsor}'s options and this helpfile borrow heavily from {opt winsor2}. +{p_end} + +{pstd} +{opt gtools} was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{pstd} +help for +{help gtools}; +{help winsor2} (if installed) diff --git a/01.code/ado/g/gtools.ado b/01.code/ado/g/gtools.ado new file mode 100755 index 0000000..e03f2c9 --- /dev/null +++ b/01.code/ado/g/gtools.ado @@ -0,0 +1,491 @@ +*! version 1.10.1 05Dec2022 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Program for managing the gtools package installation + +capture program drop gtools +program gtools + version 13.1 + + if ( inlist("`c(os)'", "MacOSX") | strpos("`c(machine_type)'", "Mac") ) local c_os_ macosx + else local c_os_: di lower("`c(os)'") + + syntax, [ /// + LICENSEs /// + Verbose /// + Install_latest /// + Upgrade /// + showcase /// + examples /// + test /// + TESTs(str) /// + branch(str) /// + ] + + if ( `"`branch'"' == "" ) local branch master + if !inlist(`"`branch'"', "develop", "master") { + disp as err "{bf:Warning}: Branch `branch' is not intended for normal use." + * exit 198 + } + + local cwd `c(pwd)' + local github https://raw.githubusercontent.com/mcaceresb/stata-gtools/`branch' + + if ( "`licenses'" == "licenses" ) { + disp `"gtools is {browse "https://github.com/mcaceresb/stata-gtools/blob/master/LICENSE":MIT-licensed }"' + disp "" + disp `"The GNU C library is GPL-licensed. See the {browse "http://www.gnu.org/licenses/":GNU lesser GPL for more details}."' + disp "" + disp `"The implementation of quicksort used is authored by the FreeBSD project and is BSD3-licensed."' + disp "" + disp `"The implementation of spookyhash used is authored by Guillaume Voirin and is {browse "https://github.com/centaurean/spookyhash/blob/master/LICENSE.md":BSD3-licensed}."' + + if ( "`verbose'" != "" ) { + gtools_licenses + } + + if ( `"`install_latest'`upgrade'`showcase'`examples'`test'`tests'"' == `""' ) { + exit 0 + } + } + + if ( ("`install_latest'" == "install_latest") | ("`upgrade'" == "upgrade") ) { + cap net uninstall gtools + net install gtools, from(`github'/build) replace + if ( `"`showcase'`examples'`test'`tests'"' == `""' ) { + exit 0 + } + } + + if ( "`showcase'`examples'" != "" ) { + gtools_showcase + if ( "`test'`tests'" == "" ) { + exit 0 + } + } + + if ( `"`test'`tests'"' != "" ) { + local t_hours comparisons + local t_days bench_full + local t_known dependencies basic_checks comparisons switches bench_test bench_full + local t_extra: list tests - t_known + + if ( `:list sizeof t_extra' ) { + disp `"(uknown tests detected: `t_extra'; will try to run anyway)"' + } + + if ( `"`tests'"' == "" ) { + disp as txt "{bf:WARNING:} Default unit tests from branch `branch' can take several" + disp as txt "hours. See {help gtools:help gtools} for details on unit testing." + } + else if ( `:list t_hours in tests' ) { + disp as txt "{bf:WARNING:} Unit tests" + disp as txt _n(1) " `tests'" _n(1) + disp as txt "from branch master can take several hours. See {help gtools:help gtools} for details." + } + else if ( `:list t_days in tests' ) { + disp as txt "{bf:WARNING:} Unit tests" + disp as txt _n(1) " `tests'" _n(1) + disp as txt "from branch master can take more than a day. See {help gtools:help gtools} for details." + } + else { + disp as txt "{bf:Note:} Unit tests '`tests'' from branch `branch'." + } + disp as txt "Are you sure you want to run them? (yes/no)", _request(GTOOLS_TESTS) + if inlist(`"${GTOOLS_TESTS}"', "y", "yes") { + global GTOOLS_TESTS + cap noi do `github'/build/gtools_tests.do `tests' + exit _rc + } + else { + global GTOOLS_TESTS + exit 0 + } + } + + display "Nothing to do. See {stata help gtools} or {stata gtools, examples} for usage. Version info:" + * mata mata mlib index + which gtools + cap noi _gtools_internal _check + if ( _rc ) { + disp as err "({bf:warning}: gtools_plugin internal check failed)" + } +end + +capture program drop gtools_licenses +program gtools_licenses + disp _n(1) `"{hline 79}"' /// + _n(1) `"gtools license"' /// + _n(1) `""' /// + _n(1) `"MIT License"' /// + _n(1) `""' /// + _n(1) `"Copyright (c) 2017 Mauricio Caceres Bravop"' /// + _n(1) `""' /// + _n(1) `"Permission is hereby granted, free of charge, to any person obtaining a copy"' /// + _n(1) `"of this software and associated documentation files (the "Software"), to"' /// + _n(1) `"deal in the Software without restriction, including without limitation the"' /// + _n(1) `"rights to use, copy, modify, merge, publish, distribute, sublicense, and/or"' /// + _n(1) `"sell copies of the Software, and to permit persons to whom the Software is"' /// + _n(1) `"furnished to do so, subject to the following conditions:"' /// + _n(1) `""' /// + _n(1) `"The above copyright notice and this permission notice shall be included in all"' /// + _n(1) `"copies or substantial portions of the Software."' /// + _n(1) `""' /// + _n(1) `"THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR"' /// + _n(1) `"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,"' /// + _n(1) `"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL"' /// + _n(1) `"THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER"' /// + _n(1) `"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,"' /// + _n(1) `"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE"' /// + _n(1) `"SOFTWARE."' /// + _n(1) `""' + disp _n(1) `"{hline 79}"' /// + _n(1) `"spookyhash license"' /// + _n(1) `""' /// + _n(1) `"Copyright (c) 2015, Guillaume Voirin"' /// + _n(1) `""' /// + _n(1) `"All rights reserved."' /// + _n(1) `""' /// + _n(1) `"Redistribution and use in source and binary forms, with or without"' /// + _n(1) `"modification, are permitted provided that the following conditions are met:"' /// + _n(1) `""' /// + _n(1) `"1. Redistributions of source code must retain the above copyright notice, this"' /// + _n(1) `" list of conditions and the following disclaimer."' /// + _n(1) `""' /// + _n(1) `"2. Redistributions in binary form must reproduce the above copyright notice,"' /// + _n(1) `" this list of conditions and the following disclaimer in the documentation"' /// + _n(1) `" and/or other materials provided with the distribution."' /// + _n(1) `""' /// + _n(1) `"3. Neither the name of the copyright holder nor the names of its"' /// + _n(1) `" contributors may be used to endorse or promote products derived from"' /// + _n(1) `" this software without specific prior written permission."' /// + _n(1) `""' /// + _n(1) `"THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS""' /// + _n(1) `"AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"' /// + _n(1) `"IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE"' /// + _n(1) `"DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE"' /// + _n(1) `"FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL"' /// + _n(1) `"DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR"' /// + _n(1) `"SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER"' /// + _n(1) `"CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,"' /// + _n(1) `"OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE"' /// + _n(1) `"OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."' /// + _n(1) `""' + disp _n(1) `"{hline 79}"' /// + _n(1) `"quicksort license"' /// + _n(1) `""' /// + _n(1) `"Copyright (c) 1992, 1993"' /// + _n(1) `" The Regents of the University of California. All rights reserved."' /// + _n(1) `""' /// + _n(1) `"Redistribution and use in source and binary forms, with or without"' /// + _n(1) `"modification, are permitted provided that the following conditions"' /// + _n(1) `"are met:"' /// + _n(1) `"1. Redistributions of source code must retain the above copyright"' /// + _n(1) `" notice, this list of conditions and the following disclaimer."' /// + _n(1) `"2. Redistributions in binary form must reproduce the above copyright"' /// + _n(1) `" notice, this list of conditions and the following disclaimer in the"' /// + _n(1) `" documentation and/or other materials provided with the distribution."' /// + _n(1) `"4. Neither the name of the University nor the names of its contributors"' /// + _n(1) `" may be used to endorse or promote products derived from this software"' /// + _n(1) `" without specific prior written permission."' /// + _n(1) `""' /// + _n(1) `"THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND"' /// + _n(1) `"ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE"' /// + _n(1) `"IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE"' /// + _n(1) `"ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE"' /// + _n(1) `"FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL"' /// + _n(1) `"DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS"' /// + _n(1) `"OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)"' /// + _n(1) `"HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT"' /// + _n(1) `"LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY"' /// + _n(1) `"OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF"' /// + _n(1) `"SUCH DAMAGE."' /// + _n(1) `""' + disp _n(1) `"{hline 79}"' /// + _n(1) `"GNU C library license"' /// + _n(1) `""' /// + _n(1) `" GNU LESSER GENERAL PUBLIC LICENSE"' /// + _n(1) `" Version 3, 29 June 2007"' /// + _n(1) `""' /// + _n(1) `" Copyright (C) 2007 Free Software Foundation, Inc. "' /// + _n(1) `" Everyone is permitted to copy and distribute verbatim copies"' /// + _n(1) `" of this license document, but changing it is not allowed."' /// + _n(1) `""' /// + _n(1) `""' /// + _n(1) `" This version of the GNU Lesser General Public License incorporates"' /// + _n(1) `"the terms and conditions of version 3 of the GNU General Public"' /// + _n(1) `"License, supplemented by the additional permissions listed below."' /// + _n(1) `""' /// + _n(1) `" 0. Additional Definitions."' /// + _n(1) `""' /// + _n(1) `" As used herein, "this License" refers to version 3 of the GNU Lesser"' /// + _n(1) `"General Public License, and the "GNU GPL" refers to version 3 of the GNU"' /// + _n(1) `"General Public License."' /// + _n(1) `""' /// + _n(1) `" "The Library" refers to a covered work governed by this License,"' /// + _n(1) `"other than an Application or a Combined Work as defined below."' /// + _n(1) `""' /// + _n(1) `" An "Application" is any work that makes use of an interface provided"' /// + _n(1) `"by the Library, but which is not otherwise based on the Library."' /// + _n(1) `"Defining a subclass of a class defined by the Library is deemed a mode"' /// + _n(1) `"of using an interface provided by the Library."' /// + _n(1) `""' /// + _n(1) `" A "Combined Work" is a work produced by combining or linking an"' /// + _n(1) `"Application with the Library. The particular version of the Library"' /// + _n(1) `"with which the Combined Work was made is also called the "Linked"' /// + _n(1) `"Version"."' /// + _n(1) `""' /// + _n(1) `" The "Minimal Corresponding Source" for a Combined Work means the"' /// + _n(1) `"Corresponding Source for the Combined Work, excluding any source code"' /// + _n(1) `"for portions of the Combined Work that, considered in isolation, are"' /// + _n(1) `"based on the Application, and not on the Linked Version."' /// + _n(1) `""' /// + _n(1) `" The "Corresponding Application Code" for a Combined Work means the"' /// + _n(1) `"object code and/or source code for the Application, including any data"' /// + _n(1) `"and utility programs needed for reproducing the Combined Work from the"' /// + _n(1) `"Application, but excluding the System Libraries of the Combined Work."' /// + _n(1) `""' + disp _n(1) `" 1. Exception to Section 3 of the GNU GPL."' /// + _n(1) `""' /// + _n(1) `" You may convey a covered work under sections 3 and 4 of this License"' /// + _n(1) `"without being bound by section 3 of the GNU GPL."' /// + _n(1) `""' /// + _n(1) `" 2. Conveying Modified Versions."' /// + _n(1) `""' /// + _n(1) `" If you modify a copy of the Library, and, in your modifications, a"' /// + _n(1) `"facility refers to a function or data to be supplied by an Application"' /// + _n(1) `"that uses the facility (other than as an argument passed when the"' /// + _n(1) `"facility is invoked), then you may convey a copy of the modified"' /// + _n(1) `"version:"' /// + _n(1) `""' /// + _n(1) `" a) under this License, provided that you make a good faith effort to"' /// + _n(1) `" ensure that, in the event an Application does not supply the"' /// + _n(1) `" function or data, the facility still operates, and performs"' /// + _n(1) `" whatever part of its purpose remains meaningful, or"' /// + _n(1) `""' /// + _n(1) `" b) under the GNU GPL, with none of the additional permissions of"' /// + _n(1) `" this License applicable to that copy."' /// + _n(1) `""' /// + _n(1) `" 3. Object Code Incorporating Material from Library Header Files."' /// + _n(1) `""' /// + _n(1) `" The object code form of an Application may incorporate material from"' /// + _n(1) `"a header file that is part of the Library. You may convey such object"' /// + _n(1) `"code under terms of your choice, provided that, if the incorporated"' /// + _n(1) `"material is not limited to numerical parameters, data structure"' /// + _n(1) `"layouts and accessors, or small macros, inline functions and templates"' /// + _n(1) `"(ten or fewer lines in length), you do both of the following:"' /// + _n(1) `""' /// + _n(1) `" a) Give prominent notice with each copy of the object code that the"' /// + _n(1) `" Library is used in it and that the Library and its use are"' /// + _n(1) `" covered by this License."' /// + _n(1) `""' /// + _n(1) `" b) Accompany the object code with a copy of the GNU GPL and this license"' /// + _n(1) `" document."' /// + _n(1) `""' + disp _n(1) `" 4. Combined Works."' /// + _n(1) `""' /// + _n(1) `" You may convey a Combined Work under terms of your choice that,"' /// + _n(1) `"taken together, effectively do not restrict modification of the"' /// + _n(1) `"portions of the Library contained in the Combined Work and reverse"' /// + _n(1) `"engineering for debugging such modifications, if you also do each of"' /// + _n(1) `"the following:"' /// + _n(1) `""' /// + _n(1) `" a) Give prominent notice with each copy of the Combined Work that"' /// + _n(1) `" the Library is used in it and that the Library and its use are"' /// + _n(1) `" covered by this License."' /// + _n(1) `""' /// + _n(1) `" b) Accompany the Combined Work with a copy of the GNU GPL and this license"' /// + _n(1) `" document."' /// + _n(1) `""' /// + _n(1) `" c) For a Combined Work that displays copyright notices during"' /// + _n(1) `" execution, include the copyright notice for the Library among"' /// + _n(1) `" these notices, as well as a reference directing the user to the"' /// + _n(1) `" copies of the GNU GPL and this license document."' /// + _n(1) `""' /// + _n(1) `" d) Do one of the following:"' /// + _n(1) `""' /// + _n(1) `" 0) Convey the Minimal Corresponding Source under the terms of this"' /// + _n(1) `" License, and the Corresponding Application Code in a form"' /// + _n(1) `" suitable for, and under terms that permit, the user to"' /// + _n(1) `" recombine or relink the Application with a modified version of"' /// + _n(1) `" the Linked Version to produce a modified Combined Work, in the"' /// + _n(1) `" manner specified by section 6 of the GNU GPL for conveying"' /// + _n(1) `" Corresponding Source."' /// + _n(1) `""' /// + _n(1) `" 1) Use a suitable shared library mechanism for linking with the"' /// + _n(1) `" Library. A suitable mechanism is one that (a) uses at run time"' /// + _n(1) `" a copy of the Library already present on the user's computer"' /// + _n(1) `" system, and (b) will operate properly with a modified version"' /// + _n(1) `" of the Library that is interface-compatible with the Linked"' /// + _n(1) `" Version."' /// + _n(1) `""' /// + _n(1) `" e) Provide Installation Information, but only if you would otherwise"' /// + _n(1) `" be required to provide such information under section 6 of the"' /// + _n(1) `" GNU GPL, and only to the extent that such information is"' /// + _n(1) `" necessary to install and execute a modified version of the"' /// + _n(1) `" Combined Work produced by recombining or relinking the"' /// + _n(1) `" Application with a modified version of the Linked Version. (If"' /// + _n(1) `" you use option 4d0, the Installation Information must accompany"' /// + _n(1) `" the Minimal Corresponding Source and Corresponding Application"' /// + _n(1) `" Code. If you use option 4d1, you must provide the Installation"' /// + _n(1) `" Information in the manner specified by section 6 of the GNU GPL"' /// + _n(1) `" for conveying Corresponding Source.)"' /// + _n(1) `""' + disp _n(1) `" 5. Combined Libraries."' /// + _n(1) `""' /// + _n(1) `" You may place library facilities that are a work based on the"' /// + _n(1) `"Library side by side in a single library together with other library"' /// + _n(1) `"facilities that are not Applications and are not covered by this"' /// + _n(1) `"License, and convey such a combined library under terms of your"' /// + _n(1) `"choice, if you do both of the following:"' /// + _n(1) `""' /// + _n(1) `" a) Accompany the combined library with a copy of the same work based"' /// + _n(1) `" on the Library, uncombined with any other library facilities,"' /// + _n(1) `" conveyed under the terms of this License."' /// + _n(1) `""' /// + _n(1) `" b) Give prominent notice with the combined library that part of it"' /// + _n(1) `" is a work based on the Library, and explaining where to find the"' /// + _n(1) `" accompanying uncombined form of the same work."' /// + _n(1) `""' /// + _n(1) `" 6. Revised Versions of the GNU Lesser General Public License."' /// + _n(1) `""' /// + _n(1) `" The Free Software Foundation may publish revised and/or new versions"' /// + _n(1) `"of the GNU Lesser General Public License from time to time. Such new"' /// + _n(1) `"versions will be similar in spirit to the present version, but may"' /// + _n(1) `"differ in detail to address new problems or concerns."' /// + _n(1) `""' /// + _n(1) `" Each version is given a distinguishing version number. If the"' /// + _n(1) `"Library as you received it specifies that a certain numbered version"' /// + _n(1) `"of the GNU Lesser General Public License "or any later version""' /// + _n(1) `"applies to it, you have the option of following the terms and"' /// + _n(1) `"conditions either of that published version or of any later version"' /// + _n(1) `"published by the Free Software Foundation. If the Library as you"' /// + _n(1) `"received it does not specify a version number of the GNU Lesser"' /// + _n(1) `"General Public License, you may choose any version of the GNU Lesser"' /// + _n(1) `"General Public License ever published by the Free Software Foundation."' /// + _n(1) `""' /// + _n(1) `" If the Library as you received it specifies that a proxy can decide"' /// + _n(1) `"whether future versions of the GNU Lesser General Public License shall"' /// + _n(1) `"apply, that proxy's public statement of acceptance of any version is"' /// + _n(1) `"permanent authorization for you to choose that version for the"' /// + _n(1) `"Library."' +end + +capture program drop gtools_showcase +program gtools_showcase + * preserve + gtools_cmd sysuse auto, clear + + gtools_head gstats {hdfe|residualize} varlist [if] [in] [weight], [absorb(varlist) options] + gtools_cmd gstats hdfe hdfe_price = price, absorb(foreign rep78) + gtools_cmd gstats residualize price mpg [w = gear_ratio], absorb(foreign rep78) prefix(res_) + + gtools_head gstats {sum|tab} varlist [if] [in] [weight], [by(varlist) options] + gtools_cmd gstats sum price [pw = gear_ratio / 4] + gtools_cmd gstats tab price mpg, by(foreign) matasave + + gtools_head gquantiles [newvarname =] exp [if] [in] [weight], {_pctile|xtile|pctile} [options] + gtools_cmd gquantiles 2 * price, _pctile nq(10) + gtools_cmd gquantiles p10 = 2 * price, pctile nq(10) + gtools_cmd gquantiles x10 = 2 * price, xtile nq(10) by(rep78) + gtools_cmd fasterxtile xx = log(price) [w = weight], cutpoints(p10) by(foreign) + + gtools_head gstats winsor varlist [if] [in] [weight], [by(varlist) cuts(# #) options] + gtools_cmd gstats winsor price gear_ratio mpg, cuts(5 95) s(_w1) + gtools_cmd gstats winsor price gear_ratio mpg, cuts(5 95) by(foreign) s(_w2) + + gtools_head hashsort varlist, [options] + gtools_cmd hashsort -make + gtools_cmd hashsort foreign -rep78, benchmark verbose mlast + + gtools_head gegen target = stat(source) [if] [in] [weight], by(varlist) [options] + gtools_cmd gegen tag = tag(foreign) + gtools_cmd gegen group = tag(-price make) + gtools_cmd gegen p2_5 = pctile(price) [w = weight], by(foreign) p(2.5) + + gtools_head gisid varlist [if] [in], [options] + gtools_cmd gisid make, missok + gtools_cmd gisid price in 1 / 2 + + gtools_head gduplicates varlist [if] [in], [options gtools(gtools_options)] + gtools_cmd gduplicates report foreign + gtools_cmd gduplicates report rep78 if foreign, gtools(bench(3)) + + gtools_head glevelsof varlist [if] [in], [options] + gtools_cmd glevelsof rep78, local(levels) sep(" | ") + gtools_cmd glevelsof foreign mpg if price < 4000, loc(lvl) sep(" | ") colsep(", ") + gtools_cmd glevelsof foreign mpg in 10 / 70, gen(uniq_) nolocal + + gtools_head gtop varlist [if] [in] [weight], [options] + disp "gtoplevelsof varlist [if] [in] [weight], [options]" _n(1) + gtools_cmd gtoplevelsof foreign rep78 + gtools_cmd gtop foreign rep78 [w = weight], ntop(5) missrow groupmiss pctfmt(%6.4g) colmax(3) + + gtools_head gregress depvar indepvars [if] [in] [weight], [by(varlist) options] + gtools_cmd gregress price mpg rep78, mata(coefs) prefix(b(_b_) se(_se_)) + gtools_cmd gregress price mpg [fw = rep78], by(foreign) absorb(rep78 headroom) cluster(rep78) + + gtools_head givregress depvar (endog = instruments) exog [if] [in] [weight], [by(varlist) options] + gtools_cmd givregress price (mpg = gear_ratio) rep78, mata(coefs) prefix(b(_b_) se(_se_)) replace + gtools_cmd givregress price (mpg = gear_ratio) [fw = rep78], by(foreign) absorb(rep78 headroom) cluster(rep78) + + gtools_head gglm depvar indepvars [if] [in] [weight], family(...) [by(varlist) options] + gtools_cmd gglm price mpg rep78, family(poisson) mata(coefs) prefix(b(_b_) se(_se_)) replace + gtools_cmd gglm price mpg [fw = trunk], family(poisson) by(foreign) absorb(rep78 headroom) cluster(rep78) + gtools_cmd + gtools_cmd gglm foreign price rep78 [fw = trunk], family(binomial) absorb(headroom) mata(coefs) + gtools_cmd gglm foreign price if rep78 > 2, family(binomial) by(rep78) prefix(b(_b_) se(_se_)) replace + + gtools_head gcollapse (stat) out = src [(stat) out = src ...] [if] [if] [weight], by(varlist) [options] + gtools_cmd gen h1 = headroom + gtools_cmd gen h2 = headroom + gtools_cmd local lbl labelformat(#stat:pretty# #sourcelabel#) + gtools_cmd + gtools_cmd gcollapse (mean) mean = price (median) p50 = gear_ratio, by(make) merge v `lbl' + disp `"disp "\`:var label mean', \`:var label p50'""' + gtools_cmd gcollapse (iqr) irq? = h? (nunique) turn (p97.5) mpg, by(foreign rep78) bench(2) wild + + gtools_head gcontract varlist [if] [if] [fweight], [options] + gtools_cmd gcontract foreign [fw = turn], freq(f) percent(p) + * restore + + gtools_head greshape subcommand list, i(i) j(j) [options] + disp " greshape wide varlist, i(i) j(j) [options]" + disp " greshape long prefixlist, i(i) [j(j) string options]" _n(1) + disp " greshape spread varlist, j(j) [options]" + disp " greshape gather varlist, j(j) value(value) [options]" _n(1) + + gtools_cmd gen j = _n + gtools_cmd greshape wide f p, i(foreign) j(j) + gtools_cmd greshape long f p, i(foreign) j(j) + gtools_cmd + gtools_cmd greshape spread f p, j(j) + gtools_cmd greshape gather f? p?, j(j) value(fp) + + gtools_head gstats transform (stat) out = src [(stat) out = src ...] [if] [if] [weight], by(varlist) [options] + disp " gstats range (stat) out = src [...] [if] [if] [weight], by(varlist) [options]" + disp " gstats moving (stat) out = src [...] [if] [if] [weight], by(varlist) [options]" + + gtools_cmd sysuse auto, clear + gtools_cmd gstats transform (normalize) price (demean) price (range mean -sd sd) price, auto + gtools_cmd gstats range (mean) mean_r = price (sd) sd_r = price, interval(-10 10 mpg) + gtools_cmd gstats moving (mean) mean_m = price (sd) sd_m = price, by(foreign) window(-5 5) +end + +capture program drop gtools_head +program gtools_head + gettoken cmd _: 0 + disp _n(1) `"`cmd'"' _n(1) `"{hline `=length(`"`cmd'"')'}"' _n(2) `"`0'"' _n(1) +end + +capture program drop gtools_cmd +program gtools_cmd + disp `"`0'"' + * disp `"{stata `0'}"' + * `0' + * disp "" +end + +if ( inlist("`c(os)'", "MacOSX") | strpos("`c(machine_type)'", "Mac") ) local c_os_ macosx +else local c_os_: di lower("`c(os)'") + +if ( `c(stata_version)' < 14.1 ) local spiver v2 +else local spiver v3 diff --git a/01.code/ado/g/gtools.sthlp b/01.code/ado/g/gtools.sthlp new file mode 100755 index 0000000..b42b8aa --- /dev/null +++ b/01.code/ado/g/gtools.sthlp @@ -0,0 +1,239 @@ +{smcl} +{* *! version 1.10.1 05Dec2022}{...} +{viewerdialog gtools "dialog gtools"}{...} +{vieweralsosee "[R] gtools" "mansection R gtools"}{...} +{viewerjumpto "Syntax" "gtools##syntax"}{...} +{viewerjumpto "Description" "gtools##description"}{...} +{viewerjumpto "Options" "gtools##options"}{...} +{viewerjumpto "Examples" "gtools##examples"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gtools} {hline 2}}Manage {opt gtools} package installation.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update +{cmd:gtools} to the latest stable version. + +{pstd} +{opt gtools} is a suite of commands that use hashes for a speedup +over traditional stata commands. Syntax is largely analogous to each +command's Stata counterparts. The following are available as part of +gtools (also see the {help gtools##examples:examples} below): + +{p 8 17 2} +{manhelp gcollapse R:gcollapse} and {manhelp gcontract R:gcontract} as {opt collapse} and {opt contract} replacements. {p_end} + +{p 8 17 2} +{manhelp gquantiles R:gquantiles} as {opt pctile}, {opt xtile}, and {opt _pctile} replacements. {manhelp fasterxtile R:fasterxtile} is also provided as an alias +{p_end} + +{p 8 17 2} +{manhelp gegen R:gegen} as a {opt egen} alternative. {p_end} + +{p 8 17 2} +{manhelp gisid R:gisid} as an {opt isid} replacement. {p_end} + +{p 8 17 2} +{manhelp gduplicates R:gduplicates} as a {opt duplicates} replacement. {p_end} + +{p 8 17 2} +{manhelp glevelsof R:glevelsof} as a {opt levelsof} replacement. {p_end} + +{p 8 17 2} +{manhelp gtoplevelsof R:gtoplevelsof} ({opt gtop}): Frequency count of top levels of a {opt varlist}. {p_end} + +{p 8 17 2} +{manhelp gunique R:gunique} and {manhelp gdistinct R:gdistinct}: Count unique levels of a set of variables. {p_end} + +{p 8 17 2} +{manhelp gstats R:gstats}: Wrapper for several statistical functions and transformations. {p_end} + +{p 8 17 2} +{manhelp hashsort R:hashsort}: (Experimental) Hash-based sorting. {p_end} + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{cmd:gtools} +[{cmd:,} {it:{help gtools##table_options:options}}] + +{synoptset 15 tabbed}{...} +{marker table_options}{...} +{synopthdr} +{synoptline} +{syntab :Options} +{synopt :{opt u:pgrade}}Install latest version from Github. +{p_end} +{synopt :{opt i:nstall_latest}}Alias for {opt upgrade}. +{p_end} +{synopt :{opt license:s}}Prints the open source projects used in gtools +{p_end} +{synopt :{opt v:erbose}}With {opt licenses}, prints the licenses of the open source projects used in gtools +{p_end} +{synopt :{opt examples}}Print examples of how to use various gtools functions. +{p_end} +{synopt :{opt showcase}}Alias for {opt examples}. +{p_end} +{synopt :{bf:test[({it:tests})]}}Run unit tests, optionally specifying which tests to run. +{p_end} +{synopt :{opth branch(str)}}Github branch to use (defualt is master). +{p_end} + + +{synoptline} +{p2colreset}{...} +{p 4 6 2} + +{marker description}{...} +{title:Description} + +{pstd} +{opt gtools} is a Stata package that provides a fast implementation of +common commands like collapse, egen, xtile, isid, levelsof, contract, +distinct, and so on using C plugins for a massive speed improvement. + +{pstd} +This program helps the user manage their gtools installation. While +unnecessary in Linux or OSX, when trying to compile the plugin on Windows +it became apparent that I would need to include a DLL with the package +(in particular the DLL for the hash library). While I try to do this +automatically, I ran into enough problems while developing the plugin that I +felt compelled to include this program. + +{marker options}{...} +{title:Options} + +{phang} +{opt upgrade} Upgrades {opt gtools} to the latest github version. + +{phang} +{opt install_latest} Alias for {opt upgrade}. + +{phang} +{opt license} Prints the open source projects used in {cmd gtools}. With +{opt verbose} it also prints the licenses. + +{phang} +{opt examples} (alias {opt showcase}) prints examples of how to use +various gtools functions. + +{phang} +{bf:test[({it:tests})]} Run unit tests, optionally specifying which tests +to run. Tests available are: dependencies, basic_checks, bench_test, +comparisons, switches, bench_full. A good set of "small" tests which +take 10-20 minutes are {cmd: dependencies basic_checks bench_test}. +By default, however, the first 5 tests are run, which take 1-3h. The +bulk of that time is from {bf:comparisons}, which compares the results +from gtools to that of various native counterparts under several +different conditions. {bf:bench_full} is not run by default because this +benchmarks gtools against stata using modestly-sized data (millions). +Some stata commands are very slow under some of the benchmarks, meaning +this can take well over a day. + +{phang} +{opth branch(str)} Github branch to use (defualt is master). + +{marker examples}{...} +{title:Examples} + +{p 4 4 2}{stata sysuse auto, clear}{p_end} + +{p 4 4 2}{it:gstats {sum|tab} varlist [if] [in] [weight], [by(varlist) options]}{p_end} + +{p 8 4 2}{stata gstats sum price [pw = gear_ratio / 4] }{p_end} +{p 8 4 2}{stata gstats tab price mpg, by(foreign) matasave }{p_end} + +{p 4 4 2}{it:gquantiles [newvarname =] exp [if] [in] [weight], {_pctile|xtile|pctile} [options]}{p_end} + +{p 8 4 2}{stata gquantiles 2 * price, _pctile nq(10) }{p_end} +{p 8 4 2}{stata gquantiles p10 = 2 * price, pctile nq(10) }{p_end} +{p 8 4 2}{stata gquantiles x10 = 2 * price, xtile nq(10) by(rep78) }{p_end} +{p 8 4 2}{stata fasterxtile xx = log(price) [w = weight], cutpoints(p10) by(foreign)}{p_end} + +{p 4 4 2}{it:gstats winsor varlist [if] [in] [weight], [by(varlist) cuts(# #) options]}{p_end} + +{p 8 4 2}{stata gstats winsor price gear_ratio mpg, cuts(5 95) s(_w1) }{p_end} +{p 8 4 2}{stata gstats winsor price gear_ratio mpg, cuts(5 95) by(foreign) s(_w2) }{p_end} + +{p 4 4 2}{it:hashsort varlist, [options] }{p_end} + +{p 8 4 2}{stata hashsort -make }{p_end} +{p 8 4 2}{stata hashsort foreign -rep78, benchmark verbose mlast}{p_end} + +{p 4 4 2}{it:gegen target = stat(source) [if] [in] [weight], by(varlist) [options]}{p_end} + +{p 8 4 2}{stata gegen tag = tag(foreign) }{p_end} +{p 8 4 2}{stata gegen group = tag(-price make) }{p_end} +{p 8 4 2}{stata gegen p2_5 = pctile(price) [w = weight], by(foreign) p(2.5) }{p_end} + +{p 4 4 2}{it:gisid varlist [if] [in], [options] }{p_end} + +{p 8 4 2}{stata gisid make, missok }{p_end} +{p 8 4 2}{stata gisid price in 1 / 2 }{p_end} + +{p 4 4 2}{it:gduplicates varlist [if] [in], [options gtools(gtools_options)] }{p_end} + +{p 8 4 2}{stata gduplicates report foreign }{p_end} +{p 8 4 2}{stata gduplicates report rep78 if foreign, gtools(bench(3)) }{p_end} + +{p 4 4 2}{it:glevelsof varlist [if] [in], [options] }{p_end} + +{p 8 4 2}{stata glevelsof rep78, local(levels) sep(" | ") }{p_end} +{p 8 4 2}{stata glevelsof foreign mpg if price < 4000, loc(lvl) sep(" | ") colsep(", ") }{p_end} +{p 8 4 2}{stata glevelsof foreign mpg in 10 / 70, gen(uniq_) nolocal }{p_end} + +{p 4 4 2}{it:gtop varlist [if] [in] [weight], [options] }{p_end} +{p 4 4 2}{it:gtoplevelsof varlist [if] [in] [weight], [options] }{p_end} + +{p 8 4 2}{stata gtoplevelsof foreign rep78 }{p_end} +{p 8 4 2}{stata gtop foreign rep78 [w = weight], ntop(5) missrow groupmiss pctfmt(%6.4g) colmax(3) }{p_end} + +{p 4 4 2}{it:gcollapse (stat) out = src [(stat) out = src ...] [if] [if] [weight], by(varlist) [options]}{p_end} + +{p 8 4 2}{stata gen h1 = headroom }{p_end} +{p 8 4 2}{stata gen h2 = headroom }{p_end} +{p 8 4 2}{stata pretty# #sourcelabel#) }{p_end} + +{p 8 4 2}{stata gcollapse (mean) mean = price (median) p50 = gear_ratio, by(make) merge v }{p_end} +{p 8 4 2}{stata disp "`:var label mean', `:var label p50'" }{p_end} +{p 8 4 2}{stata gcollapse (iqr) irq? = h? (nunique) turn (p97.5) mpg, by(foreign rep78) bench(2) wild }{p_end} + +{p 4 4 2}{it:gcontract varlist [if] [if] [fweight], [options]}{p_end} + +{p 8 4 2}{stata gcontract foreign [fw = turn], freq(f) percent(p)}{p_end} + +{p 4 4 2}{it:greshape subcommand list, i(i) j(j) [options]}{p_end} + +{p 8 4 2}{stata gen j = _n }{p_end} +{p 8 4 2}{stata greshape wide f p, i(foreign) j(j) }{p_end} +{p 8 4 2}{stata greshape long f p, i(foreign) j(j) }{p_end} +{p 8 4 2}{stata greshape spread f p, j(j) }{p_end} +{p 8 4 2}{stata greshape gather f? p?, j(j) value(fp) }{p_end} + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gtools} is maintained at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} diff --git a/01.code/ado/g/gtop.ado b/01.code/ado/g/gtop.ado new file mode 100755 index 0000000..486a07d --- /dev/null +++ b/01.code/ado/g/gtop.ado @@ -0,0 +1,36 @@ +*! version 1.2.0 23Mar2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Calculate the top groups by count of a varlist (jointly). + +cap program drop gtop +program gtop, rclass + version 13.1 + + local 00 `0' + gtoplevelsof `0' + if ( ${GTOP_RC} ) { + global GTOP_RC + exit 0 + } + local 0 `00' + + qui syntax [anything] [if] [in] [aw fw pw], [LOCal(str) MATrix(str) *] + if ( "`local'" != "" ) c_local `local' `"`r(levels)'"' + if ( "`matrix'" != "" ) matrix `matrix' = r(toplevels) + return local levels `"`r(levels)'"' + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) + return scalar alpha = r(alpha) + return scalar ntop = r(ntop) + return scalar nrows = r(nrows) + + if ( `"`r(matalevels)'"' == "" ) { + tempname gmat + matrix `gmat' = r(toplevels) + return matrix toplevels = `gmat' + } + else { + return local matalevels = `"`r(matalevels)'"' + } +end diff --git a/01.code/ado/g/gtop.sthlp b/01.code/ado/g/gtop.sthlp new file mode 100755 index 0000000..4b3e235 --- /dev/null +++ b/01.code/ado/g/gtop.sthlp @@ -0,0 +1,412 @@ +{smcl} +{* *! version 1.2.0 20Mar2019}{...} +{vieweralsosee "[P] gtoplevelsof" "mansection P gtoplevelsof"}{...} +{viewerjumpto "Syntax" "gtoplevelsof##syntax"}{...} +{viewerjumpto "Description" "gtoplevelsof##description"}{...} +{viewerjumpto "Options" "gtoplevelsof##options"}{...} +{viewerjumpto "Remarks" "gtoplevelsof##remarks"}{...} +{viewerjumpto "Stored results" "gtoplevelsof##results"}{...} +{title:Title} + +{p2colset 5 23 23 2}{...} +{p2col :{cmd:gtoplevelsof} {hline 2}}Quickly tabulate most common levels of variable list.{p_end} +{p2colreset}{...} + + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt gtop:levelsof} +{varlist} +{ifin} +[{it:{help gtoplevelsof##weight:weight}}] +[{cmd:,} {it:options}] + +{synoptset 24 tabbed}{...} +{synopthdr} +{synoptline} +{syntab :Summary Options} +{synopt:{opth ntop(int)}} Display {opt ntop} most common levels (negative shows least common; {opt .} shows every level).{p_end} +{synopt:{opth freqabove(int)}} Only count freqs above this level.{p_end} +{synopt:{opth pctabove(real)}} Only count freqs that represent at least % of the total.{p_end} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GtoolsByLevels}){p_end} + +{syntab :Toggles} +{synopt:{opt missrow}} Add row with count of missing values.{p_end} +{synopt:{opt groupmiss:ing}} Count rows with any variable missing as missing.{p_end} +{synopt:{opt nomiss:ing}} Case-wise exclude rows with missing values from frequency count.{p_end} +{synopt:{opt nooth:er}} Do not group rest of levels into "other" row.{p_end} +{synopt:{opt nong:roups}} Do not specify number of groups in "other" row.{p_end} +{synopt:{opt alpha}} Sort the top levels of varlist by variables instead of frequencies.{p_end} +{synopt:{opt silent}} Do not display the top levels of varlist.{p_end} + +{syntab :Display Options} +{synopt:{opth pctfmt(format)}} Format for percentages.{p_end} +{synopt:{opth oth:erlabel(str)}} Specify label for row with "other" count.{p_end} +{synopt:{opth missrow:label(str)}} Specify the label for the row with "missing" count.{p_end} +{synopt:{opth varabb:rev(int)}} Abbreviate variables (which are displayed as a header to their levels) .{p_end} +{synopt:{opth colmax(numlist)}} Specify width limit for levels (can be single number of variable-specific).{p_end} +{synopt:{opth colstrmax(numlist)}} Specify width limit for string variables (can be single number of variable-specific).{p_end} +{synopt:{opt cols:eparate(separator)}} Column separator; default is double blank " ".{p_end} +{synopt:{opth numfmt(format)}} Format for numeric variables. Default is {opt %.8g} (or {opt %16.0g} with {opt matasave}).{p_end} +{synopt:{opt novaluelab:els}} Do not replace numeric variables with their value labels.{p_end} +{synopt:{opt hidecont:levels}} If a level is repeated in the subsequent row, display a blank.{p_end} + +{syntab :levelsof Options} +{synopt:{opt l:ocal(macname)}}insert top levels in the local macro {it:macname}{p_end} +{synopt:{opt s:eparate(separator)}}separator for the values of returned list; default is a space{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed, in which +case the top levels by weight are printed (see {manhelp weight U:11.1.6 weight}) +{p_end} + + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gtoplevelsof} (alias {cmd:gtop}) displays a table with the +frequency counts, percentages, and cummulative counts and %s of the +most common levels of {varlist} that occur in the data. It is similar +to the user-written {cmd:group} with the {opt select} otpion or to +{opt contract} after keeping only the largest frequency counts. + +{pstd} +Unlike contract, it does not modify the original data and instead prints the +resulting table to the console. It also stores a matrix with the frequency +counts and stores the levels in the macro {opt r(levels)}. + +{pstd} +{opt gcontract} is part of the {manhelp gtools R:gtools} project. + + +{marker options}{...} +{title:Options} + +{dlgtab:Summary Options} + +{phang} +{opth ntop(int)} Number of levels to display. This can be negative; +in that case, the smallest frequencies are displayed. Note cummulative +percentages and counts are computed within each generated table, +so for the smallest groups the table would display the cummulative +count for those frequencies, in descending order. {opt .} displays +every level from most to least frequent; {opt -.} displays every level +from least to most frequent. + +{phang} +{opth freqabove(int)} Skip frequencies below this level then determining the +largest levels. So if this is 10, only frequencies above 10 will be displayed +as part of the top frequencies. If every frequency that would be displayed is +above this level then this option has no effect. + +{phang} +{opth pctabove(real)} Skip frequencies that are a smaller percentage of the +data than {opt pctabove}. If this is 10, then only frequencies that represent +at least 10% of all observations are displayed as part of the top frequencies. +If every frequency that would be displayed is at least this percentage of the +data then this option has no effect. + +{phang} +{opt mata:save}[{cmd:(}{it:str}{cmd:)}]Save results in mata object (default +name is {bf:GtoolsByLevels}). See {opt GtoolsByLevels.desc()} for more. +This object contains the raw variable levels in {opt numx} and {opt charx} +(since mata does not allow matrices of mixed-type). The levels are saved +as a string in {opt printed} (with value labels correctly applied) unless +option {opt silent} is also specified. Last, the frequencies matrix is saved +in {opt toplevels}. + +{dlgtab:Toggles} + +{phang}{opt missrow} Add row with count of missing values. By default, +missing rows are treated as another group and will be displayed as part +of the top levels. With multiple variables, only rows with all values +missing are counted here unless {opt groupmissing} is also passed. If +this option is specified then a row is printed after the top levels +with the frequency count of missing rows. + +{phang}{opt groupmissing} This option specifies that a missing row is a +row where any of the variables have a missing value. See {opt missrow}. + +{phang}{opt nomissing} Case-wise exclude rows with missing values from +frequency count. By default missing values are treated as another level. + +{phang}{opt noother} By default a row is printed after the top levels +with the frequency count from groups not in the top levels and not +counted as missing. This option toggles display of that row. + +{phang}{opt nongroups} By default the number of groups comprising the +"Other" and "Missing" rows are printed as part of the "Other" and +"Missing" row labels (should they appear; for the missing row this +is only printed if more than 1 missing value type is present). This +option toggles display of the number of groups represented. + +{phang}{opt alpha} Sort the top levels of varlist by variables instead +of frequencies. Note that the top levels are still extracted; this just +affects the final sort order. To sort in inverse order, just pass +{opt gtop -var1 -var2 ...}. + +{phang}{opt silent} Do not display the top levels of varlist. With +option {opt matasave} it also does not store the printed levels in a +separate string matrix. + +{dlgtab:Display Options} + +{phang}{opth pctfmt(format)} Print format for percentage columns. + +{phang}{opth otherlabel(str)} Specify label for row with the count of the +rest of the levels. + +{phang}{opth missrowlabel(str)} Specify the label for the row the count of +the "missing" levels. + +{phang}{opth varabbrev(int)} Variables names are displayed above their +groups. This option specifies that variables should be abbreviated to at +most {opt varabbrev} characters. This is ignored if it is smaller than 5. + +{phang}{opth colmax(numlist)} Specify width limit for levels (can be single +number of variable-specific). + +{phang}{opth colstrmax(numlist)} Specify width limit for string variables (can +be single number of variable-specific). Ths overrides {opt colmax} for strings +and allows the user to specify string and number widths sepparately. (Also see +{opth numfmt(format)}) + +{phang}{opth numfmt(format)} Format for numeric variables. Default is {opt %.8g} +(or {opt %16.0g} with {opt matasave}). By default the number levels are formatted +in C, so this must be a valid format for the C internal {opt printf}. The syntax +is very similar to mata's {opt printf}. Some examples are: %.2f, %10.6g, %5.0f, and +so on. With option {opt matasave} these are formatted in mata, and the format can +be any mata number format. + +{phang}{opt colseparate(separator)} Column separator; default is double blank " ". + +{phang}{opt novaluelabels} Do not replace numeric variables with their value +labels. Value label widths are governed by colmax and NOT colstrmax. + +{phang}{opt hidecontlevels} If a level is repeated in the subsequent row, +display a blank. This is only done if both observations are within the same +outer level. + +{dlgtab:levelsof Options} + +{phang} +{cmd:local(}{it:macname}{cmd:)} inserts the list of levels in local macro +{it:macname} within the calling program's space. Hence, that macro will +be accessible after {cmd:gtoplevelsof} has finished. This is helpful for +subsequent use. Note this uses {opt colseparate} to sepparate columns. The +default is " " so be careful when parsing! Rows are enclosed in double quotes +(`""') so parsing is possible, just not trivial. + +{phang} +{cmd:separate(}{it:separator}{cmd:)} specifies a separator +to serve as punctuation for the values of the returned list. +The default is a space. A useful alternative is a comma. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + + +{marker remarks}{...} +{title:Remarks} + +{pstd} +{cmd:gtoplevelsof} has the main function of displaying the most common levels +of {it:varlist}. While {opt tab} is great, it cannot handle a large number +of levels, and it prints ALL the levels in alphabetical order. + +{pstd} +Very often when exploring data I just want to have a quick look at the largest +levels of a variable that may have thousands of levels in a data set with +millions of rows. {opt gcontract} and {opt gcollapse} are great but they +modify the original data and doing a lot of subsequent preserve, sort, restore +gets very slow very fast. + +{pstd} +I have found this command extremely helpful when exploring big data. +Specially if a string is not clean, then having a look at the largest +values or the largest values that match a pattern is very helpful. + + +{marker examples}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gtoplevelsof/index.html#examples":online documentation} +for more examples. + +{phang}{cmd:. sysuse auto}{p_end} +{phang}{cmd:. gtoplevelsof rep78}{p_end} +{phang}{cmd:. gtoplevelsof rep78, missrow local(toplevels)}{p_end} +{phang}{cmd:. gtop rep78, colsep(", ")}{p_end} +{phang}{cmd:. gtop foreign rep78, ntop(3) missrow}{p_end} + + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gtoplevelsof} stores the following in {cmd:r()}: + +{synoptset 15 tabbed}{...} +{p2col 5 15 19 2: Macros}{p_end} +{synopt:{cmd:r(levels)}}list of top (most common) levels (rows); not with {opt matasave}{p_end} +{synopt:{cmd:r(matalevels)}}name of GtoolsByLevels mata object; only with {opt matasave}{p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(minJ) }} largest group size {p_end} +{synopt:{cmd:r(maxJ) }} smallest group size {p_end} +{synopt:{cmd:r(ntop) }} number of top levels {p_end} +{synopt:{cmd:r(nrows)}} number of rows in {opt toplevels} {p_end} +{synopt:{cmd:r(alpha)}} sorted by levels intead of frequencies {p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(toplevels)}}Table with frequency counts and percentages.{p_end} +{p2colreset}{...} + +{pstd} The missing and other rows are stored in the matrix with IDs 2 and 3, +respectively. With {opt matasave}, the following data is stored in {opt GtoolsByLevels}: + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + real rowvector charpos + position of kth character variable + + string matrix printed + formatted (printf-ed) variable levels (not with option -silent-) + + real matrix toplevels + frequencies of top levels; missing and other rows stored with ID 2 and 3. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gtoplevelsof} is maintained as part of {it:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + + +{title:Also see} + +{p 4 13 2} +help for +{help gcontract}, +{help glevelsof}, +{help gtools}; +{help flevelsof} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gtoplevelsof.ado b/01.code/ado/g/gtoplevelsof.ado new file mode 100755 index 0000000..266f5fc --- /dev/null +++ b/01.code/ado/g/gtoplevelsof.ado @@ -0,0 +1,420 @@ +*! version 1.2.0 23Mar2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Calculate the top groups by count of a varlist (jointly). + +cap program drop gtoplevelsof +program gtoplevelsof, rclass + global GTOP_RC 0 + version 13.1 + + if ( `=_N < 1' ) { + global GTOP_RC 17001 + di as txt "no observations" + exit 0 + } + + global GTOOLS_CALLER gtoplevelsof + syntax anything /// + [if] [in] /// [if condition] [in start / end] + [aw fw pw] , /// [weight type = exp] + [ /// + ntop(str) /// Number of levels to display + freqabove(real 0) /// Only include above this count + pctabove(real 0) /// Only include above this pct + /// + alpha /// Sort top levels by level, not by freq + noOTHer /// Do not add summary row with "other" group to table + noNGroups /// Do not add number of groups to "Other" row + missrow /// Incldue missings as a sepparate row + GROUPMISSing /// Count as missing if any variable is missing + noMISSing /// Exclude missing values + NODS DS /// Parse - as varlist (ds) or negative (nods) + silent /// Do not try to print the levels + /// + MATAsave /// Save results in mata + MATAsavename(str) /// mata save name + OTHerlabel(str) /// Label for "other" row + MISSROWlabel(str) /// Count as missing if any variable is missing + pctfmt(str) /// How to format percentages + /// + noVALUELABels /// Do (not) map value labels + HIDECONTlevels /// Hide level name previous level is the same + VARABBrev(int -1) /// Abbrev print of var names + colmax(numlist) /// Maximum number of characters to print per column + colstrmax(numlist) /// Maximum number of characters to print per column (strings) + numfmt(passthru) /// How to format numbers + /// + Separate(passthru) /// Levels sepparator + COLSeparate(passthru) /// Columns sepparator (only with 2+ vars) + Clean /// Clean strings + LOCal(str) /// Store variable levels in local + MATrix(str) /// Store result in matrix + /// + noWARNing /// Do not warn about how tab might sometimes be faster + debug(passthru) /// Print debugging info to console + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// debugging + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// On collision, fall back or error + /// + group(str) /// + tag(passthru) /// + counts(passthru) /// + fill(passthru) /// + replace /// + ] + + if ( (`"`matasave'"' != "") & (`"`local'"' != "") ) { + disp as err "Option local() not allowed with option -matasave-" + exit 198 + } + + if ( (`"`matasavename'"' != "") & (`"`local'"' != "") ) { + disp as err "Option local() not allowed with option -matasave()-" + exit 198 + } + + if ( (`"`matasave'"' != "") & (`"`matrix'"' != "") ) { + disp as err "Option matrix() not allowed with option -matasave-" + exit 198 + } + + if ( (`"`matasavename'"' != "") & (`"`matrix'"' != "") ) { + disp as err "Option matrix() not allowed with option -matasave()-" + exit 198 + } + + if ( `"`matasavename'"' != "" ) local matasave matasave + if ( `"`matasavename'"' == "" ) local matasavename GtoolsByLevels + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( `"`colseparate'"' == "" ) { + local colseparate colseparate(`" "') + } + + if ( `"`pctfmt'"' == "" ) { + local pctfmt `"%5.1f"' + } + + if ( `"`matasave'"' == "" ) { + if ( `"`numfmt'"' == "" ) { + local numfmt numfmt(`"%.8g"') + } + } + else { + if ( `"`numfmt'"' == "" ) { + local numfmt numfmt(`"%16.0g"') + } + } + + if !regexm(`"`pctfmt'"', "%[0-9]+\.[0-9]+(gc?|fc?|e)") { + di as err "Percent format must be %(width).(digits)(f|g); e.g. %.16g (default), %20.5f" + exit 198 + } + + if ( ("`ds'" != "") & ("`nods'" != "") ) { + di as err "-ds- and -nods- mutually exclusive" + exit 198 + } + + * Get varlist + * ----------- + + if ( `"`anything'"' != "" ) { + local varlist: copy local anything + local varlist: subinstr local varlist "+" " ", all + if ( strpos(`"`varlist'"', "-") & ("`ds'`nods'" == "") ) { + disp as txt "'-' interpreted as negative; use option -ds- to interpret as varlist" + disp as txt "(to suppress this warning, use option -nods-)" + } + if ( "`ds'" != "" ) { + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid varlist: `anything'" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + cap noi ds `varlist' + exit _rc + } + local varlist `r(varlist)' + local anything: copy local varlist + } + else { + local parse: copy local varlist + local varlist: subinstr local varlist "-" " ", all + local varlist `varlist' + if ( "`varlist'" == "" ) { + di as err "Invalid list: `anything'" + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit 198 + } + cap ds `varlist' + if ( _rc ) { + local notfound + foreach var of local varlist { + cap confirm var `var' + if ( _rc ) { + local notfound `notfound' `var' + } + } + if ( `:list sizeof notfound' > 0 ) { + if ( `:list sizeof notfound' > 1 ) { + di as err "Variables not found: `notfound'" + } + else { + di as err "Variable `notfound' not found" + } + } + exit 111 + } + local varlist + local anything + while ( `:list sizeof parse' ) { + gettoken var parse: parse, p(" -") + local neg + if inlist("`var'", "-") { + gettoken var parse: parse, p(" -") + local neg - + } + cap ds `var' + if ( _rc ) { + local rc = _rc + di as err "Variable '`var'' does not exist." + di as err "Syntax: [+|-]varname [[+|-]varname ...]" + exit `rc' + } + foreach v of varlist `var' { + local anything `anything' `neg'`v' + local varlist `varlist' `v' + } + } + } + } + if ( "`ds'" == "" ) local nods nods + + * Parse options + * ------------- + + if ( "`missing'" == "nomissing" ) { + if ( ("`missrow'" != "") | ("`groupmissing'" != "") ) { + di as err "-nomissing- not allowed with -groupmissing- or -missrow[()]-" + exit 198 + } + } + local missing = cond("`missing'" == "", "missing", "") + + if ( (`pctabove' < 0) | (`pctabove' > 100) ) { + di as err "-pctabove()- must be between 0 and 100" + exit 198 + } + + local invert + if ( `"`ntop'"' == "" ) { + local ntop 10 + } + else { + cap confirm number `ntop' + if ( _rc ) { + cap assert mi(`ntop') + if ( _rc ) { + disp as err "Option -ntop()- must be a number or missing." + exit 198 + } + } + if ( mi(`ntop') ) { + if ( (substr(`"`ntop'"', 1, 1) == "-") ) { + local invert invert + } + } + else { + if ( `ntop' < 0 ) { + local invert invert + } + } + local ntop = `ntop' + } + + local ntop ntop(`ntop') + local pct pct(`pctabove') + local freq freq(`freqabove') + + if ( ("`missrow'" != "") | ("`missrowlabel'" != "") ) { + if ( "`groupmissing'" != "" ) { + if ( "`missrowlabel'" != "" ) { + local groupmiss misslab(`"`missrowlabel'"') groupmiss + } + else { + local missrowlabel Missing (any) + local groupmiss misslab(Missing (any)) groupmiss + } + } + else { + if ( "`missrowlabel'" != "" ) { + local groupmiss misslab(`"`missrowlabel'"') + } + else { + local missrowlabel Missing + local groupmiss misslab(Missing) + } + } + } + + if ( ("`other'" == "") | ("`otherlabel'" != "") ) { + if ( "`otherlabel'" != "" ) { + local otherlab otherlab(`"`otherlabel'"') + } + else { + local otherlabel Other + local otherlab otherlab(Other) + } + } + + local gtop gtop(`ntop' /* + */ `pct' /* + */ `groupmiss' /* + */ `otherlab' /* + */ `freq' /* + */ `alpha' /* + */ `invert' /* + */ `matasave' /* + */ `valuelabels' /* + */ `silent' /* + */ matasavename(`matasavename')) + + + if ( `"`weight'"' != "" ) { + tempvar touse w + qui gen double `w' `exp' `if' `in' + local wgt `"[`weight'=`w']"' + local weights weights(`weight' `w') + mark `touse' `if` 'in' `wgt' + local if if `touse' + } + else local weights + + * Call the internals + * ------------------ + + local opts `clean' `separate' `colseparate' `missing' `gtop' `numfmt' `ds' `nods' + local sopts `compress' `forcestrl' `_ctolerance' + local sopts `sopts' `verbose' `benchmark' `benchmarklevel' + local sopts `sopts' `oncollision' `hashmethod' `debug' + + local gopts gen(`group') `tag' `counts' `fill' `replace' `weights' + cap noi _gtools_internal `anything' `if' `in', `opts' `sopts' `gopts' gfunction(top) + + local rc = _rc + global GTOOLS_CALLER "" + if ( `rc' == 17999 ) { + exit 17000 + } + else if ( `rc' == 17001 ) { + global GTOP_RC 17001 + di as txt "(no observations)" + exit 0 + } + else if ( `rc' ) { + exit `rc' + } + + local byvars = `"`r(byvars)'"' + local bynum = `"`r(bynum)'"' + local bystr = `"`r(bystr)'"' + + tempname invertmat + mata: `invertmat' = st_matrix("r(invert)") + + local abbrev = `varabbrev' + if ( `abbrev' == -1 ) { + foreach v of local varlist { + local abbrev = max(`abbrev', length("`v'")) + } + } + + local k = 0 + local abbrevlist "" + foreach v of local varlist { + local ++k + local abbrev = max(`abbrev', 5) + mata: st_local("invert", strofreal(`invertmat'[`k'])) + if ( `invert' ) { + local avar `:di %`abbrev's abbrev("`v'", `abbrev')' + local abbrevlist `abbrevlist' -`avar' + } + else { + local abbrevlist `abbrevlist' `:di %`abbrev's abbrev("`v'", `abbrev')' + } + } + + tempname gmat + if ( `"`silent'"' == "" ) { + if ( `"`matasave'"' == "" ) { + mata: GtoolsGtopPrintTop( /* + */ `:list sizeof varlist', /* + */ tokens("`abbrevlist'"), /* + */ __gtools_top_matrix, /* + */ __gtools_top_num, /* + */ "", /* + */ 0) + } + else { + mata: GtoolsGtopPrintTop( /* + */ `:list sizeof varlist', /* + */ tokens("`abbrevlist'"), /* + */ `matasavename'.toplevels, /* + */ `matasavename'.numx, /* + */ `matasavename'.printed, /* + */ 1) + } + } + else { + if ( `"`matasave'"' == "" ) { + cap mata st_matrix(`"`gmat'"', /* + */ __gtools_top_matrix[ /* + */ selectindex(__gtools_top_matrix[., 1] :!= 0), .]) + } + } + + if ( `"`_post_msg_gtop_matanote'"' != "" ) { + disp as txt `"`_post_msg_gtop_matanote'"' + } + + if ( `"`_post_msg_gtop_matawarn'"' != "" ) { + disp as err `"`_post_msg_gtop_matawarn'"' + } + + if ( `"`matasave'"' == "" ) { + mata st_local("vals", st_global("r(levels)")) + matrix colnames `gmat' = ID N Cum Pct PctCum + if ( "`local'" != "" ) c_local `local': copy local vals + if ( "`matrix'" != "" ) matrix `matrix' = `gmat' + return local levels: copy local vals + return matrix toplevels = `gmat' + } + else { + return local matalevels `"`matasavename'"' + } + + return scalar N = r(N) + return scalar J = r(J) + return scalar minJ = r(minJ) + return scalar maxJ = r(maxJ) + return scalar alpha = r(alpha) + return scalar ntop = r(ntop) + return scalar nrows = r(nrows) + + cap mata: mata drop __gtools_top_matrix + cap mata: mata drop __gtools_top_num + cap mata: mata drop `invertmat' + + * if ( `c(MP)' & (`r(J)' < 11) & ("`warning'" != "nowarning") & (`:list sizeof varlist' == 1) ) { + * disp as txt "(Note: {cmd:tab} can be faster than {cmd:gtop} with few groups.)" + * } +end diff --git a/01.code/ado/g/gtoplevelsof.sthlp b/01.code/ado/g/gtoplevelsof.sthlp new file mode 100755 index 0000000..4b3e235 --- /dev/null +++ b/01.code/ado/g/gtoplevelsof.sthlp @@ -0,0 +1,412 @@ +{smcl} +{* *! version 1.2.0 20Mar2019}{...} +{vieweralsosee "[P] gtoplevelsof" "mansection P gtoplevelsof"}{...} +{viewerjumpto "Syntax" "gtoplevelsof##syntax"}{...} +{viewerjumpto "Description" "gtoplevelsof##description"}{...} +{viewerjumpto "Options" "gtoplevelsof##options"}{...} +{viewerjumpto "Remarks" "gtoplevelsof##remarks"}{...} +{viewerjumpto "Stored results" "gtoplevelsof##results"}{...} +{title:Title} + +{p2colset 5 23 23 2}{...} +{p2col :{cmd:gtoplevelsof} {hline 2}}Quickly tabulate most common levels of variable list.{p_end} +{p2colreset}{...} + + +{marker syntax}{...} +{title:Syntax} + +{p 8 17 2} +{opt gtop:levelsof} +{varlist} +{ifin} +[{it:{help gtoplevelsof##weight:weight}}] +[{cmd:,} {it:options}] + +{synoptset 24 tabbed}{...} +{synopthdr} +{synoptline} +{syntab :Summary Options} +{synopt:{opth ntop(int)}} Display {opt ntop} most common levels (negative shows least common; {opt .} shows every level).{p_end} +{synopt:{opth freqabove(int)}} Only count freqs above this level.{p_end} +{synopt:{opth pctabove(real)}} Only count freqs that represent at least % of the total.{p_end} +{synopt:{opt mata:save}[{cmd:(}{it:str}{cmd:)}]}Save results in mata object (default name is {bf:GtoolsByLevels}){p_end} + +{syntab :Toggles} +{synopt:{opt missrow}} Add row with count of missing values.{p_end} +{synopt:{opt groupmiss:ing}} Count rows with any variable missing as missing.{p_end} +{synopt:{opt nomiss:ing}} Case-wise exclude rows with missing values from frequency count.{p_end} +{synopt:{opt nooth:er}} Do not group rest of levels into "other" row.{p_end} +{synopt:{opt nong:roups}} Do not specify number of groups in "other" row.{p_end} +{synopt:{opt alpha}} Sort the top levels of varlist by variables instead of frequencies.{p_end} +{synopt:{opt silent}} Do not display the top levels of varlist.{p_end} + +{syntab :Display Options} +{synopt:{opth pctfmt(format)}} Format for percentages.{p_end} +{synopt:{opth oth:erlabel(str)}} Specify label for row with "other" count.{p_end} +{synopt:{opth missrow:label(str)}} Specify the label for the row with "missing" count.{p_end} +{synopt:{opth varabb:rev(int)}} Abbreviate variables (which are displayed as a header to their levels) .{p_end} +{synopt:{opth colmax(numlist)}} Specify width limit for levels (can be single number of variable-specific).{p_end} +{synopt:{opth colstrmax(numlist)}} Specify width limit for string variables (can be single number of variable-specific).{p_end} +{synopt:{opt cols:eparate(separator)}} Column separator; default is double blank " ".{p_end} +{synopt:{opth numfmt(format)}} Format for numeric variables. Default is {opt %.8g} (or {opt %16.0g} with {opt matasave}).{p_end} +{synopt:{opt novaluelab:els}} Do not replace numeric variables with their value labels.{p_end} +{synopt:{opt hidecont:levels}} If a level is repeated in the subsequent row, display a blank.{p_end} + +{syntab :levelsof Options} +{synopt:{opt l:ocal(macname)}}insert top levels in the local macro {it:macname}{p_end} +{synopt:{opt s:eparate(separator)}}separator for the values of returned list; default is a space{p_end} + +{syntab:Gtools} +{synopt :{opt compress}}Try to compress strL to str#. +{p_end} +{synopt :{opt forcestrl}}Skip binary variable check and force gtools to read strL variables. +{p_end} +{synopt :{opt v:erbose}}Print info during function execution. +{p_end} +{synopt :{cmd:bench}[{cmd:(}{int}{cmd:)}]}Benchmark various steps of the plugin. Optionally specify depth level. +{p_end} +{synopt :{opth hash:method(str)}}Hash method (default, biject, or spooky). Intended for debugging. +{p_end} +{synopt :{opth oncollision(str)}}Collision handling (fallback or error). Intended for debugging. +{p_end} + +{synoptline} +{p2colreset}{...} + +{marker weight}{...} +{p 4 6 2} +{opt aweight}s, {opt fweight}s, and {opt pweight}s are allowed, in which +case the top levels by weight are printed (see {manhelp weight U:11.1.6 weight}) +{p_end} + + +{marker description}{...} +{title:Description} + +{pstd} +{cmd:gtoplevelsof} (alias {cmd:gtop}) displays a table with the +frequency counts, percentages, and cummulative counts and %s of the +most common levels of {varlist} that occur in the data. It is similar +to the user-written {cmd:group} with the {opt select} otpion or to +{opt contract} after keeping only the largest frequency counts. + +{pstd} +Unlike contract, it does not modify the original data and instead prints the +resulting table to the console. It also stores a matrix with the frequency +counts and stores the levels in the macro {opt r(levels)}. + +{pstd} +{opt gcontract} is part of the {manhelp gtools R:gtools} project. + + +{marker options}{...} +{title:Options} + +{dlgtab:Summary Options} + +{phang} +{opth ntop(int)} Number of levels to display. This can be negative; +in that case, the smallest frequencies are displayed. Note cummulative +percentages and counts are computed within each generated table, +so for the smallest groups the table would display the cummulative +count for those frequencies, in descending order. {opt .} displays +every level from most to least frequent; {opt -.} displays every level +from least to most frequent. + +{phang} +{opth freqabove(int)} Skip frequencies below this level then determining the +largest levels. So if this is 10, only frequencies above 10 will be displayed +as part of the top frequencies. If every frequency that would be displayed is +above this level then this option has no effect. + +{phang} +{opth pctabove(real)} Skip frequencies that are a smaller percentage of the +data than {opt pctabove}. If this is 10, then only frequencies that represent +at least 10% of all observations are displayed as part of the top frequencies. +If every frequency that would be displayed is at least this percentage of the +data then this option has no effect. + +{phang} +{opt mata:save}[{cmd:(}{it:str}{cmd:)}]Save results in mata object (default +name is {bf:GtoolsByLevels}). See {opt GtoolsByLevels.desc()} for more. +This object contains the raw variable levels in {opt numx} and {opt charx} +(since mata does not allow matrices of mixed-type). The levels are saved +as a string in {opt printed} (with value labels correctly applied) unless +option {opt silent} is also specified. Last, the frequencies matrix is saved +in {opt toplevels}. + +{dlgtab:Toggles} + +{phang}{opt missrow} Add row with count of missing values. By default, +missing rows are treated as another group and will be displayed as part +of the top levels. With multiple variables, only rows with all values +missing are counted here unless {opt groupmissing} is also passed. If +this option is specified then a row is printed after the top levels +with the frequency count of missing rows. + +{phang}{opt groupmissing} This option specifies that a missing row is a +row where any of the variables have a missing value. See {opt missrow}. + +{phang}{opt nomissing} Case-wise exclude rows with missing values from +frequency count. By default missing values are treated as another level. + +{phang}{opt noother} By default a row is printed after the top levels +with the frequency count from groups not in the top levels and not +counted as missing. This option toggles display of that row. + +{phang}{opt nongroups} By default the number of groups comprising the +"Other" and "Missing" rows are printed as part of the "Other" and +"Missing" row labels (should they appear; for the missing row this +is only printed if more than 1 missing value type is present). This +option toggles display of the number of groups represented. + +{phang}{opt alpha} Sort the top levels of varlist by variables instead +of frequencies. Note that the top levels are still extracted; this just +affects the final sort order. To sort in inverse order, just pass +{opt gtop -var1 -var2 ...}. + +{phang}{opt silent} Do not display the top levels of varlist. With +option {opt matasave} it also does not store the printed levels in a +separate string matrix. + +{dlgtab:Display Options} + +{phang}{opth pctfmt(format)} Print format for percentage columns. + +{phang}{opth otherlabel(str)} Specify label for row with the count of the +rest of the levels. + +{phang}{opth missrowlabel(str)} Specify the label for the row the count of +the "missing" levels. + +{phang}{opth varabbrev(int)} Variables names are displayed above their +groups. This option specifies that variables should be abbreviated to at +most {opt varabbrev} characters. This is ignored if it is smaller than 5. + +{phang}{opth colmax(numlist)} Specify width limit for levels (can be single +number of variable-specific). + +{phang}{opth colstrmax(numlist)} Specify width limit for string variables (can +be single number of variable-specific). Ths overrides {opt colmax} for strings +and allows the user to specify string and number widths sepparately. (Also see +{opth numfmt(format)}) + +{phang}{opth numfmt(format)} Format for numeric variables. Default is {opt %.8g} +(or {opt %16.0g} with {opt matasave}). By default the number levels are formatted +in C, so this must be a valid format for the C internal {opt printf}. The syntax +is very similar to mata's {opt printf}. Some examples are: %.2f, %10.6g, %5.0f, and +so on. With option {opt matasave} these are formatted in mata, and the format can +be any mata number format. + +{phang}{opt colseparate(separator)} Column separator; default is double blank " ". + +{phang}{opt novaluelabels} Do not replace numeric variables with their value +labels. Value label widths are governed by colmax and NOT colstrmax. + +{phang}{opt hidecontlevels} If a level is repeated in the subsequent row, +display a blank. This is only done if both observations are within the same +outer level. + +{dlgtab:levelsof Options} + +{phang} +{cmd:local(}{it:macname}{cmd:)} inserts the list of levels in local macro +{it:macname} within the calling program's space. Hence, that macro will +be accessible after {cmd:gtoplevelsof} has finished. This is helpful for +subsequent use. Note this uses {opt colseparate} to sepparate columns. The +default is " " so be careful when parsing! Rows are enclosed in double quotes +(`""') so parsing is possible, just not trivial. + +{phang} +{cmd:separate(}{it:separator}{cmd:)} specifies a separator +to serve as punctuation for the values of the returned list. +The default is a space. A useful alternative is a comma. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + + +{marker remarks}{...} +{title:Remarks} + +{pstd} +{cmd:gtoplevelsof} has the main function of displaying the most common levels +of {it:varlist}. While {opt tab} is great, it cannot handle a large number +of levels, and it prints ALL the levels in alphabetical order. + +{pstd} +Very often when exploring data I just want to have a quick look at the largest +levels of a variable that may have thousands of levels in a data set with +millions of rows. {opt gcontract} and {opt gcollapse} are great but they +modify the original data and doing a lot of subsequent preserve, sort, restore +gets very slow very fast. + +{pstd} +I have found this command extremely helpful when exploring big data. +Specially if a string is not clean, then having a look at the largest +values or the largest values that match a pattern is very helpful. + + +{marker examples}{...} +{title:Examples} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gtoplevelsof/index.html#examples":online documentation} +for more examples. + +{phang}{cmd:. sysuse auto}{p_end} +{phang}{cmd:. gtoplevelsof rep78}{p_end} +{phang}{cmd:. gtoplevelsof rep78, missrow local(toplevels)}{p_end} +{phang}{cmd:. gtop rep78, colsep(", ")}{p_end} +{phang}{cmd:. gtop foreign rep78, ntop(3) missrow}{p_end} + + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gtoplevelsof} stores the following in {cmd:r()}: + +{synoptset 15 tabbed}{...} +{p2col 5 15 19 2: Macros}{p_end} +{synopt:{cmd:r(levels)}}list of top (most common) levels (rows); not with {opt matasave}{p_end} +{synopt:{cmd:r(matalevels)}}name of GtoolsByLevels mata object; only with {opt matasave}{p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(minJ) }} largest group size {p_end} +{synopt:{cmd:r(maxJ) }} smallest group size {p_end} +{synopt:{cmd:r(ntop) }} number of top levels {p_end} +{synopt:{cmd:r(nrows)}} number of rows in {opt toplevels} {p_end} +{synopt:{cmd:r(alpha)}} sorted by levels intead of frequencies {p_end} +{p2colreset}{...} + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Matrices}{p_end} +{synopt:{cmd:r(toplevels)}}Table with frequency counts and percentages.{p_end} +{p2colreset}{...} + +{pstd} The missing and other rows are stored in the matrix with IDs 2 and 3, +respectively. With {opt matasave}, the following data is stored in {opt GtoolsByLevels}: + + real scalar anyvars + 1: any by variables; 0: no by variables + + real scalar anychar + 1: any string by variables; 0: all numeric by variables + + real scalar anynum + 1: any numeric by variables; 0: all string by variables + + string rowvector byvars + by variable names + + real scalar kby + number of by variables + + real scalar rowbytes + number of bytes in one row of the internal by variable matrix + + real scalar J + number of levels + + real matrix numx + numeric by variables + + string matrix charx + string by variables + + real scalar knum + number of numeric by variables + + real scalar kchar + number of string by variables + + real rowvector lens + > 0: length of string by variables; <= 0: internal code for numeric variables + + real rowvector map + map from index to numx and charx + + real rowvector charpos + position of kth character variable + + string matrix printed + formatted (printf-ed) variable levels (not with option -silent-) + + real matrix toplevels + frequencies of top levels; missing and other rows stored with ID 2 and 3. + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gtoplevelsof} is maintained as part of {it:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + + +{title:Also see} + +{p 4 13 2} +help for +{help gcontract}, +{help glevelsof}, +{help gtools}; +{help flevelsof} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/g/gunique.ado b/01.code/ado/g/gunique.ado new file mode 100755 index 0000000..790e8eb --- /dev/null +++ b/01.code/ado/g/gunique.ado @@ -0,0 +1,164 @@ +*! version 1.0.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! -unique- implementation using C for faster processing + +capture program drop gunique +program gunique, rclass + version 13.1 + + if ( `=_N < 1' ) { + di as err "no observations" + exit 2000 + } + + syntax varlist [if] [in] , /// + [ /// + Detail /// Summary statistics for group counts + MISSing /// Include missing values + by(str) /// by variabes: [+|-]varname [[+|-]varname ...] + GENerate(name) /// Store uniques in generate (default _Unique) + replace /// Replace variable specifyed by generate if it exists + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + debug(passthru) /// Print debugging info to console + ] + local seecount seecount + local unsorted unsorted + local countonly countonly + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( "`by'" != "" ) { + if ( "`generate'" == "" ) { + capture confirm new variable _Unique + if ( _rc ) { + if ( "`replace'" == "" ) { + di as err "Variable _Unique already exists." + di as err "Use the gen() option to specify a new variable." + exit 110 + } + } + local generate _Unique + } + else { + cap confirm new variable `generate' + if ( _rc ) { + if ( "`replace'" == "" ) { + di as err "`generate' already exists." + exit 110 + } + } + } + + local seecount "" + * local unsorted "" + local countonly "" + + tempvar id + local gopts gen(`id') + if ( "`missing'" == "" ) local ifid if !mi(`id') + + local type double + if ( `=_N' < 2^21 ) local type long + } + + global GTOOLS_CALLER gunique + local opts `missing' `seecount' `compress' `forcestrl' + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' `gopts' + + if ( "`detail'" != "" ) { + tempvar count + local dopts counts(`count') fill(data) + cap noi _gtools_internal `varlist' `if' `in', `unsorted' `opts' `dopts' gfunction(unique) + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + unique `varlist' `if' `in', `detail' + exit 0 + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + return scalar N = 0 + return scalar J = 0 + return scalar unique = 0 + return scalar minJ = 0 + return scalar maxJ = 0 + exit 0 + } + else if ( `rc' ) exit `rc' + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar unique = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + + local nunique = `r(J)' + local r_Ndisp = trim(`"`: di %21.0gc `r(N)''"') + local r_Jdisp = trim(`"`: di %21.0gc `r(J)''"') + + gstats sum `count' in 1 / `=r(J)', d + } + else { + cap noi _gtools_internal `varlist' `if' `in', `countonly' `unsorted' `opts' gfunction(unique) + local rc = _rc + global GTOOLS_CALLER "" + + if ( `rc' == 17999 ) { + unique `varlist' `if' `in', `detail' + exit 0 + } + else if ( `rc' == 17001 ) { + di as txt "(no observations)" + return scalar N = 0 + return scalar J = 0 + return scalar unique = 0 + return scalar minJ = 0 + return scalar maxJ = 0 + exit 0 + } + else if ( `rc' ) exit `rc' + + return scalar N = `r(N)' + return scalar J = `r(J)' + return scalar unique = `r(J)' + return scalar minJ = `r(minJ)' + return scalar maxJ = `r(maxJ)' + + local nunique = `r(J)' + local r_Ndisp = trim(`"`: di %21.0gc `r(N)''"') + local r_Jdisp = trim(`"`: di %21.0gc `r(J)''"') + } + + if ( "`by'" != "" ) { + * NB: `id' should be the group ID for varlist, which should be + * correctly missing for non-if in observations. Hence ifid gives + * the right answer here. + gegen `type' `generate' = tag(`by' `id') `ifid', missing `replace' + gegen `generate' = sum(`generate') `ifid', by(`by') replace + + di as txt "" + di as txt "'`varlist'' had `r_Jdisp' unique values in `r_Ndisp' observations." + di as txt "Variable `generate' has the number of unique values of '`varlist'' by '`by''." + + if ( "`detail'" != "" ) { + if ( `=`nunique'' > 5 ) { + local header = `"The top 5 frequency counts of `generate' for the levels of '`by'' are"' + } + else { + local header = `"The frequency counts of `generate' for the levels of '`by'' are"' + } + di as txt `"`header'"' + gtoplevelsof `by' `generate' if `generate' > 0, ntop(5) + } + } +end diff --git a/01.code/ado/g/gunique.sthlp b/01.code/ado/g/gunique.sthlp new file mode 100755 index 0000000..c77615f --- /dev/null +++ b/01.code/ado/g/gunique.sthlp @@ -0,0 +1,178 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog gunique "dialog gunique"}{...} +{vieweralsosee "[D] gunique" "mansection D gunique"}{...} +{viewerjumpto "Syntax" "gunique##syntax"}{...} +{viewerjumpto "Description" "gunique##description"}{...} +{viewerjumpto "Options" "gunique##options"}{...} +{title:Title} + + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:gunique} {hline 2}}Efficiently calculate unique values of a variable or group of variables.{p_end} +{p2colreset}{...} + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + +{marker syntax}{...} +{title:Syntax} + +{phang} +This is a fast option to the user written {opt unique}. +It is 4 to 26 times faster in Stata/IC and 4-12 times faster in MP + +{p 8 13 2} +{cmd:gunique} +{varlist} +{ifin} +[{cmd:,} +{opt d:etail}] + + +{marker description}{...} +{title:Description} + +{pstd} +{opt gunique} is a faster alternative to {help unique}. It reports the number +of unique values for the {it:varlist}. At the moment, its main difference from +{opt distinct} is that it always considers the variables jointly. It also has +slighly different options. For example, this supports the {opth by(varlist)} +option that also appears in the {opt unique} command, but does not support +computing the number of unique values for variables individually. + +{pstd} +{opt gunique} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{phang} +{opth by(varlist)} counts unique values within levels of {it:varlist} and +stores them in a new variable named {bf:_Unique}. The user can specify the +name of the new variable via the option {opth gen:erate(varname)}. + +{phang} +{opth gen:erate(varname)} supplies an alternative name for the new variable +created by {bf:by}. + +{phang} +{opt replace} replaces {bf:_Unique} or the variable specified via {opt +generate}, if it exists. + +{phang} +{opt detail} request summary statistics on the number of records which are +present for unique values of the varlist. With {opt by()}, it also prints +the levels with the most unique values. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + + +{marker example}{...} +{title:Examples} + +{p 4 4 2}{cmd:. sysuse auto}{p_end} +{p 4 4 2}{cmd:. gunique *} {p_end} +{p 4 4 2}{cmd:. gunique *, miss} {p_end} +{p 4 4 2}{cmd:. gunique make-headroom}{p_end} +{p 4 4 2}{cmd:. gunique make-headroom, d}{p_end} + +{pstd} +See the +{browse "http://gtools.readthedocs.io/en/latest/usage/gunique/index.html#examples":online documentation} +for more examples. + +{marker results}{...} +{title:Stored results} + +{pstd} +{cmd:gunique} stores the following in {cmd:r()}: + +{synoptset 20 tabbed}{...} +{p2col 5 20 24 2: Scalars}{p_end} +{synopt:{cmd:r(N) }} number of non-missing observations {p_end} +{synopt:{cmd:r(J) }} number of groups {p_end} +{synopt:{cmd:r(unique)}} number of groups {p_end} +{synopt:{cmd:r(minJ) }}largest group size {p_end} +{synopt:{cmd:r(maxJ) }}smallest group size {p_end} +{p2colreset}{...} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:gunique} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +{cmd:gunique} was written largely to mimic the functionality of the community-contributed command {cmd:unique}, +written by + +{p 8 8 2} +Michael Hills, retired{break} + +{p 8 8 2} +Tony Brady, Sealed Envelope Ltd, UK (tony@sealedenvelope.com) + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gdistinct}, +{help gtools}; +{help unique} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/h/hashsort.ado b/01.code/ado/h/hashsort.ado new file mode 100755 index 0000000..3fe212b --- /dev/null +++ b/01.code/ado/h/hashsort.ado @@ -0,0 +1,86 @@ +*! version 1.0.1 23Jan2019 Mauricio Caceres Bravo, mauricio.caceres.bravo@gmail.com +*! Hash-based implementation of -sort- and -gsort- using C-plugins + +capture program drop hashsort +program define hashsort + version 13.1 + + global GTOOLS_CALLER hashsort + syntax anything, /// Variables to sort by: [+|-]varname [[+|-]varname ...] + [ /// + GENerate(passthru) /// Generate variable with sort order + replace /// Replace generated variable, if it exists + sortgen /// Sort by generated variable, if applicable + skipcheck /// Turn off internal is sorted check + /// + compress /// Try to compress strL variables + forcestrl /// Force reading strL variables (stata 14 and above only) + Verbose /// Print info during function execution + _CTOLerance(passthru) /// (Undocumented) Counting sort tolerance; default is radix + BENCHmark /// Benchmark function + BENCHmarklevel(int 0) /// Benchmark various steps of the plugin + HASHmethod(passthru) /// Hashing method: 0 (default), 1 (biject), 2 (spooky) + oncollision(passthru) /// error|fallback: On collision, use native command or throw error + debug(passthru) /// Print debugging info to console + /// + tag(passthru) /// + counts(passthru) /// + fill(passthru) /// + invertinmata /// + /// + /// Unsupported sort options + /// ------------------------ + /// + stable /// Hashsort is always stable + mlast /// + Mfirst /// + ] + + if ( `benchmarklevel' > 0 ) local benchmark benchmark + local benchmarklevel benchmarklevel(`benchmarklevel') + + if ( "`stable'" != "" ) { + di as txt "hashsort is always -stable-" + } + + * mfirst is set by default, unlike gsort + if ( ("`mfirst'" != "") & ("`mlast'" != "") ) { + di as err "Cannot request both {opt mfirst} and {opt mlast}" + } + + * mfirst is set by default, unlike gsort + if ( ("`mfirst'" == "") & ("`mlast'" == "") & (strpos("`anything'", "-") > 0) ) { + di as txt "(note: missing values will be sorted first)" + } + + * mfirst is set by default + if ( ("`mfirst'" == "") & ("`mlast'" == "") ) { + local mfirst mfirst + } + + if ( "`generate'" != "" ) local skipcheck skipcheck + + local opts `compress' `forcestrl' nods + local opts `opts' `verbose' `benchmark' `benchmarklevel' `_ctolerance' + local opts `opts' `oncollision' `hashmethod' `debug' + local eopts `invertinmata' `sortgen' `skipcheck' + local gopts `generate' `tag' `counts' `fill' `replace' `mlast' + cap noi _gtools_internal `anything', missing `opts' `gopts' `eopts' gfunction(sort) + global GTOOLS_CALLER "" + local rc = _rc + + if ( `rc' == 17999 ) { + if regexm("`anything'", "[\+\-]") { + gsort `anything', `generate' `mfirst' + exit 0 + } + else { + sort `anything' + exit 0 + } + } + else if ( `rc' == 17001 ) { + exit 0 + } + else if ( `rc' ) exit `rc' +end diff --git a/01.code/ado/h/hashsort.sthlp b/01.code/ado/h/hashsort.sthlp new file mode 100755 index 0000000..ee73f75 --- /dev/null +++ b/01.code/ado/h/hashsort.sthlp @@ -0,0 +1,210 @@ +{smcl} +{* *! version 1.0.2 23Jan2019}{...} +{viewerdialog hashsort "dialog sort, message(-hashsort-)"}{...} +{vieweralsosee "[D] hashsort" "mansection D hashsort"}{...} +{vieweralsosee "" "--"}{...} +{vieweralsosee "[D] sort" "help sort"}{...} +{viewerjumpto "Syntax" "hashsort##syntax"}{...} +{viewerjumpto "Menu" "hashsort##menu"}{...} +{viewerjumpto "Description" "hashsort##description"}{...} +{viewerjumpto "Options" "hashsort##options"}{...} +{viewerjumpto "Examples" "hashsort##examples"}{...} +{title:Title} + +{p2colset 5 18 23 2}{...} +{p2col :{cmd:hashsort} {hline 2}}{opt sort} and {opt gsort} using hashes and C-plugins{p_end} +{p2colreset}{...} + + +{pstd} +{it:Important}: Please run {stata gtools, upgrade} to update {cmd:gtools} to +the latest stable version. + + +{marker syntax}{...} +{title:Syntax} + +{p 8 14 2} +{cmd:hashsort} +[{cmd:+}|{cmd:-}] +{varname} +[[{cmd:+}|{cmd:-}] +{varname} {it:...}] +[{cmd:,} {it:{help hashsort##options:options}}] + +{marker menu}{...} +{title:Menu} + +{marker description}{...} +{title:Description} + +{pstd} +{opt hashsort} uses C-plugins to implement a hash-based sort that is always +faster than {opt sort} for sorting groups and faster than {opt gsort} in +general. {opt hashsort} hashes the data and sorts the hash, and then it sorts +one observation per group. The fewer the number of gorups relative to the +number of observations, the larger the speed gain. + +{pstd} +If the sort is expected to be unique or if the number of groups is large, then +this comes at a potentially large memory penalty and it may not be faster than +{opt sort} (the exception is when the sorting variables are all integers). + +{pstd} +Each {varname} can be numeric or a string. The observations are placed in +ascending order of {it:varname} if {opt +} or nothing is typed in front of the +name and are placed in descending order if {opt -} is typed. {opt hashsort} +always produces a stable sort. + +{pstd} +{opt hashsort} is part of the {manhelp gtools R:gtools} project. + +{marker options}{...} +{title:Options} + +{dlgtab:Options} + +{phang} +{opth gen:enerate(varname)} Store group ID in {opt generate}. + +{phang} +{opt sortgen} Set data sortby variable to {opt generate}. + +{phang} +{opt replace} If {opt generate} exits, it is replaced. + +{phang} +{opt skipcheck} Skip internal is sorted check. + +{dlgtab:Gtools} + +{phang} +{opt compress} Try to compress strL to str#. The Stata Plugin Interface +has only limited support for strL variables. In Stata 13 and earlier +(version 2.0) there is no support, and in Stata 14 and later (version +3.0) there is read-only support. The user can try to compress strL +variables using this option. + +{phang} +{opt forcestrl} Skip binary variable check and force gtools to read strL +variables (14 and above only). {opt Gtools gives incorrect results when there is binary data in strL variables}. +This option was included because on some windows systems Stata detects +binary data even when there is none. Only use this option if you are +sure you do not have binary data in your strL variables. + +{phang} +{opt verbose} prints some useful debugging info to the console. + +{phang} +{opt bench:mark} and {opt bench:marklevel(int)} print how long in +seconds various parts of the program take to execute. The user can also +pass {opth bench(int)} for finer control. {opt bench(1)} is the same +as benchmark but {opt bench(2)} and {opt bench(3)} additionally print +benchmarks for internal plugin steps. + +{phang} +{opth hashmethod(str)} Hash method to use. {opt default} automagically +chooses the algorithm. {opt biject} tries to biject the inputs into the +natural numbers. {opt spooky} hashes the data and then uses the hash. + +{phang} +{opth oncollision(str)} How to handle collisions. A collision should never +happen but just in case it does {opt gtools} will try to use native commands. +The user can specify it throw an error instead by passing {opt oncollision(error)}. + +{marker examples}{...} +{title:Examples} + +{pstd} +Also see the +{browse "http://gtools.readthedocs.io/en/latest/usage/hashsort/index.html#examples":online documentation} +for more examples. + + {hline} + Setup +{phang2}{cmd:. sysuse auto} + +{pstd}Place observations in ascending order of {cmd:price}{p_end} +{phang2}{cmd:. hashsort price} + +{pstd}Same as above command{p_end} +{phang2}{cmd:. hashsort +price} + +{pstd}List the 10 lowest-priced cars in the data{p_end} +{phang2}{cmd:. list make price in 1/10} + +{pstd}Place observations in descending order of {cmd:price}{p_end} +{phang2}{cmd:. hashsort -price} + +{pstd}List the 10 highest-priced cars in the data{p_end} +{phang2}{cmd:. list make price in 1/10} + +{pstd}Place observations in alphabetical order of {cmd:make}{p_end} +{phang2}{cmd:. hashsort make} + +{pstd}List {cmd:make} in alphabetical order{p_end} +{phang2}{cmd:. list make} + +{pstd}Place observations in reverse alphabetical order of {cmd:make}{p_end} +{phang2}{cmd:. hashsort -make} + +{pstd}List {cmd:make} in reverse alphabetical order{p_end} +{phang2}{cmd:. list make} + + {hline} + Setup +{phang2}{cmd:. webuse bp3} + +{pstd}Place observations in ascending order of {cmd:time} within ascending +order of {cmd:id}{p_end} +{phang2}{cmd:. hashsort id time} + +{pstd}List each patient's blood pressures in the order measurements were +taken{p_end} +{phang2}{cmd:. list id time bp} + +{pstd}Place observations in descending order of {cmd:time} within ascending +order of {cmd:id}{p_end} +{phang2}{cmd:. hashsort id -time} + +{pstd}List each patient's blood pressures in reverse-time order{p_end} +{phang2}{cmd:. list id time bp}{p_end} + {hline} + + +{marker author}{...} +{title:Author} + +{pstd}Mauricio Caceres Bravo{p_end} +{pstd}{browse "mailto:mauricio.caceres.bravo@gmail.com":mauricio.caceres.bravo@gmail.com }{p_end} +{pstd}{browse "https://mcaceresb.github.io":mcaceresb.github.io}{p_end} + +{title:Website} + +{pstd}{cmd:hashsort} is maintained as part of {manhelp gtools R:gtools} at {browse "https://github.com/mcaceresb/stata-gtools":github.com/mcaceresb/stata-gtools}{p_end} + +{marker acknowledgment}{...} +{title:Acknowledgment} + +{pstd} +This help file was based on StataCorp's own help file for {it:sort} and {it:gsort}. +{p_end} + +{pstd} +This project was largely inspired by Sergio Correia's {it:ftools}: +{browse "https://github.com/sergiocorreia/ftools"}. +{p_end} + +{pstd} +The OSX version of gtools was implemented with invaluable help from @fbelotti; +see {browse "https://github.com/mcaceresb/stata-gtools/issues/11"}. +{p_end} + +{title:Also see} + +{p 4 13 2} +help for +{help gtools}; +{help fsort} (if installed), +{help ftools} (if installed) + diff --git a/01.code/ado/m/missings.ado b/01.code/ado/m/missings.ado new file mode 100755 index 0000000..44772c7 --- /dev/null +++ b/01.code/ado/m/missings.ado @@ -0,0 +1,461 @@ +*! 1.4.0 NJC 27 Jan 2023 +* 1.3.0 NJC 3 Sep 2020 +* 1.2.0 NJC 11 Jun 2017 +* 1.1.1 NJC 11 May 2017 +* 1.1.0 NJC 26 Apr 2017 +* 1.0.1 NJC 24 Sep 2015 +* 1.0.0 NJC 26 Aug 2015 +program missings, rclass byable(recall) + version 12 + + // identify subcommand + gettoken cmd 0 : 0, parse(" ,") + local l = length("`cmd'") + + if `l' == 0 { + di "{err}subcommand needed; see help on {help missings}" + exit 198 + } + + // report breakdown list table tag dropvars dropobs + if substr("report", 1, max(1, `l')) == "`cmd'" { + local cmd "report" + } + else if substr("breakdown", 1, max(1, `l')) == "`cmd'" { + local cmd "breakdown" + } + else if substr("list", 1, max(1, `l')) == "`cmd'" { + local cmd "list" + } + else if substr("table", 1, max(3, `l')) == "`cmd'" { + local cmd "table" + } + else if "tag" == "`cmd'" { + * -t- or -ta- would be ambiguous + if _by() error 190 + } + else if "dropvars" == "`cmd'" { + * destructive subcommand spelled out + if _by() error 190 + } + else if "dropobs" == "`cmd'" { + * destructive subcommand spelled out + if _by() error 190 + } + else { + di "{err}illegal {cmd}missings {err}subcommand" + exit 198 + } + + // check rest of syntax + local common NUMeric STRing SYSmiss noHEADER + + if "`cmd'" == "report" { + syntax [varlist(default=none)] [if] [in] /// + [ , `common' OBServations MINimum(numlist max=1 >=0) /// + Percent Format(str) SORT SORT2(str asis) SHOW(numlist int min=1 >0) /// + IDentify(varlist) * ] + + if "`format'" == "" local format %5.2f + + // not considered an error, just a misunderstanding! + if "`identify'" != "" & "`observations'" == "" { + noi di "identify() option ignored without observations option" + } + } + else if "`cmd'" == "breakdown" { + syntax [varlist(default=none)] [if] [in] /// + [, `common' MINimum(numlist max=1 >=0) /// + SORT SORT2(str asis) SHOW(numlist int min=1 >0) *] + } + else if "`cmd'" == "list" { + syntax [varlist(default=none)] [if] [in] /// + [ , `common' MINimum(numlist max=1 >=0) IDentify(varlist) * ] + } + else if "`cmd'" == "table" { + syntax [varlist(default=none)] [if] [in] /// + [ , `common' MINimum(numlist max=1 >=0) IDentify(varlist) * ] + } + else if "`cmd'" == "tag" { + syntax [varlist(default=none)] [if] [in], /// + Generate(str) [`common'] + + capture confirm new variable `generate' + if _rc { + di as err "generate() must specify new variable" + exit _rc + } + } + else if "`cmd'" == "dropvars" { + syntax [varlist] [, `common' force ] + + if "`force'" == "" & c(changed) { + di as err "force option required with changed dataset" + exit 4 + } + } + else if "`cmd'" == "dropobs" { + syntax [varlist] [if] [in] [, `common' force] + + if "`force'" == "" & c(changed) { + di as err "force option required with changed dataset" + exit 4 + } + } + + // check syntax for sort options if specified + if "`sort'`sort2'" != "" { + if "`sort'" != "" local sort missing descending + local sort `sort' `sort2' + + foreach opt in missing alpha descending { + local `opt' 0 + } + + foreach word of local sort { + local word = lower("`word'") + local length = length("`word'") + + if "`word'" == substr("missings", 1, `length') { + local missing 1 + } + else if "`word'" == substr("alpha", 1, `length') { + local alpha 1 + } + else if "`word'" == substr("descending", 1, `length') { + local descending 1 + } + else { + di as err "sort() option invalid?" + exit 198 + } + } + + local tosort = `alpha' + `missing' + if `tosort' > 1 | (`tosort' == 0 & `descending') { + di as err "sort request invalid" + exit 198 + } + } + + quietly { + // which variables are we looking at? + if "`varlist'" == "" { + local vartext "{txt} all variables" + unab varlist : _all + if _by() local varlist : /// + subinstr local varlist "`_byindex'" "" + } + + if "`numeric'`string'" != "" { + if "`numeric'" != "" & "`string'" != "" { + * OK + } + else { + if "`numeric'" != "" ds `varlist', has(type numeric) + else ds `varlist', has(type string) + local varlist `r(varlist)' + if "`varlist'" == "" { + di as err "no variables specified" + exit 100 + } + + if "`vartext'" != "" { + local vartext "{txt}all `numeric'`string' variables" + } + } + } + + if "`vartext'" == "" local vartext "{res} `varlist'" + } + + // looking at observations with missings is the point! + marksample touse, novarlist + + // # of observations used + quietly count if `touse' + if r(N) == 0 error 2000 + local N = r(N) + return scalar N = r(N) + + // nmissing is count of missings on variables specified + tempvar nmissing + + // basic counts of missing values + quietly { + if "`sysmiss'" != "" local system "system " + gen `nmissing' = 0 if `touse' + label var `nmissing' "# of `system'missing values" + local min `minimum' + if "`min'" == "" local min = cond("`cmd'" == "table", 0, 1) + + foreach v of local varlist { + capture confirm numeric variable `v' + local sys = (_rc == 0) & ("`sysmiss'" != "") + + if `sys' count if `v' == . & `touse' + else count if missing(`v') & `touse' + + if r(N) >= `min' { + local misslist `misslist' `v' + if r(N) == `N' { + local droplist `droplist' `v' + } + local nmiss `nmiss' `r(N)' + if "`percent'" != "" { + local pc = 100 * `r(N)'/`N' + local pcmiss `pcmiss' `pc' + } + } + + if `sys' replace `nmissing' = `nmissing' + (`v' == .) if `touse' + else replace `nmissing' = `nmissing' + missing(`v') if `touse' + } + + // % missing if requested + if "`percent'" != "" & "`observations'" != "" { + local nvars : word count `varlist' + tempvar pcmissing + gen `pcmissing' = 100 * `nmissing'/`nvars' if `touse' + label var `pcmissing' "% of `system'missing values" + format `pcmissing' `format' + } + } + + // show header by default and count of observations with missing values + if "`header'" == "" { + di _n "{p 0 4}{txt}Checking missings in `vartext':{txt}{p_end}" + } + else di + + quietly count if `nmissing' & `touse' + local NM = r(N) + di "`NM' " cond(`NM' == 1, "observation", "observations") /// + " with `system'missing values" + + // now actions for each subcommand + if "`cmd'" == "report" { + if `NM' == 0 exit 0 + + if "`observations'" != "" { + char `nmissing'[varname] "# `system'missing" + if "`percent'" != "" { + char `pcmissing'[varname] "% `system'missing" + } + list `identify' `nmissing' `pcmissing' if `nmissing' >= `min', /// + abbrev(9) subvarname `options' + + exit 0 + } + + tokenize "`nmiss'" + + // set up matrices in Mata + if "`percent'" != "" { + mata : results = J(0, 2, .) + } + else mata : results = J(0, 1, .) + mata : text = J(0, 1, "") + + local j = 1 + foreach v of local misslist { + mata : text = text \ "`v'" + + if "`percent'" != "" { + local pcm : word `j' of `pcmiss' + mata : results = results \ (``j'', `pcm') + } + else mata : results = results \ (``j'') + + local ++j + } + + preserve + drop _all + + local nvars : word count `misslist' + quietly set obs `nvars' + getmata which = text + getmata results* = results + + char which[varname] " " + char results1[varname] "# missing" + if "`percent'" != "" { + char results2[varname] "% missing" + format results2 `format' + } + + if "`sort'`sort2'" != "" { + if `missing' { + if `descending' gsort - results1 + else sort results1 + } + else if `alpha' { + if `descending' gsort - which + else sort which + } + + if "`show'" != "" local inobs "in 1/`show'" + } + + list `inobs', abbrev(9) subvarname noobs `options' + + mata mata clear + + return local varlist "`misslist'" + } + else if "`cmd'" == "breakdown" { + quietly ds `misslist', has(type string) + local strhere = "`r(varlist)'" != "" + quietly ds `misslist', has(type numeric) + local numhere = "`r(varlist)'" != "" + + quietly if `numhere' { + if "`sysmiss'" != "" local levels . + + else { + foreach v of local misslist { + levelsof `v' if missing(`v') & `touse', missing clean local(this) + local levels `levels' `this' + } + local levels : list uniq levels + local levels : list sort levels + } + + local nlevels : list sizeof levels + } + + // set up matrices in Mata + mata : allresults = J(0, 1, .) + mata : numresults = J(0, `nlevels', .) + mata : strresults = J(0, `strhere', .) + mata : text = J(0, 1, "") + + quietly foreach v of local misslist { + local j = 0 + + count if missing(`v') & `touse' + mata : allresults = allresults \ `r(N)' + + capture confirm numeric variable `v' + + if _rc { + mata : strresults = strresults \ `r(N)' + if `numhere' mata : numresults = numresults \ J(1, `nlevels', .) + } + else { + local counts + foreach x of local levels { + local ++j + count if `v' == `x' & `touse' + local counts `counts' `r(N)' + } + + local counts : subinstr local counts " " ",", all + mata : numresults = numresults \ (`counts') + if `strhere' mata : strresults = strresults \ . + } + + mata : text = text \ ("`v'") + } + + preserve + drop _all + + local nvars : word count `misslist' + quietly set obs `nvars' + + getmata which = text + char which[varname] " " + + getmata allcount = allresults + char allcount[varname] "# missing" + + if `strhere' { + getmata strcount = strresults + char strcount[varname] "empty" + } + + if `numhere' { + getmata numcount* = numresults + + tokenize "`levels'" + forval j = 1/`nlevels' { + char numcount`j'[varname] "``j''" + } + } + + if "`sort'`sort2'" != "" { + if `missing' { + if `descending' gsort - allcount + else sort allcount + } + else if `alpha' { + if `descending' gsort - which + else sort which + } + + if "`show'" != "" local inobs "in 1/`show'" + } + + list `inobs', abbrev(9) subvarname noobs `options' + + mata mata clear + + return local varlist "`misslist'" + } + else if "`cmd'" == "list" { + if `NM' > 0 { + local show : list identify | misslist + local show : list uniq show + list `show' if `nmissing' >= `min' & `touse', `options' + } + return local varlist "`misslist'" + } + else if "`cmd'" == "table" { + if `NM' > 0 { + local cond `nmissing' >= `min' & `touse' + + local nid : word count `identify' + if `nid' == 0 { + tab `nmissing' if `cond', `options' + } + else if `nid' == 1 { + qui tab `identify' if `cond' + local I = r(r) + qui tab `nmissing' if `cond' + local J = r(r) + + if `J' <= `I' tab `identify' `nmissing' if `cond', `options' + else tab `nmissing' `identify' if `cond', `options' + } + else error 103 + } + return local varlist "`misslist'" + } + else if "`cmd'" == "tag" { + gen double `generate' = `nmissing' if `touse' + quietly compress `generate' + } + else if "`cmd'" == "dropvars" { + di + if "`droplist'" != "" { + noisily drop `droplist' + di "{p}note: `droplist' dropped{p_end}" + } + else di "note: no variables qualify" + return local varlist "`droplist'" + } + else if "`cmd'" == "dropobs" { + di + local nvars : word count `varlist' + quietly count if `nmissing' == `nvars' & `touse' + return scalar n_dropped = r(N) + + if r(N) == 0 di "note: no observations qualify" + else noisily { + drop if `nmissing' == `nvars' & `touse' + } + } +end + diff --git a/01.code/ado/m/missings.sthlp b/01.code/ado/m/missings.sthlp new file mode 100755 index 0000000..b396c2a --- /dev/null +++ b/01.code/ado/m/missings.sthlp @@ -0,0 +1,328 @@ +{smcl} +{* 3sep2015/24sep2015/26nov2015/11may2017/27jun2017/3sep2020/27jan2023}{...} +{cmd:help missings}{right: ({browse "http://www.stata-journal.com/article.html?article=up00!!":SJ23-2: dm0085_3})} +{hline} + +{title:Title} + +{phang} +{cmd:missings} {hline 2} Various utilities for managing missing values + +{title:Syntax} + +{p 8 16 2} +{cmd:missings} {opt r:eport} [{varlist}] {ifin} [{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} +{opt obs:ervations} {opt min:imum(#)} {opt p:ercent} {opt f:ormat(format)} +{opt id:entify(varlist)} +{opt sort(specification)} {opt show(#)} {it:{help list:list_options}}] + +{p 8 16 2} +{cmd:missings} {opt b:reakdown} [{varlist}] {ifin} [{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} {opt min:imum(#)} {opt sort(specification)} {opt show(#)} +{it:{help list:list_options}}] + +{p 8 16 2} +{cmd:missings} {opt l:ist} [{varlist}] {ifin} +[{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} +{opt min:imum(#)} +{opt id:entify(varlist)} +{it:{help list_options}}] + +{p 8 16 2} +{cmd:missings} {opt tab:le} [{varlist}] {ifin} +[{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} +{opt min:imum(#)} +{opt id:entify(varlist)} +{it:{help tabulate_oneway:tabulate_options}}] + +{p 8 16 2} +{cmd:missings} {opt tag} [{varlist}] {ifin}{cmd:,} {opt gen:erate(newvar)} +[{opt num:eric} {opt str:ing} {opt sys:miss}] + +{p 8 16 2} +{cmd:missings dropvars} [{varlist}] +[{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} +{opt force}] + + +{p 8 16 2} +{cmd:missings dropobs} [{varlist}] {ifin} +[{cmd:,} +{opt num:eric} {opt str:ing} {opt sys:miss} +{opt force}] + + +{pstd} +{cmd:by:} may be used with any of {cmd:missings report}, {cmd:breakdown}, {cmd:missings list}, +or {cmd:missings table}. See {manhelp by D}. + + +{title:Description} + +{pstd} +{cmd:missings} is a set of utility commands for managing variables that +may have missing values. By default, "missing" means numeric missing +(that is, the system missing value {cmd:.} or one of the extended missing +values {cmd:.a} to {cmd:.z}) for numeric variables and empty or {cmd:""} for +string variables. See {helpb missing:[U] 12.2.1 Missing values} for further +information. + +{pstd} +If {varlist} is not specified, it is interpreted by default as all +variables. + +{pstd} +{cmd:missings report} issues a report on the number of missing values in +{varlist}. By default, counts of missings are given by variables; +optionally, counts are given by observations. + +{pstd} +{cmd:missing breakdown} issues a report on different missing values +in {varlist}, that is the numbers present of (1) empty strings {cmd:""} if string +variables are included and (2) system missing and extended missing +values if numeric variables are included. This subcommand is most obviously +useful as a check on the presence of extended missing values for numeric +variables. + +{pstd} +{cmd:missings list} lists observations with missing values in {varlist}. + +{pstd} +{cmd:missings table} tabulates observations by the number of missing +values in {varlist}. + +{pstd} +{cmd:missings tag} generates a variable containing the number of missing +values in each observation in {varlist}. + +{pstd} +{cmd:missings dropvars} drops any variables in {varlist} that are +missing on all values. + +{pstd} +{cmd:missings dropobs} drops any observations that are missing on all +values in {varlist}. + + +{title:Options} + +{phang} +{opt numeric} (all subcommands) indicates to include numeric +variables only. If any string variables are named explicitly, such +variables will be ignored. + +{phang} +{opt string} (all subcommands) indicates to include string variables +only. If any numeric variables are named explicitly, such variables will +be ignored. + +{phang} +{opt sysmiss} (all subcommands) indicates to include system missing +{cmd:.} only. This option has no effect with string variables, for which +missing is deemed to be the empty string {cmd:""}, regardless. + +{phang} +{opt observations} (with {cmd:missings report}) indicates counting of +missing values by observations, not the default of counting by +variables. + +{phang} +{opt minimum(#)} (with {cmd:missings report}; {cmd:missings breakdown}; {cmd:missings list}; and +{cmd:missings table}) specifies the minimum number of missings to be +shown explicitly. With {cmd:missings table}, the default is {cmd:minimum(0)}; +otherwise, it is {cmd:minimum(1)}. + +{phang} +{opt percent} (with {cmd:missings report}) reports percents missing as well as +counts. Percents are calculated relative to the number of observations or +variables specified. + +{phang} +{opt format(format)} (with {cmd:missings report}) specifies a display +format for percents. The default is {cmd:format(%5.2f)}. This option has no +effect unless {opt percent} is also specified. + +{phang} +{opt identify(varlist)} or {opt identify(varname)} +(with {cmd:missings report, observations}; {cmd:missings list}; and {cmd:missings table}) +insists on showing {it:varlist} or {it:varname} in the display of results. +This can be especially useful to show (for example) identifier variables, which +typically will not be missing, or key categorical variables such as education +or gender. With {cmd:missings report, observations} +and {cmd:missings list}, {it:varlist} is included in the {cmd:list} results. +With {cmd:missings table}, {it:varname} is used to produce a two-way table in +contrast to a one-way table; two or more variables may not be specified. + +{phang} +{opt sort(specification)} (with {cmd:missings report} and {cmd:missings breakdown}) specifies output should be +sorted as specified. The {it:specification} must include either {cmd:missings} +or {cmd:alpha} or any abbreviation of either keyword. {cmd:missings} means +sorting by number of missing values. {cmd:alpha} means sorting by variable +name. The specification may include {cmd:descending} to indicate sorting in +descending order, for example, that variables with the most missing values will +be shown first. + +{p 8 8 2}Note: to maintain compatibility with previous versions, the bare +option {opt sort} is also supported, although not indicated in the syntax +diagram. {opt sort} is equivalent to {cmd:sort(missings descending)}. + +{phang} +{opt show(#)} (with {cmd:missings report} and {cmd:missings breakdown}) specifies that at most the first {it:#} +variables be shown. This option has no effect unless sorting is also specified +and is most obviously useful whenever the sort is on the number of missing +values. + +{phang} +{it:list_options} (with {cmd:missings report}, {cmd:missings breakdown} and {cmd:missings list}) are options +listed in {manhelp list D} that may be specified when {cmd:list} is used to +show results. + +{phang} +{it:tabulate_options} (with {cmd:missings table}) are options listed in +{manhelp tabulate_oneway R:tabulate oneway} or +{manhelp tabulate_twoway R:tabulate twoway} +that may be specified when {cmd:tabulate} is used to show results. + +{phang} +{opt generate(newvar)} (with {cmd:missings tag}) specifies the name of a new +variable. {cmd:generate()} is required. + +{phang} +{opt force} (with {cmd:missings dropvars} and {cmd:missings dropobs}) signals +that the dataset in memory is being changed and is a required +option when data are being dropped and the dataset in memory has not been +saved as such. + + +{title:Remarks} + +{pstd} +{cmd:missings} is intended to unite and supersede the main ideas of +{cmd:nmissing} (Cox 1999, 2001a, 2003, 2005) and +{cmd:dropmiss} (Cox 2001b, 2008). + +{pstd} +Creating entirely empty observations (rows) and variables (columns) +is a habit of many spreadsheet users, but neither is helpful in Stata +datasets. The subcommands {cmd:dropobs} and {cmd:dropvars} should +help users clean up. Conversely, there is no explicit support here for +dropping observations or variables with some missing and some +nonmissing values. Users so minded will find other subcommands of use +as an intermediate step, but multiple imputation might be a better way +forward. + + +{title:Examples} + +{phang}{cmd:. webuse nlswork, clear}{p_end} +{phang}{cmd:. missings report}{p_end} +{phang}{cmd:. missings report, minimum(1000)}{p_end} +{phang}{cmd:. missings report, sort(miss desc)}{p_end} +{phang}{cmd:. missings report, sort(miss desc) show(10)}{p_end} +{phang}{cmd:. missings list, minimum(5)}{p_end} +{phang}{cmd:. missings list, minimum(5) id(race)}{p_end} +{phang}{cmd:. missings table}{p_end} +{phang}{cmd:. bysort race: missings table}{p_end} +{phang}{cmd:. missings table, identify(race)}{p_end} +{phang}{cmd:. missings tag, generate(nmissing)}{p_end} +{phang}{cmd:. generate frog = .}{p_end} +{phang}{cmd:. generate toad = .a}{p_end} +{phang}{cmd:. generate newt = ""}{p_end} +{phang}{cmd:. missings breakdown, sort(missings descending)}{p_end} +{phang}{cmd:. missings breakdown, numeric sort(missings descending)}{p_end} +{phang}{cmd:. missings dropvars frog toad newt, force sysmiss}{p_end} +{phang}{cmd:. missings dropvars toad, force sysmiss}{p_end} +{phang}{cmd:. set obs 30000}{p_end} +{phang}{cmd:. missings dropobs, force}{p_end} + + +{title:Stored results} + +{pstd} +{cmd:missings} stores the following in {cmd:r()}: + +{synoptset 16 tabbed}{...} +{p2col 5 16 18 2: Scalars}{p_end} +{synopt:{cmd:r(N)}}number of observations checked (all){p_end} +{synopt:{cmd:r(n_dropped)}}number of observations dropped ({cmd:missings dropobs}){p_end} + +{p2col 5 16 18 2: Macros}{p_end} +{synopt:{cmd:r(varlist)}}varlist used ({cmd:missings report}, {cmd:missings breakdown}, +{cmd:missings list}, {cmd:missings table}, and {cmd:missings dropvars}){p_end} +{p2colreset}{...} + + +{title:Author} + +{pstd}Nicholas J. Cox, Durham University, Durham, UK{p_end} +{pstd}n.j.cox@durham.ac.uk{p_end} + + +{title:Acknowledgments} + +{pstd} +Jeroen Weesie, Eric Uslaner, and Estie Sid Hudes contributed to the +earlier development of {cmd:nmissing} and {cmd:dropmiss}. + +{pstd} +A question from Fahim Ahmad on Statalist prompted the addition of sorting +and {opt show(#)} options to {cmd:missings report}. A question from +Martyn Sherriff on Statalist prompted the addition of the {cmd:identify()} +option. Discussion with Richard Goldstein led to clarification of the +scope of {cmd:identify()}. A question from J{c o/}rgen Carling on Statalist +led to adding {cmd:missings breakdown}. + + +{title:References} + +{phang} +Cox, N. J. 1999. +{browse "http://www.stata.com/products/stb/journals/stb49.pdf":dm67: Numbers of missing and present values.} +{it:Stata Technical Bulletin} 49: 7-8. +Reprinted in {it:Stata Technical Bulletin Reprints}, vol. 9, pp. 26-27. +College Station, TX: Stata Press. + +{phang} +------. 2001a. +{browse "http://www.stata.com/products/stb/journals/stb60.pdf":dm67.1: Enhancements to numbers of missing and present values}. +{it:Stata Technical Bulletin} 60: 2-3. +Reprinted in {it:Stata Technical Bulletin Reprints}, vol. 10, pp. 7-9. +College Station, TX: Stata Press. + +{phang} +------. 2001b. +{browse "http://www.stata.com/products/stb/journals/stb60.pdf":dm89: Dropping variables or observations with missing values}. +{it:Stata Technical Bulletin} 60: 7-8. +Reprinted in {it:Stata Technical Bulletin Reprints}, vol. 10, pp. 44-46. +College Station, TX: Stata Press. + +{phang} +------. 2003. Software Updates: +{browse "http://www.stata-journal.com/sjpdf.html?articlenum=up0005":dm67_2: Numbers of missing and present values}. +{it:Stata Journal} 3: 449. + +{phang} +------. 2005. +{browse "http://www.stata-journal.com/sjpdf.html?articlenum=up0013":Software Updates: dm67_3: Numbers of missing and present values}. +{it:Stata Journal} 5: 607. + +{phang} +------. 2008. Software Updates: +{browse "http://www.stata-journal.com/sjpdf.html?articlenum=up0023":dm89_1: Dropping variables or observations with missing values}. +{it:Stata Journal} 8: 594. + + +{title:Also see} + +{p 4 14 2}Article: {it:Stata Journal}, volume 17, number 3: {browse "http://www.stata-journal.com/article.html?article=up0056":dm0085_1},{break} + {it:Stata Journal}, volume 15, number 4: {browse "http://www.stata-journal.com/article.html?article=dm0085":dm0085}{p_end} + +{p 7 14 2}Help: {helpb missing:[U] 12.2.1 Missing values}, +{manhelp codebook D}, {manhelp egen D}, {manhelp ipolate D} +{manhelp misstable R}, {manhelp mvencode D}, {manhelp recode D}, +{manhelp mi MI:intro},{break} +{helpb findname} (if installed), {helpb mipolate} (if installed){p_end} diff --git a/01.code/ado/p/pip.ado b/01.code/ado/p/pip.ado new file mode 100755 index 0000000..ee2abe3 --- /dev/null +++ b/01.code/ado/p/pip.ado @@ -0,0 +1,935 @@ +/*======================================================= +Program Name: pip.ado +Author: +R.Andres Castaneda +acastanedaa@worldbank.org + +Contributor: +Tefera Bekele Degefu +tdegefu@worldbank.org + +World Bank Group + +project: Adaptation Stata package (from povcalnet) to easily query the [PIP API] +Dependencies: The World Bank - DECIS +----------------------------------------------------------------------- +References: +Output: +=======================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip, rclass +version 16.0 + +syntax [anything(name=subcommand)] /// +[, /// +COUntry(string) /// +REGion(string) /// +YEAR(string) /// +POVLine(numlist) /// +POPShare(numlist) /// +PPP_year(numlist) /// +AGGregate /// +CLEAR /// +INFOrmation /// +COVerage(string) /// +ISO /// +SERver(string) /// +pause /// +FILLgaps /// +N2disp(integer 1) /// +DISPQuery /// +querytimes(integer 5) /// +TIMEr /// +POVCALNET_format /// +noEFFICIENT /// +KEEPFrames /// +frame_prefix(string) /// +replace /// +VERsion(string) /// +IDEntity(string) /// +RELease(numlist) /// +TABle(string) /// +path(string) /// +] + +if ("`pause'" == "pause") pause on +else pause off +set checksum off + + //======================================================== + // housekeeping + //======================================================== + local curframe = c(frame) + + if regexm("`subcommand'", "^clean") { + noi pip_cleanup + exit + } + + + if regexm("`subcommand'", "^dropframe") { + pip_drop frame, frame_prefix(`frame_prefix') + exit + } + + if regexm("`subcommand'", "^dropglobal") { + pip_drop global + exit + } + + if regexm("`subcommand'", "^install") { + local sscmd: word 2 of `subcommand' + noi pip_install `sscmd', path(`path') `pause' + exit + } + + if regexm("`subcommand'", "^uninstall") { + pip_install uninstall, path(`path') `pause' + exit + } + + if regexm("`subcommand'", "^update") { + noi pip_update, path(`path') `pause' + exit + } + + // ------------------------------------------------------------------------ + // New session procedure + // ------------------------------------------------------------------------ + +qui { + + if ("${pip_cmds_ssc}" == "") { + pip_new_session , `pause' + } + + //======================================================== + // setup defaults + //======================================================== + + local server = lower("`server'") + local identity = upper("`identity'") + local country = upper("`country'") + local coverage = lower("`coverage'") + local table = lower("`table'") + + * In case global server is specified + if ("${pip_server}" != "" & "`server'" == "") { + noi disp in red "warning:" in y "Global {it:pip_server} (${pip_server}) is in use" + local server = "${pip_server}" + } + + + //======================================================== + // Auxiliary tables + //======================================================== + if regexm("`subcommand'", "^tab") { + noi pip_tables `table', server(`server') /// + version(`version') /// + release(`release') /// + ppp_year(`ppp_year') /// + identity(`identity') /// + `clear' + return add + exit + } + + + //======================================================== + // Timer + //======================================================== + + local i = 0 + local crlf "`=char(10)'`=char(13)'" + scalar tt = "" + + if ("`timer'" != "") { + timer clear + local i = 1 + } + //======================================================== + // Conditions (Defenses) + //======================================================== + if ("`aggregate'" != "") { + noi disp in red "Option {it:aggregate} is disable for now." + exit + } + if ("`aggregate'" != "" & "`fillgaps'" != "") { + noi disp in red "options {it:aggregate} and {it:fillgaps} are mutually exclusive." _n /* + */ "Please select only one." + error + } + + if ("`popshare'" != "" & (lower("`subcommand'") == "wb" | "`aggregate'" != "")) { + noi disp in red "option {it:popshare} can't be combined with option {it:aggregate} or with subcommand {it:wb}" _n + error + } + + if ("`frame_prefix'" == "") { + local frame_prefix "pip_" + } + + + /*================================================== + Defaults + ==================================================*/ + // --- timer + if ("`timer'" != "") { + local i_on = `i' + scalar tt = tt + "`crlf' `i': Set server" + local i_off = `i++' + } + // --- timer + + // --- timer + if ("`timer'" != "") timer on `i_on' + // --- timer + + *---------- API defaults + pip_set_server `server', `pause' + *return add + local url = "`r(url)'" + local server = "`r(server)'" + local base = "`r(base)'" + local base_grp = "`r(base_grp)'" + + // --- timer + if ("`timer'" != "") timer off `i_off' + // --- timer + + //======================================================== + // versions + //======================================================== + + // --- timer + if ("`timer'" != "") { + local i_on = `i' + scalar tt = tt + "`crlf' `i': Get version" + local i_off = `i++' + } + // --- timer + + // --- timer + if ("`timer'" != "") timer on `i_on' + // --- timer + + + if regexm("`subcommand'", "^ver") { + noi pip_versions, server(`server') availability + return add + exit + } + noi pip_versions, server(`server') /// + version(`version') /// + release(`release') /// + ppp_year(`ppp_year') /// + identity(`identity') + + local version_qr = "`r(version_qr)'" + local version = "`r(version)'" + local release = "`r(release)'" + local ppp_year = "`r(ppp_year)'" + local identity = "`r(identity)'" + + return local pip_version = "`version'" + + // --- timer + if ("`timer'" != "") timer off `i_off' + // --- timer + + + //======================================================== + // conditions + //======================================================== + *---------- lower case subcommand + local subcommand = lower("`subcommand'") + + *---------- Test + if ("`subcommand'" == "test") { + if ("${pip_query}" == "") { + noi di as err "global pip_query does not exist. You cannot test the query." + error + } + local fq = "`base'?${pip_query}" + noi disp in y "querying" in w _n "`fq'" + noi view browse "`fq'" + exit + } + + *---------- Modify country(all) with aggregate + if (lower("`country'") == "all" & "`aggregate'" != "") { + local country "" + local aggregate "" + local subcommand "wb" + local wb_change 1 + noi disp as err "Warning: " as text " {cmd:pip, country(all) aggregate} " /* + */ "is equivalent to {cmd:pip wb}. " _n /* + */ " if you want to aggregate all countries by survey years, " /* + */ "you need to parse the list of countries in {it:country()} option. See " /* + */ "{help pip##options:aggregate option description} for an example on how to do it" + } + else { + local wb_change 0 + } + + if ("`year'" == "") local year "all" + + if ("`year'" != "" & "`year'" != "all") { + local yrtemp + foreach yr of local year { + local tt = substr("`yr'", 1, 4) + local yrtemp `yrtemp' `tt' + } + local year "`yrtemp'" + } + * + + *---------- Coverage + if ("`coverage'" == "") local coverage = "all" + local coverage = lower("`coverage'") + + foreach c of local coverage { + + if !inlist(lower("`c'"), "national", "rural", "urban", "all") { + noi disp in red `"option {it:coverage()} must be "national", "rural", "urban" or "all" "' + error + } + + } + + *---------- Poverty line/population share + + // Blank popshare and blank povline = default povline 1.9 + if ("`popshare'" == "" & "`povline'" == "") { + + if ("`ppp_year'" == "2005") local povline = 1.25 + if ("`ppp_year'" == "2011") local povline = 1.9 + if ("`ppp_year'" == "2017") local povline = 2.15 + + local pcall = "povline" + } + + // defined popshare and defined povline = error + else if ("`popshare'" != "" & "`povline'" != "") { + noi disp as err "povline and popshare cannot be used at the same time" + error + } + + // blank popshare and defined povline + else if ("`popshare'" == "" & "`povline'" != "") { + local pcall = "povline" + } + + // defined popshare and blank povline + else { + local pcall = "popshare" + } + + *---------- Info + if regexm("`subcommand'", "^info") { + local information = "information" + local subcommand = "information" + } + + + //------------ Region + + if ("`region'" != "") { + local region = upper("`region'") + + if ("`country'" != "") { + noi disp in red "You must use either {it:country()} or {it:region()}." + error + } + + if (regexm("`region'", "SAR")) { + noi disp in red "Note: " in y "The official code of South Asia is" /// + "{it: SAS}, not {it:SAR}. We'll make the change for you" + local region: subinstr local region "SAR" "SAS", word + } + + tokenize "`version'", parse("_") + local _version = "_`1'_`3'_`9'" + + frame dir + local av_frames "`r(frames)'" + local av_frames: subinstr local av_frames " " "|", all + local av_frames = "^(" + "`av_frames'" + ")" + + //------------ Regions frame + local frpiprgn "_pip_regions`_version'" + if (!regexm("`frpiprgn'", "`av_frames'")) { + pip_info, clear justdata `pause' server(`server') version(`version') + } + frame `frpiprgn' { + levelsof region_code, local(av_regions) clean + } + + // Add all to have the same functionality as in country(all) + local av_regions = "`av_regions'" + " ALL" + + local inregion: list region in av_regions + if (`inregion' == 0) { + + noi disp in red "region `region' is not available." _n /// + "Only the following are available:" _n "`av_regions'" + + error + } + + } + + + + + *---------- WB aggregate + + if ("`subcommand'" == "wb") { + if ("`country'" != "") { + noi disp as err "option {it:country()} is not allowed with subcommand {it:wb}" + noi disp as res "Note: " as txt "subcommand {it:wb} only accepts options {it:region()} and {it:year()}" + error + } + } + + + *---------- Country + if ("`country'" == "" & "`region'" == "") local country "ALL" // to modify + if ("`country'" != "") { + if (lower("`country'") != "all") local country = upper("`country'") + else local country "ALL" // to modify + } + + + /*================================================== + Main conditions + ==================================================*/ + + if ("`information'" == "") { + + if (c(N) != 0 & "`clear'" == "" & "`information'" == "") { + + noi di as err "You must start with an empty dataset; or enable the option {it:clear}." + error 4 + } + drop _all + } + + + if ("`aggregate'" != "") { + noi disp as res "Note: " as text "Aggregation is only possible over reference years." + local agg_display = "Aggregation in base year(s) `year'" + } + + /*================================================== + Execution + ==================================================*/ + pause pip - before execution + + *---------- Information + // --- timer + if ("`timer'" != "") { + local i_on = `i' + scalar tt = tt + "`crlf' `i': Get info" + local i_off = `i++' + } + // --- timer + + // --- timer + if ("`timer'" != "") timer on `i_on' + // --- timer + + + if ("`information'" != ""){ + noi pip_info, `clear' `pause' server(`server') version(`version') + return add + exit + } + + // --- timer + if ("`timer'" != "") timer off `i_off' + // --- timer + + *---------- Regular query and Aggregate Query + if ("`subcommand'" == "wb") { + local wb "wb" + } + else local wb "" + + + tempfile povcalf + save `povcalf', empty + + local f = 0 + + if ("`pcall'" == "povline") loc i_call "i_povline" + else loc i_call "i_popshare" + + + // --- timer + if ("`timer'" != "") { + local j = `i++' + local k = `i++' + local h = `i++' + scalar tt = tt + "`crlf' `j': bulding query" + scalar tt = tt + "`crlf' `k': downloading data" + scalar tt = tt + "`crlf' `h': cleaning data" + } + // --- timer + + foreach `i_call' of local `pcall' { + + // --- timer + if ("`timer'" != "") timer on `j' + // --- timer + + local ++f + + /*================================================== + Create Query + ==================================================*/ + pip_query, country("`country'") /// + region("`region'") /// + year("`year'") /// + povline("`i_povline'") /// + popshare("`i_popshare'") /// + ppp("`ppp_year'") /// + coverage(`coverage') /// + server(`server') /// + version(`version') /// + `clear' /// + `information' /// + `iso' /// + `fillgaps' /// + `aggregate' /// + `wb' /// + `pause' /// + `groupedby' // + + local query_ys = "`r(query_ys)'" + local query_ct = "`r(query_ct)'" + local query_pl = "`r(query_pl)'" + local query_ds = "`r(query_ds)'" + local query_pp = "`r(query_pp)'" + local query_ps = "`r(query_ps)'" + + local query_cv = "`r(query_cv)'" + + return local query_ys_`f' = "`query_ys'" + return local query_ct_`f' = "`query_ct'" + return local query_pl_`f' = "`query_pl'" + return local query_ds_`f' = "`query_ds'" + return local query_pp_`f' = "`query_pp'" + return local query_ps_`f' = "`query_ps'" + + return local query_cv_`f' = "`query_cv'" + + return local base = "`base'" + + *---------- Query + if ("`popshare'" == ""){ + local query = "`query_ys'&`query_ct'&`query_cv'&`query_pl'`query_pp'`query_ds'&`version_qr'" + } + else{ + local query = "`query_ys'&`query_ct'&`query_cv'&`query_ps'`query_pp'`query_ds'&`version_qr'" + } + return local query_`f' "`query'" + global pip_query = "`query'&format=csv" + + *---------- Base + query + if ("`aggregate'" != "" | "`subcommand'" == "wb"){ + local queryfull "`base_grp'?`query'" + } + else{ + local queryfull "`base'?`query'" + } + + return local queryfull_`f' = "`queryfull'" + + // --- timer + if ("`timer'" != "") timer off `j' + // --- timer + + /*================================================== + Download and clean data + ==================================================*/ + + + // --- timer + if ("`timer'" != "") timer on `k' + // --- timer + + *---------- download data + cap import delimited "`queryfull'&format=csv", `clear' varn(1) asdouble + if (_rc) { + noi dis "" + noi dis in red "It was not possible to download data from the PIP API." + noi dis "" + noi dis in white `"(1) Please check your Internet connection by "' _c + noi dis in white `"{browse "`url'/health-check" :clicking here}"' + noi dis in white `"(2) Test that the data is retrievable. By"' _c + noi dis in white `"{stata pip test, server(`server'): clicking here }"' _c + noi dis in white "you should be able to download the data." + noi dis in white `"(3) Please consider adjusting your Stata timeout parameters. For more details see {help netio}"' + noi dis in white `"(4) Please send us an email to:"' + noi dis in white _col(8) `"email: pip@worldbank.org"' + noi dis in white _col(8) `"subject: pip query error on `c(current_date)' `c(current_time)'"' + noi di "" + error 673 + } + * noi disp "`queryfull'&format=csv" + * exit + + + // --- timer + if ("`timer'" != "") timer off `k' + // --- timer + + * global qr = `qr' + + if ("`aggregate'" == "" & "`wb'" == "") { + local rtype 1 + } + else { + local rtype 2 + } + + pause after download + + // --- timer + if ("`timer'" != "") timer on `h' + // --- timer + + *---------- Clean data + noi pip_clean `rtype', year("`year'") `iso' server(`server') /* + */ region(`region') `pause' `fillgaps' version(`version') + + pause after cleaning + // --- timer + if ("`timer'" != "") timer off `h' + // --- timer + + /*================================================== + Display Query + ==================================================*/ + + if ("`dispquery'" != "") { + noi di as res _n "{ul: Query at \$`i_povline' poverty line}" + noi di as res "{hline}" + + + if ("`query_ys'" != "") { + noi di as res "Year:" as txt "{p 4 6 2} `query_ys' {p_end}" + } + + if ("`query_ct'" != "") { + noi di as res "Country:" as txt "{p 4 6 2} `query_ct' {p_end}" + } + + if ("`query_pl'" != "") { + noi di as res "Poverty line:" as txt "{p 4 6 2} `query_pl' {p_end}" + } + + if ("`query_ps'" != "") { + noi di as res "Population share:" as txt "{p 4 6 2} `query_ps' {p_end}" + } + + if ("`query_ds'" != "") { + noi di as res "Aggregation:" as txt "{p 4 6 2} `query_ds' {p_end}" + } + + if ("`query_pp'" != "") { + noi di as res "PPP:" as txt "{p 4 6 2} `query_pp' {p_end}" + } + + if ("`'&`version_qr''" != "") { + noi di as res "Version:" as txt "{p 4 6 2} `version_qr' {p_end}" + } + + noi di as res "full query:" as txt "{p 4 6 2} `queryfull' {p_end}" _n + noi di as res "See in browser: " `"{browse "`queryfull'":here}"' _n + noi di as res "Download .csv: " `"{browse "`queryfull'&format=csv":here}"' + + noi di as res _dup(20) "-" + noi di as res "No. Obs:" as txt _col(20) c(N) + noi di as res "{hline}" + } + + /*================================================== + Append data + ==================================================*/ + if (`wb_change' == 1) { + keep if regioncode == "WLD" + } + append using `povcalf' + save `povcalf', replace + + } // end of povline loop + + return local npl = `f' + + // ------------------------------ + // display results + // ------------------------------ + + local n2disp = min(`c(N)', `n2disp') + + if (`n2disp' > 1) { + noi di as res _n "{ul: first `n2disp' observations}" + } + else if (`n2disp' == 1) { + noi di as res _n "{ul: first observation}" + } + else { + noi di as res _n "{ul: No observations available}" + } + + + if ("`subcommand'" == "wb") { + sort region_code year + + tempname tolist + frame copy `c(frame)' `tolist' + frame `tolist' { + gsort region_code -year + + count if (region_code == "WLD") + local cwld = r(N) + if (`cwld' >= `n2disp') { + keep if (region_code == "WLD") + } + noi list region_code year poverty_line headcount mean /// + in 1/`n2disp', abbreviate(12) noobs + } + + } + + else { + if ("`aggregate'" == "") { + sort country_code year + local varstodisp "country_code year poverty_line headcount mean median welfare_type" + local sepby "country_code" + } + else { + sort year + local varstodisp "year poverty_line headcount mean" + local sepby "poverty_line" + } + + foreach v of local varstodisp { + cap confirm var `v', exact + if _rc continue + local v2d "`v2d' `v'" + } + + noi list `v2d' in 1/`n2disp', abbreviate(12) sepby(`sepby') noobs + + } + + + + //======================================================== + // Create notes + //======================================================== + + local pllabel "" + foreach p of local povline { + local pllabel "`pllabel' \$`p'" + } + local pllabel = trim("`pllabel'") + local pllabel: subinstr local pllabel " " ", ", all + + + if ("`wb'" == "") { + if ("`aggregate'" == "" & "`fillgaps'" == "") { + local lvlabel "country level" + } + else if ("`aggregate'" != "" & "`fillgaps'" == "") { + local lvlabel "aggregated level" + } + else if ("`aggregate'" == "" & "`fillgaps'" != "") { + local lvlabel "Country level (lined up)" + } + else { + local lvlabel "" + } + } + else { + local lvlabel "regional and global level" + } + + + local datalabel "WB poverty at `lvlabel' using `pllabel'" + local datalabel = substr("`datalabel'", 1, 80) + + label data "`datalabel' (`c(current_date)')" + + //======================================================== + // Final messages + //======================================================== + + * citations + if ("${pip_cmds_ssc}" == "1") { + local cnoi "noi" + global pip_cmds_ssc = ${pip_cmds_ssc} + 1 + } + else { + local cnoi "qui" + noi disp `"Click {stata "pip_cite, reg_cite":here} to display how to cite"' + } + `cnoi' pip_cite, reg_cite + notes: `r(cite_data)' + + noi disp in y _n `"`cite'"' + + return local cite `"`cite'"' + + * Install alternative version + if ("${pip_cmds_ssc}" == "") { + noi pip_${pip_source} msg + } + + //======================================================== + // Convert to povcalnet format + //======================================================== + + if ("`timer'" != "") { + local i_on = `i' + scalar tt = tt + "`crlf' `i': formating to povcalnet" + local i_off = `i++' + } + // --- timer + + // --- timer + if ("`timer'" != "") timer on `i_on' + // --- timer + + if ("`povcalnet_format'" != "") { + pause before povcalnet format + pip_povcalnet_format `rtype', `pause' + } + + // --- timer + if ("`timer'" != "") timer off `i_off' + // --- timer + + //======================================================== + // Drop frames created in the middle of the process + //======================================================== + + if ("`timer'" != "") { + local i_on = `i' + scalar tt = tt + "`crlf' `i': remove frames" + local i_off = `i++' + } + // --- timer + + // --- timer + if ("`timer'" != "") timer on `i_on' + // --- timer + + frame dir + local av_frames "`r(frames)'" + + * set trace on + foreach fr of local av_frames { + + if (regexm("`fr'", "(^_pip_)(.+)")) { + + // If users wants to keep frames + if ("`keepframes'" != "") { + local frname = "`frame_prefix'" + regexs(2) + frame copy `fr' `frname', `replace' + } + // if user wants to drop them + if ("`efficient'" == "noefficient") { + frame drop `fr' + } + } + + } // condition to keep frames + + // --- timer + if ("`timer'" != "") timer off `i_on' + // --- timer + + + * set trace off + + + + // --- timer + if ("`timer'" != "") { + noi disp tt + noi timer list + } + // --- timer + +} // end of qui +end + + + +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: + +Version Control: + +*! version 0.9.5 <2023Feb14> +*! -- fix writing error in pip.pkg file that did not allow the installation of pip_update +*! version 0.9.2 <2023Feb14> +*! -- improve installation and update features +*! version 0.9.0 <2023Feb09> +*! -- Update help file +*! version 0.3.9 <2022Dec16> +*! -- BREAKING Change: Fix formating of aux tables. Rename some variables to make it consistent with other PIP outputs +*! -- Drop obs with missing values in poverty line or headcount +*! -- Fix display of citations +*! -- Improve Help file +*! -- fix bug with PPP_year and ppp parameters +*! -- Display only one observation +*! -- Fix big with options ppp and ppp_year. Only ppp_year remained. +*! -- Change order of returning variables. +*! -- change all labels to lower cases +*! -- BREAKING Change: remove distribution estimates from line up estimates. +*! version 0.3.8 <2022Oct06> +*! -- Testing version change +*! -- Fix bugs +*! version 0.3.7 <2022Oct06> +*! -- Add new routines to install and update pip +*! -- Fix bug in `pip wb, region(WLD)`, which used to return all regions, rather than just WLD. +*! -- Labels for variables `icp` and `ppp` now depend on the PPP year of the data. +*! version 0.3.6 <2022Sep08> +*! -- make it work with new API specifications +*! -- Fix problem with variable name version +*! -- Fix problem with variable name version +*! version 0.3.5 <2022Jul06> +*! -- Add `asdouble` in all calls of `import delimited` +*! version 0.3.4 <2022Jun10> +*! version 0.3.3 <2022may25> +*! version 0.3.2 <2022apr26> +*! version 0.3.1 <2022apr08> +*! version 0.3.0 <2022apr07> +*! version 0.2.2 <2022apr06> +*! version 0.2.1 <2022apr04> +*! version 0.2.0 <2022apr01> +*! version 0.1.7 <2022mar30> +*! version 0.1.6 <2022mar28> +*! version 0.1.5 <2022mar25> +*! version 0.1.4 <2022mar18> +*! version 0.1.3 <2022mar18> +*! version 0.1.2 <2022feb07> +*! version 0.1.1 <2022feb01> +*! version 0.1.0 <2022feb01> +*! version 0.0.2 <2022jan12> +*! version 0.0.1 <2021dec01> + + +*##s diff --git a/01.code/ado/p/pip.sthlp b/01.code/ado/p/pip.sthlp new file mode 100755 index 0000000..53a3fd0 --- /dev/null +++ b/01.code/ado/p/pip.sthlp @@ -0,0 +1,768 @@ +{smcl} +{* *! version 1.0.0 dec 2022}{...} +{vieweralsosee "" "--"}{...} +{vieweralsosee "Install wbopendata" "ssc install wbopendata"}{...} +{vieweralsosee "Help wbopendata (if installed)" "help wbopendata"}{...} +{viewerjumpto "Command description" "pip##desc"}{...} +{viewerjumpto "Parameters description" "pip##param"}{...} +{viewerjumpto "Options description" "pip##options"}{...} +{viewerjumpto "Subcommands" "pip##subcommands"}{...} +{viewerjumpto "Stored results" "pip##return"}{...} +{viewerjumpto "Examples" "pip##Examples"}{...} +{viewerjumpto "Disclaimer" "pip##disclaimer"}{...} +{viewerjumpto "How to cite" "pip##howtocite"}{...} +{viewerjumpto "References" "pip##references"}{...} +{viewerjumpto "Acknowledgments" "pip##acknowled"}{...} +{viewerjumpto "Authors" "pip##authors"}{...} +{viewerjumpto "Regions" "pip_countries##regions"}{...} +{viewerjumpto "Countries" "pip_countries##countries"}{...} +{title:Title} + +{p2colset 9 24 22 2}{...} +{p2col :{hi:pip} {hline 2}}Access poverty and inequality data from the +World Bank's {browse "https://pip.worldbank.org/":Poverty and Inequality Platform (PIP)}. +The {cmd:pip} command allows Stata users to access the poverty and inequality indicators +available in the PIP platform and estimate poverty at any line. PIP contains more +indicators than its predecessor(povcalnet). See {help pip##list:below} for a comparison +between the indicators in the pip and povcalnet commands. {p_end} +{p2col :{hi:Website: }}{browse "https://worldbank.github.io/pip/"}{p_end} +{p2colreset}{...} +{title:Syntax} + +{p 6 16 2} +{cmd:pip} [{it:{help pip##subcommands:subcommand}}]{cmd:,} +[{it:{help pip##param:Parameters}} {it:{help pip##options:Options}}] + +{pstd} +Description of parameters and options + +{synoptset 27 tabbed}{...} +{synopthdr:Parameters} +{synoptline} +{synopt :{opt coun:try:}(3-letter code)}List of {it:{help pip_countries##countries:country code}} (accepts multiples) or {it:all}. Default "{it:all}". +Cannot be used with option {it:region()}{p_end} +{synopt :{opt reg:ion}(WB code)}List of {it:{help pip_countries##regions:region code}} (accepts multiple) or {it:all}. Default "{it:all}". +Cannot be used with option {it:country()}{p_end} +{synopt :{opt coverage(string)}}Coverage level ("national", "urban", "rural", "all"). Default "all".{p_end} +{synopt :{opt year:}(numlist|string)}List of years (accepts up to 10), or {it:all}, or {it:last}. Default "all".{p_end} +{synopt :{opt pov:line:}(#)}List of poverty lines (in PPP specified, see option {cmd:ppp_year(#)}) to calculate + poverty measures (accepts up to 5). Default is 2.15 and 2017 PPPs.{p_end} +{synopt :{opt pops:hare:}(#)}List of quantiles. No default. Cannot be used with option {opt pov:line:}{p_end} +{synopt :{opt fill:gaps}}Loads country-level estimates (including extrapolations and interpolations) used to create regional and global aggregates.{p_end} + +{synoptset 27 tabbed}{...} +{synopthdr:Options} +{synoptline} +{synopt :{opt version(string)}}Combination of numbers in the format %Y%m%d_YYYY_RV_AV_SSS +(click {bf:{help pip_note:here}} for explanation of each component). This {it:version()} option supersedes +the next 3 options {it:ppp_year()}, {it:release()} & {it:identity()}, as the combination of these parameters uniquely identifies a dataset.{p_end} +{synopt :{opt ppp:_year:}(#)}PPP round (2011 or 2017). {p_end} +{synopt :{opt release(numlist)}}8 digit number with the PIP release date in the format {it:YYYYMMDD}.{p_end} +{synopt :{opt identity(string)}{err:*}}Version of data to run the query on (e.g., prod, int, test). See description of each identity {bf:{help pip_note:here}}.{p_end} +{synopt :{opt server(string)}{err:*}}Name of server to query (e.g, prod, dev, qa). See description of each server {bf:{help pip_note:here}}.{p_end} + +{pstd} +{err:*Note}: The {cmd:server()} and {cmd:identity()} options are available internally only for Bank staff. +For a detailed description of the {cmd:server()} and {cmd:identity()} options see {bf:{help pip_note:here}}. + +{synoptset 27 tabbed}{...} +{synopthdr:Operational} +{synoptline} +{synopt :{opt clear}}Replaces data in memory.{p_end} +{synopt :{opt querytimes(integer)}}Number of times the API is hit before defaulting to failure. +Default is 5. {it:Advanced option. Use only if internet connection is poor}.{p_end} +{synopt :{opt table(string)}}Loads one auxiliary table, this option is used along with the {cmd:tables} subcommand.{p_end} + +{synoptset 27 tabbed}{...} +{synopthdr:Subcommands} +{synoptline} +{synopt :{opt info:rmation}}Presents a clickable version of the available surveys, +countries and regions.{p_end} +{synopt :{opt wb}}Downloads World Bank's regional and global aggregation.{p_end} +{synopt :{opt tab:les}}Provides clickable list of auxiliary tables for download.{p_end} +{synopt :{opt clean:up}}Deletes all pip data from current stata memory.{p_end} +{synopt :{opt dropframe}}({it:Programmer's option}) Deletes auxiliary PIP frames in memory.{p_end} +{synopt :{opt dropglobal}}({it:Programmer's option}) Deletes auxiliary PIP global macros in memory.{p_end} +{synopt :{opt ver:sions}}Display available versions of PIP data.{p_end} +{synopt :{opt test}}Open in browser last pip call. Type {cmd:disp "${pip_query}"} to see the parameters of the API query.{p_end} + +{pstd} +{bf:Note}: {cmd:pip} requires an internet connection. + +{marker sections}{...} +{title:Sections} + +{pstd} +Sections are presented under the following headings: + + {it:{help pip##desc:Command description}} + {it:{help pip##memory:Memory use and frames}} + {it:{help pip##param:Parameters description}} + {it:{help pip##options:Options description}} + {it:{help pip##operational:Operational description}} + {it:{help pip##subcommands:Subcommands description}} + {it:{help pip##list:List of pip and povcalnet variables}} + {it:{help pip##return:Stored results}} + {it:{help pip##Examples:Examples}} + {it:{help pip##disclaimer:Disclaimer}} + {it:{help pip##references:References}} + {it:{help pip##acknowled:Acknowledgments}} + {it:{help pip##authors:Authors}} + {it:{help pip##contact:Contact}} + {it:{help pip##howtocite:How to cite}} + {it:{help pip_countries:Region and country codes}} + +{marker desc}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:Description} + +{pstd} +The {cmd:pip} command has the same functionality as the {browse "https://pip.worldbank.org/":PIP website}. +It allows Stata users to compute poverty and inequality indicators for over 160 countries +in the World Bank's database of household surveys. PIP is a computational tool that allows +users to conduct country-specific, cross-country, as well as global and regional poverty analyses. +Users are able estimate rates over time and at any poverty line specified. {cmd:pip} reports a +wide range of measures for poverty (at any chosen poverty line) and inequality. See full list of indicators +available in {cmd:pip} {help pip##list:below}. + +{pstd} +The underlying welfare aggregate is the per capita household income or consumption + expressed in 2017 PPP USD (with an option to select the 2011 PPPs). Poverty lines are expressed in daily amounts, as well as + the means and medians. For more information on the methodology,{browse "https://worldbank.github.io/PIP-Methodology/": click here}. + +{pstd} +PIP is the result of a close collaboration between World Bank staff accross the Development Data Group, the Development Research Group, and the Poverty and Inequality Global Practice. + + +{marker memory}{...} +{title:Memory use and frames}: + +{pstd} +{cmd:pip} makes use of the `frames` feature--available since Stata 16--to store a lot of information in memory. This is partly the reason why the first call of pip in a new Stata session is slower compared to subsequent calls. When closing Stata, you may see a pop-up +message reading {bf:"Frames in memory have changed"}. That is perfectly normal and should not cause any concern. +However, make sure you save the frames that you created and wish to keep. You can do that by typing {stata frames dir}. +Frames created by {cmd:pip} are prefixed by {it:_pip} and are marked by an {it:*}, meaning they have not been saved. If you do not wish to save any frames in use, just click "Exit without saving." You can also delete all PIP data in memory using the command {stata pip cleanup}. + + +{marker typesc}{...} +{title:Type of calculations}: + +{pstd} +The pip API reports two types of results: + +{phang} +{opt Survey-year}: Estimates refer to the survey period. + +{phang} +{opt Reference-year}: Loads poverty measures for a reference year that is common across countries. +Regional and global aggregates are calculated only for reference-years. Survey-year estimates are extrapolated +or interpolated to a common reference year. These extrapolations and interpolations require additional assumptions, +namely that (a) growth in household income or consumption can be approximated by growth in national accounts and +(b) all parts of the distribution grow at the same rate.{cmd: pip wb} returns the global and regional poverty aggregates +used by the World Bank. + +{pin} +{err:Important}: The option {it:fillgaps} reports the underlying country estimates for a reference-year. +These may coincide with the survey-year estimates if the country has a survey in the reference year. In other cases, +these would be extrapolated from the nearest survey or interpolated between two surveys. + +{pin} +Poverty measures that are calculated for both survey-years and reference-years include the headcount ratio, poverty gap, +and squared poverty gap. Inequality measures, including the Gini index, the mean log deviation and decile shares, +are calculated only in survey-years and are not reported for reference-years. + +{marker param}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:Parameters description} + +{phang} +{opt country(string)} {help pip_countries##countries:Countries and Economies Abbreviations}. +If specified with {opt year(#)}, this option will return all the countries for which there is +actual survey data in the year specified. When selecting multiple countries, use the corresponding +three-letter codes separated by spaces. The option {it:all} is a shorthand for calling all countries. + +{phang} +{opt region(string)} {help pip_countries##regions:Regions Abbreviations} If +specified with {opt year(#)}, this option will return all the countries in the specified region(s) +that have a survey in that year. For example, {opt region(LAC)} will return all countries in Latin +America and the Caribbean that have a survey in the specific year. When selecting multiple regions, +use the corresponding three-letter codes separated by spaces. The option {it:all} is a shorthand +for calling all regions, which is equivalent to calling all countries. + +{phang} +{opt coverage(string)} Selects the geographic coverage of the estimates. By default, all coverage +levels are loaded, but the user may select "national", "urban", or "rural". +Only one level of coverage can be selected per query. + +{phang} +{opt year(#)} Four digit years are accepted. When selecting multiple years, use +spaces to separate them. The option {it:all} is a shorthand for calling all +years, while the {it:last} option will download the latest available year +for each country. + +{phang} +{opt povline(#)} The poverty lines for which the poverty measures will be calculated. +When selecting multiple poverty lines, use less than 4 decimals and separate +each value with spaces. If left empty, the default poverty line of $2.15 is used. +By default, poverty lines are expressed in 2017 PPP USD per capita per day. +If option {opt ppp_ppp(2011)} is specified, the poverty lines are expressed in 2011 PPPs. + +{phang} +{opt popshare(#)} The desired quantile. For example, specifying popshare(0.1) returns the first +decile as the value of the poverty line. In other words, the estimated poverty line will be the +nearest income or consumption level such that the incomes of 10% of the population fall below it. +This has no default, and cannot be combined with {opt povline}. The quantile (recorded in the variable +poverty_line) is expressed in 2017 PPP USD per capita per day (unless option {opt ppp_year(2011)} is specified, +in which case it is reported in 2011 PPPs). + +{phang} +{opt fillgaps} Loads all country-level estimates that are used to create the +global and regional aggregates in the reference years. + +{p 8 8 2}{err:Note}: Countries without a survey in the reference-year have been +extrapolated or interpolated using national accounts growth rates and assuming +distribution-neutrality (see Chapter 6 +{browse "https://openknowledge.worldbank.org/bitstream/handle/10986/20384/9781464803611.pdf":here}). +Therefore, changes at the country-level from one reference year to the next need +to be interpreted carefully and may not be the result of a new household survey.{p_end} + + +{marker options}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:Options description} + +{phang} +{opt version} A detailed description of the {bf:version} option is available {bf:{help pip_note:here}}. + +{phang} +{opt ppp_year} Allows to specify PPP round (version) that will be used to calculate estimates. Default PPP round year is 2017. The other option are the 2011 PPPs. + +{phang} +{opt release} Allows to specify the PIP release date in the format YYYYMMDD. + +{phang} +{opt identity} A detailed description of the {bf:identity} option is available {bf:{help pip_note:here}}. + +{phang} +{opt server} A detailed description of the {bf:server} option is available {bf:{help pip_note:here}}. + +{marker operational}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:Operational description} + +{marker optinfo}{...} +{phang} +{opt clear} replaces data in memory. + +{phang} +{opt querytimes} Number of times the API is hit before defaulting to failure. Default is 5. Advanced option. Use only if internet connection is poor. + +{phang} +{opt table} Allows to load one auxiliary table, this option is used along with {cmd:tables} subcommand. + + + +{marker subcommands}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:Subcommands} + +{phang} +{opt information} Presents a clickable version of the available surveys, countries +and regions. Selecting countries from the menu loads the survey-year estimates. +Choosing regions loads the regional aggregates in the reference years. + +{p 8 8 2}{err:Note}: If option {it:clear} is added, data in memory is replaced +with a pip guidance database. If option {it:clear} is {ul:not} included, +{cmd:pip} preserves data in memory but displays a clickable interface of survey +availability in the results window.{p_end} + +{phang} +{opt wb} Download the World Bank's regional and global aggregates. It can be +combined with {it:year()} to filter the aggregated data. + +{phang} +{opt tables} Provides access to the auxiliary tables. +Default command {stata pip tables} a list of auxiliary tables using the 2017 PPPs. +Users can also specify PPP year as {stata pip tables, ppp_year(2017)}. + +{phang} +{opt cleanup} Deletes all PIP data from Stata's memory. + +{phang} +{opt test} By typing {stata pip test}, {cmd:pip} makes use of the global +"${pip_query}" to query your browser directly and test whether the data is +downloadable. + +{p 4 8 2} +{opt install} Install the stamble version of {cmd:pip} from SSC ({cmd:pip install ssc}) or +the development version from GitHub ({cmd:pip install gh}). the {it:install} subcommand +is intended to keep your {help sysdir:search path} clean. Say that you install the +dev version from GitHub in the regular way and then +you install the stable from SSC. By doing that, you are creating +two entries in the {it:stata.trk} file, making Stata believe that you have {cmd:pip} +installed twice, which in fact you do. You can confirm this by typing the following, {p_end} +{cmd} + github install worldbank/pip {text:// development} + ssc install pip, replace {text:// stable} + + * {text:You can't uninstall pip directly} + ado uninstall pip + {err:criterion matches more than one package} + + * {text:This is because you have two versions of {cmd:pip} installed} + ado dir pip +{result} + [318] package pip from https://raw.githubusercontent.com/worldbank/pip/master + 'PIP': Poverty and Inequality Platform Stata wrapper + + [319] package pip from http://fmwww.bc.edu/repec/bocode/p + 'PIP': module to access poverty and inequality data from the World Bank's Poverty and + Inequality Platform (PIP) +{text} +{p 8 8 2} +By using the {it:install} subcommand, {cmd:pip} makes sure all the conflicting installations +are solved. You can install {cmd:pip} from SSC and from GitHub, one after the other, and you +won't have conflicting installations. +Be aware that if you have more than one version installed in your search path, +{cmd:pip} is going to request you to confirm that you want to uninstall both versions by type +{it:yes} in the conosole and hitting enter. +{p_end} + + {cmd:pip install ssc} +{err} + There is more than one version of PIP installed in the same search path, PLUS. + You need to uninstall pip in PLUS or change installation path with option path() + Type yes in the console and hit enter to confirm you agree to uninstall pip. +{text} +{p 4 8 2} +{opt uninstall} You can uninstall any version of {cmd:pip} in your search path by typing, +{cmd:pip uninstall}. In this way, you can install {cmd:pip} from scratch from either SSC +of GitHub. +{p_end} + +{p 4 8 2} +{opt update} This subcommand makes sure your {cmd:pip} version is up to date. By default, +the first time that you use {cmd:pip} in a session, it will search for any new available versions +available on the either SSC of GitHub, depending on where you installed it from (this is +why the first time takes longer than the others). However, it could be the +case that you're using an old version on purpose and now want to get the newer version +without leaving your Stata session. Just type {cmd:pip update}. +{p_end} + +{marker return}{...} +{title:Stored results}{p 50 20 2}{p_end} + +{pstd} +{cmd:pip} stores the following in {cmd:r()}. Suffix _{it:#} is a count of the +poverty line included in {it:povlines()}: + +{p2col 5 20 24 2: queries}{p_end} +{synopt:{cmd:r(query_ys_{it:#})}}Years{p_end} +{synopt:{cmd:r(query_pl_{it:#})}}Poverty lines{p_end} +{synopt:{cmd:r(query_ct_{it:#})}}Countries{p_end} +{synopt:{cmd:r(query_cv_{it:#})}}Coverages{p_end} +{synopt:{cmd:r(query_ds_{it:#})}}Whether aggregation was used{p_end} +{synopt:{cmd:r(query_{it:#})}}Concatenation of the queries above{p_end} + +{p2col 5 20 24 2: API parts}{p_end} +{synopt:{cmd:r(server)}}Protocol (http://) and server name{p_end} +{synopt:{cmd:r(site_name)}}Site names{p_end} +{synopt:{cmd:r(handler)}}Action handler{p_end} +{synopt:{cmd:r(base)}}Concatenation of server, site_name, and handler{p_end} + +{p2col 5 20 24 2: additional info}{p_end} +{synopt:{cmd:r(queryfull_{it:#})}}Complete query{p_end} +{synopt:{cmd:r(npl)}}Total number of poverty lines{p_end} +{synopt:{cmd:pip_query}}Global macro with query information in case {cmd:pip} fails. +"${pip_query}" to display {p_end} + +{marker list}{...} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{title:List of pip and povcalnet variables}{p 50 20 2}{p_end} + +{pstd} +The following list compares the variables names available in {cmd:pip} with its predecessor command {cmd:povcalnet}. +Only the variables available in povcalnet are listed. + + {hline 43} + pip variable {col 40}povcalnet variable + {hline 20}{col 40}{hline 20} + country_code {col 40}countrycode + country_name {col 40}countryname + region_code {col 40}regioncode + year {col 40}year + welfare_time {col 40}datayear + welfare_type {col 40}datatype + poverty_line {col 40}povertyline + mean {col 40}mean + headcount {col 40}headcount + poverty_gap {col 40}povgap + poverty_severity {col 40}povgapsqr + watts {col 40}watts + gini {col 40}gini + median {col 40}median + mld {col 40}mld + polarization {col 40}polarization + population {col 40}population + decile1 {col 40}decile1 + decile2 {col 40}decile2 + decile3 {col 40}decile3 + decile4 {col 40}decile4 + decile5 {col 40}decile5 + decile6 {col 40}decile6 + decile7 {col 40}decile7 + decile8 {col 40}decile8 + decile9 {col 40}decile9 + decile10 {col 40}decile10 + ppp {col 40}ppp + is_interpolated {col 40}isinterpolated + distribution_type {col 40}usemicrodata + survey_coverage {col 40}coveragetype + {hline 43} + + + +{marker Examples}{...} +{title:Examples}{p 50 20 2}{p_end} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} + +{dlgtab: 1. Basic examples} + +{phang} +1.1. Load latest available survey-year estimates for Colombia and Argentina + +{phang2} +{stata pip, country(col arg) year(last) clear} + +{phang} +1.2. Load clickable menu + +{phang2} +{stata pip, info} + +{phang} +1.3. Load only urban coverage level + +{phang2} +{stata pip, country(all) coverage("urban") clear} + + +{dlgtab: 2. Illustration of differences between queries } + +{phang} +2.1. Country estimation at $2.15 in 2015. Since there are no surveys in ARG in +2015, results are loaded only for COL, BRA and IND. + +{phang2} +{stata pip, country(COL BRA ARG IND) year(2015) clear} + +{phang} +2.2. Reference-year estimation. Filling gaps for ARG and moving the IND estimate +from 2015-2016 to 2015. Only works for reference years. + +{phang2} +{stata pip, country(COL BRA ARG IND) year(2015) clear fillgaps} + +{phang} +2.4. World Bank aggregation ({it:country()} is not available) + +{phang2} +{stata pip wb, clear year(2015)}{p_end} +{phang2} +{stata pip wb, clear region(SAR LAC)}{p_end} +{phang2} +{stata pip wb, clear} // all regions and reference years{p_end} + + +{dlgtab: 3. Samples uniquely identified by country/year} + +{phang2} +{ul:3.1} Longest possible time series for each country, {it:even if} welfare type or survey coverage +changes from one year to another (national coverage is preferred). + +{cmd} + pip, clear + * Prepare reporting_level variable + label define level 3 "national" 2 "urban" 1 "rural" + encode reporting_level, gen(reporting_level_2) label(level) + + * keep only national when more than one is available + bysort country_code welfare_type year: egen _ncover = count(reporting_level_2) + gen _tokeepn = ( (inlist(reporting_level_2, 3, 4) & _ncover > 1) | _ncover == 1) + + keep if _tokeepn == 1 + + * Keep longest series per country + by country_code welfare_type, sort: gen _ndtype = _n == 1 + by country_code : replace _ndtype = sum(_ndtype) + by country_code : replace _ndtype = _ndtype[_N] // number of welfare_type per country + + duplicates tag country_code year, gen(_yrep) // duplicate year + + bysort country_code welfare_type: egen _type_length = count(year) // length of type series + bysort country_code: egen _type_max = max(_type_length) // longest type series + replace _type_max = (_type_max == _type_length) + + * in case of same length in series, keep consumption + by country_code _type_max, sort: gen _ntmax = _n == 1 + by country_code : replace _ntmax = sum(_ntmax) + by country_code : replace _ntmax = _ntmax[_N] // number of welfare_type per country + + + gen _tokeepl = ((_type_max == 1 & _ntmax == 2) | /// + (welfare_type == 1 & _ntmax == 1 & _ndtype == 2) | /// + _yrep == 0) + + keep if _tokeepl == 1 + drop _* + +{txt} ({stata "pip_examples pip_example08":click to run}) + +{phang2} +{ul:3.2} Longest possible time series for each country, restrict to same welfare type throughout, +but letting survey coverage vary (preferring national). + +{cmd} + pip, clear + + * Prepare reporting_level variable + label define level 3 "national" 2 "urban" 1 "rural" + encode reporting_level, gen(reporting_level_2) label(level) + + bysort country_code welfare_type year: egen _ncover = count(reporting_level_2) + gen _tokeepn = ( (inlist(reporting_level_2, 3, 4) & _ncover > 1) | _ncover == 1) + + keep if _tokeepn == 1 + * Keep longest series per country + by country_code welfare_type, sort: gen _ndtype = _n == 1 + by country_code : replace _ndtype = sum(_ndtype) + by country_code : replace _ndtype = _ndtype[_N] // number of welfare_type per country + + + bysort country_code welfare_type: egen _type_length = count(year) + bysort country_code: egen _type_max = max(_type_length) + replace _type_max = (_type_max == _type_length) + + * in case of same length in series, keep consumption + by country_code _type_max, sort: gen _ntmax = _n == 1 + by country_code : replace _ntmax = sum(_ntmax) + by country_code : replace _ntmax = _ntmax[_N] // max + + + gen _tokeepl = ((_type_max == 1 & _ntmax == 2) | /// + (welfare_type == 1 & _ntmax == 1 & _ndtype == 2)) | /// + _ndtype == 1 + + keep if _tokeepl == 1 + drop _* + +{txt} ({stata "pip_examples pip_example09":click to run}) + +{phang2} +{ul:3.3} Longest series for a country with the same welfare type. +Not necessarily the latest + +{cmd} + pip, clear + *Series length by welfare type + bysort country_code welfare_type: gen series = _N + *Longest + bysort country_code : egen longest_series=max(series) + tab country_code if series !=longest_series + keep if series == longest_series + + *2. If same length: keep most recent + bys country_code welfare_type series: egen latest_year=max(year) + bysort country_code: egen most_recent=max(latest_year) + + tab country_code if longest_series==series & latest_year!=most_recent + drop if most_recent>latest_year + + *3. Not Applicable: if equal length and most recent: keep consumption + bys country_code: egen preferred_welfare=min(welfare_type) + drop if welfare_type != preferred_welfare + +{txt} ({stata "pip_examples pip_example10":click to run}) + +{dlgtab: 4. Analytical examples} + +{phang2} +{ul:4.1} Graph of trend in poverty headcount ratio and number of poor for the world + +{cmd} + pip wb, clear + + keep if year > 1989 + keep if region_code == "WLD" + gen poorpop = headcount*population / 1000000 + gen hcpercent = round(headcount*100, 0.1) + gen poorpopround = round(poorpop, 1) + + twoway (sc hcpercent year, yaxis(1) mlab(hcpercent) /// + mlabpos(7) mlabsize(vsmall) c(l)) /// + (sc poorpopround year, yaxis(2) mlab(poorpopround) /// + mlabsize(vsmall) mlabpos(1) c(l)), /// + yti("Poverty Rate (%)" " ", size(small) axis(1)) /// + ylab(0(10)40, labs(small) nogrid angle(0) axis(1)) /// + yti("Number of Poor (million)", size(small) axis(2)) /// + ylab(0(400)2000, labs(small) angle(0) axis(2)) /// + xlabel(,labs(small)) xtitle("Year", size(small)) /// + graphregion(c(white)) ysize(5) xsize(5) /// + legend(order( /// + 1 "Poverty Rate (% of people living below $2.15)" /// + 2 "Number of people who live below $2.15") si(vsmall) /// + row(2)) scheme(s2color) + +{txt} ({stata "pip_examples pip_example01":click to run}) + +{phang2} +{ul:4.2} Graph of trends in poverty headcount ratio by region, multiple poverty lines ($2.15, $3.65, $6.85) + +{cmd} + pip wb, povline(2.15 3.65 6.85) clear + drop if inlist(region_code, "OHI", "WLD") | year<1990 + keep poverty_line region_name year headcount + replace poverty_line = poverty_line*100 + replace headcount = headcount*100 + + tostring poverty_line, replace format(%12.0f) force + reshape wide headcount,i(year region_name) j(poverty_line) string + + local title "Poverty Headcount Ratio (1990-2019), by region" + + twoway (sc headcount215 year, c(l) msiz(small)) /// + (sc headcount365 year, c(l) msiz(small)) /// + (sc headcount685 year, c(l) msiz(small)), /// + by(reg, title("`title'", si(med)) /// + note("Source: pip", si(vsmall)) graphregion(c(white))) /// + ylabel(, format(%2.0f)) /// + xlab(1990(5)2019 , labsi(vsmall)) xti("Year", si(vsmall)) /// + ylab(0(25)100, labsi(vsmall) angle(0)) /// + yti("Poverty headcount (%)", si(vsmall)) /// + leg(order(1 "$2.15" 2 "$3.65" 3 "$6.85") r(1) si(vsmall)) /// + sub(, si(small)) scheme(s2color) +{txt} ({stata "pip_examples pip_example07":click to run}) + +{phang2} +{ul:4.3} Graph of population distribution across income categories in Latin America, by country + +{cmd} + pip, region(lac) year(last) povline(2.15 3.65 6.85) clear + keep if welfare_type==2 & year>=2014 // keep income surveys + keep poverty_line country_code country_name year headcount + replace poverty_line = poverty_line*100 + replace headcount = headcount*100 + tostring poverty_line, replace format(%12.0f) force + reshape wide headcount,i(year country_code country_name ) j(poverty_line) string + + gen percentage_0 = headcount215 + gen percentage_1 = headcount365 - headcount215 + gen percentage_2 = headcount685 - headcount365 + gen percentage_3 = 100 - headcount685 + + keep country_code country_name year percentage_* + reshape long percentage_,i(year country_code country_name ) j(category) + la define category 0 "Extreme poor (< $2.15)" 1 "Poor LIMIC ($2.15-$3.65)" /// + 2 "Poor UMIC ($3.65-$6.85)" 3 "Non-poor (> $6.85)" + la val category category + la var category "" + + local title "Distribution of Income in Latin America and Caribbean, by country" + local note "Source: World Bank PIP, using the latest survey after 2014 for each country." + local yti "Population share in each income category (%)" + + graph bar (mean) percentage, inten(*0.7) o(category) o(country_code, /// + lab(labsi(small) angle(vertical)) sort(1) descending) stack asy /// + blab(bar, pos(center) format(%3.1f) si(tiny)) /// + ti("`title'", si(small)) note("`note'", si(*.7)) /// + graphregion(c(white)) ysize(6) xsize(6.5) /// + legend(si(vsmall) r(3)) yti("`yti'", si(small)) /// + ylab(,labs(small) nogrid angle(0)) scheme(s2color) +{txt} ({stata "pip_examples pip_example03":click to run}) + +{marker disclaimer}{...} +{title:Disclaimer} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} + +{p 4 4 2}To calculate global poverty estimates, survey-year estimates are extrapolated +or interpolated to a common reference year. These extrapolations and interpolations require +additional assumptions, namely that (a) growth in household income or consumption can be +approximated by growth in national accounts and (b) all parts of the distribution grow at +the same rate. Given these assumptions, users are cautioned against using reference-year +estimates (available using the fillgaps option) for comparing a country's poverty trend over time. +For that purpose, users should rely on the survey-year estimates and are advised to take into +account breaks in survey comparability. For details on the methodology please visit the +{browse "https://worldbank.github.io/PIP-Methodology/":PIP Methodology Handbook} and the {browse "https://pip.worldbank.org/publication":Global Poverty Monitoring Technical Notes}. +{p_end} + +{p 4 4 2}The term country, used interchangeably with economy, does not imply political independence +but refers to any territory for which authorities report separate social or economic statistics. +{p_end} + + +{marker references}{...} +{title:References} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} + +{p 4 8 2}Castaneda Aguilar, R.Andres, T. Fujs, C. Lakner, S. K. Tetteh-Baah(2023) +"Estimating Global Poverty in Stata: The PIP command", +Global Poverty Monitoring Technical Notes, World Bank, Washington, DC{p_end} + +{marker acknowled}{...} +{title:Acknowledgments} +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} + +{pstd} +The author would like to thank Tefera Bekele Degefu, Ifeanyi Nzegwu Edochie, Tony Fujs, +Dean Jolliffe, Daniel Mahler, Minh +Cong Nguyen, Christoph Lakner, Marta Schoch, Samuel Kofi Tetteh Baah, Martha Viveros, Nishan Yonzan, +and Haoyu Wu for comments received on earlier versions of this code. This command builds on the earlier +povcalnet command, which was developed with the help of Espen Prydz, Jorge Soler Lopez, Ruoxuan Wu and Qinghua Zhao. + +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} +{marker authors}{...} +{title:Author} +{p 4 4 4}R.Andres Castaneda, The World Bank{p_end} +{p 6 6 4}Email: {browse "acastanedaa@worldbank.org": acastanedaa@worldbank.org}{p_end} +{p 6 6 4}GitHub:{browse "https://github.com/randrescastaneda": randrescastaneda }{p_end} + +{title:Contributor} +{pstd} +Tefera Bekele Degefu + +{title:Maintainer} +{p 4 4 4}PIP Technical Team, The World Bank{p_end} +{p 6 6 4}Email: {browse "pip@worldbank.org": pip@worldbank.org}{p_end} + +{marker contact}{...} +{title:Contact} +{pstd} +Any comments, suggestions, or bugs can be reported in the +{browse "https://github.com/worldbank/pip/issues":GitHub issues page}. +All the files are available in the {browse "https://github.com/worldbank/pip":GitHub repository} + +{marker howtocite}{...} +{title:Thanks for citing this Stata command as follows} + +{p 4 8 2}Castaneda, R.Andres. (2023) +"pip: Stata Module to Access World Bank’s Global Poverty and Inequality Data" + (version 0.9.0). Stata. Washington, DC: World Bank Group. + https://worldbank.github.io/pip/ {p_end} + +{title:Thanks for citing {cmd:pip} data as follows} + +{p 4 8 2} World Bank. (2022). Poverty and Inequality Platform (version {version_ID}) +[Data set]. World Bank Group. www.pip.worldbank.org. Accessed {date}{p_end} + +{p 4 8 2}Available version_IDs:{p_end} +{p 4 8 2}2017 PPPs: 20220909_2017_01_02_PROD{p_end} +{p 4 8 2}2011 PPPs: 20220909_2011_02_02_PROD{p_end} + +{pstd} +Please make reference to the date when the database was downloaded, as statistics may change. + +{p 40 20 2}(Go up to {it:{help pip##sections:Sections Menu}}){p_end} + + + + diff --git a/01.code/ado/p/pip_cache.ado b/01.code/ado/p/pip_cache.ado new file mode 100755 index 0000000..b518fb8 --- /dev/null +++ b/01.code/ado/p/pip_cache.ado @@ -0,0 +1,289 @@ +/*================================================== +project: submit the most popular queries +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 2 Feb 2022 - 11:05:15 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_cache, rclass +syntax [anything(name=subcmd)], [ /// +STime(integer 100) /// Sleep Time +server(string) /// +] +version 16 + + +/*================================================== +1: +==================================================*/ + +qui { + + *----------1.1: + global pip_cmds_ssc = 1 // make sure it does not execute again per session + local pp = round(runiform()*100, .01) // random poverty line + local stime 100 + + if ("`subcmd'" == "") local subcmd "all" + + *----------1.2: + + if (inlist("`subcmd'"), "all", "global") { + timer clear 1 + timer on 1 + pip, povline(`pp') clear server(`server') + pip wb, povline(`pp') clear server(`server') + timer off 1 + timer list 1 + + local first_time = r(t1) + disp `first_time' + + numlist "0.1(0.1)10" + local pvcents = "`r(numlist)'" + local npvc: word count `pvcents' + + + numlist "10(1)50" + local pvdollar = "`r(numlist)'" + local npvd: word count `pvdollar' + + local est_time = `first_time'*`npvc' + /// time on cents loop + `first_time'*`npvd' + /// time on dollars loop + (`npvd'+`npvc') * (`stime'/1000) // extra time if leep between calls + + + /*============================= + // Loop over poverty lines by cents + =============================*/ + + noi disp as txt ". " in y "= saved successfully" + noi disp as txt "s " in y "= skipped - already exists (unchanged)" + noi disp as err "x " in y "= skipped - already exists (changed)" + noi disp as err "e " in y "= error" + noi disp "" + + local i = 0 + noi _dots 0, title(Caching all countries and WB request from \$0.1 to \$10 by increments of 10 cents) reps(`npvc') + + + noi pip_time_convertor `est_time', type(Estimated) + + timer clear 2 + timer on 2 + foreach pv of local pvcents { + local ++i + + cap { + pip wb, povline(`pv') clear server(`server') + sleep `stime' + pip, povline(`pv') clear server(`server') + sleep `stime' + } + if (_rc) { + noi _dots `i' 2 + } + else { + noi _dots `i' 0 + } + } + + //======================================================== + // loop over poverty lines by dollars + //======================================================== + + + local i = 0 + noi _dots 0, title(Caching all countries and WB request from \$10 to \$50 by dollar) reps(`npvd') + + foreach pv of local pvdollar { + local ++i + cap { + pip wb, povline(`pv') clear server(`server') + sleep `stime' + pip, povline(`pv') clear server(`server') + sleep `stime' + } + if (_rc) { + noi _dots `i' 2 + } + else { + noi _dots `i' 0 + } + + } + + timer off 2 + timer list 2 + local act_time = r(t2) + + noi pip_time_convertor `act_time', type(Actual) + + } // end of global condition + + if (inlist("`subcmd'"), "all", "country", "countries") { + + + timer clear 3 + timer on 3 + pip, countr(COL) povline(`pp') clear server(`server') // to initiate + pip, countr(COL) povline(`=`pp'+.01') clear server(`server') // to initiate + pip, countr(COL) povline(`=`pp'+.01') clear server(`server') // to initiate + timer off 3 + timer list 3 + + local cty_time = r(t3) + + + + frame _pip_cts { + levelsof country_code, local(countries) clean + } + + local ncty: word count `countries' + + * seconds of number of queries per country and poverty lines + local cty_n_queries = `cty_time'*`ncty'*3 + /// + (`ncty'*3) * (`stime'/1000) // extra time if sleep between calls + + + + noi disp as txt ". " in y "= saved successfully" + noi disp as txt "s " in y "= skipped - already exists (unchanged)" + noi disp as err "x " in y "= skipped - already exists (changed)" + noi disp as err "e " in y "= error" + noi disp "" + + local i = 0 + noi _dots 0, title(caching country queries with basic poverty liens 1.90, 3.20, and 5.50) reps(`ncty') + + + noi pip_time_convertor `cty_n_queries', type(Estimated) + + timer clear 4 + timer on 4 + + foreach country of local countries { + local ++i + + cap { + pip, countr(`country') clear server(`server') + sleep `stime' + pip, countr(`country') povline(3.2) clear server(`server') + sleep `stime' + pip, countr(`country') povline(5.5) clear server(`server') + sleep `stime' + } + if (_rc) { + + local cty_err "`cty_err' `country'" + + noi _dots `i' 2 + } + else { + noi _dots `i' 0 + } + + } + + + timer off 4 + timer list 4 + + local cty_actual = r(t4) + noi pip_time_convertor `cty_actual', type(Actual) + + if ("`cty_err'" != "") { + noi disp in red "countries with errors:" _n "`cty_err'" + } + + } // end of countries condition + +} + +/*================================================== +2: +==================================================*/ + + +*----------2.1: + + +*----------2.2: + + + + + +end + + +//======================================================== +// Extra programs +//======================================================== + + +program define pip_time_convertor , rclass +syntax anything(name=time id="time in seconds"), /// +type(string) /// +[ /// +noPRINT /// +] + + +if (mod(`time'/(3600), 2) >= 1) { + + local hour = `time'/3600 - mod(`time'/3600,2) + 1 + local min_dec = 60*(mod(`time'/3600,2) - 1) + local minute = round(`min_dec') + local second = round(60*mod(`min_dec',1)) +} + +else if (mod(`time'/60, 2) >= 1) { + local hour = 0 + local minute = `time'/60 - mod(`time'/60,2) + 1 + local second = round(60*(mod(`time'/60,2) - 1)) +} +else { + local hour = 0 + local minute = `time'/60 - mod(`time'/60,2) + local second = round(60*mod(`time'/60,2)) +} + +if ("`print'" == "") { + disp _n "`type' time" + disp in y "hours: `hour'" _n "minutes: `minute'" _n "seconds: `second'" +} + +return local hours = `hour' +return local minutes = `minute' +return local seconds = `second' + + +end + + +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_cite.ado b/01.code/ado/p/pip_cite.ado new file mode 100755 index 0000000..487424d --- /dev/null +++ b/01.code/ado/p/pip_cite.ado @@ -0,0 +1,197 @@ +/*================================================== +project: Citation protocol for PIP wrapper and PIP database +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 13 Jun 2022 - 16:36:52 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_cite, rclass +syntax [anything(name=subcommand)], [ /// +version(string) /// +data_bibtext /// +ado_bibtext /// +reg_cite /// +] + +version 16.0 + + +/*================================================== +1: SET UP +==================================================*/ +*------------------ Initial Parameters ------------------ + +qui { + if ("${pip_ado_version}" == "") { + + findfile pip.ado + scalar pipado = fileread("`r(fn)'") + + mata: pip_ado() + + if regexm("`pipver'", "version +([0-9\.]+) +<([a-zA-Z0-9]+)>") { + global pip_ado_version = regexs(1) + global pip_ado_date = regexs(2) + } + + } // if global is not found + + global pip_adoyear = substr("${pip_ado_date}", 1, 4) + + + /*================================================== + 2: Regular citation + ==================================================*/ + *##s + if ("`version'" == "") { + qui cap pip_versions + local version = "`r(version)'" + } + + + //------------ display data bibtext + local data_date = substr("`version'", 1, 8) + local data_year = substr("`version'", 1, 4) + local data_date = date("`data_date'", "YMD") + local data_date: disp %tdCCYY-NN-DD `data_date' + local data_date = trim("`data_date'") + + local _version: subinstr local version "_" "\_", all +} + +if ("`reg_cite'" != "") { + local cite_ado = `"Castañeda, R.Andrés. (${pip_adoyear}) "pip: Stata Module to Access World Bank’s Global Poverty and Inequality Data" (version ${pip_ado_version}). Stata. Washington, DC: World Bank Group. https://worldbank.github.io/pip/"' + noi disp _n "{hline 90}" /// + as res in smcl "{p 2 8 2}Please cite this Stata tool as:{p_end}" /// + as text `"{p 6 10 4 90}`cite_ado'{p_end}"' /// + "{p 75 0 4}{stata pip_cite, ado_bibtext:bibtext}{p_end}" + + + local cite_data = `"World Bank. (`data_year'). Poverty and Inequality Platform (version `version') [Data set]. World Bank Group. https://pip.worldbank.org/"' + noi disp as res in smcl _n "{p 2 8 2}Please cite the PIP data as:{p_end}" /// + as text `"{p 6 10 4 100}`cite_data'{p_end}"' /// + "{p 75 0 4}{stata pip_cite, data_bibtext version(`version'):bibtext}{p_end}" + + return local cite_ado = `"`cite_ado'"' + return local cite_data = `"`cite_data'"' + + exit +} + + +/*================================================== +3: BibText +==================================================*/ +local date = date("`c(current_date)'", "DMY") // %tdDDmonCCYY +local time = clock("`c(current_time)'", "hms") // %tcHH:MM:SS +local date_time = `date'*24*60*60*1000 + `time' // %tcDDmonCCYY_HH:MM:SS +local datetimeHRF: disp %tcDDmonCCYY_HH:MM:SS `date_time' +local dateHRF: disp %tdCCYY-NN-DD `date' +local datetimeMaster: disp %tcCCYYNNDDHHMMSS `date_time' +local datetimeHRF = trim("`datetimeHRF'") +local dateHRF = trim("`dateHRF'") + + +//------------display ado bibtext +if ("${pip_cite_ado}" == "") { + + local ado_date: disp %tdCCYY-NN-DD `ado_date' + local ado_date = date("${pip_ado_date}", "YMD") + local ado_date = trim("`ado_date'") + + local crlf "`=char(10)'`=char(13)'" + global pip_cite_ado = /// + "{p 4 8 2}@software{castaneda${pip_adoyear},{p_end}" + /// + "{p 8 12 2}title = {\{pip\}: {{Stata}} Module to Access {{World Bank}}’s {{Global Poverty}} and {{Inequality}} Data},{p_end}" + /// + "{p 8 12 2}shorttitle = {PIP},{p_end}" + /// + "{p 8 12 2}author = {Castañeda, R.Andrés},{p_end}" + /// + "{p 8 12 2}date = {`ado_date'},{p_end}" + /// + "{p 8 12 2}location = {{Washington, DC}},{p_end}" + /// + "{p 8 12 2}url = {https://worldbank.github.io/pip/},{p_end}" + /// + "{p 8 12 2}urldate = {`dateHRF'},{p_end}" + /// + "{p 8 12 2}abstract = {Stata module to access World Bank’s Global Poverty and Inequality data},{p_end}" + /// + "{p 8 12 2}editora = {Degefu, Tefera Bekele},{p_end}" + /// + "{p 8 12 2}editoratype = {collaborator},{p_end}" + /// + "{p 8 12 2}organization = {{World Bank Group}},{p_end}" + /// + "{p 8 12 2}version = {${pip_ado_version}},{p_end}" + /// + "{p 8 12 2}keywords = {api-wrapper}{p_end}" + /// + "{p 4 8 2}}{p_end}" + +} + +if ("`ado_bibtext'" != "") { + disp _n in smcl `"${pip_cite_ado}"' + exit +} + +local pip_cite_data = /// +"{p 4 8 2}@dataset{worldbank`data_year',{p_end}" + /// +"{p 8 12 2}title = {Poverty and {{Inequality Platform}}},{p_end}" + /// +"{p 8 12 2}shorttitle = {{PIP} Database},{p_end}" + /// +"{p 8 12 2}author = {{World Bank}},{p_end}" + /// +"{p 8 12 2}date = {`data_year'},{p_end}" + /// +"{p 8 12 2}publisher = {{World Bank Group}},{p_end}" + /// +"{p 8 12 2}url = {https://pip.worldbank.org/},{p_end}" + /// +"{p 8 12 2}urldate = {`dateHRF'},{p_end}" + /// +"{p 8 12 2}langid = {english},{p_end}" + /// +"{p 8 12 2}version = {`_version'}{p_end}" + /// +"{p 4 8 2}}{p_end}" + +if ("`data_bibtext'" != "") { + disp _n in smcl `"`pip_cite_data'"' + exit +} + +*##e + +end + + +// ------------------------------------------------------------------------ +// MATA functions +// ------------------------------------------------------------------------ + + +* findfile stata.trk +* local fn = "`r(fn)'" + +cap mata: mata drop pip_*() +mata: + +// function to look for source of code +void pip_ado() { + lines = st_strscalar("pipado") + lines = ustrsplit(lines, "`=char(10)'")' + pipver = select(lines, regexm(lines, `"^\*!"'))[1] + st_local("pipver", pipver) +} + +end + + + + + +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + +Version Control: + + diff --git a/01.code/ado/p/pip_cl.ado b/01.code/ado/p/pip_cl.ado new file mode 100755 index 0000000..17e9035 --- /dev/null +++ b/01.code/ado/p/pip_cl.ado @@ -0,0 +1,208 @@ +/*================================================== +project: Interaction with the PIP API at the country level +Author: R.Andres Castaneda +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 5 Jun 2019 - 15:12:13 +Modification Date: October, 2021 +Do-file version: 01 +References: +Output: dta +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_cl, rclass +syntax , server(string) /// +handle(string) /// +[ /// +country(string) /// +year(string) /// +povline(numlist) /// +ppp_year(numlist) /// +coverage(string) /// +clear /// +pause /// +iso /// +noDIPSQuery /// +version(string) /// +] + +version 16.0 + +if ("`pause'" == "pause") pause on +else pause off + +qui { + /*================================================== + conditions and setup + ==================================================*/ + + + local base = "`server'/`handle'/pip" + + + if ("`povline'" == "") local povline 1.9 + if ("`ppp_year'" == "") local ppp_year -1 + if ("`coverage'" == "") local coverage -1 + + *---------- download guidance data + pip_info, clear justdata `pause' + + tempname pip_lkup + frame copy _pip_lkupb `pip_lkup', replace + + frame `pip_lkup' { + + + + + levelsof country_code, local(countries) clean + if (lower("`country'") != "all") { + + local uniq_country : list uniq country + + local ncountries: list uniq_country - countries + local avai_country: list uniq_country & countries + + local countries: list country | avai_country + + } + + if ("`ncountries'" != "") { + if wordcount("`ncountries'") == 1 local be "is" + if wordcount("`ncountries'") > 1 local be "are" + + noi disp as err "Warning: " _c + noi disp as input `"`ncountries' `be' not part of the country list"' /* + */ _n "available in PIP. See {stata povcalnet info}" + + } + + if ("`countries'" == "") { + noi disp in red "None of the countries provided in {it:country()} is available in PIP" + error + } + + *---------- alternative macros + local ct = "`countries'" + local pl = "`povline'" + local pp = "`ppp'" + local yr = "`year'" + local cv = "`coverage'" + + /*================================================== + 1: Evaluate parameters + ==================================================*/ + + *----------1.1: counting words + + local nct = wordcount("`ct'") // number of countries + local npl = wordcount("`pl'") // number of poverty lines + local npp = wordcount("`pp'") // number of PPP values + local nyr = wordcount("`yr'") // number of years + local ncv = wordcount("`cv'") // number of coverage + + matrix A = `nct' \ `npl' \ `npp' \ `nyr' \ `ncv' + mata: A = st_matrix("A"); /* + */ B = ((A :== A[1]) + (A :== 1) :>= 1); /* + */ st_local("m", strofreal(mean(B))) + + if (`m' != 1) { + noi disp in r "number of elements in options {it:povline(), ppp(), year()} and " _n /* + */ "{it:coverage()} must be equal to 1 or to the number of countries in option {it:country()}" + error 197 + } + + *----------1.2: Expand macros of size one + local n = _n + foreach o in pl pp yr cv { + if (`n`o'' == 1) { + local `o': disp _dup(`nct') " ``o'' " + } + } + + /*================================================== + 2: Download data + ==================================================*/ + + *----------2.1: download data + tempfile clfile + local queryfull "`base'?format=csv" + return local queryfull = "`queryfull'" + + } // end of temp frame + + + cap import delimited "`queryfull'", `clear' asdouble + if (_rc) { + noi dis "" + noi dis in red "It was not possible to download data from the PIP API." + noi dis "" + noi dis in white `"(1) Please check your Internet connection by "' _c + noi dis in white `"{browse "`server'/`handle'/health-check" :clicking here}"' + noi dis in white `"(2) Test that the data is retrievable. By"' _c + noi dis in white `"{stata pip test: clicking here }"' _c + noi dis in white "you should be able to download the data." + noi dis in white `"(3) Please consider adjusting your Stata timeout parameters. For more details see {help netio}"' + noi dis in white `"(4) Please send us an email to:"' + noi dis in white _col(8) `"email: data@worldbank.org"' + noi dis in white _col(8) `"subject: pip query error on `c(current_date)' `c(current_time)'"' + noi di "" + error 673 + } + + *---------- 2.2 create filter conditions in loop + local j = 0 + local n = 1 + local kquery = "" // whole filter condition to extract data + foreach ict of local countries { + + * corresponding element to each country + foreach o in pl yr pp cv { + local i`o': word `n' of ``o'' + } + + *---------- coverage + if inlist("`icv'", "-1", "all") & ("`ipp'" == "-1") { + local kquery "`kquery' | (country_code == "`ict'" & reporting_year == `iyr')" + return local kquery_`j' = "`kquery_`j''" + } + else if inlist("`icv'", "-1", "all") & ("`ipp'" != "-1") { + local kquery "`kquery' | (country_code == "`ict'" & reporting_year == `iyr' & ppp == "`ipp'")" + return local kquery_`j' = "`kquery_`j''" + } + else if !inlist("`icv'", "-1", "all") & ("`ipp'" == "-1") { + local kquery "`kquery' | (country_code == "`ict'" & reporting_year == `iyr' & survey_coverage == "`icv'")" + return local kquery_`j' = "`kquery_`j''" + } + else { + local kquery "`kquery' | (country_code == "`ict'" & reporting_year == `iyr' & ppp == "`ipp'" & survey_coverage == "`icv'")" + return local kquery_`j' = "`kquery_`j''" + } + + local ++j + local ++n + } + + local kquery : subinstr local kquery "|" " " + keep if `kquery' + +} +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + + diff --git a/01.code/ado/p/pip_clean.ado b/01.code/ado/p/pip_clean.ado new file mode 100755 index 0000000..45c0a81 --- /dev/null +++ b/01.code/ado/p/pip_clean.ado @@ -0,0 +1,316 @@ +/*================================================== +project: Clean data downloaded from PIP API +Author: R.Andres Castaneda +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 5 Jun 2019 - 17:09:04 +Modification Date: September, 2021 +Do-file version: 02 +References: Adopted from povcalnet_clean +Output: dta +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_clean, rclass + +version 16.0 + +syntax anything(name=type), /// +[ /// +year(string) /// +region(string) /// +iso /// +fillgaps /// +nocensor /// +pause /// +version(string) /// +server(string) /// +] + +if ("`pause'" == "pause") pause on +else pause off + + +//------------ version +if ("`version'" != "") { + local version_qr = "&version=`version'" + tokenize "`version'", parse("_") + local _version = "_`1'_`3'_`9'" + local ppp_version = `3' +} +else { + local version_qr = "" + local _version = "" +} + + + +/*================================================== +1: type 1 +==================================================*/ + +qui if ("`type'" == "1") { + + if ("`year'" == "last"){ + bys country_code: egen maximum_y = max(reporting_year) + keep if maximum_y == reporting_year + drop maximum_y + } + + + *************************************************** + * 5. Labeling/cleaning + *************************************************** + // check if country data frame is available + + pip_info, clear justdata `pause' server(`server') version(`version') + + local orgvar reporting_pop reporting_pce + local newvar population reporting_hfce + + local i = 0 + foreach var of local orgvar { + local ++i + rename `var' `: word `i' of `newvar'' + } + + if "`iso'"!="" { + cap replace country_code = "XKX" if country_code == "KSV" + cap replace country_code = "TLS" if country_code == "TMP" + cap replace country_code = "PSE" if country_code == "WBG" + cap replace country_code = "COD" if country_code == "ZAR" + } + + *rename prmld mld + foreach v of varlist polarization median gini mld decile? decile10 { + qui cap replace `v'=. if `v'==-1 | `v' == 0 + } + + cap drop if ppp=="" + cap drop svyinfoid + + pause query - after replacing invalid values to missing values + + * cap drop polarization + qui count + local obs=`r(N)' + + tostring survey_coverage, replace + + replace survey_coverage = "1" if survey_coverage == "rural" + replace survey_coverage = "2" if survey_coverage == "urban" + replace survey_coverage = "4" if survey_coverage == "A" // not available in pip data + replace survey_coverage = "3" if survey_coverage == "national" + destring survey_coverage, force replace + label define survey_coverage 1 "rural" /* + */ 2 "urban" /* + */ 3 "national" /* + */ 4 "national (aggregate)", modify + + label values survey_coverage survey_coverage + + replace welfare_type = "1" if welfare_type == "consumption" + replace welfare_type = "2" if welfare_type == "income" + destring welfare_type, force replace + label define welfare_type 1 "consumption" 2 "income", modify + label values welfare_type welfare_type + + label var country_code "country/economy code" + label var country_name "country/economy name" + label var region_code "region code" + label var region_name "region name" + label var survey_coverage "survey coverage" + label var reporting_year "year" + label var survey_year "survey year" + label var welfare_type "welfare measured by income or consumption" + label var is_interpolated "data is interpolated" + label var distribution_type "data comes from grouped or microdata" + label var ppp "`ppp_version' purchasing power parity" + label var poverty_line "poverty line in `ppp_version' ppp prices (per capita per day)" + label var mean "average daily per capita income/consumption `ppp_version' ppp prices" + label var headcount "poverty headcount" + label var poverty_gap "poverty gap" + label var poverty_severity "squared poverty gap" + label var watts "watts index" + label var gini "gini index" + label var median "median daily income or expenditure in `ppp_version' ppp prices" + label var mld "mean log deviation" + label var polarization "polarization" + label var population "population in year" + + ds decile* + local vardec = "`r(varlist)'" + foreach var of local vardec { + if regexm("`var'", "([0-9]+)") local q = regexs(1) + label var `var' "decile `q' welfare share" + } + + label var reporting_level "reporting data level" + label var survey_acronym "survey acronym" + label var survey_comparability "survey comparability" + label var comparable_spell "comparability over time at country level" + label var cpi "consumer price index (cpi) in `ppp_version' base" + label var reporting_gdp "reported gdp" + label var reporting_hfce "reported per capita" + + sort country_code reporting_year survey_coverage + + //------------ Formatting + format headcount poverty_gap poverty_severity watts gini mld polarization /// + decile* mean /* survey_mean_ppp */ cpi %8.4f + + * format ppp survey_mean_lcu %10.2fc + format reporting_gdp reporting_hfce %15.2fc + + format population %15.0fc + + format poverty_line %6.2f + + //------------ New variable names + + local old "survey_year reporting_year reporting_gdp reporting_hfce" + local new "welfare_time year gdp hfce" + rename (`old') (`new') + */ + + //------------survey_time + + local frpipfw "_pip_fw`_version'" + + tempname frfw + frame copy `frpipfw' `frfw' + frame `frfw' { + drop year + rename reporting_year year + } + + frlink m:1 country_code year welfare_type, frame(`frfw') + frget survey_time, from(`frfw') + + + order country_code country_name region_code region_name reporting_level /// + year welfare_time welfare_type poverty_line mean headcount /// + poverty_gap poverty_severity watts gini /// + median mld polarization population decile? decile10 cpi ppp gdp hfce /// + survey_comparability /// + survey_acronym survey_time is_interpolated distribution_type survey_coverage + + + //------------remaining labels + label var welfare_time "time income or consumption refers to" + label var survey_time "time of survey in the field" + + //------------drop unnecesary variables + cap drop estimation_type + + if ("`fillgaps'" != "") { + drop ppp survey_time distribution_type gini mld polarization decile* median + } + + qui missings dropvars, force +} + +/*================================================== +2: for Aggregate requests +==================================================*/ +qui if ("`type'" == "2") { + + + if ("`year'" == "last") { + tempvar maximum_y + bys region_code: egen `maximum_y' = max(reporting_year) + keep if `maximum_y' == reporting_year + } + + *************************************************** + * 4. Renaming and labeling + *************************************************** + + rename reporting_pop population + + label var region_code "region code" + label var reporting_year "year" + label var poverty_line "poverty line in `ppp_version' ppp prices (per capita per day)" + label var mean "average daily per capita income/consumption in `ppp_version' ppp prices" + label var headcount "poverty headcount" + label var poverty_gap "poverty gap" + label var poverty_severity "squared poverty gap" + label var population "population in year" + label var pop_in_poverty "population in poverty" + label var watts "watts index" + label var region_name "world bank region" + + order region_name region_code reporting_year poverty_line /// + mean headcount poverty_gap poverty_severity watts /// + population + + //------------ Formatting + format headcount poverty_gap poverty_severity watts mean %8.4f + + format pop_in_poverty population %15.0fc + + format poverty_line %6.2f + + local old "reporting_year" + local new "year" + rename (`old') (`new') + + qui missings dropvars, force +} // end of type 2 + + +/*================================================== +Delete Obs with no headcount or Poverty line +==================================================*/ + +qui { + cap confirm var country_code, exact + if (_rc) { + local geo region_code + } + else { + local geo country_code + } + + tempvar misspl + gen `misspl' = poverty_line == . + qui count if `misspl' + if (r(N) > 0) { + noi disp as err "Warning: The API returned an invalid poverty " /// + "line for the following combinations. Observations are deleted." /// + _n "Please, contact PIP Technical Team at " + + noi list `geo' year if `misspl', clean noobs + drop if `misspl' + } + + tempvar misshc + gen `misshc' = headcount == . + qui count if `misshc' + if (r(N) > 0) { + noi disp as err "Warning: The following combinations do not " /// + "have valid poverty headcount. Obs will be deleted" + + noi list `geo' year poverty_line if `misshc', clean noobs + drop if `misshc' + } +} + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_cleanup.ado b/01.code/ado/p/pip_cleanup.ado new file mode 100755 index 0000000..4b3c3eb --- /dev/null +++ b/01.code/ado/p/pip_cleanup.ado @@ -0,0 +1,22 @@ +/*================================================== +project: Clean up PIP files +Author: R.Andres Castaneda +---------------------------------------------------- +Creation Date: 28 Mar 2022 - 13:43:38 +==================================================*/ + +/*================================================== + 0: Program set up +==================================================*/ +program define pip_cleanup +* clean frames +pip_drop frame, frame_prefix(_pip_) qui + +* clean globals +pip_drop global +noi disp in y "PIP internal data has been cleaned up" +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< diff --git a/01.code/ado/p/pip_countries.sthlp b/01.code/ado/p/pip_countries.sthlp new file mode 100755 index 0000000..922e367 --- /dev/null +++ b/01.code/ado/p/pip_countries.sthlp @@ -0,0 +1,208 @@ +{smcl} +{* *! version 1.0 13 Mar 2014}{...} +{vieweralsosee "" "--"}{...} +{vieweralsosee "Install wbopendata" "ssc install wbopendata"}{...} +{vieweralsosee "Help wbopendata (if installed)" "help wbopendata"}{...} +{viewerjumpto "pip main help file" "pip"}{...} +{viewerjumpto "Regions" "pip_countries##regions"}{...} +{viewerjumpto "Countries" "pip_countries##countries"}{...} +{title:pip - Regions and Countries codes} +{phang} + +{marker regions}{...} +{p 40 20 2}(Get Back to {it:{help pip:main help file}}){p_end} +{title:Region Codes} + +{synoptset 33 tabbed}{...} +{col 7}{it:Code}{col 41}{cmd:Name} +{col 6}{hline 70} +{synopt:{opt EAP }} East Asia and Pacific{p_end} +{synopt:{opt ECA }} Europe and Central Asia{p_end} +{synopt:{opt HIC }} Other High Income{p_end} +{synopt:{opt LAC }} Latin America and the Caribbean{p_end} +{synopt:{opt MNA }} Middle East and North Africa{p_end} +{synopt:{opt SAS }} South Asia{p_end} +{synopt:{opt SSA }} Sub-Saharan Africa{p_end} +{col 6}{hline 70} +{p 6 6 0 76}{err:Note}: {it:Other High Income} includes mostly high-income economies +(for a list see p. 152 of World Bank, 2018, and the pip webpage); +all other economies are included in geographic regions, as defined by the World Bank.{p_end} + + + +{marker countries}{...} +{p 40 20 2}(Get Back to {it:{help pip:main help file}}){p_end} +{title:Country and Economy Acronyms} + +{synoptset 33 tabbed}{...} +{col 7}{it:Code}{col 41}{cmd:Name} +{synoptline} +{synopt:{opt ALB }} Albania{p_end} +{synopt:{opt DZA }} Algeria{p_end} +{synopt:{opt AGO }} Angola{p_end} +{synopt:{opt ARG }} Argentina{p_end} +{synopt:{opt ARM }} Armenia{p_end} +{synopt:{opt AUS }} Australia{p_end} +{synopt:{opt AUT }} Austria{p_end} +{synopt:{opt AZE }} Azerbaijan{p_end} +{synopt:{opt BGD }} Bangladesh{p_end} +{synopt:{opt BLR }} Belarus{p_end} +{synopt:{opt BEL }} Belgium{p_end} +{synopt:{opt BLZ }} Belize{p_end} +{synopt:{opt BEN }} Benin{p_end} +{synopt:{opt BTN }} Bhutan{p_end} +{synopt:{opt BOL }} Bolivia{p_end} +{synopt:{opt BIH }} Bosnia and Herzegovina{p_end} +{synopt:{opt BWA }} Botswana{p_end} +{synopt:{opt BRA }} Brazil{p_end} +{synopt:{opt BGR }} Bulgaria{p_end} +{synopt:{opt BFA }} Burkina Faso{p_end} +{synopt:{opt BDI }} Burundi{p_end} +{synopt:{opt CPV }} Cabo Verde{p_end} +{synopt:{opt CMR }} Cameroon{p_end} +{synopt:{opt CAN }} Canada{p_end} +{synopt:{opt CAF }} Central African Republic{p_end} +{synopt:{opt TCD }} Chad{p_end} +{synopt:{opt CHL }} Chile{p_end} +{synopt:{opt CHN }} China{p_end} +{synopt:{opt COL }} Colombia{p_end} +{synopt:{opt COM }} Comoros{p_end} +{synopt:{opt COD }} Congo, Democratic Republic of{p_end} +{synopt:{opt COG }} Congo, Republic of{p_end} +{synopt:{opt CRI }} Costa Rica{p_end} +{synopt:{opt CIV }} Cote d'Ivoire{p_end} +{synopt:{opt HRV }} Croatia{p_end} +{synopt:{opt CYP }} Cyprus{p_end} +{synopt:{opt CZE }} Czech Republic{p_end} +{synopt:{opt DNK }} Denmark{p_end} +{synopt:{opt DJI }} Djibouti{p_end} +{synopt:{opt DOM }} Dominican Republic{p_end} +{synopt:{opt ECU }} Ecuador{p_end} +{synopt:{opt EGY }} Egypt, Arab Republic of{p_end} +{synopt:{opt SLV }} El Salvador{p_end} +{synopt:{opt EST }} Estonia{p_end} +{synopt:{opt SWZ }} Eswatini{p_end} +{synopt:{opt ETH }} Ethiopia{p_end} +{synopt:{opt FJI }} Fiji{p_end} +{synopt:{opt FIN }} Finland{p_end} +{synopt:{opt FRA }} France{p_end} +{synopt:{opt GAB }} Gabon{p_end} +{synopt:{opt GMB }} Gambia, The{p_end} +{synopt:{opt GEO }} Georgia{p_end} +{synopt:{opt DEU }} Germany{p_end} +{synopt:{opt GHA }} Ghana{p_end} +{synopt:{opt GRC }} Greece{p_end} +{synopt:{opt GTM }} Guatemala{p_end} +{synopt:{opt GIN }} Guinea{p_end} +{synopt:{opt GNB }} Guinea-Bissau{p_end} +{synopt:{opt GUY }} Guyana{p_end} +{synopt:{opt HTI }} Haiti{p_end} +{synopt:{opt HND }} Honduras{p_end} +{synopt:{opt HUN }} Hungary{p_end} +{synopt:{opt ISL }} Iceland{p_end} +{synopt:{opt IND }} India{p_end} +{synopt:{opt IDN }} Indonesia{p_end} +{synopt:{opt IRN }} Iran, Islamic Republic of{p_end} +{synopt:{opt IRQ }} Iraq{p_end} +{synopt:{opt IRL }} Ireland{p_end} +{synopt:{opt ISR }} Israel{p_end} +{synopt:{opt ITA }} Italy{p_end} +{synopt:{opt JAM }} Jamaica{p_end} +{synopt:{opt JPN }} Japan{p_end} +{synopt:{opt JOR }} Jordan{p_end} +{synopt:{opt KAZ }} Kazakhstan{p_end} +{synopt:{opt KEN }} Kenya{p_end} +{synopt:{opt KIR }} Kiribati{p_end} +{synopt:{opt KOR }} Korea, Republic of{p_end} +{synopt:{opt XKX }} Kosovo{p_end} +{synopt:{opt KGZ }} Kyrgyz Republic{p_end} +{synopt:{opt LAO }} Lao People's Democratic Republic{p_end} +{synopt:{opt LVA }} Latvia{p_end} +{synopt:{opt LBN }} Lebanon{p_end} +{synopt:{opt LSO }} Lesotho{p_end} +{synopt:{opt LBR }} Liberia{p_end} +{synopt:{opt LTU }} Lithuania{p_end} +{synopt:{opt LUX }} Luxembourg{p_end} +{synopt:{opt MDG }} Madagascar{p_end} +{synopt:{opt MWI }} Malawi{p_end} +{synopt:{opt MYS }} Malaysia{p_end} +{synopt:{opt MDV }} Maldives{p_end} +{synopt:{opt MLI }} Mali{p_end} +{synopt:{opt MLT }} Malta{p_end} +{synopt:{opt MRT }} Mauritania{p_end} +{synopt:{opt MUS }} Mauritius{p_end} +{synopt:{opt MEX }} Mexico{p_end} +{synopt:{opt FSM }} Micronesia, Federated States of{p_end} +{synopt:{opt MDA }} Moldova{p_end} +{synopt:{opt MNG }} Mongolia{p_end} +{synopt:{opt MNE }} Montenegro{p_end} +{synopt:{opt MAR }} Morocco{p_end} +{synopt:{opt MOZ }} Mozambique{p_end} +{synopt:{opt MMR }} Myanmar{p_end} +{synopt:{opt NAM }} Namibia{p_end} +{synopt:{opt NPL }} Nepal{p_end} +{synopt:{opt NLD }} Netherlands{p_end} +{synopt:{opt NIC }} Nicaragua{p_end} +{synopt:{opt NER }} Niger{p_end} +{synopt:{opt NGA }} Nigeria{p_end} +{synopt:{opt MKD }} North Macedonia{p_end} +{synopt:{opt NOR }} Norway{p_end} +{synopt:{opt PAK }} Pakistan{p_end} +{synopt:{opt PAN }} Panama{p_end} +{synopt:{opt PNG }} Papua New Guinea{p_end} +{synopt:{opt PRY }} Paraguay{p_end} +{synopt:{opt PER }} Peru{p_end} +{synopt:{opt PHL }} Philippines{p_end} +{synopt:{opt POL }} Poland{p_end} +{synopt:{opt PRT }} Portugal{p_end} +{synopt:{opt ROU }} Romania{p_end} +{synopt:{opt RUS }} Russian Federation{p_end} +{synopt:{opt RWA }} Rwanda{p_end} +{synopt:{opt WSM }} Samoa{p_end} +{synopt:{opt STP }} Sao Tome and Principe{p_end} +{synopt:{opt SEN }} Senegal{p_end} +{synopt:{opt SRB }} Serbia{p_end} +{synopt:{opt SYC }} Seychelles{p_end} +{synopt:{opt SLE }} Sierra Leone{p_end} +{synopt:{opt SVK }} Slovak Republic{p_end} +{synopt:{opt SVN }} Slovenia{p_end} +{synopt:{opt SLB }} Solomon Islands{p_end} +{synopt:{opt SOM }} Somalia{p_end} +{synopt:{opt ZAF }} South Africa{p_end} +{synopt:{opt SSD }} South Sudan{p_end} +{synopt:{opt ESP }} Spain{p_end} +{synopt:{opt LKA }} Sri Lanka{p_end} +{synopt:{opt LCA }} St. Lucia{p_end} +{synopt:{opt SDN }} Sudan{p_end} +{synopt:{opt SUR }} Suriname{p_end} +{synopt:{opt SWE }} Sweden{p_end} +{synopt:{opt CHE }} Switzerland{p_end} +{synopt:{opt SYR }} Syrian Arab Republic{p_end} +{synopt:{opt TJK }} Tajikistan{p_end} +{synopt:{opt TZA }} Tanzania{p_end} +{synopt:{opt THA }} Thailand{p_end} +{synopt:{opt TLS }} Timor-Leste{p_end} +{synopt:{opt TGO }} Togo{p_end} +{synopt:{opt TON }} Tonga{p_end} +{synopt:{opt TTO }} Trinidad and Tobago{p_end} +{synopt:{opt TUN }} Tunisia{p_end} +{synopt:{opt TUR }} Turkey{p_end} +{synopt:{opt TKM }} Turkmenistan{p_end} +{synopt:{opt TUV }} Tuvalu{p_end} +{synopt:{opt UGA }} Uganda{p_end} +{synopt:{opt UKR }} Ukraine{p_end} +{synopt:{opt GBR }} United Kingdom{p_end} +{synopt:{opt USA }} United States{p_end} +{synopt:{opt URY }} Uruguay{p_end} +{synopt:{opt UZB }} Uzbekistan{p_end} +{synopt:{opt VUT }} Vanuatu{p_end} +{synopt:{opt VEN }} Venezuela, Republica Bolivariana de{p_end} +{synopt:{opt VNM }} Vietnam{p_end} +{synopt:{opt PSE }} West Bank and Gaza{p_end} +{synopt:{opt YEM }} Yemen, Republic of{p_end} +{synopt:{opt ZMB }} Zambia{p_end} +{synopt:{opt ZWE }} Zimbabwe{p_end} +{synoptline} + +{p 40 20 2}(Go up to {it:{help pip_countries:top}}){p_end} + diff --git a/01.code/ado/p/pip_drop.ado b/01.code/ado/p/pip_drop.ado new file mode 100755 index 0000000..ca4746c --- /dev/null +++ b/01.code/ado/p/pip_drop.ado @@ -0,0 +1,76 @@ +/*================================================== +project: +Author: R.Andres Castaneda +---------------------------------------------------- +Creation Date: 20 Jan 2022 - 09:44:03 +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_drop, rclass +syntax [anything(name=subcommand)], [frame_prefix(string) qui] + + +qui { + //======================================================== + // Frames + //======================================================== + if regexm("`subcommand'", "^frame") { + + if ("`frame_prefix'" == "") { + local frame_prefix "pip_" + } + + //------------ Remove frames + + frame dir + local av_frames "`r(frames)'" + + foreach fr of local av_frames { + + if (regexm("`fr'", "(^`frame_prefix')(.+)")) { + + frame drop `fr' + + local dropped "`dropped' `fr'" + } + + } // loop over frames + + if ("`dropped'" == "") { + if ("`qui'" == "") noi disp in y "NO frame was dropped" + } + else { + if ("`qui'" == "") { + noi disp in y "The following internal frames were dropped:" + foreach f of local dropped { + noi disp in w "`f'" + } + } + } + + + } + + //======================================================== + // Globals + //======================================================== + if regexm("`subcommand'", "^global") { + local pip_globals: all globals "pip_*" + * disp "`pip_globals'" + foreach gl of local pip_globals { + if regexm("`gl'", "svr") continue + global `gl' "" + } + + } + + +} // end of qui + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< diff --git a/01.code/ado/p/pip_drop_frames.ado b/01.code/ado/p/pip_drop_frames.ado new file mode 100755 index 0000000..64860bf --- /dev/null +++ b/01.code/ado/p/pip_drop_frames.ado @@ -0,0 +1,57 @@ +/*================================================== +project: +Author: R.Andres Castaneda +---------------------------------------------------- +Creation Date: 20 Jan 2022 - 09:44:03 +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_drop_frames, rclass +syntax, [frame_prefix(string)] + +//======================================================== +// Conditions +//======================================================== + +if ("`frame_prefix'" == "") { + local frame_prefix "pip_" +} + + +//======================================================== +// Remove frames +//======================================================== + +qui { + + frame dir + local av_frames "`r(frames)'" + + foreach fr of local av_frames { + + if (regexm("`fr'", "(^`frame_prefix')(.+)")) { + + frame drop `fr' + + local dropped "`dropped' `fr'" + } + + } // loop over frames + + if ("`dropped'" == "") { + noi disp in y "NO frame was dropped" + } + else { + noi disp in r "frames `dropped' were dropped" + } + +} // end of qui + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< diff --git a/01.code/ado/p/pip_examples.ado b/01.code/ado/p/pip_examples.ado new file mode 100755 index 0000000..3890e9b --- /dev/null +++ b/01.code/ado/p/pip_examples.ado @@ -0,0 +1,416 @@ +********************************************************************************* +*pip_examples-: Auxiliary program for -pip- * +*! v1.0 sept2018 by Jorge Soler Lopez * +* Espen Beer Prydz * +* Christoph Lakner * +* Ruoxuan Wu * +* Qinghua Zhao * +* World Bank Group * +*! based on JP Azevedo wbopendata_examples * +********************************************************************************* + +* ---------------------------------------------------------------------------- +* 0. Main program +* ---------------------------------------------------------------------------- + +capture program drop pip_examples +program pip_examples +version 16.0 +args EXAMPLE +set more off +`EXAMPLE' +end + + +* ---------------------------------------------------------------------------- +* World Poverty Trend (reference year) +* ---------------------------------------------------------------------------- +program define pip_example01 + + pip wb, clear + + keep if year > 1989 + keep if region_code == "WLD" + gen poorpop = headcount * population/ 1000000 + gen hcpercent = round(headcount*100, 0.1) + gen poorpopround = round(poorpop, 1) + + twoway (sc hcpercent year, yaxis(1) mlab(hcpercent) /// + mlabpos(7) mlabsize(vsmall) c(l)) /// + (sc poorpopround year, yaxis(2) mlab(poorpopround) /// + mlabsize(vsmall) mlabpos(1) c(l)), /// + yti("Poverty Rate (%)" " ", size(small) axis(1)) /// + ylab(0(10)40, labs(small) nogrid angle(0) axis(1)) /// + yti("Number of Poor (million)", size(small) axis(2)) /// + ylab(0(400)2000, labs(small) angle(0) axis(2)) /// + xlabel(,labs(small)) xtitle("Year", size(small)) /// + graphregion(c(white)) ysize(5) xsize(5) /// + legend(order( /// + 1 "Poverty Rate (% of people living below $2.15)" /// + 2 "Number of people who live below $2.15") si(vsmall) /// + row(2)) scheme(s2color) + +end +* ---------------------------------------------------------------------------- +* Millions of poor by region (reference year) +* ---------------------------------------------------------------------------- +program define pip_example02 + pip wb, clear + keep if year > 1989 + gen poorpop = headcount * population /1000000 + gen hcpercent = round(headcount*100, 0.1) + gen poorpopround = round(poorpop, 1) + encode region_name, gen(rid) + + levelsof rid, local(regions) + foreach region of local regions { + local legend = `"`legend' `region' "`: label rid `region''" "' + } + + keep year rid poorpop + reshape wide poorpop,i(year) j(rid) + foreach i of numlist 2(1)7{ + egen poorpopacc`i'=rowtotal(poorpop1 - poorpop`i') + } + + twoway (area poorpop1 year) /// + (rarea poorpopacc2 poorpop1 year) /// + (rarea poorpopacc3 poorpopacc2 year) /// + (rarea poorpopacc4 poorpopacc3 year) /// + (rarea poorpopacc5 poorpopacc4 year) /// + (rarea poorpopacc6 poorpopacc5 year) /// + (rarea poorpopacc7 poorpopacc6 year) /// + (line poorpopacc7 year, lwidth(midthick) lcolor(gs0)), /// + ytitle("Millions of Poor" " ", size(small)) /// + xtitle(" " "", size(small)) scheme(s2color) /// + graphregion(c(white)) ysize(7) xsize(8) /// + ylabel(,labs(small) nogrid angle(verticle)) xlabel(,labs(small)) /// + legend(order(`legend') si(vsmall)) +end + +* ---------------------------------------------------------------------------- +* Categories of income and poverty in LAC +* ---------------------------------------------------------------------------- +program pip_example03 + pip, region(lac) year(last) povline(2.15 3.65 6.85) clear + keep if welfare_type ==2 & year>=2014 // keep income surveys + keep poverty_line country_code country_name year headcount + replace poverty_line = poverty_line*100 + replace headcount = headcount*100 + tostring poverty_line, replace format(%12.0f) force + reshape wide headcount,i(year country_code country_name ) j(poverty_line) string + + gen percentage_0 = headcount215 + gen percentage_1 = headcount365 - headcount215 + gen percentage_2 = headcount685 - headcount365 + gen percentage_3 = 100 - headcount685 + + keep country_code country_name year percentage_* + reshape long percentage_,i(year country_code country_name ) j(category) + la define category 0 "Extreme poor (< $2.15)" 1 "Poor LIMIC ($2.15-$3.65)" /// + 2 "Poor UMIC ($3.65-$6.85)" 3 "Non-poor (> $6.85)" + la val category category + la var category "" + + local title "Distribution of Income in Latin America and Caribbean, by country" + local note "Source: PIP, using the latest survey after 2014 for each country." + local yti "Population share in each income category (%)" + + graph bar (mean) percentage, inten(*0.7) o(category) o(country_code, /// + lab(labsi(small) angle(vertical)) sort(1) descending) stack asy /// + blab(bar, pos(center) format(%3.1f) si(tiny)) /// + ti("`title'", si(small)) note("`note'", si(*.7)) /// + graphregion(c(white)) ysize(6) xsize(6.5) /// + legend(si(vsmall) r(3)) yti("`yti'", si(small)) /// + ylab(,labs(small) nogrid angle(0)) scheme(s2color) +end + +* ---------------------------------------------------------------------------- +* Trend of Gini +* ---------------------------------------------------------------------------- +program pip_example04 +pip, country(arg gha tha) year(all) clear + replace gini = gini * 100 + keep if welfare_time > 1989 + twoway (connected gini welfare_time if country_code == "ARG") /// + (connected gini welfare_time if country_code == "GHA") /// + (connected gini welfare_time if country_code == "THA"), /// + ytitle("Gini Index" " ", size(small)) /// + xtitle(" " "", size(small)) ylabel(,labs(small) nogrid /// + angle(verticle)) xlabel(,labs(small)) /// + graphregion(c(white)) scheme(s2color) /// + legend(order(1 "Argentina" 2 "Ghana" 3 "Thailand") si(small) row(1)) + +end + +* ---------------------------------------------------------------------------- +* Growth incidence curves +* ---------------------------------------------------------------------------- +program pip_example05 + pip, country(arg gha tha) year(all) clear + reshape long decile, i(country_code welfare_time) j(dec) + + egen panelid=group(country_code dec) + replace welfare_time =int(welfare_time) + xtset panelid welfare_time + + replace decile=10*decile*mean + gen g=(((decile/L5.decile)^(1/5))-1)*100 + + replace g=(((decile/L7.decile)^(1/7))-1)*100 if country_code=="GHA" + replace dec=10*dec + + twoway (sc g dec if welfare_time ==2016 & country_code=="ARG", c(l)) /// + (sc g dec if welfare_time ==2005 & country_code=="GHA", c(l)) /// + (sc g dec if welfare_time ==2015 & country_code=="THA", c(l)), /// + yti("Annual growth in decile average income (%)" " ", /// + size(small)) xlabel(0(10)100,labs(small)) /// + xtitle("Decile group", size(small)) graphregion(c(white)) /// + legend(order(1 "Argentina(2011-2016)" /// + 2 "Ghana(1998-2005)" 3 "Thailand(2010-2015)") /// + si(vsmall) row(1)) scheme(s2color) + +end + +* ---------------------------------------------------------------------------- +* Gini & per capita GDP +* ---------------------------------------------------------------------------- +program pip_example06 + set checksum off + wbopendata, indicator(NY.GDP.PCAP.PP.KD) long clear + rename countrycode country_code + tempfile PerCapitaGDP + save `PerCapitaGDP', replace + + pip, povline(2.15) country(all) year(last) clear iso + keep country_code country_name year gini + drop if gini == -1 + * Merge Gini coefficient with per capita GDP + merge m:1 country_code year using `PerCapitaGDP', keep(match) + replace gini = gini * 100 + drop if ny_gdp_pcap_pp_kd == . + + gen loggdp = log10(ny_gdp_pcap_pp_kd) + + twoway (scatter gini loggdp, mfcolor(%0) /// + msize(vsmall)) (lfit gini loggdp), /// + ylabel(, format(%2.0f)) /// + ytitle("Gini Index" " ", size(small)) /// + xtitle(" " "GDP per Capita per Year (in 2017 USD PPP)", /// + size(small)) graphregion(c(white)) ysize(5) xsize(7) /// + ylabel(,labs(small) nogrid angle(verticle)) /// + xlabel(,labs(small)) scheme(s2color) /// + legend(order(1 "Gini Index" 2 "Fitted Value") si(small)) +end + + + + +* ---------------------------------------------------------------------------- +* Regional Poverty Evolution +* ---------------------------------------------------------------------------- +program define pip_example07 + pip wb, povline(2.15 3.65 6.85) clear + drop if inlist(region_code, "OHI", "WLD") | year<1990 + keep poverty_line region_name year headcount + replace poverty_line = poverty_line*100 + replace headcount = headcount*100 + drop if headcount == . + + tostring poverty_line, replace format(%12.0f) force + reshape wide headcount,i(year region_name) j(poverty_line) string + + local title "Poverty Headcount Ratio (1990-2019), by region" + + twoway (sc headcount215 year, c(l) msiz(small)) /// + (sc headcount365 year, c(l) msiz(small)) /// + (sc headcount685 year, c(l) msiz(small)), /// + by(reg, title("`title'", si(med)) /// + note("Source: PIP", si(vsmall)) graphregion(c(white))) /// + ylabel(, format(%2.0f)) /// + xlab(1990(5)2019 , labsi(vsmall)) xti("Year", si(vsmall)) /// + ylab(0(25)100, labsi(vsmall) angle(0)) /// + yti("Poverty headcount (%)", si(vsmall)) /// + leg(order(1 "$2.15" 2 "$3.65" 3 "$6.85") r(1) si(vsmall)) /// + sub(, si(small)) scheme(s2color) +end + + + + + +// ------------------------------------------------------------------------ +// National level and longest available series (temporal change in welfare) +// ------------------------------------------------------------------------ + +program define pip_example08 + +pip, clear + +* Prepare reporting_level variable +label define level 3 "national" 2 "urban" 1 "rural" +encode reporting_level, gen(reporting_level_2) label(level) + +* keep only national +bysort country_code welfare_type year: egen _ncover = count(reporting_level_2 ) +gen _tokeepn = ( (inlist(reporting_level_2 , 3, 4) & _ncover > 1) | _ncover == 1) + +keep if _tokeepn == 1 + +* Keep longest series per country +by country_code welfare_type , sort: gen _ndtype = _n == 1 +by country_code : replace _ndtype = sum(_ndtype) +by country_code : replace _ndtype = _ndtype[_N] // number of welfare_type per country + +duplicates tag country_code year, gen(_yrep) // duplicate year + +bysort country_code welfare_type : egen _type_length = count(year) // length of type series +bysort country_code: egen _type_max = max(_type_length) // longest type series +replace _type_max = (_type_max == _type_length) + +* in case of same length in series, keep consumption +by country_code _type_max, sort: gen _ntmax = _n == 1 +by country_code : replace _ntmax = sum(_ntmax) +by country_code : replace _ntmax = _ntmax[_N] // number of welfare_type per country + + +gen _tokeepl = ((_type_max == 1 & _ntmax == 2) | /// + (welfare_type == 1 & _ntmax == 1 & _ndtype == 2) | /// + _yrep == 0) + +keep if _tokeepl == 1 +drop _* + +end + +// ------------------------------------------------------------------------ +// National level and longest available series of same welfare type +// ------------------------------------------------------------------------ + +program define pip_example09 + +pip, clear + +* Prepare reporting_level variable +label define level 3 "national" 2 "urban" 1 "rural" +encode reporting_level, gen(reporting_level_2) label(level) + +* keep only national +bysort country_code welfare_type year: egen _ncover = count(reporting_level_2 ) +gen _tokeepn = ( (inlist(reporting_level_2 , 3, 4) & _ncover > 1) | _ncover == 1) + +keep if _tokeepn == 1 +* Keep longest series per country +by country_code welfare_type , sort: gen _ndtype = _n == 1 +by country_code : replace _ndtype = sum(_ndtype) +by country_code : replace _ndtype = _ndtype[_N] // number of welfare_type per country + + +bysort country_code welfare_type : egen _type_length = count(year) +bysort country_code: egen _type_max = max(_type_length) +replace _type_max = (_type_max == _type_length) + +* in case of same length in series, keep consumption +by country_code _type_max, sort: gen _ntmax = _n == 1 +by country_code : replace _ntmax = sum(_ntmax) +by country_code : replace _ntmax = _ntmax[_N] // max + + +gen _tokeepl = ((_type_max == 1 & _ntmax == 2) | /// + (welfare_type == 1 & _ntmax == 1 & _ndtype == 2)) | /// + _ndtype == 1 + +keep if _tokeepl == 1 +drop _* + +end + +//======================================================== +// Longest series +//======================================================== + +program pip_example10 + pip, clear + *Series length by welfare type + bysort country_code welfare_type: gen series = _N + *Longest + bysort country_code : egen longest_series=max(series) + tab country_code if series !=longest_series + keep if series == longest_series + + *2. If same length: keep most recent + bys country_code welfare_type series: egen latest_year=max(year) + bysort country_code: egen most_recent=max(latest_year) + + tab country_code if longest_series==series & latest_year!=most_recent + drop if most_recent>latest_year + + *3. Not Applicable: if equal length and most recent: keep consumption + bys country_code: egen preferred_welfare=min(welfare_type) + drop if welfare_type != preferred_welfare + +end + +//======================================================== +// replicate lineup estimates +//======================================================== + +program pip_example11 + +ip cleanup +global country_code "NAM" + +//Load survey poverty estimates +tempname pip +frame create `pip' +frame `pip' { + pip, country(${country_code}) clear coverage(national) + decode welfare_type, gen(wt) +} + +// merge with pip results +pip tables, table(interpolated_means) clear +frlink m:1 country_code welfare_time welfare_type, /// + frame(`pip' country_code welfare_time wt) + +//Poverty line to query +gen double pl_to_query = 2.15 * frval(`pip', mean)/predicted_mean_ppp +keep if pl_to_query < . + +//Weights for interpolated means +gen double interpol_wt = 1 / abs(welfare_time - year) +egen double interpol_wtt = total(interpol_wt),by(year) +gen double interpol_shr = interpol_wt/interpol_wtt +gen double survey_year = floor(welfare_time) +sort country_code year welfare_time + +keep if inrange(year, 2000, 2015) // modify to take less time +//Initialize empty data set to store results +tempname results dtloop +frame create `results' str3 country_code double(year hc wgt) +frame copy `c(frame)' `dtloop' +local N = _N +forvalues row=1/`N' { + + loc ccc = _frval(`dtloop', country_code, `row') + loc yy = _frval(`dtloop', year, `row') + loc yyyy = _frval(`dtloop', survey_year, `row') + loc pl = _frval(`dtloop', pl_to_query, `row') + loc wgt = _frval(`dtloop', interpol_shr, `row') + + pip, clear country(`ccc') year(`yyyy') coverage(national) povline(`pl') + frame post `results' ("`ccc'") (`yy') (headcount[1]) (`wgt') +} + +//Apply weights for interpolated poverty estimates +frame `results': collapse (mean) headcount=hc [w = wgt], by( country_code year) + +//Check results +pip, clear country(${country_code}) fillgaps +keep country_code year headcount +frlink 1:1 country_code year, frame(`results') +gen double d_hc = headcount/frval(`results', headcount, .a) +sum d_hc + + +end diff --git a/01.code/ado/p/pip_find_src.ado b/01.code/ado/p/pip_find_src.ado new file mode 100755 index 0000000..6bdc42f --- /dev/null +++ b/01.code/ado/p/pip_find_src.ado @@ -0,0 +1,130 @@ +/*================================================== +project: Find source of PIP installation +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 13 Feb 2023 - 17:27:32 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_find_src, rclass +syntax [anything(name=scmd)] /// +[, /// +path(string) /// +pause /// +] +version 16.0 + + +if ("`pause'" == "pause") pause on +else pause off + + +/*================================================== +1: if not found +==================================================*/ + +* This ado-file is inspired by the command `dependencies` by Diana Gold + +if ("`path'" == "") { + local path = "PLUS" +} +return local path = "`path'" +capture findfile "stata.trk", path(`"`path'"') all +local stata_trk_list `"`r(fn)'"' + +if _rc != 0 { + noi dis as res "{cmd: pip} has not been installed from either SSC or " /// + "GitHub in directory `path'. You could," as text /// + "{p 6 6 2} 1. Search for {cmd:pip} in a different directory using {it:path()} option {p_end}" /// + "{p 6 6 2} 2. Install stable version from SSC, {stata pip install ssc} {p_end}" /// + "{p 6 6 2} 3. Install development version from Github {stata pip install gh} {p_end}" + // return info + noi disp in res "Return {it:NotInstalled} in r(src)" + return local src = "NotInstalled" + exit +} + +/*================================================== +2: If found +==================================================*/ + +qui else { + + * Reverse the list of all stata.trk found in adopath + * because if a command exists in two places (ie: PLUS & PERSONAL), + local n_stata_trk : list sizeof stata_trk_list + local statatrk: word `n_stata_trk' of `stata_trk_list' + + + tempname trk + frame create `trk' + frame `trk' { + * Each line is considered a single observation - then parsed later + import delimited using `"`statatrk'"', delimiter(`"`=char(10)'"') + + * First character marks: S (source) N (name) D (installation date) d (description) f (files) U(stata tracker) e(end) + gen marker = substr(v1, 1, 1) + keep if inlist(marker, "S", "N") + gen pkg_name = substr(v1, 3, .) if marker == "N" + + local p = 0 + gen pkg_code = `p' + forvalues i = 1/`=_N' { + if (marker[`i'] == "S") { + local p = `p' + 1 + replace pkg_name = pkg_name[`i' + 1] in `i' + } + else if (marker[`i'] == "N" ) { + local last_pkg_name = pkg_name[`i'] + } + else { + replace pkg_name = "`last_pkg_name'" in `i' + } + replace pkg_code = `p' in `i' + } // end of for loop by obs + + // get those lines with pip package + keep if regexm(pkg_name, "^pip") & marker == "S" + levelsof pkg_code, clean local(trk_code) + return local trk_code = "`trk_code'" + + levelsof v1 if marker == "S", local(trk_sources) + local trk_sources: subinstr local trk_sources "S " "", all + + return local trk_sources = `"`trk_sources'"' + + // get last source + if regexm(v1[_N], "repec") local src = "ssc" + else if (v1[_N] == "") local src = "NotInstalled" + else local src "gh" + + } + + return local src = "`src'" +} // end of condition + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_gh.ado b/01.code/ado/p/pip_gh.ado new file mode 100755 index 0000000..14fe730 --- /dev/null +++ b/01.code/ado/p/pip_gh.ado @@ -0,0 +1,227 @@ +/*================================================== +project: message to the user if file is installed from GitHub +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 6 Oct 2022 - 16:35:59 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== + 0: Program set up +==================================================*/ +program define pip_gh, rclass +version 16.0 + + +syntax [anything(name=subcommand)] /// +[, /// +username(string) /// +cmd(string) /// +version(string) /// +pause /// +] + +if ("`pause'" == "pause") pause on +else pause off + +if ("`cmd'" == "") { + local cmd pip +} + +if ("`username'" == "") { + local username "worldbank" +} + + +/*================================================== + 1: Update +==================================================*/ + +if ("`subcommand'" == "update") { + + * Check repository of files + * local cmd pip + cap findfile github.dta, path("`c(sysdir_plus)'g/") + if (_rc) { + github install `username'/`cmd', replace + cap window stopbox note "pip command has been reinstalled to " /// + "keep record of new updates. Please type {stata discard} and retry." + global pip_cmds_ssc = "" + exit + } + local ghfile "`r(fn)'" + * use "`ghfile'", clear + + tempname ghdta + frame create `ghdta' + frame `ghdta' { + use "`ghfile'", clear + qui keep if name == "`cmd'" + if _N == 0 { + di in red "`cmd' package was not found" + github install `username'/`cmd', replace + cap window stopbox note "pip command has been reinstalled to " /// + "keep record of new updates. Please type discard and retry." + global pip_cmds_ssc = "" + exit + } + if _N > 1 { + di as err "{p}multiple {cmd:pip} packages found!" /// + "this can be caused if you had installed multiple " /// + "packages from different repositories, but with an " /// + "identical name..." _n + noi list + } + if _N == 1 { + local repo : di address[1] + local crrtversion : di version[1] + } + } + + * github query `repo' + _tmp_githubquery `repo' + local latestversion = "`r(latestversion)'" + * disp "`latestversion'" + if regexm("`latestversion'", "([0-9]+)\.([0-9]+)\.([0-9]+)\.?([0-9]*)") { + local lastMajor = regexs(1) + local lastMinor = regexs(2) + local lastPatch = regexs(3) + local lastDevel = regexs(4) + } + if ("`lastDevel'" == "") local lastDevel 0 + local last = `lastMajor'`lastMinor'`lastPatch'.`lastDevel' + + * github version `cmd' + * local crrtversion = "`r(version)'" + if regexm("`crrtversion'", "([0-9]+)\.([0-9]+)\.([0-9]+)\.?([0-9]*)"){ + local crrMajor = regexs(1) + local crrMinor = regexs(2) + local crrPatch = regexs(3) + local crrDevel = regexs(4) + } + if ("`crrDevel'" == "") local crrDevel 0 + local current = `crrMajor'`crrMinor'`crrPatch'.`crrDevel' + * disp "`current'" + + * force installation + if ("`crrtversion'" == "") { + local username "worldbank" // to modify + github install `username'/`cmd', replace version(`latestversion') + cap window stopbox note "pip command has been reinstalled to " /// + "keep record of new updates. Please type discard and retry." + global pip_cmds_ssc = "" + exit + } + + if (`last' > `current' ) { + cap window stopbox rusure "There is a new version of `cmd' in Github (`latestversion')." /// + "Would you like to install it now?" + + if (_rc == 0) { + cap github install `repo', replace version(`latestversion') + if (_rc == 0) { + cap window stopbox note "Installation complete. please type" /// + "discard in your command window to finish" + local bye "exit" + } + else { + noi disp as err "there was an error in the installation. " _n /// + "please run the following to retry, " _n(2) /// + "{stata github install `repo', replace}" + local bye "error" + } + } + else local bye "" + + } // end of checking github update + + else { + noi disp as result "Github version of {cmd:`cmd'} is up to date." + local bye "" + } + + return local bye = "`bye'" + +} // end if installed from github + + + +/*================================================== + 2: Message +==================================================*/ +if (inlist("`subcommand'", "msg", "message")) { + noi disp "You're using Github as the host of the {cmd:pip} Stata package." + noi disp "If you want to install the SSC version type {stata pip_install ssc}" +} + +//======================================================== +// Install +//======================================================== + +if (inlist("`subcommand'", "install")) { + pip_install gh, replace version(`version') +} + + +end + +//======================================================== +// Aux programs +//======================================================== + +// Temporal github query +program define _tmp_githubquery, rclass +syntax anything + +qui { + + preserve + drop _all + + local page "https://api.github.com/repos/`anything'/releases" + scalar page = fileread(`"`page'"') + mata { + lines = st_strscalar("page") + lines = ustrsplit(lines, ",")' + lines = strtrim(lines) + lines = stritrim(lines) + + lines = subinstr(lines, `"":""', "->") + lines = subinstr(lines, `"""', "") + } + getmata lines, replace + + split lines, parse ("->") + rename lines? (code url) + + keep if regexm(url, "releases/tag") + gen tag = regexs(2) if regexm(url, "(releases/tag/)(.*)") + local latestversion = tag[1] + +} + +return local latestversion `latestversion' + +end + +exit + +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_info.ado b/01.code/ado/p/pip_info.ado new file mode 100755 index 0000000..f3eb62a --- /dev/null +++ b/01.code/ado/p/pip_info.ado @@ -0,0 +1,328 @@ +********************************************************************************* +* pip_info * +********************************************************************************* + + +program def pip_info, rclass + +version 16.0 + +syntax [, /// +COUntry(string) /// +REGion /// +AGGregate /// +clear /// +justdata /// programmers option +pause /// debugging +server(string) /// +version(string) /// +POVCALNET_format /// +] + +if ("`pause'" == "pause") pause on +else pause off + +qui { + + //if ("`clear'" == "") preserve + + *---------- API defaults + pip_set_server `server', `pause' + local server = "`r(server)'" + local url = "`r(url)'" + return add + + //------------ version + if ("`version'" != "") { + local version_qr = "&version=`version'" + tokenize "`version'", parse("_") + local _version = "_`1'_`3'_`9'" + } + else { + local version_qr = "" + local _version = "" + } + + + *************************************************** + * 0. Info frame + *************************************************** + + local curframe = c(frame) + + //------------ Find available frames + frame dir + local av_frames "`r(frames)'" + local av_frames: subinstr local av_frames " " "|", all + local av_frames = "^(" + "`av_frames'" + ")" + + //------------ countries frame + local frpipcts "_pip_cts`_version'" + if (!regexm("`frpipcts'", "`av_frames'")) { + + frame create `frpipcts' + + frame `frpipcts' { + + cap pip_tables countries, server(`server') version(`version') clear + local rc1 = _rc + + if (`rc1' == 0) { + cap confirm new var iso2_code + if (_rc) { + drop iso2_code + } + sort country_code + } + } + + // drop frame if error happened + if (`rc1' != 0) { + local csvfile0 = "`url'/aux?table=countries`version_qr'&format=csv" + + noi disp in red "There is a problem accessing country name data." + noi disp in red "to check your connection, copy and paste in your browser the following address:" _n /* + */ _col(4) in w `"`csvfile0'"' + frame drop `frpipcts' + error + } + + } + + //------------ regions frame + local frpiprgn "_pip_regions`_version'" + if (!regexm("`frpiprgn'", "`av_frames'")) { + + frame create `frpiprgn' + + frame `frpiprgn' { + + cap pip_tables regions, server(`server') version(`version') clear + local rc1 = _rc + + if (`rc1' == 0) { + drop grouping_type + sort region_code + } + } + + // drop frame if error happened + if (`rc1' != 0) { + local csvfilergn = "`url'/aux?table=regions`version_qr'&format=csv" + noi disp in red "There is a problem accessing region name data." + noi disp in red "to check your connection, copy and paste in your browser the following address:" _n /* + */ _col(4) in w `"`csvfilergn'"' + frame drop `frpiprgn' + error + } + + } + + + + //------------ regions price framework + local frpipfw "_pip_fw`_version'" + if (!regexm("`frpipfw'", "`av_frames'")) { + frame create `frpipfw' + + frame `frpipfw' { + + cap pip_tables framework, server(`server') version(`version') clear + + //------------format variables to make them link to data. + rename welfare_type wt + label define welfare_type 1 "consumption" 2 "income" + encode wt, gen(welfare_type) + + local rc1 = _rc + } + + // drop frame if error happened + if (`rc1' != 0) { + local csvfile2 = "`url'/aux?table=framework`version_qr'&format=csv" + + noi disp in red "There is a problem accessing framework name data." + noi disp in red "to check your connection, copy and paste in your browser the following address:" _n /* + */ _col(4) in w `"`csvfile2'"' + frame drop `frpipfw' + error + } + + } + + *if ("`justdata'" != "") exit + + //======================================================== + // generating a lookup data + //======================================================== + + local frlkupb "_pip_lkupb`_version'" + if (!regexm("`frlkupb'", "`av_frames'")) { + + frame copy `frpipfw' `frlkupb' + + + frame `frlkupb' { + + keep country_code country_name wb_region_code pcn_region_code survey_coverage surveyid_year + + local orgvar survey_coverage surveyid_year + local newvar coverage_level reporting_year + + local i = 0 + foreach var of local orgvar { + local ++i + rename `var' `: word `i' of `newvar'' + } + + tostring reporting_year, replace + gen year = reporting_year + duplicates drop + + reshape wide year, i( wb_region_code pcn_region_code country_code coverage_level country_name) j(reporting_year) string + + egen year = concat(year*), p(" ") + replace year = stritrim(year) + replace year = subinstr(year," ", ",",.) + + keep country_code country_name wb_region_code pcn_region_code coverage_level year + order country_code country_name wb_region_code pcn_region_code coverage_level year + + } + } + + + local frlkupwr "_pip_lkup_wrk" + frame copy `frlkupb' `frlkupwr', replace + if ("`justdata'" != "") exit + + *************************************************** + * 2. Inital listing with countries and regions + *************************************************** + + if ("`country'" == "") & ("`region'" == "") { + + noi disp in y _n "{title:Available Surveys}: " in g "Select a country or region" + noi disp in y _n "{title: Countries}" + + frame `frlkupwr' { + + quietly levelsof country_code , local(countries) + local current_line = 0 + foreach cccc of local countries{ + local current_line = `current_line' + 1 + local display_this = "{stata pip_info, country(`cccc') clear server(`server') version(`version'): `cccc'} " + if (`current_line' < 8) noi display in y `"`display_this'"' _continue + else{ + noi display in y `"`display_this'"' + local current_line = 0 + } + } + + noi disp in y _n(2) "{title: Regions}" + quietly levelsof wb_region, local(regions) + + foreach i_reg of local regions{ + local current_line = 0 + local dipsthis "{stata pip, region(`i_reg') year(all) aggregate clear:`i_reg' }" + noi disp " `dipsthis' " _c + } + } // end of frame + + noi display in y _n "{stata pip_info, region clear: World Bank regions by year}" + noi display _n "" + + cwf `curframe' + exit + } // end of condition + + *************************************************** + * 3. Listing of country surveys + *************************************************** + + if ("`country'" != "") & ("`region'" == "") { + + frame `frlkupwr' { + + noi disp in y _n "{title:Available Surveys for `country'}" + local country = upper("`country'") + keep if country_code == "`country'" + + local link_detail = "`url'/Docs/CountryDocs/`country'.htm" + noi display `"{browse "`link_detail'" : Detailed information (browser)}"' + + local nobs = _N + local current_line = 0 + local index_s = 1 + + foreach n of numlist 1/`nobs' { + noi disp in y _n "`=country_name[`index_s']'-`=coverage_level[`index_s']'" + noi disp in y "survey year" + local years_current = "`=year[`index_s']'" + local coverage = "`=coverage_level[`index_s']'" + local years_current: subinstr local years_current "," " ", all + local index_s = `index_s'+ 1 + + foreach ind_y of local years_current { + local current_line = `current_line' + 1 + local ind_y_c=substr("`ind_y'",1,4) + local display_this = "{stata pip, country(`country') year(`ind_y') server(`server') coverage(`coverage') clear: `ind_y_c'}" + if (`current_line' < 10) noi display in y `"`display_this'"' _continue + + else { + noi display in y `"`display_this'"' + local current_line = 0 + } + + } // end of inner loop + + noi display `"{stata pip, country(`country') year(all) coverage(`coverage') clear: All}"' + + } // end of loop + + + noi display _n "" + + } // end of frame + + cwf `curframe' + exit + } // end of condition + + *************************************************** + * 4. Listing of regions + *************************************************** + if ("`country'" == "") & ("`region'" != "") { + + frame `frlkupwr' { + noi disp in y _n "{title:Available Surveys}" + noi disp in y _n "{title:Select a Year}" + + quietly levelsof wb_region, local(regions) + + foreach i_reg of local regions{ + local current_line = 0 + noi disp in y _n "`i_reg'" + local years_current = "$refyears" + foreach ind_y of local years_current { + local current_line = `current_line' + 1 + local display_this = "{stata pip, region(`i_reg') year(`ind_y') aggregate clear: `ind_y'}" + if (`current_line' < 10) noi display in y `"`display_this'"' _continue + else{ + noi display in y `"`display_this'"' + local current_line = 0 + } + } + noi display in y "{stata pip, region(`i_reg') year(all) aggregate clear: All}" + } // end of loop + + } // end of frame + noi display _n "" + cwf `curframe' + exit + + } // end of condition + +} // end of large quietly + +end diff --git a/01.code/ado/p/pip_install.ado b/01.code/ado/p/pip_install.ado new file mode 100755 index 0000000..afdef59 --- /dev/null +++ b/01.code/ado/p/pip_install.ado @@ -0,0 +1,177 @@ +/*================================================== +project: utility to install pip easily +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 6 Oct 2022 - 18:43:11 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_install, rclass +version 16.0 + + +syntax [anything(name=src)] /// +[, /// +username(string) /// +cmd(string) /// +version(string) /// +pause /// +replace /// +path(string) /// +] + +if ("`pause'" == "pause") pause on +else pause off + +if ("`cmd'" == "") { + local cmd pip +} + +if ("`username'" == "") { + local username "worldbank" +} + + +/*================================================== +1: Search source +==================================================*/ +qui pip_find_src, path(`path') +local osrc = "`r(src)'" // original or installed source +if ("`src'" == "") { + local src = "`osrc'" +} + +//======================================================== +// Uninstall +//======================================================== + +// number of pip versions installed +local trk_code = "`r(trk_code)'" +local trk_srcs = `"`r(trk_sources)'"' +local path = "`r(path)'" + +local ncodes: list sizeof trk_code + +if ("`src'" == "uninstall" | `ncodes' > 1) { + + if (`ncodes' > 1) { + + noi disp as err "There is more than one version of PIP installed" /// + " in the same search path, `path'." _n /// + "You need to uninstall {cmd:pip} in `path' or change installation path" /// + " with option {it:path()}" _n /// + "Type {it:yes} in the console and hit enter to confirm you agree to uninstall {cmd:pip}" _request(_confirm) + if ("`confirm'" != "yes") { + error + } + } + while ("`trk_code'" != "") { + local srcs : word 1 of `trk_srcs' + if regexm(`"`srcs'"', "repec") { + ado uninstall [`: word 1 of `trk_code''] + } + else { + github uninstall [`: word 1 of `trk_code''] + } + pip_find_src, path(`path') + local trk_code = "`r(trk_code)'" + local trk_srcs = `"`r(trk_sources)'"' + } + + if ("`trk_code'" == "") { + noi disp as text "{cmd:pip} was successfully uninstalled" + if ("`src'" == "uninstall") exit + } + else { + error + } +} + + + +/*================================================== +Install +==================================================*/ + +if (inlist(lower("`src'"), "github", "gh")) { + local src "gh" + local source "GitHub" + local alt_src "ssc" + local alt_source "SSC" +} +else if (lower("`src'") == "ssc") { + local src "ssc" + local source "SSC" + local alt_src "gh" + local alt_source "GitHub" +} +else { + noi disp as error "source `src' is not available. Use either {it:gh} or {it:ssc}" + error +} + +if ("`osrc'" == "ssc") { + cap ado uninstall pip +} +else { + cap github uninstall pip +} +if (_rc) { + if (_rc == 111) { + noi disp "package pip does not seems to be installed" // this should not ever happen + } + else { + error _rc + } +} + +if ("`src'" == "gh") { + cap which github + if (_rc) { + net install github, from("https://haghish.github.io/github/") + } + + cap noi github install `username'/`cmd', `replace' version(`version') +} +else { + cap noi ssc install pip, `replace' +} +if (_rc) { + noi disp as error _n "Something went wrong with the installation from `source'." + + if ("`src'" == "gh") { + noi disp `"For troubleshooting, you can follow the instructions {browse "https://github.com/worldbank/pip#from-github":here}"' + } + + noi disp "Alternatively, you could install {cmd:pip} from `alt_source'. Just type {stata pip_install `alt_src', replace}" + error +} +global pip_source = "`src'" + +noi disp "You have successfully installed {cmd:pip} from `source'. " /// +"Please type {stata discard} to load the recently installed version" + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_new_session.ado b/01.code/ado/p/pip_new_session.ado new file mode 100755 index 0000000..dd4c64d --- /dev/null +++ b/01.code/ado/p/pip_new_session.ado @@ -0,0 +1,83 @@ +/*================================================== +project: Execute in each new Stata session +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World bank +---------------------------------------------------- +Creation Date: 30 Nov 2021 - 16:05:24 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_new_session, rclass +version 16.0 + +syntax [anything(name=subcommand)] /// +[, /// +path(string) /// +pause /// +] + +if ("`pause'" == "pause") pause on +else pause off + + +/*================================================== +1: Update PIP +==================================================*/ + +pip_update, path(`path') +local pip_source = "`r(src)'" +local bye = "`r(bye)'" + +/*================================================== +2: Dependencies +==================================================*/ + +*---------- check SSC commands + +local ssc_cmds missings + +noi disp in y "Note: " in w "{cmd:pip} requires the packages " /// +"below from SSC: " _n in g "`ssc_cmds'" + +foreach cmd of local ssc_cmds { + capture which `cmd' + if (_rc != 0) { + ssc install `cmd' + noi disp in g "{cmd:`cmd'} " in w _col(15) "installed" + } +} + +adoupdate `ssc_cmds', ssconly +if ("`r(pkglist)'" != "") adoupdate `r(pkglist)', update ssconly + + +* ----- Globals +global pip_cmds_ssc = 1 // make sure it does not execute again per session +global pip_source = "`pip_source'" +`bye' + +end + +exit + +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_note.sthlp b/01.code/ado/p/pip_note.sthlp new file mode 100755 index 0000000..3dea513 --- /dev/null +++ b/01.code/ado/p/pip_note.sthlp @@ -0,0 +1,97 @@ +{smcl} +{* *! version 1.0.0 20 sep 2019}{...} + +{title:Description of the components in the option {cmd:version()}} + +{phang} +{bf:Data Versioning} : One of the main features of PIP is to provide the user with the possibility to +use any vintage (or version) of the PIP data. The vintage control has the following structure: {it: {bf: %Y%m%d_YYYY_RV_AV_SSS}}, + where each component (separated by "_") is described as follows: + +{p 8 8 2}{it:{bf: %Y%m%d}} : Same as option {it:release()}, refers to an 8-digit number (format YYYYMMDD) that conforms, +in general, to the release date of PIP. However, it could refer to any date. Other dates are available only to +internal WB users; external users have access only to the publicly released versions. +What identifies a folder as an official release is the suffix SSS, which is explained below. + +{p 8 8 2}{it:{bf: YYYY}} : Refers to the PPP round (e.g. 2011, 2017 at present). + +{p 8 8 2}{it:{bf: RV}} : Refers to the {ul:release version of the PPP round}. The same round of PPPs may +be revised by the International Comparison Program (ICP) team. + +{p 8 8 2}{it:{bf: AV}} : Refers to the {ul:adaptation version of the release version of the PPP round}. +The PPPs that are published and revised by the ICP team are adapted by the PIP team. +This is only done after careful technical analysis and only affects a few countries. + +{p 8 8 2}{it:{bf:SSS}} : Refers to the {ul:identity} of the folder (PROD, INT, or TEST). See the description +of each {cmd:identity()} folder in the section below. {cmd:NOTE}: external users can only use the identity PROD +in the {it:_SSS} suffix. INT and TEST are for internal WB users only. + +{p 8 8 2}*An example of a specific version is the following: 20220909_2017_01_02_PROD. This version is a public release (PROD suffix). +It was released on 9 September 2022. It uses the first release of the 2017 PPPs (the 2017 PPPs have been published in May 2020 +and have not been revised as of November 2022). It uses the second adaptation of the 2017 PPPs (identifying multiple +adaptations that were created internally in the preparation of the public release). + +{title:Description of the options {cmd:server()} and {cmd:identity()} {err:[For internal WB users ONLY]}} + +{phang} +{opt server(string)} Three servers (PROD, QA, and DEV) are available in PIP. +This option is only available internally for WB staff. The following are descriptions for each server: + +{p 8 8 2}{bf:1) PROD (Production)}: This server contains the data published by the Bank externally +in {browse "https://pip.worldbank.org/":pip.worldbank.org}. That is, outside the Bank’s intranet +and the only version available also to external users. Use the {cmd:server(prod)} option to access +pip data in the PROD server. + +{p 8 8 2}{bf:2) QA (Quality Assurance server)}: This server is available within the Bank's intranet to check +new version of pip data before it is released externally. This option can be used in the +pip stata command as {cmd:server(qa)}. + +{p 8 8 2}{bf:3) DEV (Development)}: This server is used for testing new PIP features and methodological improvements. +This option can be used in the pip stata command as {cmd:server(dev)}. + +{phang} +{err: IMPORTANT!:} In order to access data from the QA and DEV servers you need to {cmd:contact} +{browse "pip@worldbank.org":pip@worldbank.org}. + +{phang} +{opt identity(string)} Within the DEV server, there exist different versions of the data. +To specify the version of PIP data, include optional parameter {cmd:identity()}. +The command {cmd:identity()} has three possible values (prod, int, and test). When identity() is not specified, the default is prod. +Here are descriptions of each of these values: + +{p 8 8 2}{bf: PROD}: Refers to production. This type of folder can be found in {ul:any of the three servers} +explained above (PROD, QA, and DEV). Within the PROD and QA server, this is the only available data version, +so this distinction only matters for the DEV server. Only the folders with the identity(prod) will be considered by the API as production +folders that may be deployed in the main API and website. + +{p 8 8 2}{bf: INT}: Refers to folders that will be used internally by specific people for specific purposes. +These folders are available {ul:only in the DEV server}. These folders will not be sent to production. +These folders are static and should not be modified by the PIP technical team. +An INT folder might be created for a particular paper and will remain unchanged for replication and archiving purposes. +If the contents of an INT folder are needed to be sent to production, they have to be recreated as a PROD folder. + +{p 8 8 2}{bf: TEST}: Refers to testing folders that can be modified as needed by the PIP +technical team. These folders are available {ul:only in the DEV server}. + +{p 8 8 2}*An example might help to illustrate the differences between the server() and identity() options: Within the DEV server, a new version of the PIP data begins as identity(INT). For example, it is used to try out a methodological innovation. When the work is completed, a decision is made whether to archive and keep this dataset as identity(INT) or whether to eventually move it to production, i.e. turn it into identity(PROD). If it becomes a prod dataset, it then passes from the DEV server to the QA server for further testing and quality assurance. To publish the dataset externally it is transferred from the QA to the PROD server. + +{title:Examples} + +{phang} +Here are some examples that show the use of the {cmd:server()} and {cmd:identity()} options: + +{phang} +1) {stata pip versions, server(prod)} -> displays all the versions in the PROD server. All these datasets are publicly available. + +{phang} +2) {stata pip, country(PRY) year(2012) server(dev) clear} -> loads the estimates for Paraguay in 2012 from the DEV server. The order in which the data versions are searched within the DEV server are first {it:prod}, then {it:int}, +and then {it:test}. + +{phang} +3) {stata pip, country(PRY) year(2012) server(dev) identity(INT) clear} -> loads the estimates for Paraguay in 2012 from the DEV server from the INT folder. + +{phang} +{help pip:Go back to pip help page} + + + diff --git a/01.code/ado/p/pip_povcalnet_format.ado b/01.code/ado/p/pip_povcalnet_format.ado new file mode 100755 index 0000000..eb4f6ee --- /dev/null +++ b/01.code/ado/p/pip_povcalnet_format.ado @@ -0,0 +1,138 @@ +/*================================================== +project: Transform to PovcalNet old format +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 1 Dec 2021 - 11:27:35 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_povcalnet_format, rclass +syntax [anything(name=type)] /// +[, /// +pause /// +] + +if ("`pause'" == "pause") pause on +else pause off + +version 16.0 + + +noi disp in red "Warning: option {it:povcalnet_format} is intended only to " _n /// +"replicate results or to use Stata code that still" _n /// +"executes the deprecated {cmd:povcalnet} command." + +/*================================================== +1: country data +==================================================*/ +ren year requestyear +ren population reqyearpopulation +if ("`type'" == "1") { + + local vars1 country_code region_code reporting_level welfare_time /* + */ welfare_type is_interpolated distribution_type poverty_line poverty_gap /* + */ poverty_severity country_name + + local vars2 countrycode regioncode coveragetype datayear datatype isinterpolated usemicrodata /* + */povertyline povgap povgapsqr countryname + + local i = 0 + foreach var of local vars1 { + local ++i + cap confirm var `var', exact + if _rc continue + rename `var' `: word `i' of `vars2'' + } + + local keepvars countrycode countryname regioncode coveragetype requestyear /* + */ datayear datatype isinterpolated usemicrodata /* + */ ppp povertyline mean headcount povgap povgapsqr watts gini /* + */ median mld polarization reqyearpopulation decile? decile10 + + foreach v of local keepvars { + cap confirm var `v', exact + if _rc continue + local tokeep "`tokeep' `v'" + } + keep `tokeep' + order `tokeep' + + + * Standardize names with R package + local Snames requestyear reqyearpopulation + + local Rnames year population + + local i = 0 + foreach var of local Snames { + local ++i + rename `var' `: word `i' of `Rnames'' + } + + sort countrycode year coveragetype + + //------------ Convert to monthly values + replace mean = mean * (360/12) + replace median = median * (360/12) + +} + +/*================================================== +2: for Aggregate requests +==================================================*/ +if ("`type'" == "2") { + + //------------ Renaming and labeling + + rename region_code regioncode + rename poverty_line povertyline + rename poverty_gap povgap + rename poverty_severity povgapsqr + + keep requestyear regioncode povertyline mean headcount povgap /// + povgapsqr reqyearpopulation + order requestyear regioncode povertyline mean headcount povgap /// + povgapsqr reqyearpopulation + + local Snames requestyear reqyearpopulation + + local Rnames year population + + local i = 0 + foreach var of local Snames { + local ++i + rename `var' `: word `i' of `Rnames'' + } + + //------------ Convert to monthly values + replace mean = mean * (360/12) + +} // end of type 2 + + + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_query.ado b/01.code/ado/p/pip_query.ado new file mode 100755 index 0000000..9e620aa --- /dev/null +++ b/01.code/ado/p/pip_query.ado @@ -0,0 +1,283 @@ +********************************************************************************* +* pip_query * +********************************************************************************* +program def pip_query, rclass + +version 16.0 + +syntax [anything(name=subcommand)] /// +[, /// +YEAR(string) /// +COUntry(string) /// +REGion(string) /// +POVLine(string) /// +POPShare(string) /// +PPP(string) /// +NOSUMmary /// +ISO /// +CLEAR /// +AUXiliary /// +ORIginal /// +INFOrmation /// +COESP(string) /// +SERVER(string) /// +version(string) /// +groupedby(string) /// +coverage(string) /// +pause /// +fillgaps /// +aggregate /// +wb /// +] + +if ("`pause'" == "pause") pause on +else pause off + +quietly { + + local curframe = c(frame) + ************************************************ + * 0. Housekeeping + ************************************************ + + if ("`ppp'" != "") local ppp_q = "&PPP0=`ppp'" + return local query_pp = "`ppp_q'" + + local region = upper("`region'") + + *************************************************** + * 1. Will load guidance database + *************************************************** + pip_info, clear justdata `pause' server(`server') version(`version') + + //------------ version + if ("`version'" != "") { + local version_qr = "&version=`version'" + tokenize "`version'", parse("_") + local _version = "_`1'_`3'_`9'" + } + else { + local version_qr = "" + local _version = "" + } + + + *---------- Make sure at least one reference year is selected + local frpipfw "_pip_fw`_version'" + + if ("`year'" != "all" & ("`wb'" != "" | "`aggregate'" != "")) { + + * + frame `frpipfw': levelsof surveyid_year, local(ref_years_l) + local ref_years "`ref_years_l' last" + + local no_ref: list year - ref_years + + if (`: list no_ref === year') { + noi disp as err "Not even one of the years select belong to the following reference years: " _n /* + */ " `ref_years_l'" + error + } + + if ("`no_ref'" != "") { + noi disp in y "Warning: `no_ref' is/are not part of reference years: `ref_years_l'" + } + + */ + + } // end of 'if' condition + + + *************************************************** + * 2. Keep selected countries and save codes + *************************************************** + + *---------- Keep selected country + + + frame `frpipfw' { + + cap confirm var keep_this + if (_rc) { + gen keep_this = 0 + } + else { + replace keep_this = 0 + } + + if ("`country'" != "" & lower("`country'") != "all") { + + local countries_ : subinstr local country " " "|", all + + count if regexm(country_code, "`countries_'") + if (r(N) == 0) { + noi disp in red "None of the country codes provided corresponds to the " _n /// + "set of country codes available. Type {stata pip info} to get the full " _n /// + "list of the PIP inventory" + error + } + + replace keep_this = 1 if regexm(country_code, "`countries_'") + } + + if lower("`country'") == "all" replace keep_this = 1 + + + * If region is selected instead of countries + if ("`region'" != "") { + if ("`region'" == "WLD" | lower("`region'") == "all") { + replace keep_this = 1 + } + else { + local region_l: subinstr local region " " "|", all + replace keep_this = 1 if regexm(wb_region_code, "`region_l'") + } + + } + + local touse "keep_this == 1" + + local obs = _N + if (`obs' == 0) { + di as err "No surveys found matching your criteria. You could use the " /* + */ " {stata pip_info: guided selection} instead." + cwf `curframe' + error + } + + pause query - after filtering conditions of country and region + + *************************************************** + * 3. Keep selected years and construct the request + *************************************************** + + *---------- Check that at least one year is available + if ("`wb'" == "" & "`aggregate'" == "") { + if ("`year'"=="all") | ("`year'"=="last") | ("`fillgaps'"!="") { + local year_ok = 1 + } + else { + + local yearcheck 0 + levelsof country_code if `touse', local(cts) + local years_: subinstr local year " " "|", all + + foreach ct of local cts { + + count if country_code == "`ct'" & /// + regexm(strofreal(reporting_year), "`years_'") & `touse' + + local year_ok = r(N) + + if (`year_ok' == 0) { + + disp in r _n "Warning: years selected for `ct' do not " /// + "match any survey year." _n /// + "You could type {stata pip_info, country(`ct') version(`version') clear} to check availability." + } + else { + if (`yearcheck' == 0 ) { + local yearcheck 1 + } + } + + } // end of countries loop + + if (`yearcheck' == 0) { + noi disp as err _n "the countries and years selected do not match any year available." + error + } + } + } + + /*================================================== + Create Queries + ==================================================*/ + + *---------- Year and Display query + local y_comma: subinstr local year " " ",", all + if ("`year'" == "last") local y_comma = "all" + local year_q = "year=`y_comma'" + + if ("`fillgaps'" == "") { + local disp_q = "" + } + else { + local disp_q = "&fill_gaps=true" + } + + if ("`aggregate'" != "") { + local disp_q = "&group_by=none" + } + + if ("`wb'" != "") { + local disp_q = "&group_by=wb" + } + + return local query_ys = "`year_q'" + return local query_ds = "`disp_q'" + + *---------- Country query + + if ( lower("`country'") == "all" | lower("`region'") == "all") { + + local country_q = "country=ALL" // to modify + + } + else { + if ("`region'" != "") { + local country_q: subinstr local region " " ",", all + } + else { + levelsof country_code if `touse', local(country_q) sep(",") clean + } + local country_q = "country=`country_q'" + } + return local query_ct = "`country_q'" + + if ("`popshare'" != "") { + *----------Population share query + local popshare_q = "popshare=`popshare'" + return local query_ps = "`popshare_q'" + } + else { + *---------- Poverty lines query + local povline_q = "povline=`povline'" + return local query_pl = "`povline_q'" + } + + *---------- Coverage query + if ("`coverage'" == "") { + local coverage_q = "reporting_level=all") + local coverage_q = "reporting_level=`coverage_q'" + } + else { + local coverage_q = `""`coverage'""' + local coverage_q: subinstr local coverage_q " " `"&reporting_level="', all + local coverage_q: disp `coverage_q' + local coverage_q = "reporting_level=`coverage_q'" + } + return local query_cv = "`coverage_q'" + + } // end of frame + + cwf `curframe' + +} // end of qui + +end + +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_set_server.ado b/01.code/ado/p/pip_set_server.ado new file mode 100755 index 0000000..c0e1f87 --- /dev/null +++ b/01.code/ado/p/pip_set_server.ado @@ -0,0 +1,99 @@ +/*================================================== +project: Define which server to use +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 1 Dec 2021 - 10:59:19 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_set_server, rclass +syntax [anything(name=server)] /// +[, /// +pause /// +] + +if ("`pause'" == "pause") pause on +else pause off + +version 16.0 + +//======================================================== +// Define server +//======================================================== +*##s +* local server "prod" + +//------------ If shortcut used +local current_server "https://api.worldbank.org/pip/v1" // production + +if (inlist(lower("`server'"), "qa", "dev")) { + local url "${pip_svr_`server'}" +} + +else if (lower("`server'") == "prod") { + local url "`current_server'" + local server "prod" +} + +/*================================================== +2: Server not defined +==================================================*/ +else if ("`server'" == "") { + local url "`current_server'" + local server "prod" +} + +else { + noi disp in red "server {it:`server'} not allowed" + error +} + +//======================================================== +// Test API Health +//======================================================== + + +cap scalar tpage = fileread(`"`url'/health-check"') +* disp tpage + +*##e + + +if (!regexm(tpage, "API is running") | _rc) { + noi disp in red "There is a problem with PIP API server. Try again later" + error +} + +//======================================================== +// Return values +//======================================================== + +return local server = "`server'" +return local url = "`url'" +return local base = "`url'/pip" +return local base_grp = "`url'/pip-grp" + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_ssc.ado b/01.code/ado/p/pip_ssc.ado new file mode 100755 index 0000000..b063e89 --- /dev/null +++ b/01.code/ado/p/pip_ssc.ado @@ -0,0 +1,97 @@ +/*================================================== +project: message to the user if file is installed from SSC +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 6 Oct 2022 - 16:08:28 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== + 0: Program set up +==================================================*/ +program define pip_ssc, rclass +version 16.0 + +syntax [anything(name=subcommand)] /// +[, /// +pause /// +] + +if ("`pause'" == "pause") pause on +else pause off + + +/*================================================== + 1: Update +==================================================*/ +if ("`subcommand'" == "update") { + qui adoupdate pip, ssconly + if ("`r(pkglist)'" == "pip") { + cap window stopbox rusure "There is a new version of pip in SSC." /// + "Would you like to install it now?" + + if (_rc == 0) { + cap ado update pip, ssconly update + if (_rc == 0) { + cap window stopbox note "Installation complete. please type" /// + "discard in your command window to finish" + local bye "exit" + } + else { + noi disp as err "there was an error in the installation. " _n /// + "please run the following to retry, " _n(2) /// + "{stata ado update pip, ssconly update}" + local bye "error" + } + } + else local bye "" + } // end of checking SSC update + else { + noi disp as result "SSC version of {cmd:pip} is up to date." + local bye "" + } + + return local bye = "`bye'" +} + + +/*================================================== + 2: Message +==================================================*/ +if (inlist("`subcommand'", "msg", "message")) { + noi disp "You're using SSC as the host of the {cmd:pip} Stata package." + noi disp "If you want to install the GitHub version type {stata pip_install gh}" +} + +//======================================================== +// Install +//======================================================== + +if (inlist("`subcommand'", "install")) { + pip_install ssc, replace +} + + + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_tables.ado b/01.code/ado/p/pip_tables.ado new file mode 100755 index 0000000..f345154 --- /dev/null +++ b/01.code/ado/p/pip_tables.ado @@ -0,0 +1,120 @@ +/*================================================== +project: display and load auxiliary tables in PIP +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 25 Mar 2022 - 14:38:22 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_tables, rclass +syntax [anything(name=table)], [ /// +server(string) /// +version(string) /// +release(numlist) /// +PPP_year(numlist) /// +identity(string) /// +clear /// +] + +version 16 + + +/*================================================== +1: SET UP +==================================================*/ +qui { + + *---------- API defaults + qui pip_versions, /// + server(`server') /// + version(`version') /// + release(`release') /// + ppp_year(`ppp_year') /// + identity(`identity') + + local server = "`r(server)'" + local url = "`r(url)'" + local version = "`r(version)'" + local version_qr = "`r(version_qr)'" + + /*================================================== + 2: If table is selected + ==================================================*/ + + if ("`table'" != "") { + local table_call = "`url'/aux?table=`table'&`version_qr'&format=csv" + import delimit "`table_call'", varn(1) `clear' asdouble + return local table_call = "`table_call'" + + * rename vars. Modify the following locals + local oldvars "reporting_year survey_year" + local newvars "year welfare_time" + + gettoken old oldvars : oldvars + gettoken new newvars : newvars + qui while ("`old'" != "") { + cap confirm new var `old', exact + if (_rc) cap confirm var `new', exact + if (_rc) rename `old' `new' + + gettoken old oldvars : oldvars + gettoken new newvars : newvars + } + + * to lower cases + local tolvars "welfare_type" + foreach t of local tolvars { + cap confirm new var `t', exact + if (_rc) replace `t' = lower(`t') + } + + exit + } + + /*================================================== + 3: If table is NOT selected + ==================================================*/ + if ("`table'" == "") { + preserve + local table_call = "`url'/aux?`version_qr'&format=csv" + import delimit "`table_call'", varn(1) clear asdouble + return local table_call = "`table_call'" + + noi disp in y "Auxiliary tables available for `version':" + local _N = _N + forvalues i = 1/`_N' { + if (length("`i'") == 1) local j = "0" + "`i'" + else local j = "`i'" + + local table = tables[`i'] + local pip_code "pip tables, table(`table') server(`server') version(`version') clear" + + noi disp _col(6) `"`j' {c |} {stata `pip_code':`table'}"' + } + } +} // end qui + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_update.ado b/01.code/ado/p/pip_update.ado new file mode 100755 index 0000000..9a48f1f --- /dev/null +++ b/01.code/ado/p/pip_update.ado @@ -0,0 +1,84 @@ +/*================================================== +project: update pip depending on source +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 13 Feb 2023 - 19:35:59 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== + 0: Program set up +==================================================*/ +program define pip_update, rclass +syntax [anything(name=src)] /// +[, /// +username(string) /// +cmd(string) /// +version(string) /// +pause /// +replace /// +path(string) /// +] + +version 16.0 + + +/*================================================== + set up +==================================================*/ + +if ("`pause'" == "pause") pause on +else pause off + + +if ("`cmd'" == "") { + local cmd pip +} + +if ("`username'" == "") { + local username "worldbank" +} + +//======================================================== +// Update +//======================================================== + +pip_find_src , path(`path') +local src = "`r(src)'" +return add + +//------------ If PIP was installed from github +if ("`src'" == "gh") { + + pip_gh update, username(`username') cmd(`cmd') `pause' + return add +} // end if installed from github + +//------------ if pip was installed from SSC +else { + pip_ssc update, `pause' + return add +} // Finish checking pip update + + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/p/pip_versions.ado b/01.code/ado/p/pip_versions.ado new file mode 100755 index 0000000..4d160c4 --- /dev/null +++ b/01.code/ado/p/pip_versions.ado @@ -0,0 +1,258 @@ +/*================================================== +project: gather versions availble in a particular server +Author: R.Andres Castaneda +E-email: acastanedaa@worldbank.org +url: +Dependencies: The World Bank +---------------------------------------------------- +Creation Date: 18 Mar 2022 - 11:42:44 +Modification Date: +Do-file version: 01 +References: +Output: +==================================================*/ + +/*================================================== +0: Program set up +==================================================*/ +program define pip_versions, rclass + +syntax [anything(name=sbcmd)] , [ /// +server(string) /// +version(string) /// +release(numlist) /// +PPP_year(numlist) /// +identity(string) /// +AVAILability /// +] + +version 16 + +qui { + + /*================================================== + Check conditions + ==================================================*/ + + * local version has prvalence over everything else + if ("${pip_version}" != "" & "`version'" == "") { + noi disp in red "warning:" in y "Global {it:pip_version} (${pip_version}) is in use" + local version = "${pip_version}" + } + + * local version "2022484_2011_02_02_PROD" + if ("`version'" != "") { + * check format + local vintage_pattern = "[0-9]{8}_[0-9]{4}_[0-9]{2}_[0-9]{2}_(PROD|INT|TEST)$" + if (!ustrregexm("`version'", "`vintage_pattern'")) { + noi disp in red "version provided, {it:`version'}, does not meet the " _c /// + "format criteria: " _n in y "`vintage_pattern'" + error + } + } + + * locals have prevalence over globals + else { // if version is specified, all of this is ignored + if ("${pip_ppp_year}" != "" & "`ppp_year'" == "") { + noi disp in red "warning:" in y "Global {it:pip_ppp_year} (${pip_ppp_year}) is in use" + local ppp_year = "${pip_ppp_year}" + } + if ("${pip_identity}" != "" & "`identity'" == "") { + noi disp in red "warning:" in y "Global {it:pip_identity} (${pip_identity}) is in use" + local identity = "${pip_identity}" + } + if ("${pip_release}" != "" & "`release'" == "") { + noi disp in red "warning:" in y "Global {it:pip_release} (${pip_release}) is in use" + local release = "${pip_release}" + } + } + + + //======================================================== + // Create version frame + //======================================================== + + //------------ get server url + pip_set_server `server', `pause' + local url = "`r(url)'" + local server = "`r(server)'" + return add + + + + cap frame drop _pip_versions_`server' // change this later + frame create _pip_versions_`server' + + frame _pip_versions_`server' { + + import delimited using "`url'/versions?format=csv", clear varn(1) asdouble + keep version + + //------------* Split and rename + split version, parse("_") generate(sp) + local sp_names "release ppp_year ppp_rv ppp_av identity" + rename sp# (`sp_names') + + } + + //======================================================== + // Just show availability + //======================================================== + + if ("`availability'" != "") { + frame _pip_versions_`server' { + levelsof version, local(versions) clean + return local versions = "`versions'" + noi list version // fast list + } + exit + } + + + frame copy _pip_versions_`server' _pip_versions_wrk, replace + frame _pip_versions_wrk { + + //======================================================== + // Evaluate conditions + //======================================================== + + //------------ Version + if ("`version'" != "") { + + * check availability + * local server "http://wzlxdpip01.worldbank.org/api/v1" + * local version "20220408_2011_02_02_PROD" + count if version == "`version'" + + if (r(N) == 0) { + + levelsof version, local(vers) clean + local ver_avlb: list version in vers + noi disp in red "version {it:`version'} is not available in this server" _n /// + "Versions available are: " + foreach ver of local vers { + noi disp in y "`ver'" + } + error + } + else { + local version_qr = "version=`version'" + } + } // end of version different ot empty + + //------------ It no version is defined by the user + else { + //------------ Release + if ("`release'" != "") { + count if release == "`release'" + if (r(N) == 0) { + noi disp in red "release, {it:`release'}, is not available." _n /// + "Releases available are:" + levelsof release, local(releases) clean + foreach r of local releases { + noi disp in y "`r'" + } + error + } + else { + keep if release == "`release'" + } + } + //------------ PPP year + if ("`ppp_year'" != "") { + count if ppp_year == "`ppp_year'" + if (r(N) == 0) { + noi disp in red "ppp_year, {it:`ppp_year'}, is not available." _n /// + "PPP years available are:" + levelsof ppp_year, local(ppp_years) clean + foreach py of local ppp_years { + noi disp in y "`py'" + } + error + } + else { + keep if ppp_year == "`ppp_year'" + } + } // end of ppp year + + //------------ Identity + if ("`identity'" != "") { + local identity = upper("`identity'") + + count if identity == "`identity'" + if (r(N) == 0) { + noi disp in red "identity, {it:`identity'}, is not available." _n /// + "Identities available are:" + levelsof identity, local(identities) clean + foreach i of local identities { + noi disp in y "`i'" + } + error + } + else { + keep if identity == "`identity'" + } + } // end of identity defined + else { + count if identity == "PROD" + if (r(N) == 0) { + count if identity == "INT" + if (r(N) == 0) { + count if identity == "TEST" + if (r(N) == 0) { + noi disp in red "Valid identity was not found" + error + } + else { + keep if identity == "TEST" + } + } // If no INT + else { + keep if identity == "INT" + } + } // end if not PROD + else { + keep if identity == "PROD" + } + } // end of identity not defined by user + + + //======================================================== + // Sort remaining obs + //======================================================== + + // this guarantees only one version + sort release ppp_year ppp_rv ppp_av + keep in l + local version = version[1] + local version_qr = "version=`version'" + + } // end of version not specified + + } // end frame + tokenize "`version'", parse("_") + return local release = "`1'" + return local ppp_year = "`3'" + return local identity = "`9'" + return local version_qr = "`version_qr'" + return local version = "`version'" + + noi disp as res "Version in use:" as text " `version'" + +} // end of qui + +end +exit +/* End of do-file */ + +><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>< + +Notes: +1. +2. +3. + + +Version Control: + + diff --git a/01.code/ado/wsample.ado b/01.code/ado/wsample.ado old mode 100644 new mode 100755 index 8d0bd32..16867c5 --- a/01.code/ado/wsample.ado +++ b/01.code/ado/wsample.ado @@ -1,85 +1,85 @@ -*! wsample June 14, 2018 -* Paul Corral (World Bank Group - Poverty and Equity Global Practice) - -cap prog drop wsample -program define wsample, eclass - version 11.2 - #delimit; - syntax varlist (max=1 numeric) [if] [in] [aw], - newvar(string) - [percent(numlist max=1 >0) - value(numlist max=1 >0) - numsim(integer 1) - seed(string)]; -#delimit cr -marksample touse - -//Either value or percent -if ("`value'"!="" & "`percent'"!=""){ - dis as error "Only one option is available either value or percent" - error 119 -} - //Weights - if "`exp'"=="" { - tempvar w - qui:gen `w' = 1 - local wvar `w' - } - else{ - tempvar w - qui:gen double `w' `exp' - } - -mata: st_view(x1=., ., "`varlist'","`touse'") -mata: st_view(w1=., ., "`w'","`touse'") - -if ("`percent'"!=""){ - if (`percent'>100){ - dis as error "You have specified a value greater than 100 percent." - error 119 - } - qui: sum `w' if `touse' - local ccu = r(sum)*(`percent'/100) - dis in green "Value for target percent: `ccu'" -} -else{ - local ccu = `value' - dis in green "Target value: `ccu'" -} - -if (`numsim'==1){ - qui:gen double `newvar' = . - local mylist `mylist' `newvar' -} -else{ - forval z=1/`numsim'{ - qui:gen double `newvar'_`z' = . - local mylist `mylist' `newvar'_`z' - } -} -if ("`seed'"=="") set seed 69374255 -else set seed `seed' -//dis as error "_RanDomaSSign(x1,w1,`ccu',`numsim')" -mata: st_store(.,st_varindex(tokens("`mylist'")),"`touse'",_RanDomaSSign(x1,w1,`ccu',`numsim')) - -dis in green "Your new variable has been created in `newvar'*" -end - -cap set matastrict off -mata -//Function to create sim# permutation vectors -function _RanDomaSSign(x,w,cut,sim){ - - data = runningsum(J(rows(x),1,1)),x,w,runiform(rows(x),sim) //current order - - s=cols(data) - - //randomize assignment - for(i=4;i<=s;i++){ - _sort(data,i) - data[.,i] = ((runningsum(data[.,3])):<=cut):*data[.,2] - } - _sort(data,1) - return(data[|.,4 \ .,cols(data)|]) -} -end +*! wsample June 14, 2018 +* Paul Corral (World Bank Group - Poverty and Equity Global Practice) + +cap prog drop wsample +program define wsample, eclass + version 11.2 + #delimit; + syntax varlist (max=1 numeric) [if] [in] [aw], + newvar(string) + [percent(numlist max=1 >0) + value(numlist max=1 >0) + numsim(integer 1) + seed(string)]; +#delimit cr +marksample touse + +//Either value or percent +if ("`value'"!="" & "`percent'"!=""){ + dis as error "Only one option is available either value or percent" + error 119 +} + //Weights + if "`exp'"=="" { + tempvar w + qui:gen `w' = 1 + local wvar `w' + } + else{ + tempvar w + qui:gen double `w' `exp' + } + +mata: st_view(x1=., ., "`varlist'","`touse'") +mata: st_view(w1=., ., "`w'","`touse'") + +if ("`percent'"!=""){ + if (`percent'>100){ + dis as error "You have specified a value greater than 100 percent." + error 119 + } + qui: sum `w' if `touse' + local ccu = r(sum)*(`percent'/100) + dis in green "Value for target percent: `ccu'" +} +else{ + local ccu = `value' + dis in green "Target value: `ccu'" +} + +if (`numsim'==1){ + qui:gen double `newvar' = . + local mylist `mylist' `newvar' +} +else{ + forval z=1/`numsim'{ + qui:gen double `newvar'_`z' = . + local mylist `mylist' `newvar'_`z' + } +} +if ("`seed'"=="") set seed 69374255 +else set seed `seed' +//dis as error "_RanDomaSSign(x1,w1,`ccu',`numsim')" +mata: st_store(.,st_varindex(tokens("`mylist'")),"`touse'",_RanDomaSSign(x1,w1,`ccu',`numsim')) + +dis in green "Your new variable has been created in `newvar'*" +end + +cap set matastrict off +mata +//Function to create sim# permutation vectors +function _RanDomaSSign(x,w,cut,sim){ + + data = runningsum(J(rows(x),1,1)),x,w,runiform(rows(x),sim) //current order + + s=cols(data) + + //randomize assignment + for(i=4;i<=s;i++){ + _sort(data,i) + data[.,i] = ((runningsum(data[.,3])):<=cut):*data[.,2] + } + _sort(data,1) + return(data[|.,4 \ .,cols(data)|]) +} +end diff --git a/01.code/ado/wsample.sthlp b/01.code/ado/wsample.sthlp old mode 100644 new mode 100755 index dfeea34..477e034 --- a/01.code/ado/wsample.sthlp +++ b/01.code/ado/wsample.sthlp @@ -1,70 +1,70 @@ -{smcl} -{* *! version 1.0.0 30April2018}{...} -{cmd:help wsample} -{hline} - -{title:Title} -{p2colset 5 24 26 2}{...} -{p2col :{cmd:wsample} {hline 1} Creates values or an indicator for a weighted random sample}{p_end} -{p2colreset}{...} - -{title:Syntax} - -{p 8 23 2} - -{opt wsample} {var} {ifin} [aw] {cmd:,} -{opt newvar(newvarname)} -[{opt percent(positive real)} -{opt value(positive real)} -{opt numsim(integer)} -{opt seed(integer)}] - -{title:Description} - -{pstd} -{cmd:wsample} Draws weighted random samples of the data in memory. It is useful for simulations where specific population targets are to be met. The size of the population to be drawn can be specified as a percentage (percent option) or as a total value (value option). - -The command allows for producing indicators of the desired sample, or may also keep the values for the variable specified in varlist. - -{title:Options} - -{phang} -{opt newvar(newvarname)} Variable containing the indicator for the sample. If a variable with values is placed in varlist, then the variable produced by the newvar option will have the values to ensure that a certain population has those specified values. - -{phang} -{opt percent(positive real)} Specifies that # percent of the weighted sample is desired. - -{phang} -{opt value(positive real)} Specifies that # in the weighted sample is desired. - -{phang} -{opt numsim(integer)} Specifies that # samples is desired, each will be created in a new variable where the name specified in newvar is used as a prefix. - -{phang} -{opt seed(string)} Seed to ensure replicability, one may use Stata's c(rngstate). - - -{title:Example} -sysuse auto, clear -//Take a 90 percent weighted sample of foreign vehicles -wsample foreign if foreign==1 [aw=weight], percent(90) newvar(myforeign) seed(3894) -//Assume that only 90 percent of all vehicles should have a price, the rest are assigned a price of 0. Get 10 samples. -wsample price [aw=weight], percent(90) newvar(price90) seed(3894) numsim(10) - - -{title:Author:} - -{pstd} -Paul Corral{break} -The World Bank - Poverty and Equity Global Practice {break} -Washington, DC{break} -pcorralrodas@worldbank.org{p_end} - - -{pstd} -Any error or omission is the author's responsibility alone. - - - - - +{smcl} +{* *! version 1.0.0 30April2018}{...} +{cmd:help wsample} +{hline} + +{title:Title} +{p2colset 5 24 26 2}{...} +{p2col :{cmd:wsample} {hline 1} Creates values or an indicator for a weighted random sample}{p_end} +{p2colreset}{...} + +{title:Syntax} + +{p 8 23 2} + +{opt wsample} {var} {ifin} [aw] {cmd:,} +{opt newvar(newvarname)} +[{opt percent(positive real)} +{opt value(positive real)} +{opt numsim(integer)} +{opt seed(integer)}] + +{title:Description} + +{pstd} +{cmd:wsample} Draws weighted random samples of the data in memory. It is useful for simulations where specific population targets are to be met. The size of the population to be drawn can be specified as a percentage (percent option) or as a total value (value option). + +The command allows for producing indicators of the desired sample, or may also keep the values for the variable specified in varlist. + +{title:Options} + +{phang} +{opt newvar(newvarname)} Variable containing the indicator for the sample. If a variable with values is placed in varlist, then the variable produced by the newvar option will have the values to ensure that a certain population has those specified values. + +{phang} +{opt percent(positive real)} Specifies that # percent of the weighted sample is desired. + +{phang} +{opt value(positive real)} Specifies that # in the weighted sample is desired. + +{phang} +{opt numsim(integer)} Specifies that # samples is desired, each will be created in a new variable where the name specified in newvar is used as a prefix. + +{phang} +{opt seed(string)} Seed to ensure replicability, one may use Stata's c(rngstate). + + +{title:Example} +sysuse auto, clear +//Take a 90 percent weighted sample of foreign vehicles +wsample foreign if foreign==1 [aw=weight], percent(90) newvar(myforeign) seed(3894) +//Assume that only 90 percent of all vehicles should have a price, the rest are assigned a price of 0. Get 10 samples. +wsample price [aw=weight], percent(90) newvar(price90) seed(3894) numsim(10) + + +{title:Author:} + +{pstd} +Paul Corral{break} +The World Bank - Poverty and Equity Global Practice {break} +Washington, DC{break} +pcorralrodas@worldbank.org{p_end} + + +{pstd} +Any error or omission is the author's responsibility alone. + + + + + diff --git a/01.code/dofile/0-0 GMD datacheck.do b/01.code/dofile/0-0 GMD datacheck.do old mode 100644 new mode 100755 index 6db8da3..008f221 --- a/01.code/dofile/0-0 GMD datacheck.do +++ b/01.code/dofile/0-0 GMD datacheck.do @@ -1,59 +1,46 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -clear -global rnd AM24 - -*global upath2 -tempfile data1 data2 -save `data2', replace emptyok - -use "${upath2}\\02.input\repo_${rnd}all", clear -local all=_N -save `data1', replace - -forv i=1(1)`all' { - use `data1', clear - - local code = country[`i'] - local year = years[`i'] - local surveyid = surveyid[`i'] - local mod = module[`i'] - - cap dlw, country(`code') year(`year') type(gmd) files nocpi mod(`mod') surveyid(`surveyid') - qui if _rc==0 { - ds3 - local vlist = r(varlist) - foreach var of local vlist { - gen ct_`var' = ~missing(`var') - } - gen x = 1 - collapse (sum) ct_* - - gen code = "`code'" - gen year = `year' - gen surveyid = "`surveyid'" - gen mod = "`mod'" - - append using `data2' - save `data2', replace - } -} -use `data2', clear +//Load all data and check subnational data together with other data +//sub* gaul* +//two steps + +//1 - get variable availabiity +//2 + +clear +global rnd AM24 + +tempfile data1 data2 +save `data2', replace emptyok + +use "${upath2}\\02.input\repo_${rnd}all", clear +local all=_N +save `data1', replace + +forv i=1(1)`all' { + use `data1', clear + + local code = country[`i'] + local year = years[`i'] + local surveyid = surveyid[`i'] + local mod = module[`i'] + + cap dlw, country(`code') year(`year') type(gmd) files nocpi mod(`mod') surveyid(`surveyid') + qui if _rc==0 { + ds3 + local vlist = r(varlist) + foreach var of local vlist { + gen ct_`var' = ~missing(`var') + } + gen x = 1 + collapse (sum) ct_* + + gen code = "`code'" + gen year = `year' + gen surveyid = "`surveyid'" + gen mod = "`mod'" + + append using `data2' + save `data2', replace + } +} +use `data2', clear save "${upath2}\\02.input\\GMD_variables_count${rnd}.dta", replace \ No newline at end of file diff --git a/01.code/dofile/0-1 Get PIP nat lineup number.do b/01.code/dofile/0-1 Get PIP nat lineup number.do old mode 100644 new mode 100755 index 9dea0ff..4cbe823 --- a/01.code/dofile/0-1 Get PIP nat lineup number.do +++ b/01.code/dofile/0-1 Get PIP nat lineup number.do @@ -1,19 +1,19 @@ -//Get PIP lineup numbers -global dataout "${upath2}\03.intermediate\PIPinput\" -//only for Aug-Sep running -global piptxt - -global ylist 2010 2015 2019 2020 2021 2022 -global plines 215 365 685 322 547 1027 430 730 1370 -foreach year of global ylist { - foreach num of global plines { - pip, country(all) year(`year') fillgap povline(`=`num'/100') ${piptxt} clear - replace headcount = headcount*100 - drop if country_code=="CHN" & (reporting_level=="urban"|reporting_level=="rural") - drop if country_code=="IND" & (reporting_level=="urban"|reporting_level=="rural") - drop if country_code=="IDN" & (reporting_level=="urban"|reporting_level=="rural") - isid country_code - saveold "${dataout}\PIP_`year'_`num'.dta", replace - } -} - +//Get PIP lineup numbers +global dataout "${upath2}\03.intermediate\PIPinput\" +//only for Aug-Sep running +global piptxt server(qa) version(20240627_2017_01_02_PROD) + +global ylist 2010 2015 2019 2020 2021 2022 +global plines 215 365 685 322 547 1027 430 730 1370 +foreach year of global ylist { + foreach num of global plines { + pip, country(all) year(`year') fillgap povline(`=`num'/100') ${piptxt} clear + replace headcount = headcount*100 + drop if country_code=="CHN" & (reporting_level=="urban"|reporting_level=="rural") + drop if country_code=="IND" & (reporting_level=="urban"|reporting_level=="rural") + drop if country_code=="IDN" & (reporting_level=="urban"|reporting_level=="rural") + isid country_code + saveold "${dataout}\PIP_`year'_`num'.dta", replace + } +} + diff --git a/01.code/dofile/0-2 Update pop class region.do b/01.code/dofile/0-2 Update pop class region.do old mode 100644 new mode 100755 index 152706c..ee550f2 --- a/01.code/dofile/0-2 Update pop class region.do +++ b/01.code/dofile/0-2 Update pop class region.do @@ -1,26 +1,25 @@ -//POP, PIP region, and income class -clear -*global upath2 - -tempfile pop pce gdp data1 data2 data3 -global ver 20240326_2017_01_02_PROD - -tempfile pop -pip tables, table(pop) server(prod) version($ver) clear -*replace value = "" if value=="NA" -*destring value, replace -gen double pop = value/1000000 -drop value -reshape wide pop, i(country_code year) j( data_level) string -ren popnational pop -ren poprural pop_rural -ren popurban pop_urban - -ren country_code code -gen year_data = year - -merge 1:1 code year_data using "${upath2}\02.input\CLASS" -drop if _merge==1 -drop _merge - -save "${upath2}\\02.input\code_inc_pop_regpcn", replace +//POP, PIP region, and income class +clear + +tempfile pop pce gdp data1 data2 data3 +global ver 20240326_2017_01_02_PROD + +tempfile pop +pip tables, table(pop) server(prod) version($ver) clear +*replace value = "" if value=="NA" +*destring value, replace +gen double pop = value/1000000 +drop value +reshape wide pop, i(country_code year) j( data_level) string +ren popnational pop +ren poprural pop_rural +ren popurban pop_urban + +ren country_code code +gen year_data = year + +merge 1:1 code year_data using "${upath2}\02.input\CLASS" +drop if _merge==1 +drop _merge + +save "${upath2}\\02.input\code_inc_pop_regpcn", replace diff --git a/01.code/dofile/0-3 Prep data for coverage.do b/01.code/dofile/0-3 Prep data for coverage.do old mode 100644 new mode 100755 index 253f2a6..b8f789f --- a/01.code/dofile/0-3 Prep data for coverage.do +++ b/01.code/dofile/0-3 Prep data for coverage.do @@ -1,215 +1,213 @@ -//Data input for Coverage -clear all -tempfile data1 data2 data3 data4 class -global rnd AM2024 - -*global upath2 - -//load PIP pfw - one time. -/* -dlw, country(support) year(2005) type(gmdraw) filename(Survey_price_framework.dta) files surveyid(Support_2005_CPI_v11_M) -keep code year survname rep_year ref_year comparability datatype use_imputed use_microdata use_bin use_groupdata -isid code year survname -save "${upath2}\02.input\Survey_price_framework.dta", replace -*/ - -//Income class -use "${upath2}\02.input\CLASS.dta", clear -keep if year_data==2022 -save `class', replace - -//GMD content file -use "${upath2}\02.input\GMD_variables_countAM24.dta", clear -isid surveyid mod -replace ct_w_30m = ct_w_30mins if ct_w_30m==. & ct_w_30mins~=. -replace ct_w_30m = ct_w_30min if ct_w_30m==. & ct_w_30min~=. - -keep code year surveyid mod ct_urban ct_age ct_male ct_imp_wat_rec ct_imp_san_rec ct_electricity ct_educat4 ct_educat5 ct_educat7 ct_subnatid* ct_gaul_adm1_code ct_roof ct_wall ct_floor ct_w_30m ct_fin_account -bys surveyid (mod): gen nmod = _N - -gen keep = 1 if nmod==1 -replace keep = 1 if nmod==2 & mod=="ALL" -replace keep = 1 if nmod==3 & mod=="ALL" -keep if keep==1 -split surveyid, parse("_") -ren surveyid3 survname -drop surveyid1-surveyid2 surveyid4-surveyid8 -duplicates tag code year survname, gen(a) -drop if mod=="BIN" & a==1 & survname=="EU-SILC" -drop if mod=="HIST" & a==1 & code=="MYS" & year==2007 -drop if mod=="BIN" & a==1 & code=="LCA" & year==2015 -drop if mod=="HIST" & a==1 & code=="MEX" & year==1989 -//to be update in AM24 -drop if mod=="GPWG" & a==1 & code=="IRN" & year==2015 -drop if mod=="GPWG" & a==1 & code=="IRN" & year==2016 -drop if mod=="GPWG" & a==1 & code=="THA" & year==2009 -//temporary drop until the data is available -drop if code=="MYS" & year==2022 -drop if code=="KAZ" & year>=2019 & year<=2021 - -drop a nmod keep -isid code year survname -save `data2', replace - -//need to update to v11/AM24 -import excel using "${upath2}\\02.input\Subnational list - 0.xlsx" , first clear sheet("List") - -drop if level=="" -duplicates tag surveyid, gen(a) -drop if a==1 & lowest=="" -drop a -isid surveyid -keep region code year surveyid mod level ct_* -split surveyid, parse("_") -ren surveyid3 survname -drop surveyid1-surveyid2 surveyid4-surveyid8 - -//drop duplicates (for subnational purposes) -drop if code=="THA" & year==2009 & mod=="GPWG" -isid code year survname -save `data3', replace - -//load master -use "${upath2}\02.input\Survey_price_framework.dta", clear -keep code survname year rep_year ref_year comparability use_imputed use_microdata use_bin use_groupdata -merge 1:1 code year survname using `data2' -drop if _merge==2 -drop _merge - -merge 1:1 code year survname using `data3', keepus(level) -drop if _merge==2 -drop _merge -order level, after(mod) - -//drop countries with 2 welfare and no/not use subnat -drop if code=="AFG" -drop if code=="BRA" & (year>=2012 & year<=2015) & survname=="PNAD" -drop if code=="GBR" & survname=="EU-SILC" -drop if code=="BGR" & survname=="MTHS" & year==2007 -drop if code=="EST" & survname=="HBS" -drop if code=="HUN" & (survname=="HBS" | survname=="THMS-LIS") -drop if code=="POL" & survname=="HBS" -drop if code=="LTU" & survname=="HBS" -drop if code=="LVA" & survname=="HBS" -drop if code=="SVK" & survname=="HBS" -drop if code=="ALB" & survname=="SILC-C" -drop if code=="HRV" & survname=="HBS" -drop if code=="MNE" & survname=="HBS" -drop if code=="ROU" & survname=="HBS" -drop if code=="TUR" & survname=="HICES" & (year==2017|year==2018|year==2019) -drop if code=="SRB" & survname=="EU-SILC" & (year>=2013 & year<=2020) -drop if code=="RUS" & survname=="VNDN" - -//check one data per year -isid code year - -//variables have value -ds3 ct_* -local vlist = r(varlist) -foreach var of local vlist { - replace `var' = 1 if `var'>0 & `var'~=. -} - -//EUSILC no subnat for 2022 rounds (Region will fix this) -drop if level=="" & survname=="EU-SILC" - -//Universal coverage and High income assumptions -cap drop _merge -merge 1:1 code year using "${upath2}\02.input\WDI_elec_water.dta" -drop if _merge==2 -drop _merge - -cap drop _merge -merge 1:1 code year using "${upath2}\02.input\GED_data.dta" -drop if _merge==2 -drop _merge - -//unesco -cap drop _merge -merge 1:1 code year using "${upath2}\02.input\UNESCO_data.dta" -drop if _merge==2 -drop _merge - -//OECD -merge m:1 code using "${upath2}\02.input\oecd_list.dta" -drop if _merge==2 -drop _merge - -//CLASS -merge m:1 code using `class', keepus(incgroup_current) -drop if _merge==2 -drop _merge - -//flag: 1 avaiable, 2 fillin universal -cap gen elec_flag = . -//Impute electricity for some countries (universal coverage - 0 deprived) -replace elec_wdi = round(elec_wdi,1) -replace ged_total = round(ged_total,1) -replace ct_electricity = 1 if elec_wdi>=97 & elec_wdi~=. & ct_electricity==. -replace elec_flag = 2 if elec_wdi>=97 & elec_wdi~=. - -replace ct_electricity = 1 if ged_total>=97 & ged_total~=. & ct_electricity==. -replace elec_flag = 2 if ged_total>=97 & ged_total~=. -replace elec_flag = 2 if (code=="TUR"|code=="SRB") & year>=2022 - -//MKD SRB BGR CZE EST HRV HUN LTU LVA SVK SVN UKR POL ROU TUR ARG AUT BEL CHE CYP DEU DNK ESP FIN FRA GBR GRC IRL ISL ITA LUX MLT NLD NOR PRT SWE - -//Impute water for some countries (universal coverage - 0 deprived) -cap gen water_flag = . -replace wat_jmp = round(wat_jmp,1) -replace ct_imp_wat_rec = 1 if wat_jmp>=97 & wat_jmp~=. & ct_imp_wat_rec==. -replace water_flag = 2 if wat_jmp>=97 & wat_jmp~=. -replace water_flag = 2 if (code=="IRL" | code=="HRV") -/* -local water_list BGR CZE EST HRV HUN LVA SVK SVN AUT BEL CHE CYP DEU DNK ESP FIN FRA GBR GRC ISL ITA LUX MLT NLD NOR PRT SWE //LTU:96 PAN 94 UKR 96 IRL 97.8 -foreach c of local water_list { - *replace dep_infra_impw2 = 0 if code=="UKR" //UKR special - replace ct_imp_wat_rec = 1 if code=="`c'" & ct_imp_wat_rec==. - replace water_flag = 1 if code=="`c'" -} -*/ - -//SP -gen sp_flag = . -replace sp_flag = 2 if oecd==1 -replace sp_flag = 2 if sp_flag==. & incgroup_current=="High income" - -//Findex -gen sp_findex = . -replace sp_findex = 2 if oecd==1 -replace sp_findex = 2 if sp_findex==. & incgroup_current=="High income" -replace sp_findex = 2 if sp_findex==. & code=="LUX" - -//impute LIS, and special CHN cases -foreach var of varlist ct_urban ct_age ct_male ct_educat4 ct_imp_wat_rec ct_electricity { - replace `var' = 1 if `var'==. & strpos(survname,"-LIS")>0 - replace `var' = 1 if `var'==. & code=="CHN" & (year==2013|year==2018) -} -replace use_microdata = 1 if code=="CHN" & (year==2013|year==2018) -replace use_microdata = 1 if strpos(survname,"-LIS")>0 -//special cases IND -foreach var of varlist ct_urban ct_age ct_male ct_imp_wat_rec ct_imp_san_rec ct_electricity ct_educat4 { - replace `var' = 1 if `var'==. & code=="IND" & (year>=2019 & year<=2021) -} - -//manual fix MDG2021 -foreach var of varlist ct_urban ct_age ct_male ct_imp_wat_rec ct_electricity ct_educat4 { - *replace `var' = 1 if `var'==. & code=="MDG" & year==2021 - replace `var' = 1 if `var'==. & code=="CMR" & year==2021 - replace `var' = 1 if `var'==. & code=="NPL" & year==2022 -} - -gen ct_poverty = 1 -order ct_poverty, after(level) - -isid code year -ren year surv_year -clonevar year = rep_year -isid code year -order year, after(surv_year) -//fix -drop if code=="MYS" & year==2021 -drop if code=="KAZ" & (year>=2019 & year<=2021) -compress - +//Data input for Coverage +clear all +tempfile data1 data2 data3 data4 class +global rnd AM2024 + +//load PIP pfw - one time. +/* +dlw, country(support) year(2005) type(gmdraw) filename(Survey_price_framework.dta) files surveyid(Support_2005_CPI_v11_M) +keep code year survname rep_year ref_year comparability datatype use_imputed use_microdata use_bin use_groupdata +isid code year survname +save "${upath2}\02.input\Survey_price_framework.dta", replace +*/ + +//Income class +use "${upath2}\02.input\CLASS.dta", clear +keep if year_data==2022 +save `class', replace + +//GMD content file +use "${upath2}\02.input\GMD_variables_countAM24.dta", clear +isid surveyid mod +replace ct_w_30m = ct_w_30mins if ct_w_30m==. & ct_w_30mins~=. +replace ct_w_30m = ct_w_30min if ct_w_30m==. & ct_w_30min~=. + +keep code year surveyid mod ct_urban ct_age ct_male ct_imp_wat_rec ct_imp_san_rec ct_electricity ct_educat4 ct_educat5 ct_educat7 ct_subnatid* ct_gaul_adm1_code ct_roof ct_wall ct_floor ct_w_30m ct_fin_account +bys surveyid (mod): gen nmod = _N + +gen keep = 1 if nmod==1 +replace keep = 1 if nmod==2 & mod=="ALL" +replace keep = 1 if nmod==3 & mod=="ALL" +keep if keep==1 +split surveyid, parse("_") +ren surveyid3 survname +drop surveyid1-surveyid2 surveyid4-surveyid8 +duplicates tag code year survname, gen(a) +drop if mod=="BIN" & a==1 & survname=="EU-SILC" +drop if mod=="HIST" & a==1 & code=="MYS" & year==2007 +drop if mod=="BIN" & a==1 & code=="LCA" & year==2015 +drop if mod=="HIST" & a==1 & code=="MEX" & year==1989 +//to be update in AM24 +drop if mod=="GPWG" & a==1 & code=="IRN" & year==2015 +drop if mod=="GPWG" & a==1 & code=="IRN" & year==2016 +drop if mod=="GPWG" & a==1 & code=="THA" & year==2009 +//temporary drop until the data is available +drop if code=="MYS" & year==2022 +drop if code=="KAZ" & year>=2019 & year<=2021 + +drop a nmod keep +isid code year survname +save `data2', replace + +//need to update to v11/AM24 +import excel using "${upath2}\\02.input\Subnational list - 0.xlsx" , first clear sheet("List") + +drop if level=="" +duplicates tag surveyid, gen(a) +drop if a==1 & lowest=="" +drop a +isid surveyid +keep region code year surveyid mod level ct_* +split surveyid, parse("_") +ren surveyid3 survname +drop surveyid1-surveyid2 surveyid4-surveyid8 + +//drop duplicates (for subnational purposes) +drop if code=="THA" & year==2009 & mod=="GPWG" +isid code year survname +save `data3', replace + +//load master +use "${upath2}\02.input\Survey_price_framework.dta", clear +keep code survname year rep_year ref_year comparability use_imputed use_microdata use_bin use_groupdata +merge 1:1 code year survname using `data2' +drop if _merge==2 +drop _merge + +merge 1:1 code year survname using `data3', keepus(level) +drop if _merge==2 +drop _merge +order level, after(mod) + +//drop countries with 2 welfare and no/not use subnat +drop if code=="AFG" +drop if code=="BRA" & (year>=2012 & year<=2015) & survname=="PNAD" +drop if code=="GBR" & survname=="EU-SILC" +drop if code=="BGR" & survname=="MTHS" & year==2007 +drop if code=="EST" & survname=="HBS" +drop if code=="HUN" & (survname=="HBS" | survname=="THMS-LIS") +drop if code=="POL" & survname=="HBS" +drop if code=="LTU" & survname=="HBS" +drop if code=="LVA" & survname=="HBS" +drop if code=="SVK" & survname=="HBS" +drop if code=="ALB" & survname=="SILC-C" +drop if code=="HRV" & survname=="HBS" +drop if code=="MNE" & survname=="HBS" +drop if code=="ROU" & survname=="HBS" +drop if code=="TUR" & survname=="HICES" & (year==2017|year==2018|year==2019) +drop if code=="SRB" & survname=="EU-SILC" & (year>=2013 & year<=2020) +drop if code=="RUS" & survname=="VNDN" + +//check one data per year +isid code year + +//variables have value +ds3 ct_* +local vlist = r(varlist) +foreach var of local vlist { + replace `var' = 1 if `var'>0 & `var'~=. +} + +//EUSILC no subnat for 2022 rounds (Region will fix this) +drop if level=="" & survname=="EU-SILC" + +//Universal coverage and High income assumptions +cap drop _merge +merge 1:1 code year using "${upath2}\02.input\WDI_elec_water.dta" +drop if _merge==2 +drop _merge + +cap drop _merge +merge 1:1 code year using "${upath2}\02.input\GED_data.dta" +drop if _merge==2 +drop _merge + +//unesco +cap drop _merge +merge 1:1 code year using "${upath2}\02.input\UNESCO_data.dta" +drop if _merge==2 +drop _merge + +//OECD +merge m:1 code using "${upath2}\02.input\oecd_list.dta" +drop if _merge==2 +drop _merge + +//CLASS +merge m:1 code using `class', keepus(incgroup_current) +drop if _merge==2 +drop _merge + +//flag: 1 avaiable, 2 fillin universal +cap gen elec_flag = . +//Impute electricity for some countries (universal coverage - 0 deprived) +replace elec_wdi = round(elec_wdi,1) +replace ged_total = round(ged_total,1) +replace ct_electricity = 1 if elec_wdi>=97 & elec_wdi~=. & ct_electricity==. +replace elec_flag = 2 if elec_wdi>=97 & elec_wdi~=. + +replace ct_electricity = 1 if ged_total>=97 & ged_total~=. & ct_electricity==. +replace elec_flag = 2 if ged_total>=97 & ged_total~=. +replace elec_flag = 2 if (code=="TUR"|code=="SRB") & year>=2022 + +//MKD SRB BGR CZE EST HRV HUN LTU LVA SVK SVN UKR POL ROU TUR ARG AUT BEL CHE CYP DEU DNK ESP FIN FRA GBR GRC IRL ISL ITA LUX MLT NLD NOR PRT SWE + +//Impute water for some countries (universal coverage - 0 deprived) +cap gen water_flag = . +replace wat_jmp = round(wat_jmp,1) +replace ct_imp_wat_rec = 1 if wat_jmp>=97 & wat_jmp~=. & ct_imp_wat_rec==. +replace water_flag = 2 if wat_jmp>=97 & wat_jmp~=. +replace water_flag = 2 if (code=="IRL" | code=="HRV") +/* +local water_list BGR CZE EST HRV HUN LVA SVK SVN AUT BEL CHE CYP DEU DNK ESP FIN FRA GBR GRC ISL ITA LUX MLT NLD NOR PRT SWE //LTU:96 PAN 94 UKR 96 IRL 97.8 +foreach c of local water_list { + *replace dep_infra_impw2 = 0 if code=="UKR" //UKR special + replace ct_imp_wat_rec = 1 if code=="`c'" & ct_imp_wat_rec==. + replace water_flag = 1 if code=="`c'" +} +*/ + +//SP +gen sp_flag = . +replace sp_flag = 2 if oecd==1 +replace sp_flag = 2 if sp_flag==. & incgroup_current=="High income" + +//Findex +gen sp_findex = . +replace sp_findex = 2 if oecd==1 +replace sp_findex = 2 if sp_findex==. & incgroup_current=="High income" +replace sp_findex = 2 if sp_findex==. & code=="LUX" + +//impute LIS, and special CHN cases +foreach var of varlist ct_urban ct_age ct_male ct_educat4 ct_imp_wat_rec ct_electricity { + replace `var' = 1 if `var'==. & strpos(survname,"-LIS")>0 + replace `var' = 1 if `var'==. & code=="CHN" & (year==2013|year==2018) +} +replace use_microdata = 1 if code=="CHN" & (year==2013|year==2018) +replace use_microdata = 1 if strpos(survname,"-LIS")>0 +//special cases IND +foreach var of varlist ct_urban ct_age ct_male ct_imp_wat_rec ct_imp_san_rec ct_electricity ct_educat4 { + replace `var' = 1 if `var'==. & code=="IND" & (year>=2019 & year<=2021) +} + +//manual fix MDG2021 +foreach var of varlist ct_urban ct_age ct_male ct_imp_wat_rec ct_electricity ct_educat4 { + *replace `var' = 1 if `var'==. & code=="MDG" & year==2021 + replace `var' = 1 if `var'==. & code=="CMR" & year==2021 + replace `var' = 1 if `var'==. & code=="NPL" & year==2022 +} + +gen ct_poverty = 1 +order ct_poverty, after(level) + +isid code year +ren year surv_year +clonevar year = rep_year +isid code year +order year, after(surv_year) +//fix +drop if code=="MYS" & year==2021 +drop if code=="KAZ" & (year>=2019 & year<=2021) +compress + save "${upath2}\03.intermediate\Survey_varlist", replace \ No newline at end of file diff --git a/01.code/dofile/0-4a Coverage check.do b/01.code/dofile/0-4a Coverage check.do old mode 100644 new mode 100755 index 7d4162b..1b6dac5 --- a/01.code/dofile/0-4a Coverage check.do +++ b/01.code/dofile/0-4a Coverage check.do @@ -1,153 +1,151 @@ -//Coverage table (in theory) -clear all -tempfile data1 data2 data3 data4 dataall -save `dataall', replace emptyok - -global round AM24 -global popf code_inc_pop_regpcn.dta - -*global upath2 - -global lnystart 2010 -global lnyend 2022 - -global circa 3 -*global circa 1 - -//PIP region and WLD pop, pop_inc -use "${upath2}\\02.input\\${popf}" , clear -keep if year>=$lnystart &year<=$lnyend - -gen pop_inc = pop*(incgroup_historical=="Low income"|incgroup_historical=="Lower middle income") -collapse (sum) pop pop_inc, by( year region_pip) -ren pop allpop_reg -ren pop_inc allpop_inc -tempfile datax -save `datax', replace -collapse (sum) allpop_reg allpop_inc, by( year) -gen region_pip = "WLD" -append using `datax' -save `datax', replace - -//load survey list -use "${upath2}\03.intermediate\Survey_varlist", clear - -clonevar datayear = year - -//1.keep only year>=2012 for circa 2015 (-3) and above, add pcn_reg and pop, use reporting year -keep if datayear>= `=${lnystart}-${circa}' - -//add pop, pcn_reg -merge 1:1 code year using "${upath2}\\02.input\\${popf}" , keepus(pop region_pip) -drop if _merge==2 -ren pop pop_surv -drop _merge - -tempfile data1 -save `data1', replace - -//2.figure out the selection for circa -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code) - -//add more years -local start `=${lnystart}-${circa}' -local end `=${lnyend}+${circa}' -forv ly =`start'(1)`end' { - cap gen year`ly'=. -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order - -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - forv j=1(1)${circa} { - //+ is prefered than -, and 1 is prefered than 2, so on - replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. - replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. - } -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code sel* over* -tempfile data2 -save `data2', replace - -//add back to the data so we can aggregate -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - use `data2', clear - gen year = sel`ly' - drop if year==. - merge 1:1 code year using `data1' - ta _merge - keep if _merge==3 - drop _merge - - //add pop and hist income group of lineup year - ren year datayear1 - gen year = `ly' - merge 1:1 code year using "${upath2}\\02.input\\${popf}" , keepus(pop incgroup_historical) - ta _merge - keep if _merge==3 - drop _merge - - gen pop_inc = pop*(incgroup_historical=="Low income"|incgroup_historical=="Lower middle income") - gen double overpop = pop if over`ly'==1 - gen double samelnypop = pop if datayear1==year - - gen economy = 1 - tempfile data3 - save `data3', replace - save "${upath2}\03.intermediate\Lineupcheck\Pov_cov_cir${circa}_lny`ly'", replace - - //WLD - collapse (rawsum) pop pop_inc overpop samelnypop economy [aw=pop], by(year) - gen region_pip = "WLD" - la var pop "Population of lineup year (survey)" - la var overpop "Population of overlapped lineup year (survey)" - la var samelnypop "Population of same lineup year (survey)" - append using `dataall' - save `dataall', replace - - //pcn_reg - use `data3', clear - collapse (rawsum) pop overpop samelnypop economy [aw=pop] , by(year region_pip) - append using `dataall' - save `dataall', replace - -} - -use `dataall', clear - -gen sh_newdata = 100*((pop-overpop)/ pop) -*gen sh_datainlny = 100*(samelnypop/ pop) -cap drop _merge -merge 1:1 year region_pip using `datax' -cap drop _merge -gen sh_datainlny = 100*(samelnypop/ allpop_reg) -gen sh_lmicpop = 100*(pop_inc/ allpop_inc) -gen sh_WLDpop = 100*(pop/allpop_reg) - -la var sh_WLDpop "share of world pop" -la var sh_lmicpop "share of lic and lmic pop" -la var sh_newdata "share of new data compared to previous year" -la var sh_datainlny "share of data in the same year as lineup year" - +//Coverage table (in theory) +clear all +tempfile data1 data2 data3 data4 dataall +save `dataall', replace emptyok + +global round AM24 +global popf code_inc_pop_regpcn.dta + +global lnystart 2010 +global lnyend 2022 + +global circa 3 +*global circa 1 + +//PIP region and WLD pop, pop_inc +use "${upath2}\\02.input\\${popf}" , clear +keep if year>=$lnystart &year<=$lnyend + +gen pop_inc = pop*(incgroup_historical=="Low income"|incgroup_historical=="Lower middle income") +collapse (sum) pop pop_inc, by( year region_pip) +ren pop allpop_reg +ren pop_inc allpop_inc +tempfile datax +save `datax', replace +collapse (sum) allpop_reg allpop_inc, by( year) +gen region_pip = "WLD" +append using `datax' +save `datax', replace + +//load survey list +use "${upath2}\03.intermediate\Survey_varlist", clear + +clonevar datayear = year + +//1.keep only year>=2012 for circa 2015 (-3) and above, add pcn_reg and pop, use reporting year +keep if datayear>= `=${lnystart}-${circa}' + +//add pop, pcn_reg +merge 1:1 code year using "${upath2}\\02.input\\${popf}" , keepus(pop region_pip) +drop if _merge==2 +ren pop pop_surv +drop _merge + +tempfile data1 +save `data1', replace + +//2.figure out the selection for circa +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code) + +//add more years +local start `=${lnystart}-${circa}' +local end `=${lnyend}+${circa}' +forv ly =`start'(1)`end' { + cap gen year`ly'=. +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order + +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + gen sel`ly' = `ly' if year`ly'==1 + forv j=1(1)${circa} { + //+ is prefered than -, and 1 is prefered than 2, so on + replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. + replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. + } +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code sel* over* +tempfile data2 +save `data2', replace + +//add back to the data so we can aggregate +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + use `data2', clear + gen year = sel`ly' + drop if year==. + merge 1:1 code year using `data1' + ta _merge + keep if _merge==3 + drop _merge + + //add pop and hist income group of lineup year + ren year datayear1 + gen year = `ly' + merge 1:1 code year using "${upath2}\\02.input\\${popf}" , keepus(pop incgroup_historical) + ta _merge + keep if _merge==3 + drop _merge + + gen pop_inc = pop*(incgroup_historical=="Low income"|incgroup_historical=="Lower middle income") + gen double overpop = pop if over`ly'==1 + gen double samelnypop = pop if datayear1==year + + gen economy = 1 + tempfile data3 + save `data3', replace + save "${upath2}\03.intermediate\Lineupcheck\Pov_cov_cir${circa}_lny`ly'", replace + + //WLD + collapse (rawsum) pop pop_inc overpop samelnypop economy [aw=pop], by(year) + gen region_pip = "WLD" + la var pop "Population of lineup year (survey)" + la var overpop "Population of overlapped lineup year (survey)" + la var samelnypop "Population of same lineup year (survey)" + append using `dataall' + save `dataall', replace + + //pcn_reg + use `data3', clear + collapse (rawsum) pop overpop samelnypop economy [aw=pop] , by(year region_pip) + append using `dataall' + save `dataall', replace + +} + +use `dataall', clear + +gen sh_newdata = 100*((pop-overpop)/ pop) +*gen sh_datainlny = 100*(samelnypop/ pop) +cap drop _merge +merge 1:1 year region_pip using `datax' +cap drop _merge +gen sh_datainlny = 100*(samelnypop/ allpop_reg) +gen sh_lmicpop = 100*(pop_inc/ allpop_inc) +gen sh_WLDpop = 100*(pop/allpop_reg) + +la var sh_WLDpop "share of world pop" +la var sh_lmicpop "share of lic and lmic pop" +la var sh_newdata "share of new data compared to previous year" +la var sh_datainlny "share of data in the same year as lineup year" + save "${upath2}\03.intermediate\Lineupcheck\wld_pov_cov_circa${circa}", replace \ No newline at end of file diff --git a/01.code/dofile/0-7a Findex_quintiles 2010.do b/01.code/dofile/0-7a Findex_quintiles 2010.do deleted file mode 100644 index a4652f8..0000000 --- a/01.code/dofile/0-7a Findex_quintiles 2010.do +++ /dev/null @@ -1,55 +0,0 @@ -* Findex by national quintile and rural/urban (degurban, F2F only) -clear -tempfile data1 dataall -save `dataall', replace emptyok - -// ssc install gtools -*global upath2 - -global input ${upath2}\02.input\ - -*2011 -use "${input}/Findex/WLD_2011_FINDEX_v02_M.dta", clear -ren ecnmycode code -gen no_account = (account==2|account==3) if !mi(account) -gen year = 2011 -*clonevar degurban = urbanicity_f2f -*replace degurban = 3 if mi(degurban) -*lab def rural_2021 3 "Total", add - -tempfile data1 -save `data1', replace - -//quintile only -use `data1', clear -collapse (mean) no_account [aw=wgt], by(economy code year inc_q) -gen lvl = "national" -gen x = "q" -append using `dataall' -save `dataall', replace - -//national only -use `data1', clear -collapse (mean) no_account [aw=wgt], by(economy code year) -gen lvl = "national" -gen x = "nat" -append using `dataall' -save `dataall', replace - -gen y = "total" if x=="nat" -replace y = "q"+string(inc_q)+"total" if x=="q" -replace y = "q"+string(inc_q)+lvl if x=="q_urb_rur" - -replace no_account = no_account*100 -drop lvl x inc_q -reshape wide no_account , i( economy code year) j(y) string -order economy code year no_accounttotal no_accountq*total - -gen type = "Total" -replace code = "COD" if code=="ZAR" -replace code = "XKX" if code=="KSV" -replace code = "ROU" if code=="ROM" -replace code = "PSE" if code=="WBG" -isid code -compress -saveold "${input}/2010/findex_2010_quintiles.dta", replace diff --git a/01.code/dofile/0-7a Findex_quintiles 2021.do b/01.code/dofile/0-7a Findex_quintiles 2021.do old mode 100644 new mode 100755 index 1875eb1..73f30ac --- a/01.code/dofile/0-7a Findex_quintiles 2021.do +++ b/01.code/dofile/0-7a Findex_quintiles 2021.do @@ -1,62 +1,59 @@ -* Findex by national quintile and rural/urban (degurban, F2F only) -clear -tempfile data1 dataall -save `dataall', replace emptyok - -// ssc install gtools -*global upath2 - -global input ${upath2}\02.input\ - -*2021 -use "${input}/Findex/WLD_2021_FINDEX_v03_M.dta", clear -ren economycode code -gen no_account = account==0 if !mi(account) -clonevar degurban = urbanicity_f2f -*replace degurban = 3 if mi(degurban) -*lab def rural_2021 3 "Total", add - -tempfile data1 -save `data1', replace - -//quintile and urban/rural only -drop if degurban==. -gcollapse (mean) no_account [aw=wgt], by(economy code year inc_q degurban) -decode degurban, gen(lvl) -replace lvl = lower(lvl) -gen x = "q_urb_rur" -drop degurban -append using `dataall' -save `dataall', replace - -//quintile only -use `data1', clear -gcollapse (mean) no_account [aw=wgt], by(economy code year inc_q) -gen lvl = "national" -gen x = "q" -append using `dataall' -save `dataall', replace - -//national only -use `data1', clear -gcollapse (mean) no_account [aw=wgt], by(economy code year) -gen lvl = "national" -gen x = "nat" -append using `dataall' -save `dataall', replace - -gen y = "total" if x=="nat" -replace y = "q"+string(inc_q)+"total" if x=="q" -replace y = "q"+string(inc_q)+lvl if x=="q_urb_rur" - -replace no_account = no_account*100 -drop lvl x inc_q -reshape wide no_account , i( economy code year) j(y) string -order economy code year no_accounttotal no_accountq*total *urban *rural - -gen type = "Urb_rur" if no_accountq1urban~=. -replace type = "Total" if no_accountq1total~=. & no_accountq1urban==. & type=="" - -isid code -compress -saveold "${input}/2021/findex_2021_quintiles.dta", replace +* Findex by national quintile and rural/urban (degurban, F2F only) +clear +tempfile data1 dataall +save `dataall', replace emptyok + +global input ${upath2}\02.input\ + +*2021 +use "${input}/Findex/WLD_2021_FINDEX_v03_M.dta", clear +ren economycode code +gen no_account = account==0 if !mi(account) +clonevar degurban = urbanicity_f2f +*replace degurban = 3 if mi(degurban) +*lab def rural_2021 3 "Total", add + +tempfile data1 +save `data1', replace + +//quintile and urban/rural only +drop if degurban==. +gcollapse (mean) no_account [aw=wgt], by(economy code year inc_q degurban) +decode degurban, gen(lvl) +replace lvl = lower(lvl) +gen x = "q_urb_rur" +drop degurban +append using `dataall' +save `dataall', replace + +//quintile only +use `data1', clear +gcollapse (mean) no_account [aw=wgt], by(economy code year inc_q) +gen lvl = "national" +gen x = "q" +append using `dataall' +save `dataall', replace + +//national only +use `data1', clear +gcollapse (mean) no_account [aw=wgt], by(economy code year) +gen lvl = "national" +gen x = "nat" +append using `dataall' +save `dataall', replace + +gen y = "total" if x=="nat" +replace y = "q"+string(inc_q)+"total" if x=="q" +replace y = "q"+string(inc_q)+lvl if x=="q_urb_rur" + +replace no_account = no_account*100 +drop lvl x inc_q +reshape wide no_account , i( economy code year) j(y) string +order economy code year no_accounttotal no_accountq*total *urban *rural + +gen type = "Urb_rur" if no_accountq1urban~=. +replace type = "Total" if no_accountq1total~=. & no_accountq1urban==. & type=="" + +isid code +compress +saveold "${input}/2021/findex_2021_quintiles.dta", replace diff --git a/01.code/dofile/0-7b Prep ASPIRE.do b/01.code/dofile/0-7b Prep ASPIRE.do old mode 100644 new mode 100755 index ccae58f..7350416 --- a/01.code/dofile/0-7b Prep ASPIRE.do +++ b/01.code/dofile/0-7b Prep ASPIRE.do @@ -1,16 +1,15 @@ -//ASPIRE - the data is only unique -clear -*global upath2 - -import excel using "${upath2}\02.input\ASPIRE\ASPIRE_data_touse.xlsx" , clear firstrow sheet(Data_ver2) - -gen type= "" -replace type = "Quintile" if _pop_All_SPL_q1~=. & _pop_All_SPL_q2~=. & _pop_All_SPL_q3~=. & _pop_All_SPL_q4~=. & _pop_All_SPL_q5~=. -replace type = "Urb" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur==. & _pop_All_SPL_urb~=. -replace type = "Rur" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur~=. & _pop_All_SPL_urb==. -replace type = "Urb_rur" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur~=. & _pop_All_SPL_urb~=. -replace type = "Total" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur==. & _pop_All_SPL_urb==. & _pop_All_SPL~=. - -compress -isid code +//ASPIRE - the data is only unique +clear + +import excel using "${upath2}\02.input\ASPIRE\ASPIRE_data_touse.xlsx" , clear firstrow sheet(Data_ver2) + +gen type= "" +replace type = "Quintile" if _pop_All_SPL_q1~=. & _pop_All_SPL_q2~=. & _pop_All_SPL_q3~=. & _pop_All_SPL_q4~=. & _pop_All_SPL_q5~=. +replace type = "Urb" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur==. & _pop_All_SPL_urb~=. +replace type = "Rur" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur~=. & _pop_All_SPL_urb==. +replace type = "Urb_rur" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur~=. & _pop_All_SPL_urb~=. +replace type = "Total" if _pop_All_SPL_q1==. & _pop_All_SPL_q2==. & _pop_All_SPL_q3==. & _pop_All_SPL_q4==. & _pop_All_SPL_q5==. & _pop_All_SPL_rur==. & _pop_All_SPL_urb==. & _pop_All_SPL~=. + +compress +isid code saveold "${upath2}\02.input\2021\ASPIRE_data_2021.dta", replace \ No newline at end of file diff --git a/01.code/dofile/0-7c Prep JMP.do b/01.code/dofile/0-7c Prep JMP.do old mode 100644 new mode 100755 index 1b4d11f..df187cc --- a/01.code/dofile/0-7c Prep JMP.do +++ b/01.code/dofile/0-7c Prep JMP.do @@ -1,90 +1,88 @@ -* Improved water by national quintile and rural/urban -clear - -*global upath2 - -global circa 3 -global input ${upath2}\02.input\ - -global lnystart 2021 -global lnyend 2021 - -//Data -import excel using "${upath2}\02.input\jmp\data\jmp_clean.xlsx" , clear first sheet(estimates) -ren iso3 code - -drop source w_imp_prem w_imp_av w_imp_qual w_sm -replace type = "_" + type -reshape wide w_imp w_bas, i(code year) j(type) string - -tempfile data1 -save `data1', replace - -//2.figure out the selection for circa -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code) - -//add more years -local start `=${lnystart}-${circa}' -local end `=${lnyend}+${circa}' -forv ly =`start'(1)`end' { - cap gen year`ly'=. -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order - -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - forv j=1(1)${circa} { - //+ is prefered than -, and 1 is prefered than 2, so on - replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. - replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. - } -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code sel* over* -tempfile data2 -save `data2', replace - -//add back to the data so we can aggregate -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - use `data2', clear - gen year = sel`ly' - drop if year==. - merge 1:1 code year using `data1' - ta _merge - keep if _merge==3 - drop _merge - - //add pop and hist income group of lineup year - ren year datayear1 - gen year = `ly' - - gen type= "" - replace type="Total" if w_imp_total~=. & w_imp_urban==. & w_imp_rural==. - replace type="Urb_rur" if type=="" - isid code - order code year w_imp_total w_imp_urban w_imp_rural w_bas_total w_bas_urban w_bas_rural - - compress - saveold "${upath2}\02.input\\`ly'\\JMP_cov_`ly'", replace -} - +* Improved water by national quintile and rural/urban +clear + +global circa 3 +global input ${upath2}\02.input\ + +global lnystart 2021 +global lnyend 2021 + +//Data +import excel using "${upath2}\02.input\jmp\data\jmp_clean.xlsx" , clear first sheet(estimates) +ren iso3 code + +drop source w_imp_prem w_imp_av w_imp_qual w_sm +replace type = "_" + type +reshape wide w_imp w_bas, i(code year) j(type) string + +tempfile data1 +save `data1', replace + +//2.figure out the selection for circa +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code) + +//add more years +local start `=${lnystart}-${circa}' +local end `=${lnyend}+${circa}' +forv ly =`start'(1)`end' { + cap gen year`ly'=. +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order + +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + gen sel`ly' = `ly' if year`ly'==1 + forv j=1(1)${circa} { + //+ is prefered than -, and 1 is prefered than 2, so on + replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. + replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. + } +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code sel* over* +tempfile data2 +save `data2', replace + +//add back to the data so we can aggregate +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + use `data2', clear + gen year = sel`ly' + drop if year==. + merge 1:1 code year using `data1' + ta _merge + keep if _merge==3 + drop _merge + + //add pop and hist income group of lineup year + ren year datayear1 + gen year = `ly' + + gen type= "" + replace type="Total" if w_imp_total~=. & w_imp_urban==. & w_imp_rural==. + replace type="Urb_rur" if type=="" + isid code + order code year w_imp_total w_imp_urban w_imp_rural w_bas_total w_bas_urban w_bas_rural + + compress + saveold "${upath2}\02.input\\`ly'\\JMP_cov_`ly'", replace +} + diff --git a/01.code/dofile/0-7d Prep GED.do b/01.code/dofile/0-7d Prep GED.do old mode 100644 new mode 100755 index f8574dd..72c6d90 --- a/01.code/dofile/0-7d Prep GED.do +++ b/01.code/dofile/0-7d Prep GED.do @@ -1,98 +1,96 @@ -//Electricity -clear - -*global upath2 - -global circa 3 -global input ${upath2}\02.input\ - -global lnystart 2021 -global lnyend 2021 - -import excel using "${upath2}\02.input\GED\sdg7.1.1-access_to_electricity.xlsx" , sheet(UN reporting) clear first -//TimePeriod Value Units Nature Location Reporting Type FootNote Source ISOalpha3 - -ren * , lower - -ren timeperiod year -ren isoalpha3 code -replace value = "" if value=="NaN" -destring value, replace -replace location = lower(location) -keep code year value location -replace location = "total" if location=="allarea" -replace location = "_" + location -drop if code=="NULL" -ren value ged -reshape wide ged, i(code year) j(location) string -saveold "${upath2}\02.input\GED_data", replace -tempfile data1 -save `data1', replace - -//2.figure out the selection for circa -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code) - -//add more years -local start `=${lnystart}-${circa}' -local end `=${lnyend}+${circa}' -forv ly =`start'(1)`end' { - cap gen year`ly'=. -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order - -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - forv j=1(1)${circa} { - //+ is prefered than -, and 1 is prefered than 2, so on - replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. - replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. - } -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code sel* over* -tempfile data2 -save `data2', replace - -//add back to the data so we can aggregate -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - use `data2', clear - gen year = sel`ly' - drop if year==. - merge 1:1 code year using `data1' - ta _merge - keep if _merge==3 - drop _merge - - //add pop and hist income group of lineup year - ren year datayear1 - gen year = `ly' - - gen type= "" - replace type="Total" if ged_total~=. & ged_urban==. & ged_rural==. - replace type="Urb_rur" if type=="" - isid code - order code year ged_total ged_urban ged_rural - - compress - saveold "${upath2}\02.input\\`ly'\\GED_cov_`ly'", replace -} +//Electricity +clear + +global circa 3 +global input ${upath2}\02.input\ + +global lnystart 2021 +global lnyend 2021 + +import excel using "${upath2}\02.input\GED\sdg7.1.1-access_to_electricity.xlsx" , sheet(UN reporting) clear first +//TimePeriod Value Units Nature Location Reporting Type FootNote Source ISOalpha3 + +ren * , lower + +ren timeperiod year +ren isoalpha3 code +replace value = "" if value=="NaN" +destring value, replace +replace location = lower(location) +keep code year value location +replace location = "total" if location=="allarea" +replace location = "_" + location +drop if code=="NULL" +ren value ged +reshape wide ged, i(code year) j(location) string +saveold "${upath2}\02.input\GED_data", replace +tempfile data1 +save `data1', replace + +//2.figure out the selection for circa +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code) + +//add more years +local start `=${lnystart}-${circa}' +local end `=${lnyend}+${circa}' +forv ly =`start'(1)`end' { + cap gen year`ly'=. +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order + +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + gen sel`ly' = `ly' if year`ly'==1 + forv j=1(1)${circa} { + //+ is prefered than -, and 1 is prefered than 2, so on + replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. + replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. + } +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code sel* over* +tempfile data2 +save `data2', replace + +//add back to the data so we can aggregate +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + use `data2', clear + gen year = sel`ly' + drop if year==. + merge 1:1 code year using `data1' + ta _merge + keep if _merge==3 + drop _merge + + //add pop and hist income group of lineup year + ren year datayear1 + gen year = `ly' + + gen type= "" + replace type="Total" if ged_total~=. & ged_urban==. & ged_rural==. + replace type="Urb_rur" if type=="" + isid code + order code year ged_total ged_urban ged_rural + + compress + saveold "${upath2}\02.input\\`ly'\\GED_cov_`ly'", replace +} diff --git a/01.code/dofile/0-7e Prep UNESCO.do b/01.code/dofile/0-7e Prep UNESCO.do old mode 100644 new mode 100755 index d05afc1..e101af3 --- a/01.code/dofile/0-7e Prep UNESCO.do +++ b/01.code/dofile/0-7e Prep UNESCO.do @@ -1,100 +1,98 @@ -//UNESCO -clear - -*global upath2 - -global circa 3 -global input ${upath2}\02.input\ - -global lnystart 2021 -global lnyend 2021 - -import excel using "${upath2}\02.input\UNESCO\UNESCO_Education_CompletedPrimaryorHigher.xlsx" , clear first -//TimePeriod Value Units Nature Location Reporting Type FootNote Source ISOalpha3 - -ren * , lower -ren completedprimaryeducationorh location -ren countrycode code -drop if code=="" -replace location = lower(location) -drop if location=="poorest quintile" | location=="richest quintile" -reshape long yr, i(country code location) j(year) -ren yr unesco -replace location = "_" + location -reshape wide unesco, i(country code year) j(location) string -drop if unesco_rural==. & unesco_total==. & unesco_urban==. -drop if unesco_total==. & (unesco_rural==. | unesco_urban==.) - -saveold "${upath2}\02.input\UNESCO_data", replace -tempfile data1 -save `data1', replace - -//2.figure out the selection for circa -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code) - -//add more years -local start `=${lnystart}-${circa}' -local end `=${lnyend}+${circa}' -forv ly =`start'(1)`end' { - cap gen year`ly'=. -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order - -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - forv j=1(1)${circa} { - //+ is prefered than -, and 1 is prefered than 2, so on - replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. - replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. - } -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code sel* over* -tempfile data2 -save `data2', replace - -//add back to the data so we can aggregate -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - use `data2', clear - gen year = sel`ly' - drop if year==. - merge 1:1 code year using `data1' - ta _merge - keep if _merge==3 - drop _merge - - //add pop and hist income group of lineup year - ren year datayear1 - gen year = `ly' - - gen type= "" - replace type="Total" if unesco_total~=. & unesco_urban==. & unesco_rural==. - replace type="Urb_rur" if type=="" & unesco_urban~=. & unesco_rural~=. - replace type="Urb" if type=="" & unesco_urban~=. & unesco_rural==. - replace type="Rur" if type=="" & unesco_urban==. & unesco_rural~=. - isid code - order code year unesco_total unesco_urban unesco_rural - - compress - saveold "${upath2}\02.input\\`ly'\\UNESCO_cov_`ly'", replace -} +//UNESCO +clear + +global circa 3 +global input ${upath2}\02.input\ + +global lnystart 2021 +global lnyend 2021 + +import excel using "${upath2}\02.input\UNESCO\UNESCO_Education_CompletedPrimaryorHigher.xlsx" , clear first +//TimePeriod Value Units Nature Location Reporting Type FootNote Source ISOalpha3 + +ren * , lower +ren completedprimaryeducationorh location +ren countrycode code +drop if code=="" +replace location = lower(location) +drop if location=="poorest quintile" | location=="richest quintile" +reshape long yr, i(country code location) j(year) +ren yr unesco +replace location = "_" + location +reshape wide unesco, i(country code year) j(location) string +drop if unesco_rural==. & unesco_total==. & unesco_urban==. +drop if unesco_total==. & (unesco_rural==. | unesco_urban==.) + +saveold "${upath2}\02.input\UNESCO_data", replace +tempfile data1 +save `data1', replace + +//2.figure out the selection for circa +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code) + +//add more years +local start `=${lnystart}-${circa}' +local end `=${lnyend}+${circa}' +forv ly =`start'(1)`end' { + cap gen year`ly'=. +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order + +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + gen sel`ly' = `ly' if year`ly'==1 + forv j=1(1)${circa} { + //+ is prefered than -, and 1 is prefered than 2, so on + replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. + replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. + } +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code sel* over* +tempfile data2 +save `data2', replace + +//add back to the data so we can aggregate +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + use `data2', clear + gen year = sel`ly' + drop if year==. + merge 1:1 code year using `data1' + ta _merge + keep if _merge==3 + drop _merge + + //add pop and hist income group of lineup year + ren year datayear1 + gen year = `ly' + + gen type= "" + replace type="Total" if unesco_total~=. & unesco_urban==. & unesco_rural==. + replace type="Urb_rur" if type=="" & unesco_urban~=. & unesco_rural~=. + replace type="Urb" if type=="" & unesco_urban~=. & unesco_rural==. + replace type="Rur" if type=="" & unesco_urban==. & unesco_rural~=. + isid code + order code year unesco_total unesco_urban unesco_rural + + compress + saveold "${upath2}\02.input\\`ly'\\UNESCO_cov_`ly'", replace +} diff --git a/01.code/dofile/0-8 Water and Elec WDI.do b/01.code/dofile/0-8 Water and Elec WDI.do old mode 100644 new mode 100755 index b2f2924..2626dae --- a/01.code/dofile/0-8 Water and Elec WDI.do +++ b/01.code/dofile/0-8 Water and Elec WDI.do @@ -1,29 +1,27 @@ -//Data input for Coverage (water and JMP) -clear all -tempfile data1 data2 data3 data4 -global rnd AM2024 - -*global upath2 - -*ssc install wbopendata -// Access to electricity (% of population) -wbopendata, language(en - English) indicator(eg.elc.accs.zs) long clear -ren eg_elc_accs_zs elec_wdi -ren countrycode code -keep code year elec_wdi -tempfile data1 -save `data1', replace - -import excel "${upath2}\02.input\jmp\data\jmp_clean.xlsx", sheet("estimates") firstrow clear -ren iso3 code -rename w_imp wat_jmp -keep if type == "total" -keep code year wat_jmp -merge 1:1 code year using `data1' -keep if _merge==3 -drop _merge - -saveold "${upath2}\02.input\WDI_elec_water", replace - - - +//Data input for Coverage (water and JMP) +clear all +tempfile data1 data2 data3 data4 +global rnd AM2024 + +*ssc install wbopendata +// Access to electricity (% of population) +wbopendata, language(en - English) indicator(eg.elc.accs.zs) long clear +ren eg_elc_accs_zs elec_wdi +ren countrycode code +keep code year elec_wdi +tempfile data1 +save `data1', replace + +import excel "${upath2}\02.input\jmp\data\jmp_clean.xlsx", sheet("estimates") firstrow clear +ren iso3 code +rename w_imp wat_jmp +keep if type == "total" +keep code year wat_jmp +merge 1:1 code year using `data1' +keep if _merge==3 +drop _merge + +saveold "${upath2}\02.input\WDI_elec_water", replace + + + diff --git a/01.code/dofile/1-1 Get list for LISSY.do b/01.code/dofile/1-1 Get list for LISSY.do old mode 100644 new mode 100755 index c7d30a8..75b878a --- a/01.code/dofile/1-1 Get list for LISSY.do +++ b/01.code/dofile/1-1 Get list for LISSY.do @@ -1,159 +1,141 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . -clear all -tempfile data1 data2 data3 data4 cpidata -global rnd AM2024 -global rnd1 AM24 - -*global upath2 - -global circa 3 -global lnystart 2021 -global lnyend 2021 -global lnvalues 215 365 685 - -//load cpi -*dlw, country(support) year(2005) type(gmdraw) filename(Final_CPI_PPP_to_be_used.dta) files surveyid(Support_2005_CPI_v11_M) -use "${upath2}\02.input\Final_CPI_PPP_to_be_used.dta", clear -gen lis = strpos(survname,"-LIS")>0 -keep if lis==1 -keep if inlist(code,"AUS","CAN","DEU","GBR","ISR","JPN","KOR","TWN", "USA") -save `cpidata', replace - -//load pfw -*dlw, country(support) year(2005) type(gmdraw) filename(Survey_price_framework.dta) files surveyid(Support_2005_CPI_v11_M) -use "${upath2}\02.input\Survey_price_framework.dta", clear -gen lis = strpos(survname,"-LIS")>0 -keep if lis==1 -keep if inlist(code,"AUS","CAN","DEU","GBR","ISR","JPN","KOR","TWN", "USA") -gen code2 = "" -replace code2 = "au" if code=="AUS" -replace code2 = "ca" if code=="CAN" -replace code2 = "de" if code=="DEU" -replace code2 = "uk" if code=="GBR" -replace code2 = "il" if code=="ISR" -replace code2 = "jp" if code=="JPN" -replace code2 = "kr" if code=="KOR" -replace code2 = "tw" if code=="TWN" -replace code2 = "us" if code=="USA" -tostring year, gen(yr) -gen yr2 = substr(yr,3,2) -gen file = code2 + yr2 -keep code year file survname rep_year ref_year comparability datatype survey_coverage -save `data1', replace - -//get latest from data1 -bys code: egen ymax = max(year) -keep if year == ymax -drop ymax -save `data2', replace - -//2.figure out the selection for circa -use `data1', clear -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code datatype) - -//add more years -local start `=${lnystart}-3' -local end `=${lnyend}+3' -forv ly =`start'(1)`end' { - cap gen year`ly'=. -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order - -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - //1 year - replace sel`ly' = `=`ly'+1' if year`=`ly'+1'==1 & sel`ly'==. - replace sel`ly' = `=`ly'-1' if year`=`ly'-1'==1 & sel`ly'==. - //2 years - replace sel`ly' = `=`ly'+2' if year`=`ly'+2'==1 & sel`ly'==. - replace sel`ly' = `=`ly'-2' if year`=`ly'-2'==1 & sel`ly'==. - //3 year2 - replace sel`ly' = `=`ly'+3' if year`=`ly'+3'==1 & sel`ly'==. - replace sel`ly' = `=`ly'-3' if year`=`ly'-3'==1 & sel`ly'==. -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code datatype sel* over* -save `data3', replace - -//add back to the data -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - use `data3', clear - gen ln = `ly' - gen year = sel`ly' - drop if year==. - merge 1:1 code year datatype using `data1' - ta _merge - keep if _merge==3 - drop _merge - gen type = 1 - save `data4', replace - - keep code - duplicates drop code, force - merge 1:1 code using `data2' - drop if _merge==3 | _merge==1 - drop _merge - gen type = 2 - gen ln = `ly' - append using `data4' - la def type 1 "Within +-3 years" 2 "latest but not within +-3 years" - la val type type - drop sel* over* - sort code - //bring in cpi icp - merge 1:1 code year survname using `cpidata', keepus(cpi2017 icp2017) - keep if _merge==3 - drop _merge - //bring in lineup values 215 and 685 - gen country_code = code - ren year surv_year - gen year = `ly' - foreach ln of global lnvalues { - merge 1:1 country_code year using "${upath2}\03.intermediate\PIPinput\PIP_`ly'_`ln'.dta", keepus(headcount) - keep if _merge==3 - drop _merge - ren headcount pov`ln' - } - drop country_code year - ren surv_year year - //only keep wtihin +-3 - keep if type==1 - order code year file cpi2017 icp2017 pov215 pov365 pov685 - - saveold "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_`ly'", replace -} +clear all +tempfile data1 data2 data3 data4 cpidata +global rnd AM2024 +global rnd1 AM24 + +global circa 3 +global lnystart 2021 +global lnyend 2021 +global lnvalues 215 365 685 + +//load cpi +*dlw, country(support) year(2005) type(gmdraw) filename(Final_CPI_PPP_to_be_used.dta) files surveyid(Support_2005_CPI_v11_M) +use "${upath2}\02.input\Final_CPI_PPP_to_be_used.dta", clear +gen lis = strpos(survname,"-LIS")>0 +keep if lis==1 +keep if inlist(code,"AUS","CAN","DEU","GBR","ISR","JPN","KOR","TWN", "USA") +save `cpidata', replace + +//load pfw +*dlw, country(support) year(2005) type(gmdraw) filename(Survey_price_framework.dta) files surveyid(Support_2005_CPI_v11_M) +use "${upath2}\02.input\Survey_price_framework.dta", clear +gen lis = strpos(survname,"-LIS")>0 +keep if lis==1 +keep if inlist(code,"AUS","CAN","DEU","GBR","ISR","JPN","KOR","TWN", "USA") +gen code2 = "" +replace code2 = "au" if code=="AUS" +replace code2 = "ca" if code=="CAN" +replace code2 = "de" if code=="DEU" +replace code2 = "uk" if code=="GBR" +replace code2 = "il" if code=="ISR" +replace code2 = "jp" if code=="JPN" +replace code2 = "kr" if code=="KOR" +replace code2 = "tw" if code=="TWN" +replace code2 = "us" if code=="USA" +tostring year, gen(yr) +gen yr2 = substr(yr,3,2) +gen file = code2 + yr2 +keep code year file survname rep_year ref_year comparability datatype survey_coverage +save `data1', replace + +//get latest from data1 +bys code: egen ymax = max(year) +keep if year == ymax +drop ymax +save `data2', replace + +//2.figure out the selection for circa +use `data1', clear +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code datatype) + +//add more years +local start `=${lnystart}-3' +local end `=${lnyend}+3' +forv ly =`start'(1)`end' { + cap gen year`ly'=. +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order + +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + gen sel`ly' = `ly' if year`ly'==1 + //1 year + replace sel`ly' = `=`ly'+1' if year`=`ly'+1'==1 & sel`ly'==. + replace sel`ly' = `=`ly'-1' if year`=`ly'-1'==1 & sel`ly'==. + //2 years + replace sel`ly' = `=`ly'+2' if year`=`ly'+2'==1 & sel`ly'==. + replace sel`ly' = `=`ly'-2' if year`=`ly'-2'==1 & sel`ly'==. + //3 year2 + replace sel`ly' = `=`ly'+3' if year`=`ly'+3'==1 & sel`ly'==. + replace sel`ly' = `=`ly'-3' if year`=`ly'-3'==1 & sel`ly'==. +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code datatype sel* over* +save `data3', replace + +//add back to the data +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + use `data3', clear + gen ln = `ly' + gen year = sel`ly' + drop if year==. + merge 1:1 code year datatype using `data1' + ta _merge + keep if _merge==3 + drop _merge + gen type = 1 + save `data4', replace + + keep code + duplicates drop code, force + merge 1:1 code using `data2' + drop if _merge==3 | _merge==1 + drop _merge + gen type = 2 + gen ln = `ly' + append using `data4' + la def type 1 "Within +-3 years" 2 "latest but not within +-3 years" + la val type type + drop sel* over* + sort code + //bring in cpi icp + merge 1:1 code year survname using `cpidata', keepus(cpi2017 icp2017) + keep if _merge==3 + drop _merge + //bring in lineup values 215 and 685 + gen country_code = code + ren year surv_year + gen year = `ly' + foreach ln of global lnvalues { + merge 1:1 country_code year using "${upath2}\03.intermediate\PIPinput\PIP_`ly'_`ln'.dta", keepus(headcount) + keep if _merge==3 + drop _merge + ren headcount pov`ln' + } + drop country_code year + ren surv_year year + //only keep wtihin +-3 + keep if type==1 + order code year file cpi2017 icp2017 pov215 pov365 pov685 + + saveold "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_`ly'", replace +} diff --git a/01.code/dofile/1-2 Get list for GMD full.do b/01.code/dofile/1-2 Get list for GMD full.do old mode 100644 new mode 100755 index 8215c10..58a4afa --- a/01.code/dofile/1-2 Get list for GMD full.do +++ b/01.code/dofile/1-2 Get list for GMD full.do @@ -1,235 +1,217 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data -clear all -tempfile data1 data2 data3 data4 class data1list data2a -global rnd AM2024 - -*global upath2 -global lnlist 2010 2021 -global circa 3 - -local lnlist2 : subinstr global lnlist " " ",",all -global lnystart = min(`lnlist2') -global lnyend = max(`lnlist2') - -*global lnystart 2010 -*global lnyend 2010 - -//Load survey list -use "${upath2}\03.intermediate\Survey_varlist", clear -drop if code=="MDG" & year==2021 -drop if code=="IRN" & year==2011 //no educat4 -drop if code=="NGA" & year==2022 - -keep if use_microdata==1 -*drop if strpos(survname,"-LIS")>0 -drop ct_subnatid* - -tempfile data1 -save `data1', replace - -keep code -duplicates drop code, force -save `data1list', replace - -//get latest from data1 -use `data1', clear -bys code: egen ymax = max(year) -keep if year == ymax -drop ymax -save `data2a', replace - -//2.figure out the selection for circa -use `data1', clear -su year,d -local miny = r(min) -local maxy = r(max) -forv y=`miny'(1)`maxy' { - gen year`y'=. - replace year`y'= 1 if year==`y' -} -drop year -collapse (sum) year*, by(code) - -//add more years -foreach ly1 of global lnlist { - local start `=`ly1'-${circa}' - local end `=`ly1'+${circa}' - forv ly =`start'(1)`end' { - cap gen year`ly'=. - } -} - -//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order -local start $lnystart -local end $lnyend -forv ly = `start'(1)`end' { - gen sel`ly' = `ly' if year`ly'==1 - forv j=1(1)${circa} { - //+ is prefered than -, and 1 is prefered than 2, so on - replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. - replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. - } -} - -//check overlapped -local start `=${lnystart}+1' -local end ${lnyend} -forv ly = `start'(1)`end' { - gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. -} -gen over${lnystart} = . -keep code sel* over* -tempfile data2 -save `data2', replace - -//add back to the data so we can aggregate -*local start $lnystart -*local end $lnyend -foreach ly of global lnlist { -*forv ly = `start'(1)`end' { - use `data2', clear - gen year = sel`ly' - drop if year==. - merge 1:1 code year using `data1' - ta _merge - keep if _merge==3 - drop _merge - gen type = 1 - save `data4', replace - - keep code - duplicates drop code, force - merge 1:1 code using `data1list' - keep if _merge==2 - drop _merge - - merge 1:m code using `data1' - keep if _merge==3 - drop _merge - - gen diff = abs(rep_year-`ly') - bys code (rep_year): egen yrmin = min(diff) - keep if diff==yrmin - bys code (diff): egen yrmax = max(rep_year) - gen yrm = rep_year ==yrmax - drop if yrm==0 - drop yrm diff yrmin yrmax - - gen type = 2 - gen ln = `ly' - append using `data4' - la def type 1 "Within +-3 years" 2 "latest but not within +-3 years" - la val type type - drop sel* over* - sort code - ren type surtype - - //add pop and hist income group of lineup year - ren year datayear1 - gen year = `ly' - - //ASPIRE - merge 1:1 code using "${upath2}\02.input\\`ly'\\ASPIRE_data_`ly'.dta", keepus(type _pop_All_SPL) - ren type type_aspire - drop if _merge==2 - drop _merge - - //Findex - merge 1:1 code using "${upath2}\02.input\\`ly'\\findex_`ly'_quintiles.dta", keepus(type) - ren type type_findex - drop if _merge==2 - drop _merge - - //UNESCO_cov_2021 - merge 1:1 code using "${upath2}\02.input\\`ly'\\UNESCO_cov_`ly'.dta", keepus(type) - ren type type_unesco - drop if _merge==2 - drop _merge - - //GED ==> review when it is 100 - merge 1:1 code using "${upath2}\02.input\\`ly'\\GED_cov_`ly'.dta", keepus(type) - ren type type_ged - drop if _merge==2 - drop _merge - - //JMP - merge 1:1 code using "${upath2}\02.input\\`ly'\\JMP_cov_`ly'.dta", keepus(type) - ren type type_jmp - drop if _merge==2 - drop _merge - - //flag system: 1 in survey, 2 universal, 3 to fuse, 4 missing, cant fuse - gen ct_sp = . - gen ct_findex = ct_fin_account - gen findex_flag = . - gen edu_flag = . - - //universal - gen All_SPL =round(_pop_All_SPL,1) - replace sp_flag = 2 if All_SPL>=97 & All_SPL~=. - replace sp_flag = . if (code=="COL"|code=="MEX"|code=="TUR") & All_SPL<97 //COL MEX is not universal - - replace ct_sp = 2 if sp_flag==2 - replace ct_imp_wat_rec = 2 if water_flag==2 - replace ct_electricity = 2 if elec_flag==2 - - //fused - replace water_flag=3 if type_jmp~="" & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) - replace ct_imp_wat_rec = 3 if water_flag==3 & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) - replace ct_imp_wat_rec = 2 if water_flag==2 & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) - - replace elec_flag=3 if type_ged~="" & (ct_electricity==0|ct_electricity==.) - replace ct_electricity = 3 if elec_flag==3 & (ct_electricity==0|ct_electricity==.) - replace ct_electricity = 2 if elec_flag==2 & (ct_electricity==0|ct_electricity==.) - - replace sp_flag=3 if type_aspire~="" & (ct_sp==0|ct_sp==.) - replace ct_sp = 2 if sp_flag==2 & (ct_sp==0|ct_sp==.) - replace ct_sp = 3 if sp_flag==3 & (ct_sp==0|ct_sp==.) - - replace findex_flag=3 if type_findex~="" & (ct_findex==0|ct_findex==.) - replace ct_findex = 2 if findex_flag==2 & (ct_findex==0|ct_findex==.) - replace ct_findex = 3 if findex_flag==3 & (ct_findex==0|ct_findex==.) - replace ct_findex = 2 if code=="LUX" - replace findex_flag = 2 if code=="LUX" - - replace edu_flag=3 if type_unesco~="" & (ct_educat4==0|ct_educat4==.) - replace ct_educat4 = 2 if edu_flag==2 & (ct_educat4==0|ct_educat4==.) - replace ct_educat4 = 3 if edu_flag==3 & (ct_educat4==0|ct_educat4==.) - - //Update flags - replace edu_flag = 1 if edu_flag==. & ct_educat4==1 - replace water_flag = 1 if water_flag==. & ct_imp_wat_rec==1 - replace elec_flag = 1 if elec_flag==. & ct_electricity==1 - replace sp_flag = 1 if sp_flag==. & ct_sp==1 - replace findex_flag = 1 if findex_flag==. & ct_findex==1 - - //todo when all variables are available, and surveys are within +-3, rep_year>= `ly' - ${circa} & rep_year<= `ly' + ${circa} - gen todo = . - replace todo = 1 if ct_poverty~=. & ct_imp_wat_rec~=. & ct_electricity~=. & ct_educat4~=. & ct_sp~=. & ct_findex~=. & (rep_year >= `ly' - ${circa} & rep_year <= `ly' + ${circa}) - - drop ct_gaul_adm1_code ct_w_30m ct_roof ct_wall ct_floor groupcode groupname country ct_male ct_imp_san_rec All_SPL - - isid code - order code year ct_* *_flag type_* - compress - - saveold "${upath2}\02.input\\`ly'\\GMD_list_`ly'", replace -} - +clear all +tempfile data1 data2 data3 data4 class data1list data2a +global rnd AM2024 + +global lnlist 2021 2021 +global circa 3 + +local lnlist2 : subinstr global lnlist " " ",",all +global lnystart = min(`lnlist2') +global lnyend = max(`lnlist2') + +*global lnystart 2021 +*global lnyend 2021 + +//Load survey list +use "${upath2}\03.intermediate\Survey_varlist", clear +drop if code=="MDG" & year==2021 +drop if code=="IRN" & year==2011 //no educat4 +drop if code=="NGA" & year==2022 + +keep if use_microdata==1 +*drop if strpos(survname,"-LIS")>0 +drop ct_subnatid* + +tempfile data1 +save `data1', replace + +keep code +duplicates drop code, force +save `data1list', replace + +//get latest from data1 +use `data1', clear +bys code: egen ymax = max(year) +keep if year == ymax +drop ymax +save `data2a', replace + +//2.figure out the selection for circa +use `data1', clear +su year,d +local miny = r(min) +local maxy = r(max) +forv y=`miny'(1)`maxy' { + gen year`y'=. + replace year`y'= 1 if year==`y' +} +drop year +collapse (sum) year*, by(code) + +//add more years +foreach ly1 of global lnlist { + local start `=`ly1'-${circa}' + local end `=`ly1'+${circa}' + forv ly =`start'(1)`end' { + cap gen year`ly'=. + } +} + +//lineup year, select the same year first, then +-1, then +-2, then +-3, prefers the latest data first and closer, in that order +local start $lnystart +local end $lnyend +forv ly = `start'(1)`end' { + di "Processing year: `ly'" + gen sel`ly' = `ly' if year`ly'==1 + forv j=1(1)${circa} { + //+ is prefered than -, and 1 is prefered than 2, so on + replace sel`ly' = `=`ly'+`j'' if year`=`ly'+`j''==1 & sel`ly'==. + replace sel`ly' = `=`ly'-`j'' if year`=`ly'-`j''==1 & sel`ly'==. + } +} + +//check overlapped +local start `=${lnystart}+1' +local end ${lnyend} +forv ly = `start'(1)`end' { + gen over`ly' = 1 if sel`ly'==sel`=`ly'-1' & sel`ly'~=. & sel`=`ly'-1'~=. +} +gen over${lnystart} = . +keep code sel* over* +tempfile data2 +save `data2', replace + +//add back to the data so we can aggregate +*local start $lnystart +*local end $lnyend +foreach ly of global lnlist { +*forv ly = `start'(1)`end' { + use `data2', clear + gen year = sel`ly' + drop if year==. + merge 1:1 code year using `data1' + ta _merge + keep if _merge==3 + drop _merge + gen type = 1 + save `data4', replace + + keep code + duplicates drop code, force + merge 1:1 code using `data1list' + keep if _merge==2 + drop _merge + + merge 1:m code using `data1' + keep if _merge==3 + drop _merge + + gen diff = abs(rep_year-`ly') + bys code (rep_year): egen yrmin = min(diff) + keep if diff==yrmin + bys code (diff): egen yrmax = max(rep_year) + gen yrm = rep_year ==yrmax + drop if yrm==0 + drop yrm diff yrmin yrmax + + gen type = 2 + gen ln = `ly' + append using `data4' + la def type 1 "Within +-3 years" 2 "latest but not within +-3 years" + la val type type + drop sel* over* + sort code + ren type surtype + + //add pop and hist income group of lineup year + ren year datayear1 + gen year = `ly' + + //ASPIRE + merge 1:1 code using "${upath2}\02.input\\`ly'\\ASPIRE_data_`ly'.dta", keepus(type _pop_All_SPL) + ren type type_aspire + drop if _merge==2 + drop _merge + + //Findex + merge 1:1 code using "${upath2}\02.input\\`ly'\\findex_`ly'_quintiles.dta", keepus(type) + ren type type_findex + drop if _merge==2 + drop _merge + + //UNESCO_cov_2021 + merge 1:1 code using "${upath2}\02.input\\`ly'\\UNESCO_cov_`ly'.dta", keepus(type) + ren type type_unesco + drop if _merge==2 + drop _merge + + //GED ==> review when it is 100 + merge 1:1 code using "${upath2}\02.input\\`ly'\\GED_cov_`ly'.dta", keepus(type) + ren type type_ged + drop if _merge==2 + drop _merge + + //JMP + merge 1:1 code using "${upath2}\02.input\\`ly'\\JMP_cov_`ly'.dta", keepus(type) + ren type type_jmp + drop if _merge==2 + drop _merge + + //flag system: 1 in survey, 2 universal, 3 to fuse, 4 missing, cant fuse + gen ct_sp = . + gen ct_findex = ct_fin_account + gen findex_flag = . + gen edu_flag = . + + //universal + gen All_SPL =round(_pop_All_SPL,1) + replace sp_flag = 2 if All_SPL>=97 & All_SPL~=. + replace sp_flag = . if (code=="COL"|code=="MEX"|code=="TUR") & All_SPL<97 //COL MEX is not universal + + replace ct_sp = 2 if sp_flag==2 + replace ct_imp_wat_rec = 2 if water_flag==2 + replace ct_electricity = 2 if elec_flag==2 + + //fused + replace water_flag=3 if type_jmp~="" & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) + replace ct_imp_wat_rec = 3 if water_flag==3 & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) + replace ct_imp_wat_rec = 2 if water_flag==2 & (ct_imp_wat_rec==0|ct_imp_wat_rec==.) + + replace elec_flag=3 if type_ged~="" & (ct_electricity==0|ct_electricity==.) + replace ct_electricity = 3 if elec_flag==3 & (ct_electricity==0|ct_electricity==.) + replace ct_electricity = 2 if elec_flag==2 & (ct_electricity==0|ct_electricity==.) + + replace sp_flag=3 if type_aspire~="" & (ct_sp==0|ct_sp==.) + replace ct_sp = 2 if sp_flag==2 & (ct_sp==0|ct_sp==.) + replace ct_sp = 3 if sp_flag==3 & (ct_sp==0|ct_sp==.) + + replace findex_flag=3 if type_findex~="" & (ct_findex==0|ct_findex==.) + replace ct_findex = 2 if findex_flag==2 & (ct_findex==0|ct_findex==.) + replace ct_findex = 3 if findex_flag==3 & (ct_findex==0|ct_findex==.) + replace ct_findex = 2 if code=="LUX" + replace findex_flag = 2 if code=="LUX" + + replace edu_flag=3 if type_unesco~="" & (ct_educat4==0|ct_educat4==.) + replace ct_educat4 = 2 if edu_flag==2 & (ct_educat4==0|ct_educat4==.) + replace ct_educat4 = 3 if edu_flag==3 & (ct_educat4==0|ct_educat4==.) + + //Update flags + replace edu_flag = 1 if edu_flag==. & ct_educat4==1 + replace water_flag = 1 if water_flag==. & ct_imp_wat_rec==1 + replace elec_flag = 1 if elec_flag==. & ct_electricity==1 + replace sp_flag = 1 if sp_flag==. & ct_sp==1 + replace findex_flag = 1 if findex_flag==. & ct_findex==1 + + //todo when all variables are available, and surveys are within +-3, rep_year>= `ly' - ${circa} & rep_year<= `ly' + ${circa} + gen todo = . + replace todo = 1 if ct_poverty~=. & ct_imp_wat_rec~=. & ct_electricity~=. & ct_educat4~=. & ct_sp~=. & ct_findex~=. & (rep_year >= `ly' - ${circa} & rep_year <= `ly' + ${circa}) + + drop ct_gaul_adm1_code ct_w_30m ct_roof ct_wall ct_floor groupcode groupname country ct_male ct_imp_san_rec All_SPL + + isid code + order code year ct_* *_flag type_* + compress + + saveold "${upath2}\02.input\\`ly'\\GMD_list_`ly'", replace +} + diff --git a/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do b/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do old mode 100644 new mode 100755 index 8cc2021..d363409 --- a/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do +++ b/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do @@ -1,306 +1,286 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -************************************************************************* -*LISSY Vul - national level - Need to copy the code to run on LISSY -*Results will sent by email, clean it and run next steps -************************************************************************* - -clear -input str3 code year str4 file cpi2017 icp2017 pov215 pov365 pov685 -AUS 2018 au18 1.019114 1.5293353 0.4971 0.7382 0.9843 -CAN 2019 ca19 1.0426171 1.2866648 0.2486 0.4995 0.7154 -DEU 2020 de20 1.0335239 .7871803 0.2438 0.2438 0.4711 -GBR 2021 uk21 1.0774739 .77973002 0.2451 0.4983 0.7377 -ISR 2021 il21 1.0255175 4.2066031 0.2422 0.9950 3.4907 -KOR 2021 kr21 1.0497039 974.20563 0.0000 0.2488 0.4979 -TWN 2021 tw21 1.0367744 16.597717 0.0000 0.0000 0.2474 -USA 2021 us21 1.1054594 1 0.2487 0.4963 0.9996 -end - -replace file = trim(file) -local all= _N -tempfile data1 -save `data1', replace - -global plinelist 215 365 685 - -qui forv i=1(1)`all' { - - use `data1', clear - local code = code[`i'] - local year = year[`i'] - local cpi2017 = cpi2017[`i'] - local icp2017 = icp2017[`i'] - foreach num of numlist ${plinelist} { - local pov`num' = pov`num'[`i'] - } - - local x = file[`i'] - - use ${`x'h}, clear - gen code = "`code'" - - // urban dummy - gen urban= (rural==0) - - // welfare variable - qui drop if dhi<0 - qui drop if dhi == . - qui gen welfare = dhi/nhhmem/`cpi2017'/`icp2017'/365 - replace welfare = 0.25 if welfare<0.25 - - //merge with person-level file - qui collapse year rural hpopwgt nhhmem welfare, by(dname code hid region_c) - qui sum year - qui local y = substr("`r(mean)'", 3,.) - - qui merge 1:m hid using ${`x'p}, keepusing(pid ppopwgt age educlev illiterate enroll lfs inda1 sex) nogenerate - - //subnatid - cap des region_c - qui if _rc==0 { - cap decode region_c, gen(region_c1) - if _rc~=0 tostring region_c, gen(region_c1) - replace region_c1 = trim(region_c1) - - //recode region for ISR - if "`code'"=="ISR" { - replace region_c1 = "North" if region_c1=="[21]North: Zefat" | region_c1=="[21]North: Zefat, Kineret & Golan" | region_c1=="[22]North: Kinneret" | region_c1=="[23]North: Yizrael" | region_c1=="[23]North: Yizrael-Afula" | region_c1=="[24]North: Acre" | region_c1=="[25]North: Yizrael-Nazareth" | region_c1=="[29]North: Golan" - - replace region_c1 = "Haifa" if region_c1=="[31]Haifa: Haifa" | region_c1=="[32]Haifa: Hadera" - - replace region_c1 = "Center" if region_c1=="[41]Center: Sharon"|region_c1=="[42]Center: Petah Tiqwa"| region_c1=="[42]Center: Petah-Tikva"| region_c1=="[43]Center: Ramla" | region_c1=="[44]Center: Rehovot" - - replace region_c1 = "Tel Aviv" if region_c1=="[51]Tel aviv: Tel Aviv" |region_c1=="[52]Tel Aviv: Ramat Gan" | region_c1=="[52]Tel aviv: Ramat-gan" | region_c1=="[53]Tel aviv: Holon"|region_c1=="[51]Tel Aviv: Tel Aviv" |region_c1=="[53]Tel Aviv: Holon" - - replace region_c1 = "South" if region_c1=="[61]South: Ashkelon" | region_c1=="[61]South: Ashqelon" | region_c1=="[62]South: Be'er Sheva" - } - - drop region_c - ren region_c1 region_c - - //truncate region_c to 32 characters - replace region_c = substr(region_c,1,32) - } //region_c - else { - gen region_c = "MISSING" - } - - *decode rural, gen(rural2) - *replace rural2 = trim(rural2) - gen reg_rural = region_c + "*_*"+string(rural) - gen _all_ = "All Sample" - - /************************************** - 0. Generate comparable education vars - **************************************/ - qui gen educat4 = 1 if inlist(educlev,111) - qui replace educat4 = 2 if inlist(educlev,110,120) - qui replace educat4 = 3 if inlist(educlev,130,210) - qui replace educat4 = 4 if inlist(educlev,311,312,313,320) - - qui gen educat5 = 1 if inlist(educlev,111) - qui replace educat5 = 2 if inlist(educlev,110) - qui replace educat5 = 3 if inlist(educlev,120,130) - qui replace educat5 = 4 if inlist(educlev,210) - qui replace educat5 = 5 if inlist(educlev,220,311,312,313,320) - - qui gen educat7 = 1 if inlist(educlev,111) - qui replace educat7 = 2 if inlist(educlev,110) - qui replace educat7 = 3 if inlist(educlev,120) - qui replace educat7 = 4 if inlist(educlev,130) - qui replace educat7 = 5 if inlist(educlev,210) - qui replace educat7 = 6 if inlist(educlev,220,311) - qui replace educat7 = 4 if inlist(educlev,312,313,320) - - // school dummy - qui gen school = (enroll==1) - - // agricultural worker dummy - qui gen agri = (inda1==1) - - // labor force variable - qui gen lstatus1 = lfs==100 - qui gen lstatus2 = lfs==200 - qui gen lstatus3 = inlist(lfs,300,310,320,330,340) - - // male dummy - qui gen male = (sex==1) - - /*********************************** - **Dimension 1: Education - ***********************************/ - - **1a) Indicator: no one in hh with primary completion (age 15+) - //All adults - qui global eduage 15 - qui local eduflag = 0 - qui cap gen educat5 = . - qui cap gen educat7 = . - - qui cap su educat7 - qui if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) - } - qui else { //educat5 - cap su educat5 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) - } - else { //educat4 - cap su educat4 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) - } - else { //no education available - local eduflag = 1 - } - } //educat4 - } - - qui if `eduflag'==0 { - gen temp2a = 1 if age>=$eduage & age~=. - bys hid: egen educ_com_size = sum(temp2a) - bys hid: egen temp3 = sum(temp2) - bys hid: egen temp3c = sum(temp2c) - gen dep_educ_com = 0 - replace dep_educ_com = 1 if temp3==0 - gen dep_educ_com_lb = 0 - replace dep_educ_com_lb = 1 if temp3c==0 - ren temp3 educ_com_sum - ren temp3c educ_com_sum_lb - drop temp2 temp2a temp2c - } - qui else { - gen dep_educ_com = . - gen dep_educ_com_lb = . - gen educ_com_sum = . - gen educ_com_sum_lb = . - gen educ_com_size = . - } - - qui gen educ_com_appl = 1 - qui replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) - qui gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. - qui bys hid: egen educ_com_mis = sum(temp2b) - qui drop temp2b - qui gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. - - qui la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" - qui la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" - qui la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" - qui la var educ_com_appl_miss "School completion is applicable households but missing completion" - cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss - **************************************************** - **Dimension 2: Access to infrastructure - **************************************************** - - **************************************************** - //Indicator: Electricity - gen dep_infra_elec = 0 - qui la var dep_infra_elec "Deprived if HH has No access to electricity" - - **************************************************** - //Indicator: Water - gen dep_infra_impw = 0 - qui la var dep_infra_impw "Deprived if HH has No access to improved water" - - **************************************************** - **Dimension 3: Monetary - **************************************************** - //recalculate the 2.15 line for 2.15 poverty - qui foreach num of numlist ${plinelist} { - if `pov`num''==0 { - local pline`num' = `=`num'/100' - } - else { - _pctile welfare [aw=hpopwgt], p(`pov`num'') - local pline`num' = r(r1) - } - - gen poor`num'_ln = welfare < `pline`num'' if welfare~=. - gen pline`num' = `pline`num'' - } //num - - //findex - gen dep_fin = 0 - - //social protection - gen dep_sp = 0 - - qui gen file = dname - gen _count = 1 - gen h = hpopwgt - //multidimensional vulnerability - foreach num of numlist ${plinelist} { - //vulnerable and one dim - gen p1_edu_`num' = 0 - replace p1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 - - gen p1_sp_`num' = 0 - replace p1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 - - gen p1_fin_`num' = 0 - replace p1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 - - gen p1_elec_`num' = 0 - replace p1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 - - gen p1_water_`num' = 0 - replace p1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 - - //rsum - egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing - - //any of the 6 dimensions - deprived in education; dep_sp; dep_fin - gen multvul_`num' = 0 - replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. - - // any 2, 3, 4,...,6 - forv j=2(1)6 { - gen all`j'vul_`num' = 0 - replace all`j'vul_`num' = 1 if dim6_`num'==`j' - } - } //povlist - - - collapse (mean) welfare poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* p1_* dim6* (rawsum) _count h [aw=hpopwgt], by(file year _all_) - - gen data_`x' = 1 - global reglvl _all_ - - //tables - noi dis "`code'-`year' - `x'" - noi table ${reglvl} , c(mean welfare mean dep_infra_elec mean dep_infra_impw mean dep_fin mean dep_sp) format(%15.0g) missing - noi table ${reglvl} , c(mean dep_educ_com mean _count mean h) format(%15.0g) missing - - foreach num of numlist ${plinelist} { - noi table ${reglvl} , c(mean poor`num'_ln mean dim6_`num' mean multvul_`num') format(%15.0g) missing - noi table ${reglvl} , c(mean p1_edu_`num' mean p1_sp_`num' mean p1_fin_`num' mean p1_elec_`num' mean p1_water_`num') format(%15.0g) missing - noi table ${reglvl} , c(mean all2vul_`num' mean all3vul_`num' mean all4vul_`num' mean all5vul_`num' mean all6vul_`num') format(%15.0g) missing - } - -} //end loop all - - - +************************************************************************* +*LISSY Vul - national level +************************************************************************* + +clear +input str3 code year str4 file cpi2017 icp2017 pov215 pov365 pov685 +AUS 2018 au18 1.019114 1.5293353 0.4971 0.7382 0.9843 +CAN 2019 ca19 1.0426171 1.2866648 0.2486 0.4995 0.7154 +DEU 2020 de20 1.0335239 .7871803 0.2438 0.2438 0.4711 +GBR 2021 uk21 1.0774739 .77973002 0.2451 0.4983 0.7377 +ISR 2021 il21 1.0255175 4.2066031 0.2422 0.9950 3.4907 +KOR 2021 kr21 1.0497039 974.20563 0.0000 0.2488 0.4979 +TWN 2021 tw21 1.0367744 16.597717 0.0000 0.0000 0.2474 +USA 2021 us21 1.1054594 1 0.2487 0.4963 0.9996 +end + +replace file = trim(file) +local all= _N +tempfile data1 +save `data1', replace + +global plinelist 215 365 685 + +qui forv i=1(1)`all' { + + use `data1', clear + local code = code[`i'] + local year = year[`i'] + local cpi2017 = cpi2017[`i'] + local icp2017 = icp2017[`i'] + foreach num of numlist ${plinelist} { + local pov`num' = pov`num'[`i'] + } + + local x = file[`i'] + + use ${`x'h}, clear + gen code = "`code'" + + // urban dummy + gen urban= (rural==0) + + // welfare variable + qui drop if dhi<0 + qui drop if dhi == . + qui gen welfare = dhi/nhhmem/`cpi2017'/`icp2017'/365 + replace welfare = 0.25 if welfare<0.25 + + //merge with person-level file + qui collapse year rural hpopwgt nhhmem welfare, by(dname code hid region_c) + qui sum year + qui local y = substr("`r(mean)'", 3,.) + + qui merge 1:m hid using ${`x'p}, keepusing(pid ppopwgt age educlev illiterate enroll lfs inda1 sex) nogenerate + + //subnatid + cap des region_c + qui if _rc==0 { + cap decode region_c, gen(region_c1) + if _rc~=0 tostring region_c, gen(region_c1) + replace region_c1 = trim(region_c1) + + //recode region for ISR + if "`code'"=="ISR" { + replace region_c1 = "North" if region_c1=="[21]North: Zefat" | region_c1=="[21]North: Zefat, Kineret & Golan" | region_c1=="[22]North: Kinneret" | region_c1=="[23]North: Yizrael" | region_c1=="[23]North: Yizrael-Afula" | region_c1=="[24]North: Acre" | region_c1=="[25]North: Yizrael-Nazareth" | region_c1=="[29]North: Golan" + + replace region_c1 = "Haifa" if region_c1=="[31]Haifa: Haifa" | region_c1=="[32]Haifa: Hadera" + + replace region_c1 = "Center" if region_c1=="[41]Center: Sharon"|region_c1=="[42]Center: Petah Tiqwa"| region_c1=="[42]Center: Petah-Tikva"| region_c1=="[43]Center: Ramla" | region_c1=="[44]Center: Rehovot" + + replace region_c1 = "Tel Aviv" if region_c1=="[51]Tel aviv: Tel Aviv" |region_c1=="[52]Tel Aviv: Ramat Gan" | region_c1=="[52]Tel aviv: Ramat-gan" | region_c1=="[53]Tel aviv: Holon"|region_c1=="[51]Tel Aviv: Tel Aviv" |region_c1=="[53]Tel Aviv: Holon" + + replace region_c1 = "South" if region_c1=="[61]South: Ashkelon" | region_c1=="[61]South: Ashqelon" | region_c1=="[62]South: Be'er Sheva" + } + + drop region_c + ren region_c1 region_c + + //truncate region_c to 32 characters + replace region_c = substr(region_c,1,32) + } //region_c + else { + gen region_c = "MISSING" + } + + *decode rural, gen(rural2) + *replace rural2 = trim(rural2) + gen reg_rural = region_c + "*_*"+string(rural) + gen _all_ = "All Sample" + + /************************************** + 0. Generate comparable education vars + **************************************/ + qui gen educat4 = 1 if inlist(educlev,111) + qui replace educat4 = 2 if inlist(educlev,110,120) + qui replace educat4 = 3 if inlist(educlev,130,210) + qui replace educat4 = 4 if inlist(educlev,311,312,313,320) + + qui gen educat5 = 1 if inlist(educlev,111) + qui replace educat5 = 2 if inlist(educlev,110) + qui replace educat5 = 3 if inlist(educlev,120,130) + qui replace educat5 = 4 if inlist(educlev,210) + qui replace educat5 = 5 if inlist(educlev,220,311,312,313,320) + + qui gen educat7 = 1 if inlist(educlev,111) + qui replace educat7 = 2 if inlist(educlev,110) + qui replace educat7 = 3 if inlist(educlev,120) + qui replace educat7 = 4 if inlist(educlev,130) + qui replace educat7 = 5 if inlist(educlev,210) + qui replace educat7 = 6 if inlist(educlev,220,311) + qui replace educat7 = 4 if inlist(educlev,312,313,320) + + // school dummy + qui gen school = (enroll==1) + + // agricultural worker dummy + qui gen agri = (inda1==1) + + // labor force variable + qui gen lstatus1 = lfs==100 + qui gen lstatus2 = lfs==200 + qui gen lstatus3 = inlist(lfs,300,310,320,330,340) + + // male dummy + qui gen male = (sex==1) + + /*********************************** + **Dimension 1: Education + ***********************************/ + + **1a) Indicator: no one in hh with primary completion (age 15+) + //All adults + qui global eduage 15 + qui local eduflag = 0 + qui cap gen educat5 = . + qui cap gen educat7 = . + + qui cap su educat7 + qui if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) + } + qui else { //educat5 + cap su educat5 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) + } + else { //educat4 + cap su educat4 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) + } + else { //no education available + local eduflag = 1 + } + } //educat4 + } + + qui if `eduflag'==0 { + gen temp2a = 1 if age>=$eduage & age~=. + bys hid: egen educ_com_size = sum(temp2a) + bys hid: egen temp3 = sum(temp2) + bys hid: egen temp3c = sum(temp2c) + gen dep_educ_com = 0 + replace dep_educ_com = 1 if temp3==0 + gen dep_educ_com_lb = 0 + replace dep_educ_com_lb = 1 if temp3c==0 + ren temp3 educ_com_sum + ren temp3c educ_com_sum_lb + drop temp2 temp2a temp2c + } + qui else { + gen dep_educ_com = . + gen dep_educ_com_lb = . + gen educ_com_sum = . + gen educ_com_sum_lb = . + gen educ_com_size = . + } + + qui gen educ_com_appl = 1 + qui replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) + qui gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. + qui bys hid: egen educ_com_mis = sum(temp2b) + qui drop temp2b + qui gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. + + qui la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" + qui la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" + qui la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" + qui la var educ_com_appl_miss "School completion is applicable households but missing completion" + cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss + **************************************************** + **Dimension 2: Access to infrastructure + **************************************************** + + **************************************************** + //Indicator: Electricity + gen dep_infra_elec = 0 + qui la var dep_infra_elec "Deprived if HH has No access to electricity" + + **************************************************** + //Indicator: Water + gen dep_infra_impw = 0 + qui la var dep_infra_impw "Deprived if HH has No access to improved water" + + **************************************************** + **Dimension 3: Monetary + **************************************************** + //recalculate the 2.15 line for 2.15 poverty + qui foreach num of numlist ${plinelist} { + if `pov`num''==0 { + local pline`num' = `=`num'/100' + } + else { + _pctile welfare [aw=hpopwgt], p(`pov`num'') + local pline`num' = r(r1) + } + + gen poor`num'_ln = welfare < `pline`num'' if welfare~=. + gen pline`num' = `pline`num'' + } //num + + //findex + gen dep_fin = 0 + + //social protection + gen dep_sp = 0 + + qui gen file = dname + gen _count = 1 + gen h = hpopwgt + //multidimensional vulnerability + foreach num of numlist ${plinelist} { + //vulnerable and one dim + gen p1_edu_`num' = 0 + replace p1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 + + gen p1_sp_`num' = 0 + replace p1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 + + gen p1_fin_`num' = 0 + replace p1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 + + gen p1_elec_`num' = 0 + replace p1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 + + gen p1_water_`num' = 0 + replace p1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 + + //rsum + egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing + + //any of the 6 dimensions - deprived in education; dep_sp; dep_fin + gen multvul_`num' = 0 + replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. + + // any 2, 3, 4,...,6 + forv j=2(1)6 { + gen all`j'vul_`num' = 0 + replace all`j'vul_`num' = 1 if dim6_`num'==`j' + } + } //povlist + + + collapse (mean) welfare poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* p1_* dim6* (rawsum) _count h [aw=hpopwgt], by(file year _all_) + + gen data_`x' = 1 + global reglvl _all_ + + //tables + noi dis "`code'-`year' - `x'" + noi table ${reglvl} , c(mean welfare mean dep_infra_elec mean dep_infra_impw mean dep_fin mean dep_sp) format(%15.0g) missing + noi table ${reglvl} , c(mean dep_educ_com mean _count mean h) format(%15.0g) missing + + foreach num of numlist ${plinelist} { + noi table ${reglvl} , c(mean poor`num'_ln mean dim6_`num' mean multvul_`num') format(%15.0g) missing + noi table ${reglvl} , c(mean p1_edu_`num' mean p1_sp_`num' mean p1_fin_`num' mean p1_elec_`num' mean p1_water_`num') format(%15.0g) missing + noi table ${reglvl} , c(mean all2vul_`num' mean all3vul_`num' mean all4vul_`num' mean all5vul_`num' mean all6vul_`num') format(%15.0g) missing + } + +} //end loop all + + + diff --git a/01.code/dofile/2-1b Estimate vul rate for LISSY data.do b/01.code/dofile/2-1b Estimate vul rate for LISSY data.do old mode 100644 new mode 100755 index f050f16..c4dfbf5 --- a/01.code/dofile/2-1b Estimate vul rate for LISSY data.do +++ b/01.code/dofile/2-1b Estimate vul rate for LISSY data.do @@ -1,306 +1,287 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -************************************************************************* -*LISSY Vul -************************************************************************* - -clear -input str3 code year str4 file cpi2017 icp2017 pov215 pov365 pov685 -AUS 2018 au18 1.019114 1.5293353 0.4971 0.7382 0.9843 -CAN 2019 ca19 1.0426171 1.2866648 0.2486 0.4995 0.7154 -DEU 2020 de20 1.0335239 .7871803 0.2438 0.2438 0.4711 -GBR 2021 uk21 1.0774739 .77973002 0.2451 0.4983 0.7377 -ISR 2021 il21 1.0255175 4.2066031 0.2422 0.9950 3.4907 -KOR 2021 kr21 1.0497039 974.20563 0.0000 0.2488 0.4979 -TWN 2021 tw21 1.0367744 16.597717 0.0000 0.0000 0.2474 -USA 2021 us21 1.1054594 1 0.2487 0.4963 0.9996 -end - -replace file = trim(file) -local all= _N -tempfile data1 -save `data1', replace - -global plinelist 215 365 685 - -qui forv i=1(1)`all' { - - use `data1', clear - local code = code[`i'] - local year = year[`i'] - local cpi2017 = cpi2017[`i'] - local icp2017 = icp2017[`i'] - foreach num of numlist ${plinelist} { - local pov`num' = pov`num'[`i'] - } - - local x = file[`i'] - - use ${`x'h}, clear - gen code = "`code'" - - // urban dummy - gen urban= (rural==0) - - // welfare variable - qui drop if dhi<0 - qui drop if dhi == . - qui gen welfare = dhi/nhhmem/`cpi2017'/`icp2017'/365 - replace welfare = 0.25 if welfare<0.25 - - //merge with person-level file - qui collapse year rural hpopwgt nhhmem welfare, by(dname code hid region_c) - qui sum year - qui local y = substr("`r(mean)'", 3,.) - - qui merge 1:m hid using ${`x'p}, keepusing(pid ppopwgt age educlev illiterate enroll lfs inda1 sex) nogenerate - - //subnatid - cap des region_c - qui if _rc==0 { - cap decode region_c, gen(region_c1) - if _rc~=0 tostring region_c, gen(region_c1) - replace region_c1 = trim(region_c1) - - //recode region for ISR - if "`code'"=="ISR" { - replace region_c1 = "North" if region_c1=="[21]North: Zefat" | region_c1=="[21]North: Zefat, Kineret & Golan" | region_c1=="[22]North: Kinneret" | region_c1=="[23]North: Yizrael" | region_c1=="[23]North: Yizrael-Afula" | region_c1=="[24]North: Acre" | region_c1=="[25]North: Yizrael-Nazareth" | region_c1=="[29]North: Golan" - - replace region_c1 = "Haifa" if region_c1=="[31]Haifa: Haifa" | region_c1=="[32]Haifa: Hadera" - - replace region_c1 = "Center" if region_c1=="[41]Center: Sharon"|region_c1=="[42]Center: Petah Tiqwa"| region_c1=="[42]Center: Petah-Tikva"| region_c1=="[43]Center: Ramla" | region_c1=="[44]Center: Rehovot" - - replace region_c1 = "Tel Aviv" if region_c1=="[51]Tel aviv: Tel Aviv" |region_c1=="[52]Tel Aviv: Ramat Gan" | region_c1=="[52]Tel aviv: Ramat-gan" | region_c1=="[53]Tel aviv: Holon"|region_c1=="[51]Tel Aviv: Tel Aviv" |region_c1=="[53]Tel Aviv: Holon" - - replace region_c1 = "South" if region_c1=="[61]South: Ashkelon" | region_c1=="[61]South: Ashqelon" | region_c1=="[62]South: Be'er Sheva" - } - - drop region_c - ren region_c1 region_c - - //truncate region_c to 32 characters - replace region_c = substr(region_c,1,32) - } //region_c - else { - gen region_c = "MISSING" - } - - *decode rural, gen(rural2) - *replace rural2 = trim(rural2) - gen reg_rural = region_c + "*_*"+string(rural) - gen _all_ = "All Sample" - - /************************************** - 0. Generate comparable education vars - **************************************/ - qui gen educat4 = 1 if inlist(educlev,111) - qui replace educat4 = 2 if inlist(educlev,110,120) - qui replace educat4 = 3 if inlist(educlev,130,210) - qui replace educat4 = 4 if inlist(educlev,311,312,313,320) - - qui gen educat5 = 1 if inlist(educlev,111) - qui replace educat5 = 2 if inlist(educlev,110) - qui replace educat5 = 3 if inlist(educlev,120,130) - qui replace educat5 = 4 if inlist(educlev,210) - qui replace educat5 = 5 if inlist(educlev,220,311,312,313,320) - - qui gen educat7 = 1 if inlist(educlev,111) - qui replace educat7 = 2 if inlist(educlev,110) - qui replace educat7 = 3 if inlist(educlev,120) - qui replace educat7 = 4 if inlist(educlev,130) - qui replace educat7 = 5 if inlist(educlev,210) - qui replace educat7 = 6 if inlist(educlev,220,311) - qui replace educat7 = 4 if inlist(educlev,312,313,320) - - // school dummy - qui gen school = (enroll==1) - - // agricultural worker dummy - qui gen agri = (inda1==1) - - // labor force variable - qui gen lstatus1 = lfs==100 - qui gen lstatus2 = lfs==200 - qui gen lstatus3 = inlist(lfs,300,310,320,330,340) - - // male dummy - qui gen male = (sex==1) - - /*********************************** - **Dimension 1: Education - ***********************************/ - - **1a) Indicator: no one in hh with primary completion (age 15+) - //All adults - qui global eduage 15 - qui local eduflag = 0 - qui cap gen educat5 = . - qui cap gen educat7 = . - - qui cap su educat7 - qui if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) - } - qui else { //educat5 - cap su educat5 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) - } - else { //educat4 - cap su educat4 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) - } - else { //no education available - local eduflag = 1 - } - } //educat4 - } - - qui if `eduflag'==0 { - gen temp2a = 1 if age>=$eduage & age~=. - bys hid: egen educ_com_size = sum(temp2a) - bys hid: egen temp3 = sum(temp2) - bys hid: egen temp3c = sum(temp2c) - gen dep_educ_com = 0 - replace dep_educ_com = 1 if temp3==0 - gen dep_educ_com_lb = 0 - replace dep_educ_com_lb = 1 if temp3c==0 - ren temp3 educ_com_sum - ren temp3c educ_com_sum_lb - drop temp2 temp2a temp2c - } - qui else { - gen dep_educ_com = . - gen dep_educ_com_lb = . - gen educ_com_sum = . - gen educ_com_sum_lb = . - gen educ_com_size = . - } - - qui gen educ_com_appl = 1 - qui replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) - qui gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. - qui bys hid: egen educ_com_mis = sum(temp2b) - qui drop temp2b - qui gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. - - qui la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" - qui la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" - qui la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" - qui la var educ_com_appl_miss "School completion is applicable households but missing completion" - cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss - **************************************************** - **Dimension 2: Access to infrastructure - **************************************************** - - **************************************************** - //Indicator: Electricity - gen dep_infra_elec = 0 - qui la var dep_infra_elec "Deprived if HH has No access to electricity" - - **************************************************** - //Indicator: Water - gen dep_infra_impw = 0 - qui la var dep_infra_impw "Deprived if HH has No access to improved water" - - **************************************************** - **Dimension 3: Monetary - **************************************************** - //recalculate the 2.15 line for 2.15 poverty - qui foreach num of numlist ${plinelist} { - if `pov`num''==0 { - local pline`num' = `=`num'/100' - } - else { - _pctile welfare [aw=hpopwgt], p(`pov`num'') - local pline`num' = r(r1) - } - - gen poor`num'_ln = welfare < `pline`num'' if welfare~=. - gen pline`num' = `pline`num'' - } //num - - //findex - gen dep_fin = 0 - - //social protection - gen dep_sp = 0 - - qui gen file = dname - gen _count = 1 - gen h = hpopwgt - //multidimensional vulnerability - foreach num of numlist ${plinelist} { - //vulnerable and one dim - gen p1_edu_`num' = 0 - replace p1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 - - gen p1_sp_`num' = 0 - replace p1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 - - gen p1_fin_`num' = 0 - replace p1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 - - gen p1_elec_`num' = 0 - replace p1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 - - gen p1_water_`num' = 0 - replace p1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 - - //rsum - egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing - - //any of the 6 dimensions - deprived in education; dep_sp; dep_fin - gen multvul_`num' = 0 - replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. - - // any 2, 3, 4,...,6 - forv j=2(1)6 { - gen all`j'vul_`num' = 0 - replace all`j'vul_`num' = 1 if dim6_`num'==`j' - } - } //povlist - - - collapse (mean) welfare poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* p1_* dim6* (rawsum) _count h [aw=hpopwgt], by(file year reg_rural) - - gen data_`x' = 1 - - global reglvl reg_rural - - //regional level tables - noi dis "`code'-`year' - `x'" - noi table ${reglvl} , c(mean welfare mean dep_infra_elec mean dep_infra_impw mean dep_fin mean dep_sp) format(%15.0g) missing - noi table ${reglvl} , c(mean dep_educ_com mean _count mean h) format(%15.0g) missing - - foreach num of numlist ${plinelist} { - noi table ${reglvl} , c(mean poor`num'_ln mean dim6_`num' mean multvul_`num') format(%15.0g) missing - noi table ${reglvl} , c(mean p1_edu_`num' mean p1_sp_`num' mean p1_fin_`num' mean p1_elec_`num' mean p1_water_`num') format(%15.0g) missing - noi table ${reglvl} , c(mean all2vul_`num' mean all3vul_`num' mean all4vul_`num' mean all5vul_`num' mean all6vul_`num') format(%15.0g) missing - } - -} //end loop all - - - +************************************************************************* +*LISSY Vul +************************************************************************* + +clear +input str3 code year str4 file cpi2017 icp2017 pov215 pov365 pov685 +AUS 2018 au18 1.019114 1.5293353 0.4971 0.7382 0.9843 +CAN 2019 ca19 1.0426171 1.2866648 0.2486 0.4995 0.7154 +DEU 2020 de20 1.0335239 .7871803 0.2438 0.2438 0.4711 +GBR 2021 uk21 1.0774739 .77973002 0.2451 0.4983 0.7377 +ISR 2021 il21 1.0255175 4.2066031 0.2422 0.9950 3.4907 +KOR 2021 kr21 1.0497039 974.20563 0.0000 0.2488 0.4979 +TWN 2021 tw21 1.0367744 16.597717 0.0000 0.0000 0.2474 +USA 2021 us21 1.1054594 1 0.2487 0.4963 0.9996 +end + +replace file = trim(file) +local all= _N +tempfile data1 +save `data1', replace + +global plinelist 215 365 685 + +qui forv i=1(1)`all' { + + use `data1', clear + local code = code[`i'] + local year = year[`i'] + local cpi2017 = cpi2017[`i'] + local icp2017 = icp2017[`i'] + foreach num of numlist ${plinelist} { + local pov`num' = pov`num'[`i'] + } + + local x = file[`i'] + + use ${`x'h}, clear + gen code = "`code'" + + // urban dummy + gen urban= (rural==0) + + // welfare variable + qui drop if dhi<0 + qui drop if dhi == . + qui gen welfare = dhi/nhhmem/`cpi2017'/`icp2017'/365 + replace welfare = 0.25 if welfare<0.25 + + //merge with person-level file + qui collapse year rural hpopwgt nhhmem welfare, by(dname code hid region_c) + qui sum year + qui local y = substr("`r(mean)'", 3,.) + + qui merge 1:m hid using ${`x'p}, keepusing(pid ppopwgt age educlev illiterate enroll lfs inda1 sex) nogenerate + + //subnatid + cap des region_c + qui if _rc==0 { + cap decode region_c, gen(region_c1) + if _rc~=0 tostring region_c, gen(region_c1) + replace region_c1 = trim(region_c1) + + //recode region for ISR + if "`code'"=="ISR" { + replace region_c1 = "North" if region_c1=="[21]North: Zefat" | region_c1=="[21]North: Zefat, Kineret & Golan" | region_c1=="[22]North: Kinneret" | region_c1=="[23]North: Yizrael" | region_c1=="[23]North: Yizrael-Afula" | region_c1=="[24]North: Acre" | region_c1=="[25]North: Yizrael-Nazareth" | region_c1=="[29]North: Golan" + + replace region_c1 = "Haifa" if region_c1=="[31]Haifa: Haifa" | region_c1=="[32]Haifa: Hadera" + + replace region_c1 = "Center" if region_c1=="[41]Center: Sharon"|region_c1=="[42]Center: Petah Tiqwa"| region_c1=="[42]Center: Petah-Tikva"| region_c1=="[43]Center: Ramla" | region_c1=="[44]Center: Rehovot" + + replace region_c1 = "Tel Aviv" if region_c1=="[51]Tel aviv: Tel Aviv" |region_c1=="[52]Tel Aviv: Ramat Gan" | region_c1=="[52]Tel aviv: Ramat-gan" | region_c1=="[53]Tel aviv: Holon"|region_c1=="[51]Tel Aviv: Tel Aviv" |region_c1=="[53]Tel Aviv: Holon" + + replace region_c1 = "South" if region_c1=="[61]South: Ashkelon" | region_c1=="[61]South: Ashqelon" | region_c1=="[62]South: Be'er Sheva" + } + + drop region_c + ren region_c1 region_c + + //truncate region_c to 32 characters + replace region_c = substr(region_c,1,32) + } //region_c + else { + gen region_c = "MISSING" + } + + *decode rural, gen(rural2) + *replace rural2 = trim(rural2) + gen reg_rural = region_c + "*_*"+string(rural) + gen _all_ = "All Sample" + + /************************************** + 0. Generate comparable education vars + **************************************/ + qui gen educat4 = 1 if inlist(educlev,111) + qui replace educat4 = 2 if inlist(educlev,110,120) + qui replace educat4 = 3 if inlist(educlev,130,210) + qui replace educat4 = 4 if inlist(educlev,311,312,313,320) + + qui gen educat5 = 1 if inlist(educlev,111) + qui replace educat5 = 2 if inlist(educlev,110) + qui replace educat5 = 3 if inlist(educlev,120,130) + qui replace educat5 = 4 if inlist(educlev,210) + qui replace educat5 = 5 if inlist(educlev,220,311,312,313,320) + + qui gen educat7 = 1 if inlist(educlev,111) + qui replace educat7 = 2 if inlist(educlev,110) + qui replace educat7 = 3 if inlist(educlev,120) + qui replace educat7 = 4 if inlist(educlev,130) + qui replace educat7 = 5 if inlist(educlev,210) + qui replace educat7 = 6 if inlist(educlev,220,311) + qui replace educat7 = 4 if inlist(educlev,312,313,320) + + // school dummy + qui gen school = (enroll==1) + + // agricultural worker dummy + qui gen agri = (inda1==1) + + // labor force variable + qui gen lstatus1 = lfs==100 + qui gen lstatus2 = lfs==200 + qui gen lstatus3 = inlist(lfs,300,310,320,330,340) + + // male dummy + qui gen male = (sex==1) + + /*********************************** + **Dimension 1: Education + ***********************************/ + + **1a) Indicator: no one in hh with primary completion (age 15+) + //All adults + qui global eduage 15 + qui local eduflag = 0 + qui cap gen educat5 = . + qui cap gen educat7 = . + + qui cap su educat7 + qui if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) + } + qui else { //educat5 + cap su educat5 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) + } + else { //educat4 + cap su educat4 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) + } + else { //no education available + local eduflag = 1 + } + } //educat4 + } + + qui if `eduflag'==0 { + gen temp2a = 1 if age>=$eduage & age~=. + bys hid: egen educ_com_size = sum(temp2a) + bys hid: egen temp3 = sum(temp2) + bys hid: egen temp3c = sum(temp2c) + gen dep_educ_com = 0 + replace dep_educ_com = 1 if temp3==0 + gen dep_educ_com_lb = 0 + replace dep_educ_com_lb = 1 if temp3c==0 + ren temp3 educ_com_sum + ren temp3c educ_com_sum_lb + drop temp2 temp2a temp2c + } + qui else { + gen dep_educ_com = . + gen dep_educ_com_lb = . + gen educ_com_sum = . + gen educ_com_sum_lb = . + gen educ_com_size = . + } + + qui gen educ_com_appl = 1 + qui replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) + qui gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. + qui bys hid: egen educ_com_mis = sum(temp2b) + qui drop temp2b + qui gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. + + qui la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" + qui la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" + qui la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" + qui la var educ_com_appl_miss "School completion is applicable households but missing completion" + cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss + **************************************************** + **Dimension 2: Access to infrastructure + **************************************************** + + **************************************************** + //Indicator: Electricity + gen dep_infra_elec = 0 + qui la var dep_infra_elec "Deprived if HH has No access to electricity" + + **************************************************** + //Indicator: Water + gen dep_infra_impw = 0 + qui la var dep_infra_impw "Deprived if HH has No access to improved water" + + **************************************************** + **Dimension 3: Monetary + **************************************************** + //recalculate the 2.15 line for 2.15 poverty + qui foreach num of numlist ${plinelist} { + if `pov`num''==0 { + local pline`num' = `=`num'/100' + } + else { + _pctile welfare [aw=hpopwgt], p(`pov`num'') + local pline`num' = r(r1) + } + + gen poor`num'_ln = welfare < `pline`num'' if welfare~=. + gen pline`num' = `pline`num'' + } //num + + //findex + gen dep_fin = 0 + + //social protection + gen dep_sp = 0 + + qui gen file = dname + gen _count = 1 + gen h = hpopwgt + //multidimensional vulnerability + foreach num of numlist ${plinelist} { + //vulnerable and one dim + gen p1_edu_`num' = 0 + replace p1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 + + gen p1_sp_`num' = 0 + replace p1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 + + gen p1_fin_`num' = 0 + replace p1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 + + gen p1_elec_`num' = 0 + replace p1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 + + gen p1_water_`num' = 0 + replace p1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 + + //rsum + egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing + + //any of the 6 dimensions - deprived in education; dep_sp; dep_fin + gen multvul_`num' = 0 + replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. + + // any 2, 3, 4,...,6 + forv j=2(1)6 { + gen all`j'vul_`num' = 0 + replace all`j'vul_`num' = 1 if dim6_`num'==`j' + } + } //povlist + + + collapse (mean) welfare poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* p1_* dim6* (rawsum) _count h [aw=hpopwgt], by(file year reg_rural) + + gen data_`x' = 1 + + global reglvl reg_rural + + //regional level tables + noi dis "`code'-`year' - `x'" + noi table ${reglvl} , c(mean welfare mean dep_infra_elec mean dep_infra_impw mean dep_fin mean dep_sp) format(%15.0g) missing + noi table ${reglvl} , c(mean dep_educ_com mean _count mean h) format(%15.0g) missing + + foreach num of numlist ${plinelist} { + noi table ${reglvl} , c(mean poor`num'_ln mean dim6_`num' mean multvul_`num') format(%15.0g) missing + noi table ${reglvl} , c(mean p1_edu_`num' mean p1_sp_`num' mean p1_fin_`num' mean p1_elec_`num' mean p1_water_`num') format(%15.0g) missing + noi table ${reglvl} , c(mean all2vul_`num' mean all3vul_`num' mean all4vul_`num' mean all5vul_`num' mean all6vul_`num') format(%15.0g) missing + } + +} //end loop all + + + diff --git a/01.code/dofile/2-1c Extract national data - for LISSY data.do b/01.code/dofile/2-1c Extract national data - for LISSY data.do old mode 100644 new mode 100755 index f5de8bc..6a2723e --- a/01.code/dofile/2-1c Extract national data - for LISSY data.do +++ b/01.code/dofile/2-1c Extract national data - for LISSY data.do @@ -1,304 +1,285 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -//Extract national data LIS txt file -clear -version 18 -*global upath2 -global datain "${upath2}\03.intermediate\LISoutput\" - -global rnd AM2024 -global lnyear 2021 - -//update the filename here for each year after each run in LISSY -global f2021 "LIS nat 2021 listing_job_1245381.txt" -global f2010 "LIS nat 2010 listing_job_1245384.txt" - -local date = c(current_date) -local time = c(current_time) -local user = c(username) - -//number of tables for each data point -local ntable 11 -forv x=1(1)`ntable' { - tempfile c`x' - save `c`x'', replace emptyok -} - -//import data -import delimited using "${datain}\\${f${lnyear}}" , clear -gen drop = regexm(v1, "-{10,}") -drop if drop ==1 -drop drop -replace v1 = trim(v1) -gen v3 = v2 -split v3, parse("") -gen x = strpos(v1, "-------------") -drop if x>0 & x~=. -drop x -gen x1 = strpos(v1, "(") -gen x2 = strpos(v1, ")") -gen x3 = strpos(v1, "[") -gen x4 = strpos(v1, "]") - -drop if x1> 0 & x2>0 & x3==0 & x4==0 & v2=="" -drop x2 x1 x3 x4 - -//identify table -gen c1 = strpos(v2, "mean(welfare)") -gen c2 = strpos(v2, "mean(dep_edu~m)") -gen c3 = strpos(v2, "mean(poor215~n)") -gen c4 = strpos(v2, "mean(p1_ed~215)") -gen c5 = strpos(v2, "mean(all2v~215)") -gen c6 = strpos(v2, "mean(poor365~n)") -gen c7 = strpos(v2, "mean(p1_ed~365)") -gen c8 = strpos(v2, "mean(all2v~365)") -gen c9 = strpos(v2, "mean(poor685~n)") -gen c10 = strpos(v2, "mean(p1_ed~685)") -gen c11 = strpos(v2, "mean(all2v~685)") - -gen len = length(v1) -gen p4s = strpos(v1, "-") - -split v1 if len==15 & p4s==4, parse("-") -gen seq = _n -gen seq2 = seq if v11~="" -gen cum = sum(seq2) - -tempfile data1 -save `data1',replace -levelsof cum, local(lista) -foreach lvl of local lista { - use `data1', clear - keep if cum==`lvl' - ren v11 code - ren v12 year - ren v13 file - replace code = code[1] - replace year = year[1] - replace file = file[1] - gen gr = c1+c2+c3+c4+c5+c6+c7+c8+c9+c10+c11 - gen gr2 = sum(gr) - drop if gr2==0 - levelsof gr2, local(listb) - - tempfile datax - save `datax', replace - local i = 0 - foreach gr of local listb { - use `datax', clear - keep if gr2==`gr' - local i = `i'+1 - drop if v31=="" - - if v31[1]== "mean(welfare)" { //c1 - drop if v31=="mean(welfare)" - ren v1 area - ren v31 welfare - ren v32 dep_infra_elec - ren v33 dep_infra_impw - ren v34 dep_fin - ren v35 dep_sp - keep area code year file welfare dep_infra_elec dep_infra_impw dep_fin dep_sp - destring welfare welfare dep_infra_elec dep_infra_impw dep_fin dep_sp, replace - replace area = trim(area) - recast str area - append using `c1' - save `c1', replace - } - else if v31[1]== "mean(dep_edu~m)" { //c2 - drop if v31== "mean(dep_edu~m)" - ren v1 area - ren v31 dep_educ_com - ren v32 _count - ren v33 h - keep area code year file dep_educ_com _count h - destring dep_educ_com _count h, replace - replace area = trim(area) - recast str area - append using `c2' - save `c2', replace - } - //215 - else if v31[1]== "mean(poor215~n)" { //c3 - drop if v31== "mean(poor215~n)" - ren v1 area - ren v31 poor215_ln - ren v32 dim6_215 - ren v33 multvul_215 - keep area code year file poor215_ln dim6_215 multvul_215 - destring poor215_ln dim6_215 multvul_215, replace - replace area = trim(area) - recast str area - append using `c3' - save `c3', replace - } - else if v31[1]== "mean(p1_ed~215)" { //c4 - drop if v31== "mean(p1_ed~215)" - ren v1 area - ren v31 p1_edu_215 - ren v32 p1_sp_215 - ren v33 p1_fin_215 - ren v34 p1_elec_215 - ren v35 p1_water_215 - keep area code year file p1*_215 - destring p1*_215, replace - replace area = trim(area) - recast str area - append using `c4' - save `c4', replace - } - else if v31[1]== "mean(all2v~215)" { //c5 - drop if v31== "mean(all2v~215)" - ren v1 area - ren v31 all2vul_215 - ren v32 all3vul_215 - ren v33 all4vul_215 - ren v34 all5vul_215 - ren v35 all6vul_215 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c5' - save `c5', replace - } - //365 - else if v31[1]== "mean(poor365~n)" { //c6 - drop if v31== "mean(poor365~n)" - ren v1 area - ren v31 poor365_ln - ren v32 dim6_365 - ren v33 multvul_365 - keep area code year file poor365_ln dim6_365 multvul_365 - destring poor365_ln dim6_365 multvul_365, replace - - replace area = trim(area) - recast str area - append using `c6' - save `c6', replace - } - else if v31[1]== "mean(p1_ed~365)" { //c7 - drop if v31== "mean(p1_ed~365)" - ren v1 area - ren v31 p1_edu_365 - ren v32 p1_sp_365 - ren v33 p1_fin_365 - ren v34 p1_elec_365 - ren v35 p1_water_365 - keep area code year file p1*_365 - destring p1*_365, replace - replace area = trim(area) - recast str area - append using `c7' - save `c7', replace - } - else if v31[1]== "mean(all2v~365)" { //c8 - drop if v31== "mean(all2v~365)" - ren v1 area - ren v31 all2vul_365 - ren v32 all3vul_365 - ren v33 all4vul_365 - ren v34 all5vul_365 - ren v35 all6vul_365 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c8' - save `c8', replace - } - //685 - else if v31[1]== "mean(poor685~n)" { //c9 - drop if v31== "mean(poor685~n)" - ren v1 area - ren v31 poor685_ln - ren v32 dim6_685 - ren v33 multvul_685 - keep area code year file poor685_ln dim6_685 multvul_685 - destring poor685_ln dim6_685 multvul_685, replace - - replace area = trim(area) - recast str area - append using `c9' - save `c9', replace - } - else if v31[1]== "mean(p1_ed~685)" { //c10 - drop if v31== "mean(p1_ed~685)" - ren v1 area - ren v31 p1_edu_685 - ren v32 p1_sp_685 - ren v33 p1_fin_685 - ren v34 p1_elec_685 - ren v35 p1_water_685 - keep area code year file p1*_685 - destring p1*_685, replace - replace area = trim(area) - recast str area - append using `c10' - save `c10', replace - } - else { //c11 mean(all2v~685) - drop if v31== "mean(all2v~685)" - ren v1 area - ren v31 all2vul_685 - ren v32 all3vul_685 - ren v33 all4vul_685 - ren v34 all5vul_685 - ren v35 all6vul_685 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c11' - save `c11', replace - } - } // gr listb -} //lista - -use `c1', clear -forv i=2(1)11 { - merge 1:1 code year file area using `c`i'' - drop _merge -} - -drop if area=="reg_rural" -ren area sample -gen level = "_all_" -ren welfare mean_ln -gen lineupyear = $lnyear -gen baseyear = year -order code year file /*survname*/ level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* p1* dim6* _count h - -destring year, replace -merge m:1 code year using "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_${lnyear}", keepus(survname survey_coverage datatype comparability) -keep if _merge==3 -drop _merge - -sort code year sample -ren p1* pov1* -order code year file survname survey_coverage datatype comparability level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* _count h -destring baseyear, replace -drop file -char define _dta[date] "`date' `time'" -char define _dta[user] "`user'" -compress - -saveold "${upath2}\03.intermediate\Sim\\${lnyear}\\${rnd}_LIS_nat_vul_${lnyear}", replace - +//Extract national data LIS txt file +clear +version 18 + +global datain "${upath2}\03.intermediate\LISoutput\" + +global rnd AM2024 +global lnyear 2021 + +//update the filename here for each year after each run in LISSY +global f2021 "LIS nat 2021 listing_job_1245381.txt" +global f2010 "LIS nat 2010 listing_job_1245384.txt" + +local date = c(current_date) +local time = c(current_time) +local user = c(username) + +//number of tables for each data point +local ntable 11 +forv x=1(1)`ntable' { + tempfile c`x' + save `c`x'', replace emptyok +} + +//import data +import delimited using "${datain}\\${f${lnyear}}" , clear +gen drop = regexm(v1, "-{10,}") +drop if drop ==1 +drop drop +replace v1 = trim(v1) +gen v3 = v2 +split v3, parse("") +gen x = strpos(v1, "-------------") +drop if x>0 & x~=. +drop x +gen x1 = strpos(v1, "(") +gen x2 = strpos(v1, ")") +gen x3 = strpos(v1, "[") +gen x4 = strpos(v1, "]") + +drop if x1> 0 & x2>0 & x3==0 & x4==0 & v2=="" +drop x2 x1 x3 x4 + +//identify table +gen c1 = strpos(v2, "mean(welfare)") +gen c2 = strpos(v2, "mean(dep_edu~m)") +gen c3 = strpos(v2, "mean(poor215~n)") +gen c4 = strpos(v2, "mean(p1_ed~215)") +gen c5 = strpos(v2, "mean(all2v~215)") +gen c6 = strpos(v2, "mean(poor365~n)") +gen c7 = strpos(v2, "mean(p1_ed~365)") +gen c8 = strpos(v2, "mean(all2v~365)") +gen c9 = strpos(v2, "mean(poor685~n)") +gen c10 = strpos(v2, "mean(p1_ed~685)") +gen c11 = strpos(v2, "mean(all2v~685)") + +gen len = length(v1) +gen p4s = strpos(v1, "-") + +split v1 if len==15 & p4s==4, parse("-") +gen seq = _n +gen seq2 = seq if v11~="" +gen cum = sum(seq2) + +tempfile data1 +save `data1',replace +levelsof cum, local(lista) +foreach lvl of local lista { + use `data1', clear + keep if cum==`lvl' + ren v11 code + ren v12 year + ren v13 file + replace code = code[1] + replace year = year[1] + replace file = file[1] + gen gr = c1+c2+c3+c4+c5+c6+c7+c8+c9+c10+c11 + gen gr2 = sum(gr) + drop if gr2==0 + levelsof gr2, local(listb) + + tempfile datax + save `datax', replace + local i = 0 + foreach gr of local listb { + use `datax', clear + keep if gr2==`gr' + local i = `i'+1 + drop if v31=="" + + if v31[1]== "mean(welfare)" { //c1 + drop if v31=="mean(welfare)" + ren v1 area + ren v31 welfare + ren v32 dep_infra_elec + ren v33 dep_infra_impw + ren v34 dep_fin + ren v35 dep_sp + keep area code year file welfare dep_infra_elec dep_infra_impw dep_fin dep_sp + destring welfare welfare dep_infra_elec dep_infra_impw dep_fin dep_sp, replace + replace area = trim(area) + recast str area + append using `c1' + save `c1', replace + } + else if v31[1]== "mean(dep_edu~m)" { //c2 + drop if v31== "mean(dep_edu~m)" + ren v1 area + ren v31 dep_educ_com + ren v32 _count + ren v33 h + keep area code year file dep_educ_com _count h + destring dep_educ_com _count h, replace + replace area = trim(area) + recast str area + append using `c2' + save `c2', replace + } + //215 + else if v31[1]== "mean(poor215~n)" { //c3 + drop if v31== "mean(poor215~n)" + ren v1 area + ren v31 poor215_ln + ren v32 dim6_215 + ren v33 multvul_215 + keep area code year file poor215_ln dim6_215 multvul_215 + destring poor215_ln dim6_215 multvul_215, replace + replace area = trim(area) + recast str area + append using `c3' + save `c3', replace + } + else if v31[1]== "mean(p1_ed~215)" { //c4 + drop if v31== "mean(p1_ed~215)" + ren v1 area + ren v31 p1_edu_215 + ren v32 p1_sp_215 + ren v33 p1_fin_215 + ren v34 p1_elec_215 + ren v35 p1_water_215 + keep area code year file p1*_215 + destring p1*_215, replace + replace area = trim(area) + recast str area + append using `c4' + save `c4', replace + } + else if v31[1]== "mean(all2v~215)" { //c5 + drop if v31== "mean(all2v~215)" + ren v1 area + ren v31 all2vul_215 + ren v32 all3vul_215 + ren v33 all4vul_215 + ren v34 all5vul_215 + ren v35 all6vul_215 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c5' + save `c5', replace + } + //365 + else if v31[1]== "mean(poor365~n)" { //c6 + drop if v31== "mean(poor365~n)" + ren v1 area + ren v31 poor365_ln + ren v32 dim6_365 + ren v33 multvul_365 + keep area code year file poor365_ln dim6_365 multvul_365 + destring poor365_ln dim6_365 multvul_365, replace + + replace area = trim(area) + recast str area + append using `c6' + save `c6', replace + } + else if v31[1]== "mean(p1_ed~365)" { //c7 + drop if v31== "mean(p1_ed~365)" + ren v1 area + ren v31 p1_edu_365 + ren v32 p1_sp_365 + ren v33 p1_fin_365 + ren v34 p1_elec_365 + ren v35 p1_water_365 + keep area code year file p1*_365 + destring p1*_365, replace + replace area = trim(area) + recast str area + append using `c7' + save `c7', replace + } + else if v31[1]== "mean(all2v~365)" { //c8 + drop if v31== "mean(all2v~365)" + ren v1 area + ren v31 all2vul_365 + ren v32 all3vul_365 + ren v33 all4vul_365 + ren v34 all5vul_365 + ren v35 all6vul_365 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c8' + save `c8', replace + } + //685 + else if v31[1]== "mean(poor685~n)" { //c9 + drop if v31== "mean(poor685~n)" + ren v1 area + ren v31 poor685_ln + ren v32 dim6_685 + ren v33 multvul_685 + keep area code year file poor685_ln dim6_685 multvul_685 + destring poor685_ln dim6_685 multvul_685, replace + + replace area = trim(area) + recast str area + append using `c9' + save `c9', replace + } + else if v31[1]== "mean(p1_ed~685)" { //c10 + drop if v31== "mean(p1_ed~685)" + ren v1 area + ren v31 p1_edu_685 + ren v32 p1_sp_685 + ren v33 p1_fin_685 + ren v34 p1_elec_685 + ren v35 p1_water_685 + keep area code year file p1*_685 + destring p1*_685, replace + replace area = trim(area) + recast str area + append using `c10' + save `c10', replace + } + else { //c11 mean(all2v~685) + drop if v31== "mean(all2v~685)" + ren v1 area + ren v31 all2vul_685 + ren v32 all3vul_685 + ren v33 all4vul_685 + ren v34 all5vul_685 + ren v35 all6vul_685 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c11' + save `c11', replace + } + } // gr listb +} //lista + +use `c1', clear +forv i=2(1)11 { + merge 1:1 code year file area using `c`i'' + drop _merge +} + +drop if area=="reg_rural" +ren area sample +gen level = "_all_" +ren welfare mean_ln +gen lineupyear = $lnyear +gen baseyear = year +order code year file /*survname*/ level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* p1* dim6* _count h + +destring year, replace +merge m:1 code year using "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_${lnyear}", keepus(survname survey_coverage datatype comparability) +keep if _merge==3 +drop _merge + +sort code year sample +ren p1* pov1* +order code year file survname survey_coverage datatype comparability level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* _count h +destring baseyear, replace +drop file +char define _dta[date] "`date' `time'" +char define _dta[user] "`user'" +compress + +saveold "${upath2}\03.intermediate\Sim\\${lnyear}\\${rnd}_LIS_nat_vul_${lnyear}", replace + diff --git a/01.code/dofile/2-1d Extract subnat data - for LISSY data.do b/01.code/dofile/2-1d Extract subnat data - for LISSY data.do old mode 100644 new mode 100755 index 64b7176..625e91e --- a/01.code/dofile/2-1d Extract subnat data - for LISSY data.do +++ b/01.code/dofile/2-1d Extract subnat data - for LISSY data.do @@ -1,304 +1,285 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -//Extract national data LIS txt file -clear -version 18 -*global upath2 -global datain "${upath2}\03.intermediate\LISoutput\" - -global rnd AM2024 -global lnyear 2021 - -//update the filename here for each year after each run in LISSY -global f2021 "LIS subnat 2021 listing_job_1245388.txt" -global f2010 "LIS subnat 2010 listing_job_1245393.txt" - -local date = c(current_date) -local time = c(current_time) -local user = c(username) - -//number of tables for each data point -local ntable 11 -forv x=1(1)`ntable' { - tempfile c`x' - save `c`x'', replace emptyok -} - -//import data -import delimited using "${datain}\\${f${lnyear}}" , clear -gen drop = regexm(v1, "-{10,}") -drop if drop ==1 -drop drop -replace v1 = trim(v1) -gen v3 = v2 -split v3, parse("") -gen x = strpos(v1, "-------------") -drop if x>0 & x~=. -drop x -gen x1 = strpos(v1, "(") -gen x2 = strpos(v1, ")") -gen x3 = strpos(v1, "[") -gen x4 = strpos(v1, "]") - -drop if x1> 0 & x2>0 & x3==0 & x4==0 & v2=="" -drop x2 x1 x3 x4 - -//identify table -gen c1 = strpos(v2, "mean(welfare)") -gen c2 = strpos(v2, "mean(dep_edu~m)") -gen c3 = strpos(v2, "mean(poor215~n)") -gen c4 = strpos(v2, "mean(p1_ed~215)") -gen c5 = strpos(v2, "mean(all2v~215)") -gen c6 = strpos(v2, "mean(poor365~n)") -gen c7 = strpos(v2, "mean(p1_ed~365)") -gen c8 = strpos(v2, "mean(all2v~365)") -gen c9 = strpos(v2, "mean(poor685~n)") -gen c10 = strpos(v2, "mean(p1_ed~685)") -gen c11 = strpos(v2, "mean(all2v~685)") - -gen len = length(v1) -gen p4s = strpos(v1, "-") - -split v1 if len==15 & p4s==4, parse("-") -gen seq = _n -gen seq2 = seq if v11~="" -gen cum = sum(seq2) - -tempfile data1 -save `data1',replace -levelsof cum, local(lista) -foreach lvl of local lista { - use `data1', clear - keep if cum==`lvl' - ren v11 code - ren v12 year - ren v13 file - replace code = code[1] - replace year = year[1] - replace file = file[1] - gen gr = c1+c2+c3+c4+c5+c6+c7+c8+c9+c10+c11 - gen gr2 = sum(gr) - drop if gr2==0 - levelsof gr2, local(listb) - - tempfile datax - save `datax', replace - local i = 0 - foreach gr of local listb { - use `datax', clear - keep if gr2==`gr' - local i = `i'+1 - drop if v31=="" - - if v31[1]== "mean(welfare)" { //c1 - drop if v31=="mean(welfare)" - ren v1 area - ren v31 welfare - ren v32 dep_infra_elec - ren v33 dep_infra_impw - ren v34 dep_fin - ren v35 dep_sp - keep area code year file welfare dep_infra_elec dep_infra_impw dep_fin dep_sp - destring welfare welfare dep_infra_elec dep_infra_impw dep_fin dep_sp, replace - replace area = trim(area) - recast str area - append using `c1' - save `c1', replace - } - else if v31[1]== "mean(dep_edu~m)" { //c2 - drop if v31== "mean(dep_edu~m)" - ren v1 area - ren v31 dep_educ_com - ren v32 _count - ren v33 h - keep area code year file dep_educ_com _count h - destring dep_educ_com _count h, replace - replace area = trim(area) - recast str area - append using `c2' - save `c2', replace - } - //215 - else if v31[1]== "mean(poor215~n)" { //c3 - drop if v31== "mean(poor215~n)" - ren v1 area - ren v31 poor215_ln - ren v32 dim6_215 - ren v33 multvul_215 - keep area code year file poor215_ln dim6_215 multvul_215 - destring poor215_ln dim6_215 multvul_215, replace - replace area = trim(area) - recast str area - append using `c3' - save `c3', replace - } - else if v31[1]== "mean(p1_ed~215)" { //c4 - drop if v31== "mean(p1_ed~215)" - ren v1 area - ren v31 p1_edu_215 - ren v32 p1_sp_215 - ren v33 p1_fin_215 - ren v34 p1_elec_215 - ren v35 p1_water_215 - keep area code year file p1*_215 - destring p1*_215, replace - replace area = trim(area) - recast str area - append using `c4' - save `c4', replace - } - else if v31[1]== "mean(all2v~215)" { //c5 - drop if v31== "mean(all2v~215)" - ren v1 area - ren v31 all2vul_215 - ren v32 all3vul_215 - ren v33 all4vul_215 - ren v34 all5vul_215 - ren v35 all6vul_215 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c5' - save `c5', replace - } - //365 - else if v31[1]== "mean(poor365~n)" { //c6 - drop if v31== "mean(poor365~n)" - ren v1 area - ren v31 poor365_ln - ren v32 dim6_365 - ren v33 multvul_365 - keep area code year file poor365_ln dim6_365 multvul_365 - destring poor365_ln dim6_365 multvul_365, replace - - replace area = trim(area) - recast str area - append using `c6' - save `c6', replace - } - else if v31[1]== "mean(p1_ed~365)" { //c7 - drop if v31== "mean(p1_ed~365)" - ren v1 area - ren v31 p1_edu_365 - ren v32 p1_sp_365 - ren v33 p1_fin_365 - ren v34 p1_elec_365 - ren v35 p1_water_365 - keep area code year file p1*_365 - destring p1*_365, replace - replace area = trim(area) - recast str area - append using `c7' - save `c7', replace - } - else if v31[1]== "mean(all2v~365)" { //c8 - drop if v31== "mean(all2v~365)" - ren v1 area - ren v31 all2vul_365 - ren v32 all3vul_365 - ren v33 all4vul_365 - ren v34 all5vul_365 - ren v35 all6vul_365 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c8' - save `c8', replace - } - //685 - else if v31[1]== "mean(poor685~n)" { //c9 - drop if v31== "mean(poor685~n)" - ren v1 area - ren v31 poor685_ln - ren v32 dim6_685 - ren v33 multvul_685 - keep area code year file poor685_ln dim6_685 multvul_685 - destring poor685_ln dim6_685 multvul_685, replace - - replace area = trim(area) - recast str area - append using `c9' - save `c9', replace - } - else if v31[1]== "mean(p1_ed~685)" { //c10 - drop if v31== "mean(p1_ed~685)" - ren v1 area - ren v31 p1_edu_685 - ren v32 p1_sp_685 - ren v33 p1_fin_685 - ren v34 p1_elec_685 - ren v35 p1_water_685 - keep area code year file p1*_685 - destring p1*_685, replace - replace area = trim(area) - recast str area - append using `c10' - save `c10', replace - } - else { //c11 mean(all2v~685) - drop if v31== "mean(all2v~685)" - ren v1 area - ren v31 all2vul_685 - ren v32 all3vul_685 - ren v33 all4vul_685 - ren v34 all5vul_685 - ren v35 all6vul_685 - keep area code year file all*vul* - destring all*vul*, replace - replace area = trim(area) - recast str area - append using `c11' - save `c11', replace - } - } // gr listb -} //lista - -use `c1', clear -forv i=2(1)11 { - merge 1:1 code year file area using `c`i'' - drop _merge -} - -drop if area=="reg_rural" -ren area sample -gen level = "reg_rural" -ren welfare mean_ln -gen lineupyear = $lnyear -gen baseyear = year -order code year file /*survname*/ level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* p1* dim6* _count h - -destring year, replace -merge m:1 code year using "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_${lnyear}", keepus(survname survey_coverage datatype comparability) -keep if _merge==3 -drop _merge - -sort code year sample -ren p1* pov1* -order code year file survname survey_coverage datatype comparability level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* _count h -destring baseyear, replace -drop file -char define _dta[date] "`date' `time'" -char define _dta[user] "`user'" -compress - -saveold "${upath2}\03.intermediate\Sim\\${lnyear}\\${rnd}_LIS_subnat_vul_${lnyear}", replace - +//Extract national data LIS txt file +clear +version 18 + +global datain "${upath2}\03.intermediate\LISoutput\" + +global rnd AM2024 +global lnyear 2021 + +//update the filename here for each year after each run in LISSY +global f2021 "LIS subnat 2021 listing_job_1245388.txt" +global f2010 "LIS subnat 2010 listing_job_1245393.txt" + +local date = c(current_date) +local time = c(current_time) +local user = c(username) + +//number of tables for each data point +local ntable 11 +forv x=1(1)`ntable' { + tempfile c`x' + save `c`x'', replace emptyok +} + +//import data +import delimited using "${datain}\\${f${lnyear}}" , clear +gen drop = regexm(v1, "-{10,}") +drop if drop ==1 +drop drop +replace v1 = trim(v1) +gen v3 = v2 +split v3, parse("") +gen x = strpos(v1, "-------------") +drop if x>0 & x~=. +drop x +gen x1 = strpos(v1, "(") +gen x2 = strpos(v1, ")") +gen x3 = strpos(v1, "[") +gen x4 = strpos(v1, "]") + +drop if x1> 0 & x2>0 & x3==0 & x4==0 & v2=="" +drop x2 x1 x3 x4 + +//identify table +gen c1 = strpos(v2, "mean(welfare)") +gen c2 = strpos(v2, "mean(dep_edu~m)") +gen c3 = strpos(v2, "mean(poor215~n)") +gen c4 = strpos(v2, "mean(p1_ed~215)") +gen c5 = strpos(v2, "mean(all2v~215)") +gen c6 = strpos(v2, "mean(poor365~n)") +gen c7 = strpos(v2, "mean(p1_ed~365)") +gen c8 = strpos(v2, "mean(all2v~365)") +gen c9 = strpos(v2, "mean(poor685~n)") +gen c10 = strpos(v2, "mean(p1_ed~685)") +gen c11 = strpos(v2, "mean(all2v~685)") + +gen len = length(v1) +gen p4s = strpos(v1, "-") + +split v1 if len==15 & p4s==4, parse("-") +gen seq = _n +gen seq2 = seq if v11~="" +gen cum = sum(seq2) + +tempfile data1 +save `data1',replace +levelsof cum, local(lista) +foreach lvl of local lista { + use `data1', clear + keep if cum==`lvl' + ren v11 code + ren v12 year + ren v13 file + replace code = code[1] + replace year = year[1] + replace file = file[1] + gen gr = c1+c2+c3+c4+c5+c6+c7+c8+c9+c10+c11 + gen gr2 = sum(gr) + drop if gr2==0 + levelsof gr2, local(listb) + + tempfile datax + save `datax', replace + local i = 0 + foreach gr of local listb { + use `datax', clear + keep if gr2==`gr' + local i = `i'+1 + drop if v31=="" + + if v31[1]== "mean(welfare)" { //c1 + drop if v31=="mean(welfare)" + ren v1 area + ren v31 welfare + ren v32 dep_infra_elec + ren v33 dep_infra_impw + ren v34 dep_fin + ren v35 dep_sp + keep area code year file welfare dep_infra_elec dep_infra_impw dep_fin dep_sp + destring welfare welfare dep_infra_elec dep_infra_impw dep_fin dep_sp, replace + replace area = trim(area) + recast str area + append using `c1' + save `c1', replace + } + else if v31[1]== "mean(dep_edu~m)" { //c2 + drop if v31== "mean(dep_edu~m)" + ren v1 area + ren v31 dep_educ_com + ren v32 _count + ren v33 h + keep area code year file dep_educ_com _count h + destring dep_educ_com _count h, replace + replace area = trim(area) + recast str area + append using `c2' + save `c2', replace + } + //215 + else if v31[1]== "mean(poor215~n)" { //c3 + drop if v31== "mean(poor215~n)" + ren v1 area + ren v31 poor215_ln + ren v32 dim6_215 + ren v33 multvul_215 + keep area code year file poor215_ln dim6_215 multvul_215 + destring poor215_ln dim6_215 multvul_215, replace + replace area = trim(area) + recast str area + append using `c3' + save `c3', replace + } + else if v31[1]== "mean(p1_ed~215)" { //c4 + drop if v31== "mean(p1_ed~215)" + ren v1 area + ren v31 p1_edu_215 + ren v32 p1_sp_215 + ren v33 p1_fin_215 + ren v34 p1_elec_215 + ren v35 p1_water_215 + keep area code year file p1*_215 + destring p1*_215, replace + replace area = trim(area) + recast str area + append using `c4' + save `c4', replace + } + else if v31[1]== "mean(all2v~215)" { //c5 + drop if v31== "mean(all2v~215)" + ren v1 area + ren v31 all2vul_215 + ren v32 all3vul_215 + ren v33 all4vul_215 + ren v34 all5vul_215 + ren v35 all6vul_215 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c5' + save `c5', replace + } + //365 + else if v31[1]== "mean(poor365~n)" { //c6 + drop if v31== "mean(poor365~n)" + ren v1 area + ren v31 poor365_ln + ren v32 dim6_365 + ren v33 multvul_365 + keep area code year file poor365_ln dim6_365 multvul_365 + destring poor365_ln dim6_365 multvul_365, replace + + replace area = trim(area) + recast str area + append using `c6' + save `c6', replace + } + else if v31[1]== "mean(p1_ed~365)" { //c7 + drop if v31== "mean(p1_ed~365)" + ren v1 area + ren v31 p1_edu_365 + ren v32 p1_sp_365 + ren v33 p1_fin_365 + ren v34 p1_elec_365 + ren v35 p1_water_365 + keep area code year file p1*_365 + destring p1*_365, replace + replace area = trim(area) + recast str area + append using `c7' + save `c7', replace + } + else if v31[1]== "mean(all2v~365)" { //c8 + drop if v31== "mean(all2v~365)" + ren v1 area + ren v31 all2vul_365 + ren v32 all3vul_365 + ren v33 all4vul_365 + ren v34 all5vul_365 + ren v35 all6vul_365 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c8' + save `c8', replace + } + //685 + else if v31[1]== "mean(poor685~n)" { //c9 + drop if v31== "mean(poor685~n)" + ren v1 area + ren v31 poor685_ln + ren v32 dim6_685 + ren v33 multvul_685 + keep area code year file poor685_ln dim6_685 multvul_685 + destring poor685_ln dim6_685 multvul_685, replace + + replace area = trim(area) + recast str area + append using `c9' + save `c9', replace + } + else if v31[1]== "mean(p1_ed~685)" { //c10 + drop if v31== "mean(p1_ed~685)" + ren v1 area + ren v31 p1_edu_685 + ren v32 p1_sp_685 + ren v33 p1_fin_685 + ren v34 p1_elec_685 + ren v35 p1_water_685 + keep area code year file p1*_685 + destring p1*_685, replace + replace area = trim(area) + recast str area + append using `c10' + save `c10', replace + } + else { //c11 mean(all2v~685) + drop if v31== "mean(all2v~685)" + ren v1 area + ren v31 all2vul_685 + ren v32 all3vul_685 + ren v33 all4vul_685 + ren v34 all5vul_685 + ren v35 all6vul_685 + keep area code year file all*vul* + destring all*vul*, replace + replace area = trim(area) + recast str area + append using `c11' + save `c11', replace + } + } // gr listb +} //lista + +use `c1', clear +forv i=2(1)11 { + merge 1:1 code year file area using `c`i'' + drop _merge +} + +drop if area=="reg_rural" +ren area sample +gen level = "reg_rural" +ren welfare mean_ln +gen lineupyear = $lnyear +gen baseyear = year +order code year file /*survname*/ level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* p1* dim6* _count h + +destring year, replace +merge m:1 code year using "${upath2}\03.intermediate\Lineuplist\LISSY_ln_list_${lnyear}", keepus(survname survey_coverage datatype comparability) +keep if _merge==3 +drop _merge + +sort code year sample +ren p1* pov1* +order code year file survname survey_coverage datatype comparability level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* _count h +destring baseyear, replace +drop file +char define _dta[date] "`date' `time'" +char define _dta[user] "`user'" +compress + +saveold "${upath2}\03.intermediate\Sim\\${lnyear}\\${rnd}_LIS_subnat_vul_${lnyear}", replace + diff --git a/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do b/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do old mode 100644 new mode 100755 index c3dcf55..813972d --- a/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do +++ b/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do @@ -1,411 +1,391 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -//CHN vul 2021 - -clear all -tempfile data1 data2 data3 data4 dataall -save `dataall', replace emptyok - -*global upath2 -global sim 100 - -//Get from PIP for the lineup year 2021 -local pov215 = 0.00 -local pov365 = 0.0472 -local pov685 = 17.0336 - -/* from DLW -levelnote cpi2017 icp2017 (cpi is the same so one value) -rural 1.021 3.4950106 -urban 1.021 4.3184844 -*/ -local cpi2017 = 1.021 -local icp2017urb = 4.3184844 -local icp2017rur = 3.4950106 - -local code CHN -local lineupyear 2021 -local baseyear 2018 -local survname CHIP -local welfaretype CONS - -**************************************************** -//Get values for fusion ASPIRE FINDEX -//ASPIRE -use "${upath2}\02.input\2021\ASPIRE_data_2021.dta", clear -keep if code=="`code'" -forv i=1(1)5 { - local _pop_All_SPL_q`i' -} -local aspire_sp -count -if r(N)>0 { - local aspire_sp = 1 - forv i=1(1)5 { - local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] - } -} //rn -else { - local aspire_sp = 0 -} - -//FINDEX data (no account, which is dep_fin) -use "${upath2}\02.input\2021\findex_2021_quintiles.dta", clear -keep if code=="`code'" -forv i=1(1)5 { - local no_accountq`i'total -} - -local findex -count -if r(N)>0 { - local findex = 1 - forv i=1(1)5 { - local no_accountq`i'total = no_accountq`i'total[1] - } -} //rn -else { - local findex = 0 -} - -**************************************************** -//Load microdata -use "${upath2}\02.input\CHN\CHN_2018_CHIP.dta", clear - -la def urban 1 "Urban" 0 "Rural" -la val urban urban -decode urban, gen(urb2) -gen reg_urb = subnatid + "*_*" + urb2 -local subnatvar subnatid reg_urb - -// welfare variable -gen gallT_ppp = welfare/`cpi2017'/`icp2017urb'/365 if urban==1 -replace gallT_ppp = welfare/`cpi2017'/`icp2017rur'/365 if urban==0 -replace gallT_ppp = 0.25 if gallT_ppp<0.25 - -**************************************************** -**Dimension 1: Education -**************************************************** - -**1a) Indicator: no one in hh with primary completion (age 15+) -//All adults -global eduage 15 -local eduflag = 0 -cap gen educat5 = . -cap gen educat7 = . - -cap su educat7 -if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) -} -else { //educat5 - cap su educat5 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) - } - else { //educat4 - cap su educat4 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) - } - else { //no education available - local eduflag = 1 - } - } //educat4 -} - -if `eduflag'==0 { - gen temp2a = 1 if age>=$eduage & age~=. - bys hhid: egen educ_com_size = sum(temp2a) - bys hhid: egen temp3 = sum(temp2) - bys hhid: egen temp3c = sum(temp2c) - gen dep_educ_com = 0 - replace dep_educ_com = 1 if temp3==0 - gen dep_educ_com_lb = 0 - replace dep_educ_com_lb = 1 if temp3c==0 - ren temp3 educ_com_sum - ren temp3c educ_com_sum_lb - drop temp2 temp2a temp2c -} -else { - gen dep_educ_com = . - gen dep_educ_com_lb = . - gen educ_com_sum = . - gen educ_com_sum_lb = . - gen educ_com_size = . -} - -gen educ_com_appl = 1 -replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) -gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. -bys hhid: egen educ_com_mis = sum(temp2b) -drop temp2b -gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. - -la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" -la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" -la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" -la var educ_com_appl_miss "School completion is applicable households but missing completion" -cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss - -**************************************************** -**Dimension 2: Access to infrastructure -**************************************************** - -**************************************************** -//Indicator: Electricity -cap des electricity -if _rc==0 gen dep_infra_elec = electricity==0 if electricity~=. -else gen dep_infra_elec = . -la var dep_infra_elec "Deprived if HH has No access to electricity" - -**************************************************** -//Indicator: Water -cap des imp_wat_rec -if _rc==0 gen dep_infra_impw = imp_wat_rec==0 if imp_wat_rec~=. -else gen dep_infra_impw = . -la var dep_infra_impw "Deprived if HH has No access to improved drinking water" - -**************************************************** -**Dimension 3: Monetary -**************************************************** -//recalculate the 2.15 line for 2.15 poverty -qui foreach num of numlist 215 365 685 { - if `pov`num''==0 { - *local pline`num' = `=`num'/100' - local pline`num' = 0.25 - } - else { - _pctile gallT_ppp [aw=weight_p], p(`pov`num'') - local pline`num' = r(r1) - } - - gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. - gen pline`num' = `pline`num'' - -} //num - -//Scaled IND to HH -//get 15+ population size by quintile or quintile/urban rural only when age is available. -forv a1=1(1)5 { - local n15q`a1'total = 1 - local n15q`a1'urban = 1 - local n15q`a1'rural = 1 -} - -_ebin gallT_ppp [aw=weight_p], gen(q5ind) nq(5) -cap des age -if _rc==0 { - qui su age - if r(N)>0 { - gen tmp = age>=15 & age~=. - bys hhid (pid): egen n15 = total(tmp) - //`no_accountq`i'`nm'' `no_accountq`i'total' - forv a1=1(1)5 { - su n15 [aw=weight_p] if q5ind==`a1' - local n15q`a1'total = r(mean) - - su n15 [aw=weight_p] if q5ind==`a1' & urban==1 - local n15q`a1'urban = r(mean) - - su n15 [aw=weight_p] if q5ind==`a1' & urban==0 - local n15q`a1'rural = r(mean) - } //a1 - } //rN -} //age -cap drop q5ind tmp n15 - -//Convert to HH -bys hhid: egen double pop = total(weight_p) -duplicates drop hhid, force - -clonevar weight_use = pop - -//quintiles -_ebin gallT_ppp [aw=pop], gen(q5) nq(5) -gen g40 = q5==1|q5==2 -gen test = 1 -des,sh -tempfile databfsim -save `databfsim', replace - -//loop through random assignments - set seed 1234567 - clear - tempfile ctry1 ctry1ln - save `ctry1', replace emptyok - save `ctry1ln', replace emptyok - - noi display _n(1) - noi display in yellow "Number of simulations: $sim" _n(1) - noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") - - qui forv sim=1(1)$sim { - use `databfsim', clear - - //findex access no_accountq`i'total - if `findex'==1 { - gen dep_fin = . - forv i=1(1)5 { - cap drop _a`i' - if (`no_accountq`i'total' > 0) { - *wsample test [aw=pop] if q5==`i', percent(`no_accountq`i'total') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - local adjfin = 100*((`=`no_accountq`i'total'/100')^(0.6*`n15q`i'total')) - wsample test [aw=pop] if q5==`i', percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - - } - else { - gen _a`i' = 0 if q5==`i' - } - replace dep_fin = _a`i' if q5==`i' - drop _a`i' - } //i - gen fin_flag = 0 - } - else { //missing - gen dep_fin = . - gen fin_flag = 1 - } - - //SP access _pop_All_SPL_q`i' - /* - if `aspire_sp'==1 { - gen dep_sp = . - forv i=1(1)5 { - cap drop _a`i' - if (`_pop_All_SPL_q`i'' > 0) { - wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' - } - replace dep_sp = 1-_a`i' if q5==`i' - drop _a`i' - } //i - - gen sp_flag = 0 - } - else { //missing - gen dep_sp = . - gen sp_flag = 1 - } - */ - //ASPIRE CHN2018 is universal now. 96.7 - gen dep_sp = 0 - - //multidimensional vulnerability - foreach num of numlist 215 365 685 { - //vulnerable and one dim - gen pov1_edu_`num' = 0 - replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 - - gen pov1_sp_`num' = 0 - replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 - - gen pov1_fin_`num' = 0 - replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 - - gen pov1_elec_`num' = 0 - replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 - - gen pov1_water_`num' = 0 - replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 - - //rsum - egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing - - //any of the 6 dimensions - deprived in education; dep_sp; dep_fin - gen multvul_`num' = 0 - replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. - - // any 2, 3, 4,...,6 - forv j=2(1)6 { - gen all`j'vul_`num' = 0 - replace all`j'vul_`num' = 1 if dim6_`num'==`j' - } - } //povlist - gen _all_ = "All sample" - - gen sim = `sim' - gen _count=1 - //collapse to get indicators - compress - tempfile data2 - save `data2', replace - - *local lvllist _all_ urban2 subnatid subnatid1 /*db040 */ - local lvllist _all_ urb2 `subnatvar' - qui foreach var of local lvllist { - use `data2', clear - clonevar h = pop - *clonevar h_ln = pop - *clonevar wta_pov = pop - replace `var' = stritrim(`var') - replace `var' = ustrtrim(`var') - - levelsof `var', local(lvllist2) - cap confirm string variable `var' - if _rc==0 local st = 1 - else local st = 0 - - qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') - - - rename gallT_ppp mean_ln - ren _count nohh - ren h noind - egen double totalhh = total(nohh) - egen double totalind = total(noind) - gen sh_hh = nohh/totalhh - gen sh_pop = noind/totalind - - ren `var' sample - gen level = "`var'" - gen code = "`code'" - gen lineupyear = `lineupyear' - gen baseyear = `baseyear' - gen survname = "`survname'" - gen str welfaretype = "`welfaretype'" - - append using `ctry1ln' - order code baseyear lineupyear survname welfaretype level sample sim mean_ln poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind - save `ctry1ln', replace - } //foreach subnat - - if (mod(`sim',50)==0){ - noi display in white ". `sim'" _continue - noi display _n(0) - } - else noi display "." _continue - } //sim - //collapse across sim - - //save results - use `ctry1ln', replace - save "${upath2}\03.intermediate\Sim\2021\temp\CHN_2018_CHIP_2021_lnsim", replace - - groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) - - order code survname level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind - gen todo = 1 - saveold "${upath2}\03.intermediate\Sim\2021\CHN_2018_CHIP_2021", replace +//CHN vul 2021 + +clear all +tempfile data1 data2 data3 data4 dataall +save `dataall', replace emptyok + +global sim 100 + +//Get from PIP for the lineup year 2021 +local pov215 = 0.00 +local pov365 = 0.0472 +local pov685 = 17.0336 + +/* from DLW +levelnote cpi2017 icp2017 (cpi is the same so one value) +rural 1.021 3.4950106 +urban 1.021 4.3184844 +*/ +local cpi2017 = 1.021 +local icp2017urb = 4.3184844 +local icp2017rur = 3.4950106 + +local code CHN +local lineupyear 2021 +local baseyear 2018 +local survname CHIP +local welfaretype CONS + +**************************************************** +//Get values for fusion ASPIRE FINDEX +//ASPIRE +use "${upath2}\02.input\2021\ASPIRE_data_2021.dta", clear +keep if code=="`code'" +forv i=1(1)5 { + local _pop_All_SPL_q`i' +} +local aspire_sp +count +if r(N)>0 { + local aspire_sp = 1 + forv i=1(1)5 { + local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] + } +} //rn +else { + local aspire_sp = 0 +} + +//FINDEX data (no account, which is dep_fin) +use "${upath2}\02.input\2021\findex_2021_quintiles.dta", clear +keep if code=="`code'" +forv i=1(1)5 { + local no_accountq`i'total +} + +local findex +count +if r(N)>0 { + local findex = 1 + forv i=1(1)5 { + local no_accountq`i'total = no_accountq`i'total[1] + } +} //rn +else { + local findex = 0 +} + +**************************************************** +//Load microdata +use "${upath2}\02.input\CHN\CHN_2018_CHIP.dta", clear + +la def urban 1 "Urban" 0 "Rural" +la val urban urban +decode urban, gen(urb2) +gen reg_urb = subnatid + "*_*" + urb2 +local subnatvar subnatid reg_urb + +// welfare variable +gen gallT_ppp = welfare/`cpi2017'/`icp2017urb'/365 if urban==1 +replace gallT_ppp = welfare/`cpi2017'/`icp2017rur'/365 if urban==0 +replace gallT_ppp = 0.25 if gallT_ppp<0.25 + +**************************************************** +**Dimension 1: Education +**************************************************** + +**1a) Indicator: no one in hh with primary completion (age 15+) +//All adults +global eduage 15 +local eduflag = 0 +cap gen educat5 = . +cap gen educat7 = . + +cap su educat7 +if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) +} +else { //educat5 + cap su educat5 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) + } + else { //educat4 + cap su educat4 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) + } + else { //no education available + local eduflag = 1 + } + } //educat4 +} + +if `eduflag'==0 { + gen temp2a = 1 if age>=$eduage & age~=. + bys hhid: egen educ_com_size = sum(temp2a) + bys hhid: egen temp3 = sum(temp2) + bys hhid: egen temp3c = sum(temp2c) + gen dep_educ_com = 0 + replace dep_educ_com = 1 if temp3==0 + gen dep_educ_com_lb = 0 + replace dep_educ_com_lb = 1 if temp3c==0 + ren temp3 educ_com_sum + ren temp3c educ_com_sum_lb + drop temp2 temp2a temp2c +} +else { + gen dep_educ_com = . + gen dep_educ_com_lb = . + gen educ_com_sum = . + gen educ_com_sum_lb = . + gen educ_com_size = . +} + +gen educ_com_appl = 1 +replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) +gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. +bys hhid: egen educ_com_mis = sum(temp2b) +drop temp2b +gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. + +la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" +la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" +la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" +la var educ_com_appl_miss "School completion is applicable households but missing completion" +cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss + +**************************************************** +**Dimension 2: Access to infrastructure +**************************************************** + +**************************************************** +//Indicator: Electricity +cap des electricity +if _rc==0 gen dep_infra_elec = electricity==0 if electricity~=. +else gen dep_infra_elec = . +la var dep_infra_elec "Deprived if HH has No access to electricity" + +**************************************************** +//Indicator: Water +cap des imp_wat_rec +if _rc==0 gen dep_infra_impw = imp_wat_rec==0 if imp_wat_rec~=. +else gen dep_infra_impw = . +la var dep_infra_impw "Deprived if HH has No access to improved drinking water" + +**************************************************** +**Dimension 3: Monetary +**************************************************** +//recalculate the 2.15 line for 2.15 poverty +qui foreach num of numlist 215 365 685 { + if `pov`num''==0 { + *local pline`num' = `=`num'/100' + local pline`num' = 0.25 + } + else { + _pctile gallT_ppp [aw=weight_p], p(`pov`num'') + local pline`num' = r(r1) + } + + gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. + gen pline`num' = `pline`num'' + +} //num + +//Scaled IND to HH +//get 15+ population size by quintile or quintile/urban rural only when age is available. +forv a1=1(1)5 { + local n15q`a1'total = 1 + local n15q`a1'urban = 1 + local n15q`a1'rural = 1 +} + +_ebin gallT_ppp [aw=weight_p], gen(q5ind) nq(5) +cap des age +if _rc==0 { + qui su age + if r(N)>0 { + gen tmp = age>=15 & age~=. + bys hhid (pid): egen n15 = total(tmp) + //`no_accountq`i'`nm'' `no_accountq`i'total' + forv a1=1(1)5 { + su n15 [aw=weight_p] if q5ind==`a1' + local n15q`a1'total = r(mean) + + su n15 [aw=weight_p] if q5ind==`a1' & urban==1 + local n15q`a1'urban = r(mean) + + su n15 [aw=weight_p] if q5ind==`a1' & urban==0 + local n15q`a1'rural = r(mean) + } //a1 + } //rN +} //age +cap drop q5ind tmp n15 + +//Convert to HH +bys hhid: egen double pop = total(weight_p) +duplicates drop hhid, force + +clonevar weight_use = pop + +//quintiles +_ebin gallT_ppp [aw=pop], gen(q5) nq(5) +gen g40 = q5==1|q5==2 +gen test = 1 +des,sh +tempfile databfsim +save `databfsim', replace + +//loop through random assignments + set seed 1234567 + clear + tempfile ctry1 ctry1ln + save `ctry1', replace emptyok + save `ctry1ln', replace emptyok + + noi display _n(1) + noi display in yellow "Number of simulations: $sim" _n(1) + noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") + + qui forv sim=1(1)$sim { + use `databfsim', clear + + //findex access no_accountq`i'total + if `findex'==1 { + gen dep_fin = . + forv i=1(1)5 { + cap drop _a`i' + if (`no_accountq`i'total' > 0) { + *wsample test [aw=pop] if q5==`i', percent(`no_accountq`i'total') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + local adjfin = 100*((`=`no_accountq`i'total'/100')^(0.6*`n15q`i'total')) + wsample test [aw=pop] if q5==`i', percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + + } + else { + gen _a`i' = 0 if q5==`i' + } + replace dep_fin = _a`i' if q5==`i' + drop _a`i' + } //i + gen fin_flag = 0 + } + else { //missing + gen dep_fin = . + gen fin_flag = 1 + } + + //SP access _pop_All_SPL_q`i' + /* + if `aspire_sp'==1 { + gen dep_sp = . + forv i=1(1)5 { + cap drop _a`i' + if (`_pop_All_SPL_q`i'' > 0) { + wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' + } + replace dep_sp = 1-_a`i' if q5==`i' + drop _a`i' + } //i + + gen sp_flag = 0 + } + else { //missing + gen dep_sp = . + gen sp_flag = 1 + } + */ + //ASPIRE CHN2018 is universal now. 96.7 + gen dep_sp = 0 + + //multidimensional vulnerability + foreach num of numlist 215 365 685 { + //vulnerable and one dim + gen pov1_edu_`num' = 0 + replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 + + gen pov1_sp_`num' = 0 + replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 + + gen pov1_fin_`num' = 0 + replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 + + gen pov1_elec_`num' = 0 + replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 + + gen pov1_water_`num' = 0 + replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 + + //rsum + egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing + + //any of the 6 dimensions - deprived in education; dep_sp; dep_fin + gen multvul_`num' = 0 + replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. + + // any 2, 3, 4,...,6 + forv j=2(1)6 { + gen all`j'vul_`num' = 0 + replace all`j'vul_`num' = 1 if dim6_`num'==`j' + } + } //povlist + gen _all_ = "All sample" + + gen sim = `sim' + gen _count=1 + //collapse to get indicators + compress + tempfile data2 + save `data2', replace + + *local lvllist _all_ urban2 subnatid subnatid1 /*db040 */ + local lvllist _all_ urb2 `subnatvar' + qui foreach var of local lvllist { + use `data2', clear + clonevar h = pop + *clonevar h_ln = pop + *clonevar wta_pov = pop + replace `var' = stritrim(`var') + replace `var' = ustrtrim(`var') + + levelsof `var', local(lvllist2) + cap confirm string variable `var' + if _rc==0 local st = 1 + else local st = 0 + + qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') + + + rename gallT_ppp mean_ln + ren _count nohh + ren h noind + egen double totalhh = total(nohh) + egen double totalind = total(noind) + gen sh_hh = nohh/totalhh + gen sh_pop = noind/totalind + + ren `var' sample + gen level = "`var'" + gen code = "`code'" + gen lineupyear = `lineupyear' + gen baseyear = `baseyear' + gen survname = "`survname'" + gen str welfaretype = "`welfaretype'" + + append using `ctry1ln' + order code baseyear lineupyear survname welfaretype level sample sim mean_ln poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind + save `ctry1ln', replace + } //foreach subnat + + if (mod(`sim',50)==0){ + noi display in white ". `sim'" _continue + noi display _n(0) + } + else noi display "." _continue + } //sim + //collapse across sim + + //save results + use `ctry1ln', replace + save "${upath2}\03.intermediate\Sim\2021\temp\CHN_2018_CHIP_2021_lnsim", replace + + groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) + + order code survname level sample baseyear lineupyear mean_ln poor215_ln poor365_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind + gen todo = 1 + saveold "${upath2}\03.intermediate\Sim\2021\CHN_2018_CHIP_2021", replace diff --git a/01.code/dofile/2-3 Estimate vul rate for GMD data full.do b/01.code/dofile/2-3 Estimate vul rate for GMD data full.do old mode 100644 new mode 100755 index 80fa6f9..8102a65 --- a/01.code/dofile/2-3 Estimate vul rate for GMD data full.do +++ b/01.code/dofile/2-3 Estimate vul rate for GMD data full.do @@ -1,1485 +1,1476 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -//GMD vul - -clear all -set more off -set matsize 5000 -mat drop _all -set varabbrev off - -//setting -global rnd AM24 -global sim 100 -*global upath2 c:\Users\WB327173\OneDrive - WBG\Downloads\ECA\Global\Climate change and poverty\Vulnerable to poverty and climate 2.0\ -global reposource "${upath2}\02.input" -global repotxt repo(use ${rnd}all) reporoot(${reposource}) -global lnyear 2021 -global circa 3 -global plinelist 215 365 685 - -cap log close -log using "${upath2}\03.intermediate\Sim\\${lnyear}\\GMD_log_${lnyear}.txt", text replace -local date: di %tdMon-DD-CCYY date("$S_DATE", "DMY") -local user = "`c(username)'" -local fdataall_ln Vul_dataall_${lnyear}_`date' - -tempfile dataall dataall_ln data4 data5 ctry1 ctry1ln -save "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'", replace emptyok - -//GMD todo list -use "${upath2}\02.input\\${lnyear}\GMD_list_${lnyear}", clear -replace todo = 0 if todo==. - -//update manually -replace level = "subnatid1" if level == "subnatid" & code=="KGZ" & surv_year==2010 -replace level = "subnatid" if level == "" & code=="UGA" & surv_year==2009 - -//Doing outside GMD: India, CHN, and LIS countries -drop if (code=="CHN"|code=="IND") & ${lnyear}==2021 -drop if (code=="CHN") & ${lnyear}==2010 -drop if strpos(survname,"-LIS")>0 -drop if mod=="HIST" -drop if code=="SYR" -replace ct_urban = 0 if code=="SYC" & surv_year==2018 -//add flag missing =4 -foreach var of varlist elec_flag water_flag sp_flag findex_flag edu_flag { - replace `var'= 4 if `var'==. -} - -ren level lvlvar -replace lvlvar = "" if lvlvar=="national" -local allobs = _N -tempfile gmdlist -save `gmdlist', replace - -qui forv j=1(1)`allobs' { -*qui forv i=1(1)1 { - use `gmdlist', clear - foreach lc in code surv_year survname mod lvlvar elec_flag water_flag sp_flag findex_flag edu_flag ct_urban todo { - local `lc' = `lc'[`j'] - } - - //Load lineup poverty/pop at national level - foreach num of numlist ${plinelist} { - use "${upath2}\03.intermediate\PIPinput\PIP_${lnyear}_`num'.dta", clear - keep if country_code=="`code'" - count - local pcnpov`num' - local pcnpop - if r(N)>0 { - local pcnpov`num' = headcount[1] - local pcnpop = population[1] - } - } - - //Get values for fusion ASPIRE FINDEX, etc when flags==3 - **************************************************** - //ASPIRE - if `sp_flag'==3 { - use "${upath2}\02.input\\${lnyear}\ASPIRE_data_${lnyear}.dta", clear - keep if code=="`code'" - local aspire_sp - count - if r(N)>0 { - local type_aspire = type[1] - if "`type_aspire'"=="Quintile" { - local aspire_sp = 1 - forv i=1(1)5 { - local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] - } - } - else { //type_aspire == National - local _pop_All_SPL = _pop_All_SPL[1] - local aspire_sp = 1 - } - } //rn>0 - else { - local aspire_sp = 0 - } - } //sp_flag - - **************************************************** - //FINDEX data (no account, which is dep_fin) - if `findex_flag'==3 { - use "${upath2}\02.input\\${lnyear}\findex_${lnyear}_quintiles.dta", clear - keep if code=="`code'" - local findex - count - if r(N)>0 { - local type_findex = type[1] - local findex = 1 - if `ct_urban'==0 local type_findex "Total" - if "`type_findex'"=="Urb_rur" { //urban-rural quintiles - local findex = 1 - forv i=1(1)5 { - foreach nm in urban rural { - local no_accountq`i'`nm' = no_accountq`i'`nm'[1] - } - } - } - else { //total quintiles - local findex = 1 - forv i=1(1)5 { - local no_accountq`i'total = no_accountq`i'total[1] - } - } //types - } - else { //rn>0 - local findex = 0 - } - } //findex_flag - - **************************************************** - //JMP data - if `water_flag'==3 { - use "${upath2}\02.input\\${lnyear}\JMP_cov_${lnyear}.dta", clear - keep if code=="`code'" - local jmp - count - if r(N)>0 { - local type_jmp = type[1] - local jmp = 1 - if `ct_urban'==0 local type_jmp "Total" - if "`type_jmp'"=="Urb_rur" { //urban-rural only - local w_imp_urban = w_imp_urban[1] - local w_imp_rural = w_imp_rural[1] - } - else { //total - local w_imp_total = w_imp_total[1] - } //types - } - else { //rn>0 - local jmp = 0 - } - } //water_flag - - **************************************************** - //Electricity GED - if `elec_flag'==3 { - use "${upath2}\02.input\\${lnyear}\GED_cov_${lnyear}.dta", clear - keep if code=="`code'" - local ged - count - if r(N)>0 { - local type_ged = type[1] - local ged = 1 - if `ct_urban'==0 local type_ged "Total" - if "`type_ged'"=="Urb_rur" { //urban-rural only - local ged_urban = ged_urban[1] - local ged_rural = ged_rural[1] - } - else { //total quintiles - local ged_total = ged_total[1] - } //types - } - else { //rn>0 - local ged = 0 - } - } //elec_flag - - **************************************************** - //UNESCO - if `edu_flag'==3 { - use "${upath2}\02.input\\${lnyear}\UNESCO_cov_${lnyear}.dta", clear - keep if code=="`code'" - local unesco - count - if r(N)>0 { - local type_unesco = type[1] - local unesco = 1 - if `ct_urban'==0 local type_unesco "Total" - if "`type_unesco'"=="Urb_rur" { //urban-rural only - local unesco_urban = unesco_urban[1] - local unesco_rural = unesco_rural[1] - } - else { //total quintiles - local unesco_total = unesco_total[1] - } //types - } - else { //rn>0 - local unesco = 0 - } - } - - //microdata - cap dlw, country(`code') year(`surv_year') type(gmd) mod(`mod') surveyid(`survname') files $repotxt - if _rc==0 { - cap ren sim simsur - local baseyear = `surv_year' - local year = `surv_year' - local lineupyear = $lnyear - gen _all_ = "All sample" - noi dis "`j' - Working on `code'-`surv_year'-`survname'-`mod'" - - if "`mod'"=="GPWG" local wgt weight - else local wgt weight_p - - //Prep subnational level - if ("`lvlvar'"=="") local oklist _all_ - else { - cap confirm numeric variable `lvlvar' - if _rc==0 { - tempvar xvar - cap decode `lvlvar', gen(`xvar') - if _rc~=0 tostring `lvlvar', gen(`xvar') - cap drop `lvlvar' - rename `xvar' `lvlvar' - } - replace `lvlvar' = "MISSING" if `lvlvar'=="" - replace `lvlvar' = ustrtrim(`lvlvar') - local oklist `lvlvar' - } - - //adjustment for country specific fix subnational - qui { - if "`code'"=="PHL" & "`survname'"=="FIES" & (`year'==2018) { //ok - if "`lvlvar'"=="subnatid2" { - replace subnatid2="Basilan" if subnatid2=="7-Basilan" | subnatid2=="97-Isabela City" - replace subnatid2="North Cotabato" if subnatid2=="47-Cotabato" - replace subnatid2="Davao Del Sur" if subnatid2=="24-Davao de Sur" | subnatid2=="86-Davao Occidental" - replace subnatid2="Maguindanao" if subnatid2=="38-Maguindanao" | subnatid2=="98-Cotabato City" - replace subnatid2="Metropolitan Manila" if subnatid2=="39-Manila" | subnatid2=="74-NCR-2nd Dist." | subnatid2=="75-NCR-3rd Dist." | subnatid2=="76-NCR-4th Dist." - local oklist subnatid2 - } - } - - if "`code'"=="PHL" & "`survname'"=="FIES" & `year'==2021 { //ok - if "`lvlvar'"=="subnatid2" { - replace subnatid2="Basilan" if subnatid2=="7-Basilan" | subnatid2=="97-Isabela City" - replace subnatid2="North Cotabato" if subnatid2=="47-Cotabato" - replace subnatid2="Davao Del Sur" if subnatid2=="24-Davao de Sur" | subnatid2=="86-Davao Occidental" - replace subnatid2="Maguindanao" if subnatid2=="38-Maguindanao" | subnatid2=="98-Cotabato City" - replace subnatid2="Metropolitan Manila" if subnatid2=="39-Manila" | subnatid2=="74-NCR-2nd Distr." | subnatid2=="75-NCR-3rd Distr." | subnatid2=="76-NCR-4th Distr." - local oklist subnatid2 - } - } - - if "`code'"=="ALB" & "`survname'"=="LSMS" & `year'==2012 { //ok - decode strata, gen(subnatid) - replace subnatid = subinstr(subnatid, "_Rural","",.) - replace subnatid = subinstr(subnatid, "_Urban","",.) - local oklist subnatid - } - - if "`code'"=="BGD" & "`survname'"=="HIES" & (`year'==2016|`year'==2022) { //ok - replace subnatid = "40 - Khulna" if subnatid=="45 - Mymensingh" | subnatid=="45-Mymensingh"|subnatid=="40-Khulna" - local oklist subnatid - } - - if "`code'"=="GEO" & "`survname'"=="HIS" { //ok - if `year'>=2019 & `year'<=2021 { - replace subnatid="10 - Imereti, Racha-Lechkhumi and Kvemo Svan" if subnatid=="10 - Imereti" | subnatidsurvey=="13 - Racha-Lechkhumi and Kvemo Svaneti" - } - if `year'>=2002 & `year'<=2009 { - replace subnatid="10 - Imereti, Racha-Lechkhumi and Kvemo Svan" if subnatid=="10-Imereti" - replace subnatid="7 - Adjara A.R." if subnatid=="7-Ajara" - replace subnatid="9 - Samegrelo-Zemo Svaneti" if subnatid=="9-Samegrelo" - } - } - - if "`code'"=="IDN" & "`survname'"=="SUSENAS" { //ok - if (`year'>=2010 & `year'<=2023) { - replace `lvlvar' = "64-65-North and East Kalimantan" if `lvlvar'=="64-East Kalimantan" | `lvlvar'=="65-North Kalimantan" - } - - if `year'==2005 { - replace `lvlvar' = "21-Riau Islands" if `lvlvar'=="21-Riau Island" - replace `lvlvar' = "64-65-North and East Kalimantan" if `lvlvar'=="64-East Kalimantan" | `lvlvar'=="65-North Kalimantan" - } - } - - if "`code'"=="MNE" & "`survname'"=="SILC-C" & (`year'==2016 | `year'==2017) { //ok - gen subnatid1="" - replace subnatid1="1 – North" if subnatid=="1 – North urban" | subnatid=="5 – North rural" - replace subnatid1="2 – Center" if subnatid=="2 – Center urban" | subnatid=="6 – Center rural" - replace subnatid1="3 – South" if subnatid=="3 – South urban" | subnatid=="7 – South rural" - replace subnatid1="4 – Podgorica" if subnatid=="4 – Podgorica urban" | subnatid=="8 – Podgorica rural" - - local oklist subnatid1 - } - - if "`code'"=="DJI" & "`survname'"=="EDAM" & `year'==2017 { //ok - gen subnatid1=subnatid - replace subnatid1="10 - Djibouti" if subnatid=="11 - Djibouti-ville, 1er arrondissement" - replace subnatid1="10 - Djibouti" if subnatid=="12 - Djibouti-ville, 2eme arrondissement" - replace subnatid1="10 - Djibouti" if subnatid=="13 - Djibouti-ville, 3eme arrondissement" - replace subnatid1="10 - Djibouti" if subnatid=="14 - Djibouti-ville, 4eme arrondissement" - replace subnatid1="10 - Djibouti" if subnatid=="15 - Djibouti-ville, 5eme arrondissement" - - local oklist subnatid1 - } - if "`code'"=="STP" { //ok - if `year'==2010 { - replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 - Nord" | `lvlvar'=="2 - Centre" | `lvlvar'=="3 - Sud" - replace `lvlvar' = "2 - Principé" if `lvlvar'=="4 - Principé" - } - if `year'==2000 { - replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 – Nord" | `lvlvar'=="2 – Centre" | `lvlvar'=="3 – Sud" - replace `lvlvar' = "2 - Principé" if `lvlvar'=="4 – Principé" - } - if `year'==2017 { - replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 - Lobata"|`lvlvar'=="2 - Lembá"|`lvlvar'=="3 - Mezochi"|`lvlvar'=="4 - Agua Grande"|`lvlvar'=="5 - Cantagalo"|`lvlvar'=="6 - Caué" - replace `lvlvar' = "2 - Principé" if `lvlvar'=="7 - Príncipe" - } - } - - if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2010 | `year'==2012) { - replace subnatid="1-Metropolitan" if subnatid=="1-Metropolitan"|subnatid=="1 - Metropolitan" - replace subnatid="2-Lower" if subnatid=="2-Lower Urban"|subnatid=="3-Lower Rural" - replace subnatid="2-Lower" if subnatid=="2 - Lower Urban"|subnatid=="3 - Lower Rural" - replace subnatid="4-Upper" if subnatid=="4-Upper Urban"|subnatid=="5-Upper Rural" - replace subnatid="4-Upper" if subnatid=="4 - Upper Urban"|subnatid=="5 - Upper Rural" - replace subnatid="6-Borders" if subnatid=="6-Borders Urban"|subnatid=="7-Borders Rural" - replace subnatid="6-Borders" if subnatid=="6 - Borders Urban"|subnatid=="7 - Borders Rural" - local oklist subnatid - } - if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2017 | `year'==2015) { - if "`lvlvar'"=="subnatid1" { - replace subnatid1="1-Metropolitan" if subnatid1=="1-Metropolitan" - replace subnatid1="2-Lower" if subnatid1=="2-Lower Urban"|subnatid1=="3-Lower Rural" - replace subnatid1="4-Upper" if subnatid1=="4-Upper Urban"|subnatid1=="5-Upper Rural" - replace subnatid1="6-Borders" if subnatid1=="6-Borders Urban"|subnatid1=="7-Borders Rural" - local oklist subnatid1 - } - if "`lvlvar'"=="subnatid" local oklist subnatid - } - - if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2019) { - if "`lvlvar'"=="subnatid" { - replace subnatid="1-Metropolitan" if subnatid=="1-Metropolitan" - replace subnatid="2-Lower" if subnatid=="2-Lower Urban"|subnatid=="3-Lower Rural" - replace subnatid="4-Upper" if subnatid=="4-Upper Urban"|subnatid=="5-Upper Rural" - replace subnatid="6-Borders" if subnatid=="6-Borders Urban"|subnatid=="7-Borders Rural" - local oklist subnatid - } - } - - if "`code'"=="FIN" & "`survname'"=="EU-SILC" & (`year'>=2008 & `year'<=2010) { - replace subnatid="4-FI1C" if subnatid=="1-FI18" - local oklist subnatid - } - - if "`code'"=="CIV" & "`survname'"=="ENV" & `year'==2015 { - *decode gaul_adm1, gen(gaul_adm1_str) - *local oklist gaul_adm1_str - } - - if "`code'"=="CIV" & "`survname'"=="EHCVM" & (`year'==2018| `year'==2021) { - replace subnatid = trim(proper(lower( subnatid))) - gen gaul_adm1_str = "" - replace gaul_adm1_str="Folon" if subnatid=="10 - Kabadougou" - replace gaul_adm1_str="Folon" if subnatid=="24 - Folon" - replace gaul_adm1_str="Tchologo" if subnatid=="20 - Bagoue" - replace gaul_adm1_str="Tchologo" if subnatid=="3 - Poro" - replace gaul_adm1_str="Tchologo" if subnatid=="32 - Tchologo" - replace gaul_adm1_str="Hambol" if subnatid=="28 - Hambol" - replace gaul_adm1_str="Hambol" if subnatid=="4 - Gbeke" - replace gaul_adm1_str="Bounkani" if subnatid=="23 - Bounkani" - replace gaul_adm1_str="Bounkani" if subnatid=="8 - Gontougo" - replace gaul_adm1_str="Sud-Comoe" if subnatid=="13 - Sud-Comoe" - replace gaul_adm1_str="Sud-Comoe" if subnatid=="5 - Indenie-Djuablin" - replace gaul_adm1_str="District autonome D'abidjan" if subnatid=="1 - Autonome D'Abidjan" - replace gaul_adm1_str="District autonome de Yamoussou" if subnatid=="7 - Yamoussoukro" - replace gaul_adm1_str="Goh" if subnatid=="15 - LÔH-Djiboua" - replace gaul_adm1_str="Goh" if subnatid=="17 - GÔH" - replace gaul_adm1_str="Moronou" if subnatid=="11 - N'Zi" - replace gaul_adm1_str="Moronou" if subnatid=="21 - Belier" - replace gaul_adm1_str="Moronou" if subnatid=="29 - Iffou" - replace gaul_adm1_str="Moronou" if subnatid=="33 - Moronou" - replace gaul_adm1_str="La Me" if subnatid=="16 - Agneby-Tiassa" - replace gaul_adm1_str="La Me" if subnatid=="26 - Grands-Ponts" - replace gaul_adm1_str="La Me" if subnatid=="30 - La Me" - replace gaul_adm1_str="Guemon" if subnatid=="18 - Cavally" - replace gaul_adm1_str="Guemon" if subnatid=="27 - Guemon" - replace gaul_adm1_str="Guemon" if subnatid=="6 - Tonkpi" - replace gaul_adm1_str="Marahoue" if subnatid=="12 - Marahoue" - replace gaul_adm1_str="Marahoue" if subnatid=="2 - Haut-Sassandra" - replace gaul_adm1_str="Bere" if subnatid=="14 - Worodougou" - replace gaul_adm1_str="Bere" if subnatid=="19 - Bafing" - replace gaul_adm1_str="Bere" if subnatid=="22 - Bere" - replace gaul_adm1_str="Nawa" if subnatid=="25 - GbÔKle" - replace gaul_adm1_str="Nawa" if subnatid=="31 - Nawa" - replace gaul_adm1_str="Nawa" if subnatid=="9 - San-Pedro" - - local oklist gaul_adm1_str - } - - if "`code'"=="COM" & "`survname'"=="EESIC" & `year'==2013 { //ok - replace subnatid="1 - Moroni" if subnatid=="2 - Reste Ngazidja" - local oklist subnatid - } - /* - if "`file'"=="SSA_GMB_2015_IHS_LN2018_IND.dta" { - replace subnatid="6 – Kuntaur" if subnatid=="7 – Janjanbureh" - } - */ - - if "`code'"=="NAM" & "`survname'"=="NHIES" & `year'==2015 { //ok - replace subnatid="4-kavango east" if subnatid=="5-kavango west" - local oklist subnatid - } - - if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2018 { //ok - replace subnatid2="51-Western Area" if subnatid2=="51-Western Area Rural" | subnatid2=="52-Western Area Urban" - replace subnatid2="21–Bombali/32–Karene" if subnatid2=="21-Bombali" | subnatid2=="32-Karene" - replace subnatid2="22–Falaba/23–Koinadugu" if subnatid2=="22-Falaba" | subnatid2=="23-Koinadugu" - local oklist subnatid2 - } - - if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2011 { //ok - replace subnatid2="51-Western Area" if subnatid2=="41 - Western other" | subnatid2=="42 - Western urban (Freetown)" - local oklist subnatid2 - } - - if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2003 { //ok - replace subnatid="51-Western Area" if subnatid=="41 - Western other" | subnatid=="42 - Western urban" - local oklist subnatid - } - /* - if "`file'"=="SSA_GAB_2017_EGEP_LN2018_IND.dta" { //check - /* ALREADY DONE, strata is from SSAPOV module P - gen subnatid = strata - replace subnatid = "11-Ouest" if strata=="10-Reste Ouest Urbain" | strata=="11-Ouest Rural" - replace subnatid = "4-Nord" if strata=="4-Nord-Urbain" | strata=="5-Nord-Rural" - replace subnatid = "6-Sud" if strata=="6-Sud-Urbain" | strata=="7-Sud-Rural" - replace subnatid = "9-Est" if strata=="8-Reste Est Urbain" | strata=="9-Est Rural" - */ - } - - if "`file'"=="EAP_WSM_2008_HIES_LN2018_IND.dta" { - replace subnatid="Upolu" if subnatid=="1-Apia" - replace subnatid="Upolu" if subnatid=="2-NWU" - replace subnatid="Upolu" if subnatid=="3-RoU" - } - */ - /* - if "`code'"=="TLS" & "`survname'"=="TLSLS" & `year'==2014 { //ok - gen str subnatidx = "" - replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="01-Aileu" - replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="02-Dili" - replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="03-Ermera" - replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="04-Ainaro" - replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="06-Manufahi" - replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="05-Manatuto" - replace subnatidx = "5-Oecussi" if subnatid1=="13-Oecussi" - replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="10-Bobonaro" - replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="11-Covalima" - replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="12-Liquica" - replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="09-Viqueque" - replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="08-Lautem" - replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="07-Baucau" - local oklist subnatidx - } - */ - if "`code'"=="MWI" { //ok - if `year'>=2010 { - replace `lvlvar'="105/107 Mzimba" if `lvlvar'=="105 - Mzimba" | `lvlvar'=="107 - Mzuzu City"|`lvlvar'=="107 – Mzuzu City" - replace `lvlvar'="305/315 Blantyre" if `lvlvar'=="315 - Blantyre City" | `lvlvar'=="305 - Blantyre"| `lvlvar'=="315 – Blantyre City" - replace `lvlvar'="206/210 Lilongwe" if `lvlvar'=="210 - Lilongwe City" | `lvlvar'=="206 - Lilongwe"| `lvlvar'=="210 – Lilongwe City" - replace `lvlvar'="303/314 Zomba" if `lvlvar'=="303 - Zomba" | `lvlvar'=="314 - Zomba City" - replace `lvlvar'="303/314 Zomba" if `lvlvar'=="303 – Zomba Non-City" | `lvlvar'=="314 – Zomba City" - } - if `year'==1997 { - replace `lvlvar'="105/107 Mzimba" if `lvlvar'=="130 – Mzimba" | `lvlvar'=="131 – Mzuzu City" - replace `lvlvar'="305/315 Blantyre" if `lvlvar'=="304 – Blantyre Rural" | `lvlvar'=="305 – Blantyre City" - replace `lvlvar'="206/210 Lilongwe" if `lvlvar'=="223 – Lilongwe Rural" | `lvlvar'=="224 – Lilongwe City" - replace `lvlvar'="303/314 Zomba" if `lvlvar'=="306 – Zomba Rural" | `lvlvar'=="307 – Zomba City" - } - } - - if "`code'"=="TCD" & "`survname'"=="EHCVM" & `year'==2018 { //ok - replace subnatid="3-Borkou-Ennedi-Tibesti" if subnatid=="2 - Borkou" | subnatid=="20 - Ennedi Ouest" - } - - /* - if "`file'"=="LAC_BOL_2018_EH_LN2018_IND.dta" { - replace subnatid2 = trim(subnatid2) - replace subnatid2="8/9 Beni and Pando" if subnatid2=="8 - Beni" | subnatid2=="9 - Pando" - } - */ - - if "`code'"=="DOM" & ("`survname'"=="ECNFT-Q03"|"`survname'"=="ENFT") { //ok - replace `lvlvar'= trim(`lvlvar') - replace `lvlvar'="2 - Norte o Cibao" if `lvlvar'=="2 - Cibao Norte" | `lvlvar'=="3 - Cibao Sur" | `lvlvar'=="4 - Cibao Nordeste" | `lvlvar'=="5 - Cibao Noroeste" - replace `lvlvar'="3 - Sur" if `lvlvar'=="6 - Valdesia" | `lvlvar'=="7 - El Valle" | `lvlvar'=="8 - Enriquillo" - replace `lvlvar'="4 - Este" if `lvlvar'=="9 - Higuamo" | `lvlvar'=="10 - Yuma" - } - - if "`code'"=="NER" & "`survname'"=="EHCVM" & `year'==2021 { //ok - replace subnatid="1 - Agadez" if subnatid=="1 - gadez" - } - - if "`code'"=="BRA" { //ok - replace subnatid2="43 - Rio Grande do Sul" if subnatid2== "43 - Rio Grande do Norte" - } - - if "`code'"=="BWA" { //ok - replace subnatid="3 – Other Towns" if subnatid== "3 - Other cities & towns" - replace subnatid="4 – South East" if subnatid== "4 - Rural South-East" - replace subnatid="5 – North East" if subnatid== "5 - Rural North-East" - replace subnatid="6 – North West" if subnatid== "6 - Rural North-West" - replace subnatid="7 – South West" if subnatid== "7 - Rural South-West" - } - - if "`code'"=="CAF" { //ok - replace subnatid2="3 - Yadé" if subnatid2== "3 - Yade" - } - - if "`code'"=="BFA" { //ok - replace subnatid="1 - Boucle du Mouhoun" if subnatid== "1 - Boucle du Mouhoum" - } - - if "`code'"=="CMR" & "`survname'"=="ECAM-V" & `year'==2021 { //ok - egen sample3 = sieve(subnatid), keep(a) - gen sample3UP = upper(sample3) - replace subnatid="10 -Sud-Oues" if subnatid== "11 - sud-ouest" - replace subnatid="2 - Centre" if sample3UP== "YAOUND" - replace subnatid="6 - Littoral" if subnatid== "3 - douala" - drop sample3 sample3UP - } - - if "`code'"=="IRN" { - replace `lvlvar' = "10 - Isfahan" if `lvlvar'=="11 - Esfahan" - replace `lvlvar' = "11 - Sistan" if `lvlvar'=="12 - SistanBalouchestan" - replace `lvlvar' = "12 - Kurdestan" if `lvlvar'=="13 - Kordestan" - replace `lvlvar' = "13 - Hamadan" if `lvlvar'=="14 - Hamedan" - replace `lvlvar' = "14 - Bakhtiari" if `lvlvar'=="15 - CharmahalBakhtiari" - replace `lvlvar' = "17 - Kohkiloyeh" if `lvlvar'=="18 - KohkilouyeBoyerahamad" - replace `lvlvar' = "18 - Bushehr" if `lvlvar'=="19 - Boushehr" - replace `lvlvar' = "28 - N. Khorasan" if `lvlvar'=="29 - KhorasanShomali" - replace `lvlvar' = "S. Khorasan" if `lvlvar'=="30 - KhorasanJonoubi" - replace `lvlvar' = "3 - E.Azarbaijan" if `lvlvar'=="4 - AzarbaijanSharghi" - replace `lvlvar' = "4 - W.Azarbaijan" if `lvlvar'=="5 - AzarbaijanGharbi" - replace `lvlvar' = "6 - Khuzestan" if `lvlvar'=="7 - Kouzestan" - } - - if "`code'"=="KAZ" { - replace `lvlvar' = "51 - South_Kaz" if `lvlvar'=="61 - Turkistan" - } - - if "`code'"=="KGZ" { - replace `lvlvar' = "2-Issyk-kul" if `lvlvar'=="2-Issyk-ku" - replace `lvlvar' = "3-Jalal-Abad" if `lvlvar'=="3-Jalalaba" - } - - if "`code'"=="LAO" { - replace `lvlvar' = "18-Xaysomboon" if `lvlvar'=="18-Xaisomboun" - } - - if "`code'"=="MAR" { //old shapefile for 2000 and 2006 - /* - replace `lvlvar' = "" if `lvlvar'=="1 - Regions sahariennes" - replace `lvlvar' = "" if `lvlvar'=="10 - Tadla-Azilal" - replace `lvlvar' = "" if `lvlvar'=="11 - Meknes-Tafilalet" - replace `lvlvar' = "" if `lvlvar'=="12 - Fes-Boulemane-Taounate" - replace `lvlvar' = "" if `lvlvar'=="13 - Taza-Hoceima" - replace `lvlvar' = "1 - Tanger-Tetouan-Al Hoceima" if `lvlvar'=="14 - Tanger-Tetouan" - replace `lvlvar' = "9 - Souss-Massa" if `lvlvar'=="2 - Souss- Massa-Draa" - replace `lvlvar' = "5 - Beni Mellal-Khenifra" if `lvlvar'=="3 - Gharb-Chrarda-Beni Hssen" - replace `lvlvar' = "" if `lvlvar'=="4 - Chaouia-Ouardigha" - replace `lvlvar' = "7 - Marrakech-Safi" if `lvlvar'=="5 - Marrakech-Tensift-Haouz" - replace `lvlvar' = "2 - Oriental" if `lvlvar'=="6 - Oriental" - replace `lvlvar' = "6 - Grand Casablanca" if `lvlvar'=="7 - Grand Casablanca" - replace `lvlvar' = "4 - Rabat-Salé-Kenitra" if `lvlvar'=="8 -Rabat-Sale-Zemmour-Zaer" - replace `lvlvar' = "" if `lvlvar'=="9 - Doukkala-Abda" - */ - } - - if "`code'"=="MOZ" & "`survname'"=="IOF" & `year'==2022 { //oklist - replace `lvlvar' = "Maputo Cidade" if `lvlvar'=="Cidade de Maputo" - } - - if "`code'"=="MLI" & "`survname'"=="EHCVM" & `year'==2021 { //ok - egen sample3 = sieve(subnatid), keep(a) - gen sample3UP = upper(sample3) - replace subnatid="4 - Sgou" if sample3UP== "SEGOU" - drop sample3 sample3UP - } - - if "`code'"=="MMR" & `year'==2017 { - replace `lvlvar' = "14-Ayeyawaddy" if trim(`lvlvar')=="14-Ayeyawaddy" - replace `lvlvar' = "2-Kayar" if trim(`lvlvar')=="2-Kayar" - replace `lvlvar' = "6-Taninthayi" if trim(`lvlvar')=="6-Taninthayi" - } - - if "`code'"=="MRT" { - replace `lvlvar' = "1 - Hodh El Charghi" if `lvlvar'=="1 - Hodh charghy" - replace `lvlvar' = "2 - Hodh El Gharbi" if `lvlvar'=="2 - Hodh Gharby" - replace `lvlvar' = "11 - Tiris Zemmour" if `lvlvar'=="11 - Tirs-ezemour" - replace `lvlvar' = "8 - Dakhlet Nouadhibou" if `lvlvar'=="8 - Dakhlett Nouadibou" - } - - if "`code'"=="PRY" & `year'>=2001 & `year'<=2017 { - egen sample3 = sieve(`lvlvar'), keep(a) - gen sample3UP = upper(sample3) - replace `lvlvar'="20 - Resto" if sample3UP== "CONCEPCION" - replace `lvlvar'="20 - Resto" if sample3UP== "NEEMBUCU" - replace `lvlvar'="20 - Resto" if sample3UP== "AMAMBAY" - replace `lvlvar'="20 - Resto" if sample3UP== "CANINDEYU" - replace `lvlvar'="20 - Resto" if sample3UP== "PRESIDENTEHAYES" - replace `lvlvar'="20 - Resto" if sample3UP== "CORDILLERA" - replace `lvlvar'="20 - Resto" if sample3UP== "GUAIRA" - replace `lvlvar'="20 - Resto" if sample3UP== "MISIONES" - replace `lvlvar'="20 - Resto" if sample3UP== "PARAGUARI" - drop sample3 sample3UP - } - - if "`code'"=="POL" & "`survname'"=="HBS" { - egen sample3 = sieve(`lvlvar'), keep(a) - gen sample3UP = upper(sample3) - *gen subnatid2x = `lvlvar' - replace `lvlvar'="1-PL2" if sample3UP=="MALOPOLSKIE" - replace `lvlvar'="1-PL2" if sample3UP=="SLASKIE" - replace `lvlvar'="2-PL4" if sample3UP=="WIELKOPOLSKIE" - replace `lvlvar'="2-PL4" if sample3UP=="ZACHODNIOPOMORSKIE" - replace `lvlvar'="2-PL4" if sample3UP=="LUBUSKIE" - replace `lvlvar'="3-PL5" if sample3UP=="DOLNOSLASKIE" - replace `lvlvar'="3-PL5" if sample3UP=="OPOLSKIE" - replace `lvlvar'="4-PL6" if sample3UP=="KUJAWSKOPOMORSKIE" - replace `lvlvar'="4-PL6" if sample3UP=="WARMINSKOMAZURSKIE" - replace `lvlvar'="4-PL6" if sample3UP=="POMORSKIE" - replace `lvlvar'="5-PL7" if sample3UP=="LODZKIE" - replace `lvlvar'="5-PL7" if sample3UP=="SWIETOKRZYSKIE" - replace `lvlvar'="6-PL8" if sample3UP=="LUBELSKIE" - replace `lvlvar'="6-PL8" if sample3UP=="PODKARPACKIE" - replace `lvlvar'="6-PL8" if sample3UP=="PODLASKIE" - replace `lvlvar'="7-PL9" if sample3UP=="MAZOWIECKIE" - local oklist `lvlvar' - } - - if "`code'"=="POL" & "`survname'"=="EU-SILC" & `year'<=2017 { - replace `lvlvar'="5-PL7" if `lvlvar'=="1-PL1" - replace `lvlvar'="1-PL2" if `lvlvar'=="2-PL2" - replace `lvlvar'="6-PL8" if `lvlvar'=="3-PL3" - replace `lvlvar'="2-PL4" if `lvlvar'=="4-PL4" - replace `lvlvar'="3-PL5" if `lvlvar'=="5-PL5" - replace `lvlvar'="4-PL6" if `lvlvar'=="6-PL6" - } - - if "`code'"=="ROU" { - replace `lvlvar'="2-RO2" if `lvlvar'=="1 - North-East" | `lvlvar'=="1-North-East" - replace `lvlvar'="2-RO2" if `lvlvar'=="2 - South-East" | `lvlvar'=="2-South-East" - replace `lvlvar'="3-RO3" if `lvlvar'=="3 - South" | `lvlvar'=="3-South" - replace `lvlvar'="4-RO4" if `lvlvar'=="4 - South-West" | `lvlvar'=="4-South-West" - replace `lvlvar'="4-RO4" if `lvlvar'=="5 - West" | `lvlvar'=="5-West" - replace `lvlvar'="1-RO1" if `lvlvar'=="6 - North-West" | `lvlvar'=="6-North-West" - replace `lvlvar'="1-RO1" if `lvlvar'=="7 - Centre" | `lvlvar'=="7-Centre" - replace `lvlvar'="3-RO3" if `lvlvar'=="8 - Bucharest-Ilfov" | `lvlvar'=="8-Bucharest-Ilfov" - - replace `lvlvar'="2-RO2" if `lvlvar'=="3-RO21" - replace `lvlvar'="2-RO2" if `lvlvar'=="4-RO22" - replace `lvlvar'="3-RO3" if `lvlvar'=="5-RO31" - replace `lvlvar'="4-RO4" if `lvlvar'=="7-RO41" - replace `lvlvar'="4-RO4" if `lvlvar'=="8-RO42" - replace `lvlvar'="1-RO1" if `lvlvar'=="1-RO11" - replace `lvlvar'="1-RO1" if `lvlvar'=="2-RO12" - replace `lvlvar'="3-RO3" if `lvlvar'=="6-RO32" - } - - if "`code'"=="THA" { - replace `lvlvar'="10-Bangkok" if `lvlvar'=="10" - replace `lvlvar'="11-Samut Prakan" if `lvlvar'=="11" - replace `lvlvar'="12-Nonthaburi" if `lvlvar'=="12" - replace `lvlvar'="13-Pathum Thani" if `lvlvar'=="13" - replace `lvlvar'="14-Phra Nakhon Si Ayu" if `lvlvar'=="14" - replace `lvlvar'="15-Ang Thong" if `lvlvar'=="15" - replace `lvlvar'="16-Lop Buri" if `lvlvar'=="16" - replace `lvlvar'="17-Sing Buri" if `lvlvar'=="17" - replace `lvlvar'="18-Chai Nat" if `lvlvar'=="18" - replace `lvlvar'="19-Saraburi" if `lvlvar'=="19" - replace `lvlvar'="20-Chon Buri" if `lvlvar'=="20" - replace `lvlvar'="21-Rayong" if `lvlvar'=="21" - replace `lvlvar'="22-Chanthaburi" if `lvlvar'=="22" - replace `lvlvar'="23-Trat" if `lvlvar'=="23" - replace `lvlvar'="24-Chachoengsao" if `lvlvar'=="24" - replace `lvlvar'="25-Prachin Buri" if `lvlvar'=="25" - replace `lvlvar'="26-Nakhon Nayok" if `lvlvar'=="26" - replace `lvlvar'="27-Sa Kaeo" if `lvlvar'=="27" - replace `lvlvar'="30-Nakhon Ratchasima" if `lvlvar'=="30" - replace `lvlvar'="31-Buri Ram" if `lvlvar'=="31" - replace `lvlvar'="32-Surin" if `lvlvar'=="32" - replace `lvlvar'="33-Si Sa Ket" if `lvlvar'=="33" - replace `lvlvar'="34-Ubon Ratchathani" if `lvlvar'=="34" - replace `lvlvar'="35-Yasothon" if `lvlvar'=="35" - replace `lvlvar'="36-Chaiyaphum" if `lvlvar'=="36" - replace `lvlvar'="37-Am Nat Charoen" if `lvlvar'=="37" - replace `lvlvar'="38-Bueng Kan" if `lvlvar'=="38" - replace `lvlvar'="39-Nong Bua Lam Phu" if `lvlvar'=="39" - replace `lvlvar'="40-Khon Kaen" if `lvlvar'=="40" - replace `lvlvar'="41-Udon Thani" if `lvlvar'=="41" - replace `lvlvar'="42-Loei" if `lvlvar'=="42" - replace `lvlvar'="43-Nong Khai" if `lvlvar'=="43" - replace `lvlvar'="44-Maha Sarakham" if `lvlvar'=="44" - replace `lvlvar'="45-Roi Et" if `lvlvar'=="45" - replace `lvlvar'="46-Kalasin" if `lvlvar'=="46" - replace `lvlvar'="47-Sakon Nakhon" if `lvlvar'=="47" - replace `lvlvar'="48-Nakhon Phanom" if `lvlvar'=="48" - replace `lvlvar'="49-Mukdahan" if `lvlvar'=="49" - replace `lvlvar'="50-Chiang Mai" if `lvlvar'=="50" - replace `lvlvar'="51-Lamphun" if `lvlvar'=="51" - replace `lvlvar'="52-Lampang" if `lvlvar'=="52" - replace `lvlvar'="53-Uttaradit" if `lvlvar'=="53" - replace `lvlvar'="54-Phrae" if `lvlvar'=="54" - replace `lvlvar'="55-Nan" if `lvlvar'=="55" - replace `lvlvar'="56-Phayao" if `lvlvar'=="56" - replace `lvlvar'="57-Chiang Rai" if `lvlvar'=="57" - replace `lvlvar'="58-Mae Hong Son" if `lvlvar'=="58" - replace `lvlvar'="60-Nakhon Sawan" if `lvlvar'=="60" - replace `lvlvar'="61-Uthai Thani" if `lvlvar'=="61" - replace `lvlvar'="62-Kamphaeng Phet" if `lvlvar'=="62" - replace `lvlvar'="63-Tak" if `lvlvar'=="63" - replace `lvlvar'="64-Sukhothai" if `lvlvar'=="64" - replace `lvlvar'="65-Phitsanulok" if `lvlvar'=="65" - replace `lvlvar'="66-Phichit" if `lvlvar'=="66" - replace `lvlvar'="67-Phetchabun" if `lvlvar'=="67" - replace `lvlvar'="70-Ratchaburi" if `lvlvar'=="70" - replace `lvlvar'="71-Kanchanaburi" if `lvlvar'=="71" - replace `lvlvar'="72-Suphun Buri" if `lvlvar'=="72" - replace `lvlvar'="73-Nakhon Pathom" if `lvlvar'=="73" - replace `lvlvar'="74-Samut Sakhon" if `lvlvar'=="74" - replace `lvlvar'="75-Samut Songkhram" if `lvlvar'=="75" - replace `lvlvar'="76-Phetchaburi" if `lvlvar'=="76" - replace `lvlvar'="77-Prachuap Khiri Kha" if `lvlvar'=="77" - replace `lvlvar'="80-Nakhon Si Thammara" if `lvlvar'=="80" - replace `lvlvar'="81-Krabi" if `lvlvar'=="81" - replace `lvlvar'="82-Phangnga" if `lvlvar'=="82" - replace `lvlvar'="83-Phuket" if `lvlvar'=="83" - replace `lvlvar'="84-Surat Thani" if `lvlvar'=="84" - replace `lvlvar'="85-Ranong" if `lvlvar'=="85" - replace `lvlvar'="86-Chumphon" if `lvlvar'=="86" - replace `lvlvar'="90-Songkhla" if `lvlvar'=="90" - replace `lvlvar'="91-Satun" if `lvlvar'=="91" - replace `lvlvar'="92-Trang" if `lvlvar'=="92" - replace `lvlvar'="93-Phatthalung" if `lvlvar'=="93" - replace `lvlvar'="94-Pattani" if `lvlvar'=="94" - replace `lvlvar'="95-Yala" if `lvlvar'=="95" - replace `lvlvar'="96-Narathiwat" if `lvlvar'=="96" - - replace `lvlvar' = "14-Phra Nakhon Si Ayu" if `lvlvar'=="14-Phra Nakhon Si Ayudhya" - replace `lvlvar' = "11-Samut Prakan" if `lvlvar'=="11-Samut Prakarn" - replace `lvlvar' = "23-Trat" if `lvlvar'=="23-Trad" - replace `lvlvar' = "25-Prachin Buri" if `lvlvar'=="25-Phachinburi" - replace `lvlvar' = "72-Suphun Buri" if `lvlvar'=="72-Suphanburi" - replace `lvlvar' = "75-Samut Songkhram" if `lvlvar'=="75-Samut Songkham" - replace `lvlvar' = "77-Prachuap Khiri Kha" if `lvlvar'=="77-Prachuap Khilikhan" - replace `lvlvar' = "80-Nakhon Si Thammara" if `lvlvar'=="80-Nakhon Si Thammarat" - - } - - if "`code'"=="TUN" { - replace `lvlvar' = "2 - NE" if `lvlvar'=="2 - Nord Est" - replace `lvlvar' = "3 - NW" if `lvlvar'=="3 - Nord Ouest" - replace `lvlvar' = "4 - CenterE" if `lvlvar'=="4 - Centre Est" - replace `lvlvar' = "5 - CenterW" if `lvlvar'=="5 - Centre Ouest" - replace `lvlvar' = "6 - SE" if `lvlvar'=="6 - Sud Est" - replace `lvlvar' = "7 - SW" if `lvlvar'=="7 - Sud ouest" - } - - if "`code'"=="UKR" { - replace `lvlvar' = "21 – Transcarpathian" if `lvlvar'=="7 – Transcarpathian" - } - - if "`code'"=="UZB" { - replace `lvlvar' = "4-Jizzak" if `lvlvar'=="1708 - Jizzakh" - replace `lvlvar' = "11-Tashkent" if `lvlvar'=="1727 - Tashkent (region)" - replace `lvlvar' = "13-Khorezm" if `lvlvar'=="1733 - Khorasm" - } - - if "`code'"=="VNM" { - replace `lvlvar' = "2-Midlands and Northern Mountainous Areas" if `lvlvar'=="1-North Mountain and Midland" - replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="3-Northern and Coastal Central region_num" - replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="3-North Central area and South Central Coastal area" - replace `lvlvar' = "6-Mekong Delta" if `lvlvar'=="6-Mekong River Delta" - //8 regions into 6 regions - replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="Central North" | `lvlvar'=="Central South" - replace `lvlvar' = "4-Central Highlands" if `lvlvar'=="Highlands" - replace `lvlvar' = "6-Mekong Delta" if `lvlvar'=="Mekong River Delta" - replace `lvlvar' = "2-Midlands and Northern Mountainous" if `lvlvar'=="Northeast" - replace `lvlvar' = "2-Midlands and Northern Mountainous" if `lvlvar'=="Northwest" - } - - if "`code'"=="WSM" { - replace `lvlvar' = "1-Apia Urban Areas" if `lvlvar'=="1-Apia" - replace `lvlvar' = "2-North West Upolu" if `lvlvar'=="2-NWU" - replace `lvlvar' = "3-Rest of Upolu" if `lvlvar'=="3-RoU" - } - - if "`code'"=="XKX" { - egen sample3 = sieve(`lvlvar'), keep(a) - gen sample3UP = upper(sample3) - replace `lvlvar'="1 -Gjakovע" if sample3UP== "GJAKOVE" - replace `lvlvar'="3 - Mitrovic" if sample3UP== "MITROVICE" - replace `lvlvar'="6 - Prishtin" if sample3UP== "PRISHTINE" - replace `lvlvar'="4 - Pej" if sample3UP== "PEJE" - drop sample3 sample3UP - } - - if "`code'"=="AZE" { - egen sample3 = sieve(`lvlvar'), keep(a) - gen sample3UP = upper(sample3) - replace `lvlvar'="" if sample3UP== "GJAKOVE" - - replace `lvlvar'="1 – Absheron" if sample3UP== "ABERONQUBA"|sample3UP== "ABSHERON"|sample3UP== "ABSHERONGUBA" - replace `lvlvar'="6 – Aran" if sample3UP== "ARANWITHYUHKARABAH" - replace `lvlvar'="8 – Baku City" if sample3UP== "BAKU"|sample3UP== "BAKUCITY" - replace `lvlvar'="9 – Daghlig Shirvan" if sample3UP== "DAGLIQSHIRVAN" | sample3UP== "SHIRVAN" |sample3UP== "IRVAN"|sample3UP== "DAGHLIGSHIRVAN" - replace `lvlvar'="2 – Ganja-Gazakh" if sample3UP== "GANJAGAZAKH" - replace `lvlvar'="5 – Guba-Khachmaz" if sample3UP== "GUBAHACHMAZ" - replace `lvlvar'="4 – Lankaran" if sample3UP== "LANKARANASTARA" - - replace `lvlvar'="NA" if sample3UP== "MUANSALYAN"|sample3UP== "MUGHANSALYAN" - replace `lvlvar'="0 – Nakhchyvan" if sample3UP== "NAKHCHIVAN"|sample3UP== "NAKHCHYVANAR" - replace `lvlvar'="7 – Yukhary Garabagh" if sample3UP== "QARABAGHMIL"|sample3UP== "QARABAMIL" - replace `lvlvar'="3 – Shaki-Zagatala" if sample3UP== "SHAKIZAGATALA"|sample3UP== "SHEKIZAGATALA" - replace `lvlvar'="7 – Yukhary Garabagh" if sample3UP== "YUHARSKARABAH" | sample3UP== "YUKHARYGARABAGH" - drop sample3 sample3UP - } - - } - - //urban and rural - local urbvar - cap des urban - if _rc==0 { - ta urban - if r(N)>0 { - cap decode urban, gen(_urban_) - if _rc~=0 tostring urban, gen(_urban_) - replace _urban_ = trim(_urban_) - gen reg_rural = `oklist' + "*_*" + _urban_ - local urbvar reg_rural - } - } //urban - - local oklist2 _all_ `oklist' `urbvar' - local oklist2 : list uniq oklist2 - - **************************************************** - **Dimension 1: Poverty - **************************************************** - gen double gallT_ppp = welfare/cpi2017/icp2017/365 - drop if gallT_ppp<0 - replace gallT_ppp = 0.25 if gallT_ppp<0.25 - - //reweight to lineup year pop - su year [aw=`wgt'] - local initial = r(sum_w) - gen double pop = (`wgt') * (`pcnpop'/`initial') - - //recalculate the 2.15 line for 2.15 poverty - qui foreach num of numlist ${plinelist} { - if `pcnpov`num''==0 { - local pline`num' = `=`num'/100' - } - else { - _pctile gallT_ppp [aw=pop], p(`pcnpov`num'') - local pline`num' = r(r1) - } - - gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. - gen pline`num' = `pline`num'' - } //num - - **************************************************** - **Dimension 2: Access to Education - **************************************************** - if `edu_flag'==1 { //data in survey - **1a) Indicator: have no one with primary completion (completed 15+) - //All adults - global eduage 15 - if "`=upper("`code'")'" == "UKR" { //2014 - global eduage 2 //2019 - drop age - *ren agecat age - //check whether it is string or not - //2019 only - gen age = 1 if agecat=="1 - Up to 18 years" - replace age = 2 if agecat=="2 - 18 - 35 years old" - replace age = 3 if agecat=="3 - 36 - 55 years old" - replace age = 4 if agecat=="4 - 56 - 59 years old" - replace age = 5 if agecat=="5 - 60 years and older" - } - - if "`=upper("`code'")'" == "NRU" { //2012 - global eduage 4 //2019 - drop age - *ren agecat age - //check whether it is string or not - //2012 only - gen age = . - replace age = 1 if agecat=="0-4 years" - replace age = 2 if agecat=="5-9 years" - replace age = 3 if agecat=="10-14 years" - replace age = 4 if agecat=="15-19 years" - replace age = 5 if agecat=="20-24 years" - replace age = 6 if agecat=="25-29 years" - replace age = 7 if agecat=="30-34 years" - replace age = 8 if agecat=="35-39 years" - replace age = 9 if agecat=="40-44 years" - replace age = 10 if agecat=="45-49 years" - replace age = 11 if agecat=="50-54 years" - replace age = 12 if agecat=="55-59 years" - replace age = 13 if agecat=="60-64 years" - replace age = 14 if agecat=="65-69 years" - replace age = 15 if agecat=="70-74 years" - replace age = 16 if agecat=="75 and older" - } - - local eduflag = 0 - cap gen educat5 = . - cap gen educat7 = . - - cap su educat7 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) - } - else { //educat5 - cap su educat5 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) - } - else { //educat4 - cap su educat4 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) - } - else { //no education available - local eduflag = 1 - } - } - } - - if `eduflag'==0 { - gen temp2a = 1 if age>=$eduage & age~=. - bys hhid: egen educ_com_size = sum(temp2a) - bys hhid: egen temp3 = sum(temp2) - bys hhid: egen temp3c = sum(temp2c) - gen dep_educ_com = 0 - replace dep_educ_com = 1 if temp3==0 - gen dep_educ_com_lb = 0 - replace dep_educ_com_lb = 1 if temp3c==0 - ren temp3 educ_com_sum - ren temp3c educ_com_sum_lb - drop temp2 temp2a temp2c - } - else { - gen dep_educ_com = . - gen dep_educ_com_lb = . - gen educ_com_sum = . - gen educ_com_sum_lb = . - gen educ_com_size = . - } - - gen educ_com_appl = 1 - replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) - gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. - bys hhid: egen educ_com_mis = sum(temp2b) - drop temp2b - gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. - - la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" - la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" - la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" - la var educ_com_appl_miss "School completion is applicable households but missing completion" - cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss - } //edu flag ==1 - if `edu_flag'==2 { //universal coverage - gen dep_educ_com = 0 - } //`edu_flag'==2 - if `edu_flag'==3 { //fused in below - gen dep_educ_com = . - } - if `edu_flag'==4 { //fused in below - gen dep_educ_com = . - gen unesco_flag = 1 - } - - **************************************************** - **Dimension 3: Access to Electricity - **************************************************** - if `elec_flag'==1 { //data in survey - cap des electricity - if _rc==0 gen dep_infra_elec = electricity==0 if electricity~=. - else local elec_flag 3 - *else gen dep_infra_elec = . - } - if `elec_flag'==2 { //universal - gen dep_infra_elec = 0 - } - if `elec_flag'==3 { - gen dep_infra_elec = . - } - if `elec_flag'==4 { - gen dep_infra_elec = . - gen elec_flag = 1 - } - la var dep_infra_elec "Deprived if HH has No access to electricity" - - **************************************************** - **Dimension 4: Access to Water - **************************************************** - if `water_flag'==1 { - cap des imp_wat_rec - if _rc==0 gen dep_infra_impw = imp_wat_rec==0 if imp_wat_rec~=. - *else gen dep_infra_impw = . - else local water_flag 3 - } - if `water_flag'==2 { - gen dep_infra_impw = 0 - } - if `water_flag'==3 { - gen dep_infra_impw = . - } - if `water_flag'==4 { - gen dep_infra_impw = . - gen water_flag = 1 - } - la var dep_infra_impw "Deprived if HH has No access to improved water" - - **************************************************** - **Dimension 5: Access to social protection - **************************************************** - if `sp_flag'==1 { - //nothing yet from survey - } - if `sp_flag'==2 { - gen dep_sp = 0 - } - if `sp_flag'==3 { - gen dep_sp = . - } - if `sp_flag'==4 { - gen dep_sp = . - gen sp_flag = 1 - - } - **************************************************** - **Dimension 6: Access to financial inclusion - **************************************************** - if `findex_flag'==1 { // from surveys - cap des fin_account - if _rc==0 gen dep_fin = fin_account==0 if fin_account~=. - else local findex_flag 3 - } - if `findex_flag'==2 { - gen dep_fin = 0 - } - if `findex_flag'==3 { - gen dep_fin = . - } - if `findex_flag'==4 { - gen dep_fin = . - gen fin_flag = 1 - } - **************************************************** - cap gen rural = urban==0 - - //get 15+ population size by quintile or quintile/urban rural only when age is available. - forv a1=1(1)5 { - local n15q`a1'total = 1 - local n15q`a1'urban = 1 - local n15q`a1'rural = 1 - } - - qui if "`mod'"=="ALL" { - _ebin gallT_ppp [aw=pop], gen(q5ind) nq(5) - cap des age - if _rc==0 { - qui su age - if r(N)>0 { - gen tmp = age>=15 & age~=. - bys hhid (pid): egen n15 = total(tmp) - //`no_accountq`i'`nm'' `no_accountq`i'total' - forv a1=1(1)5 { - su n15 [aw=pop] if q5ind==`a1' - local n15q`a1'total = r(mean) - if `ct_urban'==1 { - su n15 [aw=pop] if q5ind==`a1' & urban==1 - local n15q`a1'urban = r(mean) - - su n15 [aw=pop] if q5ind==`a1' & urban==0 - local n15q`a1'rural = r(mean) - } //ct_urban - } //a1 - } //rN - } //age - cap drop q5ind tmp n15 - } //ALL - - //POP WEIGHT at HH level - to convert all data to HH level as data comes in as either individual or HH level data - set varabbrev off - cap isid hhid - if _rc==0 { - cap des pop - if _rc==0 gen double pop2 = pop - else gen double pop2 = weight_p - } - else { - cap des pop - if _rc==0 { - drop if welfare==. - drop if pop==. - bys hhid: egen double pop2 = total(pop) - } - else { - drop if welfare==. - drop if weight_p==. - bys hhid: egen double pop2 = total(weight_p) - } - duplicates drop hhid, force - } - set varabbrev on - ren pop popold - ren pop2 pop - - cap drop region - gen region = "`regn'" - cap drop survname - gen str survname = "`survname'" - local welfaretype : char _dta[welfaretype] - clonevar weight_use = pop - - //quintiles - _ebin gallT_ppp [aw=pop], gen(q5) nq(5) - gen test = 1 - - tempfile databfsim - save `databfsim', replace - - //Data fusion loop through random assignments - set seed 1234567 - clear - tempfile ctry1 ctry1ln - save `ctry1', replace emptyok - save `ctry1ln', replace emptyok - - noi display _n(1) - noi display in yellow "Number of simulations: $sim" _n(1) - noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") - - qui forv sim=1(1)$sim { - use `databfsim', clear - **************************************************** FUSION - - //Education - if `edu_flag'==3 { - if `unesco'==1 { - gen unesco_flag = 0 - if "`type_unesco'"=="Urb_rur" { //urban-rural only - foreach nm in urban rural { - cap drop _a1 - if (`unesco_`nm'' > 0) { - wsample test [aw=pop] if `nm'==1, percent(`unesco_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 if `nm'==1 - } - replace dep_educ_com = 1- _a1 if `nm'==1 - drop _a1 - } //urb-rul - } - else { //total country - local nm total - cap drop _a1 - if (`unesco_`nm'' > 0) { - wsample test [aw=pop] , percent(`unesco_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 - } - replace dep_educ_com = 1- _a1 - drop _a1 - } //types - } - else { //missing - gen unesco_flag = 1 - } - } - - //Electricity - if `elec_flag'==3 { - if `ged'==1 { - gen elec_flag = 0 - if "`type_ged'"=="Urb_rur" { //urban-rural only - foreach nm in urban rural { - cap drop _a1 - if (`ged_`nm'' > 0) { - wsample test [aw=pop] if `nm'==1, percent(`ged_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 if `nm'==1 - } - replace dep_infra_elec = 1- _a1 if `nm'==1 - drop _a1 - } //urb-rul - } - else { //total country - local nm total - cap drop _a1 - if (`ged_`nm'' > 0) { - wsample test [aw=pop] , percent(`ged_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 - } - replace dep_infra_elec = 1- _a1 - drop _a1 - } //types - } - else { //missing - gen elec_flag = 1 - } - } //elec_flag - - //Water - if `water_flag'==3 { - if `jmp'==1 { - gen water_flag = 0 - if "`type_jmp'"=="Urb_rur" { //urban-rural only - foreach nm in urban rural { - cap drop _a1 - if (`w_imp_`nm'' > 0) { - wsample test [aw=pop] if `nm'==1, percent(`w_imp_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 if `nm'==1 - } - replace dep_infra_impw = 1- _a1 if `nm'==1 - drop _a1 - } //urb-rul - } - else { //total country - local nm total - cap drop _a1 - if (`w_imp_`nm'' > 0) { - wsample test [aw=pop] , percent(`w_imp_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 - } - replace dep_infra_impw = 1- _a1 - drop _a1 - } //types - } - else { //missing - gen water_flag = 1 - } - } //water_flag - - //Findex fusion - if `findex_flag'==3 { //findex access no_accountq`i'total - if `findex'==1 { - gen fin_flag = 0 - //Add adjustment of individual level estimate to HH level estimate. N is number of 15+ in the group (national, quintile, or quitile & urban/rural). When the data is without age to figure n15, N==1, just hh = ind^0.6 - //hh = ind^(0.6*N) - if "`type_findex'"=="Urb_rur" { //urban-rural quintiles - foreach nm in urban rural { - forv i=1(1)5 { - cap drop _a`i' - if (`no_accountq`i'`nm'' > 0) { - *wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`no_accountq`i'`nm'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - local adjfin = 100*((`=`no_accountq`i'`nm''/100')^(0.6*`n15q`i'`nm'')) - wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' & `nm'==1 - } - replace dep_fin = _a`i' if q5==`i' & `nm'==1 - drop _a`i' - } //i - } //urb-rul - } //urb-rul quintiles - else { //total quintiles - forv i=1(1)5 { - cap drop _a`i' - if (`no_accountq`i'total' > 0) { - *wsample test [aw=pop] if q5==`i', percent(`no_accountq`i'total') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - local adjfin = 100*((`=`no_accountq`i'total'/100')^(0.6*`n15q`i'total')) - wsample test [aw=pop] if q5==`i', percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' - } - replace dep_fin = _a`i' if q5==`i' - drop _a`i' - } //i - } //types - } //findex - else { //missing - gen fin_flag = 1 - } - } //findex flag - - //SP access - if `sp_flag'==3 { - if `aspire_sp'==1 { - gen sp_flag = 0 - - if "`type_aspire'"=="Quintile" { - forv i=1(1)5 { - cap drop _a`i' - if (`_pop_All_SPL_q`i'' > 0) { - wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' - } - replace dep_sp = 1-_a`i' if q5==`i' - drop _a`i' - } //i - } //quintile type - else { //type_aspire == National - cap drop _a1 - if (`_pop_All_SPL' > 0) { - wsample test [aw=pop], percent(`_pop_All_SPL') newvar(_a1) seed(`=1234567+`sim'') numsim(1) - } - else { - gen _a1 = 0 - } - replace dep_sp = 1-_a1 - drop _a1 - } - } //aspire_sp==1 - else { //missing - gen sp_flag = 1 - } - } //sp_flag==3 - **************************************************** END FUSION - - //multidimensional vulnerability - foreach num of numlist ${plinelist} { - //vulnerable and one dim - gen pov1_edu_`num' = 0 - replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 - - gen pov1_sp_`num' = 0 - replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 - - gen pov1_fin_`num' = 0 - replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 - - gen pov1_elec_`num' = 0 - replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 - - gen pov1_water_`num' = 0 - replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 - - //rsum - egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing - - //any of the 6 dimensions - deprived in education; dep_sp; dep_fin - gen multvul_`num' = 0 - replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. - - // any 2, 3, 4,...,6 - forv j=2(1)6 { - gen all`j'vul_`num' = 0 - replace all`j'vul_`num' = 1 if dim6_`num'==`j' - } - } //povlist - - gen sim = `sim' - gen _count=1 - - //collapse to get indicators - compress - tempfile data2 - save `data2', replace - - foreach var of local oklist2 { - use `data2', clear - clonevar h = pop - clonevar h_ln = pop - clonevar wta_pov = pop - replace `var' = strtrim(`var') - replace `var' = ustrtrim(`var') - replace `var' = strproper(`var') - - levelsof `var', local(lvllist2) - cap confirm string variable `var' - if _rc==0 local st = 1 - else local st = 0 - - qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') - - rename gallT_ppp mean_ln - ren _count nohh - ren h noind - egen double totalhh = total(nohh) - egen double totalind = total(noind) - gen sh_hh = nohh/totalhh - gen sh_pop = noind/totalind - - ren `var' sample - - gen level = "`var'" - gen code = "`code'" - gen lineupyear = `lineupyear' - gen baseyear = `baseyear' - gen survname = "`survname'" - gen region = "`regn'" - gen str welfaretype = "`welfaretype'" - - append using `ctry1ln' - order region code baseyear lineupyear survname welfaretype level sample sim poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind - save `ctry1ln', replace - } //foreach - - if (mod(`sim',50)==0){ - noi display in white ". `sim'" _continue - noi display _n(0) - } - else noi display "." _continue - } //sim - //collapse across sim - - //save results - use `ctry1ln', replace - compress - save "${upath2}\03.intermediate\Sim\\${lnyear}\\temp\\`code'_`baseyear'_`survname'_${lnyear}_lnsim", replace - - groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) - gen todo = `todo' - order code survname level sample baseyear lineupyear todo mean_ln poor215_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind - save "${upath2}\03.intermediate\Sim\\${lnyear}\\`code'_`baseyear'_`survname'_${lnyear}", replace - - append using "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'" - compress - save "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'", replace - } //dlw rc - else { - noi dis "`j' - Failed to load DLW `code'-`surv_year'-`survname'-`mod'" - } -} //forvalue i -log close - -/* -use `dataall', clear -compress -save "${maindir}/output/sub_base", replace - -use `dataall_ln', clear -compress -save "${maindir}/output/sub_base_ln", replace -*/ +//GMD vul + +clear all +set more off +set matsize 5000 +mat drop _all +set varabbrev off + +//setting +global rnd AM24 +global sim 100 + +global reposource "${upath2}\02.input" +global repotxt repo(use ${rnd}all) reporoot(${reposource}) +global lnyear 2021 +global circa 3 +global plinelist 215 365 685 + +cap log close +*log using "${upath2}\03.intermediate\Sim\\${lnyear}a\\GMD_log_${lnyear}.txt", text replace +local date: di %tdMon-DD-CCYY date("$S_DATE", "DMY") +local user = "`c(username)'" +local fdataall_ln Vul_dataall_${lnyear}_`date' + +tempfile dataall dataall_ln data4 data5 ctry1 ctry1ln +save "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'", replace emptyok + +//GMD todo list +use "${upath2}\02.input\\${lnyear}\GMD_list_${lnyear}", clear +replace todo = 0 if todo==. + +//update manually +replace level = "subnatid1" if level == "subnatid" & code=="KGZ" & surv_year==2010 +replace level = "subnatid" if level == "" & code=="UGA" & surv_year==2009 +replace surv_year = 2019 if code=="MOZ" & surv_year==2022 +replace rep_year = 2019 if code=="MOZ" & rep_year==2022 +replace level = "subnatid" if code=="MOZ" & rep_year==2019 + +//Doing outside GMD: India, CHN, and LIS countries +drop if (code=="CHN"|code=="IND") & ${lnyear}==2021 +drop if (code=="CHN") & ${lnyear}==2010 +drop if strpos(survname,"-LIS")>0 +drop if mod=="HIST" +drop if code=="SYR" + +keep if code == "UKR" + +replace ct_urban = 0 if code=="SYC" & surv_year==2018 +//add flag missing =4 +foreach var of varlist elec_flag water_flag sp_flag findex_flag edu_flag { + replace `var'= 4 if `var'==. +} + +ren level lvlvar +replace lvlvar = "" if lvlvar=="national" +local allobs = _N +tempfile gmdlist +save `gmdlist', replace + +qui forv j=1(1)`allobs' { +*qui forv i=1(1)1 { + use `gmdlist', clear + foreach lc in code surv_year survname mod lvlvar elec_flag water_flag sp_flag findex_flag edu_flag ct_urban todo { + local `lc' = `lc'[`j'] + } + + //Load lineup poverty/pop at national level + foreach num of numlist ${plinelist} { + use "${upath2}\03.intermediate\PIPinput\PIP_${lnyear}_`num'.dta", clear + keep if country_code=="`code'" + count + local pcnpov`num' + local pcnpop + if r(N)>0 { + local pcnpov`num' = headcount[1] + local pcnpop = population[1] + } + } + + //Get values for fusion ASPIRE FINDEX, etc when flags==3 + **************************************************** + //ASPIRE + if `sp_flag'==3 { + use "${upath2}\02.input\\${lnyear}\ASPIRE_data_${lnyear}.dta", clear + keep if code=="`code'" + local aspire_sp + count + if r(N)>0 { + local type_aspire = type[1] + if "`type_aspire'"=="Quintile" { + local aspire_sp = 1 + forv i=1(1)5 { + local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] + } + } + else { //type_aspire == National + local _pop_All_SPL = _pop_All_SPL[1] + local aspire_sp = 1 + } + } //rn>0 + else { + local aspire_sp = 0 + } + } //sp_flag + + **************************************************** + //FINDEX data (no account, which is dep_fin) + if `findex_flag'==3 { + use "${upath2}\02.input\\${lnyear}\findex_${lnyear}_quintiles.dta", clear + keep if code=="`code'" + local findex + count + if r(N)>0 { + local type_findex = type[1] + local findex = 1 + if `ct_urban'==0 local type_findex "Total" + if "`type_findex'"=="Urb_rur" { //urban-rural quintiles + local findex = 1 + forv i=1(1)5 { + foreach nm in urban rural { + local no_accountq`i'`nm' = no_accountq`i'`nm'[1] + } + } + } + else { //total quintiles + local findex = 1 + forv i=1(1)5 { + local no_accountq`i'total = no_accountq`i'total[1] + } + } //types + } + else { //rn>0 + local findex = 0 + } + } //findex_flag + + **************************************************** + //JMP data + if `water_flag'==3 { + use "${upath2}\02.input\\${lnyear}\JMP_cov_${lnyear}.dta", clear + keep if code=="`code'" + local jmp + count + if r(N)>0 { + local type_jmp = type[1] + local jmp = 1 + if `ct_urban'==0 local type_jmp "Total" + if "`type_jmp'"=="Urb_rur" { //urban-rural only + local w_imp_urban = w_imp_urban[1] + local w_imp_rural = w_imp_rural[1] + } + else { //total + local w_imp_total = w_imp_total[1] + } //types + } + else { //rn>0 + local jmp = 0 + } + } //water_flag + + **************************************************** + //Electricity GED + if `elec_flag'==3 { + use "${upath2}\02.input\\${lnyear}\GED_cov_${lnyear}.dta", clear + keep if code=="`code'" + local ged + count + if r(N)>0 { + local type_ged = type[1] + local ged = 1 + if `ct_urban'==0 local type_ged "Total" + if "`type_ged'"=="Urb_rur" { //urban-rural only + local ged_urban = ged_urban[1] + local ged_rural = ged_rural[1] + } + else { //total quintiles + local ged_total = ged_total[1] + } //types + } + else { //rn>0 + local ged = 0 + } + } //elec_flag + + **************************************************** + //UNESCO + if `edu_flag'==3 { + use "${upath2}\02.input\\${lnyear}\UNESCO_cov_${lnyear}.dta", clear + keep if code=="`code'" + local unesco + count + if r(N)>0 { + local type_unesco = type[1] + local unesco = 1 + if `ct_urban'==0 local type_unesco "Total" + if "`type_unesco'"=="Urb_rur" { //urban-rural only + local unesco_urban = unesco_urban[1] + local unesco_rural = unesco_rural[1] + } + else { //total quintiles + local unesco_total = unesco_total[1] + } //types + } + else { //rn>0 + local unesco = 0 + } + } + + //microdata + cap dlw, country(`code') year(`surv_year') type(gmd) mod(`mod') surveyid(`survname') files $repotxt + if _rc==0 { + cap ren sim simsur + local baseyear = `surv_year' + local year = `surv_year' + local lineupyear = $lnyear + gen _all_ = "All sample" + noi dis "`j' - Working on `code'-`surv_year'-`survname'-`mod'" + + if "`mod'"=="GPWG" local wgt weight + else local wgt weight_p + + //Prep subnational level + if ("`lvlvar'"=="") local oklist _all_ + else { + cap confirm numeric variable `lvlvar' + if _rc==0 { + tempvar xvar + cap decode `lvlvar', gen(`xvar') + if _rc~=0 tostring `lvlvar', gen(`xvar') + cap drop `lvlvar' + rename `xvar' `lvlvar' + } + replace `lvlvar' = "MISSING" if `lvlvar'=="" + replace `lvlvar' = ustrtrim(`lvlvar') + local oklist `lvlvar' + } + + //adjustment for country specific fix subnational + qui { + if "`code'"=="PHL" & "`survname'"=="FIES" & (`year'==2018) { //ok + if "`lvlvar'"=="subnatid2" { + replace subnatid2="Basilan" if subnatid2=="7-Basilan" | subnatid2=="97-Isabela City" + replace subnatid2="North Cotabato" if subnatid2=="47-Cotabato" + replace subnatid2="Davao Del Sur" if subnatid2=="24-Davao de Sur" | subnatid2=="86-Davao Occidental" + replace subnatid2="Maguindanao" if subnatid2=="38-Maguindanao" | subnatid2=="98-Cotabato City" + replace subnatid2="Metropolitan Manila" if subnatid2=="39-Manila" | subnatid2=="74-NCR-2nd Dist." | subnatid2=="75-NCR-3rd Dist." | subnatid2=="76-NCR-4th Dist." + local oklist subnatid2 + } + } + + if "`code'"=="PHL" & "`survname'"=="FIES" & `year'==2021 { //ok + if "`lvlvar'"=="subnatid2" { + replace subnatid2="Basilan" if subnatid2=="7-Basilan" | subnatid2=="97-Isabela City" + replace subnatid2="North Cotabato" if subnatid2=="47-Cotabato" + replace subnatid2="Davao Del Sur" if subnatid2=="24-Davao de Sur" | subnatid2=="86-Davao Occidental" + replace subnatid2="Maguindanao" if subnatid2=="38-Maguindanao" | subnatid2=="98-Cotabato City" + replace subnatid2="Metropolitan Manila" if subnatid2=="39-Manila" | subnatid2=="74-NCR-2nd Distr." | subnatid2=="75-NCR-3rd Distr." | subnatid2=="76-NCR-4th Distr." + local oklist subnatid2 + } + } + + if "`code'"=="ALB" & "`survname'"=="LSMS" & `year'==2012 { //ok + decode strata, gen(subnatid) + replace subnatid = subinstr(subnatid, "_Rural","",.) + replace subnatid = subinstr(subnatid, "_Urban","",.) + local oklist subnatid + } + + if "`code'"=="BGD" & "`survname'"=="HIES" & (`year'==2016|`year'==2022) { //ok + replace subnatid = "40 - Khulna" if subnatid=="45 - Mymensingh" | subnatid=="45-Mymensingh"|subnatid=="40-Khulna" + local oklist subnatid + } + + if "`code'"=="GEO" & "`survname'"=="HIS" { //ok + if `year'>=2019 & `year'<=2021 { + replace subnatid="10 - Imereti, Racha-Lechkhumi and Kvemo Svan" if subnatid=="10 - Imereti" | subnatidsurvey=="13 - Racha-Lechkhumi and Kvemo Svaneti" + } + if `year'>=2002 & `year'<=2009 { + replace subnatid="10 - Imereti, Racha-Lechkhumi and Kvemo Svan" if subnatid=="10-Imereti" + replace subnatid="7 - Adjara A.R." if subnatid=="7-Ajara" + replace subnatid="9 - Samegrelo-Zemo Svaneti" if subnatid=="9-Samegrelo" + } + } + + if "`code'"=="IDN" & "`survname'"=="SUSENAS" { //ok + if (`year'>=2010 & `year'<=2023) { + replace `lvlvar' = "64-65-North and East Kalimantan" if `lvlvar'=="64-East Kalimantan" | `lvlvar'=="65-North Kalimantan" + } + + if `year'==2005 { + replace `lvlvar' = "21-Riau Islands" if `lvlvar'=="21-Riau Island" + replace `lvlvar' = "64-65-North and East Kalimantan" if `lvlvar'=="64-East Kalimantan" | `lvlvar'=="65-North Kalimantan" + } + } + + if "`code'"=="MNE" & "`survname'"=="SILC-C" & (`year'==2016 | `year'==2017) { //ok + gen subnatid1="" + replace subnatid1="1 – North" if subnatid=="1 – North urban" | subnatid=="5 – North rural" + replace subnatid1="2 – Center" if subnatid=="2 – Center urban" | subnatid=="6 – Center rural" + replace subnatid1="3 – South" if subnatid=="3 – South urban" | subnatid=="7 – South rural" + replace subnatid1="4 – Podgorica" if subnatid=="4 – Podgorica urban" | subnatid=="8 – Podgorica rural" + + local oklist subnatid1 + } + + if "`code'"=="DJI" & "`survname'"=="EDAM" & `year'==2017 { //ok + gen subnatid1=subnatid + replace subnatid1="10 - Djibouti" if subnatid=="11 - Djibouti-ville, 1er arrondissement" + replace subnatid1="10 - Djibouti" if subnatid=="12 - Djibouti-ville, 2eme arrondissement" + replace subnatid1="10 - Djibouti" if subnatid=="13 - Djibouti-ville, 3eme arrondissement" + replace subnatid1="10 - Djibouti" if subnatid=="14 - Djibouti-ville, 4eme arrondissement" + replace subnatid1="10 - Djibouti" if subnatid=="15 - Djibouti-ville, 5eme arrondissement" + + local oklist subnatid1 + } + if "`code'"=="STP" { //ok + if `year'==2010 { + replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 - Nord" | `lvlvar'=="2 - Centre" | `lvlvar'=="3 - Sud" + replace `lvlvar' = "2 - Principé" if `lvlvar'=="4 - Principé" + } + if `year'==2000 { + replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 – Nord" | `lvlvar'=="2 – Centre" | `lvlvar'=="3 – Sud" + replace `lvlvar' = "2 - Principé" if `lvlvar'=="4 – Principé" + } + if `year'==2017 { + replace `lvlvar' = "1 - São Tomé" if `lvlvar'=="1 - Lobata"|`lvlvar'=="2 - Lembá"|`lvlvar'=="3 - Mezochi"|`lvlvar'=="4 - Agua Grande"|`lvlvar'=="5 - Cantagalo"|`lvlvar'=="6 - Caué" + replace `lvlvar' = "2 - Principé" if `lvlvar'=="7 - Príncipe" + } + } + + if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2010 | `year'==2012) { + replace subnatid="1-Metropolitan" if subnatid=="1-Metropolitan"|subnatid=="1 - Metropolitan" + replace subnatid="2-Lower" if subnatid=="2-Lower Urban"|subnatid=="3-Lower Rural" + replace subnatid="2-Lower" if subnatid=="2 - Lower Urban"|subnatid=="3 - Lower Rural" + replace subnatid="4-Upper" if subnatid=="4-Upper Urban"|subnatid=="5-Upper Rural" + replace subnatid="4-Upper" if subnatid=="4 - Upper Urban"|subnatid=="5 - Upper Rural" + replace subnatid="6-Borders" if subnatid=="6-Borders Urban"|subnatid=="7-Borders Rural" + replace subnatid="6-Borders" if subnatid=="6 - Borders Urban"|subnatid=="7 - Borders Rural" + local oklist subnatid + } + if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2017 | `year'==2015) { + if "`lvlvar'"=="subnatid1" { + replace subnatid1="1-Metropolitan" if subnatid1=="1-Metropolitan" + replace subnatid1="2-Lower" if subnatid1=="2-Lower Urban"|subnatid1=="3-Lower Rural" + replace subnatid1="4-Upper" if subnatid1=="4-Upper Urban"|subnatid1=="5-Upper Rural" + replace subnatid1="6-Borders" if subnatid1=="6-Borders Urban"|subnatid1=="7-Borders Rural" + local oklist subnatid1 + } + if "`lvlvar'"=="subnatid" local oklist subnatid + } + + if "`code'"=="EGY" & "`survname'"=="HIECS" & (`year'==2019) { + if "`lvlvar'"=="subnatid" { + replace subnatid="1-Metropolitan" if subnatid=="1-Metropolitan" + replace subnatid="2-Lower" if subnatid=="2-Lower Urban"|subnatid=="3-Lower Rural" + replace subnatid="4-Upper" if subnatid=="4-Upper Urban"|subnatid=="5-Upper Rural" + replace subnatid="6-Borders" if subnatid=="6-Borders Urban"|subnatid=="7-Borders Rural" + local oklist subnatid + } + } + + if "`code'"=="FIN" & "`survname'"=="EU-SILC" & (`year'>=2008 & `year'<=2010) { + replace subnatid="4-FI1C" if subnatid=="1-FI18" + local oklist subnatid + } + + if "`code'"=="CIV" & "`survname'"=="ENV" & `year'==2015 { + *decode gaul_adm1, gen(gaul_adm1_str) + *local oklist gaul_adm1_str + } + + if "`code'"=="CIV" & "`survname'"=="EHCVM" & (`year'==2018| `year'==2021) { + replace subnatid = trim(proper(lower( subnatid))) + gen gaul_adm1_str = "" + replace gaul_adm1_str="Folon" if subnatid=="10 - Kabadougou" + replace gaul_adm1_str="Folon" if subnatid=="24 - Folon" + replace gaul_adm1_str="Tchologo" if subnatid=="20 - Bagoue" + replace gaul_adm1_str="Tchologo" if subnatid=="3 - Poro" + replace gaul_adm1_str="Tchologo" if subnatid=="32 - Tchologo" + replace gaul_adm1_str="Hambol" if subnatid=="28 - Hambol" + replace gaul_adm1_str="Hambol" if subnatid=="4 - Gbeke" + replace gaul_adm1_str="Bounkani" if subnatid=="23 - Bounkani" + replace gaul_adm1_str="Bounkani" if subnatid=="8 - Gontougo" + replace gaul_adm1_str="Sud-Comoe" if subnatid=="13 - Sud-Comoe" + replace gaul_adm1_str="Sud-Comoe" if subnatid=="5 - Indenie-Djuablin" + replace gaul_adm1_str="District autonome D'abidjan" if subnatid=="1 - Autonome D'Abidjan" + replace gaul_adm1_str="District autonome de Yamoussou" if subnatid=="7 - Yamoussoukro" + replace gaul_adm1_str="Goh" if subnatid=="15 - LÔH-Djiboua" + replace gaul_adm1_str="Goh" if subnatid=="17 - GÔH" + replace gaul_adm1_str="Moronou" if subnatid=="11 - N'Zi" + replace gaul_adm1_str="Moronou" if subnatid=="21 - Belier" + replace gaul_adm1_str="Moronou" if subnatid=="29 - Iffou" + replace gaul_adm1_str="Moronou" if subnatid=="33 - Moronou" + replace gaul_adm1_str="La Me" if subnatid=="16 - Agneby-Tiassa" + replace gaul_adm1_str="La Me" if subnatid=="26 - Grands-Ponts" + replace gaul_adm1_str="La Me" if subnatid=="30 - La Me" + replace gaul_adm1_str="Guemon" if subnatid=="18 - Cavally" + replace gaul_adm1_str="Guemon" if subnatid=="27 - Guemon" + replace gaul_adm1_str="Guemon" if subnatid=="6 - Tonkpi" + replace gaul_adm1_str="Marahoue" if subnatid=="12 - Marahoue" + replace gaul_adm1_str="Marahoue" if subnatid=="2 - Haut-Sassandra" + replace gaul_adm1_str="Bere" if subnatid=="14 - Worodougou" + replace gaul_adm1_str="Bere" if subnatid=="19 - Bafing" + replace gaul_adm1_str="Bere" if subnatid=="22 - Bere" + replace gaul_adm1_str="Nawa" if subnatid=="25 - GbÔKle" + replace gaul_adm1_str="Nawa" if subnatid=="31 - Nawa" + replace gaul_adm1_str="Nawa" if subnatid=="9 - San-Pedro" + + local oklist gaul_adm1_str + } + + if "`code'"=="COM" & "`survname'"=="EESIC" & `year'==2013 { //ok + replace subnatid="1 - Moroni" if subnatid=="2 - Reste Ngazidja" + local oklist subnatid + } + /* + if "`file'"=="SSA_GMB_2015_IHS_LN2018_IND.dta" { + replace subnatid="6 – Kuntaur" if subnatid=="7 – Janjanbureh" + } + */ + + if "`code'"=="NAM" & "`survname'"=="NHIES" & `year'==2015 { //ok + replace subnatid="4-kavango east" if subnatid=="5-kavango west" + local oklist subnatid + } + + if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2018 { //ok + replace subnatid2="51-Western Area" if subnatid2=="51-Western Area Rural" | subnatid2=="52-Western Area Urban" + replace subnatid2="21–Bombali/32–Karene" if subnatid2=="21-Bombali" | subnatid2=="32-Karene" + replace subnatid2="22–Falaba/23–Koinadugu" if subnatid2=="22-Falaba" | subnatid2=="23-Koinadugu" + local oklist subnatid2 + } + + if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2011 { //ok + replace subnatid2="51-Western Area" if subnatid2=="41 - Western other" | subnatid2=="42 - Western urban (Freetown)" + local oklist subnatid2 + } + + if "`code'"=="SLE" & "`survname'"=="SLIHS" & `year'==2003 { //ok + replace subnatid="51-Western Area" if subnatid=="41 - Western other" | subnatid=="42 - Western urban" + local oklist subnatid + } + /* + if "`file'"=="SSA_GAB_2017_EGEP_LN2018_IND.dta" { //check + /* ALREADY DONE, strata is from SSAPOV module P + gen subnatid = strata + replace subnatid = "11-Ouest" if strata=="10-Reste Ouest Urbain" | strata=="11-Ouest Rural" + replace subnatid = "4-Nord" if strata=="4-Nord-Urbain" | strata=="5-Nord-Rural" + replace subnatid = "6-Sud" if strata=="6-Sud-Urbain" | strata=="7-Sud-Rural" + replace subnatid = "9-Est" if strata=="8-Reste Est Urbain" | strata=="9-Est Rural" + */ + } + + if "`file'"=="EAP_WSM_2008_HIES_LN2018_IND.dta" { + replace subnatid="Upolu" if subnatid=="1-Apia" + replace subnatid="Upolu" if subnatid=="2-NWU" + replace subnatid="Upolu" if subnatid=="3-RoU" + } + */ + /* + if "`code'"=="TLS" & "`survname'"=="TLSLS" & `year'==2014 { //ok + gen str subnatidx = "" + replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="01-Aileu" + replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="02-Dili" + replace subnatidx = "1-Aileu,Dili and Emera" if subnatid1=="03-Ermera" + replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="04-Ainaro" + replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="06-Manufahi" + replace subnatidx = "2-Ainaro, Manatutao and Manufahi" if subnatid1=="05-Manatuto" + replace subnatidx = "5-Oecussi" if subnatid1=="13-Oecussi" + replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="10-Bobonaro" + replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="11-Covalima" + replace subnatidx = "4-Bobonaro, Cova Lima and Liquica" if subnatid1=="12-Liquica" + replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="09-Viqueque" + replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="08-Lautem" + replace subnatidx = "3-Baucau,Lautem and Viqueque" if subnatid1=="07-Baucau" + local oklist subnatidx + } + */ + if "`code'"=="MWI" { //ok + if `year'>=2010 { + replace `lvlvar'="105/107 Mzimba" if `lvlvar'=="105 - Mzimba" | `lvlvar'=="107 - Mzuzu City"|`lvlvar'=="107 – Mzuzu City" + replace `lvlvar'="305/315 Blantyre" if `lvlvar'=="315 - Blantyre City" | `lvlvar'=="305 - Blantyre"| `lvlvar'=="315 – Blantyre City" + replace `lvlvar'="206/210 Lilongwe" if `lvlvar'=="210 - Lilongwe City" | `lvlvar'=="206 - Lilongwe"| `lvlvar'=="210 – Lilongwe City" + replace `lvlvar'="303/314 Zomba" if `lvlvar'=="303 - Zomba" | `lvlvar'=="314 - Zomba City" + replace `lvlvar'="303/314 Zomba" if `lvlvar'=="303 – Zomba Non-City" | `lvlvar'=="314 – Zomba City" + } + if `year'==1997 { + replace `lvlvar'="105/107 Mzimba" if `lvlvar'=="130 – Mzimba" | `lvlvar'=="131 – Mzuzu City" + replace `lvlvar'="305/315 Blantyre" if `lvlvar'=="304 – Blantyre Rural" | `lvlvar'=="305 – Blantyre City" + replace `lvlvar'="206/210 Lilongwe" if `lvlvar'=="223 – Lilongwe Rural" | `lvlvar'=="224 – Lilongwe City" + replace `lvlvar'="303/314 Zomba" if `lvlvar'=="306 – Zomba Rural" | `lvlvar'=="307 – Zomba City" + } + } + + if "`code'"=="TCD" & "`survname'"=="EHCVM" & `year'==2018 { //ok + replace subnatid="3-Borkou-Ennedi-Tibesti" if subnatid=="2 - Borkou" | subnatid=="20 - Ennedi Ouest" + } + + /* + if "`file'"=="LAC_BOL_2018_EH_LN2018_IND.dta" { + replace subnatid2 = trim(subnatid2) + replace subnatid2="8/9 Beni and Pando" if subnatid2=="8 - Beni" | subnatid2=="9 - Pando" + } + */ + + if "`code'"=="DOM" & ("`survname'"=="ECNFT-Q03"|"`survname'"=="ENFT") { //ok + replace `lvlvar'= trim(`lvlvar') + replace `lvlvar'="2 - Norte o Cibao" if `lvlvar'=="2 - Cibao Norte" | `lvlvar'=="3 - Cibao Sur" | `lvlvar'=="4 - Cibao Nordeste" | `lvlvar'=="5 - Cibao Noroeste" + replace `lvlvar'="3 - Sur" if `lvlvar'=="6 - Valdesia" | `lvlvar'=="7 - El Valle" | `lvlvar'=="8 - Enriquillo" + replace `lvlvar'="4 - Este" if `lvlvar'=="9 - Higuamo" | `lvlvar'=="10 - Yuma" + } + + if "`code'"=="NER" & "`survname'"=="EHCVM" & `year'==2021 { //ok + replace subnatid="1 - Agadez" if subnatid=="1 - gadez" + } + + if "`code'"=="BRA" { //ok + replace subnatid2="43 - Rio Grande do Sul" if subnatid2== "43 - Rio Grande do Norte" + } + + if "`code'"=="BWA" { //ok + replace subnatid="3 – Other Towns" if subnatid== "3 - Other cities & towns" + replace subnatid="4 – South East" if subnatid== "4 - Rural South-East" + replace subnatid="5 – North East" if subnatid== "5 - Rural North-East" + replace subnatid="6 – North West" if subnatid== "6 - Rural North-West" + replace subnatid="7 – South West" if subnatid== "7 - Rural South-West" + } + + if "`code'"=="CAF" { //ok + replace subnatid2="3 - Yadé" if subnatid2== "3 - Yade" + } + + if "`code'"=="BFA" { //ok + replace subnatid="1 - Boucle du Mouhoun" if subnatid== "1 - Boucle du Mouhoum" + } + + if "`code'"=="CMR" & "`survname'"=="ECAM-V" & `year'==2021 { //ok + egen sample3 = sieve(subnatid), keep(a) + gen sample3UP = upper(sample3) + replace subnatid="10 -Sud-Oues" if subnatid== "11 - sud-ouest" + replace subnatid="2 - Centre" if sample3UP== "YAOUND" + replace subnatid="6 - Littoral" if subnatid== "3 - douala" + drop sample3 sample3UP + } + + if "`code'"=="IRN" { + replace `lvlvar' = "10 - Isfahan" if `lvlvar'=="11 - Esfahan" + replace `lvlvar' = "11 - Sistan" if `lvlvar'=="12 - SistanBalouchestan" + replace `lvlvar' = "12 - Kurdestan" if `lvlvar'=="13 - Kordestan" + replace `lvlvar' = "13 - Hamadan" if `lvlvar'=="14 - Hamedan" + replace `lvlvar' = "14 - Bakhtiari" if `lvlvar'=="15 - CharmahalBakhtiari" + replace `lvlvar' = "17 - Kohkiloyeh" if `lvlvar'=="18 - KohkilouyeBoyerahamad" + replace `lvlvar' = "18 - Bushehr" if `lvlvar'=="19 - Boushehr" + replace `lvlvar' = "28 - N. Khorasan" if `lvlvar'=="29 - KhorasanShomali" + replace `lvlvar' = "S. Khorasan" if `lvlvar'=="30 - KhorasanJonoubi" + replace `lvlvar' = "3 - E.Azarbaijan" if `lvlvar'=="4 - AzarbaijanSharghi" + replace `lvlvar' = "4 - W.Azarbaijan" if `lvlvar'=="5 - AzarbaijanGharbi" + replace `lvlvar' = "6 - Khuzestan" if `lvlvar'=="7 - Kouzestan" + } + + if "`code'"=="KAZ" { + replace `lvlvar' = "51 - South_Kaz" if `lvlvar'=="61 - Turkistan" + } + + if "`code'"=="KGZ" { + replace `lvlvar' = "2-Issyk-kul" if `lvlvar'=="2-Issyk-ku" + replace `lvlvar' = "3-Jalal-Abad" if `lvlvar'=="3-Jalalaba" + } + + if "`code'"=="LAO" { + replace `lvlvar' = "18-Xaysomboon" if `lvlvar'=="18-Xaisomboun" + } + + if "`code'"=="MAR" { //old shapefile for 2000 and 2006 + /* + replace `lvlvar' = "" if `lvlvar'=="1 - Regions sahariennes" + replace `lvlvar' = "" if `lvlvar'=="10 - Tadla-Azilal" + replace `lvlvar' = "" if `lvlvar'=="11 - Meknes-Tafilalet" + replace `lvlvar' = "" if `lvlvar'=="12 - Fes-Boulemane-Taounate" + replace `lvlvar' = "" if `lvlvar'=="13 - Taza-Hoceima" + replace `lvlvar' = "1 - Tanger-Tetouan-Al Hoceima" if `lvlvar'=="14 - Tanger-Tetouan" + replace `lvlvar' = "9 - Souss-Massa" if `lvlvar'=="2 - Souss- Massa-Draa" + replace `lvlvar' = "5 - Beni Mellal-Khenifra" if `lvlvar'=="3 - Gharb-Chrarda-Beni Hssen" + replace `lvlvar' = "" if `lvlvar'=="4 - Chaouia-Ouardigha" + replace `lvlvar' = "7 - Marrakech-Safi" if `lvlvar'=="5 - Marrakech-Tensift-Haouz" + replace `lvlvar' = "2 - Oriental" if `lvlvar'=="6 - Oriental" + replace `lvlvar' = "6 - Grand Casablanca" if `lvlvar'=="7 - Grand Casablanca" + replace `lvlvar' = "4 - Rabat-Salé-Kenitra" if `lvlvar'=="8 -Rabat-Sale-Zemmour-Zaer" + replace `lvlvar' = "" if `lvlvar'=="9 - Doukkala-Abda" + */ + } + + if "`code'"=="MOZ" & "`survname'"=="IOF" & `year'==2022 { //oklist + replace `lvlvar' = "Maputo Cidade" if `lvlvar'=="Cidade de Maputo" + } + + if "`code'"=="MLI" & "`survname'"=="EHCVM" & `year'==2021 { //ok + egen sample3 = sieve(subnatid), keep(a) + gen sample3UP = upper(sample3) + replace subnatid="4 - Sgou" if sample3UP== "SEGOU" + drop sample3 sample3UP + } + + if "`code'"=="MMR" & `year'==2017 { + replace `lvlvar' = "14-Ayeyawaddy" if trim(`lvlvar')=="14-Ayeyawaddy" + replace `lvlvar' = "2-Kayar" if trim(`lvlvar')=="2-Kayar" + replace `lvlvar' = "6-Taninthayi" if trim(`lvlvar')=="6-Taninthayi" + } + + if "`code'"=="MRT" { + replace `lvlvar' = "1 - Hodh El Charghi" if `lvlvar'=="1 - Hodh charghy" + replace `lvlvar' = "2 - Hodh El Gharbi" if `lvlvar'=="2 - Hodh Gharby" + replace `lvlvar' = "11 - Tiris Zemmour" if `lvlvar'=="11 - Tirs-ezemour" + replace `lvlvar' = "8 - Dakhlet Nouadhibou" if `lvlvar'=="8 - Dakhlett Nouadibou" + } + + if "`code'"=="PRY" & `year'>=2001 & `year'<=2017 { + egen sample3 = sieve(`lvlvar'), keep(a) + gen sample3UP = upper(sample3) + replace `lvlvar'="20 - Resto" if sample3UP== "CONCEPCION" + replace `lvlvar'="20 - Resto" if sample3UP== "NEEMBUCU" + replace `lvlvar'="20 - Resto" if sample3UP== "AMAMBAY" + replace `lvlvar'="20 - Resto" if sample3UP== "CANINDEYU" + replace `lvlvar'="20 - Resto" if sample3UP== "PRESIDENTEHAYES" + replace `lvlvar'="20 - Resto" if sample3UP== "CORDILLERA" + replace `lvlvar'="20 - Resto" if sample3UP== "GUAIRA" + replace `lvlvar'="20 - Resto" if sample3UP== "MISIONES" + replace `lvlvar'="20 - Resto" if sample3UP== "PARAGUARI" + drop sample3 sample3UP + } + + if "`code'"=="POL" & "`survname'"=="HBS" { + egen sample3 = sieve(`lvlvar'), keep(a) + gen sample3UP = upper(sample3) + *gen subnatid2x = `lvlvar' + replace `lvlvar'="1-PL2" if sample3UP=="MALOPOLSKIE" + replace `lvlvar'="1-PL2" if sample3UP=="SLASKIE" + replace `lvlvar'="2-PL4" if sample3UP=="WIELKOPOLSKIE" + replace `lvlvar'="2-PL4" if sample3UP=="ZACHODNIOPOMORSKIE" + replace `lvlvar'="2-PL4" if sample3UP=="LUBUSKIE" + replace `lvlvar'="3-PL5" if sample3UP=="DOLNOSLASKIE" + replace `lvlvar'="3-PL5" if sample3UP=="OPOLSKIE" + replace `lvlvar'="4-PL6" if sample3UP=="KUJAWSKOPOMORSKIE" + replace `lvlvar'="4-PL6" if sample3UP=="WARMINSKOMAZURSKIE" + replace `lvlvar'="4-PL6" if sample3UP=="POMORSKIE" + replace `lvlvar'="5-PL7" if sample3UP=="LODZKIE" + replace `lvlvar'="5-PL7" if sample3UP=="SWIETOKRZYSKIE" + replace `lvlvar'="6-PL8" if sample3UP=="LUBELSKIE" + replace `lvlvar'="6-PL8" if sample3UP=="PODKARPACKIE" + replace `lvlvar'="6-PL8" if sample3UP=="PODLASKIE" + replace `lvlvar'="7-PL9" if sample3UP=="MAZOWIECKIE" + local oklist `lvlvar' + } + + if "`code'"=="POL" & "`survname'"=="EU-SILC" & `year'<=2017 { + replace `lvlvar'="5-PL7" if `lvlvar'=="1-PL1" + replace `lvlvar'="1-PL2" if `lvlvar'=="2-PL2" + replace `lvlvar'="6-PL8" if `lvlvar'=="3-PL3" + replace `lvlvar'="2-PL4" if `lvlvar'=="4-PL4" + replace `lvlvar'="3-PL5" if `lvlvar'=="5-PL5" + replace `lvlvar'="4-PL6" if `lvlvar'=="6-PL6" + } + + if "`code'"=="ROU" { + replace `lvlvar'="2-RO2" if `lvlvar'=="1 - North-East" | `lvlvar'=="1-North-East" + replace `lvlvar'="2-RO2" if `lvlvar'=="2 - South-East" | `lvlvar'=="2-South-East" + replace `lvlvar'="3-RO3" if `lvlvar'=="3 - South" | `lvlvar'=="3-South" + replace `lvlvar'="4-RO4" if `lvlvar'=="4 - South-West" | `lvlvar'=="4-South-West" + replace `lvlvar'="4-RO4" if `lvlvar'=="5 - West" | `lvlvar'=="5-West" + replace `lvlvar'="1-RO1" if `lvlvar'=="6 - North-West" | `lvlvar'=="6-North-West" + replace `lvlvar'="1-RO1" if `lvlvar'=="7 - Centre" | `lvlvar'=="7-Centre" + replace `lvlvar'="3-RO3" if `lvlvar'=="8 - Bucharest-Ilfov" | `lvlvar'=="8-Bucharest-Ilfov" + + replace `lvlvar'="2-RO2" if `lvlvar'=="3-RO21" + replace `lvlvar'="2-RO2" if `lvlvar'=="4-RO22" + replace `lvlvar'="3-RO3" if `lvlvar'=="5-RO31" + replace `lvlvar'="4-RO4" if `lvlvar'=="7-RO41" + replace `lvlvar'="4-RO4" if `lvlvar'=="8-RO42" + replace `lvlvar'="1-RO1" if `lvlvar'=="1-RO11" + replace `lvlvar'="1-RO1" if `lvlvar'=="2-RO12" + replace `lvlvar'="3-RO3" if `lvlvar'=="6-RO32" + } + + if "`code'"=="THA" { + replace `lvlvar'="10-Bangkok" if `lvlvar'=="10" + replace `lvlvar'="11-Samut Prakan" if `lvlvar'=="11" + replace `lvlvar'="12-Nonthaburi" if `lvlvar'=="12" + replace `lvlvar'="13-Pathum Thani" if `lvlvar'=="13" + replace `lvlvar'="14-Phra Nakhon Si Ayu" if `lvlvar'=="14" + replace `lvlvar'="15-Ang Thong" if `lvlvar'=="15" + replace `lvlvar'="16-Lop Buri" if `lvlvar'=="16" + replace `lvlvar'="17-Sing Buri" if `lvlvar'=="17" + replace `lvlvar'="18-Chai Nat" if `lvlvar'=="18" + replace `lvlvar'="19-Saraburi" if `lvlvar'=="19" + replace `lvlvar'="20-Chon Buri" if `lvlvar'=="20" + replace `lvlvar'="21-Rayong" if `lvlvar'=="21" + replace `lvlvar'="22-Chanthaburi" if `lvlvar'=="22" + replace `lvlvar'="23-Trat" if `lvlvar'=="23" + replace `lvlvar'="24-Chachoengsao" if `lvlvar'=="24" + replace `lvlvar'="25-Prachin Buri" if `lvlvar'=="25" + replace `lvlvar'="26-Nakhon Nayok" if `lvlvar'=="26" + replace `lvlvar'="27-Sa Kaeo" if `lvlvar'=="27" + replace `lvlvar'="30-Nakhon Ratchasima" if `lvlvar'=="30" + replace `lvlvar'="31-Buri Ram" if `lvlvar'=="31" + replace `lvlvar'="32-Surin" if `lvlvar'=="32" + replace `lvlvar'="33-Si Sa Ket" if `lvlvar'=="33" + replace `lvlvar'="34-Ubon Ratchathani" if `lvlvar'=="34" + replace `lvlvar'="35-Yasothon" if `lvlvar'=="35" + replace `lvlvar'="36-Chaiyaphum" if `lvlvar'=="36" + replace `lvlvar'="37-Am Nat Charoen" if `lvlvar'=="37" + replace `lvlvar'="38-Bueng Kan" if `lvlvar'=="38" + replace `lvlvar'="39-Nong Bua Lam Phu" if `lvlvar'=="39" + replace `lvlvar'="40-Khon Kaen" if `lvlvar'=="40" + replace `lvlvar'="41-Udon Thani" if `lvlvar'=="41" + replace `lvlvar'="42-Loei" if `lvlvar'=="42" + replace `lvlvar'="43-Nong Khai" if `lvlvar'=="43" + replace `lvlvar'="44-Maha Sarakham" if `lvlvar'=="44" + replace `lvlvar'="45-Roi Et" if `lvlvar'=="45" + replace `lvlvar'="46-Kalasin" if `lvlvar'=="46" + replace `lvlvar'="47-Sakon Nakhon" if `lvlvar'=="47" + replace `lvlvar'="48-Nakhon Phanom" if `lvlvar'=="48" + replace `lvlvar'="49-Mukdahan" if `lvlvar'=="49" + replace `lvlvar'="50-Chiang Mai" if `lvlvar'=="50" + replace `lvlvar'="51-Lamphun" if `lvlvar'=="51" + replace `lvlvar'="52-Lampang" if `lvlvar'=="52" + replace `lvlvar'="53-Uttaradit" if `lvlvar'=="53" + replace `lvlvar'="54-Phrae" if `lvlvar'=="54" + replace `lvlvar'="55-Nan" if `lvlvar'=="55" + replace `lvlvar'="56-Phayao" if `lvlvar'=="56" + replace `lvlvar'="57-Chiang Rai" if `lvlvar'=="57" + replace `lvlvar'="58-Mae Hong Son" if `lvlvar'=="58" + replace `lvlvar'="60-Nakhon Sawan" if `lvlvar'=="60" + replace `lvlvar'="61-Uthai Thani" if `lvlvar'=="61" + replace `lvlvar'="62-Kamphaeng Phet" if `lvlvar'=="62" + replace `lvlvar'="63-Tak" if `lvlvar'=="63" + replace `lvlvar'="64-Sukhothai" if `lvlvar'=="64" + replace `lvlvar'="65-Phitsanulok" if `lvlvar'=="65" + replace `lvlvar'="66-Phichit" if `lvlvar'=="66" + replace `lvlvar'="67-Phetchabun" if `lvlvar'=="67" + replace `lvlvar'="70-Ratchaburi" if `lvlvar'=="70" + replace `lvlvar'="71-Kanchanaburi" if `lvlvar'=="71" + replace `lvlvar'="72-Suphun Buri" if `lvlvar'=="72" + replace `lvlvar'="73-Nakhon Pathom" if `lvlvar'=="73" + replace `lvlvar'="74-Samut Sakhon" if `lvlvar'=="74" + replace `lvlvar'="75-Samut Songkhram" if `lvlvar'=="75" + replace `lvlvar'="76-Phetchaburi" if `lvlvar'=="76" + replace `lvlvar'="77-Prachuap Khiri Kha" if `lvlvar'=="77" + replace `lvlvar'="80-Nakhon Si Thammara" if `lvlvar'=="80" + replace `lvlvar'="81-Krabi" if `lvlvar'=="81" + replace `lvlvar'="82-Phangnga" if `lvlvar'=="82" + replace `lvlvar'="83-Phuket" if `lvlvar'=="83" + replace `lvlvar'="84-Surat Thani" if `lvlvar'=="84" + replace `lvlvar'="85-Ranong" if `lvlvar'=="85" + replace `lvlvar'="86-Chumphon" if `lvlvar'=="86" + replace `lvlvar'="90-Songkhla" if `lvlvar'=="90" + replace `lvlvar'="91-Satun" if `lvlvar'=="91" + replace `lvlvar'="92-Trang" if `lvlvar'=="92" + replace `lvlvar'="93-Phatthalung" if `lvlvar'=="93" + replace `lvlvar'="94-Pattani" if `lvlvar'=="94" + replace `lvlvar'="95-Yala" if `lvlvar'=="95" + replace `lvlvar'="96-Narathiwat" if `lvlvar'=="96" + + replace `lvlvar' = "14-Phra Nakhon Si Ayu" if `lvlvar'=="14-Phra Nakhon Si Ayudhya" + replace `lvlvar' = "11-Samut Prakan" if `lvlvar'=="11-Samut Prakarn" + replace `lvlvar' = "23-Trat" if `lvlvar'=="23-Trad" + replace `lvlvar' = "25-Prachin Buri" if `lvlvar'=="25-Phachinburi" + replace `lvlvar' = "72-Suphun Buri" if `lvlvar'=="72-Suphanburi" + replace `lvlvar' = "75-Samut Songkhram" if `lvlvar'=="75-Samut Songkham" + replace `lvlvar' = "77-Prachuap Khiri Kha" if `lvlvar'=="77-Prachuap Khilikhan" + replace `lvlvar' = "80-Nakhon Si Thammara" if `lvlvar'=="80-Nakhon Si Thammarat" + + } + + if "`code'"=="TUN" { + replace `lvlvar' = "2 - NE" if `lvlvar'=="2 - Nord Est" + replace `lvlvar' = "3 - NW" if `lvlvar'=="3 - Nord Ouest" + replace `lvlvar' = "4 - CenterE" if `lvlvar'=="4 - Centre Est" + replace `lvlvar' = "5 - CenterW" if `lvlvar'=="5 - Centre Ouest" + replace `lvlvar' = "6 - SE" if `lvlvar'=="6 - Sud Est" + replace `lvlvar' = "7 - SW" if `lvlvar'=="7 - Sud ouest" + } + + if "`code'"=="UKR" { + replace `lvlvar' = "21 – Transcarpathian" if `lvlvar'=="7 – Transcarpathian" + } + + if "`code'"=="UZB" { + replace `lvlvar' = "4-Jizzak" if `lvlvar'=="1708 - Jizzakh" + replace `lvlvar' = "11-Tashkent" if `lvlvar'=="1727 - Tashkent (region)" + replace `lvlvar' = "13-Khorezm" if `lvlvar'=="1733 - Khorasm" + } + + if "`code'"=="VNM" { + replace `lvlvar' = "2-Midlands and Northern Mountainous Areas" if `lvlvar'=="1-North Mountain and Midland" + replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="3-Northern and Coastal Central region_num" + replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="3-North Central area and South Central Coastal area" + replace `lvlvar' = "6-Mekong Delta" if `lvlvar'=="6-Mekong River Delta" + //8 regions into 6 regions + replace `lvlvar' = "3-Northern and Coastal Central Region" if `lvlvar'=="Central North" | `lvlvar'=="Central South" + replace `lvlvar' = "4-Central Highlands" if `lvlvar'=="Highlands" + replace `lvlvar' = "6-Mekong Delta" if `lvlvar'=="Mekong River Delta" + replace `lvlvar' = "2-Midlands and Northern Mountainous" if `lvlvar'=="Northeast" + replace `lvlvar' = "2-Midlands and Northern Mountainous" if `lvlvar'=="Northwest" + } + + if "`code'"=="WSM" { + replace `lvlvar' = "1-Apia Urban Areas" if `lvlvar'=="1-Apia" + replace `lvlvar' = "2-North West Upolu" if `lvlvar'=="2-NWU" + replace `lvlvar' = "3-Rest of Upolu" if `lvlvar'=="3-RoU" + } + + if "`code'"=="XKX" { + egen sample3 = sieve(`lvlvar'), keep(a) + gen sample3UP = upper(sample3) + replace `lvlvar'="1 -Gjakovע" if sample3UP== "GJAKOVE" + replace `lvlvar'="3 - Mitrovic" if sample3UP== "MITROVICE" + replace `lvlvar'="6 - Prishtin" if sample3UP== "PRISHTINE" + replace `lvlvar'="4 - Pej" if sample3UP== "PEJE" + drop sample3 sample3UP + } + + if "`code'"=="AZE" { + egen sample3 = sieve(`lvlvar'), keep(a) + gen sample3UP = upper(sample3) + replace `lvlvar'="" if sample3UP== "GJAKOVE" + + replace `lvlvar'="1 – Absheron" if sample3UP== "ABERONQUBA"|sample3UP== "ABSHERON"|sample3UP== "ABSHERONGUBA" + replace `lvlvar'="6 – Aran" if sample3UP== "ARANWITHYUHKARABAH" + replace `lvlvar'="8 – Baku City" if sample3UP== "BAKU"|sample3UP== "BAKUCITY" + replace `lvlvar'="9 – Daghlig Shirvan" if sample3UP== "DAGLIQSHIRVAN" | sample3UP== "SHIRVAN" |sample3UP== "IRVAN"|sample3UP== "DAGHLIGSHIRVAN" + replace `lvlvar'="2 – Ganja-Gazakh" if sample3UP== "GANJAGAZAKH" + replace `lvlvar'="5 – Guba-Khachmaz" if sample3UP== "GUBAHACHMAZ" + replace `lvlvar'="4 – Lankaran" if sample3UP== "LANKARANASTARA" + + replace `lvlvar'="NA" if sample3UP== "MUANSALYAN"|sample3UP== "MUGHANSALYAN" + replace `lvlvar'="0 – Nakhchyvan" if sample3UP== "NAKHCHIVAN"|sample3UP== "NAKHCHYVANAR" + replace `lvlvar'="7 – Yukhary Garabagh" if sample3UP== "QARABAGHMIL"|sample3UP== "QARABAMIL" + replace `lvlvar'="3 – Shaki-Zagatala" if sample3UP== "SHAKIZAGATALA"|sample3UP== "SHEKIZAGATALA" + replace `lvlvar'="7 – Yukhary Garabagh" if sample3UP== "YUHARSKARABAH" | sample3UP== "YUKHARYGARABAGH" + drop sample3 sample3UP + } + + } + + //urban and rural + local urbvar + cap des urban + if _rc==0 { + ta urban + if r(N)>0 { + cap decode urban, gen(_urban_) + if _rc~=0 tostring urban, gen(_urban_) + replace _urban_ = trim(_urban_) + gen reg_rural = `oklist' + "*_*" + _urban_ + local urbvar reg_rural + } + } //urban + + local oklist2 _all_ `oklist' `urbvar' + local oklist2 : list uniq oklist2 + + **************************************************** + **Dimension 1: Poverty + **************************************************** + gen double gallT_ppp = welfare/cpi2017/icp2017/365 + drop if gallT_ppp<0 + replace gallT_ppp = 0.25 if gallT_ppp<0.25 + + //reweight to lineup year pop + su year [aw=`wgt'] + local initial = r(sum_w) + gen double pop = (`wgt') * (`pcnpop'/`initial') + + //recalculate the 2.15 line for 2.15 poverty + qui foreach num of numlist ${plinelist} { + if `pcnpov`num''==0 { + local pline`num' = `=`num'/100' + } + else { + _pctile gallT_ppp [aw=pop], p(`pcnpov`num'') + local pline`num' = r(r1) + } + + gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. + gen pline`num' = `pline`num'' + } //num + + **************************************************** + **Dimension 2: Access to Education + **************************************************** + if `edu_flag'==1 { //data in survey + **1a) Indicator: have no one with primary completion (completed 15+) + //All adults + global eduage 15 + //Special cases: only for the countries without continuous age: UKR NRU + if "`=upper("`code'")'" == "UKR" { + //for all years from 2014-2020, each year has different agecat string categories. + if `surv_year'==2020 { //specific year from the listing file (GMD_list_YYYY) + global eduage 2 //2019 + drop age + gen age = 1 if agecat=="1 - Up to 18 years" + replace age = 2 if agecat=="2 - 18 - 35 years old" + replace age = 3 if agecat=="3 - 36 - 55 years old" + replace age = 4 if agecat=="4 - 56 - 59 years old" + replace age = 5 if agecat=="5 - 60 years and older" + } + } + + if "`=upper("`code'")'" == "NRU" { //2012 + if `surv_year'==2012 { + global eduage 4 + drop age + gen age = . + replace age = 1 if agecat=="0-4 years" + replace age = 2 if agecat=="5-9 years" + replace age = 3 if agecat=="10-14 years" + replace age = 4 if agecat=="15-19 years" + replace age = 5 if agecat=="20-24 years" + replace age = 6 if agecat=="25-29 years" + replace age = 7 if agecat=="30-34 years" + replace age = 8 if agecat=="35-39 years" + replace age = 9 if agecat=="40-44 years" + replace age = 10 if agecat=="45-49 years" + replace age = 11 if agecat=="50-54 years" + replace age = 12 if agecat=="55-59 years" + replace age = 13 if agecat=="60-64 years" + replace age = 14 if agecat=="65-69 years" + replace age = 15 if agecat=="70-74 years" + replace age = 16 if agecat=="75 and older" + } + } + + local eduflag = 0 + cap gen educat5 = . + cap gen educat7 = . + + cap su educat7 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) + } + else { //educat5 + cap su educat5 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) + } + else { //educat4 + cap su educat4 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) + } + else { //no education available + local eduflag = 1 + } + } + } + + if `eduflag'==0 { + gen temp2a = 1 if age>=$eduage & age~=. + bys hhid: egen educ_com_size = sum(temp2a) + bys hhid: egen temp3 = sum(temp2) + bys hhid: egen temp3c = sum(temp2c) + gen dep_educ_com = 0 + replace dep_educ_com = 1 if temp3==0 + gen dep_educ_com_lb = 0 + replace dep_educ_com_lb = 1 if temp3c==0 + ren temp3 educ_com_sum + ren temp3c educ_com_sum_lb + drop temp2 temp2a temp2c + } + else { + gen dep_educ_com = . + gen dep_educ_com_lb = . + gen educ_com_sum = . + gen educ_com_sum_lb = . + gen educ_com_size = . + } + + gen educ_com_appl = 1 + replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) + gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. + bys hhid: egen educ_com_mis = sum(temp2b) + drop temp2b + gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. + + la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" + la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" + la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" + la var educ_com_appl_miss "School completion is applicable households but missing completion" + cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss + + //drop fake age due to agecat string (only for the countries without continuous age) + if ("`=upper("`code'")'" == "UKR" & `surv_year'==2020) drop age + if ("`=upper("`code'")'" == "NRU" & `surv_year'==2012) drop age + } //edu flag ==1 + if `edu_flag'==2 { //universal coverage + gen dep_educ_com = 0 + } //`edu_flag'==2 + if `edu_flag'==3 { //fused in below + gen dep_educ_com = . + } + if `edu_flag'==4 { //fused in below + gen dep_educ_com = . + gen unesco_flag = 1 + } + + **************************************************** + **Dimension 3: Access to Electricity + **************************************************** + if `elec_flag'==1 { //data in survey + cap des electricity + if _rc==0 gen dep_infra_elec = electricity==0 if electricity~=. + else local elec_flag 3 + *else gen dep_infra_elec = . + } + if `elec_flag'==2 { //universal + gen dep_infra_elec = 0 + } + if `elec_flag'==3 { + gen dep_infra_elec = . + } + if `elec_flag'==4 { + gen dep_infra_elec = . + gen elec_flag = 1 + } + la var dep_infra_elec "Deprived if HH has No access to electricity" + + **************************************************** + **Dimension 4: Access to Water + **************************************************** + if `water_flag'==1 { + cap des imp_wat_rec + if _rc==0 gen dep_infra_impw = imp_wat_rec==0 if imp_wat_rec~=. + *else gen dep_infra_impw = . + else local water_flag 3 + } + if `water_flag'==2 { + gen dep_infra_impw = 0 + } + if `water_flag'==3 { + gen dep_infra_impw = . + } + if `water_flag'==4 { + gen dep_infra_impw = . + gen water_flag = 1 + } + la var dep_infra_impw "Deprived if HH has No access to improved water" + + **************************************************** + **Dimension 5: Access to social protection + **************************************************** + if `sp_flag'==1 { + //nothing yet from survey + } + if `sp_flag'==2 { + gen dep_sp = 0 + } + if `sp_flag'==3 { + gen dep_sp = . + } + if `sp_flag'==4 { + gen dep_sp = . + gen sp_flag = 1 + + } + **************************************************** + **Dimension 6: Access to financial inclusion + **************************************************** + if `findex_flag'==1 { // from surveys + cap des fin_account + if _rc==0 gen dep_fin = fin_account==0 if fin_account~=. + else local findex_flag 3 + } + if `findex_flag'==2 { + gen dep_fin = 0 + } + if `findex_flag'==3 { + gen dep_fin = . + } + if `findex_flag'==4 { + gen dep_fin = . + gen fin_flag = 1 + } + **************************************************** + cap gen rural = urban==0 + + //get 15+ population size by quintile or quintile/urban rural only when cont. age is available. + forv a1=1(1)5 { + local n15q`a1'total = 1 + local n15q`a1'urban = 1 + local n15q`a1'rural = 1 + } + + qui if "`mod'"=="ALL" { + _ebin gallT_ppp [aw=pop], gen(q5ind) nq(5) + cap des age + if _rc==0 { + qui su age + if r(N)>0 { + gen tmp = age>=15 & age~=. + bys hhid (pid): egen n15 = total(tmp) + //`no_accountq`i'`nm'' `no_accountq`i'total' + forv a1=1(1)5 { + su n15 [aw=pop] if q5ind==`a1' + local n15q`a1'total = r(mean) + if `ct_urban'==1 { + su n15 [aw=pop] if q5ind==`a1' & urban==1 + local n15q`a1'urban = r(mean) + + su n15 [aw=pop] if q5ind==`a1' & urban==0 + local n15q`a1'rural = r(mean) + } //ct_urban + } //a1 + } //rN + } //age + cap drop q5ind tmp n15 + } //ALL + + //POP WEIGHT at HH level - to convert all data to HH level as data comes in as either individual or HH level data + set varabbrev off + cap isid hhid + if _rc==0 { + cap des pop + if _rc==0 gen double pop2 = pop + else gen double pop2 = weight_p + } + else { + cap des pop + if _rc==0 { + drop if welfare==. + drop if pop==. + bys hhid: egen double pop2 = total(pop) + } + else { + drop if welfare==. + drop if weight_p==. + bys hhid: egen double pop2 = total(weight_p) + } + duplicates drop hhid, force + } + set varabbrev on + ren pop popold + ren pop2 pop + + cap drop region + gen region = "`regn'" + cap drop survname + gen str survname = "`survname'" + local welfaretype : char _dta[welfaretype] + clonevar weight_use = pop + + //quintiles + _ebin gallT_ppp [aw=pop], gen(q5) nq(5) + gen test = 1 + + tempfile databfsim + save `databfsim', replace + + //Data fusion loop through random assignments + set seed 1234567 + clear + tempfile ctry1 ctry1ln + save `ctry1', replace emptyok + save `ctry1ln', replace emptyok + + noi display _n(1) + noi display in yellow "Number of simulations: $sim" _n(1) + noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") + + qui forv sim=1(1)$sim { + use `databfsim', clear + **************************************************** FUSION + + //Education + if `edu_flag'==3 { + if `unesco'==1 { + gen unesco_flag = 0 + if "`type_unesco'"=="Urb_rur" { //urban-rural only + foreach nm in urban rural { + cap drop _a1 + if (`unesco_`nm'' > 0) { + wsample test [aw=pop] if `nm'==1, percent(`unesco_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 if `nm'==1 + } + replace dep_educ_com = 1- _a1 if `nm'==1 + drop _a1 + } //urb-rul + } + else { //total country + local nm total + cap drop _a1 + if (`unesco_`nm'' > 0) { + wsample test [aw=pop] , percent(`unesco_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 + } + replace dep_educ_com = 1- _a1 + drop _a1 + } //types + } + else { //missing + gen unesco_flag = 1 + } + } + + //Electricity + if `elec_flag'==3 { + if `ged'==1 { + gen elec_flag = 0 + if "`type_ged'"=="Urb_rur" { //urban-rural only + foreach nm in urban rural { + cap drop _a1 + if (`ged_`nm'' > 0) { + wsample test [aw=pop] if `nm'==1, percent(`ged_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 if `nm'==1 + } + replace dep_infra_elec = 1- _a1 if `nm'==1 + drop _a1 + } //urb-rul + } + else { //total country + local nm total + cap drop _a1 + if (`ged_`nm'' > 0) { + wsample test [aw=pop] , percent(`ged_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 + } + replace dep_infra_elec = 1- _a1 + drop _a1 + } //types + } + else { //missing + gen elec_flag = 1 + } + } //elec_flag + + //Water + if `water_flag'==3 { + if `jmp'==1 { + gen water_flag = 0 + if "`type_jmp'"=="Urb_rur" { //urban-rural only + foreach nm in urban rural { + cap drop _a1 + if (`w_imp_`nm'' > 0) { + wsample test [aw=pop] if `nm'==1, percent(`w_imp_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 if `nm'==1 + } + replace dep_infra_impw = 1- _a1 if `nm'==1 + drop _a1 + } //urb-rul + } + else { //total country + local nm total + cap drop _a1 + if (`w_imp_`nm'' > 0) { + wsample test [aw=pop] , percent(`w_imp_`nm'') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 + } + replace dep_infra_impw = 1- _a1 + drop _a1 + } //types + } + else { //missing + gen water_flag = 1 + } + } //water_flag + + //Findex fusion + if `findex_flag'==3 { //findex access no_accountq`i'total + if `findex'==1 { + gen fin_flag = 0 + //Add adjustment of individual level estimate to HH level estimate. N is number of 15+ in the group (national, quintile, or quitile & urban/rural). When the data is without age to figure n15, N==1, just hh = ind^0.6 + //hh = ind^(0.6*N) + if "`type_findex'"=="Urb_rur" { //urban-rural quintiles + foreach nm in urban rural { + forv i=1(1)5 { + cap drop _a`i' + if (`no_accountq`i'`nm'' > 0) { + *wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`no_accountq`i'`nm'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + local adjfin = 100*((`=`no_accountq`i'`nm''/100')^(0.6*`n15q`i'`nm'')) + wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' & `nm'==1 + } + replace dep_fin = _a`i' if q5==`i' & `nm'==1 + drop _a`i' + } //i + } //urb-rul + } //urb-rul quintiles + else { //total quintiles + forv i=1(1)5 { + cap drop _a`i' + if (`no_accountq`i'total' > 0) { + *wsample test [aw=pop] if q5==`i', percent(`no_accountq`i'total') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + local adjfin = 100*((`=`no_accountq`i'total'/100')^(0.6*`n15q`i'total')) + wsample test [aw=pop] if q5==`i', percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' + } + replace dep_fin = _a`i' if q5==`i' + drop _a`i' + } //i + } //types + } //findex + else { //missing + gen fin_flag = 1 + } + } //findex flag + + //SP access + if `sp_flag'==3 { + if `aspire_sp'==1 { + gen sp_flag = 0 + + if "`type_aspire'"=="Quintile" { + forv i=1(1)5 { + cap drop _a`i' + if (`_pop_All_SPL_q`i'' > 0) { + wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' + } + replace dep_sp = 1-_a`i' if q5==`i' + drop _a`i' + } //i + } //quintile type + else { //type_aspire == National + cap drop _a1 + if (`_pop_All_SPL' > 0) { + wsample test [aw=pop], percent(`_pop_All_SPL') newvar(_a1) seed(`=1234567+`sim'') numsim(1) + } + else { + gen _a1 = 0 + } + replace dep_sp = 1-_a1 + drop _a1 + } + } //aspire_sp==1 + else { //missing + gen sp_flag = 1 + } + } //sp_flag==3 + **************************************************** END FUSION + + //multidimensional vulnerability + foreach num of numlist ${plinelist} { + //vulnerable and one dim + gen pov1_edu_`num' = 0 + replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 + + gen pov1_sp_`num' = 0 + replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 + + gen pov1_fin_`num' = 0 + replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 + + gen pov1_elec_`num' = 0 + replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 + + gen pov1_water_`num' = 0 + replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 + + //rsum + egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing + + //any of the 6 dimensions - deprived in education; dep_sp; dep_fin + gen multvul_`num' = 0 + replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. + + // any 2, 3, 4,...,6 + forv j=2(1)6 { + gen all`j'vul_`num' = 0 + replace all`j'vul_`num' = 1 if dim6_`num'==`j' + } + } //povlist + + gen sim = `sim' + gen _count=1 + + //collapse to get indicators + compress + tempfile data2 + save `data2', replace + + foreach var of local oklist2 { + use `data2', clear + clonevar h = pop + clonevar h_ln = pop + clonevar wta_pov = pop + replace `var' = strtrim(`var') + replace `var' = ustrtrim(`var') + replace `var' = strproper(`var') + + levelsof `var', local(lvllist2) + cap confirm string variable `var' + if _rc==0 local st = 1 + else local st = 0 + + qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') + + rename gallT_ppp mean_ln + ren _count nohh + ren h noind + egen double totalhh = total(nohh) + egen double totalind = total(noind) + gen sh_hh = nohh/totalhh + gen sh_pop = noind/totalind + + ren `var' sample + + gen level = "`var'" + gen code = "`code'" + gen lineupyear = `lineupyear' + gen baseyear = `baseyear' + gen survname = "`survname'" + gen region = "`regn'" + gen str welfaretype = "`welfaretype'" + + append using `ctry1ln' + order region code baseyear lineupyear survname welfaretype level sample sim poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind + save `ctry1ln', replace + } //foreach + + if (mod(`sim',50)==0){ + noi display in white ". `sim'" _continue + noi display _n(0) + } + else noi display "." _continue + } //sim + //collapse across sim + + //save results + use `ctry1ln', replace + compress + save "${upath2}\03.intermediate\Sim\\${lnyear}\\temp\\`code'_`baseyear'_`survname'_${lnyear}_lnsim", replace + + groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) + gen todo = `todo' + order code survname level sample baseyear lineupyear todo mean_ln poor215_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind + save "${upath2}\03.intermediate\Sim\\${lnyear}\\`code'_`baseyear'_`survname'_${lnyear}", replace + + append using "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'" + compress + save "${upath2}\03.intermediate\Sim\Vintages\\`fdataall_ln'", replace + } //dlw rc + else { + noi dis "`j' - Failed to load DLW `code'-`surv_year'-`survname'-`mod'" + } +} //forvalue i +log close + +/* +use `dataall', clear +compress +save "${maindir}/output/sub_base", replace + +use `dataall_ln', clear +compress +save "${maindir}/output/sub_base_ln", replace +*/ diff --git a/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do b/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do old mode 100644 new mode 100755 index 32b7f26..fd56875 --- a/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do +++ b/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do @@ -1,442 +1,422 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -//IND vul 2021 - -clear all -tempfile data1 data2 data3 data4 dataall -save `dataall', replace emptyok - -*global upath2 -global sim 100 - -//Get from PIP for the lineup year 2021 -local pov215 = 13.7069 -local pov365 = 45.2669 -local pov685 = 82.2031 - -/* from DLW -levelnote cpi2017 icp2017 -rural 1.216772 17.09197 -urban 1.2663806 20.787346 -*/ -local cpi2017rur = 1.216772 -local icp2017rur = 17.09197 -local cpi2017urb = 1.2663806 -local icp2017urb = 20.787346 - -local code IND -local lineupyear 2021 -local baseyear 2021 -local survname CPHS -local welfaretype CONS - -**************************************************** -//Get values for fusion ASPIRE FINDEX -qui { - //ASPIRE IND is 100 - /* - use "${upath2}\02.input\2021\ASPIRE_data_2021.dta", clear - keep if code=="`code'" - forv i=1(1)5 { - local _pop_All_SPL_q`i' - } - local aspire_sp - count - if r(N)>0 { - local aspire_sp = 1 - forv i=1(1)5 { - local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] - } - } //rn - else { - local aspire_sp = 0 - } - */ - - //FINDEX data (no account, which is dep_fin) - use "${upath2}\02.input\2021\findex_2021_quintiles.dta", clear - keep if code=="`code'" - forv i=1(1)5 { - local no_accountq`i'total - } - - local findex - count - if r(N)>0 { - local findex = 1 - forv i=1(1)5 { - foreach nm in urban rural { - local no_accountq`i'`nm' = no_accountq`i'`nm'[1] - } - } - } //rn - else { - local findex = 0 - } -} -**************************************************** -//Load microdata -*dlw, country(ind) year(2021) type(gmd) files mod(gpwg) -use "${upath2}\02.input\IND\IND_2021_CPHS_V01_M_V02_A_GMD_GPWG.dta" , clear -merge m:1 hhid using "${upath2}\02.input\IND\CPHS_2021_HHID_state.dta" -drop if _merge==2 -drop _merge -decode state, gen(subnatid) - -levelsof sim, local(simlist) -tempfile datax -save `datax', replace - -qui foreach simout of local simlist { - use `datax', clear - keep if sim==`simout' - ren sim simout - noi dis "doing imputed welfare - `simout'" - merge 1:m hhid using "${upath2}\02.input\IND\IND_2021_ALL.dta" - drop if _merge==2 - drop _merge - - //Telangana updated - replace subnatid = "Andhra Pradesh" if subnatid=="Telangana" - - la def urban 1 "Urban" 0 "Rural" - la val urban urban - decode urban, gen(urb2) - gen reg_urb = subnatid + "*_*" + urb2 - local subnatvar subnatid reg_urb - - // welfare variable - gen gallT_ppp = welfare/`cpi2017urb'/`icp2017urb'/365 if urban==1 - replace gallT_ppp = welfare/`cpi2017rur'/`icp2017rur'/365 if urban==0 - replace gallT_ppp = 0.25 if gallT_ppp<0.25 - - //ind weight - bys hhid (pid): gen hx = _N - gen double weight_p = weight/hx - - **************************************************** - **Dimension 1: Education - **************************************************** - - **1a) Indicator: no one in hh with primary completion (age 15+) - //All adults - global eduage 15 - local eduflag = 0 - cap gen educat5 = . - cap gen educat7 = . - - cap su educat7 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) - } - else { //educat5 - cap su educat5 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) - } - else { //educat4 - cap su educat4 - if r(N)>0 { - gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. - gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) - } - else { //no education available - local eduflag = 1 - } - } //educat4 - } - - if `eduflag'==0 { - gen temp2a = 1 if age>=$eduage & age~=. - bys hhid: egen educ_com_size = sum(temp2a) - bys hhid: egen temp3 = sum(temp2) - bys hhid: egen temp3c = sum(temp2c) - gen dep_educ_com = 0 - replace dep_educ_com = 1 if temp3==0 - gen dep_educ_com_lb = 0 - replace dep_educ_com_lb = 1 if temp3c==0 - ren temp3 educ_com_sum - ren temp3c educ_com_sum_lb - drop temp2 temp2a temp2c - } - else { - gen dep_educ_com = . - gen dep_educ_com_lb = . - gen educ_com_sum = . - gen educ_com_sum_lb = . - gen educ_com_size = . - } - - gen educ_com_appl = 1 - replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) - gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. - bys hhid: egen educ_com_mis = sum(temp2b) - drop temp2b - gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. - - la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" - la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" - la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" - la var educ_com_appl_miss "School completion is applicable households but missing completion" - cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss - - **************************************************** - **Dimension 2: Access to infrastructure - **************************************************** - - **************************************************** - //Indicator: Electricity - gen dep_infra_elec = 0 - - **************************************************** - //Indicator: Water - gen dep_infra_impw = 0 - - **************************************************** - **Dimension 3: Monetary - **************************************************** - //recalculate the 2.15 line for 2.15 poverty - qui foreach num of numlist 215 365 685 { - if `pov`num''==0 { - local pline`num' = `=`num'/100' - } - else { - _pctile gallT_ppp [aw=weight_p], p(`pov`num'') - local pline`num' = r(r1) - } - - gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. - gen pline`num' = `pline`num'' - } //num - - //Scaled IND to HH - //get 15+ population size by quintile or quintile/urban rural only when age is available. - forv a1=1(1)5 { - local n15q`a1'total = 1 - local n15q`a1'urban = 1 - local n15q`a1'rural = 1 - } - - _ebin gallT_ppp [aw=weight_p], gen(q5ind) nq(5) - cap des age - if _rc==0 { - qui su age - if r(N)>0 { - gen tmp = age>=15 & age~=. - bys hhid (pid): egen n15 = total(tmp) - //`no_accountq`i'`nm'' `no_accountq`i'total' - forv a1=1(1)5 { - su n15 [aw=weight_p] if q5ind==`a1' - local n15q`a1'total = r(mean) - - su n15 [aw=weight_p] if q5ind==`a1' & urban==1 - local n15q`a1'urban = r(mean) - - su n15 [aw=weight_p] if q5ind==`a1' & urban==0 - local n15q`a1'rural = r(mean) - } //a1 - } //rN - } //age - cap drop q5ind tmp n15 - - //Convert to HH - bys hhid: egen double pop = total(weight_p) - duplicates drop hhid, force - - clonevar weight_use = pop - - //quintiles - _ebin gallT_ppp [aw=pop], gen(q5) nq(5) - gen test = 1 - gen rural = urban==0 - des,sh - tempfile databfsim - save `databfsim', replace - - //loop through random assignments - set seed 1234567 - clear - tempfile ctry1 ctry1ln - save `ctry1', replace emptyok - save `ctry1ln', replace emptyok - - noi display _n(1) - noi display in yellow "Number of simulations: $sim" _n(1) - noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") - - qui forv sim=1(1)$sim { - use `databfsim', clear - - //findex access no_accountq`i'total - if `findex'==1 { - gen dep_fin = . - foreach nm in urban rural { - forv i=1(1)5 { - cap drop _a`i' - if (`no_accountq`i'`nm'' > 0) { - *wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`no_accountq`i'`nm'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - local adjfin = 100*((`=`no_accountq`i'`nm''/100')^(0.6*`n15q`i'`nm'')) - wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' & `nm'==1 - } - replace dep_fin = _a`i' if q5==`i' & `nm'==1 - drop _a`i' - } //i - } //urb-rul - gen fin_flag = 0 - } - else { //missing - gen dep_fin = . - gen fin_flag = 1 - } - - //SP access _pop_All_SPL_q`i' - /* - if `aspire_sp'==1 { - gen dep_sp = . - forv i=1(1)5 { - cap drop _a`i' - if (`_pop_All_SPL_q`i'' > 0) { - wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) - } - else { - gen _a`i' = 0 if q5==`i' - } - replace dep_sp = 1-_a`i' if q5==`i' - drop _a`i' - } //i - - gen sp_flag = 0 - } - else { //missing - gen dep_sp = . - gen sp_flag = 1 - } - */ - //ASPIRE IND = 100, June 15 version - gen dep_sp = 0 - - //multidimensional vulnerability - foreach num of numlist 215 365 685 { - //vulnerable and one dim - gen pov1_edu_`num' = 0 - replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 - - gen pov1_sp_`num' = 0 - replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 - - gen pov1_fin_`num' = 0 - replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 - - gen pov1_elec_`num' = 0 - replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 - - gen pov1_water_`num' = 0 - replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 - - //rsum - egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing - - //any of the 6 dimensions - deprived in education; dep_sp; dep_fin - gen multvul_`num' = 0 - replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. - - // any 2, 3, 4,...,6 - forv j=2(1)6 { - gen all`j'vul_`num' = 0 - replace all`j'vul_`num' = 1 if dim6_`num'==`j' - } - } //povlist - gen _all_ = "All sample" - - gen sim = `sim' - gen _count=1 - //collapse to get indicators - compress - tempfile data2 - save `data2', replace - - *local lvllist _all_ urban2 subnatid subnatid1 /*db040 */ - local lvllist _all_ urb2 `subnatvar' - qui foreach var of local lvllist { - use `data2', clear - clonevar h = pop - *clonevar h_ln = pop - *clonevar wta_pov = pop - replace `var' = stritrim(`var') - replace `var' = ustrtrim(`var') - - levelsof `var', local(lvllist2) - cap confirm string variable `var' - if _rc==0 local st = 1 - else local st = 0 - - qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') - - rename gallT_ppp mean_ln - ren _count nohh - ren h noind - egen double totalhh = total(nohh) - egen double totalind = total(noind) - gen sh_hh = nohh/totalhh - gen sh_pop = noind/totalind - - ren `var' sample - gen level = "`var'" - gen code = "`code'" - gen lineupyear = `lineupyear' - gen baseyear = `baseyear' - gen survname = "`survname'" - gen str welfaretype = "`welfaretype'" - - append using `ctry1ln' - order code baseyear lineupyear survname welfaretype level sample sim mean_ln poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind - save `ctry1ln', replace - } //foreach subnat - - if (mod(`sim',50)==0){ - noi display in white ". `sim'" _continue - noi display _n(0) - } - else noi display "." _continue - } //sim - //collapse across sim - - //save results - use `ctry1ln', replace - - groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) - gen simout = `simout' - order code survname level sample baseyear lineupyear mean_ln poor215_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind - saveold "${upath2}\03.intermediate\Sim\2021\temp\IND_2021_CPHS_2021_`simout'", replace - append using `dataall' - save `dataall', replace -} //simout - -use `dataall', clear -ta simout -groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) -gen todo = 1 -saveold "${upath2}\03.intermediate\Sim\2021\IND_2021_CPHS_2021", replace - +//IND vul 2021 + +clear all +tempfile data1 data2 data3 data4 dataall +save `dataall', replace emptyok + +global sim 100 + +//Get from PIP for the lineup year 2021 +local pov215 = 13.7069 +local pov365 = 45.2669 +local pov685 = 82.2031 + +/* from DLW +levelnote cpi2017 icp2017 +rural 1.216772 17.09197 +urban 1.2663806 20.787346 +*/ +local cpi2017rur = 1.216772 +local icp2017rur = 17.09197 +local cpi2017urb = 1.2663806 +local icp2017urb = 20.787346 + +local code IND +local lineupyear 2021 +local baseyear 2021 +local survname CPHS +local welfaretype CONS + +**************************************************** +//Get values for fusion ASPIRE FINDEX +qui { + //ASPIRE IND is 100 + /* + use "${upath2}\02.input\2021\ASPIRE_data_2021.dta", clear + keep if code=="`code'" + forv i=1(1)5 { + local _pop_All_SPL_q`i' + } + local aspire_sp + count + if r(N)>0 { + local aspire_sp = 1 + forv i=1(1)5 { + local _pop_All_SPL_q`i' = _pop_All_SPL_q`i'[1] + } + } //rn + else { + local aspire_sp = 0 + } + */ + + //FINDEX data (no account, which is dep_fin) + use "${upath2}\02.input\2021\findex_2021_quintiles.dta", clear + keep if code=="`code'" + forv i=1(1)5 { + local no_accountq`i'total + } + + local findex + count + if r(N)>0 { + local findex = 1 + forv i=1(1)5 { + foreach nm in urban rural { + local no_accountq`i'`nm' = no_accountq`i'`nm'[1] + } + } + } //rn + else { + local findex = 0 + } +} +**************************************************** +//Load microdata +*dlw, country(ind) year(2021) type(gmd) files mod(gpwg) +use "${upath2}\02.input\IND\IND_2021_CPHS_V01_M_V02_A_GMD_GPWG.dta" , clear +merge m:1 hhid using "${upath2}\02.input\IND\CPHS_2021_HHID_state.dta" +drop if _merge==2 +drop _merge +decode state, gen(subnatid) + +levelsof sim, local(simlist) +tempfile datax +save `datax', replace + +qui foreach simout of local simlist { + use `datax', clear + keep if sim==`simout' + ren sim simout + noi dis "doing imputed welfare - `simout'" + merge 1:m hhid using "${upath2}\02.input\IND\IND_2021_ALL.dta" + drop if _merge==2 + drop _merge + + //Telangana updated + replace subnatid = "Andhra Pradesh" if subnatid=="Telangana" + + la def urban 1 "Urban" 0 "Rural" + la val urban urban + decode urban, gen(urb2) + gen reg_urb = subnatid + "*_*" + urb2 + local subnatvar subnatid reg_urb + + // welfare variable + gen gallT_ppp = welfare/`cpi2017urb'/`icp2017urb'/365 if urban==1 + replace gallT_ppp = welfare/`cpi2017rur'/`icp2017rur'/365 if urban==0 + replace gallT_ppp = 0.25 if gallT_ppp<0.25 + + //ind weight + bys hhid (pid): gen hx = _N + gen double weight_p = weight/hx + + **************************************************** + **Dimension 1: Education + **************************************************** + + **1a) Indicator: no one in hh with primary completion (age 15+) + //All adults + global eduage 15 + local eduflag = 0 + cap gen educat5 = . + cap gen educat7 = . + + cap su educat7 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat7>=3 & educat7~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat7>=3 | educat7==.) + } + else { //educat5 + cap su educat5 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat5>=3 & educat5~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat5>=3 | educat5==.) + } + else { //educat4 + cap su educat4 + if r(N)>0 { + gen temp2 = 1 if age>=$eduage & age~=. & educat4>=2 & educat4~=. + gen temp2c = 1 if age>=$eduage & age~=. & (educat4>=2 | educat4==.) + } + else { //no education available + local eduflag = 1 + } + } //educat4 + } + + if `eduflag'==0 { + gen temp2a = 1 if age>=$eduage & age~=. + bys hhid: egen educ_com_size = sum(temp2a) + bys hhid: egen temp3 = sum(temp2) + bys hhid: egen temp3c = sum(temp2c) + gen dep_educ_com = 0 + replace dep_educ_com = 1 if temp3==0 + gen dep_educ_com_lb = 0 + replace dep_educ_com_lb = 1 if temp3c==0 + ren temp3 educ_com_sum + ren temp3c educ_com_sum_lb + drop temp2 temp2a temp2c + } + else { + gen dep_educ_com = . + gen dep_educ_com_lb = . + gen educ_com_sum = . + gen educ_com_sum_lb = . + gen educ_com_size = . + } + + gen educ_com_appl = 1 + replace educ_com_appl = 0 if (educ_com_size==0 | educ_com_size==.) + gen temp2b = 1 if age>=$eduage & age~=. & educat4==. & educat5==. & educat7==. + bys hhid: egen educ_com_mis = sum(temp2b) + drop temp2b + gen educ_com_appl_miss = educ_com_appl == 1 & educ_com_mis>0 & educ_com_mis~=. + + la var dep_educ_com "Deprived if Households with NO adults $eduage+ with no primary completion" + la var dep_educ_com_lb "Deprived if Households with NO adults $eduage+ with no or missing primary completion" + la var educ_com_appl "School completion is applicable households, has $eduage or more individuals" + la var educ_com_appl_miss "School completion is applicable households but missing completion" + cap drop dep_educ_com_lb educ_com_appl educ_com_appl_miss + + **************************************************** + **Dimension 2: Access to infrastructure + **************************************************** + + **************************************************** + //Indicator: Electricity + gen dep_infra_elec = 0 + + **************************************************** + //Indicator: Water + gen dep_infra_impw = 0 + + **************************************************** + **Dimension 3: Monetary + **************************************************** + //recalculate the 2.15 line for 2.15 poverty + qui foreach num of numlist 215 365 685 { + if `pov`num''==0 { + local pline`num' = `=`num'/100' + } + else { + _pctile gallT_ppp [aw=weight_p], p(`pov`num'') + local pline`num' = r(r1) + } + + gen poor`num'_ln = gallT_ppp < `pline`num'' if gallT_ppp~=. + gen pline`num' = `pline`num'' + } //num + + //Scaled IND to HH + //get 15+ population size by quintile or quintile/urban rural only when age is available. + forv a1=1(1)5 { + local n15q`a1'total = 1 + local n15q`a1'urban = 1 + local n15q`a1'rural = 1 + } + + _ebin gallT_ppp [aw=weight_p], gen(q5ind) nq(5) + cap des age + if _rc==0 { + qui su age + if r(N)>0 { + gen tmp = age>=15 & age~=. + bys hhid (pid): egen n15 = total(tmp) + //`no_accountq`i'`nm'' `no_accountq`i'total' + forv a1=1(1)5 { + su n15 [aw=weight_p] if q5ind==`a1' + local n15q`a1'total = r(mean) + + su n15 [aw=weight_p] if q5ind==`a1' & urban==1 + local n15q`a1'urban = r(mean) + + su n15 [aw=weight_p] if q5ind==`a1' & urban==0 + local n15q`a1'rural = r(mean) + } //a1 + } //rN + } //age + cap drop q5ind tmp n15 + + //Convert to HH + bys hhid: egen double pop = total(weight_p) + duplicates drop hhid, force + + clonevar weight_use = pop + + //quintiles + _ebin gallT_ppp [aw=pop], gen(q5) nq(5) + gen test = 1 + gen rural = urban==0 + des,sh + tempfile databfsim + save `databfsim', replace + + //loop through random assignments + set seed 1234567 + clear + tempfile ctry1 ctry1ln + save `ctry1', replace emptyok + save `ctry1ln', replace emptyok + + noi display _n(1) + noi display in yellow "Number of simulations: $sim" _n(1) + noi mata: display("{txt}{hline 4}{c +}{hline 3} 1 " + "{hline 3}{c +}{hline 3} 2 " + "{hline 3}{c +}{hline 3} 3 " + "{hline 3}{c +}{hline 3} 4 " + "{hline 3}{c +}{hline 3} 5 ") + + qui forv sim=1(1)$sim { + use `databfsim', clear + + //findex access no_accountq`i'total + if `findex'==1 { + gen dep_fin = . + foreach nm in urban rural { + forv i=1(1)5 { + cap drop _a`i' + if (`no_accountq`i'`nm'' > 0) { + *wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`no_accountq`i'`nm'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + local adjfin = 100*((`=`no_accountq`i'`nm''/100')^(0.6*`n15q`i'`nm'')) + wsample test [aw=pop] if q5==`i' & `nm'==1, percent(`adjfin') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' & `nm'==1 + } + replace dep_fin = _a`i' if q5==`i' & `nm'==1 + drop _a`i' + } //i + } //urb-rul + gen fin_flag = 0 + } + else { //missing + gen dep_fin = . + gen fin_flag = 1 + } + + //SP access _pop_All_SPL_q`i' + /* + if `aspire_sp'==1 { + gen dep_sp = . + forv i=1(1)5 { + cap drop _a`i' + if (`_pop_All_SPL_q`i'' > 0) { + wsample test [aw=pop] if q5==`i', percent(`_pop_All_SPL_q`i'') newvar(_a`i') seed(`=1234567+`i'*`sim'') numsim(1) + } + else { + gen _a`i' = 0 if q5==`i' + } + replace dep_sp = 1-_a`i' if q5==`i' + drop _a`i' + } //i + + gen sp_flag = 0 + } + else { //missing + gen dep_sp = . + gen sp_flag = 1 + } + */ + //ASPIRE IND = 100, June 15 version + gen dep_sp = 0 + + //multidimensional vulnerability + foreach num of numlist 215 365 685 { + //vulnerable and one dim + gen pov1_edu_`num' = 0 + replace pov1_edu_`num' = 1 if poor`num'_ln==1 & dep_educ_com==1 + + gen pov1_sp_`num' = 0 + replace pov1_sp_`num' = 1 if poor`num'_ln==1 & dep_sp==1 + + gen pov1_fin_`num' = 0 + replace pov1_fin_`num' = 1 if poor`num'_ln==1 & dep_fin==1 + + gen pov1_elec_`num' = 0 + replace pov1_elec_`num' = 1 if poor`num'_ln==1 & dep_infra_elec==1 + + gen pov1_water_`num' = 0 + replace pov1_water_`num' = 1 if poor`num'_ln==1 & dep_infra_impw==1 + + //rsum + egen dim6_`num' = rowtotal(poor`num'_ln dep_educ_com dep_sp dep_fin dep_infra_elec dep_infra_impw), missing + + //any of the 6 dimensions - deprived in education; dep_sp; dep_fin + gen multvul_`num' = 0 + replace multvul_`num' = 1 if dim6_`num'>=1 & dim6_`num'~=. + + // any 2, 3, 4,...,6 + forv j=2(1)6 { + gen all`j'vul_`num' = 0 + replace all`j'vul_`num' = 1 if dim6_`num'==`j' + } + } //povlist + gen _all_ = "All sample" + + gen sim = `sim' + gen _count=1 + //collapse to get indicators + compress + tempfile data2 + save `data2', replace + + *local lvllist _all_ urban2 subnatid subnatid1 /*db040 */ + local lvllist _all_ urb2 `subnatvar' + qui foreach var of local lvllist { + use `data2', clear + clonevar h = pop + *clonevar h_ln = pop + *clonevar wta_pov = pop + replace `var' = stritrim(`var') + replace `var' = ustrtrim(`var') + + levelsof `var', local(lvllist2) + cap confirm string variable `var' + if _rc==0 local st = 1 + else local st = 0 + + qui groupfunction [aw=pop], mean(gallT_ppp poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6*) rawsum(_count h) by(sim `var') + + rename gallT_ppp mean_ln + ren _count nohh + ren h noind + egen double totalhh = total(nohh) + egen double totalind = total(noind) + gen sh_hh = nohh/totalhh + gen sh_pop = noind/totalind + + ren `var' sample + gen level = "`var'" + gen code = "`code'" + gen lineupyear = `lineupyear' + gen baseyear = `baseyear' + gen survname = "`survname'" + gen str welfaretype = "`welfaretype'" + + append using `ctry1ln' + order code baseyear lineupyear survname welfaretype level sample sim mean_ln poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* total* sh_* nohh noind + save `ctry1ln', replace + } //foreach subnat + + if (mod(`sim',50)==0){ + noi display in white ". `sim'" _continue + noi display _n(0) + } + else noi display "." _continue + } //sim + //collapse across sim + + //save results + use `ctry1ln', replace + + groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) + gen simout = `simout' + order code survname level sample baseyear lineupyear mean_ln poor215_ln poor685_ln dep_* multvul_* all*vul* pov1* dim6* total* sh_* nohh noind + saveold "${upath2}\03.intermediate\Sim\2021\temp\IND_2021_CPHS_2021_`simout'", replace + append using `dataall' + save `dataall', replace +} //simout + +use `dataall', clear +ta simout +groupfunction, mean(poor* multvul_* all6vul_* all5vul_* all4vul_* all3vul_* all2vul_* dep_* pov1_* dim6* mean_ln total* sh_* nohh noind) by(code baseyear lineupyear survname level sample) +gen todo = 1 +saveold "${upath2}\03.intermediate\Sim\2021\IND_2021_CPHS_2021", replace + diff --git a/01.code/dofile/2-5 Combine vul estimates full.do b/01.code/dofile/2-5 Combine vul estimates full.do old mode 100644 new mode 100755 index edf79f2..811a3c1 --- a/01.code/dofile/2-5 Combine vul estimates full.do +++ b/01.code/dofile/2-5 Combine vul estimates full.do @@ -1,223 +1,206 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org -*! Ben James Brunckhorst - bbrunckhorst@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - -//Load all data and check subnational data together with other data - -clear all - -//setting -global rnd AM24 -global sim 100 -*global upath2 -global lnyear 2021 - -tempfile data1 pop geoiddata -save `data1', replace emptyok - -//import standard geoid -import excel using "${upath2}\\02.input\SPID boundaries.xlsx", sheet("SPID boundaries") first clear -gen sample_clean = ustrregexra(sample,`"[^a-zA-Z0-9]"',"") -replace sample_clean = upper(sample_clean) -isid code year survname sample_clean -save `geoiddata', replace - -//Population from PIP -use "${upath2}\\02.input\code_inc_pop_regpcn.dta", clear -keep if year==$lnyear -ren pop* pop*_pip -save `pop', replace - -//Combine data -local files : dir "${upath2}\\03.intermediate\Sim\\${lnyear}\" files "*.dta", nofail respect -qui foreach file of local files { - use "${upath2}\\03.intermediate\Sim\\${lnyear}\\`file'", clear - append using `data1' - save `data1', replace -} - -use `data1', clear -//update todo from LIS: AUS CAN DEU GBR ISR JPN KOR TWN USA -ta code if strpos(survname,"-LIS")>0 & todo==. - -replace todo = 1 if strpos(survname,"-LIS")>0 & todo==. - -replace nohh = _count if nohh==. & _count~=. -replace noind = h if noind==. & h~=. -drop h _count -replace noind = noind/1000000 -drop if code=="GMB" & level=="region" -merge m:1 code using `pop', keepus(pop_pip pop_rural_pip pop_urban_pip) -drop if _merge==2 -drop _merge -save `data1', replace - -use `data1', clear -keep code -duplicates drop code, force -saveold "${upath2}\\03.intermediate\Data_vul_${lnyear}_codelist.dta" , replace - -use `data1', clear -isid code survname level sample baseyear lineupyear -sort code survname level sample baseyear lineupyear -keep if level=="_all_" -saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_national.dta" , replace - -use `data1', clear -isid code survname level sample baseyear lineupyear -sort code survname level sample baseyear lineupyear -saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_temp.dta" , replace - -use "${upath2}\\03.intermediate\Survey_vul_${lnyear}_temp.dta", clear -ren level byvar - -//fix 4 countries -drop if code=="KOR" & sample==".*_*." -drop if code=="NLD" & sample=="1-." & byvar=="subnatid" -drop if code=="SVN" & byvar=="subnatid" -*drop if code=="SVN" & sample=="1-Si0" -drop if code=="TWN" & byvar=="reg_rural" - -split sample , parse("*_*") -ren sample sample_org -ren sample2 data_group -ren sample1 sample -drop if sample=="MISSING"| sample=="NA" | sample=="Missing" | sample==""|sample==". –"|sample==".-" -replace sample = "5 - Littoral" if sample=="6 - Littoral" & code=="CMR" - -//conflict MAR -drop if (sample=="11 - Laayoune-Sakia Al Hamra"|sample=="12 - Dakhla-Oued Eddahab") & code=="MAR" -drop if sample=="67 – Sevastopol City" & code=="RUS" - -replace byvar = "reg_rural" if (code=="CHN"|code=="IND") & byvar == "reg_urb" - -gen data = "" -replace data = "All" if byvar=="_all_" -replace data = "All" if byvar=="urb2" -replace data = "Subnat" if byvar=="reg_rural" -replace data = "All" if data=="" -drop if sample_org==".-*_*" | sample_org==".*_*." | sample_org==". – Not Applicable (No Stratification)*_*" | sample_org=="Missing" | sample_org=="*_*0" - -replace data_group = "national" if byvar=="_all_" -replace data_group = "urbrur" if byvar=="urb2" -replace data_group = "subnat" if byvar=="subnatid" |byvar=="subnatid1"|byvar=="subnatid2"|byvar=="subnatidsurvey"|byvar=="gaul_adm1_str"|byvar=="gaul_adm1_code" - -replace data_group = "Urban" if data_group=="1" & byvar=="reg_rural" -replace data_group = "Rural" if data_group=="0" & byvar=="reg_rural" - -replace data_group = "subnat" if data_group=="." & data=="Subnat" & (code=="AUS" | code=="GBR" | code=="TWN") -replace data = "All" if (code=="AUS"|code=="GBR"|code=="TWN") - -//Country with national level only (even when subnation is available but not representative) -drop if (code=="HND"|code=="JAM"|code=="TWN") & data=="Subnat" -drop if (code=="HND"|code=="JAM"|code=="TWN") & data=="All" & data_group=="subnat" -*ECU -drop if code=="IND" & data=="All" & data_group=="subnat" -drop if code=="PAN" & data=="All" & data_group=="subnat" - -//drop obs with missing data_group, selected countries -drop if data_group=="." & data=="Subnat" & (code=="USA") -drop if data_group=="" & data=="Subnat" & (code=="UKR") -drop if data_group=="" & data=="Subnat" & (code=="PSE") -drop if data_group=="" & data=="Subnat" & (code=="NAM") - -order code data data_group sample -sort code data data_group sample - -bys code sample: gen ct = _N -drop if ct==3 & data=="All" - -bys code data (sample): gen ct2 = _N -bys code: gen y = _N -drop if ct2==1 & data=="All" & y~=1 -drop if (ct2==2|ct2==3) & data=="All" & ct~=3 -bys code (byvar): egen ng = nvals(data_group) -drop if ng==2 & data_group=="national" - -drop if code=="IND" & (byvar=="urb2"|byvar=="_all_") -drop if code=="KAZ" & byvar=="subnatid" & data=="All" -drop if code=="LKA" & data=="All" & (byvar=="_all_"|byvar=="subnatid2") -drop if code=="ISR" & sample=="[70]Yehuda and Shomron" - -drop if code=="MWI" & data=="All" & (byvar=="subnatid2"|byvar=="_all_") -drop if code=="PAN" & data=="All" & (byvar=="_all_") -drop if code=="TUR" & data=="All" & (byvar=="_all_") -drop if code=="MDV" & data_group=="national" & baseyear==2019 - -drop if code=="MDV" & data=="Subnat" & baseyear==2019 -drop if code=="TON" & data_group=="national" & baseyear==2021 -drop if code=="TON" & data_group=="subnat" & baseyear==2021 - -//not sure sample -drop if code=="IND" & sample=="Telangana" - -//fix data with more than 2 data groups -bys code: egen ndata1 =nvals(data) -ta code data if ndata1==2 -drop if ndata1==2 & data=="All" & (code=="BWA" |code=="COG") -drop if ndata1==2 & data=="All" & (code=="ESP" |code=="KGZ") -drop if ndata1==2 & data=="All" & (code=="MUS" |code=="RWA") -drop if ndata1==2 & data=="All" & (code=="SYC" |code=="VUT") -drop if ndata1==2 & data=="All" & (code=="WSM") - -//merge with geoid data -replace year = baseyear if year==. -gen sample_clean = ustrregexra(sample,`"[^a-zA-Z0-9]"',"") -replace sample_clean = upper(sample_clean) - -merge m:1 code year survname sample_clean using `geoiddata', keepus(geo_code geo_code2_new) -drop if _merge==2 -drop _merge -sort code data sample data_group - -//Add geocode manually -bys code: gen ct0=_N -levelsof code if ct0==1,local(adm0list) -dis `"`adm0list'"' -foreach c1 of local adm0list { - replace geo_code = "`c1'_2020_WB0" if ct0==1 & code=="`c1'" & geo_code=="" -} - -//ctry with urban/rural at national level -local ctrynat1 CHE CYP DNK EST GAB GRD GTM HRV IRL ISL KIR KOR LCA LTU LUX LVA MHL NOR PRT SVK SVN TON TUR TUV URY ECU SSD -foreach c1 of loca ctrynat1 { - replace geo_code = "`c1'_2020_WB0" if code=="`c1'" & geo_code=="" & ct0==2 -} - -drop ct ct2 ng y ct0 ndata1 -bys code geo_code: gen x = _N - -gen degurban = "" -replace degurban = data_group if x==2 -replace degurban = "national" if x==1 -replace degurban = lower(degurban) -drop x - -bys code: egen double x1 = total(nohh) -bys code: egen double x2 = total(noind) -ren sh_hh sh_hh1 -ren sh_pop sh_pop1 -gen double sh_hh = nohh/x1 -gen double sh_pop = noind/x2 -drop x1 x2 - -sort code data sample data_group - -isid geo_code degurban -*br code data sample data_group geo* degurban baseyear todo - +clear all + +//setting +global rnd AM24 +global sim 100 + +global lnyear 2021 + +tempfile data1 pop geoiddata +save `data1', replace emptyok + +//import standard geoid +import excel using "${upath2}\\02.input\SPID boundaries.xlsx", sheet("SPID boundaries") first clear +gen sample_clean = ustrregexra(sample,`"[^a-zA-Z0-9]"',"") +replace sample_clean = upper(sample_clean) +isid code year survname sample_clean +save `geoiddata', replace + +//Population from PIP +use "${upath2}\\02.input\code_inc_pop_regpcn.dta", clear +keep if year==$lnyear +ren pop* pop*_pip +save `pop', replace + +//Combine data +local files : dir "${upath2}\\03.intermediate\Sim\\${lnyear}\" files "*.dta", nofail respect +qui foreach file of local files { + use "${upath2}\\03.intermediate\Sim\\${lnyear}\\`file'", clear + append using `data1' + save `data1', replace +} + +use `data1', clear +//update todo from LIS: AUS CAN DEU GBR ISR JPN KOR TWN USA +ta code if strpos(survname,"-LIS")>0 & todo==. + +replace todo = 1 if strpos(survname,"-LIS")>0 & todo==. + +replace nohh = _count if nohh==. & _count~=. +replace noind = h if noind==. & h~=. +drop h _count +replace noind = noind/1000000 +drop if code=="GMB" & level=="region" +merge m:1 code using `pop', keepus(pop_pip pop_rural_pip pop_urban_pip) +drop if _merge==2 +drop _merge +save `data1', replace + +use `data1', clear +keep code +duplicates drop code, force +saveold "${upath2}\\03.intermediate\Data_vul_${lnyear}_codelist.dta" , replace + +use `data1', clear +isid code survname level sample baseyear lineupyear +sort code survname level sample baseyear lineupyear +keep if level=="_all_" +saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_national.dta" , replace + +use `data1', clear +isid code survname level sample baseyear lineupyear +sort code survname level sample baseyear lineupyear +saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_temp.dta" , replace + +use "${upath2}\\03.intermediate\Survey_vul_${lnyear}_temp.dta", clear +ren level byvar + +//fix 4 countries +drop if code=="KOR" & sample==".*_*." +drop if code=="NLD" & sample=="1-." & byvar=="subnatid" +drop if code=="SVN" & byvar=="subnatid" +*drop if code=="SVN" & sample=="1-Si0" +drop if code=="TWN" & byvar=="reg_rural" + +split sample , parse("*_*") +ren sample sample_org +ren sample2 data_group +ren sample1 sample +drop if sample=="MISSING"| sample=="NA" | sample=="Missing" | sample==""|sample==". –"|sample==".-" +replace sample = "5 - Littoral" if sample=="6 - Littoral" & code=="CMR" +replace sample = "10 - Maputo Province" if sample=="10 – Maputo Provincia" & code=="MOZ" +//conflict MAR +drop if (sample=="11 - Laayoune-Sakia Al Hamra"|sample=="12 - Dakhla-Oued Eddahab") & code=="MAR" +drop if sample=="67 – Sevastopol City" & code=="RUS" + +replace byvar = "reg_rural" if (code=="CHN"|code=="IND") & byvar == "reg_urb" + +gen data = "" +replace data = "All" if byvar=="_all_" +replace data = "All" if byvar=="urb2" +replace data = "Subnat" if byvar=="reg_rural" +replace data = "All" if data=="" +drop if sample_org==".-*_*" | sample_org==".*_*." | sample_org==". – Not Applicable (No Stratification)*_*" | sample_org=="Missing" | sample_org=="*_*0" + +replace data_group = "national" if byvar=="_all_" +replace data_group = "urbrur" if byvar=="urb2" +replace data_group = "subnat" if byvar=="subnatid" |byvar=="subnatid1"|byvar=="subnatid2"|byvar=="subnatidsurvey"|byvar=="gaul_adm1_str"|byvar=="gaul_adm1_code" + +replace data_group = "Urban" if data_group=="1" & byvar=="reg_rural" +replace data_group = "Rural" if data_group=="0" & byvar=="reg_rural" + +replace data_group = "Urban" if data_group=="1.Urban" & byvar=="reg_rural" +replace data_group = "Rural" if data_group=="0.Rural" & byvar=="reg_rural" + +replace data_group = "subnat" if data_group=="." & data=="Subnat" & (code=="AUS" | code=="GBR" | code=="TWN") +replace data = "All" if (code=="AUS"|code=="GBR"|code=="TWN") + +//Country with national level only (even when subnation is available but not representative) +drop if (code=="HND"|code=="JAM"|code=="TWN") & data=="Subnat" +drop if (code=="HND"|code=="JAM"|code=="TWN") & data=="All" & data_group=="subnat" +*ECU +drop if code=="IND" & data=="All" & data_group=="subnat" +drop if code=="PAN" & data=="All" & data_group=="subnat" + +//drop obs with missing data_group, selected countries +drop if data_group=="." & data=="Subnat" & (code=="USA") +drop if data_group=="" & data=="Subnat" & (code=="UKR") +drop if data_group=="" & data=="Subnat" & (code=="PSE") +drop if data_group=="" & data=="Subnat" & (code=="NAM") + +order code data data_group sample +sort code data data_group sample + +bys code sample: gen ct = _N +drop if ct==3 & data=="All" + +bys code data (sample): gen ct2 = _N +bys code: gen y = _N +drop if ct2==1 & data=="All" & y~=1 +drop if (ct2==2|ct2==3) & data=="All" & ct~=3 +bys code (byvar): egen ng = nvals(data_group) +drop if ng==2 & data_group=="national" + +drop if code=="IND" & (byvar=="urb2"|byvar=="_all_") +drop if code=="KAZ" & byvar=="subnatid" & data=="All" +drop if code=="LKA" & data=="All" & (byvar=="_all_"|byvar=="subnatid2") +drop if code=="ISR" & sample=="[70]Yehuda and Shomron" + +drop if code=="MWI" & data=="All" & (byvar=="subnatid2"|byvar=="_all_") +drop if code=="PAN" & data=="All" & (byvar=="_all_") +drop if code=="TUR" & data=="All" & (byvar=="_all_") +drop if code=="MDV" & data_group=="national" & baseyear==2019 + +drop if code=="MDV" & data=="Subnat" & baseyear==2019 +drop if code=="TON" & data_group=="national" & baseyear==2021 +drop if code=="TON" & data_group=="subnat" & baseyear==2021 + +//not sure sample +drop if code=="IND" & sample=="Telangana" + +//fix data with more than 2 data groups +bys code: egen ndata1 =nvals(data) +ta code data if ndata1==2 +drop if ndata1==2 & data=="All" & (code=="BWA" |code=="COG") +drop if ndata1==2 & data=="All" & (code=="ESP" |code=="KGZ") +drop if ndata1==2 & data=="All" & (code=="MUS" |code=="RWA") +drop if ndata1==2 & data=="All" & (code=="SYC" |code=="VUT") +drop if ndata1==2 & data=="All" & (code=="WSM") + +//merge with geoid data +replace year = baseyear if year==. +gen sample_clean = ustrregexra(sample,`"[^a-zA-Z0-9]"',"") +replace sample_clean = upper(sample_clean) + +merge m:1 code year survname sample_clean using `geoiddata', keepus(geo_code geo_code2_new) +drop if _merge==2 +drop _merge +sort code data sample data_group + +//Add geocode manually +bys code: gen ct0=_N +levelsof code if ct0==1,local(adm0list) +dis `"`adm0list'"' +foreach c1 of local adm0list { + replace geo_code = "`c1'_2020_WB0" if ct0==1 & code=="`c1'" & geo_code=="" +} + +//ctry with urban/rural at national level +local ctrynat1 CHE CYP DNK EST GAB GRD GTM HRV IRL ISL KIR KOR LCA LTU LUX LVA MHL NOR PRT SVK SVN TON TUR TUV URY ECU SSD +foreach c1 of loca ctrynat1 { + replace geo_code = "`c1'_2020_WB0" if code=="`c1'" & geo_code=="" & ct0==2 +} + +drop ct ct2 ng y ct0 ndata1 +bys code geo_code: gen x = _N + +gen degurban = "" +replace degurban = data_group if x==2 +replace degurban = "national" if x==1 +replace degurban = lower(degurban) +drop x + +bys code: egen double x1 = total(nohh) +bys code: egen double x2 = total(noind) +ren sh_hh sh_hh1 +ren sh_pop sh_pop1 +gen double sh_hh = nohh/x1 +gen double sh_pop = noind/x2 +drop x1 x2 + +sort code data sample data_group + +isid geo_code degurban +*br code data sample data_group geo* degurban baseyear todo + saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}", replace \ No newline at end of file diff --git a/01.code/dofile/2-6a Merge exposure, rai, and vul estimates.do b/01.code/dofile/2-6a Merge exposure, rai, and vul estimates.do old mode 100644 new mode 100755 index 9a9bb69..f099c50 --- a/01.code/dofile/2-6a Merge exposure, rai, and vul estimates.do +++ b/01.code/dofile/2-6a Merge exposure, rai, and vul estimates.do @@ -1,327 +1,307 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org -*! Ben James Brunckhorst - bbrunckhorst@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - - -clear -global rnd AM24 -global lnyear 2021 -tempfile data1 data2 data3 data4 fullctry missreg dataall -*global upath2 - -*** EXPOSED DATA - onetime run -//1-ANY exposure + RAI -import delimited "${upath2}\\03.intermediate\Exposure\\${lnyear}\\am24exp_clean.csv", clear delim(",") asdouble varn(1) -drop if code=="" & geo_code=="" -compress -save `data1', replace -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_raw", replace - -//national -use `data1', clear -collapse (sum) totalpop* exp_*, by(code scenario) -gen dtype = "National" - -ren exp*rai exprai* -ren exprai_drought_ exprai_drought -ren exprai_flood_ exprai_flood -ren exprai_heat_ exprai_heat -ren exprai_cyclone_ exprai_cyclone -ren exprai_any_ exprai_any - -reshape long exp_ exprai_, i(code scenario totalpop dtype) j(hazard) string -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_national", replace - -//national - dou_code exposure -use `data1', clear -collapse (sum) totalpop* exp_*, by(code scenario dou_code) -gen geo_code = code + "_2020_WB0" -gen dtype = "National-dou" -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou", replace - -//national - dou_code pop -*saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou_pop", replace - -//area/national pop -use `data1', clear -collapse (sum) totalpop* exp_*, by(code geo_code scenario) -gen dtype = "Area" -keep if scenario=="RP100*" -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop", replace - -//national boundary -gen x = strpos(geo_code, "_WB0")>0 -drop if x==1 -drop x -collapse (sum) totalpop* exp_*, by(code scenario) -gen geo_code = code + "_2020_WB0" -append using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop" -isid code geo_code -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop", replace - -//area exposure -use `data1', clear -collapse (sum) totalpop* exp_*, by(code geo_code scenario) -gen dtype = "Area" -ren exp*rai exprai* -ren exprai_drought_ exprai_drought -ren exprai_flood_ exprai_flood -ren exprai_heat_ exprai_heat -ren exprai_cyclone_ exprai_cyclone -ren exprai_any_ exprai_any - -reshape long exp_ exprai_, i(code geo_code scenario totalpop dtype) j(hazard) string -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area", replace - -//2-dtypes -use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear -keep code data data_group sample sh_pop geo_code -sort code geo_code data_group -bys code geo_code (data_group): egen t1 = total(sh_pop) -gen sh_urbrur = sh_pop/t1 -drop t1 sh_pop data - -bys code (sample): gen t1 = _N -bys code sample (data_group): gen t2 = _N - -//national boundary -gen x = strpos(geo_code, "_WB0")>0 - -gen dtype = "National" if t1==1 -replace dtype = "Area" if t2==1 & dtype=="" -replace dtype = "National-Urbrur" if t2==2 & dtype=="" & x==1 -replace dtype = "Area-Urbrur" if t2==2 & dtype=="" -*replace dtype = "National" if code=="URY" & geo_code=="URY_2020_WB0" -drop t1 t2 x - -reshape wide sh_urbrur, i(code sample geo_code dtype) j( data_group ) string - -saveold "${upath2}\\03.intermediate\Survey_dtype", replace - -//3-Adjust urban-rural in exposed data (dtypes: National-Urbrur and Area-Urbrur) -//3a-Area-Urbrur -use "${upath2}\\03.intermediate\Survey_dtype", clear -keep if dtype=="Area-Urbrur" -drop sh_urbrurnational sh_urbrursubnat -merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_raw" -//check if there is no _merge==1 -count if _merge==1 -assert r(N)==0 -drop if _merge==2 -drop _merge -save `data4', replace - -//3b-National-Urbrur -use "${upath2}\\03.intermediate\Survey_dtype", clear -keep if dtype=="National-Urbrur" -drop sh_urbrurnational sh_urbrursubnat -merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou" -//check if there is no _merge==1 -count if _merge==1 -assert r(N)==0 -drop if _merge==2 -drop _merge -append using `data4' - -sort code geo_code scenario dou_code -bys code geo_code scenario (dou_code): egen pop_area = total(totalpop) -gen sh_pop_e = totalpop/pop_area -gen x = -1*dou_code -bysort code geo_code scenario (x): gen seq = _n -bysort code geo_code scenario (x): gen x1 = sum(sh_pop_e) -gen diff = sh_urbrurUrban-x1 -gen urb = sh_urbrurUrban if seq==1 & diff<0 - -replace urb = sh_pop_e if diff>0 & diff~=. -bysort code geo_code scenario (x): gen x2 = sh_urbrurUrban-x1[_n-1] if diff<0 -replace urb = x2 if x20 & x2~=. -replace urb = 0 if urb==. - -//check urb -bysort code geo_code scenario (x): egen y = total(urb) -gen y1 = y- sh_urbrurUrban -su y1 //mean is 0. - -gen rur = . -replace rur = sh_pop_e - urb if seq==1 & sh_pop_e>urb & rur==. -replace rur = sh_pop_e if urb==0 -replace rur = sh_pop_e - x2 if x2>0 & x2~=. & rur==. -replace rur = 0 if rur==. - -//check rur -bysort code geo_code scenario (x): egen z = total(rur) -gen d = y+z -su d //mean is 1 - -drop y y1 x1 diff x2 z d - -local vlist2 totalpop exp_drought exp_flood exp_heat exp_cyclone exp_any totalpop_rai exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai -foreach var of local vlist2 { - gen `var'1 = `var'*urb/sh_pop_e - gen `var'2 = `var'*rur/sh_pop_e - drop `var' -} - -drop sh_pop_e x seq pop_area - -//collapse 7degurban into one level - 1 urban, 2 rural -collapse (sum) totalpop* exp_*, by(code geo_code scenario dtype) - -reshape long totalpop totalpop_rai exp_drought exp_flood exp_heat exp_cyclone exp_any exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai, i(code geo_code scenario dtype) j(urbrur) - -gen data_group = "Urban" if urbrur==1 -replace data_group = "Rural" if urbrur==2 -drop urbrur - -order totalpop exp_drought exp_flood exp_heat exp_cyclone exp_any totalpop_rai exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai, after(data_group) - -*gen dtype = "Area-Urbrur" -isid code geo_code scenario data_group -compress - -ren exp*rai exprai* -ren exprai_drought_ exprai_drought -ren exprai_flood_ exprai_flood -ren exprai_heat_ exprai_heat -ren exprai_cyclone_ exprai_cyclone -ren exprai_any_ exprai_any - -reshape long exp_ exprai_, i(code geo_code scenario data_group totalpop dtype) j(hazard) string -saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_urbrur", replace - -*** EXPOSED DATA AND SURVEY -//4-Add missing areas in countries to the list - NOT SIMPLE as there is no correct POP share for the missing areas. -use "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\exp_dou_rai_raw", clear -keep code geo_code code -duplicates drop code geo_code, force -bys code (geo_code): gen x = _N - -//drop country with no data -*drop if sample=="-1" & x==1 -//country with some data -save `fullctry', replace - -//Bring in missing areas in country with datam, assign national average numbers -use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear -keep code geo_code -duplicates drop code geo_code, force -gen survey = 1 -merge 1:1 code geo_code using `fullctry' -bys code: egen mn = mean(_merge) -keep if mn>2 & mn<3 -drop if survey==1 -drop x _merge mn -merge m:1 code using "${upath2}\\03.intermediate\Survey_vul_2021_national.dta" -drop if _merge==2 -gen degurban = "national" -gen missing_area = "yes" -drop _merge survey - -append using "${upath2}\\03.intermediate\Survey_vul_${lnyear}" - -//Add in grid pop to rescale the population of missing areas and existing areas in the surveys -merge m:1 code geo_code using "${upath2}\\03.intermediate\Exposure\\${lnyear}\\exp_dou_rai_area_pop", keepus(totalpop) -//check if there is no _merge==1 -count if _merge==1 -assert r(N)==0 - -keep if _merge==3 -drop _merge - -bys code geo_code (data_group): gen seq = _N -bys code geo_code (data_group): replace totalpop = . if seq == 2 & _n==2 -bys code: egen double t1 = total(totalpop) -bys code missing_area: egen double t2 = total(totalpop) - -gen double t1area = t2*sh_pop if missing_area=="" -replace t1area = totalpop if missing_area=="yes" - -bys code: egen double t1check = total(t1area) -compare t1 t1check -gen double sh_pop_new = t1area/t1 -drop t1 t1check t2 seq t1area -gen diff = sh_pop - sh_pop_new -ren sh_pop sh_pop_old -ren sh_pop_new sh_pop -drop diff totalpop -saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_withmissing", replace - -//Add exposed to Survey -tempfile data1 data2 data3 fullctry missreg dataall -use "${upath2}\\03.intermediate\Survey_vul_${lnyear}_withmissing", clear -*use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear -sort code geo_code data_group - -bys code (geo_code): gen t1 = _N -*bys code geo_code (data_group): gen t2 = _N -bys code geo_code (degurban): gen t2 = _N - -gen x = strpos(geo_code, "_WB0")>0 -gen dtype = "National" if t1==1 -replace dtype = "Area" if t2==1 & dtype=="" -replace dtype = "National-Urbrur" if t2==2 & dtype=="" & x==1 -replace dtype = "Area-Urbrur" if t2==2 & dtype=="" -drop t1 t2 x -save `data1', replace - -//Urban-rural -use `data1', clear -keep if dtype == "Area-Urbrur" | dtype=="National-Urbrur" -merge 1:m code geo_code data_group using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_urbrur" -ta _merge -keep if _merge==3 -drop _merge -save `data2', replace - -//area -use `data1', clear -keep if dtype == "Area" -merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area" -ta _merge -keep if _merge==3 -drop _merge -append using `data2' -save `data2', replace - -//national -use `data1', clear -keep if dtype == "National" -merge 1:m code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_national" -ta _merge -keep if _merge==3 -drop _merge -append using `data2' -sort scenario code geo_code data_group -save `data2', replace - -//rescale pop to WDI/PIP -gen double s = pop_pip*sh_pop - -//reestimate exp as share of population -local vlist totalpop_rai exp_ exprai_ -foreach var of local vlist { - replace `var'= (`var'/totalpop)*s -} - -drop totalpop -ren s totalpop - -saveold "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw_full", replace - -//only countries with all dimensions -keep if todo==1 +clear +global rnd AM24 +global lnyear 2021 +tempfile data1 data2 data3 data4 fullctry missreg dataall + +*** EXPOSED DATA - onetime run +//1-ANY exposure + RAI +import delimited "${upath2}\\03.intermediate\Exposure\\${lnyear}\\am24exp_clean.csv", clear delim(",") asdouble varn(1) +drop if code=="" & geo_code=="" +compress +save `data1', replace +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_raw", replace + +//national +use `data1', clear +collapse (sum) totalpop* exp_*, by(code scenario) +gen dtype = "National" + +ren exp*rai exprai* +ren exprai_drought_ exprai_drought +ren exprai_flood_ exprai_flood +ren exprai_heat_ exprai_heat +ren exprai_cyclone_ exprai_cyclone +ren exprai_any_ exprai_any + +reshape long exp_ exprai_, i(code scenario totalpop dtype) j(hazard) string +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_national", replace + +//national - dou_code exposure +use `data1', clear +collapse (sum) totalpop* exp_*, by(code scenario dou_code) +gen geo_code = code + "_2020_WB0" +gen dtype = "National-dou" +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou", replace + +//national - dou_code pop +*saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou_pop", replace + +//area/national pop +use `data1', clear +collapse (sum) totalpop* exp_*, by(code geo_code scenario) +gen dtype = "Area" +keep if scenario=="RP100*" +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop", replace + +//national boundary +gen x = strpos(geo_code, "_WB0")>0 +drop if x==1 +drop x +collapse (sum) totalpop* exp_*, by(code scenario) +gen geo_code = code + "_2020_WB0" +append using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop" +isid code geo_code +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area_pop", replace + +//area exposure +use `data1', clear +collapse (sum) totalpop* exp_*, by(code geo_code scenario) +gen dtype = "Area" +ren exp*rai exprai* +ren exprai_drought_ exprai_drought +ren exprai_flood_ exprai_flood +ren exprai_heat_ exprai_heat +ren exprai_cyclone_ exprai_cyclone +ren exprai_any_ exprai_any + +reshape long exp_ exprai_, i(code geo_code scenario totalpop dtype) j(hazard) string +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area", replace + +//2-dtypes +use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear +keep code data data_group sample sh_pop geo_code +sort code geo_code data_group +bys code geo_code (data_group): egen t1 = total(sh_pop) +gen sh_urbrur = sh_pop/t1 +drop t1 sh_pop data + +bys code (sample): gen t1 = _N +bys code sample (data_group): gen t2 = _N + +//national boundary +gen x = strpos(geo_code, "_WB0")>0 + +gen dtype = "National" if t1==1 +replace dtype = "Area" if t2==1 & dtype=="" +replace dtype = "National-Urbrur" if t2==2 & dtype=="" & x==1 +replace dtype = "Area-Urbrur" if t2==2 & dtype=="" +*replace dtype = "National" if code=="URY" & geo_code=="URY_2020_WB0" +drop t1 t2 x + +reshape wide sh_urbrur, i(code sample geo_code dtype) j( data_group ) string + +saveold "${upath2}\\03.intermediate\Survey_dtype", replace + +//3-Adjust urban-rural in exposed data (dtypes: National-Urbrur and Area-Urbrur) +//3a-Area-Urbrur +use "${upath2}\\03.intermediate\Survey_dtype", clear +keep if dtype=="Area-Urbrur" +drop sh_urbrurnational sh_urbrursubnat +merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_raw" +//check if there is no _merge==1 +count if _merge==1 +assert r(N)==0 +drop if _merge==2 +drop _merge +save `data4', replace + +//3b-National-Urbrur +use "${upath2}\\03.intermediate\Survey_dtype", clear +keep if dtype=="National-Urbrur" +drop sh_urbrurnational sh_urbrursubnat +merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_natdou" +//check if there is no _merge==1 +count if _merge==1 +assert r(N)==0 +drop if _merge==2 +drop _merge +append using `data4' + +sort code geo_code scenario dou_code +bys code geo_code scenario (dou_code): egen pop_area = total(totalpop) +gen sh_pop_e = totalpop/pop_area +gen x = -1*dou_code +bysort code geo_code scenario (x): gen seq = _n +bysort code geo_code scenario (x): gen x1 = sum(sh_pop_e) +gen diff = sh_urbrurUrban-x1 +gen urb = sh_urbrurUrban if seq==1 & diff<0 + +replace urb = sh_pop_e if diff>0 & diff~=. +bysort code geo_code scenario (x): gen x2 = sh_urbrurUrban-x1[_n-1] if diff<0 +replace urb = x2 if x20 & x2~=. +replace urb = 0 if urb==. + +//check urb +bysort code geo_code scenario (x): egen y = total(urb) +gen y1 = y- sh_urbrurUrban +su y1 //mean is 0. + +gen rur = . +replace rur = sh_pop_e - urb if seq==1 & sh_pop_e>urb & rur==. +replace rur = sh_pop_e if urb==0 +replace rur = sh_pop_e - x2 if x2>0 & x2~=. & rur==. +replace rur = 0 if rur==. + +//check rur +bysort code geo_code scenario (x): egen z = total(rur) +gen d = y+z +su d //mean is 1 + +drop y y1 x1 diff x2 z d + +local vlist2 totalpop exp_drought exp_flood exp_heat exp_cyclone exp_any totalpop_rai exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai +foreach var of local vlist2 { + gen `var'1 = `var'*urb/sh_pop_e + gen `var'2 = `var'*rur/sh_pop_e + drop `var' +} + +drop sh_pop_e x seq pop_area + +//collapse 7degurban into one level - 1 urban, 2 rural +collapse (sum) totalpop* exp_*, by(code geo_code scenario dtype) + +reshape long totalpop totalpop_rai exp_drought exp_flood exp_heat exp_cyclone exp_any exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai, i(code geo_code scenario dtype) j(urbrur) + +gen data_group = "Urban" if urbrur==1 +replace data_group = "Rural" if urbrur==2 +drop urbrur + +order totalpop exp_drought exp_flood exp_heat exp_cyclone exp_any totalpop_rai exp_drought_rai exp_flood_rai exp_heat_rai exp_cyclone_rai exp_any_rai, after(data_group) + +*gen dtype = "Area-Urbrur" +isid code geo_code scenario data_group +compress + +ren exp*rai exprai* +ren exprai_drought_ exprai_drought +ren exprai_flood_ exprai_flood +ren exprai_heat_ exprai_heat +ren exprai_cyclone_ exprai_cyclone +ren exprai_any_ exprai_any + +reshape long exp_ exprai_, i(code geo_code scenario data_group totalpop dtype) j(hazard) string +saveold "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_urbrur", replace + +*** EXPOSED DATA AND SURVEY +//4-Add missing areas in countries to the list - NOT SIMPLE as there is no correct POP share for the missing areas. +use "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\exp_dou_rai_raw", clear +keep code geo_code code +duplicates drop code geo_code, force +bys code (geo_code): gen x = _N + +//drop country with no data +*drop if sample=="-1" & x==1 +//country with some data +save `fullctry', replace + +//Bring in missing areas in country with datam, assign national average numbers +use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear +keep code geo_code +duplicates drop code geo_code, force +gen survey = 1 +merge 1:1 code geo_code using `fullctry' +bys code: egen mn = mean(_merge) +keep if mn>2 & mn<3 +drop if survey==1 +drop x _merge mn +merge m:1 code using "${upath2}\\03.intermediate\Survey_vul_${lnyear}_national.dta" +drop if _merge==2 +gen degurban = "national" +gen missing_area = "yes" +drop _merge survey + +append using "${upath2}\\03.intermediate\Survey_vul_${lnyear}" + +//Add in grid pop to rescale the population of missing areas and existing areas in the surveys +merge m:1 code geo_code using "${upath2}\\03.intermediate\Exposure\\${lnyear}\\exp_dou_rai_area_pop", keepus(totalpop) +//check if there is no _merge==1 +count if _merge==1 +assert r(N)==0 + +keep if _merge==3 +drop _merge + +bys code geo_code (data_group): gen seq = _N +bys code geo_code (data_group): replace totalpop = . if seq == 2 & _n==2 +bys code: egen double t1 = total(totalpop) +bys code missing_area: egen double t2 = total(totalpop) + +gen double t1area = t2*sh_pop if missing_area=="" +replace t1area = totalpop if missing_area=="yes" + +bys code: egen double t1check = total(t1area) +compare t1 t1check +gen double sh_pop_new = t1area/t1 +drop t1 t1check t2 seq t1area +gen diff = sh_pop - sh_pop_new +ren sh_pop sh_pop_old +ren sh_pop_new sh_pop +drop diff totalpop +saveold "${upath2}\\03.intermediate\Survey_vul_${lnyear}_withmissing", replace + +//Add exposed to Survey +tempfile data1 data2 data3 fullctry missreg dataall +use "${upath2}\\03.intermediate\Survey_vul_${lnyear}_withmissing", clear +*use "${upath2}\\03.intermediate\Survey_vul_${lnyear}", clear +sort code geo_code data_group + +bys code (geo_code): gen t1 = _N +*bys code geo_code (data_group): gen t2 = _N +bys code geo_code (degurban): gen t2 = _N + +gen x = strpos(geo_code, "_WB0")>0 +gen dtype = "National" if t1==1 +replace dtype = "Area" if t2==1 & dtype=="" +replace dtype = "National-Urbrur" if t2==2 & dtype=="" & x==1 +replace dtype = "Area-Urbrur" if t2==2 & dtype=="" +drop t1 t2 x +save `data1', replace + +//Urban-rural +use `data1', clear +keep if dtype == "Area-Urbrur" | dtype=="National-Urbrur" +merge 1:m code geo_code data_group using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_urbrur" +ta _merge +keep if _merge==3 +drop _merge +save `data2', replace + +//area +use `data1', clear +keep if dtype == "Area" +merge 1:m code geo_code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_area" +ta _merge +keep if _merge==3 +drop _merge +append using `data2' +save `data2', replace + +//national +use `data1', clear +keep if dtype == "National" +merge 1:m code using "${upath2}\\03.intermediate\Exposure\\\${lnyear}\\\exp_dou_rai_national" +ta _merge +keep if _merge==3 +drop _merge +append using `data2' +sort scenario code geo_code data_group +save `data2', replace + +//rescale pop to WDI/PIP +gen double s = pop_pip*sh_pop + +//reestimate exp as share of population +local vlist totalpop_rai exp_ exprai_ +foreach var of local vlist { + replace `var'= (`var'/totalpop)*s +} + +drop totalpop +ren s totalpop + +saveold "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw_full", replace + +//only countries with all dimensions +keep if todo==1 saveold "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw", replace \ No newline at end of file diff --git a/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do b/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do old mode 100644 new mode 100755 index 06352a9..86630e0 --- a/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do +++ b/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do @@ -1,83 +1,61 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org -*! Ben James Brunckhorst - bbrunckhorst@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - - -clear -version 18 -global rnd AM24 -global lnyear 2021 -tempfile data1 data2 data3 fullctry missreg dataall - -*global upath2 - -global fileout Tables.xlsx - -/* OLD exposure only without spatial vulnerability -use "${upath2}\\04.output\Exp_vul_${lnyear}_raw", clear -gen multvul_215_exp = multvul_215*exposure -gen all2vul_215_exp = all2vul_215*exposure -*/ - -use "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw", clear -//todo = 1 is the list of countries with all dimensions, which is 104 countries. -replace dep_educ_com = 0 if code=="KOR" & dep_educ_com==. - -//expose and vulnerable (both HH vulnerable and spatial vulnerable) -foreach var of varlist multvul_215 all2vul_215 multvul_365 all2vul_365 multvul_685 all2vul_685 { - gen double `var'_exp = exprai_ + (exp_ - exprai_)*`var' -} - -foreach var of varlist poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin { - gen double `var'_exp = (exp_ )*`var' -} - -// totalpop_rai: total population with spatial vulnerability -// exprai_ : exposed and spatial vulnerability - -gen multvul_215_expold = multvul_215*exp_ -*collect: table (scenario hazard), statistic(sum multvul_215_exp multvul_215_expold totalpop totalpop_rai) nototal nformat(%4.0f) - -//Table WLD -collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ totalpop) nototal nformat(%4.2f) -*collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ multvul_215_expold all2vul_215_exp totalpop) nototal nformat(%4.2f) -collect style header scenario hazard , title(hide) -collect preview -s -collect export "${upath2}\\04.output\\${fileout}", sheet(WLD, replace) modify - -//Single dimension -collect: table (scenario hazard), statistic(sum poor215_ln_exp poor365_ln_exp poor685_ln_exp dep_educ_com_exp dep_infra_elec_exp dep_infra_impw_exp dep_sp_exp dep_fin_exp exprai_ totalpop) nototal nformat(%4.0f) -collect style header scenario hazard , title(hide) -collect preview - -su poor215_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop*pop_pip] if hazard=="any" & scenario=="RP100*" - -collect: table (scenario hazard), statistic(sum totalpop_rai exprai_ totalpop) nototal nformat(%4.0f) -* 2% of popultion is spatial vulnerable. and 1% of population is exposed and spatial vulnerable - -//Country level - any hazard and RP100 -collect: table (code) () if hazard=="any" & scenario=="RP100*", statistic(sum multvul_215_exp all2vul_215_exp totalpop) nformat(%4.0f) -collect style header code , title(hide) -collect preview -collect export "${upath2}\\04.output\\${fileout}", sheet(Country_any_RP100, replace) modify - -sss -table (scenario hazard), statistic(sum multvul_215_exp all2vul_215_exp totalpop) nototal nformat(%4.0f) - -collect: table (pcn_region_code ) (exp2) if (vul2==1 & line==215) , statistic(sum nvul ) nototal nformat(%4.0f) - +clear +version 18 +global rnd AM24 +global lnyear 2021 +tempfile data1 data2 data3 fullctry missreg dataall + +global fileout Tables.xlsx + +/* OLD exposure only without spatial vulnerability +use "${upath2}\\04.output\Exp_vul_${lnyear}_raw", clear +gen multvul_215_exp = multvul_215*exposure +gen all2vul_215_exp = all2vul_215*exposure +*/ + +use "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw", clear +//todo = 1 is the list of countries with all dimensions, which is 104 countries. +replace dep_educ_com = 0 if code=="KOR" & dep_educ_com==. + +//expose and vulnerable (both HH vulnerable and spatial vulnerable) +foreach var of varlist multvul_215 all2vul_215 multvul_365 all2vul_365 multvul_685 all2vul_685 { + gen double `var'_exp = exprai_ + (exp_ - exprai_)*`var' +} + +foreach var of varlist poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin { + gen double `var'_exp = (exp_ )*`var' +} + +// totalpop_rai: total population with spatial vulnerability +// exprai_ : exposed and spatial vulnerability + +gen multvul_215_expold = multvul_215*exp_ +*collect: table (scenario hazard), statistic(sum multvul_215_exp multvul_215_expold totalpop totalpop_rai) nototal nformat(%4.0f) + +//Table WLD +collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ totalpop) nototal nformat(%4.2f) +*collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ multvul_215_expold all2vul_215_exp totalpop) nototal nformat(%4.2f) +collect style header scenario hazard , title(hide) +collect preview +collect export "${upath2}\\04.output\\${fileout}", sheet(WLD, replace) modify + +//Single dimension +collect: table (scenario hazard), statistic(sum poor215_ln_exp poor365_ln_exp poor685_ln_exp dep_educ_com_exp dep_infra_elec_exp dep_infra_impw_exp dep_sp_exp dep_fin_exp exprai_ totalpop) nototal nformat(%4.0f) +collect style header scenario hazard , title(hide) +collect preview + +su poor215_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop*pop_pip] if hazard=="any" & scenario=="RP100*" + +collect: table (scenario hazard), statistic(sum totalpop_rai exprai_ totalpop) nototal nformat(%4.0f) +* 2% of popultion is spatial vulnerable. and 1% of population is exposed and spatial vulnerable + +//Country level - any hazard and RP100 +collect: table (code) () if hazard=="any" & scenario=="RP100*", statistic(sum multvul_215_exp all2vul_215_exp totalpop) nformat(%4.0f) +collect style header code , title(hide) +collect preview +collect export "${upath2}\\04.output\\${fileout}", sheet(Country_any_RP100, replace) modify + + +table (scenario hazard), statistic(sum multvul_215_exp all2vul_215_exp totalpop) nototal nformat(%4.0f) + +collect: table (pcn_region_code ) (exp2) if (vul2==1 & line==215) , statistic(sum nvul ) nototal nformat(%4.0f) + diff --git a/01.code/dofile/2-8 Get tables for CSC.do b/01.code/dofile/2-8 Get tables for CSC.do old mode 100644 new mode 100755 index 30d1008..3eb2cea --- a/01.code/dofile/2-8 Get tables for CSC.do +++ b/01.code/dofile/2-8 Get tables for CSC.do @@ -1,259 +1,239 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org -*! Ben James Brunckhorst - bbrunckhorst@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - - -clear -version 18 -global rnd AM24 -global lnyear 2021 -tempfile data1 data2 data3 fullctry missreg dataall dataall2 pop template -save `dataall', replace emptyok -save `dataall2', replace emptyok - -*global upath2 - -global fileout Tables_CSC.xlsx -global fileout1 Tables_Country.xlsx -//import template -import excel "${upath2}\\04.output\For CSC\CSC_Vision Indicator Template.xlsx", sheet(Vision Indicator Template) first -gen seq = _n -unab original : * -save `template', replace - -//Population from PIP -use "${upath2}\\02.input\code_inc_pop_regpcn.dta", clear -keep if year==$lnyear -ren pop* pop*_pip -save `pop', replace - -//Get total population for aggregates -use "${upath2}\\04.output\For CSC\ctrylist.dta", clear -merge 1:1 code using `pop', keepus(pop*_pip) -ta _merge -drop _merge - -gen reg_fy24= "" -replace reg_fy24 = "EAS" if region_fy24=="East Asia & Pacific" -replace reg_fy24 = "ECS" if region_fy24=="Europe & Central Asia" -replace reg_fy24 = "LCN" if region_fy24=="Latin America & Caribbean" -replace reg_fy24 = "MEA" if region_fy24=="Middle East & North Africa" -replace reg_fy24 = "NA" if region_fy24=="North America" -replace reg_fy24 = "SAS" if region_fy24=="South Asia" -replace reg_fy24 = "SSF" if region_fy24=="Sub-Saharan Africa" - -gen inc_fy24 = "" -replace inc_fy24 = "HIC" if income_fy24=="High income" -replace inc_fy24 = "LIC" if income_fy24=="Low income" -replace inc_fy24 = "LMC" if income_fy24=="Lower middle income" -replace inc_fy24 = "UMC" if income_fy24=="Upper middle income" - -gen obs = 1 -save `data1', replace - -//WLD -use `data1', clear -collapse (sum) pop_pip obs -gen group = "WLD" -gen section = "WLD" -save `dataall', replace - -//Regions -use `data1', clear -collapse (sum) pop_pip obs, by(reg_fy24) -ren reg_fy24 group -gen section = "Region" -append using `dataall' -save `dataall', replace - -//income groups -use `data1', clear -collapse (sum) pop_pip obs, by(inc_fy24) -gen section = "Income group" -ren inc_fy24 group -append using `dataall' -save `dataall', replace - -//LDC -use `data1', clear -keep if ldc_fy24=="LDC" -collapse (sum) pop_pip obs -gen section = "LDC" -gen group = "LDC" -append using `dataall' -save `dataall', replace - -//FCS -use `data1', clear -keep if fcs_fy24=="FCS" -collapse (sum) pop_pip obs -gen section = "FCS" -gen group = "FCS" -append using `dataall' -save `dataall', replace - -//SST -use `data1', clear -keep if small_states_fy24=="SS" -collapse (sum) pop_pip obs -gen group = "SST" -gen section = "Small states" -append using `dataall' -save `dataall', replace - -//SID -use `data1', clear -keep if sids_fy24=="SIDS" -collapse (sum) pop_pip obs -gen section = "SID" -gen group = "SID" -append using `dataall' -drop if group=="" - -ren pop_pip pop_full -ren obs obs_full -save `dataall', replace - -//Raw exposure and vulnerable with all dimension (todo==1) -use "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw", clear -keep if scenario=="RP100*" & hazard=="any" -replace dep_educ_com = 0 if code=="KOR" & dep_educ_com==. - -//todo = 1 is the list of countries with all dimensions, which is 104 countries. - -//expose and vulnerable (both HH vulnerable and spatial vulnerable) -foreach var of varlist multvul_215 all2vul_215 multvul_365 all2vul_365 multvul_685 all2vul_685 { - gen double `var'_exp = exprai_ + (exp_ - exprai_)*`var' -} - -foreach var of varlist poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin { - gen double `var'_exp = (exp_ )*`var' -} - -// totalpop_rai: total population with spatial vulnerability -// exprai_ : exposed and spatial vulnerability - -//WLD table -collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ totalpop) nototal nformat(%4.2f) -collect style header scenario hazard , title(hide) -collect preview - -su poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop*pop_pip] if hazard=="any" & scenario=="RP100*" - -*collapse (first) pop_pip (sum) multvul_215_exp exp_ totalpop, by(code) -collapse (first) pop_pip (rawsum) multvul_215_exp exp_ totalpop (mean) poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop], by(code) -gen double rate = (multvul_215_exp/totalpop)*100 - -//same number as the WLD table -su rate [aw= totalpop] -su rate [aw= pop_pip] -ren pop_pip pop_pip_vul -save `data3', replace - -//Ctry level number to CSC -gen ISO = code -ren rate YR2021 -export excel code YR2021 using "${upath2}\\04.output\For CSC\\${fileout}", sheet(country_level) replace firstrow(variables) keepcellfmt -keep ISO YR2021 -merge 1:1 ISO using `template', update replace -drop _merge -save `template', replace - -use `data3', clear -export excel code rate poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin pop_pip using "${upath2}\\04.output\For CSC\\${fileout1}", sheet(country_level) replace firstrow(variables) keepcellfmt -*save "c:\Temp\Am24", replace - -//merge in the CSC listing -merge 1:1 code using `data1' -keep if _merge==3 -drop _merge -save `data2', replace - -//VUL rates -//WLD -use `data2', clear -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] -gen group = "WLD" -save `dataall2', replace - -//Regions -use `data2', clear -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip], by(reg_fy24) -ren reg_fy24 group -append using `dataall2' -save `dataall2', replace - -//income groups -use `data2', clear -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip], by(inc_fy24) -ren inc_fy24 group -append using `dataall2' -save `dataall2', replace - -//LDC -use `data2', clear -keep if ldc_fy24=="LDC" -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] -gen group = "LDC" -append using `dataall2' -save `dataall2', replace - -//FCS -use `data2', clear -keep if fcs_fy24=="FCS" -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] -gen group = "FCS" -append using `dataall2' -save `dataall2', replace - -//SST -use `data2', clear -keep if small_states_fy24=="SS" -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] -gen group = "SST" -append using `dataall2' -save `dataall2', replace - -//SID -use `data2', clear -keep if sids_fy24=="SIDS" -collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] -gen group = "SID" -append using `dataall2' -drop if group=="" - -ren pop_pip pop_cov -ren obs obs_cov -save `dataall2', replace - -merge 1:1 group using `dataall' -gen double pop_share = (pop_cov/pop_full)*100 -drop _merge -sort section group -order section group pop_full obs_full rate pop_share pop_cov obs_cov -drop if group=="NA" & section=="Region" -drop if pop_share < 50 -ren rate YR2021 -export excel using "${upath2}\\04.output\For CSC\\${fileout}", sheet(agg_level, replace) firstrow(variables) keepcellfmt - -ren group ISO -keep ISO YR2021 -merge 1:1 ISO using `template', update replace -sort seq -order `original' -drop seq _merge -export excel using "${upath2}\\04.output\For CSC\\${fileout}", sheet(format, replace) firstrow(variables) keepcellfmt +clear +version 18 +global rnd AM24 +global lnyear 2021 +tempfile data1 data2 data3 fullctry missreg dataall dataall2 pop template +save `dataall', replace emptyok +save `dataall2', replace emptyok + + +global fileout Tables_CSC.xlsx +global fileout1 Tables_Country.xlsx +//import template +import excel "${upath2}\\04.output\For CSC\CSC_Vision Indicator Template.xlsx", sheet(Vision Indicator Template) first +gen seq = _n +unab original : * +save `template', replace + +//Population from PIP +use "${upath2}\\02.input\code_inc_pop_regpcn.dta", clear +keep if year==$lnyear +ren pop* pop*_pip +save `pop', replace + +//Get total population for aggregates +use "${upath2}\\04.output\For CSC\ctrylist.dta", clear +merge 1:1 code using `pop', keepus(pop*_pip) +ta _merge +drop _merge + +gen reg_fy24= "" +replace reg_fy24 = "EAS" if region_fy24=="East Asia & Pacific" +replace reg_fy24 = "ECS" if region_fy24=="Europe & Central Asia" +replace reg_fy24 = "LCN" if region_fy24=="Latin America & Caribbean" +replace reg_fy24 = "MEA" if region_fy24=="Middle East & North Africa" +replace reg_fy24 = "NA" if region_fy24=="North America" +replace reg_fy24 = "SAS" if region_fy24=="South Asia" +replace reg_fy24 = "SSF" if region_fy24=="Sub-Saharan Africa" + +gen inc_fy24 = "" +replace inc_fy24 = "HIC" if income_fy24=="High income" +replace inc_fy24 = "LIC" if income_fy24=="Low income" +replace inc_fy24 = "LMC" if income_fy24=="Lower middle income" +replace inc_fy24 = "UMC" if income_fy24=="Upper middle income" + +gen obs = 1 +save `data1', replace + +//WLD +use `data1', clear +collapse (sum) pop_pip obs +gen group = "WLD" +gen section = "WLD" +save `dataall', replace + +//Regions +use `data1', clear +collapse (sum) pop_pip obs, by(reg_fy24) +ren reg_fy24 group +gen section = "Region" +append using `dataall' +save `dataall', replace + +//income groups +use `data1', clear +collapse (sum) pop_pip obs, by(inc_fy24) +gen section = "Income group" +ren inc_fy24 group +append using `dataall' +save `dataall', replace + +//LDC +use `data1', clear +keep if ldc_fy24=="LDC" +collapse (sum) pop_pip obs +gen section = "LDC" +gen group = "LDC" +append using `dataall' +save `dataall', replace + +//FCS +use `data1', clear +keep if fcs_fy24=="FCS" +collapse (sum) pop_pip obs +gen section = "FCS" +gen group = "FCS" +append using `dataall' +save `dataall', replace + +//SST +use `data1', clear +keep if small_states_fy24=="SS" +collapse (sum) pop_pip obs +gen group = "SST" +gen section = "Small states" +append using `dataall' +save `dataall', replace + +//SID +use `data1', clear +keep if sids_fy24=="SIDS" +collapse (sum) pop_pip obs +gen section = "SID" +gen group = "SID" +append using `dataall' +drop if group=="" + +ren pop_pip pop_full +ren obs obs_full +save `dataall', replace + +//Raw exposure and vulnerable with all dimension (todo==1) +use "${upath2}\\04.output\Exp_vul_rai_${lnyear}_raw", clear +keep if scenario=="RP100*" & hazard=="any" +replace dep_educ_com = 0 if code=="KOR" & dep_educ_com==. + +//todo = 1 is the list of countries with all dimensions, which is 104 countries. + +//expose and vulnerable (both HH vulnerable and spatial vulnerable) +foreach var of varlist multvul_215 all2vul_215 multvul_365 all2vul_365 multvul_685 all2vul_685 { + gen double `var'_exp = exprai_ + (exp_ - exprai_)*`var' +} + +foreach var of varlist poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin { + gen double `var'_exp = (exp_ )*`var' +} + +// totalpop_rai: total population with spatial vulnerability +// exprai_ : exposed and spatial vulnerability + +//WLD table +collect: table (scenario hazard), statistic(sum multvul_215_exp exp_ totalpop) nototal nformat(%4.2f) +collect style header scenario hazard , title(hide) +collect preview + +su poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop*pop_pip] if hazard=="any" & scenario=="RP100*" + +*collapse (first) pop_pip (sum) multvul_215_exp exp_ totalpop, by(code) +collapse (first) pop_pip (rawsum) multvul_215_exp exp_ totalpop (mean) poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin [aw=sh_pop], by(code) +gen double rate = (multvul_215_exp/totalpop)*100 + +//same number as the WLD table +su rate [aw= totalpop] +su rate [aw= pop_pip] +ren pop_pip pop_pip_vul +save `data3', replace + +//Ctry level number to CSC +gen ISO = code +ren rate YR2021 +export excel code YR2021 using "${upath2}\\04.output\For CSC\\${fileout}", sheet(country_level) replace firstrow(variables) keepcellfmt +keep ISO YR2021 +merge 1:1 ISO using `template', update replace +drop _merge +save `template', replace + +use `data3', clear +export excel code rate poor215_ln poor365_ln poor685_ln dep_educ_com dep_infra_elec dep_infra_impw dep_sp dep_fin pop_pip_vul using "${upath2}\\04.output\For CSC\\${fileout1}", sheet(country_level) replace firstrow(variables) keepcellfmt +*save "c:\Temp\Am24", replace + +//merge in the CSC listing +merge 1:1 code using `data1' +keep if _merge==3 +drop _merge +save `data2', replace + +//VUL rates +//WLD +use `data2', clear +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] +gen group = "WLD" +save `dataall2', replace + +//Regions +use `data2', clear +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip], by(reg_fy24) +ren reg_fy24 group +append using `dataall2' +save `dataall2', replace + +//income groups +use `data2', clear +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip], by(inc_fy24) +ren inc_fy24 group +append using `dataall2' +save `dataall2', replace + +//LDC +use `data2', clear +keep if ldc_fy24=="LDC" +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] +gen group = "LDC" +append using `dataall2' +save `dataall2', replace + +//FCS +use `data2', clear +keep if fcs_fy24=="FCS" +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] +gen group = "FCS" +append using `dataall2' +save `dataall2', replace + +//SST +use `data2', clear +keep if small_states_fy24=="SS" +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] +gen group = "SST" +append using `dataall2' +save `dataall2', replace + +//SID +use `data2', clear +keep if sids_fy24=="SIDS" +collapse (rawsum) pop_pip obs (mean) rate [aw=pop_pip] +gen group = "SID" +append using `dataall2' +drop if group=="" + +ren pop_pip pop_cov +ren obs obs_cov +save `dataall2', replace + +merge 1:1 group using `dataall' +gen double pop_share = (pop_cov/pop_full)*100 +drop _merge +sort section group +order section group pop_full obs_full rate pop_share pop_cov obs_cov +drop if group=="NA" & section=="Region" +drop if pop_share < 40 +ren rate YR2021 +export excel using "${upath2}\\04.output\For CSC\\${fileout}", sheet(agg_level, replace) firstrow(variables) keepcellfmt + +ren group ISO +keep ISO YR2021 +merge 1:1 ISO using `template', update replace +sort seq +order `original' +drop seq _merge +export excel using "${upath2}\\04.output\For CSC\\${fileout}", sheet(format, replace) firstrow(variables) keepcellfmt diff --git a/01.code/dofile/MASTER dofile.do b/01.code/dofile/MASTER dofile.do old mode 100644 new mode 100755 index 56bf4f7..ef7c1d6 --- a/01.code/dofile/MASTER dofile.do +++ b/01.code/dofile/MASTER dofile.do @@ -1,72 +1,53 @@ -*! version 0.1.1 01Aug2024 -*! Copyright (C) World Bank 2024 -*! Minh Cong Nguyen - mnguyen3@worldbank.org -*! Ben James Brunckhorst - bbrunckhorst@worldbank.org - -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation, either version 3 of the License, or -* (at your option) any later version. - -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. - -* You should have received a copy of the GNU General Public License -* along with this program. If not, see . - - -//Vulnerable to poverty and climate 2.0 -//folder structure and sequences - -global rnd AM24 -global upath2 "Your path here" - -//See the excel file (readme) for the folder structure and the sequence of codes -/* -01.code\ - ado\ - dofile\ -02.input\ -03.intermediate\ - Lineupcheck\ - Lineuplist\ - LISoutput\ - PIPinput\ -04.output\ -05.references\ -06.note\ -07.presentations -*/ - -do "$upath2/01.code/dofile/0-0 GMD datacheck.do" -do "$upath2/01.code/dofile/0-1 Get PIP nat lineup number.do" -do "$upath2/01.code/dofile/0-2 Update pop class region.do" -do "$upath2/01.code/dofile/0-7a Findex_quintiles 2021.do" -do "$upath2/01.code/dofile/0-7a Findex_quintiles 2010.do" -do "$upath2/01.code/dofile/0-7b Prep ASPIRE.do" -do "$upath2/01.code/dofile/0-7c Prep JMP.do" -do "$upath2/01.code/dofile/0-7d Prep GED.do" -do "$upath2/01.code/dofile/0-7e Prep UNESCO.do" -do "$upath2/01.code/dofile/0-8 Water and Elec WDI.do" -do "$upath2/01.code/dofile/0-3 Prep data for coverage.do" - -do "$upath2/01.code/dofile/1-1 Get list for LISSY.do" -do "$upath2/01.code/dofile/1-2 Get list for GMD full.do" - -//Below dofiles need to run in LISSY. -do "$upath2/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do" -do "$upath2/01.code/dofile/2-1b Estimate vul rate for LISSY data.do" -//Get the txt from LISSY, clean it and run the code to format the data -do "$upath2/01.code/dofile/2-1c Extract national data - for LISSY data.do" -do "$upath2/01.code/dofile/2-1d Extract subnat data - for LISSY data.do" - -//Run these in your machine -do "$upath2/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do" -do "$upath2/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do" -do "$upath2/01.code/dofile/2-3 Estimate vul rate for GMD data full.do" -do "$upath2/01.code/dofile/2-5 Combine vul estimates full.do" -do "$upath2/01.code/dofile/2-6a Merge exposure and vul estimates.do" -do "$upath2/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do" -do "$upath2/01.code/dofile/2-8 Get tables for CSC.do" +//Vulnerable to poverty and climate 2.0 +//folder structure and sequences + +global rnd AM24 +global upath2 "path in your computer" + +set varabbrev on + +sysdir set PLUS "$upath2/01.code/ado" + +** Run the needed do files + +*gtools, groupfunction, + +*no need to run +*do "$upath2/01.code/dofile/0-0 GMD datacheck.do" + +do "$upath2/01.code/dofile/0-1 Get PIP nat lineup number.do" +do "$upath2/01.code/dofile/0-2 Update pop class region.do" +do "$upath2/01.code/dofile/0-7a Findex_quintiles 2021.do" +do "$upath2/01.code/dofile/0-7b Prep ASPIRE.do" +do "$upath2/01.code/dofile/0-7c Prep JMP.do" +do "$upath2/01.code/dofile/0-7d Prep GED.do" +do "$upath2/01.code/dofile/0-7e Prep UNESCO.do" +do "$upath2/01.code/dofile/0-8 Water and Elec WDI.do" +do "$upath2/01.code/dofile/0-3 Prep data for coverage.do" + +* no need to run +*do "$upath2/01.code/dofile/0-4a Coverage check.do" + +do "$upath2/01.code/dofile/1-1 Get list for LISSY.do" +do "$upath2/01.code/dofile/1-2 Get list for GMD full.do" + +pause + +* these two do files have to be run in Lissy interface. See readme file for details. +*"$upath2/01.code/dofile/2-1a Estimate national vul rate for LISSY data.do" +*$upath2/01.code/dofile/2-1b Estimate vul rate for LISSY data.do" + +do "$upath2/01.code/dofile/2-1c Extract national data - for LISSY data.do" +do "$upath2/01.code/dofile/2-1d Extract subnat data - for LISSY data.do" +do "$upath2/01.code/dofile/2-2 Estimate vul rate for CHN data 2021.do" +do "$upath2/01.code/dofile/2-4 Estimate vul rate for IND data 2021.do" + +do "$upath2/01.code/dofile/2-3 Estimate vul rate for GMD data full.do" + +do "$upath2/01.code/dofile/2-5 Combine vul estimates full.do" +do "$upath2/01.code/dofile/2-6a Merge exposure, rai, and vul estimates.do" + +*do "$upath2/01.code/dofile/2-7 Vul_Exp - Get tables and figures.do" +do "$upath2/01.code/dofile/2-8 Get tables for CSC.do" + + diff --git a/02.input/2021/ASPIRE_data_2021.dta b/02.input/2021/ASPIRE_data_2021.dta deleted file mode 100644 index 73617e7..0000000 Binary files a/02.input/2021/ASPIRE_data_2021.dta and /dev/null differ diff --git a/02.input/2021/GED_cov_2021.dta b/02.input/2021/GED_cov_2021.dta deleted file mode 100644 index 481ebc0..0000000 Binary files a/02.input/2021/GED_cov_2021.dta and /dev/null differ diff --git a/02.input/2021/GMD_list_2021.dta b/02.input/2021/GMD_list_2021.dta deleted file mode 100644 index 424d97f..0000000 Binary files a/02.input/2021/GMD_list_2021.dta and /dev/null differ diff --git a/02.input/2021/JMP_cov_2021.dta b/02.input/2021/JMP_cov_2021.dta deleted file mode 100644 index b4c9b0c..0000000 Binary files a/02.input/2021/JMP_cov_2021.dta and /dev/null differ diff --git a/02.input/2021/UNESCO_cov_2021.dta b/02.input/2021/UNESCO_cov_2021.dta deleted file mode 100644 index 042f5e5..0000000 Binary files a/02.input/2021/UNESCO_cov_2021.dta and /dev/null differ diff --git a/02.input/2021/findex_2021_quintiles.dta b/02.input/2021/findex_2021_quintiles.dta deleted file mode 100644 index 9b3599f..0000000 Binary files a/02.input/2021/findex_2021_quintiles.dta and /dev/null differ diff --git a/LICENSE b/LICENSE index ef17e49..5ed8af1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,385 +1,14 @@ -Mozilla Public License Version 2.0 -================================== - -1. Definitions - --------------- - -1.1. "Contributor" - means each individual or legal entity that creates, contributes to - the creation of, or owns Covered Software. - -1.2. "Contributor Version" - means the combination of the Contributions of others (if any) used - by a Contributor and that particular Contributor's Contribution. - -1.3. "Contribution" - means Covered Software of a particular Contributor. - -1.4. "Covered Software" - means Source Code Form to which the initial Contributor has attached - the notice in Exhibit A, the Executable Form of such Source Code - Form, and Modifications of such Source Code Form, in each case - including portions thereof. - -1.5. "Incompatible With Secondary Licenses" - means - - (a) that the initial Contributor has attached the notice described - in Exhibit B to the Covered Software; or - - (b) that the Covered Software was made available under the terms of - version 1.1 or earlier of the License, but not also under the - terms of a Secondary License. - -1.6. "Executable Form" - means any form of the work other than Source Code Form. - -1.7. "Larger Work" - means a work that combines Covered Software with other material, in - a separate file or files, that is not Covered Software. - -1.8. "License" - means this document. - -1.9. "Licensable" - means having the right to grant, to the maximum extent possible, - whether at the time of the initial grant or subsequently, any and - all of the rights conveyed by this License. - -1.10. "Modifications" - means any of the following: - - (a) any file in Source Code Form that results from an addition to, - deletion from, or modification of the contents of Covered - Software; or - - (b) any new file in Source Code Form that contains any Covered - Software. - -1.11. "Patent Claims" of a Contributor - means any patent claim(s), including without limitation, method, - process, and apparatus claims, in any patent Licensable by such - Contributor that would be infringed, but for the grant of the - License, by the making, using, selling, offering for sale, having - made, import, or transfer of either its Contributions or its - Contributor Version. - -1.12. "Secondary License" - means either the GNU General Public License, Version 2.0, the GNU - Lesser General Public License, Version 2.1, the GNU Affero General - Public License, Version 3.0, or any later versions of those - licenses. - -1.13. "Source Code Form" - means the form of the work preferred for making modifications. - -1.14. "You" (or "Your") - means an individual or a legal entity exercising rights under this - License. For legal entities, "You" includes any entity that - controls, is controlled by, or is under common control with You. For - purposes of this definition, "control" means (a) the power, direct - or indirect, to cause the direction or management of such entity, - whether by contract or otherwise, or (b) ownership of more than - fifty percent (50%) of the outstanding shares or beneficial - ownership of such entity. - -2. License Grants and Conditions - --------------------------------- - -2.1. Grants - -Each Contributor hereby grants You a world-wide, royalty-free, -non-exclusive license: - -(a) under intellectual property rights (other than patent or trademark) - Licensable by such Contributor to use, reproduce, make available, - modify, display, perform, distribute, and otherwise exploit its - Contributions, either on an unmodified basis, with Modifications, or - as part of a Larger Work; and - -(b) under Patent Claims of such Contributor to make, use, sell, offer - for sale, have made, import, and otherwise transfer either its - Contributions or its Contributor Version. - -2.2. Effective Date - -The licenses granted in Section 2.1 with respect to any Contribution -become effective for each Contribution on the date the Contributor first -distributes such Contribution. - -2.3. Limitations on Grant Scope - -The licenses granted in this Section 2 are the only rights granted under -this License. No additional rights or licenses will be implied from the -distribution or licensing of Covered Software under this License. -Notwithstanding Section 2.1(b) above, no patent license is granted by a -Contributor: - -(a) for any code that a Contributor has removed from Covered Software; - or - -(b) for infringements caused by: (i) Your and any other third party's - modifications of Covered Software, or (ii) the combination of its - Contributions with other software (except as part of its Contributor - Version); or - -(c) under Patent Claims infringed by Covered Software in the absence of - its Contributions. - -This License does not grant any rights in the trademarks, service marks, -or logos of any Contributor (except as may be necessary to comply with -the notice requirements in Section 3.4). - -2.4. Subsequent Licenses - -No Contributor makes additional grants as a result of Your choice to -distribute the Covered Software under a subsequent version of this -License (see Section 10.2) or under the terms of a Secondary License (if -permitted under the terms of Section 3.3). - -2.5. Representation - -Each Contributor represents that the Contributor believes its -Contributions are its original creation(s) or it has sufficient rights -to grant the rights to its Contributions conveyed by this License. - -2.6. Fair Use - -This License is not intended to limit any rights You have under -applicable copyright doctrines of fair use, fair dealing, or other -equivalents. - -2.7. Conditions - -Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted -in Section 2.1. - -3. Responsibilities - -------------------- - -3.1. Distribution of Source Form - -All distribution of Covered Software in Source Code Form, including any -Modifications that You create or to which You contribute, must be under -the terms of this License. You must inform recipients that the Source -Code Form of the Covered Software is governed by the terms of this -License, and how they can obtain a copy of this License. You may not -attempt to alter or restrict the recipients' rights in the Source Code -Form. - -3.2. Distribution of Executable Form - -If You distribute Covered Software in Executable Form then: - -(a) such Covered Software must also be made available in Source Code - Form, as described in Section 3.1, and You must inform recipients of - the Executable Form how they can obtain a copy of such Source Code - Form by reasonable means in a timely manner, at a charge no more - than the cost of distribution to the recipient; and - -(b) You may distribute such Executable Form under the terms of this - License, or sublicense it under different terms, provided that the - license for the Executable Form does not attempt to limit or alter - the recipients' rights in the Source Code Form under this License. - -3.3. Distribution of a Larger Work - -You may create and distribute a Larger Work under terms of Your choice, -provided that You also comply with the requirements of this License for -the Covered Software. If the Larger Work is a combination of Covered -Software with a work governed by one or more Secondary Licenses, and the -Covered Software is not Incompatible With Secondary Licenses, this -License permits You to additionally distribute such Covered Software -under the terms of such Secondary License(s), so that the recipient of -the Larger Work may, at their option, further distribute the Covered -Software under the terms of either this License or such Secondary -License(s). - -3.4. Notices - -You may not remove or alter the substance of any license notices -(including copyright notices, patent notices, disclaimers of warranty, -or limitations of liability) contained within the Source Code Form of -the Covered Software, except that You may alter any license notices to -the extent required to remedy known factual inaccuracies. - -3.5. Application of Additional Terms - -You may choose to offer, and to charge a fee for, warranty, support, -indemnity or liability obligations to one or more recipients of Covered -Software. However, You may do so only on Your own behalf, and not on -behalf of any Contributor. You must make it absolutely clear that any -such warranty, support, indemnity, or liability obligation is offered by -You alone, and You hereby agree to indemnify every Contributor for any -liability incurred by such Contributor as a result of warranty, support, -indemnity or liability terms You offer. You may include additional -disclaimers of warranty and limitations of liability specific to any -jurisdiction. - -4. Inability to Comply Due to Statute or Regulation - ---------------------------------------------------- - -If it is impossible for You to comply with any of the terms of this -License with respect to some or all of the Covered Software due to -statute, judicial order, or regulation then You must: (a) comply with -the terms of this License to the maximum extent possible; and (b) -describe the limitations and the code they affect. Such description must -be placed in a text file included with all distributions of the Covered -Software under this License. Except to the extent prohibited by statute -or regulation, such description must be sufficiently detailed for a -recipient of ordinary skill to be able to understand it. - -5. Termination - --------------- - -5.1. The rights granted under this License will terminate automatically -if You fail to comply with any of its terms. However, if You become -compliant, then the rights granted under this License from a particular -Contributor are reinstated (a) provisionally, unless and until such -Contributor explicitly and finally terminates Your grants, and (b) on an -ongoing basis, if such Contributor fails to notify You of the -non-compliance by some reasonable means prior to 60 days after You have -come back into compliance. Moreover, Your grants from a particular -Contributor are reinstated on an ongoing basis if such Contributor -notifies You of the non-compliance by some reasonable means, this is the -first time You have received notice of non-compliance with this License -from such Contributor, and You become compliant prior to 30 days after -Your receipt of the notice. - -5.2. If You initiate litigation against any entity by asserting a patent -infringement claim (excluding declaratory judgment actions, -counter-claims, and cross-claims) alleging that a Contributor Version -directly or indirectly infringes any patent, then the rights granted to -You by any and all Contributors for the Covered Software under Section -2.1 of this License shall terminate. - -5.3. In the event of termination under Sections 5.1 or 5.2 above, all -end user license agreements (excluding distributors and resellers) which -have been validly granted by You or Your distributors under this License -prior to termination shall survive termination. - -************************************************************************ - -* * -* 6. Disclaimer of Warranty * -* ------------------------- * -* * -* Covered Software is provided under this License on an "as is" * -* basis, without warranty of any kind, either expressed, implied, or * -* statutory, including, without limitation, warranties that the * -* Covered Software is free of defects, merchantable, fit for a * -* particular purpose or non-infringing. The entire risk as to the * -* quality and performance of the Covered Software is with You. * -* Should any Covered Software prove defective in any respect, You * -* (not any Contributor) assume the cost of any necessary servicing, * -* repair, or correction. This disclaimer of warranty constitutes an * -* essential part of this License. No use of any Covered Software is * -* authorized under this License except under this disclaimer. * -* * - -************************************************************************ - -************************************************************************ - -* * -* 7. Limitation of Liability * -* -------------------------- * -* * -* Under no circumstances and under no legal theory, whether tort * -* (including negligence), contract, or otherwise, shall any * -* Contributor, or anyone who distributes Covered Software as * -* permitted above, be liable to You for any direct, indirect, * -* special, incidental, or consequential damages of any character * -* including, without limitation, damages for lost profits, loss of * -* goodwill, work stoppage, computer failure or malfunction, or any * -* and all other commercial damages or losses, even if such party * -* shall have been informed of the possibility of such damages. This * -* limitation of liability shall not apply to liability for death or * -* personal injury resulting from such party's negligence to the * -* extent applicable law prohibits such limitation. Some * -* jurisdictions do not allow the exclusion or limitation of * -* incidental or consequential damages, so this exclusion and * -* limitation may not apply to You. * -* * - -************************************************************************ - -8. Litigation - -------------- - -Any litigation relating to this License may be brought only in the -courts of a jurisdiction where the defendant maintains its principal -place of business and such litigation shall be governed by laws of that -jurisdiction, without reference to its conflict-of-law provisions. -Nothing in this Section shall prevent a party's ability to bring -cross-claims or counter-claims. - -9. Miscellaneous - ----------------- - -This License represents the complete agreement concerning the subject -matter hereof. If any provision of this License is held to be -unenforceable, such provision shall be reformed only to the extent -necessary to make it enforceable. Any law or regulation which provides -that the language of a contract shall be construed against the drafter -shall not be used to construe this License against a Contributor. - -10. Versions of the License - ---------------------------- - -10.1. New Versions - -Mozilla Foundation is the license steward. Except as provided in Section -10.3, no one other than the license steward has the right to modify or -publish new versions of this License. Each version will be given a -distinguishing version number. - -10.2. Effect of New Versions - -You may distribute the Covered Software under the terms of the version -of the License under which You originally received the Covered Software, -or under the terms of any subsequent version published by the license -steward. - -10.3. Modified Versions - -If you create software not governed by this License, and you want to -create a new license for such software, you may create and use a -modified version of this License if you rename the license and remove -any references to the name of the license steward (except to note that -such modified license differs from this License). - -10.4. Distributing Source Code Form that is Incompatible With Secondary -Licenses - -If You choose to distribute Source Code Form that is Incompatible With -Secondary Licenses under the terms of this version of the License, the -notice described in Exhibit B of this License must be attached. - -Exhibit A - Source Code Form License Notice -------------------------------------------- - - This Source Code Form is subject to the terms of the Mozilla Public - License, v. 2.0. If a copy of the MPL was not distributed with this - file, You can obtain one at . - -If it is not possible or desirable to put the notice in a particular -file, then You may include the notice in a location (such as a LICENSE -file in a relevant directory) where a recipient would be likely to look -for such a notice. - -You may add additional accurate notices of copyright ownership. - -Exhibit B - "Incompatible With Secondary Licenses" Notice ---------------------------------------------------------- - - This Source Code Form is "Incompatible With Secondary Licenses", as - defined by the Mozilla Public License, v. 2.0. +BSD-3-Clause + +================================================================ + +Copyright (2024), : Minh Cong Nguyen; Ben James Brunckhorst; Esther G. Naikal; Nisan Gorgulu; Ruth Vargas Hill; Stephane Hallegatte +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file