diff --git a/pdm.lock b/pdm.lock index c596c84d..97a173d7 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev", "doc", "test", "tox"] strategy = ["cross_platform"] lock_version = "4.4" -content_hash = "sha256:119fc550e1587b7dacafc59fac397079fa6afb6e8b9d56b557cbe43b82fb8055" +content_hash = "sha256:77a6ce5373d71ad88844c627629edab640a35d5de9bc5fc79e43ccd23cadabb2" [[package]] name = "alabaster" @@ -42,6 +42,9 @@ name = "astroid" version = "3.0.1" requires_python = ">=3.8.0" summary = "An abstract syntax tree for Python with inference support." +dependencies = [ + "typing-extensions>=4.0.0; python_version < \"3.11\"", +] files = [ {file = "astroid-3.0.1-py3-none-any.whl", hash = "sha256:7d5895c9825e18079c5aeac0572bc2e4c83205c95d416e0b4fee8bc361d2d9ca"}, {file = "astroid-3.0.1.tar.gz", hash = "sha256:86b0bb7d7da0be1a7c4aedb7974e391b32d4ed89e33de6ed6902b4b15c97577e"}, @@ -139,6 +142,21 @@ files = [ {file = "boto3-1.33.11.tar.gz", hash = "sha256:620f1eb3e18e780be58383b4a4e10db003d2314131190514153996032c8d932d"}, ] +[[package]] +name = "boto3-stubs-lite" +version = "1.33.13" +requires_python = ">=3.7" +summary = "Type annotations for boto3 1.33.13 generated with mypy-boto3-builder 7.21.0" +dependencies = [ + "botocore-stubs", + "types-s3transfer", + "typing-extensions>=4.1.0; python_version < \"3.12\"", +] +files = [ + {file = "boto3-stubs-lite-1.33.13.tar.gz", hash = "sha256:c60afcdd4a396c3565115b8a7adf147ec040c1bbc69324ba837bc8aadcd128f7"}, + {file = "boto3_stubs_lite-1.33.13-py3-none-any.whl", hash = "sha256:475d871300f60aa998bdf31d16737f02e6b72b30e697272c59576e0c9037e1bc"}, +] + [[package]] name = "botocore" version = "1.33.11" @@ -155,6 +173,19 @@ files = [ {file = "botocore-1.33.11.tar.gz", hash = "sha256:b14b328f902d120de0a09eaa657a9a701c0ceeb711197c2f01ef0523f855086c"}, ] +[[package]] +name = "botocore-stubs" +version = "1.33.13" +requires_python = ">=3.7,<4.0" +summary = "Type annotations and code completion for botocore" +dependencies = [ + "types-awscrt", +] +files = [ + {file = "botocore_stubs-1.33.13-py3-none-any.whl", hash = "sha256:138dfc7bebf6c16d3754ce625bd761f607438e778bfb397824906e5ab36fa301"}, + {file = "botocore_stubs-1.33.13.tar.gz", hash = "sha256:b7b08a1beec9e6605fba12ffcd007b695bfc0a29f379a039f1ed69ec8eb1f7c7"}, +] + [[package]] name = "cachecontrol" version = "0.13.1" @@ -454,6 +485,64 @@ files = [ {file = "coverage-7.3.2.tar.gz", hash = "sha256:be32ad29341b0170e795ca590e1c07e81fc061cb5b10c74ce7203491484404ef"}, ] +[[package]] +name = "cramjam" +version = "2.7.0" +requires_python = ">=3.7" +summary = "Thin Python bindings to de/compression algorithms in Rust" +files = [ + {file = "cramjam-2.7.0-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:aac9d49e16f473ceb4eaf74a53180eac3363127f01855c39122b400a988e80bf"}, + {file = "cramjam-2.7.0-cp310-cp310-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:a08dcb7c7b54f82db4ee9120aaace06326499c0d4108770ee7ac63d7bd1d803d"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5c411d785cec410d4164e4ecc76b6c152761fbb61325bcc4acbdc8926874c0b"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d07c5af763501bd23523658aeb535082eaac014746f7973df85f76b0d9b40967"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7f93316abc1abfd348b04afc6cadbbd4fba44cd91e7b9803c9330045a7a1885"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d56afb5f278a18743a218514825b6ab176f18a4084d8f6515c64e3acef19478"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ea1c781d3760df0b6ad80b7b19dc8e038e0638fb1cfabc68da96cedb8d0adca"}, + {file = "cramjam-2.7.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3544ea95d0e98ac926d92d652adc417e78091117cbe2ef7733e26c40601604c"}, + {file = "cramjam-2.7.0-cp310-none-win32.whl", hash = "sha256:0ffb891294e77f2a3b0137992ebd6eb9b1f1bc3728d7d4314632e30270855117"}, + {file = "cramjam-2.7.0-cp310-none-win_amd64.whl", hash = "sha256:79c36d95e89b43c29595c889c7a4d30d29aefc55d7c58a26a058b9bbe7abd5cf"}, + {file = "cramjam-2.7.0-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:71bf6a6632648333c402a8692fe61f45416066eb0d8b7f4530cdf37fee221a11"}, + {file = "cramjam-2.7.0-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:bee04fd1cdd5f2a2e91e4b271f22e228c698fe7b7f8ef209374d717f7889e80c"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60951e64d3e05ef2a46d2a92fc4e4563ae5e28bb3b6f231f2dca68a5078a72dc"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e6231fd3ac680c34c0d8405abfa8c3d12f92e28d0897d960aa905f053cc09e63"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2602c42ed101ada634fa37253d40946f0468b2b749689781cba715a7d78038e"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbc59b223750a901d65d96333461ab17076594fa34448ed2ef911bd4b628f068"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fab800ed93fb129d85c63f26660b695fb194efb29765a163f269321778e28a8d"}, + {file = "cramjam-2.7.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0b48b11c328d91250dadc63c00753f5ba26eb9df5fe75ba2ce8a78631260479d"}, + {file = "cramjam-2.7.0-cp311-none-win32.whl", hash = "sha256:bef07e7d4607c4d70627e58eb630fe60e48b80a61ab05b33314e3296eb90af78"}, + {file = "cramjam-2.7.0-cp311-none-win_amd64.whl", hash = "sha256:3f2e41dc8143d0e88ec9ba3ff66c8d2aea486b04931119abe9e394f9961d74bc"}, + {file = "cramjam-2.7.0-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:8e82276b120000834af3f776cbe22ac3016cd9e0ed00e7f3fb31e2ce95e17181"}, + {file = "cramjam-2.7.0-cp312-cp312-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:2b5a463b01d8867f78a361defb4fadca63635965e8204f7e2409b717d85f0c1d"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b84565611f36ee8c3944b96bb01ee4d44c101acf84f1430c9414986ab3a7fb03"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3cd45d09229960a89e0e3bbf71d29ac399391446f51279f3202846d7da106eb7"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb065d054e49645414bc84eaa49a6b7ed6c7185fb8ba2648c518344cf481144"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a3f45cd17730c00358643a814d5921a31c8a0d238582e08594fa86fdbc8401c"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bf4ec082a7647ed7463c5e0bd176850a9f013c258ad98f53055bcd1716c7fac"}, + {file = "cramjam-2.7.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9f9e403be6307e98a47cc7d0132fe1fe4683c873c5d7d9da099fbac9c299a4b0"}, + {file = "cramjam-2.7.0-cp312-none-win32.whl", hash = "sha256:fd2e81c69baacb95fa28cdf844f714d7c03f0c805f4fa2decc5e9565e6b4405d"}, + {file = "cramjam-2.7.0-cp312-none-win_amd64.whl", hash = "sha256:831951a3eccebd5911387e3fff14483747d3dae5ad496afbd50447ffe2d03dba"}, + {file = "cramjam-2.7.0-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:f52718e1c7aed3d0e3ffbbe5c085d9c449daa726379788ddb27cb62ffc2b6ba1"}, + {file = "cramjam-2.7.0-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:ee4bc46e5cf87d6097833bca33a66f2724b4773242a71ed642d13682fedefb71"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd77e784289fc44a5e6487b2e263f07cc271f25a8395e97213b6a934fe47a768"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53ac8df43546d3f70acbc5c17424a8c083418bd6d2cacfbd5108aaa8f3eb26db"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca6309d06edf29a9a5c22e25d1d7f7609abb8ae7281583bc486afb19fd645898"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96ff622f0db7f89d7c9aeadd9cc0c9bf61e804841a03a22ca919aa4955640d"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc87176851c50c5aaf6bacafb6bed5a86e3b4ee6a749d6ec13f3d37ae0e951a"}, + {file = "cramjam-2.7.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f9340c70d95ea102cf51a07ecc09f93f1363e585d97b5276734a5f8c4476e560"}, + {file = "cramjam-2.7.0-cp39-none-win32.whl", hash = "sha256:3d5ed0fa20b42e063ef66ad01d9948e868bbfc327bf86604e078b67f074f76f3"}, + {file = "cramjam-2.7.0-cp39-none-win_amd64.whl", hash = "sha256:7e44dda432a8c8a47cb73869201d2f9777604bd913d859def84c659fb736cfd3"}, + {file = "cramjam-2.7.0-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:22927dbdda85d8719074e061f9ec024c9bf16088e5e4c6c1c134c46e2d9153b7"}, + {file = "cramjam-2.7.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee80ebd85acec1031e7563cce3de3961bd3f2ec8947c5bf84a9356b25af67a75"}, + {file = "cramjam-2.7.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91bb03ca0d3857f319afd13525d5c6214a55aa49778ce46a02c16f0eee37907c"}, + {file = "cramjam-2.7.0-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:202e4a5a496ea49d0bb5252fbee8b6e421d256968773c7a8b3e86d98eec9228e"}, + {file = "cramjam-2.7.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19c44de9dee2ea0c586a5b12adc9bc28224544717bce88a94c3ee202b9ece25d"}, + {file = "cramjam-2.7.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03a72a689c93b8a2c7c08b529c1224c47bd469722e559af231016694b90f6442"}, + {file = "cramjam-2.7.0-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:fbffb1f63edf4cb4272a25de288c2f2e20914bb93e003883656774e61794b960"}, + {file = "cramjam-2.7.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df46019cb999d874ce86e08d71d7d2983c052d6a63f7aa6bce960e4e05e8ea37"}, + {file = "cramjam-2.7.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44fe99233ef2f42ff03d8395e7d97e0c45306eb356f6f01fa69bdb49783fdb8a"}, + {file = "cramjam-2.7.0.tar.gz", hash = "sha256:579fb724eec048b1a18ca8f7ad9a7ef296dc02eba5f87fd4d5031f0c32c5c9ac"}, +] + [[package]] name = "deprecated" version = "1.2.14" @@ -506,6 +595,51 @@ files = [ {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, ] +[[package]] +name = "fastparquet" +version = "2023.10.1" +requires_python = ">=3.8" +summary = "Python support for Parquet file format" +dependencies = [ + "cramjam>=2.3", + "fsspec", + "numpy>=1.20.3", + "packaging", + "pandas>=1.5.0", +] +files = [ + {file = "fastparquet-2023.10.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:75a00475e96d26214dace147b27ab782da7a0ae230cade05ea9181c3aec2e637"}, + {file = "fastparquet-2023.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0c1d5559aa0a4fff8eb3b301c8177b6813bb15fe9d2007ad0dc89f8fa519c5"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b798cdfa8f01cd573b135a493a4d0686ebbcd3a412d6e59889a7ae41ff90efeb"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a746f521da7459707899fc33b334b2d21f590380f472fc27642f3ef28ee451d2"}, + {file = "fastparquet-2023.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e86d64898e846ed0f5745427436e5772fd7bb5d9a930f1dca8233e90385e126b"}, + {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5c3afafd4a0907216f5ee4f835f47ad16b84c5dede4c5ca4c0754dffe3eb72d7"}, + {file = "fastparquet-2023.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:68d26a1172be5b02018f6c28603f195807955d8262b913349385d977f3ae081f"}, + {file = "fastparquet-2023.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:b7086ca3a0d8ae8680b380da9b7057a1491d629945b1dd228eba5b362e2e39aa"}, + {file = "fastparquet-2023.10.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7f8d53f5e5049b21893964cd27154c2a7c8180f3ffd1f2693f80e0f834a3a35e"}, + {file = "fastparquet-2023.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ea74f28494fda892641a564f728d046a074fdea5b9ff664ef9554c0da563bad4"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab978612d21630033df0a0b12423ed826fe36e83a1710b155968c3c6e2b3174a"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc30c502feaa67c058c496eb4a734eba8bd373f0d24a32cc69360c79f7220ef"}, + {file = "fastparquet-2023.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99568ae6bbbd973b32d796cb664ba156b101e5d1931dba780fe2dc0d9b227dfd"}, + {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:53b9ad8d646c2609854cbe7d7b17be343664cabae1cd0eb119011e389df8484d"}, + {file = "fastparquet-2023.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b2a9ee49039223a1e216c409c25072be1f362de27197cbec5f90cf2e736df3b0"}, + {file = "fastparquet-2023.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:9133d2f975c6e05187be4b558060e6a4aafeba02dceaf849cf6ad46d32e59405"}, + {file = "fastparquet-2023.10.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b826696cd48f1defb6fcafb4c9798102233e54f3f3491251c034dde3d94f420a"}, + {file = "fastparquet-2023.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bf2d58bee17e0eea8565c2bcd2b339ee032472751651e21f000eb564ad3cd5cf"}, + {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9296098d06c6692ee477fe491381eda39fc0dcfe2fce210496491fe16ce27ef8"}, + {file = "fastparquet-2023.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c975d648ea491e684135e9e3c0a15b440d66d0772fe497269e5c9c4eaaeb62a2"}, + {file = "fastparquet-2023.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5208db1f38c8ac5f50f309f77bdb828fa7f247b82e2df88d847ad3bec38903"}, + {file = "fastparquet-2023.10.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1fac5319aabcbc4acc2feb5df68336de755de7d60a2ee9329fef178ac016e236"}, + {file = "fastparquet-2023.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c61d26705e9a2ad2d52ed1d527c75e96e6a9a04be35bd4c8d6f4accd778f9b05"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2635f0f37a983e35be9b8013b84361e3d0cdd4f514b822016445c029b1c6e007"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cde8f6798d37e2af38ada058fc7018c2157d90a8dd728c0c59fab85b8adb9215"}, + {file = "fastparquet-2023.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20c17c5e7186723a175c9e7da94285bdef3cb477cb7cca0e2812b1e245279671"}, + {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:91ee6b5b0efc18586e61da6662119de92fc7bf552c3a08a13eb2af16bc12f16a"}, + {file = "fastparquet-2023.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:332cb3b204e1de64dcfc4c5d0b517ea665856d19c139f693e8c9efc11992e19e"}, + {file = "fastparquet-2023.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:5eb06a70daf50d70290b87f3a5ca6f25eb24ad850bcc68197b5438d92b11c763"}, + {file = "fastparquet-2023.10.1.tar.gz", hash = "sha256:076fedfba2b56782b4823c1d351424425cfeaa5b8644c542416ca1363fe6d921"}, +] + [[package]] name = "filelock" version = "3.13.1" @@ -529,6 +663,16 @@ files = [ {file = "findpython-0.4.1.tar.gz", hash = "sha256:d7d014558681b3761d57a5b2342a713a8bf302f6c1fc9d99f81b9d8bd1681b04"}, ] +[[package]] +name = "fsspec" +version = "2023.12.2" +requires_python = ">=3.8" +summary = "File-system specification" +files = [ + {file = "fsspec-2023.12.2-py3-none-any.whl", hash = "sha256:d800d87f72189a745fa3d6b033b9dc4a34ad069f60ca60b943a63599f5501960"}, + {file = "fsspec-2023.12.2.tar.gz", hash = "sha256:8548d39e8810b59c38014934f6b31e57f40c1b20f911f4cc2b85389c7e9bf0cb"}, +] + [[package]] name = "globalwarmingpotentials" version = "0.9.3" @@ -797,6 +941,19 @@ files = [ {file = "msgpack-1.0.7.tar.gz", hash = "sha256:572efc93db7a4d27e404501975ca6d2d9775705c2d922390d878fcf768d92c87"}, ] +[[package]] +name = "mypy-boto3-s3" +version = "1.33.2" +requires_python = ">=3.7" +summary = "Type annotations for boto3.S3 1.33.2 service generated with mypy-boto3-builder 7.20.3" +dependencies = [ + "typing-extensions>=4.1.0; python_version < \"3.12\"", +] +files = [ + {file = "mypy-boto3-s3-1.33.2.tar.gz", hash = "sha256:f54a3ad3288f4e4719ebada3dde68c320507b0fc451d59bc68af7e6ab15cbdad"}, + {file = "mypy_boto3_s3-1.33.2-py3-none-any.whl", hash = "sha256:9d463df6def30de31a467d49ab92ff7795d46709d56eff6f52216a08bac27918"}, +] + [[package]] name = "ndindex" version = "1.7" @@ -1875,6 +2032,26 @@ files = [ {file = "truststore-0.8.0.tar.gz", hash = "sha256:dc70da89634944a579bfeec70a7a4523c53ffdb3cf52d1bb4a431fda278ddb96"}, ] +[[package]] +name = "types-awscrt" +version = "0.19.19" +requires_python = ">=3.7,<4.0" +summary = "Type annotations and code completion for awscrt" +files = [ + {file = "types_awscrt-0.19.19-py3-none-any.whl", hash = "sha256:a577c4d60a7fb7e21b436a73207a66f6ba50329d578b347934c5d99d4d612901"}, + {file = "types_awscrt-0.19.19.tar.gz", hash = "sha256:850d5ad95d8f337b15fb154790f39af077faf5c08d43758fd750f379a87d5f73"}, +] + +[[package]] +name = "types-s3transfer" +version = "0.8.2" +requires_python = ">=3.7,<4.0" +summary = "Type annotations and code completion for s3transfer" +files = [ + {file = "types_s3transfer-0.8.2-py3-none-any.whl", hash = "sha256:5e084ebcf2704281c71b19d5da6e1544b50859367d034b50080d5316a76a9418"}, + {file = "types_s3transfer-0.8.2.tar.gz", hash = "sha256:2e41756fcf94775a9949afa856489ac4570308609b0493dfbd7b4d333eb423e6"}, +] + [[package]] name = "typing-extensions" version = "4.9.0" diff --git a/pyproject.toml b/pyproject.toml index bb3ebfbb..a1fe2359 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ITR" -version = "v1.0.11" +version = "v1.1.1" description = "Assess the temperature alignment of current targets, commitments, and investment and lending portfolios." authors = [ { name = "Michael Tiemann", email = "72577720+MichaelTiemannOSC@users.noreply.github.com" }, @@ -29,32 +29,35 @@ classifiers = [ ] dependencies = [ - "autoapi>=2.0.1", - "iam-units>=2022.10.27", - "numpy==1.24.3", - "openpyxl==3.0.10", - "openscm-units==0.5.2", - "orca==1.8", - "osc-ingest-tools>=0.5.2", - "pandas>=2.1.0", - "pip>=23.3.1", - "Pint>=0.22", - "Pint-Pandas>=0.5", - "psutil==5.9.5", - "pydantic>=2.3.0", - "pygithub==1.55", - "pytest==7.3.2", - "python-dotenv==1.0.0", - "setuptools>=65.7.0", - "sphinx<8,>=6", - "sphinx-autoapi>=2.0.1", - "sphinx-autodoc-typehints", - "sphinx-rtd-theme==1.3.0", - "SQLAlchemy>=2.0.20", - "tables>=3.8.0", - "trino==0.326.0", - "wheel>=0.41.0", - "xlrd==2.0.1", + "autoapi>=2.0.1", + "fastparquet>=2023.10.1", + "iam-units>=2022.10.27", + "numpy==1.24.3", + "openpyxl==3.0.10", + "openscm-units==0.5.2", + "orca==1.8", + "osc-ingest-tools>=0.5.2", + "pandas>=2.1.0", + "pip>=23.3.1", + "Pint>=0.23", + "Pint-Pandas>=0.5", + "psutil==5.9.5", + "pydantic>=2.3.0", + "pygithub==1.55", + "pytest==7.3.2", + "python-dotenv==1.0.0", + "setuptools>=65.7.0", + "sphinx<8,>=6", + "sphinx-autoapi>=2.0.1", + "sphinx-autodoc-typehints", + "sphinx-rtd-theme==1.3.0", + "SQLAlchemy>=2.0.20", + "tables>=3.8.0", + "trino==0.326.0", + "wheel>=0.41.0", + "xlrd==2.0.1", + "mypy-boto3-s3>=1.33.2", + "boto3-stubs-lite>=1.33.13", ] [project.urls] diff --git a/src/ITR/configs.py b/src/ITR/configs.py index 49f8b4a4..6cbf304a 100644 --- a/src/ITR/configs.py +++ b/src/ITR/configs.py @@ -55,11 +55,6 @@ class ColumnsConfig: BASE_YEAR_PRODUCTION = "base_year_production" GHG_SCOPE12 = "ghg_s1s2" GHG_SCOPE3 = "ghg_s3" - TEMPLATE_SCOPE1 = "em_s1" - TEMPLATE_SCOPE2 = "em_s2" - TEMPLATE_SCOPE12 = "em_s1s2" - TEMPLATE_SCOPE3 = "em_s3" - TEMPLATE_SCOPE123 = "em_s1s2s3" HISTORIC_DATA = "historic_data" TARGET_DATA = "target_data" TEMPLATE_PRODUCTION = "production" diff --git a/src/ITR/data/base_providers.py b/src/ITR/data/base_providers.py index 2c05da75..4182b33f 100644 --- a/src/ITR/data/base_providers.py +++ b/src/ITR/data/base_providers.py @@ -104,6 +104,7 @@ def __init__( super().__init__() self.column_config = column_config self._productions_benchmarks = production_benchmarks + self._own_data = True try: with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -116,8 +117,12 @@ def __init__( ) except AttributeError: assert False - # See comment above to understand use of `cumprod` function - self._prod_df = _prod_delta_df_t.add(1.0).cumprod(axis=0).astype("pint[dimensionless]").T + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + # See comment above to understand use of `cumprod` function + self._prod_df = _prod_delta_df_t.add(1.0).cumprod(axis=0).astype("pint[dimensionless]").T + self._prod_df.columns.name = "year" self._prod_df.index.names = [ self.column_config.SECTOR, self.column_config.REGION, @@ -179,10 +184,9 @@ def get_company_projected_production(self, company_sector_region_scope: pd.DataF ColumnsConfig.COMPANY_ID, ColumnsConfig.SECTOR, ColumnsConfig.REGION, ColumnsConfig.SCOPE :return: DataFrame of projected productions for [base_year through 2050] """ - # get_benchmark_projections is an expensive call. It's designed to return ALL benchmark info for ANY sector/region combo passed - # and it does all that work whether we need all the data or just one row. Best to lift this out of any inner loop - # and use the valuable DataFrame it creates. - company_benchmark_projections = self.get_benchmark_projections(company_sector_region_scope) + from ..utils import get_benchmark_projections + + company_benchmark_projections = get_benchmark_projections(self._prod_df, company_sector_region_scope) company_production = company_sector_region_scope.set_index(self.column_config.SCOPE, append=True)[ self.column_config.BASE_YEAR_PRODUCTION ] @@ -199,55 +203,6 @@ def get_company_projected_production(self, company_sector_region_scope: pd.DataF company_projected_productions_t = company_benchmark_projections.T.mul(company_production, axis=1) return company_projected_productions_t.T - def get_benchmark_projections( - self, company_sector_region_scope: pd.DataFrame, scope: EScope = EScope.AnyScope - ) -> pd.DataFrame: - """ - Overrides subclass method - returns a Dataframe with production benchmarks per company_id given a region and sector. - :param company_sector_region_scope: DataFrame indexed by ColumnsConfig.COMPANY_ID - with at least the following columns: ColumnsConfig.SECTOR, ColumnsConfig.REGION, and ColumnsConfig.SCOPE - :param scope: a scope - :return: A pint[dimensionless] DataFrame with partial production benchmark data per calendar year per row, indexed by company. - """ - - benchmark_projection = self._get_projected_production(scope) # TODO optimize performance - df = ( - company_sector_region_scope[["sector", "region", "scope"]] - .reset_index() - .drop_duplicates() - .set_index(["company_id", "scope"]) - ) - # We drop the meaningless S1S2/AnyScope from the production benchmark and replace it with the company's scope. - # This is needed to make indexes align when we go to multiply production times intensity for a scope. - company_benchmark_projections = df.merge( - benchmark_projection.droplevel("scope"), - left_on=["sector", "region"], - right_index=True, - how="left", - ) - mask = company_benchmark_projections.iloc[:, -1].isna() - if mask.any(): - # Patch up unknown regions as "Global" - global_benchmark_projections = ( - df[mask] - .merge( - benchmark_projection.loc[(slice(None), "Global"), :].droplevel(["region", "scope"]), - left_on=["sector"], - right_index=True, - how="left", - ) - .drop(columns="region") - ) - combined_benchmark_projections = pd.concat( - [ - company_benchmark_projections[~mask].drop(columns="region"), - global_benchmark_projections, - ] - ) - return combined_benchmark_projections.drop(columns="sector") - return company_benchmark_projections.drop(columns=["sector", "region"]) - class BaseProviderIntensityBenchmark(IntensityBenchmarkDataProvider): def __init__( @@ -261,6 +216,7 @@ def __init__( EI_benchmarks.benchmark_global_budget, EI_benchmarks.is_AFOLU_included, ) + self._own_data = True self._EI_benchmarks = EI_benchmarks self.column_config = column_config self.projection_controls = projection_controls @@ -273,6 +229,7 @@ def __init__( pass self._EI_df_t = pd.concat(benchmarks_as_series, axis=1) + self._EI_df_t.index.name = "year" self._EI_df_t.columns.set_names(["sector", "region", "scope"], inplace=True) # https://stackoverflow.com/a/56528071/1291237 self._EI_df_t.sort_index(axis=1, inplace=True) @@ -454,7 +411,8 @@ def __init__( projection_controls: ProjectionControls = ProjectionControls(), ): super().__init__() - self.column_config = column_config + self._own_data = True + self._column_config = column_config self.projection_controls = projection_controls # In the initialization phase, `companies` has minimal fundamental values (company_id, company_name, sector, region, # but not projected_intensities, projected_targets, etc) @@ -462,6 +420,20 @@ def __init__( # Initially we don't have to do any allocation of emissions across multiple sectors, but if we do, we'll update the index here. self._bm_allocation_index = pd.DataFrame().index + @property + def column_config(self) -> Type[ColumnsConfig]: + """ + :return: ColumnsConfig values for this Data Provider + """ + return self._column_config + + @property + def own_data(self) -> bool: + """ + Return True if this object contains its own data; false if data housed elsewhere + """ + return self._own_data + def get_projection_controls(self) -> ProjectionControls: return self.projection_controls @@ -478,21 +450,21 @@ def _validate_projected_trajectories( EI_BENCHMARKS are the benchmarks for all sectors, regions, and scopes In previous incarnations of this function, no benchmark data was needed for any reason. """ - if isinstance(ei_benchmarks, BaseProviderIntensityBenchmark): - ei_bm_df_t: pd.DataFrame = ei_benchmarks._EI_df_t + if hasattr(ei_benchmarks, "_EI_df_t"): + ei_df_t: pd.DataFrame = ei_benchmarks._EI_df_t else: - raise TypeError + raise AttributeError(f"object {ei_benchmarks} does not have _EI_df_t attribute") company_ids_without_data = [ - c.company_id for c in companies if c.historic_data.empty() and c.projected_intensities.empty() + c.company_id for c in companies if c.historic_data.empty and c.projected_intensities.empty ] if company_ids_without_data: error_message = ( - f"Provide either historic emission data or projections for companies with " + "Provide either historic emission data or projections for companies with " f"IDs {company_ids_without_data}" ) logger.error(error_message) raise ValueError(error_message) - companies_without_historic_data = [c for c in companies if c.historic_data.empty()] + companies_without_historic_data = [c for c in companies if c.historic_data.empty] if companies_without_historic_data: # Can arise from degenerate test cases pass @@ -500,7 +472,7 @@ def _validate_projected_trajectories( for company in companies_without_historic_data: scope_em = {} scope_ei = {} - if not company.projected_intensities.empty(): + if not company.projected_intensities.empty: for scope_name in EScope.get_scopes(): if isinstance( company.projected_intensities[scope_name], @@ -562,7 +534,7 @@ def _validate_projected_trajectories( companies_without_base_year_production = [] companies_without_projections = [] for c in companies: - if c.projected_intensities.empty(): + if c.projected_intensities.empty: companies_without_projections.append(c) else: companies_with_projections.append(c) @@ -576,9 +548,9 @@ def _validate_projected_trajectories( else: companies_without_base_year_production.append(c) if companies_without_projections: - new_company_projections = EITrajectoryProjector( - self.projection_controls, ei_bm_df_t - ).project_ei_trajectories(companies_without_projections) + new_company_projections = EITrajectoryProjector(self.projection_controls, ei_df_t).project_ei_trajectories( + companies_without_projections + ) for c in new_company_projections: assert c.base_year_production is not None production_units = c.base_year_production.units @@ -610,10 +582,10 @@ def _validate_projected_trajectories( for company in companies: sector = company.sector region = company.region - if (sector, region) in ei_bm_df_t.columns: - ei_dtype = ei_bm_df_t[(sector, region)].dtypes.iloc[0] - elif (sector, "Global") in ei_bm_df_t.columns: - ei_dtype = ei_bm_df_t[(sector, "Global")].dtypes.iloc[0] + if (sector, region) in ei_df_t.columns: + ei_dtype = ei_df_t[(sector, region)].dtypes.iloc[0] + elif (sector, "Global") in ei_df_t.columns: + ei_dtype = ei_df_t[(sector, "Global")].dtypes.iloc[0] else: continue for scope in EScope.get_scopes(): @@ -727,13 +699,12 @@ def _calculate_target_projections( warnings.simplefilter("ignore") # FIXME: Note that we don't need to call with a scope, because production is independent of scope. # We use the arbitrary EScope.AnyScope just to be explicit about that. - assert isinstance(production_bm, BaseProviderProductionBenchmark) - df_partial_pp = production_bm._get_projected_production(EScope.AnyScope) + df_partial_pp = getattr(production_bm, "_get_projected_production")(EScope.AnyScope) ei_df_t = ei_bm._get_intensity_benchmarks() for c in self._companies: - if not c.projected_targets.empty(): + if not c.projected_targets.empty: continue if c.target_data is None: logger.warning(f"No target data for {c.company_name}") @@ -1151,12 +1122,12 @@ class EITrajectoryProjector(EIProjector): def __init__( self, projection_controls: ProjectionControls = ProjectionControls(), - ei_bm_df_t=None, + ei_df_t=None, *args, **kwargs, ): super().__init__(projection_controls=projection_controls) - self._EI_df_t = pd.DataFrame() if ei_bm_df_t is None else ei_bm_df_t + self._EI_df_t = pd.DataFrame() if ei_df_t is None else ei_df_t def project_ei_trajectories(self, companies: List[ICompanyData], backfill_needed=True) -> List[ICompanyData]: historic_df = self._extract_historic_df(companies) @@ -1249,14 +1220,14 @@ def project_ei_trajectories(self, companies: List[ICompanyData], backfill_needed def _extract_historic_df(self, companies: List[ICompanyData]) -> pd.DataFrame: data = [] for company in companies: - if company.historic_data.empty(): + if company.historic_data.empty: continue c_hd = company.historic_data if len(c_hd.productions): data.append(self._historic_productions_to_dict(company.company_id, c_hd.productions)) - if not c_hd.emissions.empty(): + if not c_hd.emissions.empty: data.extend(self._historic_emissions_to_dicts(company.company_id, c_hd.emissions)) - if not c_hd.emissions_intensities.empty(): + if not c_hd.emissions_intensities.empty: data.extend(self._historic_ei_to_dicts(company.company_id, c_hd.emissions_intensities)) if not data: logger.error(f"No historic data for companies: {[c.company_id for c in companies]}") @@ -1405,7 +1376,7 @@ def _align_and_compute_missing_historic_ei(self, companies: List[ICompanyData], logger.warning(warning_message) if missing_data: error_message = ( - f"Provide either historic emissions intensity data, or historic emission and " + "Provide either historic emissions intensity data, or historic emission and " f"production data for these company - scope combinations: {missing_data}" ) logger.error(error_message) @@ -1482,7 +1453,7 @@ def _winsorize(self, historic_intensities: pd.DataFrame) -> pd.DataFrame: lambda col: col.map(ITR.std_devs, na_action="ignore") ) except ValueError: - logger.error(f"ValueError in _winsorize") + logger.error("ValueError in _winsorize") raise else: # pint.dequantify did all the hard work for us @@ -1680,7 +1651,7 @@ def calculate_nz_target_years(self, targets: List[ITargetData]) -> dict: # We then infer netzero year targets for constituents of compound scopes from compound scopes # and infer netzero year taregts for compound scopes as the last of all constituents if nz_target_years["S1S2S3"] < nz_target_years["S1S2"]: - logger.warn(f"target S1S2S3 date <= S1S2 date") + logger.warning("target S1S2S3 date <= S1S2 date") nz_target_years["S1S2"] = nz_target_years["S1S2S3"] nz_target_years["S1"] = min(nz_target_years["S1S2"], nz_target_years["S1"]) nz_target_years["S2"] = min(nz_target_years["S1S2"], nz_target_years["S2"]) @@ -1784,7 +1755,7 @@ def project_ei_targets( # for some sectors, and projecting a netzero target for S1 from S1+S2 makes that benchmark useable. # Note that we can only infer separate S1 and S2 targets from S1+S2 targets when S1+S2 = 0, because S1=0 + S2=0 is S1+S2=0 if no_scope_targets: - if company.historic_data.empty(): + if company.historic_data.empty: # This just defends against poorly constructed test cases nz_target_years[scope_name] = None continue @@ -1862,7 +1833,7 @@ def project_ei_targets( skip_first_year = 1 else: # When starting from scratch, use recent historic data if available. - if company.historic_data.empty(): + if company.historic_data.empty: ei_realizations = [] else: ei_realizations = company.historic_data.emissions_intensities[scope_name] @@ -1929,7 +1900,7 @@ def project_ei_targets( last_em_value = last_em_value.to(target_base_year_unit) skip_first_year = 1 else: - if company.historic_data.empty(): + if company.historic_data.empty: em_realizations = [] else: em_realizations = company.historic_data.emissions[scope_name] diff --git a/src/ITR/data/data_providers.py b/src/ITR/data/data_providers.py index 30ea3dee..58b9ea2d 100644 --- a/src/ITR/data/data_providers.py +++ b/src/ITR/data/data_providers.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import List, Optional +from typing import List, Optional, Type import pandas as pd @@ -37,6 +37,22 @@ def __init__(self, **kwargs): """ pass + @property + @abstractmethod + def column_config(self) -> Type[ColumnsConfig]: + """ + Return the ColumnsConfig associated with this Data Provider + """ + raise NotImplementedError + + @property + @abstractmethod + def own_data(self) -> bool: + """ + Return True if this object contains its own data; false if data housed elsewhere + """ + raise NotImplementedError + @abstractmethod def get_projection_controls(self) -> ProjectionControls: """ @@ -150,7 +166,14 @@ def __init__(self, **kwargs): :param config: A dictionary containing the configuration parameters for this data provider. """ - pass + self._own_data = False + + @property + def own_data(self) -> bool: + """ + :return: True if this object contains its own data; false if data housed elsewhere + """ + return self._own_data @abstractmethod def benchmark_changed(self, production_benchmark: ProductionBenchmarkDataProvider) -> bool: @@ -166,17 +189,6 @@ def get_company_projected_production(self, ghg_scope12: pd.DataFrame) -> pd.Data """ raise NotImplementedError - @abstractmethod - def get_benchmark_projections(self, company_secor_region_info: pd.DataFrame) -> pd.DataFrame: - """ - get the sector emissions for a list of companies. - If there is no data for the sector, then it will be replaced by the global value - :param company_secor_region_info: DataFrame with at least the following columns : - ColumnsConfig.COMPANY_ID, ColumnsConfig.SECTOR and ColumnsConfig.REGION - :return: A DataFrame with company and intensity benchmarks per calendar year per row - """ - raise NotImplementedError - class IntensityBenchmarkDataProvider(ABC): """ @@ -203,6 +215,14 @@ def __init__( self._benchmark_temperature = benchmark_temperature self._is_AFOLU_included = is_AFOLU_included self._benchmark_global_budget = benchmark_global_budget + self._own_data = False + + @property + def own_data(self) -> bool: + """ + :return: True if this object contains its own data; false if data housed elsewhere + """ + return self._own_data @abstractmethod def get_scopes(self) -> List[EScope]: diff --git a/src/ITR/data/data_warehouse.py b/src/ITR/data/data_warehouse.py index 0d6d520d..067da19b 100644 --- a/src/ITR/data/data_warehouse.py +++ b/src/ITR/data/data_warehouse.py @@ -51,7 +51,6 @@ def __init__( benchmark_projected_production: Optional[ProductionBenchmarkDataProvider], benchmarks_projected_ei: Optional[IntensityBenchmarkDataProvider], estimate_missing_data: Optional[Callable[["DataWarehouse", ICompanyData], None]] = None, - column_config: Type[ColumnsConfig] = ColumnsConfig, ): """ Create a new data warehouse instance. @@ -65,7 +64,6 @@ def __init__( # benchmarks_projected_ei._EI_df_t is the (transposed) EI dataframe for the benchmark # benchmark_projected_production.get_company_projected_production(company_sector_region_scope) gives production data per company (per year) # multiplying these two gives aligned emissions data for the company, in case we want to add missing data based on sector averages - self.column_config = column_config self.company_data = company_data self.estimate_missing_data = estimate_missing_data # Place to stash historic data before doing PC-conversion so it can be retreived when switching to non-PC benchmarks @@ -76,8 +74,17 @@ def __init__( # Trajectories + Emissions Intensities benchmark data are needed to estimate missing S3 data # Target projections rely both on Production benchmark data and S3 estimated data # Production-centric manipulations must happen after targets have been projected - if benchmark_projected_production is not None or benchmarks_projected_ei is not None: + if (benchmark_projected_production is not None and benchmark_projected_production.own_data) or ( + benchmarks_projected_ei is not None and benchmarks_projected_ei.own_data + ): self.update_benchmarks(benchmark_projected_production, benchmarks_projected_ei) + self._own_data = True + else: + self._own_data = False + + @property + def own_data(self) -> bool: + return self._own_data def _preserve_historic_data(self): for c in self.company_data._companies: @@ -137,19 +144,31 @@ def update_benchmarks( Update the benchmark data used in this instance of the DataWarehouse. If there is no change, do nothing. """ new_production_bm = new_ei_bm = new_prod_centric = False - if self.benchmark_projected_production is None or self.benchmark_projected_production.benchmark_changed( - benchmark_projected_production + if benchmark_projected_production is None: + pass + if self.benchmark_projected_production is None or ( + self.benchmark_projected_production.own_data + and self.benchmark_projected_production.benchmark_changed(benchmark_projected_production) ): self.benchmark_projected_production = benchmark_projected_production # type: ignore new_production_bm = True - if self.benchmarks_projected_ei is None: + if benchmarks_projected_ei is None: + pass + elif self.benchmarks_projected_ei is None: self.benchmarks_projected_ei = benchmarks_projected_ei # type: ignore - new_ei_bm = True - elif self.benchmarks_projected_ei.benchmarks_changed(benchmarks_projected_ei): - new_prod_centric = self.benchmarks_projected_ei.prod_centric_changed(benchmarks_projected_ei) + if benchmarks_projected_ei.own_data: + new_ei_bm = True + elif self.benchmarks_projected_ei.own_data: + if benchmarks_projected_ei.own_data: + if self.benchmarks_projected_ei.benchmarks_changed(benchmarks_projected_ei): + new_prod_centric = self.benchmarks_projected_ei.prod_centric_changed(benchmarks_projected_ei) + new_ei_bm = True self.benchmarks_projected_ei = benchmarks_projected_ei - new_ei_bm = True + + if not new_production_bm and not new_ei_bm: + return + assert self.benchmarks_projected_ei is not None # Production benchmark data is needed to project trajectories @@ -174,7 +193,7 @@ def update_benchmarks( # If we are missing S3 (or other) data, fill in before projecting targets if new_ei_bm and self.estimate_missing_data is not None: - logger.info(f"estimating missing data") + logger.info("estimating missing data") for c in self.company_data.get_company_data(): self.estimate_missing_data(self, c) @@ -190,7 +209,7 @@ def update_benchmarks( # If our benchmark is production-centric, migrate S3 data (including estimated S3 data) into S1S2 # If we shift before we project, then S3 targets will not be projected correctly. if new_ei_bm and benchmarks_projected_ei.is_production_centric(): - logger.info(f"Shifting S3 emissions data into S1 according to Production-Centric benchmark rules") + logger.info("Shifting S3 emissions data into S1 according to Production-Centric benchmark rules") if self.orig_historic_data != {}: self._restore_historic_data() else: @@ -201,7 +220,7 @@ def update_benchmarks( if not ITR.isna(c.ghg_s3): c.ghg_s1s2 = c.ghg_s1s2 + c.ghg_s3 c.ghg_s3 = None # Q_(0.0, c.ghg_s3.u) - if not c.historic_data.empty(): + if not c.historic_data.empty: def _adjust_historic_data(data, primary_scope_attr, data_adder): if data[primary_scope_attr]: @@ -365,7 +384,9 @@ def update_trajectories(self): # We cannot only update trajectories without regard for all that depend on those trajectories # For example, different benchmarks may have different scopes defined, units for benchmarks, etc. logger.info( - f"re-calculating trajectories for {len(self.company_data._companies)} companies\n (times {len(EScope.get_scopes())} scopes times {self.company_data.projection_controls.TARGET_YEAR-self.company_data.projection_controls.BASE_YEAR} years)" + f"re-calculating trajectories for {len(self.company_data._companies)} companies" + f"\n (times {len(EScope.get_scopes())} scopes times " + f"{self.company_data.projection_controls.TARGET_YEAR-self.company_data.projection_controls.BASE_YEAR} years)" ) for company in self.company_data._companies: company.projected_intensities = None @@ -526,9 +547,12 @@ def _process_company_data( df_budget = DataWarehouse._get_cumulative_emissions( projected_ei=budgeted_ei, projected_production=projected_production ) - base_year_scale = df_trajectory.loc[df_budget.index][base_year].mul( - df_budget[base_year].map(lambda x: Q_(0.0, f"1/({x.u})") if x.m == 0.0 else 1 / x) - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + base_year_scale = df_trajectory.loc[df_budget.index][base_year].mul( + df_budget[base_year].map(lambda x: Q_(0.0, f"1/({x.u})") if x.m == 0.0 else 1 / x) + ) df_scaled_budget = df_budget.mul(base_year_scale, axis=0) # FIXME: we calculate exceedance only against df_budget, not also df_scaled_budget # df_trajectory_exceedance = self._get_exceedance_year(df_trajectory, df_budget, self.company_data.projection_controls.TARGET_YEAR, None) @@ -577,7 +601,7 @@ def get_preprocessed_company_data(self, company_ids: List[str]) -> List[ICompany company_data = self.company_data.get_company_data(company_ids) df_company_data = pd.DataFrame.from_records([dict(c) for c in company_data]).set_index( - self.column_config.COMPANY_ID, drop=False + self.company_data.column_config.COMPANY_ID, drop=False ) valid_company_ids = df_company_data.index.to_list() @@ -649,7 +673,7 @@ def _convert_df_to_model(self, df_company_data: pd.DataFrame) -> List[ICompanyAg except ValidationError: logger.warning( "(one of) the input(s) of company %s is invalid and will be skipped" - % company_data[self.column_config.COMPANY_NAME] + % company_data[self.company_data.column_config.COMPANY_NAME] ) pass return model_companies @@ -671,31 +695,34 @@ def _get_cumulative_emissions(cls, projected_ei: pd.DataFrame, projected_product # Ensure that projected_production is ordered the same as projected_ei, preserving order of projected_ei # projected_production is constructed to be limited to the years we want to analyze - proj_prod_t = asPintDataFrame(projected_production.loc[projected_ei.index].T) - # Limit projected_ei to the year range of projected_production - proj_ei_t = asPintDataFrame(projected_ei[proj_prod_t.index].T) - units_CO2e = "t CO2e" - # We use pd.concat because pd.combine reverts our PintArrays into object arrays :-/ - proj_CO2e_m_t = pd.concat( - [ - ITR.data.osc_units.align_production_to_bm(proj_prod_t[col], proj_ei_t[col]) - .mul(proj_ei_t[col]) - .pint.m_as(units_CO2e) - for col in proj_ei_t.columns - ], - axis=1, - ) - # pd.concat names parameter refers to index.names; there's no way to set columns.names - proj_CO2e_m_t.columns.names = proj_ei_t.columns.names - if ITR.HAS_UNCERTAINTIES: - # Sum both the nominal and std_dev values, because these series are completely correlated - # Note that NaNs in this dataframe will be nan+/-nan, showing up in both nom and err - nom_CO2e_m_t = proj_CO2e_m_t.apply(ITR.nominal_values).cumsum() - err_CO2e_m_t = proj_CO2e_m_t.apply(ITR.std_devs).cumsum() - cumulative_emissions_m_t = nom_CO2e_m_t.combine(err_CO2e_m_t, ITR.recombine_nom_and_std) - else: - cumulative_emissions_m_t = proj_CO2e_m_t.cumsum() - return cumulative_emissions_m_t.T.astype(f"pint[{units_CO2e}]") + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + proj_prod_t = asPintDataFrame(projected_production.loc[projected_ei.index].T) + # Limit projected_ei to the year range of projected_production + proj_ei_t = asPintDataFrame(projected_ei[proj_prod_t.index].T) + units_CO2e = "t CO2e" + # We use pd.concat because pd.combine reverts our PintArrays into object arrays :-/ + proj_CO2e_m_t = pd.concat( + [ + ITR.data.osc_units.align_production_to_bm(proj_prod_t[col], proj_ei_t[col]) + .mul(proj_ei_t[col]) + .pint.m_as(units_CO2e) + for col in proj_ei_t.columns + ], + axis=1, + ) + # pd.concat names parameter refers to index.names; there's no way to set columns.names + proj_CO2e_m_t.columns.names = proj_ei_t.columns.names + if ITR.HAS_UNCERTAINTIES: + # Sum both the nominal and std_dev values, because these series are completely correlated + # Note that NaNs in this dataframe will be nan+/-nan, showing up in both nom and err + nom_CO2e_m_t = proj_CO2e_m_t.apply(ITR.nominal_values).cumsum() + err_CO2e_m_t = proj_CO2e_m_t.apply(ITR.std_devs).cumsum() + cumulative_emissions_m_t = nom_CO2e_m_t.combine(err_CO2e_m_t, ITR.recombine_nom_and_std) + else: + cumulative_emissions_m_t = proj_CO2e_m_t.cumsum() + return cumulative_emissions_m_t.T.astype(f"pint[{units_CO2e}]") @classmethod def _get_exceedance_year( diff --git a/src/ITR/data/template.py b/src/ITR/data/template.py index 4ad790de..4ca62c6b 100644 --- a/src/ITR/data/template.py +++ b/src/ITR/data/template.py @@ -253,10 +253,15 @@ def __init__( self.template_v2_start_year = None self.projection_controls = projection_controls # The initial population of companies' data - self._companies = self._init_from_template_company_data(excel_path) - super().__init__(self._companies, column_config, projection_controls) - # The perfection of historic ESG data (adding synthethic company sectors, dropping those with missing data) - self._companies = self._convert_from_template_company_data() + if excel_path: + self._own_data = True + self._companies = self._init_from_template_company_data(excel_path) + super().__init__(self._companies, column_config, projection_controls) + # The perfection of historic ESG data (adding synthethic company sectors, dropping those with missing data) + self._companies = self._convert_from_template_company_data() + else: + self._own_data = False + self._companies = [] # When rows of data are expressed in terms of scope intensities, solve for the implied production # This function is called before we've decided on "best" production, and indeed generates candidates for "best" emissions @@ -411,13 +416,15 @@ def _solve_intensities(self, df_fundamentals: pd.DataFrame, df_esg: pd.DataFrame ], axis=1, ) - if ITR.HAS_UNCERTAINTIES: - df4 = df3_t.astype( - "pint[t CO2e]" - ).T # .drop_duplicates() # When we have uncertainties, multiple observations influence the observed error term - # Also https://github.com/pandas-dev/pandas/issues/12693 - else: - df4 = df3_t.astype("pint[t CO2e]").T.drop_duplicates() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if ITR.HAS_UNCERTAINTIES: + df4 = df3_t.astype( + "pint[t CO2e]" + ).T # .drop_duplicates() # When we have uncertainties, multiple observations influence the observed error term + # Also https://github.com/pandas-dev/pandas/issues/12693 + else: + df4 = df3_t.astype("pint[t CO2e]").T.drop_duplicates() df5 = df4.droplevel( [ColumnsConfig.COMPANY_ID, ColumnsConfig.TEMPLATE_REPORT_DATE] ).swaplevel() # .sort_index() @@ -514,7 +521,7 @@ def _init_from_template_company_data(self, excel_path: str): ].ffill() # NA in exposure is how we drop rows we want to ignore - df = df[df.exposure.notna()] + df = df[df.exposure.notna()].copy() # TODO: Fix market_cap column naming inconsistency df.rename( @@ -569,7 +576,7 @@ def _init_from_template_company_data(self, excel_path: str): ) ) ): - error_message = f"All data should be in the same currency." + error_message = "All data should be in the same currency." logger.error(error_message) raise ValueError(error_message) elif fx_quote.any(): @@ -633,7 +640,7 @@ def convert_prefix_to_scalar(x): ) else: if len(df_fundamentals[ColumnsConfig.COMPANY_CURRENCY].unique()) != 1: - error_message = f"All data should be in the same currency." + error_message = "All data should be in the same currency." logger.error(error_message) raise ValueError(error_message) for col in fundamental_metrics: @@ -736,9 +743,11 @@ def convert_prefix_to_scalar(x): # Disable rows we do not yet handle df_esg = df_esg[~df_esg.metric.isin(["generation", "consumption"])] if ColumnsConfig.BASE_YEAR in df_esg.columns: - df_esg = df_esg[df_esg.base_year.map(lambda x: isinstance(x, str) or x.lower() != "x")] + df_esg = df_esg[ + df_esg[ColumnsConfig.BASE_YEAR].map(lambda x: not isinstance(x, str) or x.lower() != "x") + ] if "submetric" in df_esg.columns: - df_esg = df_esg[df_esg.submetric.map(lambda x: isinstance(x, str) or x.lower() != "ignore")] + df_esg = df_esg[df_esg.submetric.map(lambda x: not isinstance(x, str) or x.lower() != "ignore")] # FIXME: Should we move more df_esg work up here? self.df_esg = df_esg self.df_fundamentals = df_fundamentals @@ -862,7 +871,7 @@ def _fixup_name(x): f"Company {company_id} uses multiple units describing scopes " f"{[s for s in em_unit_ambig.loc[[company_id]]['metric']]}" ) - logger.warning(f"The ITR Tool will choose one and covert all to that") + logger.warning("The ITR Tool will choose one and covert all to that") em_units = em_metrics.groupby(by=["company_id"], group_keys=True).first() # We update the metrics we were told with the metrics we are given @@ -1244,14 +1253,21 @@ def _fixup_name(x): logger.warning(f"Dropping NULL-valued production data for these indexes\n{df3_null_idx}") df3_num_t = df3_num_t[~df3_null_idx] df3_denom_t = df3_denom_t[~df3_null_idx] - df4 = ( - df3_num_t - * df3_denom_t.rdiv(1.0).apply( - lambda x: x.map( - lambda y: x.dtype.na_value if ITR.isna(y) else Q_(0, x.dtype.units) if y.m == np.inf else y + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df4 = ( + df3_num_t + * df3_denom_t.rdiv(1.0).apply( + lambda x: x.map( + lambda y: x.dtype.na_value + if ITR.isna(y) + else Q_(0, x.dtype.units) + if np.isinf(ITR.nominal_values(y.m)) + else y + ) ) - ) - ).T + ).T df4["variable"] = VariablesConfig.EMISSIONS_INTENSITIES df4 = df4.reset_index().set_index(["company_id", "variable", "scope"]) # Build df5 from PintArrays, not object types @@ -1260,7 +1276,10 @@ def _fixup_name(x): df3_denom_t = pd.concat({VariablesConfig.PRODUCTIONS: df3_denom_t}, names=["variable"], axis=1) df3_denom_t = pd.concat({"production": df3_denom_t}, names=["scope"], axis=1) df3_denom_t.columns = df3_denom_t.columns.reorder_levels(["company_id", "variable", "scope"]) - df5 = pd.concat([df3_num_t.T, df3_denom_t.T, df4]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df5 = pd.concat([df3_num_t.T, df3_denom_t.T, df4]) df_historic_data = df5 @@ -1380,7 +1399,7 @@ def unique_ids(mask): ) if c_ids_with_nonnumeric_target: error_message = ( - f"Non-numeric target reduction ambition is invalid; please fix companies with ID: " + "Non-numeric target reduction ambition is invalid; please fix companies with ID: " f"{c_ids_with_nonnumeric_target}" ) logger.error(error_message) @@ -1388,7 +1407,7 @@ def unique_ids(mask): c_ids_with_increase_target = list(target_data[target_data["target_reduction_ambition"] < 0].index) if c_ids_with_increase_target: error_message = ( - f"Negative target reduction ambition is invalid and entered for companies with ID: " + "Negative target reduction ambition is invalid and entered for companies with ID: " f"{c_ids_with_increase_target}" ) logger.error(error_message) diff --git a/src/ITR/data/vault_providers.py b/src/ITR/data/vault_providers.py index 376d21c0..48671282 100644 --- a/src/ITR/data/vault_providers.py +++ b/src/ITR/data/vault_providers.py @@ -1,16 +1,21 @@ import logging import os import pathlib -from typing import List, Optional, Type +import warnings +from abc import ABC +from typing import Callable, List, Optional, Type, Union import numpy as np import osc_ingest_trino as osc import pandas as pd import sqlalchemy from dotenv import load_dotenv +from mypy_boto3_s3.service_resource import Bucket + +import ITR from ..configs import ColumnsConfig, LoggingConfig, ProjectionControls -from ..data import PintArray, ureg +from ..data import PintArray, PintType, ureg # Rather than duplicating a few methods from BaseCompanyDataProvider, we just call them to delegate to them from ..data.base_providers import BaseCompanyDataProvider @@ -20,7 +25,8 @@ ProductionBenchmarkDataProvider, ) from ..data.data_warehouse import DataWarehouse -from ..data.osc_units import Quantity +from ..data.osc_units import Q_, EmissionsQuantity, Quantity, delta_degC_Quantity +from ..data.template import TemplateProviderCompany from ..interfaces import ( EScope, IBenchmark, @@ -30,26 +36,16 @@ IProductionBenchmarkScopes, ) +# re_simplify_units = r" \/ (\w+)( \/ (\w+))? \* \1(?(3) \* \3|)" +re_simplify_units_both = r" \/ (\w+) \/ (\w+) \* \1 \* \2" +re_simplify_units_one = r" \/ (\w+) \* \1" + # TODO handle ways to append information (from other providers, other benchmarks, new scope info, new corp data updates, etc) logger = logging.getLogger(__name__) LoggingConfig.add_config_to_logger(logger) -# Load some standard environment variables from a dot-env file, if it exists. -# If no such file can be found, does not fail, and so allows these environment vars to -# be populated in some other way -dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("HOME", "/opt/app-root/src")) -dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env" -if os.path.exists(dotenv_path): - load_dotenv(dotenv_path=dotenv_path, override=True) - -ingest_catalog = "osc_datacommons_dev" -ingest_schema = "demo_dv" -demo_schema = "demo_dv" - -engine = osc.attach_trino_engine(verbose=True, catalog=ingest_catalog, schema=ingest_schema) - # If DF_COL contains Pint quantities (because it is a PintArray or an array of Pint Quantities), # return a two-column dataframe of magnitudes and units. @@ -66,8 +62,11 @@ def dequantify_column(df_col: pd.Series) -> pd.DataFrame: elif df_col.size == 0: return df_col elif isinstance(df_col.iloc[0], Quantity): # type: ignore - m, u = list(zip(*df_col.map(lambda x: (np.nan, "dimensionless") if pd.isna(x) else (x.m, str(x.u))))) - return pd.DataFrame({df_col.name: m, df_col.name + "_units": u}, index=df_col.index) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + m, u = list(zip(*df_col.map(lambda x: (np.nan, "dimensionless") if pd.isna(x) else (x.m, str(x.u))))) + return pd.DataFrame({df_col.name: m, df_col.name + "_units": u}, index=df_col.index).convert_dtypes() else: return df_col @@ -104,8 +103,13 @@ def requantify_df(df: pd.DataFrame, typemap={}) -> pd.DataFrame: new_col = PintArray(df[col], dtype=f"pint[{ureg(df[units_col].iloc[0]).u}]") else: # Make a pd.Series of Quantity in a way that does not throw UnitStrippedWarning + if df[col].map(lambda x: x is None).any(): + # breakpoint() + raise new_col = pd.Series(data=df[col], name=col) * pd.Series( - data=df[units_col].map(lambda x: typemap.get(col, "dimensionless") if pd.isna(x) else ureg(x).u), + data=df[units_col].map( + lambda x: typemap.get(col, ureg("dimensionless").u) if pd.isna(x) else ureg(x).u + ), name=col, ) if col in typemap.keys(): @@ -118,28 +122,80 @@ def requantify_df(df: pd.DataFrame, typemap={}) -> pd.DataFrame: return df -def create_table_from_df( +class VaultInstance(ABC): + def __init__( + self, + engine: sqlalchemy.Engine, + schema: Optional[str] = "", + hive_bucket: Optional[Bucket] = None, + hive_catalog: Optional[str] = None, + hive_schema: Optional[str] = None, + ): + """ + As an alternative to using FastAPI interfaces, this creates an interface allowing access to Production benchmark data via the Data Vault. + :param engine: the Sqlalchemy connect to the Data Vault + :param schema: The database schema where the Data Vault lives + :param hive_bucket, hive_catalog, hive_schema: Optional parameters to enable fast ingestion via Hive; otherwise uses Trino batch insertion (which is slow) + """ + super().__init__() + self.engine = engine + self.schema = schema or engine.dialect.default_schema_name or "demo_dv" + self.hive_bucket = hive_bucket + self.hive_catalog = hive_catalog + self.hive_schema = hive_schema + + +def create_vault_table_from_df( df: pd.DataFrame, - schemaname: str, tablename: str, - engine: sqlalchemy.engine.base.Engine, + vault: VaultInstance, verbose=False, ): - drop_table = f"drop table if exists {schemaname}.{tablename}" - qres = osc._do_sql(drop_table, engine, verbose) - logger.debug("dtypes, columns, and index of create_table_from_df(df...)") + """ + Create a table in the Data Vault + + :param df: The DataFrame to be written as a table in the Data Vault + :param schemaname: The schema where the table should be written + :param tablename: The name of the table in the Data Vault + :param engine: The SqlAlchemy connection to the Data Vault + :param hive_bucket: :param hive_catalog: :param hive_schema: Optional paramters. If given we attempt to use a fast Hive ingestion process. Otherwise use default (and slow) Trino ingestion. + :param verbose: If True, log information about actions of the Data Vault as they happen + """ + drop_table = f"drop table if exists {vault.schema}.{tablename}" + qres = osc._do_sql(drop_table, vault.engine, verbose) + logger.debug("dtypes, columns, and index of create_vault_table_from_df(df...)") logger.debug(df.dtypes) logger.debug(df.columns) logger.debug(df.index) - new_df = dequantify_df(df) - new_df.to_sql( - tablename, - con=engine, - schema=schemaname, - if_exists="append", - index=False, - method=osc.TrinoBatchInsert(batch_size=5000, verbose=True), - ) + new_df = dequantify_df(df).convert_dtypes() + if vault.hive_bucket is not None: + osc.fast_pandas_ingest_via_hive( + new_df, + vault.engine, + None, + vault.schema, + tablename, + vault.hive_bucket, + vault.hive_catalog, + vault.hive_schema, + partition_columns=["year"] if "year" in new_df.columns else [], + overwrite=True, + typemap={ + "datetime64[ns]": "timestamp(6)", + "datetime64[ns, UTC]": "timestamp(6)", + # "Int16":"integer", "int16":"integer" + }, + verbose=verbose, + ) + else: + new_df.to_sql( + tablename, + con=vault.engine, + schema=vault.schema, + if_exists="append", + index=False, + method=osc.TrinoBatchInsert(batch_size=5000, verbose=verbose), + ) # When reading SQL tables to import into DataFrames, it is up to the user to preserve {COL}, {COL}_units pairings so they can be reconstructed. @@ -151,33 +207,33 @@ def create_table_from_df( def read_quantified_sql( sql: str, - tablename, - schemaname, - engine: sqlalchemy.engine.base.Engine, - index_col=None, + tablename: Union[str, None], + engine: sqlalchemy.Engine, + schemaname: Optional[str] = "", + index_col: Optional[Union[List[str], str, None]] = None, ) -> pd.DataFrame: - qres = osc._do_sql(f"describe {schemaname}.{tablename}", engine, verbose=False) - # tabledesc will be a list of tuples (column, type, extra, comment) - colnames = [x[0] for x in qres] # read columns normally...this will be missing any unit-related information sql_df = pd.read_sql(sql, engine, index_col) - # if the query requests columns that don't otherwise bring unit information along with them, get that information too - extra_unit_columns = [ - (i, f"{col}_units") - for i, col in enumerate(sql_df.columns) - if f"{col}_units" not in sql_df.columns and f"{col}_units" in colnames - ] - if extra_unit_columns: - extra_unit_columns_positions = [ - (i, extra_unit_columns[i][0], extra_unit_columns[i][1]) for i in range(len(extra_unit_columns)) + if tablename: + qres = osc._do_sql(f"describe {schemaname}.{tablename}", engine, verbose=False) + # tabledesc will be a list of tuples (column, type, extra, comment) + colnames = [x[0] for x in qres] + # if the query requests columns that don't otherwise bring unit information along with them, get that information too + extra_unit_columns = [ + (i, f"{col}_units") + for i, col in enumerate(sql_df.columns) + if f"{col}_units" not in sql_df.columns and f"{col}_units" in colnames ] - for col_tuple in extra_unit_columns_positions: - logger.error( - f"Missing units column '{col_tuple[2]}' after original column '{sql_df.columns[col_tuple[1]]}' (should be column #{col_tuple[0]+col_tuple[1]+1} in new query)" - ) - raise ValueError - else: - return requantify_df(sql_df).convert_dtypes() + if extra_unit_columns: + extra_unit_columns_positions = [ + (i, extra_unit_columns[i][0], extra_unit_columns[i][1]) for i in range(len(extra_unit_columns)) + ] + for col_tuple in extra_unit_columns_positions: + logger.error( + f"Missing units column '{col_tuple[2]}' after original column '{sql_df.columns[col_tuple[1]]}' (should be column #{col_tuple[0]+col_tuple[1]+1} in new query)" + ) + raise ValueError + return requantify_df(sql_df).convert_dtypes() # Basic Corp Data Asumptions @@ -193,65 +249,387 @@ def read_quantified_sql( # Benchmarks are named (e.g., 'OECM') -class VaultCompanyDataProvider(CompanyDataProvider): - """ - This class serves primarily for connecting to the ITR tool to the Data Vault via Trino. - - :param company_table: the name of the Trino table that contains fundamental data for companies - :param target_table: the name of the Trino table that contains company (emissions intensity) target data (and possibly historical data) - :param trajectory_table: the name of the Trino table that contains company (emissions intensity) historical data (and possibly trajectory data) - :param company_schema: the name of the schema where the company_table is found - :param column_config: An optional ColumnsConfig object containing relevant variable names - """ - +class VaultProviderProductionBenchmark(ProductionBenchmarkDataProvider): def __init__( self, - engine: sqlalchemy.engine.base.Engine, - company_table: str, - target_table: str = "", - trajectory_table: str = "", - company_schema: str = "", + vault: VaultInstance, + benchmark_name: str, + prod_df: pd.DataFrame = pd.DataFrame(), column_config: Type[ColumnsConfig] = ColumnsConfig, ): + """ + As an alternative to using FastAPI interfaces, this creates an interface allowing access to Production benchmark data via the Data Vault. + :param vault: the Data Vault instance + :param benchmark_name: the table name of the benchmark (in Trino) + :param production_benchmarks: List of IBenchmarkScopes + :param column_config: An optional ColumnsConfig object containing relevant variable names + """ super().__init__() - self._engine = engine - self._schema = company_schema or engine.dialect.default_schema_name or "demo_dv" - self._company_table = company_table + self._v = vault + self._benchmark_name = benchmark_name self.column_config = column_config - # Validate and complete the projected trajectories - self._target_table = target_table or company_table.replace("company_", "target_") # target_data - self._trajectory_table = trajectory_table or company_table.replace("company_", "trajectory_") # trajectory_data - self._production_table = company_table.replace("company_", "production_") # production_data - self._emissions_table = company_table.replace("company_", "emissions_") # emissions_data - companies_without_projections = osc._do_sql( - f""" -select C.company_name, C.company_id from {self._schema}.{self._company_table} C left join {self._schema}.{self._target_table} EI on EI.company_id=C.company_id -where EI.ei_s1_by_year is NULL and EI.ei_s1s2_by_year is NULL and EI.ei_s1s2s3_by_year is NULL -""", - self._engine, - verbose=True, - ) - if companies_without_projections: - logger.error( - f"Provide either historic emissions data or projections for companies with IDs {companies_without_projections}" + if prod_df.empty: + self._own_data = False + # unstack and reshape what we read from SQL + prod_df = read_quantified_sql( + f"select sector, region, year, production, production_units from {self._benchmark_name}", + None, + self._v.engine, + index_col=["sector", "region", "year"], ) + prod_df["scope"] = EScope.AnyScope + self._prod_df = prod_df.set_index("scope", append=True).unstack(level=2) + else: + self._own_data = True + self._prod_df = prod_df + df = prod_df.stack(level=0).to_frame("production").reset_index() + df.scope = df.scope.map(lambda x: x.name) + create_vault_table_from_df(df, benchmark_name, self._v) - def get_projection_controls(self) -> ProjectionControls: - raise NotImplementedError + def benchmark_changed(self, new_projected_production: ProductionBenchmarkDataProvider) -> bool: + # The Data Vault does not keep its own copies of benchmarks + return False - def get_company_ids(self) -> List[str]: - raise NotImplementedError + # Production benchmarks are dimensionless, relevant for AnyScope + def _get_projected_production(self, scope: EScope = EScope.AnyScope) -> pd.DataFrame: + """ + Converts IProductionBenchmarkScopes into dataframe for a scope + :param scope: a scope + :return: a pint[dimensionless] pd.DataFrame + """ + return self._prod_df + + def get_company_projected_production(self, company_sector_region_scope: pd.DataFrame) -> pd.DataFrame: + """ + get the projected productions for list of companies + :param company_sector_region_scope: DataFrame with at least the following columns : + ColumnsConfig.COMPANY_ID, ColumnsConfig.SECTOR, ColumnsConfig.REGION, ColumnsConfig.SCOPE + :return: DataFrame of projected productions for [base_year through 2050] + """ + + if self._prod_df.empty: + # breakpoint() + raise + # select company_id, year, production_by_year, production_by_year_units from itr_production_data where company_id='US00130H1059' order by year; + else: + from ..utils import get_benchmark_projections + + company_benchmark_projections = get_benchmark_projections(self._prod_df, company_sector_region_scope) - def _validate_projected_trajectories( - self, companies: List[ICompanyData], ei_benchmarks: IntensityBenchmarkDataProvider + company_production = company_sector_region_scope.set_index(self.column_config.SCOPE, append=True)[ + self.column_config.BASE_YEAR_PRODUCTION + ] + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # We have to use lambda function here because company_production is heterogeneous, not a PintArray + nan_production = company_production.map(lambda x: ITR.isna(x)) + if nan_production.any(): + # If we don't have valid production data for base year, we get back a nan result that's a pain to debug, so nag here + logger.error( + f"these companies are missing production data: {nan_production[nan_production].index.get_level_values(0).to_list()}" + ) + # We transpose the operation so that Pandas is happy to preserve the dtype integrity of the column + company_projected_productions_t = company_benchmark_projections.T.mul(company_production, axis=1) + return company_projected_productions_t.T + + +class VaultProviderIntensityBenchmark(IntensityBenchmarkDataProvider): + def __init__( + self, + vault: VaultInstance, + benchmark_name: str, + ei_df_t: pd.DataFrame = pd.DataFrame(), + benchmark_temperature: delta_degC_Quantity = Q_(1.5, "delta_degC"), + benchmark_global_budget: EmissionsQuantity = Q_(396, "Gt CO2e"), + is_AFOLU_included: bool = False, + production_centric: bool = False, + column_config: Type[ColumnsConfig] = ColumnsConfig, + projection_controls: ProjectionControls = ProjectionControls(), + # What to do about **kwargs? ): """ - Called when benchmark data is first known, or when projection control parameters or benchmark data changes. - COMPANIES are a list of companies with historic data that need to be projected. - EI_BENCHMARKS are the benchmarks for all sectors, regions, and scopes - In previous incarnations of this function, no benchmark data was needed for any reason. + As an alternative to using FastAPI interfaces, this creates an interface allowing access to Emission Intensity benchmark data via the Data Vault. + :param vault: the Data Vault instance + :param benchmark_name: the table name of the benchmark (in Trino) + :param production_centric: FIXME + :param ei_df_t: FIXME + :param column_config: An optional ColumnsConfig object containing relevant variable names + :param projection_controls: Projection Controls set the target BASE_YEAR, START_YEAR, and END_YEAR parameters of the model """ + self._v = vault + self._benchmark_name = benchmark_name + self.projection_controls = projection_controls + if ei_df_t.empty: + self._own_data = False + # unstack and reshape what we read from SQL + self._EI_df_t = ( + read_quantified_sql( + f"select sector, region, scope, year, intensity, intensity_units from {self._benchmark_name}", + None, + self._v.engine, + index_col=["sector", "region", "scope", "year"], + ) + .unstack(level="year") + .T + ) + self._EI_df_t = ITR.data.osc_units.asPintDataFrame(self._EI_df_t) + ei_bm_parameters = read_quantified_sql( + "select benchmark_temp, benchmark_temp_units, global_budget, global_budget_units, is_AFOLU_included, production_centric" + f" from {self._benchmark_name} limit 1", + None, + self._v.engine, + ) + super().__init__( + ei_bm_parameters["benchmark_temp"].squeeze(), + ei_bm_parameters["global_budget"].squeeze(), + ei_bm_parameters["is_AFOLU_included"].squeeze(), + ) + self.production_centric = ei_bm_parameters["production_centric"].squeeze() + else: + super().__init__( + benchmark_temperature, + benchmark_global_budget, + is_AFOLU_included, + ) # type: ignore + self._own_data = True + self._EI_df_t = ei_df_t + self.production_centric = production_centric + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df = ei_df_t.T.stack(level=0).to_frame("intensity").reset_index() + df.scope = df.scope.map(lambda x: x.name) + df["global_budget"] = benchmark_global_budget + df["benchmark_temp"] = benchmark_temperature + df["is_AFOLU_included"] = is_AFOLU_included + df["production_centric"] = production_centric + create_vault_table_from_df(df, benchmark_name, self._v) + + def get_scopes(self) -> List[EScope]: + scopes = self._EI_df_t.columns.get_level_values("scope").unique() + return scopes.tolist() + + def benchmarks_changed(self, new_projected_ei: IntensityBenchmarkDataProvider) -> bool: + assert hasattr(new_projected_ei, "_EI_df_t") + return self._EI_df_t.compare(new_projected_ei._EI_df_t).empty + + def prod_centric_changed(self, new_projected_ei: IntensityBenchmarkDataProvider) -> bool: + prev_prod_centric = self.production_centric + next_prod_centric = False + assert hasattr(new_projected_ei, "_EI_benchmarks") + if getattr(new_projected_ei._EI_benchmarks, "S1S2", None): + next_prod_centric = new_projected_ei._EI_benchmarks["S1S2"].production_centric + return prev_prod_centric != next_prod_centric + + def is_production_centric(self) -> bool: + """ + returns True if benchmark is "production_centric" (as defined by OECM) + """ + return self.production_centric + + def _get_intensity_benchmarks( + self, company_sector_region_scope: Optional[pd.DataFrame] = None, scope_to_calc: Optional[EScope] = None + ) -> pd.DataFrame: + """ + Overrides subclass method + returns dataframe of all EI benchmarks if COMPANY_SECTOR_REGION_SCOPE is None. Otherwise + returns a Dataframe with intensity benchmarks per company_id given a region and sector. + :param company_sector_region_scope: DataFrame indexed by ColumnsConfig.COMPANY_ID + with at least the following columns: ColumnsConfig.SECTOR, ColumnsConfig.REGION, and ColumnsConfig.SCOPE + :return: A DataFrame with company and intensity benchmarks; rows are calendar years, columns are company data + """ + if company_sector_region_scope is None: + return self._EI_df_t + sec_reg_scopes = company_sector_region_scope[["sector", "region", "scope"]] + if scope_to_calc is not None: + sec_reg_scopes = sec_reg_scopes[sec_reg_scopes.scope.eq(scope_to_calc)] + sec_reg_scopes_mi = pd.MultiIndex.from_frame(sec_reg_scopes).unique() + bm_proj_t = self._EI_df_t.loc[ + range( + self.projection_controls.BASE_YEAR, + self.projection_controls.TARGET_YEAR + 1, + ), + # Here we gather all requested combos as well as ensuring we have 'Global' regional coverage + # for sector/scope combinations that arrive with unknown region values + [ + col + for col in sec_reg_scopes_mi.append( + pd.MultiIndex.from_frame(sec_reg_scopes.assign(region="Global")) + ).unique() + if col in self._EI_df_t.columns + ], + ] + # This piece of work essentially does a column-based join (to avoid extra transpositions) + result = pd.concat( + [ + bm_proj_t[tuple(ser)].rename((idx, ser.iloc[2])) + if tuple(ser) in bm_proj_t + else bm_proj_t[ser_global].rename((idx, ser.iloc[2])) + if ( + ser_global := ( + ser.iloc[0], + "Global", + ser.iloc[2], + ) + ) + in bm_proj_t + else pd.Series() + for idx, ser in sec_reg_scopes.iterrows() + ], + axis=1, + ).dropna(axis=1, how="all") + result.columns = pd.MultiIndex.from_tuples(result.columns, names=["company_id", "scope"]) + return result + + # SDA stands for Sectoral Decarbonization Approach; see https://sciencebasedtargets.org/resources/files/SBTi-Power-Sector-15C-guide-FINAL.pdf + def get_SDA_intensity_benchmarks( + self, company_info_at_base_year: pd.DataFrame, scope_to_calc: Optional[EScope] = None + ) -> pd.DataFrame: + """ + Overrides subclass method + returns a Dataframe with intensity benchmarks per company_id given a region and sector. + :param company_info_at_base_year: DataFrame with at least the following columns : + ColumnsConfig.COMPANY_ID, ColumnsConfig.BASE_EI, ColumnsConfig.SECTOR, ColumnsConfig.REGION, ColumnsConfig.SCOPE + :return: A DataFrame with company and SDA intensity benchmarks per calendar year per row + """ + # To make pint happier, we do our math in columns that can be represented by PintArrays + intensity_benchmarks_t = self._get_intensity_benchmarks(company_info_at_base_year, scope_to_calc) raise NotImplementedError + decarbonization_paths_t = self._get_decarbonizations_paths(intensity_benchmarks_t) + last_ei = intensity_benchmarks_t.loc[self.projection_controls.TARGET_YEAR] + ei_base = intensity_benchmarks_t.loc[self.projection_controls.BASE_YEAR] + df_t = decarbonization_paths_t.mul((ei_base - last_ei), axis=1) + df_t = df_t.add(last_ei, axis=1) + df_t.index.name = "year" + idx = pd.Index.intersection( + df_t.columns, + pd.MultiIndex.from_arrays([company_info_at_base_year.index, company_info_at_base_year.scope]), + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # pint units don't like being twisted from columns to rows, but it's ok + df = df_t[idx].T + return df + + +class VaultCompanyDataProvider(BaseCompanyDataProvider): + def __init__( + self, + vault: VaultInstance, + company_table: str, + template_company_data: Union[TemplateProviderCompany, None], + column_config: Type[ColumnsConfig] = ColumnsConfig, + projection_controls: ProjectionControls = ProjectionControls(), + ): + """ + This class serves primarily for connecting to the ITR tool to the Data Vault via Trino. + + :param vault: the Data Vault instance + :param company_table: the name of the Trino table that contains fundamental data for companies + :param template_company_data: if not None, company data to ingest into company, target, and trajectory tables + :param column_config: An optional ColumnsConfig object containing relevant variable names + """ + super().__init__( + companies=[] if template_company_data is None else template_company_data._companies, + column_config=column_config, + ) + self._v = vault + self._company_table = company_table + self._production_table = "! uninitialized table !" + self._trajectory_table = "! uninitialized table !" + + if not template_company_data: + self._own_data = False + return + if not template_company_data.own_data: + # With our DataProvider object initialized, we'll use existing SQL table data for actual company data + return + + # Here we fill in the company data underlying the CompanyDataProvider + + df = ( + template_company_data.df_fundamentals[ + [ + "company_name", + "company_lei", + "company_id", + "sector", + "country", + "region", + "exposure", + "currency", + "report_date", + "company_market_cap", + "company_revenue", + "company_enterprise_value", + "company_ev_plus_cash", + "company_total_assets", + "cash", + "debt", + ] + ] + .copy() + .rename( + columns={ + "cash": "company_cash_equivalents", + "debt": "company_debt", + }, + ) + ) + df["year"] = df.report_date.dt.year + df.drop(columns="report_date", inplace=True) + + # ingest company data; no need to reset index because df_fundamentals also has "company_id" column + create_vault_table_from_df( + df, + self._company_table, + self._v, + verbose=True, + ) + + # We don't have any target nor trajectory projections until we connect benchmark data via DataWarehouse + + def get_company_fundamentals(self, company_ids: List[str]) -> pd.DataFrame: + """ + :param company_ids: A list of company IDs + :return: A pandas DataFrame with company fundamental info per company (company_id is a column) + """ + if self.own_data: + return super().get_company_fundamentals(company_ids) + + company_ids_sql = ",".join([f"'{cid}'" for cid in company_ids]) + # FIXME: doesn't work with heterogeneous currencies as written + df_fundamentals = read_quantified_sql( + f"select * from {self._company_table} where company_id in ({company_ids_sql})", + None, + self._v.engine, + index_col=self.column_config.COMPANY_ID, + ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df_prod = pd.read_sql( + f"select company_id, production_by_year, production_by_year_units from {self._production_table}" + f" where year={self.projection_controls.BASE_YEAR} and company_id in ({company_ids_sql})", + self._v.engine, + index_col=self.column_config.COMPANY_ID, + ).apply(lambda x: Q_(x.production_by_year, x.production_by_year_units), axis=1) + df_prod.name = self.column_config.BASE_YEAR_PRODUCTION + df_ei = pd.read_sql( + f"select company_id, ei_s1s2_by_year, ei_s1s2_by_year_units, ei_s3_by_year, ei_s3_by_year_units from {self._trajectory_table}" + f" where year={self.projection_controls.BASE_YEAR} and company_id in ({company_ids_sql})", + self._v.engine, + index_col=self.column_config.COMPANY_ID, + ).apply( + lambda x: [Q_(x.ei_s1s2_by_year, x.ei_s1s2_by_year_units), Q_(x.ei_s3_by_year, x.ei_s3_by_year_units)], + axis=1, + result_type="expand", + ) + df_em = df_ei.mul(df_prod, axis=0).rename( + columns={0: self.column_config.GHG_SCOPE12, 1: self.column_config.GHG_SCOPE3} + ) + df = pd.concat([df_fundamentals, df_prod, df_em], axis=1) + return df # The factors one would want to sum over companies for weighting purposes are: # * market_cap_usd @@ -260,6 +638,37 @@ def _validate_projected_trajectories( # * revenue_usd # * emissions + def get_company_projected_trajectories(self, company_ids: List[str], year=None) -> pd.DataFrame: + """ + :param company_ids: A list of company IDs + :param year: values for a specific year, or all years if None + :return: A pandas DataFrame with projected intensity trajectories per company, indexed by company_id and scope + """ + company_ids_sql = ",".join([f"'{cid}'" for cid in company_ids]) + if year is not None: + sql = f"select * from {self._trajectory_table} where year={self.projection_controls.BASE_YEAR} and company_id in ({company_ids_sql})" + else: + sql = f"select * from {self._trajectory_table} where company_id in ({company_ids_sql})" + df_ei = read_quantified_sql(sql, None, self._v.engine, index_col=self.column_config.COMPANY_ID) + if year: + df_ei.drop(columns="year", inplace=True) + for col in df_ei.columns: + if col.startswith("ei_") and col.endswith("_by_year"): + df_ei.rename(columns={col: EScope[col[3:-8].upper()]}, inplace=True) + elif col == "year": + pass + else: + df_ei.drop(columns=col, inplace=True) + df_ei.columns.name = "scope" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + if year is not None: + df_ei = df_ei.unstack(level=0) + else: + df_ei = df_ei.set_index("year", append=True).stack(level=0).unstack(level=1) + return df_ei.reorder_levels(["company_id", "scope"]) + # TODO: make return value a Quantity (USD or CO2) def sum_over_companies( self, @@ -269,38 +678,23 @@ def sum_over_companies( scope: EScope = EScope.S1S2, ) -> float: if factor == "enterprise_value_usd": - qres = osc._do_sql( - f"select sum(market_cap_usd + debt_usd - cash_usd) as {factor}_sum from {self._schema}.{self._company_table} where year={year}", - self._engine, - verbose=False, - ) + factor_sum = "select sum(market_cap_usd + debt_usd - cash_usd)" elif factor == "emissions": if scope in [EScope.S1, EScope.S2, EScope.S3]: - qres = osc._do_sql( - f"select sum(co2_{scope.name.lower()}_by_year) as {factor}_sum from {self._schema}.{self._emissions_table} where year={year}", - self._engine, - verbose=False, - ) + factor_sum = f"select sum(co2_{scope.name.lower()}_by_year)" elif scope == EScope.S1S2: - qres = osc._do_sql( - f"select sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)) as {factor}_sum from {self._schema}.{self._emissions_table} where year={year}", - self._engine, - verbose=False, - ) + factor_sum = "select sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year))" elif scope == EScope.S1S2S3: - qres = osc._do_sql( - f"select sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)+if(is_nan(co2_s3_by_year),0.0,co2_s3_by_year)) as {factor}_sum from {self._schema}.{self._emissions_table} where year={year}", - self._engine, - verbose=False, - ) + factor_sum = "select sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)+if(is_nan(co2_s3_by_year),0.0,co2_s3_by_year))" else: - assert False + raise ValueError(f"scope {scope} not supported") else: - qres = osc._do_sql( - f"select sum({factor}) as {factor}_sum from {self._schema}.{self._company_table} where year={year}", - self._engine, - verbose=False, - ) + factor_sum = f"select sum({factor})" + sql = f"{factor_sum} as {factor}_sum from {self._v.schema}.{self._company_table}" + if year is not None: + sql = f"{sql} where year={year}" + qres = osc._do_sql(sql, self._v.engine, verbose=False) + # qres[0] is the first row of the returned data; qres[0][0] is the first (and only) column of the row returned return qres[0][0] @@ -317,507 +711,345 @@ def compute_portfolio_weights( :param company_ids: A pd.Series of company IDs (ISINs) :return: A pd.Series weighted by the factor """ + from_sql = f"from {self._v.schema}.{self._company_table}" + group_sql = "group by company_id" if factor == "company_evic": - qres = osc._do_sql( - f"select company_id, sum(company_market_cap + company_cash_equivalents) as {factor} from {self._schema}.{self._company_table} group by company_id", - self._engine, - verbose=False, - ) + where_sql = "" + factor_sql = "select company_id, sum(company_market_cap + company_cash_equivalents)" elif factor == "emissions": + where_sql = f"where year = {year}" if scope in [EScope.S1, EScope.S2, EScope.S3]: - qres = osc._do_sql( - f"select company_id, sum(co2_{scope.name.lower()}_by_year) as {factor} from {self._schema}.{self._emissions_table} where year={year} group by company_id", - self._engine, - verbose=False, - ) + factor_sql = f"select company_id, sum(co2_{scope.name.lower()}_by_year)" elif scope == EScope.S1S2: - qres = osc._do_sql( - f"select company_id, sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)) as {factor} from {self._schema}.{self._emissions_table} where year={year} group by company_id", - self._engine, - verbose=False, - ) + factor_sql = "select company_id, sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year))" elif scope == EScope.S1S2: - qres = osc._do_sql( - f"select company_id, sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)+if(is_nan(co2_s3_by_year),0.0,co2_s3_by_year)) as {factor} from {self._schema}.{self._emissions_table} where year={year} group by company_id", - self._engne, - verbose=False, - ) + factor_sql = "select company_id, sum(co2_s1_by_year+if(is_nan(co2_s2_by_year),0.0,co2_s2_by_year)+if(is_nan(co2_s3_by_year),0.0,co2_s3_by_year))" else: - assert False + raise ValueError(f"scope {scope} not supported") else: - qres = osc._do_sql( - f"select company_id, sum({factor}) as {factor} from {self._schema}.{self._company_table} group by company_id", - self._engine, - verbose=False, - ) + sql = f"select company_id, sum({factor})" + qres = osc._do_sql( + f"{factor_sql} as {factor} {from_sql} {where_sql} {group_sql}", self._v.engine, verbose=False + ) weights = pd.Series(data=[s[1] for s in qres], index=[s[0] for s in qres], dtype=float) weights = weights.loc[pa_temp_scores.index.intersection(weights.index)] weight_sum = weights.sum() return pa_temp_scores * weights / weight_sum - def get_company_data(self, company_ids: Optional[List[str]] = None) -> List[ICompanyData]: - """ - Get all relevant data for a list of company ids. This method should return a list of ICompanyData - instances. - - :param company_ids: A list of company IDs (ISINs) - :return: A list containing the company data - """ - raise NotImplementedError + # Commented out because this doesn't include necessary base_year_production nor ghg_s1s2 nor ghg_s3 + # def get_company_fundamentals(self, company_ids: List[str]) -> pd.DataFrame: + # """ + # :param company_ids: A list of company IDs + # :return: A pandas DataFrame with company fundamental info per company + # """ + # or_clause = " or ".join([f"company_id = '{c}'" for c in company_ids]) + # sql = f"select * from {self._schema}.{self._company_table} where {or_clause}" + # df = read_quantified_sql(sql, self._company_table, self._engine, self._schema) + # # df = df.drop(columns=['projected_targets', 'projected_intensities']) + # return df.set_index(self.column_config.COMPANY_ID) - def get_value(self, company_ids: List[str], variable_name: str) -> pd.Series: - """ - Gets the value of a variable for a list of companies ids - :param company_ids: list of company ids - :param variable_name: variable name of the projected feature - :return: series of values - """ - raise NotImplementedError - def _calculate_target_projections( - self, - production_bm: ProductionBenchmarkDataProvider, - ei_bm: IntensityBenchmarkDataProvider, - ): - """ - We cannot calculate target projections until after we have loaded benchmark data. - We do so when companies are associated with benchmarks, in the DataWarehouse construction - - :param production_bm: A Production Benchmark (multi-sector, single-scope, 2020-2050) - :param ei_bm: Intensity Benchmarks for all sectors and scopes defined by the benchmark, 2020-2050 - """ - raise NotImplementedError - - def get_company_intensity_and_production_at_base_year(self, company_ids: List[str]) -> pd.DataFrame: - """ - overrides subclass method - :param: company_ids: list of company ids - :return: DataFrame the following columns : - ColumnsConfig.COMPANY_ID, ColumnsConfig.GHG_S1S2, ColumnsConfig.BASE_EI, ColumnsConfig.SECTOR and - ColumnsConfig.REGION - """ - raise NotImplementedError - - def get_company_fundamentals(self, company_ids: List[str]) -> pd.DataFrame: - """ - :param company_ids: A list of company IDs - :return: A pandas DataFrame with company fundamental info per company - """ - or_clause = " or ".join([f"company_id = '{c}'" for c in company_ids]) - sql = f"select * from {self._schema}.{self._company_table} where {or_clause}" - df = read_quantified_sql(sql, self._company_table, self._schema, self._engine) - # df = df.drop(columns=['projected_targets', 'projected_intensities']) - return df - - def get_company_projected_trajectories(self, company_ids: List[str]) -> pd.DataFrame: - """ - :param company_ids: A list of company IDs - :return: A pandas DataFrame with projected intensities per company - """ - raise NotImplementedError - - def get_company_projected_targets(self, company_ids: List[str]) -> pd.DataFrame: - """ - :param company_ids: A list of company IDs - :return: A pandas DataFrame with projected targets per company - """ - raise NotImplementedError - - def _allocate_emissions( - self, - new_companies: List[ICompanyData], - benchmarks_projected_ei: IntensityBenchmarkDataProvider, - projection_controls: ProjectionControls, - ): - """ - Use benchmark data from `ei_benchmarks` to allocate sector-level emissions from aggregated emissions. - For example, a Utility may supply both Electricity and Gas to customers, reported separately. - When we split the company into Electricity and Gas lines of business, we can allocate Scope emissions - to the respective lines of business using benchmark averages to guide the allocation. - """ - raise NotImplementedError - - -class VaultProviderProductionBenchmark(ProductionBenchmarkDataProvider): +# FIXME: Need to reshape the tables TARGET_DATA and TRAJECTORY_DATA so scope is a column and the EI data relates only to that scope (wide to long) +class DataVaultWarehouse(DataWarehouse): def __init__( self, - engine: sqlalchemy.engine.base.Engine, - benchmark_name: str, - production_benchmarks: IProductionBenchmarkScopes, - ingest_schema: str = "", - column_config: Type[ColumnsConfig] = ColumnsConfig, + vault: VaultInstance, + company_data: VaultCompanyDataProvider, + benchmark_projected_production: VaultProviderProductionBenchmark, + benchmarks_projected_ei: VaultProviderIntensityBenchmark, + estimate_missing_data: Optional[Callable[["DataWarehouse", ICompanyData], None]] = None, + itr_prefix: Optional[str] = "", ): """ - Base provider that relies on pydantic interfaces. Default for FastAPI usage - :param benchmark_name: the table name of the benchmark (in Trino) - :param production_benchmarks: List of IBenchmarkScopes - :param column_config: An optional ColumnsConfig object containing relevant variable names + Construct Data Vault tables for cumulative emissions budgets, trajectories, and targets, + which rely on trajectory and target projections from benchmark production and SDA pathways. + + Fundamentally: DataWarehouse(benchmark_ei, benchmark_prod, company_data) + -> { production_data, trajectory_data, target_data } + -> { cumulative_budgets, cumulative_emissions } + + :param engine: The Sqlalchemy connector to the Data Vault + :param company_data: as a VaultCompanyDataProvider, this provides both a reference to a fundamental company data table and data structures containing historic ESG data. Trajectory and Target projections also get filled in here. + :param benchmark_projected_production: A reference to the benchmark production table as well as data structures used by the Data Vault for projections + :param benchmark_projected_ei: A reference to the benchmark emissions intensity table as well as data structures used by the Data Vault for projections + :param estimate_missing_data: If provided, a function that can fill in missing S3 data (possibly by aligning to benchmark statistics) + :param ingest_schema: The database schema where the Data Vault lives + :param itr_prefix: A prefix for all tables so that different users can use the same schema without conflicts + :param hive_bucket: :param hive_catalog: :param hive_schema: Optional paramters. If given we attempt to use a fast Hive ingestion process. Otherwise use default (and slow) Trino ingestion. """ - super().__init__(production_benchmarks=production_benchmarks, column_config=column_config) - self._engine = engine - self._schema = ingest_schema or engine.dialect.default_schema_name or "demo_dv" - self.benchmark_name = benchmark_name - qres = osc._do_sql( - f"drop table if exists {self._schema}.{benchmark_name}", - self._engine, - verbose=False, - ) - df = pd.DataFrame() - for scope in ["AnyScope"]: - if production_benchmarks.model_dump()[scope] is None: - continue - for benchmark in production_benchmarks.model_dump()[scope]["benchmarks"]: - bdf = pd.DataFrame.from_dict( - { - r["year"]: [ - r["value"], - benchmark["region"], - benchmark["sector"], - scope, - ] - for r in benchmark["projections"] - }, - columns=["production", "region", "sector", "scope"], - orient="index", - ) - df = pd.concat([df, bdf]) - df.reset_index(inplace=True) - df.rename(columns={"index": "year"}, inplace=True) - df = df.convert_dtypes() - create_table_from_df(df, self._schema, benchmark_name, engine) - - def benchmark_changed(self, new_projected_production: ProductionBenchmarkDataProvider) -> bool: - # The Data Vault does not keep its own copies of benchmarks - return False - - def get_company_projected_production(self, *args, **kwargs): - return BaseCompanyDataProvider.get_company_projected_production(*args, **kwargs) - - def get_benchmark_projections(self, *args, **kwargs): - return BaseCompanyDataProvider.get_benchmark_projections(*args, **kwargs) - - -class VaultProviderIntensityBenchmark(IntensityBenchmarkDataProvider): - def __init__( - self, - engine: sqlalchemy.engine.base.Engine, - benchmark_name: str, - EI_benchmarks: IEIBenchmarkScopes, - ingest_schema: str = "", - column_config: Type[ColumnsConfig] = ColumnsConfig, - projection_controls: ProjectionControls = ProjectionControls(), - ): + # This initialization step adds trajectory and target projections to `company_data` super().__init__( - EI_benchmarks.benchmark_temperature, - EI_benchmarks.benchmark_global_budget, - EI_benchmarks.is_AFOLU_included, + company_data=company_data, # type: ignore + benchmark_projected_production=benchmark_projected_production, + benchmarks_projected_ei=benchmarks_projected_ei, + estimate_missing_data=estimate_missing_data, ) - self._engine = engine - self._schema = ingest_schema or engine.dialect.default_schema_name or "demo_dv" - self.benchmark_name = benchmark_name - self.column_config = column_config - self.projection_controls = projection_controls - df = pd.DataFrame() - for scope in EScope.get_scopes(): - if EI_benchmarks.model_dump()[scope] is None: - continue - for benchmark in EI_benchmarks.model_dump()[scope]["benchmarks"]: - benchmark_df = pd.DataFrame.from_dict( - { - r["year"]: [ - r["value"], - benchmark["region"], - benchmark["sector"], - scope, - EI_benchmarks.benchmark_global_budget, - EI_benchmarks.benchmark_temperature, - ] - for r in benchmark["projections"] - }, - columns=[ - "intensity", - "region", - "sector", - "scope", - "global_budget", - "benchmark_temp", - ], - orient="index", - ) - # TODO: AFOLU correction - df = pd.concat([df, benchmark_df]) - df.reset_index(inplace=True) - df.rename(columns={"index": "year"}, inplace=True) - df = df.convert_dtypes() - create_table_from_df(df, self._schema, benchmark_name, engine) - - def get_scopes(self) -> List[EScope]: - raise NotImplementedError - - def benchmarks_changed(self, new_projected_ei: IntensityBenchmarkDataProvider) -> bool: - # The Data Vault does not keep its own copies of benchmarks - return False - - def prod_centric_changed(self, new_projected_ei: IntensityBenchmarkDataProvider) -> bool: - # The Data Vault does not keep its own copies of benchmarks - return False - - def is_production_centric(self) -> bool: - """ - returns True if benchmark is "production_centric" (as defined by OECM) - """ - raise NotImplementedError - - def get_SDA_intensity_benchmarks(self, company_info_at_base_year: pd.DataFrame) -> pd.DataFrame: - """ - Overrides subclass method - returns a Dataframe with intensity benchmarks per company_id given a region and sector. - :param company_info_at_base_year: DataFrame with at least the following columns : - ColumnsConfig.COMPANY_ID, ColumnsConfig.BASE_EI ColumnsConfig.SECTOR and ColumnsConfig.REGION - :return: A DataFrame with company and SDA intensity benchmarks per calendar year per row - """ - intensity_benchmarks = self._get_intensity_benchmarks(company_info_at_base_year) - decarbonization_paths = self._get_decarbonizations_paths(intensity_benchmarks) - last_ei = intensity_benchmarks[self.projection_controls.TARGET_YEAR] - ei_base = company_info_at_base_year[self.column_config.BASE_EI] - - return decarbonization_paths.mul((ei_base - last_ei), axis=0).add(last_ei, axis=0) - - def _get_decarbonizations_paths(self, intensity_benchmarks: pd.DataFrame) -> pd.DataFrame: - """ - Overrides subclass method - Returns a DataFrame with the projected decarbonization paths for the supplied companies in intensity_benchmarks. - :param: A DataFrame with company and intensity benchmarks per calendar year per row - :return: A pd.DataFrame with company and decarbonisation path s per calendar year per row - """ - return intensity_benchmarks.apply(lambda row: self._get_decarbonization(row), axis=1) + self._v = vault + self._benchmark_prod_name = benchmark_projected_production._benchmark_name + self._benchmarks_ei_name = benchmarks_projected_ei._benchmark_name + self._company_table = company_data._company_table + self._target_table = self._company_table.replace("company_", "target_") # target_data + self._trajectory_table = self._company_table.replace("company_", "trajectory_") # trajectory_data + self._production_table = self._company_table.replace("company_", "production_") # production_data + self._emissions_table = f"{itr_prefix}cumulative_emissions" # cumulative_emissions + self._budgets_table = f"{itr_prefix}cumulative_budgets" # cumulative_budgets + self._overshoot_table = f"{itr_prefix}overshoot_ratios" # overshoot_ratios + self._tempscore_table = f"{itr_prefix}temperature_scores" # temperature_scores + + if not company_data.own_data: + for slot in ["_production_table", "_target_table", "_trajectory_table"]: + setattr(self.company_data, slot, getattr(self, slot)) + return - def _get_decarbonization(self, intensity_benchmark_row: pd.Series) -> pd.Series: - """ - Overrides subclass method - returns a Series with the decarbonization path for a benchmark. - :param: A Series with company and intensity benchmarks per calendar year per row - :return: A pd.Series with company and decarbonisation path s per calendar year per row - """ - first_ei = intensity_benchmark_row[self.projection_controls.BASE_YEAR] - last_ei = intensity_benchmark_row[self.projection_controls.TARGET_YEAR] - return intensity_benchmark_row.apply(lambda x: (x - last_ei) / (first_ei - last_ei)) + assert benchmark_projected_production.own_data - def _convert_benchmark_to_series(self, benchmark: IBenchmark) -> pd.Series: - """ - extracts the company projected intensities or targets for a given scope - :param feature: PROJECTED_EI or PROJECTED_TARGETS - :param scope: a scope - :return: pd.Series - """ - return pd.Series( - {r.year: r.value for r in benchmark.projections}, # type: ignore - name=(benchmark.region, benchmark.sector), + # Calculate base production data (and base emissions) + company_idx, sector_data, region_data, prod_data = zip( + *[ + (c.company_id, c.sector, c.region, c.base_year_production) + for c in company_data._companies + if c.company_id in company_data.get_company_ids() + ] ) - - def _get_projected_intensities(self, scope: EScope = EScope.S1S2) -> pd.Series: - """ - Converts IBenchmarkScopes into dataframe for a scope - :param scope: a scope - :return: pd.Series - """ - raise NotImplementedError - - def _get_intensity_benchmarks( - self, company_sector_region_info: Optional[pd.DataFrame] = None, scope_to_calc: Optional[EScope] = None - ) -> pd.DataFrame: - """ - Overrides subclass method - returns dataframe of all EI benchmarks if COMPANY_SECTOR_REGION_SCOPE is None. Otherwise - returns a Dataframe with intensity benchmarks per company_id given a region and sector. - :param company_sector_region_scope: DataFrame indexed by ColumnsConfig.COMPANY_ID - with at least the following columns: ColumnsConfig.SECTOR, ColumnsConfig.REGION, and ColumnsConfig.SCOPE - :return: A DataFrame with company and intensity benchmarks; rows are calendar years, columns are company data - """ - raise NotImplementedError - - -# FIXME: Need to reshape the tables TARGET_DATA and TRAJECTORY_DATA so scope is a column and the EI data relates only to that scope (wide to long) -class DataVaultWarehouse(DataWarehouse): - def __init__( - self, - engine: sqlalchemy.engine.base.Engine, - company_data: VaultCompanyDataProvider, - # This arrives as a table instantiated in the database - benchmark_projected_production: VaultProviderProductionBenchmark, - # This arrives as a table instantiated in the database - benchmarks_projected_ei: VaultProviderIntensityBenchmark, - ingest_schema: str = "", - itr_prefix: str = "", - column_config: Type[ColumnsConfig] = ColumnsConfig, - ): - super().__init__( - company_data=None, # type: ignore - benchmark_projected_production=None, - benchmarks_projected_ei=None, - column_config=column_config, + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df = pd.DataFrame( + data={ + "sector": sector_data, + "region": region_data, + "scope": [EScope.AnyScope] * len(company_idx), + "base_year_production": prod_data, + }, + index=pd.Index(company_idx, name="company_id"), + ).drop_duplicates() + company_info_at_base_year = df[~df["base_year_production"].map(lambda x: pd.isna(x))] + projected_production = benchmark_projected_production.get_company_projected_production( + company_info_at_base_year + ).droplevel("scope") + projected_production.columns.name = "year" + # Ingest production projections into Data Vault + create_vault_table_from_df( + projected_production.stack(level="year").to_frame(name="production_by_year").reset_index(), + self._production_table, + self._v, + verbose=True, ) - self._engine = engine - self._schema = ingest_schema or engine.dialect.default_schema_name or "demo_dv" - self._tempscore_table = f"{itr_prefix}temperature_scores" - # intensity_projections = read_quantified_sql(f"select * from {self._schema}.{self._target_table}", self._target_table, self._schema, self._engine) - # intensity_projections['scope'] = 'S1S2' - # intensity_projections['source'] = self._schema + # If we have company data, we need to compute trajectories and targets + projection_slots = ["_target_table", "_trajectory_table"] + + target_dfs: List[pd.DataFrame] = [] + trajectory_dfs: List[pd.DataFrame] = [] + + # Ingest target and trajectory projections into the Data Vault + for i, projection in enumerate(["projected_targets", "projected_intensities"]): + projection_dfs = [] + for company in company_data._companies: + ei_dict = {} + for scope in EScope.get_scopes(): + if getattr(company, projection)[scope]: + ei_dict[scope] = getattr(company, projection)[scope].projections + else: + ei_dict[scope] = pd.Series(dtype="object") + ei_data = pd.concat([ei_dict[scope] for scope in EScope.get_scopes()], axis=1).reset_index() + ei_data.columns = ["year"] + [f"ei_{scope.lower()}_by_year" for scope in EScope.get_scopes()] + df = pd.DataFrame( + data=[[company.company_name, "", company.company_id, company.sector, company.region]] + * len(ei_data.index), + columns=["company_name", "company_lei", "company_id", "sector", "region"], + ) + projection_dfs.append(pd.concat([df, ei_data], axis=1)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # Quieting warnings due to https://github.com/hgrecco/pint/issues/1897 + df2 = pd.concat(projection_dfs).reset_index(drop=True) + create_vault_table_from_df( + df2, + getattr(self, projection_slots[i]), + self._v, + verbose=True, + ) + # Inject projection tablename into company data (needed for `get_company_projected_trajectories` + setattr(self.company_data, projection_slots[i], getattr(self, projection_slots[i])) + assert isinstance(self.company_data, VaultCompanyDataProvider) + self.company_data._production_table = self._production_table - # If there's no company data, we are just using the vault, not initializing it - if company_data is None: - return - if benchmark_projected_production is None and benchmarks_projected_ei is None: - return + # Drop cumulative emissions tables so they get recalculated + for cumulative_attr in ["_emissions_table", "_budgets_table"]: + drop_sql = f"drop table if exists {self._v.schema}.{getattr(self, cumulative_attr)}" + osc._do_sql(drop_sql, self._v.engine, verbose=False) - # The DataVaultWarehouse provides three calculations per company: + # The DataVaultWarehouse provides three calculations per company (using SQL code rather than Python): # * Cumulative trajectory of emissions # * Cumulative target of emissions # * Cumulative budget of emissions (separately for each benchmark) - qres = osc._do_sql( - f"drop table if exists {self._schema}.{itr_prefix}cumulative_emissions", - self._engine, - verbose=False, - ) - qres = osc._do_sql( - f""" -create table {self._schema}.{itr_prefix}cumulative_emissions with ( - format = 'ORC', - partitioning = array['scope'] -) as -select C.company_name, C.company_id, '{company_data._schema}' as source, 'S1S2' as scope, - sum((ET.ei_s1_by_year+if(is_nan(ET.ei_s2_by_year),0.0,ET.ei_s2_by_year)) * P.production_by_year) as cumulative_trajectory, - concat(ET.ei_s1_by_year_units, ' * ', P.production_by_year_units) as cumulative_trajectory_units, - sum((EI.ei_s1_by_year+if(is_nan(EI.ei_s2_by_year),0.0,EI.ei_s2_by_year)) * P.production_by_year) as cumulative_target, - concat(EI.ei_s1_by_year_units, ' * ', P.production_by_year_units) as cumulative_target_units -from {company_data._schema}.{company_data._company_table} C - join {company_data._schema}.{company_data._production_table} P on P.company_id=C.company_id - join {company_data._schema}.{company_data._target_table} EI on EI.company_id=C.company_id and EI.year=P.year and EI.ei_s1_by_year is not NULL - join {company_data._schema}.{company_data._trajectory_table} ET on ET.company_id=C.company_id and ET.year=P.year and ET.ei_s1_by_year is not NULL -where P.year>=2020 -group by C.company_name, C.company_id, '{company_data._schema}', 'S1S2', - concat(ET.ei_s1_by_year_units, ' * ', P.production_by_year_units), - concat(EI.ei_s1_by_year_units, ' * ', P.production_by_year_units) -UNION ALL -select C.company_name, C.company_id, '{company_data._schema}' as source, 'S1S2S3' as scope, - sum((ET.ei_s1_by_year+if(is_nan(ET.ei_s2_by_year),0.0,ET.ei_s2_by_year)+if(is_nan(ET.ei_s3_by_year),0.0,ET.ei_s3_by_year)) * P.production_by_year) as cumulative_trajectory, - concat(ET.ei_s1_by_year_units, ' * ', P.production_by_year_units) as cumulative_trajectory_units, - sum((EI.ei_s1_by_year+if(is_nan(EI.ei_s2_by_year),0.0,EI.ei_s2_by_year)+if(is_nan(EI.ei_s3_by_year),0.0,EI.ei_s3_by_year)) * P.production_by_year) as cumulative_target, - concat(EI.ei_s1_by_year_units, ' * ', P.production_by_year_units) as cumulative_target_units -from {company_data._schema}.{company_data._company_table} C - join {company_data._schema}.{company_data._production_table} P on P.company_id=C.company_id - join {company_data._schema}.{company_data._target_table} EI on EI.company_id=C.company_id and EI.year=P.year and EI.ei_s1_by_year is not NULL - join {company_data._schema}.{company_data._trajectory_table} ET on ET.company_id=C.company_id and ET.year=P.year and ET.ei_s1_by_year is not NULL -where P.year>=2020 -group by C.company_name, C.company_id, '{company_data._schema}', 'S1S2S3', - concat(ET.ei_s1_by_year_units, ' * ', P.production_by_year_units), - concat(EI.ei_s1_by_year_units, ' * ', P.production_by_year_units) -""", - self._engine, - verbose=True, - ) + qres = osc._do_sql(f"show tables like '{self._emissions_table}'", self._v.engine, verbose=False) + if len(qres) == 0: + emissions_from_tables = f""" + {self._v.schema}.{self._company_table} C + join {self._v.schema}.{self._production_table} P on P.company_id=C.company_id + left join {self._v.schema}.{self._trajectory_table} EI on EI.company_id=C.company_id and EI.year=P.year and EI.ei_SCOPE_by_year is not NULL + left join {self._v.schema}.{self._target_table} ET on ET.company_id=C.company_id and ET.year=P.year and ET.ei_SCOPE_by_year is not NULL +""" + + create_emissions_sql = f"create table {self._v.schema}.{self._emissions_table} with (format = 'ORC', partitioning = array['scope']) as" + emissions_scope_sql = "UNION ALL".join( + [ + f""" +select C.company_name, C.company_id, '{self._v.schema}' as source, P.year, + sum(EI.ei_{scope}_by_year * P.production_by_year) over (partition by C.company_id order by P.year) as cumulative_trajectory, + if (EI.ei_{scope}_by_year_units is NULL, 't CO2e', + regexp_replace(regexp_replace(concat(EI.ei_{scope}_by_year_units, ' * ', P.production_by_year_units), + '{re_simplify_units_both}', ''), '{re_simplify_units_one}', '')) as cumulative_trajectory_units, + sum(ET.ei_{scope}_by_year * P.production_by_year) over (partition by C.company_id order by P.year) as cumulative_target, + if (ET.ei_{scope}_by_year_units is NULL, 't CO2e', + regexp_replace(regexp_replace(concat(ET.ei_{scope}_by_year_units, ' * ', P.production_by_year_units), + '{re_simplify_units_both}', ''), '{re_simplify_units_one}', '')) as cumulative_target_units, + '{scope.upper()}' as scope +from {emissions_from_tables.replace('SCOPE', scope)} +""" + for scope in map(str.lower, EScope.get_scopes()) + ] + ) + qres = osc._do_sql(f"{create_emissions_sql} {emissions_scope_sql}", self._v.engine, verbose=True) + assert len(qres) and len(qres[0]) and qres[0][0] > 0 - qres = osc._do_sql( - f"drop table if exists {self._schema}.{itr_prefix}cumulative_budget_1", - self._engine, - verbose=False, - ) - qres = osc._do_sql( - f""" -create table {self._schema}.{itr_prefix}cumulative_budget_1 with ( + qres = osc._do_sql(f"show tables like '{self._budgets_table}'", self._v.engine, verbose=False) + if len(qres) == 0: + # base_year_scale = trajectory / budget at base year (a scalar) + # scaled cumulative budget = base_year_scale * cumulative budget (a time series) + + budgets_from_productions = f""" +create table {self._v.schema}.{self._budgets_table} with ( format = 'ORC', partitioning = array['scope'] ) as -select C.company_name, C.company_id, '{company_data._schema}' as source, B.scope, 'benchmark_1' as benchmark, - B.global_budget, B.benchmark_temp, - sum(B.intensity * P.production_by_year) as cumulative_budget, - concat(B.intensity_units, ' * ', P.production_by_year_units) as cumulative_budget_units -from {company_data._schema}.{company_data._company_table} C - join {company_data._schema}.{company_data._production_table} P on P.company_id=C.company_id - join {self._schema}.{benchmarks_projected_ei.benchmark_name} B on P.year=B.year and C.region=B.region and C.sector=B.sector -where P.year>=2020 -group by C.company_name, C.company_id, '{company_data._schema}', B.scope, 'benchmark_1', B.global_budget, B.benchmark_temp, - concat(B.intensity_units, ' * ', P.production_by_year_units) -""", - self._engine, - verbose=True, - ) +with P_BY as (select distinct company_id, + first_value(year) over (partition by company_id order by year) as base_year, + first_value(production_by_year) over (partition by company_id order by year) as production_by_year + from {self._v.schema}.{self._production_table}) +select C.company_name, C.company_id, '{self._v.schema}' as source, P.year, -- FIXME: should have scenario_name and year released + B.global_budget, B.global_budget_units, B.benchmark_temp, B.benchmark_temp_units, + sum(B.intensity * P.production_by_year) over (partition by C.company_id, B.scope order by P.year) as cumulative_budget, + regexp_replace(regexp_replace(concat(B.intensity_units, ' * ', P.production_by_year_units), + '{re_simplify_units_both}', ''), '{re_simplify_units_one}', '') as cumulative_budget_units, + CE_BY.cumulative_trajectory/(B_BY.intensity * P_BY.production_by_year) + * sum(B.intensity * P.production_by_year) over (partition by C.company_id, B.scope order by P.year) as cumulative_scaled_budget, + CE_BY.cumulative_trajectory_units as cumulative_scaled_budget_units, + B.scope +from {self._v.schema}.{self._company_table} C + join P_BY on P_BY.company_id=C.company_id + join {self._v.schema}.{self._production_table} P on P.company_id=C.company_id + join {self._v.schema}.{self._benchmarks_ei_name} B on P.year=B.year and C.sector=B.sector and B.region=if(C.region in ('North America', 'Europe'), C.region, 'Global') + join {self._v.schema}.{self._emissions_table} CE on CE.company_id=C.company_id and B.scope=CE.scope and CE.year=P.year + join {self._v.schema}.{self._emissions_table} CE_BY on CE_BY.company_id=C.company_id and CE_BY.scope=B.scope and CE_BY.year=P_BY.base_year + join {self._v.schema}.{self._benchmarks_ei_name} B_BY on B.scope=B_BY.scope and B.region=B_BY.region and B.sector=B_BY.sector and B_BY.year=P_BY.base_year +""" + + qres = osc._do_sql( + budgets_from_productions, + self._v.engine, + verbose=True, + ) + assert len(qres) and len(qres[0]) and qres[0][0] > 0 def quant_init( self, - engine: sqlalchemy.engine.base.Engine, - company_data: VaultCompanyDataProvider, - ingest_schema: str = "", + vault: VaultInstance, + company_data: Union[VaultCompanyDataProvider, None], itr_prefix: str = "", ): # The Quant users of the DataVaultWarehouse produces two calculations per company: # * Target and Trajectory overshoot ratios # * Temperature Scores + self._v = vault qres = osc._do_sql( - f"drop table if exists {self._schema}.{itr_prefix}overshoot_ratios", - self._engine, + f"drop table if exists {self._v.schema}.{self._overshoot_table}", + self._v.engine, verbose=False, ) - qres = osc._do_sql( + df_ratios = read_quantified_sql( f""" -create table {self._schema}.{itr_prefix}overshoot_ratios with ( - format = 'ORC', - partitioning = array['scope'] -) as -select E.company_name, E.company_id, '{company_data._schema}' as source, B.scope, 'benchmark_1' as benchmark, - B.global_budget, B.benchmark_temp, +select E.company_name, E.company_id, '{self._v.schema}' as source, B.year, -- FIXME: should have scenario_name and year released + B.global_budget, B.global_budget_units, B.benchmark_temp, B.benchmark_temp_units, E.cumulative_trajectory/B.cumulative_budget as trajectory_overshoot_ratio, concat(E.cumulative_trajectory_units, ' / (', B.cumulative_budget_units, ')') as trajectory_overshoot_ratio_units, E.cumulative_target/B.cumulative_budget as target_overshoot_ratio, - concat(E.cumulative_target_units, ' / (', B.cumulative_budget_units, ')') as target_overshoot_ratio_units -from {self._schema}.{itr_prefix}cumulative_emissions E - join {self._schema}.{itr_prefix}cumulative_budget_1 B on E.company_id=B.company_id and E.scope=B.scope + concat(E.cumulative_target_units, ' / (', B.cumulative_budget_units, ')') as target_overshoot_ratio_units, + B.scope +from {self._v.schema}.{self._emissions_table} E + join {self._v.schema}.{self._budgets_table} B on E.company_id=B.company_id and E.scope=B.scope and E.year=B.year """, - self._engine, + None, + self._v.engine, + index_col=(["company_id", "scope", "year"]), + ) + assert isinstance(df_ratios["global_budget"].dtype, PintType) + assert isinstance(df_ratios["benchmark_temp"].dtype, PintType) + df_ratios["trajectory_overshoot_ratio"] = df_ratios["trajectory_overshoot_ratio"].astype("pint[dimensionless]") + df_ratios["target_overshoot_ratio"] = df_ratios["target_overshoot_ratio"].astype("pint[dimensionless]") + create_vault_table_from_df( + df_ratios.reset_index()[df_ratios.index.names + df_ratios.columns.tolist()], + self._overshoot_table, + self._v, verbose=True, ) qres = osc._do_sql( - f"drop table if exists {self._schema}.{self._tempscore_table}", - self._engine, + f"drop table if exists {self._v.schema}.{self._tempscore_table}", + self._v.engine, verbose=False, ) qres = osc._do_sql( f""" -create table {self._schema}.{self._tempscore_table} with ( +create table {self._v.schema}.{self._tempscore_table} with ( format = 'ORC', partitioning = array['scope'] ) as -select R.company_name, R.company_id, '{company_data._schema}' as source, R.scope, 'benchmark_1' as benchmark, +select R.company_name, R.company_id, '{self._v.schema}' as source, R.year, -- FIXME: should have scenario_name and year released R.benchmark_temp + R.global_budget * (R.trajectory_overshoot_ratio-1) * 2.2/3664.0 as trajectory_temperature_score, - 'delta_degC' as trajectory_temperature_score_units, + R.benchmark_temp_units as trajectory_temperature_score_units, R.benchmark_temp + R.global_budget * (R.target_overshoot_ratio-1) * 2.2/3664.0 as target_temperature_score, - 'delta_degC' as target_temperature_score_units -from {self._schema}.{itr_prefix}overshoot_ratios R + R.benchmark_temp_units as target_temperature_score_units, + R.scope +from {self._v.schema}.{itr_prefix}overshoot_ratios R """, - self._engine, + self._v.engine, verbose=True, ) def get_preprocessed_company_data(self, company_ids: List[str]) -> List[ICompanyAggregates]: raise NotImplementedError - def get_pa_temp_scores(self, probability: float, company_ids: List[str]) -> pd.Series: + def get_pa_temp_scores( + self, + probability: float, + company_ids: List[str], + scope: EScope = EScope.S1S2, + year: int = 2050, + ) -> pd.Series: if probability < 0 or probability > 1: raise ValueError(f"probability value {probability} outside range [0.0, 1.0]") temp_scores = read_quantified_sql( - f"select company_id, scope, target_temperature_score, target_temperature_score_units, trajectory_temperature_score, trajectory_temperature_score_units from {self._schema}.{self._tempscore_table}", - self._tempscore_table, - self._schema, - self._engine, + "select company_id, scope, target_temperature_score, target_temperature_score_units, trajectory_temperature_score, trajectory_temperature_score_units, year" + f" from {self._tempscore_table} where scope='{scope.name}' and year={year}", + None, + self._v.engine, index_col=["company_id", "scope"], ) # We may have company_ids in our portfolio not in our database, and vice-versa. # Return proper pa_temp_scores for what we can find, and np.nan for those we cannot - retval = pd.Series(data=None, index=company_ids, dtype="float64") + retval = pd.Series( + data=None, + index=pd.MultiIndex.from_tuples( + [(company_id, scope.name) for company_id in company_ids], names=["company_id", "scope"] + ), + name="temp_score", + dtype="pint[delta_degC]", + ) retval.loc[ retval.index.intersection(temp_scores.index) ] = temp_scores.target_temperature_score * probability + temp_scores.trajectory_temperature_score * ( diff --git a/src/ITR/interfaces.py b/src/ITR/interfaces.py index 301dab49..1ee08afa 100644 --- a/src/ITR/interfaces.py +++ b/src/ITR/interfaces.py @@ -172,6 +172,7 @@ class Aggregation(BaseModel): def __getitem__(self, item): return getattr(self, item) + @property def empty(self): return len(self.contributions) == 0 @@ -189,8 +190,9 @@ class ScoreAggregation(BaseModel): def __getitem__(self, item): return getattr(self, item) + @property def empty(self): - return self.all.empty() + return self.all.empty emptyScoreAggregation = ScoreAggregation() @@ -289,7 +291,7 @@ def __init__( if p.year in range(ProjectionControls.BASE_YEAR, ProjectionControls.TARGET_YEAR + 1) ] elif not self.projections: - logger.warning(f"Empty Benchmark for sector {sector}, region {region}") + logger.warning(f"Empty Benchmark for sector {self.sector}, region {self.region}") def __getitem__(self, item): return getattr(self, item) @@ -535,6 +537,7 @@ def _align_and_sum_projected_targets(self, primary_scope_attr): ) ) + @property def empty(self): return self == empty_ICompanyEIProjectionsScopes @@ -598,6 +601,7 @@ def __str__(self): } return str(pd.DataFrame.from_dict(dict_items)) + @property def empty(self): return self == empty_IHistoricEmissionsScopes @@ -655,6 +659,7 @@ def __str__(self): } return str(pd.DataFrame.from_dict(dict_items)) + @property def empty(self): return self == empty_IHistoricEIScopes @@ -720,6 +725,7 @@ def _normalize_qty(value, metric) -> Quantity: ], ) + @property def empty(self) -> bool: if self.productions: return False @@ -910,7 +916,7 @@ def __init__( self.emissions_metric = EmissionsMetric("t CO2") # TODO: Should raise a warning here - if self.historic_data.empty(): + if self.historic_data.empty: # We are only partly initialized. Remaining will be done later return self.historic_data._normalize(self.production_metric, self.emissions_metric) diff --git a/src/ITR/temperature_score.py b/src/ITR/temperature_score.py index 4de93ee4..9652a420 100644 --- a/src/ITR/temperature_score.py +++ b/src/ITR/temperature_score.py @@ -27,6 +27,9 @@ logger = logging.getLogger(__name__) LoggingConfig.add_config_to_logger(logger) +nan_delta_degC = Q_(pd.NA, "delta_degC") +nan_dimensionless = Q_(pd.NA, "dimensionless") + class TemperatureScore(PortfolioAggregation): """ @@ -78,17 +81,17 @@ def get_score( ) or scorable_row[self.budget_column].m <= 0: return ( self.get_default_score(scorable_row), - np.nan, - np.nan, - np.nan, - np.nan, + nan_delta_degC, + nan_dimensionless, + nan_delta_degC, + nan_dimensionless, EScoreResultType.DEFAULT, ) # If only target data missing assign only trajectory_score to final score elif ITR.isna(scorable_row[self.c.COLS.CUMULATIVE_TARGET]) or scorable_row[self.c.COLS.CUMULATIVE_TARGET] == 0: - target_overshoot_ratio = np.nan - target_temperature_score = np.nan + target_overshoot_ratio = nan_dimensionless + target_temperature_score = nan_delta_degC trajectory_overshoot_ratio = ( scorable_row[self.c.COLS.CUMULATIVE_TRAJECTORY] / scorable_row[self.budget_column] ) @@ -200,7 +203,7 @@ def _prepare_data(self, data: pd.DataFrame, target_probability: float): company_id_and_scope = [self.c.COLS.COMPANY_ID, self.c.COLS.SCOPE] companies = data.index.get_level_values(self.c.COLS.COMPANY_ID).unique() - # If taregt score not provided, use non-specific probability + # If target score not provided, use non-specific probability data = data.fillna({self.c.COLS.TARGET_PROBABILITY: target_probability}) # If scope S1S2S3 is in the list of scopes to calculate, we need to calculate the other two as well @@ -287,7 +290,7 @@ def _calculate_company_score(self, data): ] ] .groupby([self.c.COLS.TIME_FRAME])[self.c.SCORE_RESULT_TYPE] - .transform(max) + .transform("max") == data[self.c.SCORE_RESULT_TYPE] ) diff --git a/src/ITR/utils.py b/src/ITR/utils.py index 70a39627..644d93fa 100644 --- a/src/ITR/utils.py +++ b/src/ITR/utils.py @@ -72,15 +72,15 @@ def dataframe_to_portfolio(df_portfolio: pd.DataFrame) -> List[PortfolioCompany] """ # Adding some non-empty checks for portfolio upload if df_portfolio[ColumnsConfig.INVESTMENT_VALUE].isnull().any(): - error_message = f"Investment values are missing for one or more companies in the input file." + error_message = "Investment values are missing for one or more companies in the input file." logger.error(error_message) raise ValueError(error_message) if df_portfolio[ColumnsConfig.COMPANY_ISIN].isnull().any(): - error_message = f"Company ISINs are missing for one or more companies in the input file." + error_message = "Company ISINs are missing for one or more companies in the input file." logger.error(error_message) raise ValueError(error_message) if df_portfolio[ColumnsConfig.COMPANY_ID].isnull().any(): - error_message = f"Company IDs are missing for one or more companies in the input file." + error_message = "Company IDs are missing for one or more companies in the input file." logger.error(error_message) raise ValueError(error_message) @@ -102,7 +102,7 @@ def get_data(data_warehouse: DataWarehouse, portfolio: List[PortfolioCompany]) - df_portfolio[ColumnsConfig.INVESTMENT_VALUE] = asPintSeries(df_portfolio[ColumnsConfig.INVESTMENT_VALUE]) if ColumnsConfig.COMPANY_ID not in df_portfolio.columns: - raise ValueError(f"Portfolio contains no company_id data") + raise ValueError("Portfolio contains no company_id data") # This transforms a dataframe of portfolio data into model data just so we can transform that back into a dataframe?! # It does this for all scopes, not only the scopes of interest @@ -151,6 +151,61 @@ def get_data(data_warehouse: DataWarehouse, portfolio: List[PortfolioCompany]) - return portfolio_data +def get_benchmark_projections( + prod_df: pd.DataFrame, company_sector_region_scope: Optional[pd.DataFrame] = None, scope: EScope = EScope.AnyScope +) -> pd.DataFrame: + """ + :param prod_df: DataFrame of production statistics by sector, region, scope (and year) + :param company_sector_region_scope: DataFrame indexed by ColumnsConfig.COMPANY_ID + with at least the following columns: ColumnsConfig.SECTOR, ColumnsConfig.REGION, and ColumnsConfig.SCOPE + :param scope: a scope + :return: A pint[dimensionless] DataFrame with partial production benchmark data per calendar year per row, indexed by company. + """ + + if company_sector_region_scope is None: + return prod_df + + # We drop the meaningless S1S2/AnyScope from the production benchmark and replace it with the company's scope. + # This is needed to make indexes align when we go to multiply production times intensity for a scope. + prod_df_anyscope = prod_df.droplevel("scope") + df = ( + company_sector_region_scope[["sector", "region", "scope"]] + .reset_index() + .drop_duplicates() + .set_index(["company_id", "scope"]) + ) + # We drop the meaningless S1S2/AnyScope from the production benchmark and replace it with the company's scope. + # This is needed to make indexes align when we go to multiply production times intensity for a scope. + company_benchmark_projections = df.merge( + prod_df_anyscope, + left_on=["sector", "region"], + right_index=True, + how="left", + ) + # If we don't get a match, then the projections will be `nan`. Look at the last year's column to find them. + mask = company_benchmark_projections.iloc[:, -1].isna() + if mask.any(): + # Patch up unknown regions as "Global" + global_benchmark_projections = ( + df[mask] + .drop(columns="region") + .merge( + prod_df_anyscope.loc[(slice(None), "Global"), :].droplevel(["region"]), + left_on=["sector"], + right_index=True, + how="left", + ) + ) + combined_benchmark_projections = pd.concat( + [ + company_benchmark_projections[~mask].drop(columns="region"), + global_benchmark_projections, + ] + ) + return combined_benchmark_projections.drop(columns="sector") + return company_benchmark_projections.drop(columns=["sector", "region"]) + + def calculate( portfolio_data: pd.DataFrame, fallback_score: delta_degC_Quantity, diff --git a/test/test_projection.py b/test/test_projection.py index 21d31b0e..1e64973a 100644 --- a/test/test_projection.py +++ b/test/test_projection.py @@ -239,7 +239,7 @@ def test_targets(self): ) assert_pint_series_equal(self, test_projection, expected[i], places=3) else: - assert c.projected_targets.empty() + assert c.projected_targets.empty def test_extrapolate(self): with open(os.path.join(self.root, "inputs", "json", "test_fillna_companies.json"), "r") as file: diff --git a/test/test_vault_providers.py b/test/test_vault_providers.py index 4aa4ef91..52512ef4 100644 --- a/test/test_vault_providers.py +++ b/test/test_vault_providers.py @@ -1,28 +1,36 @@ -# Skip because right now this breaks CI/CD -import pytest - -if pytest.__version__ < "3.0.0": - pytest.skip() -else: - pytestmark = pytest.mark.skip - pytest.skip("skipping vault because Trino auth breaks CI/CD", allow_module_level=True) - import json import os import pathlib -import unittest +import re +from typing import Tuple +import numpy as np import osc_ingest_trino as osc import pandas as pd +import pytest import trino -from dotenv import load_dotenv -from numpy.testing import assert_array_equal from sqlalchemy.engine import create_engine +from sqlalchemy.exc import ProgrammingError import ITR # noqa F401 -from ITR import data_dir -from ITR.configs import ColumnsConfig, TemperatureScoreConfig +from ITR import data_dir as json_data_dir +from ITR.configs import ColumnsConfig, ProjectionControls, TemperatureScoreConfig +from ITR.data.base_providers import ( + BaseProviderIntensityBenchmark, + BaseProviderProductionBenchmark, +) from ITR.data.data_warehouse import DataWarehouse +from ITR.data.osc_units import Q_, requantify_df_from_columns +from ITR.data.template import TemplateProviderCompany +from ITR.data.vault_providers import ( + DataVaultWarehouse, + VaultCompanyDataProvider, + VaultInstance, + VaultProviderIntensityBenchmark, + VaultProviderProductionBenchmark, + read_quantified_sql, + requantify_df, +) from ITR.interfaces import ( EScope, ETimeFrames, @@ -34,127 +42,340 @@ from ITR.portfolio_aggregation import PortfolioAggregationMethod from ITR.temperature_score import TemperatureScore +xlsx_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "inputs") + +# If there's no credientials file, this fails silently without raising +osc.load_credentials_dotenv() + +ingest_catalog = "osc_datacommons_dev" +ingest_schema = "demo_dv" +itr_prefix = "itr_" + try: - from ITR.data.vault_providers import ( - DataVaultWarehouse, - VaultCompanyDataProvider, - VaultProviderIntensityBenchmark, - VaultProviderProductionBenchmark, - ) + # sqlstring = "trino://{user}@{host}:{port}/".format( + # user=os.environ["TRINO_USER_USER1"], + # host=os.environ["TRINO_HOST"], + # port=os.environ["TRINO_PORT"], + # ) + # sqlargs = { + # "auth": trino.auth.JWTAuthentication(os.environ["TRINO_PASSWD_USER1"]), + # "http_scheme": "https", + # "catalog": ingest_catalog, + # "schema": demo_schema, + # } + # engine_init = create_engine(sqlstring, connect_args=sqlargs) + # print("connecting with engine " + str(engine_init)) + # connection_init = engine_init.connect() - vault_initialized = True + engine_init = osc.attach_trino_engine(verbose=True, catalog=ingest_catalog, schema=ingest_schema) except KeyError: - vault_initialized = False - -if vault_initialized: - ingest_catalog = "osc_datacommons_dev" - demo_schema = "demo_dv" - - dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")) - dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env" - if os.path.exists(dotenv_path): - load_dotenv(dotenv_path=dotenv_path, override=True) - - sqlstring = "trino://{user}@{host}:{port}/".format( - user=os.environ["TRINO_USER_USER1"], - host=os.environ["TRINO_HOST"], - port=os.environ["TRINO_PORT"], - ) - sqlargs = { - "auth": trino.auth.JWTAuthentication(os.environ["TRINO_PASSWD_USER1"]), - "http_scheme": "https", - "catalog": ingest_catalog, - "schema": demo_schema, - } - engine_init = create_engine(sqlstring, connect_args=sqlargs) - print("connecting with engine " + str(engine_init)) - connection_init = engine_init.connect() - - -class TestVaultProvider(unittest.TestCase): - """ - Test the Value provider - """ - - def setUp(self) -> None: - self.benchmark_prod_json = os.path.join(data_dir, "benchmark_production_OECM.json") - self.benchmark_EI_json = os.path.join(data_dir, "benchmark_EI_OECM_S3.json") - - # load production benchmarks - with open(self.benchmark_prod_json) as json_file: - parsed_json = json.load(json_file) - prod_bms = IProductionBenchmarkScopes.model_validate(parsed_json) - self.vault_production_bm = VaultProviderProductionBenchmark( - engine_init, benchmark_name="benchmark_prod", production_benchmarks=prod_bms + if pytest.__version__ < "3.0.0": + pytest.skip() + else: + pytestmark = pytest.mark.skip + pytest.skip("skipping vault because Trino auth breaks CI/CD", allow_module_level=True) + +# bucket must be configured with credentials for trino, and accessible to the hive catalog +# You may need to use a different prefix here depending on how you name your credentials.env variables +try: + hive_bucket = osc.attach_s3_bucket("S3_OSCCL2") + hive_catalog = "osc_datacommons_hive_ingest" + hive_schema = "ingest" +except KeyError: + hive_bucket = None + hive_catalog = None + hive_schema = None + +company_data_path = os.path.join(xlsx_data_dir, "20230106 ITR V2 Sample Data.xlsx") + + +@pytest.fixture +def base_benchmarks() -> Tuple[BaseProviderProductionBenchmark, BaseProviderIntensityBenchmark]: + # load production benchmarks + with open(os.path.join(json_data_dir, "benchmark_production_OECM.json")) as json_file: + parsed_json = json.load(json_file) + prod_bms = IProductionBenchmarkScopes.model_validate(parsed_json) + base_production_bm = BaseProviderProductionBenchmark( + production_benchmarks=prod_bms, + ) + + # load intensity benchmarks + with open(os.path.join(json_data_dir, "benchmark_EI_OECM_S3.json")) as json_file: + parsed_json = json.load(json_file) + ei_bms = IEIBenchmarkScopes.model_validate(parsed_json) + base_EI_bm = BaseProviderIntensityBenchmark( + EI_benchmarks=ei_bms, + ) + return (base_production_bm, base_EI_bm) + + +@pytest.fixture +def base_company_data() -> TemplateProviderCompany: + company_data = TemplateProviderCompany(company_data_path, projection_controls=ProjectionControls()) + return company_data + + +@pytest.fixture +def base_warehouse(base_company_data, base_benchmarks) -> DataWarehouse: + prod_bm, EI_bm = base_benchmarks + warehouse = DataWarehouse( + base_company_data, + prod_bm, + EI_bm, + estimate_missing_data=DataWarehouse.estimate_missing_s3_data, + ) + return warehouse + + +@pytest.fixture +def vault() -> VaultInstance: + instance = VaultInstance( + engine=engine_init, + schema=ingest_schema, + hive_bucket=hive_bucket, + hive_catalog=hive_catalog, + hive_schema=hive_schema, + ) + return instance + + +@pytest.fixture +def vault_benchmarks_from_base( + vault, base_benchmarks +) -> Tuple[VaultProviderProductionBenchmark, VaultProviderIntensityBenchmark]: + base_prod_bm, base_EI_bm = base_benchmarks + vault_prod_bm = VaultProviderProductionBenchmark( + vault=vault, + benchmark_name=f"{itr_prefix}benchmark_prod", + prod_df=base_prod_bm._prod_df, + ) + assert vault_prod_bm.own_data + + vault_EI_bm = VaultProviderIntensityBenchmark( + vault, + benchmark_name=f"{itr_prefix}benchmark_ei", + ei_df_t=base_EI_bm._EI_df_t, + benchmark_temperature=base_EI_bm.benchmark_temperature, + benchmark_global_budget=base_EI_bm.benchmark_global_budget, + is_AFOLU_included=base_EI_bm.is_AFOLU_included, + production_centric=base_EI_bm.is_production_centric(), + ) + assert vault_EI_bm.own_data + return (vault_prod_bm, vault_EI_bm) + + +@pytest.fixture +def vault_warehouse_from_base(vault, vault_benchmarks_from_base, base_warehouse) -> DataVaultWarehouse: + vault_company_data = VaultCompanyDataProvider( + vault, + company_table=f"{itr_prefix}company_data", + # We don't use `base_company_data` because we need projections created by `base_warehouse` + template_company_data=base_warehouse.company_data, + ) + vault_prod_bm_from_base, vault_EI_bm_from_base = vault_benchmarks_from_base + vault_warehouse = DataVaultWarehouse( + vault, + vault_company_data, + vault_prod_bm_from_base, + vault_EI_bm_from_base, + estimate_missing_data=DataWarehouse.estimate_missing_s3_data, + itr_prefix=itr_prefix, + ) + return vault_warehouse + + +@pytest.fixture +def vault_benchmarks(vault, request) -> Tuple[VaultProviderProductionBenchmark, VaultProviderIntensityBenchmark]: + try: + vault_prod_bm = VaultProviderProductionBenchmark( + vault, + benchmark_name=f"{itr_prefix}benchmark_prod", + prod_df=pd.DataFrame(), ) - # load intensity benchmarks - with open(self.benchmark_EI_json) as json_file: - parsed_json = json.load(json_file) - ei_bms = IEIBenchmarkScopes.model_validate(parsed_json) - self.vault_EI_bm = VaultProviderIntensityBenchmark( - engine_init, benchmark_name="benchmark_ei", EI_benchmarks=ei_bms + vault_EI_bm = VaultProviderIntensityBenchmark( + vault, + benchmark_name=f"{itr_prefix}benchmark_ei", ) + except ProgrammingError: + vault_prod_bm_from_base, vault_ei_bm_from_base = request.getfixturevalue("vault_benchmarks_from_base") - # load company data - # TODO: ISIC code should read as int, not float - self.vault_company_data = VaultCompanyDataProvider(engine_init, "company_data") + vault_prod_bm = VaultProviderProductionBenchmark( + vault, + benchmark_name=f"{itr_prefix}benchmark_prod", + ) - self.vault_warehouse = DataVaultWarehouse( - engine_init, - self.vault_company_data, - self.vault_production_bm, - self.vault_EI_bm, + vault_EI_bm = VaultProviderIntensityBenchmark( + vault, + benchmark_name=f"{itr_prefix}benchmark_ei", ) - def test_N0_projections(self): - sqlstring = "trino://{user}@{host}:{port}/".format( - user=os.environ["TRINO_USER_USER1"], - host=os.environ["TRINO_HOST"], - port=os.environ["TRINO_PORT"], + assert not vault_prod_bm.own_data + assert not vault_EI_bm.own_data + return (vault_prod_bm, vault_EI_bm) + + +@pytest.fixture +def vault_warehouse(vault, vault_benchmarks) -> DataVaultWarehouse: + # This creates a wrapper around what should be an existing data in the Data Vault. + # If no such data exists, it will fail + + vault_company_data = VaultCompanyDataProvider( + vault, + company_table=f"{itr_prefix}company_data", + # We don't use `base_company_data` because base_warehouse creates projections we need + template_company_data=None, + ) + vault_production_bm, vault_ei_bm = vault_benchmarks + + # Verify that we have all the tables we need + tablenames = [ + "company_data", + "benchmark_prod", + "benchmark_ei", + "production_data", + "trajectory_data", + "target_data", + "cumulative_emissions", + "cumulative_budgets", + ] + sql_counts = ",".join( + [f"{tablename}_cnt as (select count (*) as cnt from {itr_prefix}{tablename})" for tablename in tablenames] + ) + sql_sums = "+".join([f"{tablename}_cnt.cnt" for tablename in tablenames]) + sql_joins = ",".join([f"{tablename}_cnt" for tablename in tablenames]) + # One N-clause statement executes about N times faster than N individual checks + qres = osc._do_sql(f"with {sql_counts} select {sql_sums} from {sql_joins}", engine=vault.engine, verbose=True) + warehouse = DataVaultWarehouse( + vault, + company_data=vault_company_data, + benchmark_projected_production=vault_production_bm, + benchmarks_projected_ei=vault_ei_bm, + itr_prefix=itr_prefix, + ) + return warehouse + + +@pytest.mark.parametrize( + "base_warehouse_x,vault_warehouse_x", + [ + ("base_warehouse", "vault_warehouse_from_base"), + ("base_warehouse", "vault_warehouse"), + ], +) +def test_warehouse(base_warehouse_x: DataWarehouse, vault_warehouse_x: DataVaultWarehouse, request) -> None: + base_warehouse_x = request.getfixturevalue(base_warehouse_x) + vault_warehouse_x = request.getfixturevalue(vault_warehouse_x) + vault = vault_warehouse_x._v + base_company = next(iter(base_warehouse_x.company_data.get_company_data(["US00130H1059"]))) + vault_company_data = vault_warehouse_x.company_data + assert base_company.projected_targets.S1S2 is not None + company_0_id, company_0_s1s2_ser = ( + base_company.company_id, + base_company.projected_targets.S1S2.projections, + ) + + ser_from_vault = read_quantified_sql( + f"select year, ei_s1s2_by_year, ei_s1s2_by_year_units from {vault_warehouse_x._target_table} where company_id='{company_0_id}' order by year", + vault_warehouse_x._target_table, + vault.engine, + vault.schema, + index_col="year", + ).squeeze() + assert company_0_s1s2_ser.compare(ser_from_vault).empty + + company_info_at_base_year = vault_company_data.get_company_intensity_and_production_at_base_year( + [company_0_id], + ) + assert base_warehouse_x.benchmark_projected_production is not None + projected_production = base_warehouse_x.benchmark_projected_production.get_company_projected_production( + company_info_at_base_year + ) + company_proj_production = projected_production.loc[:, EScope.S1S2, :].stack(level=0) + company_proj_production.index.set_names(["company_id", "year"], inplace=True) + company_proj_production.name = "production_by_year" + ser_from_vault = read_quantified_sql( + f"select company_id, year, production_by_year, production_by_year_units from {vault_warehouse_x._production_table} where company_id='{company_0_id}' order by year", + vault_warehouse_x._production_table, + vault.engine, + vault.schema, + index_col=["company_id", "year"], + ).squeeze() + assert company_proj_production.compare(ser_from_vault).empty + + company_0_cumulative_em = DataWarehouse._get_cumulative_emissions( + base_warehouse_x.company_data.get_company_projected_targets([company_0_id]), + projected_production, + ).stack(level=0) + company_0_cumulative_em.index.set_names(["company_id", "scope", "year"], inplace=True) + company_0_cumulative_em.name = "cumulative_target" + + df_from_vault = read_quantified_sql( + f"select company_id, scope, year, cumulative_target, cumulative_target_units from {vault_warehouse_x._emissions_table} where company_id='{company_0_id}' order by year", + f"{itr_prefix}cumulative_emissions", + vault.engine, + vault.schema, + index_col=["company_id", "scope", "year"], + ).astype("pint[t CO2e]") + + assert ( + company_0_cumulative_em.loc[:, EScope.S1S2, :] + .pint.m.round(2) + .compare(df_from_vault.loc[:, "S1S2", :].squeeze().pint.m.round(2)) + .empty + ) + + qres = osc._do_sql("show tables", engine=engine_init) + assert len(qres) >= 8 + qres = osc._do_sql(f"select count (*) from {itr_prefix}benchmark_prod", engine=engine_init, verbose=True) + assert len(qres) > 0 and qres[0] == (2208,) + qres = osc._do_sql(f"select count (*) from {itr_prefix}benchmark_ei", engine=engine_init, verbose=True) + assert len(qres) > 0 and qres[0] == (11040,) + qres = osc._do_sql(f"select count (*) from {itr_prefix}company_data", engine=engine_init, verbose=True) + assert len(qres) > 0 and len(qres[0]) > 0 and qres[0][0] > 0 + + +def test_tempscore_from_base(base_warehouse) -> None: + df_portfolio = pd.read_excel(company_data_path, sheet_name="Portfolio").iloc[[0]] + + for i, col in enumerate(df_portfolio.columns): + if col.startswith("investment_value"): + if match := re.match(r".*\[([A-Z]{3})\]", col, re.I): + df_portfolio.rename(columns={col: "investment_value"}, inplace=True) + df_portfolio["investment_value"] = df_portfolio["investment_value"].astype(f"pint[{match.group(1)}]") + companies = ITR.utils.dataframe_to_portfolio(df_portfolio) + temperature_score = TemperatureScore(time_frames=[ETimeFrames.LONG], scopes=EScope.get_result_scopes()) + df = temperature_score.calculate( + data_warehouse=base_warehouse, + portfolio=companies, + target_probability=0.5, + ) + assert df[df.scope == EScope.S1S2].temperature_score.pint.m.round(2).item() == 2.41 + + +def test_temp_scores(vault_warehouse) -> None: + engine_quant = osc.attach_trino_engine(verbose=True, catalog=ingest_catalog, schema=ingest_schema) + + quant_vault = VaultInstance( + engine=engine_quant, + schema=ingest_schema, + hive_bucket=hive_bucket, + hive_catalog=hive_catalog, + hive_schema=hive_schema, + ) + vault_warehouse.quant_init(quant_vault, company_data=None, itr_prefix=itr_prefix) + df_portfolio = pd.read_excel(company_data_path, sheet_name="Portfolio", index_col="company_id") + + for i, col in enumerate(df_portfolio.columns): + if col.startswith("investment_value"): + if match := re.match(r".*\[([A-Z]{3})\]", col, re.I): + df_portfolio.rename(columns={col: "investment_value"}, inplace=True) + df_portfolio["investment_value"] = df_portfolio["investment_value"].astype(f"pint[{match.group(1)}]") + df_portfolio["pa_score"] = ( + vault_warehouse.get_pa_temp_scores( + probability=0.5, company_ids=df_portfolio.index.values, scope=EScope.S1S2, year=2050 ) - sqlargs = { - "auth": trino.auth.JWTAuthentication(os.environ["TRINO_PASSWD_USER1"]), - "http_scheme": "https", - } - engine_dev = create_engine(sqlstring, connect_args=sqlargs) - print("connecting with engine " + str(engine_dev)) - connection_dev = engine_dev.connect() - # Show projections for emissions trajectories, production, and emission targets (N0 only) - # Show cumulative emissions (trajectory, target) and budget (N1 can also see) - pass - - def test_N1_temp_scores(self): - sqlstring = "trino://{user}@{host}:{port}/".format( - user=os.environ["TRINO_USER_USER2"], - host=os.environ["TRINO_HOST"], - port=os.environ["TRINO_PORT"], - ) - sqlargs = { - "auth": trino.auth.JWTAuthentication(os.environ["TRINO_PASSWD_USER2"]), - "http_scheme": "https", - } - engine_quant = create_engine(sqlstring, connect_args=sqlargs) - print("connecting with engine " + str(engine_quant)) - connection_quant = engine_quant.connect() - # Show cumulative emissions (trajectory, target) and budget (N1 can see) - # Show overshoot ratios (trajectory, target) (N1 can see) - # Show trajectory and target temp scores (N2 can also see) - pass - - def test_N2_portfolio(self): - sqlstring = "trino://{user}@{host}:{port}/".format( - user=os.environ["TRINO_USER_USER3"], - host=os.environ["TRINO_HOST"], - port=os.environ["TRINO_PORT"], - ) - sqlargs = { - "auth": trino.auth.JWTAuthentication(os.environ["TRINO_PASSWD_USER3"]), - "http_scheme": "https", - } - engine_user = create_engine(sqlstring, connect_args=sqlargs) - print("connecting with engine " + str(engine_user)) - connection_user = engine_user.connect() - # Show weighted temp score over portfolio (N2 can see) - # Different weighting types give different coefficients - pass + .droplevel("scope") + .astype("pint[delta_degC]") + ) + assert df_portfolio.loc["US00130H1059"].pa_score.m.round(2).item() == 2.41